23 files changed, 463 insertions, 35 deletions
diff --git a/Auxiliary/vim/syntax/cmake.vim b/Auxiliary/vim/syntax/cmake.vim
index f303bd4..1a47f67 100644
--- a/Auxiliary/vim/syntax/cmake.vim
+++ b/Auxiliary/vim/syntax/cmake.vim
@@ -128,7 +128,10 @@ syn keyword cmakeProperty contained
             \ CPACK_WIX_ACL
             \ CROSSCOMPILING_EMULATOR
             \ CUDA_ARCHITECTURES
+            \ CUDA_CUBIN_COMPILATION
             \ CUDA_EXTENSIONS
+            \ CUDA_FATBIN_COMPILATION
+            \ CUDA_OPTIX_COMPILATION
             \ CUDA_PTX_COMPILATION
             \ CUDA_RESOLVE_DEVICE_SYMBOLS
             \ CUDA_RUNTIME_LIBRARY
diff --git a/Help/manual/cmake-properties.7.rst b/Help/manual/cmake-properties.7.rst
index 01c9ce8..8559b0b 100644
--- a/Help/manual/cmake-properties.7.rst
+++ b/Help/manual/cmake-properties.7.rst
@@ -175,7 +175,10 @@ Properties on Targets
    /prop_tgt/CONFIG_POSTFIX
    /prop_tgt/CROSSCOMPILING_EMULATOR
    /prop_tgt/CUDA_ARCHITECTURES
+   /prop_tgt/CUDA_CUBIN_COMPILATION
    /prop_tgt/CUDA_EXTENSIONS
+   /prop_tgt/CUDA_FATBIN_COMPILATION
+   /prop_tgt/CUDA_OPTIX_COMPILATION
    /prop_tgt/CUDA_PTX_COMPILATION
    /prop_tgt/CUDA_RESOLVE_DEVICE_SYMBOLS
    /prop_tgt/CUDA_RUNTIME_LIBRARY
diff --git a/Help/prop_tgt/CUDA_CUBIN_COMPILATION.rst b/Help/prop_tgt/CUDA_CUBIN_COMPILATION.rst
new file mode 100644
index 0000000..f8860ae
--- /dev/null
+++ b/Help/prop_tgt/CUDA_CUBIN_COMPILATION.rst
@@ -0,0 +1,14 @@
+CUDA_CUBIN_COMPILATION
+----------------------
+
+.. versionadded:: 3.27
+
+Compile CUDA sources to ``.cubin`` files instead of ``.obj`` files
+within :ref:`Object Libraries`.
+
+For example:
+
+.. code-block:: cmake
+
+  add_library(mycubin OBJECT a.cu b.cu)
+  set_property(TARGET mycubin PROPERTY CUDA_CUBIN_COMPILATION ON)
diff --git a/Help/prop_tgt/CUDA_FATBIN_COMPILATION.rst b/Help/prop_tgt/CUDA_FATBIN_COMPILATION.rst
new file mode 100644
index 0000000..3d3c715
--- /dev/null
+++ b/Help/prop_tgt/CUDA_FATBIN_COMPILATION.rst
@@ -0,0 +1,14 @@
+CUDA_FATBIN_COMPILATION
+-----------------------
+
+.. versionadded:: 3.27
+
+Compile CUDA sources to ``.fatbin`` files instead of ``.obj`` files
+within :ref:`Object Libraries`.
+
+For example:
+
+.. code-block:: cmake
+
+  add_library(myfbins OBJECT a.cu b.cu)
+  set_property(TARGET myfbins PROPERTY CUDA_FATBIN_COMPILATION ON)
diff --git a/Help/prop_tgt/CUDA_OPTIX_COMPILATION.rst b/Help/prop_tgt/CUDA_OPTIX_COMPILATION.rst
new file mode 100644
index 0000000..c2a06a8
--- /dev/null
+++ b/Help/prop_tgt/CUDA_OPTIX_COMPILATION.rst
@@ -0,0 +1,14 @@
+CUDA_OPTIX_COMPILATION
+----------------------
+
+.. versionadded:: 3.27
+
+Compile CUDA sources to ``.optixir`` files instead of ``.obj`` files
+within :ref:`Object Libraries`.
+
+For example:
+
+.. code-block:: cmake
+
+  add_library(myoptix OBJECT a.cu b.cu)
+  set_property(TARGET myoptix PROPERTY CUDA_OPTIX_COMPILATION ON)
diff --git a/Help/release/dev/cuda-support-new-compile-modes.rst b/Help/release/dev/cuda-support-new-compile-modes.rst
new file mode 100644
index 0000000..2d24c16
--- /dev/null
+++ b/Help/release/dev/cuda-support-new-compile-modes.rst
@@ -0,0 +1,14 @@
+cuda-support-new-compile-modes
+------------------------------
+
+* A :prop_tgt:`CUDA_CUBIN_COMPILATION` target property was added to
+  :ref:`Object Libraries` to support compiling to ``.cubin`` files
+  instead of host object files. Currently only supported with NVIDIA.
+
+* A :prop_tgt:`CUDA_FATBIN_COMPILATION` target property was added to
+  :ref:`Object Libraries` to support compiling to ``.fatbin`` files
+  instead of host object files. Currently only supported with NVIDIA.
+
+* A :prop_tgt:`CUDA_OPTIX_COMPILATION` target property was added to
+  :ref:`Object Libraries` to support compiling to ``.optixir`` files
+  instead of host object files. Currently only supported with NVIDIA.
diff --git a/Modules/CMakeCUDAInformation.cmake b/Modules/CMakeCUDAInformation.cmake
index dea721e..e774088 100644
--- a/Modules/CMakeCUDAInformation.cmake
+++ b/Modules/CMakeCUDAInformation.cmake
@@ -134,7 +134,6 @@ include(CMakeCommonLanguageInclude)
 # CMAKE_CUDA_CREATE_SHARED_LIBRARY
 # CMAKE_CUDA_CREATE_SHARED_MODULE
 # CMAKE_CUDA_COMPILE_WHOLE_COMPILATION
-# CMAKE_CUDA_COMPILE_PTX_COMPILATION
 # CMAKE_CUDA_COMPILE_SEPARABLE_COMPILATION
 # CMAKE_CUDA_LINK_EXECUTABLE
 
diff --git a/Modules/Compiler/NVIDIA-CUDA.cmake b/Modules/Compiler/NVIDIA-CUDA.cmake
index 0823954..c839d1c 100644
--- a/Modules/Compiler/NVIDIA-CUDA.cmake
+++ b/Modules/Compiler/NVIDIA-CUDA.cmake
@@ -8,6 +8,11 @@ set(_CMAKE_COMPILE_AS_CUDA_FLAG "-x cu")
 set(_CMAKE_CUDA_WHOLE_FLAG "-c")
 set(_CMAKE_CUDA_RDC_FLAG "-rdc=true")
 set(_CMAKE_CUDA_PTX_FLAG "-ptx")
+set(_CMAKE_CUDA_CUBIN_FLAG "-cubin")
+set(_CMAKE_CUDA_FATBIN_FLAG "-fatbin")
+if(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL "11.7.0")
+  set(_CMAKE_CUDA_OPTIX_FLAG "-optix-ir")
+endif()
 
 if (CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 10.2.89)
   # The -forward-unknown-to-host-compiler flag was only
diff --git a/Source/cmGeneratorTarget.cxx b/Source/cmGeneratorTarget.cxx
index 54cd8ba..4cfa1d7 100644
--- a/Source/cmGeneratorTarget.cxx
+++ b/Source/cmGeneratorTarget.cxx
@@ -3,6 +3,7 @@
 #include "cmGeneratorTarget.h"
 
 #include <algorithm>
+#include <array>
 #include <cassert>
 #include <cerrno>
 #include <cstddef>
@@ -1011,12 +1012,27 @@ const std::string& cmGeneratorTarget::GetObjectName(cmSourceFile const* file)
 
 const char* cmGeneratorTarget::GetCustomObjectExtension() const
 {
-  static std::string extension;
-  const bool has_ptx_extension =
-    this->GetPropertyAsBool("CUDA_PTX_COMPILATION");
-  if (has_ptx_extension) {
-    extension = ".ptx";
-    return extension.c_str();
+  struct compiler_mode
+  {
+    std::string variable;
+    std::string extension;
+  };
+  static std::array<compiler_mode, 4> const modes{
+    { { "CUDA_PTX_COMPILATION", ".ptx" },
+      { "CUDA_CUBIN_COMPILATION", ".cubin" },
+      { "CUDA_FATBIN_COMPILATION", ".fatbin" },
+      { "CUDA_OPTIX_COMPILATION", ".optixir" } }
+  };
+
+  std::string const& compiler =
+    this->Makefile->GetSafeDefinition("CMAKE_CUDA_COMPILER_ID");
+  if (!compiler.empty()) {
+    for (const auto& m : modes) {
+      const bool has_extension = this->GetPropertyAsBool(m.variable);
+      if (has_extension) {
+        return m.extension.c_str();
+      }
+    }
   }
   return nullptr;
 }
diff --git a/Source/cmMakefileTargetGenerator.cxx b/Source/cmMakefileTargetGenerator.cxx
index e217dd9..2ead739 100644
--- a/Source/cmMakefileTargetGenerator.cxx
+++ b/Source/cmMakefileTargetGenerator.cxx
@@ -3,6 +3,7 @@
 #include "cmMakefileTargetGenerator.h"
 
 #include <algorithm>
+#include <array>
 #include <cassert>
 #include <cstdio>
 #include <iterator>
@@ -977,11 +978,23 @@ void cmMakefileTargetGenerator::WriteObjectRuleFiles(
           this->Makefile->GetRequiredDefinition("_CMAKE_CUDA_RDC_FLAG");
         cudaCompileMode = cmStrCat(cudaCompileMode, rdcFlag, " ");
       }
-      if (this->GeneratorTarget->GetPropertyAsBool("CUDA_PTX_COMPILATION")) {
-        const std::string& ptxFlag =
-          this->Makefile->GetRequiredDefinition("_CMAKE_CUDA_PTX_FLAG");
-        cudaCompileMode = cmStrCat(cudaCompileMode, ptxFlag);
-      } else {
+
+      static std::array<cm::string_view, 4> const compileModes{
+        { "PTX"_s, "CUBIN"_s, "FATBIN"_s, "OPTIX"_s }
+      };
+      bool useNormalCompileMode = true;
+      for (cm::string_view mode : compileModes) {
+        auto propName = cmStrCat("CUDA_", mode, "_COMPILATION");
+        auto defName = cmStrCat("_CMAKE_CUDA_", mode, "_FLAG");
+        if (this->GeneratorTarget->GetPropertyAsBool(propName)) {
+          const std::string& flag =
+            this->Makefile->GetRequiredDefinition(defName);
+          cudaCompileMode = cmStrCat(cudaCompileMode, flag);
+          useNormalCompileMode = false;
+          break;
+        }
+      }
+      if (useNormalCompileMode) {
         const std::string& wholeFlag =
           this->Makefile->GetRequiredDefinition("_CMAKE_CUDA_WHOLE_FLAG");
         cudaCompileMode = cmStrCat(cudaCompileMode, wholeFlag);
diff --git a/Source/cmNinjaTargetGenerator.cxx b/Source/cmNinjaTargetGenerator.cxx
index 8663f46..dc56142 100644
--- a/Source/cmNinjaTargetGenerator.cxx
+++ b/Source/cmNinjaTargetGenerator.cxx
@@ -3,6 +3,7 @@
 #include "cmNinjaTargetGenerator.h"
 
 #include <algorithm>
+#include <array>
 #include <cassert>
 #include <functional>
 #include <iterator>
@@ -859,11 +860,22 @@ void cmNinjaTargetGenerator::WriteCompileRule(const std::string& lang,
         this->Makefile->GetRequiredDefinition("_CMAKE_CUDA_RDC_FLAG");
       cudaCompileMode = cmStrCat(cudaCompileMode, rdcFlag, " ");
     }
-    if (this->GeneratorTarget->GetPropertyAsBool("CUDA_PTX_COMPILATION")) {
-      const std::string& ptxFlag =
-        this->Makefile->GetRequiredDefinition("_CMAKE_CUDA_PTX_FLAG");
-      cudaCompileMode = cmStrCat(cudaCompileMode, ptxFlag);
-    } else {
+    static std::array<cm::string_view, 4> const compileModes{
+      { "PTX"_s, "CUBIN"_s, "FATBIN"_s, "OPTIX"_s }
+    };
+    bool useNormalCompileMode = true;
+    for (cm::string_view mode : compileModes) {
+      auto propName = cmStrCat("CUDA_", mode, "_COMPILATION");
+      auto defName = cmStrCat("_CMAKE_CUDA_", mode, "_FLAG");
+      if (this->GeneratorTarget->GetPropertyAsBool(propName)) {
+        const std::string& flag =
+          this->Makefile->GetRequiredDefinition(defName);
+        cudaCompileMode = cmStrCat(cudaCompileMode, flag);
+        useNormalCompileMode = false;
+        break;
+      }
+    }
+    if (useNormalCompileMode) {
       const std::string& wholeFlag =
         this->Makefile->GetRequiredDefinition("_CMAKE_CUDA_WHOLE_FLAG");
       cudaCompileMode = cmStrCat(cudaCompileMode, wholeFlag);
@@ -1789,11 +1801,22 @@ void cmNinjaTargetGenerator::ExportObjectCompileCommand(
         this->Makefile->GetRequiredDefinition("_CMAKE_CUDA_RDC_FLAG");
       cudaCompileMode = cmStrCat(cudaCompileMode, rdcFlag, " ");
     }
-    if (this->GeneratorTarget->GetPropertyAsBool("CUDA_PTX_COMPILATION")) {
-      const std::string& ptxFlag =
-        this->Makefile->GetRequiredDefinition("_CMAKE_CUDA_PTX_FLAG");
-      cudaCompileMode = cmStrCat(cudaCompileMode, ptxFlag);
-    } else {
+    static std::array<cm::string_view, 4> const compileModes{
+      { "PTX"_s, "CUBIN"_s, "FATBIN"_s, "OPTIX"_s }
+    };
+    bool useNormalCompileMode = true;
+    for (cm::string_view mode : compileModes) {
+      auto propName = cmStrCat("CUDA_", mode, "_COMPILATION");
+      auto defName = cmStrCat("_CMAKE_CUDA_", mode, "_FLAG");
+      if (this->GeneratorTarget->GetPropertyAsBool(propName)) {
+        const std::string& flag =
+          this->Makefile->GetRequiredDefinition(defName);
+        cudaCompileMode = cmStrCat(cudaCompileMode, flag);
+        useNormalCompileMode = false;
+        break;
+      }
+    }
+    if (useNormalCompileMode) {
       const std::string& wholeFlag =
         this->Makefile->GetRequiredDefinition("_CMAKE_CUDA_WHOLE_FLAG");
       cudaCompileMode = cmStrCat(cudaCompileMode, wholeFlag);
diff --git a/Source/cmTarget.cxx b/Source/cmTarget.cxx
index ec87271..f06e26b 100644
--- a/Source/cmTarget.cxx
+++ b/Source/cmTarget.cxx
@@ -1775,6 +1775,9 @@ MAKE_PROP(COMPILE_FEATURES);
 MAKE_PROP(COMPILE_OPTIONS);
 MAKE_PROP(PRECOMPILE_HEADERS);
 MAKE_PROP(PRECOMPILE_HEADERS_REUSE_FROM);
+MAKE_PROP(CUDA_CUBIN_COMPILATION);
+MAKE_PROP(CUDA_FATBIN_COMPILATION);
+MAKE_PROP(CUDA_OPTIX_COMPILATION);
 MAKE_PROP(CUDA_PTX_COMPILATION);
 MAKE_PROP(EXPORT_NAME);
 MAKE_PROP(IMPORTED);
@@ -1910,14 +1913,38 @@ void cmTarget::StoreProperty(const std::string& prop, ValueType value)
                value ? value
                      : std::string{})) { // NOLINT(bugprone-branch-clone)
     /* error was reported by check method */
-  } else if (prop == propCUDA_PTX_COMPILATION &&
-             this->GetType() != cmStateEnums::OBJECT_LIBRARY) {
-    std::ostringstream e;
-    e << "CUDA_PTX_COMPILATION property can only be applied to OBJECT "
-         "targets (\""
-      << this->impl->Name << "\")\n";
-    this->impl->Makefile->IssueMessage(MessageType::FATAL_ERROR, e.str());
-    return;
+  } else if (prop == propCUDA_CUBIN_COMPILATION ||
+             prop == propCUDA_FATBIN_COMPILATION ||
+             prop == propCUDA_OPTIX_COMPILATION ||
+             prop == propCUDA_PTX_COMPILATION) {
+    auto const& compiler =
+      this->impl->Makefile->GetSafeDefinition("CMAKE_CUDA_COMPILER_ID");
+    auto const& compilerVersion =
+      this->impl->Makefile->GetSafeDefinition("CMAKE_CUDA_COMPILER_VERSION");
+    if (this->GetType() != cmStateEnums::OBJECT_LIBRARY) {
+      auto e =
+        cmStrCat(prop, " property can only be applied to OBJECT targets(",
+                 this->impl->Name, ")\n");
+      this->impl->Makefile->IssueMessage(MessageType::FATAL_ERROR, e);
+      return;
+    }
+    const bool flag_found =
+      (prop == propCUDA_PTX_COMPILATION &&
+       this->impl->Makefile->GetDefinition("_CMAKE_CUDA_PTX_FLAG")) ||
+      (prop == propCUDA_CUBIN_COMPILATION &&
+       this->impl->Makefile->GetDefinition("_CMAKE_CUDA_CUBIN_FLAG")) ||
+      (prop == propCUDA_FATBIN_COMPILATION &&
+       this->impl->Makefile->GetDefinition("_CMAKE_CUDA_FATBIN_FLAG")) ||
+      (prop == propCUDA_OPTIX_COMPILATION &&
+       this->impl->Makefile->GetDefinition("_CMAKE_CUDA_OPTIX_FLAG"));
+    if (flag_found) {
+      this->impl->Properties.SetProperty(prop, value);
+    } else {
+      auto e = cmStrCat(prop, " property is not supported by ", compiler,
+                        "  compiler version ", compilerVersion, ".");
+      this->impl->Makefile->IssueMessage(MessageType::FATAL_ERROR, e);
+      return;
+    }
   } else if (prop == propPRECOMPILE_HEADERS_REUSE_FROM) {
     if (this->GetProperty("PRECOMPILE_HEADERS")) {
       std::ostringstream e;
diff --git a/Source/cmVisualStudio10TargetGenerator.cxx b/Source/cmVisualStudio10TargetGenerator.cxx
index ef0fcf3..8926f9e 100644
--- a/Source/cmVisualStudio10TargetGenerator.cxx
+++ b/Source/cmVisualStudio10TargetGenerator.cxx
@@ -3597,13 +3597,13 @@ bool cmVisualStudio10TargetGenerator::ComputeCudaOptions(
   if (this->GeneratorTarget->GetPropertyAsBool("CUDA_SEPARABLE_COMPILATION")) {
     cudaOptions.AddFlag("GenerateRelocatableDeviceCode", "true");
   }
-  bool notPtx = true;
+  bool notPtxLike = true;
   if (this->GeneratorTarget->GetPropertyAsBool("CUDA_PTX_COMPILATION")) {
     cudaOptions.AddFlag("NvccCompilation", "ptx");
     // We drop the %(Extension) component as CMake expects all PTX files
     // to not have the source file extension at all
     cudaOptions.AddFlag("CompileOut", "$(IntDir)%(Filename).ptx");
-    notPtx = false;
+    notPtxLike = false;
 
     if (cmSystemTools::VersionCompare(cmSystemTools::OP_GREATER_EQUAL,
                                       cudaVersion, "9.0") &&
@@ -3618,9 +3618,24 @@ bool cmVisualStudio10TargetGenerator::ComputeCudaOptions(
                           "%(BaseCommandLineTemplate) [CompileOut] [FastMath] "
                           "[Defines] \"%(FullPath)\"");
     }
-  }
-
-  if (notPtx &&
+  } else if (this->GeneratorTarget->GetPropertyAsBool(
+               "CUDA_CUBIN_COMPILATION")) {
+    cudaOptions.AddFlag("NvccCompilation", "cubin");
+    cudaOptions.AddFlag("CompileOut", "$(IntDir)%(Filename).cubin");
+    notPtxLike = false;
+  } else if (this->GeneratorTarget->GetPropertyAsBool(
+               "CUDA_FATBIN_COMPILATION")) {
+    cudaOptions.AddFlag("NvccCompilation", "fatbin");
+    cudaOptions.AddFlag("CompileOut", "$(IntDir)%(Filename).fatbin");
+    notPtxLike = false;
+  } else if (this->GeneratorTarget->GetPropertyAsBool(
+               "CUDA_OPTIX_COMPILATION")) {
+    cudaOptions.AddFlag("NvccCompilation", "optix-ir");
+    cudaOptions.AddFlag("CompileOut", "$(IntDir)%(Filename).optixir");
+    notPtxLike = false;
+  }
+
+  if (notPtxLike &&
       cmSystemTools::VersionCompareGreaterEq(
         "8.0", this->GlobalGenerator->GetPlatformToolsetCudaString())) {
     // Explicitly state that we want this file to be treated as a
diff --git a/Tests/CudaOnly/CMakeLists.txt b/Tests/CudaOnly/CMakeLists.txt
index db08076..aa25c4c 100644
--- a/Tests/CudaOnly/CMakeLists.txt
+++ b/Tests/CudaOnly/CMakeLists.txt
@@ -27,6 +27,9 @@ if(CMake_TEST_CUDA AND NOT CMake_TEST_CUDA STREQUAL "Clang")
 
   # Only NVCC defines __CUDACC_DEBUG__ when compiling in debug mode.
   add_cuda_test_macro(CudaOnly.GPUDebugFlag CudaOnlyGPUDebugFlag)
+  add_cuda_test_macro(CudaOnly.CUBIN CudaOnlyCUBIN)
+  add_cuda_test_macro(CudaOnly.Fatbin CudaOnlyFatbin)
+  add_cuda_test_macro(CudaOnly.OptixIR CudaOnlyOptixIR)
 endif()
 
 add_cuda_test_macro(CudaOnly.DeviceLTO CudaOnlyDeviceLTO)
diff --git a/Tests/CudaOnly/CUBIN/CMakeLists.txt b/Tests/CudaOnly/CUBIN/CMakeLists.txt
new file mode 100644
index 0000000..464714b
--- /dev/null
+++ b/Tests/CudaOnly/CUBIN/CMakeLists.txt
@@ -0,0 +1,21 @@
+cmake_minimum_required(VERSION 3.18)
+project(CudaCUBIN LANGUAGES CUDA)
+
+
+set(CMAKE_CUDA_ARCHITECTURES all-major)
+
+add_library(CudaCUBIN OBJECT kernelA.cu kernelB.cu kernelC.cu)
+set_property(TARGET CudaCUBIN PROPERTY CUDA_CUBIN_COMPILATION ON)
+set_property(TARGET CudaCUBIN PROPERTY CUDA_ARCHITECTURES native)
+
+add_executable(CudaOnlyCUBIN main.cu)
+target_compile_features(CudaOnlyCUBIN PRIVATE cuda_std_11)
+target_compile_definitions(CudaOnlyCUBIN PRIVATE "CUBIN_FILE_PATHS=\"$<JOIN:$<TARGET_OBJECTS:CudaCUBIN>,~_~>\"")
+
+find_package(CUDAToolkit REQUIRED)
+target_link_libraries(CudaOnlyCUBIN PRIVATE CUDA::cuda_driver)
+
+if(APPLE)
+  # Help the static cuda runtime find the driver (libcuda.dyllib) at runtime.
+  set_property(TARGET CudaOnlyCUBIN PROPERTY BUILD_RPATH ${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES})
+endif()
diff --git a/Tests/CudaOnly/CUBIN/kernelA.cu b/Tests/CudaOnly/CUBIN/kernelA.cu
new file mode 100644
index 0000000..fbe0d26
--- /dev/null
+++ b/Tests/CudaOnly/CUBIN/kernelA.cu
@@ -0,0 +1,7 @@
+
+__global__ void kernelA(float* r, float* x, float* y, float* z, int size)
+{
+  for (int i = threadIdx.x; i < size; i += blockDim.x) {
+    r[i] = x[i] * y[i] + z[i];
+  }
+}
diff --git a/Tests/CudaOnly/CUBIN/kernelB.cu b/Tests/CudaOnly/CUBIN/kernelB.cu
new file mode 100644
index 0000000..7478253
--- /dev/null
+++ b/Tests/CudaOnly/CUBIN/kernelB.cu
@@ -0,0 +1,7 @@
+
+__global__ void kernelB(float* r, float* x, float* y, float* z, int size)
+{
+  for (int i = threadIdx.x; i < size; i += blockDim.x) {
+    r[i] = x[i] * y[i] + z[i];
+  }
+}
diff --git a/Tests/CudaOnly/CUBIN/kernelC.cu b/Tests/CudaOnly/CUBIN/kernelC.cu
new file mode 100644
index 0000000..5f8a0ce
--- /dev/null
+++ b/Tests/CudaOnly/CUBIN/kernelC.cu
@@ -0,0 +1,7 @@
+
+__global__ void kernelC(float* r, float* x, float* y, float* z, int size)
+{
+  for (int i = threadIdx.x; i < size; i += blockDim.x) {
+    r[i] = x[i] * y[i] + z[i];
+  }
+}
diff --git a/Tests/CudaOnly/CUBIN/main.cu b/Tests/CudaOnly/CUBIN/main.cu
new file mode 100644
index 0000000..da5249c
--- /dev/null
+++ b/Tests/CudaOnly/CUBIN/main.cu
@@ -0,0 +1,56 @@
+#include <iostream>
+#include <string>
+#include <vector>
+
+#include <cuda.h>
+
+#define GENERATED_HEADER(x) GENERATED_HEADER1(x)
+#define GENERATED_HEADER1(x) <x>
+
+static std::string input_paths = { CUBIN_FILE_PATHS };
+
+int main()
+{
+  const std::string delimiter = "~_~";
+  input_paths += delimiter;
+
+  size_t end = 0;
+  size_t previous_end = 0;
+  std::vector<std::string> actual_paths;
+  while ((end = input_paths.find(delimiter, previous_end)) !=
+         std::string::npos) {
+    actual_paths.emplace_back(
+      input_paths.substr(previous_end, end - previous_end));
+    previous_end = end + 3;
+  }
+
+  cuInit(0);
+  int count = 0;
+  cuDeviceGetCount(&count);
+  if (count == 0) {
+    std::cerr << "No CUDA devices found\n";
+    return 1;
+  }
+
+  CUdevice device;
+  cuDeviceGet(&device, 0);
+
+  CUcontext context;
+  cuCtxCreate(&context, 0, device);
+
+  CUmodule module;
+  for (auto p : actual_paths) {
+    if (p.find(".cubin") == std::string::npos) {
+      std::cout << p << " Doesn't have the .cubin suffix" << p << std::endl;
+      return 1;
+    }
+    std::cout << "trying to load cubin: " << p << std::endl;
+    CUresult result = cuModuleLoad(&module, p.c_str());
+    std::cout << "module pointer: " << module << '\n';
+    if (result != CUDA_SUCCESS || module == nullptr) {
+      std::cerr << "Failed to load the embedded cubin with error: "
+                << static_cast<unsigned int>(result) << '\n';
+      return 1;
+    }
+  }
+}
diff --git a/Tests/CudaOnly/Fatbin/CMakeLists.txt b/Tests/CudaOnly/Fatbin/CMakeLists.txt
new file mode 100644
index 0000000..db0dc22
--- /dev/null
+++ b/Tests/CudaOnly/Fatbin/CMakeLists.txt
@@ -0,0 +1,25 @@
+cmake_minimum_required(VERSION 3.18)
+project(CudaFATBIN LANGUAGES CUDA)
+
+
+set(CMAKE_CUDA_ARCHITECTURES all-major)
+
+add_library(CudaFATBIN OBJECT
+${CMAKE_CURRENT_SOURCE_DIR}/../CUBIN/kernelA.cu
+${CMAKE_CURRENT_SOURCE_DIR}/../CUBIN/kernelB.cu
+${CMAKE_CURRENT_SOURCE_DIR}/../CUBIN/kernelC.cu)
+
+set_property(TARGET CudaFATBIN PROPERTY CUDA_FATBIN_COMPILATION ON)
+
+# Will use `cuModuleLoadFatBinary` to load the fatbinaries
+add_executable(CudaOnlyFatbin main.cu)
+target_compile_features(CudaOnlyFatbin PRIVATE cuda_std_11)
+target_compile_definitions(CudaOnlyFatbin PRIVATE "FATBIN_FILE_PATHS=\"$<JOIN:$<TARGET_OBJECTS:CudaFATBIN>,~_~>\"")
+
+find_package(CUDAToolkit REQUIRED)
+target_link_libraries(CudaOnlyFatbin PRIVATE CUDA::cuda_driver)
+
+if(APPLE)
+  # Help the static cuda runtime find the driver (libcuda.dyllib) at runtime.
+  set_property(TARGET CudaOnlyFatbin PROPERTY BUILD_RPATH ${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES})
+endif()
diff --git a/Tests/CudaOnly/Fatbin/main.cu b/Tests/CudaOnly/Fatbin/main.cu
new file mode 100644
index 0000000..903feee
--- /dev/null
+++ b/Tests/CudaOnly/Fatbin/main.cu
@@ -0,0 +1,56 @@
+#include <iostream>
+#include <string>
+#include <vector>
+
+#include <cuda.h>
+
+#define GENERATED_HEADER(x) GENERATED_HEADER1(x)
+#define GENERATED_HEADER1(x) <x>
+
+static std::string input_paths = { FATBIN_FILE_PATHS };
+
+int main()
+{
+  const std::string delimiter = "~_~";
+  input_paths += delimiter;
+
+  size_t end = 0;
+  size_t previous_end = 0;
+  std::vector<std::string> actual_paths;
+  while ((end = input_paths.find(delimiter, previous_end)) !=
+         std::string::npos) {
+    actual_paths.emplace_back(
+      input_paths.substr(previous_end, end - previous_end));
+    previous_end = end + 3;
+  }
+
+  cuInit(0);
+  int count = 0;
+  cuDeviceGetCount(&count);
+  if (count == 0) {
+    std::cerr << "No CUDA devices found\n";
+    return 1;
+  }
+
+  CUdevice device;
+  cuDeviceGet(&device, 0);
+
+  CUcontext context;
+  cuCtxCreate(&context, 0, device);
+
+  CUmodule module;
+  for (auto p : actual_paths) {
+    if (p.find(".fatbin") == std::string::npos) {
+      std::cout << p << " Doesn't have the .fatbin suffix" << p << std::endl;
+      return 1;
+    }
+    std::cout << "trying to load fatbin: " << p << std::endl;
+    CUresult result = cuModuleLoad(&module, p.c_str());
+    std::cout << "module pointer: " << module << '\n';
+    if (result != CUDA_SUCCESS || module == nullptr) {
+      std::cerr << "Failed to load the embedded fatbin with error: "
+                << static_cast<unsigned int>(result) << '\n';
+      return 1;
+    }
+  }
+}
diff --git a/Tests/CudaOnly/OptixIR/CMakeLists.txt b/Tests/CudaOnly/OptixIR/CMakeLists.txt
new file mode 100644
index 0000000..afeabda
--- /dev/null
+++ b/Tests/CudaOnly/OptixIR/CMakeLists.txt
@@ -0,0 +1,33 @@
+cmake_minimum_required(VERSION 3.18)
+project(CudaOptix LANGUAGES CUDA)
+
+
+set(CMAKE_CUDA_ARCHITECTURES all-major)
+
+add_library(CudaOptix OBJECT
+  ${CMAKE_CURRENT_SOURCE_DIR}/../CUBIN/kernelA.cu
+  ${CMAKE_CURRENT_SOURCE_DIR}/../CUBIN/kernelB.cu
+  ${CMAKE_CURRENT_SOURCE_DIR}/../CUBIN/kernelC.cu)
+
+if(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL "11.7.0")
+  set_property(TARGET CudaOptix PROPERTY CUDA_OPTIX_COMPILATION ON)
+endif()
+
+set_property(TARGET CudaOptix PROPERTY CUDA_ARCHITECTURES native)
+
+add_executable(CudaOnlyOptixIR main.cu)
+target_compile_features(CudaOnlyOptixIR PRIVATE cuda_std_11)
+
+if(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL "11.7.0")
+  target_compile_definitions(CudaOnlyOptixIR PRIVATE "OPTIX_FILE_PATHS=\"$<JOIN:$<TARGET_OBJECTS:CudaOptix>,~_~>\"")
+else()
+  target_compile_definitions(CudaOnlyOptixIR PRIVATE "OPTIX_FILE_PATHS=\"NO_OPTIX_SUPPORT\"")
+endif()
+
+find_package(CUDAToolkit REQUIRED)
+target_link_libraries(CudaOnlyOptixIR PRIVATE CUDA::cuda_driver)
+
+if(APPLE)
+  # Help the static cuda runtime find the driver (libcuda.dyllib) at runtime.
+  set_property(TARGET CudaOnlyOptixIR PROPERTY BUILD_RPATH ${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES})
+endif()
diff --git a/Tests/CudaOnly/OptixIR/main.cu b/Tests/CudaOnly/OptixIR/main.cu
new file mode 100644
index 0000000..c79829b
--- /dev/null
+++ b/Tests/CudaOnly/OptixIR/main.cu
@@ -0,0 +1,53 @@
+#include <fstream>
+#include <iostream>
+#include <string>
+#include <vector>
+
+#include <cuda.h>
+
+#define GENERATED_HEADER(x) GENERATED_HEADER1(x)
+#define GENERATED_HEADER1(x) <x>
+
+static std::string input_paths = { OPTIX_FILE_PATHS };
+
+int main()
+{
+  if (input_paths == "NO_OPTIX_SUPPORT") {
+    return 0;
+  }
+
+  const std::string delimiter = "~_~";
+  input_paths += delimiter;
+
+  size_t end = 0;
+  size_t previous_end = 0;
+  std::vector<std::string> actual_paths;
+  while ((end = input_paths.find(delimiter, previous_end)) !=
+         std::string::npos) {
+    actual_paths.emplace_back(
+      input_paths.substr(previous_end, end - previous_end));
+    previous_end = end + 3;
+  }
+
+  if (actual_paths.empty()) {
+    std::cerr << "Failed to parse OPTIX_FILE_PATHS" << std::endl;
+    return 1;
+  }
+
+  const std::uint32_t optix_magic_value = 0x7f4e43ed;
+  for (auto p : actual_paths) {
+    if (p.find(".optixir") == std::string::npos) {
+      std::cout << p << " Doesn't have the .optixir suffix" << p << std::endl;
+      return 1;
+    }
+    std::ifstream input(p, std::ios::binary);
+    std::uint32_t value;
+    input.read(reinterpret_cast<char*>(&value), sizeof(value));
+    if (value != optix_magic_value) {
+      std::cerr << p << " Doesn't look like an optix-ir file" << std::endl;
+      return 1;
+    }
+  }
+
+  return 0;
+}