From 0b1cea66cd1f80458f0da579d0182d908874939d Mon Sep 17 00:00:00 2001
From: root <raul@tambre.ee>
Date: Tue, 27 Jul 2021 22:58:03 +0300
Subject: CUDA/Clang: Fix separable compilation in non-root directories with
 Makefiles

Seems the relative paths were wrong basically all around such that only
compiling files in the top-level directory would work. I've modified
CudaOnly.SeparateCompilation to cover this.

Fixes #22482.
---
 Help/release/3.21.rst                              |  6 ++
 Source/cmMakefileTargetGenerator.cxx               | 24 ++++----
 Tests/CudaOnly/CMakeLists.txt                      |  2 +-
 Tests/CudaOnly/SeparateCompilation/CMakeLists.txt  | 19 +-----
 Tests/CudaOnly/SeparateCompilation/main.cu         | 68 ----------------------
 .../SeparateCompilation/main/CMakeLists.txt        | 18 ++++++
 Tests/CudaOnly/SeparateCompilation/main/main.cu    | 68 ++++++++++++++++++++++
 7 files changed, 108 insertions(+), 97 deletions(-)
 delete mode 100644 Tests/CudaOnly/SeparateCompilation/main.cu
 create mode 100644 Tests/CudaOnly/SeparateCompilation/main/CMakeLists.txt
 create mode 100644 Tests/CudaOnly/SeparateCompilation/main/main.cu

diff --git a/Help/release/3.21.rst b/Help/release/3.21.rst
index 3e70552..fc5d6ac 100644
--- a/Help/release/3.21.rst
+++ b/Help/release/3.21.rst
@@ -304,3 +304,9 @@ Changes made since CMake 3.21.0 include the following.
 
 * The :generator:`Visual Studio 17 2022` generator is now based on
   "Visual Studio 2022 Preview 2".  Previously it was based on "Preview 1.1".
+
+3.21.2
+------
+
+* ``CUDA`` targets with :prop_tgt:`CUDA_SEPARABLE_COMPILATION` enabled are now
+  correctly generated in non-root directories.
diff --git a/Source/cmMakefileTargetGenerator.cxx b/Source/cmMakefileTargetGenerator.cxx
index 6d8376c..6324b2e 100644
--- a/Source/cmMakefileTargetGenerator.cxx
+++ b/Source/cmMakefileTargetGenerator.cxx
@@ -1484,14 +1484,18 @@ void cmMakefileTargetGenerator::WriteDeviceLinkRule(
   }
 
   std::vector<std::string> architectures = cmExpandedList(architecturesStr);
+  std::string const& relPath =
+    this->LocalGenerator->GetHomeRelativeOutputPath();
 
   // Ensure there are no duplicates.
   const std::vector<std::string> linkDeps = [&]() -> std::vector<std::string> {
     std::vector<std::string> deps;
     this->AppendTargetDepends(deps, true);
     this->GeneratorTarget->GetLinkDepends(deps, this->GetConfigName(), "CUDA");
-    std::copy(this->Objects.begin(), this->Objects.end(),
-              std::back_inserter(deps));
+
+    for (std::string const& obj : this->Objects) {
+      deps.emplace_back(cmStrCat(relPath, obj));
+    }
 
     std::unordered_set<std::string> depsSet(deps.begin(), deps.end());
     deps.clear();
@@ -1510,7 +1514,8 @@ void cmMakefileTargetGenerator::WriteDeviceLinkRule(
 
   std::string profiles;
   std::vector<std::string> fatbinaryDepends;
-  std::string registerFile = cmStrCat(objectDir, "cmake_cuda_register.h");
+  std::string const registerFile =
+    cmStrCat(objectDir, "cmake_cuda_register.h");
 
   // Link device code for each architecture.
   for (const std::string& architectureKind : architectures) {
@@ -1518,7 +1523,7 @@ void cmMakefileTargetGenerator::WriteDeviceLinkRule(
     const std::string architecture =
       architectureKind.substr(0, architectureKind.find('-'));
     const std::string cubin =
-      cmStrCat(relObjectDir, "sm_", architecture, ".cubin");
+      cmStrCat(objectDir, "sm_", architecture, ".cubin");
 
     profiles += cmStrCat(" -im=profile=sm_", architecture, ",file=", cubin);
     fatbinaryDepends.emplace_back(cubin);
@@ -1530,8 +1535,8 @@ void cmMakefileTargetGenerator::WriteDeviceLinkRule(
     // all architectures the register file will be the same too. Thus
     // generate it only on the first invocation to reduce overhead.
     if (fatbinaryDepends.size() == 1) {
-      std::string registerFileRel =
-        this->LocalGenerator->MaybeRelativeToCurBinDir(registerFile);
+      std::string const registerFileRel =
+        cmStrCat(relPath, relObjectDir, "cmake_cuda_register.h");
       registerFileCmd =
         cmStrCat(" --register-link-binaries=", registerFileRel);
       cleanFiles.push_back(registerFileRel);
@@ -1555,7 +1560,7 @@ void cmMakefileTargetGenerator::WriteDeviceLinkRule(
   const std::string fatbinaryOutput =
     cmStrCat(objectDir, "cmake_cuda_fatbin.h");
   const std::string fatbinaryOutputRel =
-    this->LocalGenerator->MaybeRelativeToCurBinDir(fatbinaryOutput);
+    cmStrCat(relPath, relObjectDir, "cmake_cuda_fatbin.h");
 
   this->LocalGenerator->WriteMakeRule(*this->BuildFileStream, nullptr,
                                       fatbinaryOutputRel, fatbinaryDepends,
@@ -1583,9 +1588,8 @@ void cmMakefileTargetGenerator::WriteDeviceLinkRule(
                                                compileCmd, vars);
 
   commands.emplace_back(compileCmd);
-  this->LocalGenerator->WriteMakeRule(
-    *this->BuildFileStream, nullptr, output,
-    { cmStrCat(relObjectDir, "cmake_cuda_fatbin.h") }, commands, false);
+  this->LocalGenerator->WriteMakeRule(*this->BuildFileStream, nullptr, output,
+                                      { fatbinaryOutputRel }, commands, false);
 
   // Clean all the possible executable names and symlinks.
   this->CleanFiles.insert(cleanFiles.begin(), cleanFiles.end());
diff --git a/Tests/CudaOnly/CMakeLists.txt b/Tests/CudaOnly/CMakeLists.txt
index fdb7a6e..a3fb409 100644
--- a/Tests/CudaOnly/CMakeLists.txt
+++ b/Tests/CudaOnly/CMakeLists.txt
@@ -15,7 +15,7 @@ add_cuda_test_macro(CudaOnly.ToolkitBeforeLang CudaOnlyToolkitBeforeLang)
 add_cuda_test_macro(CudaOnly.WithDefs CudaOnlyWithDefs)
 add_cuda_test_macro(CudaOnly.CircularLinkLine CudaOnlyCircularLinkLine)
 add_cuda_test_macro(CudaOnly.ResolveDeviceSymbols CudaOnlyResolveDeviceSymbols)
-add_cuda_test_macro(CudaOnly.SeparateCompilation CudaOnlySeparateCompilation)
+add_cuda_test_macro(CudaOnly.SeparateCompilation main/CudaOnlySeparateCompilation)
 
 if(CMake_TEST_CUDA AND NOT CMake_TEST_CUDA STREQUAL "Clang")
   # Clang doesn't have flags for selecting the runtime.
diff --git a/Tests/CudaOnly/SeparateCompilation/CMakeLists.txt b/Tests/CudaOnly/SeparateCompilation/CMakeLists.txt
index 864ecbf..17069e3 100644
--- a/Tests/CudaOnly/SeparateCompilation/CMakeLists.txt
+++ b/Tests/CudaOnly/SeparateCompilation/CMakeLists.txt
@@ -34,26 +34,9 @@ add_library(CUDASeparateLibB STATIC file4.cu file5.cu)
 target_compile_features(CUDASeparateLibB PRIVATE cuda_std_11)
 target_link_libraries(CUDASeparateLibB PRIVATE CUDASeparateLibA)
 
-add_executable(CudaOnlySeparateCompilation main.cu)
-target_link_libraries(CudaOnlySeparateCompilation
-                      PRIVATE CUDASeparateLibB)
-set_target_properties(CudaOnlySeparateCompilation PROPERTIES CUDA_STANDARD 11)
-set_target_properties(CudaOnlySeparateCompilation PROPERTIES CUDA_STANDARD_REQUIRED TRUE)
-
 set_target_properties(CUDASeparateLibA
                       CUDASeparateLibB
                       PROPERTIES CUDA_SEPARABLE_COMPILATION ON
                       POSITION_INDEPENDENT_CODE ON)
 
-if (CMAKE_GENERATOR MATCHES "^Visual Studio")
-  #Visual Studio CUDA integration will not perform device linking
-  #on a target that itself does not have GenerateRelocatableDeviceCode
-  #enabled.
-  set_target_properties(CudaOnlySeparateCompilation
-                        PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
-endif()
-
-if(APPLE)
-  # Help the static cuda runtime find the driver (libcuda.dyllib) at runtime.
-  set_property(TARGET CudaOnlySeparateCompilation PROPERTY BUILD_RPATH ${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES})
-endif()
+add_subdirectory(main)
diff --git a/Tests/CudaOnly/SeparateCompilation/main.cu b/Tests/CudaOnly/SeparateCompilation/main.cu
deleted file mode 100644
index 40dbe5d..0000000
--- a/Tests/CudaOnly/SeparateCompilation/main.cu
+++ /dev/null
@@ -1,68 +0,0 @@
-
-#include <iostream>
-
-#include "file1.h"
-#include "file2.h"
-
-int file4_launch_kernel(int x);
-int file5_launch_kernel(int x);
-
-int choose_cuda_device()
-{
-  int nDevices = 0;
-  cudaError_t err = cudaGetDeviceCount(&nDevices);
-  if (err != cudaSuccess) {
-    std::cerr << "Failed to retrieve the number of CUDA enabled devices"
-              << std::endl;
-    return 1;
-  }
-  for (int i = 0; i < nDevices; ++i) {
-    cudaDeviceProp prop;
-    cudaError_t err = cudaGetDeviceProperties(&prop, i);
-    if (err != cudaSuccess) {
-      std::cerr << "Could not retrieve properties from CUDA device " << i
-                << std::endl;
-      return 1;
-    }
-    if (prop.major >= 3) {
-      err = cudaSetDevice(i);
-      if (err != cudaSuccess) {
-        std::cout << "Could not select CUDA device " << i << std::endl;
-      } else {
-        return 0;
-      }
-    }
-  }
-
-  std::cout << "Could not find a CUDA enabled card supporting compute >=3.0"
-            << std::endl;
-
-  return 1;
-}
-
-int main(int argc, char** argv)
-{
-  int ret = choose_cuda_device();
-  if (ret) {
-    return 0;
-  }
-
-  cudaError_t err;
-  file4_launch_kernel(42);
-  err = cudaGetLastError();
-  if (err != cudaSuccess) {
-    std::cerr << "file4_launch_kernel: kernel launch failed: "
-              << cudaGetErrorString(err) << std::endl;
-    return 1;
-  }
-
-  file5_launch_kernel(42);
-  err = cudaGetLastError();
-  if (err != cudaSuccess) {
-    std::cerr << "file5_launch_kernel: kernel launch failed: "
-              << cudaGetErrorString(err) << std::endl;
-    return 1;
-  }
-
-  return 0;
-}
diff --git a/Tests/CudaOnly/SeparateCompilation/main/CMakeLists.txt b/Tests/CudaOnly/SeparateCompilation/main/CMakeLists.txt
new file mode 100644
index 0000000..c181078
--- /dev/null
+++ b/Tests/CudaOnly/SeparateCompilation/main/CMakeLists.txt
@@ -0,0 +1,18 @@
+add_executable(CudaOnlySeparateCompilation main.cu)
+target_link_libraries(CudaOnlySeparateCompilation PRIVATE CUDASeparateLibB)
+set_target_properties(CudaOnlySeparateCompilation PROPERTIES
+  CUDA_STANDARD 11
+  CUDA_STANDARD_REQUIRED TRUE
+)
+
+if(CMAKE_GENERATOR MATCHES "^Visual Studio")
+  # Visual Studio CUDA integration will not perform device linking
+  # on a target that itself does not have GenerateRelocatableDeviceCode
+  # enabled.
+  set_property(TARGET CudaOnlySeparateCompilation PROPERTY CUDA_SEPARABLE_COMPILATION ON)
+endif()
+
+if(APPLE)
+  # Help the static cuda runtime find the driver (libcuda.dyllib) at runtime.
+  set_property(TARGET CudaOnlySeparateCompilation PROPERTY BUILD_RPATH ${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES})
+endif()
diff --git a/Tests/CudaOnly/SeparateCompilation/main/main.cu b/Tests/CudaOnly/SeparateCompilation/main/main.cu
new file mode 100644
index 0000000..2b6e8f4
--- /dev/null
+++ b/Tests/CudaOnly/SeparateCompilation/main/main.cu
@@ -0,0 +1,68 @@
+
+#include <iostream>
+
+#include "../file1.h"
+#include "../file2.h"
+
+int file4_launch_kernel(int x);
+int file5_launch_kernel(int x);
+
+int choose_cuda_device()
+{
+  int nDevices = 0;
+  cudaError_t err = cudaGetDeviceCount(&nDevices);
+  if (err != cudaSuccess) {
+    std::cerr << "Failed to retrieve the number of CUDA enabled devices"
+              << std::endl;
+    return 1;
+  }
+  for (int i = 0; i < nDevices; ++i) {
+    cudaDeviceProp prop;
+    cudaError_t err = cudaGetDeviceProperties(&prop, i);
+    if (err != cudaSuccess) {
+      std::cerr << "Could not retrieve properties from CUDA device " << i
+                << std::endl;
+      return 1;
+    }
+    if (prop.major >= 3) {
+      err = cudaSetDevice(i);
+      if (err != cudaSuccess) {
+        std::cout << "Could not select CUDA device " << i << std::endl;
+      } else {
+        return 0;
+      }
+    }
+  }
+
+  std::cout << "Could not find a CUDA enabled card supporting compute >=3.0"
+            << std::endl;
+
+  return 1;
+}
+
+int main(int argc, char** argv)
+{
+  int ret = choose_cuda_device();
+  if (ret) {
+    return 0;
+  }
+
+  cudaError_t err;
+  file4_launch_kernel(42);
+  err = cudaGetLastError();
+  if (err != cudaSuccess) {
+    std::cerr << "file4_launch_kernel: kernel launch failed: "
+              << cudaGetErrorString(err) << std::endl;
+    return 1;
+  }
+
+  file5_launch_kernel(42);
+  err = cudaGetLastError();
+  if (err != cudaSuccess) {
+    std::cerr << "file5_launch_kernel: kernel launch failed: "
+              << cudaGetErrorString(err) << std::endl;
+    return 1;
+  }
+
+  return 0;
+}
-- 
cgit v0.12


From 3975678fcc3928f2a7dcd79fe9b9e9ebf3abe2b2 Mon Sep 17 00:00:00 2001
From: root <raul@tambre.ee>
Date: Tue, 27 Jul 2021 23:38:36 +0300
Subject: CUDA/Clang: Simplify --register-link-binaries logic

Move the logic for appending cubin afterwards, so the check can simply be
empty().
With the Makefile generator the option is now at the front instead of being
intermixed with the actual bins.
---
 Source/cmMakefileTargetGenerator.cxx    | 20 ++++++++++----------
 Source/cmNinjaNormalTargetGenerator.cxx | 10 +++++-----
 2 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/Source/cmMakefileTargetGenerator.cxx b/Source/cmMakefileTargetGenerator.cxx
index 6324b2e..98c61fe 100644
--- a/Source/cmMakefileTargetGenerator.cxx
+++ b/Source/cmMakefileTargetGenerator.cxx
@@ -1519,22 +1519,13 @@ void cmMakefileTargetGenerator::WriteDeviceLinkRule(
 
   // Link device code for each architecture.
   for (const std::string& architectureKind : architectures) {
-    // Clang always generates real code, so strip the specifier.
-    const std::string architecture =
-      architectureKind.substr(0, architectureKind.find('-'));
-    const std::string cubin =
-      cmStrCat(objectDir, "sm_", architecture, ".cubin");
-
-    profiles += cmStrCat(" -im=profile=sm_", architecture, ",file=", cubin);
-    fatbinaryDepends.emplace_back(cubin);
-
     std::string registerFileCmd;
 
     // The generated register file contains macros that when expanded
     // register the device routines. Because the routines are the same for
     // all architectures the register file will be the same too. Thus
     // generate it only on the first invocation to reduce overhead.
-    if (fatbinaryDepends.size() == 1) {
+    if (fatbinaryDepends.empty()) {
       std::string const registerFileRel =
         cmStrCat(relPath, relObjectDir, "cmake_cuda_register.h");
       registerFileCmd =
@@ -1542,6 +1533,15 @@ void cmMakefileTargetGenerator::WriteDeviceLinkRule(
       cleanFiles.push_back(registerFileRel);
     }
 
+    // Clang always generates real code, so strip the specifier.
+    const std::string architecture =
+      architectureKind.substr(0, architectureKind.find('-'));
+    const std::string cubin =
+      cmStrCat(objectDir, "sm_", architecture, ".cubin");
+
+    profiles += cmStrCat(" -im=profile=sm_", architecture, ",file=", cubin);
+    fatbinaryDepends.emplace_back(cubin);
+
     std::string command = cmStrCat(
       this->Makefile->GetRequiredDefinition("CMAKE_CUDA_DEVICE_LINKER"),
       " -arch=sm_", architecture, registerFileCmd, " -o=$@ ",
diff --git a/Source/cmNinjaNormalTargetGenerator.cxx b/Source/cmNinjaNormalTargetGenerator.cxx
index 5a4c652..493bd4a 100644
--- a/Source/cmNinjaNormalTargetGenerator.cxx
+++ b/Source/cmNinjaNormalTargetGenerator.cxx
@@ -753,10 +753,6 @@ void cmNinjaNormalTargetGenerator::WriteDeviceLinkStatements(
     const std::string cubin =
       cmStrCat(ninjaOutputDir, "/sm_", architecture, ".cubin");
 
-    fatbinary.Variables["PROFILES"] +=
-      cmStrCat(" -im=profile=sm_", architecture, ",file=", cubin);
-    fatbinary.ExplicitDeps.emplace_back(cubin);
-
     cmNinjaBuild dlink(this->LanguageLinkerCudaDeviceRule(config));
     dlink.ExplicitDeps = explicitDeps;
     dlink.Outputs = { cubin };
@@ -766,11 +762,15 @@ void cmNinjaNormalTargetGenerator::WriteDeviceLinkStatements(
     // the device routines. Because the routines are the same for all
     // architectures the register file will be the same too. Thus generate it
     // only on the first invocation to reduce overhead.
-    if (fatbinary.ExplicitDeps.size() == 1) {
+    if (fatbinary.ExplicitDeps.empty()) {
       dlink.Variables["REGISTER"] = cmStrCat(
         "--register-link-binaries=", ninjaOutputDir, "/cmake_cuda_register.h");
     }
 
+    fatbinary.Variables["PROFILES"] +=
+      cmStrCat(" -im=profile=sm_", architecture, ",file=", cubin);
+    fatbinary.ExplicitDeps.emplace_back(cubin);
+
     this->GetGlobalGenerator()->WriteBuild(this->GetCommonFileStream(), dlink);
   }
 
-- 
cgit v0.12