From 0b1cea66cd1f80458f0da579d0182d908874939d Mon Sep 17 00:00:00 2001
From: root <raul@tambre.ee>
Date: Tue, 27 Jul 2021 22:58:03 +0300
Subject: CUDA/Clang: Fix separable compilation in non-root directories with
 Makefiles

Seems the relative paths were wrong basically all around such that only
compiling files in the top-level directory would work. I've modified
CudaOnly.SeparateCompilation to cover this.

Fixes #22482.
---
 Help/release/3.21.rst                              |  6 ++
 Source/cmMakefileTargetGenerator.cxx               | 24 ++++----
 Tests/CudaOnly/CMakeLists.txt                      |  2 +-
 Tests/CudaOnly/SeparateCompilation/CMakeLists.txt  | 19 +-----
 Tests/CudaOnly/SeparateCompilation/main.cu         | 68 ----------------------
 .../SeparateCompilation/main/CMakeLists.txt        | 18 ++++++
 Tests/CudaOnly/SeparateCompilation/main/main.cu    | 68 ++++++++++++++++++++++
 7 files changed, 108 insertions(+), 97 deletions(-)
 delete mode 100644 Tests/CudaOnly/SeparateCompilation/main.cu
 create mode 100644 Tests/CudaOnly/SeparateCompilation/main/CMakeLists.txt
 create mode 100644 Tests/CudaOnly/SeparateCompilation/main/main.cu

diff --git a/Help/release/3.21.rst b/Help/release/3.21.rst
index 3e70552..fc5d6ac 100644
--- a/Help/release/3.21.rst
+++ b/Help/release/3.21.rst
@@ -304,3 +304,9 @@ Changes made since CMake 3.21.0 include the following.
 
 * The :generator:`Visual Studio 17 2022` generator is now based on
   "Visual Studio 2022 Preview 2".  Previously it was based on "Preview 1.1".
+
+3.21.2
+------
+
+* ``CUDA`` targets with :prop_tgt:`CUDA_SEPARABLE_COMPILATION` enabled are now
+  correctly generated in non-root directories.
diff --git a/Source/cmMakefileTargetGenerator.cxx b/Source/cmMakefileTargetGenerator.cxx
index 6d8376c..6324b2e 100644
--- a/Source/cmMakefileTargetGenerator.cxx
+++ b/Source/cmMakefileTargetGenerator.cxx
@@ -1484,14 +1484,18 @@ void cmMakefileTargetGenerator::WriteDeviceLinkRule(
   }
 
   std::vector<std::string> architectures = cmExpandedList(architecturesStr);
+  std::string const& relPath =
+    this->LocalGenerator->GetHomeRelativeOutputPath();
 
   // Ensure there are no duplicates.
   const std::vector<std::string> linkDeps = [&]() -> std::vector<std::string> {
     std::vector<std::string> deps;
     this->AppendTargetDepends(deps, true);
     this->GeneratorTarget->GetLinkDepends(deps, this->GetConfigName(), "CUDA");
-    std::copy(this->Objects.begin(), this->Objects.end(),
-              std::back_inserter(deps));
+
+    for (std::string const& obj : this->Objects) {
+      deps.emplace_back(cmStrCat(relPath, obj));
+    }
 
     std::unordered_set<std::string> depsSet(deps.begin(), deps.end());
     deps.clear();
@@ -1510,7 +1514,8 @@ void cmMakefileTargetGenerator::WriteDeviceLinkRule(
 
   std::string profiles;
   std::vector<std::string> fatbinaryDepends;
-  std::string registerFile = cmStrCat(objectDir, "cmake_cuda_register.h");
+  std::string const registerFile =
+    cmStrCat(objectDir, "cmake_cuda_register.h");
 
   // Link device code for each architecture.
   for (const std::string& architectureKind : architectures) {
@@ -1518,7 +1523,7 @@ void cmMakefileTargetGenerator::WriteDeviceLinkRule(
     const std::string architecture =
       architectureKind.substr(0, architectureKind.find('-'));
     const std::string cubin =
-      cmStrCat(relObjectDir, "sm_", architecture, ".cubin");
+      cmStrCat(objectDir, "sm_", architecture, ".cubin");
 
     profiles += cmStrCat(" -im=profile=sm_", architecture, ",file=", cubin);
     fatbinaryDepends.emplace_back(cubin);
@@ -1530,8 +1535,8 @@ void cmMakefileTargetGenerator::WriteDeviceLinkRule(
     // all architectures the register file will be the same too. Thus
     // generate it only on the first invocation to reduce overhead.
     if (fatbinaryDepends.size() == 1) {
-      std::string registerFileRel =
-        this->LocalGenerator->MaybeRelativeToCurBinDir(registerFile);
+      std::string const registerFileRel =
+        cmStrCat(relPath, relObjectDir, "cmake_cuda_register.h");
       registerFileCmd =
         cmStrCat(" --register-link-binaries=", registerFileRel);
       cleanFiles.push_back(registerFileRel);
@@ -1555,7 +1560,7 @@ void cmMakefileTargetGenerator::WriteDeviceLinkRule(
   const std::string fatbinaryOutput =
     cmStrCat(objectDir, "cmake_cuda_fatbin.h");
   const std::string fatbinaryOutputRel =
-    this->LocalGenerator->MaybeRelativeToCurBinDir(fatbinaryOutput);
+    cmStrCat(relPath, relObjectDir, "cmake_cuda_fatbin.h");
 
   this->LocalGenerator->WriteMakeRule(*this->BuildFileStream, nullptr,
                                       fatbinaryOutputRel, fatbinaryDepends,
@@ -1583,9 +1588,8 @@ void cmMakefileTargetGenerator::WriteDeviceLinkRule(
                                                compileCmd, vars);
 
   commands.emplace_back(compileCmd);
-  this->LocalGenerator->WriteMakeRule(
-    *this->BuildFileStream, nullptr, output,
-    { cmStrCat(relObjectDir, "cmake_cuda_fatbin.h") }, commands, false);
+  this->LocalGenerator->WriteMakeRule(*this->BuildFileStream, nullptr, output,
+                                      { fatbinaryOutputRel }, commands, false);
 
   // Clean all the possible executable names and symlinks.
   this->CleanFiles.insert(cleanFiles.begin(), cleanFiles.end());
diff --git a/Tests/CudaOnly/CMakeLists.txt b/Tests/CudaOnly/CMakeLists.txt
index fdb7a6e..a3fb409 100644
--- a/Tests/CudaOnly/CMakeLists.txt
+++ b/Tests/CudaOnly/CMakeLists.txt
@@ -15,7 +15,7 @@ add_cuda_test_macro(CudaOnly.ToolkitBeforeLang CudaOnlyToolkitBeforeLang)
 add_cuda_test_macro(CudaOnly.WithDefs CudaOnlyWithDefs)
 add_cuda_test_macro(CudaOnly.CircularLinkLine CudaOnlyCircularLinkLine)
 add_cuda_test_macro(CudaOnly.ResolveDeviceSymbols CudaOnlyResolveDeviceSymbols)
-add_cuda_test_macro(CudaOnly.SeparateCompilation CudaOnlySeparateCompilation)
+add_cuda_test_macro(CudaOnly.SeparateCompilation main/CudaOnlySeparateCompilation)
 
 if(CMake_TEST_CUDA AND NOT CMake_TEST_CUDA STREQUAL "Clang")
   # Clang doesn't have flags for selecting the runtime.
diff --git a/Tests/CudaOnly/SeparateCompilation/CMakeLists.txt b/Tests/CudaOnly/SeparateCompilation/CMakeLists.txt
index 864ecbf..17069e3 100644
--- a/Tests/CudaOnly/SeparateCompilation/CMakeLists.txt
+++ b/Tests/CudaOnly/SeparateCompilation/CMakeLists.txt
@@ -34,26 +34,9 @@ add_library(CUDASeparateLibB STATIC file4.cu file5.cu)
 target_compile_features(CUDASeparateLibB PRIVATE cuda_std_11)
 target_link_libraries(CUDASeparateLibB PRIVATE CUDASeparateLibA)
 
-add_executable(CudaOnlySeparateCompilation main.cu)
-target_link_libraries(CudaOnlySeparateCompilation
-                      PRIVATE CUDASeparateLibB)
-set_target_properties(CudaOnlySeparateCompilation PROPERTIES CUDA_STANDARD 11)
-set_target_properties(CudaOnlySeparateCompilation PROPERTIES CUDA_STANDARD_REQUIRED TRUE)
-
 set_target_properties(CUDASeparateLibA
                       CUDASeparateLibB
                       PROPERTIES CUDA_SEPARABLE_COMPILATION ON
                       POSITION_INDEPENDENT_CODE ON)
 
-if (CMAKE_GENERATOR MATCHES "^Visual Studio")
-  #Visual Studio CUDA integration will not perform device linking
-  #on a target that itself does not have GenerateRelocatableDeviceCode
-  #enabled.
-  set_target_properties(CudaOnlySeparateCompilation
-                        PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
-endif()
-
-if(APPLE)
-  # Help the static cuda runtime find the driver (libcuda.dyllib) at runtime.
-  set_property(TARGET CudaOnlySeparateCompilation PROPERTY BUILD_RPATH ${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES})
-endif()
+add_subdirectory(main)
diff --git a/Tests/CudaOnly/SeparateCompilation/main.cu b/Tests/CudaOnly/SeparateCompilation/main.cu
deleted file mode 100644
index 40dbe5d..0000000
--- a/Tests/CudaOnly/SeparateCompilation/main.cu
+++ /dev/null
@@ -1,68 +0,0 @@
-
-#include <iostream>
-
-#include "file1.h"
-#include "file2.h"
-
-int file4_launch_kernel(int x);
-int file5_launch_kernel(int x);
-
-int choose_cuda_device()
-{
-  int nDevices = 0;
-  cudaError_t err = cudaGetDeviceCount(&nDevices);
-  if (err != cudaSuccess) {
-    std::cerr << "Failed to retrieve the number of CUDA enabled devices"
-              << std::endl;
-    return 1;
-  }
-  for (int i = 0; i < nDevices; ++i) {
-    cudaDeviceProp prop;
-    cudaError_t err = cudaGetDeviceProperties(&prop, i);
-    if (err != cudaSuccess) {
-      std::cerr << "Could not retrieve properties from CUDA device " << i
-                << std::endl;
-      return 1;
-    }
-    if (prop.major >= 3) {
-      err = cudaSetDevice(i);
-      if (err != cudaSuccess) {
-        std::cout << "Could not select CUDA device " << i << std::endl;
-      } else {
-        return 0;
-      }
-    }
-  }
-
-  std::cout << "Could not find a CUDA enabled card supporting compute >=3.0"
-            << std::endl;
-
-  return 1;
-}
-
-int main(int argc, char** argv)
-{
-  int ret = choose_cuda_device();
-  if (ret) {
-    return 0;
-  }
-
-  cudaError_t err;
-  file4_launch_kernel(42);
-  err = cudaGetLastError();
-  if (err != cudaSuccess) {
-    std::cerr << "file4_launch_kernel: kernel launch failed: "
-              << cudaGetErrorString(err) << std::endl;
-    return 1;
-  }
-
-  file5_launch_kernel(42);
-  err = cudaGetLastError();
-  if (err != cudaSuccess) {
-    std::cerr << "file5_launch_kernel: kernel launch failed: "
-              << cudaGetErrorString(err) << std::endl;
-    return 1;
-  }
-
-  return 0;
-}
diff --git a/Tests/CudaOnly/SeparateCompilation/main/CMakeLists.txt b/Tests/CudaOnly/SeparateCompilation/main/CMakeLists.txt
new file mode 100644
index 0000000..c181078
--- /dev/null
+++ b/Tests/CudaOnly/SeparateCompilation/main/CMakeLists.txt
@@ -0,0 +1,18 @@
+add_executable(CudaOnlySeparateCompilation main.cu)
+target_link_libraries(CudaOnlySeparateCompilation PRIVATE CUDASeparateLibB)
+set_target_properties(CudaOnlySeparateCompilation PROPERTIES
+  CUDA_STANDARD 11
+  CUDA_STANDARD_REQUIRED TRUE
+)
+
+if(CMAKE_GENERATOR MATCHES "^Visual Studio")
+  # Visual Studio CUDA integration will not perform device linking
+  # on a target that itself does not have GenerateRelocatableDeviceCode
+  # enabled.
+  set_property(TARGET CudaOnlySeparateCompilation PROPERTY CUDA_SEPARABLE_COMPILATION ON)
+endif()
+
+if(APPLE)
+  # Help the static cuda runtime find the driver (libcuda.dyllib) at runtime.
+  set_property(TARGET CudaOnlySeparateCompilation PROPERTY BUILD_RPATH ${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES})
+endif()
diff --git a/Tests/CudaOnly/SeparateCompilation/main/main.cu b/Tests/CudaOnly/SeparateCompilation/main/main.cu
new file mode 100644
index 0000000..2b6e8f4
--- /dev/null
+++ b/Tests/CudaOnly/SeparateCompilation/main/main.cu
@@ -0,0 +1,68 @@
+
+#include <iostream>
+
+#include "../file1.h"
+#include "../file2.h"
+
+int file4_launch_kernel(int x);
+int file5_launch_kernel(int x);
+
+int choose_cuda_device()
+{
+  int nDevices = 0;
+  cudaError_t err = cudaGetDeviceCount(&nDevices);
+  if (err != cudaSuccess) {
+    std::cerr << "Failed to retrieve the number of CUDA enabled devices"
+              << std::endl;
+    return 1;
+  }
+  for (int i = 0; i < nDevices; ++i) {
+    cudaDeviceProp prop;
+    cudaError_t err = cudaGetDeviceProperties(&prop, i);
+    if (err != cudaSuccess) {
+      std::cerr << "Could not retrieve properties from CUDA device " << i
+                << std::endl;
+      return 1;
+    }
+    if (prop.major >= 3) {
+      err = cudaSetDevice(i);
+      if (err != cudaSuccess) {
+        std::cout << "Could not select CUDA device " << i << std::endl;
+      } else {
+        return 0;
+      }
+    }
+  }
+
+  std::cout << "Could not find a CUDA enabled card supporting compute >=3.0"
+            << std::endl;
+
+  return 1;
+}
+
+int main(int argc, char** argv)
+{
+  int ret = choose_cuda_device();
+  if (ret) {
+    return 0;
+  }
+
+  cudaError_t err;
+  file4_launch_kernel(42);
+  err = cudaGetLastError();
+  if (err != cudaSuccess) {
+    std::cerr << "file4_launch_kernel: kernel launch failed: "
+              << cudaGetErrorString(err) << std::endl;
+    return 1;
+  }
+
+  file5_launch_kernel(42);
+  err = cudaGetLastError();
+  if (err != cudaSuccess) {
+    std::cerr << "file5_launch_kernel: kernel launch failed: "
+              << cudaGetErrorString(err) << std::endl;
+    return 1;
+  }
+
+  return 0;
+}
-- 
cgit v0.12