From 41eab150a8ef42bbebff18ff84652e9da1ef4e75 Mon Sep 17 00:00:00 2001
From: Robert Maynard <robert.maynard@kitware.com>
Date: Tue, 27 Mar 2018 14:59:34 -0400
Subject: CUDA: Pass more link libraries to device linking

Previously we dropped non-target items from the device link line because
nvcc rejects paths to shared library files, and only with target items
do we know the kind of library.  However, this also prevents projects
from linking to system-provided libraries like `cublas_device` that
contain device code.

Fix this by passing more link items to device linking.  Items that are
not file paths, such as `-lfoo`, can simply be passed unconditionally.
Items that are targets known to be shared libraries can still be
skipped.  Items that are paths to library files can be passed directly
if they end in `.a`.  Otherwise, pass them using `-Xnvlink` to bypass
nvcc's front-end.  The nvlink tool knows to ignore shared library files.

Issue: #16317
---
 Source/cmLinkLineDeviceComputer.cxx                | 54 +++++++--------
 Source/cmNinjaNormalTargetGenerator.cxx            |  2 +-
 Tests/CudaOnly/CMakeLists.txt                      |  1 +
 .../LinkSystemDeviceLibraries/CMakeLists.txt       | 15 +++++
 Tests/CudaOnly/LinkSystemDeviceLibraries/main.cu   | 77 ++++++++++++++++++++++
 5 files changed, 118 insertions(+), 31 deletions(-)
 create mode 100644 Tests/CudaOnly/LinkSystemDeviceLibraries/CMakeLists.txt
 create mode 100644 Tests/CudaOnly/LinkSystemDeviceLibraries/main.cu
diff --git a/Source/cmLinkLineDeviceComputer.cxx b/Source/cmLinkLineDeviceComputer.cxx
index 3beeae3..557fa41 100644
--- a/Source/cmLinkLineDeviceComputer.cxx
+++ b/Source/cmLinkLineDeviceComputer.cxx
@@ -3,9 +3,9 @@
 
 #include "cmLinkLineDeviceComputer.h"
 
-#include <set>
 #include <sstream>
 
+#include "cmAlgorithms.h"
 #include "cmComputeLinkInformation.h"
 #include "cmGeneratorTarget.h"
 #include "cmGlobalNinjaGenerator.h"
@@ -32,38 +32,32 @@ std::string cmLinkLineDeviceComputer::ComputeLinkLibraries(
   ItemVector const& items = cli.GetItems();
   std::string config = cli.GetConfig();
   for (auto const& item : items) {
-    if (!item.Target) {
-      continue;
-    }
-
-    bool skippable = false;
-    switch (item.Target->GetType()) {
-      case cmStateEnums::SHARED_LIBRARY:
-      case cmStateEnums::MODULE_LIBRARY:
-      case cmStateEnums::INTERFACE_LIBRARY:
-        skippable = true;
-        break;
-      case cmStateEnums::STATIC_LIBRARY:
-        // If a static library is resolving its device linking, it should
-        // be removed for other device linking
-        skippable =
-          item.Target->GetPropertyAsBool("CUDA_RESOLVE_DEVICE_SYMBOLS");
-        break;
-      default:
-        break;
-    }
-
-    if (skippable) {
-      continue;
-    }
-
-    std::set<std::string> langs;
-    item.Target->GetLanguages(langs, config);
-    if (langs.count("CUDA") == 0) {
-      continue;
+    if (item.Target) {
+      bool skip = false;
+      switch (item.Target->GetType()) {
+        case cmStateEnums::MODULE_LIBRARY:
+        case cmStateEnums::INTERFACE_LIBRARY:
+          skip = true;
+          break;
+        case cmStateEnums::STATIC_LIBRARY:
+          skip = item.Target->GetPropertyAsBool("CUDA_RESOLVE_DEVICE_SYMBOLS");
+          break;
+        default:
+          break;
+      }
+      if (skip) {
+        continue;
+      }
     }
 
     if (item.IsPath) {
+      // nvcc understands absolute paths to libraries ending in '.a' should
+      // be passed to nvlink.  Other extensions like '.so' or '.dylib' are
+      // rejected by the nvcc front-end even though nvlink knows to ignore
+      // them.  Bypass the front-end via '-Xnvlink'.
+      if (!cmHasLiteralSuffix(item.Value, ".a")) {
+        fout << "-Xnvlink ";
+      }
       fout << this->ConvertToOutputFormat(
         this->ConvertToLinkReference(item.Value));
     } else {
diff --git a/Source/cmNinjaNormalTargetGenerator.cxx b/Source/cmNinjaNormalTargetGenerator.cxx
index f1fb2d2..52e3677 100644
--- a/Source/cmNinjaNormalTargetGenerator.cxx
+++ b/Source/cmNinjaNormalTargetGenerator.cxx
@@ -187,7 +187,7 @@ void cmNinjaNormalTargetGenerator::WriteDeviceLinkRule(bool useResponseFile)
     std::string responseFlag;
     if (!useResponseFile) {
       vars.Objects = "$in";
-      vars.LinkLibraries = "$LINK_LIBRARIES";
+      vars.LinkLibraries = "$LINK_PATH $LINK_LIBRARIES";
     } else {
       std::string cmakeVarLang = "CMAKE_";
       cmakeVarLang += this->TargetLinkLanguage;
diff --git a/Tests/CudaOnly/CMakeLists.txt b/Tests/CudaOnly/CMakeLists.txt
index 5ad6e6b..565baca 100644
--- a/Tests/CudaOnly/CMakeLists.txt
+++ b/Tests/CudaOnly/CMakeLists.txt
@@ -2,6 +2,7 @@
 ADD_TEST_MACRO(CudaOnly.EnableStandard CudaOnlyEnableStandard)
 ADD_TEST_MACRO(CudaOnly.ExportPTX CudaOnlyExportPTX)
 ADD_TEST_MACRO(CudaOnly.GPUDebugFlag CudaOnlyGPUDebugFlag)
+ADD_TEST_MACRO(CudaOnly.LinkSystemDeviceLibraries CudaOnlyLinkSystemDeviceLibraries)
 ADD_TEST_MACRO(CudaOnly.ResolveDeviceSymbols CudaOnlyResolveDeviceSymbols)
 ADD_TEST_MACRO(CudaOnly.SeparateCompilation CudaOnlySeparateCompilation)
 ADD_TEST_MACRO(CudaOnly.WithDefs CudaOnlyWithDefs)
diff --git a/Tests/CudaOnly/LinkSystemDeviceLibraries/CMakeLists.txt b/Tests/CudaOnly/LinkSystemDeviceLibraries/CMakeLists.txt
new file mode 100644
index 0000000..62be1e6
--- /dev/null
+++ b/Tests/CudaOnly/LinkSystemDeviceLibraries/CMakeLists.txt
@@ -0,0 +1,15 @@
+cmake_minimum_required(VERSION 3.8)
+project(CudaOnlyLinkSystemDeviceLibraries CUDA)
+
+string(APPEND CMAKE_CUDA_FLAGS " -gencode arch=compute_35,code=compute_35 -gencode arch=compute_35,code=sm_35")
+set(CMAKE_CUDA_STANDARD 11)
+
+add_executable(CudaOnlyLinkSystemDeviceLibraries main.cu)
+set_target_properties( CudaOnlyLinkSystemDeviceLibraries
+                       PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
+target_link_libraries( CudaOnlyLinkSystemDeviceLibraries PRIVATE cublas_device)
+
+if(APPLE)
+  # Help the static cuda runtime find the driver (libcuda.dyllib) at runtime.
+  set_property(TARGET CudaOnlyLinkSystemDeviceLibraries PROPERTY BUILD_RPATH ${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES})
+endif()
diff --git a/Tests/CudaOnly/LinkSystemDeviceLibraries/main.cu b/Tests/CudaOnly/LinkSystemDeviceLibraries/main.cu
new file mode 100644
index 0000000..7eecec1
--- /dev/null
+++ b/Tests/CudaOnly/LinkSystemDeviceLibraries/main.cu
@@ -0,0 +1,77 @@
+
+#include <cublas_v2.h>
+#include <cuda_runtime.h>
+#include <iostream>
+
+__global__ void deviceCublasSgemm(int n, float alpha, float beta,
+                                  const float* d_A, const float* d_B,
+                                  float* d_C)
+{
+  cublasHandle_t cnpHandle;
+  cublasStatus_t status = cublasCreate(&cnpHandle);
+
+  if (status != CUBLAS_STATUS_SUCCESS) {
+    return;
+  }
+
+  // Call function defined in the cublas_device system static library.
+  // This way we can verify that we properly pass system libraries to the
+  // device link line
+  status = cublasSgemm(cnpHandle, CUBLAS_OP_N, CUBLAS_OP_N, n, n, n, &alpha,
+                       d_A, n, d_B, n, &beta, d_C, n);
+
+  cublasDestroy(cnpHandle);
+}
+
+int choose_cuda_device()
+{
+  int nDevices = 0;
+  cudaError_t err = cudaGetDeviceCount(&nDevices);
+  if (err != cudaSuccess) {
+    std::cerr << "Failed to retrieve the number of CUDA enabled devices"
+              << std::endl;
+    return 1;
+  }
+  for (int i = 0; i < nDevices; ++i) {
+    cudaDeviceProp prop;
+    cudaError_t err = cudaGetDeviceProperties(&prop, i);
+    if (err != cudaSuccess) {
+      std::cerr << "Could not retrieve properties from CUDA device " << i
+                << std::endl;
+      return 1;
+    }
+
+    if (prop.major > 3 || (prop.major == 3 && prop.minor >= 5)) {
+      err = cudaSetDevice(i);
+      if (err != cudaSuccess) {
+        std::cout << "Could not select CUDA device " << i << std::endl;
+      } else {
+        return 0;
+      }
+    }
+  }
+
+  std::cout << "Could not find a CUDA enabled card supporting compute >=3.5"
+            << std::endl;
+  return 1;
+}
+
+int main(int argc, char** argv)
+{
+  int ret = choose_cuda_device();
+  if (ret) {
+    return 0;
+  }
+
+  // initial values that will make sure that the cublasSgemm won't actually
+  // do any work
+  int n = 0;
+  float alpha = 1;
+  float beta = 1;
+  float* d_A = nullptr;
+  float* d_B = nullptr;
+  float* d_C = nullptr;
+  deviceCublasSgemm<<<1, 1>>>(n, alpha, beta, d_A, d_B, d_C);
+
+  return 0;
+}
-- 
cgit v0.12