5 files changed, 125 insertions, 0 deletions
diff --git a/Tests/CudaOnly/DeviceLTO/CMakeLists.txt b/Tests/CudaOnly/DeviceLTO/CMakeLists.txt
new file mode 100644
index 0000000..653b35d
--- /dev/null
+++ b/Tests/CudaOnly/DeviceLTO/CMakeLists.txt
@@ -0,0 +1,37 @@
+cmake_minimum_required(VERSION 3.18)
+project(DeviceLTO CUDA)
+
+# Goal:
+# Verify that we correctly compile with device LTO
+# Verify that device LTO requirements are propagated to
+# the final device link line
+
+add_library(CUDA_dlto STATIC file1.cu file2.cu file3.cu)
+add_executable(CudaOnlyDeviceLTO main.cu)
+
+set_target_properties(CUDA_dlto
+                      PROPERTIES
+                      CUDA_ARCHITECTURES "${CMAKE_CUDA_ARCHITECTURES_ALL}"
+                      CUDA_SEPARABLE_COMPILATION ON
+                      POSITION_INDEPENDENT_CODE ON)
+
+set_target_properties(CudaOnlyDeviceLTO
+                      PROPERTIES
+                      CUDA_SEPARABLE_COMPILATION ON
+                      CUDA_ARCHITECTURES "${CMAKE_CUDA_ARCHITECTURES_ALL}"
+                      )
+
+target_link_libraries(CudaOnlyDeviceLTO PRIVATE CUDA_dlto)
+
+include(CheckIPOSupported)
+check_ipo_supported(LANGUAGES CUDA RESULT ipo_supported)
+if(ipo_supported)
+  set_target_properties(CUDA_dlto
+                        PROPERTIES
+                        INTERPROCEDURAL_OPTIMIZATION ON)
+
+  # When non-LTO variants (i.e. virtual) are built together with LTO ones the
+  # linker warns about missing device LTO for the virtual architectures.
+  # Ignore these warnings.
+  target_link_options(CudaOnlyDeviceLTO PRIVATE "$<DEVICE_LINK:-w>")
+endif()
diff --git a/Tests/CudaOnly/DeviceLTO/file1.cu b/Tests/CudaOnly/DeviceLTO/file1.cu
new file mode 100644
index 0000000..703927c
--- /dev/null
+++ b/Tests/CudaOnly/DeviceLTO/file1.cu
@@ -0,0 +1,17 @@
+#ifdef _WIN32
+#  define EXPORT __declspec(dllexport)
+#else
+#  define EXPORT
+#endif
+
+extern __device__ int file2_func(int);
+void __global__ kernel(int x)
+{
+  file2_func(x);
+}
+
+EXPORT int launch_kernel(int x)
+{
+  kernel<<<1, 1>>>(x);
+  return x;
+}
diff --git a/Tests/CudaOnly/DeviceLTO/file2.cu b/Tests/CudaOnly/DeviceLTO/file2.cu
new file mode 100644
index 0000000..73d6468
--- /dev/null
+++ b/Tests/CudaOnly/DeviceLTO/file2.cu
@@ -0,0 +1,5 @@
+extern __device__ int file3_func(int);
+int __device__ file2_func(int x)
+{
+  return x + file3_func(x);
+}
diff --git a/Tests/CudaOnly/DeviceLTO/file3.cu b/Tests/CudaOnly/DeviceLTO/file3.cu
new file mode 100644
index 0000000..235ac06
--- /dev/null
+++ b/Tests/CudaOnly/DeviceLTO/file3.cu
@@ -0,0 +1,4 @@
+int __device__ file3_func(int x)
+{
+  return x * x * x;
+}
diff --git a/Tests/CudaOnly/DeviceLTO/main.cu b/Tests/CudaOnly/DeviceLTO/main.cu
new file mode 100644
index 0000000..8ef4873
--- /dev/null
+++ b/Tests/CudaOnly/DeviceLTO/main.cu
@@ -0,0 +1,62 @@
+#include <iostream>
+
+#include "cuda.h"
+
+#ifdef _WIN32
+#  define IMPORT __declspec(dllimport)
+#else
+#  define IMPORT
+#endif
+
+IMPORT int launch_kernel(int x);
+
+int choose_cuda_device()
+{
+  int nDevices = 0;
+  cudaError_t err = cudaGetDeviceCount(&nDevices);
+  if (err != cudaSuccess) {
+    std::cerr << "Failed to retrieve the number of CUDA enabled devices"
+              << std::endl;
+    return 1;
+  }
+  for (int i = 0; i < nDevices; ++i) {
+    cudaDeviceProp prop;
+    cudaError_t err = cudaGetDeviceProperties(&prop, i);
+    if (err != cudaSuccess) {
+      std::cerr << "Could not retrieve properties from CUDA device " << i
+                << std::endl;
+      return 1;
+    }
+    std::cout << "prop.major: " << prop.major << std::endl;
+    err = cudaSetDevice(i);
+    if (err != cudaSuccess) {
+      std::cout << "Could not select CUDA device " << i << std::endl;
+    } else {
+      return 0;
+    }
+  }
+
+  std::cout << "Could not find a CUDA enabled card" << std::endl;
+
+  return 1;
+}
+
+int main()
+{
+  int ret = choose_cuda_device();
+  if (ret) {
+    return 0;
+  }
+
+  cudaError_t err;
+  launch_kernel(1);
+  err = cudaGetLastError();
+  if (err != cudaSuccess) {
+    std::cerr << "launch_kernel: kernel launch should have passed.\n "
+                 "Error message: "
+              << cudaGetErrorString(err) << std::endl;
+    return 1;
+  }
+
+  return 0;
+}