CUDA: Add abstraction for cuda runtime selection

Fixes #17559 Replace our hard-coded default of cudart=static with a first-class abstraction to select the runtime library from an enumeration of logical names.
author: Robert Maynard <robert.maynard@kitware.com> 2019-11-29 18:51:32 (GMT)
committer: Robert Maynard <robert.maynard@kitware.com> 2020-01-27 21:02:26 (GMT)
commit: 0d0145138fe7cd60edc7f0b97e860e9a4fae1555 (patch)
tree: c013d23f71ec3e8b0e1ccbb632d3cbb0a560d91d /Tests/CudaOnly/StaticRuntimePlusToolkit
parent: 4dbc9dfc7a1458878a26e1f0cec1a382e14bf48a (diff)
download: CMake-0d0145138fe7cd60edc7f0b97e860e9a4fae1555.zip
CMake-0d0145138fe7cd60edc7f0b97e860e9a4fae1555.tar.gz
CMake-0d0145138fe7cd60edc7f0b97e860e9a4fae1555.tar.bz2
7 files changed, 209 insertions, 0 deletions
diff --git a/Tests/CudaOnly/StaticRuntimePlusToolkit/CMakeLists.txt b/Tests/CudaOnly/StaticRuntimePlusToolkit/CMakeLists.txt
new file mode 100644
index 0000000..97ac229
--- /dev/null
+++ b/Tests/CudaOnly/StaticRuntimePlusToolkit/CMakeLists.txt
@@ -0,0 +1,29 @@
+cmake_minimum_required(VERSION 3.15)
+project(StaticRuntimePlusToolkit CUDA)
+
+#Goal for this example:
+# Validate that with cuda we can use some components of the CUDA toolkit, and
+# specify the cuda runtime
+find_package(CUDAToolkit REQUIRED)
+
+add_library(Common OBJECT curand.cu nppif.cu)
+target_link_libraries(Common PRIVATE CUDA::toolkit)
+set_target_properties(Common PROPERTIES POSITION_INDEPENDENT_CODE ON)
+
+#static runtime with shared toolkit libraries
+add_library(SharedToolkit SHARED shared.cu)
+target_link_libraries(SharedToolkit PRIVATE Common CUDA::curand CUDA::nppif )
+set_target_properties(SharedToolkit PROPERTIES CUDA_RUNTIME_LIBRARY none)
+target_link_libraries(SharedToolkit PUBLIC CUDA::cudart_static)
+
+#static runtime with static toolkit libraries
+add_library(StaticToolkit SHARED static.cu)
+target_link_libraries(StaticToolkit PRIVATE Common CUDA::curand_static CUDA::nppif_static)
+
+#static runtime with mixed toolkit libraries
+add_library(MixedToolkit SHARED mixed.cu)
+target_link_libraries(MixedToolkit PRIVATE Common CUDA::curand CUDA::nppif_static)
+set_target_properties(MixedToolkit PROPERTIES CUDA_RUNTIME_LIBRARY Static)
+
+add_executable(CudaOnlyStaticRuntimePlusToolkit main.cu)
+target_link_libraries(CudaOnlyStaticRuntimePlusToolkit PRIVATE SharedToolkit StaticToolkit MixedToolkit)
diff --git a/Tests/CudaOnly/StaticRuntimePlusToolkit/curand.cu b/Tests/CudaOnly/StaticRuntimePlusToolkit/curand.cu
new file mode 100644
index 0000000..95872f0
--- /dev/null
+++ b/Tests/CudaOnly/StaticRuntimePlusToolkit/curand.cu
@@ -0,0 +1,59 @@
+// Comes from:
+// https://docs.nvidia.com/cuda/curand/host-api-overview.html#host-api-example
+
+/*
+ * This program uses the host CURAND API to generate 100
+ * pseudorandom floats.
+ */
+#include <cuda.h>
+#include <curand.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#define CUDA_CALL(x)                                                          \
+  do {                                                                        \
+    if ((x) != cudaSuccess) {                                                 \
+      printf("Error at %s:%d\n", __FILE__, __LINE__);                         \
+      return EXIT_FAILURE;                                                    \
+    }                                                                         \
+  } while (0)
+#define CURAND_CALL(x)                                                        \
+  do {                                                                        \
+    if ((x) != CURAND_STATUS_SUCCESS) {                                       \
+      printf("Error at %s:%d\n", __FILE__, __LINE__);                         \
+      return EXIT_FAILURE;                                                    \
+    }                                                                         \
+  } while (0)
+
+int curand_main()
+{
+  size_t n = 100;
+  size_t i;
+  curandGenerator_t gen;
+  float *devData, *hostData;
+
+  /* Allocate n floats on host */
+  hostData = (float*)calloc(n, sizeof(float));
+
+  /* Allocate n floats on device */
+  CUDA_CALL(cudaMalloc((void**)&devData, n * sizeof(float)));
+
+  /* Create pseudo-random number generator */
+  CURAND_CALL(curandCreateGenerator(&gen, CURAND_RNG_PSEUDO_DEFAULT));
+
+  /* Set seed */
+  CURAND_CALL(curandSetPseudoRandomGeneratorSeed(gen, 1234ULL));
+
+  /* Generate n floats on device */
+  CURAND_CALL(curandGenerateUniform(gen, devData, n));
+
+  /* Copy device memory to host */
+  CUDA_CALL(
+    cudaMemcpy(hostData, devData, n * sizeof(float), cudaMemcpyDeviceToHost));
+
+  /* Cleanup */
+  CURAND_CALL(curandDestroyGenerator(gen));
+  CUDA_CALL(cudaFree(devData));
+  free(hostData);
+  return EXIT_SUCCESS;
+}
diff --git a/Tests/CudaOnly/StaticRuntimePlusToolkit/main.cu b/Tests/CudaOnly/StaticRuntimePlusToolkit/main.cu
new file mode 100644
index 0000000..5a09f8e
--- /dev/null
+++ b/Tests/CudaOnly/StaticRuntimePlusToolkit/main.cu
@@ -0,0 +1,11 @@
+
+
+int shared_version();
+int static_version();
+int mixed_version();
+
+int main()
+{
+  return mixed_version() == 0 && shared_version() == 0 &&
+    static_version() == 0;
+}
diff --git a/Tests/CudaOnly/StaticRuntimePlusToolkit/mixed.cu b/Tests/CudaOnly/StaticRuntimePlusToolkit/mixed.cu
new file mode 100644
index 0000000..a05140d
--- /dev/null
+++ b/Tests/CudaOnly/StaticRuntimePlusToolkit/mixed.cu
@@ -0,0 +1,8 @@
+
+int curand_main();
+int nppif_main();
+
+int mixed_version()
+{
+  return curand_main() == 0 && nppif_main() == 0;
+}
diff --git a/Tests/CudaOnly/StaticRuntimePlusToolkit/nppif.cu b/Tests/CudaOnly/StaticRuntimePlusToolkit/nppif.cu
new file mode 100644
index 0000000..2871090
--- /dev/null
+++ b/Tests/CudaOnly/StaticRuntimePlusToolkit/nppif.cu
@@ -0,0 +1,86 @@
+// Comes from
+// https://devtalk.nvidia.com/default/topic/1037482/gpu-accelerated-libraries/help-me-help-you-with-modern-cmake-and-cuda-mwe-for-npp/post/5271066/#5271066
+
+#include <cstdio>
+#include <iostream>
+
+#include <assert.h>
+#include <cuda_runtime_api.h>
+#include <nppi_filtering_functions.h>
+
+int nppif_main()
+{
+  /**
+   * 8-bit unsigned single-channel 1D row convolution.
+   */
+  const int simgrows = 32;
+  const int simgcols = 32;
+  Npp8u *d_pSrc, *d_pDst;
+  const int nMaskSize = 3;
+  NppiSize oROI;
+  oROI.width = simgcols - nMaskSize;
+  oROI.height = simgrows;
+  const int simgsize = simgrows * simgcols * sizeof(d_pSrc[0]);
+  const int dimgsize = oROI.width * oROI.height * sizeof(d_pSrc[0]);
+  const int simgpix = simgrows * simgcols;
+  const int dimgpix = oROI.width * oROI.height;
+  const int nSrcStep = simgcols * sizeof(d_pSrc[0]);
+  const int nDstStep = oROI.width * sizeof(d_pDst[0]);
+  const int pixval = 1;
+  const int nDivisor = 1;
+  const Npp32s h_pKernel[nMaskSize] = { pixval, pixval, pixval };
+  Npp32s* d_pKernel;
+  const Npp32s nAnchor = 2;
+  cudaError_t err = cudaMalloc((void**)&d_pSrc, simgsize);
+  if (err != cudaSuccess) {
+    fprintf(stderr, "Cuda error %d\n", __LINE__);
+    return 1;
+  }
+  err = cudaMalloc((void**)&d_pDst, dimgsize);
+  if (err != cudaSuccess) {
+    fprintf(stderr, "Cuda error %d\n", __LINE__);
+    return 1;
+  }
+  err = cudaMalloc((void**)&d_pKernel, nMaskSize * sizeof(d_pKernel[0]));
+  if (err != cudaSuccess) {
+    fprintf(stderr, "Cuda error %d\n", __LINE__);
+    return 1;
+  }
+  // set image to pixval initially
+  err = cudaMemset(d_pSrc, pixval, simgsize);
+  if (err != cudaSuccess) {
+    fprintf(stderr, "Cuda error %d\n", __LINE__);
+    return 1;
+  }
+  err = cudaMemset(d_pDst, 0, dimgsize);
+  if (err != cudaSuccess) {
+    fprintf(stderr, "Cuda error %d\n", __LINE__);
+    return 1;
+  }
+  err = cudaMemcpy(d_pKernel, h_pKernel, nMaskSize * sizeof(d_pKernel[0]),
+                   cudaMemcpyHostToDevice);
+  if (err != cudaSuccess) {
+    fprintf(stderr, "Cuda error %d\n", __LINE__);
+    return 1;
+  }
+  // copy src to dst
+  NppStatus ret =
+    nppiFilterRow_8u_C1R(d_pSrc, nSrcStep, d_pDst, nDstStep, oROI, d_pKernel,
+                         nMaskSize, nAnchor, nDivisor);
+  assert(ret == NPP_NO_ERROR);
+  Npp8u* h_imgres = new Npp8u[dimgpix];
+  err = cudaMemcpy(h_imgres, d_pDst, dimgsize, cudaMemcpyDeviceToHost);
+  if (err != cudaSuccess) {
+    fprintf(stderr, "Cuda error %d\n", __LINE__);
+    return 1;
+  }
+  // test for filtering
+  for (int i = 0; i < dimgpix; i++) {
+    if (h_imgres[i] != (pixval * pixval * nMaskSize)) {
+      fprintf(stderr, "h_imgres at index %d failed to match\n", i);
+      return 1;
+    }
+  }
+
+  return 0;
+}
diff --git a/Tests/CudaOnly/StaticRuntimePlusToolkit/shared.cu b/Tests/CudaOnly/StaticRuntimePlusToolkit/shared.cu
new file mode 100644
index 0000000..9967b66
--- /dev/null
+++ b/Tests/CudaOnly/StaticRuntimePlusToolkit/shared.cu
@@ -0,0 +1,8 @@
+
+int curand_main();
+int nppif_main();
+
+int shared_version()
+{
+  return curand_main() == 0 && nppif_main() == 0;
+}
diff --git a/Tests/CudaOnly/StaticRuntimePlusToolkit/static.cu b/Tests/CudaOnly/StaticRuntimePlusToolkit/static.cu
new file mode 100644
index 0000000..ca7eb4c
--- /dev/null
+++ b/Tests/CudaOnly/StaticRuntimePlusToolkit/static.cu
@@ -0,0 +1,8 @@
+
+int curand_main();
+int nppif_main();
+
+int static_version()
+{
+  return curand_main() == 0 && nppif_main() == 0;
+}
author	Robert Maynard <robert.maynard@kitware.com>	2019-11-29 18:51:32 (GMT)
committer	Robert Maynard <robert.maynard@kitware.com>	2020-01-27 21:02:26 (GMT)
commit	0d0145138fe7cd60edc7f0b97e860e9a4fae1555 (patch)
tree	c013d23f71ec3e8b0e1ccbb632d3cbb0a560d91d /Tests/CudaOnly/StaticRuntimePlusToolkit
parent	4dbc9dfc7a1458878a26e1f0cec1a382e14bf48a (diff)
download	CMake-0d0145138fe7cd60edc7f0b97e860e9a4fae1555.zip CMake-0d0145138fe7cd60edc7f0b97e860e9a4fae1555.tar.gz CMake-0d0145138fe7cd60edc7f0b97e860e9a4fae1555.tar.bz2