summaryrefslogtreecommitdiffstats
path: root/Tests/CudaOnly/SharedRuntimePlusToolkit
diff options
context:
space:
mode:
Diffstat (limited to 'Tests/CudaOnly/SharedRuntimePlusToolkit')
-rw-r--r--Tests/CudaOnly/SharedRuntimePlusToolkit/CMakeLists.txt42
-rw-r--r--Tests/CudaOnly/SharedRuntimePlusToolkit/curand.cu65
-rw-r--r--Tests/CudaOnly/SharedRuntimePlusToolkit/main.cu23
-rw-r--r--Tests/CudaOnly/SharedRuntimePlusToolkit/mixed.cu16
-rw-r--r--Tests/CudaOnly/SharedRuntimePlusToolkit/nppif.cu92
-rw-r--r--Tests/CudaOnly/SharedRuntimePlusToolkit/shared.cu16
-rw-r--r--Tests/CudaOnly/SharedRuntimePlusToolkit/static.cu16
7 files changed, 270 insertions, 0 deletions
diff --git a/Tests/CudaOnly/SharedRuntimePlusToolkit/CMakeLists.txt b/Tests/CudaOnly/SharedRuntimePlusToolkit/CMakeLists.txt
new file mode 100644
index 0000000..03fba22
--- /dev/null
+++ b/Tests/CudaOnly/SharedRuntimePlusToolkit/CMakeLists.txt
@@ -0,0 +1,42 @@
+cmake_minimum_required(VERSION 3.15)
+project(SharedRuntimePlusToolkit CUDA)
+
+#Goal for this example:
+# Validate that with c++ we can use some components of the CUDA toolkit, and
+# specify the cuda runtime
+find_package(CUDAToolkit REQUIRED)
+
+add_library(Common OBJECT curand.cu nppif.cu)
+target_link_libraries(Common PRIVATE CUDA::toolkit)
+set_target_properties(Common PROPERTIES POSITION_INDEPENDENT_CODE ON)
+
+#shared runtime with shared toolkit libraries
+add_library(SharedToolkit SHARED shared.cu)
+target_link_libraries(SharedToolkit PRIVATE Common PUBLIC CUDA::curand CUDA::nppif)
+set_target_properties(SharedToolkit PROPERTIES CUDA_RUNTIME_LIBRARY none)
+target_link_libraries(SharedToolkit PUBLIC CUDA::cudart)
+
+# The CUDA only ships the shared version of the toolkit libraries
+# on windows
+if(NOT WIN32)
+ #shared runtime with static toolkit libraries
+ add_library(StaticToolkit SHARED static.cu)
+ target_link_libraries(StaticToolkit PRIVATE Common CUDA::curand_static CUDA::nppif_static)
+ set_target_properties(StaticToolkit PROPERTIES CUDA_RUNTIME_LIBRARY Shared)
+
+ #static runtime with mixed toolkit libraries
+ add_library(MixedToolkit SHARED mixed.cu)
+ target_link_libraries(MixedToolkit PRIVATE Common CUDA::curand_static CUDA::nppif)
+ set_target_properties(MixedToolkit PROPERTIES CUDA_RUNTIME_LIBRARY Shared)
+endif()
+
+add_executable(CudaOnlySharedRuntimePlusToolkit main.cu)
+target_link_libraries(CudaOnlySharedRuntimePlusToolkit PRIVATE SharedToolkit
+ $<TARGET_NAME_IF_EXISTS:StaticToolkit>
+ $<TARGET_NAME_IF_EXISTS:MixedToolkit>)
+
+if(UNIX)
+ # Help the shared cuda runtime find libcudart as it is not located
+ # in a default system searched location
+ set_property(TARGET CudaOnlySharedRuntimePlusToolkit PROPERTY BUILD_RPATH ${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES})
+endif()
diff --git a/Tests/CudaOnly/SharedRuntimePlusToolkit/curand.cu b/Tests/CudaOnly/SharedRuntimePlusToolkit/curand.cu
new file mode 100644
index 0000000..fdd7b53
--- /dev/null
+++ b/Tests/CudaOnly/SharedRuntimePlusToolkit/curand.cu
@@ -0,0 +1,65 @@
+// Comes from:
+// https://docs.nvidia.com/cuda/curand/host-api-overview.html#host-api-example
+
+#ifdef _WIN32
+# define EXPORT __declspec(dllexport)
+#else
+# define EXPORT
+#endif
+
+/*
+ * This program uses the host CURAND API to generate 100
+ * pseudorandom floats.
+ */
+#include <cuda.h>
+#include <curand.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#define CUDA_CALL(x) \
+ do { \
+ if ((x) != cudaSuccess) { \
+ printf("Error at %s:%d\n", __FILE__, __LINE__); \
+ return EXIT_FAILURE; \
+ } \
+ } while (0)
+#define CURAND_CALL(x) \
+ do { \
+ if ((x) != CURAND_STATUS_SUCCESS) { \
+ printf("Error at %s:%d\n", __FILE__, __LINE__); \
+ return EXIT_FAILURE; \
+ } \
+ } while (0)
+
+EXPORT int curand_main()
+{
+ size_t n = 100;
+ size_t i;
+ curandGenerator_t gen;
+ float *devData, *hostData;
+
+ /* Allocate n floats on host */
+ hostData = (float*)calloc(n, sizeof(float));
+
+ /* Allocate n floats on device */
+ CUDA_CALL(cudaMalloc((void**)&devData, n * sizeof(float)));
+
+ /* Create pseudo-random number generator */
+ CURAND_CALL(curandCreateGenerator(&gen, CURAND_RNG_PSEUDO_DEFAULT));
+
+ /* Set seed */
+ CURAND_CALL(curandSetPseudoRandomGeneratorSeed(gen, 1234ULL));
+
+ /* Generate n floats on device */
+ CURAND_CALL(curandGenerateUniform(gen, devData, n));
+
+ /* Copy device memory to host */
+ CUDA_CALL(
+ cudaMemcpy(hostData, devData, n * sizeof(float), cudaMemcpyDeviceToHost));
+
+ /* Cleanup */
+ CURAND_CALL(curandDestroyGenerator(gen));
+ CUDA_CALL(cudaFree(devData));
+ free(hostData);
+ return EXIT_SUCCESS;
+}
diff --git a/Tests/CudaOnly/SharedRuntimePlusToolkit/main.cu b/Tests/CudaOnly/SharedRuntimePlusToolkit/main.cu
new file mode 100644
index 0000000..2a4da22
--- /dev/null
+++ b/Tests/CudaOnly/SharedRuntimePlusToolkit/main.cu
@@ -0,0 +1,23 @@
+
+#ifdef _WIN32
+# define IMPORT __declspec(dllimport)
+IMPORT int shared_version();
+int static_version()
+{
+ return 0;
+}
+int mixed_version()
+{
+ return 0;
+}
+#else
+int shared_version();
+int static_version();
+int mixed_version();
+#endif
+
+int main()
+{
+ return mixed_version() == 0 && shared_version() == 0 &&
+ static_version() == 0;
+}
diff --git a/Tests/CudaOnly/SharedRuntimePlusToolkit/mixed.cu b/Tests/CudaOnly/SharedRuntimePlusToolkit/mixed.cu
new file mode 100644
index 0000000..6de6886
--- /dev/null
+++ b/Tests/CudaOnly/SharedRuntimePlusToolkit/mixed.cu
@@ -0,0 +1,16 @@
+
+#ifdef _WIN32
+# define IMPORT __declspec(dllimport)
+# define EXPORT __declspec(dllexport)
+#else
+# define IMPORT
+# define EXPORT
+#endif
+
+IMPORT int curand_main();
+IMPORT int nppif_main();
+
+EXPORT int mixed_version()
+{
+ return curand_main() == 0 && nppif_main() == 0;
+}
diff --git a/Tests/CudaOnly/SharedRuntimePlusToolkit/nppif.cu b/Tests/CudaOnly/SharedRuntimePlusToolkit/nppif.cu
new file mode 100644
index 0000000..ac5341c
--- /dev/null
+++ b/Tests/CudaOnly/SharedRuntimePlusToolkit/nppif.cu
@@ -0,0 +1,92 @@
+// Comes from
+// https://devtalk.nvidia.com/default/topic/1037482/gpu-accelerated-libraries/help-me-help-you-with-modern-cmake-and-cuda-mwe-for-npp/post/5271066/#5271066
+
+#ifdef _WIN32
+# define EXPORT __declspec(dllexport)
+#else
+# define EXPORT
+#endif
+
+#include <cstdio>
+#include <iostream>
+
+#include <assert.h>
+#include <cuda_runtime_api.h>
+#include <nppi_filtering_functions.h>
+
+EXPORT int nppif_main()
+{
+ /**
+ * 8-bit unsigned single-channel 1D row convolution.
+ */
+ const int simgrows = 32;
+ const int simgcols = 32;
+ Npp8u *d_pSrc, *d_pDst;
+ const int nMaskSize = 3;
+ NppiSize oROI;
+ oROI.width = simgcols - nMaskSize;
+ oROI.height = simgrows;
+ const int simgsize = simgrows * simgcols * sizeof(d_pSrc[0]);
+ const int dimgsize = oROI.width * oROI.height * sizeof(d_pSrc[0]);
+ const int simgpix = simgrows * simgcols;
+ const int dimgpix = oROI.width * oROI.height;
+ const int nSrcStep = simgcols * sizeof(d_pSrc[0]);
+ const int nDstStep = oROI.width * sizeof(d_pDst[0]);
+ const int pixval = 1;
+ const int nDivisor = 1;
+ const Npp32s h_pKernel[nMaskSize] = { pixval, pixval, pixval };
+ Npp32s* d_pKernel;
+ const Npp32s nAnchor = 2;
+ cudaError_t err = cudaMalloc((void**)&d_pSrc, simgsize);
+ if (err != cudaSuccess) {
+ fprintf(stderr, "Cuda error %d\n", __LINE__);
+ return 1;
+ }
+ err = cudaMalloc((void**)&d_pDst, dimgsize);
+ if (err != cudaSuccess) {
+ fprintf(stderr, "Cuda error %d\n", __LINE__);
+ return 1;
+ }
+ err = cudaMalloc((void**)&d_pKernel, nMaskSize * sizeof(d_pKernel[0]));
+ if (err != cudaSuccess) {
+ fprintf(stderr, "Cuda error %d\n", __LINE__);
+ return 1;
+ }
+ // set image to pixval initially
+ err = cudaMemset(d_pSrc, pixval, simgsize);
+ if (err != cudaSuccess) {
+ fprintf(stderr, "Cuda error %d\n", __LINE__);
+ return 1;
+ }
+ err = cudaMemset(d_pDst, 0, dimgsize);
+ if (err != cudaSuccess) {
+ fprintf(stderr, "Cuda error %d\n", __LINE__);
+ return 1;
+ }
+ err = cudaMemcpy(d_pKernel, h_pKernel, nMaskSize * sizeof(d_pKernel[0]),
+ cudaMemcpyHostToDevice);
+ if (err != cudaSuccess) {
+ fprintf(stderr, "Cuda error %d\n", __LINE__);
+ return 1;
+ }
+ // copy src to dst
+ NppStatus ret =
+ nppiFilterRow_8u_C1R(d_pSrc, nSrcStep, d_pDst, nDstStep, oROI, d_pKernel,
+ nMaskSize, nAnchor, nDivisor);
+ assert(ret == NPP_NO_ERROR);
+ Npp8u* h_imgres = new Npp8u[dimgpix];
+ err = cudaMemcpy(h_imgres, d_pDst, dimgsize, cudaMemcpyDeviceToHost);
+ if (err != cudaSuccess) {
+ fprintf(stderr, "Cuda error %d\n", __LINE__);
+ return 1;
+ }
+ // test for filtering
+ for (int i = 0; i < dimgpix; i++) {
+ if (h_imgres[i] != (pixval * pixval * nMaskSize)) {
+ fprintf(stderr, "h_imgres at index %d failed to match\n", i);
+ return 1;
+ }
+ }
+
+ return 0;
+}
diff --git a/Tests/CudaOnly/SharedRuntimePlusToolkit/shared.cu b/Tests/CudaOnly/SharedRuntimePlusToolkit/shared.cu
new file mode 100644
index 0000000..f3c3dbc
--- /dev/null
+++ b/Tests/CudaOnly/SharedRuntimePlusToolkit/shared.cu
@@ -0,0 +1,16 @@
+
+#ifdef _WIN32
+# define IMPORT __declspec(dllimport)
+# define EXPORT __declspec(dllexport)
+#else
+# define IMPORT
+# define EXPORT
+#endif
+
+int curand_main();
+int nppif_main();
+
+EXPORT int shared_version()
+{
+ return curand_main() == 0 && nppif_main() == 0;
+}
diff --git a/Tests/CudaOnly/SharedRuntimePlusToolkit/static.cu b/Tests/CudaOnly/SharedRuntimePlusToolkit/static.cu
new file mode 100644
index 0000000..6932fa3
--- /dev/null
+++ b/Tests/CudaOnly/SharedRuntimePlusToolkit/static.cu
@@ -0,0 +1,16 @@
+
+#ifdef _WIN32
+# define IMPORT __declspec(dllimport)
+# define EXPORT __declspec(dllexport)
+#else
+# define IMPORT
+# define EXPORT
+#endif
+
+IMPORT int curand_main();
+IMPORT int nppif_main();
+
+EXPORT int static_version()
+{
+ return curand_main() == 0 && nppif_main() == 0;
+}