26 files changed, 755 insertions, 17 deletions
diff --git a/Tests/Cuda/Complex/CMakeLists.txt b/Tests/Cuda/Complex/CMakeLists.txt
index d3d4b7c..08d1e16 100644
--- a/Tests/Cuda/Complex/CMakeLists.txt
+++ b/Tests/Cuda/Complex/CMakeLists.txt
@@ -22,18 +22,11 @@ set(CMAKE_CUDA_STANDARD_REQUIRED TRUE)
 set(CMAKE_CXX_STANDARD_REQUIRED TRUE)
 
 add_library(CudaComplexCppBase SHARED dynamic.cpp)
-add_library(CudaComplexSeperableLib STATIC file1.cu file2.cu file3.cu)
-set_target_properties(CudaComplexSeperableLib
-                       PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
-set_target_properties( CudaComplexSeperableLib
-                       PROPERTIES POSITION_INDEPENDENT_CODE ON)
-
 add_library(CudaComplexSharedLib SHARED dynamic.cu)
 target_link_libraries(CudaComplexSharedLib PUBLIC CudaComplexCppBase)
 
+add_library(CudaComplexSeperableLib STATIC file1.cu file2.cu file3.cu)
 add_library(CudaComplexMixedLib SHARED mixed.cpp mixed.cu)
-set_target_properties(CudaComplexMixedLib
-                       PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
 target_link_libraries(CudaComplexMixedLib
                       PUBLIC CudaComplexSharedLib
                       PRIVATE CudaComplexSeperableLib)
@@ -41,7 +34,27 @@ target_link_libraries(CudaComplexMixedLib
 add_executable(CudaComplex main.cpp)
 target_link_libraries(CudaComplex PUBLIC CudaComplexMixedLib)
 
+
+set_target_properties(CudaComplexMixedLib
+                      CudaComplexSeperableLib
+                      PROPERTIES
+                        POSITION_INDEPENDENT_CODE  ON
+                        CUDA_SEPARABLE_COMPILATION ON
+                      )
+set_target_properties(CudaComplexMixedLib
+                      CudaComplexSharedLib
+                      PROPERTIES
+                        CUDA_RUNTIME_LIBRARY shared
+                      )
+
+
 if(APPLE)
   # Help the static cuda runtime find the driver (libcuda.dyllib) at runtime.
   set_property(TARGET CudaComplex PROPERTY BUILD_RPATH ${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES})
 endif()
+
+if(UNIX)
+  # Help the shared cuda runtime find libcudart as it is not located
+  # in a default system searched location
+  set_property(TARGET CudaComplexMixedLib PROPERTY BUILD_RPATH ${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES})
+endif()
diff --git a/Tests/Cuda/Complex/dynamic.cu b/Tests/Cuda/Complex/dynamic.cu
index 9da8853..7f2f2b5 100644
--- a/Tests/Cuda/Complex/dynamic.cu
+++ b/Tests/Cuda/Complex/dynamic.cu
@@ -54,17 +54,20 @@ EXPORT int choose_cuda_device()
   return 1;
 }
 
-EXPORT void cuda_dynamic_lib_func()
+EXPORT bool cuda_dynamic_lib_func()
 {
-  DetermineIfValidCudaDevice<<<1, 1>>>();
   cudaError_t err = cudaGetLastError();
   if (err != cudaSuccess) {
-    std::cerr << "DetermineIfValidCudaDevice [SYNC] failed: "
+    std::cerr << "DetermineIfValidCudaDevice [Per Launch] failed: "
               << cudaGetErrorString(err) << std::endl;
+    return false;
   }
+  DetermineIfValidCudaDevice<<<1, 1>>>();
   err = cudaDeviceSynchronize();
   if (err != cudaSuccess) {
-    std::cerr << "DetermineIfValidCudaDevice [ASYNC] failed: "
+    std::cerr << "DetermineIfValidCudaDevice [SYNC] failed: "
               << cudaGetErrorString(cudaGetLastError()) << std::endl;
+    return false;
   }
+  return true;
 }
diff --git a/Tests/Cuda/Complex/main.cpp b/Tests/Cuda/Complex/main.cpp
index 6ca5952..da09b44 100644
--- a/Tests/Cuda/Complex/main.cpp
+++ b/Tests/Cuda/Complex/main.cpp
@@ -22,5 +22,6 @@ int main(int argc, char** argv)
 
   int r1 = call_cuda_seperable_code(42);
   int r2 = mixed_launch_kernel(42);
+
   return (r1 == 42 || r2 == 42) ? 1 : 0;
 }
diff --git a/Tests/Cuda/Complex/mixed.cu b/Tests/Cuda/Complex/mixed.cu
index 5b85aec..76119ad 100644
--- a/Tests/Cuda/Complex/mixed.cu
+++ b/Tests/Cuda/Complex/mixed.cu
@@ -15,7 +15,7 @@
 result_type __device__ file1_func(int x);
 result_type_dynamic __device__ file2_func(int x);
 
-IMPORT void __host__ cuda_dynamic_lib_func();
+IMPORT bool __host__ cuda_dynamic_lib_func();
 
 static __global__ void mixed_kernel(result_type* r, int x)
 {
@@ -25,7 +25,9 @@ static __global__ void mixed_kernel(result_type* r, int x)
 
 EXPORT int mixed_launch_kernel(int x)
 {
-  cuda_dynamic_lib_func();
+  if (!cuda_dynamic_lib_func()) {
+    return x;
+  }
 
   result_type* r;
   cudaError_t err = cudaMallocManaged(&r, sizeof(result_type));
diff --git a/Tests/CudaOnly/CMakeLists.txt b/Tests/CudaOnly/CMakeLists.txt
index a0575cd..cc1ee1a 100644
--- a/Tests/CudaOnly/CMakeLists.txt
+++ b/Tests/CudaOnly/CMakeLists.txt
@@ -5,10 +5,21 @@ ADD_TEST_MACRO(CudaOnly.ExportPTX CudaOnlyExportPTX)
 ADD_TEST_MACRO(CudaOnly.GPUDebugFlag CudaOnlyGPUDebugFlag)
 ADD_TEST_MACRO(CudaOnly.ResolveDeviceSymbols CudaOnlyResolveDeviceSymbols)
 ADD_TEST_MACRO(CudaOnly.SeparateCompilation CudaOnlySeparateCompilation)
+ADD_TEST_MACRO(CudaOnly.SharedRuntimePlusToolkit CudaOnlySharedRuntimePlusToolkit)
 ADD_TEST_MACRO(CudaOnly.Standard98 CudaOnlyStandard98)
 ADD_TEST_MACRO(CudaOnly.Toolkit CudaOnlyToolkit)
 ADD_TEST_MACRO(CudaOnly.WithDefs CudaOnlyWithDefs)
 
+# The CUDA only ships the shared version of the toolkit libraries
+# on windows
+if(NOT WIN32)
+  ADD_TEST_MACRO(Cuda.StaticRuntimePlusToolkit StaticRuntimePlusToolkit)
+endif()
+
+if(MSVC)
+  ADD_TEST_MACRO(CudaOnly.PDB CudaOnlyPDB)
+endif()
+
 add_test(NAME CudaOnly.DontResolveDeviceSymbols COMMAND
   ${CMAKE_CTEST_COMMAND} -C $<CONFIGURATION>
   --build-and-test
@@ -20,6 +31,14 @@ add_test(NAME CudaOnly.DontResolveDeviceSymbols COMMAND
   --test-command ${CMAKE_CTEST_COMMAND} -V -C $<CONFIGURATION>
   )
 
-if(MSVC)
-  ADD_TEST_MACRO(CudaOnly.PDB CudaOnlyPDB)
-endif()
+add_test(NAME CudaOnly.RuntimeControls COMMAND
+  ${CMAKE_CTEST_COMMAND} -C $<CONFIGURATION>
+  --build-and-test
+  "${CMAKE_CURRENT_SOURCE_DIR}/RuntimeControls/"
+  "${CMAKE_CURRENT_BINARY_DIR}/RuntimeControls/"
+  --build-two-config
+    ${build_generator_args}
+  --build-project RuntimeControls
+  --build-options ${build_options}
+  --test-command ${CMAKE_CTEST_COMMAND} -V -C $<CONFIGURATION>
+  )
diff --git a/Tests/CudaOnly/RuntimeControls/CMakeLists.txt b/Tests/CudaOnly/RuntimeControls/CMakeLists.txt
new file mode 100644
index 0000000..8b58fec
--- /dev/null
+++ b/Tests/CudaOnly/RuntimeControls/CMakeLists.txt
@@ -0,0 +1,60 @@
+cmake_minimum_required(VERSION 3.7)
+project (RuntimeControls CUDA)
+
+# Find nm and dumpbin
+if(CMAKE_NM)
+  set(dump_command ${CMAKE_NM})
+  set(dump_args -g)
+else()
+  include(GetPrerequisites)
+  message(STATUS "calling list_prerequisites to find dumpbin")
+  list_prerequisites("${CMAKE_COMMAND}" 0 0 0)
+  if(gp_dumpbin)
+    set(dump_command ${gp_dumpbin})
+    set(dump_args /ARCHIVEMEMBERS)
+  endif()
+endif()
+
+string(APPEND CMAKE_CUDA_FLAGS " -gencode arch=compute_30,code=[compute_30]")
+
+set(CMAKE_CUDA_STANDARD 11)
+set(CMAKE_CUDA_RUNTIME_LIBRARY static)
+
+if(NOT "x${CMAKE_CUDA_SIMULATE_ID}" STREQUAL "xMSVC")
+  add_library(UsesNoCudaRT SHARED file1.cu)
+  set_target_properties(UsesNoCudaRT PROPERTIES CUDA_RUNTIME_LIBRARY none)
+endif()
+
+add_library(UsesStaticCudaRT SHARED file2.cu)
+
+add_executable(CudaOnlyRuntimeControls main.cu)
+set_target_properties(CudaOnlyRuntimeControls PROPERTIES CUDA_RUNTIME_LIBRARY shared)
+
+target_link_libraries(CudaOnlyRuntimeControls PRIVATE $<TARGET_NAME_IF_EXISTS:UsesNoCudaRT> UsesStaticCudaRT)
+
+
+if(dump_command)
+  if(TARGET UsesNoCudaRT)
+    add_custom_command(TARGET UsesNoCudaRT POST_BUILD
+    COMMAND ${CMAKE_COMMAND}
+    -DDUMP_COMMAND=${dump_command}
+    -DDUMP_ARGS=${dump_args}
+    -DTEST_LIBRARY_PATH=$<TARGET_FILE:UsesNoCudaRT>
+    -P ${CMAKE_CURRENT_SOURCE_DIR}/no_runtime.cmake
+    )
+  endif()
+  add_custom_command(TARGET UsesStaticCudaRT POST_BUILD
+    COMMAND ${CMAKE_COMMAND}
+    -DDUMP_COMMAND=${dump_command}
+    -DDUMP_ARGS=${dump_args}
+    -DTEST_LIBRARY_PATH=$<TARGET_FILE:UsesStaticCudaRT>
+    -P ${CMAKE_CURRENT_SOURCE_DIR}/uses_static_runtime.cmake
+  )
+  string(REPLACE ";" "|" dirs "${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES}")
+  add_custom_command(TARGET CudaOnlyRuntimeControls POST_BUILD
+    COMMAND ${CMAKE_COMMAND}
+    -DEXEC_PATH=$<TARGET_FILE:CudaOnlyRuntimeControls>
+    -DEXTRA_LIB_DIRS="${dirs}"
+    -P ${CMAKE_CURRENT_SOURCE_DIR}/verify_runtime.cmake
+  )
+endif()
diff --git a/Tests/CudaOnly/RuntimeControls/file1.cu b/Tests/CudaOnly/RuntimeControls/file1.cu
new file mode 100644
index 0000000..28beb5e
--- /dev/null
+++ b/Tests/CudaOnly/RuntimeControls/file1.cu
@@ -0,0 +1,18 @@
+
+#ifdef _WIN32
+#  define EXPORT __declspec(dllexport)
+#else
+#  define EXPORT
+#endif
+
+void __global__ file1_kernel(int x, int& r)
+{
+  r = -x;
+}
+
+EXPORT int file1_launch_kernel(int x)
+{
+  int r = 0;
+  file1_kernel<<<1, 1>>>(x, r);
+  return r;
+}
diff --git a/Tests/CudaOnly/RuntimeControls/file2.cu b/Tests/CudaOnly/RuntimeControls/file2.cu
new file mode 100644
index 0000000..ff68a70
--- /dev/null
+++ b/Tests/CudaOnly/RuntimeControls/file2.cu
@@ -0,0 +1,18 @@
+
+#ifdef _WIN32
+#  define EXPORT __declspec(dllexport)
+#else
+#  define EXPORT
+#endif
+
+void __global__ file2_kernel(int x, int& r)
+{
+  r = -x;
+}
+
+EXPORT int file2_launch_kernel(int x)
+{
+  int r = 0;
+  file2_kernel<<<1, 1>>>(x, r);
+  return r;
+}
diff --git a/Tests/CudaOnly/RuntimeControls/main.cu b/Tests/CudaOnly/RuntimeControls/main.cu
new file mode 100644
index 0000000..0be22af
--- /dev/null
+++ b/Tests/CudaOnly/RuntimeControls/main.cu
@@ -0,0 +1,81 @@
+
+#include <iostream>
+
+#include "cuda.h"
+
+#ifdef _WIN32
+#  define IMPORT __declspec(dllimport)
+#else
+#  define IMPORT
+#endif
+
+#ifndef _WIN32
+IMPORT int file1_launch_kernel(int x);
+#endif
+
+IMPORT int file2_launch_kernel(int x);
+
+int choose_cuda_device()
+{
+  int nDevices = 0;
+  cudaError_t err = cudaGetDeviceCount(&nDevices);
+  if (err != cudaSuccess) {
+    std::cerr << "Failed to retrieve the number of CUDA enabled devices"
+              << std::endl;
+    return 1;
+  }
+  for (int i = 0; i < nDevices; ++i) {
+    cudaDeviceProp prop;
+    cudaError_t err = cudaGetDeviceProperties(&prop, i);
+    if (err != cudaSuccess) {
+      std::cerr << "Could not retrieve properties from CUDA device " << i
+                << std::endl;
+      return 1;
+    }
+    std::cout << "prop.major: " << prop.major << std::endl;
+    if (prop.major >= 3) {
+      err = cudaSetDevice(i);
+      if (err != cudaSuccess) {
+        std::cout << "Could not select CUDA device " << i << std::endl;
+      } else {
+        return 0;
+      }
+    }
+  }
+
+  std::cout << "Could not find a CUDA enabled card supporting compute >=3.0"
+            << std::endl;
+
+  return 1;
+}
+
+int main(int argc, char** argv)
+{
+  int ret = choose_cuda_device();
+  if (ret) {
+    return 0;
+  }
+
+  cudaError_t err;
+#ifndef _WIN32
+  file1_launch_kernel(1);
+  err = cudaGetLastError();
+  if (err != cudaSuccess) {
+    std::cerr << "file1_launch_kernel: kernel launch should have passed.\n "
+                 "Error message: "
+              << cudaGetErrorString(err) << std::endl;
+    return 1;
+  }
+#endif
+
+  file2_launch_kernel(1);
+  err = cudaGetLastError();
+  if (err != cudaSuccess) {
+    std::cerr << "file2_launch_kernel: kernel launch should have passed.\n "
+                 "Error message: "
+              << cudaGetErrorString(err) << std::endl;
+    return 1;
+  }
+
+  return 0;
+}
diff --git a/Tests/CudaOnly/RuntimeControls/no_runtime.cmake b/Tests/CudaOnly/RuntimeControls/no_runtime.cmake
new file mode 100644
index 0000000..55f28cc
--- /dev/null
+++ b/Tests/CudaOnly/RuntimeControls/no_runtime.cmake
@@ -0,0 +1,14 @@
+execute_process(COMMAND ${DUMP_COMMAND} ${DUMP_ARGS} ${TEST_LIBRARY_PATH}
+  RESULT_VARIABLE RESULT
+  OUTPUT_VARIABLE OUTPUT
+  ERROR_VARIABLE ERROR
+)
+
+if(NOT "${RESULT}" STREQUAL "0")
+  message(FATAL_ERROR "${DUMP_COMMAND} failed [${RESULT}] [${OUTPUT}] [${ERROR}]")
+endif()
+
+if(NOT "${OUTPUT}" MATCHES "(__cuda)")
+  message(FATAL_ERROR
+  "not missing cuda device symbols, static runtime linking was used.")
+endif()
diff --git a/Tests/CudaOnly/RuntimeControls/uses_static_runtime.cmake b/Tests/CudaOnly/RuntimeControls/uses_static_runtime.cmake
new file mode 100644
index 0000000..b372fea
--- /dev/null
+++ b/Tests/CudaOnly/RuntimeControls/uses_static_runtime.cmake
@@ -0,0 +1,14 @@
+execute_process(COMMAND ${DUMP_COMMAND} ${DUMP_ARGS} ${TEST_LIBRARY_PATH}
+  RESULT_VARIABLE RESULT
+  OUTPUT_VARIABLE OUTPUT
+  ERROR_VARIABLE ERROR
+)
+
+if(NOT "${RESULT}" STREQUAL "0")
+  message(FATAL_ERROR "${DUMP_COMMAND} failed [${RESULT}] [${OUTPUT}] [${ERROR}]")
+endif()
+
+if("${OUTPUT}" MATCHES "__cuda")
+  message(FATAL_ERROR
+    "missing cuda device symbols, static runtime linking was not used.")
+endif()
diff --git a/Tests/CudaOnly/RuntimeControls/verify_runtime.cmake b/Tests/CudaOnly/RuntimeControls/verify_runtime.cmake
new file mode 100644
index 0000000..b313dac
--- /dev/null
+++ b/Tests/CudaOnly/RuntimeControls/verify_runtime.cmake
@@ -0,0 +1,16 @@
+
+string(REPLACE "|" ";" dirs "${EXTRA_LIB_DIRS}")
+file(GET_RUNTIME_DEPENDENCIES
+  RESOLVED_DEPENDENCIES_VAR resolved_libs
+  UNRESOLVED_DEPENDENCIES_VAR unresolved_libs
+  DIRECTORIES ${dirs}
+  EXECUTABLES ${EXEC_PATH}
+  )
+
+list(FILTER resolved_libs INCLUDE REGEX ".*cudart.*")
+list(LENGTH resolved_libs has_cudart)
+
+if(has_cudart EQUAL 0)
+  message(FATAL_ERROR
+    "missing cudart shared library from runtime dependency output.")
+endif()
diff --git a/Tests/CudaOnly/SharedRuntimePlusToolkit/CMakeLists.txt b/Tests/CudaOnly/SharedRuntimePlusToolkit/CMakeLists.txt
new file mode 100644
index 0000000..03fba22
--- /dev/null
+++ b/Tests/CudaOnly/SharedRuntimePlusToolkit/CMakeLists.txt
@@ -0,0 +1,42 @@
+cmake_minimum_required(VERSION 3.15)
+project(SharedRuntimePlusToolkit CUDA)
+
+#Goal for this example:
+# Validate that with c++ we can use some components of the CUDA toolkit, and
+# specify the cuda runtime
+find_package(CUDAToolkit REQUIRED)
+
+add_library(Common OBJECT curand.cu nppif.cu)
+target_link_libraries(Common PRIVATE CUDA::toolkit)
+set_target_properties(Common PROPERTIES POSITION_INDEPENDENT_CODE ON)
+
+#shared runtime with shared toolkit libraries
+add_library(SharedToolkit SHARED shared.cu)
+target_link_libraries(SharedToolkit PRIVATE Common PUBLIC CUDA::curand CUDA::nppif)
+set_target_properties(SharedToolkit PROPERTIES CUDA_RUNTIME_LIBRARY none)
+target_link_libraries(SharedToolkit PUBLIC CUDA::cudart)
+
+# The CUDA only ships the shared version of the toolkit libraries
+# on windows
+if(NOT WIN32)
+  #shared runtime with static toolkit libraries
+  add_library(StaticToolkit SHARED static.cu)
+  target_link_libraries(StaticToolkit PRIVATE Common CUDA::curand_static CUDA::nppif_static)
+  set_target_properties(StaticToolkit PROPERTIES CUDA_RUNTIME_LIBRARY Shared)
+
+  #static runtime with mixed toolkit libraries
+  add_library(MixedToolkit SHARED mixed.cu)
+  target_link_libraries(MixedToolkit PRIVATE Common CUDA::curand_static CUDA::nppif)
+  set_target_properties(MixedToolkit PROPERTIES CUDA_RUNTIME_LIBRARY Shared)
+endif()
+
+add_executable(CudaOnlySharedRuntimePlusToolkit main.cu)
+target_link_libraries(CudaOnlySharedRuntimePlusToolkit PRIVATE SharedToolkit
+                      $<TARGET_NAME_IF_EXISTS:StaticToolkit>
+                      $<TARGET_NAME_IF_EXISTS:MixedToolkit>)
+
+if(UNIX)
+  # Help the shared cuda runtime find libcudart as it is not located
+  # in a default system searched location
+  set_property(TARGET CudaOnlySharedRuntimePlusToolkit PROPERTY BUILD_RPATH ${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES})
+endif()
diff --git a/Tests/CudaOnly/SharedRuntimePlusToolkit/curand.cu b/Tests/CudaOnly/SharedRuntimePlusToolkit/curand.cu
new file mode 100644
index 0000000..fdd7b53
--- /dev/null
+++ b/Tests/CudaOnly/SharedRuntimePlusToolkit/curand.cu
@@ -0,0 +1,65 @@
+// Comes from:
+// https://docs.nvidia.com/cuda/curand/host-api-overview.html#host-api-example
+
+#ifdef _WIN32
+#  define EXPORT __declspec(dllexport)
+#else
+#  define EXPORT
+#endif
+
+/*
+ * This program uses the host CURAND API to generate 100
+ * pseudorandom floats.
+ */
+#include <cuda.h>
+#include <curand.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#define CUDA_CALL(x)                                                          \
+  do {                                                                        \
+    if ((x) != cudaSuccess) {                                                 \
+      printf("Error at %s:%d\n", __FILE__, __LINE__);                         \
+      return EXIT_FAILURE;                                                    \
+    }                                                                         \
+  } while (0)
+#define CURAND_CALL(x)                                                        \
+  do {                                                                        \
+    if ((x) != CURAND_STATUS_SUCCESS) {                                       \
+      printf("Error at %s:%d\n", __FILE__, __LINE__);                         \
+      return EXIT_FAILURE;                                                    \
+    }                                                                         \
+  } while (0)
+
+EXPORT int curand_main()
+{
+  size_t n = 100;
+  size_t i;
+  curandGenerator_t gen;
+  float *devData, *hostData;
+
+  /* Allocate n floats on host */
+  hostData = (float*)calloc(n, sizeof(float));
+
+  /* Allocate n floats on device */
+  CUDA_CALL(cudaMalloc((void**)&devData, n * sizeof(float)));
+
+  /* Create pseudo-random number generator */
+  CURAND_CALL(curandCreateGenerator(&gen, CURAND_RNG_PSEUDO_DEFAULT));
+
+  /* Set seed */
+  CURAND_CALL(curandSetPseudoRandomGeneratorSeed(gen, 1234ULL));
+
+  /* Generate n floats on device */
+  CURAND_CALL(curandGenerateUniform(gen, devData, n));
+
+  /* Copy device memory to host */
+  CUDA_CALL(
+    cudaMemcpy(hostData, devData, n * sizeof(float), cudaMemcpyDeviceToHost));
+
+  /* Cleanup */
+  CURAND_CALL(curandDestroyGenerator(gen));
+  CUDA_CALL(cudaFree(devData));
+  free(hostData);
+  return EXIT_SUCCESS;
+}
diff --git a/Tests/CudaOnly/SharedRuntimePlusToolkit/main.cu b/Tests/CudaOnly/SharedRuntimePlusToolkit/main.cu
new file mode 100644
index 0000000..2a4da22
--- /dev/null
+++ b/Tests/CudaOnly/SharedRuntimePlusToolkit/main.cu
@@ -0,0 +1,23 @@
+
+#ifdef _WIN32
+#  define IMPORT __declspec(dllimport)
+IMPORT int shared_version();
+int static_version()
+{
+  return 0;
+}
+int mixed_version()
+{
+  return 0;
+}
+#else
+int shared_version();
+int static_version();
+int mixed_version();
+#endif
+
+int main()
+{
+  return mixed_version() == 0 && shared_version() == 0 &&
+    static_version() == 0;
+}
diff --git a/Tests/CudaOnly/SharedRuntimePlusToolkit/mixed.cu b/Tests/CudaOnly/SharedRuntimePlusToolkit/mixed.cu
new file mode 100644
index 0000000..6de6886
--- /dev/null
+++ b/Tests/CudaOnly/SharedRuntimePlusToolkit/mixed.cu
@@ -0,0 +1,16 @@
+
+#ifdef _WIN32
+#  define IMPORT __declspec(dllimport)
+#  define EXPORT __declspec(dllexport)
+#else
+#  define IMPORT
+#  define EXPORT
+#endif
+
+IMPORT int curand_main();
+IMPORT int nppif_main();
+
+EXPORT int mixed_version()
+{
+  return curand_main() == 0 && nppif_main() == 0;
+}
diff --git a/Tests/CudaOnly/SharedRuntimePlusToolkit/nppif.cu b/Tests/CudaOnly/SharedRuntimePlusToolkit/nppif.cu
new file mode 100644
index 0000000..ac5341c
--- /dev/null
+++ b/Tests/CudaOnly/SharedRuntimePlusToolkit/nppif.cu
@@ -0,0 +1,92 @@
+// Comes from
+// https://devtalk.nvidia.com/default/topic/1037482/gpu-accelerated-libraries/help-me-help-you-with-modern-cmake-and-cuda-mwe-for-npp/post/5271066/#5271066
+
+#ifdef _WIN32
+#  define EXPORT __declspec(dllexport)
+#else
+#  define EXPORT
+#endif
+
+#include <cstdio>
+#include <iostream>
+
+#include <assert.h>
+#include <cuda_runtime_api.h>
+#include <nppi_filtering_functions.h>
+
+EXPORT int nppif_main()
+{
+  /**
+   * 8-bit unsigned single-channel 1D row convolution.
+   */
+  const int simgrows = 32;
+  const int simgcols = 32;
+  Npp8u *d_pSrc, *d_pDst;
+  const int nMaskSize = 3;
+  NppiSize oROI;
+  oROI.width = simgcols - nMaskSize;
+  oROI.height = simgrows;
+  const int simgsize = simgrows * simgcols * sizeof(d_pSrc[0]);
+  const int dimgsize = oROI.width * oROI.height * sizeof(d_pSrc[0]);
+  const int simgpix = simgrows * simgcols;
+  const int dimgpix = oROI.width * oROI.height;
+  const int nSrcStep = simgcols * sizeof(d_pSrc[0]);
+  const int nDstStep = oROI.width * sizeof(d_pDst[0]);
+  const int pixval = 1;
+  const int nDivisor = 1;
+  const Npp32s h_pKernel[nMaskSize] = { pixval, pixval, pixval };
+  Npp32s* d_pKernel;
+  const Npp32s nAnchor = 2;
+  cudaError_t err = cudaMalloc((void**)&d_pSrc, simgsize);
+  if (err != cudaSuccess) {
+    fprintf(stderr, "Cuda error %d\n", __LINE__);
+    return 1;
+  }
+  err = cudaMalloc((void**)&d_pDst, dimgsize);
+  if (err != cudaSuccess) {
+    fprintf(stderr, "Cuda error %d\n", __LINE__);
+    return 1;
+  }
+  err = cudaMalloc((void**)&d_pKernel, nMaskSize * sizeof(d_pKernel[0]));
+  if (err != cudaSuccess) {
+    fprintf(stderr, "Cuda error %d\n", __LINE__);
+    return 1;
+  }
+  // set image to pixval initially
+  err = cudaMemset(d_pSrc, pixval, simgsize);
+  if (err != cudaSuccess) {
+    fprintf(stderr, "Cuda error %d\n", __LINE__);
+    return 1;
+  }
+  err = cudaMemset(d_pDst, 0, dimgsize);
+  if (err != cudaSuccess) {
+    fprintf(stderr, "Cuda error %d\n", __LINE__);
+    return 1;
+  }
+  err = cudaMemcpy(d_pKernel, h_pKernel, nMaskSize * sizeof(d_pKernel[0]),
+                   cudaMemcpyHostToDevice);
+  if (err != cudaSuccess) {
+    fprintf(stderr, "Cuda error %d\n", __LINE__);
+    return 1;
+  }
+  // copy src to dst
+  NppStatus ret =
+    nppiFilterRow_8u_C1R(d_pSrc, nSrcStep, d_pDst, nDstStep, oROI, d_pKernel,
+                         nMaskSize, nAnchor, nDivisor);
+  assert(ret == NPP_NO_ERROR);
+  Npp8u* h_imgres = new Npp8u[dimgpix];
+  err = cudaMemcpy(h_imgres, d_pDst, dimgsize, cudaMemcpyDeviceToHost);
+  if (err != cudaSuccess) {
+    fprintf(stderr, "Cuda error %d\n", __LINE__);
+    return 1;
+  }
+  // test for filtering
+  for (int i = 0; i < dimgpix; i++) {
+    if (h_imgres[i] != (pixval * pixval * nMaskSize)) {
+      fprintf(stderr, "h_imgres at index %d failed to match\n", i);
+      return 1;
+    }
+  }
+
+  return 0;
+}
diff --git a/Tests/CudaOnly/SharedRuntimePlusToolkit/shared.cu b/Tests/CudaOnly/SharedRuntimePlusToolkit/shared.cu
new file mode 100644
index 0000000..f3c3dbc
--- /dev/null
+++ b/Tests/CudaOnly/SharedRuntimePlusToolkit/shared.cu
@@ -0,0 +1,16 @@
+
+#ifdef _WIN32
+#  define IMPORT __declspec(dllimport)
+#  define EXPORT __declspec(dllexport)
+#else
+#  define IMPORT
+#  define EXPORT
+#endif
+
+int curand_main();
+int nppif_main();
+
+EXPORT int shared_version()
+{
+  return curand_main() == 0 && nppif_main() == 0;
+}
diff --git a/Tests/CudaOnly/SharedRuntimePlusToolkit/static.cu b/Tests/CudaOnly/SharedRuntimePlusToolkit/static.cu
new file mode 100644
index 0000000..6932fa3
--- /dev/null
+++ b/Tests/CudaOnly/SharedRuntimePlusToolkit/static.cu
@@ -0,0 +1,16 @@
+
+#ifdef _WIN32
+#  define IMPORT __declspec(dllimport)
+#  define EXPORT __declspec(dllexport)
+#else
+#  define IMPORT
+#  define EXPORT
+#endif
+
+IMPORT int curand_main();
+IMPORT int nppif_main();
+
+EXPORT int static_version()
+{
+  return curand_main() == 0 && nppif_main() == 0;
+}
diff --git a/Tests/CudaOnly/StaticRuntimePlusToolkit/CMakeLists.txt b/Tests/CudaOnly/StaticRuntimePlusToolkit/CMakeLists.txt
new file mode 100644
index 0000000..97ac229
--- /dev/null
+++ b/Tests/CudaOnly/StaticRuntimePlusToolkit/CMakeLists.txt
@@ -0,0 +1,29 @@
+cmake_minimum_required(VERSION 3.15)
+project(StaticRuntimePlusToolkit CUDA)
+
+#Goal for this example:
+# Validate that with cuda we can use some components of the CUDA toolkit, and
+# specify the cuda runtime
+find_package(CUDAToolkit REQUIRED)
+
+add_library(Common OBJECT curand.cu nppif.cu)
+target_link_libraries(Common PRIVATE CUDA::toolkit)
+set_target_properties(Common PROPERTIES POSITION_INDEPENDENT_CODE ON)
+
+#static runtime with shared toolkit libraries
+add_library(SharedToolkit SHARED shared.cu)
+target_link_libraries(SharedToolkit PRIVATE Common CUDA::curand CUDA::nppif )
+set_target_properties(SharedToolkit PROPERTIES CUDA_RUNTIME_LIBRARY none)
+target_link_libraries(SharedToolkit PUBLIC CUDA::cudart_static)
+
+#static runtime with static toolkit libraries
+add_library(StaticToolkit SHARED static.cu)
+target_link_libraries(StaticToolkit PRIVATE Common CUDA::curand_static CUDA::nppif_static)
+
+#static runtime with mixed toolkit libraries
+add_library(MixedToolkit SHARED mixed.cu)
+target_link_libraries(MixedToolkit PRIVATE Common CUDA::curand CUDA::nppif_static)
+set_target_properties(MixedToolkit PROPERTIES CUDA_RUNTIME_LIBRARY Static)
+
+add_executable(CudaOnlyStaticRuntimePlusToolkit main.cu)
+target_link_libraries(CudaOnlyStaticRuntimePlusToolkit PRIVATE SharedToolkit StaticToolkit MixedToolkit)
diff --git a/Tests/CudaOnly/StaticRuntimePlusToolkit/curand.cu b/Tests/CudaOnly/StaticRuntimePlusToolkit/curand.cu
new file mode 100644
index 0000000..95872f0
--- /dev/null
+++ b/Tests/CudaOnly/StaticRuntimePlusToolkit/curand.cu
@@ -0,0 +1,59 @@
+// Comes from:
+// https://docs.nvidia.com/cuda/curand/host-api-overview.html#host-api-example
+
+/*
+ * This program uses the host CURAND API to generate 100
+ * pseudorandom floats.
+ */
+#include <cuda.h>
+#include <curand.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#define CUDA_CALL(x)                                                          \
+  do {                                                                        \
+    if ((x) != cudaSuccess) {                                                 \
+      printf("Error at %s:%d\n", __FILE__, __LINE__);                         \
+      return EXIT_FAILURE;                                                    \
+    }                                                                         \
+  } while (0)
+#define CURAND_CALL(x)                                                        \
+  do {                                                                        \
+    if ((x) != CURAND_STATUS_SUCCESS) {                                       \
+      printf("Error at %s:%d\n", __FILE__, __LINE__);                         \
+      return EXIT_FAILURE;                                                    \
+    }                                                                         \
+  } while (0)
+
+int curand_main()
+{
+  size_t n = 100;
+  size_t i;
+  curandGenerator_t gen;
+  float *devData, *hostData;
+
+  /* Allocate n floats on host */
+  hostData = (float*)calloc(n, sizeof(float));
+
+  /* Allocate n floats on device */
+  CUDA_CALL(cudaMalloc((void**)&devData, n * sizeof(float)));
+
+  /* Create pseudo-random number generator */
+  CURAND_CALL(curandCreateGenerator(&gen, CURAND_RNG_PSEUDO_DEFAULT));
+
+  /* Set seed */
+  CURAND_CALL(curandSetPseudoRandomGeneratorSeed(gen, 1234ULL));
+
+  /* Generate n floats on device */
+  CURAND_CALL(curandGenerateUniform(gen, devData, n));
+
+  /* Copy device memory to host */
+  CUDA_CALL(
+    cudaMemcpy(hostData, devData, n * sizeof(float), cudaMemcpyDeviceToHost));
+
+  /* Cleanup */
+  CURAND_CALL(curandDestroyGenerator(gen));
+  CUDA_CALL(cudaFree(devData));
+  free(hostData);
+  return EXIT_SUCCESS;
+}
diff --git a/Tests/CudaOnly/StaticRuntimePlusToolkit/main.cu b/Tests/CudaOnly/StaticRuntimePlusToolkit/main.cu
new file mode 100644
index 0000000..5a09f8e
--- /dev/null
+++ b/Tests/CudaOnly/StaticRuntimePlusToolkit/main.cu
@@ -0,0 +1,11 @@
+
+
+int shared_version();
+int static_version();
+int mixed_version();
+
+int main()
+{
+  return mixed_version() == 0 && shared_version() == 0 &&
+    static_version() == 0;
+}
diff --git a/Tests/CudaOnly/StaticRuntimePlusToolkit/mixed.cu b/Tests/CudaOnly/StaticRuntimePlusToolkit/mixed.cu
new file mode 100644
index 0000000..a05140d
--- /dev/null
+++ b/Tests/CudaOnly/StaticRuntimePlusToolkit/mixed.cu
@@ -0,0 +1,8 @@
+
+int curand_main();
+int nppif_main();
+
+int mixed_version()
+{
+  return curand_main() == 0 && nppif_main() == 0;
+}
diff --git a/Tests/CudaOnly/StaticRuntimePlusToolkit/nppif.cu b/Tests/CudaOnly/StaticRuntimePlusToolkit/nppif.cu
new file mode 100644
index 0000000..2871090
--- /dev/null
+++ b/Tests/CudaOnly/StaticRuntimePlusToolkit/nppif.cu
@@ -0,0 +1,86 @@
+// Comes from
+// https://devtalk.nvidia.com/default/topic/1037482/gpu-accelerated-libraries/help-me-help-you-with-modern-cmake-and-cuda-mwe-for-npp/post/5271066/#5271066
+
+#include <cstdio>
+#include <iostream>
+
+#include <assert.h>
+#include <cuda_runtime_api.h>
+#include <nppi_filtering_functions.h>
+
+int nppif_main()
+{
+  /**
+   * 8-bit unsigned single-channel 1D row convolution.
+   */
+  const int simgrows = 32;
+  const int simgcols = 32;
+  Npp8u *d_pSrc, *d_pDst;
+  const int nMaskSize = 3;
+  NppiSize oROI;
+  oROI.width = simgcols - nMaskSize;
+  oROI.height = simgrows;
+  const int simgsize = simgrows * simgcols * sizeof(d_pSrc[0]);
+  const int dimgsize = oROI.width * oROI.height * sizeof(d_pSrc[0]);
+  const int simgpix = simgrows * simgcols;
+  const int dimgpix = oROI.width * oROI.height;
+  const int nSrcStep = simgcols * sizeof(d_pSrc[0]);
+  const int nDstStep = oROI.width * sizeof(d_pDst[0]);
+  const int pixval = 1;
+  const int nDivisor = 1;
+  const Npp32s h_pKernel[nMaskSize] = { pixval, pixval, pixval };
+  Npp32s* d_pKernel;
+  const Npp32s nAnchor = 2;
+  cudaError_t err = cudaMalloc((void**)&d_pSrc, simgsize);
+  if (err != cudaSuccess) {
+    fprintf(stderr, "Cuda error %d\n", __LINE__);
+    return 1;
+  }
+  err = cudaMalloc((void**)&d_pDst, dimgsize);
+  if (err != cudaSuccess) {
+    fprintf(stderr, "Cuda error %d\n", __LINE__);
+    return 1;
+  }
+  err = cudaMalloc((void**)&d_pKernel, nMaskSize * sizeof(d_pKernel[0]));
+  if (err != cudaSuccess) {
+    fprintf(stderr, "Cuda error %d\n", __LINE__);
+    return 1;
+  }
+  // set image to pixval initially
+  err = cudaMemset(d_pSrc, pixval, simgsize);
+  if (err != cudaSuccess) {
+    fprintf(stderr, "Cuda error %d\n", __LINE__);
+    return 1;
+  }
+  err = cudaMemset(d_pDst, 0, dimgsize);
+  if (err != cudaSuccess) {
+    fprintf(stderr, "Cuda error %d\n", __LINE__);
+    return 1;
+  }
+  err = cudaMemcpy(d_pKernel, h_pKernel, nMaskSize * sizeof(d_pKernel[0]),
+                   cudaMemcpyHostToDevice);
+  if (err != cudaSuccess) {
+    fprintf(stderr, "Cuda error %d\n", __LINE__);
+    return 1;
+  }
+  // copy src to dst
+  NppStatus ret =
+    nppiFilterRow_8u_C1R(d_pSrc, nSrcStep, d_pDst, nDstStep, oROI, d_pKernel,
+                         nMaskSize, nAnchor, nDivisor);
+  assert(ret == NPP_NO_ERROR);
+  Npp8u* h_imgres = new Npp8u[dimgpix];
+  err = cudaMemcpy(h_imgres, d_pDst, dimgsize, cudaMemcpyDeviceToHost);
+  if (err != cudaSuccess) {
+    fprintf(stderr, "Cuda error %d\n", __LINE__);
+    return 1;
+  }
+  // test for filtering
+  for (int i = 0; i < dimgpix; i++) {
+    if (h_imgres[i] != (pixval * pixval * nMaskSize)) {
+      fprintf(stderr, "h_imgres at index %d failed to match\n", i);
+      return 1;
+    }
+  }
+
+  return 0;
+}
diff --git a/Tests/CudaOnly/StaticRuntimePlusToolkit/shared.cu b/Tests/CudaOnly/StaticRuntimePlusToolkit/shared.cu
new file mode 100644
index 0000000..9967b66
--- /dev/null
+++ b/Tests/CudaOnly/StaticRuntimePlusToolkit/shared.cu
@@ -0,0 +1,8 @@
+
+int curand_main();
+int nppif_main();
+
+int shared_version()
+{
+  return curand_main() == 0 && nppif_main() == 0;
+}
diff --git a/Tests/CudaOnly/StaticRuntimePlusToolkit/static.cu b/Tests/CudaOnly/StaticRuntimePlusToolkit/static.cu
new file mode 100644
index 0000000..ca7eb4c
--- /dev/null
+++ b/Tests/CudaOnly/StaticRuntimePlusToolkit/static.cu
@@ -0,0 +1,8 @@
+
+int curand_main();
+int nppif_main();
+
+int static_version()
+{
+  return curand_main() == 0 && nppif_main() == 0;
+}