CUDA: Add abstraction for cuda runtime selection

Fixes #17559
Replace our hard-coded default of cudart=static with a first-class abstraction to select the runtime library from an enumeration of logical names.
This commit is contained in:
Robert Maynard
2019-11-29 13:51:32 -05:00
parent 4dbc9dfc7a
commit 0d0145138f
41 changed files with 911 additions and 51 deletions

View File

@@ -173,6 +173,7 @@ Properties on Targets
/prop_tgt/CUDA_PTX_COMPILATION
/prop_tgt/CUDA_SEPARABLE_COMPILATION
/prop_tgt/CUDA_RESOLVE_DEVICE_SYMBOLS
/prop_tgt/CUDA_RUNTIME_LIBRARY
/prop_tgt/CUDA_EXTENSIONS
/prop_tgt/CUDA_STANDARD
/prop_tgt/CUDA_STANDARD_REQUIRED

View File

@@ -372,6 +372,7 @@ Variables that Control the Build
/variable/CMAKE_CTEST_ARGUMENTS
/variable/CMAKE_CUDA_SEPARABLE_COMPILATION
/variable/CMAKE_CUDA_RESOLVE_DEVICE_SYMBOLS
/variable/CMAKE_CUDA_RUNTIME_LIBRARY
/variable/CMAKE_DEBUG_POSTFIX
/variable/CMAKE_DISABLE_PRECOMPILE_HEADERS
/variable/CMAKE_ENABLE_EXPORTS

View File

@@ -0,0 +1,9 @@
``None``
Link with ``-cudart=none`` or equivalent flag(s) to use no CUDA
runtime library.
``Shared``
Link with ``-cudart=shared`` or equivalent flag(s) to use a
dynamically-linked CUDA runtime library.
``Static``
Link with ``-cudart=static`` or equivalent flag(s) to use a
statically-linked CUDA runtime library.

View File

@@ -0,0 +1,21 @@
CUDA_RUNTIME_LIBRARY
--------------------
Select the CUDA runtime library for use by compilers targeting the CUDA language.
The allowed case insensitive values are:
.. include:: CUDA_RUNTIME_LIBRARY-VALUES.txt
Contents of ``CUDA_RUNTIME_LIBRARY`` may use
:manual:`generator expressions <cmake-generator-expressions(7)>`.
If this property is not set then CMake uses the default value
``Static`` to select the CUDA runtime library.
.. note::
This property has effect only when the ``CUDA`` language is enabled. To
control the CUDA runtime linking when only using the CUDA SDK with the
``C`` or ``C++`` language we recommend using the :module:`FindCUDAToolkit`
module.

View File

@@ -0,0 +1,7 @@
cuda-runtime-library
--------------------
* The :variable:`CMAKE_CUDA_RUNTIME_LIBRARY` variable and
:prop_tgt:`CUDA_RUNTIME_LIBRARY` target property were introduced to
select the CUDA runtime library used when linking targets that
use CUDA.

View File

@@ -0,0 +1,24 @@
CMAKE_CUDA_RUNTIME_LIBRARY
--------------------------
Select the CUDA runtime library for use by compilers targeting the MSVC ABI.
This variable is used to initialize the :prop_tgt:`CUDA_RUNTIME_LIBRARY`
property on all targets as they are created.
The allowed case insensitive values are:
.. include:: ../prop_tgt/CUDA_RUNTIME_LIBRARY-VALUES.txt
Contents of ``CMAKE_CUDA_RUNTIME_LIBRARY`` may use
:manual:`generator expressions <cmake-generator-expressions(7)>`.
If this variable is not set then the :prop_tgt:`CUDA_RUNTIME_LIBRARY` target
property will not be set automatically. If that property is not set then
CMake uses the default value ``Static`` to select the CUDA runtime library.
.. note::
This property has effect only when the ``CUDA`` language is enabled. To
control the CUDA runtime linking when only using the CUDA SDK with the
``C`` or ``C++`` language we recommend using the :module:`FindCUDAToolkit`
module.

View File

@@ -67,6 +67,17 @@ else()
set(CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES "${CMAKE_CUDA_HOST_IMPLICIT_LINK_DIRECTORIES}")
endif()
# Remove the following libraries from CMAKE_CUDA_HOST_IMPLICIT_LINK_LIBRARIES and
# CMAKE_CUDA_IMPLICIT_LINK_LIBRARIES
#
# - cudart
# - cudart_static
# - cudadevrt
#
# These are controlled by CMAKE_CUDA_RUNTIME_LIBRARY
list(REMOVE_ITEM CMAKE_CUDA_IMPLICIT_LINK_LIBRARIES cudart cudart_static cudadevrt)
list(REMOVE_ITEM CMAKE_CUDA_HOST_IMPLICIT_LINK_LIBRARIES cudart cudart_static cudadevrt)
# Re-configure to save learned information.
configure_file(
${CMAKE_ROOT}/Modules/CMakeCUDACompiler.cmake.in

View File

@@ -43,6 +43,11 @@ endif()
set(CMAKE_SHARED_LIBRARY_CREATE_CUDA_FLAGS -shared)
set(CMAKE_INCLUDE_SYSTEM_FLAG_CUDA -isystem=)
set(CMAKE_CUDA_RUNTIME_LIBRARY_DEFAULT "STATIC")
set(CMAKE_CUDA_RUNTIME_LIBRARY_LINK_OPTIONS_STATIC "cudadevrt;cudart_static")
set(CMAKE_CUDA_RUNTIME_LIBRARY_LINK_OPTIONS_SHARED "cudadevrt;cudart")
set(CMAKE_CUDA_RUNTIME_LIBRARY_LINK_OPTIONS_NONE "")
if("x${CMAKE_CUDA_SIMULATE_ID}" STREQUAL "xMSVC")
set(CMAKE_CUDA03_STANDARD_COMPILE_OPTION "")
set(CMAKE_CUDA03_EXTENSION_COMPILE_OPTION "")

View File

@@ -69,6 +69,11 @@ else()
endif()
unset(_cmp0092)
set(CMAKE_CUDA_RUNTIME_LIBRARY_DEFAULT "STATIC")
set(CMAKE_CUDA_RUNTIME_LIBRARY_LINK_OPTIONS_STATIC "cudadevrt;cudart_static")
set(CMAKE_CUDA_RUNTIME_LIBRARY_LINK_OPTIONS_SHARED "cudadevrt;cudart")
set(CMAKE_CUDA_RUNTIME_LIBRARY_LINK_OPTIONS_NONE "")
string(APPEND CMAKE_CUDA_FLAGS_INIT " ${PLATFORM_DEFINES_CUDA} -D_WINDOWS -Xcompiler=\"${_W3}${_FLAGS_CXX}\"")
string(APPEND CMAKE_CUDA_FLAGS_DEBUG_INIT " -Xcompiler=\"${_MDd}-Zi -Ob0 -Od ${_RTC1}\"")
string(APPEND CMAKE_CUDA_FLAGS_RELEASE_INIT " -Xcompiler=\"${_MD}-O2 -Ob2\" -DNDEBUG")

View File

@@ -10,6 +10,7 @@
#include "cmAlgorithms.h"
#include "cmComputeLinkDepends.h"
#include "cmGeneratorExpression.h"
#include "cmGeneratorTarget.h"
#include "cmGlobalGenerator.h"
#include "cmListFileCache.h"
@@ -573,6 +574,15 @@ void cmComputeLinkInformation::AddImplicitLinkInfo()
cmGeneratorTarget::LinkClosure const* lc =
this->Target->GetLinkClosure(this->Config);
for (std::string const& li : lc->Languages) {
if (li == "CUDA") {
// These need to go before the other implicit link information
// as they could require symbols from those other library
// Currently restricted to CUDA as it is the only language
// we have documented runtime behavior controls for
this->AddRuntimeLinkLibrary(li);
}
// Skip those of the linker language. They are implicit.
if (li != this->LinkLanguage) {
this->AddImplicitLinkInfo(li);
@@ -580,6 +590,39 @@ void cmComputeLinkInformation::AddImplicitLinkInfo()
}
}
void cmComputeLinkInformation::AddRuntimeLinkLibrary(std::string const& lang)
{ // Add the lang runtime library flags. This is activated by the presence
// of a default selection whether or not it is overridden by a property.
std::string defaultVar =
cmStrCat("CMAKE_", lang, "_RUNTIME_LIBRARY_DEFAULT");
const char* langRuntimeLibraryDefault =
this->Makefile->GetDefinition(defaultVar);
if (langRuntimeLibraryDefault && *langRuntimeLibraryDefault) {
const char* runtimeLibraryValue =
this->Target->GetProperty(cmStrCat(lang, "_RUNTIME_LIBRARY"));
if (!runtimeLibraryValue) {
runtimeLibraryValue = langRuntimeLibraryDefault;
}
std::string runtimeLibrary =
cmSystemTools::UpperCase(cmGeneratorExpression::Evaluate(
runtimeLibraryValue, this->Target->GetLocalGenerator(), this->Config,
this->Target));
if (!runtimeLibrary.empty()) {
if (const char* runtimeLinkOptions = this->Makefile->GetDefinition(
"CMAKE_" + lang + "_RUNTIME_LIBRARY_LINK_OPTIONS_" +
runtimeLibrary)) {
std::vector<std::string> libsVec = cmExpandedList(runtimeLinkOptions);
for (std::string const& i : libsVec) {
if (!cmContains(this->ImplicitLinkLibs, i)) {
this->AddItem(i, nullptr);
}
}
}
}
}
}
void cmComputeLinkInformation::AddImplicitLinkInfo(std::string const& lang)
{
// Add libraries for this language that are not implied by the

View File

@@ -172,6 +172,7 @@ private:
void LoadImplicitLinkInfo();
void AddImplicitLinkInfo();
void AddImplicitLinkInfo(std::string const& lang);
void AddRuntimeLinkLibrary(std::string const& lang);
std::set<std::string> ImplicitLinkDirs;
std::set<std::string> ImplicitLinkLibs;

View File

@@ -358,6 +358,7 @@ cmTarget::cmTarget(std::string const& name, cmStateEnums::TargetType type,
initProp("CUDA_COMPILER_LAUNCHER");
initProp("CUDA_SEPARABLE_COMPILATION");
initProp("CUDA_RESOLVE_DEVICE_SYMBOLS");
initProp("CUDA_RUNTIME_LIBRARY");
initProp("LINK_SEARCH_START_STATIC");
initProp("LINK_SEARCH_END_STATIC");
initProp("Swift_LANGUAGE_VERSION");

View File

@@ -3636,18 +3636,7 @@ bool cmVisualStudio10TargetGenerator::ComputeLinkOptions(
this->AddLibraries(cli, libVec, vsTargetVec, config);
if (cmContains(linkClosure->Languages, "CUDA") &&
this->CudaOptions[config] != nullptr) {
switch (this->CudaOptions[config]->GetCudaRuntime()) {
case cmVisualStudioGeneratorOptions::CudaRuntimeStatic:
libVec.push_back("cudadevrt.lib");
libVec.push_back("cudart_static.lib");
break;
case cmVisualStudioGeneratorOptions::CudaRuntimeShared:
libVec.push_back("cudadevrt.lib");
libVec.push_back("cudart.lib");
break;
case cmVisualStudioGeneratorOptions::CudaRuntimeNone:
break;
}
this->CudaOptions[config]->FixCudaRuntime(this->GeneratorTarget);
}
std::string standardLibsVar =
cmStrCat("CMAKE_", linkLanguage, "_STANDARD_LIBRARIES");

View File

@@ -3,6 +3,8 @@
#include <cm/iterator>
#include "cmAlgorithms.h"
#include "cmGeneratorExpression.h"
#include "cmGeneratorTarget.h"
#include "cmLocalVisualStudioGenerator.h"
#include "cmOutputConverter.h"
#include "cmSystemTools.h"
@@ -149,25 +151,33 @@ bool cmVisualStudioGeneratorOptions::UsingSBCS() const
return false;
}
cmVisualStudioGeneratorOptions::CudaRuntime
cmVisualStudioGeneratorOptions::GetCudaRuntime() const
void cmVisualStudioGeneratorOptions::FixCudaRuntime(cmGeneratorTarget* target)
{
std::map<std::string, FlagValue>::const_iterator i =
this->FlagMap.find("CudaRuntime");
if (i != this->FlagMap.end() && i->second.size() == 1) {
std::string const& cudaRuntime = i->second[0];
if (cudaRuntime == "Static") {
return CudaRuntimeStatic;
}
if (cudaRuntime == "Shared") {
return CudaRuntimeShared;
}
if (cudaRuntime == "None") {
return CudaRuntimeNone;
if (i == this->FlagMap.end()) {
// User didn't provide am override so get the property value
const char* runtimeLibraryValue =
target->GetProperty("CUDA_RUNTIME_LIBRARY");
if (runtimeLibraryValue) {
std::string cudaRuntime =
cmSystemTools::UpperCase(cmGeneratorExpression::Evaluate(
runtimeLibraryValue, this->LocalGenerator, this->Configuration,
target));
if (cudaRuntime == "STATIC") {
this->AddFlag("CudaRuntime", "Static");
}
if (cudaRuntime == "SHARED") {
this->AddFlag("CudaRuntime", "Shared");
}
if (cudaRuntime == "NONE") {
this->AddFlag("CudaRuntime", "None");
}
} else {
// nvcc default is static
this->AddFlag("CudaRuntime", "Static");
}
}
// nvcc default is static
return CudaRuntimeStatic;
}
void cmVisualStudioGeneratorOptions::FixCudaCodeGeneration()

View File

@@ -13,6 +13,7 @@
#include "cmIDEOptions.h"
class cmLocalVisualStudioGenerator;
class cmGeneratorTarget;
using cmVS7FlagTable = cmIDEFlagTable;
@@ -61,15 +62,8 @@ public:
bool UsingUnicode() const;
bool UsingSBCS() const;
enum CudaRuntime
{
CudaRuntimeStatic,
CudaRuntimeShared,
CudaRuntimeNone
};
CudaRuntime GetCudaRuntime() const;
void FixCudaCodeGeneration();
void FixCudaRuntime(cmGeneratorTarget* target);
void FixManifestUACFlags();

View File

@@ -22,18 +22,11 @@ set(CMAKE_CUDA_STANDARD_REQUIRED TRUE)
set(CMAKE_CXX_STANDARD_REQUIRED TRUE)
add_library(CudaComplexCppBase SHARED dynamic.cpp)
add_library(CudaComplexSeperableLib STATIC file1.cu file2.cu file3.cu)
set_target_properties(CudaComplexSeperableLib
PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
set_target_properties( CudaComplexSeperableLib
PROPERTIES POSITION_INDEPENDENT_CODE ON)
add_library(CudaComplexSharedLib SHARED dynamic.cu)
target_link_libraries(CudaComplexSharedLib PUBLIC CudaComplexCppBase)
add_library(CudaComplexSeperableLib STATIC file1.cu file2.cu file3.cu)
add_library(CudaComplexMixedLib SHARED mixed.cpp mixed.cu)
set_target_properties(CudaComplexMixedLib
PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
target_link_libraries(CudaComplexMixedLib
PUBLIC CudaComplexSharedLib
PRIVATE CudaComplexSeperableLib)
@@ -41,7 +34,27 @@ target_link_libraries(CudaComplexMixedLib
add_executable(CudaComplex main.cpp)
target_link_libraries(CudaComplex PUBLIC CudaComplexMixedLib)
set_target_properties(CudaComplexMixedLib
CudaComplexSeperableLib
PROPERTIES
POSITION_INDEPENDENT_CODE ON
CUDA_SEPARABLE_COMPILATION ON
)
set_target_properties(CudaComplexMixedLib
CudaComplexSharedLib
PROPERTIES
CUDA_RUNTIME_LIBRARY shared
)
if(APPLE)
# Help the static cuda runtime find the driver (libcuda.dyllib) at runtime.
set_property(TARGET CudaComplex PROPERTY BUILD_RPATH ${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES})
endif()
if(UNIX)
# Help the shared cuda runtime find libcudart as it is not located
# in a default system searched location
set_property(TARGET CudaComplexMixedLib PROPERTY BUILD_RPATH ${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES})
endif()

View File

@@ -54,17 +54,20 @@ EXPORT int choose_cuda_device()
return 1;
}
EXPORT void cuda_dynamic_lib_func()
EXPORT bool cuda_dynamic_lib_func()
{
DetermineIfValidCudaDevice<<<1, 1>>>();
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess) {
std::cerr << "DetermineIfValidCudaDevice [SYNC] failed: "
std::cerr << "DetermineIfValidCudaDevice [Per Launch] failed: "
<< cudaGetErrorString(err) << std::endl;
return false;
}
DetermineIfValidCudaDevice<<<1, 1>>>();
err = cudaDeviceSynchronize();
if (err != cudaSuccess) {
std::cerr << "DetermineIfValidCudaDevice [ASYNC] failed: "
std::cerr << "DetermineIfValidCudaDevice [SYNC] failed: "
<< cudaGetErrorString(cudaGetLastError()) << std::endl;
return false;
}
return true;
}

View File

@@ -22,5 +22,6 @@ int main(int argc, char** argv)
int r1 = call_cuda_seperable_code(42);
int r2 = mixed_launch_kernel(42);
return (r1 == 42 || r2 == 42) ? 1 : 0;
}

View File

@@ -15,7 +15,7 @@
result_type __device__ file1_func(int x);
result_type_dynamic __device__ file2_func(int x);
IMPORT void __host__ cuda_dynamic_lib_func();
IMPORT bool __host__ cuda_dynamic_lib_func();
static __global__ void mixed_kernel(result_type* r, int x)
{
@@ -25,7 +25,9 @@ static __global__ void mixed_kernel(result_type* r, int x)
EXPORT int mixed_launch_kernel(int x)
{
cuda_dynamic_lib_func();
if (!cuda_dynamic_lib_func()) {
return x;
}
result_type* r;
cudaError_t err = cudaMallocManaged(&r, sizeof(result_type));

View File

@@ -5,10 +5,21 @@ ADD_TEST_MACRO(CudaOnly.ExportPTX CudaOnlyExportPTX)
ADD_TEST_MACRO(CudaOnly.GPUDebugFlag CudaOnlyGPUDebugFlag)
ADD_TEST_MACRO(CudaOnly.ResolveDeviceSymbols CudaOnlyResolveDeviceSymbols)
ADD_TEST_MACRO(CudaOnly.SeparateCompilation CudaOnlySeparateCompilation)
ADD_TEST_MACRO(CudaOnly.SharedRuntimePlusToolkit CudaOnlySharedRuntimePlusToolkit)
ADD_TEST_MACRO(CudaOnly.Standard98 CudaOnlyStandard98)
ADD_TEST_MACRO(CudaOnly.Toolkit CudaOnlyToolkit)
ADD_TEST_MACRO(CudaOnly.WithDefs CudaOnlyWithDefs)
# The CUDA only ships the shared version of the toolkit libraries
# on windows
if(NOT WIN32)
ADD_TEST_MACRO(Cuda.StaticRuntimePlusToolkit StaticRuntimePlusToolkit)
endif()
if(MSVC)
ADD_TEST_MACRO(CudaOnly.PDB CudaOnlyPDB)
endif()
add_test(NAME CudaOnly.DontResolveDeviceSymbols COMMAND
${CMAKE_CTEST_COMMAND} -C $<CONFIGURATION>
--build-and-test
@@ -20,6 +31,14 @@ add_test(NAME CudaOnly.DontResolveDeviceSymbols COMMAND
--test-command ${CMAKE_CTEST_COMMAND} -V -C $<CONFIGURATION>
)
if(MSVC)
ADD_TEST_MACRO(CudaOnly.PDB CudaOnlyPDB)
endif()
add_test(NAME CudaOnly.RuntimeControls COMMAND
${CMAKE_CTEST_COMMAND} -C $<CONFIGURATION>
--build-and-test
"${CMAKE_CURRENT_SOURCE_DIR}/RuntimeControls/"
"${CMAKE_CURRENT_BINARY_DIR}/RuntimeControls/"
--build-two-config
${build_generator_args}
--build-project RuntimeControls
--build-options ${build_options}
--test-command ${CMAKE_CTEST_COMMAND} -V -C $<CONFIGURATION>
)

View File

@@ -0,0 +1,60 @@
cmake_minimum_required(VERSION 3.7)
project (RuntimeControls CUDA)
# Find nm and dumpbin
if(CMAKE_NM)
set(dump_command ${CMAKE_NM})
set(dump_args -g)
else()
include(GetPrerequisites)
message(STATUS "calling list_prerequisites to find dumpbin")
list_prerequisites("${CMAKE_COMMAND}" 0 0 0)
if(gp_dumpbin)
set(dump_command ${gp_dumpbin})
set(dump_args /ARCHIVEMEMBERS)
endif()
endif()
string(APPEND CMAKE_CUDA_FLAGS " -gencode arch=compute_30,code=[compute_30]")
set(CMAKE_CUDA_STANDARD 11)
set(CMAKE_CUDA_RUNTIME_LIBRARY static)
if(NOT "x${CMAKE_CUDA_SIMULATE_ID}" STREQUAL "xMSVC")
add_library(UsesNoCudaRT SHARED file1.cu)
set_target_properties(UsesNoCudaRT PROPERTIES CUDA_RUNTIME_LIBRARY none)
endif()
add_library(UsesStaticCudaRT SHARED file2.cu)
add_executable(CudaOnlyRuntimeControls main.cu)
set_target_properties(CudaOnlyRuntimeControls PROPERTIES CUDA_RUNTIME_LIBRARY shared)
target_link_libraries(CudaOnlyRuntimeControls PRIVATE $<TARGET_NAME_IF_EXISTS:UsesNoCudaRT> UsesStaticCudaRT)
if(dump_command)
if(TARGET UsesNoCudaRT)
add_custom_command(TARGET UsesNoCudaRT POST_BUILD
COMMAND ${CMAKE_COMMAND}
-DDUMP_COMMAND=${dump_command}
-DDUMP_ARGS=${dump_args}
-DTEST_LIBRARY_PATH=$<TARGET_FILE:UsesNoCudaRT>
-P ${CMAKE_CURRENT_SOURCE_DIR}/no_runtime.cmake
)
endif()
add_custom_command(TARGET UsesStaticCudaRT POST_BUILD
COMMAND ${CMAKE_COMMAND}
-DDUMP_COMMAND=${dump_command}
-DDUMP_ARGS=${dump_args}
-DTEST_LIBRARY_PATH=$<TARGET_FILE:UsesStaticCudaRT>
-P ${CMAKE_CURRENT_SOURCE_DIR}/uses_static_runtime.cmake
)
string(REPLACE ";" "|" dirs "${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES}")
add_custom_command(TARGET CudaOnlyRuntimeControls POST_BUILD
COMMAND ${CMAKE_COMMAND}
-DEXEC_PATH=$<TARGET_FILE:CudaOnlyRuntimeControls>
-DEXTRA_LIB_DIRS="${dirs}"
-P ${CMAKE_CURRENT_SOURCE_DIR}/verify_runtime.cmake
)
endif()

View File

@@ -0,0 +1,18 @@
#ifdef _WIN32
# define EXPORT __declspec(dllexport)
#else
# define EXPORT
#endif
void __global__ file1_kernel(int x, int& r)
{
r = -x;
}
EXPORT int file1_launch_kernel(int x)
{
int r = 0;
file1_kernel<<<1, 1>>>(x, r);
return r;
}

View File

@@ -0,0 +1,18 @@
#ifdef _WIN32
# define EXPORT __declspec(dllexport)
#else
# define EXPORT
#endif
void __global__ file2_kernel(int x, int& r)
{
r = -x;
}
EXPORT int file2_launch_kernel(int x)
{
int r = 0;
file2_kernel<<<1, 1>>>(x, r);
return r;
}

View File

@@ -0,0 +1,81 @@
#include <iostream>
#include "cuda.h"
#ifdef _WIN32
# define IMPORT __declspec(dllimport)
#else
# define IMPORT
#endif
#ifndef _WIN32
IMPORT int file1_launch_kernel(int x);
#endif
IMPORT int file2_launch_kernel(int x);
int choose_cuda_device()
{
int nDevices = 0;
cudaError_t err = cudaGetDeviceCount(&nDevices);
if (err != cudaSuccess) {
std::cerr << "Failed to retrieve the number of CUDA enabled devices"
<< std::endl;
return 1;
}
for (int i = 0; i < nDevices; ++i) {
cudaDeviceProp prop;
cudaError_t err = cudaGetDeviceProperties(&prop, i);
if (err != cudaSuccess) {
std::cerr << "Could not retrieve properties from CUDA device " << i
<< std::endl;
return 1;
}
std::cout << "prop.major: " << prop.major << std::endl;
if (prop.major >= 3) {
err = cudaSetDevice(i);
if (err != cudaSuccess) {
std::cout << "Could not select CUDA device " << i << std::endl;
} else {
return 0;
}
}
}
std::cout << "Could not find a CUDA enabled card supporting compute >=3.0"
<< std::endl;
return 1;
}
int main(int argc, char** argv)
{
int ret = choose_cuda_device();
if (ret) {
return 0;
}
cudaError_t err;
#ifndef _WIN32
file1_launch_kernel(1);
err = cudaGetLastError();
if (err != cudaSuccess) {
std::cerr << "file1_launch_kernel: kernel launch should have passed.\n "
"Error message: "
<< cudaGetErrorString(err) << std::endl;
return 1;
}
#endif
file2_launch_kernel(1);
err = cudaGetLastError();
if (err != cudaSuccess) {
std::cerr << "file2_launch_kernel: kernel launch should have passed.\n "
"Error message: "
<< cudaGetErrorString(err) << std::endl;
return 1;
}
return 0;
}

View File

@@ -0,0 +1,14 @@
execute_process(COMMAND ${DUMP_COMMAND} ${DUMP_ARGS} ${TEST_LIBRARY_PATH}
RESULT_VARIABLE RESULT
OUTPUT_VARIABLE OUTPUT
ERROR_VARIABLE ERROR
)
if(NOT "${RESULT}" STREQUAL "0")
message(FATAL_ERROR "${DUMP_COMMAND} failed [${RESULT}] [${OUTPUT}] [${ERROR}]")
endif()
if(NOT "${OUTPUT}" MATCHES "(__cuda)")
message(FATAL_ERROR
"not missing cuda device symbols, static runtime linking was used.")
endif()

View File

@@ -0,0 +1,14 @@
execute_process(COMMAND ${DUMP_COMMAND} ${DUMP_ARGS} ${TEST_LIBRARY_PATH}
RESULT_VARIABLE RESULT
OUTPUT_VARIABLE OUTPUT
ERROR_VARIABLE ERROR
)
if(NOT "${RESULT}" STREQUAL "0")
message(FATAL_ERROR "${DUMP_COMMAND} failed [${RESULT}] [${OUTPUT}] [${ERROR}]")
endif()
if("${OUTPUT}" MATCHES "__cuda")
message(FATAL_ERROR
"missing cuda device symbols, static runtime linking was not used.")
endif()

View File

@@ -0,0 +1,16 @@
string(REPLACE "|" ";" dirs "${EXTRA_LIB_DIRS}")
file(GET_RUNTIME_DEPENDENCIES
RESOLVED_DEPENDENCIES_VAR resolved_libs
UNRESOLVED_DEPENDENCIES_VAR unresolved_libs
DIRECTORIES ${dirs}
EXECUTABLES ${EXEC_PATH}
)
list(FILTER resolved_libs INCLUDE REGEX ".*cudart.*")
list(LENGTH resolved_libs has_cudart)
if(has_cudart EQUAL 0)
message(FATAL_ERROR
"missing cudart shared library from runtime dependency output.")
endif()

View File

@@ -0,0 +1,42 @@
cmake_minimum_required(VERSION 3.15)
project(SharedRuntimePlusToolkit CUDA)
#Goal for this example:
# Validate that with c++ we can use some components of the CUDA toolkit, and
# specify the cuda runtime
find_package(CUDAToolkit REQUIRED)
add_library(Common OBJECT curand.cu nppif.cu)
target_link_libraries(Common PRIVATE CUDA::toolkit)
set_target_properties(Common PROPERTIES POSITION_INDEPENDENT_CODE ON)
#shared runtime with shared toolkit libraries
add_library(SharedToolkit SHARED shared.cu)
target_link_libraries(SharedToolkit PRIVATE Common PUBLIC CUDA::curand CUDA::nppif)
set_target_properties(SharedToolkit PROPERTIES CUDA_RUNTIME_LIBRARY none)
target_link_libraries(SharedToolkit PUBLIC CUDA::cudart)
# The CUDA only ships the shared version of the toolkit libraries
# on windows
if(NOT WIN32)
#shared runtime with static toolkit libraries
add_library(StaticToolkit SHARED static.cu)
target_link_libraries(StaticToolkit PRIVATE Common CUDA::curand_static CUDA::nppif_static)
set_target_properties(StaticToolkit PROPERTIES CUDA_RUNTIME_LIBRARY Shared)
#static runtime with mixed toolkit libraries
add_library(MixedToolkit SHARED mixed.cu)
target_link_libraries(MixedToolkit PRIVATE Common CUDA::curand_static CUDA::nppif)
set_target_properties(MixedToolkit PROPERTIES CUDA_RUNTIME_LIBRARY Shared)
endif()
add_executable(CudaOnlySharedRuntimePlusToolkit main.cu)
target_link_libraries(CudaOnlySharedRuntimePlusToolkit PRIVATE SharedToolkit
$<TARGET_NAME_IF_EXISTS:StaticToolkit>
$<TARGET_NAME_IF_EXISTS:MixedToolkit>)
if(UNIX)
# Help the shared cuda runtime find libcudart as it is not located
# in a default system searched location
set_property(TARGET CudaOnlySharedRuntimePlusToolkit PROPERTY BUILD_RPATH ${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES})
endif()

View File

@@ -0,0 +1,65 @@
// Comes from:
// https://docs.nvidia.com/cuda/curand/host-api-overview.html#host-api-example
#ifdef _WIN32
# define EXPORT __declspec(dllexport)
#else
# define EXPORT
#endif
/*
* This program uses the host CURAND API to generate 100
* pseudorandom floats.
*/
#include <cuda.h>
#include <curand.h>
#include <stdio.h>
#include <stdlib.h>
#define CUDA_CALL(x) \
do { \
if ((x) != cudaSuccess) { \
printf("Error at %s:%d\n", __FILE__, __LINE__); \
return EXIT_FAILURE; \
} \
} while (0)
#define CURAND_CALL(x) \
do { \
if ((x) != CURAND_STATUS_SUCCESS) { \
printf("Error at %s:%d\n", __FILE__, __LINE__); \
return EXIT_FAILURE; \
} \
} while (0)
EXPORT int curand_main()
{
size_t n = 100;
size_t i;
curandGenerator_t gen;
float *devData, *hostData;
/* Allocate n floats on host */
hostData = (float*)calloc(n, sizeof(float));
/* Allocate n floats on device */
CUDA_CALL(cudaMalloc((void**)&devData, n * sizeof(float)));
/* Create pseudo-random number generator */
CURAND_CALL(curandCreateGenerator(&gen, CURAND_RNG_PSEUDO_DEFAULT));
/* Set seed */
CURAND_CALL(curandSetPseudoRandomGeneratorSeed(gen, 1234ULL));
/* Generate n floats on device */
CURAND_CALL(curandGenerateUniform(gen, devData, n));
/* Copy device memory to host */
CUDA_CALL(
cudaMemcpy(hostData, devData, n * sizeof(float), cudaMemcpyDeviceToHost));
/* Cleanup */
CURAND_CALL(curandDestroyGenerator(gen));
CUDA_CALL(cudaFree(devData));
free(hostData);
return EXIT_SUCCESS;
}

View File

@@ -0,0 +1,23 @@
#ifdef _WIN32
# define IMPORT __declspec(dllimport)
IMPORT int shared_version();
int static_version()
{
return 0;
}
int mixed_version()
{
return 0;
}
#else
int shared_version();
int static_version();
int mixed_version();
#endif
int main()
{
return mixed_version() == 0 && shared_version() == 0 &&
static_version() == 0;
}

View File

@@ -0,0 +1,16 @@
#ifdef _WIN32
# define IMPORT __declspec(dllimport)
# define EXPORT __declspec(dllexport)
#else
# define IMPORT
# define EXPORT
#endif
IMPORT int curand_main();
IMPORT int nppif_main();
EXPORT int mixed_version()
{
return curand_main() == 0 && nppif_main() == 0;
}

View File

@@ -0,0 +1,92 @@
// Comes from
// https://devtalk.nvidia.com/default/topic/1037482/gpu-accelerated-libraries/help-me-help-you-with-modern-cmake-and-cuda-mwe-for-npp/post/5271066/#5271066
#ifdef _WIN32
# define EXPORT __declspec(dllexport)
#else
# define EXPORT
#endif
#include <cstdio>
#include <iostream>
#include <assert.h>
#include <cuda_runtime_api.h>
#include <nppi_filtering_functions.h>
EXPORT int nppif_main()
{
/**
* 8-bit unsigned single-channel 1D row convolution.
*/
const int simgrows = 32;
const int simgcols = 32;
Npp8u *d_pSrc, *d_pDst;
const int nMaskSize = 3;
NppiSize oROI;
oROI.width = simgcols - nMaskSize;
oROI.height = simgrows;
const int simgsize = simgrows * simgcols * sizeof(d_pSrc[0]);
const int dimgsize = oROI.width * oROI.height * sizeof(d_pSrc[0]);
const int simgpix = simgrows * simgcols;
const int dimgpix = oROI.width * oROI.height;
const int nSrcStep = simgcols * sizeof(d_pSrc[0]);
const int nDstStep = oROI.width * sizeof(d_pDst[0]);
const int pixval = 1;
const int nDivisor = 1;
const Npp32s h_pKernel[nMaskSize] = { pixval, pixval, pixval };
Npp32s* d_pKernel;
const Npp32s nAnchor = 2;
cudaError_t err = cudaMalloc((void**)&d_pSrc, simgsize);
if (err != cudaSuccess) {
fprintf(stderr, "Cuda error %d\n", __LINE__);
return 1;
}
err = cudaMalloc((void**)&d_pDst, dimgsize);
if (err != cudaSuccess) {
fprintf(stderr, "Cuda error %d\n", __LINE__);
return 1;
}
err = cudaMalloc((void**)&d_pKernel, nMaskSize * sizeof(d_pKernel[0]));
if (err != cudaSuccess) {
fprintf(stderr, "Cuda error %d\n", __LINE__);
return 1;
}
// set image to pixval initially
err = cudaMemset(d_pSrc, pixval, simgsize);
if (err != cudaSuccess) {
fprintf(stderr, "Cuda error %d\n", __LINE__);
return 1;
}
err = cudaMemset(d_pDst, 0, dimgsize);
if (err != cudaSuccess) {
fprintf(stderr, "Cuda error %d\n", __LINE__);
return 1;
}
err = cudaMemcpy(d_pKernel, h_pKernel, nMaskSize * sizeof(d_pKernel[0]),
cudaMemcpyHostToDevice);
if (err != cudaSuccess) {
fprintf(stderr, "Cuda error %d\n", __LINE__);
return 1;
}
// copy src to dst
NppStatus ret =
nppiFilterRow_8u_C1R(d_pSrc, nSrcStep, d_pDst, nDstStep, oROI, d_pKernel,
nMaskSize, nAnchor, nDivisor);
assert(ret == NPP_NO_ERROR);
Npp8u* h_imgres = new Npp8u[dimgpix];
err = cudaMemcpy(h_imgres, d_pDst, dimgsize, cudaMemcpyDeviceToHost);
if (err != cudaSuccess) {
fprintf(stderr, "Cuda error %d\n", __LINE__);
return 1;
}
// test for filtering
for (int i = 0; i < dimgpix; i++) {
if (h_imgres[i] != (pixval * pixval * nMaskSize)) {
fprintf(stderr, "h_imgres at index %d failed to match\n", i);
return 1;
}
}
return 0;
}

View File

@@ -0,0 +1,16 @@
#ifdef _WIN32
# define IMPORT __declspec(dllimport)
# define EXPORT __declspec(dllexport)
#else
# define IMPORT
# define EXPORT
#endif
int curand_main();
int nppif_main();
EXPORT int shared_version()
{
return curand_main() == 0 && nppif_main() == 0;
}

View File

@@ -0,0 +1,16 @@
#ifdef _WIN32
# define IMPORT __declspec(dllimport)
# define EXPORT __declspec(dllexport)
#else
# define IMPORT
# define EXPORT
#endif
IMPORT int curand_main();
IMPORT int nppif_main();
EXPORT int static_version()
{
return curand_main() == 0 && nppif_main() == 0;
}

View File

@@ -0,0 +1,29 @@
cmake_minimum_required(VERSION 3.15)
project(StaticRuntimePlusToolkit CUDA)
#Goal for this example:
# Validate that with cuda we can use some components of the CUDA toolkit, and
# specify the cuda runtime
find_package(CUDAToolkit REQUIRED)
add_library(Common OBJECT curand.cu nppif.cu)
target_link_libraries(Common PRIVATE CUDA::toolkit)
set_target_properties(Common PROPERTIES POSITION_INDEPENDENT_CODE ON)
#static runtime with shared toolkit libraries
add_library(SharedToolkit SHARED shared.cu)
target_link_libraries(SharedToolkit PRIVATE Common CUDA::curand CUDA::nppif )
set_target_properties(SharedToolkit PROPERTIES CUDA_RUNTIME_LIBRARY none)
target_link_libraries(SharedToolkit PUBLIC CUDA::cudart_static)
#static runtime with static toolkit libraries
add_library(StaticToolkit SHARED static.cu)
target_link_libraries(StaticToolkit PRIVATE Common CUDA::curand_static CUDA::nppif_static)
#static runtime with mixed toolkit libraries
add_library(MixedToolkit SHARED mixed.cu)
target_link_libraries(MixedToolkit PRIVATE Common CUDA::curand CUDA::nppif_static)
set_target_properties(MixedToolkit PROPERTIES CUDA_RUNTIME_LIBRARY Static)
add_executable(CudaOnlyStaticRuntimePlusToolkit main.cu)
target_link_libraries(CudaOnlyStaticRuntimePlusToolkit PRIVATE SharedToolkit StaticToolkit MixedToolkit)

View File

@@ -0,0 +1,59 @@
// Comes from:
// https://docs.nvidia.com/cuda/curand/host-api-overview.html#host-api-example
/*
* This program uses the host CURAND API to generate 100
* pseudorandom floats.
*/
#include <cuda.h>
#include <curand.h>
#include <stdio.h>
#include <stdlib.h>
#define CUDA_CALL(x) \
do { \
if ((x) != cudaSuccess) { \
printf("Error at %s:%d\n", __FILE__, __LINE__); \
return EXIT_FAILURE; \
} \
} while (0)
#define CURAND_CALL(x) \
do { \
if ((x) != CURAND_STATUS_SUCCESS) { \
printf("Error at %s:%d\n", __FILE__, __LINE__); \
return EXIT_FAILURE; \
} \
} while (0)
int curand_main()
{
size_t n = 100;
size_t i;
curandGenerator_t gen;
float *devData, *hostData;
/* Allocate n floats on host */
hostData = (float*)calloc(n, sizeof(float));
/* Allocate n floats on device */
CUDA_CALL(cudaMalloc((void**)&devData, n * sizeof(float)));
/* Create pseudo-random number generator */
CURAND_CALL(curandCreateGenerator(&gen, CURAND_RNG_PSEUDO_DEFAULT));
/* Set seed */
CURAND_CALL(curandSetPseudoRandomGeneratorSeed(gen, 1234ULL));
/* Generate n floats on device */
CURAND_CALL(curandGenerateUniform(gen, devData, n));
/* Copy device memory to host */
CUDA_CALL(
cudaMemcpy(hostData, devData, n * sizeof(float), cudaMemcpyDeviceToHost));
/* Cleanup */
CURAND_CALL(curandDestroyGenerator(gen));
CUDA_CALL(cudaFree(devData));
free(hostData);
return EXIT_SUCCESS;
}

View File

@@ -0,0 +1,11 @@
int shared_version();
int static_version();
int mixed_version();
int main()
{
return mixed_version() == 0 && shared_version() == 0 &&
static_version() == 0;
}

View File

@@ -0,0 +1,8 @@
int curand_main();
int nppif_main();
int mixed_version()
{
return curand_main() == 0 && nppif_main() == 0;
}

View File

@@ -0,0 +1,86 @@
// Comes from
// https://devtalk.nvidia.com/default/topic/1037482/gpu-accelerated-libraries/help-me-help-you-with-modern-cmake-and-cuda-mwe-for-npp/post/5271066/#5271066
#include <cstdio>
#include <iostream>
#include <assert.h>
#include <cuda_runtime_api.h>
#include <nppi_filtering_functions.h>
int nppif_main()
{
/**
* 8-bit unsigned single-channel 1D row convolution.
*/
const int simgrows = 32;
const int simgcols = 32;
Npp8u *d_pSrc, *d_pDst;
const int nMaskSize = 3;
NppiSize oROI;
oROI.width = simgcols - nMaskSize;
oROI.height = simgrows;
const int simgsize = simgrows * simgcols * sizeof(d_pSrc[0]);
const int dimgsize = oROI.width * oROI.height * sizeof(d_pSrc[0]);
const int simgpix = simgrows * simgcols;
const int dimgpix = oROI.width * oROI.height;
const int nSrcStep = simgcols * sizeof(d_pSrc[0]);
const int nDstStep = oROI.width * sizeof(d_pDst[0]);
const int pixval = 1;
const int nDivisor = 1;
const Npp32s h_pKernel[nMaskSize] = { pixval, pixval, pixval };
Npp32s* d_pKernel;
const Npp32s nAnchor = 2;
cudaError_t err = cudaMalloc((void**)&d_pSrc, simgsize);
if (err != cudaSuccess) {
fprintf(stderr, "Cuda error %d\n", __LINE__);
return 1;
}
err = cudaMalloc((void**)&d_pDst, dimgsize);
if (err != cudaSuccess) {
fprintf(stderr, "Cuda error %d\n", __LINE__);
return 1;
}
err = cudaMalloc((void**)&d_pKernel, nMaskSize * sizeof(d_pKernel[0]));
if (err != cudaSuccess) {
fprintf(stderr, "Cuda error %d\n", __LINE__);
return 1;
}
// set image to pixval initially
err = cudaMemset(d_pSrc, pixval, simgsize);
if (err != cudaSuccess) {
fprintf(stderr, "Cuda error %d\n", __LINE__);
return 1;
}
err = cudaMemset(d_pDst, 0, dimgsize);
if (err != cudaSuccess) {
fprintf(stderr, "Cuda error %d\n", __LINE__);
return 1;
}
err = cudaMemcpy(d_pKernel, h_pKernel, nMaskSize * sizeof(d_pKernel[0]),
cudaMemcpyHostToDevice);
if (err != cudaSuccess) {
fprintf(stderr, "Cuda error %d\n", __LINE__);
return 1;
}
// copy src to dst
NppStatus ret =
nppiFilterRow_8u_C1R(d_pSrc, nSrcStep, d_pDst, nDstStep, oROI, d_pKernel,
nMaskSize, nAnchor, nDivisor);
assert(ret == NPP_NO_ERROR);
Npp8u* h_imgres = new Npp8u[dimgpix];
err = cudaMemcpy(h_imgres, d_pDst, dimgsize, cudaMemcpyDeviceToHost);
if (err != cudaSuccess) {
fprintf(stderr, "Cuda error %d\n", __LINE__);
return 1;
}
// test for filtering
for (int i = 0; i < dimgpix; i++) {
if (h_imgres[i] != (pixval * pixval * nMaskSize)) {
fprintf(stderr, "h_imgres at index %d failed to match\n", i);
return 1;
}
}
return 0;
}

View File

@@ -0,0 +1,8 @@
int curand_main();
int nppif_main();
int shared_version()
{
return curand_main() == 0 && nppif_main() == 0;
}

View File

@@ -0,0 +1,8 @@
int curand_main();
int nppif_main();
int static_version()
{
return curand_main() == 0 && nppif_main() == 0;
}