mirror of
https://github.com/Kitware/CMake.git
synced 2025-12-31 19:00:54 -06:00
CUDA: Add abstraction for cuda runtime selection
Fixes #17559 Replace our hard-coded default of cudart=static with a first-class abstraction to select the runtime library from an enumeration of logical names.
This commit is contained in:
@@ -173,6 +173,7 @@ Properties on Targets
|
||||
/prop_tgt/CUDA_PTX_COMPILATION
|
||||
/prop_tgt/CUDA_SEPARABLE_COMPILATION
|
||||
/prop_tgt/CUDA_RESOLVE_DEVICE_SYMBOLS
|
||||
/prop_tgt/CUDA_RUNTIME_LIBRARY
|
||||
/prop_tgt/CUDA_EXTENSIONS
|
||||
/prop_tgt/CUDA_STANDARD
|
||||
/prop_tgt/CUDA_STANDARD_REQUIRED
|
||||
|
||||
@@ -372,6 +372,7 @@ Variables that Control the Build
|
||||
/variable/CMAKE_CTEST_ARGUMENTS
|
||||
/variable/CMAKE_CUDA_SEPARABLE_COMPILATION
|
||||
/variable/CMAKE_CUDA_RESOLVE_DEVICE_SYMBOLS
|
||||
/variable/CMAKE_CUDA_RUNTIME_LIBRARY
|
||||
/variable/CMAKE_DEBUG_POSTFIX
|
||||
/variable/CMAKE_DISABLE_PRECOMPILE_HEADERS
|
||||
/variable/CMAKE_ENABLE_EXPORTS
|
||||
|
||||
9
Help/prop_tgt/CUDA_RUNTIME_LIBRARY-VALUES.txt
Normal file
9
Help/prop_tgt/CUDA_RUNTIME_LIBRARY-VALUES.txt
Normal file
@@ -0,0 +1,9 @@
|
||||
``None``
|
||||
Link with ``-cudart=none`` or equivalent flag(s) to use no CUDA
|
||||
runtime library.
|
||||
``Shared``
|
||||
Link with ``-cudart=shared`` or equivalent flag(s) to use a
|
||||
dynamically-linked CUDA runtime library.
|
||||
``Static``
|
||||
Link with ``-cudart=static`` or equivalent flag(s) to use a
|
||||
statically-linked CUDA runtime library.
|
||||
21
Help/prop_tgt/CUDA_RUNTIME_LIBRARY.rst
Normal file
21
Help/prop_tgt/CUDA_RUNTIME_LIBRARY.rst
Normal file
@@ -0,0 +1,21 @@
|
||||
CUDA_RUNTIME_LIBRARY
|
||||
--------------------
|
||||
|
||||
Select the CUDA runtime library for use by compilers targeting the CUDA language.
|
||||
|
||||
The allowed case insensitive values are:
|
||||
|
||||
.. include:: CUDA_RUNTIME_LIBRARY-VALUES.txt
|
||||
|
||||
Contents of ``CUDA_RUNTIME_LIBRARY`` may use
|
||||
:manual:`generator expressions <cmake-generator-expressions(7)>`.
|
||||
|
||||
If this property is not set then CMake uses the default value
|
||||
``Static`` to select the CUDA runtime library.
|
||||
|
||||
.. note::
|
||||
|
||||
This property has effect only when the ``CUDA`` language is enabled. To
|
||||
control the CUDA runtime linking when only using the CUDA SDK with the
|
||||
``C`` or ``C++`` language we recommend using the :module:`FindCUDAToolkit`
|
||||
module.
|
||||
7
Help/release/dev/cuda-runtime-library.rst
Normal file
7
Help/release/dev/cuda-runtime-library.rst
Normal file
@@ -0,0 +1,7 @@
|
||||
cuda-runtime-library
|
||||
--------------------
|
||||
|
||||
* The :variable:`CMAKE_CUDA_RUNTIME_LIBRARY` variable and
|
||||
:prop_tgt:`CUDA_RUNTIME_LIBRARY` target property were introduced to
|
||||
select the CUDA runtime library used when linking targets that
|
||||
use CUDA.
|
||||
24
Help/variable/CMAKE_CUDA_RUNTIME_LIBRARY.rst
Normal file
24
Help/variable/CMAKE_CUDA_RUNTIME_LIBRARY.rst
Normal file
@@ -0,0 +1,24 @@
|
||||
CMAKE_CUDA_RUNTIME_LIBRARY
|
||||
--------------------------
|
||||
|
||||
Select the CUDA runtime library for use by compilers targeting the MSVC ABI.
|
||||
This variable is used to initialize the :prop_tgt:`CUDA_RUNTIME_LIBRARY`
|
||||
property on all targets as they are created.
|
||||
|
||||
The allowed case insensitive values are:
|
||||
|
||||
.. include:: ../prop_tgt/CUDA_RUNTIME_LIBRARY-VALUES.txt
|
||||
|
||||
Contents of ``CMAKE_CUDA_RUNTIME_LIBRARY`` may use
|
||||
:manual:`generator expressions <cmake-generator-expressions(7)>`.
|
||||
|
||||
If this variable is not set then the :prop_tgt:`CUDA_RUNTIME_LIBRARY` target
|
||||
property will not be set automatically. If that property is not set then
|
||||
CMake uses the default value ``Static`` to select the CUDA runtime library.
|
||||
|
||||
.. note::
|
||||
|
||||
This property has effect only when the ``CUDA`` language is enabled. To
|
||||
control the CUDA runtime linking when only using the CUDA SDK with the
|
||||
``C`` or ``C++`` language we recommend using the :module:`FindCUDAToolkit`
|
||||
module.
|
||||
@@ -67,6 +67,17 @@ else()
|
||||
set(CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES "${CMAKE_CUDA_HOST_IMPLICIT_LINK_DIRECTORIES}")
|
||||
endif()
|
||||
|
||||
# Remove the following libraries from CMAKE_CUDA_HOST_IMPLICIT_LINK_LIBRARIES and
|
||||
# CMAKE_CUDA_IMPLICIT_LINK_LIBRARIES
|
||||
#
|
||||
# - cudart
|
||||
# - cudart_static
|
||||
# - cudadevrt
|
||||
#
|
||||
# These are controlled by CMAKE_CUDA_RUNTIME_LIBRARY
|
||||
list(REMOVE_ITEM CMAKE_CUDA_IMPLICIT_LINK_LIBRARIES cudart cudart_static cudadevrt)
|
||||
list(REMOVE_ITEM CMAKE_CUDA_HOST_IMPLICIT_LINK_LIBRARIES cudart cudart_static cudadevrt)
|
||||
|
||||
# Re-configure to save learned information.
|
||||
configure_file(
|
||||
${CMAKE_ROOT}/Modules/CMakeCUDACompiler.cmake.in
|
||||
|
||||
@@ -43,6 +43,11 @@ endif()
|
||||
set(CMAKE_SHARED_LIBRARY_CREATE_CUDA_FLAGS -shared)
|
||||
set(CMAKE_INCLUDE_SYSTEM_FLAG_CUDA -isystem=)
|
||||
|
||||
set(CMAKE_CUDA_RUNTIME_LIBRARY_DEFAULT "STATIC")
|
||||
set(CMAKE_CUDA_RUNTIME_LIBRARY_LINK_OPTIONS_STATIC "cudadevrt;cudart_static")
|
||||
set(CMAKE_CUDA_RUNTIME_LIBRARY_LINK_OPTIONS_SHARED "cudadevrt;cudart")
|
||||
set(CMAKE_CUDA_RUNTIME_LIBRARY_LINK_OPTIONS_NONE "")
|
||||
|
||||
if("x${CMAKE_CUDA_SIMULATE_ID}" STREQUAL "xMSVC")
|
||||
set(CMAKE_CUDA03_STANDARD_COMPILE_OPTION "")
|
||||
set(CMAKE_CUDA03_EXTENSION_COMPILE_OPTION "")
|
||||
|
||||
@@ -69,6 +69,11 @@ else()
|
||||
endif()
|
||||
unset(_cmp0092)
|
||||
|
||||
set(CMAKE_CUDA_RUNTIME_LIBRARY_DEFAULT "STATIC")
|
||||
set(CMAKE_CUDA_RUNTIME_LIBRARY_LINK_OPTIONS_STATIC "cudadevrt;cudart_static")
|
||||
set(CMAKE_CUDA_RUNTIME_LIBRARY_LINK_OPTIONS_SHARED "cudadevrt;cudart")
|
||||
set(CMAKE_CUDA_RUNTIME_LIBRARY_LINK_OPTIONS_NONE "")
|
||||
|
||||
string(APPEND CMAKE_CUDA_FLAGS_INIT " ${PLATFORM_DEFINES_CUDA} -D_WINDOWS -Xcompiler=\"${_W3}${_FLAGS_CXX}\"")
|
||||
string(APPEND CMAKE_CUDA_FLAGS_DEBUG_INIT " -Xcompiler=\"${_MDd}-Zi -Ob0 -Od ${_RTC1}\"")
|
||||
string(APPEND CMAKE_CUDA_FLAGS_RELEASE_INIT " -Xcompiler=\"${_MD}-O2 -Ob2\" -DNDEBUG")
|
||||
|
||||
@@ -10,6 +10,7 @@
|
||||
|
||||
#include "cmAlgorithms.h"
|
||||
#include "cmComputeLinkDepends.h"
|
||||
#include "cmGeneratorExpression.h"
|
||||
#include "cmGeneratorTarget.h"
|
||||
#include "cmGlobalGenerator.h"
|
||||
#include "cmListFileCache.h"
|
||||
@@ -573,6 +574,15 @@ void cmComputeLinkInformation::AddImplicitLinkInfo()
|
||||
cmGeneratorTarget::LinkClosure const* lc =
|
||||
this->Target->GetLinkClosure(this->Config);
|
||||
for (std::string const& li : lc->Languages) {
|
||||
|
||||
if (li == "CUDA") {
|
||||
// These need to go before the other implicit link information
|
||||
// as they could require symbols from those other library
|
||||
// Currently restricted to CUDA as it is the only language
|
||||
// we have documented runtime behavior controls for
|
||||
this->AddRuntimeLinkLibrary(li);
|
||||
}
|
||||
|
||||
// Skip those of the linker language. They are implicit.
|
||||
if (li != this->LinkLanguage) {
|
||||
this->AddImplicitLinkInfo(li);
|
||||
@@ -580,6 +590,39 @@ void cmComputeLinkInformation::AddImplicitLinkInfo()
|
||||
}
|
||||
}
|
||||
|
||||
void cmComputeLinkInformation::AddRuntimeLinkLibrary(std::string const& lang)
|
||||
{ // Add the lang runtime library flags. This is activated by the presence
|
||||
// of a default selection whether or not it is overridden by a property.
|
||||
std::string defaultVar =
|
||||
cmStrCat("CMAKE_", lang, "_RUNTIME_LIBRARY_DEFAULT");
|
||||
const char* langRuntimeLibraryDefault =
|
||||
this->Makefile->GetDefinition(defaultVar);
|
||||
if (langRuntimeLibraryDefault && *langRuntimeLibraryDefault) {
|
||||
const char* runtimeLibraryValue =
|
||||
this->Target->GetProperty(cmStrCat(lang, "_RUNTIME_LIBRARY"));
|
||||
if (!runtimeLibraryValue) {
|
||||
runtimeLibraryValue = langRuntimeLibraryDefault;
|
||||
}
|
||||
|
||||
std::string runtimeLibrary =
|
||||
cmSystemTools::UpperCase(cmGeneratorExpression::Evaluate(
|
||||
runtimeLibraryValue, this->Target->GetLocalGenerator(), this->Config,
|
||||
this->Target));
|
||||
if (!runtimeLibrary.empty()) {
|
||||
if (const char* runtimeLinkOptions = this->Makefile->GetDefinition(
|
||||
"CMAKE_" + lang + "_RUNTIME_LIBRARY_LINK_OPTIONS_" +
|
||||
runtimeLibrary)) {
|
||||
std::vector<std::string> libsVec = cmExpandedList(runtimeLinkOptions);
|
||||
for (std::string const& i : libsVec) {
|
||||
if (!cmContains(this->ImplicitLinkLibs, i)) {
|
||||
this->AddItem(i, nullptr);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void cmComputeLinkInformation::AddImplicitLinkInfo(std::string const& lang)
|
||||
{
|
||||
// Add libraries for this language that are not implied by the
|
||||
|
||||
@@ -172,6 +172,7 @@ private:
|
||||
void LoadImplicitLinkInfo();
|
||||
void AddImplicitLinkInfo();
|
||||
void AddImplicitLinkInfo(std::string const& lang);
|
||||
void AddRuntimeLinkLibrary(std::string const& lang);
|
||||
std::set<std::string> ImplicitLinkDirs;
|
||||
std::set<std::string> ImplicitLinkLibs;
|
||||
|
||||
|
||||
@@ -358,6 +358,7 @@ cmTarget::cmTarget(std::string const& name, cmStateEnums::TargetType type,
|
||||
initProp("CUDA_COMPILER_LAUNCHER");
|
||||
initProp("CUDA_SEPARABLE_COMPILATION");
|
||||
initProp("CUDA_RESOLVE_DEVICE_SYMBOLS");
|
||||
initProp("CUDA_RUNTIME_LIBRARY");
|
||||
initProp("LINK_SEARCH_START_STATIC");
|
||||
initProp("LINK_SEARCH_END_STATIC");
|
||||
initProp("Swift_LANGUAGE_VERSION");
|
||||
|
||||
@@ -3636,18 +3636,7 @@ bool cmVisualStudio10TargetGenerator::ComputeLinkOptions(
|
||||
this->AddLibraries(cli, libVec, vsTargetVec, config);
|
||||
if (cmContains(linkClosure->Languages, "CUDA") &&
|
||||
this->CudaOptions[config] != nullptr) {
|
||||
switch (this->CudaOptions[config]->GetCudaRuntime()) {
|
||||
case cmVisualStudioGeneratorOptions::CudaRuntimeStatic:
|
||||
libVec.push_back("cudadevrt.lib");
|
||||
libVec.push_back("cudart_static.lib");
|
||||
break;
|
||||
case cmVisualStudioGeneratorOptions::CudaRuntimeShared:
|
||||
libVec.push_back("cudadevrt.lib");
|
||||
libVec.push_back("cudart.lib");
|
||||
break;
|
||||
case cmVisualStudioGeneratorOptions::CudaRuntimeNone:
|
||||
break;
|
||||
}
|
||||
this->CudaOptions[config]->FixCudaRuntime(this->GeneratorTarget);
|
||||
}
|
||||
std::string standardLibsVar =
|
||||
cmStrCat("CMAKE_", linkLanguage, "_STANDARD_LIBRARIES");
|
||||
|
||||
@@ -3,6 +3,8 @@
|
||||
#include <cm/iterator>
|
||||
|
||||
#include "cmAlgorithms.h"
|
||||
#include "cmGeneratorExpression.h"
|
||||
#include "cmGeneratorTarget.h"
|
||||
#include "cmLocalVisualStudioGenerator.h"
|
||||
#include "cmOutputConverter.h"
|
||||
#include "cmSystemTools.h"
|
||||
@@ -149,25 +151,33 @@ bool cmVisualStudioGeneratorOptions::UsingSBCS() const
|
||||
return false;
|
||||
}
|
||||
|
||||
cmVisualStudioGeneratorOptions::CudaRuntime
|
||||
cmVisualStudioGeneratorOptions::GetCudaRuntime() const
|
||||
void cmVisualStudioGeneratorOptions::FixCudaRuntime(cmGeneratorTarget* target)
|
||||
{
|
||||
std::map<std::string, FlagValue>::const_iterator i =
|
||||
this->FlagMap.find("CudaRuntime");
|
||||
if (i != this->FlagMap.end() && i->second.size() == 1) {
|
||||
std::string const& cudaRuntime = i->second[0];
|
||||
if (cudaRuntime == "Static") {
|
||||
return CudaRuntimeStatic;
|
||||
}
|
||||
if (cudaRuntime == "Shared") {
|
||||
return CudaRuntimeShared;
|
||||
}
|
||||
if (cudaRuntime == "None") {
|
||||
return CudaRuntimeNone;
|
||||
if (i == this->FlagMap.end()) {
|
||||
// User didn't provide am override so get the property value
|
||||
const char* runtimeLibraryValue =
|
||||
target->GetProperty("CUDA_RUNTIME_LIBRARY");
|
||||
if (runtimeLibraryValue) {
|
||||
std::string cudaRuntime =
|
||||
cmSystemTools::UpperCase(cmGeneratorExpression::Evaluate(
|
||||
runtimeLibraryValue, this->LocalGenerator, this->Configuration,
|
||||
target));
|
||||
if (cudaRuntime == "STATIC") {
|
||||
this->AddFlag("CudaRuntime", "Static");
|
||||
}
|
||||
if (cudaRuntime == "SHARED") {
|
||||
this->AddFlag("CudaRuntime", "Shared");
|
||||
}
|
||||
if (cudaRuntime == "NONE") {
|
||||
this->AddFlag("CudaRuntime", "None");
|
||||
}
|
||||
} else {
|
||||
// nvcc default is static
|
||||
this->AddFlag("CudaRuntime", "Static");
|
||||
}
|
||||
}
|
||||
// nvcc default is static
|
||||
return CudaRuntimeStatic;
|
||||
}
|
||||
|
||||
void cmVisualStudioGeneratorOptions::FixCudaCodeGeneration()
|
||||
|
||||
@@ -13,6 +13,7 @@
|
||||
#include "cmIDEOptions.h"
|
||||
|
||||
class cmLocalVisualStudioGenerator;
|
||||
class cmGeneratorTarget;
|
||||
|
||||
using cmVS7FlagTable = cmIDEFlagTable;
|
||||
|
||||
@@ -61,15 +62,8 @@ public:
|
||||
bool UsingUnicode() const;
|
||||
bool UsingSBCS() const;
|
||||
|
||||
enum CudaRuntime
|
||||
{
|
||||
CudaRuntimeStatic,
|
||||
CudaRuntimeShared,
|
||||
CudaRuntimeNone
|
||||
};
|
||||
CudaRuntime GetCudaRuntime() const;
|
||||
|
||||
void FixCudaCodeGeneration();
|
||||
void FixCudaRuntime(cmGeneratorTarget* target);
|
||||
|
||||
void FixManifestUACFlags();
|
||||
|
||||
|
||||
@@ -22,18 +22,11 @@ set(CMAKE_CUDA_STANDARD_REQUIRED TRUE)
|
||||
set(CMAKE_CXX_STANDARD_REQUIRED TRUE)
|
||||
|
||||
add_library(CudaComplexCppBase SHARED dynamic.cpp)
|
||||
add_library(CudaComplexSeperableLib STATIC file1.cu file2.cu file3.cu)
|
||||
set_target_properties(CudaComplexSeperableLib
|
||||
PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
|
||||
set_target_properties( CudaComplexSeperableLib
|
||||
PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
||||
|
||||
add_library(CudaComplexSharedLib SHARED dynamic.cu)
|
||||
target_link_libraries(CudaComplexSharedLib PUBLIC CudaComplexCppBase)
|
||||
|
||||
add_library(CudaComplexSeperableLib STATIC file1.cu file2.cu file3.cu)
|
||||
add_library(CudaComplexMixedLib SHARED mixed.cpp mixed.cu)
|
||||
set_target_properties(CudaComplexMixedLib
|
||||
PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
|
||||
target_link_libraries(CudaComplexMixedLib
|
||||
PUBLIC CudaComplexSharedLib
|
||||
PRIVATE CudaComplexSeperableLib)
|
||||
@@ -41,7 +34,27 @@ target_link_libraries(CudaComplexMixedLib
|
||||
add_executable(CudaComplex main.cpp)
|
||||
target_link_libraries(CudaComplex PUBLIC CudaComplexMixedLib)
|
||||
|
||||
|
||||
set_target_properties(CudaComplexMixedLib
|
||||
CudaComplexSeperableLib
|
||||
PROPERTIES
|
||||
POSITION_INDEPENDENT_CODE ON
|
||||
CUDA_SEPARABLE_COMPILATION ON
|
||||
)
|
||||
set_target_properties(CudaComplexMixedLib
|
||||
CudaComplexSharedLib
|
||||
PROPERTIES
|
||||
CUDA_RUNTIME_LIBRARY shared
|
||||
)
|
||||
|
||||
|
||||
if(APPLE)
|
||||
# Help the static cuda runtime find the driver (libcuda.dyllib) at runtime.
|
||||
set_property(TARGET CudaComplex PROPERTY BUILD_RPATH ${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES})
|
||||
endif()
|
||||
|
||||
if(UNIX)
|
||||
# Help the shared cuda runtime find libcudart as it is not located
|
||||
# in a default system searched location
|
||||
set_property(TARGET CudaComplexMixedLib PROPERTY BUILD_RPATH ${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES})
|
||||
endif()
|
||||
|
||||
@@ -54,17 +54,20 @@ EXPORT int choose_cuda_device()
|
||||
return 1;
|
||||
}
|
||||
|
||||
EXPORT void cuda_dynamic_lib_func()
|
||||
EXPORT bool cuda_dynamic_lib_func()
|
||||
{
|
||||
DetermineIfValidCudaDevice<<<1, 1>>>();
|
||||
cudaError_t err = cudaGetLastError();
|
||||
if (err != cudaSuccess) {
|
||||
std::cerr << "DetermineIfValidCudaDevice [SYNC] failed: "
|
||||
std::cerr << "DetermineIfValidCudaDevice [Per Launch] failed: "
|
||||
<< cudaGetErrorString(err) << std::endl;
|
||||
return false;
|
||||
}
|
||||
DetermineIfValidCudaDevice<<<1, 1>>>();
|
||||
err = cudaDeviceSynchronize();
|
||||
if (err != cudaSuccess) {
|
||||
std::cerr << "DetermineIfValidCudaDevice [ASYNC] failed: "
|
||||
std::cerr << "DetermineIfValidCudaDevice [SYNC] failed: "
|
||||
<< cudaGetErrorString(cudaGetLastError()) << std::endl;
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
@@ -22,5 +22,6 @@ int main(int argc, char** argv)
|
||||
|
||||
int r1 = call_cuda_seperable_code(42);
|
||||
int r2 = mixed_launch_kernel(42);
|
||||
|
||||
return (r1 == 42 || r2 == 42) ? 1 : 0;
|
||||
}
|
||||
|
||||
@@ -15,7 +15,7 @@
|
||||
result_type __device__ file1_func(int x);
|
||||
result_type_dynamic __device__ file2_func(int x);
|
||||
|
||||
IMPORT void __host__ cuda_dynamic_lib_func();
|
||||
IMPORT bool __host__ cuda_dynamic_lib_func();
|
||||
|
||||
static __global__ void mixed_kernel(result_type* r, int x)
|
||||
{
|
||||
@@ -25,7 +25,9 @@ static __global__ void mixed_kernel(result_type* r, int x)
|
||||
|
||||
EXPORT int mixed_launch_kernel(int x)
|
||||
{
|
||||
cuda_dynamic_lib_func();
|
||||
if (!cuda_dynamic_lib_func()) {
|
||||
return x;
|
||||
}
|
||||
|
||||
result_type* r;
|
||||
cudaError_t err = cudaMallocManaged(&r, sizeof(result_type));
|
||||
|
||||
@@ -5,10 +5,21 @@ ADD_TEST_MACRO(CudaOnly.ExportPTX CudaOnlyExportPTX)
|
||||
ADD_TEST_MACRO(CudaOnly.GPUDebugFlag CudaOnlyGPUDebugFlag)
|
||||
ADD_TEST_MACRO(CudaOnly.ResolveDeviceSymbols CudaOnlyResolveDeviceSymbols)
|
||||
ADD_TEST_MACRO(CudaOnly.SeparateCompilation CudaOnlySeparateCompilation)
|
||||
ADD_TEST_MACRO(CudaOnly.SharedRuntimePlusToolkit CudaOnlySharedRuntimePlusToolkit)
|
||||
ADD_TEST_MACRO(CudaOnly.Standard98 CudaOnlyStandard98)
|
||||
ADD_TEST_MACRO(CudaOnly.Toolkit CudaOnlyToolkit)
|
||||
ADD_TEST_MACRO(CudaOnly.WithDefs CudaOnlyWithDefs)
|
||||
|
||||
# The CUDA only ships the shared version of the toolkit libraries
|
||||
# on windows
|
||||
if(NOT WIN32)
|
||||
ADD_TEST_MACRO(Cuda.StaticRuntimePlusToolkit StaticRuntimePlusToolkit)
|
||||
endif()
|
||||
|
||||
if(MSVC)
|
||||
ADD_TEST_MACRO(CudaOnly.PDB CudaOnlyPDB)
|
||||
endif()
|
||||
|
||||
add_test(NAME CudaOnly.DontResolveDeviceSymbols COMMAND
|
||||
${CMAKE_CTEST_COMMAND} -C $<CONFIGURATION>
|
||||
--build-and-test
|
||||
@@ -20,6 +31,14 @@ add_test(NAME CudaOnly.DontResolveDeviceSymbols COMMAND
|
||||
--test-command ${CMAKE_CTEST_COMMAND} -V -C $<CONFIGURATION>
|
||||
)
|
||||
|
||||
if(MSVC)
|
||||
ADD_TEST_MACRO(CudaOnly.PDB CudaOnlyPDB)
|
||||
endif()
|
||||
add_test(NAME CudaOnly.RuntimeControls COMMAND
|
||||
${CMAKE_CTEST_COMMAND} -C $<CONFIGURATION>
|
||||
--build-and-test
|
||||
"${CMAKE_CURRENT_SOURCE_DIR}/RuntimeControls/"
|
||||
"${CMAKE_CURRENT_BINARY_DIR}/RuntimeControls/"
|
||||
--build-two-config
|
||||
${build_generator_args}
|
||||
--build-project RuntimeControls
|
||||
--build-options ${build_options}
|
||||
--test-command ${CMAKE_CTEST_COMMAND} -V -C $<CONFIGURATION>
|
||||
)
|
||||
|
||||
60
Tests/CudaOnly/RuntimeControls/CMakeLists.txt
Normal file
60
Tests/CudaOnly/RuntimeControls/CMakeLists.txt
Normal file
@@ -0,0 +1,60 @@
|
||||
cmake_minimum_required(VERSION 3.7)
|
||||
project (RuntimeControls CUDA)
|
||||
|
||||
# Find nm and dumpbin
|
||||
if(CMAKE_NM)
|
||||
set(dump_command ${CMAKE_NM})
|
||||
set(dump_args -g)
|
||||
else()
|
||||
include(GetPrerequisites)
|
||||
message(STATUS "calling list_prerequisites to find dumpbin")
|
||||
list_prerequisites("${CMAKE_COMMAND}" 0 0 0)
|
||||
if(gp_dumpbin)
|
||||
set(dump_command ${gp_dumpbin})
|
||||
set(dump_args /ARCHIVEMEMBERS)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
string(APPEND CMAKE_CUDA_FLAGS " -gencode arch=compute_30,code=[compute_30]")
|
||||
|
||||
set(CMAKE_CUDA_STANDARD 11)
|
||||
set(CMAKE_CUDA_RUNTIME_LIBRARY static)
|
||||
|
||||
if(NOT "x${CMAKE_CUDA_SIMULATE_ID}" STREQUAL "xMSVC")
|
||||
add_library(UsesNoCudaRT SHARED file1.cu)
|
||||
set_target_properties(UsesNoCudaRT PROPERTIES CUDA_RUNTIME_LIBRARY none)
|
||||
endif()
|
||||
|
||||
add_library(UsesStaticCudaRT SHARED file2.cu)
|
||||
|
||||
add_executable(CudaOnlyRuntimeControls main.cu)
|
||||
set_target_properties(CudaOnlyRuntimeControls PROPERTIES CUDA_RUNTIME_LIBRARY shared)
|
||||
|
||||
target_link_libraries(CudaOnlyRuntimeControls PRIVATE $<TARGET_NAME_IF_EXISTS:UsesNoCudaRT> UsesStaticCudaRT)
|
||||
|
||||
|
||||
if(dump_command)
|
||||
if(TARGET UsesNoCudaRT)
|
||||
add_custom_command(TARGET UsesNoCudaRT POST_BUILD
|
||||
COMMAND ${CMAKE_COMMAND}
|
||||
-DDUMP_COMMAND=${dump_command}
|
||||
-DDUMP_ARGS=${dump_args}
|
||||
-DTEST_LIBRARY_PATH=$<TARGET_FILE:UsesNoCudaRT>
|
||||
-P ${CMAKE_CURRENT_SOURCE_DIR}/no_runtime.cmake
|
||||
)
|
||||
endif()
|
||||
add_custom_command(TARGET UsesStaticCudaRT POST_BUILD
|
||||
COMMAND ${CMAKE_COMMAND}
|
||||
-DDUMP_COMMAND=${dump_command}
|
||||
-DDUMP_ARGS=${dump_args}
|
||||
-DTEST_LIBRARY_PATH=$<TARGET_FILE:UsesStaticCudaRT>
|
||||
-P ${CMAKE_CURRENT_SOURCE_DIR}/uses_static_runtime.cmake
|
||||
)
|
||||
string(REPLACE ";" "|" dirs "${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES}")
|
||||
add_custom_command(TARGET CudaOnlyRuntimeControls POST_BUILD
|
||||
COMMAND ${CMAKE_COMMAND}
|
||||
-DEXEC_PATH=$<TARGET_FILE:CudaOnlyRuntimeControls>
|
||||
-DEXTRA_LIB_DIRS="${dirs}"
|
||||
-P ${CMAKE_CURRENT_SOURCE_DIR}/verify_runtime.cmake
|
||||
)
|
||||
endif()
|
||||
18
Tests/CudaOnly/RuntimeControls/file1.cu
Normal file
18
Tests/CudaOnly/RuntimeControls/file1.cu
Normal file
@@ -0,0 +1,18 @@
|
||||
|
||||
#ifdef _WIN32
|
||||
# define EXPORT __declspec(dllexport)
|
||||
#else
|
||||
# define EXPORT
|
||||
#endif
|
||||
|
||||
void __global__ file1_kernel(int x, int& r)
|
||||
{
|
||||
r = -x;
|
||||
}
|
||||
|
||||
EXPORT int file1_launch_kernel(int x)
|
||||
{
|
||||
int r = 0;
|
||||
file1_kernel<<<1, 1>>>(x, r);
|
||||
return r;
|
||||
}
|
||||
18
Tests/CudaOnly/RuntimeControls/file2.cu
Normal file
18
Tests/CudaOnly/RuntimeControls/file2.cu
Normal file
@@ -0,0 +1,18 @@
|
||||
|
||||
#ifdef _WIN32
|
||||
# define EXPORT __declspec(dllexport)
|
||||
#else
|
||||
# define EXPORT
|
||||
#endif
|
||||
|
||||
void __global__ file2_kernel(int x, int& r)
|
||||
{
|
||||
r = -x;
|
||||
}
|
||||
|
||||
EXPORT int file2_launch_kernel(int x)
|
||||
{
|
||||
int r = 0;
|
||||
file2_kernel<<<1, 1>>>(x, r);
|
||||
return r;
|
||||
}
|
||||
81
Tests/CudaOnly/RuntimeControls/main.cu
Normal file
81
Tests/CudaOnly/RuntimeControls/main.cu
Normal file
@@ -0,0 +1,81 @@
|
||||
|
||||
#include <iostream>
|
||||
|
||||
#include "cuda.h"
|
||||
|
||||
#ifdef _WIN32
|
||||
# define IMPORT __declspec(dllimport)
|
||||
#else
|
||||
# define IMPORT
|
||||
#endif
|
||||
|
||||
#ifndef _WIN32
|
||||
IMPORT int file1_launch_kernel(int x);
|
||||
#endif
|
||||
|
||||
IMPORT int file2_launch_kernel(int x);
|
||||
|
||||
int choose_cuda_device()
|
||||
{
|
||||
int nDevices = 0;
|
||||
cudaError_t err = cudaGetDeviceCount(&nDevices);
|
||||
if (err != cudaSuccess) {
|
||||
std::cerr << "Failed to retrieve the number of CUDA enabled devices"
|
||||
<< std::endl;
|
||||
return 1;
|
||||
}
|
||||
for (int i = 0; i < nDevices; ++i) {
|
||||
cudaDeviceProp prop;
|
||||
cudaError_t err = cudaGetDeviceProperties(&prop, i);
|
||||
if (err != cudaSuccess) {
|
||||
std::cerr << "Could not retrieve properties from CUDA device " << i
|
||||
<< std::endl;
|
||||
return 1;
|
||||
}
|
||||
std::cout << "prop.major: " << prop.major << std::endl;
|
||||
if (prop.major >= 3) {
|
||||
err = cudaSetDevice(i);
|
||||
if (err != cudaSuccess) {
|
||||
std::cout << "Could not select CUDA device " << i << std::endl;
|
||||
} else {
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
std::cout << "Could not find a CUDA enabled card supporting compute >=3.0"
|
||||
<< std::endl;
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
int main(int argc, char** argv)
|
||||
{
|
||||
int ret = choose_cuda_device();
|
||||
if (ret) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
cudaError_t err;
|
||||
#ifndef _WIN32
|
||||
file1_launch_kernel(1);
|
||||
err = cudaGetLastError();
|
||||
if (err != cudaSuccess) {
|
||||
std::cerr << "file1_launch_kernel: kernel launch should have passed.\n "
|
||||
"Error message: "
|
||||
<< cudaGetErrorString(err) << std::endl;
|
||||
return 1;
|
||||
}
|
||||
#endif
|
||||
|
||||
file2_launch_kernel(1);
|
||||
err = cudaGetLastError();
|
||||
if (err != cudaSuccess) {
|
||||
std::cerr << "file2_launch_kernel: kernel launch should have passed.\n "
|
||||
"Error message: "
|
||||
<< cudaGetErrorString(err) << std::endl;
|
||||
return 1;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
14
Tests/CudaOnly/RuntimeControls/no_runtime.cmake
Normal file
14
Tests/CudaOnly/RuntimeControls/no_runtime.cmake
Normal file
@@ -0,0 +1,14 @@
|
||||
execute_process(COMMAND ${DUMP_COMMAND} ${DUMP_ARGS} ${TEST_LIBRARY_PATH}
|
||||
RESULT_VARIABLE RESULT
|
||||
OUTPUT_VARIABLE OUTPUT
|
||||
ERROR_VARIABLE ERROR
|
||||
)
|
||||
|
||||
if(NOT "${RESULT}" STREQUAL "0")
|
||||
message(FATAL_ERROR "${DUMP_COMMAND} failed [${RESULT}] [${OUTPUT}] [${ERROR}]")
|
||||
endif()
|
||||
|
||||
if(NOT "${OUTPUT}" MATCHES "(__cuda)")
|
||||
message(FATAL_ERROR
|
||||
"not missing cuda device symbols, static runtime linking was used.")
|
||||
endif()
|
||||
14
Tests/CudaOnly/RuntimeControls/uses_static_runtime.cmake
Normal file
14
Tests/CudaOnly/RuntimeControls/uses_static_runtime.cmake
Normal file
@@ -0,0 +1,14 @@
|
||||
execute_process(COMMAND ${DUMP_COMMAND} ${DUMP_ARGS} ${TEST_LIBRARY_PATH}
|
||||
RESULT_VARIABLE RESULT
|
||||
OUTPUT_VARIABLE OUTPUT
|
||||
ERROR_VARIABLE ERROR
|
||||
)
|
||||
|
||||
if(NOT "${RESULT}" STREQUAL "0")
|
||||
message(FATAL_ERROR "${DUMP_COMMAND} failed [${RESULT}] [${OUTPUT}] [${ERROR}]")
|
||||
endif()
|
||||
|
||||
if("${OUTPUT}" MATCHES "__cuda")
|
||||
message(FATAL_ERROR
|
||||
"missing cuda device symbols, static runtime linking was not used.")
|
||||
endif()
|
||||
16
Tests/CudaOnly/RuntimeControls/verify_runtime.cmake
Normal file
16
Tests/CudaOnly/RuntimeControls/verify_runtime.cmake
Normal file
@@ -0,0 +1,16 @@
|
||||
|
||||
string(REPLACE "|" ";" dirs "${EXTRA_LIB_DIRS}")
|
||||
file(GET_RUNTIME_DEPENDENCIES
|
||||
RESOLVED_DEPENDENCIES_VAR resolved_libs
|
||||
UNRESOLVED_DEPENDENCIES_VAR unresolved_libs
|
||||
DIRECTORIES ${dirs}
|
||||
EXECUTABLES ${EXEC_PATH}
|
||||
)
|
||||
|
||||
list(FILTER resolved_libs INCLUDE REGEX ".*cudart.*")
|
||||
list(LENGTH resolved_libs has_cudart)
|
||||
|
||||
if(has_cudart EQUAL 0)
|
||||
message(FATAL_ERROR
|
||||
"missing cudart shared library from runtime dependency output.")
|
||||
endif()
|
||||
42
Tests/CudaOnly/SharedRuntimePlusToolkit/CMakeLists.txt
Normal file
42
Tests/CudaOnly/SharedRuntimePlusToolkit/CMakeLists.txt
Normal file
@@ -0,0 +1,42 @@
|
||||
cmake_minimum_required(VERSION 3.15)
|
||||
project(SharedRuntimePlusToolkit CUDA)
|
||||
|
||||
#Goal for this example:
|
||||
# Validate that with c++ we can use some components of the CUDA toolkit, and
|
||||
# specify the cuda runtime
|
||||
find_package(CUDAToolkit REQUIRED)
|
||||
|
||||
add_library(Common OBJECT curand.cu nppif.cu)
|
||||
target_link_libraries(Common PRIVATE CUDA::toolkit)
|
||||
set_target_properties(Common PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
||||
|
||||
#shared runtime with shared toolkit libraries
|
||||
add_library(SharedToolkit SHARED shared.cu)
|
||||
target_link_libraries(SharedToolkit PRIVATE Common PUBLIC CUDA::curand CUDA::nppif)
|
||||
set_target_properties(SharedToolkit PROPERTIES CUDA_RUNTIME_LIBRARY none)
|
||||
target_link_libraries(SharedToolkit PUBLIC CUDA::cudart)
|
||||
|
||||
# The CUDA only ships the shared version of the toolkit libraries
|
||||
# on windows
|
||||
if(NOT WIN32)
|
||||
#shared runtime with static toolkit libraries
|
||||
add_library(StaticToolkit SHARED static.cu)
|
||||
target_link_libraries(StaticToolkit PRIVATE Common CUDA::curand_static CUDA::nppif_static)
|
||||
set_target_properties(StaticToolkit PROPERTIES CUDA_RUNTIME_LIBRARY Shared)
|
||||
|
||||
#static runtime with mixed toolkit libraries
|
||||
add_library(MixedToolkit SHARED mixed.cu)
|
||||
target_link_libraries(MixedToolkit PRIVATE Common CUDA::curand_static CUDA::nppif)
|
||||
set_target_properties(MixedToolkit PROPERTIES CUDA_RUNTIME_LIBRARY Shared)
|
||||
endif()
|
||||
|
||||
add_executable(CudaOnlySharedRuntimePlusToolkit main.cu)
|
||||
target_link_libraries(CudaOnlySharedRuntimePlusToolkit PRIVATE SharedToolkit
|
||||
$<TARGET_NAME_IF_EXISTS:StaticToolkit>
|
||||
$<TARGET_NAME_IF_EXISTS:MixedToolkit>)
|
||||
|
||||
if(UNIX)
|
||||
# Help the shared cuda runtime find libcudart as it is not located
|
||||
# in a default system searched location
|
||||
set_property(TARGET CudaOnlySharedRuntimePlusToolkit PROPERTY BUILD_RPATH ${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES})
|
||||
endif()
|
||||
65
Tests/CudaOnly/SharedRuntimePlusToolkit/curand.cu
Normal file
65
Tests/CudaOnly/SharedRuntimePlusToolkit/curand.cu
Normal file
@@ -0,0 +1,65 @@
|
||||
// Comes from:
|
||||
// https://docs.nvidia.com/cuda/curand/host-api-overview.html#host-api-example
|
||||
|
||||
#ifdef _WIN32
|
||||
# define EXPORT __declspec(dllexport)
|
||||
#else
|
||||
# define EXPORT
|
||||
#endif
|
||||
|
||||
/*
|
||||
* This program uses the host CURAND API to generate 100
|
||||
* pseudorandom floats.
|
||||
*/
|
||||
#include <cuda.h>
|
||||
#include <curand.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
#define CUDA_CALL(x) \
|
||||
do { \
|
||||
if ((x) != cudaSuccess) { \
|
||||
printf("Error at %s:%d\n", __FILE__, __LINE__); \
|
||||
return EXIT_FAILURE; \
|
||||
} \
|
||||
} while (0)
|
||||
#define CURAND_CALL(x) \
|
||||
do { \
|
||||
if ((x) != CURAND_STATUS_SUCCESS) { \
|
||||
printf("Error at %s:%d\n", __FILE__, __LINE__); \
|
||||
return EXIT_FAILURE; \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
EXPORT int curand_main()
|
||||
{
|
||||
size_t n = 100;
|
||||
size_t i;
|
||||
curandGenerator_t gen;
|
||||
float *devData, *hostData;
|
||||
|
||||
/* Allocate n floats on host */
|
||||
hostData = (float*)calloc(n, sizeof(float));
|
||||
|
||||
/* Allocate n floats on device */
|
||||
CUDA_CALL(cudaMalloc((void**)&devData, n * sizeof(float)));
|
||||
|
||||
/* Create pseudo-random number generator */
|
||||
CURAND_CALL(curandCreateGenerator(&gen, CURAND_RNG_PSEUDO_DEFAULT));
|
||||
|
||||
/* Set seed */
|
||||
CURAND_CALL(curandSetPseudoRandomGeneratorSeed(gen, 1234ULL));
|
||||
|
||||
/* Generate n floats on device */
|
||||
CURAND_CALL(curandGenerateUniform(gen, devData, n));
|
||||
|
||||
/* Copy device memory to host */
|
||||
CUDA_CALL(
|
||||
cudaMemcpy(hostData, devData, n * sizeof(float), cudaMemcpyDeviceToHost));
|
||||
|
||||
/* Cleanup */
|
||||
CURAND_CALL(curandDestroyGenerator(gen));
|
||||
CUDA_CALL(cudaFree(devData));
|
||||
free(hostData);
|
||||
return EXIT_SUCCESS;
|
||||
}
|
||||
23
Tests/CudaOnly/SharedRuntimePlusToolkit/main.cu
Normal file
23
Tests/CudaOnly/SharedRuntimePlusToolkit/main.cu
Normal file
@@ -0,0 +1,23 @@
|
||||
|
||||
#ifdef _WIN32
|
||||
# define IMPORT __declspec(dllimport)
|
||||
IMPORT int shared_version();
|
||||
int static_version()
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
int mixed_version()
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
#else
|
||||
int shared_version();
|
||||
int static_version();
|
||||
int mixed_version();
|
||||
#endif
|
||||
|
||||
int main()
|
||||
{
|
||||
return mixed_version() == 0 && shared_version() == 0 &&
|
||||
static_version() == 0;
|
||||
}
|
||||
16
Tests/CudaOnly/SharedRuntimePlusToolkit/mixed.cu
Normal file
16
Tests/CudaOnly/SharedRuntimePlusToolkit/mixed.cu
Normal file
@@ -0,0 +1,16 @@
|
||||
|
||||
#ifdef _WIN32
|
||||
# define IMPORT __declspec(dllimport)
|
||||
# define EXPORT __declspec(dllexport)
|
||||
#else
|
||||
# define IMPORT
|
||||
# define EXPORT
|
||||
#endif
|
||||
|
||||
IMPORT int curand_main();
|
||||
IMPORT int nppif_main();
|
||||
|
||||
EXPORT int mixed_version()
|
||||
{
|
||||
return curand_main() == 0 && nppif_main() == 0;
|
||||
}
|
||||
92
Tests/CudaOnly/SharedRuntimePlusToolkit/nppif.cu
Normal file
92
Tests/CudaOnly/SharedRuntimePlusToolkit/nppif.cu
Normal file
@@ -0,0 +1,92 @@
|
||||
// Comes from
|
||||
// https://devtalk.nvidia.com/default/topic/1037482/gpu-accelerated-libraries/help-me-help-you-with-modern-cmake-and-cuda-mwe-for-npp/post/5271066/#5271066
|
||||
|
||||
#ifdef _WIN32
|
||||
# define EXPORT __declspec(dllexport)
|
||||
#else
|
||||
# define EXPORT
|
||||
#endif
|
||||
|
||||
#include <cstdio>
|
||||
#include <iostream>
|
||||
|
||||
#include <assert.h>
|
||||
#include <cuda_runtime_api.h>
|
||||
#include <nppi_filtering_functions.h>
|
||||
|
||||
EXPORT int nppif_main()
|
||||
{
|
||||
/**
|
||||
* 8-bit unsigned single-channel 1D row convolution.
|
||||
*/
|
||||
const int simgrows = 32;
|
||||
const int simgcols = 32;
|
||||
Npp8u *d_pSrc, *d_pDst;
|
||||
const int nMaskSize = 3;
|
||||
NppiSize oROI;
|
||||
oROI.width = simgcols - nMaskSize;
|
||||
oROI.height = simgrows;
|
||||
const int simgsize = simgrows * simgcols * sizeof(d_pSrc[0]);
|
||||
const int dimgsize = oROI.width * oROI.height * sizeof(d_pSrc[0]);
|
||||
const int simgpix = simgrows * simgcols;
|
||||
const int dimgpix = oROI.width * oROI.height;
|
||||
const int nSrcStep = simgcols * sizeof(d_pSrc[0]);
|
||||
const int nDstStep = oROI.width * sizeof(d_pDst[0]);
|
||||
const int pixval = 1;
|
||||
const int nDivisor = 1;
|
||||
const Npp32s h_pKernel[nMaskSize] = { pixval, pixval, pixval };
|
||||
Npp32s* d_pKernel;
|
||||
const Npp32s nAnchor = 2;
|
||||
cudaError_t err = cudaMalloc((void**)&d_pSrc, simgsize);
|
||||
if (err != cudaSuccess) {
|
||||
fprintf(stderr, "Cuda error %d\n", __LINE__);
|
||||
return 1;
|
||||
}
|
||||
err = cudaMalloc((void**)&d_pDst, dimgsize);
|
||||
if (err != cudaSuccess) {
|
||||
fprintf(stderr, "Cuda error %d\n", __LINE__);
|
||||
return 1;
|
||||
}
|
||||
err = cudaMalloc((void**)&d_pKernel, nMaskSize * sizeof(d_pKernel[0]));
|
||||
if (err != cudaSuccess) {
|
||||
fprintf(stderr, "Cuda error %d\n", __LINE__);
|
||||
return 1;
|
||||
}
|
||||
// set image to pixval initially
|
||||
err = cudaMemset(d_pSrc, pixval, simgsize);
|
||||
if (err != cudaSuccess) {
|
||||
fprintf(stderr, "Cuda error %d\n", __LINE__);
|
||||
return 1;
|
||||
}
|
||||
err = cudaMemset(d_pDst, 0, dimgsize);
|
||||
if (err != cudaSuccess) {
|
||||
fprintf(stderr, "Cuda error %d\n", __LINE__);
|
||||
return 1;
|
||||
}
|
||||
err = cudaMemcpy(d_pKernel, h_pKernel, nMaskSize * sizeof(d_pKernel[0]),
|
||||
cudaMemcpyHostToDevice);
|
||||
if (err != cudaSuccess) {
|
||||
fprintf(stderr, "Cuda error %d\n", __LINE__);
|
||||
return 1;
|
||||
}
|
||||
// copy src to dst
|
||||
NppStatus ret =
|
||||
nppiFilterRow_8u_C1R(d_pSrc, nSrcStep, d_pDst, nDstStep, oROI, d_pKernel,
|
||||
nMaskSize, nAnchor, nDivisor);
|
||||
assert(ret == NPP_NO_ERROR);
|
||||
Npp8u* h_imgres = new Npp8u[dimgpix];
|
||||
err = cudaMemcpy(h_imgres, d_pDst, dimgsize, cudaMemcpyDeviceToHost);
|
||||
if (err != cudaSuccess) {
|
||||
fprintf(stderr, "Cuda error %d\n", __LINE__);
|
||||
return 1;
|
||||
}
|
||||
// test for filtering
|
||||
for (int i = 0; i < dimgpix; i++) {
|
||||
if (h_imgres[i] != (pixval * pixval * nMaskSize)) {
|
||||
fprintf(stderr, "h_imgres at index %d failed to match\n", i);
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
16
Tests/CudaOnly/SharedRuntimePlusToolkit/shared.cu
Normal file
16
Tests/CudaOnly/SharedRuntimePlusToolkit/shared.cu
Normal file
@@ -0,0 +1,16 @@
|
||||
|
||||
#ifdef _WIN32
|
||||
# define IMPORT __declspec(dllimport)
|
||||
# define EXPORT __declspec(dllexport)
|
||||
#else
|
||||
# define IMPORT
|
||||
# define EXPORT
|
||||
#endif
|
||||
|
||||
int curand_main();
|
||||
int nppif_main();
|
||||
|
||||
EXPORT int shared_version()
|
||||
{
|
||||
return curand_main() == 0 && nppif_main() == 0;
|
||||
}
|
||||
16
Tests/CudaOnly/SharedRuntimePlusToolkit/static.cu
Normal file
16
Tests/CudaOnly/SharedRuntimePlusToolkit/static.cu
Normal file
@@ -0,0 +1,16 @@
|
||||
|
||||
#ifdef _WIN32
|
||||
# define IMPORT __declspec(dllimport)
|
||||
# define EXPORT __declspec(dllexport)
|
||||
#else
|
||||
# define IMPORT
|
||||
# define EXPORT
|
||||
#endif
|
||||
|
||||
IMPORT int curand_main();
|
||||
IMPORT int nppif_main();
|
||||
|
||||
EXPORT int static_version()
|
||||
{
|
||||
return curand_main() == 0 && nppif_main() == 0;
|
||||
}
|
||||
29
Tests/CudaOnly/StaticRuntimePlusToolkit/CMakeLists.txt
Normal file
29
Tests/CudaOnly/StaticRuntimePlusToolkit/CMakeLists.txt
Normal file
@@ -0,0 +1,29 @@
|
||||
cmake_minimum_required(VERSION 3.15)
|
||||
project(StaticRuntimePlusToolkit CUDA)
|
||||
|
||||
#Goal for this example:
|
||||
# Validate that with cuda we can use some components of the CUDA toolkit, and
|
||||
# specify the cuda runtime
|
||||
find_package(CUDAToolkit REQUIRED)
|
||||
|
||||
add_library(Common OBJECT curand.cu nppif.cu)
|
||||
target_link_libraries(Common PRIVATE CUDA::toolkit)
|
||||
set_target_properties(Common PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
||||
|
||||
#static runtime with shared toolkit libraries
|
||||
add_library(SharedToolkit SHARED shared.cu)
|
||||
target_link_libraries(SharedToolkit PRIVATE Common CUDA::curand CUDA::nppif )
|
||||
set_target_properties(SharedToolkit PROPERTIES CUDA_RUNTIME_LIBRARY none)
|
||||
target_link_libraries(SharedToolkit PUBLIC CUDA::cudart_static)
|
||||
|
||||
#static runtime with static toolkit libraries
|
||||
add_library(StaticToolkit SHARED static.cu)
|
||||
target_link_libraries(StaticToolkit PRIVATE Common CUDA::curand_static CUDA::nppif_static)
|
||||
|
||||
#static runtime with mixed toolkit libraries
|
||||
add_library(MixedToolkit SHARED mixed.cu)
|
||||
target_link_libraries(MixedToolkit PRIVATE Common CUDA::curand CUDA::nppif_static)
|
||||
set_target_properties(MixedToolkit PROPERTIES CUDA_RUNTIME_LIBRARY Static)
|
||||
|
||||
add_executable(CudaOnlyStaticRuntimePlusToolkit main.cu)
|
||||
target_link_libraries(CudaOnlyStaticRuntimePlusToolkit PRIVATE SharedToolkit StaticToolkit MixedToolkit)
|
||||
59
Tests/CudaOnly/StaticRuntimePlusToolkit/curand.cu
Normal file
59
Tests/CudaOnly/StaticRuntimePlusToolkit/curand.cu
Normal file
@@ -0,0 +1,59 @@
|
||||
// Comes from:
|
||||
// https://docs.nvidia.com/cuda/curand/host-api-overview.html#host-api-example
|
||||
|
||||
/*
|
||||
* This program uses the host CURAND API to generate 100
|
||||
* pseudorandom floats.
|
||||
*/
|
||||
#include <cuda.h>
|
||||
#include <curand.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
#define CUDA_CALL(x) \
|
||||
do { \
|
||||
if ((x) != cudaSuccess) { \
|
||||
printf("Error at %s:%d\n", __FILE__, __LINE__); \
|
||||
return EXIT_FAILURE; \
|
||||
} \
|
||||
} while (0)
|
||||
#define CURAND_CALL(x) \
|
||||
do { \
|
||||
if ((x) != CURAND_STATUS_SUCCESS) { \
|
||||
printf("Error at %s:%d\n", __FILE__, __LINE__); \
|
||||
return EXIT_FAILURE; \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
int curand_main()
|
||||
{
|
||||
size_t n = 100;
|
||||
size_t i;
|
||||
curandGenerator_t gen;
|
||||
float *devData, *hostData;
|
||||
|
||||
/* Allocate n floats on host */
|
||||
hostData = (float*)calloc(n, sizeof(float));
|
||||
|
||||
/* Allocate n floats on device */
|
||||
CUDA_CALL(cudaMalloc((void**)&devData, n * sizeof(float)));
|
||||
|
||||
/* Create pseudo-random number generator */
|
||||
CURAND_CALL(curandCreateGenerator(&gen, CURAND_RNG_PSEUDO_DEFAULT));
|
||||
|
||||
/* Set seed */
|
||||
CURAND_CALL(curandSetPseudoRandomGeneratorSeed(gen, 1234ULL));
|
||||
|
||||
/* Generate n floats on device */
|
||||
CURAND_CALL(curandGenerateUniform(gen, devData, n));
|
||||
|
||||
/* Copy device memory to host */
|
||||
CUDA_CALL(
|
||||
cudaMemcpy(hostData, devData, n * sizeof(float), cudaMemcpyDeviceToHost));
|
||||
|
||||
/* Cleanup */
|
||||
CURAND_CALL(curandDestroyGenerator(gen));
|
||||
CUDA_CALL(cudaFree(devData));
|
||||
free(hostData);
|
||||
return EXIT_SUCCESS;
|
||||
}
|
||||
11
Tests/CudaOnly/StaticRuntimePlusToolkit/main.cu
Normal file
11
Tests/CudaOnly/StaticRuntimePlusToolkit/main.cu
Normal file
@@ -0,0 +1,11 @@
|
||||
|
||||
|
||||
int shared_version();
|
||||
int static_version();
|
||||
int mixed_version();
|
||||
|
||||
int main()
|
||||
{
|
||||
return mixed_version() == 0 && shared_version() == 0 &&
|
||||
static_version() == 0;
|
||||
}
|
||||
8
Tests/CudaOnly/StaticRuntimePlusToolkit/mixed.cu
Normal file
8
Tests/CudaOnly/StaticRuntimePlusToolkit/mixed.cu
Normal file
@@ -0,0 +1,8 @@
|
||||
|
||||
int curand_main();
|
||||
int nppif_main();
|
||||
|
||||
int mixed_version()
|
||||
{
|
||||
return curand_main() == 0 && nppif_main() == 0;
|
||||
}
|
||||
86
Tests/CudaOnly/StaticRuntimePlusToolkit/nppif.cu
Normal file
86
Tests/CudaOnly/StaticRuntimePlusToolkit/nppif.cu
Normal file
@@ -0,0 +1,86 @@
|
||||
// Comes from
|
||||
// https://devtalk.nvidia.com/default/topic/1037482/gpu-accelerated-libraries/help-me-help-you-with-modern-cmake-and-cuda-mwe-for-npp/post/5271066/#5271066
|
||||
|
||||
#include <cstdio>
|
||||
#include <iostream>
|
||||
|
||||
#include <assert.h>
|
||||
#include <cuda_runtime_api.h>
|
||||
#include <nppi_filtering_functions.h>
|
||||
|
||||
int nppif_main()
|
||||
{
|
||||
/**
|
||||
* 8-bit unsigned single-channel 1D row convolution.
|
||||
*/
|
||||
const int simgrows = 32;
|
||||
const int simgcols = 32;
|
||||
Npp8u *d_pSrc, *d_pDst;
|
||||
const int nMaskSize = 3;
|
||||
NppiSize oROI;
|
||||
oROI.width = simgcols - nMaskSize;
|
||||
oROI.height = simgrows;
|
||||
const int simgsize = simgrows * simgcols * sizeof(d_pSrc[0]);
|
||||
const int dimgsize = oROI.width * oROI.height * sizeof(d_pSrc[0]);
|
||||
const int simgpix = simgrows * simgcols;
|
||||
const int dimgpix = oROI.width * oROI.height;
|
||||
const int nSrcStep = simgcols * sizeof(d_pSrc[0]);
|
||||
const int nDstStep = oROI.width * sizeof(d_pDst[0]);
|
||||
const int pixval = 1;
|
||||
const int nDivisor = 1;
|
||||
const Npp32s h_pKernel[nMaskSize] = { pixval, pixval, pixval };
|
||||
Npp32s* d_pKernel;
|
||||
const Npp32s nAnchor = 2;
|
||||
cudaError_t err = cudaMalloc((void**)&d_pSrc, simgsize);
|
||||
if (err != cudaSuccess) {
|
||||
fprintf(stderr, "Cuda error %d\n", __LINE__);
|
||||
return 1;
|
||||
}
|
||||
err = cudaMalloc((void**)&d_pDst, dimgsize);
|
||||
if (err != cudaSuccess) {
|
||||
fprintf(stderr, "Cuda error %d\n", __LINE__);
|
||||
return 1;
|
||||
}
|
||||
err = cudaMalloc((void**)&d_pKernel, nMaskSize * sizeof(d_pKernel[0]));
|
||||
if (err != cudaSuccess) {
|
||||
fprintf(stderr, "Cuda error %d\n", __LINE__);
|
||||
return 1;
|
||||
}
|
||||
// set image to pixval initially
|
||||
err = cudaMemset(d_pSrc, pixval, simgsize);
|
||||
if (err != cudaSuccess) {
|
||||
fprintf(stderr, "Cuda error %d\n", __LINE__);
|
||||
return 1;
|
||||
}
|
||||
err = cudaMemset(d_pDst, 0, dimgsize);
|
||||
if (err != cudaSuccess) {
|
||||
fprintf(stderr, "Cuda error %d\n", __LINE__);
|
||||
return 1;
|
||||
}
|
||||
err = cudaMemcpy(d_pKernel, h_pKernel, nMaskSize * sizeof(d_pKernel[0]),
|
||||
cudaMemcpyHostToDevice);
|
||||
if (err != cudaSuccess) {
|
||||
fprintf(stderr, "Cuda error %d\n", __LINE__);
|
||||
return 1;
|
||||
}
|
||||
// copy src to dst
|
||||
NppStatus ret =
|
||||
nppiFilterRow_8u_C1R(d_pSrc, nSrcStep, d_pDst, nDstStep, oROI, d_pKernel,
|
||||
nMaskSize, nAnchor, nDivisor);
|
||||
assert(ret == NPP_NO_ERROR);
|
||||
Npp8u* h_imgres = new Npp8u[dimgpix];
|
||||
err = cudaMemcpy(h_imgres, d_pDst, dimgsize, cudaMemcpyDeviceToHost);
|
||||
if (err != cudaSuccess) {
|
||||
fprintf(stderr, "Cuda error %d\n", __LINE__);
|
||||
return 1;
|
||||
}
|
||||
// test for filtering
|
||||
for (int i = 0; i < dimgpix; i++) {
|
||||
if (h_imgres[i] != (pixval * pixval * nMaskSize)) {
|
||||
fprintf(stderr, "h_imgres at index %d failed to match\n", i);
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
8
Tests/CudaOnly/StaticRuntimePlusToolkit/shared.cu
Normal file
8
Tests/CudaOnly/StaticRuntimePlusToolkit/shared.cu
Normal file
@@ -0,0 +1,8 @@
|
||||
|
||||
int curand_main();
|
||||
int nppif_main();
|
||||
|
||||
int shared_version()
|
||||
{
|
||||
return curand_main() == 0 && nppif_main() == 0;
|
||||
}
|
||||
8
Tests/CudaOnly/StaticRuntimePlusToolkit/static.cu
Normal file
8
Tests/CudaOnly/StaticRuntimePlusToolkit/static.cu
Normal file
@@ -0,0 +1,8 @@
|
||||
|
||||
int curand_main();
|
||||
int nppif_main();
|
||||
|
||||
int static_version()
|
||||
{
|
||||
return curand_main() == 0 && nppif_main() == 0;
|
||||
}
|
||||
Reference in New Issue
Block a user