mirror of
https://github.com/Kitware/CMake.git
synced 2026-01-11 16:32:14 -06:00
CUDA: Allow both CUDA_SEPARABLE_COMPILATION and CUDA_PTX_COMPILATION
The target properties `CUDA_SEPARABLE_COMPILATION` and `CUDA_PTX_COMPILATION` now aren't mutually exclusive and can now be used together on the same target.
This commit is contained in:
5
Help/release/dev/cuda-ptx-separable-compilation.rst
Normal file
5
Help/release/dev/cuda-ptx-separable-compilation.rst
Normal file
@@ -0,0 +1,5 @@
|
||||
cuda-ptx-separable-compilation
|
||||
------------------------------
|
||||
|
||||
* ``CUDA`` targets can now enable both :prop_tgt:`CUDA_SEPARABLE_COMPILATION` and
|
||||
:prop_tgt:`CUDA_PTX_COMPILATION`.
|
||||
@@ -160,22 +160,9 @@ if(NOT DEFINED CMAKE_CUDA_ARCHIVE_FINISH)
|
||||
set(CMAKE_CUDA_ARCHIVE_FINISH "<CMAKE_RANLIB> <TARGET>")
|
||||
endif()
|
||||
|
||||
#Specify how to compile when ptx has been requested
|
||||
if(NOT CMAKE_CUDA_COMPILE_PTX_COMPILATION)
|
||||
set(CMAKE_CUDA_COMPILE_PTX_COMPILATION
|
||||
"<CMAKE_CUDA_COMPILER> ${_CMAKE_CUDA_EXTRA_FLAGS} <DEFINES> <INCLUDES> <FLAGS> ${_CMAKE_COMPILE_AS_CUDA_FLAG} ${_CMAKE_CUDA_PTX_FLAG} <SOURCE> -o <OBJECT>")
|
||||
endif()
|
||||
|
||||
#Specify how to compile when separable compilation has been requested
|
||||
if(NOT CMAKE_CUDA_COMPILE_SEPARABLE_COMPILATION)
|
||||
set(CMAKE_CUDA_COMPILE_SEPARABLE_COMPILATION
|
||||
"<CMAKE_CUDA_COMPILER> ${_CMAKE_CUDA_EXTRA_FLAGS} <DEFINES> <INCLUDES> <FLAGS> ${_CMAKE_COMPILE_AS_CUDA_FLAG} ${_CMAKE_CUDA_DEVICE_CODE} <SOURCE> -o <OBJECT>")
|
||||
endif()
|
||||
|
||||
#Specify how to compile when whole compilation has been requested
|
||||
if(NOT CMAKE_CUDA_COMPILE_WHOLE_COMPILATION)
|
||||
set(CMAKE_CUDA_COMPILE_WHOLE_COMPILATION
|
||||
"<CMAKE_CUDA_COMPILER> ${_CMAKE_CUDA_EXTRA_FLAGS} <DEFINES> <INCLUDES> <FLAGS> ${_CMAKE_COMPILE_AS_CUDA_FLAG} -c <SOURCE> -o <OBJECT>")
|
||||
if(NOT CMAKE_CUDA_COMPILE_OBJECT)
|
||||
set(CMAKE_CUDA_COMPILE_OBJECT
|
||||
"<CMAKE_CUDA_COMPILER> ${_CMAKE_CUDA_EXTRA_FLAGS} <DEFINES> <INCLUDES> <FLAGS> ${_CMAKE_COMPILE_AS_CUDA_FLAG} <CUDA_COMPILE_MODE> <SOURCE> -o <OBJECT>")
|
||||
endif()
|
||||
|
||||
# compile a cu file into an executable
|
||||
|
||||
@@ -18,8 +18,9 @@ __compiler_clang_cxx_standards(CUDA)
|
||||
|
||||
set(CMAKE_CUDA_COMPILER_HAS_DEVICE_LINK_PHASE TRUE)
|
||||
set(_CMAKE_COMPILE_AS_CUDA_FLAG "-x cuda")
|
||||
set(_CMAKE_CUDA_WHOLE_FLAG "-c")
|
||||
set(_CMAKE_CUDA_RDC_FLAG "-fgpu-rdc")
|
||||
set(_CMAKE_CUDA_PTX_FLAG "--cuda-device-only -S")
|
||||
set(_CMAKE_CUDA_DEVICE_CODE "-fgpu-rdc -c")
|
||||
|
||||
# RulePlaceholderExpander expands crosscompile variables like sysroot and target only for CMAKE_<LANG>_COMPILER. Override the default.
|
||||
set(CMAKE_CUDA_LINK_EXECUTABLE "<CMAKE_CUDA_COMPILER> <LINK_FLAGS> <OBJECTS> -o <TARGET> <LINK_LIBRARIES>${__IMPLICIT_LINKS}")
|
||||
|
||||
@@ -5,8 +5,9 @@ set(CMAKE_CUDA_VERBOSE_FLAG "-v")
|
||||
set(CMAKE_CUDA_VERBOSE_COMPILE_FLAG "-Xcompiler=-v")
|
||||
|
||||
set(_CMAKE_COMPILE_AS_CUDA_FLAG "-x cu")
|
||||
set(_CMAKE_CUDA_WHOLE_FLAG "-c")
|
||||
set(_CMAKE_CUDA_RDC_FLAG "-rdc=true")
|
||||
set(_CMAKE_CUDA_PTX_FLAG "-ptx")
|
||||
set(_CMAKE_CUDA_DEVICE_CODE "-dc")
|
||||
|
||||
if (CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 10.2.89)
|
||||
# The -forward-unknown-to-host-compiler flag was only
|
||||
|
||||
@@ -1,11 +1,7 @@
|
||||
include(Platform/Windows-MSVC)
|
||||
|
||||
set(CMAKE_CUDA_COMPILE_PTX_COMPILATION
|
||||
"<CMAKE_CUDA_COMPILER> ${_CMAKE_CUDA_EXTRA_FLAGS} <DEFINES> <INCLUDES> <FLAGS> ${_CMAKE_COMPILE_AS_CUDA_FLAG} -ptx <SOURCE> -o <OBJECT> -Xcompiler=-Fd<TARGET_COMPILE_PDB>,-FS")
|
||||
set(CMAKE_CUDA_COMPILE_SEPARABLE_COMPILATION
|
||||
"<CMAKE_CUDA_COMPILER> ${_CMAKE_CUDA_EXTRA_FLAGS} <DEFINES> <INCLUDES> <FLAGS> ${_CMAKE_COMPILE_AS_CUDA_FLAG} -dc <SOURCE> -o <OBJECT> -Xcompiler=-Fd<TARGET_COMPILE_PDB>,-FS")
|
||||
set(CMAKE_CUDA_COMPILE_WHOLE_COMPILATION
|
||||
"<CMAKE_CUDA_COMPILER> ${_CMAKE_CUDA_EXTRA_FLAGS} <DEFINES> <INCLUDES> <FLAGS> ${_CMAKE_COMPILE_AS_CUDA_FLAG} -c <SOURCE> -o <OBJECT> -Xcompiler=-Fd<TARGET_COMPILE_PDB>,-FS")
|
||||
set(CMAKE_CUDA_COMPILE_OBJECT
|
||||
"<CMAKE_CUDA_COMPILER> ${_CMAKE_CUDA_EXTRA_FLAGS} <DEFINES> <INCLUDES> <FLAGS> ${_CMAKE_COMPILE_AS_CUDA_FLAG} <CUDA_COMPILE_MODE> <SOURCE> -o <OBJECT> -Xcompiler=-Fd<TARGET_COMPILE_PDB>,-FS")
|
||||
|
||||
set(__IMPLICIT_LINKS)
|
||||
foreach(dir ${CMAKE_CUDA_HOST_IMPLICIT_LINK_DIRECTORIES})
|
||||
|
||||
@@ -897,28 +897,31 @@ void cmMakefileTargetGenerator::WriteObjectRuleFiles(
|
||||
|
||||
// Construct the compile rules.
|
||||
{
|
||||
std::vector<std::string> compileCommands;
|
||||
std::string cudaCompileMode;
|
||||
if (lang == "CUDA") {
|
||||
std::string cmdVar;
|
||||
if (this->GeneratorTarget->GetPropertyAsBool(
|
||||
"CUDA_SEPARABLE_COMPILATION")) {
|
||||
cmdVar = "CMAKE_CUDA_COMPILE_SEPARABLE_COMPILATION";
|
||||
} else if (this->GeneratorTarget->GetPropertyAsBool(
|
||||
"CUDA_PTX_COMPILATION")) {
|
||||
cmdVar = "CMAKE_CUDA_COMPILE_PTX_COMPILATION";
|
||||
} else {
|
||||
cmdVar = "CMAKE_CUDA_COMPILE_WHOLE_COMPILATION";
|
||||
const std::string& rdcFlag =
|
||||
this->Makefile->GetRequiredDefinition("_CMAKE_CUDA_RDC_FLAG");
|
||||
cudaCompileMode = cmStrCat(cudaCompileMode, rdcFlag, " ");
|
||||
}
|
||||
const std::string& compileRule =
|
||||
this->Makefile->GetRequiredDefinition(cmdVar);
|
||||
cmExpandList(compileRule, compileCommands);
|
||||
} else {
|
||||
const std::string cmdVar = "CMAKE_" + lang + "_COMPILE_OBJECT";
|
||||
const std::string& compileRule =
|
||||
this->Makefile->GetRequiredDefinition(cmdVar);
|
||||
cmExpandList(compileRule, compileCommands);
|
||||
if (this->GeneratorTarget->GetPropertyAsBool("CUDA_PTX_COMPILATION")) {
|
||||
const std::string& ptxFlag =
|
||||
this->Makefile->GetRequiredDefinition("_CMAKE_CUDA_PTX_FLAG");
|
||||
cudaCompileMode = cmStrCat(cudaCompileMode, ptxFlag);
|
||||
} else {
|
||||
const std::string& wholeFlag =
|
||||
this->Makefile->GetRequiredDefinition("_CMAKE_CUDA_WHOLE_FLAG");
|
||||
cudaCompileMode = cmStrCat(cudaCompileMode, wholeFlag);
|
||||
}
|
||||
vars.CudaCompileMode = cudaCompileMode.c_str();
|
||||
}
|
||||
|
||||
std::vector<std::string> compileCommands;
|
||||
const std::string& compileRule = this->Makefile->GetRequiredDefinition(
|
||||
"CMAKE_" + lang + "_COMPILE_OBJECT");
|
||||
cmExpandList(compileRule, compileCommands);
|
||||
|
||||
if (this->GeneratorTarget->GetPropertyAsBool("EXPORT_COMPILE_COMMANDS") &&
|
||||
lang_can_export_cmds && compileCommands.size() == 1) {
|
||||
std::string compileCommand = compileCommands[0];
|
||||
|
||||
@@ -605,6 +605,7 @@ void cmNinjaTargetGenerator::WriteCompileRule(const std::string& lang,
|
||||
vars.TargetCompilePDB = "$TARGET_COMPILE_PDB";
|
||||
vars.ObjectDir = "$OBJECT_DIR";
|
||||
vars.ObjectFileDir = "$OBJECT_FILE_DIR";
|
||||
vars.CudaCompileMode = "$CUDA_COMPILE_MODE";
|
||||
vars.ISPCHeader = "$ISPC_HEADER_FILE";
|
||||
|
||||
cmMakefile* mf = this->GetMakefile();
|
||||
@@ -815,27 +816,32 @@ void cmNinjaTargetGenerator::WriteCompileRule(const std::string& lang,
|
||||
vars.Flags = flags.c_str();
|
||||
vars.DependencyFile = rule.DepFile.c_str();
|
||||
|
||||
// Rule for compiling object file.
|
||||
std::vector<std::string> compileCmds;
|
||||
std::string cudaCompileMode;
|
||||
if (lang == "CUDA") {
|
||||
std::string cmdVar;
|
||||
if (this->GeneratorTarget->GetPropertyAsBool(
|
||||
"CUDA_SEPARABLE_COMPILATION")) {
|
||||
cmdVar = "CMAKE_CUDA_COMPILE_SEPARABLE_COMPILATION";
|
||||
} else if (this->GeneratorTarget->GetPropertyAsBool(
|
||||
"CUDA_PTX_COMPILATION")) {
|
||||
cmdVar = "CMAKE_CUDA_COMPILE_PTX_COMPILATION";
|
||||
} else {
|
||||
cmdVar = "CMAKE_CUDA_COMPILE_WHOLE_COMPILATION";
|
||||
const std::string& rdcFlag =
|
||||
this->Makefile->GetRequiredDefinition("_CMAKE_CUDA_RDC_FLAG");
|
||||
cudaCompileMode = cmStrCat(cudaCompileMode, rdcFlag, " ");
|
||||
}
|
||||
const std::string& compileCmd = mf->GetRequiredDefinition(cmdVar);
|
||||
cmExpandList(compileCmd, compileCmds);
|
||||
} else {
|
||||
const std::string cmdVar = cmStrCat("CMAKE_", lang, "_COMPILE_OBJECT");
|
||||
const std::string& compileCmd = mf->GetRequiredDefinition(cmdVar);
|
||||
cmExpandList(compileCmd, compileCmds);
|
||||
if (this->GeneratorTarget->GetPropertyAsBool("CUDA_PTX_COMPILATION")) {
|
||||
const std::string& ptxFlag =
|
||||
this->Makefile->GetRequiredDefinition("_CMAKE_CUDA_PTX_FLAG");
|
||||
cudaCompileMode = cmStrCat(cudaCompileMode, ptxFlag);
|
||||
} else {
|
||||
const std::string& wholeFlag =
|
||||
this->Makefile->GetRequiredDefinition("_CMAKE_CUDA_WHOLE_FLAG");
|
||||
cudaCompileMode = cmStrCat(cudaCompileMode, wholeFlag);
|
||||
}
|
||||
vars.CudaCompileMode = cudaCompileMode.c_str();
|
||||
}
|
||||
|
||||
// Rule for compiling object file.
|
||||
std::vector<std::string> compileCmds;
|
||||
const std::string cmdVar = cmStrCat("CMAKE_", lang, "_COMPILE_OBJECT");
|
||||
const std::string& compileCmd = mf->GetRequiredDefinition(cmdVar);
|
||||
cmExpandList(compileCmd, compileCmds);
|
||||
|
||||
// See if we need to use a compiler launcher like ccache or distcc
|
||||
std::string compilerLauncher;
|
||||
if (!compileCmds.empty() &&
|
||||
|
||||
@@ -85,6 +85,11 @@ std::string cmRulePlaceholderExpander::ExpandRuleVariable(
|
||||
return replaceValues.ObjectsQuoted;
|
||||
}
|
||||
}
|
||||
if (replaceValues.CudaCompileMode) {
|
||||
if (variable == "CUDA_COMPILE_MODE") {
|
||||
return replaceValues.CudaCompileMode;
|
||||
}
|
||||
}
|
||||
if (replaceValues.AIXExports) {
|
||||
if (variable == "AIX_EXPORTS") {
|
||||
return replaceValues.AIXExports;
|
||||
|
||||
@@ -65,6 +65,7 @@ public:
|
||||
const char* SwiftOutputFileMap = nullptr;
|
||||
const char* SwiftSources = nullptr;
|
||||
const char* ISPCHeader = nullptr;
|
||||
const char* CudaCompileMode = nullptr;
|
||||
const char* Fatbinary = nullptr;
|
||||
const char* RegisterFile = nullptr;
|
||||
const char* Launcher = nullptr;
|
||||
|
||||
@@ -3211,18 +3211,17 @@ bool cmVisualStudio10TargetGenerator::ComputeCudaOptions(
|
||||
// the default to not have any extension
|
||||
cudaOptions.AddFlag("CompileOut", "$(IntDir)%(Filename).obj");
|
||||
|
||||
bool notPtx = true;
|
||||
if (this->GeneratorTarget->GetPropertyAsBool("CUDA_SEPARABLE_COMPILATION")) {
|
||||
cudaOptions.AddFlag("GenerateRelocatableDeviceCode", "true");
|
||||
} else if (this->GeneratorTarget->GetPropertyAsBool(
|
||||
"CUDA_PTX_COMPILATION")) {
|
||||
}
|
||||
bool notPtx = true;
|
||||
if (this->GeneratorTarget->GetPropertyAsBool("CUDA_PTX_COMPILATION")) {
|
||||
cudaOptions.AddFlag("NvccCompilation", "ptx");
|
||||
// We drop the %(Extension) component as CMake expects all PTX files
|
||||
// to not have the source file extension at all
|
||||
cudaOptions.AddFlag("CompileOut", "$(IntDir)%(Filename).ptx");
|
||||
notPtx = false;
|
||||
}
|
||||
|
||||
if (notPtx &&
|
||||
cmSystemTools::VersionCompareGreaterEq(
|
||||
"8.0", this->GlobalGenerator->GetPlatformToolsetCudaString())) {
|
||||
|
||||
@@ -16,6 +16,7 @@ add_cuda_test_macro(CudaOnly.WithDefs CudaOnlyWithDefs)
|
||||
add_cuda_test_macro(CudaOnly.CircularLinkLine CudaOnlyCircularLinkLine)
|
||||
add_cuda_test_macro(CudaOnly.ResolveDeviceSymbols CudaOnlyResolveDeviceSymbols)
|
||||
add_cuda_test_macro(CudaOnly.SeparateCompilation main/CudaOnlySeparateCompilation)
|
||||
add_cuda_test_macro(CudaOnly.SeparateCompilationPTX CudaOnlySeparateCompilationPTX)
|
||||
|
||||
if(CMake_TEST_CUDA AND NOT CMake_TEST_CUDA STREQUAL "Clang")
|
||||
# Clang doesn't have flags for selecting the runtime.
|
||||
|
||||
@@ -56,7 +56,7 @@ add_custom_command(
|
||||
"-DBIN_TO_C_COMMAND=${bin_to_c}"
|
||||
"-DOBJECTS=$<TARGET_OBJECTS:CudaPTX>"
|
||||
"-DOUTPUT=${output_file}"
|
||||
-P ${CMAKE_CURRENT_SOURCE_DIR}/bin2c_wrapper.cmake
|
||||
-P ${CMAKE_CURRENT_SOURCE_DIR}/../utils/bin2c_wrapper.cmake
|
||||
VERBATIM
|
||||
DEPENDS $<TARGET_OBJECTS:CudaPTX>
|
||||
COMMENT "Converting Object files to a C header"
|
||||
|
||||
51
Tests/CudaOnly/SeparateCompilationPTX/CMakeLists.txt
Normal file
51
Tests/CudaOnly/SeparateCompilationPTX/CMakeLists.txt
Normal file
@@ -0,0 +1,51 @@
|
||||
cmake_minimum_required(VERSION 3.19)
|
||||
project (SeparateCompPTX CUDA)
|
||||
|
||||
#Goal for this example:
|
||||
# How to generate PTX files with RDC enabled
|
||||
|
||||
# PTX can be compiled only for a single virtual architecture at a time
|
||||
list(POP_FRONT CMAKE_CUDA_ARCHITECTURES temp)
|
||||
set(CMAKE_CUDA_ARCHITECTURES ${temp})
|
||||
string(APPEND CMAKE_CUDA_ARCHITECTURES "-virtual")
|
||||
|
||||
add_library(CudaPTX OBJECT kernels.cu)
|
||||
set_property(TARGET CudaPTX PROPERTY CUDA_PTX_COMPILATION ON)
|
||||
set_property(TARGET CudaPTX PROPERTY CUDA_SEPARABLE_COMPILATION ON)
|
||||
|
||||
|
||||
set(output_file ${CMAKE_CURRENT_BINARY_DIR}/embedded_objs.h)
|
||||
|
||||
find_package(CUDAToolkit REQUIRED)
|
||||
find_program(bin_to_c
|
||||
NAMES bin2c
|
||||
PATHS ${CUDAToolkit_BIN_DIR}
|
||||
)
|
||||
if(NOT bin_to_c)
|
||||
message(FATAL_ERROR
|
||||
"bin2c not found:\n"
|
||||
" CUDAToolkit_BIN_DIR='${CUDAToolkit_BIN_DIR}'\n"
|
||||
)
|
||||
endif()
|
||||
|
||||
add_custom_command(
|
||||
OUTPUT "${output_file}"
|
||||
COMMAND ${CMAKE_COMMAND}
|
||||
"-DBIN_TO_C_COMMAND=${bin_to_c}"
|
||||
"-DOBJECTS=$<TARGET_OBJECTS:CudaPTX>"
|
||||
"-DOUTPUT=${output_file}"
|
||||
-P ${CMAKE_CURRENT_SOURCE_DIR}/../utils/bin2c_wrapper.cmake
|
||||
VERBATIM
|
||||
DEPENDS $<TARGET_OBJECTS:CudaPTX>
|
||||
COMMENT "Converting Object files to a C header"
|
||||
)
|
||||
|
||||
add_executable(CudaOnlySeparateCompilationPTX main.cu ${output_file})
|
||||
target_compile_features(CudaOnlySeparateCompilationPTX PRIVATE cuda_std_11)
|
||||
target_include_directories(CudaOnlySeparateCompilationPTX PRIVATE
|
||||
${CMAKE_CURRENT_BINARY_DIR} )
|
||||
target_link_libraries(CudaOnlySeparateCompilationPTX PRIVATE CUDA::cuda_driver)
|
||||
if(APPLE)
|
||||
# Help the static cuda runtime find the driver (libcuda.dyllib) at runtime.
|
||||
set_property(TARGET CudaOnlySeparateCompilationPTX PROPERTY BUILD_RPATH ${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES})
|
||||
endif()
|
||||
14
Tests/CudaOnly/SeparateCompilationPTX/kernels.cu
Normal file
14
Tests/CudaOnly/SeparateCompilationPTX/kernels.cu
Normal file
@@ -0,0 +1,14 @@
|
||||
|
||||
__global__ void kernelA(float* r, float* x, float* y, float* z, int size)
|
||||
{
|
||||
for (int i = threadIdx.x; i < size; i += blockDim.x) {
|
||||
r[i] = x[i] * y[i] + z[i];
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void kernelB(float* r, float* x, float* y, float* z, int size)
|
||||
{
|
||||
for (int i = threadIdx.x; i < size; i += blockDim.x) {
|
||||
r[i] = x[i] * y[i] + z[i];
|
||||
}
|
||||
}
|
||||
30
Tests/CudaOnly/SeparateCompilationPTX/main.cu
Normal file
30
Tests/CudaOnly/SeparateCompilationPTX/main.cu
Normal file
@@ -0,0 +1,30 @@
|
||||
#include <iostream>
|
||||
|
||||
#include <cuda.h>
|
||||
|
||||
#include "embedded_objs.h"
|
||||
|
||||
int main()
|
||||
{
|
||||
cuInit(0);
|
||||
int count = 0;
|
||||
cuDeviceGetCount(&count);
|
||||
if (count == 0) {
|
||||
std::cerr << "No CUDA devices found\n";
|
||||
return 1;
|
||||
}
|
||||
|
||||
CUdevice device;
|
||||
cuDeviceGet(&device, 0);
|
||||
|
||||
CUcontext context;
|
||||
cuCtxCreate(&context, 0, device);
|
||||
|
||||
CUmodule module;
|
||||
cuModuleLoadData(&module, kernels);
|
||||
if (module == nullptr) {
|
||||
std::cerr << "Failed to load the embedded ptx" << std::endl;
|
||||
return 1;
|
||||
}
|
||||
std::cout << module << std::endl;
|
||||
}
|
||||
Reference in New Issue
Block a user