From 0b1cea66cd1f80458f0da579d0182d908874939d Mon Sep 17 00:00:00 2001 From: root Date: Tue, 27 Jul 2021 22:58:03 +0300 Subject: [PATCH 1/2] CUDA/Clang: Fix separable compilation in non-root directories with Makefiles Seems the relative paths were wrong basically all around such that only compiling files in the top-level directory would work. I've modified CudaOnly.SeparateCompilation to cover this. Fixes #22482. --- Help/release/3.21.rst | 6 +++++ Source/cmMakefileTargetGenerator.cxx | 24 +++++++++++-------- Tests/CudaOnly/CMakeLists.txt | 2 +- .../SeparateCompilation/CMakeLists.txt | 19 +-------------- .../SeparateCompilation/main/CMakeLists.txt | 18 ++++++++++++++ .../SeparateCompilation/{ => main}/main.cu | 4 ++-- 6 files changed, 42 insertions(+), 31 deletions(-) create mode 100644 Tests/CudaOnly/SeparateCompilation/main/CMakeLists.txt rename Tests/CudaOnly/SeparateCompilation/{ => main}/main.cu (97%) diff --git a/Help/release/3.21.rst b/Help/release/3.21.rst index 3e705529fe..fc5d6ace2f 100644 --- a/Help/release/3.21.rst +++ b/Help/release/3.21.rst @@ -304,3 +304,9 @@ Changes made since CMake 3.21.0 include the following. * The :generator:`Visual Studio 17 2022` generator is now based on "Visual Studio 2022 Preview 2". Previously it was based on "Preview 1.1". + +3.21.2 +------ + +* ``CUDA`` targets with :prop_tgt:`CUDA_SEPARABLE_COMPILATION` enabled are now + correctly generated in non-root directories. diff --git a/Source/cmMakefileTargetGenerator.cxx b/Source/cmMakefileTargetGenerator.cxx index 6d8376c678..6324b2ef83 100644 --- a/Source/cmMakefileTargetGenerator.cxx +++ b/Source/cmMakefileTargetGenerator.cxx @@ -1484,14 +1484,18 @@ void cmMakefileTargetGenerator::WriteDeviceLinkRule( } std::vector architectures = cmExpandedList(architecturesStr); + std::string const& relPath = + this->LocalGenerator->GetHomeRelativeOutputPath(); // Ensure there are no duplicates. const std::vector linkDeps = [&]() -> std::vector { std::vector deps; this->AppendTargetDepends(deps, true); this->GeneratorTarget->GetLinkDepends(deps, this->GetConfigName(), "CUDA"); - std::copy(this->Objects.begin(), this->Objects.end(), - std::back_inserter(deps)); + + for (std::string const& obj : this->Objects) { + deps.emplace_back(cmStrCat(relPath, obj)); + } std::unordered_set depsSet(deps.begin(), deps.end()); deps.clear(); @@ -1510,7 +1514,8 @@ void cmMakefileTargetGenerator::WriteDeviceLinkRule( std::string profiles; std::vector fatbinaryDepends; - std::string registerFile = cmStrCat(objectDir, "cmake_cuda_register.h"); + std::string const registerFile = + cmStrCat(objectDir, "cmake_cuda_register.h"); // Link device code for each architecture. for (const std::string& architectureKind : architectures) { @@ -1518,7 +1523,7 @@ void cmMakefileTargetGenerator::WriteDeviceLinkRule( const std::string architecture = architectureKind.substr(0, architectureKind.find('-')); const std::string cubin = - cmStrCat(relObjectDir, "sm_", architecture, ".cubin"); + cmStrCat(objectDir, "sm_", architecture, ".cubin"); profiles += cmStrCat(" -im=profile=sm_", architecture, ",file=", cubin); fatbinaryDepends.emplace_back(cubin); @@ -1530,8 +1535,8 @@ void cmMakefileTargetGenerator::WriteDeviceLinkRule( // all architectures the register file will be the same too. Thus // generate it only on the first invocation to reduce overhead. if (fatbinaryDepends.size() == 1) { - std::string registerFileRel = - this->LocalGenerator->MaybeRelativeToCurBinDir(registerFile); + std::string const registerFileRel = + cmStrCat(relPath, relObjectDir, "cmake_cuda_register.h"); registerFileCmd = cmStrCat(" --register-link-binaries=", registerFileRel); cleanFiles.push_back(registerFileRel); @@ -1555,7 +1560,7 @@ void cmMakefileTargetGenerator::WriteDeviceLinkRule( const std::string fatbinaryOutput = cmStrCat(objectDir, "cmake_cuda_fatbin.h"); const std::string fatbinaryOutputRel = - this->LocalGenerator->MaybeRelativeToCurBinDir(fatbinaryOutput); + cmStrCat(relPath, relObjectDir, "cmake_cuda_fatbin.h"); this->LocalGenerator->WriteMakeRule(*this->BuildFileStream, nullptr, fatbinaryOutputRel, fatbinaryDepends, @@ -1583,9 +1588,8 @@ void cmMakefileTargetGenerator::WriteDeviceLinkRule( compileCmd, vars); commands.emplace_back(compileCmd); - this->LocalGenerator->WriteMakeRule( - *this->BuildFileStream, nullptr, output, - { cmStrCat(relObjectDir, "cmake_cuda_fatbin.h") }, commands, false); + this->LocalGenerator->WriteMakeRule(*this->BuildFileStream, nullptr, output, + { fatbinaryOutputRel }, commands, false); // Clean all the possible executable names and symlinks. this->CleanFiles.insert(cleanFiles.begin(), cleanFiles.end()); diff --git a/Tests/CudaOnly/CMakeLists.txt b/Tests/CudaOnly/CMakeLists.txt index fdb7a6eda8..a3fb409b91 100644 --- a/Tests/CudaOnly/CMakeLists.txt +++ b/Tests/CudaOnly/CMakeLists.txt @@ -15,7 +15,7 @@ add_cuda_test_macro(CudaOnly.ToolkitBeforeLang CudaOnlyToolkitBeforeLang) add_cuda_test_macro(CudaOnly.WithDefs CudaOnlyWithDefs) add_cuda_test_macro(CudaOnly.CircularLinkLine CudaOnlyCircularLinkLine) add_cuda_test_macro(CudaOnly.ResolveDeviceSymbols CudaOnlyResolveDeviceSymbols) -add_cuda_test_macro(CudaOnly.SeparateCompilation CudaOnlySeparateCompilation) +add_cuda_test_macro(CudaOnly.SeparateCompilation main/CudaOnlySeparateCompilation) if(CMake_TEST_CUDA AND NOT CMake_TEST_CUDA STREQUAL "Clang") # Clang doesn't have flags for selecting the runtime. diff --git a/Tests/CudaOnly/SeparateCompilation/CMakeLists.txt b/Tests/CudaOnly/SeparateCompilation/CMakeLists.txt index 864ecbfb5d..17069e3107 100644 --- a/Tests/CudaOnly/SeparateCompilation/CMakeLists.txt +++ b/Tests/CudaOnly/SeparateCompilation/CMakeLists.txt @@ -34,26 +34,9 @@ add_library(CUDASeparateLibB STATIC file4.cu file5.cu) target_compile_features(CUDASeparateLibB PRIVATE cuda_std_11) target_link_libraries(CUDASeparateLibB PRIVATE CUDASeparateLibA) -add_executable(CudaOnlySeparateCompilation main.cu) -target_link_libraries(CudaOnlySeparateCompilation - PRIVATE CUDASeparateLibB) -set_target_properties(CudaOnlySeparateCompilation PROPERTIES CUDA_STANDARD 11) -set_target_properties(CudaOnlySeparateCompilation PROPERTIES CUDA_STANDARD_REQUIRED TRUE) - set_target_properties(CUDASeparateLibA CUDASeparateLibB PROPERTIES CUDA_SEPARABLE_COMPILATION ON POSITION_INDEPENDENT_CODE ON) -if (CMAKE_GENERATOR MATCHES "^Visual Studio") - #Visual Studio CUDA integration will not perform device linking - #on a target that itself does not have GenerateRelocatableDeviceCode - #enabled. - set_target_properties(CudaOnlySeparateCompilation - PROPERTIES CUDA_SEPARABLE_COMPILATION ON) -endif() - -if(APPLE) - # Help the static cuda runtime find the driver (libcuda.dyllib) at runtime. - set_property(TARGET CudaOnlySeparateCompilation PROPERTY BUILD_RPATH ${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES}) -endif() +add_subdirectory(main) diff --git a/Tests/CudaOnly/SeparateCompilation/main/CMakeLists.txt b/Tests/CudaOnly/SeparateCompilation/main/CMakeLists.txt new file mode 100644 index 0000000000..c1810786c9 --- /dev/null +++ b/Tests/CudaOnly/SeparateCompilation/main/CMakeLists.txt @@ -0,0 +1,18 @@ +add_executable(CudaOnlySeparateCompilation main.cu) +target_link_libraries(CudaOnlySeparateCompilation PRIVATE CUDASeparateLibB) +set_target_properties(CudaOnlySeparateCompilation PROPERTIES + CUDA_STANDARD 11 + CUDA_STANDARD_REQUIRED TRUE +) + +if(CMAKE_GENERATOR MATCHES "^Visual Studio") + # Visual Studio CUDA integration will not perform device linking + # on a target that itself does not have GenerateRelocatableDeviceCode + # enabled. + set_property(TARGET CudaOnlySeparateCompilation PROPERTY CUDA_SEPARABLE_COMPILATION ON) +endif() + +if(APPLE) + # Help the static cuda runtime find the driver (libcuda.dyllib) at runtime. + set_property(TARGET CudaOnlySeparateCompilation PROPERTY BUILD_RPATH ${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES}) +endif() diff --git a/Tests/CudaOnly/SeparateCompilation/main.cu b/Tests/CudaOnly/SeparateCompilation/main/main.cu similarity index 97% rename from Tests/CudaOnly/SeparateCompilation/main.cu rename to Tests/CudaOnly/SeparateCompilation/main/main.cu index 40dbe5d7e2..2b6e8f4cb8 100644 --- a/Tests/CudaOnly/SeparateCompilation/main.cu +++ b/Tests/CudaOnly/SeparateCompilation/main/main.cu @@ -1,8 +1,8 @@ #include -#include "file1.h" -#include "file2.h" +#include "../file1.h" +#include "../file2.h" int file4_launch_kernel(int x); int file5_launch_kernel(int x); From 3975678fcc3928f2a7dcd79fe9b9e9ebf3abe2b2 Mon Sep 17 00:00:00 2001 From: root Date: Tue, 27 Jul 2021 23:38:36 +0300 Subject: [PATCH 2/2] CUDA/Clang: Simplify --register-link-binaries logic Move the logic for appending cubin afterwards, so the check can simply be empty(). With the Makefile generator the option is now at the front instead of being intermixed with the actual bins. --- Source/cmMakefileTargetGenerator.cxx | 28 ++++++++++++------------- Source/cmNinjaNormalTargetGenerator.cxx | 10 ++++----- 2 files changed, 19 insertions(+), 19 deletions(-) diff --git a/Source/cmMakefileTargetGenerator.cxx b/Source/cmMakefileTargetGenerator.cxx index 6324b2ef83..98c61fe8be 100644 --- a/Source/cmMakefileTargetGenerator.cxx +++ b/Source/cmMakefileTargetGenerator.cxx @@ -1519,6 +1519,20 @@ void cmMakefileTargetGenerator::WriteDeviceLinkRule( // Link device code for each architecture. for (const std::string& architectureKind : architectures) { + std::string registerFileCmd; + + // The generated register file contains macros that when expanded + // register the device routines. Because the routines are the same for + // all architectures the register file will be the same too. Thus + // generate it only on the first invocation to reduce overhead. + if (fatbinaryDepends.empty()) { + std::string const registerFileRel = + cmStrCat(relPath, relObjectDir, "cmake_cuda_register.h"); + registerFileCmd = + cmStrCat(" --register-link-binaries=", registerFileRel); + cleanFiles.push_back(registerFileRel); + } + // Clang always generates real code, so strip the specifier. const std::string architecture = architectureKind.substr(0, architectureKind.find('-')); @@ -1528,20 +1542,6 @@ void cmMakefileTargetGenerator::WriteDeviceLinkRule( profiles += cmStrCat(" -im=profile=sm_", architecture, ",file=", cubin); fatbinaryDepends.emplace_back(cubin); - std::string registerFileCmd; - - // The generated register file contains macros that when expanded - // register the device routines. Because the routines are the same for - // all architectures the register file will be the same too. Thus - // generate it only on the first invocation to reduce overhead. - if (fatbinaryDepends.size() == 1) { - std::string const registerFileRel = - cmStrCat(relPath, relObjectDir, "cmake_cuda_register.h"); - registerFileCmd = - cmStrCat(" --register-link-binaries=", registerFileRel); - cleanFiles.push_back(registerFileRel); - } - std::string command = cmStrCat( this->Makefile->GetRequiredDefinition("CMAKE_CUDA_DEVICE_LINKER"), " -arch=sm_", architecture, registerFileCmd, " -o=$@ ", diff --git a/Source/cmNinjaNormalTargetGenerator.cxx b/Source/cmNinjaNormalTargetGenerator.cxx index 5a4c6521d8..493bd4ac85 100644 --- a/Source/cmNinjaNormalTargetGenerator.cxx +++ b/Source/cmNinjaNormalTargetGenerator.cxx @@ -753,10 +753,6 @@ void cmNinjaNormalTargetGenerator::WriteDeviceLinkStatements( const std::string cubin = cmStrCat(ninjaOutputDir, "/sm_", architecture, ".cubin"); - fatbinary.Variables["PROFILES"] += - cmStrCat(" -im=profile=sm_", architecture, ",file=", cubin); - fatbinary.ExplicitDeps.emplace_back(cubin); - cmNinjaBuild dlink(this->LanguageLinkerCudaDeviceRule(config)); dlink.ExplicitDeps = explicitDeps; dlink.Outputs = { cubin }; @@ -766,11 +762,15 @@ void cmNinjaNormalTargetGenerator::WriteDeviceLinkStatements( // the device routines. Because the routines are the same for all // architectures the register file will be the same too. Thus generate it // only on the first invocation to reduce overhead. - if (fatbinary.ExplicitDeps.size() == 1) { + if (fatbinary.ExplicitDeps.empty()) { dlink.Variables["REGISTER"] = cmStrCat( "--register-link-binaries=", ninjaOutputDir, "/cmake_cuda_register.h"); } + fatbinary.Variables["PROFILES"] += + cmStrCat(" -im=profile=sm_", architecture, ",file=", cubin); + fatbinary.ExplicitDeps.emplace_back(cubin); + this->GetGlobalGenerator()->WriteBuild(this->GetCommonFileStream(), dlink); }