diff options
author | Raul Tambre <raul@tambre.ee> | 2020-09-05 16:40:02 (GMT) |
---|---|---|
committer | Brad King <brad.king@kitware.com> | 2020-09-24 19:19:54 (GMT) |
commit | c63fe018353cf6afb30980c4cac7493be7cd0a82 (patch) | |
tree | 68d2daf0cd8ab91a9feaa49392607c6cfecd2ac4 /Source | |
parent | c98ec731f90eb0180c89108b7d2e42263b66d1ed (diff) | |
download | CMake-c63fe018353cf6afb30980c4cac7493be7cd0a82.zip CMake-c63fe018353cf6afb30980c4cac7493be7cd0a82.tar.gz CMake-c63fe018353cf6afb30980c4cac7493be7cd0a82.tar.bz2 |
CUDA: Clang separable compilation
For NVCC the compiler takes care of device linking when passed the "-dlink"
flag.
Clang doesn't support such magic and requires the buildsystem to do the work
that NVCC does behind the scenes.
The implementation is based on Bazel's device linking documentation:
https://github.com/tensorflow/tensorflow/blob/7cabcdf073abad8c46e9dda62bb8fa4682d2061e/third_party/nccl/build_defs.bzl.tpl#L259
Closes: #20726
Diffstat (limited to 'Source')
-rw-r--r-- | Source/cmLocalGenerator.cxx | 11 | ||||
-rw-r--r-- | Source/cmLocalGenerator.h | 2 | ||||
-rw-r--r-- | Source/cmMakefileExecutableTargetGenerator.cxx | 53 | ||||
-rw-r--r-- | Source/cmMakefileExecutableTargetGenerator.h | 4 | ||||
-rw-r--r-- | Source/cmMakefileLibraryTargetGenerator.cxx | 73 | ||||
-rw-r--r-- | Source/cmMakefileLibraryTargetGenerator.h | 5 | ||||
-rw-r--r-- | Source/cmMakefileTargetGenerator.cxx | 133 | ||||
-rw-r--r-- | Source/cmMakefileTargetGenerator.h | 7 | ||||
-rw-r--r-- | Source/cmNinjaNormalTargetGenerator.cxx | 224 | ||||
-rw-r--r-- | Source/cmNinjaNormalTargetGenerator.h | 15 | ||||
-rw-r--r-- | Source/cmNinjaTargetGenerator.cxx | 9 | ||||
-rw-r--r-- | Source/cmNinjaTargetGenerator.h | 3 | ||||
-rw-r--r-- | Source/cmRulePlaceholderExpander.cxx | 10 | ||||
-rw-r--r-- | Source/cmRulePlaceholderExpander.h | 2 |
14 files changed, 453 insertions, 98 deletions
diff --git a/Source/cmLocalGenerator.cxx b/Source/cmLocalGenerator.cxx index 47931b0..4e6010c 100644 --- a/Source/cmLocalGenerator.cxx +++ b/Source/cmLocalGenerator.cxx @@ -1955,17 +1955,6 @@ void cmLocalGenerator::AddLanguageFlags(std::string& flags, } else if (lang == "CUDA") { target->AddCUDAArchitectureFlags(flags); target->AddCUDAToolkitFlags(flags); - - if (compiler == "Clang") { - bool separable = target->GetPropertyAsBool("CUDA_SEPARABLE_COMPILATION"); - - if (separable) { - this->Makefile->IssueMessage( - MessageType::FATAL_ERROR, - "CUDA_SEPARABLE_COMPILATION isn't supported on Clang. " - "See CMake issue #20726."); - } - } } else if (lang == "ISPC") { target->AddISPCTargetFlags(flags); } diff --git a/Source/cmLocalGenerator.h b/Source/cmLocalGenerator.h index fad6136..22d3599 100644 --- a/Source/cmLocalGenerator.h +++ b/Source/cmLocalGenerator.h @@ -446,7 +446,7 @@ public: void GetTargetCompileFlags(cmGeneratorTarget* target, std::string const& config, std::string const& lang, std::string& flags, - std::string const& arch = std::string()); + std::string const& arch); std::vector<BT<std::string>> GetTargetCompileFlags( cmGeneratorTarget* target, std::string const& config, std::string const& lang, std::string const& arch = std::string()); diff --git a/Source/cmMakefileExecutableTargetGenerator.cxx b/Source/cmMakefileExecutableTargetGenerator.cxx index 9b5c6e6..871878c 100644 --- a/Source/cmMakefileExecutableTargetGenerator.cxx +++ b/Source/cmMakefileExecutableTargetGenerator.cxx @@ -91,19 +91,12 @@ void cmMakefileExecutableTargetGenerator::WriteDeviceExecutableRule( std::vector<std::string> commands; - // Get the language to use for linking this library. - std::string linkLanguage = "CUDA"; + // Get the name of the device object to generate. std::string const& objExt = this->Makefile->GetSafeDefinition("CMAKE_CUDA_OUTPUT_EXTENSION"); - - // Build list of dependencies. - std::vector<std::string> depends; - this->AppendLinkDepends(depends, linkLanguage); - - // Get the name of the device object to generate. - std::string const targetOutputReal = + std::string const targetOutput = this->GeneratorTarget->ObjectDirectory + "cmake_device_link" + objExt; - this->DeviceLinkObject = targetOutputReal; + this->DeviceLinkObject = targetOutput; this->NumberOfProgressActions++; if (!this->NoRuleMessages) { @@ -111,7 +104,7 @@ void cmMakefileExecutableTargetGenerator::WriteDeviceExecutableRule( this->MakeEchoProgress(progress); // Add the link message. std::string buildEcho = - cmStrCat("Linking ", linkLanguage, " device code ", + cmStrCat("Linking CUDA device code ", this->LocalGenerator->ConvertToOutputFormat( this->LocalGenerator->MaybeConvertToRelativePath( this->LocalGenerator->GetCurrentBinaryDirectory(), @@ -121,6 +114,29 @@ void cmMakefileExecutableTargetGenerator::WriteDeviceExecutableRule( commands, buildEcho, cmLocalUnixMakefileGenerator3::EchoLink, &progress); } + if (this->Makefile->GetSafeDefinition("CMAKE_CUDA_COMPILER_ID") == "Clang") { + this->WriteDeviceLinkRule(commands, targetOutput); + } else { + this->WriteNvidiaDeviceExecutableRule(relink, commands, targetOutput); + } + + // Write the main driver rule to build everything in this target. + this->WriteTargetDriverRule(targetOutput, relink); +#else + static_cast<void>(relink); +#endif +} + +void cmMakefileExecutableTargetGenerator::WriteNvidiaDeviceExecutableRule( + bool relink, std::vector<std::string>& commands, + const std::string& targetOutput) +{ + const std::string linkLanguage = "CUDA"; + + // Build list of dependencies. + std::vector<std::string> depends; + this->AppendLinkDepends(depends, linkLanguage); + // Build a list of compiler flags and linker flags. std::string langFlags; std::string linkFlags; @@ -136,7 +152,7 @@ void cmMakefileExecutableTargetGenerator::WriteDeviceExecutableRule( // may need to be cleaned. std::vector<std::string> exeCleanFiles; exeCleanFiles.push_back(this->LocalGenerator->MaybeConvertToRelativePath( - this->LocalGenerator->GetCurrentBinaryDirectory(), targetOutputReal)); + this->LocalGenerator->GetCurrentBinaryDirectory(), targetOutput)); // Determine whether a link script will be used. bool useLinkScript = this->GlobalGenerator->GetUseLinkScript(); @@ -195,7 +211,7 @@ void cmMakefileExecutableTargetGenerator::WriteDeviceExecutableRule( : cmOutputConverter::SHELL; std::string target = this->LocalGenerator->ConvertToOutputFormat( this->LocalGenerator->MaybeConvertToRelativePath( - this->LocalGenerator->GetCurrentBinaryDirectory(), targetOutputReal), + this->LocalGenerator->GetCurrentBinaryDirectory(), targetOutput), output); std::string targetFullPathCompilePDB = @@ -226,7 +242,7 @@ void cmMakefileExecutableTargetGenerator::WriteDeviceExecutableRule( this->LocalGenerator->CreateRulePlaceholderExpander()); // Expand placeholders in the commands. - rulePlaceholderExpander->SetTargetImpLib(targetOutputReal); + rulePlaceholderExpander->SetTargetImpLib(targetOutput); for (std::string& real_link_command : real_link_commands) { real_link_command = cmStrCat(launcher, real_link_command); rulePlaceholderExpander->ExpandRuleVariables(this->LocalGenerator, @@ -255,17 +271,10 @@ void cmMakefileExecutableTargetGenerator::WriteDeviceExecutableRule( // Write the build rule. this->LocalGenerator->WriteMakeRule(*this->BuildFileStream, nullptr, - targetOutputReal, depends, commands, - false); - - // Write the main driver rule to build everything in this target. - this->WriteTargetDriverRule(targetOutputReal, relink); + targetOutput, depends, commands, false); // Clean all the possible executable names and symlinks. this->CleanFiles.insert(exeCleanFiles.begin(), exeCleanFiles.end()); -#else - static_cast<void>(relink); -#endif } void cmMakefileExecutableTargetGenerator::WriteExecutableRule(bool relink) diff --git a/Source/cmMakefileExecutableTargetGenerator.h b/Source/cmMakefileExecutableTargetGenerator.h index 782692a..520f577 100644 --- a/Source/cmMakefileExecutableTargetGenerator.h +++ b/Source/cmMakefileExecutableTargetGenerator.h @@ -5,6 +5,7 @@ #include "cmConfigure.h" // IWYU pragma: keep #include <string> +#include <vector> #include "cmMakefileTargetGenerator.h" @@ -23,6 +24,9 @@ public: protected: virtual void WriteExecutableRule(bool relink); virtual void WriteDeviceExecutableRule(bool relink); + virtual void WriteNvidiaDeviceExecutableRule( + bool relink, std::vector<std::string>& commands, + const std::string& targetOutput); private: std::string DeviceLinkObject; diff --git a/Source/cmMakefileLibraryTargetGenerator.cxx b/Source/cmMakefileLibraryTargetGenerator.cxx index 1c25fc4..b32ea6a 100644 --- a/Source/cmMakefileLibraryTargetGenerator.cxx +++ b/Source/cmMakefileLibraryTargetGenerator.cxx @@ -129,8 +129,7 @@ void cmMakefileLibraryTargetGenerator::WriteStaticLibraryRules() const bool requiresDeviceLinking = requireDeviceLinking( *this->GeneratorTarget, *this->LocalGenerator, this->GetConfigName()); if (requiresDeviceLinking) { - std::string linkRuleVar = "CMAKE_CUDA_DEVICE_LINK_LIBRARY"; - this->WriteDeviceLibraryRules(linkRuleVar, false); + this->WriteDeviceLibraryRules("CMAKE_CUDA_DEVICE_LINK_LIBRARY", false); } std::string linkLanguage = @@ -156,8 +155,7 @@ void cmMakefileLibraryTargetGenerator::WriteSharedLibraryRules(bool relink) const bool requiresDeviceLinking = requireDeviceLinking( *this->GeneratorTarget, *this->LocalGenerator, this->GetConfigName()); if (requiresDeviceLinking) { - std::string linkRuleVar = "CMAKE_CUDA_DEVICE_LINK_LIBRARY"; - this->WriteDeviceLibraryRules(linkRuleVar, relink); + this->WriteDeviceLibraryRules("CMAKE_CUDA_DEVICE_LINK_LIBRARY", relink); } } @@ -191,8 +189,7 @@ void cmMakefileLibraryTargetGenerator::WriteModuleLibraryRules(bool relink) const bool requiresDeviceLinking = requireDeviceLinking( *this->GeneratorTarget, *this->LocalGenerator, this->GetConfigName()); if (requiresDeviceLinking) { - std::string linkRuleVar = "CMAKE_CUDA_DEVICE_LINK_LIBRARY"; - this->WriteDeviceLibraryRules(linkRuleVar, relink); + this->WriteDeviceLibraryRules("CMAKE_CUDA_DEVICE_LINK_LIBRARY", relink); } } @@ -239,29 +236,13 @@ void cmMakefileLibraryTargetGenerator::WriteDeviceLibraryRules( // TODO: Merge the methods that call this method to avoid // code duplication. std::vector<std::string> commands; - - // Get the language to use for linking this library. - std::string linkLanguage = "CUDA"; std::string const objExt = this->Makefile->GetSafeDefinition("CMAKE_CUDA_OUTPUT_EXTENSION"); - // Build list of dependencies. - std::vector<std::string> depends; - this->AppendLinkDepends(depends, linkLanguage); - - // Add language-specific flags. - std::string langFlags; - this->LocalGenerator->AddLanguageFlagsForLinking( - langFlags, this->GeneratorTarget, linkLanguage, this->GetConfigName()); - - // Create set of linking flags. - std::string linkFlags; - this->GetDeviceLinkFlags(linkFlags, linkLanguage); - // Get the name of the device object to generate. - std::string const targetOutputReal = + std::string const targetOutput = this->GeneratorTarget->ObjectDirectory + "cmake_device_link" + objExt; - this->DeviceLinkObject = targetOutputReal; + this->DeviceLinkObject = targetOutput; this->NumberOfProgressActions++; if (!this->NoRuleMessages) { @@ -269,7 +250,7 @@ void cmMakefileLibraryTargetGenerator::WriteDeviceLibraryRules( this->MakeEchoProgress(progress); // Add the link message. std::string buildEcho = - cmStrCat("Linking ", linkLanguage, " device code ", + cmStrCat("Linking CUDA device code ", this->LocalGenerator->ConvertToOutputFormat( this->LocalGenerator->MaybeConvertToRelativePath( this->LocalGenerator->GetCurrentBinaryDirectory(), @@ -278,10 +259,41 @@ void cmMakefileLibraryTargetGenerator::WriteDeviceLibraryRules( this->LocalGenerator->AppendEcho( commands, buildEcho, cmLocalUnixMakefileGenerator3::EchoLink, &progress); } + + if (this->Makefile->GetSafeDefinition("CMAKE_CUDA_COMPILER_ID") == "Clang") { + this->WriteDeviceLinkRule(commands, targetOutput); + } else { + this->WriteNvidiaDeviceLibraryRules(linkRuleVar, relink, commands, + targetOutput); + } + + // Write the main driver rule to build everything in this target. + this->WriteTargetDriverRule(targetOutput, relink); +} + +void cmMakefileLibraryTargetGenerator::WriteNvidiaDeviceLibraryRules( + const std::string& linkRuleVar, bool relink, + std::vector<std::string>& commands, const std::string& targetOutput) +{ + std::string linkLanguage = "CUDA"; + + // Build list of dependencies. + std::vector<std::string> depends; + this->AppendLinkDepends(depends, linkLanguage); + + // Add language-specific flags. + std::string langFlags; + this->LocalGenerator->AddLanguageFlagsForLinking( + langFlags, this->GeneratorTarget, linkLanguage, this->GetConfigName()); + + // Create set of linking flags. + std::string linkFlags; + this->GetDeviceLinkFlags(linkFlags, linkLanguage); + // Clean files associated with this library. std::set<std::string> libCleanFiles; libCleanFiles.insert(this->LocalGenerator->MaybeConvertToRelativePath( - this->LocalGenerator->GetCurrentBinaryDirectory(), targetOutputReal)); + this->LocalGenerator->GetCurrentBinaryDirectory(), targetOutput)); // Determine whether a link script will be used. bool useLinkScript = this->GlobalGenerator->GetUseLinkScript(); @@ -335,7 +347,7 @@ void cmMakefileLibraryTargetGenerator::WriteDeviceLibraryRules( std::string target = this->LocalGenerator->ConvertToOutputFormat( this->LocalGenerator->MaybeConvertToRelativePath( - this->LocalGenerator->GetCurrentBinaryDirectory(), targetOutputReal), + this->LocalGenerator->GetCurrentBinaryDirectory(), targetOutput), output); std::string targetFullPathCompilePDB = @@ -364,7 +376,7 @@ void cmMakefileLibraryTargetGenerator::WriteDeviceLibraryRules( this->LocalGenerator->CreateRulePlaceholderExpander()); // Construct the main link rule and expand placeholders. - rulePlaceholderExpander->SetTargetImpLib(targetOutputReal); + rulePlaceholderExpander->SetTargetImpLib(targetOutput); std::string linkRule = this->GetLinkRule(linkRuleVar); cmExpandList(linkRule, real_link_commands); @@ -399,14 +411,11 @@ void cmMakefileLibraryTargetGenerator::WriteDeviceLibraryRules( commands1.clear(); // Compute the list of outputs. - std::vector<std::string> outputs(1, targetOutputReal); + std::vector<std::string> outputs(1, targetOutput); // Write the build rule. this->WriteMakeRule(*this->BuildFileStream, nullptr, outputs, depends, commands, false); - - // Write the main driver rule to build everything in this target. - this->WriteTargetDriverRule(targetOutputReal, relink); #else static_cast<void>(linkRuleVar); static_cast<void>(relink); diff --git a/Source/cmMakefileLibraryTargetGenerator.h b/Source/cmMakefileLibraryTargetGenerator.h index 6a38e18..cc989e7 100644 --- a/Source/cmMakefileLibraryTargetGenerator.h +++ b/Source/cmMakefileLibraryTargetGenerator.h @@ -5,6 +5,7 @@ #include "cmConfigure.h" // IWYU pragma: keep #include <string> +#include <vector> #include "cmMakefileTargetGenerator.h" @@ -27,6 +28,10 @@ protected: void WriteModuleLibraryRules(bool relink); void WriteDeviceLibraryRules(const std::string& linkRule, bool relink); + void WriteNvidiaDeviceLibraryRules(const std::string& linkRuleVar, + bool relink, + std::vector<std::string>& commands, + const std::string& targetOutput); void WriteLibraryRules(const std::string& linkRule, const std::string& extraFlags, bool relink); // MacOSX Framework support methods diff --git a/Source/cmMakefileTargetGenerator.cxx b/Source/cmMakefileTargetGenerator.cxx index e1fe0e5..5f97d86 100644 --- a/Source/cmMakefileTargetGenerator.cxx +++ b/Source/cmMakefileTargetGenerator.cxx @@ -2,10 +2,13 @@ file Copyright.txt or https://cmake.org/licensing for details. */ #include "cmMakefileTargetGenerator.h" +#include <algorithm> #include <cassert> #include <cstdio> +#include <iterator> #include <sstream> #include <unordered_map> +#include <unordered_set> #include <utility> #include <cm/memory> @@ -25,6 +28,7 @@ #include "cmMakefileExecutableTargetGenerator.h" #include "cmMakefileLibraryTargetGenerator.h" #include "cmMakefileUtilityTargetGenerator.h" +#include "cmMessageType.h" #include "cmOutputConverter.h" #include "cmPolicies.h" #include "cmProperty.h" @@ -1323,6 +1327,130 @@ void cmMakefileTargetGenerator::WriteObjectDependRules( } } +void cmMakefileTargetGenerator::WriteDeviceLinkRule( + std::vector<std::string>& commands, const std::string& output) +{ + std::string architecturesStr = + this->GeneratorTarget->GetSafeProperty("CUDA_ARCHITECTURES"); + + if (cmIsOff(architecturesStr)) { + this->Makefile->IssueMessage(MessageType::FATAL_ERROR, + "CUDA_SEPARABLE_COMPILATION on Clang " + "requires CUDA_ARCHITECTURES to be set."); + return; + } + + std::vector<std::string> architectures = cmExpandedList(architecturesStr); + + // Ensure there are no duplicates. + const std::vector<std::string> linkDeps = [&]() -> std::vector<std::string> { + std::vector<std::string> deps; + this->AppendTargetDepends(deps, true); + this->GeneratorTarget->GetLinkDepends(deps, this->GetConfigName(), "CUDA"); + std::copy(this->Objects.begin(), this->Objects.end(), + std::back_inserter(deps)); + + std::unordered_set<std::string> depsSet(deps.begin(), deps.end()); + deps.clear(); + std::copy(depsSet.begin(), depsSet.end(), std::back_inserter(deps)); + return deps; + }(); + + const std::string objectDir = this->GeneratorTarget->ObjectDirectory; + const std::string relObjectDir = + this->LocalGenerator->MaybeConvertToRelativePath( + this->LocalGenerator->GetCurrentBinaryDirectory(), objectDir); + + // Construct a list of files associated with this executable that + // may need to be cleaned. + std::vector<std::string> cleanFiles; + cleanFiles.push_back(this->LocalGenerator->MaybeConvertToRelativePath( + this->LocalGenerator->GetCurrentBinaryDirectory(), output)); + + std::string profiles; + std::vector<std::string> fatbinaryDepends; + std::string registerFile = cmStrCat(objectDir, "cmake_cuda_register.h"); + + // Link device code for each architecture. + for (const std::string& architectureKind : architectures) { + // Clang always generates real code, so strip the specifier. + const std::string architecture = + architectureKind.substr(0, architectureKind.find('-')); + const std::string cubin = + cmStrCat(relObjectDir, "sm_", architecture, ".cubin"); + + profiles += cmStrCat(" -im=profile=sm_", architecture, ",file=", cubin); + fatbinaryDepends.emplace_back(cubin); + + std::string registerFileCmd; + + // The generated register file contains macros that when expanded register + // the device routines. Because the routines are the same for all + // architectures the register file will be the same too. Thus generate it + // only on the first invocation to reduce overhead. + if (fatbinaryDepends.size() == 1) { + std::string registerFileRel = + this->LocalGenerator->MaybeConvertToRelativePath( + this->LocalGenerator->GetCurrentBinaryDirectory(), registerFile); + registerFileCmd = + cmStrCat(" --register-link-binaries=", registerFileRel); + cleanFiles.push_back(registerFileRel); + } + + std::string command = cmStrCat( + this->Makefile->GetRequiredDefinition("CMAKE_CUDA_DEVICE_LINKER"), + " -arch=sm_", architecture, registerFileCmd, " -o=$@ ", + cmJoin(linkDeps, " ")); + + this->LocalGenerator->WriteMakeRule(*this->BuildFileStream, nullptr, cubin, + linkDeps, { command }, false); + } + + // Combine all architectures into a single fatbinary. + const std::string fatbinaryCommand = + cmStrCat(this->Makefile->GetRequiredDefinition("CMAKE_CUDA_FATBINARY"), + " -64 -cmdline=--compile-only -compress-all -link " + "--embedded-fatbin=$@", + profiles); + const std::string fatbinaryOutput = + cmStrCat(objectDir, "cmake_cuda_fatbin.h"); + const std::string fatbinaryOutputRel = + this->LocalGenerator->MaybeConvertToRelativePath( + this->LocalGenerator->GetCurrentBinaryDirectory(), fatbinaryOutput); + + this->LocalGenerator->WriteMakeRule(*this->BuildFileStream, nullptr, + fatbinaryOutputRel, fatbinaryDepends, + { fatbinaryCommand }, false); + + // Compile the stub that registers the kernels and contains the fatbinaries. + cmRulePlaceholderExpander::RuleVariables vars; + vars.CMTargetName = this->GetGeneratorTarget()->GetName().c_str(); + vars.CMTargetType = + cmState::GetTargetTypeName(this->GetGeneratorTarget()->GetType()).c_str(); + + vars.Language = "CUDA"; + vars.Object = output.c_str(); + vars.Fatbinary = fatbinaryOutput.c_str(); + vars.RegisterFile = registerFile.c_str(); + + std::string flags = this->GetFlags("CUDA", this->GetConfigName()); + vars.Flags = flags.c_str(); + + std::string compileCmd = this->GetLinkRule("CMAKE_CUDA_DEVICE_LINK_COMPILE"); + std::unique_ptr<cmRulePlaceholderExpander> rulePlaceholderExpander( + this->LocalGenerator->CreateRulePlaceholderExpander()); + rulePlaceholderExpander->ExpandRuleVariables(this->LocalGenerator, + compileCmd, vars); + + commands.emplace_back(compileCmd); + this->LocalGenerator->WriteMakeRule( + *this->BuildFileStream, nullptr, output, + { cmStrCat(relObjectDir, "cmake_cuda_fatbin.h") }, commands, false); + + // Clean all the possible executable names and symlinks. + this->CleanFiles.insert(cleanFiles.begin(), cleanFiles.end()); +} + void cmMakefileTargetGenerator::GenerateCustomRuleFile( cmCustomCommandGenerator const& ccg) { @@ -1579,10 +1707,11 @@ void cmMakefileTargetGenerator::WriteTargetDriverRule( } void cmMakefileTargetGenerator::AppendTargetDepends( - std::vector<std::string>& depends) + std::vector<std::string>& depends, bool ignoreType) { // Static libraries never depend on anything for linking. - if (this->GeneratorTarget->GetType() == cmStateEnums::STATIC_LIBRARY) { + if (this->GeneratorTarget->GetType() == cmStateEnums::STATIC_LIBRARY && + !ignoreType) { return; } diff --git a/Source/cmMakefileTargetGenerator.h b/Source/cmMakefileTargetGenerator.h index 1740d54..cb804e0 100644 --- a/Source/cmMakefileTargetGenerator.h +++ b/Source/cmMakefileTargetGenerator.h @@ -104,6 +104,10 @@ protected: void WriteObjectDependRules(cmSourceFile const& source, std::vector<std::string>& depends); + // CUDA device linking. + void WriteDeviceLinkRule(std::vector<std::string>& commands, + const std::string& output); + // write the build rule for a custom command void GenerateCustomRuleFile(cmCustomCommandGenerator const& ccg); @@ -127,7 +131,8 @@ protected: void DriveCustomCommands(std::vector<std::string>& depends); // append intertarget dependencies - void AppendTargetDepends(std::vector<std::string>& depends); + void AppendTargetDepends(std::vector<std::string>& depends, + bool ignoreType = false); // Append object file dependencies. void AppendObjectDepends(std::vector<std::string>& depends); diff --git a/Source/cmNinjaNormalTargetGenerator.cxx b/Source/cmNinjaNormalTargetGenerator.cxx index 210b36e..ccb959b 100644 --- a/Source/cmNinjaNormalTargetGenerator.cxx +++ b/Source/cmNinjaNormalTargetGenerator.cxx @@ -8,6 +8,7 @@ #include <map> #include <set> #include <sstream> +#include <unordered_set> #include <utility> #include <cm/memory> @@ -25,6 +26,7 @@ #include "cmLocalGenerator.h" #include "cmLocalNinjaGenerator.h" #include "cmMakefile.h" +#include "cmMessageType.h" #include "cmNinjaLinkLineDeviceComputer.h" #include "cmNinjaTypes.h" #include "cmOSXBundleGenerator.h" @@ -178,6 +180,33 @@ std::string cmNinjaNormalTargetGenerator::LanguageLinkerDeviceRule( "_", config); } +std::string cmNinjaNormalTargetGenerator::LanguageLinkerCudaDeviceRule( + const std::string& config) const +{ + return cmStrCat( + this->TargetLinkLanguage(config), "_DEVICE_LINK__", + cmGlobalNinjaGenerator::EncodeRuleName(this->GeneratorTarget->GetName()), + '_', config); +} + +std::string cmNinjaNormalTargetGenerator::LanguageLinkerCudaDeviceCompileRule( + const std::string& config) const +{ + return cmStrCat( + this->TargetLinkLanguage(config), "_DEVICE_LINK_COMPILE__", + cmGlobalNinjaGenerator::EncodeRuleName(this->GeneratorTarget->GetName()), + '_', config); +} + +std::string cmNinjaNormalTargetGenerator::LanguageLinkerCudaFatbinaryRule( + const std::string& config) const +{ + return cmStrCat( + this->TargetLinkLanguage(config), "_FATBINARY__", + cmGlobalNinjaGenerator::EncodeRuleName(this->GeneratorTarget->GetName()), + '_', config); +} + struct cmNinjaRemoveNoOpCommands { bool operator()(std::string const& cmd) @@ -186,7 +215,7 @@ struct cmNinjaRemoveNoOpCommands } }; -void cmNinjaNormalTargetGenerator::WriteDeviceLinkRule( +void cmNinjaNormalTargetGenerator::WriteNvidiaDeviceLinkRule( bool useResponseFile, const std::string& config) { cmNinjaRule rule(this->LanguageLinkerDeviceRule(config)); @@ -272,6 +301,55 @@ void cmNinjaNormalTargetGenerator::WriteDeviceLinkRule( } } +void cmNinjaNormalTargetGenerator::WriteDeviceLinkRules( + const std::string& config) +{ + const cmMakefile* mf = this->GetMakefile(); + + cmNinjaRule rule(LanguageLinkerCudaDeviceRule(config)); + rule.Command = this->GetLocalGenerator()->BuildCommandLine( + { cmStrCat(mf->GetRequiredDefinition("CMAKE_CUDA_DEVICE_LINKER"), + " -arch=$ARCH $REGISTER -o=$out $in") }); + rule.Comment = "Rule for CUDA device linking."; + rule.Description = "Linking CUDA $out"; + this->GetGlobalGenerator()->AddRule(rule); + + cmRulePlaceholderExpander::RuleVariables vars; + vars.CMTargetName = this->GetGeneratorTarget()->GetName().c_str(); + vars.CMTargetType = + cmState::GetTargetTypeName(this->GetGeneratorTarget()->GetType()).c_str(); + + vars.Language = "CUDA"; + vars.Object = "$out"; + vars.Fatbinary = "$FATBIN"; + vars.RegisterFile = "$REGISTER"; + + std::string flags = this->GetFlags("CUDA", config); + vars.Flags = flags.c_str(); + + std::string compileCmd = this->GetMakefile()->GetRequiredDefinition( + "CMAKE_CUDA_DEVICE_LINK_COMPILE"); + std::unique_ptr<cmRulePlaceholderExpander> rulePlaceholderExpander( + this->GetLocalGenerator()->CreateRulePlaceholderExpander()); + rulePlaceholderExpander->ExpandRuleVariables(this->GetLocalGenerator(), + compileCmd, vars); + + rule.Name = LanguageLinkerCudaDeviceCompileRule(config); + rule.Command = this->GetLocalGenerator()->BuildCommandLine({ compileCmd }); + rule.Comment = "Rule for compiling CUDA device stubs."; + rule.Description = "Compiling CUDA device stub $out"; + this->GetGlobalGenerator()->AddRule(rule); + + rule.Name = LanguageLinkerCudaFatbinaryRule(config); + rule.Command = this->GetLocalGenerator()->BuildCommandLine( + { cmStrCat(mf->GetRequiredDefinition("CMAKE_CUDA_FATBINARY"), + " -64 -cmdline=--compile-only -compress-all -link " + "--embedded-fatbin=$out $PROFILES") }); + rule.Comment = "Rule for CUDA fatbinaries."; + rule.Description = "Creating fatbinary $out"; + this->GetGlobalGenerator()->AddRule(rule); +} + void cmNinjaNormalTargetGenerator::WriteLinkRule(bool useResponseFile, const std::string& config) { @@ -586,7 +664,6 @@ void cmNinjaNormalTargetGenerator::WriteDeviceLinkStatement( // First and very important step is to make sure while inside this // step our link language is set to CUDA - std::string cudaLinkLanguage = "CUDA"; std::string const& objExt = this->Makefile->GetSafeDefinition("CMAKE_CUDA_OUTPUT_EXTENSION"); @@ -598,6 +675,118 @@ void cmNinjaNormalTargetGenerator::WriteDeviceLinkStatement( std::string targetOutputReal = ConvertToNinjaPath(targetOutputDir + "cmake_device_link" + objExt); + if (firstForConfig) { + globalGen->GetByproductsForCleanTarget(config).push_back(targetOutputReal); + } + this->DeviceLinkObject = targetOutputReal; + + // Write comments. + cmGlobalNinjaGenerator::WriteDivider(this->GetCommonFileStream()); + this->GetCommonFileStream() + << "# Device Link build statements for " + << cmState::GetTargetTypeName(genTarget->GetType()) << " target " + << this->GetTargetName() << "\n\n"; + + if (this->Makefile->GetSafeDefinition("CMAKE_CUDA_COMPILER_ID") == "Clang") { + std::string architecturesStr = + this->GeneratorTarget->GetSafeProperty("CUDA_ARCHITECTURES"); + + if (cmIsOff(architecturesStr)) { + this->Makefile->IssueMessage(MessageType::FATAL_ERROR, + "CUDA_SEPARABLE_COMPILATION on Clang " + "requires CUDA_ARCHITECTURES to be set."); + return; + } + + this->WriteDeviceLinkRules(config); + this->WriteDeviceLinkStatements(config, cmExpandedList(architecturesStr), + targetOutputReal); + } else { + this->WriteNvidiaDeviceLinkStatement(config, fileConfig, targetOutputDir, + targetOutputReal); + } +} + +void cmNinjaNormalTargetGenerator::WriteDeviceLinkStatements( + const std::string& config, const std::vector<std::string>& architectures, + const std::string& output) +{ + // Ensure there are no duplicates. + const cmNinjaDeps explicitDeps = [&]() -> std::vector<std::string> { + std::unordered_set<std::string> depsSet; + const cmNinjaDeps linkDeps = + this->ComputeLinkDeps(this->TargetLinkLanguage(config), config, true); + const cmNinjaDeps objects = this->GetObjects(config); + depsSet.insert(linkDeps.begin(), linkDeps.end()); + depsSet.insert(objects.begin(), objects.end()); + + std::vector<std::string> deps; + std::copy(depsSet.begin(), depsSet.end(), std::back_inserter(deps)); + return deps; + }(); + + const std::string objectDir = + cmStrCat(this->GeneratorTarget->GetSupportDirectory(), + this->GetGlobalGenerator()->ConfigDirectory(config)); + const std::string ninjaOutputDir = this->ConvertToNinjaPath(objectDir); + + cmNinjaBuild fatbinary(LanguageLinkerCudaFatbinaryRule(config)); + + // Link device code for each architecture. + for (const std::string& architectureKind : architectures) { + // Clang always generates real code, so strip the specifier. + const std::string architecture = + architectureKind.substr(0, architectureKind.find('-')); + const std::string cubin = + cmStrCat(ninjaOutputDir, "/sm_", architecture, ".cubin"); + + fatbinary.Variables["PROFILES"] += + cmStrCat(" -im=profile=sm_", architecture, ",file=", cubin); + fatbinary.ExplicitDeps.emplace_back(cubin); + + cmNinjaBuild dlink(LanguageLinkerCudaDeviceRule(config)); + dlink.ExplicitDeps = explicitDeps; + dlink.Outputs = { cubin }; + dlink.Variables["ARCH"] = cmStrCat("sm_", architecture); + + // The generated register file contains macros that when expanded register + // the device routines. Because the routines are the same for all + // architectures the register file will be the same too. Thus generate it + // only on the first invocation to reduce overhead. + if (fatbinary.ExplicitDeps.size() == 1) { + dlink.Variables["REGISTER"] = cmStrCat( + "--register-link-binaries=", ninjaOutputDir, "/cmake_cuda_register.h"); + } + + this->GetGlobalGenerator()->WriteBuild(this->GetCommonFileStream(), dlink); + } + + // Combine all architectures into a single fatbinary. + fatbinary.Outputs = { cmStrCat(ninjaOutputDir, "/cmake_cuda_fatbin.h") }; + this->GetGlobalGenerator()->WriteBuild(this->GetCommonFileStream(), + fatbinary); + + // Compile the stub that registers the kernels and contains the fatbinaries. + cmNinjaBuild dcompile(LanguageLinkerCudaDeviceCompileRule(config)); + dcompile.Outputs = { output }; + dcompile.ExplicitDeps = { cmStrCat(ninjaOutputDir, "/cmake_cuda_fatbin.h") }; + dcompile.Variables["FATBIN"] = + this->GetLocalGenerator()->ConvertToOutputFormat( + cmStrCat(objectDir, "/cmake_cuda_fatbin.h"), cmOutputConverter::SHELL); + dcompile.Variables["REGISTER"] = + this->GetLocalGenerator()->ConvertToOutputFormat( + cmStrCat(objectDir, "/cmake_cuda_register.h"), cmOutputConverter::SHELL); + this->GetGlobalGenerator()->WriteBuild(this->GetCommonFileStream(), + dcompile); +} + +void cmNinjaNormalTargetGenerator::WriteNvidiaDeviceLinkStatement( + const std::string& config, const std::string& fileConfig, + const std::string& outputDir, const std::string& output) +{ + cmGeneratorTarget* genTarget = this->GetGeneratorTarget(); + cmGlobalNinjaGenerator* globalGen = this->GetGlobalGenerator(); + std::string targetOutputImplib = ConvertToNinjaPath( genTarget->GetFullPath(config, cmStateEnums::ImportLibraryArtifact)); @@ -606,8 +795,8 @@ void cmNinjaNormalTargetGenerator::WriteDeviceLinkStatement( cmStrCat(this->GetLocalGenerator()->GetTargetDirectory(genTarget), globalGen->ConfigDirectory(fileConfig), "/"); targetOutputFileConfigDir = - globalGen->ExpandCFGIntDir(targetOutputDir, fileConfig); - if (targetOutputDir == targetOutputFileConfigDir) { + globalGen->ExpandCFGIntDir(outputDir, fileConfig); + if (outputDir == targetOutputFileConfigDir) { return; } @@ -623,27 +812,15 @@ void cmNinjaNormalTargetGenerator::WriteDeviceLinkStatement( } } - if (firstForConfig) { - globalGen->GetByproductsForCleanTarget(config).push_back(targetOutputReal); - } - this->DeviceLinkObject = targetOutputReal; - - // Write comments. - cmGlobalNinjaGenerator::WriteDivider(this->GetCommonFileStream()); - const cmStateEnums::TargetType targetType = genTarget->GetType(); - this->GetCommonFileStream() << "# Device Link build statements for " - << cmState::GetTargetTypeName(targetType) - << " target " << this->GetTargetName() << "\n\n"; - // Compute the comment. cmNinjaBuild build(this->LanguageLinkerDeviceRule(config)); build.Comment = - cmStrCat("Link the ", this->GetVisibleTypeName(), ' ', targetOutputReal); + cmStrCat("Link the ", this->GetVisibleTypeName(), ' ', output); cmNinjaVars& vars = build.Variables; // Compute outputs. - build.Outputs.push_back(targetOutputReal); + build.Outputs.push_back(output); // Compute specific libraries to link with. build.ExplicitDeps = this->GetObjects(config); build.ImplicitDeps = @@ -659,7 +836,7 @@ void cmNinjaNormalTargetGenerator::WriteDeviceLinkStatement( cmLocalNinjaGenerator& localGen = *this->GetLocalGenerator(); vars["TARGET_FILE"] = - localGen.ConvertToOutputFormat(targetOutputReal, cmOutputConverter::SHELL); + localGen.ConvertToOutputFormat(output, cmOutputConverter::SHELL); std::unique_ptr<cmLinkLineComputer> linkLineComputer( new cmNinjaLinkLineDeviceComputer( @@ -683,8 +860,7 @@ void cmNinjaNormalTargetGenerator::WriteDeviceLinkStatement( // Compute language specific link flags. std::string langFlags; - localGen.AddLanguageFlagsForLinking(langFlags, genTarget, cudaLinkLanguage, - config); + localGen.AddLanguageFlagsForLinking(langFlags, genTarget, "CUDA", config); vars["LANGUAGE_COMPILE_FLAGS"] = langFlags; auto const tgtNames = this->TargetNames(config); @@ -692,7 +868,7 @@ void cmNinjaNormalTargetGenerator::WriteDeviceLinkStatement( vars["SONAME_FLAG"] = this->GetMakefile()->GetSONameFlag(this->TargetLinkLanguage(config)); vars["SONAME"] = tgtNames.SharedObject; - if (targetType == cmStateEnums::SHARED_LIBRARY) { + if (genTarget->GetType() == cmStateEnums::SHARED_LIBRARY) { std::string install_dir = this->GetGeneratorTarget()->GetInstallNameDirForBuildTree(config); if (!install_dir.empty()) { @@ -731,7 +907,7 @@ void cmNinjaNormalTargetGenerator::WriteDeviceLinkStatement( // do not check if the user has explicitly forced a response file. int const commandLineLengthLimit = static_cast<int>(cmSystemTools::CalculateCommandLineLengthLimit()) - - globalGen->GetRuleCmdLength(this->LanguageLinkerDeviceRule(config)); + globalGen->GetRuleCmdLength(build.Rule); build.RspFile = this->ConvertToNinjaPath( cmStrCat("CMakeFiles/", genTarget->GetName(), @@ -746,7 +922,7 @@ void cmNinjaNormalTargetGenerator::WriteDeviceLinkStatement( bool usedResponseFile = false; globalGen->WriteBuild(this->GetCommonFileStream(), build, commandLineLengthLimit, &usedResponseFile); - this->WriteDeviceLinkRule(usedResponseFile, config); + this->WriteNvidiaDeviceLinkRule(usedResponseFile, config); } void cmNinjaNormalTargetGenerator::WriteLinkStatement( diff --git a/Source/cmNinjaNormalTargetGenerator.h b/Source/cmNinjaNormalTargetGenerator.h index 25e40d0..ffc405c 100644 --- a/Source/cmNinjaNormalTargetGenerator.h +++ b/Source/cmNinjaNormalTargetGenerator.h @@ -21,18 +21,31 @@ public: private: std::string LanguageLinkerRule(const std::string& config) const; std::string LanguageLinkerDeviceRule(const std::string& config) const; + std::string LanguageLinkerCudaDeviceRule(const std::string& config) const; + std::string LanguageLinkerCudaDeviceCompileRule( + const std::string& config) const; + std::string LanguageLinkerCudaFatbinaryRule(const std::string& config) const; const char* GetVisibleTypeName() const; void WriteLanguagesRules(const std::string& config); void WriteLinkRule(bool useResponseFile, const std::string& config); - void WriteDeviceLinkRule(bool useResponseFile, const std::string& config); + void WriteDeviceLinkRules(const std::string& config); + void WriteNvidiaDeviceLinkRule(bool useResponseFile, + const std::string& config); void WriteLinkStatement(const std::string& config, const std::string& fileConfig, bool firstForConfig); void WriteDeviceLinkStatement(const std::string& config, const std::string& fileConfig, bool firstForConfig); + void WriteDeviceLinkStatements(const std::string& config, + const std::vector<std::string>& architectures, + const std::string& output); + void WriteNvidiaDeviceLinkStatement(const std::string& config, + const std::string& fileConfig, + const std::string& outputDir, + const std::string& output); void WriteObjectLibStatement(const std::string& config); diff --git a/Source/cmNinjaTargetGenerator.cxx b/Source/cmNinjaTargetGenerator.cxx index accdcf1..04d84a0 100644 --- a/Source/cmNinjaTargetGenerator.cxx +++ b/Source/cmNinjaTargetGenerator.cxx @@ -346,11 +346,13 @@ std::string cmNinjaTargetGenerator::ComputeIncludes( } cmNinjaDeps cmNinjaTargetGenerator::ComputeLinkDeps( - const std::string& linkLanguage, const std::string& config) const + const std::string& linkLanguage, const std::string& config, + bool ignoreType) const { // Static libraries never depend on other targets for linking. - if (this->GeneratorTarget->GetType() == cmStateEnums::STATIC_LIBRARY || - this->GeneratorTarget->GetType() == cmStateEnums::OBJECT_LIBRARY) { + if (!ignoreType && + (this->GeneratorTarget->GetType() == cmStateEnums::STATIC_LIBRARY || + this->GeneratorTarget->GetType() == cmStateEnums::OBJECT_LIBRARY)) { return cmNinjaDeps(); } @@ -1009,6 +1011,7 @@ void cmNinjaTargetGenerator::WriteObjectBuildStatements( { std::vector<cmSourceFile const*> objectSources; this->GeneratorTarget->GetObjectSources(objectSources, config); + for (cmSourceFile const* sf : objectSources) { this->WriteObjectBuildStatement(sf, config, fileConfig, firstForConfig); } diff --git a/Source/cmNinjaTargetGenerator.h b/Source/cmNinjaTargetGenerator.h index 9d9ce60..a27c9b4 100644 --- a/Source/cmNinjaTargetGenerator.h +++ b/Source/cmNinjaTargetGenerator.h @@ -113,7 +113,8 @@ protected: /// @return the list of link dependency for the given target @a target. cmNinjaDeps ComputeLinkDeps(const std::string& linkLanguage, - const std::string& config) const; + const std::string& config, + bool ignoreType = false) const; /// @return the source file path for the given @a source. std::string GetSourceFilePath(cmSourceFile const* source) const; diff --git a/Source/cmRulePlaceholderExpander.cxx b/Source/cmRulePlaceholderExpander.cxx index 6f40ec6..f5f9c67 100644 --- a/Source/cmRulePlaceholderExpander.cxx +++ b/Source/cmRulePlaceholderExpander.cxx @@ -141,6 +141,16 @@ std::string cmRulePlaceholderExpander::ExpandRuleVariable( return replaceValues.DependencyFile; } } + if (replaceValues.Fatbinary) { + if (variable == "FATBINARY") { + return replaceValues.Fatbinary; + } + } + if (replaceValues.RegisterFile) { + if (variable == "REGISTER_FILE") { + return replaceValues.RegisterFile; + } + } if (replaceValues.Target) { if (variable == "TARGET_QUOTED") { diff --git a/Source/cmRulePlaceholderExpander.h b/Source/cmRulePlaceholderExpander.h index dfce8bb..c8d107d 100644 --- a/Source/cmRulePlaceholderExpander.h +++ b/Source/cmRulePlaceholderExpander.h @@ -64,6 +64,8 @@ public: const char* SwiftOutputFileMap; const char* SwiftSources; const char* ISPCHeader; + const char* Fatbinary; + const char* RegisterFile; }; // Expand rule variables in CMake of the type found in language rules |