summaryrefslogtreecommitdiffstats
path: root/Source/cmNinjaNormalTargetGenerator.cxx
diff options
context:
space:
mode:
authorRaul Tambre <raul@tambre.ee>2020-09-05 16:40:02 (GMT)
committerBrad King <brad.king@kitware.com>2020-09-24 19:19:54 (GMT)
commitc63fe018353cf6afb30980c4cac7493be7cd0a82 (patch)
tree68d2daf0cd8ab91a9feaa49392607c6cfecd2ac4 /Source/cmNinjaNormalTargetGenerator.cxx
parentc98ec731f90eb0180c89108b7d2e42263b66d1ed (diff)
downloadCMake-c63fe018353cf6afb30980c4cac7493be7cd0a82.zip
CMake-c63fe018353cf6afb30980c4cac7493be7cd0a82.tar.gz
CMake-c63fe018353cf6afb30980c4cac7493be7cd0a82.tar.bz2
CUDA: Clang separable compilation
For NVCC the compiler takes care of device linking when passed the "-dlink" flag. Clang doesn't support such magic and requires the buildsystem to do the work that NVCC does behind the scenes. The implementation is based on Bazel's device linking documentation: https://github.com/tensorflow/tensorflow/blob/7cabcdf073abad8c46e9dda62bb8fa4682d2061e/third_party/nccl/build_defs.bzl.tpl#L259 Closes: #20726
Diffstat (limited to 'Source/cmNinjaNormalTargetGenerator.cxx')
-rw-r--r--Source/cmNinjaNormalTargetGenerator.cxx224
1 files changed, 200 insertions, 24 deletions
diff --git a/Source/cmNinjaNormalTargetGenerator.cxx b/Source/cmNinjaNormalTargetGenerator.cxx
index 210b36e..ccb959b 100644
--- a/Source/cmNinjaNormalTargetGenerator.cxx
+++ b/Source/cmNinjaNormalTargetGenerator.cxx
@@ -8,6 +8,7 @@
#include <map>
#include <set>
#include <sstream>
+#include <unordered_set>
#include <utility>
#include <cm/memory>
@@ -25,6 +26,7 @@
#include "cmLocalGenerator.h"
#include "cmLocalNinjaGenerator.h"
#include "cmMakefile.h"
+#include "cmMessageType.h"
#include "cmNinjaLinkLineDeviceComputer.h"
#include "cmNinjaTypes.h"
#include "cmOSXBundleGenerator.h"
@@ -178,6 +180,33 @@ std::string cmNinjaNormalTargetGenerator::LanguageLinkerDeviceRule(
"_", config);
}
+std::string cmNinjaNormalTargetGenerator::LanguageLinkerCudaDeviceRule(
+ const std::string& config) const
+{
+ return cmStrCat(
+ this->TargetLinkLanguage(config), "_DEVICE_LINK__",
+ cmGlobalNinjaGenerator::EncodeRuleName(this->GeneratorTarget->GetName()),
+ '_', config);
+}
+
+std::string cmNinjaNormalTargetGenerator::LanguageLinkerCudaDeviceCompileRule(
+ const std::string& config) const
+{
+ return cmStrCat(
+ this->TargetLinkLanguage(config), "_DEVICE_LINK_COMPILE__",
+ cmGlobalNinjaGenerator::EncodeRuleName(this->GeneratorTarget->GetName()),
+ '_', config);
+}
+
+std::string cmNinjaNormalTargetGenerator::LanguageLinkerCudaFatbinaryRule(
+ const std::string& config) const
+{
+ return cmStrCat(
+ this->TargetLinkLanguage(config), "_FATBINARY__",
+ cmGlobalNinjaGenerator::EncodeRuleName(this->GeneratorTarget->GetName()),
+ '_', config);
+}
+
struct cmNinjaRemoveNoOpCommands
{
bool operator()(std::string const& cmd)
@@ -186,7 +215,7 @@ struct cmNinjaRemoveNoOpCommands
}
};
-void cmNinjaNormalTargetGenerator::WriteDeviceLinkRule(
+void cmNinjaNormalTargetGenerator::WriteNvidiaDeviceLinkRule(
bool useResponseFile, const std::string& config)
{
cmNinjaRule rule(this->LanguageLinkerDeviceRule(config));
@@ -272,6 +301,55 @@ void cmNinjaNormalTargetGenerator::WriteDeviceLinkRule(
}
}
+void cmNinjaNormalTargetGenerator::WriteDeviceLinkRules(
+ const std::string& config)
+{
+ const cmMakefile* mf = this->GetMakefile();
+
+ cmNinjaRule rule(LanguageLinkerCudaDeviceRule(config));
+ rule.Command = this->GetLocalGenerator()->BuildCommandLine(
+ { cmStrCat(mf->GetRequiredDefinition("CMAKE_CUDA_DEVICE_LINKER"),
+ " -arch=$ARCH $REGISTER -o=$out $in") });
+ rule.Comment = "Rule for CUDA device linking.";
+ rule.Description = "Linking CUDA $out";
+ this->GetGlobalGenerator()->AddRule(rule);
+
+ cmRulePlaceholderExpander::RuleVariables vars;
+ vars.CMTargetName = this->GetGeneratorTarget()->GetName().c_str();
+ vars.CMTargetType =
+ cmState::GetTargetTypeName(this->GetGeneratorTarget()->GetType()).c_str();
+
+ vars.Language = "CUDA";
+ vars.Object = "$out";
+ vars.Fatbinary = "$FATBIN";
+ vars.RegisterFile = "$REGISTER";
+
+ std::string flags = this->GetFlags("CUDA", config);
+ vars.Flags = flags.c_str();
+
+ std::string compileCmd = this->GetMakefile()->GetRequiredDefinition(
+ "CMAKE_CUDA_DEVICE_LINK_COMPILE");
+ std::unique_ptr<cmRulePlaceholderExpander> rulePlaceholderExpander(
+ this->GetLocalGenerator()->CreateRulePlaceholderExpander());
+ rulePlaceholderExpander->ExpandRuleVariables(this->GetLocalGenerator(),
+ compileCmd, vars);
+
+ rule.Name = LanguageLinkerCudaDeviceCompileRule(config);
+ rule.Command = this->GetLocalGenerator()->BuildCommandLine({ compileCmd });
+ rule.Comment = "Rule for compiling CUDA device stubs.";
+ rule.Description = "Compiling CUDA device stub $out";
+ this->GetGlobalGenerator()->AddRule(rule);
+
+ rule.Name = LanguageLinkerCudaFatbinaryRule(config);
+ rule.Command = this->GetLocalGenerator()->BuildCommandLine(
+ { cmStrCat(mf->GetRequiredDefinition("CMAKE_CUDA_FATBINARY"),
+ " -64 -cmdline=--compile-only -compress-all -link "
+ "--embedded-fatbin=$out $PROFILES") });
+ rule.Comment = "Rule for CUDA fatbinaries.";
+ rule.Description = "Creating fatbinary $out";
+ this->GetGlobalGenerator()->AddRule(rule);
+}
+
void cmNinjaNormalTargetGenerator::WriteLinkRule(bool useResponseFile,
const std::string& config)
{
@@ -586,7 +664,6 @@ void cmNinjaNormalTargetGenerator::WriteDeviceLinkStatement(
// First and very important step is to make sure while inside this
// step our link language is set to CUDA
- std::string cudaLinkLanguage = "CUDA";
std::string const& objExt =
this->Makefile->GetSafeDefinition("CMAKE_CUDA_OUTPUT_EXTENSION");
@@ -598,6 +675,118 @@ void cmNinjaNormalTargetGenerator::WriteDeviceLinkStatement(
std::string targetOutputReal =
ConvertToNinjaPath(targetOutputDir + "cmake_device_link" + objExt);
+ if (firstForConfig) {
+ globalGen->GetByproductsForCleanTarget(config).push_back(targetOutputReal);
+ }
+ this->DeviceLinkObject = targetOutputReal;
+
+ // Write comments.
+ cmGlobalNinjaGenerator::WriteDivider(this->GetCommonFileStream());
+ this->GetCommonFileStream()
+ << "# Device Link build statements for "
+ << cmState::GetTargetTypeName(genTarget->GetType()) << " target "
+ << this->GetTargetName() << "\n\n";
+
+ if (this->Makefile->GetSafeDefinition("CMAKE_CUDA_COMPILER_ID") == "Clang") {
+ std::string architecturesStr =
+ this->GeneratorTarget->GetSafeProperty("CUDA_ARCHITECTURES");
+
+ if (cmIsOff(architecturesStr)) {
+ this->Makefile->IssueMessage(MessageType::FATAL_ERROR,
+ "CUDA_SEPARABLE_COMPILATION on Clang "
+ "requires CUDA_ARCHITECTURES to be set.");
+ return;
+ }
+
+ this->WriteDeviceLinkRules(config);
+ this->WriteDeviceLinkStatements(config, cmExpandedList(architecturesStr),
+ targetOutputReal);
+ } else {
+ this->WriteNvidiaDeviceLinkStatement(config, fileConfig, targetOutputDir,
+ targetOutputReal);
+ }
+}
+
+void cmNinjaNormalTargetGenerator::WriteDeviceLinkStatements(
+ const std::string& config, const std::vector<std::string>& architectures,
+ const std::string& output)
+{
+ // Ensure there are no duplicates.
+ const cmNinjaDeps explicitDeps = [&]() -> std::vector<std::string> {
+ std::unordered_set<std::string> depsSet;
+ const cmNinjaDeps linkDeps =
+ this->ComputeLinkDeps(this->TargetLinkLanguage(config), config, true);
+ const cmNinjaDeps objects = this->GetObjects(config);
+ depsSet.insert(linkDeps.begin(), linkDeps.end());
+ depsSet.insert(objects.begin(), objects.end());
+
+ std::vector<std::string> deps;
+ std::copy(depsSet.begin(), depsSet.end(), std::back_inserter(deps));
+ return deps;
+ }();
+
+ const std::string objectDir =
+ cmStrCat(this->GeneratorTarget->GetSupportDirectory(),
+ this->GetGlobalGenerator()->ConfigDirectory(config));
+ const std::string ninjaOutputDir = this->ConvertToNinjaPath(objectDir);
+
+ cmNinjaBuild fatbinary(LanguageLinkerCudaFatbinaryRule(config));
+
+ // Link device code for each architecture.
+ for (const std::string& architectureKind : architectures) {
+ // Clang always generates real code, so strip the specifier.
+ const std::string architecture =
+ architectureKind.substr(0, architectureKind.find('-'));
+ const std::string cubin =
+ cmStrCat(ninjaOutputDir, "/sm_", architecture, ".cubin");
+
+ fatbinary.Variables["PROFILES"] +=
+ cmStrCat(" -im=profile=sm_", architecture, ",file=", cubin);
+ fatbinary.ExplicitDeps.emplace_back(cubin);
+
+ cmNinjaBuild dlink(LanguageLinkerCudaDeviceRule(config));
+ dlink.ExplicitDeps = explicitDeps;
+ dlink.Outputs = { cubin };
+ dlink.Variables["ARCH"] = cmStrCat("sm_", architecture);
+
+ // The generated register file contains macros that when expanded register
+ // the device routines. Because the routines are the same for all
+ // architectures the register file will be the same too. Thus generate it
+ // only on the first invocation to reduce overhead.
+ if (fatbinary.ExplicitDeps.size() == 1) {
+ dlink.Variables["REGISTER"] = cmStrCat(
+ "--register-link-binaries=", ninjaOutputDir, "/cmake_cuda_register.h");
+ }
+
+ this->GetGlobalGenerator()->WriteBuild(this->GetCommonFileStream(), dlink);
+ }
+
+ // Combine all architectures into a single fatbinary.
+ fatbinary.Outputs = { cmStrCat(ninjaOutputDir, "/cmake_cuda_fatbin.h") };
+ this->GetGlobalGenerator()->WriteBuild(this->GetCommonFileStream(),
+ fatbinary);
+
+ // Compile the stub that registers the kernels and contains the fatbinaries.
+ cmNinjaBuild dcompile(LanguageLinkerCudaDeviceCompileRule(config));
+ dcompile.Outputs = { output };
+ dcompile.ExplicitDeps = { cmStrCat(ninjaOutputDir, "/cmake_cuda_fatbin.h") };
+ dcompile.Variables["FATBIN"] =
+ this->GetLocalGenerator()->ConvertToOutputFormat(
+ cmStrCat(objectDir, "/cmake_cuda_fatbin.h"), cmOutputConverter::SHELL);
+ dcompile.Variables["REGISTER"] =
+ this->GetLocalGenerator()->ConvertToOutputFormat(
+ cmStrCat(objectDir, "/cmake_cuda_register.h"), cmOutputConverter::SHELL);
+ this->GetGlobalGenerator()->WriteBuild(this->GetCommonFileStream(),
+ dcompile);
+}
+
+void cmNinjaNormalTargetGenerator::WriteNvidiaDeviceLinkStatement(
+ const std::string& config, const std::string& fileConfig,
+ const std::string& outputDir, const std::string& output)
+{
+ cmGeneratorTarget* genTarget = this->GetGeneratorTarget();
+ cmGlobalNinjaGenerator* globalGen = this->GetGlobalGenerator();
+
std::string targetOutputImplib = ConvertToNinjaPath(
genTarget->GetFullPath(config, cmStateEnums::ImportLibraryArtifact));
@@ -606,8 +795,8 @@ void cmNinjaNormalTargetGenerator::WriteDeviceLinkStatement(
cmStrCat(this->GetLocalGenerator()->GetTargetDirectory(genTarget),
globalGen->ConfigDirectory(fileConfig), "/");
targetOutputFileConfigDir =
- globalGen->ExpandCFGIntDir(targetOutputDir, fileConfig);
- if (targetOutputDir == targetOutputFileConfigDir) {
+ globalGen->ExpandCFGIntDir(outputDir, fileConfig);
+ if (outputDir == targetOutputFileConfigDir) {
return;
}
@@ -623,27 +812,15 @@ void cmNinjaNormalTargetGenerator::WriteDeviceLinkStatement(
}
}
- if (firstForConfig) {
- globalGen->GetByproductsForCleanTarget(config).push_back(targetOutputReal);
- }
- this->DeviceLinkObject = targetOutputReal;
-
- // Write comments.
- cmGlobalNinjaGenerator::WriteDivider(this->GetCommonFileStream());
- const cmStateEnums::TargetType targetType = genTarget->GetType();
- this->GetCommonFileStream() << "# Device Link build statements for "
- << cmState::GetTargetTypeName(targetType)
- << " target " << this->GetTargetName() << "\n\n";
-
// Compute the comment.
cmNinjaBuild build(this->LanguageLinkerDeviceRule(config));
build.Comment =
- cmStrCat("Link the ", this->GetVisibleTypeName(), ' ', targetOutputReal);
+ cmStrCat("Link the ", this->GetVisibleTypeName(), ' ', output);
cmNinjaVars& vars = build.Variables;
// Compute outputs.
- build.Outputs.push_back(targetOutputReal);
+ build.Outputs.push_back(output);
// Compute specific libraries to link with.
build.ExplicitDeps = this->GetObjects(config);
build.ImplicitDeps =
@@ -659,7 +836,7 @@ void cmNinjaNormalTargetGenerator::WriteDeviceLinkStatement(
cmLocalNinjaGenerator& localGen = *this->GetLocalGenerator();
vars["TARGET_FILE"] =
- localGen.ConvertToOutputFormat(targetOutputReal, cmOutputConverter::SHELL);
+ localGen.ConvertToOutputFormat(output, cmOutputConverter::SHELL);
std::unique_ptr<cmLinkLineComputer> linkLineComputer(
new cmNinjaLinkLineDeviceComputer(
@@ -683,8 +860,7 @@ void cmNinjaNormalTargetGenerator::WriteDeviceLinkStatement(
// Compute language specific link flags.
std::string langFlags;
- localGen.AddLanguageFlagsForLinking(langFlags, genTarget, cudaLinkLanguage,
- config);
+ localGen.AddLanguageFlagsForLinking(langFlags, genTarget, "CUDA", config);
vars["LANGUAGE_COMPILE_FLAGS"] = langFlags;
auto const tgtNames = this->TargetNames(config);
@@ -692,7 +868,7 @@ void cmNinjaNormalTargetGenerator::WriteDeviceLinkStatement(
vars["SONAME_FLAG"] =
this->GetMakefile()->GetSONameFlag(this->TargetLinkLanguage(config));
vars["SONAME"] = tgtNames.SharedObject;
- if (targetType == cmStateEnums::SHARED_LIBRARY) {
+ if (genTarget->GetType() == cmStateEnums::SHARED_LIBRARY) {
std::string install_dir =
this->GetGeneratorTarget()->GetInstallNameDirForBuildTree(config);
if (!install_dir.empty()) {
@@ -731,7 +907,7 @@ void cmNinjaNormalTargetGenerator::WriteDeviceLinkStatement(
// do not check if the user has explicitly forced a response file.
int const commandLineLengthLimit =
static_cast<int>(cmSystemTools::CalculateCommandLineLengthLimit()) -
- globalGen->GetRuleCmdLength(this->LanguageLinkerDeviceRule(config));
+ globalGen->GetRuleCmdLength(build.Rule);
build.RspFile = this->ConvertToNinjaPath(
cmStrCat("CMakeFiles/", genTarget->GetName(),
@@ -746,7 +922,7 @@ void cmNinjaNormalTargetGenerator::WriteDeviceLinkStatement(
bool usedResponseFile = false;
globalGen->WriteBuild(this->GetCommonFileStream(), build,
commandLineLengthLimit, &usedResponseFile);
- this->WriteDeviceLinkRule(usedResponseFile, config);
+ this->WriteNvidiaDeviceLinkRule(usedResponseFile, config);
}
void cmNinjaNormalTargetGenerator::WriteLinkStatement(