diff options
author | Raul Tambre <raul@tambre.ee> | 2020-09-05 16:40:02 (GMT) |
---|---|---|
committer | Brad King <brad.king@kitware.com> | 2020-09-24 19:19:54 (GMT) |
commit | c63fe018353cf6afb30980c4cac7493be7cd0a82 (patch) | |
tree | 68d2daf0cd8ab91a9feaa49392607c6cfecd2ac4 /Source/cmNinjaNormalTargetGenerator.cxx | |
parent | c98ec731f90eb0180c89108b7d2e42263b66d1ed (diff) | |
download | CMake-c63fe018353cf6afb30980c4cac7493be7cd0a82.zip CMake-c63fe018353cf6afb30980c4cac7493be7cd0a82.tar.gz CMake-c63fe018353cf6afb30980c4cac7493be7cd0a82.tar.bz2 |
CUDA: Clang separable compilation
For NVCC the compiler takes care of device linking when passed the "-dlink"
flag.
Clang doesn't support such magic and requires the buildsystem to do the work
that NVCC does behind the scenes.
The implementation is based on Bazel's device linking documentation:
https://github.com/tensorflow/tensorflow/blob/7cabcdf073abad8c46e9dda62bb8fa4682d2061e/third_party/nccl/build_defs.bzl.tpl#L259
Closes: #20726
Diffstat (limited to 'Source/cmNinjaNormalTargetGenerator.cxx')
-rw-r--r-- | Source/cmNinjaNormalTargetGenerator.cxx | 224 |
1 files changed, 200 insertions, 24 deletions
diff --git a/Source/cmNinjaNormalTargetGenerator.cxx b/Source/cmNinjaNormalTargetGenerator.cxx index 210b36e..ccb959b 100644 --- a/Source/cmNinjaNormalTargetGenerator.cxx +++ b/Source/cmNinjaNormalTargetGenerator.cxx @@ -8,6 +8,7 @@ #include <map> #include <set> #include <sstream> +#include <unordered_set> #include <utility> #include <cm/memory> @@ -25,6 +26,7 @@ #include "cmLocalGenerator.h" #include "cmLocalNinjaGenerator.h" #include "cmMakefile.h" +#include "cmMessageType.h" #include "cmNinjaLinkLineDeviceComputer.h" #include "cmNinjaTypes.h" #include "cmOSXBundleGenerator.h" @@ -178,6 +180,33 @@ std::string cmNinjaNormalTargetGenerator::LanguageLinkerDeviceRule( "_", config); } +std::string cmNinjaNormalTargetGenerator::LanguageLinkerCudaDeviceRule( + const std::string& config) const +{ + return cmStrCat( + this->TargetLinkLanguage(config), "_DEVICE_LINK__", + cmGlobalNinjaGenerator::EncodeRuleName(this->GeneratorTarget->GetName()), + '_', config); +} + +std::string cmNinjaNormalTargetGenerator::LanguageLinkerCudaDeviceCompileRule( + const std::string& config) const +{ + return cmStrCat( + this->TargetLinkLanguage(config), "_DEVICE_LINK_COMPILE__", + cmGlobalNinjaGenerator::EncodeRuleName(this->GeneratorTarget->GetName()), + '_', config); +} + +std::string cmNinjaNormalTargetGenerator::LanguageLinkerCudaFatbinaryRule( + const std::string& config) const +{ + return cmStrCat( + this->TargetLinkLanguage(config), "_FATBINARY__", + cmGlobalNinjaGenerator::EncodeRuleName(this->GeneratorTarget->GetName()), + '_', config); +} + struct cmNinjaRemoveNoOpCommands { bool operator()(std::string const& cmd) @@ -186,7 +215,7 @@ struct cmNinjaRemoveNoOpCommands } }; -void cmNinjaNormalTargetGenerator::WriteDeviceLinkRule( +void cmNinjaNormalTargetGenerator::WriteNvidiaDeviceLinkRule( bool useResponseFile, const std::string& config) { cmNinjaRule rule(this->LanguageLinkerDeviceRule(config)); @@ -272,6 +301,55 @@ void cmNinjaNormalTargetGenerator::WriteDeviceLinkRule( } } +void cmNinjaNormalTargetGenerator::WriteDeviceLinkRules( + const std::string& config) +{ + const cmMakefile* mf = this->GetMakefile(); + + cmNinjaRule rule(LanguageLinkerCudaDeviceRule(config)); + rule.Command = this->GetLocalGenerator()->BuildCommandLine( + { cmStrCat(mf->GetRequiredDefinition("CMAKE_CUDA_DEVICE_LINKER"), + " -arch=$ARCH $REGISTER -o=$out $in") }); + rule.Comment = "Rule for CUDA device linking."; + rule.Description = "Linking CUDA $out"; + this->GetGlobalGenerator()->AddRule(rule); + + cmRulePlaceholderExpander::RuleVariables vars; + vars.CMTargetName = this->GetGeneratorTarget()->GetName().c_str(); + vars.CMTargetType = + cmState::GetTargetTypeName(this->GetGeneratorTarget()->GetType()).c_str(); + + vars.Language = "CUDA"; + vars.Object = "$out"; + vars.Fatbinary = "$FATBIN"; + vars.RegisterFile = "$REGISTER"; + + std::string flags = this->GetFlags("CUDA", config); + vars.Flags = flags.c_str(); + + std::string compileCmd = this->GetMakefile()->GetRequiredDefinition( + "CMAKE_CUDA_DEVICE_LINK_COMPILE"); + std::unique_ptr<cmRulePlaceholderExpander> rulePlaceholderExpander( + this->GetLocalGenerator()->CreateRulePlaceholderExpander()); + rulePlaceholderExpander->ExpandRuleVariables(this->GetLocalGenerator(), + compileCmd, vars); + + rule.Name = LanguageLinkerCudaDeviceCompileRule(config); + rule.Command = this->GetLocalGenerator()->BuildCommandLine({ compileCmd }); + rule.Comment = "Rule for compiling CUDA device stubs."; + rule.Description = "Compiling CUDA device stub $out"; + this->GetGlobalGenerator()->AddRule(rule); + + rule.Name = LanguageLinkerCudaFatbinaryRule(config); + rule.Command = this->GetLocalGenerator()->BuildCommandLine( + { cmStrCat(mf->GetRequiredDefinition("CMAKE_CUDA_FATBINARY"), + " -64 -cmdline=--compile-only -compress-all -link " + "--embedded-fatbin=$out $PROFILES") }); + rule.Comment = "Rule for CUDA fatbinaries."; + rule.Description = "Creating fatbinary $out"; + this->GetGlobalGenerator()->AddRule(rule); +} + void cmNinjaNormalTargetGenerator::WriteLinkRule(bool useResponseFile, const std::string& config) { @@ -586,7 +664,6 @@ void cmNinjaNormalTargetGenerator::WriteDeviceLinkStatement( // First and very important step is to make sure while inside this // step our link language is set to CUDA - std::string cudaLinkLanguage = "CUDA"; std::string const& objExt = this->Makefile->GetSafeDefinition("CMAKE_CUDA_OUTPUT_EXTENSION"); @@ -598,6 +675,118 @@ void cmNinjaNormalTargetGenerator::WriteDeviceLinkStatement( std::string targetOutputReal = ConvertToNinjaPath(targetOutputDir + "cmake_device_link" + objExt); + if (firstForConfig) { + globalGen->GetByproductsForCleanTarget(config).push_back(targetOutputReal); + } + this->DeviceLinkObject = targetOutputReal; + + // Write comments. + cmGlobalNinjaGenerator::WriteDivider(this->GetCommonFileStream()); + this->GetCommonFileStream() + << "# Device Link build statements for " + << cmState::GetTargetTypeName(genTarget->GetType()) << " target " + << this->GetTargetName() << "\n\n"; + + if (this->Makefile->GetSafeDefinition("CMAKE_CUDA_COMPILER_ID") == "Clang") { + std::string architecturesStr = + this->GeneratorTarget->GetSafeProperty("CUDA_ARCHITECTURES"); + + if (cmIsOff(architecturesStr)) { + this->Makefile->IssueMessage(MessageType::FATAL_ERROR, + "CUDA_SEPARABLE_COMPILATION on Clang " + "requires CUDA_ARCHITECTURES to be set."); + return; + } + + this->WriteDeviceLinkRules(config); + this->WriteDeviceLinkStatements(config, cmExpandedList(architecturesStr), + targetOutputReal); + } else { + this->WriteNvidiaDeviceLinkStatement(config, fileConfig, targetOutputDir, + targetOutputReal); + } +} + +void cmNinjaNormalTargetGenerator::WriteDeviceLinkStatements( + const std::string& config, const std::vector<std::string>& architectures, + const std::string& output) +{ + // Ensure there are no duplicates. + const cmNinjaDeps explicitDeps = [&]() -> std::vector<std::string> { + std::unordered_set<std::string> depsSet; + const cmNinjaDeps linkDeps = + this->ComputeLinkDeps(this->TargetLinkLanguage(config), config, true); + const cmNinjaDeps objects = this->GetObjects(config); + depsSet.insert(linkDeps.begin(), linkDeps.end()); + depsSet.insert(objects.begin(), objects.end()); + + std::vector<std::string> deps; + std::copy(depsSet.begin(), depsSet.end(), std::back_inserter(deps)); + return deps; + }(); + + const std::string objectDir = + cmStrCat(this->GeneratorTarget->GetSupportDirectory(), + this->GetGlobalGenerator()->ConfigDirectory(config)); + const std::string ninjaOutputDir = this->ConvertToNinjaPath(objectDir); + + cmNinjaBuild fatbinary(LanguageLinkerCudaFatbinaryRule(config)); + + // Link device code for each architecture. + for (const std::string& architectureKind : architectures) { + // Clang always generates real code, so strip the specifier. + const std::string architecture = + architectureKind.substr(0, architectureKind.find('-')); + const std::string cubin = + cmStrCat(ninjaOutputDir, "/sm_", architecture, ".cubin"); + + fatbinary.Variables["PROFILES"] += + cmStrCat(" -im=profile=sm_", architecture, ",file=", cubin); + fatbinary.ExplicitDeps.emplace_back(cubin); + + cmNinjaBuild dlink(LanguageLinkerCudaDeviceRule(config)); + dlink.ExplicitDeps = explicitDeps; + dlink.Outputs = { cubin }; + dlink.Variables["ARCH"] = cmStrCat("sm_", architecture); + + // The generated register file contains macros that when expanded register + // the device routines. Because the routines are the same for all + // architectures the register file will be the same too. Thus generate it + // only on the first invocation to reduce overhead. + if (fatbinary.ExplicitDeps.size() == 1) { + dlink.Variables["REGISTER"] = cmStrCat( + "--register-link-binaries=", ninjaOutputDir, "/cmake_cuda_register.h"); + } + + this->GetGlobalGenerator()->WriteBuild(this->GetCommonFileStream(), dlink); + } + + // Combine all architectures into a single fatbinary. + fatbinary.Outputs = { cmStrCat(ninjaOutputDir, "/cmake_cuda_fatbin.h") }; + this->GetGlobalGenerator()->WriteBuild(this->GetCommonFileStream(), + fatbinary); + + // Compile the stub that registers the kernels and contains the fatbinaries. + cmNinjaBuild dcompile(LanguageLinkerCudaDeviceCompileRule(config)); + dcompile.Outputs = { output }; + dcompile.ExplicitDeps = { cmStrCat(ninjaOutputDir, "/cmake_cuda_fatbin.h") }; + dcompile.Variables["FATBIN"] = + this->GetLocalGenerator()->ConvertToOutputFormat( + cmStrCat(objectDir, "/cmake_cuda_fatbin.h"), cmOutputConverter::SHELL); + dcompile.Variables["REGISTER"] = + this->GetLocalGenerator()->ConvertToOutputFormat( + cmStrCat(objectDir, "/cmake_cuda_register.h"), cmOutputConverter::SHELL); + this->GetGlobalGenerator()->WriteBuild(this->GetCommonFileStream(), + dcompile); +} + +void cmNinjaNormalTargetGenerator::WriteNvidiaDeviceLinkStatement( + const std::string& config, const std::string& fileConfig, + const std::string& outputDir, const std::string& output) +{ + cmGeneratorTarget* genTarget = this->GetGeneratorTarget(); + cmGlobalNinjaGenerator* globalGen = this->GetGlobalGenerator(); + std::string targetOutputImplib = ConvertToNinjaPath( genTarget->GetFullPath(config, cmStateEnums::ImportLibraryArtifact)); @@ -606,8 +795,8 @@ void cmNinjaNormalTargetGenerator::WriteDeviceLinkStatement( cmStrCat(this->GetLocalGenerator()->GetTargetDirectory(genTarget), globalGen->ConfigDirectory(fileConfig), "/"); targetOutputFileConfigDir = - globalGen->ExpandCFGIntDir(targetOutputDir, fileConfig); - if (targetOutputDir == targetOutputFileConfigDir) { + globalGen->ExpandCFGIntDir(outputDir, fileConfig); + if (outputDir == targetOutputFileConfigDir) { return; } @@ -623,27 +812,15 @@ void cmNinjaNormalTargetGenerator::WriteDeviceLinkStatement( } } - if (firstForConfig) { - globalGen->GetByproductsForCleanTarget(config).push_back(targetOutputReal); - } - this->DeviceLinkObject = targetOutputReal; - - // Write comments. - cmGlobalNinjaGenerator::WriteDivider(this->GetCommonFileStream()); - const cmStateEnums::TargetType targetType = genTarget->GetType(); - this->GetCommonFileStream() << "# Device Link build statements for " - << cmState::GetTargetTypeName(targetType) - << " target " << this->GetTargetName() << "\n\n"; - // Compute the comment. cmNinjaBuild build(this->LanguageLinkerDeviceRule(config)); build.Comment = - cmStrCat("Link the ", this->GetVisibleTypeName(), ' ', targetOutputReal); + cmStrCat("Link the ", this->GetVisibleTypeName(), ' ', output); cmNinjaVars& vars = build.Variables; // Compute outputs. - build.Outputs.push_back(targetOutputReal); + build.Outputs.push_back(output); // Compute specific libraries to link with. build.ExplicitDeps = this->GetObjects(config); build.ImplicitDeps = @@ -659,7 +836,7 @@ void cmNinjaNormalTargetGenerator::WriteDeviceLinkStatement( cmLocalNinjaGenerator& localGen = *this->GetLocalGenerator(); vars["TARGET_FILE"] = - localGen.ConvertToOutputFormat(targetOutputReal, cmOutputConverter::SHELL); + localGen.ConvertToOutputFormat(output, cmOutputConverter::SHELL); std::unique_ptr<cmLinkLineComputer> linkLineComputer( new cmNinjaLinkLineDeviceComputer( @@ -683,8 +860,7 @@ void cmNinjaNormalTargetGenerator::WriteDeviceLinkStatement( // Compute language specific link flags. std::string langFlags; - localGen.AddLanguageFlagsForLinking(langFlags, genTarget, cudaLinkLanguage, - config); + localGen.AddLanguageFlagsForLinking(langFlags, genTarget, "CUDA", config); vars["LANGUAGE_COMPILE_FLAGS"] = langFlags; auto const tgtNames = this->TargetNames(config); @@ -692,7 +868,7 @@ void cmNinjaNormalTargetGenerator::WriteDeviceLinkStatement( vars["SONAME_FLAG"] = this->GetMakefile()->GetSONameFlag(this->TargetLinkLanguage(config)); vars["SONAME"] = tgtNames.SharedObject; - if (targetType == cmStateEnums::SHARED_LIBRARY) { + if (genTarget->GetType() == cmStateEnums::SHARED_LIBRARY) { std::string install_dir = this->GetGeneratorTarget()->GetInstallNameDirForBuildTree(config); if (!install_dir.empty()) { @@ -731,7 +907,7 @@ void cmNinjaNormalTargetGenerator::WriteDeviceLinkStatement( // do not check if the user has explicitly forced a response file. int const commandLineLengthLimit = static_cast<int>(cmSystemTools::CalculateCommandLineLengthLimit()) - - globalGen->GetRuleCmdLength(this->LanguageLinkerDeviceRule(config)); + globalGen->GetRuleCmdLength(build.Rule); build.RspFile = this->ConvertToNinjaPath( cmStrCat("CMakeFiles/", genTarget->GetName(), @@ -746,7 +922,7 @@ void cmNinjaNormalTargetGenerator::WriteDeviceLinkStatement( bool usedResponseFile = false; globalGen->WriteBuild(this->GetCommonFileStream(), build, commandLineLengthLimit, &usedResponseFile); - this->WriteDeviceLinkRule(usedResponseFile, config); + this->WriteNvidiaDeviceLinkRule(usedResponseFile, config); } void cmNinjaNormalTargetGenerator::WriteLinkStatement( |