2019-12-22 22:42:04 +00:00
|
|
|
|
use crate::back::write::{
|
|
|
|
|
self, save_temp_bitcode, to_llvm_opt_settings, with_llvm_pmb, DiagnosticHandlers,
|
|
|
|
|
};
|
2019-02-17 18:58:58 +00:00
|
|
|
|
use crate::llvm::archive_ro::ArchiveRO;
|
2019-12-22 22:42:04 +00:00
|
|
|
|
use crate::llvm::{self, False, True};
|
|
|
|
|
use crate::{LlvmCodegenBackend, ModuleLlvm};
|
|
|
|
|
use rustc_codegen_ssa::back::lto::{LtoModuleCodegen, SerializedModule, ThinModule, ThinShared};
|
|
|
|
|
use rustc_codegen_ssa::back::symbol_export;
|
|
|
|
|
use rustc_codegen_ssa::back::write::{CodegenContext, FatLTOInput, ModuleConfig};
|
|
|
|
|
use rustc_codegen_ssa::traits::*;
|
2020-04-23 18:45:55 +00:00
|
|
|
|
use rustc_codegen_ssa::{looks_like_rust_object_file, ModuleCodegen, ModuleKind};
|
2019-12-22 22:42:04 +00:00
|
|
|
|
use rustc_data_structures::fx::{FxHashMap, FxHashSet};
|
|
|
|
|
use rustc_errors::{FatalError, Handler};
|
2020-01-05 01:37:57 +00:00
|
|
|
|
use rustc_hir::def_id::LOCAL_CRATE;
|
2020-03-29 15:19:48 +00:00
|
|
|
|
use rustc_middle::bug;
|
|
|
|
|
use rustc_middle::dep_graph::WorkProduct;
|
|
|
|
|
use rustc_middle::middle::exported_symbols::SymbolExportLevel;
|
2019-12-22 22:42:04 +00:00
|
|
|
|
use rustc_session::cgu_reuse_tracker::CguReuse;
|
2020-05-01 22:30:23 +00:00
|
|
|
|
use rustc_session::config::{self, CrateType, Lto};
|
2020-08-05 11:35:53 +00:00
|
|
|
|
use tracing::{debug, info};
|
Implement LTO
This commit implements LTO for rust leveraging LLVM's passes. What this means
is:
* When compiling an rlib, in addition to insdering foo.o into the archive, also
insert foo.bc (the LLVM bytecode) of the optimized module.
* When the compiler detects the -Z lto option, it will attempt to perform LTO on
a staticlib or binary output. The compiler will emit an error if a dylib or
rlib output is being generated.
* The actual act of performing LTO is as follows:
1. Force all upstream libraries to have an rlib version available.
2. Load the bytecode of each upstream library from the rlib.
3. Link all this bytecode into the current LLVM module (just using llvm
apis)
4. Run an internalization pass which internalizes all symbols except those
found reachable for the local crate of compilation.
5. Run the LLVM LTO pass manager over this entire module
6a. If assembling an archive, then add all upstream rlibs into the output
archive. This ignores all of the object/bitcode/metadata files rust
generated and placed inside the rlibs.
6b. If linking a binary, create copies of all upstream rlibs, remove the
rust-generated object-file, and then link everything as usual.
As I have explained in #10741, this process is excruciatingly slow, so this is
*not* turned on by default, and it is also why I have decided to hide it behind
a -Z flag for now. The good news is that the binary sizes are about as small as
they can be as a result of LTO, so it's definitely working.
Closes #10741
Closes #10740
2013-12-03 07:19:29 +00:00
|
|
|
|
|
2018-08-17 14:07:23 +00:00
|
|
|
|
use std::ffi::{CStr, CString};
|
2019-11-29 15:04:40 +00:00
|
|
|
|
use std::fs::File;
|
|
|
|
|
use std::io;
|
|
|
|
|
use std::mem;
|
|
|
|
|
use std::path::Path;
|
2017-12-16 16:20:54 +00:00
|
|
|
|
use std::ptr;
|
2017-07-23 15:14:38 +00:00
|
|
|
|
use std::slice;
|
rustc: Implement ThinLTO
This commit is an implementation of LLVM's ThinLTO for consumption in rustc
itself. Currently today LTO works by merging all relevant LLVM modules into one
and then running optimization passes. "Thin" LTO operates differently by having
more sharded work and allowing parallelism opportunities between optimizing
codegen units. Further down the road Thin LTO also allows *incremental* LTO
which should enable even faster release builds without compromising on the
performance we have today.
This commit uses a `-Z thinlto` flag to gate whether ThinLTO is enabled. It then
also implements two forms of ThinLTO:
* In one mode we'll *only* perform ThinLTO over the codegen units produced in a
single compilation. That is, we won't load upstream rlibs, but we'll instead
just perform ThinLTO amongst all codegen units produced by the compiler for
the local crate. This is intended to emulate a desired end point where we have
codegen units turned on by default for all crates and ThinLTO allows us to do
this without performance loss.
* In anther mode, like full LTO today, we'll optimize all upstream dependencies
in "thin" mode. Unlike today, however, this LTO step is fully parallelized so
should finish much more quickly.
There's a good bit of comments about what the implementation is doing and where
it came from, but the tl;dr; is that currently most of the support here is
copied from upstream LLVM. This code duplication is done for a number of
reasons:
* Controlling parallelism means we can use the existing jobserver support to
avoid overloading machines.
* We will likely want a slightly different form of incremental caching which
integrates with our own incremental strategy, but this is yet to be
determined.
* This buys us some flexibility about when/where we run ThinLTO, as well as
having it tailored to fit our needs for the time being.
* Finally this allows us to reuse some artifacts such as our `TargetMachine`
creation, where all our options we used today aren't necessarily supported by
upstream LLVM yet.
My hope is that we can get some experience with this copy/paste in tree and then
eventually upstream some work to LLVM itself to avoid the duplication while
still ensuring our needs are met. Otherwise I fear that maintaining these
bindings may be quite costly over the years with LLVM updates!
2017-07-23 15:14:38 +00:00
|
|
|
|
use std::sync::Arc;
|
2014-07-31 13:05:08 +00:00
|
|
|
|
|
2019-11-29 15:04:40 +00:00
|
|
|
|
/// We keep track of past LTO imports that were used to produce the current set
|
|
|
|
|
/// of compiled object files that we might choose to reuse during this
|
|
|
|
|
/// compilation session.
|
|
|
|
|
pub const THIN_LTO_IMPORTS_INCR_COMP_FILE_NAME: &str = "thin-lto-past-imports.bin";
|
|
|
|
|
|
2020-05-01 22:30:23 +00:00
|
|
|
|
pub fn crate_type_allows_lto(crate_type: CrateType) -> bool {
|
2016-11-30 15:03:42 +00:00
|
|
|
|
match crate_type {
|
2020-05-01 22:30:23 +00:00
|
|
|
|
CrateType::Executable | CrateType::Staticlib | CrateType::Cdylib => true,
|
|
|
|
|
CrateType::Dylib | CrateType::Rlib | CrateType::ProcMacro => false,
|
2016-11-30 15:03:42 +00:00
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2019-12-22 22:42:04 +00:00
|
|
|
|
fn prepare_lto(
|
|
|
|
|
cgcx: &CodegenContext<LlvmCodegenBackend>,
|
|
|
|
|
diag_handler: &Handler,
|
|
|
|
|
) -> Result<(Vec<CString>, Vec<(SerializedModule<ModuleBuffer>, CString)>), FatalError> {
|
2018-01-16 23:02:31 +00:00
|
|
|
|
let export_threshold = match cgcx.lto {
|
|
|
|
|
// We're just doing LTO for our one crate
|
|
|
|
|
Lto::ThinLocal => SymbolExportLevel::Rust,
|
|
|
|
|
|
|
|
|
|
// We're doing LTO for the entire crate graph
|
2019-12-22 22:42:04 +00:00
|
|
|
|
Lto::Fat | Lto::Thin => symbol_export::crates_export_threshold(&cgcx.crate_types),
|
2018-01-16 23:02:31 +00:00
|
|
|
|
|
|
|
|
|
Lto::No => panic!("didn't request LTO but we're doing LTO"),
|
rustc: Implement ThinLTO
This commit is an implementation of LLVM's ThinLTO for consumption in rustc
itself. Currently today LTO works by merging all relevant LLVM modules into one
and then running optimization passes. "Thin" LTO operates differently by having
more sharded work and allowing parallelism opportunities between optimizing
codegen units. Further down the road Thin LTO also allows *incremental* LTO
which should enable even faster release builds without compromising on the
performance we have today.
This commit uses a `-Z thinlto` flag to gate whether ThinLTO is enabled. It then
also implements two forms of ThinLTO:
* In one mode we'll *only* perform ThinLTO over the codegen units produced in a
single compilation. That is, we won't load upstream rlibs, but we'll instead
just perform ThinLTO amongst all codegen units produced by the compiler for
the local crate. This is intended to emulate a desired end point where we have
codegen units turned on by default for all crates and ThinLTO allows us to do
this without performance loss.
* In anther mode, like full LTO today, we'll optimize all upstream dependencies
in "thin" mode. Unlike today, however, this LTO step is fully parallelized so
should finish much more quickly.
There's a good bit of comments about what the implementation is doing and where
it came from, but the tl;dr; is that currently most of the support here is
copied from upstream LLVM. This code duplication is done for a number of
reasons:
* Controlling parallelism means we can use the existing jobserver support to
avoid overloading machines.
* We will likely want a slightly different form of incremental caching which
integrates with our own incremental strategy, but this is yet to be
determined.
* This buys us some flexibility about when/where we run ThinLTO, as well as
having it tailored to fit our needs for the time being.
* Finally this allows us to reuse some artifacts such as our `TargetMachine`
creation, where all our options we used today aren't necessarily supported by
upstream LLVM yet.
My hope is that we can get some experience with this copy/paste in tree and then
eventually upstream some work to LLVM itself to avoid the duplication while
still ensuring our needs are met. Otherwise I fear that maintaining these
bindings may be quite costly over the years with LLVM updates!
2017-07-23 15:14:38 +00:00
|
|
|
|
};
|
2016-11-30 15:03:42 +00:00
|
|
|
|
|
2018-02-27 16:52:07 +00:00
|
|
|
|
let symbol_filter = &|&(ref name, level): &(String, SymbolExportLevel)| {
|
2017-09-13 20:22:20 +00:00
|
|
|
|
if level.is_below_threshold(export_threshold) {
|
2019-10-18 07:10:13 +00:00
|
|
|
|
Some(CString::new(name.as_str()).unwrap())
|
2016-11-30 15:03:42 +00:00
|
|
|
|
} else {
|
|
|
|
|
None
|
|
|
|
|
}
|
|
|
|
|
};
|
2019-12-22 22:42:04 +00:00
|
|
|
|
let exported_symbols = cgcx.exported_symbols.as_ref().expect("needs exported symbols for LTO");
|
2020-07-07 15:12:44 +00:00
|
|
|
|
let mut symbols_below_threshold = {
|
|
|
|
|
let _timer = cgcx.prof.generic_activity("LLVM_lto_generate_symbols_below_threshold");
|
2019-12-22 22:42:04 +00:00
|
|
|
|
exported_symbols[&LOCAL_CRATE].iter().filter_map(symbol_filter).collect::<Vec<CString>>()
|
2019-09-27 12:04:36 +00:00
|
|
|
|
};
|
2020-07-07 15:12:44 +00:00
|
|
|
|
info!("{} symbols to preserve in this crate", symbols_below_threshold.len());
|
2016-11-30 15:03:42 +00:00
|
|
|
|
|
rustc: Implement ThinLTO
This commit is an implementation of LLVM's ThinLTO for consumption in rustc
itself. Currently today LTO works by merging all relevant LLVM modules into one
and then running optimization passes. "Thin" LTO operates differently by having
more sharded work and allowing parallelism opportunities between optimizing
codegen units. Further down the road Thin LTO also allows *incremental* LTO
which should enable even faster release builds without compromising on the
performance we have today.
This commit uses a `-Z thinlto` flag to gate whether ThinLTO is enabled. It then
also implements two forms of ThinLTO:
* In one mode we'll *only* perform ThinLTO over the codegen units produced in a
single compilation. That is, we won't load upstream rlibs, but we'll instead
just perform ThinLTO amongst all codegen units produced by the compiler for
the local crate. This is intended to emulate a desired end point where we have
codegen units turned on by default for all crates and ThinLTO allows us to do
this without performance loss.
* In anther mode, like full LTO today, we'll optimize all upstream dependencies
in "thin" mode. Unlike today, however, this LTO step is fully parallelized so
should finish much more quickly.
There's a good bit of comments about what the implementation is doing and where
it came from, but the tl;dr; is that currently most of the support here is
copied from upstream LLVM. This code duplication is done for a number of
reasons:
* Controlling parallelism means we can use the existing jobserver support to
avoid overloading machines.
* We will likely want a slightly different form of incremental caching which
integrates with our own incremental strategy, but this is yet to be
determined.
* This buys us some flexibility about when/where we run ThinLTO, as well as
having it tailored to fit our needs for the time being.
* Finally this allows us to reuse some artifacts such as our `TargetMachine`
creation, where all our options we used today aren't necessarily supported by
upstream LLVM yet.
My hope is that we can get some experience with this copy/paste in tree and then
eventually upstream some work to LLVM itself to avoid the duplication while
still ensuring our needs are met. Otherwise I fear that maintaining these
bindings may be quite costly over the years with LLVM updates!
2017-07-23 15:14:38 +00:00
|
|
|
|
// If we're performing LTO for the entire crate graph, then for each of our
|
|
|
|
|
// upstream dependencies, find the corresponding rlib and load the bitcode
|
|
|
|
|
// from the archive.
|
|
|
|
|
//
|
|
|
|
|
// We save off all the bytecode and LLVM module ids for later processing
|
|
|
|
|
// with either fat or thin LTO
|
2017-07-23 15:14:38 +00:00
|
|
|
|
let mut upstream_modules = Vec::new();
|
2018-01-16 23:02:31 +00:00
|
|
|
|
if cgcx.lto != Lto::ThinLocal {
|
rustc: Implement ThinLTO
This commit is an implementation of LLVM's ThinLTO for consumption in rustc
itself. Currently today LTO works by merging all relevant LLVM modules into one
and then running optimization passes. "Thin" LTO operates differently by having
more sharded work and allowing parallelism opportunities between optimizing
codegen units. Further down the road Thin LTO also allows *incremental* LTO
which should enable even faster release builds without compromising on the
performance we have today.
This commit uses a `-Z thinlto` flag to gate whether ThinLTO is enabled. It then
also implements two forms of ThinLTO:
* In one mode we'll *only* perform ThinLTO over the codegen units produced in a
single compilation. That is, we won't load upstream rlibs, but we'll instead
just perform ThinLTO amongst all codegen units produced by the compiler for
the local crate. This is intended to emulate a desired end point where we have
codegen units turned on by default for all crates and ThinLTO allows us to do
this without performance loss.
* In anther mode, like full LTO today, we'll optimize all upstream dependencies
in "thin" mode. Unlike today, however, this LTO step is fully parallelized so
should finish much more quickly.
There's a good bit of comments about what the implementation is doing and where
it came from, but the tl;dr; is that currently most of the support here is
copied from upstream LLVM. This code duplication is done for a number of
reasons:
* Controlling parallelism means we can use the existing jobserver support to
avoid overloading machines.
* We will likely want a slightly different form of incremental caching which
integrates with our own incremental strategy, but this is yet to be
determined.
* This buys us some flexibility about when/where we run ThinLTO, as well as
having it tailored to fit our needs for the time being.
* Finally this allows us to reuse some artifacts such as our `TargetMachine`
creation, where all our options we used today aren't necessarily supported by
upstream LLVM yet.
My hope is that we can get some experience with this copy/paste in tree and then
eventually upstream some work to LLVM itself to avoid the duplication while
still ensuring our needs are met. Otherwise I fear that maintaining these
bindings may be quite costly over the years with LLVM updates!
2017-07-23 15:14:38 +00:00
|
|
|
|
if cgcx.opts.cg.prefer_dynamic {
|
2019-12-22 22:42:04 +00:00
|
|
|
|
diag_handler
|
|
|
|
|
.struct_err("cannot prefer dynamic linking when performing LTO")
|
|
|
|
|
.note(
|
|
|
|
|
"only 'staticlib', 'bin', and 'cdylib' outputs are \
|
|
|
|
|
supported with LTO",
|
|
|
|
|
)
|
|
|
|
|
.emit();
|
|
|
|
|
return Err(FatalError);
|
rustc: Implement ThinLTO
This commit is an implementation of LLVM's ThinLTO for consumption in rustc
itself. Currently today LTO works by merging all relevant LLVM modules into one
and then running optimization passes. "Thin" LTO operates differently by having
more sharded work and allowing parallelism opportunities between optimizing
codegen units. Further down the road Thin LTO also allows *incremental* LTO
which should enable even faster release builds without compromising on the
performance we have today.
This commit uses a `-Z thinlto` flag to gate whether ThinLTO is enabled. It then
also implements two forms of ThinLTO:
* In one mode we'll *only* perform ThinLTO over the codegen units produced in a
single compilation. That is, we won't load upstream rlibs, but we'll instead
just perform ThinLTO amongst all codegen units produced by the compiler for
the local crate. This is intended to emulate a desired end point where we have
codegen units turned on by default for all crates and ThinLTO allows us to do
this without performance loss.
* In anther mode, like full LTO today, we'll optimize all upstream dependencies
in "thin" mode. Unlike today, however, this LTO step is fully parallelized so
should finish much more quickly.
There's a good bit of comments about what the implementation is doing and where
it came from, but the tl;dr; is that currently most of the support here is
copied from upstream LLVM. This code duplication is done for a number of
reasons:
* Controlling parallelism means we can use the existing jobserver support to
avoid overloading machines.
* We will likely want a slightly different form of incremental caching which
integrates with our own incremental strategy, but this is yet to be
determined.
* This buys us some flexibility about when/where we run ThinLTO, as well as
having it tailored to fit our needs for the time being.
* Finally this allows us to reuse some artifacts such as our `TargetMachine`
creation, where all our options we used today aren't necessarily supported by
upstream LLVM yet.
My hope is that we can get some experience with this copy/paste in tree and then
eventually upstream some work to LLVM itself to avoid the duplication while
still ensuring our needs are met. Otherwise I fear that maintaining these
bindings may be quite costly over the years with LLVM updates!
2017-07-23 15:14:38 +00:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Make sure we actually can run LTO
|
|
|
|
|
for crate_type in cgcx.crate_types.iter() {
|
|
|
|
|
if !crate_type_allows_lto(*crate_type) {
|
2019-12-22 22:42:04 +00:00
|
|
|
|
let e = diag_handler.fatal(
|
|
|
|
|
"lto can only be run for executables, cdylibs and \
|
|
|
|
|
static library outputs",
|
|
|
|
|
);
|
|
|
|
|
return Err(e);
|
rustc: Implement ThinLTO
This commit is an implementation of LLVM's ThinLTO for consumption in rustc
itself. Currently today LTO works by merging all relevant LLVM modules into one
and then running optimization passes. "Thin" LTO operates differently by having
more sharded work and allowing parallelism opportunities between optimizing
codegen units. Further down the road Thin LTO also allows *incremental* LTO
which should enable even faster release builds without compromising on the
performance we have today.
This commit uses a `-Z thinlto` flag to gate whether ThinLTO is enabled. It then
also implements two forms of ThinLTO:
* In one mode we'll *only* perform ThinLTO over the codegen units produced in a
single compilation. That is, we won't load upstream rlibs, but we'll instead
just perform ThinLTO amongst all codegen units produced by the compiler for
the local crate. This is intended to emulate a desired end point where we have
codegen units turned on by default for all crates and ThinLTO allows us to do
this without performance loss.
* In anther mode, like full LTO today, we'll optimize all upstream dependencies
in "thin" mode. Unlike today, however, this LTO step is fully parallelized so
should finish much more quickly.
There's a good bit of comments about what the implementation is doing and where
it came from, but the tl;dr; is that currently most of the support here is
copied from upstream LLVM. This code duplication is done for a number of
reasons:
* Controlling parallelism means we can use the existing jobserver support to
avoid overloading machines.
* We will likely want a slightly different form of incremental caching which
integrates with our own incremental strategy, but this is yet to be
determined.
* This buys us some flexibility about when/where we run ThinLTO, as well as
having it tailored to fit our needs for the time being.
* Finally this allows us to reuse some artifacts such as our `TargetMachine`
creation, where all our options we used today aren't necessarily supported by
upstream LLVM yet.
My hope is that we can get some experience with this copy/paste in tree and then
eventually upstream some work to LLVM itself to avoid the duplication while
still ensuring our needs are met. Otherwise I fear that maintaining these
bindings may be quite costly over the years with LLVM updates!
2017-07-23 15:14:38 +00:00
|
|
|
|
}
|
2014-09-17 23:18:12 +00:00
|
|
|
|
}
|
Implement LTO
This commit implements LTO for rust leveraging LLVM's passes. What this means
is:
* When compiling an rlib, in addition to insdering foo.o into the archive, also
insert foo.bc (the LLVM bytecode) of the optimized module.
* When the compiler detects the -Z lto option, it will attempt to perform LTO on
a staticlib or binary output. The compiler will emit an error if a dylib or
rlib output is being generated.
* The actual act of performing LTO is as follows:
1. Force all upstream libraries to have an rlib version available.
2. Load the bytecode of each upstream library from the rlib.
3. Link all this bytecode into the current LLVM module (just using llvm
apis)
4. Run an internalization pass which internalizes all symbols except those
found reachable for the local crate of compilation.
5. Run the LLVM LTO pass manager over this entire module
6a. If assembling an archive, then add all upstream rlibs into the output
archive. This ignores all of the object/bitcode/metadata files rust
generated and placed inside the rlibs.
6b. If linking a binary, create copies of all upstream rlibs, remove the
rust-generated object-file, and then link everything as usual.
As I have explained in #10741, this process is excruciatingly slow, so this is
*not* turned on by default, and it is also why I have decided to hide it behind
a -Z flag for now. The good news is that the binary sizes are about as small as
they can be as a result of LTO, so it's definitely working.
Closes #10741
Closes #10740
2013-12-03 07:19:29 +00:00
|
|
|
|
|
rustc: Implement ThinLTO
This commit is an implementation of LLVM's ThinLTO for consumption in rustc
itself. Currently today LTO works by merging all relevant LLVM modules into one
and then running optimization passes. "Thin" LTO operates differently by having
more sharded work and allowing parallelism opportunities between optimizing
codegen units. Further down the road Thin LTO also allows *incremental* LTO
which should enable even faster release builds without compromising on the
performance we have today.
This commit uses a `-Z thinlto` flag to gate whether ThinLTO is enabled. It then
also implements two forms of ThinLTO:
* In one mode we'll *only* perform ThinLTO over the codegen units produced in a
single compilation. That is, we won't load upstream rlibs, but we'll instead
just perform ThinLTO amongst all codegen units produced by the compiler for
the local crate. This is intended to emulate a desired end point where we have
codegen units turned on by default for all crates and ThinLTO allows us to do
this without performance loss.
* In anther mode, like full LTO today, we'll optimize all upstream dependencies
in "thin" mode. Unlike today, however, this LTO step is fully parallelized so
should finish much more quickly.
There's a good bit of comments about what the implementation is doing and where
it came from, but the tl;dr; is that currently most of the support here is
copied from upstream LLVM. This code duplication is done for a number of
reasons:
* Controlling parallelism means we can use the existing jobserver support to
avoid overloading machines.
* We will likely want a slightly different form of incremental caching which
integrates with our own incremental strategy, but this is yet to be
determined.
* This buys us some flexibility about when/where we run ThinLTO, as well as
having it tailored to fit our needs for the time being.
* Finally this allows us to reuse some artifacts such as our `TargetMachine`
creation, where all our options we used today aren't necessarily supported by
upstream LLVM yet.
My hope is that we can get some experience with this copy/paste in tree and then
eventually upstream some work to LLVM itself to avoid the duplication while
still ensuring our needs are met. Otherwise I fear that maintaining these
bindings may be quite costly over the years with LLVM updates!
2017-07-23 15:14:38 +00:00
|
|
|
|
for &(cnum, ref path) in cgcx.each_linked_rlib_for_lto.iter() {
|
2019-12-22 22:42:04 +00:00
|
|
|
|
let exported_symbols =
|
|
|
|
|
cgcx.exported_symbols.as_ref().expect("needs exported symbols for LTO");
|
2019-09-27 12:04:36 +00:00
|
|
|
|
{
|
2020-07-07 15:12:44 +00:00
|
|
|
|
let _timer =
|
|
|
|
|
cgcx.prof.generic_activity("LLVM_lto_generate_symbols_below_threshold");
|
|
|
|
|
symbols_below_threshold
|
|
|
|
|
.extend(exported_symbols[&cnum].iter().filter_map(symbol_filter));
|
2019-09-27 12:04:36 +00:00
|
|
|
|
}
|
rustc: Implement ThinLTO
This commit is an implementation of LLVM's ThinLTO for consumption in rustc
itself. Currently today LTO works by merging all relevant LLVM modules into one
and then running optimization passes. "Thin" LTO operates differently by having
more sharded work and allowing parallelism opportunities between optimizing
codegen units. Further down the road Thin LTO also allows *incremental* LTO
which should enable even faster release builds without compromising on the
performance we have today.
This commit uses a `-Z thinlto` flag to gate whether ThinLTO is enabled. It then
also implements two forms of ThinLTO:
* In one mode we'll *only* perform ThinLTO over the codegen units produced in a
single compilation. That is, we won't load upstream rlibs, but we'll instead
just perform ThinLTO amongst all codegen units produced by the compiler for
the local crate. This is intended to emulate a desired end point where we have
codegen units turned on by default for all crates and ThinLTO allows us to do
this without performance loss.
* In anther mode, like full LTO today, we'll optimize all upstream dependencies
in "thin" mode. Unlike today, however, this LTO step is fully parallelized so
should finish much more quickly.
There's a good bit of comments about what the implementation is doing and where
it came from, but the tl;dr; is that currently most of the support here is
copied from upstream LLVM. This code duplication is done for a number of
reasons:
* Controlling parallelism means we can use the existing jobserver support to
avoid overloading machines.
* We will likely want a slightly different form of incremental caching which
integrates with our own incremental strategy, but this is yet to be
determined.
* This buys us some flexibility about when/where we run ThinLTO, as well as
having it tailored to fit our needs for the time being.
* Finally this allows us to reuse some artifacts such as our `TargetMachine`
creation, where all our options we used today aren't necessarily supported by
upstream LLVM yet.
My hope is that we can get some experience with this copy/paste in tree and then
eventually upstream some work to LLVM itself to avoid the duplication while
still ensuring our needs are met. Otherwise I fear that maintaining these
bindings may be quite costly over the years with LLVM updates!
2017-07-23 15:14:38 +00:00
|
|
|
|
|
|
|
|
|
let archive = ArchiveRO::open(&path).expect("wanted an rlib");
|
2020-04-23 18:45:55 +00:00
|
|
|
|
let obj_files = archive
|
2019-12-22 22:42:04 +00:00
|
|
|
|
.iter()
|
|
|
|
|
.filter_map(|child| child.ok().and_then(|c| c.name().map(|name| (name, c))))
|
2020-04-23 18:45:55 +00:00
|
|
|
|
.filter(|&(name, _)| looks_like_rust_object_file(name));
|
|
|
|
|
for (name, child) in obj_files {
|
|
|
|
|
info!("adding bitcode from {}", name);
|
|
|
|
|
match get_bitcode_slice_from_object_data(child.data()) {
|
|
|
|
|
Ok(data) => {
|
|
|
|
|
let module = SerializedModule::FromRlib(data.to_vec());
|
|
|
|
|
upstream_modules.push((module, CString::new(name).unwrap()));
|
|
|
|
|
}
|
|
|
|
|
Err(msg) => return Err(diag_handler.fatal(&msg)),
|
|
|
|
|
}
|
rustc: Implement ThinLTO
This commit is an implementation of LLVM's ThinLTO for consumption in rustc
itself. Currently today LTO works by merging all relevant LLVM modules into one
and then running optimization passes. "Thin" LTO operates differently by having
more sharded work and allowing parallelism opportunities between optimizing
codegen units. Further down the road Thin LTO also allows *incremental* LTO
which should enable even faster release builds without compromising on the
performance we have today.
This commit uses a `-Z thinlto` flag to gate whether ThinLTO is enabled. It then
also implements two forms of ThinLTO:
* In one mode we'll *only* perform ThinLTO over the codegen units produced in a
single compilation. That is, we won't load upstream rlibs, but we'll instead
just perform ThinLTO amongst all codegen units produced by the compiler for
the local crate. This is intended to emulate a desired end point where we have
codegen units turned on by default for all crates and ThinLTO allows us to do
this without performance loss.
* In anther mode, like full LTO today, we'll optimize all upstream dependencies
in "thin" mode. Unlike today, however, this LTO step is fully parallelized so
should finish much more quickly.
There's a good bit of comments about what the implementation is doing and where
it came from, but the tl;dr; is that currently most of the support here is
copied from upstream LLVM. This code duplication is done for a number of
reasons:
* Controlling parallelism means we can use the existing jobserver support to
avoid overloading machines.
* We will likely want a slightly different form of incremental caching which
integrates with our own incremental strategy, but this is yet to be
determined.
* This buys us some flexibility about when/where we run ThinLTO, as well as
having it tailored to fit our needs for the time being.
* Finally this allows us to reuse some artifacts such as our `TargetMachine`
creation, where all our options we used today aren't necessarily supported by
upstream LLVM yet.
My hope is that we can get some experience with this copy/paste in tree and then
eventually upstream some work to LLVM itself to avoid the duplication while
still ensuring our needs are met. Otherwise I fear that maintaining these
bindings may be quite costly over the years with LLVM updates!
2017-07-23 15:14:38 +00:00
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
2017-07-23 15:14:38 +00:00
|
|
|
|
|
2020-07-07 15:12:44 +00:00
|
|
|
|
Ok((symbols_below_threshold, upstream_modules))
|
2018-12-03 19:45:03 +00:00
|
|
|
|
}
|
|
|
|
|
|
2020-04-23 18:45:55 +00:00
|
|
|
|
fn get_bitcode_slice_from_object_data(obj: &[u8]) -> Result<&[u8], String> {
|
|
|
|
|
let mut len = 0;
|
|
|
|
|
let data =
|
|
|
|
|
unsafe { llvm::LLVMRustGetBitcodeSliceFromObjectData(obj.as_ptr(), obj.len(), &mut len) };
|
|
|
|
|
if !data.is_null() {
|
|
|
|
|
assert!(len != 0);
|
|
|
|
|
let bc = unsafe { slice::from_raw_parts(data, len) };
|
|
|
|
|
|
|
|
|
|
// `bc` must be a sub-slice of `obj`.
|
|
|
|
|
assert!(obj.as_ptr() <= bc.as_ptr());
|
|
|
|
|
assert!(bc[bc.len()..bc.len()].as_ptr() <= obj[obj.len()..obj.len()].as_ptr());
|
|
|
|
|
|
|
|
|
|
Ok(bc)
|
|
|
|
|
} else {
|
|
|
|
|
assert!(len == 0);
|
|
|
|
|
let msg = llvm::last_error().unwrap_or_else(|| "unknown LLVM error".to_string());
|
|
|
|
|
Err(format!("failed to get bitcode from object file for LTO ({})", msg))
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2018-12-03 19:45:03 +00:00
|
|
|
|
/// Performs fat LTO by merging all modules into a single one and returning it
|
|
|
|
|
/// for further optimization.
|
2019-12-22 22:42:04 +00:00
|
|
|
|
pub(crate) fn run_fat(
|
|
|
|
|
cgcx: &CodegenContext<LlvmCodegenBackend>,
|
|
|
|
|
modules: Vec<FatLTOInput<LlvmCodegenBackend>>,
|
|
|
|
|
cached_modules: Vec<(SerializedModule<ModuleBuffer>, WorkProduct)>,
|
|
|
|
|
) -> Result<LtoModuleCodegen<LlvmCodegenBackend>, FatalError> {
|
2018-12-03 19:45:03 +00:00
|
|
|
|
let diag_handler = cgcx.create_diag_handler();
|
2020-07-07 15:12:44 +00:00
|
|
|
|
let (symbols_below_threshold, upstream_modules) = prepare_lto(cgcx, &diag_handler)?;
|
|
|
|
|
let symbols_below_threshold =
|
|
|
|
|
symbols_below_threshold.iter().map(|c| c.as_ptr()).collect::<Vec<_>>();
|
|
|
|
|
fat_lto(
|
|
|
|
|
cgcx,
|
|
|
|
|
&diag_handler,
|
|
|
|
|
modules,
|
|
|
|
|
cached_modules,
|
|
|
|
|
upstream_modules,
|
|
|
|
|
&symbols_below_threshold,
|
|
|
|
|
)
|
2018-12-03 19:45:03 +00:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/// Performs thin LTO by performing necessary global analysis and returning two
|
|
|
|
|
/// lists, one of the modules that need optimization and another for modules that
|
|
|
|
|
/// can simply be copied over from the incr. comp. cache.
|
2019-12-22 22:42:04 +00:00
|
|
|
|
pub(crate) fn run_thin(
|
|
|
|
|
cgcx: &CodegenContext<LlvmCodegenBackend>,
|
|
|
|
|
modules: Vec<(String, ThinBuffer)>,
|
|
|
|
|
cached_modules: Vec<(SerializedModule<ModuleBuffer>, WorkProduct)>,
|
|
|
|
|
) -> Result<(Vec<LtoModuleCodegen<LlvmCodegenBackend>>, Vec<WorkProduct>), FatalError> {
|
2018-12-03 19:45:03 +00:00
|
|
|
|
let diag_handler = cgcx.create_diag_handler();
|
2020-07-07 15:12:44 +00:00
|
|
|
|
let (symbols_below_threshold, upstream_modules) = prepare_lto(cgcx, &diag_handler)?;
|
|
|
|
|
let symbols_below_threshold =
|
|
|
|
|
symbols_below_threshold.iter().map(|c| c.as_ptr()).collect::<Vec<_>>();
|
2019-02-01 14:15:43 +00:00
|
|
|
|
if cgcx.opts.cg.linker_plugin_lto.enabled() {
|
2019-12-22 22:42:04 +00:00
|
|
|
|
unreachable!(
|
|
|
|
|
"We should never reach this case if the LTO step \
|
|
|
|
|
is deferred to the linker"
|
|
|
|
|
);
|
rustc: Implement ThinLTO
This commit is an implementation of LLVM's ThinLTO for consumption in rustc
itself. Currently today LTO works by merging all relevant LLVM modules into one
and then running optimization passes. "Thin" LTO operates differently by having
more sharded work and allowing parallelism opportunities between optimizing
codegen units. Further down the road Thin LTO also allows *incremental* LTO
which should enable even faster release builds without compromising on the
performance we have today.
This commit uses a `-Z thinlto` flag to gate whether ThinLTO is enabled. It then
also implements two forms of ThinLTO:
* In one mode we'll *only* perform ThinLTO over the codegen units produced in a
single compilation. That is, we won't load upstream rlibs, but we'll instead
just perform ThinLTO amongst all codegen units produced by the compiler for
the local crate. This is intended to emulate a desired end point where we have
codegen units turned on by default for all crates and ThinLTO allows us to do
this without performance loss.
* In anther mode, like full LTO today, we'll optimize all upstream dependencies
in "thin" mode. Unlike today, however, this LTO step is fully parallelized so
should finish much more quickly.
There's a good bit of comments about what the implementation is doing and where
it came from, but the tl;dr; is that currently most of the support here is
copied from upstream LLVM. This code duplication is done for a number of
reasons:
* Controlling parallelism means we can use the existing jobserver support to
avoid overloading machines.
* We will likely want a slightly different form of incremental caching which
integrates with our own incremental strategy, but this is yet to be
determined.
* This buys us some flexibility about when/where we run ThinLTO, as well as
having it tailored to fit our needs for the time being.
* Finally this allows us to reuse some artifacts such as our `TargetMachine`
creation, where all our options we used today aren't necessarily supported by
upstream LLVM yet.
My hope is that we can get some experience with this copy/paste in tree and then
eventually upstream some work to LLVM itself to avoid the duplication while
still ensuring our needs are met. Otherwise I fear that maintaining these
bindings may be quite costly over the years with LLVM updates!
2017-07-23 15:14:38 +00:00
|
|
|
|
}
|
2020-07-07 15:12:44 +00:00
|
|
|
|
thin_lto(
|
|
|
|
|
cgcx,
|
|
|
|
|
&diag_handler,
|
|
|
|
|
modules,
|
|
|
|
|
upstream_modules,
|
|
|
|
|
cached_modules,
|
|
|
|
|
&symbols_below_threshold,
|
|
|
|
|
)
|
2017-07-23 15:14:38 +00:00
|
|
|
|
}
|
|
|
|
|
|
2019-12-22 22:42:04 +00:00
|
|
|
|
pub(crate) fn prepare_thin(module: ModuleCodegen<ModuleLlvm>) -> (String, ThinBuffer) {
|
2018-12-04 15:24:20 +00:00
|
|
|
|
let name = module.name.clone();
|
|
|
|
|
let buffer = ThinBuffer::new(module.module_llvm.llmod());
|
|
|
|
|
(name, buffer)
|
|
|
|
|
}
|
|
|
|
|
|
2019-12-22 22:42:04 +00:00
|
|
|
|
fn fat_lto(
|
|
|
|
|
cgcx: &CodegenContext<LlvmCodegenBackend>,
|
|
|
|
|
diag_handler: &Handler,
|
|
|
|
|
modules: Vec<FatLTOInput<LlvmCodegenBackend>>,
|
|
|
|
|
cached_modules: Vec<(SerializedModule<ModuleBuffer>, WorkProduct)>,
|
|
|
|
|
mut serialized_modules: Vec<(SerializedModule<ModuleBuffer>, CString)>,
|
2020-07-07 15:12:44 +00:00
|
|
|
|
symbols_below_threshold: &[*const libc::c_char],
|
2019-12-22 22:42:04 +00:00
|
|
|
|
) -> Result<LtoModuleCodegen<LlvmCodegenBackend>, FatalError> {
|
2019-09-27 12:04:36 +00:00
|
|
|
|
let _timer = cgcx.prof.generic_activity("LLVM_fat_lto_build_monolithic_module");
|
2017-07-23 15:14:38 +00:00
|
|
|
|
info!("going for a fat lto");
|
|
|
|
|
|
2019-08-27 19:25:35 +00:00
|
|
|
|
// Sort out all our lists of incoming modules into two lists.
|
|
|
|
|
//
|
|
|
|
|
// * `serialized_modules` (also and argument to this function) contains all
|
|
|
|
|
// modules that are serialized in-memory.
|
|
|
|
|
// * `in_memory` contains modules which are already parsed and in-memory,
|
|
|
|
|
// such as from multi-CGU builds.
|
|
|
|
|
//
|
|
|
|
|
// All of `cached_modules` (cached from previous incremental builds) can
|
|
|
|
|
// immediately go onto the `serialized_modules` modules list and then we can
|
|
|
|
|
// split the `modules` array into these two lists.
|
|
|
|
|
let mut in_memory = Vec::new();
|
|
|
|
|
serialized_modules.extend(cached_modules.into_iter().map(|(buffer, wp)| {
|
|
|
|
|
info!("pushing cached module {:?}", wp.cgu_name);
|
|
|
|
|
(buffer, CString::new(wp.cgu_name).unwrap())
|
|
|
|
|
}));
|
|
|
|
|
for module in modules {
|
|
|
|
|
match module {
|
|
|
|
|
FatLTOInput::InMemory(m) => in_memory.push(m),
|
|
|
|
|
FatLTOInput::Serialized { name, buffer } => {
|
|
|
|
|
info!("pushing serialized module {:?}", name);
|
|
|
|
|
let buffer = SerializedModule::Local(buffer);
|
|
|
|
|
serialized_modules.push((buffer, CString::new(name).unwrap()));
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2017-07-23 15:14:38 +00:00
|
|
|
|
// Find the "costliest" module and merge everything into that codegen unit.
|
|
|
|
|
// All the other modules will be serialized and reparsed into the new
|
|
|
|
|
// context, so this hopefully avoids serializing and parsing the largest
|
|
|
|
|
// codegen unit.
|
|
|
|
|
//
|
|
|
|
|
// Additionally use a regular module as the base here to ensure that various
|
|
|
|
|
// file copy operations in the backend work correctly. The only other kind
|
|
|
|
|
// of module here should be an allocator one, and if your crate is smaller
|
|
|
|
|
// than the allocator module then the size doesn't really matter anyway.
|
2019-12-22 22:42:04 +00:00
|
|
|
|
let costliest_module = in_memory
|
|
|
|
|
.iter()
|
2017-07-23 15:14:38 +00:00
|
|
|
|
.enumerate()
|
|
|
|
|
.filter(|&(_, module)| module.kind == ModuleKind::Regular)
|
|
|
|
|
.map(|(i, module)| {
|
2019-12-22 22:42:04 +00:00
|
|
|
|
let cost = unsafe { llvm::LLVMRustModuleCost(module.module_llvm.llmod()) };
|
2017-07-23 15:14:38 +00:00
|
|
|
|
(cost, i)
|
|
|
|
|
})
|
2019-02-11 15:46:04 +00:00
|
|
|
|
.max();
|
|
|
|
|
|
|
|
|
|
// If we found a costliest module, we're good to go. Otherwise all our
|
|
|
|
|
// inputs were serialized which could happen in the case, for example, that
|
|
|
|
|
// all our inputs were incrementally reread from the cache and we're just
|
|
|
|
|
// re-executing the LTO passes. If that's the case deserialize the first
|
|
|
|
|
// module and create a linker with it.
|
|
|
|
|
let module: ModuleCodegen<ModuleLlvm> = match costliest_module {
|
2019-08-27 19:25:35 +00:00
|
|
|
|
Some((_cost, i)) => in_memory.remove(i),
|
2019-02-11 15:46:04 +00:00
|
|
|
|
None => {
|
2020-02-28 13:20:33 +00:00
|
|
|
|
assert!(!serialized_modules.is_empty(), "must have at least one serialized module");
|
2019-08-27 19:25:35 +00:00
|
|
|
|
let (buffer, name) = serialized_modules.remove(0);
|
|
|
|
|
info!("no in-memory regular modules to choose from, parsing {:?}", name);
|
2019-02-11 15:46:04 +00:00
|
|
|
|
ModuleCodegen {
|
2019-08-27 19:25:35 +00:00
|
|
|
|
module_llvm: ModuleLlvm::parse(cgcx, &name, buffer.data(), diag_handler)?,
|
|
|
|
|
name: name.into_string().unwrap(),
|
2019-02-11 15:46:04 +00:00
|
|
|
|
kind: ModuleKind::Regular,
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
};
|
2017-07-23 15:14:38 +00:00
|
|
|
|
let mut serialized_bitcode = Vec::new();
|
2018-06-27 14:57:25 +00:00
|
|
|
|
{
|
2018-07-17 23:20:51 +00:00
|
|
|
|
let (llcx, llmod) = {
|
2018-08-20 15:13:01 +00:00
|
|
|
|
let llvm = &module.module_llvm;
|
2018-07-17 23:20:51 +00:00
|
|
|
|
(&llvm.llcx, llvm.llmod())
|
|
|
|
|
};
|
2018-08-14 15:55:22 +00:00
|
|
|
|
info!("using {:?} as a base module", module.name);
|
2018-06-27 14:57:25 +00:00
|
|
|
|
|
2018-07-17 23:20:51 +00:00
|
|
|
|
// The linking steps below may produce errors and diagnostics within LLVM
|
|
|
|
|
// which we'd like to handle and print, so set up our diagnostic handlers
|
|
|
|
|
// (which get unregistered when they go out of scope below).
|
|
|
|
|
let _handler = DiagnosticHandlers::new(cgcx, diag_handler, llcx);
|
|
|
|
|
|
2018-06-27 14:57:25 +00:00
|
|
|
|
// For all other modules we codegened we'll need to link them into our own
|
|
|
|
|
// bitcode. All modules were codegened in their own LLVM context, however,
|
|
|
|
|
// and we want to move everything to the same LLVM context. Currently the
|
|
|
|
|
// way we know of to do that is to serialize them to a string and them parse
|
|
|
|
|
// them later. Not great but hey, that's why it's "fat" LTO, right?
|
2019-08-27 19:25:35 +00:00
|
|
|
|
for module in in_memory {
|
|
|
|
|
let buffer = ModuleBuffer::new(module.module_llvm.llmod());
|
|
|
|
|
let llmod_id = CString::new(&module.name[..]).unwrap();
|
|
|
|
|
serialized_modules.push((SerializedModule::Local(buffer), llmod_id));
|
|
|
|
|
}
|
2019-08-08 17:51:52 +00:00
|
|
|
|
// Sort the modules to ensure we produce deterministic results.
|
2019-08-27 19:25:35 +00:00
|
|
|
|
serialized_modules.sort_by(|module1, module2| module1.1.cmp(&module2.1));
|
2017-07-23 15:14:38 +00:00
|
|
|
|
|
2018-06-27 14:57:25 +00:00
|
|
|
|
// For all serialized bitcode files we parse them and link them in as we did
|
|
|
|
|
// above, this is all mostly handled in C++. Like above, though, we don't
|
|
|
|
|
// know much about the memory management here so we err on the side of being
|
|
|
|
|
// save and persist everything with the original module.
|
|
|
|
|
let mut linker = Linker::new(llmod);
|
|
|
|
|
for (bc_decoded, name) in serialized_modules {
|
2020-02-07 14:01:23 +00:00
|
|
|
|
let _timer = cgcx
|
|
|
|
|
.prof
|
|
|
|
|
.generic_activity_with_arg("LLVM_fat_lto_link_module", format!("{:?}", name));
|
2018-06-27 14:57:25 +00:00
|
|
|
|
info!("linking {:?}", name);
|
2020-02-07 14:01:23 +00:00
|
|
|
|
let data = bc_decoded.data();
|
|
|
|
|
linker.add(&data).map_err(|()| {
|
|
|
|
|
let msg = format!("failed to load bc of {:?}", name);
|
|
|
|
|
write::llvm_err(&diag_handler, &msg)
|
2018-06-27 14:57:25 +00:00
|
|
|
|
})?;
|
|
|
|
|
serialized_bitcode.push(bc_decoded);
|
|
|
|
|
}
|
|
|
|
|
drop(linker);
|
2018-10-23 15:01:35 +00:00
|
|
|
|
save_temp_bitcode(&cgcx, &module, "lto.input");
|
Implement LTO
This commit implements LTO for rust leveraging LLVM's passes. What this means
is:
* When compiling an rlib, in addition to insdering foo.o into the archive, also
insert foo.bc (the LLVM bytecode) of the optimized module.
* When the compiler detects the -Z lto option, it will attempt to perform LTO on
a staticlib or binary output. The compiler will emit an error if a dylib or
rlib output is being generated.
* The actual act of performing LTO is as follows:
1. Force all upstream libraries to have an rlib version available.
2. Load the bytecode of each upstream library from the rlib.
3. Link all this bytecode into the current LLVM module (just using llvm
apis)
4. Run an internalization pass which internalizes all symbols except those
found reachable for the local crate of compilation.
5. Run the LLVM LTO pass manager over this entire module
6a. If assembling an archive, then add all upstream rlibs into the output
archive. This ignores all of the object/bitcode/metadata files rust
generated and placed inside the rlibs.
6b. If linking a binary, create copies of all upstream rlibs, remove the
rust-generated object-file, and then link everything as usual.
As I have explained in #10741, this process is excruciatingly slow, so this is
*not* turned on by default, and it is also why I have decided to hide it behind
a -Z flag for now. The good news is that the binary sizes are about as small as
they can be as a result of LTO, so it's definitely working.
Closes #10741
Closes #10740
2013-12-03 07:19:29 +00:00
|
|
|
|
|
2020-07-07 15:12:44 +00:00
|
|
|
|
// Internalize everything below threshold to help strip out more modules and such.
|
2013-12-11 07:27:15 +00:00
|
|
|
|
unsafe {
|
2020-07-07 15:12:44 +00:00
|
|
|
|
let ptr = symbols_below_threshold.as_ptr();
|
2019-12-22 22:42:04 +00:00
|
|
|
|
llvm::LLVMRustRunRestrictionPass(
|
|
|
|
|
llmod,
|
|
|
|
|
ptr as *const *const libc::c_char,
|
2020-07-07 15:12:44 +00:00
|
|
|
|
symbols_below_threshold.len() as libc::size_t,
|
2019-12-22 22:42:04 +00:00
|
|
|
|
);
|
2018-10-23 15:01:35 +00:00
|
|
|
|
save_temp_bitcode(&cgcx, &module, "lto.after-restriction");
|
2013-12-11 07:27:15 +00:00
|
|
|
|
}
|
2018-06-27 14:57:25 +00:00
|
|
|
|
|
|
|
|
|
if cgcx.no_landing_pads {
|
|
|
|
|
unsafe {
|
|
|
|
|
llvm::LLVMRustMarkAllFunctionsNounwind(llmod);
|
|
|
|
|
}
|
2018-10-23 15:01:35 +00:00
|
|
|
|
save_temp_bitcode(&cgcx, &module, "lto.after-nounwind");
|
2018-06-27 14:57:25 +00:00
|
|
|
|
}
|
2013-12-11 07:27:15 +00:00
|
|
|
|
}
|
|
|
|
|
|
2019-12-22 22:42:04 +00:00
|
|
|
|
Ok(LtoModuleCodegen::Fat { module: Some(module), _serialized_bitcode: serialized_bitcode })
|
2017-07-23 15:14:38 +00:00
|
|
|
|
}
|
|
|
|
|
|
2018-07-17 11:26:22 +00:00
|
|
|
|
struct Linker<'a>(&'a mut llvm::Linker<'a>);
|
2018-02-12 16:38:46 +00:00
|
|
|
|
|
2018-07-17 11:26:22 +00:00
|
|
|
|
impl Linker<'a> {
|
|
|
|
|
fn new(llmod: &'a llvm::Module) -> Self {
|
2018-02-12 16:38:46 +00:00
|
|
|
|
unsafe { Linker(llvm::LLVMRustLinkerNew(llmod)) }
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
fn add(&mut self, bytecode: &[u8]) -> Result<(), ()> {
|
|
|
|
|
unsafe {
|
2019-12-22 22:42:04 +00:00
|
|
|
|
if llvm::LLVMRustLinkerAdd(
|
|
|
|
|
self.0,
|
|
|
|
|
bytecode.as_ptr() as *const libc::c_char,
|
|
|
|
|
bytecode.len(),
|
|
|
|
|
) {
|
2018-02-12 16:38:46 +00:00
|
|
|
|
Ok(())
|
|
|
|
|
} else {
|
|
|
|
|
Err(())
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2018-07-17 11:26:22 +00:00
|
|
|
|
impl Drop for Linker<'a> {
|
2018-02-12 16:38:46 +00:00
|
|
|
|
fn drop(&mut self) {
|
2019-12-22 22:42:04 +00:00
|
|
|
|
unsafe {
|
|
|
|
|
llvm::LLVMRustLinkerFree(&mut *(self.0 as *mut _));
|
|
|
|
|
}
|
2018-02-12 16:38:46 +00:00
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
rustc: Implement ThinLTO
This commit is an implementation of LLVM's ThinLTO for consumption in rustc
itself. Currently today LTO works by merging all relevant LLVM modules into one
and then running optimization passes. "Thin" LTO operates differently by having
more sharded work and allowing parallelism opportunities between optimizing
codegen units. Further down the road Thin LTO also allows *incremental* LTO
which should enable even faster release builds without compromising on the
performance we have today.
This commit uses a `-Z thinlto` flag to gate whether ThinLTO is enabled. It then
also implements two forms of ThinLTO:
* In one mode we'll *only* perform ThinLTO over the codegen units produced in a
single compilation. That is, we won't load upstream rlibs, but we'll instead
just perform ThinLTO amongst all codegen units produced by the compiler for
the local crate. This is intended to emulate a desired end point where we have
codegen units turned on by default for all crates and ThinLTO allows us to do
this without performance loss.
* In anther mode, like full LTO today, we'll optimize all upstream dependencies
in "thin" mode. Unlike today, however, this LTO step is fully parallelized so
should finish much more quickly.
There's a good bit of comments about what the implementation is doing and where
it came from, but the tl;dr; is that currently most of the support here is
copied from upstream LLVM. This code duplication is done for a number of
reasons:
* Controlling parallelism means we can use the existing jobserver support to
avoid overloading machines.
* We will likely want a slightly different form of incremental caching which
integrates with our own incremental strategy, but this is yet to be
determined.
* This buys us some flexibility about when/where we run ThinLTO, as well as
having it tailored to fit our needs for the time being.
* Finally this allows us to reuse some artifacts such as our `TargetMachine`
creation, where all our options we used today aren't necessarily supported by
upstream LLVM yet.
My hope is that we can get some experience with this copy/paste in tree and then
eventually upstream some work to LLVM itself to avoid the duplication while
still ensuring our needs are met. Otherwise I fear that maintaining these
bindings may be quite costly over the years with LLVM updates!
2017-07-23 15:14:38 +00:00
|
|
|
|
/// Prepare "thin" LTO to get run on these modules.
|
|
|
|
|
///
|
|
|
|
|
/// The general structure of ThinLTO is quite different from the structure of
|
|
|
|
|
/// "fat" LTO above. With "fat" LTO all LLVM modules in question are merged into
|
|
|
|
|
/// one giant LLVM module, and then we run more optimization passes over this
|
|
|
|
|
/// big module after internalizing most symbols. Thin LTO, on the other hand,
|
|
|
|
|
/// avoid this large bottleneck through more targeted optimization.
|
|
|
|
|
///
|
|
|
|
|
/// At a high level Thin LTO looks like:
|
|
|
|
|
///
|
|
|
|
|
/// 1. Prepare a "summary" of each LLVM module in question which describes
|
|
|
|
|
/// the values inside, cost of the values, etc.
|
|
|
|
|
/// 2. Merge the summaries of all modules in question into one "index"
|
|
|
|
|
/// 3. Perform some global analysis on this index
|
|
|
|
|
/// 4. For each module, use the index and analysis calculated previously to
|
|
|
|
|
/// perform local transformations on the module, for example inlining
|
|
|
|
|
/// small functions from other modules.
|
|
|
|
|
/// 5. Run thin-specific optimization passes over each module, and then code
|
|
|
|
|
/// generate everything at the end.
|
|
|
|
|
///
|
|
|
|
|
/// The summary for each module is intended to be quite cheap, and the global
|
|
|
|
|
/// index is relatively quite cheap to create as well. As a result, the goal of
|
|
|
|
|
/// ThinLTO is to reduce the bottleneck on LTO and enable LTO to be used in more
|
|
|
|
|
/// situations. For example one cheap optimization is that we can parallelize
|
|
|
|
|
/// all codegen modules, easily making use of all the cores on a machine.
|
|
|
|
|
///
|
|
|
|
|
/// With all that in mind, the function here is designed at specifically just
|
|
|
|
|
/// calculating the *index* for ThinLTO. This index will then be shared amongst
|
2018-05-08 13:10:16 +00:00
|
|
|
|
/// all of the `LtoModuleCodegen` units returned below and destroyed once
|
rustc: Implement ThinLTO
This commit is an implementation of LLVM's ThinLTO for consumption in rustc
itself. Currently today LTO works by merging all relevant LLVM modules into one
and then running optimization passes. "Thin" LTO operates differently by having
more sharded work and allowing parallelism opportunities between optimizing
codegen units. Further down the road Thin LTO also allows *incremental* LTO
which should enable even faster release builds without compromising on the
performance we have today.
This commit uses a `-Z thinlto` flag to gate whether ThinLTO is enabled. It then
also implements two forms of ThinLTO:
* In one mode we'll *only* perform ThinLTO over the codegen units produced in a
single compilation. That is, we won't load upstream rlibs, but we'll instead
just perform ThinLTO amongst all codegen units produced by the compiler for
the local crate. This is intended to emulate a desired end point where we have
codegen units turned on by default for all crates and ThinLTO allows us to do
this without performance loss.
* In anther mode, like full LTO today, we'll optimize all upstream dependencies
in "thin" mode. Unlike today, however, this LTO step is fully parallelized so
should finish much more quickly.
There's a good bit of comments about what the implementation is doing and where
it came from, but the tl;dr; is that currently most of the support here is
copied from upstream LLVM. This code duplication is done for a number of
reasons:
* Controlling parallelism means we can use the existing jobserver support to
avoid overloading machines.
* We will likely want a slightly different form of incremental caching which
integrates with our own incremental strategy, but this is yet to be
determined.
* This buys us some flexibility about when/where we run ThinLTO, as well as
having it tailored to fit our needs for the time being.
* Finally this allows us to reuse some artifacts such as our `TargetMachine`
creation, where all our options we used today aren't necessarily supported by
upstream LLVM yet.
My hope is that we can get some experience with this copy/paste in tree and then
eventually upstream some work to LLVM itself to avoid the duplication while
still ensuring our needs are met. Otherwise I fear that maintaining these
bindings may be quite costly over the years with LLVM updates!
2017-07-23 15:14:38 +00:00
|
|
|
|
/// they all go out of scope.
|
2019-12-22 22:42:04 +00:00
|
|
|
|
fn thin_lto(
|
|
|
|
|
cgcx: &CodegenContext<LlvmCodegenBackend>,
|
|
|
|
|
diag_handler: &Handler,
|
|
|
|
|
modules: Vec<(String, ThinBuffer)>,
|
|
|
|
|
serialized_modules: Vec<(SerializedModule<ModuleBuffer>, CString)>,
|
|
|
|
|
cached_modules: Vec<(SerializedModule<ModuleBuffer>, WorkProduct)>,
|
2020-07-07 15:12:44 +00:00
|
|
|
|
symbols_below_threshold: &[*const libc::c_char],
|
2019-12-22 22:42:04 +00:00
|
|
|
|
) -> Result<(Vec<LtoModuleCodegen<LlvmCodegenBackend>>, Vec<WorkProduct>), FatalError> {
|
2019-09-27 12:04:36 +00:00
|
|
|
|
let _timer = cgcx.prof.generic_activity("LLVM_thin_lto_global_analysis");
|
rustc: Implement ThinLTO
This commit is an implementation of LLVM's ThinLTO for consumption in rustc
itself. Currently today LTO works by merging all relevant LLVM modules into one
and then running optimization passes. "Thin" LTO operates differently by having
more sharded work and allowing parallelism opportunities between optimizing
codegen units. Further down the road Thin LTO also allows *incremental* LTO
which should enable even faster release builds without compromising on the
performance we have today.
This commit uses a `-Z thinlto` flag to gate whether ThinLTO is enabled. It then
also implements two forms of ThinLTO:
* In one mode we'll *only* perform ThinLTO over the codegen units produced in a
single compilation. That is, we won't load upstream rlibs, but we'll instead
just perform ThinLTO amongst all codegen units produced by the compiler for
the local crate. This is intended to emulate a desired end point where we have
codegen units turned on by default for all crates and ThinLTO allows us to do
this without performance loss.
* In anther mode, like full LTO today, we'll optimize all upstream dependencies
in "thin" mode. Unlike today, however, this LTO step is fully parallelized so
should finish much more quickly.
There's a good bit of comments about what the implementation is doing and where
it came from, but the tl;dr; is that currently most of the support here is
copied from upstream LLVM. This code duplication is done for a number of
reasons:
* Controlling parallelism means we can use the existing jobserver support to
avoid overloading machines.
* We will likely want a slightly different form of incremental caching which
integrates with our own incremental strategy, but this is yet to be
determined.
* This buys us some flexibility about when/where we run ThinLTO, as well as
having it tailored to fit our needs for the time being.
* Finally this allows us to reuse some artifacts such as our `TargetMachine`
creation, where all our options we used today aren't necessarily supported by
upstream LLVM yet.
My hope is that we can get some experience with this copy/paste in tree and then
eventually upstream some work to LLVM itself to avoid the duplication while
still ensuring our needs are met. Otherwise I fear that maintaining these
bindings may be quite costly over the years with LLVM updates!
2017-07-23 15:14:38 +00:00
|
|
|
|
unsafe {
|
|
|
|
|
info!("going for that thin, thin LTO");
|
|
|
|
|
|
2019-12-22 22:42:04 +00:00
|
|
|
|
let green_modules: FxHashMap<_, _> =
|
|
|
|
|
cached_modules.iter().map(|&(_, ref wp)| (wp.cgu_name.clone(), wp.clone())).collect();
|
2018-08-31 13:18:08 +00:00
|
|
|
|
|
2018-10-06 09:45:11 +00:00
|
|
|
|
let full_scope_len = modules.len() + serialized_modules.len() + cached_modules.len();
|
|
|
|
|
let mut thin_buffers = Vec::with_capacity(modules.len());
|
|
|
|
|
let mut module_names = Vec::with_capacity(full_scope_len);
|
|
|
|
|
let mut thin_modules = Vec::with_capacity(full_scope_len);
|
rustc: Implement ThinLTO
This commit is an implementation of LLVM's ThinLTO for consumption in rustc
itself. Currently today LTO works by merging all relevant LLVM modules into one
and then running optimization passes. "Thin" LTO operates differently by having
more sharded work and allowing parallelism opportunities between optimizing
codegen units. Further down the road Thin LTO also allows *incremental* LTO
which should enable even faster release builds without compromising on the
performance we have today.
This commit uses a `-Z thinlto` flag to gate whether ThinLTO is enabled. It then
also implements two forms of ThinLTO:
* In one mode we'll *only* perform ThinLTO over the codegen units produced in a
single compilation. That is, we won't load upstream rlibs, but we'll instead
just perform ThinLTO amongst all codegen units produced by the compiler for
the local crate. This is intended to emulate a desired end point where we have
codegen units turned on by default for all crates and ThinLTO allows us to do
this without performance loss.
* In anther mode, like full LTO today, we'll optimize all upstream dependencies
in "thin" mode. Unlike today, however, this LTO step is fully parallelized so
should finish much more quickly.
There's a good bit of comments about what the implementation is doing and where
it came from, but the tl;dr; is that currently most of the support here is
copied from upstream LLVM. This code duplication is done for a number of
reasons:
* Controlling parallelism means we can use the existing jobserver support to
avoid overloading machines.
* We will likely want a slightly different form of incremental caching which
integrates with our own incremental strategy, but this is yet to be
determined.
* This buys us some flexibility about when/where we run ThinLTO, as well as
having it tailored to fit our needs for the time being.
* Finally this allows us to reuse some artifacts such as our `TargetMachine`
creation, where all our options we used today aren't necessarily supported by
upstream LLVM yet.
My hope is that we can get some experience with this copy/paste in tree and then
eventually upstream some work to LLVM itself to avoid the duplication while
still ensuring our needs are met. Otherwise I fear that maintaining these
bindings may be quite costly over the years with LLVM updates!
2017-07-23 15:14:38 +00:00
|
|
|
|
|
2018-12-04 15:24:20 +00:00
|
|
|
|
for (i, (name, buffer)) in modules.into_iter().enumerate() {
|
|
|
|
|
info!("local module: {} - {}", i, name);
|
|
|
|
|
let cname = CString::new(name.clone()).unwrap();
|
rustc: Implement ThinLTO
This commit is an implementation of LLVM's ThinLTO for consumption in rustc
itself. Currently today LTO works by merging all relevant LLVM modules into one
and then running optimization passes. "Thin" LTO operates differently by having
more sharded work and allowing parallelism opportunities between optimizing
codegen units. Further down the road Thin LTO also allows *incremental* LTO
which should enable even faster release builds without compromising on the
performance we have today.
This commit uses a `-Z thinlto` flag to gate whether ThinLTO is enabled. It then
also implements two forms of ThinLTO:
* In one mode we'll *only* perform ThinLTO over the codegen units produced in a
single compilation. That is, we won't load upstream rlibs, but we'll instead
just perform ThinLTO amongst all codegen units produced by the compiler for
the local crate. This is intended to emulate a desired end point where we have
codegen units turned on by default for all crates and ThinLTO allows us to do
this without performance loss.
* In anther mode, like full LTO today, we'll optimize all upstream dependencies
in "thin" mode. Unlike today, however, this LTO step is fully parallelized so
should finish much more quickly.
There's a good bit of comments about what the implementation is doing and where
it came from, but the tl;dr; is that currently most of the support here is
copied from upstream LLVM. This code duplication is done for a number of
reasons:
* Controlling parallelism means we can use the existing jobserver support to
avoid overloading machines.
* We will likely want a slightly different form of incremental caching which
integrates with our own incremental strategy, but this is yet to be
determined.
* This buys us some flexibility about when/where we run ThinLTO, as well as
having it tailored to fit our needs for the time being.
* Finally this allows us to reuse some artifacts such as our `TargetMachine`
creation, where all our options we used today aren't necessarily supported by
upstream LLVM yet.
My hope is that we can get some experience with this copy/paste in tree and then
eventually upstream some work to LLVM itself to avoid the duplication while
still ensuring our needs are met. Otherwise I fear that maintaining these
bindings may be quite costly over the years with LLVM updates!
2017-07-23 15:14:38 +00:00
|
|
|
|
thin_modules.push(llvm::ThinLTOModule {
|
2018-12-04 15:24:20 +00:00
|
|
|
|
identifier: cname.as_ptr(),
|
rustc: Implement ThinLTO
This commit is an implementation of LLVM's ThinLTO for consumption in rustc
itself. Currently today LTO works by merging all relevant LLVM modules into one
and then running optimization passes. "Thin" LTO operates differently by having
more sharded work and allowing parallelism opportunities between optimizing
codegen units. Further down the road Thin LTO also allows *incremental* LTO
which should enable even faster release builds without compromising on the
performance we have today.
This commit uses a `-Z thinlto` flag to gate whether ThinLTO is enabled. It then
also implements two forms of ThinLTO:
* In one mode we'll *only* perform ThinLTO over the codegen units produced in a
single compilation. That is, we won't load upstream rlibs, but we'll instead
just perform ThinLTO amongst all codegen units produced by the compiler for
the local crate. This is intended to emulate a desired end point where we have
codegen units turned on by default for all crates and ThinLTO allows us to do
this without performance loss.
* In anther mode, like full LTO today, we'll optimize all upstream dependencies
in "thin" mode. Unlike today, however, this LTO step is fully parallelized so
should finish much more quickly.
There's a good bit of comments about what the implementation is doing and where
it came from, but the tl;dr; is that currently most of the support here is
copied from upstream LLVM. This code duplication is done for a number of
reasons:
* Controlling parallelism means we can use the existing jobserver support to
avoid overloading machines.
* We will likely want a slightly different form of incremental caching which
integrates with our own incremental strategy, but this is yet to be
determined.
* This buys us some flexibility about when/where we run ThinLTO, as well as
having it tailored to fit our needs for the time being.
* Finally this allows us to reuse some artifacts such as our `TargetMachine`
creation, where all our options we used today aren't necessarily supported by
upstream LLVM yet.
My hope is that we can get some experience with this copy/paste in tree and then
eventually upstream some work to LLVM itself to avoid the duplication while
still ensuring our needs are met. Otherwise I fear that maintaining these
bindings may be quite costly over the years with LLVM updates!
2017-07-23 15:14:38 +00:00
|
|
|
|
data: buffer.data().as_ptr(),
|
|
|
|
|
len: buffer.data().len(),
|
|
|
|
|
});
|
|
|
|
|
thin_buffers.push(buffer);
|
2018-12-04 15:24:20 +00:00
|
|
|
|
module_names.push(cname);
|
rustc: Implement ThinLTO
This commit is an implementation of LLVM's ThinLTO for consumption in rustc
itself. Currently today LTO works by merging all relevant LLVM modules into one
and then running optimization passes. "Thin" LTO operates differently by having
more sharded work and allowing parallelism opportunities between optimizing
codegen units. Further down the road Thin LTO also allows *incremental* LTO
which should enable even faster release builds without compromising on the
performance we have today.
This commit uses a `-Z thinlto` flag to gate whether ThinLTO is enabled. It then
also implements two forms of ThinLTO:
* In one mode we'll *only* perform ThinLTO over the codegen units produced in a
single compilation. That is, we won't load upstream rlibs, but we'll instead
just perform ThinLTO amongst all codegen units produced by the compiler for
the local crate. This is intended to emulate a desired end point where we have
codegen units turned on by default for all crates and ThinLTO allows us to do
this without performance loss.
* In anther mode, like full LTO today, we'll optimize all upstream dependencies
in "thin" mode. Unlike today, however, this LTO step is fully parallelized so
should finish much more quickly.
There's a good bit of comments about what the implementation is doing and where
it came from, but the tl;dr; is that currently most of the support here is
copied from upstream LLVM. This code duplication is done for a number of
reasons:
* Controlling parallelism means we can use the existing jobserver support to
avoid overloading machines.
* We will likely want a slightly different form of incremental caching which
integrates with our own incremental strategy, but this is yet to be
determined.
* This buys us some flexibility about when/where we run ThinLTO, as well as
having it tailored to fit our needs for the time being.
* Finally this allows us to reuse some artifacts such as our `TargetMachine`
creation, where all our options we used today aren't necessarily supported by
upstream LLVM yet.
My hope is that we can get some experience with this copy/paste in tree and then
eventually upstream some work to LLVM itself to avoid the duplication while
still ensuring our needs are met. Otherwise I fear that maintaining these
bindings may be quite costly over the years with LLVM updates!
2017-07-23 15:14:38 +00:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// FIXME: All upstream crates are deserialized internally in the
|
|
|
|
|
// function below to extract their summary and modules. Note that
|
|
|
|
|
// unlike the loop above we *must* decode and/or read something
|
|
|
|
|
// here as these are all just serialized files on disk. An
|
|
|
|
|
// improvement, however, to make here would be to store the
|
|
|
|
|
// module summary separately from the actual module itself. Right
|
|
|
|
|
// now this is store in one large bitcode file, and the entire
|
|
|
|
|
// file is deflate-compressed. We could try to bypass some of the
|
|
|
|
|
// decompression by storing the index uncompressed and only
|
|
|
|
|
// lazily decompressing the bytecode if necessary.
|
|
|
|
|
//
|
|
|
|
|
// Note that truly taking advantage of this optimization will
|
|
|
|
|
// likely be further down the road. We'd have to implement
|
|
|
|
|
// incremental ThinLTO first where we could actually avoid
|
|
|
|
|
// looking at upstream modules entirely sometimes (the contents,
|
|
|
|
|
// we must always unconditionally look at the index).
|
2018-10-06 09:45:11 +00:00
|
|
|
|
let mut serialized = Vec::with_capacity(serialized_modules.len() + cached_modules.len());
|
rustc: Implement ThinLTO
This commit is an implementation of LLVM's ThinLTO for consumption in rustc
itself. Currently today LTO works by merging all relevant LLVM modules into one
and then running optimization passes. "Thin" LTO operates differently by having
more sharded work and allowing parallelism opportunities between optimizing
codegen units. Further down the road Thin LTO also allows *incremental* LTO
which should enable even faster release builds without compromising on the
performance we have today.
This commit uses a `-Z thinlto` flag to gate whether ThinLTO is enabled. It then
also implements two forms of ThinLTO:
* In one mode we'll *only* perform ThinLTO over the codegen units produced in a
single compilation. That is, we won't load upstream rlibs, but we'll instead
just perform ThinLTO amongst all codegen units produced by the compiler for
the local crate. This is intended to emulate a desired end point where we have
codegen units turned on by default for all crates and ThinLTO allows us to do
this without performance loss.
* In anther mode, like full LTO today, we'll optimize all upstream dependencies
in "thin" mode. Unlike today, however, this LTO step is fully parallelized so
should finish much more quickly.
There's a good bit of comments about what the implementation is doing and where
it came from, but the tl;dr; is that currently most of the support here is
copied from upstream LLVM. This code duplication is done for a number of
reasons:
* Controlling parallelism means we can use the existing jobserver support to
avoid overloading machines.
* We will likely want a slightly different form of incremental caching which
integrates with our own incremental strategy, but this is yet to be
determined.
* This buys us some flexibility about when/where we run ThinLTO, as well as
having it tailored to fit our needs for the time being.
* Finally this allows us to reuse some artifacts such as our `TargetMachine`
creation, where all our options we used today aren't necessarily supported by
upstream LLVM yet.
My hope is that we can get some experience with this copy/paste in tree and then
eventually upstream some work to LLVM itself to avoid the duplication while
still ensuring our needs are met. Otherwise I fear that maintaining these
bindings may be quite costly over the years with LLVM updates!
2017-07-23 15:14:38 +00:00
|
|
|
|
|
2019-12-22 22:42:04 +00:00
|
|
|
|
let cached_modules =
|
|
|
|
|
cached_modules.into_iter().map(|(sm, wp)| (sm, CString::new(wp.cgu_name).unwrap()));
|
2018-08-31 13:18:08 +00:00
|
|
|
|
|
|
|
|
|
for (module, name) in serialized_modules.into_iter().chain(cached_modules) {
|
|
|
|
|
info!("upstream or cached module {:?}", name);
|
2018-08-20 15:13:01 +00:00
|
|
|
|
thin_modules.push(llvm::ThinLTOModule {
|
|
|
|
|
identifier: name.as_ptr(),
|
|
|
|
|
data: module.data().as_ptr(),
|
|
|
|
|
len: module.data().len(),
|
|
|
|
|
});
|
|
|
|
|
serialized.push(module);
|
|
|
|
|
module_names.push(name);
|
|
|
|
|
}
|
|
|
|
|
|
2018-08-31 13:18:08 +00:00
|
|
|
|
// Sanity check
|
|
|
|
|
assert_eq!(thin_modules.len(), module_names.len());
|
|
|
|
|
|
rustc: Implement ThinLTO
This commit is an implementation of LLVM's ThinLTO for consumption in rustc
itself. Currently today LTO works by merging all relevant LLVM modules into one
and then running optimization passes. "Thin" LTO operates differently by having
more sharded work and allowing parallelism opportunities between optimizing
codegen units. Further down the road Thin LTO also allows *incremental* LTO
which should enable even faster release builds without compromising on the
performance we have today.
This commit uses a `-Z thinlto` flag to gate whether ThinLTO is enabled. It then
also implements two forms of ThinLTO:
* In one mode we'll *only* perform ThinLTO over the codegen units produced in a
single compilation. That is, we won't load upstream rlibs, but we'll instead
just perform ThinLTO amongst all codegen units produced by the compiler for
the local crate. This is intended to emulate a desired end point where we have
codegen units turned on by default for all crates and ThinLTO allows us to do
this without performance loss.
* In anther mode, like full LTO today, we'll optimize all upstream dependencies
in "thin" mode. Unlike today, however, this LTO step is fully parallelized so
should finish much more quickly.
There's a good bit of comments about what the implementation is doing and where
it came from, but the tl;dr; is that currently most of the support here is
copied from upstream LLVM. This code duplication is done for a number of
reasons:
* Controlling parallelism means we can use the existing jobserver support to
avoid overloading machines.
* We will likely want a slightly different form of incremental caching which
integrates with our own incremental strategy, but this is yet to be
determined.
* This buys us some flexibility about when/where we run ThinLTO, as well as
having it tailored to fit our needs for the time being.
* Finally this allows us to reuse some artifacts such as our `TargetMachine`
creation, where all our options we used today aren't necessarily supported by
upstream LLVM yet.
My hope is that we can get some experience with this copy/paste in tree and then
eventually upstream some work to LLVM itself to avoid the duplication while
still ensuring our needs are met. Otherwise I fear that maintaining these
bindings may be quite costly over the years with LLVM updates!
2017-07-23 15:14:38 +00:00
|
|
|
|
// Delegate to the C++ bindings to create some data here. Once this is a
|
|
|
|
|
// tried-and-true interface we may wish to try to upstream some of this
|
|
|
|
|
// to LLVM itself, right now we reimplement a lot of what they do
|
|
|
|
|
// upstream...
|
|
|
|
|
let data = llvm::LLVMRustCreateThinLTOData(
|
|
|
|
|
thin_modules.as_ptr(),
|
|
|
|
|
thin_modules.len() as u32,
|
2020-07-07 15:12:44 +00:00
|
|
|
|
symbols_below_threshold.as_ptr(),
|
|
|
|
|
symbols_below_threshold.len() as u32,
|
2019-12-22 22:42:04 +00:00
|
|
|
|
)
|
|
|
|
|
.ok_or_else(|| write::llvm_err(&diag_handler, "failed to prepare thin LTO context"))?;
|
2018-07-17 13:43:49 +00:00
|
|
|
|
|
rustc: Implement ThinLTO
This commit is an implementation of LLVM's ThinLTO for consumption in rustc
itself. Currently today LTO works by merging all relevant LLVM modules into one
and then running optimization passes. "Thin" LTO operates differently by having
more sharded work and allowing parallelism opportunities between optimizing
codegen units. Further down the road Thin LTO also allows *incremental* LTO
which should enable even faster release builds without compromising on the
performance we have today.
This commit uses a `-Z thinlto` flag to gate whether ThinLTO is enabled. It then
also implements two forms of ThinLTO:
* In one mode we'll *only* perform ThinLTO over the codegen units produced in a
single compilation. That is, we won't load upstream rlibs, but we'll instead
just perform ThinLTO amongst all codegen units produced by the compiler for
the local crate. This is intended to emulate a desired end point where we have
codegen units turned on by default for all crates and ThinLTO allows us to do
this without performance loss.
* In anther mode, like full LTO today, we'll optimize all upstream dependencies
in "thin" mode. Unlike today, however, this LTO step is fully parallelized so
should finish much more quickly.
There's a good bit of comments about what the implementation is doing and where
it came from, but the tl;dr; is that currently most of the support here is
copied from upstream LLVM. This code duplication is done for a number of
reasons:
* Controlling parallelism means we can use the existing jobserver support to
avoid overloading machines.
* We will likely want a slightly different form of incremental caching which
integrates with our own incremental strategy, but this is yet to be
determined.
* This buys us some flexibility about when/where we run ThinLTO, as well as
having it tailored to fit our needs for the time being.
* Finally this allows us to reuse some artifacts such as our `TargetMachine`
creation, where all our options we used today aren't necessarily supported by
upstream LLVM yet.
My hope is that we can get some experience with this copy/paste in tree and then
eventually upstream some work to LLVM itself to avoid the duplication while
still ensuring our needs are met. Otherwise I fear that maintaining these
bindings may be quite costly over the years with LLVM updates!
2017-07-23 15:14:38 +00:00
|
|
|
|
info!("thin LTO data created");
|
|
|
|
|
|
2019-11-29 15:04:40 +00:00
|
|
|
|
let (import_map_path, prev_import_map, curr_import_map) =
|
2019-12-22 22:42:04 +00:00
|
|
|
|
if let Some(ref incr_comp_session_dir) = cgcx.incr_comp_session_dir {
|
|
|
|
|
let path = incr_comp_session_dir.join(THIN_LTO_IMPORTS_INCR_COMP_FILE_NAME);
|
|
|
|
|
// If previous imports have been deleted, or we get an IO error
|
|
|
|
|
// reading the file storing them, then we'll just use `None` as the
|
|
|
|
|
// prev_import_map, which will force the code to be recompiled.
|
2020-04-15 16:28:01 +00:00
|
|
|
|
let prev = if path.exists() {
|
|
|
|
|
ThinLTOImportMaps::load_from_file(&path).ok()
|
|
|
|
|
} else {
|
|
|
|
|
None
|
|
|
|
|
};
|
|
|
|
|
let curr = ThinLTOImportMaps::from_thin_lto_data(data);
|
2019-12-22 22:42:04 +00:00
|
|
|
|
(Some(path), prev, curr)
|
2019-11-29 15:04:40 +00:00
|
|
|
|
} else {
|
2019-12-22 22:42:04 +00:00
|
|
|
|
// If we don't compile incrementally, we don't need to load the
|
|
|
|
|
// import data from LLVM.
|
|
|
|
|
assert!(green_modules.is_empty());
|
2020-04-15 16:28:01 +00:00
|
|
|
|
let curr = ThinLTOImportMaps::default();
|
2019-12-22 22:42:04 +00:00
|
|
|
|
(None, None, curr)
|
2019-11-29 15:04:40 +00:00
|
|
|
|
};
|
2018-09-03 10:42:27 +00:00
|
|
|
|
info!("thin LTO import map loaded");
|
|
|
|
|
|
|
|
|
|
let data = ThinData(data);
|
|
|
|
|
|
rustc: Implement ThinLTO
This commit is an implementation of LLVM's ThinLTO for consumption in rustc
itself. Currently today LTO works by merging all relevant LLVM modules into one
and then running optimization passes. "Thin" LTO operates differently by having
more sharded work and allowing parallelism opportunities between optimizing
codegen units. Further down the road Thin LTO also allows *incremental* LTO
which should enable even faster release builds without compromising on the
performance we have today.
This commit uses a `-Z thinlto` flag to gate whether ThinLTO is enabled. It then
also implements two forms of ThinLTO:
* In one mode we'll *only* perform ThinLTO over the codegen units produced in a
single compilation. That is, we won't load upstream rlibs, but we'll instead
just perform ThinLTO amongst all codegen units produced by the compiler for
the local crate. This is intended to emulate a desired end point where we have
codegen units turned on by default for all crates and ThinLTO allows us to do
this without performance loss.
* In anther mode, like full LTO today, we'll optimize all upstream dependencies
in "thin" mode. Unlike today, however, this LTO step is fully parallelized so
should finish much more quickly.
There's a good bit of comments about what the implementation is doing and where
it came from, but the tl;dr; is that currently most of the support here is
copied from upstream LLVM. This code duplication is done for a number of
reasons:
* Controlling parallelism means we can use the existing jobserver support to
avoid overloading machines.
* We will likely want a slightly different form of incremental caching which
integrates with our own incremental strategy, but this is yet to be
determined.
* This buys us some flexibility about when/where we run ThinLTO, as well as
having it tailored to fit our needs for the time being.
* Finally this allows us to reuse some artifacts such as our `TargetMachine`
creation, where all our options we used today aren't necessarily supported by
upstream LLVM yet.
My hope is that we can get some experience with this copy/paste in tree and then
eventually upstream some work to LLVM itself to avoid the duplication while
still ensuring our needs are met. Otherwise I fear that maintaining these
bindings may be quite costly over the years with LLVM updates!
2017-07-23 15:14:38 +00:00
|
|
|
|
// Throw our data in an `Arc` as we'll be sharing it across threads. We
|
|
|
|
|
// also put all memory referenced by the C++ data (buffers, ids, etc)
|
|
|
|
|
// into the arc as well. After this we'll create a thin module
|
2018-05-08 13:10:16 +00:00
|
|
|
|
// codegen per module in this data.
|
rustc: Implement ThinLTO
This commit is an implementation of LLVM's ThinLTO for consumption in rustc
itself. Currently today LTO works by merging all relevant LLVM modules into one
and then running optimization passes. "Thin" LTO operates differently by having
more sharded work and allowing parallelism opportunities between optimizing
codegen units. Further down the road Thin LTO also allows *incremental* LTO
which should enable even faster release builds without compromising on the
performance we have today.
This commit uses a `-Z thinlto` flag to gate whether ThinLTO is enabled. It then
also implements two forms of ThinLTO:
* In one mode we'll *only* perform ThinLTO over the codegen units produced in a
single compilation. That is, we won't load upstream rlibs, but we'll instead
just perform ThinLTO amongst all codegen units produced by the compiler for
the local crate. This is intended to emulate a desired end point where we have
codegen units turned on by default for all crates and ThinLTO allows us to do
this without performance loss.
* In anther mode, like full LTO today, we'll optimize all upstream dependencies
in "thin" mode. Unlike today, however, this LTO step is fully parallelized so
should finish much more quickly.
There's a good bit of comments about what the implementation is doing and where
it came from, but the tl;dr; is that currently most of the support here is
copied from upstream LLVM. This code duplication is done for a number of
reasons:
* Controlling parallelism means we can use the existing jobserver support to
avoid overloading machines.
* We will likely want a slightly different form of incremental caching which
integrates with our own incremental strategy, but this is yet to be
determined.
* This buys us some flexibility about when/where we run ThinLTO, as well as
having it tailored to fit our needs for the time being.
* Finally this allows us to reuse some artifacts such as our `TargetMachine`
creation, where all our options we used today aren't necessarily supported by
upstream LLVM yet.
My hope is that we can get some experience with this copy/paste in tree and then
eventually upstream some work to LLVM itself to avoid the duplication while
still ensuring our needs are met. Otherwise I fear that maintaining these
bindings may be quite costly over the years with LLVM updates!
2017-07-23 15:14:38 +00:00
|
|
|
|
let shared = Arc::new(ThinShared {
|
|
|
|
|
data,
|
|
|
|
|
thin_buffers,
|
|
|
|
|
serialized_modules: serialized,
|
|
|
|
|
module_names,
|
|
|
|
|
});
|
2018-08-31 13:18:08 +00:00
|
|
|
|
|
|
|
|
|
let mut copy_jobs = vec![];
|
|
|
|
|
let mut opt_jobs = vec![];
|
|
|
|
|
|
2018-09-03 10:42:27 +00:00
|
|
|
|
info!("checking which modules can be-reused and which have to be re-optimized.");
|
2018-08-31 13:18:08 +00:00
|
|
|
|
for (module_index, module_name) in shared.module_names.iter().enumerate() {
|
|
|
|
|
let module_name = module_name_to_str(module_name);
|
|
|
|
|
|
2019-11-29 15:04:40 +00:00
|
|
|
|
// If (1.) the module hasn't changed, and (2.) none of the modules
|
2020-04-17 20:04:59 +00:00
|
|
|
|
// it imports from have changed, *and* (3.) the import and export
|
|
|
|
|
// sets themselves have not changed from the previous compile when
|
|
|
|
|
// it was last ThinLTO'ed, then we can re-use the post-ThinLTO
|
|
|
|
|
// version of the module. Otherwise, freshly perform LTO
|
|
|
|
|
// optimization.
|
2020-04-15 16:28:01 +00:00
|
|
|
|
//
|
|
|
|
|
// (Note that globally, the export set is just the inverse of the
|
|
|
|
|
// import set.)
|
2019-11-29 15:04:40 +00:00
|
|
|
|
//
|
2020-04-20 14:33:27 +00:00
|
|
|
|
// For further justification of why the above is necessary and sufficient,
|
|
|
|
|
// see the LLVM blog post on ThinLTO:
|
|
|
|
|
//
|
|
|
|
|
// http://blog.llvm.org/2016/06/thinlto-scalable-and-incremental-lto.html
|
|
|
|
|
//
|
|
|
|
|
// which states the following:
|
|
|
|
|
//
|
|
|
|
|
// ```quote
|
|
|
|
|
// any particular ThinLTO backend must be redone iff:
|
|
|
|
|
//
|
|
|
|
|
// 1. The corresponding (primary) module’s bitcode changed
|
|
|
|
|
// 2. The list of imports into or exports from the module changed
|
|
|
|
|
// 3. The bitcode for any module being imported from has changed
|
|
|
|
|
// 4. Any global analysis result affecting either the primary module
|
|
|
|
|
// or anything it imports has changed.
|
|
|
|
|
// ```
|
|
|
|
|
//
|
2019-11-29 15:04:40 +00:00
|
|
|
|
// This strategy means we can always save the computed imports as
|
|
|
|
|
// canon: when we reuse the post-ThinLTO version, condition (3.)
|
2020-03-06 11:13:55 +00:00
|
|
|
|
// ensures that the current import set is the same as the previous
|
2019-11-29 15:04:40 +00:00
|
|
|
|
// one. (And of course, when we don't reuse the post-ThinLTO
|
|
|
|
|
// version, the current import set *is* the correct one, since we
|
|
|
|
|
// are doing the ThinLTO in this current compilation cycle.)
|
|
|
|
|
//
|
2020-04-15 16:28:01 +00:00
|
|
|
|
// For more discussion, see rust-lang/rust#59535 (where the import
|
|
|
|
|
// issue was discovered) and rust-lang/rust#69798 (where the
|
|
|
|
|
// analogous export issue was discovered).
|
2019-11-29 15:04:40 +00:00
|
|
|
|
if let (Some(prev_import_map), true) =
|
|
|
|
|
(prev_import_map.as_ref(), green_modules.contains_key(module_name))
|
|
|
|
|
{
|
|
|
|
|
assert!(cgcx.incr_comp_session_dir.is_some());
|
|
|
|
|
|
2020-04-15 16:28:01 +00:00
|
|
|
|
let prev_imports = prev_import_map.imports_of(module_name);
|
|
|
|
|
let curr_imports = curr_import_map.imports_of(module_name);
|
|
|
|
|
let prev_exports = prev_import_map.exports_of(module_name);
|
|
|
|
|
let curr_exports = curr_import_map.exports_of(module_name);
|
2019-11-29 15:04:40 +00:00
|
|
|
|
let imports_all_green = curr_imports
|
2018-09-03 10:42:27 +00:00
|
|
|
|
.iter()
|
|
|
|
|
.all(|imported_module| green_modules.contains_key(imported_module));
|
2020-04-14 13:47:03 +00:00
|
|
|
|
if imports_all_green
|
|
|
|
|
&& equivalent_as_sets(prev_imports, curr_imports)
|
|
|
|
|
&& equivalent_as_sets(prev_exports, curr_exports)
|
|
|
|
|
{
|
2018-08-31 13:18:08 +00:00
|
|
|
|
let work_product = green_modules[module_name].clone();
|
|
|
|
|
copy_jobs.push(work_product);
|
2018-09-03 10:42:27 +00:00
|
|
|
|
info!(" - {}: re-used", module_name);
|
2019-11-29 15:04:40 +00:00
|
|
|
|
assert!(cgcx.incr_comp_session_dir.is_some());
|
2019-12-22 22:42:04 +00:00
|
|
|
|
cgcx.cgu_reuse_tracker.set_actual_reuse(module_name, CguReuse::PostLto);
|
|
|
|
|
continue;
|
2018-08-31 13:18:08 +00:00
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2018-09-03 10:42:27 +00:00
|
|
|
|
info!(" - {}: re-compiled", module_name);
|
2018-08-31 13:18:08 +00:00
|
|
|
|
opt_jobs.push(LtoModuleCodegen::Thin(ThinModule {
|
rustc: Implement ThinLTO
This commit is an implementation of LLVM's ThinLTO for consumption in rustc
itself. Currently today LTO works by merging all relevant LLVM modules into one
and then running optimization passes. "Thin" LTO operates differently by having
more sharded work and allowing parallelism opportunities between optimizing
codegen units. Further down the road Thin LTO also allows *incremental* LTO
which should enable even faster release builds without compromising on the
performance we have today.
This commit uses a `-Z thinlto` flag to gate whether ThinLTO is enabled. It then
also implements two forms of ThinLTO:
* In one mode we'll *only* perform ThinLTO over the codegen units produced in a
single compilation. That is, we won't load upstream rlibs, but we'll instead
just perform ThinLTO amongst all codegen units produced by the compiler for
the local crate. This is intended to emulate a desired end point where we have
codegen units turned on by default for all crates and ThinLTO allows us to do
this without performance loss.
* In anther mode, like full LTO today, we'll optimize all upstream dependencies
in "thin" mode. Unlike today, however, this LTO step is fully parallelized so
should finish much more quickly.
There's a good bit of comments about what the implementation is doing and where
it came from, but the tl;dr; is that currently most of the support here is
copied from upstream LLVM. This code duplication is done for a number of
reasons:
* Controlling parallelism means we can use the existing jobserver support to
avoid overloading machines.
* We will likely want a slightly different form of incremental caching which
integrates with our own incremental strategy, but this is yet to be
determined.
* This buys us some flexibility about when/where we run ThinLTO, as well as
having it tailored to fit our needs for the time being.
* Finally this allows us to reuse some artifacts such as our `TargetMachine`
creation, where all our options we used today aren't necessarily supported by
upstream LLVM yet.
My hope is that we can get some experience with this copy/paste in tree and then
eventually upstream some work to LLVM itself to avoid the duplication while
still ensuring our needs are met. Otherwise I fear that maintaining these
bindings may be quite costly over the years with LLVM updates!
2017-07-23 15:14:38 +00:00
|
|
|
|
shared: shared.clone(),
|
2018-08-31 13:18:08 +00:00
|
|
|
|
idx: module_index,
|
|
|
|
|
}));
|
|
|
|
|
}
|
|
|
|
|
|
2020-03-06 11:13:55 +00:00
|
|
|
|
// Save the current ThinLTO import information for the next compilation
|
2019-11-29 15:04:40 +00:00
|
|
|
|
// session, overwriting the previous serialized imports (if any).
|
|
|
|
|
if let Some(path) = import_map_path {
|
|
|
|
|
if let Err(err) = curr_import_map.save_to_file(&path) {
|
|
|
|
|
let msg = format!("Error while writing ThinLTO import data: {}", err);
|
|
|
|
|
return Err(write::llvm_err(&diag_handler, &msg));
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2018-08-31 13:18:08 +00:00
|
|
|
|
Ok((opt_jobs, copy_jobs))
|
rustc: Implement ThinLTO
This commit is an implementation of LLVM's ThinLTO for consumption in rustc
itself. Currently today LTO works by merging all relevant LLVM modules into one
and then running optimization passes. "Thin" LTO operates differently by having
more sharded work and allowing parallelism opportunities between optimizing
codegen units. Further down the road Thin LTO also allows *incremental* LTO
which should enable even faster release builds without compromising on the
performance we have today.
This commit uses a `-Z thinlto` flag to gate whether ThinLTO is enabled. It then
also implements two forms of ThinLTO:
* In one mode we'll *only* perform ThinLTO over the codegen units produced in a
single compilation. That is, we won't load upstream rlibs, but we'll instead
just perform ThinLTO amongst all codegen units produced by the compiler for
the local crate. This is intended to emulate a desired end point where we have
codegen units turned on by default for all crates and ThinLTO allows us to do
this without performance loss.
* In anther mode, like full LTO today, we'll optimize all upstream dependencies
in "thin" mode. Unlike today, however, this LTO step is fully parallelized so
should finish much more quickly.
There's a good bit of comments about what the implementation is doing and where
it came from, but the tl;dr; is that currently most of the support here is
copied from upstream LLVM. This code duplication is done for a number of
reasons:
* Controlling parallelism means we can use the existing jobserver support to
avoid overloading machines.
* We will likely want a slightly different form of incremental caching which
integrates with our own incremental strategy, but this is yet to be
determined.
* This buys us some flexibility about when/where we run ThinLTO, as well as
having it tailored to fit our needs for the time being.
* Finally this allows us to reuse some artifacts such as our `TargetMachine`
creation, where all our options we used today aren't necessarily supported by
upstream LLVM yet.
My hope is that we can get some experience with this copy/paste in tree and then
eventually upstream some work to LLVM itself to avoid the duplication while
still ensuring our needs are met. Otherwise I fear that maintaining these
bindings may be quite costly over the years with LLVM updates!
2017-07-23 15:14:38 +00:00
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2019-11-29 15:04:40 +00:00
|
|
|
|
/// Given two slices, each with no repeat elements. returns true if and only if
|
|
|
|
|
/// the two slices have the same contents when considered as sets (i.e. when
|
|
|
|
|
/// element order is disregarded).
|
|
|
|
|
fn equivalent_as_sets(a: &[String], b: &[String]) -> bool {
|
|
|
|
|
// cheap path: unequal lengths means cannot possibly be set equivalent.
|
2019-12-22 22:42:04 +00:00
|
|
|
|
if a.len() != b.len() {
|
|
|
|
|
return false;
|
|
|
|
|
}
|
2019-11-29 15:04:40 +00:00
|
|
|
|
// fast path: before building new things, check if inputs are equivalent as is.
|
2019-12-22 22:42:04 +00:00
|
|
|
|
if a == b {
|
|
|
|
|
return true;
|
|
|
|
|
}
|
2019-11-29 15:04:40 +00:00
|
|
|
|
// slow path: general set comparison.
|
|
|
|
|
let a: FxHashSet<&str> = a.iter().map(|s| s.as_str()).collect();
|
|
|
|
|
let b: FxHashSet<&str> = b.iter().map(|s| s.as_str()).collect();
|
|
|
|
|
a == b
|
|
|
|
|
}
|
|
|
|
|
|
2019-12-22 22:42:04 +00:00
|
|
|
|
pub(crate) fn run_pass_manager(
|
|
|
|
|
cgcx: &CodegenContext<LlvmCodegenBackend>,
|
|
|
|
|
module: &ModuleCodegen<ModuleLlvm>,
|
|
|
|
|
config: &ModuleConfig,
|
|
|
|
|
thin: bool,
|
|
|
|
|
) {
|
2020-02-07 14:01:23 +00:00
|
|
|
|
let _timer = cgcx.prof.extra_verbose_generic_activity("LLVM_lto_optimize", &module.name[..]);
|
|
|
|
|
|
Implement LTO
This commit implements LTO for rust leveraging LLVM's passes. What this means
is:
* When compiling an rlib, in addition to insdering foo.o into the archive, also
insert foo.bc (the LLVM bytecode) of the optimized module.
* When the compiler detects the -Z lto option, it will attempt to perform LTO on
a staticlib or binary output. The compiler will emit an error if a dylib or
rlib output is being generated.
* The actual act of performing LTO is as follows:
1. Force all upstream libraries to have an rlib version available.
2. Load the bytecode of each upstream library from the rlib.
3. Link all this bytecode into the current LLVM module (just using llvm
apis)
4. Run an internalization pass which internalizes all symbols except those
found reachable for the local crate of compilation.
5. Run the LLVM LTO pass manager over this entire module
6a. If assembling an archive, then add all upstream rlibs into the output
archive. This ignores all of the object/bitcode/metadata files rust
generated and placed inside the rlibs.
6b. If linking a binary, create copies of all upstream rlibs, remove the
rust-generated object-file, and then link everything as usual.
As I have explained in #10741, this process is excruciatingly slow, so this is
*not* turned on by default, and it is also why I have decided to hide it behind
a -Z flag for now. The good news is that the binary sizes are about as small as
they can be as a result of LTO, so it's definitely working.
Closes #10741
Closes #10740
2013-12-03 07:19:29 +00:00
|
|
|
|
// Now we have one massive module inside of llmod. Time to run the
|
|
|
|
|
// LTO-specific optimization passes that LLVM provides.
|
|
|
|
|
//
|
|
|
|
|
// This code is based off the code found in llvm's LTO code generator:
|
|
|
|
|
// tools/lto/LTOCodeGenerator.cpp
|
|
|
|
|
debug!("running the pass manager");
|
|
|
|
|
unsafe {
|
2020-01-05 18:16:58 +00:00
|
|
|
|
if write::should_use_new_llvm_pass_manager(config) {
|
|
|
|
|
let opt_stage = if thin { llvm::OptStage::ThinLTO } else { llvm::OptStage::FatLTO };
|
|
|
|
|
let opt_level = config.opt_level.unwrap_or(config::OptLevel::No);
|
|
|
|
|
// See comment below for why this is necessary.
|
|
|
|
|
let opt_level = if let config::OptLevel::No = opt_level {
|
|
|
|
|
config::OptLevel::Less
|
|
|
|
|
} else {
|
|
|
|
|
opt_level
|
|
|
|
|
};
|
2020-02-11 21:37:16 +00:00
|
|
|
|
write::optimize_with_new_llvm_pass_manager(cgcx, module, config, opt_level, opt_stage);
|
2020-01-05 18:16:58 +00:00
|
|
|
|
debug!("lto done");
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
Implement LTO
This commit implements LTO for rust leveraging LLVM's passes. What this means
is:
* When compiling an rlib, in addition to insdering foo.o into the archive, also
insert foo.bc (the LLVM bytecode) of the optimized module.
* When the compiler detects the -Z lto option, it will attempt to perform LTO on
a staticlib or binary output. The compiler will emit an error if a dylib or
rlib output is being generated.
* The actual act of performing LTO is as follows:
1. Force all upstream libraries to have an rlib version available.
2. Load the bytecode of each upstream library from the rlib.
3. Link all this bytecode into the current LLVM module (just using llvm
apis)
4. Run an internalization pass which internalizes all symbols except those
found reachable for the local crate of compilation.
5. Run the LLVM LTO pass manager over this entire module
6a. If assembling an archive, then add all upstream rlibs into the output
archive. This ignores all of the object/bitcode/metadata files rust
generated and placed inside the rlibs.
6b. If linking a binary, create copies of all upstream rlibs, remove the
rust-generated object-file, and then link everything as usual.
As I have explained in #10741, this process is excruciatingly slow, so this is
*not* turned on by default, and it is also why I have decided to hide it behind
a -Z flag for now. The good news is that the binary sizes are about as small as
they can be as a result of LTO, so it's definitely working.
Closes #10741
Closes #10740
2013-12-03 07:19:29 +00:00
|
|
|
|
let pm = llvm::LLVMCreatePassManager();
|
2019-11-29 03:31:09 +00:00
|
|
|
|
llvm::LLVMAddAnalysisPasses(module.module_llvm.tm, pm);
|
2018-05-30 20:48:20 +00:00
|
|
|
|
|
2018-06-12 19:05:37 +00:00
|
|
|
|
if config.verify_llvm_ir {
|
2019-10-05 07:48:14 +00:00
|
|
|
|
let pass = llvm::LLVMRustFindAndCreatePass("verify\0".as_ptr().cast());
|
2018-07-12 15:00:49 +00:00
|
|
|
|
llvm::LLVMRustAddPass(pm, pass.unwrap());
|
2018-05-30 20:48:20 +00:00
|
|
|
|
}
|
Implement LTO
This commit implements LTO for rust leveraging LLVM's passes. What this means
is:
* When compiling an rlib, in addition to insdering foo.o into the archive, also
insert foo.bc (the LLVM bytecode) of the optimized module.
* When the compiler detects the -Z lto option, it will attempt to perform LTO on
a staticlib or binary output. The compiler will emit an error if a dylib or
rlib output is being generated.
* The actual act of performing LTO is as follows:
1. Force all upstream libraries to have an rlib version available.
2. Load the bytecode of each upstream library from the rlib.
3. Link all this bytecode into the current LLVM module (just using llvm
apis)
4. Run an internalization pass which internalizes all symbols except those
found reachable for the local crate of compilation.
5. Run the LLVM LTO pass manager over this entire module
6a. If assembling an archive, then add all upstream rlibs into the output
archive. This ignores all of the object/bitcode/metadata files rust
generated and placed inside the rlibs.
6b. If linking a binary, create copies of all upstream rlibs, remove the
rust-generated object-file, and then link everything as usual.
As I have explained in #10741, this process is excruciatingly slow, so this is
*not* turned on by default, and it is also why I have decided to hide it behind
a -Z flag for now. The good news is that the binary sizes are about as small as
they can be as a result of LTO, so it's definitely working.
Closes #10741
Closes #10740
2013-12-03 07:19:29 +00:00
|
|
|
|
|
2017-10-11 18:19:59 +00:00
|
|
|
|
// When optimizing for LTO we don't actually pass in `-O0`, but we force
|
|
|
|
|
// it to always happen at least with `-O1`.
|
|
|
|
|
//
|
|
|
|
|
// With ThinLTO we mess around a lot with symbol visibility in a way
|
|
|
|
|
// that will actually cause linking failures if we optimize at O0 which
|
|
|
|
|
// notable is lacking in dead code elimination. To ensure we at least
|
|
|
|
|
// get some optimizations and correctly link we forcibly switch to `-O1`
|
|
|
|
|
// to get dead code elimination.
|
|
|
|
|
//
|
|
|
|
|
// Note that in general this shouldn't matter too much as you typically
|
|
|
|
|
// only turn on ThinLTO when you're compiling with optimizations
|
|
|
|
|
// otherwise.
|
2019-12-22 22:42:04 +00:00
|
|
|
|
let opt_level = config
|
|
|
|
|
.opt_level
|
|
|
|
|
.map(|x| to_llvm_opt_settings(x).0)
|
2018-10-23 15:01:35 +00:00
|
|
|
|
.unwrap_or(llvm::CodeGenOptLevel::None);
|
2017-10-11 18:19:59 +00:00
|
|
|
|
let opt_level = match opt_level {
|
|
|
|
|
llvm::CodeGenOptLevel::None => llvm::CodeGenOptLevel::Less,
|
|
|
|
|
level => level,
|
|
|
|
|
};
|
2018-10-23 15:01:35 +00:00
|
|
|
|
with_llvm_pmb(module.module_llvm.llmod(), config, opt_level, false, &mut |b| {
|
rustc: Implement ThinLTO
This commit is an implementation of LLVM's ThinLTO for consumption in rustc
itself. Currently today LTO works by merging all relevant LLVM modules into one
and then running optimization passes. "Thin" LTO operates differently by having
more sharded work and allowing parallelism opportunities between optimizing
codegen units. Further down the road Thin LTO also allows *incremental* LTO
which should enable even faster release builds without compromising on the
performance we have today.
This commit uses a `-Z thinlto` flag to gate whether ThinLTO is enabled. It then
also implements two forms of ThinLTO:
* In one mode we'll *only* perform ThinLTO over the codegen units produced in a
single compilation. That is, we won't load upstream rlibs, but we'll instead
just perform ThinLTO amongst all codegen units produced by the compiler for
the local crate. This is intended to emulate a desired end point where we have
codegen units turned on by default for all crates and ThinLTO allows us to do
this without performance loss.
* In anther mode, like full LTO today, we'll optimize all upstream dependencies
in "thin" mode. Unlike today, however, this LTO step is fully parallelized so
should finish much more quickly.
There's a good bit of comments about what the implementation is doing and where
it came from, but the tl;dr; is that currently most of the support here is
copied from upstream LLVM. This code duplication is done for a number of
reasons:
* Controlling parallelism means we can use the existing jobserver support to
avoid overloading machines.
* We will likely want a slightly different form of incremental caching which
integrates with our own incremental strategy, but this is yet to be
determined.
* This buys us some flexibility about when/where we run ThinLTO, as well as
having it tailored to fit our needs for the time being.
* Finally this allows us to reuse some artifacts such as our `TargetMachine`
creation, where all our options we used today aren't necessarily supported by
upstream LLVM yet.
My hope is that we can get some experience with this copy/paste in tree and then
eventually upstream some work to LLVM itself to avoid the duplication while
still ensuring our needs are met. Otherwise I fear that maintaining these
bindings may be quite costly over the years with LLVM updates!
2017-07-23 15:14:38 +00:00
|
|
|
|
if thin {
|
2018-11-05 13:52:08 +00:00
|
|
|
|
llvm::LLVMRustPassManagerBuilderPopulateThinLTOPassManager(b, pm);
|
rustc: Implement ThinLTO
This commit is an implementation of LLVM's ThinLTO for consumption in rustc
itself. Currently today LTO works by merging all relevant LLVM modules into one
and then running optimization passes. "Thin" LTO operates differently by having
more sharded work and allowing parallelism opportunities between optimizing
codegen units. Further down the road Thin LTO also allows *incremental* LTO
which should enable even faster release builds without compromising on the
performance we have today.
This commit uses a `-Z thinlto` flag to gate whether ThinLTO is enabled. It then
also implements two forms of ThinLTO:
* In one mode we'll *only* perform ThinLTO over the codegen units produced in a
single compilation. That is, we won't load upstream rlibs, but we'll instead
just perform ThinLTO amongst all codegen units produced by the compiler for
the local crate. This is intended to emulate a desired end point where we have
codegen units turned on by default for all crates and ThinLTO allows us to do
this without performance loss.
* In anther mode, like full LTO today, we'll optimize all upstream dependencies
in "thin" mode. Unlike today, however, this LTO step is fully parallelized so
should finish much more quickly.
There's a good bit of comments about what the implementation is doing and where
it came from, but the tl;dr; is that currently most of the support here is
copied from upstream LLVM. This code duplication is done for a number of
reasons:
* Controlling parallelism means we can use the existing jobserver support to
avoid overloading machines.
* We will likely want a slightly different form of incremental caching which
integrates with our own incremental strategy, but this is yet to be
determined.
* This buys us some flexibility about when/where we run ThinLTO, as well as
having it tailored to fit our needs for the time being.
* Finally this allows us to reuse some artifacts such as our `TargetMachine`
creation, where all our options we used today aren't necessarily supported by
upstream LLVM yet.
My hope is that we can get some experience with this copy/paste in tree and then
eventually upstream some work to LLVM itself to avoid the duplication while
still ensuring our needs are met. Otherwise I fear that maintaining these
bindings may be quite costly over the years with LLVM updates!
2017-07-23 15:14:38 +00:00
|
|
|
|
} else {
|
2019-12-22 22:42:04 +00:00
|
|
|
|
llvm::LLVMPassManagerBuilderPopulateLTOPassManager(
|
|
|
|
|
b, pm, /* Internalize = */ False, /* RunInliner = */ True,
|
|
|
|
|
);
|
rustc: Implement ThinLTO
This commit is an implementation of LLVM's ThinLTO for consumption in rustc
itself. Currently today LTO works by merging all relevant LLVM modules into one
and then running optimization passes. "Thin" LTO operates differently by having
more sharded work and allowing parallelism opportunities between optimizing
codegen units. Further down the road Thin LTO also allows *incremental* LTO
which should enable even faster release builds without compromising on the
performance we have today.
This commit uses a `-Z thinlto` flag to gate whether ThinLTO is enabled. It then
also implements two forms of ThinLTO:
* In one mode we'll *only* perform ThinLTO over the codegen units produced in a
single compilation. That is, we won't load upstream rlibs, but we'll instead
just perform ThinLTO amongst all codegen units produced by the compiler for
the local crate. This is intended to emulate a desired end point where we have
codegen units turned on by default for all crates and ThinLTO allows us to do
this without performance loss.
* In anther mode, like full LTO today, we'll optimize all upstream dependencies
in "thin" mode. Unlike today, however, this LTO step is fully parallelized so
should finish much more quickly.
There's a good bit of comments about what the implementation is doing and where
it came from, but the tl;dr; is that currently most of the support here is
copied from upstream LLVM. This code duplication is done for a number of
reasons:
* Controlling parallelism means we can use the existing jobserver support to
avoid overloading machines.
* We will likely want a slightly different form of incremental caching which
integrates with our own incremental strategy, but this is yet to be
determined.
* This buys us some flexibility about when/where we run ThinLTO, as well as
having it tailored to fit our needs for the time being.
* Finally this allows us to reuse some artifacts such as our `TargetMachine`
creation, where all our options we used today aren't necessarily supported by
upstream LLVM yet.
My hope is that we can get some experience with this copy/paste in tree and then
eventually upstream some work to LLVM itself to avoid the duplication while
still ensuring our needs are met. Otherwise I fear that maintaining these
bindings may be quite costly over the years with LLVM updates!
2017-07-23 15:14:38 +00:00
|
|
|
|
}
|
2015-07-22 23:22:51 +00:00
|
|
|
|
});
|
Implement LTO
This commit implements LTO for rust leveraging LLVM's passes. What this means
is:
* When compiling an rlib, in addition to insdering foo.o into the archive, also
insert foo.bc (the LLVM bytecode) of the optimized module.
* When the compiler detects the -Z lto option, it will attempt to perform LTO on
a staticlib or binary output. The compiler will emit an error if a dylib or
rlib output is being generated.
* The actual act of performing LTO is as follows:
1. Force all upstream libraries to have an rlib version available.
2. Load the bytecode of each upstream library from the rlib.
3. Link all this bytecode into the current LLVM module (just using llvm
apis)
4. Run an internalization pass which internalizes all symbols except those
found reachable for the local crate of compilation.
5. Run the LLVM LTO pass manager over this entire module
6a. If assembling an archive, then add all upstream rlibs into the output
archive. This ignores all of the object/bitcode/metadata files rust
generated and placed inside the rlibs.
6b. If linking a binary, create copies of all upstream rlibs, remove the
rust-generated object-file, and then link everything as usual.
As I have explained in #10741, this process is excruciatingly slow, so this is
*not* turned on by default, and it is also why I have decided to hide it behind
a -Z flag for now. The good news is that the binary sizes are about as small as
they can be as a result of LTO, so it's definitely working.
Closes #10741
Closes #10740
2013-12-03 07:19:29 +00:00
|
|
|
|
|
2018-11-02 12:22:48 +00:00
|
|
|
|
// We always generate bitcode through ThinLTOBuffers,
|
|
|
|
|
// which do not support anonymous globals
|
|
|
|
|
if config.bitcode_needed() {
|
2019-10-05 07:48:14 +00:00
|
|
|
|
let pass = llvm::LLVMRustFindAndCreatePass("name-anon-globals\0".as_ptr().cast());
|
2018-11-02 12:22:48 +00:00
|
|
|
|
llvm::LLVMRustAddPass(pm, pass.unwrap());
|
|
|
|
|
}
|
|
|
|
|
|
2018-06-12 19:05:37 +00:00
|
|
|
|
if config.verify_llvm_ir {
|
2019-10-05 07:48:14 +00:00
|
|
|
|
let pass = llvm::LLVMRustFindAndCreatePass("verify\0".as_ptr().cast());
|
2018-07-12 15:00:49 +00:00
|
|
|
|
llvm::LLVMRustAddPass(pm, pass.unwrap());
|
2018-05-30 20:48:20 +00:00
|
|
|
|
}
|
Implement LTO
This commit implements LTO for rust leveraging LLVM's passes. What this means
is:
* When compiling an rlib, in addition to insdering foo.o into the archive, also
insert foo.bc (the LLVM bytecode) of the optimized module.
* When the compiler detects the -Z lto option, it will attempt to perform LTO on
a staticlib or binary output. The compiler will emit an error if a dylib or
rlib output is being generated.
* The actual act of performing LTO is as follows:
1. Force all upstream libraries to have an rlib version available.
2. Load the bytecode of each upstream library from the rlib.
3. Link all this bytecode into the current LLVM module (just using llvm
apis)
4. Run an internalization pass which internalizes all symbols except those
found reachable for the local crate of compilation.
5. Run the LLVM LTO pass manager over this entire module
6a. If assembling an archive, then add all upstream rlibs into the output
archive. This ignores all of the object/bitcode/metadata files rust
generated and placed inside the rlibs.
6b. If linking a binary, create copies of all upstream rlibs, remove the
rust-generated object-file, and then link everything as usual.
As I have explained in #10741, this process is excruciatingly slow, so this is
*not* turned on by default, and it is also why I have decided to hide it behind
a -Z flag for now. The good news is that the binary sizes are about as small as
they can be as a result of LTO, so it's definitely working.
Closes #10741
Closes #10740
2013-12-03 07:19:29 +00:00
|
|
|
|
|
2020-02-07 14:01:23 +00:00
|
|
|
|
llvm::LLVMRunPassManager(pm, module.module_llvm.llmod());
|
Implement LTO
This commit implements LTO for rust leveraging LLVM's passes. What this means
is:
* When compiling an rlib, in addition to insdering foo.o into the archive, also
insert foo.bc (the LLVM bytecode) of the optimized module.
* When the compiler detects the -Z lto option, it will attempt to perform LTO on
a staticlib or binary output. The compiler will emit an error if a dylib or
rlib output is being generated.
* The actual act of performing LTO is as follows:
1. Force all upstream libraries to have an rlib version available.
2. Load the bytecode of each upstream library from the rlib.
3. Link all this bytecode into the current LLVM module (just using llvm
apis)
4. Run an internalization pass which internalizes all symbols except those
found reachable for the local crate of compilation.
5. Run the LLVM LTO pass manager over this entire module
6a. If assembling an archive, then add all upstream rlibs into the output
archive. This ignores all of the object/bitcode/metadata files rust
generated and placed inside the rlibs.
6b. If linking a binary, create copies of all upstream rlibs, remove the
rust-generated object-file, and then link everything as usual.
As I have explained in #10741, this process is excruciatingly slow, so this is
*not* turned on by default, and it is also why I have decided to hide it behind
a -Z flag for now. The good news is that the binary sizes are about as small as
they can be as a result of LTO, so it's definitely working.
Closes #10741
Closes #10740
2013-12-03 07:19:29 +00:00
|
|
|
|
|
|
|
|
|
llvm::LLVMDisposePassManager(pm);
|
|
|
|
|
}
|
|
|
|
|
debug!("lto done");
|
|
|
|
|
}
|
2014-07-31 13:05:08 +00:00
|
|
|
|
|
2018-07-17 13:08:25 +00:00
|
|
|
|
pub struct ModuleBuffer(&'static mut llvm::ModuleBuffer);
|
2017-07-23 15:14:38 +00:00
|
|
|
|
|
|
|
|
|
unsafe impl Send for ModuleBuffer {}
|
|
|
|
|
unsafe impl Sync for ModuleBuffer {}
|
|
|
|
|
|
|
|
|
|
impl ModuleBuffer {
|
2018-06-27 14:57:25 +00:00
|
|
|
|
pub fn new(m: &llvm::Module) -> ModuleBuffer {
|
2019-12-22 22:42:04 +00:00
|
|
|
|
ModuleBuffer(unsafe { llvm::LLVMRustModuleBufferCreate(m) })
|
2017-07-23 15:14:38 +00:00
|
|
|
|
}
|
2018-10-23 15:01:35 +00:00
|
|
|
|
}
|
2017-07-23 15:14:38 +00:00
|
|
|
|
|
2018-10-23 15:01:35 +00:00
|
|
|
|
impl ModuleBufferMethods for ModuleBuffer {
|
|
|
|
|
fn data(&self) -> &[u8] {
|
2017-07-23 15:14:38 +00:00
|
|
|
|
unsafe {
|
|
|
|
|
let ptr = llvm::LLVMRustModuleBufferPtr(self.0);
|
|
|
|
|
let len = llvm::LLVMRustModuleBufferLen(self.0);
|
|
|
|
|
slice::from_raw_parts(ptr, len)
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
impl Drop for ModuleBuffer {
|
|
|
|
|
fn drop(&mut self) {
|
2019-12-22 22:42:04 +00:00
|
|
|
|
unsafe {
|
|
|
|
|
llvm::LLVMRustModuleBufferFree(&mut *(self.0 as *mut _));
|
|
|
|
|
}
|
2017-07-23 15:14:38 +00:00
|
|
|
|
}
|
2014-07-31 13:05:08 +00:00
|
|
|
|
}
|
rustc: Implement ThinLTO
This commit is an implementation of LLVM's ThinLTO for consumption in rustc
itself. Currently today LTO works by merging all relevant LLVM modules into one
and then running optimization passes. "Thin" LTO operates differently by having
more sharded work and allowing parallelism opportunities between optimizing
codegen units. Further down the road Thin LTO also allows *incremental* LTO
which should enable even faster release builds without compromising on the
performance we have today.
This commit uses a `-Z thinlto` flag to gate whether ThinLTO is enabled. It then
also implements two forms of ThinLTO:
* In one mode we'll *only* perform ThinLTO over the codegen units produced in a
single compilation. That is, we won't load upstream rlibs, but we'll instead
just perform ThinLTO amongst all codegen units produced by the compiler for
the local crate. This is intended to emulate a desired end point where we have
codegen units turned on by default for all crates and ThinLTO allows us to do
this without performance loss.
* In anther mode, like full LTO today, we'll optimize all upstream dependencies
in "thin" mode. Unlike today, however, this LTO step is fully parallelized so
should finish much more quickly.
There's a good bit of comments about what the implementation is doing and where
it came from, but the tl;dr; is that currently most of the support here is
copied from upstream LLVM. This code duplication is done for a number of
reasons:
* Controlling parallelism means we can use the existing jobserver support to
avoid overloading machines.
* We will likely want a slightly different form of incremental caching which
integrates with our own incremental strategy, but this is yet to be
determined.
* This buys us some flexibility about when/where we run ThinLTO, as well as
having it tailored to fit our needs for the time being.
* Finally this allows us to reuse some artifacts such as our `TargetMachine`
creation, where all our options we used today aren't necessarily supported by
upstream LLVM yet.
My hope is that we can get some experience with this copy/paste in tree and then
eventually upstream some work to LLVM itself to avoid the duplication while
still ensuring our needs are met. Otherwise I fear that maintaining these
bindings may be quite costly over the years with LLVM updates!
2017-07-23 15:14:38 +00:00
|
|
|
|
|
2018-10-23 15:01:35 +00:00
|
|
|
|
pub struct ThinData(&'static mut llvm::ThinLTOData);
|
rustc: Implement ThinLTO
This commit is an implementation of LLVM's ThinLTO for consumption in rustc
itself. Currently today LTO works by merging all relevant LLVM modules into one
and then running optimization passes. "Thin" LTO operates differently by having
more sharded work and allowing parallelism opportunities between optimizing
codegen units. Further down the road Thin LTO also allows *incremental* LTO
which should enable even faster release builds without compromising on the
performance we have today.
This commit uses a `-Z thinlto` flag to gate whether ThinLTO is enabled. It then
also implements two forms of ThinLTO:
* In one mode we'll *only* perform ThinLTO over the codegen units produced in a
single compilation. That is, we won't load upstream rlibs, but we'll instead
just perform ThinLTO amongst all codegen units produced by the compiler for
the local crate. This is intended to emulate a desired end point where we have
codegen units turned on by default for all crates and ThinLTO allows us to do
this without performance loss.
* In anther mode, like full LTO today, we'll optimize all upstream dependencies
in "thin" mode. Unlike today, however, this LTO step is fully parallelized so
should finish much more quickly.
There's a good bit of comments about what the implementation is doing and where
it came from, but the tl;dr; is that currently most of the support here is
copied from upstream LLVM. This code duplication is done for a number of
reasons:
* Controlling parallelism means we can use the existing jobserver support to
avoid overloading machines.
* We will likely want a slightly different form of incremental caching which
integrates with our own incremental strategy, but this is yet to be
determined.
* This buys us some flexibility about when/where we run ThinLTO, as well as
having it tailored to fit our needs for the time being.
* Finally this allows us to reuse some artifacts such as our `TargetMachine`
creation, where all our options we used today aren't necessarily supported by
upstream LLVM yet.
My hope is that we can get some experience with this copy/paste in tree and then
eventually upstream some work to LLVM itself to avoid the duplication while
still ensuring our needs are met. Otherwise I fear that maintaining these
bindings may be quite costly over the years with LLVM updates!
2017-07-23 15:14:38 +00:00
|
|
|
|
|
|
|
|
|
unsafe impl Send for ThinData {}
|
|
|
|
|
unsafe impl Sync for ThinData {}
|
|
|
|
|
|
|
|
|
|
impl Drop for ThinData {
|
|
|
|
|
fn drop(&mut self) {
|
|
|
|
|
unsafe {
|
2018-07-17 13:43:49 +00:00
|
|
|
|
llvm::LLVMRustFreeThinLTOData(&mut *(self.0 as *mut _));
|
rustc: Implement ThinLTO
This commit is an implementation of LLVM's ThinLTO for consumption in rustc
itself. Currently today LTO works by merging all relevant LLVM modules into one
and then running optimization passes. "Thin" LTO operates differently by having
more sharded work and allowing parallelism opportunities between optimizing
codegen units. Further down the road Thin LTO also allows *incremental* LTO
which should enable even faster release builds without compromising on the
performance we have today.
This commit uses a `-Z thinlto` flag to gate whether ThinLTO is enabled. It then
also implements two forms of ThinLTO:
* In one mode we'll *only* perform ThinLTO over the codegen units produced in a
single compilation. That is, we won't load upstream rlibs, but we'll instead
just perform ThinLTO amongst all codegen units produced by the compiler for
the local crate. This is intended to emulate a desired end point where we have
codegen units turned on by default for all crates and ThinLTO allows us to do
this without performance loss.
* In anther mode, like full LTO today, we'll optimize all upstream dependencies
in "thin" mode. Unlike today, however, this LTO step is fully parallelized so
should finish much more quickly.
There's a good bit of comments about what the implementation is doing and where
it came from, but the tl;dr; is that currently most of the support here is
copied from upstream LLVM. This code duplication is done for a number of
reasons:
* Controlling parallelism means we can use the existing jobserver support to
avoid overloading machines.
* We will likely want a slightly different form of incremental caching which
integrates with our own incremental strategy, but this is yet to be
determined.
* This buys us some flexibility about when/where we run ThinLTO, as well as
having it tailored to fit our needs for the time being.
* Finally this allows us to reuse some artifacts such as our `TargetMachine`
creation, where all our options we used today aren't necessarily supported by
upstream LLVM yet.
My hope is that we can get some experience with this copy/paste in tree and then
eventually upstream some work to LLVM itself to avoid the duplication while
still ensuring our needs are met. Otherwise I fear that maintaining these
bindings may be quite costly over the years with LLVM updates!
2017-07-23 15:14:38 +00:00
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2018-07-17 13:31:09 +00:00
|
|
|
|
pub struct ThinBuffer(&'static mut llvm::ThinLTOBuffer);
|
rustc: Implement ThinLTO
This commit is an implementation of LLVM's ThinLTO for consumption in rustc
itself. Currently today LTO works by merging all relevant LLVM modules into one
and then running optimization passes. "Thin" LTO operates differently by having
more sharded work and allowing parallelism opportunities between optimizing
codegen units. Further down the road Thin LTO also allows *incremental* LTO
which should enable even faster release builds without compromising on the
performance we have today.
This commit uses a `-Z thinlto` flag to gate whether ThinLTO is enabled. It then
also implements two forms of ThinLTO:
* In one mode we'll *only* perform ThinLTO over the codegen units produced in a
single compilation. That is, we won't load upstream rlibs, but we'll instead
just perform ThinLTO amongst all codegen units produced by the compiler for
the local crate. This is intended to emulate a desired end point where we have
codegen units turned on by default for all crates and ThinLTO allows us to do
this without performance loss.
* In anther mode, like full LTO today, we'll optimize all upstream dependencies
in "thin" mode. Unlike today, however, this LTO step is fully parallelized so
should finish much more quickly.
There's a good bit of comments about what the implementation is doing and where
it came from, but the tl;dr; is that currently most of the support here is
copied from upstream LLVM. This code duplication is done for a number of
reasons:
* Controlling parallelism means we can use the existing jobserver support to
avoid overloading machines.
* We will likely want a slightly different form of incremental caching which
integrates with our own incremental strategy, but this is yet to be
determined.
* This buys us some flexibility about when/where we run ThinLTO, as well as
having it tailored to fit our needs for the time being.
* Finally this allows us to reuse some artifacts such as our `TargetMachine`
creation, where all our options we used today aren't necessarily supported by
upstream LLVM yet.
My hope is that we can get some experience with this copy/paste in tree and then
eventually upstream some work to LLVM itself to avoid the duplication while
still ensuring our needs are met. Otherwise I fear that maintaining these
bindings may be quite costly over the years with LLVM updates!
2017-07-23 15:14:38 +00:00
|
|
|
|
|
|
|
|
|
unsafe impl Send for ThinBuffer {}
|
|
|
|
|
unsafe impl Sync for ThinBuffer {}
|
|
|
|
|
|
|
|
|
|
impl ThinBuffer {
|
2018-06-27 14:57:25 +00:00
|
|
|
|
pub fn new(m: &llvm::Module) -> ThinBuffer {
|
2017-10-20 01:44:33 +00:00
|
|
|
|
unsafe {
|
|
|
|
|
let buffer = llvm::LLVMRustThinLTOBufferCreate(m);
|
|
|
|
|
ThinBuffer(buffer)
|
|
|
|
|
}
|
|
|
|
|
}
|
2018-10-23 15:01:35 +00:00
|
|
|
|
}
|
2017-10-20 01:44:33 +00:00
|
|
|
|
|
2018-10-23 15:01:35 +00:00
|
|
|
|
impl ThinBufferMethods for ThinBuffer {
|
|
|
|
|
fn data(&self) -> &[u8] {
|
rustc: Implement ThinLTO
This commit is an implementation of LLVM's ThinLTO for consumption in rustc
itself. Currently today LTO works by merging all relevant LLVM modules into one
and then running optimization passes. "Thin" LTO operates differently by having
more sharded work and allowing parallelism opportunities between optimizing
codegen units. Further down the road Thin LTO also allows *incremental* LTO
which should enable even faster release builds without compromising on the
performance we have today.
This commit uses a `-Z thinlto` flag to gate whether ThinLTO is enabled. It then
also implements two forms of ThinLTO:
* In one mode we'll *only* perform ThinLTO over the codegen units produced in a
single compilation. That is, we won't load upstream rlibs, but we'll instead
just perform ThinLTO amongst all codegen units produced by the compiler for
the local crate. This is intended to emulate a desired end point where we have
codegen units turned on by default for all crates and ThinLTO allows us to do
this without performance loss.
* In anther mode, like full LTO today, we'll optimize all upstream dependencies
in "thin" mode. Unlike today, however, this LTO step is fully parallelized so
should finish much more quickly.
There's a good bit of comments about what the implementation is doing and where
it came from, but the tl;dr; is that currently most of the support here is
copied from upstream LLVM. This code duplication is done for a number of
reasons:
* Controlling parallelism means we can use the existing jobserver support to
avoid overloading machines.
* We will likely want a slightly different form of incremental caching which
integrates with our own incremental strategy, but this is yet to be
determined.
* This buys us some flexibility about when/where we run ThinLTO, as well as
having it tailored to fit our needs for the time being.
* Finally this allows us to reuse some artifacts such as our `TargetMachine`
creation, where all our options we used today aren't necessarily supported by
upstream LLVM yet.
My hope is that we can get some experience with this copy/paste in tree and then
eventually upstream some work to LLVM itself to avoid the duplication while
still ensuring our needs are met. Otherwise I fear that maintaining these
bindings may be quite costly over the years with LLVM updates!
2017-07-23 15:14:38 +00:00
|
|
|
|
unsafe {
|
|
|
|
|
let ptr = llvm::LLVMRustThinLTOBufferPtr(self.0) as *const _;
|
|
|
|
|
let len = llvm::LLVMRustThinLTOBufferLen(self.0);
|
|
|
|
|
slice::from_raw_parts(ptr, len)
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
impl Drop for ThinBuffer {
|
|
|
|
|
fn drop(&mut self) {
|
|
|
|
|
unsafe {
|
2018-07-17 13:31:09 +00:00
|
|
|
|
llvm::LLVMRustThinLTOBufferFree(&mut *(self.0 as *mut _));
|
rustc: Implement ThinLTO
This commit is an implementation of LLVM's ThinLTO for consumption in rustc
itself. Currently today LTO works by merging all relevant LLVM modules into one
and then running optimization passes. "Thin" LTO operates differently by having
more sharded work and allowing parallelism opportunities between optimizing
codegen units. Further down the road Thin LTO also allows *incremental* LTO
which should enable even faster release builds without compromising on the
performance we have today.
This commit uses a `-Z thinlto` flag to gate whether ThinLTO is enabled. It then
also implements two forms of ThinLTO:
* In one mode we'll *only* perform ThinLTO over the codegen units produced in a
single compilation. That is, we won't load upstream rlibs, but we'll instead
just perform ThinLTO amongst all codegen units produced by the compiler for
the local crate. This is intended to emulate a desired end point where we have
codegen units turned on by default for all crates and ThinLTO allows us to do
this without performance loss.
* In anther mode, like full LTO today, we'll optimize all upstream dependencies
in "thin" mode. Unlike today, however, this LTO step is fully parallelized so
should finish much more quickly.
There's a good bit of comments about what the implementation is doing and where
it came from, but the tl;dr; is that currently most of the support here is
copied from upstream LLVM. This code duplication is done for a number of
reasons:
* Controlling parallelism means we can use the existing jobserver support to
avoid overloading machines.
* We will likely want a slightly different form of incremental caching which
integrates with our own incremental strategy, but this is yet to be
determined.
* This buys us some flexibility about when/where we run ThinLTO, as well as
having it tailored to fit our needs for the time being.
* Finally this allows us to reuse some artifacts such as our `TargetMachine`
creation, where all our options we used today aren't necessarily supported by
upstream LLVM yet.
My hope is that we can get some experience with this copy/paste in tree and then
eventually upstream some work to LLVM itself to avoid the duplication while
still ensuring our needs are met. Otherwise I fear that maintaining these
bindings may be quite costly over the years with LLVM updates!
2017-07-23 15:14:38 +00:00
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2018-10-23 15:01:35 +00:00
|
|
|
|
pub unsafe fn optimize_thin_module(
|
|
|
|
|
thin_module: &mut ThinModule<LlvmCodegenBackend>,
|
|
|
|
|
cgcx: &CodegenContext<LlvmCodegenBackend>,
|
|
|
|
|
) -> Result<ModuleCodegen<ModuleLlvm>, FatalError> {
|
|
|
|
|
let diag_handler = cgcx.create_diag_handler();
|
2019-12-22 22:42:04 +00:00
|
|
|
|
let tm = (cgcx.tm_factory.0)().map_err(|e| write::llvm_err(&diag_handler, &e))?;
|
2018-10-23 15:01:35 +00:00
|
|
|
|
|
|
|
|
|
// Right now the implementation we've got only works over serialized
|
|
|
|
|
// modules, so we create a fresh new LLVM context and parse the module
|
|
|
|
|
// into that context. One day, however, we may do this for upstream
|
|
|
|
|
// crates but for locally codegened modules we may be able to reuse
|
|
|
|
|
// that LLVM Context and Module.
|
|
|
|
|
let llcx = llvm::LLVMRustContextCreate(cgcx.fewer_names);
|
2019-02-11 15:46:04 +00:00
|
|
|
|
let llmod_raw = parse_module(
|
2018-10-23 15:01:35 +00:00
|
|
|
|
llcx,
|
2019-02-11 15:46:04 +00:00
|
|
|
|
&thin_module.shared.module_names[thin_module.idx],
|
|
|
|
|
thin_module.data(),
|
|
|
|
|
&diag_handler,
|
|
|
|
|
)? as *const _;
|
2018-10-23 15:01:35 +00:00
|
|
|
|
let module = ModuleCodegen {
|
2019-12-22 22:42:04 +00:00
|
|
|
|
module_llvm: ModuleLlvm { llmod_raw, llcx, tm },
|
2018-10-23 15:01:35 +00:00
|
|
|
|
name: thin_module.name().to_string(),
|
|
|
|
|
kind: ModuleKind::Regular,
|
|
|
|
|
};
|
|
|
|
|
{
|
2020-06-26 01:52:41 +00:00
|
|
|
|
let target = &*module.module_llvm.tm;
|
2018-10-23 15:01:35 +00:00
|
|
|
|
let llmod = module.module_llvm.llmod();
|
|
|
|
|
save_temp_bitcode(&cgcx, &module, "thin-lto-input");
|
|
|
|
|
|
|
|
|
|
// Before we do much else find the "main" `DICompileUnit` that we'll be
|
|
|
|
|
// using below. If we find more than one though then rustc has changed
|
|
|
|
|
// in a way we're not ready for, so generate an ICE by returning
|
|
|
|
|
// an error.
|
|
|
|
|
let mut cu1 = ptr::null_mut();
|
|
|
|
|
let mut cu2 = ptr::null_mut();
|
|
|
|
|
llvm::LLVMRustThinLTOGetDICompileUnit(llmod, &mut cu1, &mut cu2);
|
|
|
|
|
if !cu2.is_null() {
|
|
|
|
|
let msg = "multiple source DICompileUnits found";
|
2019-12-22 22:42:04 +00:00
|
|
|
|
return Err(write::llvm_err(&diag_handler, msg));
|
2018-10-23 15:01:35 +00:00
|
|
|
|
}
|
2017-12-16 16:20:54 +00:00
|
|
|
|
|
2018-10-23 15:01:35 +00:00
|
|
|
|
// Like with "fat" LTO, get some better optimizations if landing pads
|
|
|
|
|
// are disabled by removing all landing pads.
|
|
|
|
|
if cgcx.no_landing_pads {
|
2020-02-07 14:01:23 +00:00
|
|
|
|
let _timer = cgcx
|
|
|
|
|
.prof
|
|
|
|
|
.generic_activity_with_arg("LLVM_thin_lto_remove_landing_pads", thin_module.name());
|
2018-10-23 15:01:35 +00:00
|
|
|
|
llvm::LLVMRustMarkAllFunctionsNounwind(llmod);
|
|
|
|
|
save_temp_bitcode(&cgcx, &module, "thin-lto-after-nounwind");
|
|
|
|
|
}
|
rustc: Implement ThinLTO
This commit is an implementation of LLVM's ThinLTO for consumption in rustc
itself. Currently today LTO works by merging all relevant LLVM modules into one
and then running optimization passes. "Thin" LTO operates differently by having
more sharded work and allowing parallelism opportunities between optimizing
codegen units. Further down the road Thin LTO also allows *incremental* LTO
which should enable even faster release builds without compromising on the
performance we have today.
This commit uses a `-Z thinlto` flag to gate whether ThinLTO is enabled. It then
also implements two forms of ThinLTO:
* In one mode we'll *only* perform ThinLTO over the codegen units produced in a
single compilation. That is, we won't load upstream rlibs, but we'll instead
just perform ThinLTO amongst all codegen units produced by the compiler for
the local crate. This is intended to emulate a desired end point where we have
codegen units turned on by default for all crates and ThinLTO allows us to do
this without performance loss.
* In anther mode, like full LTO today, we'll optimize all upstream dependencies
in "thin" mode. Unlike today, however, this LTO step is fully parallelized so
should finish much more quickly.
There's a good bit of comments about what the implementation is doing and where
it came from, but the tl;dr; is that currently most of the support here is
copied from upstream LLVM. This code duplication is done for a number of
reasons:
* Controlling parallelism means we can use the existing jobserver support to
avoid overloading machines.
* We will likely want a slightly different form of incremental caching which
integrates with our own incremental strategy, but this is yet to be
determined.
* This buys us some flexibility about when/where we run ThinLTO, as well as
having it tailored to fit our needs for the time being.
* Finally this allows us to reuse some artifacts such as our `TargetMachine`
creation, where all our options we used today aren't necessarily supported by
upstream LLVM yet.
My hope is that we can get some experience with this copy/paste in tree and then
eventually upstream some work to LLVM itself to avoid the duplication while
still ensuring our needs are met. Otherwise I fear that maintaining these
bindings may be quite costly over the years with LLVM updates!
2017-07-23 15:14:38 +00:00
|
|
|
|
|
2018-10-23 15:01:35 +00:00
|
|
|
|
// Up next comes the per-module local analyses that we do for Thin LTO.
|
|
|
|
|
// Each of these functions is basically copied from the LLVM
|
|
|
|
|
// implementation and then tailored to suit this implementation. Ideally
|
|
|
|
|
// each of these would be supported by upstream LLVM but that's perhaps
|
|
|
|
|
// a patch for another day!
|
|
|
|
|
//
|
|
|
|
|
// You can find some more comments about these functions in the LLVM
|
|
|
|
|
// bindings we've got (currently `PassWrapper.cpp`)
|
2019-09-27 12:04:36 +00:00
|
|
|
|
{
|
2020-02-07 14:01:23 +00:00
|
|
|
|
let _timer =
|
|
|
|
|
cgcx.prof.generic_activity_with_arg("LLVM_thin_lto_rename", thin_module.name());
|
2020-06-26 01:52:41 +00:00
|
|
|
|
if !llvm::LLVMRustPrepareThinLTORename(thin_module.shared.data.0, llmod, target) {
|
2019-09-27 12:04:36 +00:00
|
|
|
|
let msg = "failed to prepare thin LTO module";
|
2019-12-22 22:42:04 +00:00
|
|
|
|
return Err(write::llvm_err(&diag_handler, msg));
|
2019-09-27 12:04:36 +00:00
|
|
|
|
}
|
|
|
|
|
save_temp_bitcode(cgcx, &module, "thin-lto-after-rename");
|
rustc: Implement ThinLTO
This commit is an implementation of LLVM's ThinLTO for consumption in rustc
itself. Currently today LTO works by merging all relevant LLVM modules into one
and then running optimization passes. "Thin" LTO operates differently by having
more sharded work and allowing parallelism opportunities between optimizing
codegen units. Further down the road Thin LTO also allows *incremental* LTO
which should enable even faster release builds without compromising on the
performance we have today.
This commit uses a `-Z thinlto` flag to gate whether ThinLTO is enabled. It then
also implements two forms of ThinLTO:
* In one mode we'll *only* perform ThinLTO over the codegen units produced in a
single compilation. That is, we won't load upstream rlibs, but we'll instead
just perform ThinLTO amongst all codegen units produced by the compiler for
the local crate. This is intended to emulate a desired end point where we have
codegen units turned on by default for all crates and ThinLTO allows us to do
this without performance loss.
* In anther mode, like full LTO today, we'll optimize all upstream dependencies
in "thin" mode. Unlike today, however, this LTO step is fully parallelized so
should finish much more quickly.
There's a good bit of comments about what the implementation is doing and where
it came from, but the tl;dr; is that currently most of the support here is
copied from upstream LLVM. This code duplication is done for a number of
reasons:
* Controlling parallelism means we can use the existing jobserver support to
avoid overloading machines.
* We will likely want a slightly different form of incremental caching which
integrates with our own incremental strategy, but this is yet to be
determined.
* This buys us some flexibility about when/where we run ThinLTO, as well as
having it tailored to fit our needs for the time being.
* Finally this allows us to reuse some artifacts such as our `TargetMachine`
creation, where all our options we used today aren't necessarily supported by
upstream LLVM yet.
My hope is that we can get some experience with this copy/paste in tree and then
eventually upstream some work to LLVM itself to avoid the duplication while
still ensuring our needs are met. Otherwise I fear that maintaining these
bindings may be quite costly over the years with LLVM updates!
2017-07-23 15:14:38 +00:00
|
|
|
|
}
|
2019-09-27 12:04:36 +00:00
|
|
|
|
|
|
|
|
|
{
|
2020-02-07 14:01:23 +00:00
|
|
|
|
let _timer = cgcx
|
|
|
|
|
.prof
|
|
|
|
|
.generic_activity_with_arg("LLVM_thin_lto_resolve_weak", thin_module.name());
|
2019-09-27 12:04:36 +00:00
|
|
|
|
if !llvm::LLVMRustPrepareThinLTOResolveWeak(thin_module.shared.data.0, llmod) {
|
|
|
|
|
let msg = "failed to prepare thin LTO module";
|
2019-12-22 22:42:04 +00:00
|
|
|
|
return Err(write::llvm_err(&diag_handler, msg));
|
2019-09-27 12:04:36 +00:00
|
|
|
|
}
|
|
|
|
|
save_temp_bitcode(cgcx, &module, "thin-lto-after-resolve");
|
2018-10-23 15:01:35 +00:00
|
|
|
|
}
|
2019-09-27 12:04:36 +00:00
|
|
|
|
|
|
|
|
|
{
|
2020-02-07 14:01:23 +00:00
|
|
|
|
let _timer = cgcx
|
|
|
|
|
.prof
|
|
|
|
|
.generic_activity_with_arg("LLVM_thin_lto_internalize", thin_module.name());
|
2019-09-27 12:04:36 +00:00
|
|
|
|
if !llvm::LLVMRustPrepareThinLTOInternalize(thin_module.shared.data.0, llmod) {
|
|
|
|
|
let msg = "failed to prepare thin LTO module";
|
2019-12-22 22:42:04 +00:00
|
|
|
|
return Err(write::llvm_err(&diag_handler, msg));
|
2019-09-27 12:04:36 +00:00
|
|
|
|
}
|
|
|
|
|
save_temp_bitcode(cgcx, &module, "thin-lto-after-internalize");
|
2018-10-23 15:01:35 +00:00
|
|
|
|
}
|
2019-09-27 12:04:36 +00:00
|
|
|
|
|
|
|
|
|
{
|
2020-02-07 14:01:23 +00:00
|
|
|
|
let _timer =
|
|
|
|
|
cgcx.prof.generic_activity_with_arg("LLVM_thin_lto_import", thin_module.name());
|
2020-06-26 01:52:41 +00:00
|
|
|
|
if !llvm::LLVMRustPrepareThinLTOImport(thin_module.shared.data.0, llmod, target) {
|
2019-09-27 12:04:36 +00:00
|
|
|
|
let msg = "failed to prepare thin LTO module";
|
2019-12-22 22:42:04 +00:00
|
|
|
|
return Err(write::llvm_err(&diag_handler, msg));
|
2019-09-27 12:04:36 +00:00
|
|
|
|
}
|
|
|
|
|
save_temp_bitcode(cgcx, &module, "thin-lto-after-import");
|
2018-10-23 15:01:35 +00:00
|
|
|
|
}
|
2017-12-21 15:03:16 +00:00
|
|
|
|
|
2018-10-23 15:01:35 +00:00
|
|
|
|
// Ok now this is a bit unfortunate. This is also something you won't
|
|
|
|
|
// find upstream in LLVM's ThinLTO passes! This is a hack for now to
|
|
|
|
|
// work around bugs in LLVM.
|
|
|
|
|
//
|
|
|
|
|
// First discovered in #45511 it was found that as part of ThinLTO
|
|
|
|
|
// importing passes LLVM will import `DICompileUnit` metadata
|
|
|
|
|
// information across modules. This means that we'll be working with one
|
|
|
|
|
// LLVM module that has multiple `DICompileUnit` instances in it (a
|
|
|
|
|
// bunch of `llvm.dbg.cu` members). Unfortunately there's a number of
|
|
|
|
|
// bugs in LLVM's backend which generates invalid DWARF in a situation
|
|
|
|
|
// like this:
|
|
|
|
|
//
|
|
|
|
|
// https://bugs.llvm.org/show_bug.cgi?id=35212
|
|
|
|
|
// https://bugs.llvm.org/show_bug.cgi?id=35562
|
|
|
|
|
//
|
|
|
|
|
// While the first bug there is fixed the second ended up causing #46346
|
|
|
|
|
// which was basically a resurgence of #45511 after LLVM's bug 35212 was
|
|
|
|
|
// fixed.
|
|
|
|
|
//
|
|
|
|
|
// This function below is a huge hack around this problem. The function
|
|
|
|
|
// below is defined in `PassWrapper.cpp` and will basically "merge"
|
|
|
|
|
// all `DICompileUnit` instances in a module. Basically it'll take all
|
|
|
|
|
// the objects, rewrite all pointers of `DISubprogram` to point to the
|
|
|
|
|
// first `DICompileUnit`, and then delete all the other units.
|
|
|
|
|
//
|
|
|
|
|
// This is probably mangling to the debug info slightly (but hopefully
|
|
|
|
|
// not too much) but for now at least gets LLVM to emit valid DWARF (or
|
|
|
|
|
// so it appears). Hopefully we can remove this once upstream bugs are
|
|
|
|
|
// fixed in LLVM.
|
2019-09-27 12:04:36 +00:00
|
|
|
|
{
|
2020-02-07 14:01:23 +00:00
|
|
|
|
let _timer = cgcx
|
|
|
|
|
.prof
|
|
|
|
|
.generic_activity_with_arg("LLVM_thin_lto_patch_debuginfo", thin_module.name());
|
2019-09-27 12:04:36 +00:00
|
|
|
|
llvm::LLVMRustThinLTOPatchDICompileUnit(llmod, cu1);
|
|
|
|
|
save_temp_bitcode(cgcx, &module, "thin-lto-after-patch");
|
|
|
|
|
}
|
2018-10-23 15:01:35 +00:00
|
|
|
|
|
|
|
|
|
// Alright now that we've done everything related to the ThinLTO
|
|
|
|
|
// analysis it's time to run some optimizations! Here we use the same
|
|
|
|
|
// `run_pass_manager` as the "fat" LTO above except that we tell it to
|
|
|
|
|
// populate a thin-specific pass manager, which presumably LLVM treats a
|
|
|
|
|
// little differently.
|
2019-09-27 12:04:36 +00:00
|
|
|
|
{
|
|
|
|
|
info!("running thin lto passes over {}", module.name);
|
|
|
|
|
let config = cgcx.config(module.kind);
|
|
|
|
|
run_pass_manager(cgcx, &module, config, true);
|
|
|
|
|
save_temp_bitcode(cgcx, &module, "thin-lto-after-pm");
|
|
|
|
|
}
|
rustc: Implement ThinLTO
This commit is an implementation of LLVM's ThinLTO for consumption in rustc
itself. Currently today LTO works by merging all relevant LLVM modules into one
and then running optimization passes. "Thin" LTO operates differently by having
more sharded work and allowing parallelism opportunities between optimizing
codegen units. Further down the road Thin LTO also allows *incremental* LTO
which should enable even faster release builds without compromising on the
performance we have today.
This commit uses a `-Z thinlto` flag to gate whether ThinLTO is enabled. It then
also implements two forms of ThinLTO:
* In one mode we'll *only* perform ThinLTO over the codegen units produced in a
single compilation. That is, we won't load upstream rlibs, but we'll instead
just perform ThinLTO amongst all codegen units produced by the compiler for
the local crate. This is intended to emulate a desired end point where we have
codegen units turned on by default for all crates and ThinLTO allows us to do
this without performance loss.
* In anther mode, like full LTO today, we'll optimize all upstream dependencies
in "thin" mode. Unlike today, however, this LTO step is fully parallelized so
should finish much more quickly.
There's a good bit of comments about what the implementation is doing and where
it came from, but the tl;dr; is that currently most of the support here is
copied from upstream LLVM. This code duplication is done for a number of
reasons:
* Controlling parallelism means we can use the existing jobserver support to
avoid overloading machines.
* We will likely want a slightly different form of incremental caching which
integrates with our own incremental strategy, but this is yet to be
determined.
* This buys us some flexibility about when/where we run ThinLTO, as well as
having it tailored to fit our needs for the time being.
* Finally this allows us to reuse some artifacts such as our `TargetMachine`
creation, where all our options we used today aren't necessarily supported by
upstream LLVM yet.
My hope is that we can get some experience with this copy/paste in tree and then
eventually upstream some work to LLVM itself to avoid the duplication while
still ensuring our needs are met. Otherwise I fear that maintaining these
bindings may be quite costly over the years with LLVM updates!
2017-07-23 15:14:38 +00:00
|
|
|
|
}
|
2018-10-23 15:01:35 +00:00
|
|
|
|
Ok(module)
|
rustc: Implement ThinLTO
This commit is an implementation of LLVM's ThinLTO for consumption in rustc
itself. Currently today LTO works by merging all relevant LLVM modules into one
and then running optimization passes. "Thin" LTO operates differently by having
more sharded work and allowing parallelism opportunities between optimizing
codegen units. Further down the road Thin LTO also allows *incremental* LTO
which should enable even faster release builds without compromising on the
performance we have today.
This commit uses a `-Z thinlto` flag to gate whether ThinLTO is enabled. It then
also implements two forms of ThinLTO:
* In one mode we'll *only* perform ThinLTO over the codegen units produced in a
single compilation. That is, we won't load upstream rlibs, but we'll instead
just perform ThinLTO amongst all codegen units produced by the compiler for
the local crate. This is intended to emulate a desired end point where we have
codegen units turned on by default for all crates and ThinLTO allows us to do
this without performance loss.
* In anther mode, like full LTO today, we'll optimize all upstream dependencies
in "thin" mode. Unlike today, however, this LTO step is fully parallelized so
should finish much more quickly.
There's a good bit of comments about what the implementation is doing and where
it came from, but the tl;dr; is that currently most of the support here is
copied from upstream LLVM. This code duplication is done for a number of
reasons:
* Controlling parallelism means we can use the existing jobserver support to
avoid overloading machines.
* We will likely want a slightly different form of incremental caching which
integrates with our own incremental strategy, but this is yet to be
determined.
* This buys us some flexibility about when/where we run ThinLTO, as well as
having it tailored to fit our needs for the time being.
* Finally this allows us to reuse some artifacts such as our `TargetMachine`
creation, where all our options we used today aren't necessarily supported by
upstream LLVM yet.
My hope is that we can get some experience with this copy/paste in tree and then
eventually upstream some work to LLVM itself to avoid the duplication while
still ensuring our needs are met. Otherwise I fear that maintaining these
bindings may be quite costly over the years with LLVM updates!
2017-07-23 15:14:38 +00:00
|
|
|
|
}
|
2018-08-17 14:07:23 +00:00
|
|
|
|
|
2020-04-15 16:28:01 +00:00
|
|
|
|
/// Summarizes module import/export relationships used by LLVM's ThinLTO pass.
|
|
|
|
|
///
|
|
|
|
|
/// Note that we tend to have two such instances of `ThinLTOImportMaps` in use:
|
|
|
|
|
/// one loaded from a file that represents the relationships used during the
|
|
|
|
|
/// compilation associated with the incremetnal build artifacts we are
|
|
|
|
|
/// attempting to reuse, and another constructed via `from_thin_lto_data`, which
|
|
|
|
|
/// captures the relationships of ThinLTO in the current compilation.
|
2018-10-16 14:57:53 +00:00
|
|
|
|
#[derive(Debug, Default)]
|
2020-04-15 16:28:01 +00:00
|
|
|
|
pub struct ThinLTOImportMaps {
|
2018-08-17 14:07:23 +00:00
|
|
|
|
// key = llvm name of importing module, value = list of modules it imports from
|
|
|
|
|
imports: FxHashMap<String, Vec<String>>,
|
2020-04-14 13:47:03 +00:00
|
|
|
|
// key = llvm name of exporting module, value = list of modules it exports to
|
|
|
|
|
exports: FxHashMap<String, Vec<String>>,
|
2018-08-17 14:07:23 +00:00
|
|
|
|
}
|
|
|
|
|
|
2020-04-15 16:28:01 +00:00
|
|
|
|
impl ThinLTOImportMaps {
|
|
|
|
|
/// Returns modules imported by `llvm_module_name` during some ThinLTO pass.
|
|
|
|
|
fn imports_of(&self, llvm_module_name: &str) -> &[String] {
|
2018-08-20 15:13:01 +00:00
|
|
|
|
self.imports.get(llvm_module_name).map(|v| &v[..]).unwrap_or(&[])
|
|
|
|
|
}
|
|
|
|
|
|
2020-04-15 16:28:01 +00:00
|
|
|
|
/// Returns modules exported by `llvm_module_name` during some ThinLTO pass.
|
|
|
|
|
fn exports_of(&self, llvm_module_name: &str) -> &[String] {
|
2020-04-14 13:47:03 +00:00
|
|
|
|
self.exports.get(llvm_module_name).map(|v| &v[..]).unwrap_or(&[])
|
|
|
|
|
}
|
|
|
|
|
|
2019-11-29 15:04:40 +00:00
|
|
|
|
fn save_to_file(&self, path: &Path) -> io::Result<()> {
|
|
|
|
|
use std::io::Write;
|
|
|
|
|
let file = File::create(path)?;
|
|
|
|
|
let mut writer = io::BufWriter::new(file);
|
|
|
|
|
for (importing_module_name, imported_modules) in &self.imports {
|
|
|
|
|
writeln!(writer, "{}", importing_module_name)?;
|
|
|
|
|
for imported_module in imported_modules {
|
|
|
|
|
writeln!(writer, " {}", imported_module)?;
|
|
|
|
|
}
|
|
|
|
|
writeln!(writer)?;
|
|
|
|
|
}
|
|
|
|
|
Ok(())
|
|
|
|
|
}
|
|
|
|
|
|
2020-04-15 16:28:01 +00:00
|
|
|
|
fn load_from_file(path: &Path) -> io::Result<ThinLTOImportMaps> {
|
2019-11-29 15:04:40 +00:00
|
|
|
|
use std::io::BufRead;
|
|
|
|
|
let mut imports = FxHashMap::default();
|
2020-04-14 13:47:03 +00:00
|
|
|
|
let mut exports: FxHashMap<_, Vec<_>> = FxHashMap::default();
|
|
|
|
|
let mut current_module: Option<String> = None;
|
|
|
|
|
let mut current_imports: Vec<String> = vec![];
|
2019-11-29 15:04:40 +00:00
|
|
|
|
let file = File::open(path)?;
|
|
|
|
|
for line in io::BufReader::new(file).lines() {
|
|
|
|
|
let line = line?;
|
|
|
|
|
if line.is_empty() {
|
2019-12-22 22:42:04 +00:00
|
|
|
|
let importing_module = current_module.take().expect("Importing module not set");
|
2020-04-14 13:47:03 +00:00
|
|
|
|
for imported in ¤t_imports {
|
|
|
|
|
exports.entry(imported.clone()).or_default().push(importing_module.clone());
|
|
|
|
|
}
|
2019-12-22 22:42:04 +00:00
|
|
|
|
imports.insert(importing_module, mem::replace(&mut current_imports, vec![]));
|
2020-02-26 12:03:46 +00:00
|
|
|
|
} else if line.starts_with(' ') {
|
2019-11-29 15:04:40 +00:00
|
|
|
|
// Space marks an imported module
|
|
|
|
|
assert_ne!(current_module, None);
|
|
|
|
|
current_imports.push(line.trim().to_string());
|
|
|
|
|
} else {
|
|
|
|
|
// Otherwise, beginning of a new module (must be start or follow empty line)
|
|
|
|
|
assert_eq!(current_module, None);
|
|
|
|
|
current_module = Some(line.trim().to_string());
|
|
|
|
|
}
|
|
|
|
|
}
|
2020-04-15 16:28:01 +00:00
|
|
|
|
Ok(ThinLTOImportMaps { imports, exports })
|
2019-11-29 15:04:40 +00:00
|
|
|
|
}
|
|
|
|
|
|
2019-02-08 13:53:55 +00:00
|
|
|
|
/// Loads the ThinLTO import map from ThinLTOData.
|
2020-04-15 16:28:01 +00:00
|
|
|
|
unsafe fn from_thin_lto_data(data: *const llvm::ThinLTOData) -> ThinLTOImportMaps {
|
2019-12-22 22:42:04 +00:00
|
|
|
|
unsafe extern "C" fn imported_module_callback(
|
|
|
|
|
payload: *mut libc::c_void,
|
|
|
|
|
importing_module_name: *const libc::c_char,
|
|
|
|
|
imported_module_name: *const libc::c_char,
|
|
|
|
|
) {
|
2020-04-15 16:28:01 +00:00
|
|
|
|
let map = &mut *(payload as *mut ThinLTOImportMaps);
|
2018-08-17 14:07:23 +00:00
|
|
|
|
let importing_module_name = CStr::from_ptr(importing_module_name);
|
|
|
|
|
let importing_module_name = module_name_to_str(&importing_module_name);
|
|
|
|
|
let imported_module_name = CStr::from_ptr(imported_module_name);
|
|
|
|
|
let imported_module_name = module_name_to_str(&imported_module_name);
|
2018-08-31 13:18:08 +00:00
|
|
|
|
|
2018-08-17 14:07:23 +00:00
|
|
|
|
if !map.imports.contains_key(importing_module_name) {
|
|
|
|
|
map.imports.insert(importing_module_name.to_owned(), vec![]);
|
|
|
|
|
}
|
2018-08-20 15:13:01 +00:00
|
|
|
|
|
2018-08-17 14:07:23 +00:00
|
|
|
|
map.imports
|
2019-12-22 22:42:04 +00:00
|
|
|
|
.get_mut(importing_module_name)
|
|
|
|
|
.unwrap()
|
|
|
|
|
.push(imported_module_name.to_owned());
|
2020-04-14 13:47:03 +00:00
|
|
|
|
|
|
|
|
|
if !map.exports.contains_key(imported_module_name) {
|
|
|
|
|
map.exports.insert(imported_module_name.to_owned(), vec![]);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
map.exports
|
|
|
|
|
.get_mut(imported_module_name)
|
|
|
|
|
.unwrap()
|
|
|
|
|
.push(importing_module_name.to_owned());
|
2018-08-17 14:07:23 +00:00
|
|
|
|
}
|
2020-04-14 13:47:03 +00:00
|
|
|
|
|
2020-04-15 16:28:01 +00:00
|
|
|
|
let mut map = ThinLTOImportMaps::default();
|
2019-12-22 22:42:04 +00:00
|
|
|
|
llvm::LLVMRustGetThinLTOModuleImports(
|
|
|
|
|
data,
|
|
|
|
|
imported_module_callback,
|
|
|
|
|
&mut map as *mut _ as *mut libc::c_void,
|
|
|
|
|
);
|
2018-08-17 14:07:23 +00:00
|
|
|
|
map
|
|
|
|
|
}
|
2018-08-31 13:18:08 +00:00
|
|
|
|
}
|
2018-08-17 14:07:23 +00:00
|
|
|
|
|
2018-08-31 13:18:08 +00:00
|
|
|
|
fn module_name_to_str(c_str: &CStr) -> &str {
|
2019-12-22 22:42:04 +00:00
|
|
|
|
c_str.to_str().unwrap_or_else(|e| {
|
|
|
|
|
bug!("Encountered non-utf8 LLVM module name `{}`: {}", c_str.to_string_lossy(), e)
|
|
|
|
|
})
|
2018-09-03 10:42:27 +00:00
|
|
|
|
}
|
2019-02-11 15:46:04 +00:00
|
|
|
|
|
2019-08-27 19:25:35 +00:00
|
|
|
|
pub fn parse_module<'a>(
|
2019-02-11 15:46:04 +00:00
|
|
|
|
cx: &'a llvm::Context,
|
|
|
|
|
name: &CStr,
|
|
|
|
|
data: &[u8],
|
|
|
|
|
diag_handler: &Handler,
|
|
|
|
|
) -> Result<&'a llvm::Module, FatalError> {
|
|
|
|
|
unsafe {
|
2019-12-22 22:42:04 +00:00
|
|
|
|
llvm::LLVMRustParseBitcodeForLTO(cx, data.as_ptr(), data.len(), name.as_ptr()).ok_or_else(
|
|
|
|
|
|| {
|
|
|
|
|
let msg = "failed to parse bitcode for LTO module";
|
|
|
|
|
write::llvm_err(&diag_handler, msg)
|
|
|
|
|
},
|
|
|
|
|
)
|
2019-02-11 15:46:04 +00:00
|
|
|
|
}
|
|
|
|
|
}
|