Auto merge of #44841 - alexcrichton:thinlto, r=michaelwoerister

rustc: Implement ThinLTO This commit is an implementation of LLVM's ThinLTO for consumption in rustc itself. Currently today LTO works by merging all relevant LLVM modules into one and then running optimization passes. "Thin" LTO operates differently by having more sharded work and allowing parallelism opportunities between optimizing codegen units. Further down the road Thin LTO also allows *incremental* LTO which should enable even faster release builds without compromising on the performance we have today. This commit uses a `-Z thinlto` flag to gate whether ThinLTO is enabled. It then also implements two forms of ThinLTO: * In one mode we'll *only* perform ThinLTO over the codegen units produced in a single compilation. That is, we won't load upstream rlibs, but we'll instead just perform ThinLTO amongst all codegen units produced by the compiler for the local crate. This is intended to emulate a desired end point where we have codegen units turned on by default for all crates and ThinLTO allows us to do this without performance loss. * In anther mode, like full LTO today, we'll optimize all upstream dependencies in "thin" mode. Unlike today, however, this LTO step is fully parallelized so should finish much more quickly. There's a good bit of comments about what the implementation is doing and where it came from, but the tl;dr; is that currently most of the support here is copied from upstream LLVM. This code duplication is done for a number of reasons: * Controlling parallelism means we can use the existing jobserver support to avoid overloading machines. * We will likely want a slightly different form of incremental caching which integrates with our own incremental strategy, but this is yet to be determined. * This buys us some flexibility about when/where we run ThinLTO, as well as having it tailored to fit our needs for the time being. * Finally this allows us to reuse some artifacts such as our `TargetMachine` creation, where all our options we used today aren't necessarily supported by upstream LLVM yet. My hope is that we can get some experience with this copy/paste in tree and then eventually upstream some work to LLVM itself to avoid the duplication while still ensuring our needs are met. Otherwise I fear that maintaining these bindings may be quite costly over the years with LLVM updates!
2024-12-22 13:34:47 +00:00 · 2017-10-07 22:18:20 +00:00 · 2017-10-07 22:18:20 +00:00 · ac76206be4
commit ac76206be4
parent 05f8ddc46a 4ca1b19fde
24 changed files with 1288 additions and 182 deletions
--- a/src/librustc/session/config.rs
+++ b/src/librustc/session/config.rs
@ -409,9 +409,7 @@ impl_stable_hash_for!(struct self::OutputFilenames {
    outputs
 });

-/// Codegen unit names generated by the numbered naming scheme will contain this
-/// marker right before the index of the codegen unit.
-pub const NUMBERED_CODEGEN_UNIT_MARKER: &'static str = ".cgu-";
+pub const RUST_CGU_EXT: &str = "rust-cgu";

 impl OutputFilenames {
    pub fn path(&self, flavor: OutputType) -> PathBuf {
@ -442,22 +440,14 @@ impl OutputFilenames {
        let mut extension = String::new();

        if let Some(codegen_unit_name) = codegen_unit_name {
-            if codegen_unit_name.contains(NUMBERED_CODEGEN_UNIT_MARKER) {
-                // If we use the numbered naming scheme for modules, we don't want
-                // the files to look like <crate-name><extra>.<crate-name>.<index>.<ext>
-                // but simply <crate-name><extra>.<index>.<ext>
-                let marker_offset = codegen_unit_name.rfind(NUMBERED_CODEGEN_UNIT_MARKER)
-                                                     .unwrap();
-                let index_offset = marker_offset + NUMBERED_CODEGEN_UNIT_MARKER.len();
-                extension.push_str(&codegen_unit_name[index_offset .. ]);
-            } else {
-                extension.push_str(codegen_unit_name);
-            };
+            extension.push_str(codegen_unit_name);
        }

        if !ext.is_empty() {
            if !extension.is_empty() {
                extension.push_str(".");
+                extension.push_str(RUST_CGU_EXT);
+                extension.push_str(".");
            }

            extension.push_str(ext);
@ -1105,6 +1095,8 @@ options! {DebuggingOptions, DebuggingSetter, basic_debugging_options,
                 "run the non-lexical lifetimes MIR pass"),
    trans_time_graph: bool = (false, parse_bool, [UNTRACKED],
        "generate a graphical HTML report of time spent in trans and LLVM"),
+    thinlto: bool = (false, parse_bool, [TRACKED],
+        "enable ThinLTO when possible"),
 }

 pub fn default_lib_output() -> CrateType {
--- a/src/librustc_llvm/build.rs
+++ b/src/librustc_llvm/build.rs
@ -115,6 +115,7 @@ fn main() {
                                "linker",
                                "asmparser",
                                "mcjit",
+                                "lto",
                                "interpreter",
                                "instrumentation"];

--- a/src/librustc_llvm/ffi.rs
+++ b/src/librustc_llvm/ffi.rs
@ -345,6 +345,20 @@ pub enum PassKind {
    Module,
 }

+/// LLVMRustThinLTOData
+pub enum ThinLTOData {}
+
+/// LLVMRustThinLTOBuffer
+pub enum ThinLTOBuffer {}
+
+/// LLVMRustThinLTOModule
+#[repr(C)]
+pub struct ThinLTOModule {
+    pub identifier: *const c_char,
+    pub data: *const u8,
+    pub len: usize,
+}
+
 // Opaque pointer types
 #[allow(missing_copy_implementations)]
 pub enum Module_opaque {}
@ -1271,6 +1285,9 @@ extern "C" {
                                                        PM: PassManagerRef,
                                                        Internalize: Bool,
                                                        RunInliner: Bool);
+    pub fn LLVMRustPassManagerBuilderPopulateThinLTOPassManager(
+        PMB: PassManagerBuilderRef,
+        PM: PassManagerRef) -> bool;

    // Stuff that's in rustllvm/ because it's not upstream yet.

@ -1685,4 +1702,43 @@ extern "C" {
    pub fn LLVMRustModuleBufferLen(p: *const ModuleBuffer) -> usize;
    pub fn LLVMRustModuleBufferFree(p: *mut ModuleBuffer);
    pub fn LLVMRustModuleCost(M: ModuleRef) -> u64;
+
+    pub fn LLVMRustThinLTOAvailable() -> bool;
+    pub fn LLVMRustWriteThinBitcodeToFile(PMR: PassManagerRef,
+                                          M: ModuleRef,
+                                          BC: *const c_char) -> bool;
+    pub fn LLVMRustThinLTOBufferCreate(M: ModuleRef) -> *mut ThinLTOBuffer;
+    pub fn LLVMRustThinLTOBufferFree(M: *mut ThinLTOBuffer);
+    pub fn LLVMRustThinLTOBufferPtr(M: *const ThinLTOBuffer) -> *const c_char;
+    pub fn LLVMRustThinLTOBufferLen(M: *const ThinLTOBuffer) -> size_t;
+    pub fn LLVMRustCreateThinLTOData(
+        Modules: *const ThinLTOModule,
+        NumModules: c_uint,
+        PreservedSymbols: *const *const c_char,
+        PreservedSymbolsLen: c_uint,
+    ) -> *mut ThinLTOData;
+    pub fn LLVMRustPrepareThinLTORename(
+        Data: *const ThinLTOData,
+        Module: ModuleRef,
+    ) -> bool;
+    pub fn LLVMRustPrepareThinLTOResolveWeak(
+        Data: *const ThinLTOData,
+        Module: ModuleRef,
+    ) -> bool;
+    pub fn LLVMRustPrepareThinLTOInternalize(
+        Data: *const ThinLTOData,
+        Module: ModuleRef,
+    ) -> bool;
+    pub fn LLVMRustPrepareThinLTOImport(
+        Data: *const ThinLTOData,
+        Module: ModuleRef,
+    ) -> bool;
+    pub fn LLVMRustFreeThinLTOData(Data: *mut ThinLTOData);
+    pub fn LLVMRustParseBitcodeForThinLTO(
+        Context: ContextRef,
+        Data: *const u8,
+        len: usize,
+        Identifier: *const c_char,
+    ) -> ModuleRef;
+    pub fn LLVMGetModuleIdentifier(M: ModuleRef, size: *mut usize) -> *const c_char;
 }
--- a/src/librustc_trans/back/link.rs
+++ b/src/librustc_trans/back/link.rs
@ -16,6 +16,7 @@ use super::rpath::RPathConfig;
 use super::rpath;
 use metadata::METADATA_FILENAME;
 use rustc::session::config::{self, NoDebugInfo, OutputFilenames, OutputType, PrintRequest};
+use rustc::session::config::RUST_CGU_EXT;
 use rustc::session::filesearch;
 use rustc::session::search_paths::PathKind;
 use rustc::session::Session;
@ -45,13 +46,9 @@ use syntax::attr;
 /// The LLVM module name containing crate-metadata. This includes a `.` on
 /// purpose, so it cannot clash with the name of a user-defined module.
 pub const METADATA_MODULE_NAME: &'static str = "crate.metadata";
-/// The name of the crate-metadata object file the compiler generates. Must
-/// match up with `METADATA_MODULE_NAME`.
-pub const METADATA_OBJ_NAME: &'static str = "crate.metadata.o";

 // same as for metadata above, but for allocator shim
 pub const ALLOCATOR_MODULE_NAME: &'static str = "crate.allocator";
-pub const ALLOCATOR_OBJ_NAME: &'static str = "crate.allocator.o";

 pub use rustc_trans_utils::link::{find_crate_name, filename_for_input, default_output_for_target,
                                  invalid_output_for_target, build_link_meta, out_filename,
@ -129,6 +126,14 @@ fn command_path(sess: &Session) -> OsString {
    env::join_paths(new_path).unwrap()
 }

+fn metadata_obj(outputs: &OutputFilenames) -> PathBuf {
+    outputs.temp_path(OutputType::Object, Some(METADATA_MODULE_NAME))
+}
+
+fn allocator_obj(outputs: &OutputFilenames) -> PathBuf {
+    outputs.temp_path(OutputType::Object, Some(ALLOCATOR_MODULE_NAME))
+}
+
 pub fn remove(sess: &Session, path: &Path) {
    match fs::remove_file(path) {
        Ok(..) => {}
@ -174,9 +179,9 @@ pub fn link_binary(sess: &Session,
                remove(sess, &obj.object);
            }
        }
-        remove(sess, &outputs.with_extension(METADATA_OBJ_NAME));
+        remove(sess, &metadata_obj(outputs));
        if trans.allocator_module.is_some() {
-            remove(sess, &outputs.with_extension(ALLOCATOR_OBJ_NAME));
+            remove(sess, &allocator_obj(outputs));
        }
    }

@ -478,7 +483,7 @@ fn link_rlib<'a>(sess: &'a Session,

        RlibFlavor::StaticlibBase => {
            if trans.allocator_module.is_some() {
-                ab.add_file(&outputs.with_extension(ALLOCATOR_OBJ_NAME));
+                ab.add_file(&allocator_obj(outputs));
            }
        }
    }
@ -908,11 +913,11 @@ fn link_args(cmd: &mut Linker,
    // object file, so we link that in here.
    if crate_type == config::CrateTypeDylib ||
       crate_type == config::CrateTypeProcMacro {
-        cmd.add_object(&outputs.with_extension(METADATA_OBJ_NAME));
+        cmd.add_object(&metadata_obj(outputs));
    }

    if trans.allocator_module.is_some() {
-        cmd.add_object(&outputs.with_extension(ALLOCATOR_OBJ_NAME));
+        cmd.add_object(&allocator_obj(outputs));
    }

    // Try to strip as much out of the generated object by removing unused
@ -1265,11 +1270,23 @@ fn add_upstream_rust_crates(cmd: &mut Linker,
                let canonical = f.replace("-", "_");
                let canonical_name = name.replace("-", "_");

+                // Look for `.rust-cgu.o` at the end of the filename to conclude
+                // that this is a Rust-related object file.
+                fn looks_like_rust(s: &str) -> bool {
+                    let path = Path::new(s);
+                    let ext = path.extension().and_then(|s| s.to_str());
+                    if ext != Some(OutputType::Object.extension()) {
+                        return false
+                    }
+                    let ext2 = path.file_stem()
+                        .and_then(|s| Path::new(s).extension())
+                        .and_then(|s| s.to_str());
+                    ext2 == Some(RUST_CGU_EXT)
+                }
+
                let is_rust_object =
-                    canonical.starts_with(&canonical_name) && {
-                        let num = &f[name.len()..f.len() - 2];
-                        num.len() > 0 && num[1..].parse::<u32>().is_ok()
-                    };
+                    canonical.starts_with(&canonical_name) &&
+                    looks_like_rust(&f);

                // If we've been requested to skip all native object files
                // (those not generated by the rust compiler) then we can skip
--- a/src/librustc_trans/back/lto.rs
+++ b/src/librustc_trans/back/lto.rs
@ -9,23 +9,25 @@
 // except according to those terms.

 use back::bytecode::{DecodedBytecode, RLIB_BYTECODE_EXTENSION};
-use back::write;
 use back::symbol_export;
-use rustc::session::config;
+use back::write::{ModuleConfig, with_llvm_pmb, CodegenContext};
+use back::write;
 use errors::{FatalError, Handler};
-use llvm;
 use llvm::archive_ro::ArchiveRO;
 use llvm::{ModuleRef, TargetMachineRef, True, False};
-use rustc::middle::exported_symbols::SymbolExportLevel;
-use rustc::util::common::time;
+use llvm;
 use rustc::hir::def_id::LOCAL_CRATE;
-use back::write::{ModuleConfig, with_llvm_pmb, CodegenContext};
-use {ModuleTranslation, ModuleKind};
+use rustc::middle::exported_symbols::SymbolExportLevel;
+use rustc::session::config;
+use rustc::util::common::time;
+use time_graph::Timeline;
+use {ModuleTranslation, ModuleLlvm, ModuleKind, ModuleSource};

 use libc;

 use std::ffi::CString;
 use std::slice;
+use std::sync::Arc;

 pub fn crate_type_allows_lto(crate_type: config::CrateType) -> bool {
    match crate_type {
@ -45,14 +47,14 @@ pub enum LtoModuleTranslation {
        _serialized_bitcode: Vec<SerializedModule>,
    },

-    // Note the lack of other entries in this enum! Ideally one day this gap is
-    // intended to be filled with a "Thin" LTO variant.
+    Thin(ThinModule),
 }

 impl LtoModuleTranslation {
    pub fn name(&self) -> &str {
        match *self {
            LtoModuleTranslation::Fat { .. } => "everything",
+            LtoModuleTranslation::Thin(ref m) => m.name(),
        }
    }

@ -62,7 +64,9 @@ impl LtoModuleTranslation {
    /// points to LLVM data structures owned by this `LtoModuleTranslation`.
    /// It's intended that the module returned is immediately code generated and
    /// dropped, and then this LTO module is dropped.
-    pub unsafe fn optimize(&mut self, cgcx: &CodegenContext)
+    pub unsafe fn optimize(&mut self,
+                           cgcx: &CodegenContext,
+                           timeline: &mut Timeline)
        -> Result<ModuleTranslation, FatalError>
    {
        match *self {
@ -71,9 +75,11 @@ impl LtoModuleTranslation {
                let config = cgcx.config(trans.kind);
                let llmod = trans.llvm().unwrap().llmod;
                let tm = trans.llvm().unwrap().tm;
-                run_pass_manager(cgcx, tm, llmod, config);
+                run_pass_manager(cgcx, tm, llmod, config, false);
+                timeline.record("fat-done");
                Ok(trans)
            }
+            LtoModuleTranslation::Thin(ref mut thin) => thin.optimize(cgcx, timeline),
        }
    }

@ -83,33 +89,31 @@ impl LtoModuleTranslation {
        match *self {
            // Only one module with fat LTO, so the cost doesn't matter.
            LtoModuleTranslation::Fat { .. } => 0,
+            LtoModuleTranslation::Thin(ref m) => m.cost(),
        }
    }
 }

-pub fn run(cgcx: &CodegenContext, modules: Vec<ModuleTranslation>)
+pub enum LTOMode {
+    WholeCrateGraph,
+    JustThisCrate,
+}
+
+pub fn run(cgcx: &CodegenContext,
+           modules: Vec<ModuleTranslation>,
+           mode: LTOMode,
+           timeline: &mut Timeline)
    -> Result<Vec<LtoModuleTranslation>, FatalError>
 {
    let diag_handler = cgcx.create_diag_handler();
-    if cgcx.opts.cg.prefer_dynamic {
-        diag_handler.struct_err("cannot prefer dynamic linking when performing LTO")
-                    .note("only 'staticlib', 'bin', and 'cdylib' outputs are \
-                           supported with LTO")
-                    .emit();
-        return Err(FatalError)
-    }
-
-    // Make sure we actually can run LTO
-    for crate_type in cgcx.crate_types.iter() {
-        if !crate_type_allows_lto(*crate_type) {
-            let e = diag_handler.fatal("lto can only be run for executables, cdylibs and \
-                                        static library outputs");
-            return Err(e)
+    let export_threshold = match mode {
+        LTOMode::WholeCrateGraph => {
+            symbol_export::crates_export_threshold(&cgcx.crate_types)
        }
-    }
-
-    let export_threshold =
-        symbol_export::crates_export_threshold(&cgcx.crate_types);
+        LTOMode::JustThisCrate => {
+            SymbolExportLevel::Rust
+        }
+    };

    let symbol_filter = &|&(ref name, _, level): &(String, _, SymbolExportLevel)| {
        if level.is_below_threshold(export_threshold) {
@ -121,55 +125,81 @@ pub fn run(cgcx: &CodegenContext, modules: Vec<ModuleTranslation>)
        }
    };

-    let mut symbol_white_list: Vec<CString> = cgcx.exported_symbols[&LOCAL_CRATE]
+    let mut symbol_white_list = cgcx.exported_symbols[&LOCAL_CRATE]
        .iter()
        .filter_map(symbol_filter)
-        .collect();
-    info!("{} symbols in whitelist", symbol_white_list.len());
+        .collect::<Vec<CString>>();
+    timeline.record("whitelist");

-    // For each of our upstream dependencies, find the corresponding rlib and
-    // load the bitcode from the archive. Then merge it into the current LLVM
-    // module that we've got.
+    // If we're performing LTO for the entire crate graph, then for each of our
+    // upstream dependencies, find the corresponding rlib and load the bitcode
+    // from the archive.
+    //
+    // We save off all the bytecode and LLVM module ids for later processing
+    // with either fat or thin LTO
    let mut upstream_modules = Vec::new();
-    for &(cnum, ref path) in cgcx.each_linked_rlib_for_lto.iter() {
-        symbol_white_list.extend(
-            cgcx.exported_symbols[&cnum]
-                .iter()
-                .filter_map(symbol_filter));
-        info!("{} symbols in whitelist after {}", symbol_white_list.len(), cnum);
+    if let LTOMode::WholeCrateGraph = mode {
+        if cgcx.opts.cg.prefer_dynamic {
+            diag_handler.struct_err("cannot prefer dynamic linking when performing LTO")
+                        .note("only 'staticlib', 'bin', and 'cdylib' outputs are \
+                               supported with LTO")
+                        .emit();
+            return Err(FatalError)
+        }

-        let archive = ArchiveRO::open(&path).expect("wanted an rlib");
-        let bytecodes = archive.iter().filter_map(|child| {
-            child.ok().and_then(|c| c.name().map(|name| (name, c)))
-        }).filter(|&(name, _)| name.ends_with(RLIB_BYTECODE_EXTENSION));
-        for (name, data) in bytecodes {
-            info!("adding bytecode {}", name);
-            let bc_encoded = data.data();
+        // Make sure we actually can run LTO
+        for crate_type in cgcx.crate_types.iter() {
+            if !crate_type_allows_lto(*crate_type) {
+                let e = diag_handler.fatal("lto can only be run for executables, cdylibs and \
+                                            static library outputs");
+                return Err(e)
+            }
+        }

-            let (bc, id) = time(cgcx.time_passes, &format!("decode {}", name), || {
-                match DecodedBytecode::new(bc_encoded) {
-                    Ok(b) => Ok((b.bytecode(), b.identifier().to_string())),
-                    Err(e) => Err(diag_handler.fatal(&e)),
-                }
-            })?;
-            let bc = SerializedModule::FromRlib(bc);
-            upstream_modules.push((bc, CString::new(id).unwrap()));
+        for &(cnum, ref path) in cgcx.each_linked_rlib_for_lto.iter() {
+            symbol_white_list.extend(
+                cgcx.exported_symbols[&cnum]
+                    .iter()
+                    .filter_map(symbol_filter));
+
+            let archive = ArchiveRO::open(&path).expect("wanted an rlib");
+            let bytecodes = archive.iter().filter_map(|child| {
+                child.ok().and_then(|c| c.name().map(|name| (name, c)))
+            }).filter(|&(name, _)| name.ends_with(RLIB_BYTECODE_EXTENSION));
+            for (name, data) in bytecodes {
+                info!("adding bytecode {}", name);
+                let bc_encoded = data.data();
+
+                let (bc, id) = time(cgcx.time_passes, &format!("decode {}", name), || {
+                    match DecodedBytecode::new(bc_encoded) {
+                        Ok(b) => Ok((b.bytecode(), b.identifier().to_string())),
+                        Err(e) => Err(diag_handler.fatal(&e)),
+                    }
+                })?;
+                let bc = SerializedModule::FromRlib(bc);
+                upstream_modules.push((bc, CString::new(id).unwrap()));
+            }
+            timeline.record(&format!("load: {}", path.display()));
        }
    }

-    // Internalize everything but the exported symbols of the current module
-    let arr: Vec<*const libc::c_char> = symbol_white_list.iter()
-                                                         .map(|c| c.as_ptr())
-                                                         .collect();
-
-    fat_lto(cgcx, &diag_handler, modules, upstream_modules, &arr)
+    let arr = symbol_white_list.iter().map(|c| c.as_ptr()).collect::<Vec<_>>();
+    match mode {
+        LTOMode::WholeCrateGraph if !cgcx.thinlto => {
+            fat_lto(cgcx, &diag_handler, modules, upstream_modules, &arr, timeline)
+        }
+        _ => {
+            thin_lto(&diag_handler, modules, upstream_modules, &arr, timeline)
+        }
+    }
 }

 fn fat_lto(cgcx: &CodegenContext,
           diag_handler: &Handler,
           mut modules: Vec<ModuleTranslation>,
           mut serialized_modules: Vec<(SerializedModule, CString)>,
-           symbol_white_list: &[*const libc::c_char])
+           symbol_white_list: &[*const libc::c_char],
+           timeline: &mut Timeline)
    -> Result<Vec<LtoModuleTranslation>, FatalError>
 {
    info!("going for a fat lto");
@ -228,6 +258,7 @@ fn fat_lto(cgcx: &CodegenContext,
                Err(write::llvm_err(&diag_handler, msg))
            }
        })?;
+        timeline.record(&format!("link {:?}", name));
        serialized_bitcode.push(bc_decoded);
    }
    cgcx.save_temp_bitcode(&module, "lto.input");
@ -248,6 +279,7 @@ fn fat_lto(cgcx: &CodegenContext,
        }
        cgcx.save_temp_bitcode(&module, "lto.after-nounwind");
    }
+    timeline.record("passes");

    Ok(vec![LtoModuleTranslation::Fat {
        module: Some(module),
@ -255,11 +287,143 @@ fn fat_lto(cgcx: &CodegenContext,
    }])
 }

+/// Prepare "thin" LTO to get run on these modules.
+///
+/// The general structure of ThinLTO is quite different from the structure of
+/// "fat" LTO above. With "fat" LTO all LLVM modules in question are merged into
+/// one giant LLVM module, and then we run more optimization passes over this
+/// big module after internalizing most symbols. Thin LTO, on the other hand,
+/// avoid this large bottleneck through more targeted optimization.
+///
+/// At a high level Thin LTO looks like:
+///
+///     1. Prepare a "summary" of each LLVM module in question which describes
+///        the values inside, cost of the values, etc.
+///     2. Merge the summaries of all modules in question into one "index"
+///     3. Perform some global analysis on this index
+///     4. For each module, use the index and analysis calculated previously to
+///        perform local transformations on the module, for example inlining
+///        small functions from other modules.
+///     5. Run thin-specific optimization passes over each module, and then code
+///        generate everything at the end.
+///
+/// The summary for each module is intended to be quite cheap, and the global
+/// index is relatively quite cheap to create as well. As a result, the goal of
+/// ThinLTO is to reduce the bottleneck on LTO and enable LTO to be used in more
+/// situations. For example one cheap optimization is that we can parallelize
+/// all codegen modules, easily making use of all the cores on a machine.
+///
+/// With all that in mind, the function here is designed at specifically just
+/// calculating the *index* for ThinLTO. This index will then be shared amongst
+/// all of the `LtoModuleTranslation` units returned below and destroyed once
+/// they all go out of scope.
+fn thin_lto(diag_handler: &Handler,
+            modules: Vec<ModuleTranslation>,
+            serialized_modules: Vec<(SerializedModule, CString)>,
+            symbol_white_list: &[*const libc::c_char],
+            timeline: &mut Timeline)
+    -> Result<Vec<LtoModuleTranslation>, FatalError>
+{
+    unsafe {
+        info!("going for that thin, thin LTO");
+
+        let mut thin_buffers = Vec::new();
+        let mut module_names = Vec::new();
+        let mut thin_modules = Vec::new();
+
+        // FIXME: right now, like with fat LTO, we serialize all in-memory
+        //        modules before working with them and ThinLTO. We really
+        //        shouldn't do this, however, and instead figure out how to
+        //        extract a summary from an in-memory module and then merge that
+        //        into the global index. It turns out that this loop is by far
+        //        the most expensive portion of this small bit of global
+        //        analysis!
+        for (i, module) in modules.iter().enumerate() {
+            info!("local module: {} - {}", i, module.llmod_id);
+            let llvm = module.llvm().expect("can't lto pretranslated module");
+            let name = CString::new(module.llmod_id.clone()).unwrap();
+            let buffer = llvm::LLVMRustThinLTOBufferCreate(llvm.llmod);
+            let buffer = ThinBuffer(buffer);
+            thin_modules.push(llvm::ThinLTOModule {
+                identifier: name.as_ptr(),
+                data: buffer.data().as_ptr(),
+                len: buffer.data().len(),
+            });
+            thin_buffers.push(buffer);
+            module_names.push(name);
+            timeline.record(&module.llmod_id);
+        }
+
+        // FIXME: All upstream crates are deserialized internally in the
+        //        function below to extract their summary and modules. Note that
+        //        unlike the loop above we *must* decode and/or read something
+        //        here as these are all just serialized files on disk. An
+        //        improvement, however, to make here would be to store the
+        //        module summary separately from the actual module itself. Right
+        //        now this is store in one large bitcode file, and the entire
+        //        file is deflate-compressed. We could try to bypass some of the
+        //        decompression by storing the index uncompressed and only
+        //        lazily decompressing the bytecode if necessary.
+        //
+        //        Note that truly taking advantage of this optimization will
+        //        likely be further down the road. We'd have to implement
+        //        incremental ThinLTO first where we could actually avoid
+        //        looking at upstream modules entirely sometimes (the contents,
+        //        we must always unconditionally look at the index).
+        let mut serialized = Vec::new();
+        for (module, name) in serialized_modules {
+            info!("foreign module {:?}", name);
+            thin_modules.push(llvm::ThinLTOModule {
+                identifier: name.as_ptr(),
+                data: module.data().as_ptr(),
+                len: module.data().len(),
+            });
+            serialized.push(module);
+            module_names.push(name);
+        }
+
+        // Delegate to the C++ bindings to create some data here. Once this is a
+        // tried-and-true interface we may wish to try to upstream some of this
+        // to LLVM itself, right now we reimplement a lot of what they do
+        // upstream...
+        let data = llvm::LLVMRustCreateThinLTOData(
+            thin_modules.as_ptr(),
+            thin_modules.len() as u32,
+            symbol_white_list.as_ptr(),
+            symbol_white_list.len() as u32,
+        );
+        if data.is_null() {
+            let msg = format!("failed to prepare thin LTO context");
+            return Err(write::llvm_err(&diag_handler, msg))
+        }
+        let data = ThinData(data);
+        info!("thin LTO data created");
+        timeline.record("data");
+
+        // Throw our data in an `Arc` as we'll be sharing it across threads. We
+        // also put all memory referenced by the C++ data (buffers, ids, etc)
+        // into the arc as well. After this we'll create a thin module
+        // translation per module in this data.
+        let shared = Arc::new(ThinShared {
+            data,
+            thin_buffers,
+            serialized_modules: serialized,
+            module_names,
+        });
+        Ok((0..shared.module_names.len()).map(|i| {
+            LtoModuleTranslation::Thin(ThinModule {
+                shared: shared.clone(),
+                idx: i,
+            })
+        }).collect())
+    }
+}
+
 fn run_pass_manager(cgcx: &CodegenContext,
                    tm: TargetMachineRef,
                    llmod: ModuleRef,
-                    config: &ModuleConfig) {
-
+                    config: &ModuleConfig,
+                    thin: bool) {
    // Now we have one massive module inside of llmod. Time to run the
    // LTO-specific optimization passes that LLVM provides.
    //
@ -274,9 +438,15 @@ fn run_pass_manager(cgcx: &CodegenContext,
        llvm::LLVMRustAddPass(pm, pass);

        with_llvm_pmb(llmod, config, &mut |b| {
-            llvm::LLVMPassManagerBuilderPopulateLTOPassManager(b, pm,
-                /* Internalize = */ False,
-                /* RunInliner = */ True);
+            if thin {
+                if !llvm::LLVMRustPassManagerBuilderPopulateThinLTOPassManager(b, pm) {
+                    panic!("this version of LLVM does not support ThinLTO");
+                }
+            } else {
+                llvm::LLVMPassManagerBuilderPopulateLTOPassManager(b, pm,
+                    /* Internalize = */ False,
+                    /* RunInliner = */ True);
+            }
        });

        let pass = llvm::LLVMRustFindAndCreatePass("verify\0".as_ptr() as *const _);
@ -331,3 +501,158 @@ impl Drop for ModuleBuffer {
        unsafe { llvm::LLVMRustModuleBufferFree(self.0); }
    }
 }
+
+pub struct ThinModule {
+    shared: Arc<ThinShared>,
+    idx: usize,
+}
+
+struct ThinShared {
+    data: ThinData,
+    thin_buffers: Vec<ThinBuffer>,
+    serialized_modules: Vec<SerializedModule>,
+    module_names: Vec<CString>,
+}
+
+struct ThinData(*mut llvm::ThinLTOData);
+
+unsafe impl Send for ThinData {}
+unsafe impl Sync for ThinData {}
+
+impl Drop for ThinData {
+    fn drop(&mut self) {
+        unsafe {
+            llvm::LLVMRustFreeThinLTOData(self.0);
+        }
+    }
+}
+
+struct ThinBuffer(*mut llvm::ThinLTOBuffer);
+
+unsafe impl Send for ThinBuffer {}
+unsafe impl Sync for ThinBuffer {}
+
+impl ThinBuffer {
+    fn data(&self) -> &[u8] {
+        unsafe {
+            let ptr = llvm::LLVMRustThinLTOBufferPtr(self.0) as *const _;
+            let len = llvm::LLVMRustThinLTOBufferLen(self.0);
+            slice::from_raw_parts(ptr, len)
+        }
+    }
+}
+
+impl Drop for ThinBuffer {
+    fn drop(&mut self) {
+        unsafe {
+            llvm::LLVMRustThinLTOBufferFree(self.0);
+        }
+    }
+}
+
+impl ThinModule {
+    fn name(&self) -> &str {
+        self.shared.module_names[self.idx].to_str().unwrap()
+    }
+
+    fn cost(&self) -> u64 {
+        // Yes, that's correct, we're using the size of the bytecode as an
+        // indicator for how costly this codegen unit is.
+        self.data().len() as u64
+    }
+
+    fn data(&self) -> &[u8] {
+        let a = self.shared.thin_buffers.get(self.idx).map(|b| b.data());
+        a.unwrap_or_else(|| {
+            let len = self.shared.thin_buffers.len();
+            self.shared.serialized_modules[self.idx - len].data()
+        })
+    }
+
+    unsafe fn optimize(&mut self, cgcx: &CodegenContext, timeline: &mut Timeline)
+        -> Result<ModuleTranslation, FatalError>
+    {
+        let diag_handler = cgcx.create_diag_handler();
+        let tm = (cgcx.tm_factory)().map_err(|e| {
+            write::llvm_err(&diag_handler, e)
+        })?;
+
+        // Right now the implementation we've got only works over serialized
+        // modules, so we create a fresh new LLVM context and parse the module
+        // into that context. One day, however, we may do this for upstream
+        // crates but for locally translated modules we may be able to reuse
+        // that LLVM Context and Module.
+        let llcx = llvm::LLVMContextCreate();
+        let llmod = llvm::LLVMRustParseBitcodeForThinLTO(
+            llcx,
+            self.data().as_ptr(),
+            self.data().len(),
+            self.shared.module_names[self.idx].as_ptr(),
+        );
+        assert!(!llmod.is_null());
+        let mtrans = ModuleTranslation {
+            source: ModuleSource::Translated(ModuleLlvm {
+                llmod,
+                llcx,
+                tm,
+            }),
+            llmod_id: self.name().to_string(),
+            name: self.name().to_string(),
+            kind: ModuleKind::Regular,
+        };
+        cgcx.save_temp_bitcode(&mtrans, "thin-lto-input");
+
+        // Like with "fat" LTO, get some better optimizations if landing pads
+        // are disabled by removing all landing pads.
+        if cgcx.no_landing_pads {
+            llvm::LLVMRustMarkAllFunctionsNounwind(llmod);
+            cgcx.save_temp_bitcode(&mtrans, "thin-lto-after-nounwind");
+            timeline.record("nounwind");
+        }
+
+        // Up next comes the per-module local analyses that we do for Thin LTO.
+        // Each of these functions is basically copied from the LLVM
+        // implementation and then tailored to suit this implementation. Ideally
+        // each of these would be supported by upstream LLVM but that's perhaps
+        // a patch for another day!
+        //
+        // You can find some more comments about these functions in the LLVM
+        // bindings we've got (currently `PassWrapper.cpp`)
+        if !llvm::LLVMRustPrepareThinLTORename(self.shared.data.0, llmod) {
+            let msg = format!("failed to prepare thin LTO module");
+            return Err(write::llvm_err(&diag_handler, msg))
+        }
+        cgcx.save_temp_bitcode(&mtrans, "thin-lto-after-rename");
+        timeline.record("rename");
+        if !llvm::LLVMRustPrepareThinLTOResolveWeak(self.shared.data.0, llmod) {
+            let msg = format!("failed to prepare thin LTO module");
+            return Err(write::llvm_err(&diag_handler, msg))
+        }
+        cgcx.save_temp_bitcode(&mtrans, "thin-lto-after-resolve");
+        timeline.record("resolve");
+        if !llvm::LLVMRustPrepareThinLTOInternalize(self.shared.data.0, llmod) {
+            let msg = format!("failed to prepare thin LTO module");
+            return Err(write::llvm_err(&diag_handler, msg))
+        }
+        cgcx.save_temp_bitcode(&mtrans, "thin-lto-after-internalize");
+        timeline.record("internalize");
+        if !llvm::LLVMRustPrepareThinLTOImport(self.shared.data.0, llmod) {
+            let msg = format!("failed to prepare thin LTO module");
+            return Err(write::llvm_err(&diag_handler, msg))
+        }
+        cgcx.save_temp_bitcode(&mtrans, "thin-lto-after-import");
+        timeline.record("import");
+
+        // Alright now that we've done everything related to the ThinLTO
+        // analysis it's time to run some optimizations! Here we use the same
+        // `run_pass_manager` as the "fat" LTO above except that we tell it to
+        // populate a thin-specific pass manager, which presumably LLVM treats a
+        // little differently.
+        info!("running thin lto passes over {}", mtrans.name);
+        let config = cgcx.config(mtrans.kind);
+        run_pass_manager(cgcx, tm, llmod, config, true);
+        cgcx.save_temp_bitcode(&mtrans, "thin-lto-after-pm");
+        timeline.record("thin-done");
+        Ok(mtrans)
+    }
+}
--- a/src/librustc_trans/back/write.rs
+++ b/src/librustc_trans/back/write.rs
@ -19,7 +19,7 @@ use rustc::session::config::{self, OutputFilenames, OutputType, OutputTypes, Pas
                             AllPasses, Sanitizer};
 use rustc::session::Session;
 use rustc::util::nodemap::FxHashMap;
-use time_graph::{self, TimeGraph};
+use time_graph::{self, TimeGraph, Timeline};
 use llvm;
 use llvm::{ModuleRef, TargetMachineRef, PassManagerRef, DiagnosticInfoRef};
 use llvm::{SMDiagnosticRef, ContextRef};
@ -303,6 +303,7 @@ pub struct CodegenContext {
    // Resouces needed when running LTO
    pub time_passes: bool,
    pub lto: bool,
+    pub thinlto: bool,
    pub no_landing_pads: bool,
    pub save_temps: bool,
    pub exported_symbols: Arc<ExportedSymbols>,
@ -315,6 +316,8 @@ pub struct CodegenContext {
    allocator_module_config: Arc<ModuleConfig>,
    pub tm_factory: Arc<Fn() -> Result<TargetMachineRef, String> + Send + Sync>,

+    // Number of cgus excluding the allocator/metadata modules
+    pub total_cgus: usize,
    // Handler to use for diagnostics produced during codegen.
    pub diag_emitter: SharedEmitter,
    // LLVM passes added by plugins.
@ -450,7 +453,8 @@ unsafe extern "C" fn diagnostic_handler(info: DiagnosticInfoRef, user: *mut c_vo
 unsafe fn optimize(cgcx: &CodegenContext,
                   diag_handler: &Handler,
                   mtrans: &ModuleTranslation,
-                   config: &ModuleConfig)
+                   config: &ModuleConfig,
+                   timeline: &mut Timeline)
    -> Result<(), FatalError>
 {
    let (llmod, llcx, tm) = match mtrans.source {
@ -529,6 +533,7 @@ unsafe fn optimize(cgcx: &CodegenContext,
        // Finally, run the actual optimization passes
        time(config.time_passes, &format!("llvm function passes [{}]", module_name.unwrap()), ||
             llvm::LLVMRustRunFunctionPassManager(fpm, llmod));
+        timeline.record("fpm");
        time(config.time_passes, &format!("llvm module passes [{}]", module_name.unwrap()), ||
             llvm::LLVMRunPassManager(mpm, llmod));

@ -543,7 +548,18 @@ fn generate_lto_work(cgcx: &CodegenContext,
                     modules: Vec<ModuleTranslation>)
    -> Vec<(WorkItem, u64)>
 {
-    let lto_modules = lto::run(cgcx, modules).unwrap_or_else(|e| panic!(e));
+    let mut timeline = cgcx.time_graph.as_ref().map(|tg| {
+        tg.start(TRANS_WORKER_TIMELINE,
+                 TRANS_WORK_PACKAGE_KIND,
+                 "generate lto")
+    }).unwrap_or(Timeline::noop());
+    let mode = if cgcx.lto {
+        lto::LTOMode::WholeCrateGraph
+    } else {
+        lto::LTOMode::JustThisCrate
+    };
+    let lto_modules = lto::run(cgcx, modules, mode, &mut timeline)
+        .unwrap_or_else(|e| panic!(e));

    lto_modules.into_iter().map(|module| {
        let cost = module.cost();
@ -554,9 +570,11 @@ fn generate_lto_work(cgcx: &CodegenContext,
 unsafe fn codegen(cgcx: &CodegenContext,
                  diag_handler: &Handler,
                  mtrans: ModuleTranslation,
-                  config: &ModuleConfig)
+                  config: &ModuleConfig,
+                  timeline: &mut Timeline)
    -> Result<CompiledModule, FatalError>
 {
+    timeline.record("codegen");
    let (llmod, llcx, tm) = match mtrans.source {
        ModuleSource::Translated(ref llvm) => (llvm.llmod, llvm.llcx, llvm.tm),
        ModuleSource::Preexisting(_) => {
@ -601,7 +619,18 @@ unsafe fn codegen(cgcx: &CodegenContext,

    if write_bc {
        let bc_out_c = path2cstr(&bc_out);
-        llvm::LLVMWriteBitcodeToFile(llmod, bc_out_c.as_ptr());
+        if llvm::LLVMRustThinLTOAvailable() {
+            with_codegen(tm, llmod, config.no_builtins, |cpm| {
+                llvm::LLVMRustWriteThinBitcodeToFile(
+                    cpm,
+                    llmod,
+                    bc_out_c.as_ptr(),
+                )
+            });
+        } else {
+            llvm::LLVMWriteBitcodeToFile(llmod, bc_out_c.as_ptr());
+        }
+        timeline.record("bc");
    }

    time(config.time_passes, &format!("codegen passes [{}]", module_name.unwrap()),
@ -644,7 +673,8 @@ unsafe fn codegen(cgcx: &CodegenContext,
            with_codegen(tm, llmod, config.no_builtins, |cpm| {
                llvm::LLVMRustPrintModule(cpm, llmod, out.as_ptr(), demangle_callback);
                llvm::LLVMDisposePassManager(cpm);
-            })
+            });
+            timeline.record("ir");
        }

        if config.emit_asm {
@ -665,6 +695,7 @@ unsafe fn codegen(cgcx: &CodegenContext,
            if config.emit_obj {
                llvm::LLVMDisposeModule(llmod);
            }
+            timeline.record("asm");
        }

        if write_obj {
@ -672,6 +703,7 @@ unsafe fn codegen(cgcx: &CodegenContext,
                write_output_file(diag_handler, tm, cpm, llmod, &obj_out,
                                  llvm::FileType::ObjectFile)
            })?;
+            timeline.record("obj");
        }

        Ok(())
@ -712,7 +744,8 @@ pub fn start_async_translation(tcx: TyCtxt,
                               time_graph: Option<TimeGraph>,
                               link: LinkMeta,
                               metadata: EncodedMetadata,
-                               coordinator_receive: Receiver<Box<Any + Send>>)
+                               coordinator_receive: Receiver<Box<Any + Send>>,
+                               total_cgus: usize)
                               -> OngoingCrateTranslation {
    let sess = tcx.sess;
    let crate_output = tcx.output_filenames(LOCAL_CRATE);
@ -836,6 +869,7 @@ pub fn start_async_translation(tcx: TyCtxt,
                                                  shared_emitter,
                                                  trans_worker_send,
                                                  coordinator_receive,
+                                                  total_cgus,
                                                  client,
                                                  time_graph.clone(),
                                                  Arc::new(modules_config),
@ -1080,7 +1114,9 @@ enum WorkItemResult {
    NeedsLTO(ModuleTranslation),
 }

-fn execute_work_item(cgcx: &CodegenContext, work_item: WorkItem)
+fn execute_work_item(cgcx: &CodegenContext,
+                     work_item: WorkItem,
+                     timeline: &mut Timeline)
    -> Result<WorkItemResult, FatalError>
 {
    let diag_handler = cgcx.create_diag_handler();
@ -1089,8 +1125,8 @@ fn execute_work_item(cgcx: &CodegenContext, work_item: WorkItem)
        WorkItem::Optimize(mtrans) => mtrans,
        WorkItem::LTO(mut lto) => {
            unsafe {
-                let module = lto.optimize(cgcx)?;
-                let module = codegen(cgcx, &diag_handler, module, config)?;
+                let module = lto.optimize(cgcx, timeline)?;
+                let module = codegen(cgcx, &diag_handler, module, config, timeline)?;
                return Ok(WorkItemResult::Compiled(module))
            }
        }
@ -1140,9 +1176,27 @@ fn execute_work_item(cgcx: &CodegenContext, work_item: WorkItem)
        debug!("llvm-optimizing {:?}", module_name);

        unsafe {
-            optimize(cgcx, &diag_handler, &mtrans, config)?;
-            if !cgcx.lto || mtrans.kind == ModuleKind::Metadata {
-                let module = codegen(cgcx, &diag_handler, mtrans, config)?;
+            optimize(cgcx, &diag_handler, &mtrans, config, timeline)?;
+
+            let lto = cgcx.lto;
+
+            let auto_thin_lto =
+                cgcx.thinlto &&
+                cgcx.total_cgus > 1 &&
+                mtrans.kind != ModuleKind::Allocator;
+
+            // If we're a metadata module we never participate in LTO.
+            //
+            // If LTO was explicitly requested on the command line, we always
+            // LTO everything else.
+            //
+            // If LTO *wasn't* explicitly requested and we're not a metdata
+            // module, then we may automatically do ThinLTO if we've got
+            // multiple codegen units. Note, however, that the allocator module
+            // doesn't participate here automatically because of linker
+            // shenanigans later on.
+            if mtrans.kind == ModuleKind::Metadata || (!lto && !auto_thin_lto) {
+                let module = codegen(cgcx, &diag_handler, mtrans, config, timeline)?;
                Ok(WorkItemResult::Compiled(module))
            } else {
                Ok(WorkItemResult::NeedsLTO(mtrans))
@ -1187,6 +1241,7 @@ fn start_executing_work(tcx: TyCtxt,
                        shared_emitter: SharedEmitter,
                        trans_worker_send: Sender<Message>,
                        coordinator_receive: Receiver<Box<Any + Send>>,
+                        total_cgus: usize,
                        jobserver: Client,
                        time_graph: Option<TimeGraph>,
                        modules_config: Arc<ModuleConfig>,
@ -1229,6 +1284,7 @@ fn start_executing_work(tcx: TyCtxt,
        crate_types: sess.crate_types.borrow().clone(),
        each_linked_rlib_for_lto,
        lto: sess.lto(),
+        thinlto: sess.opts.debugging_opts.thinlto,
        no_landing_pads: sess.no_landing_pads(),
        save_temps: sess.opts.cg.save_temps,
        opts: Arc::new(sess.opts.clone()),
@ -1246,6 +1302,7 @@ fn start_executing_work(tcx: TyCtxt,
        metadata_module_config: metadata_config,
        allocator_module_config: allocator_config,
        tm_factory: target_machine_factory(tcx.sess),
+        total_cgus,
    };

    // This is the "main loop" of parallel work happening for parallel codegen.
@ -1743,12 +1800,13 @@ fn spawn_work(cgcx: CodegenContext, work: WorkItem) {
        // as a diagnostic was already sent off to the main thread - just
        // surface that there was an error in this worker.
        bomb.result = {
-            let _timing_guard = cgcx.time_graph.as_ref().map(|tg| {
+            let timeline = cgcx.time_graph.as_ref().map(|tg| {
                tg.start(time_graph::TimelineId(cgcx.worker),
                         LLVM_WORK_PACKAGE_KIND,
                         &work.name())
            });
-            execute_work_item(&cgcx, work).ok()
+            let mut timeline = timeline.unwrap_or(Timeline::noop());
+            execute_work_item(&cgcx, work, &mut timeline).ok()
        };
    });
 }
--- a/src/librustc_trans/base.rs
+++ b/src/librustc_trans/base.rs
@ -886,6 +886,11 @@ pub fn trans_crate<'a, 'tcx>(tcx: TyCtxt<'a, 'tcx, 'tcx>,

    check_for_rustc_errors_attr(tcx);

+    if tcx.sess.opts.debugging_opts.thinlto {
+        if unsafe { !llvm::LLVMRustThinLTOAvailable() } {
+            tcx.sess.fatal("this compiler's LLVM does not support ThinLTO");
+        }
+    }

    let crate_hash = tcx.dep_graph
                        .fingerprint_of(&DepNode::new_no_params(DepKind::Krate));
@ -925,7 +930,8 @@ pub fn trans_crate<'a, 'tcx>(tcx: TyCtxt<'a, 'tcx, 'tcx>,
            time_graph.clone(),
            link_meta,
            metadata,
-            rx);
+            rx,
+            1);

        ongoing_translation.submit_pre_translated_module_to_llvm(tcx, metadata_module);
        ongoing_translation.translation_finished(tcx);
@ -961,7 +967,8 @@ pub fn trans_crate<'a, 'tcx>(tcx: TyCtxt<'a, 'tcx, 'tcx>,
        time_graph.clone(),
        link_meta,
        metadata,
-        rx);
+        rx,
+        codegen_units.len());

    // Translate an allocator shim, if any
    let allocator_module = if let Some(kind) = tcx.sess.allocator_kind.get() {
@ -1372,7 +1379,9 @@ fn compile_codegen_unit<'a, 'tcx>(tcx: TyCtxt<'a, 'tcx, 'tcx>,
        // crashes if the module identifier is same as other symbols
        // such as a function name in the module.
        // 1. http://llvm.org/bugs/show_bug.cgi?id=11479
-        let llmod_id = format!("{}.rs", cgu.name());
+        let llmod_id = format!("{}-{}.rs",
+                               cgu.name(),
+                               tcx.crate_disambiguator(LOCAL_CRATE));

        // Instantiate translation items without filling out definitions yet...
        let scx = SharedCrateContext::new(tcx);
--- a/src/librustc_trans/partitioning.rs
+++ b/src/librustc_trans/partitioning.rs
@ -108,7 +108,6 @@ use rustc::dep_graph::{DepNode, WorkProductId};
 use rustc::hir::def_id::DefId;
 use rustc::hir::map::DefPathData;
 use rustc::middle::trans::{Linkage, Visibility};
-use rustc::session::config::NUMBERED_CODEGEN_UNIT_MARKER;
 use rustc::ty::{self, TyCtxt, InstanceDef};
 use rustc::ty::item_path::characteristic_def_id_of_type;
 use rustc::util::nodemap::{FxHashMap, FxHashSet};
@ -627,7 +626,7 @@ fn compute_codegen_unit_name<'a, 'tcx>(tcx: TyCtxt<'a, 'tcx, 'tcx>,
 }

 fn numbered_codegen_unit_name(crate_name: &str, index: usize) -> InternedString {
-    Symbol::intern(&format!("{}{}{}", crate_name, NUMBERED_CODEGEN_UNIT_MARKER, index)).as_str()
+    Symbol::intern(&format!("{}{}", crate_name, index)).as_str()
 }

 fn debug_dump<'a, 'b, 'tcx, I>(tcx: TyCtxt<'a, 'tcx, 'tcx>,
--- a/src/librustc_trans/time_graph.rs
+++ b/src/librustc_trans/time_graph.rs
@ -9,11 +9,12 @@
 // except according to those terms.

 use std::collections::HashMap;
+use std::fs::File;
+use std::io::prelude::*;
 use std::marker::PhantomData;
+use std::mem;
 use std::sync::{Arc, Mutex};
 use std::time::Instant;
-use std::io::prelude::*;
-use std::fs::File;

 const OUTPUT_WIDTH_IN_PX: u64 = 1000;
 const TIME_LINE_HEIGHT_IN_PX: u64 = 20;
@ -25,6 +26,7 @@ struct Timing {
    end: Instant,
    work_package_kind: WorkPackageKind,
    name: String,
+    events: Vec<(String, Instant)>,
 }

 #[derive(Clone, Copy, Hash, Eq, PartialEq, Debug)]
@ -44,9 +46,14 @@ pub struct TimeGraph {
 #[derive(Clone, Copy)]
 pub struct WorkPackageKind(pub &'static [&'static str]);

-pub struct RaiiToken {
+pub struct Timeline {
+    token: Option<RaiiToken>,
+}
+
+struct RaiiToken {
    graph: TimeGraph,
    timeline: TimelineId,
+    events: Vec<(String, Instant)>,
    // The token must not be Send:
    _marker: PhantomData<*const ()>
 }
@ -54,7 +61,7 @@ pub struct RaiiToken {

 impl Drop for RaiiToken {
    fn drop(&mut self) {
-        self.graph.end(self.timeline);
+        self.graph.end(self.timeline, mem::replace(&mut self.events, Vec::new()));
    }
 }

@ -68,7 +75,7 @@ impl TimeGraph {
    pub fn start(&self,
                 timeline: TimelineId,
                 work_package_kind: WorkPackageKind,
-                 name: &str) -> RaiiToken {
+                 name: &str) -> Timeline {
        {
            let mut table = self.data.lock().unwrap();

@ -81,14 +88,17 @@ impl TimeGraph {
            data.open_work_package = Some((Instant::now(), work_package_kind, name.to_string()));
        }

-        RaiiToken {
-            graph: self.clone(),
-            timeline,
-            _marker: PhantomData,
+        Timeline {
+            token: Some(RaiiToken {
+                graph: self.clone(),
+                timeline,
+                events: Vec::new(),
+                _marker: PhantomData,
+            }),
        }
    }

-    fn end(&self, timeline: TimelineId) {
+    fn end(&self, timeline: TimelineId, events: Vec<(String, Instant)>) {
        let end = Instant::now();

        let mut table = self.data.lock().unwrap();
@ -100,6 +110,7 @@ impl TimeGraph {
                end,
                work_package_kind,
                name,
+                events,
            });
        } else {
            bug!("end timing without start?")
@ -113,13 +124,13 @@ impl TimeGraph {
            assert!(data.open_work_package.is_none());
        }

-        let mut timelines: Vec<PerThread> =
+        let mut threads: Vec<PerThread> =
            table.values().map(|data| data.clone()).collect();

-        timelines.sort_by_key(|timeline| timeline.timings[0].start);
+        threads.sort_by_key(|timeline| timeline.timings[0].start);

-        let earliest_instant = timelines[0].timings[0].start;
-        let latest_instant = timelines.iter()
+        let earliest_instant = threads[0].timings[0].start;
+        let latest_instant = threads.iter()
                                       .map(|timeline| timeline.timings
                                                               .last()
                                                               .unwrap()
@ -130,16 +141,46 @@ impl TimeGraph {

        let mut file = File::create(format!("{}.html", output_filename)).unwrap();

-        writeln!(file, "<html>").unwrap();
-        writeln!(file, "<head></head>").unwrap();
-        writeln!(file, "<body>").unwrap();
+        writeln!(file, "
+            <html>
+            <head>
+                <style>
+                    #threads a {{
+                        position: absolute;
+                        overflow: hidden;
+                    }}
+                    #threads {{
+                        height: {total_height}px;
+                        width: {width}px;
+                    }}
+
+                    .timeline {{
+                        display: none;
+                        width: {width}px;
+                        position: relative;
+                    }}
+
+                    .timeline:target {{
+                        display: block;
+                    }}
+
+                    .event {{
+                        position: absolute;
+                    }}
+                </style>
+            </head>
+            <body>
+                <div id='threads'>
+        ",
+            total_height = threads.len() * TIME_LINE_HEIGHT_STRIDE_IN_PX,
+            width = OUTPUT_WIDTH_IN_PX,
+        ).unwrap();

        let mut color = 0;
-
-        for (line_index, timeline) in timelines.iter().enumerate() {
+        for (line_index, thread) in threads.iter().enumerate() {
            let line_top = line_index * TIME_LINE_HEIGHT_STRIDE_IN_PX;

-            for span in &timeline.timings {
+            for span in &thread.timings {
                let start = distance(earliest_instant, span.start);
                let end = distance(earliest_instant, span.end);

@ -148,13 +189,13 @@ impl TimeGraph {

                let colors = span.work_package_kind.0;

-                writeln!(file, "<div style='position:absolute; \
-                                            overflow:hidden; \
-                                            top:{}px; \
-                                            left:{}px; \
-                                            width:{}px; \
-                                            height:{}px; \
-                                            background:{};'>{}</div>",
+                writeln!(file, "<a href='#timing{}'
+                                   style='top:{}px; \
+                                          left:{}px; \
+                                          width:{}px; \
+                                          height:{}px; \
+                                          background:{};'>{}</a>",
+                    color,
                    line_top,
                    start,
                    end - start,
@ -167,8 +208,61 @@ impl TimeGraph {
            }
        }

-        writeln!(file, "</body>").unwrap();
-        writeln!(file, "</html>").unwrap();
+        writeln!(file, "
+            </div>
+        ").unwrap();
+
+        let mut idx = 0;
+        for thread in threads.iter() {
+            for timing in &thread.timings {
+                let colors = timing.work_package_kind.0;
+                let height = TIME_LINE_HEIGHT_STRIDE_IN_PX * timing.events.len();
+                writeln!(file, "<div class='timeline'
+                                     id='timing{}'
+                                     style='background:{};height:{}px;'>",
+                         idx,
+                         colors[idx % colors.len()],
+                         height).unwrap();
+                idx += 1;
+                let max = distance(timing.start, timing.end);
+                for (i, &(ref event, time)) in timing.events.iter().enumerate() {
+                    let i = i as u64;
+                    let time = distance(timing.start, time);
+                    let at = normalize(time, max, OUTPUT_WIDTH_IN_PX);
+                    writeln!(file, "<span class='event'
+                                          style='left:{}px;\
+                                                 top:{}px;'>{}</span>",
+                             at,
+                             TIME_LINE_HEIGHT_IN_PX * i,
+                             event).unwrap();
+                }
+                writeln!(file, "</div>").unwrap();
+            }
+        }
+
+        writeln!(file, "
+            </body>
+            </html>
+        ").unwrap();
+    }
+}
+
+impl Timeline {
+    pub fn noop() -> Timeline {
+        Timeline { token: None }
+    }
+
+    /// Record an event which happened at this moment on this timeline.
+    ///
+    /// Events are displayed in the eventual HTML output where you can click on
+    /// a particular timeline and it'll expand to all of the events that
+    /// happened on that timeline. This can then be used to drill into a
+    /// particular timeline and see what events are happening and taking the
+    /// most time.
+    pub fn record(&mut self, name: &str) {
+        if let Some(ref mut token) = self.token {
+            token.events.push((name.to_string(), Instant::now()));
+        }
    }
 }

--- a/src/rustllvm/PassWrapper.cpp
+++ b/src/rustllvm/PassWrapper.cpp
@ -26,7 +26,11 @@
 #include "llvm/Transforms/IPO/PassManagerBuilder.h"

 #if LLVM_VERSION_GE(4, 0)
+#include "llvm/Object/ModuleSummaryIndexObjectFile.h"
 #include "llvm/Transforms/IPO/AlwaysInliner.h"
+#include "llvm/Transforms/IPO/FunctionImport.h"
+#include "llvm/Transforms/Utils/FunctionImportUtils.h"
+#include "llvm/LTO/LTO.h"
 #endif

 #include "llvm-c/Transforms/PassManagerBuilder.h"
@ -102,6 +106,19 @@ extern "C" void LLVMRustAddPass(LLVMPassManagerRef PMR, LLVMPassRef RustPass) {
  PMB->add(Pass);
 }

+extern "C"
+bool LLVMRustPassManagerBuilderPopulateThinLTOPassManager(
+  LLVMPassManagerBuilderRef PMBR,
+  LLVMPassManagerRef PMR
+) {
+#if LLVM_VERSION_GE(4, 0)
+  unwrap(PMBR)->populateThinLTOPassManager(*unwrap(PMR));
+  return true;
+#else
+  return false;
+#endif
+}
+
 #ifdef LLVM_COMPONENT_X86
 #define SUBTARGET_X86 SUBTARGET(X86)
 #else
@ -740,3 +757,447 @@ extern "C" void LLVMRustSetModulePIELevel(LLVMModuleRef M) {
  unwrap(M)->setPIELevel(PIELevel::Level::Large);
 #endif
 }
+
+extern "C" bool
+LLVMRustThinLTOAvailable() {
+#if LLVM_VERSION_GE(4, 0)
+  return true;
+#else
+  return false;
+#endif
+}
+
+#if LLVM_VERSION_GE(4, 0)
+
+// Here you'll find an implementation of ThinLTO as used by the Rust compiler
+// right now. This ThinLTO support is only enabled on "recent ish" versions of
+// LLVM, and otherwise it's just blanket rejected from other compilers.
+//
+// Most of this implementation is straight copied from LLVM. At the time of
+// this writing it wasn't *quite* suitable to reuse more code from upstream
+// for our purposes, but we should strive to upstream this support once it's
+// ready to go! I figure we may want a bit of testing locally first before
+// sending this upstream to LLVM. I hear though they're quite eager to receive
+// feedback like this!
+//
+// If you're reading this code and wondering "what in the world" or you're
+// working "good lord by LLVM upgrade is *still* failing due to these bindings"
+// then fear not! (ok maybe fear a little). All code here is mostly based
+// on `lib/LTO/ThinLTOCodeGenerator.cpp` in LLVM.
+//
+// You'll find that the general layout here roughly corresponds to the `run`
+// method in that file as well as `ProcessThinLTOModule`. Functions are
+// specifically commented below as well, but if you're updating this code
+// or otherwise trying to understand it, the LLVM source will be useful in
+// interpreting the mysteries within.
+//
+// Otherwise I'll apologize in advance, it probably requires a relatively
+// significant investment on your part to "truly understand" what's going on
+// here. Not saying I do myself, but it took me awhile staring at LLVM's source
+// and various online resources about ThinLTO to make heads or tails of all
+// this.
+
+extern "C" bool
+LLVMRustWriteThinBitcodeToFile(LLVMPassManagerRef PMR,
+                               LLVMModuleRef M,
+                               const char *BcFile) {
+  llvm::legacy::PassManager *PM = unwrap<llvm::legacy::PassManager>(PMR);
+  std::error_code EC;
+  llvm::raw_fd_ostream bc(BcFile, EC, llvm::sys::fs::F_None);
+  if (EC) {
+    LLVMRustSetLastError(EC.message().c_str());
+    return false;
+  }
+  PM->add(createWriteThinLTOBitcodePass(bc));
+  PM->run(*unwrap(M));
+  delete PM;
+  return true;
+}
+
+// This is a shared data structure which *must* be threadsafe to share
+// read-only amongst threads. This also corresponds basically to the arguments
+// of the `ProcessThinLTOModule` function in the LLVM source.
+struct LLVMRustThinLTOData {
+  // The combined index that is the global analysis over all modules we're
+  // performing ThinLTO for. This is mostly managed by LLVM.
+  ModuleSummaryIndex Index;
+
+  // All modules we may look at, stored as in-memory serialized versions. This
+  // is later used when inlining to ensure we can extract any module to inline
+  // from.
+  StringMap<MemoryBufferRef> ModuleMap;
+
+  // A set that we manage of everything we *don't* want internalized. Note that
+  // this includes all transitive references right now as well, but it may not
+  // always!
+  DenseSet<GlobalValue::GUID> GUIDPreservedSymbols;
+
+  // Not 100% sure what these are, but they impact what's internalized and
+  // what's inlined across modules, I believe.
+  StringMap<FunctionImporter::ImportMapTy> ImportLists;
+  StringMap<FunctionImporter::ExportSetTy> ExportLists;
+  StringMap<GVSummaryMapTy> ModuleToDefinedGVSummaries;
+};
+
+// Just an argument to the `LLVMRustCreateThinLTOData` function below.
+struct LLVMRustThinLTOModule {
+  const char *identifier;
+  const char *data;
+  size_t len;
+};
+
+// This is copied from `lib/LTO/ThinLTOCodeGenerator.cpp`, not sure what it
+// does.
+static const GlobalValueSummary *
+getFirstDefinitionForLinker(const GlobalValueSummaryList &GVSummaryList) {
+  auto StrongDefForLinker = llvm::find_if(
+      GVSummaryList, [](const std::unique_ptr<GlobalValueSummary> &Summary) {
+        auto Linkage = Summary->linkage();
+        return !GlobalValue::isAvailableExternallyLinkage(Linkage) &&
+               !GlobalValue::isWeakForLinker(Linkage);
+      });
+  if (StrongDefForLinker != GVSummaryList.end())
+    return StrongDefForLinker->get();
+
+  auto FirstDefForLinker = llvm::find_if(
+      GVSummaryList, [](const std::unique_ptr<GlobalValueSummary> &Summary) {
+        auto Linkage = Summary->linkage();
+        return !GlobalValue::isAvailableExternallyLinkage(Linkage);
+      });
+  if (FirstDefForLinker == GVSummaryList.end())
+    return nullptr;
+  return FirstDefForLinker->get();
+}
+
+// This is a helper function we added that isn't present in LLVM's source.
+//
+// The way LTO works in Rust is that we typically have a number of symbols that
+// we know ahead of time need to be preserved. We want to ensure that ThinLTO
+// doesn't accidentally internalize any of these and otherwise is always
+// ready to keep them linking correctly.
+//
+// This function will recursively walk the `GUID` provided and all of its
+// references, as specified in the `Index`. In other words, we're taking a
+// `GUID` as input, adding it to `Preserved`, and then taking all `GUID`
+// items that the input references and recursing.
+static void
+addPreservedGUID(const ModuleSummaryIndex &Index,
+                 DenseSet<GlobalValue::GUID> &Preserved,
+                 GlobalValue::GUID GUID) {
+  if (Preserved.count(GUID))
+    return;
+  Preserved.insert(GUID);
+
+  auto SummaryList = Index.findGlobalValueSummaryList(GUID);
+  if (SummaryList == Index.end())
+    return;
+  for (auto &Summary : SummaryList->second) {
+    for (auto &Ref : Summary->refs()) {
+      if (Ref.isGUID()) {
+        addPreservedGUID(Index, Preserved, Ref.getGUID());
+      } else {
+        auto Value = Ref.getValue();
+        addPreservedGUID(Index, Preserved, Value->getGUID());
+      }
+    }
+
+    GlobalValueSummary *GVSummary = Summary.get();
+    if (isa<FunctionSummary>(GVSummary)) {
+      FunctionSummary *FS = cast<FunctionSummary>(GVSummary);
+      for (auto &Call: FS->calls()) {
+        if (Call.first.isGUID()) {
+          addPreservedGUID(Index, Preserved, Call.first.getGUID());
+        } else {
+          auto Value = Call.first.getValue();
+          addPreservedGUID(Index, Preserved, Value->getGUID());
+        }
+      }
+      for (auto &GUID: FS->type_tests()) {
+        addPreservedGUID(Index, Preserved, GUID);
+      }
+    }
+  }
+}
+
+// The main entry point for creating the global ThinLTO analysis. The structure
+// here is basically the same as before threads are spawned in the `run`
+// function of `lib/LTO/ThinLTOCodeGenerator.cpp`.
+extern "C" LLVMRustThinLTOData*
+LLVMRustCreateThinLTOData(LLVMRustThinLTOModule *modules,
+                          int num_modules,
+                          const char **preserved_symbols,
+                          int num_symbols) {
+  auto Ret = llvm::make_unique<LLVMRustThinLTOData>();
+
+  // Load each module's summary and merge it into one combined index
+  for (int i = 0; i < num_modules; i++) {
+    auto module = &modules[i];
+    StringRef buffer(module->data, module->len);
+    MemoryBufferRef mem_buffer(buffer, module->identifier);
+
+    Ret->ModuleMap[module->identifier] = mem_buffer;
+
+    Expected<std::unique_ptr<object::ModuleSummaryIndexObjectFile>> ObjOrErr =
+      object::ModuleSummaryIndexObjectFile::create(mem_buffer);
+    if (!ObjOrErr) {
+      LLVMRustSetLastError(toString(ObjOrErr.takeError()).c_str());
+      return nullptr;
+    }
+    auto Index = (*ObjOrErr)->takeIndex();
+    Ret->Index.mergeFrom(std::move(Index), i);
+  }
+
+  // Collect for each module the list of function it defines (GUID -> Summary)
+  Ret->Index.collectDefinedGVSummariesPerModule(Ret->ModuleToDefinedGVSummaries);
+
+  // Convert the preserved symbols set from string to GUID, this is then needed
+  // for internalization. We use `addPreservedGUID` to include any transitively
+  // used symbol as well.
+  for (int i = 0; i < num_symbols; i++) {
+    addPreservedGUID(Ret->Index,
+                     Ret->GUIDPreservedSymbols,
+                     GlobalValue::getGUID(preserved_symbols[i]));
+  }
+
+  // Collect the import/export lists for all modules from the call-graph in the
+  // combined index
+  //
+  // This is copied from `lib/LTO/ThinLTOCodeGenerator.cpp`
+  computeDeadSymbols(Ret->Index, Ret->GUIDPreservedSymbols);
+  ComputeCrossModuleImport(
+    Ret->Index,
+    Ret->ModuleToDefinedGVSummaries,
+    Ret->ImportLists,
+    Ret->ExportLists
+  );
+
+  // Resolve LinkOnce/Weak symbols, this has to be computed early be cause it
+  // impacts the caching.
+  //
+  // This is copied from `lib/LTO/ThinLTOCodeGenerator.cpp`
+  StringMap<std::map<GlobalValue::GUID, GlobalValue::LinkageTypes>> ResolvedODR;
+  DenseMap<GlobalValue::GUID, const GlobalValueSummary *> PrevailingCopy;
+  for (auto &I : Ret->Index) {
+    if (I.second.size() > 1)
+      PrevailingCopy[I.first] = getFirstDefinitionForLinker(I.second);
+  }
+  auto isPrevailing = [&](GlobalValue::GUID GUID, const GlobalValueSummary *S) {
+    const auto &Prevailing = PrevailingCopy.find(GUID);
+    if (Prevailing == PrevailingCopy.end())
+      return true;
+    return Prevailing->second == S;
+  };
+  auto recordNewLinkage = [&](StringRef ModuleIdentifier,
+                              GlobalValue::GUID GUID,
+                              GlobalValue::LinkageTypes NewLinkage) {
+    ResolvedODR[ModuleIdentifier][GUID] = NewLinkage;
+  };
+  thinLTOResolveWeakForLinkerInIndex(Ret->Index, isPrevailing, recordNewLinkage);
+  auto isExported = [&](StringRef ModuleIdentifier, GlobalValue::GUID GUID) {
+    const auto &ExportList = Ret->ExportLists.find(ModuleIdentifier);
+    return (ExportList != Ret->ExportLists.end() &&
+      ExportList->second.count(GUID)) ||
+      Ret->GUIDPreservedSymbols.count(GUID);
+  };
+  thinLTOInternalizeAndPromoteInIndex(Ret->Index, isExported);
+
+  return Ret.release();
+}
+
+extern "C" void
+LLVMRustFreeThinLTOData(LLVMRustThinLTOData *Data) {
+  delete Data;
+}
+
+// Below are the various passes that happen *per module* when doing ThinLTO.
+//
+// In other words, these are the functions that are all run concurrently
+// with one another, one per module. The passes here correspond to the analysis
+// passes in `lib/LTO/ThinLTOCodeGenerator.cpp`, currently found in the
+// `ProcessThinLTOModule` function. Here they're split up into separate steps
+// so rustc can save off the intermediate bytecode between each step.
+
+extern "C" bool
+LLVMRustPrepareThinLTORename(const LLVMRustThinLTOData *Data, LLVMModuleRef M) {
+  Module &Mod = *unwrap(M);
+  if (renameModuleForThinLTO(Mod, Data->Index)) {
+    LLVMRustSetLastError("renameModuleForThinLTO failed");
+    return false;
+  }
+  return true;
+}
+
+extern "C" bool
+LLVMRustPrepareThinLTOResolveWeak(const LLVMRustThinLTOData *Data, LLVMModuleRef M) {
+  Module &Mod = *unwrap(M);
+  const auto &DefinedGlobals = Data->ModuleToDefinedGVSummaries.lookup(Mod.getModuleIdentifier());
+  thinLTOResolveWeakForLinkerModule(Mod, DefinedGlobals);
+  return true;
+}
+
+extern "C" bool
+LLVMRustPrepareThinLTOInternalize(const LLVMRustThinLTOData *Data, LLVMModuleRef M) {
+  Module &Mod = *unwrap(M);
+  const auto &DefinedGlobals = Data->ModuleToDefinedGVSummaries.lookup(Mod.getModuleIdentifier());
+  thinLTOInternalizeModule(Mod, DefinedGlobals);
+  return true;
+}
+
+extern "C" bool
+LLVMRustPrepareThinLTOImport(const LLVMRustThinLTOData *Data, LLVMModuleRef M) {
+  Module &Mod = *unwrap(M);
+  const auto &ImportList = Data->ImportLists.lookup(Mod.getModuleIdentifier());
+  auto Loader = [&](StringRef Identifier) {
+    const auto &Memory = Data->ModuleMap.lookup(Identifier);
+    auto &Context = Mod.getContext();
+    return getLazyBitcodeModule(Memory, Context, true, true);
+  };
+  FunctionImporter Importer(Data->Index, Loader);
+  Expected<bool> Result = Importer.importFunctions(Mod, ImportList);
+  if (!Result) {
+    LLVMRustSetLastError(toString(Result.takeError()).c_str());
+    return false;
+  }
+  return true;
+}
+
+// This struct and various functions are sort of a hack right now, but the
+// problem is that we've got in-memory LLVM modules after we generate and
+// optimize all codegen-units for one compilation in rustc. To be compatible
+// with the LTO support above we need to serialize the modules plus their
+// ThinLTO summary into memory.
+//
+// This structure is basically an owned version of a serialize module, with
+// a ThinLTO summary attached.
+struct LLVMRustThinLTOBuffer {
+  std::string data;
+};
+
+extern "C" LLVMRustThinLTOBuffer*
+LLVMRustThinLTOBufferCreate(LLVMModuleRef M) {
+  auto Ret = llvm::make_unique<LLVMRustThinLTOBuffer>();
+  {
+    raw_string_ostream OS(Ret->data);
+    {
+      legacy::PassManager PM;
+      PM.add(createWriteThinLTOBitcodePass(OS));
+      PM.run(*unwrap(M));
+    }
+  }
+  return Ret.release();
+}
+
+extern "C" void
+LLVMRustThinLTOBufferFree(LLVMRustThinLTOBuffer *Buffer) {
+  delete Buffer;
+}
+
+extern "C" const void*
+LLVMRustThinLTOBufferPtr(const LLVMRustThinLTOBuffer *Buffer) {
+  return Buffer->data.data();
+}
+
+extern "C" size_t
+LLVMRustThinLTOBufferLen(const LLVMRustThinLTOBuffer *Buffer) {
+  return Buffer->data.length();
+}
+
+// This is what we used to parse upstream bitcode for actual ThinLTO
+// processing.  We'll call this once per module optimized through ThinLTO, and
+// it'll be called concurrently on many threads.
+extern "C" LLVMModuleRef
+LLVMRustParseBitcodeForThinLTO(LLVMContextRef Context,
+                               const char *data,
+                               size_t len,
+                               const char *identifier) {
+  StringRef Data(data, len);
+  MemoryBufferRef Buffer(Data, identifier);
+  unwrap(Context)->enableDebugTypeODRUniquing();
+  Expected<std::unique_ptr<Module>> SrcOrError =
+      parseBitcodeFile(Buffer, *unwrap(Context));
+  if (!SrcOrError) {
+    LLVMRustSetLastError(toString(SrcOrError.takeError()).c_str());
+    return nullptr;
+  }
+  return wrap(std::move(*SrcOrError).release());
+}
+
+#else
+
+extern "C" bool
+LLVMRustWriteThinBitcodeToFile(LLVMPassManagerRef PMR,
+                               LLVMModuleRef M,
+                               const char *BcFile) {
+  llvm_unreachable("ThinLTO not available");
+}
+
+struct LLVMRustThinLTOData {
+};
+
+struct LLVMRustThinLTOModule {
+};
+
+extern "C" LLVMRustThinLTOData*
+LLVMRustCreateThinLTOData(LLVMRustThinLTOModule *modules,
+                          int num_modules,
+                          const char **preserved_symbols,
+                          int num_symbols) {
+  llvm_unreachable("ThinLTO not available");
+}
+
+extern "C" bool
+LLVMRustPrepareThinLTORename(const LLVMRustThinLTOData *Data, LLVMModuleRef M) {
+  llvm_unreachable("ThinLTO not available");
+}
+
+extern "C" bool
+LLVMRustPrepareThinLTOResolveWeak(const LLVMRustThinLTOData *Data, LLVMModuleRef M) {
+  llvm_unreachable("ThinLTO not available");
+}
+
+extern "C" bool
+LLVMRustPrepareThinLTOInternalize(const LLVMRustThinLTOData *Data, LLVMModuleRef M) {
+  llvm_unreachable("ThinLTO not available");
+}
+
+extern "C" bool
+LLVMRustPrepareThinLTOImport(const LLVMRustThinLTOData *Data, LLVMModuleRef M) {
+  llvm_unreachable("ThinLTO not available");
+}
+
+extern "C" void
+LLVMRustFreeThinLTOData(LLVMRustThinLTOData *Data) {
+  llvm_unreachable("ThinLTO not available");
+}
+
+struct LLVMRustThinLTOBuffer {
+};
+
+extern "C" LLVMRustThinLTOBuffer*
+LLVMRustThinLTOBufferCreate(LLVMModuleRef M) {
+  llvm_unreachable("ThinLTO not available");
+}
+
+extern "C" void
+LLVMRustThinLTOBufferFree(LLVMRustThinLTOBuffer *Buffer) {
+  llvm_unreachable("ThinLTO not available");
+}
+
+extern "C" const void*
+LLVMRustThinLTOBufferPtr(const LLVMRustThinLTOBuffer *Buffer) {
+  llvm_unreachable("ThinLTO not available");
+}
+
+extern "C" size_t
+LLVMRustThinLTOBufferLen(const LLVMRustThinLTOBuffer *Buffer) {
+  llvm_unreachable("ThinLTO not available");
+}
+
+extern "C" LLVMModuleRef
+LLVMRustParseBitcodeForThinLTO(LLVMContextRef Context,
+                               const char *data,
+                               size_t len,
+                               const char *identifier) {
+  llvm_unreachable("ThinLTO not available");
+}
+#endif // LLVM_VERSION_GE(4, 0)
--- a/src/test/codegen-units/item-collection/drop_in_place_intrinsic.rs
+++ b/src/test/codegen-units/item-collection/drop_in_place_intrinsic.rs
@ -11,7 +11,7 @@
 // ignore-tidy-linelength
 // compile-flags:-Zprint-trans-items=eager

-//~ TRANS_ITEM fn core::ptr[0]::drop_in_place[0]<drop_in_place_intrinsic::StructWithDtor[0]> @@ drop_in_place_intrinsic.cgu-0[Internal]
+//~ TRANS_ITEM fn core::ptr[0]::drop_in_place[0]<drop_in_place_intrinsic::StructWithDtor[0]> @@ drop_in_place_intrinsic0[Internal]
 struct StructWithDtor(u32);

 impl Drop for StructWithDtor {
@ -22,7 +22,7 @@ impl Drop for StructWithDtor {
 //~ TRANS_ITEM fn drop_in_place_intrinsic::main[0]
 fn main() {

-    //~ TRANS_ITEM fn core::ptr[0]::drop_in_place[0]<[drop_in_place_intrinsic::StructWithDtor[0]; 2]> @@ drop_in_place_intrinsic.cgu-0[Internal]
+    //~ TRANS_ITEM fn core::ptr[0]::drop_in_place[0]<[drop_in_place_intrinsic::StructWithDtor[0]; 2]> @@ drop_in_place_intrinsic0[Internal]
    let x = [StructWithDtor(0), StructWithDtor(1)];

    drop_slice_in_place(&x);
@ -34,7 +34,7 @@ fn drop_slice_in_place(x: &[StructWithDtor]) {
        // This is the interesting thing in this test case: Normally we would
        // not have drop-glue for the unsized [StructWithDtor]. This has to be
        // generated though when the drop_in_place() intrinsic is used.
-        //~ TRANS_ITEM fn core::ptr[0]::drop_in_place[0]<[drop_in_place_intrinsic::StructWithDtor[0]]> @@ drop_in_place_intrinsic.cgu-0[Internal]
+        //~ TRANS_ITEM fn core::ptr[0]::drop_in_place[0]<[drop_in_place_intrinsic::StructWithDtor[0]]> @@ drop_in_place_intrinsic0[Internal]
        ::std::ptr::drop_in_place(x as *const _ as *mut [StructWithDtor]);
    }
 }
--- a/src/test/codegen-units/item-collection/generic-drop-glue.rs
+++ b/src/test/codegen-units/item-collection/generic-drop-glue.rs
@ -45,7 +45,7 @@ enum EnumNoDrop<T1, T2> {
 struct NonGenericNoDrop(i32);

 struct NonGenericWithDrop(i32);
-//~ TRANS_ITEM fn core::ptr[0]::drop_in_place[0]<generic_drop_glue::NonGenericWithDrop[0]> @@ generic_drop_glue.cgu-0[Internal]
+//~ TRANS_ITEM fn core::ptr[0]::drop_in_place[0]<generic_drop_glue::NonGenericWithDrop[0]> @@ generic_drop_glue0[Internal]

 impl Drop for NonGenericWithDrop {
    //~ TRANS_ITEM fn generic_drop_glue::{{impl}}[2]::drop[0]
@ -54,11 +54,11 @@ impl Drop for NonGenericWithDrop {

 //~ TRANS_ITEM fn generic_drop_glue::main[0]
 fn main() {
-    //~ TRANS_ITEM fn core::ptr[0]::drop_in_place[0]<generic_drop_glue::StructWithDrop[0]<i8, char>> @@ generic_drop_glue.cgu-0[Internal]
+    //~ TRANS_ITEM fn core::ptr[0]::drop_in_place[0]<generic_drop_glue::StructWithDrop[0]<i8, char>> @@ generic_drop_glue0[Internal]
    //~ TRANS_ITEM fn generic_drop_glue::{{impl}}[0]::drop[0]<i8, char>
    let _ = StructWithDrop { x: 0i8, y: 'a' }.x;

-    //~ TRANS_ITEM fn core::ptr[0]::drop_in_place[0]<generic_drop_glue::StructWithDrop[0]<&str, generic_drop_glue::NonGenericNoDrop[0]>> @@ generic_drop_glue.cgu-0[Internal]
+    //~ TRANS_ITEM fn core::ptr[0]::drop_in_place[0]<generic_drop_glue::StructWithDrop[0]<&str, generic_drop_glue::NonGenericNoDrop[0]>> @@ generic_drop_glue0[Internal]
    //~ TRANS_ITEM fn generic_drop_glue::{{impl}}[0]::drop[0]<&str, generic_drop_glue::NonGenericNoDrop[0]>
    let _ = StructWithDrop { x: "&str", y: NonGenericNoDrop(0) }.y;

@ -67,17 +67,17 @@ fn main() {

    // This is supposed to generate drop-glue because it contains a field that
    // needs to be dropped.
-    //~ TRANS_ITEM fn core::ptr[0]::drop_in_place[0]<generic_drop_glue::StructNoDrop[0]<generic_drop_glue::NonGenericWithDrop[0], f64>> @@ generic_drop_glue.cgu-0[Internal]
+    //~ TRANS_ITEM fn core::ptr[0]::drop_in_place[0]<generic_drop_glue::StructNoDrop[0]<generic_drop_glue::NonGenericWithDrop[0], f64>> @@ generic_drop_glue0[Internal]
    let _ = StructNoDrop { x: NonGenericWithDrop(0), y: 0f64 }.y;

-    //~ TRANS_ITEM fn core::ptr[0]::drop_in_place[0]<generic_drop_glue::EnumWithDrop[0]<i32, i64>> @@ generic_drop_glue.cgu-0[Internal]
+    //~ TRANS_ITEM fn core::ptr[0]::drop_in_place[0]<generic_drop_glue::EnumWithDrop[0]<i32, i64>> @@ generic_drop_glue0[Internal]
    //~ TRANS_ITEM fn generic_drop_glue::{{impl}}[1]::drop[0]<i32, i64>
    let _ = match EnumWithDrop::A::<i32, i64>(0) {
        EnumWithDrop::A(x) => x,
        EnumWithDrop::B(x) => x as i32
    };

-    //~TRANS_ITEM fn core::ptr[0]::drop_in_place[0]<generic_drop_glue::EnumWithDrop[0]<f64, f32>> @@ generic_drop_glue.cgu-0[Internal]
+    //~TRANS_ITEM fn core::ptr[0]::drop_in_place[0]<generic_drop_glue::EnumWithDrop[0]<f64, f32>> @@ generic_drop_glue0[Internal]
    //~ TRANS_ITEM fn generic_drop_glue::{{impl}}[1]::drop[0]<f64, f32>
    let _ = match EnumWithDrop::B::<f64, f32>(1.0) {
        EnumWithDrop::A(x) => x,
--- a/src/test/codegen-units/item-collection/instantiation-through-vtable.rs
+++ b/src/test/codegen-units/item-collection/instantiation-through-vtable.rs
@ -31,13 +31,13 @@ impl<T> Trait for Struct<T> {
 fn main() {
    let s1 = Struct { _a: 0u32 };

-    //~ TRANS_ITEM fn core::ptr[0]::drop_in_place[0]<instantiation_through_vtable::Struct[0]<u32>> @@ instantiation_through_vtable.cgu-0[Internal]
+    //~ TRANS_ITEM fn core::ptr[0]::drop_in_place[0]<instantiation_through_vtable::Struct[0]<u32>> @@ instantiation_through_vtable0[Internal]
    //~ TRANS_ITEM fn instantiation_through_vtable::{{impl}}[0]::foo[0]<u32>
    //~ TRANS_ITEM fn instantiation_through_vtable::{{impl}}[0]::bar[0]<u32>
    let _ = &s1 as &Trait;

    let s1 = Struct { _a: 0u64 };
-    //~ TRANS_ITEM fn core::ptr[0]::drop_in_place[0]<instantiation_through_vtable::Struct[0]<u64>> @@ instantiation_through_vtable.cgu-0[Internal]
+    //~ TRANS_ITEM fn core::ptr[0]::drop_in_place[0]<instantiation_through_vtable::Struct[0]<u64>> @@ instantiation_through_vtable0[Internal]
    //~ TRANS_ITEM fn instantiation_through_vtable::{{impl}}[0]::foo[0]<u64>
    //~ TRANS_ITEM fn instantiation_through_vtable::{{impl}}[0]::bar[0]<u64>
    let _ = &s1 as &Trait;
--- a/src/test/codegen-units/item-collection/non-generic-drop-glue.rs
+++ b/src/test/codegen-units/item-collection/non-generic-drop-glue.rs
@ -13,7 +13,7 @@

 #![deny(dead_code)]

-//~ TRANS_ITEM fn core::ptr[0]::drop_in_place[0]<non_generic_drop_glue::StructWithDrop[0]> @@ non_generic_drop_glue.cgu-0[Internal]
+//~ TRANS_ITEM fn core::ptr[0]::drop_in_place[0]<non_generic_drop_glue::StructWithDrop[0]> @@ non_generic_drop_glue0[Internal]
 struct StructWithDrop {
    x: i32
 }
@ -27,7 +27,7 @@ struct StructNoDrop {
    x: i32
 }

-//~ TRANS_ITEM fn core::ptr[0]::drop_in_place[0]<non_generic_drop_glue::EnumWithDrop[0]> @@ non_generic_drop_glue.cgu-0[Internal]
+//~ TRANS_ITEM fn core::ptr[0]::drop_in_place[0]<non_generic_drop_glue::EnumWithDrop[0]> @@ non_generic_drop_glue0[Internal]
 enum EnumWithDrop {
    A(i32)
 }
--- a/src/test/codegen-units/item-collection/transitive-drop-glue.rs
+++ b/src/test/codegen-units/item-collection/transitive-drop-glue.rs
@ -13,11 +13,11 @@

 #![deny(dead_code)]

-//~ TRANS_ITEM fn core::ptr[0]::drop_in_place[0]<transitive_drop_glue::Root[0]> @@ transitive_drop_glue.cgu-0[Internal]
+//~ TRANS_ITEM fn core::ptr[0]::drop_in_place[0]<transitive_drop_glue::Root[0]> @@ transitive_drop_glue0[Internal]
 struct Root(Intermediate);
-//~ TRANS_ITEM fn core::ptr[0]::drop_in_place[0]<transitive_drop_glue::Intermediate[0]> @@ transitive_drop_glue.cgu-0[Internal]
+//~ TRANS_ITEM fn core::ptr[0]::drop_in_place[0]<transitive_drop_glue::Intermediate[0]> @@ transitive_drop_glue0[Internal]
 struct Intermediate(Leaf);
-//~ TRANS_ITEM fn core::ptr[0]::drop_in_place[0]<transitive_drop_glue::Leaf[0]> @@ transitive_drop_glue.cgu-0[Internal]
+//~ TRANS_ITEM fn core::ptr[0]::drop_in_place[0]<transitive_drop_glue::Leaf[0]> @@ transitive_drop_glue0[Internal]
 struct Leaf;

 impl Drop for Leaf {
@ -38,15 +38,15 @@ fn main() {

    let _ = Root(Intermediate(Leaf));

-    //~ TRANS_ITEM fn core::ptr[0]::drop_in_place[0]<transitive_drop_glue::RootGen[0]<u32>> @@ transitive_drop_glue.cgu-0[Internal]
-    //~ TRANS_ITEM fn core::ptr[0]::drop_in_place[0]<transitive_drop_glue::IntermediateGen[0]<u32>> @@ transitive_drop_glue.cgu-0[Internal]
-    //~ TRANS_ITEM fn core::ptr[0]::drop_in_place[0]<transitive_drop_glue::LeafGen[0]<u32>> @@ transitive_drop_glue.cgu-0[Internal]
+    //~ TRANS_ITEM fn core::ptr[0]::drop_in_place[0]<transitive_drop_glue::RootGen[0]<u32>> @@ transitive_drop_glue0[Internal]
+    //~ TRANS_ITEM fn core::ptr[0]::drop_in_place[0]<transitive_drop_glue::IntermediateGen[0]<u32>> @@ transitive_drop_glue0[Internal]
+    //~ TRANS_ITEM fn core::ptr[0]::drop_in_place[0]<transitive_drop_glue::LeafGen[0]<u32>> @@ transitive_drop_glue0[Internal]
    //~ TRANS_ITEM fn transitive_drop_glue::{{impl}}[1]::drop[0]<u32>
    let _ = RootGen(IntermediateGen(LeafGen(0u32)));

-    //~ TRANS_ITEM fn core::ptr[0]::drop_in_place[0]<transitive_drop_glue::RootGen[0]<i16>> @@ transitive_drop_glue.cgu-0[Internal]
-    //~ TRANS_ITEM fn core::ptr[0]::drop_in_place[0]<transitive_drop_glue::IntermediateGen[0]<i16>> @@ transitive_drop_glue.cgu-0[Internal]
-    //~ TRANS_ITEM fn core::ptr[0]::drop_in_place[0]<transitive_drop_glue::LeafGen[0]<i16>> @@ transitive_drop_glue.cgu-0[Internal]
+    //~ TRANS_ITEM fn core::ptr[0]::drop_in_place[0]<transitive_drop_glue::RootGen[0]<i16>> @@ transitive_drop_glue0[Internal]
+    //~ TRANS_ITEM fn core::ptr[0]::drop_in_place[0]<transitive_drop_glue::IntermediateGen[0]<i16>> @@ transitive_drop_glue0[Internal]
+    //~ TRANS_ITEM fn core::ptr[0]::drop_in_place[0]<transitive_drop_glue::LeafGen[0]<i16>> @@ transitive_drop_glue0[Internal]
    //~ TRANS_ITEM fn transitive_drop_glue::{{impl}}[1]::drop[0]<i16>
    let _ = RootGen(IntermediateGen(LeafGen(0i16)));
 }
--- a/src/test/codegen-units/item-collection/tuple-drop-glue.rs
+++ b/src/test/codegen-units/item-collection/tuple-drop-glue.rs
@ -13,7 +13,7 @@

 #![deny(dead_code)]

-//~ TRANS_ITEM fn core::ptr[0]::drop_in_place[0]<tuple_drop_glue::Dropped[0]> @@ tuple_drop_glue.cgu-0[Internal]
+//~ TRANS_ITEM fn core::ptr[0]::drop_in_place[0]<tuple_drop_glue::Dropped[0]> @@ tuple_drop_glue0[Internal]
 struct Dropped;

 impl Drop for Dropped {
@ -23,10 +23,10 @@ impl Drop for Dropped {

 //~ TRANS_ITEM fn tuple_drop_glue::main[0]
 fn main() {
-    //~ TRANS_ITEM fn core::ptr[0]::drop_in_place[0]<(u32, tuple_drop_glue::Dropped[0])> @@ tuple_drop_glue.cgu-0[Internal]
+    //~ TRANS_ITEM fn core::ptr[0]::drop_in_place[0]<(u32, tuple_drop_glue::Dropped[0])> @@ tuple_drop_glue0[Internal]
    let x = (0u32, Dropped);

-    //~ TRANS_ITEM fn core::ptr[0]::drop_in_place[0]<(i16, (tuple_drop_glue::Dropped[0], bool))> @@ tuple_drop_glue.cgu-0[Internal]
-    //~ TRANS_ITEM fn core::ptr[0]::drop_in_place[0]<(tuple_drop_glue::Dropped[0], bool)> @@ tuple_drop_glue.cgu-0[Internal]
+    //~ TRANS_ITEM fn core::ptr[0]::drop_in_place[0]<(i16, (tuple_drop_glue::Dropped[0], bool))> @@ tuple_drop_glue0[Internal]
+    //~ TRANS_ITEM fn core::ptr[0]::drop_in_place[0]<(tuple_drop_glue::Dropped[0], bool)> @@ tuple_drop_glue0[Internal]
    let x = (0i16, (Dropped, true));
 }
--- a/src/test/codegen-units/item-collection/unsizing.rs
+++ b/src/test/codegen-units/item-collection/unsizing.rs
@ -57,13 +57,13 @@ fn main()
 {
    // simple case
    let bool_sized = &true;
-    //~ TRANS_ITEM fn core::ptr[0]::drop_in_place[0]<bool> @@ unsizing.cgu-0[Internal]
+    //~ TRANS_ITEM fn core::ptr[0]::drop_in_place[0]<bool> @@ unsizing0[Internal]
    //~ TRANS_ITEM fn unsizing::{{impl}}[0]::foo[0]
    let _bool_unsized = bool_sized as &Trait;

    let char_sized = &'a';

-    //~ TRANS_ITEM fn core::ptr[0]::drop_in_place[0]<char> @@ unsizing.cgu-0[Internal]
+    //~ TRANS_ITEM fn core::ptr[0]::drop_in_place[0]<char> @@ unsizing0[Internal]
    //~ TRANS_ITEM fn unsizing::{{impl}}[1]::foo[0]
    let _char_unsized = char_sized as &Trait;

@ -73,13 +73,13 @@ fn main()
        _b: 2,
        _c: 3.0f64
    };
-    //~ TRANS_ITEM fn core::ptr[0]::drop_in_place[0]<f64> @@ unsizing.cgu-0[Internal]
+    //~ TRANS_ITEM fn core::ptr[0]::drop_in_place[0]<f64> @@ unsizing0[Internal]
    //~ TRANS_ITEM fn unsizing::{{impl}}[2]::foo[0]
    let _struct_unsized = struct_sized as &Struct<Trait>;

    // custom coercion
    let wrapper_sized = Wrapper(&0u32);
-    //~ TRANS_ITEM fn core::ptr[0]::drop_in_place[0]<u32> @@ unsizing.cgu-0[Internal]
+    //~ TRANS_ITEM fn core::ptr[0]::drop_in_place[0]<u32> @@ unsizing0[Internal]
    //~ TRANS_ITEM fn unsizing::{{impl}}[3]::foo[0]
    let _wrapper_sized = wrapper_sized as Wrapper<Trait>;
 }
--- a/src/test/run-make/extra-filename-with-temp-outputs/Makefile
+++ b/src/test/run-make/extra-filename-with-temp-outputs/Makefile
@ -2,5 +2,5 @@

 all:
 	$(RUSTC) -C extra-filename=bar foo.rs -C save-temps
-	rm $(TMPDIR)/foobar.0.o
+	rm $(TMPDIR)/foobar.foo0.rust-cgu.o
 	rm $(TMPDIR)/$(call BIN,foobar)
--- a/src/test/run-make/sepcomp-cci-copies/Makefile
+++ b/src/test/run-make/sepcomp-cci-copies/Makefile
@ -6,4 +6,4 @@
 all:
 	$(RUSTC) cci_lib.rs
 	$(RUSTC) foo.rs --emit=llvm-ir -C codegen-units=3
-	[ "$$(cat "$(TMPDIR)"/foo.?.ll | grep -c define\ .*cci_fn)" -eq "2" ]
+	[ "$$(cat "$(TMPDIR)"/foo.*.ll | grep -c define\ .*cci_fn)" -eq "2" ]
--- a/src/test/run-make/sepcomp-inlining/Makefile
+++ b/src/test/run-make/sepcomp-inlining/Makefile
@ -8,7 +8,7 @@

 all:
 	$(RUSTC) foo.rs --emit=llvm-ir -C codegen-units=3
-	[ "$$(cat "$(TMPDIR)"/foo.?.ll | grep -c define\ i32\ .*inlined)" -eq "0" ]
-	[ "$$(cat "$(TMPDIR)"/foo.?.ll | grep -c define\ internal\ i32\ .*inlined)" -eq "2" ]
-	[ "$$(cat "$(TMPDIR)"/foo.?.ll | grep -c define\ hidden\ i32\ .*normal)" -eq "1" ]
-	[ "$$(cat "$(TMPDIR)"/foo.?.ll | grep -c declare\ hidden\ i32\ .*normal)" -eq "2" ]
+	[ "$$(cat "$(TMPDIR)"/foo.*.ll | grep -c define\ i32\ .*inlined)" -eq "0" ]
+	[ "$$(cat "$(TMPDIR)"/foo.*.ll | grep -c define\ internal\ i32\ .*inlined)" -eq "2" ]
+	[ "$$(cat "$(TMPDIR)"/foo.*.ll | grep -c define\ hidden\ i32\ .*normal)" -eq "1" ]
+	[ "$$(cat "$(TMPDIR)"/foo.*.ll | grep -c declare\ hidden\ i32\ .*normal)" -eq "2" ]
--- a/src/test/run-make/sepcomp-separate/Makefile
+++ b/src/test/run-make/sepcomp-separate/Makefile
@ -6,4 +6,4 @@

 all:
 	$(RUSTC) foo.rs --emit=llvm-ir -C codegen-units=3
-	[ "$$(cat "$(TMPDIR)"/foo.?.ll | grep -c define\ .*magic_fn)" -eq "3" ]
+	[ "$$(cat "$(TMPDIR)"/foo.*.ll | grep -c define\ .*magic_fn)" -eq "3" ]
--- a/src/test/run-pass/auxiliary/thin-lto-inlines-aux.rs
+++ b/src/test/run-pass/auxiliary/thin-lto-inlines-aux.rs
@ -0,0 +1,17 @@
+// Copyright 2017 The Rust Project Developers. See the COPYRIGHT
+// file at the top-level directory of this distribution and at
+// http://rust-lang.org/COPYRIGHT.
+//
+// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
+// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
+// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
+// option. This file may not be copied, modified, or distributed
+// except according to those terms.
+
+// no-prefer-dynamic
+
+#![crate_type = "rlib"]
+
+pub fn bar() -> u32 {
+    3
+}
--- a/src/test/run-pass/thin-lto-inlines.rs
+++ b/src/test/run-pass/thin-lto-inlines.rs
@ -0,0 +1,39 @@
+// Copyright 2017 The Rust Project Developers. See the COPYRIGHT
+// file at the top-level directory of this distribution and at
+// http://rust-lang.org/COPYRIGHT.
+//
+// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
+// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
+// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
+// option. This file may not be copied, modified, or distributed
+// except according to those terms.
+
+// compile-flags: -Z thinlto -C codegen-units=8 -O
+// min-llvm-version 4.0
+// ignore-emscripten
+
+// We want to assert here that ThinLTO will inline across codegen units. There's
+// not really a great way to do that in general so we sort of hack around it by
+// praying two functions go into separate codegen units and then assuming that
+// if inlining *doesn't* happen the first byte of the functions will differ.
+
+pub fn foo() -> u32 {
+    bar::bar()
+}
+
+mod bar {
+    pub fn bar() -> u32 {
+        3
+    }
+}
+
+fn main() {
+    println!("{} {}", foo(), bar::bar());
+
+    unsafe {
+        let foo = foo as usize as *const u8;
+        let bar = bar::bar as usize as *const u8;
+
+        assert_eq!(*foo, *bar);
+    }
+}
--- a/src/test/run-pass/thin-lto-inlines2.rs
+++ b/src/test/run-pass/thin-lto-inlines2.rs
@ -0,0 +1,38 @@
+// Copyright 2017 The Rust Project Developers. See the COPYRIGHT
+// file at the top-level directory of this distribution and at
+// http://rust-lang.org/COPYRIGHT.
+//
+// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
+// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
+// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
+// option. This file may not be copied, modified, or distributed
+// except according to those terms.
+
+// compile-flags: -Z thinlto -C codegen-units=8 -O -C lto
+// aux-build:thin-lto-inlines-aux.rs
+// min-llvm-version 4.0
+// no-prefer-dynamic
+// ignore-emscripten
+
+// We want to assert here that ThinLTO will inline across codegen units. There's
+// not really a great way to do that in general so we sort of hack around it by
+// praying two functions go into separate codegen units and then assuming that
+// if inlining *doesn't* happen the first byte of the functions will differ.
+
+extern crate thin_lto_inlines_aux as bar;
+
+pub fn foo() -> u32 {
+    bar::bar()
+}
+
+fn main() {
+    println!("{} {}", foo(), bar::bar());
+
+    unsafe {
+        let foo = foo as usize as *const u8;
+        let bar = bar::bar as usize as *const u8;
+
+        assert_eq!(*foo, *bar);
+    }
+}
+