From a09e9e9a2a0977d164173d8d08aac9fc3c9c3b4b Mon Sep 17 00:00:00 2001 From: Alex Crichton Date: Thu, 22 Mar 2018 13:39:52 -0700 Subject: [PATCH] ci: Don't use Travis caches for docker images This commit moves away from caching on Travis to our own caching on S3 for caching docker layers between builds. Unfortunately the Travis caches have over time had a few critical pain points: * Caches are only updated for successful builds, meaning that if a build times out or fails in a different location the sucessfully-created docker images isn't always cached. While this makes sense as a general rule of caches it hurts our use cases. * Caches are per-branch and builder which means that we don't have a separate cache on each release channel. All our merges go through the `auto` branch which means that they're all sharing the same cache, even those for merging to master/beta. This means that PRs which switch between master/beta will keep rebuilting and having cache misses. * Caches have historically been invaliated somewhat regularly a little more aggressively than we'd want (I think). * We don't always need to update the contents of the cache if the Docker image didn't change at all, and saving off the docker layers can sometimes be quite expensive. For all these reasons this commit drops the usage of Travis's built-in caching support. Instead our own caching is used by storing blobs to S3. Normally this would be a very risky endeavour but we're basically priming a cache for a cache (docker) so if we get this wrong the failure mode is longer builds, not stale caches. We'll notice that pretty quickly and hopefully fix it! The logic here is inserted directly into the `src/ci/docker/run.sh` script to download an image based on a shasum of the `Dockerfile` and other assorted files. This blob, if found, is loaded into docker and we record what layers were inserted. After docker finishes the build (hopefully quickly with lots of cache hits) we then see the sha of the final image. If it's one of the layers we loaded then there's no need to update the cache. Otherwise we upload our layers to the global cache, possibly overwriting what we previously just downloaded. This is hopefully a step towards mitigating #49278 although it doesn't completely fix it as it means we'll still probably have to retry builds that bust the cache. --- .travis.yml | 22 ++++++---------------- src/ci/docker/run.sh | 32 ++++++++++++++++++++++++++++++++ 2 files changed, 38 insertions(+), 16 deletions(-) diff --git a/.travis.yml b/.travis.yml index 41ea0c9afa8..091a5abdaa2 100644 --- a/.travis.yml +++ b/.travis.yml @@ -183,7 +183,6 @@ matrix: if: branch = master AND type = push before_install: [] install: [] - cache: false sudo: false script: MESSAGE_FILE=$(mktemp -t msg.XXXXXX); @@ -201,7 +200,12 @@ env: - secure: "cFh8thThqEJLC98XKI5pfqflUzOlxsYPRW20AWRaYOOgYHPTiGWypTXiPbGSKaeAXTZoOA+DpQtEmefc0U6lt9dHc7a/MIaK6isFurjlnKYiLOeTruzyu1z7PWCeZ/jKXsU2RK/88DBtlNwfMdaMIeuKj14IVfpepPPL71ETbuk=" before_install: - - zcat $HOME/docker/rust-ci.tar.gz | docker load || true + # We'll use the AWS cli to download/upload cached docker layers, so install + # that here. + - if [ "$TRAVIS_OS_NAME" = linux ]; then + pip install --user awscli; + export PATH=$PATH:$HOME/.local/bin; + fi - mkdir -p $HOME/rustsrc # FIXME(#46924): these two commands are required to enable IPv6, # they shouldn't exist, please revert once more official solutions appeared. @@ -286,23 +290,9 @@ after_failure: # it happened - dmesg | grep -i kill -# Save tagged docker images we created and load them if they're available -# Travis saves caches whether the build failed or not, nuke rustsrc if -# the failure was while updating it (as it may be in a bad state) -# https://github.com/travis-ci/travis-ci/issues/4472 -before_cache: - - docker history -q rust-ci | - grep -v missing | - xargs docker save | - gzip > $HOME/docker/rust-ci.tar.gz - notifications: email: false -cache: - directories: - - $HOME/docker - before_deploy: - mkdir -p deploy/$TRAVIS_COMMIT - > diff --git a/src/ci/docker/run.sh b/src/ci/docker/run.sh index f743c976f91..2946cf7fc50 100755 --- a/src/ci/docker/run.sh +++ b/src/ci/docker/run.sh @@ -27,6 +27,21 @@ travis_fold start build_docker travis_time_start if [ -f "$docker_dir/$image/Dockerfile" ]; then + if [ "$CI" != "" ]; then + cksum=$(find $docker_dir/$image $docker_dir/scripts -type f | \ + sort | \ + xargs cat | \ + sha512sum | \ + awk '{print $1}') + s3url="s3://$SCCACHE_BUCKET/docker/$cksum" + url="https://s3-us-west-1.amazonaws.com/$SCCACHE_BUCKET/docker/$cksum" + echo "Attempting to download $s3url" + set +e + loaded_images=$(curl $url | docker load | sed 's/.* sha/sha/') + set -e + echo "Downloaded containers:\n$loaded_images" + fi + dockerfile="$docker_dir/$image/Dockerfile" if [ -x /usr/bin/cygpath ]; then context="`cygpath -w $docker_dir`" @@ -40,6 +55,23 @@ if [ -f "$docker_dir/$image/Dockerfile" ]; then -t rust-ci \ -f "$dockerfile" \ "$context" + + if [ "$s3url" != "" ]; then + digest=$(docker inspect rust-ci --format '{{.Id}}') + echo "Built container $digest" + if ! grep -q "$digest" <(echo "$loaded_images"); then + echo "Uploading finished image to $s3url" + set +e + docker history -q rust-ci | \ + grep -v missing | \ + xargs docker save | \ + gzip | \ + aws s3 cp - $s3url + set -e + else + echo "Looks like docker image is the same as before, not uploading" + fi + fi elif [ -f "$docker_dir/disabled/$image/Dockerfile" ]; then if [ -n "$TRAVIS_OS_NAME" ]; then echo Cannot run disabled images on travis!