nixpkgs/pkgs/build-support/build-fhs-userenv/chrootenv/chrootenv.c
Michal Sojka b681ad3254 buildFHSUserEnv: Allow having custom /opt in the FHS environment
buildFHSUserEnv is meant primarily for running 3rd-party software
which is difficult to patch for NixOS. Such software is often built to
run from /opt. Currently, running such a software from FHS environment
is difficult for two reasons:

1. If the 3rd-party software is put into the Nix store via a simple
   derivation (with e.g. installPhase = "dpkg-deb -x $src $out"), the
   content of /opt directory of that derivation does not appear in the
   FHSEnv even if the derivation is specified in targetPkgs. This is
   why we change env.nix.

2. If using buildFHSUserEnvChroot and the host system has the /opt
   directory, it always gets bind-mounted to the FHSEnv even if some
   targetPkgs contain /opt (NB buildFHSUserEnvBubblewrap does not have
   this problem). If that directory is not accessible for non-root
   users (which is what docker's containerd does with /opt :-(), the
   user running the FHSEnv cannot use it.

   With the change in chrootenv.c, /opt is not bind-mounted to the
   container, but instead created as user-modifiable symlink to
   /host/opt (see the init attribute in
   build-fhs-userenv/default.nix). If needed, the user can remove this
   symlink and create an empty /opt directory which is under his/her
   control.
2021-06-27 08:33:51 +02:00

170 lines
4.5 KiB
C

#define _GNU_SOURCE
#include <glib.h>
#include <glib/gstdio.h>
#include <errno.h>
#include <sched.h>
#include <unistd.h>
#include <sys/mount.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <sys/wait.h>
#include <sys/syscall.h>
#define fail(s, err) g_error("%s: %s: %s", __func__, s, g_strerror(err))
#define fail_if(expr) \
if (expr) \
fail(#expr, errno);
const gchar *bind_blacklist[] = {"bin", "etc", "host", "real-host", "usr", "lib", "lib64", "lib32", "sbin", "opt", NULL};
int pivot_root(const char *new_root, const char *put_old) {
return syscall(SYS_pivot_root, new_root, put_old);
}
void mount_tmpfs(const gchar *target) {
fail_if(mount("none", target, "tmpfs", 0, NULL));
}
void bind_mount(const gchar *source, const gchar *target) {
fail_if(g_mkdir(target, 0755));
fail_if(mount(source, target, NULL, MS_BIND | MS_REC, NULL));
}
const gchar *create_tmpdir() {
gchar *prefix =
g_build_filename(g_get_tmp_dir(), "chrootenvXXXXXX", NULL);
fail_if(!g_mkdtemp_full(prefix, 0755));
return prefix;
}
void pivot_host(const gchar *guest) {
g_autofree gchar *point = g_build_filename(guest, "host", NULL);
fail_if(g_mkdir(point, 0755));
fail_if(pivot_root(guest, point));
}
void bind_mount_item(const gchar *host, const gchar *guest, const gchar *name) {
g_autofree gchar *source = g_build_filename(host, name, NULL);
g_autofree gchar *target = g_build_filename(guest, name, NULL);
if (G_LIKELY(g_file_test(source, G_FILE_TEST_IS_DIR)))
bind_mount(source, target);
}
void bind(const gchar *host, const gchar *guest) {
mount_tmpfs(guest);
pivot_host(guest);
g_autofree gchar *host_dir = g_build_filename("/host", host, NULL);
g_autoptr(GError) err = NULL;
g_autoptr(GDir) dir = g_dir_open(host_dir, 0, &err);
if (err != NULL)
fail("g_dir_open", errno);
const gchar *item;
while ((item = g_dir_read_name(dir)))
if (!g_strv_contains(bind_blacklist, item))
bind_mount_item(host_dir, "/", item);
}
void spit(const char *path, char *fmt, ...) {
va_list args;
va_start(args, fmt);
FILE *f = g_fopen(path, "w");
if (f == NULL)
fail("g_fopen", errno);
g_vfprintf(f, fmt, args);
fclose(f);
}
int main(gint argc, gchar **argv) {
const gchar *self = *argv++;
if (argc < 2) {
g_message("%s command [arguments...]", self);
return 1;
}
g_autofree const gchar *prefix = create_tmpdir();
pid_t cpid = fork();
if (cpid < 0)
fail("fork", errno);
else if (cpid == 0) {
uid_t uid = getuid();
gid_t gid = getgid();
int namespaces = CLONE_NEWNS;
if (uid != 0) {
namespaces |= CLONE_NEWUSER;
}
if (unshare(namespaces) < 0) {
int unshare_errno = errno;
g_message("Requires Linux version >= 3.19 built with CONFIG_USER_NS");
if (g_file_test("/proc/sys/kernel/unprivileged_userns_clone",
G_FILE_TEST_EXISTS))
g_message("Run: sudo sysctl -w kernel.unprivileged_userns_clone=1");
fail("unshare", unshare_errno);
}
// hide all mounts we do from the parent
fail_if(mount(0, "/", 0, MS_PRIVATE | MS_REC, 0));
if (uid != 0) {
spit("/proc/self/setgroups", "deny");
spit("/proc/self/uid_map", "%d %d 1", uid, uid);
spit("/proc/self/gid_map", "%d %d 1", gid, gid);
}
// If there is a /host directory, assume this is nested chrootenv and use it as host instead.
gboolean nested_host = g_file_test("/host", G_FILE_TEST_EXISTS | G_FILE_TEST_IS_DIR);
g_autofree const gchar *host = nested_host ? "/host" : "/";
bind(host, prefix);
// Replace /host by an actual (inner) /host.
if (nested_host) {
fail_if(g_mkdir("/real-host", 0755));
fail_if(mount("/host/host", "/real-host", NULL, MS_BIND | MS_REC, NULL));
// For some reason umount("/host") returns EBUSY even immediately after
// pivot_root. We detach it at least to keep `/proc/mounts` from blowing
// up in nested cases.
fail_if(umount2("/host", MNT_DETACH));
fail_if(mount("/real-host", "/host", NULL, MS_MOVE, NULL));
fail_if(rmdir("/real-host"));
}
fail_if(chdir("/"));
fail_if(execvp(*argv, argv));
}
else {
int status;
fail_if(waitpid(cpid, &status, 0) != cpid);
fail_if(rmdir(prefix));
if (WIFEXITED(status))
return WEXITSTATUS(status);
else if (WIFSIGNALED(status))
kill(getpid(), WTERMSIG(status));
return 1;
}
}