nixos/wrappers: allow setuid and setgid wrappers to run in user namespaces
In user namespaces where an unprivileged user is mapped as root and root
is unmapped, setuid bits have no effect. However setuid root
executables like mount are still usable *in the namespace* as the user
already has the required privileges. This commit detects the situation
where the wrapper gained no privileges that the parent process did not
already have and in this case does less sanity checking. In short there
is no need to be picky since the parent already can execute the foo.real
executable themselves.
Details:
man 7 user_namespaces:
Set-user-ID and set-group-ID programs
When a process inside a user namespace executes a set-user-ID
(set-group-ID) program, the process's effective user (group) ID
inside the namespace is changed to whatever value is mapped for
the user (group) ID of the file. However, if either the user or
the group ID of the file has no mapping inside the namespace, the
set-user-ID (set-group-ID) bit is silently ignored: the new
program is executed, but the process's effective user (group) ID
is left unchanged. (This mirrors the semantics of executing a
set-user-ID or set-group-ID program that resides on a filesystem
that was mounted with the MS_NOSUID flag, as described in
mount(2).)
The effect of the setuid bit is that the real user id is preserved and
the effective and set user ids are changed to the owner of the wrapper.
We detect that no privilege was gained by checking that euid == suid
== ruid. In this case we stop checking that euid == owner of the
wrapper file.
As a reminder here are the values of euid, ruid, suid, stat.st_uid and
stat.st_mode & S_ISUID in various cases when running a setuid 42 executable as user 1000:
Normal case:
ruid=1000 euid=42 suid=42
setuid=2048, st_uid=42
nosuid mount:
ruid=1000 euid=1000 suid=1000
setuid=2048, st_uid=42
inside unshare -rm:
ruid=0 euid=0 suid=0
setuid=2048, st_uid=65534
inside unshare -rm, on a suid mount:
ruid=0 euid=0 suid=0
setuid=2048, st_uid=65534
2023-05-13 12:00:00 +00:00
|
|
|
#define _GNU_SOURCE
|
2016-06-30 23:59:32 +00:00
|
|
|
#include <stdlib.h>
|
|
|
|
#include <stdio.h>
|
|
|
|
#include <string.h>
|
|
|
|
#include <unistd.h>
|
2022-01-27 07:12:00 +00:00
|
|
|
#include <stdnoreturn.h>
|
2016-06-30 23:59:32 +00:00
|
|
|
#include <sys/types.h>
|
|
|
|
#include <sys/stat.h>
|
2021-01-14 07:24:27 +00:00
|
|
|
#include <sys/xattr.h>
|
2016-06-30 23:59:32 +00:00
|
|
|
#include <fcntl.h>
|
|
|
|
#include <dirent.h>
|
|
|
|
#include <errno.h>
|
2017-01-26 08:41:00 +00:00
|
|
|
#include <linux/capability.h>
|
|
|
|
#include <sys/prctl.h>
|
2018-03-24 17:02:24 +00:00
|
|
|
#include <limits.h>
|
2021-01-14 07:24:27 +00:00
|
|
|
#include <stdint.h>
|
|
|
|
#include <syscall.h>
|
|
|
|
#include <byteswap.h>
|
2016-06-30 23:59:32 +00:00
|
|
|
|
2023-10-04 19:16:06 +00:00
|
|
|
// imported from glibc
|
|
|
|
#include "unsecvars.h"
|
|
|
|
|
2022-11-04 23:09:32 +00:00
|
|
|
#ifndef SOURCE_PROG
|
|
|
|
#error SOURCE_PROG should be defined via preprocessor commandline
|
|
|
|
#endif
|
|
|
|
|
nixos/wrappers: allow setuid and setgid wrappers to run in user namespaces
In user namespaces where an unprivileged user is mapped as root and root
is unmapped, setuid bits have no effect. However setuid root
executables like mount are still usable *in the namespace* as the user
already has the required privileges. This commit detects the situation
where the wrapper gained no privileges that the parent process did not
already have and in this case does less sanity checking. In short there
is no need to be picky since the parent already can execute the foo.real
executable themselves.
Details:
man 7 user_namespaces:
Set-user-ID and set-group-ID programs
When a process inside a user namespace executes a set-user-ID
(set-group-ID) program, the process's effective user (group) ID
inside the namespace is changed to whatever value is mapped for
the user (group) ID of the file. However, if either the user or
the group ID of the file has no mapping inside the namespace, the
set-user-ID (set-group-ID) bit is silently ignored: the new
program is executed, but the process's effective user (group) ID
is left unchanged. (This mirrors the semantics of executing a
set-user-ID or set-group-ID program that resides on a filesystem
that was mounted with the MS_NOSUID flag, as described in
mount(2).)
The effect of the setuid bit is that the real user id is preserved and
the effective and set user ids are changed to the owner of the wrapper.
We detect that no privilege was gained by checking that euid == suid
== ruid. In this case we stop checking that euid == owner of the
wrapper file.
As a reminder here are the values of euid, ruid, suid, stat.st_uid and
stat.st_mode & S_ISUID in various cases when running a setuid 42 executable as user 1000:
Normal case:
ruid=1000 euid=42 suid=42
setuid=2048, st_uid=42
nosuid mount:
ruid=1000 euid=1000 suid=1000
setuid=2048, st_uid=42
inside unshare -rm:
ruid=0 euid=0 suid=0
setuid=2048, st_uid=65534
inside unshare -rm, on a suid mount:
ruid=0 euid=0 suid=0
setuid=2048, st_uid=65534
2023-05-13 12:00:00 +00:00
|
|
|
// aborts when false, printing the failed expression
|
2022-01-27 07:12:00 +00:00
|
|
|
#define ASSERT(expr) ((expr) ? (void) 0 : assert_failure(#expr))
|
2016-06-30 23:59:32 +00:00
|
|
|
|
|
|
|
extern char **environ;
|
|
|
|
|
2017-01-30 18:59:29 +00:00
|
|
|
// Wrapper debug variable name
|
2021-01-14 07:24:27 +00:00
|
|
|
static char *wrapper_debug = "WRAPPER_DEBUG";
|
|
|
|
|
|
|
|
#define CAP_SETPCAP 8
|
|
|
|
|
|
|
|
#if __BYTE_ORDER == __BIG_ENDIAN
|
|
|
|
#define LE32_TO_H(x) bswap_32(x)
|
|
|
|
#else
|
|
|
|
#define LE32_TO_H(x) (x)
|
|
|
|
#endif
|
|
|
|
|
2022-01-27 07:12:00 +00:00
|
|
|
static noreturn void assert_failure(const char *assertion) {
|
|
|
|
fprintf(stderr, "Assertion `%s` in NixOS's wrapper.c failed.\n", assertion);
|
|
|
|
fflush(stderr);
|
|
|
|
abort();
|
|
|
|
}
|
|
|
|
|
2021-01-14 07:24:27 +00:00
|
|
|
int get_last_cap(unsigned *last_cap) {
|
|
|
|
FILE* file = fopen("/proc/sys/kernel/cap_last_cap", "r");
|
|
|
|
if (file == NULL) {
|
|
|
|
int saved_errno = errno;
|
|
|
|
fprintf(stderr, "failed to open /proc/sys/kernel/cap_last_cap: %s\n", strerror(errno));
|
|
|
|
return -saved_errno;
|
2016-06-30 23:59:32 +00:00
|
|
|
}
|
2021-01-14 07:24:27 +00:00
|
|
|
int res = fscanf(file, "%u", last_cap);
|
|
|
|
if (res == EOF) {
|
|
|
|
int saved_errno = errno;
|
|
|
|
fprintf(stderr, "could not read number from /proc/sys/kernel/cap_last_cap: %s\n", strerror(errno));
|
|
|
|
return -saved_errno;
|
2016-06-30 23:59:32 +00:00
|
|
|
}
|
2021-01-14 07:24:27 +00:00
|
|
|
fclose(file);
|
|
|
|
return 0;
|
2016-06-30 23:59:32 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// Given the path to this program, fetch its configured capability set
|
|
|
|
// (as set by `setcap ... /path/to/file`) and raise those capabilities
|
|
|
|
// into the Ambient set.
|
2021-01-14 07:24:27 +00:00
|
|
|
static int make_caps_ambient(const char *self_path) {
|
|
|
|
struct vfs_ns_cap_data data = {};
|
|
|
|
int r = getxattr(self_path, "security.capability", &data, sizeof(data));
|
|
|
|
|
|
|
|
if (r < 0) {
|
|
|
|
if (errno == ENODATA) {
|
|
|
|
// no capabilities set
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
fprintf(stderr, "cannot get capabilities for %s: %s", self_path, strerror(errno));
|
|
|
|
return 1;
|
|
|
|
}
|
2016-06-30 23:59:32 +00:00
|
|
|
|
2021-01-14 07:24:27 +00:00
|
|
|
size_t size;
|
|
|
|
uint32_t version = LE32_TO_H(data.magic_etc) & VFS_CAP_REVISION_MASK;
|
|
|
|
switch (version) {
|
|
|
|
case VFS_CAP_REVISION_1:
|
|
|
|
size = VFS_CAP_U32_1;
|
|
|
|
break;
|
|
|
|
case VFS_CAP_REVISION_2:
|
|
|
|
case VFS_CAP_REVISION_3:
|
|
|
|
size = VFS_CAP_U32_3;
|
|
|
|
break;
|
|
|
|
default:
|
|
|
|
fprintf(stderr, "BUG! Unsupported capability version 0x%x on %s. Report to NixOS bugtracker\n", version, self_path);
|
|
|
|
return 1;
|
|
|
|
}
|
2017-01-30 18:59:29 +00:00
|
|
|
|
2021-01-14 07:24:27 +00:00
|
|
|
const struct __user_cap_header_struct header = {
|
|
|
|
.version = _LINUX_CAPABILITY_VERSION_3,
|
|
|
|
.pid = getpid(),
|
|
|
|
};
|
|
|
|
struct __user_cap_data_struct user_data[2] = {};
|
|
|
|
|
|
|
|
for (size_t i = 0; i < size; i++) {
|
|
|
|
// merge inheritable & permitted into one
|
|
|
|
user_data[i].permitted = user_data[i].inheritable =
|
|
|
|
LE32_TO_H(data.data[i].inheritable) | LE32_TO_H(data.data[i].permitted);
|
2016-06-30 23:59:32 +00:00
|
|
|
}
|
|
|
|
|
2021-01-14 07:24:27 +00:00
|
|
|
if (syscall(SYS_capset, &header, &user_data) < 0) {
|
|
|
|
fprintf(stderr, "failed to inherit capabilities: %s", strerror(errno));
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
unsigned last_cap;
|
|
|
|
r = get_last_cap(&last_cap);
|
|
|
|
if (r < 0) {
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
uint64_t set = user_data[0].permitted | (uint64_t)user_data[1].permitted << 32;
|
|
|
|
for (unsigned cap = 0; cap < last_cap; cap++) {
|
|
|
|
if (!(set & (1ULL << cap))) {
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Check for the cap_setpcap capability, we set this on the
|
|
|
|
// wrapper so it can elevate the capabilities to the Ambient
|
|
|
|
// set but we do not want to propagate it down into the
|
|
|
|
// wrapped program.
|
|
|
|
//
|
|
|
|
// TODO: what happens if that's the behavior you want
|
|
|
|
// though???? I'm preferring a strict vs. loose policy here.
|
|
|
|
if (cap == CAP_SETPCAP) {
|
|
|
|
if(getenv(wrapper_debug)) {
|
|
|
|
fprintf(stderr, "cap_setpcap in set, skipping it\n");
|
|
|
|
}
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
if (prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_RAISE, (unsigned long) cap, 0, 0)) {
|
|
|
|
fprintf(stderr, "cannot raise the capability %d into the ambient set: %s\n", cap, strerror(errno));
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
if (getenv(wrapper_debug)) {
|
|
|
|
fprintf(stderr, "raised %d into the ambient capability set\n", cap);
|
|
|
|
}
|
2016-06-30 23:59:32 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2023-10-04 19:16:06 +00:00
|
|
|
// These are environment variable aliases for glibc tunables.
|
|
|
|
// This list shouldn't grow further, since this is a legacy mechanism.
|
|
|
|
// Any future tunables are expected to only be accessible through GLIBC_TUNABLES.
|
|
|
|
//
|
|
|
|
// They are not included in the glibc-provided UNSECURE_ENVVARS list,
|
|
|
|
// since any SUID executable ignores them. This wrapper also serves
|
|
|
|
// executables that are merely granted ambient capabilities, rather than
|
|
|
|
// being SUID, and hence don't run in secure mode. We'd like them to
|
|
|
|
// defend those in depth as well, so we clear these explicitly.
|
|
|
|
//
|
|
|
|
// Except for MALLOC_CHECK_ (which is marked SXID_ERASE), these are all
|
|
|
|
// marked SXID_IGNORE (ignored in secure mode), so even the glibc version
|
|
|
|
// of this wrapper would leave them intact.
|
|
|
|
#define UNSECURE_ENVVARS_TUNABLES \
|
|
|
|
"MALLOC_CHECK_\0" \
|
|
|
|
"MALLOC_TOP_PAD_\0" \
|
|
|
|
"MALLOC_PERTURB_\0" \
|
|
|
|
"MALLOC_MMAP_THRESHOLD_\0" \
|
|
|
|
"MALLOC_TRIM_THRESHOLD_\0" \
|
|
|
|
"MALLOC_MMAP_MAX_\0" \
|
|
|
|
"MALLOC_ARENA_MAX\0" \
|
|
|
|
"MALLOC_ARENA_TEST\0"
|
|
|
|
|
2021-01-14 07:24:27 +00:00
|
|
|
int main(int argc, char **argv) {
|
2022-01-27 07:14:53 +00:00
|
|
|
ASSERT(argc >= 1);
|
2023-08-24 06:35:11 +00:00
|
|
|
|
2024-02-01 17:31:37 +00:00
|
|
|
// argv[0] goes into a lot of places, to a far greater degree than other elements
|
|
|
|
// of argv. glibc has had buffer overflows relating to argv[0], eg CVE-2023-6246.
|
|
|
|
// Since we expect the wrappers to be invoked from either $PATH or /run/wrappers/bin,
|
|
|
|
// there should be no reason to pass any particularly large values here, so we can
|
|
|
|
// be strict for strictness' sake.
|
|
|
|
ASSERT(strlen(argv[0]) < 512);
|
|
|
|
|
2023-10-04 19:16:06 +00:00
|
|
|
int debug = getenv(wrapper_debug) != NULL;
|
|
|
|
|
|
|
|
// Drop insecure environment variables explicitly
|
|
|
|
//
|
|
|
|
// glibc does this automatically in SUID binaries, but we'd like to cover this:
|
|
|
|
//
|
|
|
|
// a) before it gets to glibc
|
|
|
|
// b) in binaries that are only granted ambient capabilities by the wrapper,
|
|
|
|
// but don't run with an altered effective UID/GID, nor directly gain
|
|
|
|
// capabilities themselves, and thus don't run in secure mode.
|
|
|
|
//
|
|
|
|
// We're using musl, which doesn't drop environment variables in secure mode,
|
|
|
|
// and we'd also like glibc-specific variables to be covered.
|
|
|
|
//
|
|
|
|
// If we don't explicitly unset them, it's quite easy to just set LD_PRELOAD,
|
|
|
|
// have it passed through to the wrapped program, and gain privileges.
|
|
|
|
for (char *unsec = UNSECURE_ENVVARS_TUNABLES UNSECURE_ENVVARS; *unsec; unsec = strchr(unsec, 0) + 1) {
|
|
|
|
if (debug) {
|
|
|
|
fprintf(stderr, "unsetting %s\n", unsec);
|
|
|
|
}
|
|
|
|
unsetenv(unsec);
|
|
|
|
}
|
|
|
|
|
2017-02-14 00:03:06 +00:00
|
|
|
// Read the capabilities set on the wrapper and raise them in to
|
2021-01-14 07:24:27 +00:00
|
|
|
// the ambient set so the program we're wrapping receives the
|
2016-06-30 23:59:32 +00:00
|
|
|
// capabilities too!
|
2022-11-14 13:45:36 +00:00
|
|
|
if (make_caps_ambient("/proc/self/exe") != 0) {
|
2021-01-14 07:24:27 +00:00
|
|
|
return 1;
|
|
|
|
}
|
2016-06-30 23:59:32 +00:00
|
|
|
|
2022-11-04 23:09:32 +00:00
|
|
|
execve(SOURCE_PROG, argv, environ);
|
2016-06-30 23:59:32 +00:00
|
|
|
|
|
|
|
fprintf(stderr, "%s: cannot run `%s': %s\n",
|
2022-11-04 23:09:32 +00:00
|
|
|
argv[0], SOURCE_PROG, strerror(errno));
|
2016-06-30 23:59:32 +00:00
|
|
|
|
2021-01-14 07:24:27 +00:00
|
|
|
return 1;
|
2016-06-30 23:59:32 +00:00
|
|
|
}
|