diff --git a/nixos/modules/virtualisation/oci-containers.nix b/nixos/modules/virtualisation/oci-containers.nix index 161b4f6027b2..3adf4530aeec 100644 --- a/nixos/modules/virtualisation/oci-containers.nix +++ b/nixos/modules/virtualisation/oci-containers.nix @@ -1,4 +1,10 @@ -{ config, options, lib, pkgs, ... }: +{ + config, + options, + lib, + pkgs, + ... +}: with lib; let @@ -8,7 +14,8 @@ let defaultBackend = options.virtualisation.oci-containers.backend.default; containerOptions = - { ... }: { + { ... }: + { options = { @@ -77,8 +84,8 @@ let }; cmd = mkOption { - type = with types; listOf str; - default = []; + type = with types; listOf str; + default = [ ]; description = "Commandline arguments to pass to the image's entrypoint."; example = literalExpression '' ["--port=9000"] @@ -87,7 +94,7 @@ let labels = mkOption { type = with types; attrsOf str; - default = {}; + default = { }; description = "Labels to attach to the container at runtime."; example = literalExpression '' { @@ -105,26 +112,26 @@ let environment = mkOption { type = with types; attrsOf str; - default = {}; + default = { }; description = "Environment variables to set for this container."; example = literalExpression '' { DATABASE_HOST = "db.example.com"; DATABASE_PORT = "3306"; } - ''; + ''; }; environmentFiles = mkOption { type = with types; listOf path; - default = []; + default = [ ]; description = "Environment files for this container."; example = literalExpression '' [ /path/to/.env /path/to/.env.secret ] - ''; + ''; }; log-driver = mkOption { @@ -147,7 +154,7 @@ let ports = mkOption { type = with types; listOf str; - default = []; + default = [ ]; description = '' Network ports to publish from the container to the outer host. @@ -194,7 +201,7 @@ let volumes = mkOption { type = with types; listOf str; - default = []; + default = [ ]; description = '' List of volumes to attach to this container. @@ -222,7 +229,7 @@ let dependsOn = mkOption { type = with types; listOf str; - default = []; + default = [ ]; description = '' Define which other containers this one depends on. They will be added to both After and Requires for the unit. @@ -247,14 +254,17 @@ let preRunExtraOptions = mkOption { type = with types; listOf str; - default = []; + default = [ ]; description = "Extra options for {command}`${defaultBackend}` that go before the `run` argument."; - example = [ "--runtime" "runsc" ]; + example = [ + "--runtime" + "runsc" + ]; }; extraOptions = mkOption { type = with types; listOf str; - default = []; + default = [ ]; description = "Extra options for {command}`${defaultBackend} run`."; example = literalExpression '' ["--network=host"] @@ -262,177 +272,293 @@ let }; autoStart = mkOption { - type = types.bool; + type = with types; bool; default = true; description = '' When enabled, the container is automatically started on boot. If this option is set to false, the container has to be started on-demand via its service. ''; }; + + pull = mkOption { + type = + with types; + enum [ + "always" + "missing" + "never" + "newer" + ]; + default = "missing"; + description = '' + Image pull policy for the container. Must be one of: always, missing, never, newer + ''; + }; + + capAdd = mkOption { + type = with types; lazyAttrsOf (nullOr bool); + default = { }; + description = '' + Capabilities to add to container + ''; + example = literalExpression '' + { + SYS_ADMIN = true; + { + ''; + }; + + capDrop = mkOption { + type = with types; lazyAttrsOf (nullOr bool); + default = { }; + description = '' + Capabilities to drop from container + ''; + example = literalExpression '' + { + SYS_ADMIN = true; + { + ''; + }; + + devices = mkOption { + type = with types; listOf str; + default = [ ]; + description = '' + List of devices to attach to this container. + ''; + example = literalExpression '' + [ + "/dev/dri:/dev/dri" + ] + ''; + }; + + privileged = mkOption { + type = with types; bool; + default = false; + description = '' + Give extended privileges to the container + ''; + }; + + networks = mkOption { + type = with types; listOf str; + default = [ ]; + description = '' + Networks to attach the container to + ''; + }; }; }; - isValidLogin = login: login.username != null && login.passwordFile != null && login.registry != null; + isValidLogin = + login: login.username != null && login.passwordFile != null && login.registry != null; - mkService = name: container: let - dependsOn = map (x: "${cfg.backend}-${x}.service") container.dependsOn; - escapedName = escapeShellArg name; - preStartScript = pkgs.writeShellApplication { - name = "pre-start"; - runtimeInputs = [ ]; - text = '' - ${cfg.backend} rm -f ${name} || true - ${optionalString (isValidLogin container.login) '' - # try logging in, if it fails, check if image exists locally - ${cfg.backend} login \ - ${container.login.registry} \ - --username ${container.login.username} \ - --password-stdin < ${container.login.passwordFile} \ - || ${cfg.backend} image inspect ${container.image} >/dev/null \ - || { echo "image doesn't exist locally and login failed" >&2 ; exit 1; } - ''} - ${optionalString (container.imageFile != null) '' - ${cfg.backend} load -i ${container.imageFile} - ''} - ${optionalString (container.imageStream != null) '' - ${container.imageStream} | ${cfg.backend} load - ''} - ${optionalString (cfg.backend == "podman") '' - rm -f /run/podman-${escapedName}.ctr-id - ''} - ''; + mkService = + name: container: + let + dependsOn = map (x: "${cfg.backend}-${x}.service") container.dependsOn; + escapedName = escapeShellArg name; + preStartScript = pkgs.writeShellApplication { + name = "pre-start"; + runtimeInputs = [ ]; + text = '' + ${cfg.backend} rm -f ${name} || true + ${optionalString (isValidLogin container.login) '' + # try logging in, if it fails, check if image exists locally + ${cfg.backend} login \ + ${container.login.registry} \ + --username ${container.login.username} \ + --password-stdin < ${container.login.passwordFile} \ + || ${cfg.backend} image inspect ${container.image} >/dev/null \ + || { echo "image doesn't exist locally and login failed" >&2 ; exit 1; } + ''} + ${optionalString (container.imageFile != null) '' + ${cfg.backend} load -i ${container.imageFile} + ''} + ${optionalString (container.imageStream != null) '' + ${container.imageStream} | ${cfg.backend} load + ''} + ${optionalString (cfg.backend == "podman") '' + rm -f /run/podman-${escapedName}.ctr-id + ''} + ''; + }; + in + { + wantedBy = [ ] ++ optional (container.autoStart) "multi-user.target"; + wants = lib.optional ( + container.imageFile == null && container.imageStream == null + ) "network-online.target"; + after = + lib.optionals (cfg.backend == "docker") [ + "docker.service" + "docker.socket" + ] + # if imageFile or imageStream is not set, the service needs the network to download the image from the registry + ++ lib.optionals (container.imageFile == null && container.imageStream == null) [ + "network-online.target" + ] + ++ dependsOn; + requires = dependsOn; + environment = proxy_env; + + path = + if cfg.backend == "docker" then + [ config.virtualisation.docker.package ] + else if cfg.backend == "podman" then + [ config.virtualisation.podman.package ] + else + throw "Unhandled backend: ${cfg.backend}"; + + script = concatStringsSep " \\\n " ( + [ + "exec ${cfg.backend} " + ] + ++ map escapeShellArg container.preRunExtraOptions + ++ [ + "run" + "--rm" + "--name=${escapedName}" + "--log-driver=${container.log-driver}" + ] + ++ optional (container.entrypoint != null) "--entrypoint=${escapeShellArg container.entrypoint}" + ++ optional (container.hostname != null) "--hostname=${escapeShellArg container.hostname}" + ++ lib.optionals (cfg.backend == "podman") [ + "--cidfile=/run/podman-${escapedName}.ctr-id" + "--cgroups=no-conmon" + "--sdnotify=conmon" + "-d" + "--replace" + ] + ++ (mapAttrsToList (k: v: "-e ${escapeShellArg k}=${escapeShellArg v}") container.environment) + ++ map (f: "--env-file ${escapeShellArg f}") container.environmentFiles + ++ map (p: "-p ${escapeShellArg p}") container.ports + ++ optional (container.user != null) "-u ${escapeShellArg container.user}" + ++ map (v: "-v ${escapeShellArg v}") container.volumes + ++ (mapAttrsToList (k: v: "-l ${escapeShellArg k}=${escapeShellArg v}") container.labels) + ++ optional (container.workdir != null) "-w ${escapeShellArg container.workdir}" + ++ optional (container.privileged) "--privileged" + ++ mapAttrsToList (k: _: "--cap-add=${escapeShellArg k}") ( + filterAttrs (_: v: v == true) container.capAdd + ) + ++ mapAttrsToList (k: _: "--cap-drop=${escapeShellArg k}") ( + filterAttrs (_: v: v == true) container.capDrop + ) + ++ map (d: "--device=${escapeShellArg d}") container.devices + ++ map (n: "--network=${escapeShellArg n}") container.networks + ++ [ "--pull ${escapeShellArg container.pull}" ] + ++ map escapeShellArg container.extraOptions + ++ [ container.image ] + ++ map escapeShellArg container.cmd + ); + + preStop = + if cfg.backend == "podman" then + "podman stop --ignore --cidfile=/run/podman-${escapedName}.ctr-id" + else + "${cfg.backend} stop ${name} || true"; + + postStop = + if cfg.backend == "podman" then + "podman rm -f --ignore --cidfile=/run/podman-${escapedName}.ctr-id" + else + "${cfg.backend} rm -f ${name} || true"; + + serviceConfig = + { + ### There is no generalized way of supporting `reload` for docker + ### containers. Some containers may respond well to SIGHUP sent to their + ### init process, but it is not guaranteed; some apps have other reload + ### mechanisms, some don't have a reload signal at all, and some docker + ### images just have broken signal handling. The best compromise in this + ### case is probably to leave ExecReload undefined, so `systemctl reload` + ### will at least result in an error instead of potentially undefined + ### behaviour. + ### + ### Advanced users can still override this part of the unit to implement + ### a custom reload handler, since the result of all this is a normal + ### systemd service from the perspective of the NixOS module system. + ### + # ExecReload = ...; + ### + ExecStartPre = [ "${preStartScript}/bin/pre-start" ]; + TimeoutStartSec = 0; + TimeoutStopSec = 120; + Restart = "always"; + } + // optionalAttrs (cfg.backend == "podman") { + Environment = "PODMAN_SYSTEMD_UNIT=podman-${name}.service"; + Type = "notify"; + NotifyAccess = "all"; + }; }; - in { - wantedBy = [] ++ optional (container.autoStart) "multi-user.target"; - wants = lib.optional (container.imageFile == null && container.imageStream == null) "network-online.target"; - after = lib.optionals (cfg.backend == "docker") [ "docker.service" "docker.socket" ] - # if imageFile or imageStream is not set, the service needs the network to download the image from the registry - ++ lib.optionals (container.imageFile == null && container.imageStream == null) [ "network-online.target" ] - ++ dependsOn; - requires = dependsOn; - environment = proxy_env; - path = - if cfg.backend == "docker" then [ config.virtualisation.docker.package ] - else if cfg.backend == "podman" then [ config.virtualisation.podman.package ] - else throw "Unhandled backend: ${cfg.backend}"; - - script = concatStringsSep " \\\n " ([ - "exec ${cfg.backend} " - ] ++ map escapeShellArg container.preRunExtraOptions ++ [ - "run" - "--rm" - "--name=${escapedName}" - "--log-driver=${container.log-driver}" - ] ++ optional (container.entrypoint != null) - "--entrypoint=${escapeShellArg container.entrypoint}" - ++ optional (container.hostname != null) - "--hostname=${escapeShellArg container.hostname}" - ++ lib.optionals (cfg.backend == "podman") [ - "--cidfile=/run/podman-${escapedName}.ctr-id" - "--cgroups=no-conmon" - "--sdnotify=conmon" - "-d" - "--replace" - ] ++ (mapAttrsToList (k: v: "-e ${escapeShellArg k}=${escapeShellArg v}") container.environment) - ++ map (f: "--env-file ${escapeShellArg f}") container.environmentFiles - ++ map (p: "-p ${escapeShellArg p}") container.ports - ++ optional (container.user != null) "-u ${escapeShellArg container.user}" - ++ map (v: "-v ${escapeShellArg v}") container.volumes - ++ (mapAttrsToList (k: v: "-l ${escapeShellArg k}=${escapeShellArg v}") container.labels) - ++ optional (container.workdir != null) "-w ${escapeShellArg container.workdir}" - ++ map escapeShellArg container.extraOptions - ++ [container.image] - ++ map escapeShellArg container.cmd - ); - - preStop = if cfg.backend == "podman" - then "podman stop --ignore --cidfile=/run/podman-${escapedName}.ctr-id" - else "${cfg.backend} stop ${name} || true"; - - postStop = if cfg.backend == "podman" - then "podman rm -f --ignore --cidfile=/run/podman-${escapedName}.ctr-id" - else "${cfg.backend} rm -f ${name} || true"; - - serviceConfig = { - ### There is no generalized way of supporting `reload` for docker - ### containers. Some containers may respond well to SIGHUP sent to their - ### init process, but it is not guaranteed; some apps have other reload - ### mechanisms, some don't have a reload signal at all, and some docker - ### images just have broken signal handling. The best compromise in this - ### case is probably to leave ExecReload undefined, so `systemctl reload` - ### will at least result in an error instead of potentially undefined - ### behaviour. - ### - ### Advanced users can still override this part of the unit to implement - ### a custom reload handler, since the result of all this is a normal - ### systemd service from the perspective of the NixOS module system. - ### - # ExecReload = ...; - ### - ExecStartPre = [ "${preStartScript}/bin/pre-start" ]; - TimeoutStartSec = 0; - TimeoutStopSec = 120; - Restart = "always"; - } // optionalAttrs (cfg.backend == "podman") { - Environment="PODMAN_SYSTEMD_UNIT=podman-${name}.service"; - Type="notify"; - NotifyAccess="all"; - }; - }; - -in { +in +{ imports = [ - ( - lib.mkChangedOptionModule - [ "docker-containers" ] - [ "virtualisation" "oci-containers" ] - (oldcfg: { - backend = "docker"; - containers = lib.mapAttrs (n: v: builtins.removeAttrs (v // { - extraOptions = v.extraDockerOptions or []; - }) [ "extraDockerOptions" ]) oldcfg.docker-containers; - }) - ) + (lib.mkChangedOptionModule [ "docker-containers" ] [ "virtualisation" "oci-containers" ] (oldcfg: { + backend = "docker"; + containers = lib.mapAttrs ( + n: v: + builtins.removeAttrs ( + v + // { + extraOptions = v.extraDockerOptions or [ ]; + } + ) [ "extraDockerOptions" ] + ) oldcfg.docker-containers; + })) ]; options.virtualisation.oci-containers = { backend = mkOption { - type = types.enum [ "podman" "docker" ]; + type = types.enum [ + "podman" + "docker" + ]; default = if versionAtLeast config.system.stateVersion "22.05" then "podman" else "docker"; description = "The underlying Docker implementation to use."; }; containers = mkOption { - default = {}; + default = { }; type = types.attrsOf (types.submodule containerOptions); description = "OCI (Docker) containers to run as systemd services."; }; }; - config = lib.mkIf (cfg.containers != {}) (lib.mkMerge [ - { - systemd.services = mapAttrs' (n: v: nameValuePair "${cfg.backend}-${n}" (mkService n v)) cfg.containers; + config = lib.mkIf (cfg.containers != { }) ( + lib.mkMerge [ + { + systemd.services = mapAttrs' ( + n: v: nameValuePair "${cfg.backend}-${n}" (mkService n v) + ) cfg.containers; - assertions = - let - toAssertion = _: { imageFile, imageStream, ... }: - { assertion = imageFile == null || imageStream == null; + assertions = + let + toAssertion = + _: + { imageFile, imageStream, ... }: + { + assertion = imageFile == null || imageStream == null; - message = "You can only define one of imageFile and imageStream"; - }; + message = "You can only define one of imageFile and imageStream"; + }; - in + in lib.mapAttrsToList toAssertion cfg.containers; - } - (lib.mkIf (cfg.backend == "podman") { - virtualisation.podman.enable = true; - }) - (lib.mkIf (cfg.backend == "docker") { - virtualisation.docker.enable = true; - }) - ]); + } + (lib.mkIf (cfg.backend == "podman") { + virtualisation.podman.enable = true; + }) + (lib.mkIf (cfg.backend == "docker") { + virtualisation.docker.enable = true; + }) + ] + ); } diff --git a/nixos/tests/oci-containers.nix b/nixos/tests/oci-containers.nix index bc80f2d9b5db..09075c20d79b 100644 --- a/nixos/tests/oci-containers.nix +++ b/nixos/tests/oci-containers.nix @@ -22,6 +22,16 @@ let image = "nginx-container"; imageStream = pkgs.dockerTools.examples.nginxStream; ports = ["8181:80"]; + capAdd = { + CAP_AUDIT_READ = true; + }; + capDrop = { + CAP_AUDIT_WRITE = true; + }; + privileged = false; + devices = [ + "/dev/random:/dev/random" + ]; }; }; @@ -32,11 +42,18 @@ let }; testScript = '' + import json + start_all() ${backend}.wait_for_unit("${backend}-nginx.service") ${backend}.wait_for_open_port(8181) ${backend}.wait_until_succeeds("curl -f http://localhost:8181 | grep Hello") + output = json.loads(${backend}.succeed("${backend} inspect nginx --format json").strip())[0] ${backend}.succeed("systemctl stop ${backend}-nginx.service", timeout=10) + assert output['HostConfig']['CapAdd'] == ["CAP_AUDIT_READ"] + assert output['HostConfig']['CapDrop'] == ${if backend == "docker" then "[\"CAP_AUDIT_WRITE\"]" else "[]"} # Rootless podman runs with no capabilities so it cannot drop them + assert output['HostConfig']['Privileged'] == False + assert output['HostConfig']['Devices'] == [{'PathOnHost': '/dev/random', 'PathInContainer': '/dev/random', 'CgroupPermissions': '${if backend == "docker" then "rwm" else ""}'}] ''; };