tree-wide: cudaPackages should not break default eval

cudaPackages: guard expressions against null values
2024-11-22 23:13:19 +00:00 · 2023-12-14 22:19:02 +00:00 · 2023-12-14 22:19:02 +00:00 · 9bebd9e72d
commit 9bebd9e72d
parent 501a1af970
14 changed files with 119 additions and 98 deletions
--- a/pkgs/applications/science/math/caffe/default.nix
+++ b/pkgs/applications/science/math/caffe/default.nix
@ -153,7 +153,7 @@ stdenv.mkDerivation rec {
      || cudaSupport
      || !(leveldbSupport -> (leveldb != null && snappy != null))
      || !(cudnnSupport -> (hasCudnn && cudaSupport))
-      || !(ncclSupport -> cudaSupport)
+      || !(ncclSupport -> (cudaSupport && !nccl.meta.unsupported))
      || !(pythonSupport -> (python != null && numpy != null))
    ;
    license = licenses.bsd2;
--- a/pkgs/development/cuda-modules/cudnn/shims.nix
+++ b/pkgs/development/cuda-modules/cudnn/shims.nix
@ -1,10 +1,18 @@
 # Shims to mimic the shape of ../modules/generic/manifests/{feature,redistrib}/release.nix
-{package, redistArch}:
 {
-  featureRelease.${redistArch}.outputs = {
-    lib = true;
-    static = true;
-    dev = true;
+  lib,
+  package,
+  # redistArch :: String
+  # String is "unsupported" if the given architecture is unsupported.
+  redistArch,
+}:
+{
+  featureRelease = lib.optionalAttrs (redistArch != "unsupported") {
+    ${redistArch}.outputs = {
+      lib = true;
+      static = true;
+      dev = true;
+    };
  };
  redistribRelease = {
    name = "NVIDIA CUDA Deep Neural Network library (cuDNN)";
--- a/pkgs/development/cuda-modules/cutensor/extension.nix
+++ b/pkgs/development/cuda-modules/cutensor/extension.nix
@ -92,6 +92,7 @@ let
  # A release is supported if it has a libPath that matches our CUDA version for our platform.
  # LibPath are not constant across the same release -- one platform may support fewer
  # CUDA versions than another.
+  # redistArch :: String
  redistArch = flags.getRedistArch hostPlatform.system;
  # platformIsSupported :: Manifests -> Boolean
  platformIsSupported =
--- a/pkgs/development/cuda-modules/flags.nix
+++ b/pkgs/development/cuda-modules/flags.nix
@ -131,39 +131,29 @@ let
  # `linux-aarch64` redist (which is for Jetson devices) if we're building any Jetson devices.
  # Since both are based on aarch64, we can only have one or the other, otherwise there's an
  # ambiguity as to which should be used.
+  # NOTE: This function *will* be called by unsupported systems because `cudaPackages` is part of
+  # `all-packages.nix`, which is evaluated on all systems. As such, we need to handle unsupported
+  # systems gracefully.
  # getRedistArch :: String -> String
-  getRedistArch =
-    nixSystem:
-    if nixSystem == "aarch64-linux" then
-      if jetsonTargets != [] then "linux-aarch64" else "linux-sbsa"
-    else if nixSystem == "x86_64-linux" then
-      "linux-x86_64"
-    else if nixSystem == "ppc64le-linux" then
-      "linux-ppc64le"
-    else if nixSystem == "x86_64-windows" then
-      "windows-x86_64"
-    else
-      "unsupported";
+  getRedistArch = nixSystem: attrsets.attrByPath [ nixSystem ] "unsupported" {
+    aarch64-linux = if jetsonTargets != [] then "linux-aarch64" else "linux-sbsa";
+    x86_64-linux = "linux-x86_64";
+    ppc64le-linux = "linux-ppc64le";
+    x86_64-windows = "windows-x86_64";
+  };

  # Maps NVIDIA redist arch to Nix system.
-  # It is imperative that we include the boolean condition based on jetsonTargets to ensure
-  # we don't advertise availability of packages only available on server-grade ARM
-  # as being available for the Jetson, since both `linux-sbsa` and `linux-aarch64` are
-  # mapped to the Nix system `aarch64-linux`.
-  getNixSystem =
-    redistArch:
-    if redistArch == "linux-sbsa" && jetsonTargets == [] then
-      "aarch64-linux"
-    else if redistArch == "linux-aarch64" && jetsonTargets != [] then
-      "aarch64-linux"
-    else if redistArch == "linux-x86_64" then
-      "x86_64-linux"
-    else if redistArch == "linux-ppc64le" then
-      "ppc64le-linux"
-    else if redistArch == "windows-x86_64" then
-      "x86_64-windows"
-    else
-      "unsupported-${redistArch}";
+  # NOTE: This function *will* be called by unsupported systems because `cudaPackages` is part of
+  # `all-packages.nix`, which is evaluated on all systems. As such, we need to handle unsupported
+  # systems gracefully.
+  # getNixSystem :: String -> String
+  getNixSystem = redistArch: attrsets.attrByPath [ redistArch ] "unsupported-${redistArch}" {
+    linux-sbsa = "aarch64-linux";
+    linux-aarch64 = "aarch64-linux";
+    linux-x86_64 = "x86_64-linux";
+    linux-ppc64le = "ppc64le-linux";
+    windows-x86_64 = "x86_64-windows";
+  };

  formatCapabilities =
    {
--- a/pkgs/development/cuda-modules/generic-builders/manifest.nix
+++ b/pkgs/development/cuda-modules/generic-builders/manifest.nix
@ -42,6 +42,9 @@ let
  # Get the redist architectures for which package provides distributables.
  # These are used by meta.platforms.
  supportedRedistArchs = builtins.attrNames featureRelease;
+  # redistArch :: String
+  # The redistArch is the name of the architecture for which the redistributable is built.
+  # It is `"unsupported"` if the redistributable is not supported on the target platform.
  redistArch = flags.getRedistArch hostPlatform.system;
 in
 backendStdenv.mkDerivation (
@ -86,8 +89,18 @@ backendStdenv.mkDerivation (
          "sample"
          "python"
        ];
+        # Filter out outputs that don't exist in the redistributable.
+        # NOTE: In the case the redistributable isn't supported on the target platform,
+        # we will have `outputs = [ "out" ] ++ possibleOutputs`. This is of note because platforms which
+        # aren't supported would otherwise have evaluation errors when trying to access outputs other than `out`.
+        # The alternative would be to have `outputs = [ "out" ]` when`redistArch = "unsupported"`, but that would
+        # require adding guards throughout the entirety of the CUDA package set to ensure `cudaSupport` is true --
+        # recall that OfBorg will evaluate packages marked as broken and that `cudaPackages` will be evaluated with
+        # `cudaSupport = false`!
        additionalOutputs =
-          if redistArch == "unsupported" then possibleOutputs else builtins.filter hasOutput possibleOutputs;
+          if redistArch == "unsupported"
+          then possibleOutputs
+          else builtins.filter hasOutput possibleOutputs;
        # The out output is special -- it's the default output and we always include it.
        outputs = [ "out" ] ++ additionalOutputs;
      in
@ -114,19 +127,28 @@ backendStdenv.mkDerivation (
    # Useful for introspecting why something went wrong.
    # Maps descriptions of why the derivation would be marked broken to
    # booleans indicating whether that description is true.
-    brokenConditions = {};
-
-    src = fetchurl {
-      url =
-        if (builtins.hasAttr redistArch redistribRelease) then
-          "https://developer.download.nvidia.com/compute/${redistName}/redist/${
-            redistribRelease.${redistArch}.relative_path
-          }"
-        else
-          "cannot-construct-an-url-for-the-${redistArch}-platform";
-      sha256 = redistribRelease.${redistArch}.sha256 or lib.fakeHash;
+    # brokenConditions :: AttrSet Bool
+    brokenConditions = {
+      # Using an unrecognized redistArch
+      "Unrecognized NixOS platform ${hostPlatform.system}" = redistArch == "unsupported";
+      # Trying to build for a platform that doesn't have a redistributable
+      "Unsupported NixOS platform (or configuration) ${hostPlatform.system}" = finalAttrs.src == null;
    };

+    # src :: Optional Derivation
+    src = trivial.pipe redistArch [
+      # If redistArch doesn't exist in redistribRelease, return null.
+      (redistArch: redistribRelease.${redistArch} or null)
+      # If the release is non-null, fetch the source; otherwise, return null.
+      (trivial.mapNullable (
+        { relative_path, sha256, ... }:
+        fetchurl {
+          url = "https://developer.download.nvidia.com/compute/${redistName}/redist/${relative_path}";
+          inherit sha256;
+        }
+      ))
+    ];
+
    postPatch = ''
      if [[ -d pkg-config ]] ; then
        mkdir -p share/pkg-config
@ -284,16 +306,12 @@ backendStdenv.mkDerivation (
    meta = {
      description = "${redistribRelease.name}. By downloading and using the packages you accept the terms and conditions of the ${finalAttrs.meta.license.shortName}";
      sourceProvenance = [sourceTypes.binaryNativeCode];
-      platforms =
-        lists.concatMap
-          (
-            redistArch:
-            let
-              nixSystem = flags.getNixSystem redistArch;
-            in
-            lists.optionals (!(strings.hasPrefix "unsupported-" nixSystem)) [ nixSystem ]
-          )
-          supportedRedistArchs;
+      platforms = trivial.pipe supportedRedistArchs [
+        # Map each redist arch to the equivalent nix system or null if there is no equivalent.
+        (builtins.map flags.getNixSystem)
+        # Filter out unsupported systems
+        (builtins.filter (nixSystem: !(strings.hasPrefix "unsupported-" nixSystem)))
+      ];
      broken = lists.any trivial.id (attrsets.attrValues finalAttrs.brokenConditions);
      license = licenses.unfree;
      maintainers = teams.cuda.members;
--- a/pkgs/development/cuda-modules/generic-builders/multiplex.nix
+++ b/pkgs/development/cuda-modules/generic-builders/multiplex.nix
@ -20,7 +20,7 @@
  # The featureRelease is used to populate meta.platforms (by way of looking at the attribute names)
  # and to determine the outputs of the package.
  # shimFn :: {package, redistArch} -> AttrSet
-  shimsFn ? ({package, redistArch}: throw "shimsFn must be provided"),
+  shimsFn ? (throw "shimsFn must be provided"),
  # fixupFn :: Path
  # A path (or nix expression) to be evaluated with callPackage and then
  # provided to the package's overrideAttrs function.
@ -29,16 +29,8 @@
  # - cudaVersion
  # - mkVersionedPackageName
  # - package
-  fixupFn ? (
-    {
-      final,
-      cudaVersion,
-      mkVersionedPackageName,
-      package,
-      ...
-    }:
-    throw "fixupFn must be provided"
-  ),
+  # - ...
+  fixupFn ? (throw "fixupFn must be provided"),
 }:
 let
  inherit (lib)
@ -80,9 +72,11 @@ let
    && strings.versionAtLeast package.maxCudaVersion cudaVersion;

  # Get all of the packages for our given platform.
+  # redistArch :: String
+  # Value is `"unsupported"` if the platform is not supported.
  redistArch = flags.getRedistArch hostPlatform.system;

-  allReleases = builtins.concatMap (xs: xs) (builtins.attrValues releaseSets);
+  allReleases = lists.flatten (builtins.attrValues releaseSets);

  # All the supported packages we can build for our platform.
  # perSystemReleases :: List Package
--- a/pkgs/development/cuda-modules/nccl/default.nix
+++ b/pkgs/development/cuda-modules/nccl/default.nix
@ -100,6 +100,9 @@ backendStdenv.mkDerivation (
      homepage = "https://developer.nvidia.com/nccl";
      license = licenses.bsd3;
      platforms = platforms.linux;
+      # NCCL is not supported on Jetson, because it does not use NVLink or PCI-e for inter-GPU communication.
+      # https://forums.developer.nvidia.com/t/can-jetson-orin-support-nccl/232845/9
+      badPlatforms = lib.optionals cudaFlags.isJetsonBuild [ "aarch64-linux" ];
      maintainers =
        with maintainers;
        [
--- a/pkgs/development/cuda-modules/tensorrt/fixup.nix
+++ b/pkgs/development/cuda-modules/tensorrt/fixup.nix
@ -11,18 +11,17 @@
 }:
 let
  inherit (lib)
+    attrsets
    maintainers
    meta
    strings
    versions
    ;
-  targetArch =
-    if hostPlatform.isx86_64 then
-      "x86_64-linux-gnu"
-    else if hostPlatform.isAarch64 then
-      "aarch64-linux-gnu"
-    else
-      "unsupported";
+  # targetArch :: String
+  targetArch = attrsets.attrByPath [ hostPlatform.system ] "unsupported" {
+    x86_64-linux = "x86_64-linux-gnu";
+    aarch64-linux = "aarch64-linux-gnu";
+  };
 in
 finalAttrs: prevAttrs: {
  # Useful for inspecting why something went wrong.
@ -69,7 +68,7 @@ finalAttrs: prevAttrs: {

  preInstall =
    (prevAttrs.preInstall or "")
-    + ''
+    + strings.optionalString (targetArch != "unsupported") ''
      # Replace symlinks to bin and lib with the actual directories from targets.
      for dir in bin lib; do
        rm "$dir"
--- a/pkgs/development/cuda-modules/tensorrt/shims.nix
+++ b/pkgs/development/cuda-modules/tensorrt/shims.nix
@ -1,13 +1,21 @@
 # Shims to mimic the shape of ../modules/generic/manifests/{feature,redistrib}/release.nix
-{package, redistArch}:
 {
-  featureRelease.${redistArch}.outputs = {
-    bin = true;
-    lib = true;
-    static = true;
-    dev = true;
-    sample = true;
-    python = true;
+  lib,
+  package,
+  # redistArch :: String
+  # String is `"unsupported"` if the given architecture is unsupported.
+  redistArch,
+}:
+{
+  featureRelease = lib.optionalAttrs (redistArch != "unsupported") {
+    ${redistArch}.outputs = {
+      bin = true;
+      lib = true;
+      static = true;
+      dev = true;
+      sample = true;
+      python = true;
+    };
  };
  redistribRelease = {
    name = "TensorRT: a high-performance deep learning interface";
--- a/pkgs/development/libraries/science/math/magma/generic.nix
+++ b/pkgs/development/libraries/science/math/magma/generic.nix
@ -159,7 +159,7 @@ stdenv.mkDerivation {
    description = "Matrix Algebra on GPU and Multicore Architectures";
    license = licenses.bsd3;
    homepage = "http://icl.cs.utk.edu/magma/index.html";
-    platforms = platforms.unix;
+    platforms = platforms.linux;
    maintainers = with maintainers; [ connorbaker ];

    # Cf. https://bitbucket.org/icl/magma/src/fcfe5aa61c1a4c664b36a73ebabbdbab82765e9f/CMakeLists.txt#lines-20
--- a/pkgs/development/libraries/xgboost/default.nix
+++ b/pkgs/development/libraries/xgboost/default.nix
@ -14,7 +14,7 @@
 , rPackages
 }@inputs:

-assert ncclSupport -> cudaSupport;
+assert ncclSupport -> (cudaSupport && !cudaPackages.nccl.meta.unsupported);
 # Disable regular tests when building the R package
 # because 1) the R package runs its own tests and
 # 2) the R package creates a different binary shared
--- a/pkgs/development/python-modules/jaxlib/default.nix
+++ b/pkgs/development/python-modules/jaxlib/default.nix
@ -64,7 +64,8 @@ let
    # aarch64-darwin is broken because of https://github.com/bazelbuild/rules_cc/pull/136
    # however even with that fix applied, it doesn't work for everyone:
    # https://github.com/NixOS/nixpkgs/pull/184395#issuecomment-1207287129
-    broken = stdenv.isDarwin;
+    # NOTE: We always build with NCCL; if it is unsupported, then our build is broken.
+    broken = stdenv.isDarwin || nccl.meta.unsupported;
  };

  cudatoolkit_joined = symlinkJoin {
--- a/pkgs/development/python-modules/torch/default.nix
+++ b/pkgs/development/python-modules/torch/default.nix
@ -7,7 +7,8 @@
  magma,
  magma-hip,
  magma-cuda-static,
-  useSystemNccl ? true,
+  # Use the system NCCL as long as it is supported.
+  useSystemNccl ? !cudaPackages.nccl.meta.unsupported,
  MPISupport ? false, mpi,
  buildDocs ? false,

@ -57,6 +58,7 @@
 let
  inherit (lib) attrsets lists strings trivial;
  inherit (cudaPackages) cudaFlags cudnn nccl;
+  ncclSupported = cudaSupport && !cudaPackages.nccl.meta.unsupported;

  setBool = v: if v then "1" else "0";

@ -121,6 +123,7 @@ let
    "Unsupported CUDA version" = cudaSupport && !(builtins.elem cudaPackages.cudaMajorVersion [ "11" "12" ]);
    "MPI cudatoolkit does not match cudaPackages.cudatoolkit" = MPISupport && cudaSupport && (mpi.cudatoolkit != cudaPackages.cudatoolkit);
    "Magma cudaPackages does not match cudaPackages" = cudaSupport && (effectiveMagma.cudaPackages != cudaPackages);
+    "Requested system NCCL, but cudaPackages.nccl is not supported" = useSystemNccl && !ncclSupported;
  };
 in buildPythonPackage rec {
  pname = "torch";
@ -273,9 +276,9 @@ in buildPythonPackage rec {
  PYTORCH_BUILD_VERSION = version;
  PYTORCH_BUILD_NUMBER = 0;

-  USE_NCCL = setBool (cudaSupport && cudaPackages ? nccl);
-  USE_SYSTEM_NCCL = setBool useSystemNccl;                  # don't build pytorch's third_party NCCL
-  USE_STATIC_NCCL = setBool useSystemNccl;
+  USE_NCCL = setBool (cudaSupport && ncclSupported);
+  USE_SYSTEM_NCCL = setBool (cudaSupport && useSystemNccl);                  # don't build pytorch's third_party NCCL
+  USE_STATIC_NCCL = setBool (cudaSupport && useSystemNccl);

  # Suppress a weird warning in mkl-dnn, part of ideep in pytorch
  # (upstream seems to have fixed this in the wrong place?)
@ -363,7 +366,7 @@ in buildPythonPackage rec {
    ] ++ lists.optionals (cudaPackages ? cudnn) [
      cudnn.dev
      cudnn.lib
-    ] ++ lists.optionals (useSystemNccl && cudaPackages ? nccl) [
+    ] ++ lists.optionals (useSystemNccl && ncclSupported) [
      # Some platforms do not support NCCL (i.e., Jetson)
      nccl.dev # Provides nccl.h AND a static copy of NCCL!
    ] ++ lists.optionals (strings.versionOlder cudaVersion "11.8") [
--- a/pkgs/top-level/cuda-packages.nix
+++ b/pkgs/top-level/cuda-packages.nix
@ -73,10 +73,6 @@ let
        # Loose packages
        cudatoolkit = final.callPackage ../development/cuda-modules/cudatoolkit {};
        saxpy = final.callPackage ../development/cuda-modules/saxpy {};
-      }
-      # NCCL is not supported on Jetson, because it does not use NVLink or PCI-e for inter-GPU communication.
-      # https://forums.developer.nvidia.com/t/can-jetson-orin-support-nccl/232845/9
-      // attrsets.optionalAttrs (!flags.isJetsonBuild) {
        nccl = final.callPackage ../development/cuda-modules/nccl {};
        nccl-tests = final.callPackage ../development/cuda-modules/nccl-tests {};
      }