9 files changed, 114 insertions, 107 deletions
diff --git a/doc/languages-frameworks/cuda.section.md b/doc/languages-frameworks/cuda.section.md
index fccf66bf79d..27bae33bc71 100644
--- a/doc/languages-frameworks/cuda.section.md
+++ b/doc/languages-frameworks/cuda.section.md
@@ -32,3 +32,22 @@ mypkg = let
   }});
 in callPackage { inherit cudaPackages; };
 ```
+
+The CUDA NVCC compiler requires flags to determine which hardware you
+want to target for in terms of SASS (real hardware) or PTX (JIT kernels).
+
+Nixpkgs tries to target support real architecture defaults based on the
+CUDA toolkit version with PTX support for future hardware.  Experienced
+users may optmize this configuration for a variety of reasons such as
+reducing binary size and compile time, supporting legacy hardware, or
+optimizing for specific hardware.
+
+You may provide capabilities to add support or reduce binary size through
+`config` using `cudaCapabilities = [ "6.0" "7.0" ];` and
+`cudaForwardCompat = true;` if you want PTX support for future hardware.
+
+Please consult [GPUs supported](https://en.wikipedia.org/wiki/CUDA#GPUs_supported)
+for your specific card(s).
+
+Library maintainers should consult [NVCC Docs](https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/)
+and release notes for their software package.
diff --git a/pkgs/applications/science/math/mxnet/default.nix b/pkgs/applications/science/math/mxnet/default.nix
index 9a1b550d882..dcba888ce2f 100644
--- a/pkgs/applications/science/math/mxnet/default.nix
+++ b/pkgs/applications/science/math/mxnet/default.nix
@@ -2,11 +2,10 @@
 , opencv3, gtest, blas, gomp, llvmPackages, perl
 , cudaSupport ? config.cudaSupport or false, cudaPackages ? {}, nvidia_x11
 , cudnnSupport ? cudaSupport
-, cudaCapabilities ? [ "3.7" "5.0" "6.0" "7.0" "7.5" "8.0" "8.6" ]
 }:
 
 let
-  inherit (cudaPackages) cudatoolkit cudnn;
+  inherit (cudaPackages) cudatoolkit cudaFlags cudnn;
 in
 
 assert cudnnSupport -> cudaSupport;
@@ -51,7 +50,7 @@ stdenv.mkDerivation rec {
       "-DUSE_OLDCMAKECUDA=ON"  # see https://github.com/apache/incubator-mxnet/issues/10743
       "-DCUDA_ARCH_NAME=All"
       "-DCUDA_HOST_COMPILER=${cudatoolkit.cc}/bin/cc"
-      "-DMXNET_CUDA_ARCH=${lib.concatStringsSep ";" cudaCapabilities}"
+      "-DMXNET_CUDA_ARCH=${cudaFlags.cudaCapabilitiesSemiColonString}"
     ] else [ "-DUSE_CUDA=OFF" ])
     ++ lib.optional (!cudnnSupport) "-DUSE_CUDNN=OFF";
 
diff --git a/pkgs/development/compilers/cudatoolkit/extension.nix b/pkgs/development/compilers/cudatoolkit/extension.nix
index 862c8316799..c11f12b118a 100644
--- a/pkgs/development/compilers/cudatoolkit/extension.nix
+++ b/pkgs/development/compilers/cudatoolkit/extension.nix
@@ -10,6 +10,8 @@ final: prev: let
   ### Add classic cudatoolkit package
   cudatoolkit = buildCudaToolkitPackage ((attrs: attrs // { gcc = prev.pkgs.${attrs.gcc}; }) cudatoolkitVersions.${final.cudaVersion});
 
+  cudaFlags = final.callPackage ./flags.nix {};
+
 in {
-  inherit cudatoolkit;
+  inherit cudatoolkit cudaFlags;
 }
diff --git a/pkgs/development/compilers/cudatoolkit/flags.nix b/pkgs/development/compilers/cudatoolkit/flags.nix
new file mode 100644
index 00000000000..24f653ded29
--- /dev/null
+++ b/pkgs/development/compilers/cudatoolkit/flags.nix
@@ -0,0 +1,78 @@
+{ config
+, lib
+, cudatoolkit
+}:
+let
+
+  # Flags are determined based on your CUDA toolkit by default.  You may benefit
+  # from improved performance, reduced file size, or greater hardware suppport by
+  # passing a configuration based on your specific GPU environment.
+  #
+  # config.cudaCapabilities: list of hardware generations to support (e.g., "8.0")
+  # config.cudaForwardCompat: bool for compatibility with future GPU generations
+  #
+  # Please see the accompanying documentation or https://github.com/NixOS/nixpkgs/pull/205351
+
+  defaultCudaCapabilities = rec {
+    cuda9 = [
+      "3.0"
+      "3.5"
+      "5.0"
+      "5.2"
+      "6.0"
+      "6.1"
+      "7.0"
+    ];
+
+    cuda10 = cuda9 ++ [
+      "7.5"
+    ];
+
+    cuda11 = [
+      "3.5"
+      "5.0"
+      "5.2"
+      "6.0"
+      "6.1"
+      "7.0"
+      "7.5"
+      "8.0"
+      "8.6"
+    ];
+
+  };
+
+  cudaMicroarchitectureNames = {
+    "3" = "Kepler";
+    "5" = "Maxwell";
+    "6" = "Pascal";
+    "7" = "Volta";
+    "8" = "Ampere";
+    "9" = "Hopper";
+  };
+
+  defaultCudaArchList = defaultCudaCapabilities."cuda${lib.versions.major cudatoolkit.version}";
+  cudaRealCapabilities = config.cudaCapabilities or defaultCudaArchList;
+  capabilitiesForward = "${lib.last cudaRealCapabilities}+PTX";
+
+  dropDot = ver: builtins.replaceStrings ["."] [""] ver;
+
+  archMapper = feat: map (ver: "${feat}_${dropDot ver}");
+  gencodeMapper = feat: map (ver: "-gencode=arch=compute_${dropDot ver},code=${feat}_${dropDot ver}");
+  cudaRealArchs = archMapper "sm" cudaRealCapabilities;
+  cudaPTXArchs = archMapper "compute" cudaRealCapabilities;
+  cudaArchs = cudaRealArchs ++ [ (lib.last cudaPTXArchs) ];
+
+  cudaArchNames = lib.unique (map (v: cudaMicroarchitectureNames.${lib.versions.major v}) cudaRealCapabilities);
+  cudaCapabilities = cudaRealCapabilities ++ lib.optional (config.cudaForwardCompat or true) capabilitiesForward;
+  cudaGencode = gencodeMapper "sm" cudaRealCapabilities ++ lib.optionals (config.cudaForwardCompat or true) (gencodeMapper "compute" [ (lib.last cudaPTXArchs) ]);
+
+  cudaCapabilitiesCommaString = lib.strings.concatStringsSep "," cudaCapabilities;
+  cudaCapabilitiesSemiColonString = lib.strings.concatStringsSep ";" cudaCapabilities;
+  cudaRealCapabilitiesCommaString = lib.strings.concatStringsSep "," cudaRealCapabilities;
+
+in
+{
+   inherit cudaArchs cudaArchNames cudaCapabilities cudaCapabilitiesCommaString cudaCapabilitiesSemiColonString
+     cudaRealCapabilities cudaRealCapabilitiesCommaString cudaGencode cudaRealArchs cudaPTXArchs;
+}
diff --git a/pkgs/development/libraries/science/math/magma/default.nix b/pkgs/development/libraries/science/math/magma/default.nix
index 05d7d4fa184..06b4e12d04e 100644
--- a/pkgs/development/libraries/science/math/magma/default.nix
+++ b/pkgs/development/libraries/science/math/magma/default.nix
@@ -1,7 +1,7 @@
 { lib, stdenv, fetchurl, cmake, gfortran, ninja, cudaPackages, libpthreadstubs, lapack, blas }:
 
 let
-  inherit (cudaPackages) cudatoolkit;
+  inherit (cudaPackages) cudatoolkit cudaFlags;
 in
 
 assert let majorIs = lib.versions.major cudatoolkit.version;
@@ -10,36 +10,6 @@ assert let majorIs = lib.versions.major cudatoolkit.version;
 let
   version = "2.6.2";
 
-  # We define a specific set of CUDA compute capabilities here,
-  # because CUDA 11 does not support compute capability 3.0. Also,
-  # we use it to enable newer capabilities that are not enabled
-  # by magma by default. The list of supported architectures
-  # can be found in magma's top-level CMakeLists.txt.
-  cudaCapabilities = rec {
-    cuda9 = [
-      "Kepler"  # 3.0, 3.5
-      "Maxwell" # 5.0
-      "Pascal"  # 6.0
-      "Volta"   # 7.0
-    ];
-
-    cuda10 = [
-      "Turing"  # 7.5
-    ] ++ cuda9;
-
-    cuda11 = [
-      "sm_35"   # sm_30 is not supported by CUDA 11
-      "Maxwell" # 5.0
-      "Pascal"  # 6.0
-      "Volta"   # 7.0
-      "Turing"  # 7.5
-      "Ampere"  # 8.0
-    ];
-  };
-
-  capabilityString = lib.strings.concatStringsSep ","
-    cudaCapabilities."cuda${lib.versions.major cudatoolkit.version}";
-
 in stdenv.mkDerivation {
   pname = "magma";
   inherit version;
@@ -53,7 +23,9 @@ in stdenv.mkDerivation {
 
   buildInputs = [ cudatoolkit libpthreadstubs lapack blas ];
 
-  cmakeFlags = [ "-DGPU_TARGET=${capabilityString}" ];
+  cmakeFlags = [
+    "-DGPU_TARGET=${builtins.concatStringsSep "," cudaFlags.cudaRealArchs}"
+  ];
 
   doCheck = false;
 
diff --git a/pkgs/development/python-modules/jaxlib/default.nix b/pkgs/development/python-modules/jaxlib/default.nix
index 37bfe4d739f..4018655cc48 100644
--- a/pkgs/development/python-modules/jaxlib/default.nix
+++ b/pkgs/development/python-modules/jaxlib/default.nix
@@ -41,7 +41,6 @@
 , zlib
 
   # CUDA flags:
-, cudaCapabilities ? [ "sm_35" "sm_50" "sm_60" "sm_70" "sm_75" "compute_80" ]
 , cudaSupport ? false
 , cudaPackages ? {}
 
@@ -50,7 +49,7 @@
 }:
 
 let
-  inherit (cudaPackages) cudatoolkit cudnn nccl;
+  inherit (cudaPackages) cudatoolkit cudaFlags cudnn nccl;
 
   pname = "jaxlib";
   version = "0.3.22";
@@ -165,7 +164,7 @@ let
       build --action_env TF_CUDA_PATHS="${cudatoolkit_joined},${cudnn},${nccl}"
       build --action_env TF_CUDA_VERSION="${lib.versions.majorMinor cudatoolkit.version}"
       build --action_env TF_CUDNN_VERSION="${lib.versions.major cudnn.version}"
-      build:cuda --action_env TF_CUDA_COMPUTE_CAPABILITIES="${lib.concatStringsSep "," cudaCapabilities}"
+      build:cuda --action_env TF_CUDA_COMPUTE_CAPABILITIES="${cudaFlags.cudaRealCapabilitiesCommaString}"
     '' + ''
       CFG
     '';
diff --git a/pkgs/development/python-modules/tensorflow/default.nix b/pkgs/development/python-modules/tensorflow/default.nix
index 39461fd7953..a549fe393b9 100644
--- a/pkgs/development/python-modules/tensorflow/default.nix
+++ b/pkgs/development/python-modules/tensorflow/default.nix
@@ -22,8 +22,6 @@
 , tensorboardSupport ? true
 # XLA without CUDA is broken
 , xlaSupport ? cudaSupport
-# Default from ./configure script
-, cudaCapabilities ? [ "sm_35" "sm_50" "sm_60" "sm_70" "sm_75" "compute_80" ]
 , sse42Support ? stdenv.hostPlatform.sse4_2Support
 , avx2Support  ? stdenv.hostPlatform.avx2Support
 , fmaSupport   ? stdenv.hostPlatform.fmaSupport
@@ -32,7 +30,7 @@
 }:
 
 let
-  inherit (cudaPackages) cudatoolkit cudnn nccl;
+  inherit (cudaPackages) cudatoolkit cudaFlags cudnn nccl;
 in
 
 assert cudaSupport -> cudatoolkit != null
@@ -305,7 +303,7 @@ let
     TF_CUDA_PATHS = lib.optionalString cudaSupport "${cudatoolkit_joined},${cudnn},${nccl}";
     GCC_HOST_COMPILER_PREFIX = lib.optionalString cudaSupport "${cudatoolkit_cc_joined}/bin";
     GCC_HOST_COMPILER_PATH = lib.optionalString cudaSupport "${cudatoolkit_cc_joined}/bin/gcc";
-    TF_CUDA_COMPUTE_CAPABILITIES = lib.concatStringsSep "," cudaCapabilities;
+    TF_CUDA_COMPUTE_CAPABILITIES = builtins.concatStringsSep "," cudaFlags.cudaRealArchs;
 
     postPatch = ''
       # bazel 3.3 should work just as well as bazel 3.1
diff --git a/pkgs/development/python-modules/torch/default.nix b/pkgs/development/python-modules/torch/default.nix
index 887738f2c5f..17ecd3f280b 100644
--- a/pkgs/development/python-modules/torch/default.nix
+++ b/pkgs/development/python-modules/torch/default.nix
@@ -3,7 +3,6 @@
   mklDnnSupport ? true, useSystemNccl ? true,
   MPISupport ? false, mpi,
   buildDocs ? false,
-  cudaArchList ? null,
 
   # Native build inputs
   cmake, util-linux, linkFarm, symlinkJoin, which, pybind11, removeReferencesTo,
@@ -33,7 +32,7 @@
   isPy3k, pythonOlder }:
 
 let
-  inherit (cudaPackages) cudatoolkit cudnn nccl;
+  inherit (cudaPackages) cudatoolkit cudaFlags cudnn nccl;
 in
 
 # assert that everything needed for cuda is present and that the correct cuda versions are used
@@ -52,64 +51,6 @@ let
     paths = [ cudatoolkit.out cudatoolkit.lib nccl.dev nccl.out ];
   };
 
-  # Give an explicit list of supported architectures for the build, See:
-  # - pytorch bug report: https://github.com/pytorch/pytorch/issues/23573
-  # - pytorch-1.2.0 build on nixpks: https://github.com/NixOS/nixpkgs/pull/65041
-  #
-  # This list was selected by omitting the TORCH_CUDA_ARCH_LIST parameter,
-  # observing the fallback option (which selected all architectures known
-  # from cudatoolkit_10_0, pytorch-1.2, and python-3.6), and doing a binary
-  # searching to find offending architectures.
-  #
-  # NOTE: Because of sandboxing, this derivation can't auto-detect the hardware's
-  # cuda architecture, so there is also now a problem around new architectures
-  # not being supported until explicitly added to this derivation.
-  #
-  # FIXME: CMake is throwing the following warning on python-1.2:
-  #
-  # ```
-  # CMake Warning at cmake/public/utils.cmake:172 (message):
-  #   In the future we will require one to explicitly pass TORCH_CUDA_ARCH_LIST
-  #   to cmake instead of implicitly setting it as an env variable.  This will
-  #   become a FATAL_ERROR in future version of pytorch.
-  # ```
-  # If this is causing problems for your build, this derivation may have to strip
-  # away the standard `buildPythonPackage` and use the
-  # [*Adjust Build Options*](https://github.com/pytorch/pytorch/tree/v1.2.0#adjust-build-options-optional)
-  # instructions. This will also add more flexibility around configurations
-  # (allowing FBGEMM to be built in pytorch-1.1), and may future proof this
-  # derivation.
-  brokenArchs = [ "3.0" ]; # this variable is only used as documentation.
-
-  cudaCapabilities = rec {
-    cuda9 = [
-      "3.5"
-      "5.0"
-      "5.2"
-      "6.0"
-      "6.1"
-      "7.0"
-      "7.0+PTX"  # I am getting a "undefined architecture compute_75" on cuda 9
-                 # which leads me to believe this is the final cuda-9-compatible architecture.
-    ];
-
-    cuda10 = cuda9 ++ [
-      "7.5"
-      "7.5+PTX"  # < most recent architecture as of cudatoolkit_10_0 and pytorch-1.2.0
-    ];
-
-    cuda11 = cuda10 ++ [
-      "8.0"
-      "8.0+PTX"  # < CUDA toolkit 11.0
-      "8.6"
-      "8.6+PTX"  # < CUDA toolkit 11.1
-    ];
-  };
-  final_cudaArchList =
-    if !cudaSupport || cudaArchList != null
-    then cudaArchList
-    else cudaCapabilities."cuda${lib.versions.major cudatoolkit.version}";
-
   # Normally libcuda.so.1 is provided at runtime by nvidia-x11 via
   # LD_LIBRARY_PATH=/run/opengl-driver/lib.  We only use the stub
   # libcuda.so from cudatoolkit for running tests, so that we don’t have
@@ -153,7 +94,7 @@ in buildPythonPackage rec {
   ];
 
   preConfigure = lib.optionalString cudaSupport ''
-    export TORCH_CUDA_ARCH_LIST="${lib.strings.concatStringsSep ";" final_cudaArchList}"
+    export TORCH_CUDA_ARCH_LIST="${cudaFlags.cudaCapabilitiesSemiColonString}"
     export CC=${cudatoolkit.cc}/bin/gcc CXX=${cudatoolkit.cc}/bin/g++
   '' + lib.optionalString (cudaSupport && cudnn != null) ''
     export CUDNN_INCLUDE_DIR=${cudnn}/include
@@ -308,7 +249,6 @@ in buildPythonPackage rec {
 
   passthru = {
     inherit cudaSupport cudaPackages;
-    cudaArchList = final_cudaArchList;
     # At least for 1.10.2 `torch.fft` is unavailable unless BLAS provider is MKL. This attribute allows for easy detection of its availability.
     blasProvider = blas.provider;
   };
diff --git a/pkgs/development/python-modules/torchvision/default.nix b/pkgs/development/python-modules/torchvision/default.nix
index 223ef3f1d86..212401efe54 100644
--- a/pkgs/development/python-modules/torchvision/default.nix
+++ b/pkgs/development/python-modules/torchvision/default.nix
@@ -15,7 +15,7 @@
 }:
 
 let
-  inherit (torch.cudaPackages) cudatoolkit cudnn;
+  inherit (torch.cudaPackages) cudatoolkit cudaFlags cudnn;
 
   cudatoolkit_joined = symlinkJoin {
     name = "${cudatoolkit.name}-unsplit";
@@ -45,7 +45,7 @@ in buildPythonPackage rec {
   propagatedBuildInputs = [ numpy pillow torch scipy ];
 
   preBuild = lib.optionalString cudaSupport ''
-    export TORCH_CUDA_ARCH_LIST="${cudaArchStr}"
+    export TORCH_CUDA_ARCH_LIST="${cudaFlags.cudaCapabilitiesSemiColonString}"
     export FORCE_CUDA=1
   '';