summary refs log tree commit diff
path: root/lib/fileset
diff options
context:
space:
mode:
authorSilvan Mosberger <contact@infinisil.com>2023-10-11 17:33:32 +0200
committerGitHub <noreply@github.com>2023-10-11 17:33:32 +0200
commit006b28f2a8c66b951af691da5f9bc3f426b60a8f (patch)
tree601ecd9dbfd601d2e2efa46dfca3e7fc662aae56 /lib/fileset
parent91ac36c5b7a1ca134e066e7aded243951ca355a7 (diff)
parent389be8db81592ce4c5519a96a459593e07eb82f7 (diff)
downloadnixpkgs-006b28f2a8c66b951af691da5f9bc3f426b60a8f.tar
nixpkgs-006b28f2a8c66b951af691da5f9bc3f426b60a8f.tar.gz
nixpkgs-006b28f2a8c66b951af691da5f9bc3f426b60a8f.tar.bz2
nixpkgs-006b28f2a8c66b951af691da5f9bc3f426b60a8f.tar.lz
nixpkgs-006b28f2a8c66b951af691da5f9bc3f426b60a8f.tar.xz
nixpkgs-006b28f2a8c66b951af691da5f9bc3f426b60a8f.tar.zst
nixpkgs-006b28f2a8c66b951af691da5f9bc3f426b60a8f.zip
Merge pull request #257356 from tweag/fileset.intersect
`lib.fileset.intersection`: init
Diffstat (limited to 'lib/fileset')
-rw-r--r--lib/fileset/README.md35
-rw-r--r--lib/fileset/default.nix41
-rw-r--r--lib/fileset/internal.nix115
-rwxr-xr-xlib/fileset/tests.sh95
4 files changed, 278 insertions, 8 deletions
diff --git a/lib/fileset/README.md b/lib/fileset/README.md
index d2215803692..ebe13f08fde 100644
--- a/lib/fileset/README.md
+++ b/lib/fileset/README.md
@@ -58,7 +58,8 @@ An attribute set with these values:
 
 - `_internalBase` (path):
   Any files outside of this path cannot influence the set of files.
-  This is always a directory.
+  This is always a directory and should be as long as possible.
+  This is used by `lib.fileset.toSource` to check that all files are under the `root` argument
 
 - `_internalBaseRoot` (path):
   The filesystem root of `_internalBase`, same as `(lib.path.splitRoot _internalBase).root`.
@@ -143,9 +144,37 @@ Arguments:
   - (-) Leaves us with no identity element for `union` and no reasonable return value for `unions []`.
     From a set theory perspective, which has a well-known notion of empty sets, this is unintuitive.
 
+### No intersection for lists
+
+While there is `intersection a b`, there is no function `intersections [ a b c ]`.
+
+Arguments:
+- (+) There is no known use case for such a function, it can be added later if a use case arises
+- (+) There is no suitable return value for `intersections [ ]`, see also "Nullary intersections" [here](https://en.wikipedia.org/w/index.php?title=List_of_set_identities_and_relations&oldid=1177174035#Definitions)
+  - (-) Could throw an error for that case
+  - (-) Create a special value to represent "all the files" and return that
+    - (+) Such a value could then not be used with `fileFilter` unless the internal representation is changed considerably
+  - (-) Could return the empty file set
+    - (+) This would be wrong in set theory
+- (-) Inconsistent with `union` and `unions`
+
+### Intersection base path
+
+The base path of the result of an `intersection` is the longest base path of the arguments.
+E.g. the base path of `intersection ./foo ./foo/bar` is `./foo/bar`.
+Meanwhile `intersection ./foo ./bar` returns the empty file set without a base path.
+
+Arguments:
+- Alternative: Use the common prefix of all base paths as the resulting base path
+  - (-) This is unnecessarily strict, because the purpose of the base path is to track the directory under which files _could_ be in the file set. It should be as long as possible.
+    All files contained in `intersection ./foo ./foo/bar` will be under `./foo/bar` (never just under `./foo`), and `intersection ./foo ./bar` will never contain any files (never under `./.`).
+    This would lead to `toSource` having to unexpectedly throw errors for cases such as `toSource { root = ./foo; fileset = intersect ./foo base; }`, where `base` may be `./bar` or `./.`.
+  - (-) There is no benefit to the user, since base path is not directly exposed in the interface
+
 ### Empty directories
 
-File sets can only represent a _set_ of local files, directories on their own are not representable.
+File sets can only represent a _set_ of local files.
+Directories on their own are not representable.
 
 Arguments:
 - (+) There does not seem to be a sensible set of combinators when directories can be represented on their own.
@@ -161,7 +190,7 @@ Arguments:
 
   - `./.` represents all files in `./.` _and_ the directory itself, but not its subdirectories, meaning that at least `./.` will be preserved even if it's empty.
 
-    In that case, `intersect ./. ./foo` should only include files and no directories themselves, since `./.` includes only `./.` as a directory, and same for `./foo`, so there's no overlap in directories.
+    In that case, `intersection ./. ./foo` should only include files and no directories themselves, since `./.` includes only `./.` as a directory, and same for `./foo`, so there's no overlap in directories.
     But intuitively this operation should result in the same as `./foo` – everything else is just confusing.
 - (+) This matches how Git only supports files, so developers should already be used to it.
 - (-) Empty directories (even if they contain nested directories) are neither representable nor preserved when coercing from paths.
diff --git a/lib/fileset/default.nix b/lib/fileset/default.nix
index 93a552262b0..7bd70167038 100644
--- a/lib/fileset/default.nix
+++ b/lib/fileset/default.nix
@@ -7,6 +7,7 @@ let
     _toSourceFilter
     _unionMany
     _printFileset
+    _intersection
     ;
 
   inherit (builtins)
@@ -18,6 +19,7 @@ let
     ;
 
   inherit (lib.lists)
+    elemAt
     imap0
     ;
 
@@ -277,6 +279,45 @@ If a directory does not recursively contain any file, it is omitted from the sto
       ];
 
   /*
+    The file set containing all files that are in both of two given file sets.
+    See also [Intersection (set theory)](https://en.wikipedia.org/wiki/Intersection_(set_theory)).
+
+    The given file sets are evaluated as lazily as possible,
+    with the first argument being evaluated first if needed.
+
+    Type:
+      intersection :: FileSet -> FileSet -> FileSet
+
+    Example:
+      # Limit the selected files to the ones in ./., so only ./src and ./Makefile
+      intersection ./. (unions [ ../LICENSE ./src ./Makefile ])
+  */
+  intersection =
+    # The first file set.
+    # This argument can also be a path,
+    # which gets [implicitly coerced to a file set](#sec-fileset-path-coercion).
+    fileset1:
+    # The second file set.
+    # This argument can also be a path,
+    # which gets [implicitly coerced to a file set](#sec-fileset-path-coercion).
+    fileset2:
+    let
+      filesets = _coerceMany "lib.fileset.intersection" [
+        {
+          context = "first argument";
+          value = fileset1;
+        }
+        {
+          context = "second argument";
+          value = fileset2;
+        }
+      ];
+    in
+    _intersection
+      (elemAt filesets 0)
+      (elemAt filesets 1);
+
+  /*
     Incrementally evaluate and trace a file set in a pretty way.
     This function is only intended for debugging purposes.
     The exact tracing format is unspecified and may change.
diff --git a/lib/fileset/internal.nix b/lib/fileset/internal.nix
index d18e37e9d6e..546b93f158a 100644
--- a/lib/fileset/internal.nix
+++ b/lib/fileset/internal.nix
@@ -461,6 +461,43 @@ rec {
     else
       nonEmpty;
 
+  # Transforms the filesetTree of a file set to a shorter base path, e.g.
+  # _shortenTreeBase [ "foo" ] (_create /foo/bar null)
+  # => { bar = null; }
+  _shortenTreeBase = targetBaseComponents: fileset:
+    let
+      recurse = index:
+        # If we haven't reached the required depth yet
+        if index < length fileset._internalBaseComponents then
+          # Create an attribute set and recurse as the value, this can be lazily evaluated this way
+          { ${elemAt fileset._internalBaseComponents index} = recurse (index + 1); }
+        else
+          # Otherwise we reached the appropriate depth, here's the original tree
+          fileset._internalTree;
+    in
+    recurse (length targetBaseComponents);
+
+  # Transforms the filesetTree of a file set to a longer base path, e.g.
+  # _lengthenTreeBase [ "foo" "bar" ] (_create /foo { bar.baz = "regular"; })
+  # => { baz = "regular"; }
+  _lengthenTreeBase = targetBaseComponents: fileset:
+    let
+      recurse = index: tree:
+        # If the filesetTree is an attribute set and we haven't reached the required depth yet
+        if isAttrs tree && index < length targetBaseComponents then
+          # Recurse with the tree under the right component (which might not exist)
+          recurse (index + 1) (tree.${elemAt targetBaseComponents index} or null)
+        else
+          # For all values here we can just return the tree itself:
+          # tree == null -> the result is also null, everything is excluded
+          # tree == "directory" -> the result is also "directory",
+          #   because the base path is always a directory and everything is included
+          # isAttrs tree -> the result is `tree`
+          #   because we don't need to recurse any more since `index == length longestBaseComponents`
+          tree;
+    in
+    recurse (length fileset._internalBaseComponents) fileset._internalTree;
+
   # Computes the union of a list of filesets.
   # The filesets must already be coerced and validated to be in the same filesystem root
   # Type: [ Fileset ] -> Fileset
@@ -497,11 +534,7 @@ rec {
       # So the tree under `/foo/bar` gets nested under `{ bar = ...; ... }`,
       # while the tree under `/foo/baz` gets nested under `{ baz = ...; ... }`
       # Therefore allowing combined operations over them.
-      trees = map (fileset:
-        setAttrByPath
-          (drop (length commonBaseComponents) fileset._internalBaseComponents)
-          fileset._internalTree
-        ) filesetsWithBase;
+      trees = map (_shortenTreeBase commonBaseComponents) filesetsWithBase;
 
       # Folds all trees together into a single one using _unionTree
       # We do not use a fold here because it would cause a thunk build-up
@@ -533,4 +566,76 @@ rec {
       # The non-null elements have to be attribute sets representing partial trees
       # We need to recurse into those
       zipAttrsWith (name: _unionTrees) withoutNull;
+
+  # Computes the intersection of a list of filesets.
+  # The filesets must already be coerced and validated to be in the same filesystem root
+  # Type: Fileset -> Fileset -> Fileset
+  _intersection = fileset1: fileset2:
+    let
+      # The common base components prefix, e.g.
+      # (/foo/bar, /foo/bar/baz) -> /foo/bar
+      # (/foo/bar, /foo/baz) -> /foo
+      commonBaseComponentsLength =
+        # TODO: Have a `lib.lists.commonPrefixLength` function such that we don't need the list allocation from commonPrefix here
+        length (
+          commonPrefix
+            fileset1._internalBaseComponents
+            fileset2._internalBaseComponents
+        );
+
+      # To be able to intersect filesetTree's together, they need to have the same base path.
+      # Base paths can be intersected by taking the longest one (if any)
+
+      # The fileset with the longest base, if any, e.g.
+      # (/foo/bar, /foo/bar/baz) -> /foo/bar/baz
+      # (/foo/bar, /foo/baz) -> null
+      longestBaseFileset =
+        if commonBaseComponentsLength == length fileset1._internalBaseComponents then
+          # The common prefix is the same as the first path, so the second path is equal or longer
+          fileset2
+        else if commonBaseComponentsLength == length fileset2._internalBaseComponents then
+          # The common prefix is the same as the second path, so the first path is longer
+          fileset1
+        else
+          # The common prefix is neither the first nor the second path
+          # This means there's no overlap between the two sets
+          null;
+
+      # Whether the result should be the empty value without a base
+      resultIsEmptyWithoutBase =
+        # If either fileset is the empty fileset without a base, the intersection is too
+        fileset1._internalIsEmptyWithoutBase
+        || fileset2._internalIsEmptyWithoutBase
+        # If there is no overlap between the base paths
+        || longestBaseFileset == null;
+
+      # Lengthen each fileset's tree to the longest base prefix
+      tree1 = _lengthenTreeBase longestBaseFileset._internalBaseComponents fileset1;
+      tree2 = _lengthenTreeBase longestBaseFileset._internalBaseComponents fileset2;
+
+      # With two filesetTree's with the same base, we can compute their intersection
+      resultTree = _intersectTree tree1 tree2;
+    in
+    if resultIsEmptyWithoutBase then
+      _emptyWithoutBase
+    else
+      _create longestBaseFileset._internalBase resultTree;
+
+  # The intersection of two filesetTree's with the same base path
+  # The second element is only evaluated as much as necessary.
+  # Type: filesetTree -> filesetTree -> filesetTree
+  _intersectTree = lhs: rhs:
+    if isAttrs lhs && isAttrs rhs then
+      # Both sides are attribute sets, we can recurse for the attributes existing on both sides
+      mapAttrs
+        (name: _intersectTree lhs.${name})
+        (builtins.intersectAttrs lhs rhs)
+    else if lhs == null || isString rhs then
+      # If the lhs is null, the result should also be null
+      # And if the rhs is the identity element
+      # (a string, aka it includes everything), then it's also the lhs
+      lhs
+    else
+      # In all other cases it's the rhs
+      rhs;
 }
diff --git a/lib/fileset/tests.sh b/lib/fileset/tests.sh
index 9e09da80924..7a104654983 100755
--- a/lib/fileset/tests.sh
+++ b/lib/fileset/tests.sh
@@ -587,6 +587,97 @@ done
 # So, just using 1000 files for now.
 checkFileset 'unions (mapAttrsToList (name: _: ./. + "/${name}/a") (builtins.readDir ./.))'
 
+
+## lib.fileset.intersection
+
+
+# Different filesystem roots in root and fileset are not supported
+mkdir -p {foo,bar}/mock-root
+expectFailure 'with ((import <nixpkgs/lib>).extend (import <nixpkgs/lib/fileset/mock-splitRoot.nix>)).fileset;
+  toSource { root = ./.; fileset = intersection ./foo/mock-root ./bar/mock-root; }
+' 'lib.fileset.intersection: Filesystem roots are not the same:
+\s*first argument: root "'"$work"'/foo/mock-root"
+\s*second argument: root "'"$work"'/bar/mock-root"
+\s*Different roots are not supported.'
+rm -rf -- *
+
+# Coercion errors show the correct context
+expectFailure 'toSource { root = ./.; fileset = intersection ./a ./.; }' 'lib.fileset.intersection: first argument \('"$work"'/a\) does not exist.'
+expectFailure 'toSource { root = ./.; fileset = intersection ./. ./b; }' 'lib.fileset.intersection: second argument \('"$work"'/b\) does not exist.'
+
+# The tree of later arguments should not be evaluated if a former argument already excludes all files
+tree=(
+    [a]=0
+)
+checkFileset 'intersection _emptyWithoutBase (_create ./. (abort "This should not be used!"))'
+# We don't have any combinators that can explicitly remove files yet, so we need to rely on internal functions to test this for now
+checkFileset 'intersection (_create ./. { a = null; }) (_create ./. { a = abort "This should not be used!"; })'
+
+# If either side is empty, the result is empty
+tree=(
+    [a]=0
+)
+checkFileset 'intersection _emptyWithoutBase _emptyWithoutBase'
+checkFileset 'intersection _emptyWithoutBase (_create ./. null)'
+checkFileset 'intersection (_create ./. null) _emptyWithoutBase'
+checkFileset 'intersection (_create ./. null) (_create ./. null)'
+
+# If the intersection base paths are not overlapping, the result is empty and has no base path
+mkdir a b c
+touch {a,b,c}/x
+expectEqual 'toSource { root = ./c; fileset = intersection ./a ./b; }' 'toSource { root = ./c; fileset = _emptyWithoutBase; }'
+rm -rf -- *
+
+# If the intersection exists, the resulting base path is the longest of them
+mkdir a
+touch x a/b
+expectEqual 'toSource { root = ./a; fileset = intersection ./a ./.; }' 'toSource { root = ./a; fileset = ./a; }'
+expectEqual 'toSource { root = ./a; fileset = intersection ./. ./a; }' 'toSource { root = ./a; fileset = ./a; }'
+rm -rf -- *
+
+# Also finds the intersection with null'd filesetTree's
+tree=(
+    [a]=0
+    [b]=1
+    [c]=0
+)
+checkFileset 'intersection (_create ./. { a = "regular"; b = "regular"; c = null; }) (_create ./. { a = null; b = "regular"; c = "regular"; })'
+
+# Actually computes the intersection between files
+tree=(
+    [a]=0
+    [b]=0
+    [c]=1
+    [d]=1
+    [e]=0
+    [f]=0
+)
+checkFileset 'intersection (unions [ ./a ./b ./c ./d ]) (unions [ ./c ./d ./e ./f ])'
+
+tree=(
+    [a/x]=0
+    [a/y]=0
+    [b/x]=1
+    [b/y]=1
+    [c/x]=0
+    [c/y]=0
+)
+checkFileset 'intersection ./b ./.'
+checkFileset 'intersection ./b (unions [ ./a/x ./a/y ./b/x ./b/y ./c/x ./c/y ])'
+
+# Complicated case
+tree=(
+    [a/x]=0
+    [a/b/i]=1
+    [c/d/x]=0
+    [c/d/f]=1
+    [c/x]=0
+    [c/e/i]=1
+    [c/e/j]=1
+)
+checkFileset 'intersection (unions [ ./a/b ./c/d ./c/e ]) (unions [ ./a ./c/d/f ./c/e ])'
+
+
 ## Tracing
 
 # The second trace argument is returned
@@ -609,6 +700,10 @@ rm -rf -- *
 # The empty file set without a base also prints as empty
 expectTrace '_emptyWithoutBase' '(empty)'
 expectTrace 'unions [ ]' '(empty)'
+mkdir foo bar
+touch {foo,bar}/x
+expectTrace 'intersection ./foo ./bar' '(empty)'
+rm -rf -- *
 
 # If a directory is fully included, print it as such
 touch a