summary refs log tree commit diff
path: root/pkgs/build-support/references-by-popularity
diff options
context:
space:
mode:
authorGraham Christensen <graham@grahamc.com>2019-03-05 16:37:49 -0500
committerGraham Christensen <graham@grahamc.com>2019-03-05 16:37:52 -0500
commit09362bc3e88f7e78953bcd312b6bd86191ce4d65 (patch)
tree976a8acb6cbc079a2441bec70a50f71c77b3a55c /pkgs/build-support/references-by-popularity
parent54826e7471fb5c8ccfff47fd0389f7a89ec3b67d (diff)
downloadnixpkgs-09362bc3e88f7e78953bcd312b6bd86191ce4d65.tar
nixpkgs-09362bc3e88f7e78953bcd312b6bd86191ce4d65.tar.gz
nixpkgs-09362bc3e88f7e78953bcd312b6bd86191ce4d65.tar.bz2
nixpkgs-09362bc3e88f7e78953bcd312b6bd86191ce4d65.tar.lz
nixpkgs-09362bc3e88f7e78953bcd312b6bd86191ce4d65.tar.xz
nixpkgs-09362bc3e88f7e78953bcd312b6bd86191ce4d65.tar.zst
nixpkgs-09362bc3e88f7e78953bcd312b6bd86191ce4d65.zip
references-by-popularity: cache computation to avoid memory bloat
On very large graphs (14k+ paths), we'd end up with a massive in
memory tree of mostly duplication.

We can safely cache trees and point back to them later, saving
memory.
Diffstat (limited to 'pkgs/build-support/references-by-popularity')
-rw-r--r--pkgs/build-support/references-by-popularity/closure-graph.py34
1 files changed, 30 insertions, 4 deletions
diff --git a/pkgs/build-support/references-by-popularity/closure-graph.py b/pkgs/build-support/references-by-popularity/closure-graph.py
index 145ba1bc9b3..579f3b041fa 100644
--- a/pkgs/build-support/references-by-popularity/closure-graph.py
+++ b/pkgs/build-support/references-by-popularity/closure-graph.py
@@ -338,11 +338,23 @@ class TestMakeLookup(unittest.TestCase):
 #                   /nix/store/tux: {}
 #   }
 # }
+subgraphs_cache = {}
 def make_graph_segment_from_root(root, lookup):
+    global subgraphs_cache
     children = {}
     for ref in lookup[root]:
-        debug("Making graph segments on {}".format(ref))
-        children[ref] = make_graph_segment_from_root(ref, lookup)
+        # make_graph_segment_from_root is a pure function, and will
+        # always return the same result based on a given input. Thus,
+        # cache computation.
+        #
+        # Python's assignment will use a pointer, preventing memory
+        # bloat for large graphs.
+        if ref not in subgraphs_cache:
+            debug("Subgraph Cache miss on {}".format(ref))
+            subgraphs_cache[ref] = make_graph_segment_from_root(ref, lookup)
+        else:
+            debug("Subgraph Cache hit on {}".format(ref))
+        children[ref] = subgraphs_cache[ref]
     return children
 
 class TestMakeGraphSegmentFromRoot(unittest.TestCase):
@@ -393,13 +405,27 @@ class TestMakeGraphSegmentFromRoot(unittest.TestCase):
 #   /nix/store/baz: 4
 #   /nix/store/tux: 6
 # ]
+popularity_cache = {}
 def graph_popularity_contest(full_graph):
+    global popularity_cache
     popularity = defaultdict(int)
     for path, subgraph in full_graph.items():
-        debug("Calculating popularity under {}".format(path))
         popularity[path] += 1
-        subcontest = graph_popularity_contest(subgraph)
+        # graph_popularity_contest is a pure function, and will
+        # always return the same result based on a given input. Thus,
+        # cache computation.
+        #
+        # Python's assignment will use a pointer, preventing memory
+        # bloat for large graphs.
+        if path not in popularity_cache:
+            debug("Popularity Cache miss on {}", path)
+            popularity_cache[path] = graph_popularity_contest(subgraph)
+        else:
+            debug("Popularity Cache hit on {}", path)
+
+        subcontest = popularity_cache[path]
         for subpath, subpopularity in subcontest.items():
+            debug("Calculating popularity for {}", subpath)
             popularity[subpath] += subpopularity + 1
 
     return popularity