summary refs log tree commit diff
path: root/pkgs/development/tools/parsing/tree-sitter/update.nix
diff options
context:
space:
mode:
Diffstat (limited to 'pkgs/development/tools/parsing/tree-sitter/update.nix')
-rw-r--r--pkgs/development/tools/parsing/tree-sitter/update.nix213
1 files changed, 187 insertions, 26 deletions
diff --git a/pkgs/development/tools/parsing/tree-sitter/update.nix b/pkgs/development/tools/parsing/tree-sitter/update.nix
index 2a3575a44d2..55237b298d2 100644
--- a/pkgs/development/tools/parsing/tree-sitter/update.nix
+++ b/pkgs/development/tools/parsing/tree-sitter/update.nix
@@ -1,66 +1,227 @@
-{ writeShellScript, nix-prefetch-git
+{ writeShellScript, nix-prefetch-git, formats, lib
 , curl, jq, xe
 , src }:
 
+# Grammar list:
+# https://github.com/tree-sitter/tree-sitter/blob/master/docs/index.md
+
 let
-  # print all the grammar names mentioned in the fetch-fixtures script
-  getGrammarNames = writeShellScript "get-grammars.sh" ''
+  # Grammars we want to fetch from the tree-sitter github orga
+  knownTreeSitterOrgGrammarRepos = [
+    "tree-sitter-javascript"
+    "tree-sitter-c"
+    "tree-sitter-swift"
+    "tree-sitter-json"
+    "tree-sitter-cpp"
+    "tree-sitter-ruby"
+    "tree-sitter-go"
+    "tree-sitter-c-sharp"
+    "tree-sitter-python"
+    "tree-sitter-typescript"
+    "tree-sitter-rust"
+    "tree-sitter-bash"
+    "tree-sitter-php"
+    "tree-sitter-java"
+    "tree-sitter-scala"
+    "tree-sitter-ocaml"
+    "tree-sitter-julia"
+    "tree-sitter-agda"
+    "tree-sitter-fluent"
+    "tree-sitter-html"
+    "tree-sitter-haskell"
+    "tree-sitter-regex"
+    "tree-sitter-css"
+    "tree-sitter-verilog"
+    "tree-sitter-jsdoc"
+    "tree-sitter-ql"
+    "tree-sitter-embedded-template"
+    "tree-sitter-tsq"
+  ];
+  knownTreeSitterOrgGrammarReposJson = jsonFile "known-tree-sitter-org-grammar-repos" knownTreeSitterOrgGrammarRepos;
+
+  # repos of the tree-sitter github orga we want to ignore (not grammars)
+  ignoredTreeSitterOrgRepos = [
+    "tree-sitter"
+    "tree-sitter-cli"
+    # this is the haskell language bindings, tree-sitter-haskell is the grammar
+    "haskell-tree-sitter"
+    # this is the ruby language bindings, tree-sitter-ruby is the grammar
+    "ruby-tree-sitter"
+    # this is the (unmaintained) rust language bindings, tree-sitter-rust is the grammar
+    "rust-tree-sitter"
+    # this is the nodejs language bindings, tree-sitter-javascript is the grammar
+    "node-tree-sitter"
+    # this is the python language bindings, tree-sitter-python is the grammar
+    "py-tree-sitter"
+    # afl fuzzing for tree sitter
+    "afl-tree-sitter"
+    # archived
+    "highlight-schema"
+    # website
+    "tree-sitter.github.io"
+    # not maintained
+    "tree-sitter-razor"
+    # rust library for constructing arbitrary graph structures from source code
+    "tree-sitter-graph"
+  ];
+  ignoredTreeSitterOrgReposJson = jsonFile "ignored-tree-sitter-org-repos" ignoredTreeSitterOrgRepos;
+
+  # Additional grammars that are not in the official github orga.
+  # If you need a grammar that already exists in the official orga,
+  # make sure to give it a different name.
+  otherGrammars = {
+    "tree-sitter-nix" = {
+      orga = "cstrahan";
+      repo = "tree-sitter-nix";
+    };
+    "tree-sitter-latex" = {
+      orga = "latex-lsp";
+      repo = "tree-sitter-latex";
+    };
+    "tree-sitter-lua" = {
+      orga = "nvim-treesitter";
+      repo = "tree-sitter-lua";
+    };
+    "tree-sitter-fennel" = {
+      orga = "travonted";
+      repo = "tree-sitter-fennel";
+    };
+    "tree-sitter-markdown" = {
+      orga = "ikatyang";
+      repo = "tree-sitter-markdown";
+    };
+    "tree-sitter-svelte" = {
+      orga = "Himujjal";
+      repo = "tree-sitter-svelte";
+    };
+    "tree-sitter-yaml" = {
+      orga = "ikatyang";
+      repo = "tree-sitter-yaml";
+    };
+    "tree-sitter-toml" = {
+      orga = "ikatyang";
+      repo = "tree-sitter-toml";
+    };
+    "tree-sitter-zig" = {
+      orga = "GrayJack";
+      repo = "tree-sitter-zig";
+    };
+  };
+
+  allGrammars =
+    let
+      treeSitterOrgaGrammars =
+        lib.listToAttrs (map (repo:
+          { name = repo;
+            value = {
+              orga = "tree-sitter";
+              inherit repo;
+            };
+          })
+        knownTreeSitterOrgGrammarRepos);
+
+    in
+      mergeAttrsUnique otherGrammars treeSitterOrgaGrammars;
+
+  # TODO: move to lib
+  mergeAttrsUnique = left: right:
+    let intersect = lib.intersectLists (lib.attrNames left) (lib.attrNames right); in
+    assert
+      lib.assertMsg (intersect == [])
+        (lib.concatStringsSep "\n" [
+          "mergeAttrsUnique: keys in attrset overlapping:"
+          "left: ${lib.generators.toPretty {} (lib.getAttrs intersect left)}"
+          "right: ${lib.generators.toPretty {} (lib.getAttrs intersect right)}"
+        ]);
+    left // right;
+
+
+
+  jsonFile = name: val: (formats.json {}).generate name val;
+
+  # check the tree-sitter orga repos
+  checkTreeSitterRepos = writeShellScript "get-grammars.sh" ''
     set -euo pipefail
-    sed -ne 's/^fetch_grammar \(\S*\).*$/\1/p' \
-      ${src}/script/fetch-fixtures
+    res=$(${jq}/bin/jq \
+      --slurpfile known "${knownTreeSitterOrgGrammarReposJson}" \
+      --slurpfile ignore "${ignoredTreeSitterOrgReposJson}" \
+      '. - ($known[0] + $ignore[0])' \
+      )
+    if [ ! "$res" == "[]" ]; then
+      echo "These repositories are neither known nor ignored:" 1>&2
+      echo "$res" 1>&2
+      exit 1
+    fi
   '';
 
   # TODO
   urlEscape = x: x;
-  # TODO
-  urlEscapeSh = writeShellScript "escape-url" ''printf '%s' "$1"'';
 
   # generic bash script to find the latest github release for a repo
-  latestGithubRelease = { owner }: writeShellScript "latest-github-release" ''
+  latestGithubRelease = { orga, repo }: writeShellScript "latest-github-release" ''
     set -euo pipefail
-    repo="$1"
     res=$(${curl}/bin/curl \
       --silent \
-      "https://api.github.com/repos/${urlEscape owner}/$(${urlEscapeSh} "$repo")/releases/latest")
-    if [[ "$(printf "%s" "$res" | ${jq}/bin/jq '.message')" =~ "rate limit" ]]; then
+      "https://api.github.com/repos/${urlEscape orga}/${urlEscape repo}/releases/latest")
+    if [[ "$(printf "%s" "$res" | ${jq}/bin/jq '.message?')" =~ "rate limit" ]]; then
       echo "rate limited" >&2
     fi
     release=$(printf "%s" "$res" | ${jq}/bin/jq '.tag_name')
     # github sometimes returns an empty list even tough there are releases
     if [ "$release" = "null" ]; then
-      echo "uh-oh, latest for $repo is not there, using HEAD" >&2
+      echo "uh-oh, latest for ${orga + "/" + repo} is not there, using HEAD" >&2
       release="HEAD"
     fi
     echo "$release"
   '';
 
+  # find the latest repos of a github organization
+  latestGithubRepos = { orga }: writeShellScript "latest-github-repos" ''
+    set -euo pipefail
+    res=$(${curl}/bin/curl \
+      --silent \
+      'https://api.github.com/orgs/${urlEscape orga}/repos?per_page=100')
+
+    if [[ "$(printf "%s" "$res" | ${jq}/bin/jq '.message?')" =~ "rate limit" ]]; then
+      echo "rate limited" >&2   #
+    fi
+
+    printf "%s" "$res" | ${jq}/bin/jq 'map(.name)' \
+      || echo "failed $res"
+  '';
+
   # update one tree-sitter grammar repo and print their nix-prefetch-git output
-  updateGrammar = { owner }: writeShellScript "update-grammar.sh" ''
+  updateGrammar = { orga, repo }: writeShellScript "update-grammar.sh" ''
     set -euo pipefail
-    repo="$1"
-    latest="$(${latestGithubRelease { inherit owner; }} "$repo")"
-    echo "Fetching latest release ($latest) of $repo …" >&2
+    latest="$(${latestGithubRelease { inherit orga repo; }})"
+    echo "Fetching latest release ($latest) of ${repo} …" >&2
     ${nix-prefetch-git}/bin/nix-prefetch-git \
       --quiet \
       --no-deepClone \
-      --url "https://github.com/${urlEscape owner}/$(${urlEscapeSh} "$repo")" \
+      --url "https://github.com/${urlEscape orga}/${urlEscape repo}" \
       --rev "$latest"
     '';
 
+  foreachSh = attrs: f:
+    lib.concatMapStringsSep "\n" f
+    (lib.mapAttrsToList (k: v: { name = k; } // v) attrs);
+
   update-all-grammars = writeShellScript "update-all-grammars.sh" ''
     set -euo pipefail
-    grammarNames=$(${getGrammarNames})
+    echo "fetching list of grammars" 1>&2
+    treeSitterRepos=$(${latestGithubRepos { orga = "tree-sitter"; }})
+    echo "checking the tree-sitter repo list against the grammars we know" 1>&2
+    printf '%s' "$treeSitterRepos" | ${checkTreeSitterRepos}
     outputDir="${toString ./.}/grammars"
+    echo "writing files to $outputDir" 1>&2
     mkdir -p "$outputDir"
-    updateCommand=$(printf \
-      '${updateGrammar { owner = "tree-sitter"; }} "$1" > "%s/$1.json"' \
-      "$outputDir")
-    printf '%s' "$grammarNames" \
-      | ${xe}/bin/xe printf "tree-sitter-%s\n" {} \
-      | ${xe}/bin/xe -j2 -s "$updateCommand"
+    ${foreachSh allGrammars
+      ({name, orga, repo}: ''${updateGrammar { inherit orga repo; }} > $outputDir/${name}.json'')}
     ( echo "{"
-      printf '%s' "$grammarNames" \
-        | ${xe}/bin/xe -s 'printf "  %s = (builtins.fromJSON (builtins.readFile ./tree-sitter-%s.json));\n" "$1" "$1"'
+      ${foreachSh allGrammars
+        ({name, ...}: ''
+           # indentation hack
+             printf "  %s = (builtins.fromJSON (builtins.readFile ./%s.json));\n" "${name}" "${name}"'')}
       echo "}" ) \
       > "$outputDir/default.nix"
   '';