summary refs log tree commit diff
path: root/pkgs/applications/networking/cluster/hadoop/default.nix
blob: 65512de2031b903c05c763ffd1b6a7a6c3f6cf0f (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
{ lib
, stdenv
, fetchurl
, makeWrapper
, autoPatchelfHook
, jdk8_headless
, jdk11_headless
, bash
, coreutils
, which
, bzip2
, cyrus_sasl
, protobuf
, snappy
, zlib
, zstd
, openssl
, glibc
, nixosTests
, sparkSupport ? true
, spark
}:

with lib;

assert elem stdenv.system [ "x86_64-linux" "x86_64-darwin" "aarch64-linux" "aarch64-darwin" ];

let
  common = { pname, platformAttrs, untarDir ? "${pname}-${version}", jdk, openssl ? null, nativeLibs ? [ ], libPatches ? "", tests }:
    stdenv.mkDerivation rec {
      inherit pname jdk libPatches untarDir openssl;
      version = platformAttrs.${stdenv.system}.version or (throw "Unsupported system: ${stdenv.system}");
      src = fetchurl {
        url = "mirror://apache/hadoop/common/hadoop-${version}/hadoop-${version}" + optionalString stdenv.isAarch64 "-aarch64" + ".tar.gz";
        inherit (platformAttrs.${stdenv.system}) hash;
      };
      doCheck = true;

      nativeBuildInputs = [ makeWrapper ]
        ++ optionals (stdenv.isLinux && (nativeLibs != [ ] || libPatches != "")) [ autoPatchelfHook ];
      buildInputs = [ openssl ] ++ nativeLibs;

      installPhase = ''
        mkdir -p $out/{lib/${untarDir}/conf,bin,lib}
        mv * $out/lib/${untarDir}
      '' + optionalString stdenv.isLinux ''
        # All versions need container-executor, but some versions can't use autoPatchelf because of broken SSL versions
        patchelf --set-interpreter ${glibc.out}/lib64/ld-linux-x86-64.so.2 $out/lib/${untarDir}/bin/container-executor
      '' + ''
        for n in $(find $out/lib/${untarDir}/bin -type f ! -name "*.*"); do
          makeWrapper "$n" "$out/bin/$(basename $n)"\
            --set-default JAVA_HOME ${jdk.home}\
            --set-default HADOOP_HOME $out/lib/${untarDir}\
            --run "test -d /etc/hadoop-conf && export HADOOP_CONF_DIR=\''${HADOOP_CONF_DIR-'/etc/hadoop-conf/'}"\
            --set-default HADOOP_CONF_DIR $out/lib/${untarDir}/etc/hadoop/\
            --prefix PATH : "${makeBinPath [ bash coreutils which]}"\
            --prefix JAVA_LIBRARY_PATH : "${makeLibraryPath buildInputs}"
        done
      '' + optionalString sparkSupport ''
        # Add the spark shuffle service jar to YARN
        cp ${spark.src}/yarn/spark-${spark.version}-yarn-shuffle.jar $out/lib/${untarDir}/share/hadoop/yarn/
      '' + libPatches;

      passthru = { inherit tests; };

      meta = recursiveUpdate {
        homepage = "https://hadoop.apache.org/";
        description = "Framework for distributed processing of large data sets across clusters of computers";
        license = licenses.asl20;
        sourceProvenance = with sourceTypes; [ binaryBytecode ];

        longDescription = ''
          The Apache Hadoop software library is a framework that allows for
          the distributed processing of large data sets across clusters of
          computers using a simple programming model. It is designed to
          scale up from single servers to thousands of machines, each
          offering local computation and storage. Rather than rely on
          hardware to deliver high-avaiability, the library itself is
          designed to detect and handle failures at the application layer,
          so delivering a highly-availabile service on top of a cluster of
          computers, each of which may be prone to failures.
        '';
        maintainers = with maintainers; [ illustris ];
        platforms = attrNames platformAttrs;
      } (attrByPath [ stdenv.system "meta" ] {} platformAttrs);
    };
in
{
  # Different version of hadoop support different java runtime versions
  # https://cwiki.apache.org/confluence/display/HADOOP/Hadoop+Java+Versions
  hadoop_3_3 = common rec {
    pname = "hadoop";
    platformAttrs = rec {
        x86_64-linux = {
          version = "3.3.5";
          hash = "sha256-RG4FypL6I6YGF6ixeUbe3kcoGvFQQEFhfLfV9i50JSo=";
        };
        x86_64-darwin = x86_64-linux;
        aarch64-linux = {
          version = "3.3.5";
          hash = "sha256-qcKjbE881isauWBxIv+NY0UFbYit704/Re8Kdl6x1LA=";
        };
        aarch64-darwin = aarch64-linux;
    };
    untarDir = "${pname}-${platformAttrs.${stdenv.system}.version}";
    jdk = jdk11_headless;
    inherit openssl;
    # TODO: Package and add Intel Storage Acceleration Library
    nativeLibs = [ stdenv.cc.cc.lib protobuf zlib snappy ];
    libPatches = ''
      ln -s ${getLib cyrus_sasl}/lib/libsasl2.so $out/lib/${untarDir}/lib/native/libsasl2.so.2
      ln -s ${getLib openssl}/lib/libcrypto.so $out/lib/${untarDir}/lib/native/
      ln -s ${getLib zlib}/lib/libz.so.1 $out/lib/${untarDir}/lib/native/
      ln -s ${getLib zstd}/lib/libzstd.so.1 $out/lib/${untarDir}/lib/native/
      ln -s ${getLib bzip2}/lib/libbz2.so.1 $out/lib/${untarDir}/lib/native/
    '' + optionalString stdenv.isLinux ''
      # libjvm.so for Java >=11
      patchelf --add-rpath ${jdk.home}/lib/server $out/lib/${untarDir}/lib/native/libnativetask.so.1.0.0
      # Java 8 has libjvm.so at a different path
      patchelf --add-rpath ${jdk.home}/jre/lib/amd64/server $out/lib/${untarDir}/lib/native/libnativetask.so.1.0.0
      # NixOS/nixpkgs#193370
      # This workaround is needed to use protobuf 3.19
      patchelf --replace-needed libprotobuf.so.18 libprotobuf.so $out/lib/${untarDir}/lib/native/libhdfspp.so
    '';
    tests = nixosTests.hadoop;
  };
  hadoop_3_2 = common rec {
    pname = "hadoop";
    platformAttrs.x86_64-linux = {
      version = "3.2.4";
      hash = "sha256-qt2gpMr+NHuiVR+/zFRzRyRKG725/ZNBIM69z9J9wNw=";
    };
    jdk = jdk8_headless;
    # not using native libs because of broken openssl_1_0_2 dependency
    # can be manually overridden
    tests = nixosTests.hadoop_3_2;
  };
  hadoop2 = common rec {
    pname = "hadoop";
    platformAttrs.x86_64-linux = {
      version = "2.10.2";
      hash = "sha256-xhA4zxqIRGNhIeBnJO9dLKf/gx/Bq+uIyyZwsIafEyo=";
    };
    jdk = jdk8_headless;
    tests = nixosTests.hadoop2;
  };
}