summary refs log tree commit diff
path: root/pkgs/development/python-modules/pyarrow/default.nix
blob: 141d90a5a971f9c6a241bf48162ef5cd58f90447 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
{ lib
, stdenv
, buildPythonPackage
, python
, isPy3k
, arrow-cpp
, cffi
, cloudpickle
, cmake
, cython
, fsspec
, hypothesis
, numpy
, pandas
, pytestCheckHook
, pytest-lazy-fixture
, pkg-config
, scipy
, setuptools-scm
, six
}:

let
  zero_or_one = cond: if cond then 1 else 0;

  _arrow-cpp = arrow-cpp.override { python3 = python; };
in

buildPythonPackage rec {
  pname = "pyarrow";
  disabled = !isPy3k;

  inherit (_arrow-cpp) version src;

  sourceRoot = "apache-arrow-${version}/python";

  nativeBuildInputs = [ cmake cython pkg-config setuptools-scm ];
  propagatedBuildInputs = [ numpy six cloudpickle scipy fsspec cffi ];
  checkInputs = [
    hypothesis
    pandas
    pytestCheckHook
    pytest-lazy-fixture
  ];

  PYARROW_BUILD_TYPE = "release";

  PYARROW_WITH_DATASET = zero_or_one true;
  PYARROW_WITH_FLIGHT = zero_or_one _arrow-cpp.enableFlight;
  PYARROW_WITH_HDFS = zero_or_one true;
  PYARROW_WITH_PARQUET = zero_or_one true;
  PYARROW_WITH_PLASMA = zero_or_one (!stdenv.isDarwin);
  PYARROW_WITH_S3 = zero_or_one _arrow-cpp.enableS3;

  PYARROW_CMAKE_OPTIONS = [
    "-DCMAKE_INSTALL_RPATH=${ARROW_HOME}/lib"
  ];

  ARROW_HOME = _arrow-cpp;
  PARQUET_HOME = _arrow-cpp;

  ARROW_TEST_DATA = lib.optionalString doCheck _arrow-cpp.ARROW_TEST_DATA;

  doCheck = true;
  dontUseCmakeConfigure = true;

  preBuild = ''
    export PYARROW_PARALLEL=$NIX_BUILD_CORES
  '';

  pytestFlagsArray = [
    # Deselect a single test because pyarrow prints a 2-line error message where
    # only a single line is expected. The additional line of output comes from
    # the glog library which is an optional dependency of arrow-cpp that is
    # enabled in nixpkgs.
    # Upstream Issue: https://issues.apache.org/jira/browse/ARROW-11393
    "--deselect=pyarrow/tests/test_memory.py::test_env_var"
    # these tests require access to s3 via the internet
    "--deselect=pyarrow/tests/test_fs.py::test_resolve_s3_region"
    "--deselect=pyarrow/tests/test_fs.py::test_s3_real_aws"
    "--deselect=pyarrow/tests/test_fs.py::test_s3_real_aws_region_selection"
    "--deselect=pyarrow/tests/test_fs.py::test_s3_options"
  ] ++ lib.optionals stdenv.isDarwin [
    # Requires loopback networking
    "--deselect=pyarrow/tests/test_ipc.py::test_socket_"
  ];

  dontUseSetuptoolsCheck = true;
  preCheck = ''
    shopt -s extglob
    rm -r pyarrow/!(tests)
  '';

  pythonImportsCheck = [ "pyarrow" ] ++ map (module: "pyarrow.${module}") ([
    "compute"
    "csv"
    "dataset"
    "feather"
    "flight"
    "fs"
    "hdfs"
    "json"
    "parquet"
  ] ++ lib.optionals (!stdenv.isDarwin) [ "plasma" ]);

  meta = with lib; {
    description = "A cross-language development platform for in-memory data";
    homepage = "https://arrow.apache.org/";
    license = licenses.asl20;
    platforms = platforms.unix;
    maintainers = with maintainers; [ veprbl cpcloud ];
  };
}