Coverage for /var/srv/projects/api.amasfac.comuna18.com/tmp/venv/lib/python3.9/site-packages/pandas/core/arrays/arrow/array.py: 13%
397 statements
« prev ^ index » next coverage.py v6.4.4, created at 2023-07-17 14:22 -0600
« prev ^ index » next coverage.py v6.4.4, created at 2023-07-17 14:22 -0600
1from __future__ import annotations
3from typing import (
4 TYPE_CHECKING,
5 Any,
6 TypeVar,
7)
9import numpy as np
11from pandas._libs import lib
12from pandas._typing import (
13 Dtype,
14 PositionalIndexer,
15 TakeIndexer,
16 npt,
17)
18from pandas.compat import (
19 pa_version_under1p01,
20 pa_version_under2p0,
21 pa_version_under3p0,
22 pa_version_under4p0,
23 pa_version_under5p0,
24 pa_version_under6p0,
25 pa_version_under7p0,
26)
27from pandas.util._decorators import (
28 deprecate_nonkeyword_arguments,
29 doc,
30)
32from pandas.core.dtypes.common import (
33 is_array_like,
34 is_bool_dtype,
35 is_integer,
36 is_integer_dtype,
37 is_scalar,
38)
39from pandas.core.dtypes.missing import isna
41from pandas.core.algorithms import resolve_na_sentinel
42from pandas.core.arraylike import OpsMixin
43from pandas.core.arrays.base import ExtensionArray
44from pandas.core.indexers import (
45 check_array_indexer,
46 unpack_tuple_and_ellipses,
47 validate_indices,
48)
50if not pa_version_under1p01: 50 ↛ 51line 50 didn't jump to line 51, because the condition on line 50 was never true
51 import pyarrow as pa
52 import pyarrow.compute as pc
54 from pandas.core.arrays.arrow._arrow_utils import fallback_performancewarning
55 from pandas.core.arrays.arrow.dtype import ArrowDtype
57 ARROW_CMP_FUNCS = {
58 "eq": pc.equal,
59 "ne": pc.not_equal,
60 "lt": pc.less,
61 "gt": pc.greater,
62 "le": pc.less_equal,
63 "ge": pc.greater_equal,
64 }
66 ARROW_LOGICAL_FUNCS = {
67 "and": NotImplemented if pa_version_under2p0 else pc.and_kleene,
68 "rand": NotImplemented
69 if pa_version_under2p0
70 else lambda x, y: pc.and_kleene(y, x),
71 "or": NotImplemented if pa_version_under2p0 else pc.or_kleene,
72 "ror": NotImplemented
73 if pa_version_under2p0
74 else lambda x, y: pc.or_kleene(y, x),
75 "xor": NotImplemented if pa_version_under2p0 else pc.xor,
76 "rxor": NotImplemented if pa_version_under2p0 else lambda x, y: pc.xor(y, x),
77 }
79 def cast_for_truediv(
80 arrow_array: pa.ChunkedArray, pa_object: pa.Array | pa.Scalar
81 ) -> pa.ChunkedArray:
82 # Ensure int / int -> float mirroring Python/Numpy behavior
83 # as pc.divide_checked(int, int) -> int
84 if pa.types.is_integer(arrow_array.type) and pa.types.is_integer(
85 pa_object.type
86 ):
87 return arrow_array.cast(pa.float64())
88 return arrow_array
90 def floordiv_compat(
91 left: pa.ChunkedArray | pa.Array | pa.Scalar,
92 right: pa.ChunkedArray | pa.Array | pa.Scalar,
93 ) -> pa.ChunkedArray:
94 # Ensure int // int -> int mirroring Python/Numpy behavior
95 # as pc.floor(pc.divide_checked(int, int)) -> float
96 result = pc.floor(pc.divide_checked(left, right))
97 if pa.types.is_integer(left.type) and pa.types.is_integer(right.type):
98 result = result.cast(left.type)
99 return result
101 ARROW_ARITHMETIC_FUNCS = {
102 "add": NotImplemented if pa_version_under2p0 else pc.add_checked,
103 "radd": NotImplemented
104 if pa_version_under2p0
105 else lambda x, y: pc.add_checked(y, x),
106 "sub": NotImplemented if pa_version_under2p0 else pc.subtract_checked,
107 "rsub": NotImplemented
108 if pa_version_under2p0
109 else lambda x, y: pc.subtract_checked(y, x),
110 "mul": NotImplemented if pa_version_under2p0 else pc.multiply_checked,
111 "rmul": NotImplemented
112 if pa_version_under2p0
113 else lambda x, y: pc.multiply_checked(y, x),
114 "truediv": NotImplemented
115 if pa_version_under2p0
116 else lambda x, y: pc.divide_checked(cast_for_truediv(x, y), y),
117 "rtruediv": NotImplemented
118 if pa_version_under2p0
119 else lambda x, y: pc.divide_checked(y, cast_for_truediv(x, y)),
120 "floordiv": NotImplemented
121 if pa_version_under2p0
122 else lambda x, y: floordiv_compat(x, y),
123 "rfloordiv": NotImplemented
124 if pa_version_under2p0
125 else lambda x, y: floordiv_compat(y, x),
126 "mod": NotImplemented,
127 "rmod": NotImplemented,
128 "divmod": NotImplemented,
129 "rdivmod": NotImplemented,
130 "pow": NotImplemented if pa_version_under4p0 else pc.power_checked,
131 "rpow": NotImplemented
132 if pa_version_under4p0
133 else lambda x, y: pc.power_checked(y, x),
134 }
136if TYPE_CHECKING: 136 ↛ 137line 136 didn't jump to line 137, because the condition on line 136 was never true
137 from pandas import Series
139ArrowExtensionArrayT = TypeVar("ArrowExtensionArrayT", bound="ArrowExtensionArray")
142def to_pyarrow_type(
143 dtype: ArrowDtype | pa.DataType | Dtype | None,
144) -> pa.DataType | None:
145 """
146 Convert dtype to a pyarrow type instance.
147 """
148 if isinstance(dtype, ArrowDtype):
149 pa_dtype = dtype.pyarrow_dtype
150 elif isinstance(dtype, pa.DataType):
151 pa_dtype = dtype
152 elif dtype:
153 # Accepts python types too
154 pa_dtype = pa.from_numpy_dtype(dtype)
155 else:
156 pa_dtype = None
157 return pa_dtype
160class ArrowExtensionArray(OpsMixin, ExtensionArray):
161 """
162 Pandas ExtensionArray backed by a PyArrow ChunkedArray.
164 .. warning::
166 ArrowExtensionArray is considered experimental. The implementation and
167 parts of the API may change without warning.
169 Parameters
170 ----------
171 values : pyarrow.Array or pyarrow.ChunkedArray
173 Attributes
174 ----------
175 None
177 Methods
178 -------
179 None
181 Returns
182 -------
183 ArrowExtensionArray
185 Notes
186 -----
187 Most methods are implemented using `pyarrow compute functions. <https://arrow.apache.org/docs/python/api/compute.html>`__
188 Some methods may either raise an exception or raise a ``PerformanceWarning`` if an
189 associated compute function is not available based on the installed version of PyArrow.
191 Please install the latest version of PyArrow to enable the best functionality and avoid
192 potential bugs in prior versions of PyArrow.
194 Examples
195 --------
196 Create an ArrowExtensionArray with :func:`pandas.array`:
198 >>> pd.array([1, 1, None], dtype="int64[pyarrow]")
199 <ArrowExtensionArray>
200 [1, 1, <NA>]
201 Length: 3, dtype: int64[pyarrow]
202 """ # noqa: E501 (http link too long)
204 _data: pa.ChunkedArray
205 _dtype: ArrowDtype
207 def __init__(self, values: pa.Array | pa.ChunkedArray) -> None:
208 if pa_version_under1p01:
209 msg = "pyarrow>=1.0.0 is required for PyArrow backed ArrowExtensionArray."
210 raise ImportError(msg)
211 if isinstance(values, pa.Array):
212 self._data = pa.chunked_array([values])
213 elif isinstance(values, pa.ChunkedArray):
214 self._data = values
215 else:
216 raise ValueError(
217 f"Unsupported type '{type(values)}' for ArrowExtensionArray"
218 )
219 self._dtype = ArrowDtype(self._data.type)
221 @classmethod
222 def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy=False):
223 """
224 Construct a new ExtensionArray from a sequence of scalars.
225 """
226 pa_dtype = to_pyarrow_type(dtype)
227 is_cls = isinstance(scalars, cls)
228 if is_cls or isinstance(scalars, (pa.Array, pa.ChunkedArray)):
229 if is_cls:
230 scalars = scalars._data
231 if pa_dtype:
232 scalars = scalars.cast(pa_dtype)
233 return cls(scalars)
234 else:
235 return cls(
236 pa.chunked_array(pa.array(scalars, type=pa_dtype, from_pandas=True))
237 )
239 @classmethod
240 def _from_sequence_of_strings(
241 cls, strings, *, dtype: Dtype | None = None, copy=False
242 ):
243 """
244 Construct a new ExtensionArray from a sequence of strings.
245 """
246 pa_type = to_pyarrow_type(dtype)
247 if pa_type is None:
248 # Let pyarrow try to infer or raise
249 scalars = strings
250 elif pa.types.is_timestamp(pa_type):
251 from pandas.core.tools.datetimes import to_datetime
253 scalars = to_datetime(strings, errors="raise")
254 elif pa.types.is_date(pa_type):
255 from pandas.core.tools.datetimes import to_datetime
257 scalars = to_datetime(strings, errors="raise").date
258 elif pa.types.is_duration(pa_type):
259 from pandas.core.tools.timedeltas import to_timedelta
261 scalars = to_timedelta(strings, errors="raise")
262 elif pa.types.is_time(pa_type):
263 from pandas.core.tools.times import to_time
265 # "coerce" to allow "null times" (None) to not raise
266 scalars = to_time(strings, errors="coerce")
267 elif pa.types.is_boolean(pa_type):
268 from pandas.core.arrays import BooleanArray
270 scalars = BooleanArray._from_sequence_of_strings(strings).to_numpy()
271 elif (
272 pa.types.is_integer(pa_type)
273 or pa.types.is_floating(pa_type)
274 or pa.types.is_decimal(pa_type)
275 ):
276 from pandas.core.tools.numeric import to_numeric
278 scalars = to_numeric(strings, errors="raise")
279 else:
280 raise NotImplementedError(
281 f"Converting strings to {pa_type} is not implemented."
282 )
283 return cls._from_sequence(scalars, dtype=pa_type, copy=copy)
285 def __getitem__(self, item: PositionalIndexer):
286 """Select a subset of self.
288 Parameters
289 ----------
290 item : int, slice, or ndarray
291 * int: The position in 'self' to get.
292 * slice: A slice object, where 'start', 'stop', and 'step' are
293 integers or None
294 * ndarray: A 1-d boolean NumPy ndarray the same length as 'self'
296 Returns
297 -------
298 item : scalar or ExtensionArray
300 Notes
301 -----
302 For scalar ``item``, return a scalar value suitable for the array's
303 type. This should be an instance of ``self.dtype.type``.
304 For slice ``key``, return an instance of ``ExtensionArray``, even
305 if the slice is length 0 or 1.
306 For a boolean mask, return an instance of ``ExtensionArray``, filtered
307 to the values where ``item`` is True.
308 """
309 item = check_array_indexer(self, item)
311 if isinstance(item, np.ndarray):
312 if not len(item):
313 # Removable once we migrate StringDtype[pyarrow] to ArrowDtype[string]
314 if self._dtype.name == "string" and self._dtype.storage == "pyarrow":
315 pa_dtype = pa.string()
316 else:
317 pa_dtype = self._dtype.pyarrow_dtype
318 return type(self)(pa.chunked_array([], type=pa_dtype))
319 elif is_integer_dtype(item.dtype):
320 return self.take(item)
321 elif is_bool_dtype(item.dtype):
322 return type(self)(self._data.filter(item))
323 else:
324 raise IndexError(
325 "Only integers, slices and integer or "
326 "boolean arrays are valid indices."
327 )
328 elif isinstance(item, tuple):
329 item = unpack_tuple_and_ellipses(item)
331 # error: Non-overlapping identity check (left operand type:
332 # "Union[Union[int, integer[Any]], Union[slice, List[int],
333 # ndarray[Any, Any]]]", right operand type: "ellipsis")
334 if item is Ellipsis: # type: ignore[comparison-overlap]
335 # TODO: should be handled by pyarrow?
336 item = slice(None)
338 if is_scalar(item) and not is_integer(item):
339 # e.g. "foo" or 2.5
340 # exception message copied from numpy
341 raise IndexError(
342 r"only integers, slices (`:`), ellipsis (`...`), numpy.newaxis "
343 r"(`None`) and integer or boolean arrays are valid indices"
344 )
345 # We are not an array indexer, so maybe e.g. a slice or integer
346 # indexer. We dispatch to pyarrow.
347 value = self._data[item]
348 if isinstance(value, pa.ChunkedArray):
349 return type(self)(value)
350 else:
351 scalar = value.as_py()
352 if scalar is None:
353 return self._dtype.na_value
354 else:
355 return scalar
357 def __arrow_array__(self, type=None):
358 """Convert myself to a pyarrow ChunkedArray."""
359 return self._data
361 def __invert__(self: ArrowExtensionArrayT) -> ArrowExtensionArrayT:
362 if pa_version_under2p0:
363 raise NotImplementedError("__invert__ not implement for pyarrow < 2.0")
364 return type(self)(pc.invert(self._data))
366 def __neg__(self: ArrowExtensionArrayT) -> ArrowExtensionArrayT:
367 return type(self)(pc.negate_checked(self._data))
369 def __pos__(self: ArrowExtensionArrayT) -> ArrowExtensionArrayT:
370 return type(self)(self._data)
372 def __abs__(self: ArrowExtensionArrayT) -> ArrowExtensionArrayT:
373 return type(self)(pc.abs_checked(self._data))
375 def _cmp_method(self, other, op):
376 from pandas.arrays import BooleanArray
378 pc_func = ARROW_CMP_FUNCS[op.__name__]
379 if isinstance(other, ArrowExtensionArray):
380 result = pc_func(self._data, other._data)
381 elif isinstance(other, (np.ndarray, list)):
382 result = pc_func(self._data, other)
383 elif is_scalar(other):
384 try:
385 result = pc_func(self._data, pa.scalar(other))
386 except (pa.lib.ArrowNotImplementedError, pa.lib.ArrowInvalid):
387 mask = isna(self) | isna(other)
388 valid = ~mask
389 result = np.zeros(len(self), dtype="bool")
390 result[valid] = op(np.array(self)[valid], other)
391 return BooleanArray(result, mask)
392 else:
393 raise NotImplementedError(
394 f"{op.__name__} not implemented for {type(other)}"
395 )
397 if pa_version_under2p0:
398 result = result.to_pandas().values
399 else:
400 result = result.to_numpy()
401 return BooleanArray._from_sequence(result)
403 def _evaluate_op_method(self, other, op, arrow_funcs):
404 pc_func = arrow_funcs[op.__name__]
405 if pc_func is NotImplemented:
406 raise NotImplementedError(f"{op.__name__} not implemented.")
407 if isinstance(other, ArrowExtensionArray):
408 result = pc_func(self._data, other._data)
409 elif isinstance(other, (np.ndarray, list)):
410 result = pc_func(self._data, pa.array(other, from_pandas=True))
411 elif is_scalar(other):
412 result = pc_func(self._data, pa.scalar(other))
413 else:
414 raise NotImplementedError(
415 f"{op.__name__} not implemented for {type(other)}"
416 )
417 return type(self)(result)
419 def _logical_method(self, other, op):
420 return self._evaluate_op_method(other, op, ARROW_LOGICAL_FUNCS)
422 def _arith_method(self, other, op):
423 return self._evaluate_op_method(other, op, ARROW_ARITHMETIC_FUNCS)
425 def equals(self, other) -> bool:
426 if not isinstance(other, ArrowExtensionArray):
427 return False
428 # I'm told that pyarrow makes __eq__ behave like pandas' equals;
429 # TODO: is this documented somewhere?
430 return self._data == other._data
432 @property
433 def dtype(self) -> ArrowDtype:
434 """
435 An instance of 'ExtensionDtype'.
436 """
437 return self._dtype
439 @property
440 def nbytes(self) -> int:
441 """
442 The number of bytes needed to store this object in memory.
443 """
444 return self._data.nbytes
446 def __len__(self) -> int:
447 """
448 Length of this array.
450 Returns
451 -------
452 length : int
453 """
454 return len(self._data)
456 @property
457 def _hasna(self) -> bool:
458 return self._data.null_count > 0
460 def isna(self) -> npt.NDArray[np.bool_]:
461 """
462 Boolean NumPy array indicating if each value is missing.
464 This should return a 1-D array the same length as 'self'.
465 """
466 if pa_version_under2p0:
467 return self._data.is_null().to_pandas().values
468 else:
469 return self._data.is_null().to_numpy()
471 @deprecate_nonkeyword_arguments(version=None, allowed_args=["self"])
472 def argsort(
473 self,
474 ascending: bool = True,
475 kind: str = "quicksort",
476 na_position: str = "last",
477 *args,
478 **kwargs,
479 ) -> np.ndarray:
480 order = "ascending" if ascending else "descending"
481 null_placement = {"last": "at_end", "first": "at_start"}.get(na_position, None)
482 if null_placement is None or pa_version_under7p0:
483 # Although pc.array_sort_indices exists in version 6
484 # there's a bug that affects the pa.ChunkedArray backing
485 # https://issues.apache.org/jira/browse/ARROW-12042
486 fallback_performancewarning("7")
487 return super().argsort(
488 ascending=ascending, kind=kind, na_position=na_position
489 )
491 result = pc.array_sort_indices(
492 self._data, order=order, null_placement=null_placement
493 )
494 if pa_version_under2p0:
495 np_result = result.to_pandas().values
496 else:
497 np_result = result.to_numpy()
498 return np_result.astype(np.intp, copy=False)
500 def _argmin_max(self, skipna: bool, method: str) -> int:
501 if self._data.length() in (0, self._data.null_count) or (
502 self._hasna and not skipna
503 ):
504 # For empty or all null, pyarrow returns -1 but pandas expects TypeError
505 # For skipna=False and data w/ null, pandas expects NotImplementedError
506 # let ExtensionArray.arg{max|min} raise
507 return getattr(super(), f"arg{method}")(skipna=skipna)
509 if pa_version_under6p0:
510 raise NotImplementedError(
511 f"arg{method} only implemented for pyarrow version >= 6.0"
512 )
514 value = getattr(pc, method)(self._data, skip_nulls=skipna)
515 return pc.index(self._data, value).as_py()
517 def argmin(self, skipna: bool = True) -> int:
518 return self._argmin_max(skipna, "min")
520 def argmax(self, skipna: bool = True) -> int:
521 return self._argmin_max(skipna, "max")
523 def copy(self: ArrowExtensionArrayT) -> ArrowExtensionArrayT:
524 """
525 Return a shallow copy of the array.
527 Underlying ChunkedArray is immutable, so a deep copy is unnecessary.
529 Returns
530 -------
531 type(self)
532 """
533 return type(self)(self._data)
535 def dropna(self: ArrowExtensionArrayT) -> ArrowExtensionArrayT:
536 """
537 Return ArrowExtensionArray without NA values.
539 Returns
540 -------
541 ArrowExtensionArray
542 """
543 if pa_version_under6p0:
544 fallback_performancewarning(version="6")
545 return super().dropna()
546 else:
547 return type(self)(pc.drop_null(self._data))
549 def isin(self, values) -> npt.NDArray[np.bool_]:
550 if pa_version_under2p0:
551 fallback_performancewarning(version="2")
552 return super().isin(values)
554 # for an empty value_set pyarrow 3.0.0 segfaults and pyarrow 2.0.0 returns True
555 # for null values, so we short-circuit to return all False array.
556 if not len(values):
557 return np.zeros(len(self), dtype=bool)
559 kwargs = {}
560 if pa_version_under3p0:
561 # in pyarrow 2.0.0 skip_null is ignored but is a required keyword and raises
562 # with unexpected keyword argument in pyarrow 3.0.0+
563 kwargs["skip_null"] = True
565 result = pc.is_in(
566 self._data, value_set=pa.array(values, from_pandas=True), **kwargs
567 )
568 # pyarrow 2.0.0 returned nulls, so we explicitly specify dtype to convert nulls
569 # to False
570 return np.array(result, dtype=np.bool_)
572 def _values_for_factorize(self) -> tuple[np.ndarray, Any]:
573 """
574 Return an array and missing value suitable for factorization.
576 Returns
577 -------
578 values : ndarray
579 na_value : pd.NA
581 Notes
582 -----
583 The values returned by this method are also used in
584 :func:`pandas.util.hash_pandas_object`.
585 """
586 if pa_version_under2p0:
587 values = self._data.to_pandas().values
588 else:
589 values = self._data.to_numpy()
590 return values, self.dtype.na_value
592 @doc(ExtensionArray.factorize)
593 def factorize(
594 self,
595 na_sentinel: int | lib.NoDefault = lib.no_default,
596 use_na_sentinel: bool | lib.NoDefault = lib.no_default,
597 ) -> tuple[np.ndarray, ExtensionArray]:
598 resolved_na_sentinel = resolve_na_sentinel(na_sentinel, use_na_sentinel)
599 if pa_version_under4p0:
600 encoded = self._data.dictionary_encode()
601 else:
602 null_encoding = "mask" if resolved_na_sentinel is not None else "encode"
603 encoded = self._data.dictionary_encode(null_encoding=null_encoding)
604 indices = pa.chunked_array(
605 [c.indices for c in encoded.chunks], type=encoded.type.index_type
606 ).to_pandas()
607 if indices.dtype.kind == "f":
608 indices[np.isnan(indices)] = (
609 resolved_na_sentinel if resolved_na_sentinel is not None else -1
610 )
611 indices = indices.astype(np.int64, copy=False)
613 if encoded.num_chunks:
614 uniques = type(self)(encoded.chunk(0).dictionary)
615 if resolved_na_sentinel is None and pa_version_under4p0:
616 # TODO: share logic with BaseMaskedArray.factorize
617 # Insert na with the proper code
618 na_mask = indices.values == -1
619 na_index = na_mask.argmax()
620 if na_mask[na_index]:
621 na_code = 0 if na_index == 0 else indices[:na_index].max() + 1
622 uniques = uniques.insert(na_code, self.dtype.na_value)
623 indices[indices >= na_code] += 1
624 indices[indices == -1] = na_code
625 else:
626 uniques = type(self)(pa.array([], type=encoded.type.value_type))
628 return indices.values, uniques
630 def reshape(self, *args, **kwargs):
631 raise NotImplementedError(
632 f"{type(self)} does not support reshape "
633 f"as backed by a 1D pyarrow.ChunkedArray."
634 )
636 def take(
637 self,
638 indices: TakeIndexer,
639 allow_fill: bool = False,
640 fill_value: Any = None,
641 ) -> ArrowExtensionArray:
642 """
643 Take elements from an array.
645 Parameters
646 ----------
647 indices : sequence of int or one-dimensional np.ndarray of int
648 Indices to be taken.
649 allow_fill : bool, default False
650 How to handle negative values in `indices`.
652 * False: negative values in `indices` indicate positional indices
653 from the right (the default). This is similar to
654 :func:`numpy.take`.
656 * True: negative values in `indices` indicate
657 missing values. These values are set to `fill_value`. Any other
658 other negative values raise a ``ValueError``.
660 fill_value : any, optional
661 Fill value to use for NA-indices when `allow_fill` is True.
662 This may be ``None``, in which case the default NA value for
663 the type, ``self.dtype.na_value``, is used.
665 For many ExtensionArrays, there will be two representations of
666 `fill_value`: a user-facing "boxed" scalar, and a low-level
667 physical NA value. `fill_value` should be the user-facing version,
668 and the implementation should handle translating that to the
669 physical version for processing the take if necessary.
671 Returns
672 -------
673 ExtensionArray
675 Raises
676 ------
677 IndexError
678 When the indices are out of bounds for the array.
679 ValueError
680 When `indices` contains negative values other than ``-1``
681 and `allow_fill` is True.
683 See Also
684 --------
685 numpy.take
686 api.extensions.take
688 Notes
689 -----
690 ExtensionArray.take is called by ``Series.__getitem__``, ``.loc``,
691 ``iloc``, when `indices` is a sequence of values. Additionally,
692 it's called by :meth:`Series.reindex`, or any other method
693 that causes realignment, with a `fill_value`.
694 """
695 # TODO: Remove once we got rid of the (indices < 0) check
696 if not is_array_like(indices):
697 indices_array = np.asanyarray(indices)
698 else:
699 # error: Incompatible types in assignment (expression has type
700 # "Sequence[int]", variable has type "ndarray")
701 indices_array = indices # type: ignore[assignment]
703 if len(self._data) == 0 and (indices_array >= 0).any():
704 raise IndexError("cannot do a non-empty take")
705 if indices_array.size > 0 and indices_array.max() >= len(self._data):
706 raise IndexError("out of bounds value in 'indices'.")
708 if allow_fill:
709 fill_mask = indices_array < 0
710 if fill_mask.any():
711 validate_indices(indices_array, len(self._data))
712 # TODO(ARROW-9433): Treat negative indices as NULL
713 indices_array = pa.array(indices_array, mask=fill_mask)
714 result = self._data.take(indices_array)
715 if isna(fill_value):
716 return type(self)(result)
717 # TODO: ArrowNotImplementedError: Function fill_null has no
718 # kernel matching input types (array[string], scalar[string])
719 result = type(self)(result)
720 result[fill_mask] = fill_value
721 return result
722 # return type(self)(pc.fill_null(result, pa.scalar(fill_value)))
723 else:
724 # Nothing to fill
725 return type(self)(self._data.take(indices))
726 else: # allow_fill=False
727 # TODO(ARROW-9432): Treat negative indices as indices from the right.
728 if (indices_array < 0).any():
729 # Don't modify in-place
730 indices_array = np.copy(indices_array)
731 indices_array[indices_array < 0] += len(self._data)
732 return type(self)(self._data.take(indices_array))
734 def unique(self: ArrowExtensionArrayT) -> ArrowExtensionArrayT:
735 """
736 Compute the ArrowExtensionArray of unique values.
738 Returns
739 -------
740 ArrowExtensionArray
741 """
742 if pa_version_under2p0:
743 fallback_performancewarning(version="2")
744 return super().unique()
745 else:
746 return type(self)(pc.unique(self._data))
748 def value_counts(self, dropna: bool = True) -> Series:
749 """
750 Return a Series containing counts of each unique value.
752 Parameters
753 ----------
754 dropna : bool, default True
755 Don't include counts of missing values.
757 Returns
758 -------
759 counts : Series
761 See Also
762 --------
763 Series.value_counts
764 """
765 from pandas import (
766 Index,
767 Series,
768 )
770 vc = self._data.value_counts()
772 values = vc.field(0)
773 counts = vc.field(1)
774 if dropna and self._data.null_count > 0:
775 mask = values.is_valid()
776 values = values.filter(mask)
777 counts = counts.filter(mask)
779 # No missing values so we can adhere to the interface and return a numpy array.
780 counts = np.array(counts)
782 index = Index(type(self)(values))
784 return Series(counts, index=index).astype("Int64")
786 @classmethod
787 def _concat_same_type(
788 cls: type[ArrowExtensionArrayT], to_concat
789 ) -> ArrowExtensionArrayT:
790 """
791 Concatenate multiple ArrowExtensionArrays.
793 Parameters
794 ----------
795 to_concat : sequence of ArrowExtensionArrays
797 Returns
798 -------
799 ArrowExtensionArray
800 """
801 chunks = [array for ea in to_concat for array in ea._data.iterchunks()]
802 arr = pa.chunked_array(chunks)
803 return cls(arr)
805 def _reduce(self, name: str, *, skipna: bool = True, **kwargs):
806 """
807 Return a scalar result of performing the reduction operation.
809 Parameters
810 ----------
811 name : str
812 Name of the function, supported values are:
813 { any, all, min, max, sum, mean, median, prod,
814 std, var, sem, kurt, skew }.
815 skipna : bool, default True
816 If True, skip NaN values.
817 **kwargs
818 Additional keyword arguments passed to the reduction function.
819 Currently, `ddof` is the only supported kwarg.
821 Returns
822 -------
823 scalar
825 Raises
826 ------
827 TypeError : subclass does not define reductions
828 """
829 if name == "sem":
831 def pyarrow_meth(data, skipna, **kwargs):
832 numerator = pc.stddev(data, skip_nulls=skipna, **kwargs)
833 denominator = pc.sqrt_checked(
834 pc.subtract_checked(
835 pc.count(self._data, skip_nulls=skipna), kwargs["ddof"]
836 )
837 )
838 return pc.divide_checked(numerator, denominator)
840 else:
841 pyarrow_name = {
842 "median": "approximate_median",
843 "prod": "product",
844 "std": "stddev",
845 "var": "variance",
846 }.get(name, name)
847 # error: Incompatible types in assignment
848 # (expression has type "Optional[Any]", variable has type
849 # "Callable[[Any, Any, KwArg(Any)], Any]")
850 pyarrow_meth = getattr(pc, pyarrow_name, None) # type: ignore[assignment]
851 if pyarrow_meth is None:
852 # Let ExtensionArray._reduce raise the TypeError
853 return super()._reduce(name, skipna=skipna, **kwargs)
854 try:
855 result = pyarrow_meth(self._data, skip_nulls=skipna, **kwargs)
856 except (AttributeError, NotImplementedError, TypeError) as err:
857 msg = (
858 f"'{type(self).__name__}' with dtype {self.dtype} "
859 f"does not support reduction '{name}' with pyarrow "
860 f"version {pa.__version__}. '{name}' may be supported by "
861 f"upgrading pyarrow."
862 )
863 raise TypeError(msg) from err
864 if pc.is_null(result).as_py():
865 return self.dtype.na_value
866 return result.as_py()
868 def __setitem__(self, key: int | slice | np.ndarray, value: Any) -> None:
869 """Set one or more values inplace.
871 Parameters
872 ----------
873 key : int, ndarray, or slice
874 When called from, e.g. ``Series.__setitem__``, ``key`` will be
875 one of
877 * scalar int
878 * ndarray of integers.
879 * boolean ndarray
880 * slice object
882 value : ExtensionDtype.type, Sequence[ExtensionDtype.type], or object
883 value or values to be set of ``key``.
885 Returns
886 -------
887 None
888 """
889 key = check_array_indexer(self, key)
890 indices = self._indexing_key_to_indices(key)
891 value = self._maybe_convert_setitem_value(value)
893 argsort = np.argsort(indices)
894 indices = indices[argsort]
896 if is_scalar(value):
897 value = np.broadcast_to(value, len(self))
898 elif len(indices) != len(value):
899 raise ValueError("Length of indexer and values mismatch")
900 else:
901 value = np.asarray(value)[argsort]
903 self._data = self._set_via_chunk_iteration(indices=indices, value=value)
905 def _indexing_key_to_indices(
906 self, key: int | slice | np.ndarray
907 ) -> npt.NDArray[np.intp]:
908 """
909 Convert indexing key for self into positional indices.
911 Parameters
912 ----------
913 key : int | slice | np.ndarray
915 Returns
916 -------
917 npt.NDArray[np.intp]
918 """
919 n = len(self)
920 if isinstance(key, slice):
921 indices = np.arange(n)[key]
922 elif is_integer(key):
923 # error: Invalid index type "List[Union[int, ndarray[Any, Any]]]"
924 # for "ndarray[Any, dtype[signedinteger[Any]]]"; expected type
925 # "Union[SupportsIndex, _SupportsArray[dtype[Union[bool_,
926 # integer[Any]]]], _NestedSequence[_SupportsArray[dtype[Union
927 # [bool_, integer[Any]]]]], _NestedSequence[Union[bool, int]]
928 # , Tuple[Union[SupportsIndex, _SupportsArray[dtype[Union[bool_
929 # , integer[Any]]]], _NestedSequence[_SupportsArray[dtype[Union
930 # [bool_, integer[Any]]]]], _NestedSequence[Union[bool, int]]], ...]]"
931 indices = np.arange(n)[[key]] # type: ignore[index]
932 elif is_bool_dtype(key):
933 key = np.asarray(key)
934 if len(key) != n:
935 raise ValueError("Length of indexer and values mismatch")
936 indices = key.nonzero()[0]
937 else:
938 key = np.asarray(key)
939 indices = np.arange(n)[key]
940 return indices
942 # TODO: redefine _rank using pc.rank with pyarrow 9.0
944 def _quantile(
945 self: ArrowExtensionArrayT, qs: npt.NDArray[np.float64], interpolation: str
946 ) -> ArrowExtensionArrayT:
947 """
948 Compute the quantiles of self for each quantile in `qs`.
950 Parameters
951 ----------
952 qs : np.ndarray[float64]
953 interpolation: str
955 Returns
956 -------
957 same type as self
958 """
959 if pa_version_under4p0:
960 raise NotImplementedError(
961 "quantile only supported for pyarrow version >= 4.0"
962 )
963 result = pc.quantile(self._data, q=qs, interpolation=interpolation)
964 return type(self)(result)
966 def _mode(self: ArrowExtensionArrayT, dropna: bool = True) -> ArrowExtensionArrayT:
967 """
968 Returns the mode(s) of the ExtensionArray.
970 Always returns `ExtensionArray` even if only one value.
972 Parameters
973 ----------
974 dropna : bool, default True
975 Don't consider counts of NA values.
976 Not implemented by pyarrow.
978 Returns
979 -------
980 same type as self
981 Sorted, if possible.
982 """
983 if pa_version_under6p0:
984 raise NotImplementedError("mode only supported for pyarrow version >= 6.0")
985 modes = pc.mode(self._data, pc.count_distinct(self._data).as_py())
986 values = modes.field(0)
987 counts = modes.field(1)
988 # counts sorted descending i.e counts[0] = max
989 mask = pc.equal(counts, counts[0])
990 most_common = values.filter(mask)
991 return type(self)(most_common)
993 def _maybe_convert_setitem_value(self, value):
994 """Maybe convert value to be pyarrow compatible."""
995 # TODO: Make more robust like ArrowStringArray._maybe_convert_setitem_value
996 return value
998 def _set_via_chunk_iteration(
999 self, indices: npt.NDArray[np.intp], value: npt.NDArray[Any]
1000 ) -> pa.ChunkedArray:
1001 """
1002 Loop through the array chunks and set the new values while
1003 leaving the chunking layout unchanged.
1005 Parameters
1006 ----------
1007 indices : npt.NDArray[np.intp]
1008 Position indices for the underlying ChunkedArray.
1010 value : ExtensionDtype.type, Sequence[ExtensionDtype.type], or object
1011 value or values to be set of ``key``.
1013 Notes
1014 -----
1015 Assumes that indices is sorted. Caller is responsible for sorting.
1016 """
1017 new_data = []
1018 stop = 0
1019 for chunk in self._data.iterchunks():
1020 start, stop = stop, stop + len(chunk)
1021 if len(indices) == 0 or stop <= indices[0]:
1022 new_data.append(chunk)
1023 else:
1024 n = int(np.searchsorted(indices, stop, side="left"))
1025 c_ind = indices[:n] - start
1026 indices = indices[n:]
1027 n = len(c_ind)
1028 c_value, value = value[:n], value[n:]
1029 new_data.append(self._replace_with_indices(chunk, c_ind, c_value))
1030 return pa.chunked_array(new_data)
1032 @classmethod
1033 def _replace_with_indices(
1034 cls,
1035 chunk: pa.Array,
1036 indices: npt.NDArray[np.intp],
1037 value: npt.NDArray[Any],
1038 ) -> pa.Array:
1039 """
1040 Replace items selected with a set of positional indices.
1042 Analogous to pyarrow.compute.replace_with_mask, except that replacement
1043 positions are identified via indices rather than a mask.
1045 Parameters
1046 ----------
1047 chunk : pa.Array
1048 indices : npt.NDArray[np.intp]
1049 value : npt.NDArray[Any]
1050 Replacement value(s).
1052 Returns
1053 -------
1054 pa.Array
1055 """
1056 n = len(indices)
1058 if n == 0:
1059 return chunk
1061 start, stop = indices[[0, -1]]
1063 if (stop - start) == (n - 1):
1064 # fast path for a contiguous set of indices
1065 arrays = [
1066 chunk[:start],
1067 pa.array(value, type=chunk.type, from_pandas=True),
1068 chunk[stop + 1 :],
1069 ]
1070 arrays = [arr for arr in arrays if len(arr)]
1071 if len(arrays) == 1:
1072 return arrays[0]
1073 return pa.concat_arrays(arrays)
1075 mask = np.zeros(len(chunk), dtype=np.bool_)
1076 mask[indices] = True
1078 if pa_version_under5p0:
1079 arr = chunk.to_numpy(zero_copy_only=False)
1080 arr[mask] = value
1081 return pa.array(arr, type=chunk.type)
1083 if isna(value).all():
1084 return pc.if_else(mask, None, chunk)
1086 return pc.replace_with_mask(chunk, mask, value)