Coverage for /var/srv/projects/api.amasfac.comuna18.com/tmp/venv/lib/python3.9/site-packages/pandas/core/arrays/arrow/array.py: 13%

1from __future__ import annotations

3from typing import (

4 TYPE_CHECKING,

5 Any,

6 TypeVar,

9import numpy as np

11from pandas._libs import lib

12from pandas._typing import (

13 Dtype,

14 PositionalIndexer,

15 TakeIndexer,

16 npt,

17)

18from pandas.compat import (

19 pa_version_under1p01,

20 pa_version_under2p0,

21 pa_version_under3p0,

22 pa_version_under4p0,

23 pa_version_under5p0,

24 pa_version_under6p0,

25 pa_version_under7p0,

26)

27from pandas.util._decorators import (

28 deprecate_nonkeyword_arguments,

29 doc,

30)

32from pandas.core.dtypes.common import (

33 is_array_like,

34 is_bool_dtype,

35 is_integer,

36 is_integer_dtype,

37 is_scalar,

38)

39from pandas.core.dtypes.missing import isna

41from pandas.core.algorithms import resolve_na_sentinel

42from pandas.core.arraylike import OpsMixin

43from pandas.core.arrays.base import ExtensionArray

44from pandas.core.indexers import (

45 check_array_indexer,

46 unpack_tuple_and_ellipses,

47 validate_indices,

48)

50if not pa_version_under1p01: 50 ↛ 51line 50 didn't jump to line 51, because the condition on line 50 was never true

51 import pyarrow as pa

52 import pyarrow.compute as pc

54 from pandas.core.arrays.arrow._arrow_utils import fallback_performancewarning

55 from pandas.core.arrays.arrow.dtype import ArrowDtype

57 ARROW_CMP_FUNCS = {

58 "eq": pc.equal,

59 "ne": pc.not_equal,

60 "lt": pc.less,

61 "gt": pc.greater,

62 "le": pc.less_equal,

63 "ge": pc.greater_equal,

64 }

66 ARROW_LOGICAL_FUNCS = {

67 "and": NotImplemented if pa_version_under2p0 else pc.and_kleene,

68 "rand": NotImplemented

69 if pa_version_under2p0

70 else lambda x, y: pc.and_kleene(y, x),

71 "or": NotImplemented if pa_version_under2p0 else pc.or_kleene,

72 "ror": NotImplemented

73 if pa_version_under2p0

74 else lambda x, y: pc.or_kleene(y, x),

75 "xor": NotImplemented if pa_version_under2p0 else pc.xor,

76 "rxor": NotImplemented if pa_version_under2p0 else lambda x, y: pc.xor(y, x),

77 }

79 def cast_for_truediv(

80 arrow_array: pa.ChunkedArray, pa_object: pa.Array | pa.Scalar

81 ) -> pa.ChunkedArray:

82 # Ensure int / int -> float mirroring Python/Numpy behavior

83 # as pc.divide_checked(int, int) -> int

84 if pa.types.is_integer(arrow_array.type) and pa.types.is_integer(

85 pa_object.type

86 ):

87 return arrow_array.cast(pa.float64())

88 return arrow_array

90 def floordiv_compat(

91 left: pa.ChunkedArray | pa.Array | pa.Scalar,

92 right: pa.ChunkedArray | pa.Array | pa.Scalar,

93 ) -> pa.ChunkedArray:

94 # Ensure int // int -> int mirroring Python/Numpy behavior

95 # as pc.floor(pc.divide_checked(int, int)) -> float

96 result = pc.floor(pc.divide_checked(left, right))

97 if pa.types.is_integer(left.type) and pa.types.is_integer(right.type):

98 result = result.cast(left.type)

99 return result

100

101 ARROW_ARITHMETIC_FUNCS = {

102 "add": NotImplemented if pa_version_under2p0 else pc.add_checked,

103 "radd": NotImplemented

104 if pa_version_under2p0

105 else lambda x, y: pc.add_checked(y, x),

106 "sub": NotImplemented if pa_version_under2p0 else pc.subtract_checked,

107 "rsub": NotImplemented

108 if pa_version_under2p0

109 else lambda x, y: pc.subtract_checked(y, x),

110 "mul": NotImplemented if pa_version_under2p0 else pc.multiply_checked,

111 "rmul": NotImplemented

112 if pa_version_under2p0

113 else lambda x, y: pc.multiply_checked(y, x),

114 "truediv": NotImplemented

115 if pa_version_under2p0

116 else lambda x, y: pc.divide_checked(cast_for_truediv(x, y), y),

117 "rtruediv": NotImplemented

118 if pa_version_under2p0

119 else lambda x, y: pc.divide_checked(y, cast_for_truediv(x, y)),

120 "floordiv": NotImplemented

121 if pa_version_under2p0

122 else lambda x, y: floordiv_compat(x, y),

123 "rfloordiv": NotImplemented

124 if pa_version_under2p0

125 else lambda x, y: floordiv_compat(y, x),

126 "mod": NotImplemented,

127 "rmod": NotImplemented,

128 "divmod": NotImplemented,

129 "rdivmod": NotImplemented,

130 "pow": NotImplemented if pa_version_under4p0 else pc.power_checked,

131 "rpow": NotImplemented

132 if pa_version_under4p0

133 else lambda x, y: pc.power_checked(y, x),

134 }

135

136if TYPE_CHECKING: 136 ↛ 137line 136 didn't jump to line 137, because the condition on line 136 was never true

137 from pandas import Series

138

139ArrowExtensionArrayT = TypeVar("ArrowExtensionArrayT", bound="ArrowExtensionArray")

140

141

142def to_pyarrow_type(

143 dtype: ArrowDtype | pa.DataType | Dtype | None,

144) -> pa.DataType | None:

145 """

146 Convert dtype to a pyarrow type instance.

147 """

148 if isinstance(dtype, ArrowDtype):

149 pa_dtype = dtype.pyarrow_dtype

150 elif isinstance(dtype, pa.DataType):

151 pa_dtype = dtype

152 elif dtype:

153 # Accepts python types too

154 pa_dtype = pa.from_numpy_dtype(dtype)

155 else:

156 pa_dtype = None

157 return pa_dtype

158

159

160class ArrowExtensionArray(OpsMixin, ExtensionArray):

161 """

162 Pandas ExtensionArray backed by a PyArrow ChunkedArray.

163

164 .. warning::

165

166 ArrowExtensionArray is considered experimental. The implementation and

167 parts of the API may change without warning.

168

169 Parameters

170 ----------

171 values : pyarrow.Array or pyarrow.ChunkedArray

172

173 Attributes

174 ----------

175 None

176

177 Methods

178 -------

179 None

180

181 Returns

182 -------

183 ArrowExtensionArray

184

185 Notes

186 -----

187 Most methods are implemented using `pyarrow compute functions. <https://arrow.apache.org/docs/python/api/compute.html>`__

188 Some methods may either raise an exception or raise a ``PerformanceWarning`` if an

189 associated compute function is not available based on the installed version of PyArrow.

190

191 Please install the latest version of PyArrow to enable the best functionality and avoid

192 potential bugs in prior versions of PyArrow.

193

194 Examples

195 --------

196 Create an ArrowExtensionArray with :func:`pandas.array`:

197

198 >>> pd.array([1, 1, None], dtype="int64[pyarrow]")

199 <ArrowExtensionArray>

200 [1, 1, <NA>]

201 Length: 3, dtype: int64[pyarrow]

202 """ # noqa: E501 (http link too long)

203

204 _data: pa.ChunkedArray

205 _dtype: ArrowDtype

206

207 def __init__(self, values: pa.Array | pa.ChunkedArray) -> None:

208 if pa_version_under1p01:

209 msg = "pyarrow>=1.0.0 is required for PyArrow backed ArrowExtensionArray."

210 raise ImportError(msg)

211 if isinstance(values, pa.Array):

212 self._data = pa.chunked_array([values])

213 elif isinstance(values, pa.ChunkedArray):

214 self._data = values

215 else:

216 raise ValueError(

217 f"Unsupported type '{type(values)}' for ArrowExtensionArray"

218 )

219 self._dtype = ArrowDtype(self._data.type)

220

221 @classmethod

222 def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy=False):

223 """

224 Construct a new ExtensionArray from a sequence of scalars.

225 """

226 pa_dtype = to_pyarrow_type(dtype)

227 is_cls = isinstance(scalars, cls)

228 if is_cls or isinstance(scalars, (pa.Array, pa.ChunkedArray)):

229 if is_cls:

230 scalars = scalars._data

231 if pa_dtype:

232 scalars = scalars.cast(pa_dtype)

233 return cls(scalars)

234 else:

235 return cls(

236 pa.chunked_array(pa.array(scalars, type=pa_dtype, from_pandas=True))

237 )

238

239 @classmethod

240 def _from_sequence_of_strings(

241 cls, strings, *, dtype: Dtype | None = None, copy=False

242 ):

243 """

244 Construct a new ExtensionArray from a sequence of strings.

245 """

246 pa_type = to_pyarrow_type(dtype)

247 if pa_type is None:

248 # Let pyarrow try to infer or raise

249 scalars = strings

250 elif pa.types.is_timestamp(pa_type):

251 from pandas.core.tools.datetimes import to_datetime

252

253 scalars = to_datetime(strings, errors="raise")

254 elif pa.types.is_date(pa_type):

255 from pandas.core.tools.datetimes import to_datetime

256

257 scalars = to_datetime(strings, errors="raise").date

258 elif pa.types.is_duration(pa_type):

259 from pandas.core.tools.timedeltas import to_timedelta

260

261 scalars = to_timedelta(strings, errors="raise")

262 elif pa.types.is_time(pa_type):

263 from pandas.core.tools.times import to_time

264

265 # "coerce" to allow "null times" (None) to not raise

266 scalars = to_time(strings, errors="coerce")

267 elif pa.types.is_boolean(pa_type):

268 from pandas.core.arrays import BooleanArray

269

270 scalars = BooleanArray._from_sequence_of_strings(strings).to_numpy()

271 elif (

272 pa.types.is_integer(pa_type)

273 or pa.types.is_floating(pa_type)

274 or pa.types.is_decimal(pa_type)

275 ):

276 from pandas.core.tools.numeric import to_numeric

277

278 scalars = to_numeric(strings, errors="raise")

279 else:

280 raise NotImplementedError(

281 f"Converting strings to {pa_type} is not implemented."

282 )

283 return cls._from_sequence(scalars, dtype=pa_type, copy=copy)

284

285 def __getitem__(self, item: PositionalIndexer):

286 """Select a subset of self.

287

288 Parameters

289 ----------

290 item : int, slice, or ndarray

291 * int: The position in 'self' to get.

292 * slice: A slice object, where 'start', 'stop', and 'step' are

293 integers or None

294 * ndarray: A 1-d boolean NumPy ndarray the same length as 'self'

295

296 Returns

297 -------

298 item : scalar or ExtensionArray

299

300 Notes

301 -----

302 For scalar ``item``, return a scalar value suitable for the array's

303 type. This should be an instance of ``self.dtype.type``.

304 For slice ``key``, return an instance of ``ExtensionArray``, even

305 if the slice is length 0 or 1.

306 For a boolean mask, return an instance of ``ExtensionArray``, filtered

307 to the values where ``item`` is True.

308 """

309 item = check_array_indexer(self, item)

310

311 if isinstance(item, np.ndarray):

312 if not len(item):

313 # Removable once we migrate StringDtype[pyarrow] to ArrowDtype[string]

314 if self._dtype.name == "string" and self._dtype.storage == "pyarrow":

315 pa_dtype = pa.string()

316 else:

317 pa_dtype = self._dtype.pyarrow_dtype

318 return type(self)(pa.chunked_array([], type=pa_dtype))

319 elif is_integer_dtype(item.dtype):

320 return self.take(item)

321 elif is_bool_dtype(item.dtype):

322 return type(self)(self._data.filter(item))

323 else:

324 raise IndexError(

325 "Only integers, slices and integer or "

326 "boolean arrays are valid indices."

327 )

328 elif isinstance(item, tuple):

329 item = unpack_tuple_and_ellipses(item)

330

331 # error: Non-overlapping identity check (left operand type:

332 # "Union[Union[int, integer[Any]], Union[slice, List[int],

333 # ndarray[Any, Any]]]", right operand type: "ellipsis")

334 if item is Ellipsis: # type: ignore[comparison-overlap]

335 # TODO: should be handled by pyarrow?

336 item = slice(None)

337

338 if is_scalar(item) and not is_integer(item):

339 # e.g. "foo" or 2.5

340 # exception message copied from numpy

341 raise IndexError(

342 r"only integers, slices (`:`), ellipsis (`...`), numpy.newaxis "

343 r"(`None`) and integer or boolean arrays are valid indices"

344 )

345 # We are not an array indexer, so maybe e.g. a slice or integer

346 # indexer. We dispatch to pyarrow.

347 value = self._data[item]

348 if isinstance(value, pa.ChunkedArray):

349 return type(self)(value)

350 else:

351 scalar = value.as_py()

352 if scalar is None:

353 return self._dtype.na_value

354 else:

355 return scalar

356

357 def __arrow_array__(self, type=None):

358 """Convert myself to a pyarrow ChunkedArray."""

359 return self._data

360

361 def __invert__(self: ArrowExtensionArrayT) -> ArrowExtensionArrayT:

362 if pa_version_under2p0:

363 raise NotImplementedError("__invert__ not implement for pyarrow < 2.0")

364 return type(self)(pc.invert(self._data))

365

366 def __neg__(self: ArrowExtensionArrayT) -> ArrowExtensionArrayT:

367 return type(self)(pc.negate_checked(self._data))

368

369 def __pos__(self: ArrowExtensionArrayT) -> ArrowExtensionArrayT:

370 return type(self)(self._data)

371

372 def __abs__(self: ArrowExtensionArrayT) -> ArrowExtensionArrayT:

373 return type(self)(pc.abs_checked(self._data))

374

375 def _cmp_method(self, other, op):

376 from pandas.arrays import BooleanArray

377

378 pc_func = ARROW_CMP_FUNCS[op.__name__]

379 if isinstance(other, ArrowExtensionArray):

380 result = pc_func(self._data, other._data)

381 elif isinstance(other, (np.ndarray, list)):

382 result = pc_func(self._data, other)

383 elif is_scalar(other):

384 try:

385 result = pc_func(self._data, pa.scalar(other))

386 except (pa.lib.ArrowNotImplementedError, pa.lib.ArrowInvalid):

387 mask = isna(self) | isna(other)

388 valid = ~mask

389 result = np.zeros(len(self), dtype="bool")

390 result[valid] = op(np.array(self)[valid], other)

391 return BooleanArray(result, mask)

392 else:

393 raise NotImplementedError(

394 f"{op.__name__} not implemented for {type(other)}"

395 )

396

397 if pa_version_under2p0:

398 result = result.to_pandas().values

399 else:

400 result = result.to_numpy()

401 return BooleanArray._from_sequence(result)

402

403 def _evaluate_op_method(self, other, op, arrow_funcs):

404 pc_func = arrow_funcs[op.__name__]

405 if pc_func is NotImplemented:

406 raise NotImplementedError(f"{op.__name__} not implemented.")

407 if isinstance(other, ArrowExtensionArray):

408 result = pc_func(self._data, other._data)

409 elif isinstance(other, (np.ndarray, list)):

410 result = pc_func(self._data, pa.array(other, from_pandas=True))

411 elif is_scalar(other):

412 result = pc_func(self._data, pa.scalar(other))

413 else:

414 raise NotImplementedError(

415 f"{op.__name__} not implemented for {type(other)}"

416 )

417 return type(self)(result)

418

419 def _logical_method(self, other, op):

420 return self._evaluate_op_method(other, op, ARROW_LOGICAL_FUNCS)

421

422 def _arith_method(self, other, op):

423 return self._evaluate_op_method(other, op, ARROW_ARITHMETIC_FUNCS)

424

425 def equals(self, other) -> bool:

426 if not isinstance(other, ArrowExtensionArray):

427 return False

428 # I'm told that pyarrow makes __eq__ behave like pandas' equals;

429 # TODO: is this documented somewhere?

430 return self._data == other._data

431

432 @property

433 def dtype(self) -> ArrowDtype:

434 """

435 An instance of 'ExtensionDtype'.

436 """

437 return self._dtype

438

439 @property

440 def nbytes(self) -> int:

441 """

442 The number of bytes needed to store this object in memory.

443 """

444 return self._data.nbytes

445

446 def __len__(self) -> int:

447 """

448 Length of this array.

449

450 Returns

451 -------

452 length : int

453 """

454 return len(self._data)

455

456 @property

457 def _hasna(self) -> bool:

458 return self._data.null_count > 0

459

460 def isna(self) -> npt.NDArray[np.bool_]:

461 """

462 Boolean NumPy array indicating if each value is missing.

463

464 This should return a 1-D array the same length as 'self'.

465 """

466 if pa_version_under2p0:

467 return self._data.is_null().to_pandas().values

468 else:

469 return self._data.is_null().to_numpy()

470

471 @deprecate_nonkeyword_arguments(version=None, allowed_args=["self"])

472 def argsort(

473 self,

474 ascending: bool = True,

475 kind: str = "quicksort",

476 na_position: str = "last",

477 *args,

478 **kwargs,

479 ) -> np.ndarray:

480 order = "ascending" if ascending else "descending"

481 null_placement = {"last": "at_end", "first": "at_start"}.get(na_position, None)

482 if null_placement is None or pa_version_under7p0:

483 # Although pc.array_sort_indices exists in version 6

484 # there's a bug that affects the pa.ChunkedArray backing

485 # https://issues.apache.org/jira/browse/ARROW-12042

486 fallback_performancewarning("7")

487 return super().argsort(

488 ascending=ascending, kind=kind, na_position=na_position

489 )

490

491 result = pc.array_sort_indices(

492 self._data, order=order, null_placement=null_placement

493 )

494 if pa_version_under2p0:

495 np_result = result.to_pandas().values

496 else:

497 np_result = result.to_numpy()

498 return np_result.astype(np.intp, copy=False)

499

500 def _argmin_max(self, skipna: bool, method: str) -> int:

501 if self._data.length() in (0, self._data.null_count) or (

502 self._hasna and not skipna

503 ):

504 # For empty or all null, pyarrow returns -1 but pandas expects TypeError

505 # For skipna=False and data w/ null, pandas expects NotImplementedError

506 # let ExtensionArray.arg{max|min} raise

507 return getattr(super(), f"arg{method}")(skipna=skipna)

508

509 if pa_version_under6p0:

510 raise NotImplementedError(

511 f"arg{method} only implemented for pyarrow version >= 6.0"

512 )

513

514 value = getattr(pc, method)(self._data, skip_nulls=skipna)

515 return pc.index(self._data, value).as_py()

516

517 def argmin(self, skipna: bool = True) -> int:

518 return self._argmin_max(skipna, "min")

519

520 def argmax(self, skipna: bool = True) -> int:

521 return self._argmin_max(skipna, "max")

522

523 def copy(self: ArrowExtensionArrayT) -> ArrowExtensionArrayT:

524 """

525 Return a shallow copy of the array.

526

527 Underlying ChunkedArray is immutable, so a deep copy is unnecessary.

528

529 Returns

530 -------

531 type(self)

532 """

533 return type(self)(self._data)

534

535 def dropna(self: ArrowExtensionArrayT) -> ArrowExtensionArrayT:

536 """

537 Return ArrowExtensionArray without NA values.

538

539 Returns

540 -------

541 ArrowExtensionArray

542 """

543 if pa_version_under6p0:

544 fallback_performancewarning(version="6")

545 return super().dropna()

546 else:

547 return type(self)(pc.drop_null(self._data))

548

549 def isin(self, values) -> npt.NDArray[np.bool_]:

550 if pa_version_under2p0:

551 fallback_performancewarning(version="2")

552 return super().isin(values)

553

554 # for an empty value_set pyarrow 3.0.0 segfaults and pyarrow 2.0.0 returns True

555 # for null values, so we short-circuit to return all False array.

556 if not len(values):

557 return np.zeros(len(self), dtype=bool)

558

559 kwargs = {}

560 if pa_version_under3p0:

561 # in pyarrow 2.0.0 skip_null is ignored but is a required keyword and raises

562 # with unexpected keyword argument in pyarrow 3.0.0+

563 kwargs["skip_null"] = True

564

565 result = pc.is_in(

566 self._data, value_set=pa.array(values, from_pandas=True), **kwargs

567 )

568 # pyarrow 2.0.0 returned nulls, so we explicitly specify dtype to convert nulls

569 # to False

570 return np.array(result, dtype=np.bool_)

571

572 def _values_for_factorize(self) -> tuple[np.ndarray, Any]:

573 """

574 Return an array and missing value suitable for factorization.

575

576 Returns

577 -------

578 values : ndarray

579 na_value : pd.NA

580

581 Notes

582 -----

583 The values returned by this method are also used in

584 :func:`pandas.util.hash_pandas_object`.

585 """

586 if pa_version_under2p0:

587 values = self._data.to_pandas().values

588 else:

589 values = self._data.to_numpy()

590 return values, self.dtype.na_value

591

592 @doc(ExtensionArray.factorize)

593 def factorize(

594 self,

595 na_sentinel: int | lib.NoDefault = lib.no_default,

596 use_na_sentinel: bool | lib.NoDefault = lib.no_default,

597 ) -> tuple[np.ndarray, ExtensionArray]:

598 resolved_na_sentinel = resolve_na_sentinel(na_sentinel, use_na_sentinel)

599 if pa_version_under4p0:

600 encoded = self._data.dictionary_encode()

601 else:

602 null_encoding = "mask" if resolved_na_sentinel is not None else "encode"

603 encoded = self._data.dictionary_encode(null_encoding=null_encoding)

604 indices = pa.chunked_array(

605 [c.indices for c in encoded.chunks], type=encoded.type.index_type

606 ).to_pandas()

607 if indices.dtype.kind == "f":

608 indices[np.isnan(indices)] = (

609 resolved_na_sentinel if resolved_na_sentinel is not None else -1

610 )

611 indices = indices.astype(np.int64, copy=False)

612

613 if encoded.num_chunks:

614 uniques = type(self)(encoded.chunk(0).dictionary)

615 if resolved_na_sentinel is None and pa_version_under4p0:

616 # TODO: share logic with BaseMaskedArray.factorize

617 # Insert na with the proper code

618 na_mask = indices.values == -1

619 na_index = na_mask.argmax()

620 if na_mask[na_index]:

621 na_code = 0 if na_index == 0 else indices[:na_index].max() + 1

622 uniques = uniques.insert(na_code, self.dtype.na_value)

623 indices[indices >= na_code] += 1

624 indices[indices == -1] = na_code

625 else:

626 uniques = type(self)(pa.array([], type=encoded.type.value_type))

627

628 return indices.values, uniques

629

630 def reshape(self, *args, **kwargs):

631 raise NotImplementedError(

632 f"{type(self)} does not support reshape "

633 f"as backed by a 1D pyarrow.ChunkedArray."

634 )

635

636 def take(

637 self,

638 indices: TakeIndexer,

639 allow_fill: bool = False,

640 fill_value: Any = None,

641 ) -> ArrowExtensionArray:

642 """

643 Take elements from an array.

644

645 Parameters

646 ----------

647 indices : sequence of int or one-dimensional np.ndarray of int

648 Indices to be taken.

649 allow_fill : bool, default False

650 How to handle negative values in `indices`.

651

652 * False: negative values in `indices` indicate positional indices

653 from the right (the default). This is similar to

654 :func:`numpy.take`.

655

656 * True: negative values in `indices` indicate

657 missing values. These values are set to `fill_value`. Any other

658 other negative values raise a ``ValueError``.

659

660 fill_value : any, optional

661 Fill value to use for NA-indices when `allow_fill` is True.

662 This may be ``None``, in which case the default NA value for

663 the type, ``self.dtype.na_value``, is used.

664

665 For many ExtensionArrays, there will be two representations of

666 `fill_value`: a user-facing "boxed" scalar, and a low-level

667 physical NA value. `fill_value` should be the user-facing version,

668 and the implementation should handle translating that to the

669 physical version for processing the take if necessary.

670

671 Returns

672 -------

673 ExtensionArray

674

675 Raises

676 ------

677 IndexError

678 When the indices are out of bounds for the array.

679 ValueError

680 When `indices` contains negative values other than ``-1``

681 and `allow_fill` is True.

682

683 See Also

684 --------

685 numpy.take

686 api.extensions.take

687

688 Notes

689 -----

690 ExtensionArray.take is called by ``Series.__getitem__``, ``.loc``,

691 ``iloc``, when `indices` is a sequence of values. Additionally,

692 it's called by :meth:`Series.reindex`, or any other method

693 that causes realignment, with a `fill_value`.

694 """

695 # TODO: Remove once we got rid of the (indices < 0) check

696 if not is_array_like(indices):

697 indices_array = np.asanyarray(indices)

698 else:

699 # error: Incompatible types in assignment (expression has type

700 # "Sequence[int]", variable has type "ndarray")

701 indices_array = indices # type: ignore[assignment]

702

703 if len(self._data) == 0 and (indices_array >= 0).any():

704 raise IndexError("cannot do a non-empty take")

705 if indices_array.size > 0 and indices_array.max() >= len(self._data):

706 raise IndexError("out of bounds value in 'indices'.")

707

708 if allow_fill:

709 fill_mask = indices_array < 0

710 if fill_mask.any():

711 validate_indices(indices_array, len(self._data))

712 # TODO(ARROW-9433): Treat negative indices as NULL

713 indices_array = pa.array(indices_array, mask=fill_mask)

714 result = self._data.take(indices_array)

715 if isna(fill_value):

716 return type(self)(result)

717 # TODO: ArrowNotImplementedError: Function fill_null has no

718 # kernel matching input types (array[string], scalar[string])

719 result = type(self)(result)

720 result[fill_mask] = fill_value

721 return result

722 # return type(self)(pc.fill_null(result, pa.scalar(fill_value)))

723 else:

724 # Nothing to fill

725 return type(self)(self._data.take(indices))

726 else: # allow_fill=False

727 # TODO(ARROW-9432): Treat negative indices as indices from the right.

728 if (indices_array < 0).any():

729 # Don't modify in-place

730 indices_array = np.copy(indices_array)

731 indices_array[indices_array < 0] += len(self._data)

732 return type(self)(self._data.take(indices_array))

733

734 def unique(self: ArrowExtensionArrayT) -> ArrowExtensionArrayT:

735 """

736 Compute the ArrowExtensionArray of unique values.

737

738 Returns

739 -------

740 ArrowExtensionArray

741 """

742 if pa_version_under2p0:

743 fallback_performancewarning(version="2")

744 return super().unique()

745 else:

746 return type(self)(pc.unique(self._data))

747

748 def value_counts(self, dropna: bool = True) -> Series:

749 """

750 Return a Series containing counts of each unique value.

751

752 Parameters

753 ----------

754 dropna : bool, default True

755 Don't include counts of missing values.

756

757 Returns

758 -------

759 counts : Series

760

761 See Also

762 --------

763 Series.value_counts

764 """

765 from pandas import (

766 Index,

767 Series,

768 )

769

770 vc = self._data.value_counts()

771

772 values = vc.field(0)

773 counts = vc.field(1)

774 if dropna and self._data.null_count > 0:

775 mask = values.is_valid()

776 values = values.filter(mask)

777 counts = counts.filter(mask)

778

779 # No missing values so we can adhere to the interface and return a numpy array.

780 counts = np.array(counts)

781

782 index = Index(type(self)(values))

783

784 return Series(counts, index=index).astype("Int64")

785

786 @classmethod

787 def _concat_same_type(

788 cls: type[ArrowExtensionArrayT], to_concat

789 ) -> ArrowExtensionArrayT:

790 """

791 Concatenate multiple ArrowExtensionArrays.

792

793 Parameters

794 ----------

795 to_concat : sequence of ArrowExtensionArrays

796

797 Returns

798 -------

799 ArrowExtensionArray

800 """

801 chunks = [array for ea in to_concat for array in ea._data.iterchunks()]

802 arr = pa.chunked_array(chunks)

803 return cls(arr)

804

805 def _reduce(self, name: str, *, skipna: bool = True, **kwargs):

806 """

807 Return a scalar result of performing the reduction operation.

808

809 Parameters

810 ----------

811 name : str

812 Name of the function, supported values are:

813 { any, all, min, max, sum, mean, median, prod,

814 std, var, sem, kurt, skew }.

815 skipna : bool, default True

816 If True, skip NaN values.

817 **kwargs

818 Additional keyword arguments passed to the reduction function.

819 Currently, `ddof` is the only supported kwarg.

820

821 Returns

822 -------

823 scalar

824

825 Raises

826 ------

827 TypeError : subclass does not define reductions

828 """

829 if name == "sem":

830

831 def pyarrow_meth(data, skipna, **kwargs):

832 numerator = pc.stddev(data, skip_nulls=skipna, **kwargs)

833 denominator = pc.sqrt_checked(

834 pc.subtract_checked(

835 pc.count(self._data, skip_nulls=skipna), kwargs["ddof"]

836 )

837 )

838 return pc.divide_checked(numerator, denominator)

839

840 else:

841 pyarrow_name = {

842 "median": "approximate_median",

843 "prod": "product",

844 "std": "stddev",

845 "var": "variance",

846 }.get(name, name)

847 # error: Incompatible types in assignment

848 # (expression has type "Optional[Any]", variable has type

849 # "Callable[[Any, Any, KwArg(Any)], Any]")

850 pyarrow_meth = getattr(pc, pyarrow_name, None) # type: ignore[assignment]

851 if pyarrow_meth is None:

852 # Let ExtensionArray._reduce raise the TypeError

853 return super()._reduce(name, skipna=skipna, **kwargs)

854 try:

855 result = pyarrow_meth(self._data, skip_nulls=skipna, **kwargs)

856 except (AttributeError, NotImplementedError, TypeError) as err:

857 msg = (

858 f"'{type(self).__name__}' with dtype {self.dtype} "

859 f"does not support reduction '{name}' with pyarrow "

860 f"version {pa.__version__}. '{name}' may be supported by "

861 f"upgrading pyarrow."

862 )

863 raise TypeError(msg) from err

864 if pc.is_null(result).as_py():

865 return self.dtype.na_value

866 return result.as_py()

867

868 def __setitem__(self, key: int | slice | np.ndarray, value: Any) -> None:

869 """Set one or more values inplace.

870

871 Parameters

872 ----------

873 key : int, ndarray, or slice

874 When called from, e.g. ``Series.__setitem__``, ``key`` will be

875 one of

876

877 * scalar int

878 * ndarray of integers.

879 * boolean ndarray

880 * slice object

881

882 value : ExtensionDtype.type, Sequence[ExtensionDtype.type], or object

883 value or values to be set of ``key``.

884

885 Returns

886 -------

887 None

888 """

889 key = check_array_indexer(self, key)

890 indices = self._indexing_key_to_indices(key)

891 value = self._maybe_convert_setitem_value(value)

892

893 argsort = np.argsort(indices)

894 indices = indices[argsort]

895

896 if is_scalar(value):

897 value = np.broadcast_to(value, len(self))

898 elif len(indices) != len(value):

899 raise ValueError("Length of indexer and values mismatch")

900 else:

901 value = np.asarray(value)[argsort]

902

903 self._data = self._set_via_chunk_iteration(indices=indices, value=value)

904

905 def _indexing_key_to_indices(

906 self, key: int | slice | np.ndarray

907 ) -> npt.NDArray[np.intp]:

908 """

909 Convert indexing key for self into positional indices.

910

911 Parameters

912 ----------

913 key : int | slice | np.ndarray

914

915 Returns

916 -------

917 npt.NDArray[np.intp]

918 """

919 n = len(self)

920 if isinstance(key, slice):

921 indices = np.arange(n)[key]

922 elif is_integer(key):

923 # error: Invalid index type "List[Union[int, ndarray[Any, Any]]]"

924 # for "ndarray[Any, dtype[signedinteger[Any]]]"; expected type

925 # "Union[SupportsIndex, _SupportsArray[dtype[Union[bool_,

926 # integer[Any]]]], _NestedSequence[_SupportsArray[dtype[Union

927 # [bool_, integer[Any]]]]], _NestedSequence[Union[bool, int]]

928 # , Tuple[Union[SupportsIndex, _SupportsArray[dtype[Union[bool_

929 # , integer[Any]]]], _NestedSequence[_SupportsArray[dtype[Union

930 # [bool_, integer[Any]]]]], _NestedSequence[Union[bool, int]]], ...]]"

931 indices = np.arange(n)[[key]] # type: ignore[index]

932 elif is_bool_dtype(key):

933 key = np.asarray(key)

934 if len(key) != n:

935 raise ValueError("Length of indexer and values mismatch")

936 indices = key.nonzero()[0]

937 else:

938 key = np.asarray(key)

939 indices = np.arange(n)[key]

940 return indices

941

942 # TODO: redefine _rank using pc.rank with pyarrow 9.0

943

944 def _quantile(

945 self: ArrowExtensionArrayT, qs: npt.NDArray[np.float64], interpolation: str

946 ) -> ArrowExtensionArrayT:

947 """

948 Compute the quantiles of self for each quantile in `qs`.

949

950 Parameters

951 ----------

952 qs : np.ndarray[float64]

953 interpolation: str

954

955 Returns

956 -------

957 same type as self

958 """

959 if pa_version_under4p0:

960 raise NotImplementedError(

961 "quantile only supported for pyarrow version >= 4.0"

962 )

963 result = pc.quantile(self._data, q=qs, interpolation=interpolation)

964 return type(self)(result)

965

966 def _mode(self: ArrowExtensionArrayT, dropna: bool = True) -> ArrowExtensionArrayT:

967 """

968 Returns the mode(s) of the ExtensionArray.

969

970 Always returns `ExtensionArray` even if only one value.

971

972 Parameters

973 ----------

974 dropna : bool, default True

975 Don't consider counts of NA values.

976 Not implemented by pyarrow.

977

978 Returns

979 -------

980 same type as self

981 Sorted, if possible.

982 """

983 if pa_version_under6p0:

984 raise NotImplementedError("mode only supported for pyarrow version >= 6.0")

985 modes = pc.mode(self._data, pc.count_distinct(self._data).as_py())

986 values = modes.field(0)

987 counts = modes.field(1)

988 # counts sorted descending i.e counts[0] = max

989 mask = pc.equal(counts, counts[0])

990 most_common = values.filter(mask)

991 return type(self)(most_common)

992

993 def _maybe_convert_setitem_value(self, value):

994 """Maybe convert value to be pyarrow compatible."""

995 # TODO: Make more robust like ArrowStringArray._maybe_convert_setitem_value

996 return value

997

998 def _set_via_chunk_iteration(

999 self, indices: npt.NDArray[np.intp], value: npt.NDArray[Any]

1000 ) -> pa.ChunkedArray:

1001 """

1002 Loop through the array chunks and set the new values while

1003 leaving the chunking layout unchanged.

1004

1005 Parameters

1006 ----------

1007 indices : npt.NDArray[np.intp]

1008 Position indices for the underlying ChunkedArray.

1009

1010 value : ExtensionDtype.type, Sequence[ExtensionDtype.type], or object

1011 value or values to be set of ``key``.

1012

1013 Notes

1014 -----

1015 Assumes that indices is sorted. Caller is responsible for sorting.

1016 """

1017 new_data = []

1018 stop = 0

1019 for chunk in self._data.iterchunks():

1020 start, stop = stop, stop + len(chunk)

1021 if len(indices) == 0 or stop <= indices[0]:

1022 new_data.append(chunk)

1023 else:

1024 n = int(np.searchsorted(indices, stop, side="left"))

1025 c_ind = indices[:n] - start

1026 indices = indices[n:]

1027 n = len(c_ind)

1028 c_value, value = value[:n], value[n:]

1029 new_data.append(self._replace_with_indices(chunk, c_ind, c_value))

1030 return pa.chunked_array(new_data)

1031

1032 @classmethod

1033 def _replace_with_indices(

1034 cls,

1035 chunk: pa.Array,

1036 indices: npt.NDArray[np.intp],

1037 value: npt.NDArray[Any],

1038 ) -> pa.Array:

1039 """

1040 Replace items selected with a set of positional indices.

1041

1042 Analogous to pyarrow.compute.replace_with_mask, except that replacement

1043 positions are identified via indices rather than a mask.

1044

1045 Parameters

1046 ----------

1047 chunk : pa.Array

1048 indices : npt.NDArray[np.intp]

1049 value : npt.NDArray[Any]

1050 Replacement value(s).

1051

1052 Returns

1053 -------

1054 pa.Array

1055 """

1056 n = len(indices)

1057

1058 if n == 0:

1059 return chunk

1060

1061 start, stop = indices[[0, -1]]

1062

1063 if (stop - start) == (n - 1):

1064 # fast path for a contiguous set of indices

1065 arrays = [

1066 chunk[:start],

1067 pa.array(value, type=chunk.type, from_pandas=True),

1068 chunk[stop + 1 :],

1069 ]

1070 arrays = [arr for arr in arrays if len(arr)]

1071 if len(arrays) == 1:

1072 return arrays[0]

1073 return pa.concat_arrays(arrays)

1074

1075 mask = np.zeros(len(chunk), dtype=np.bool_)

1076 mask[indices] = True

1077

1078 if pa_version_under5p0:

1079 arr = chunk.to_numpy(zero_copy_only=False)

1080 arr[mask] = value

1081 return pa.array(arr, type=chunk.type)

1082

1083 if isna(value).all():

1084 return pc.if_else(mask, None, chunk)

1085

1086 return pc.replace_with_mask(chunk, mask, value)