Coverage for /var/srv/projects/api.amasfac.comuna18.com/tmp/venv/lib/python3.9/site-packages/pandas/core/arrays/arrow/array.py: 13%

397 statements  

« prev     ^ index     » next       coverage.py v6.4.4, created at 2023-07-17 14:22 -0600

1from __future__ import annotations 

2 

3from typing import ( 

4 TYPE_CHECKING, 

5 Any, 

6 TypeVar, 

7) 

8 

9import numpy as np 

10 

11from pandas._libs import lib 

12from pandas._typing import ( 

13 Dtype, 

14 PositionalIndexer, 

15 TakeIndexer, 

16 npt, 

17) 

18from pandas.compat import ( 

19 pa_version_under1p01, 

20 pa_version_under2p0, 

21 pa_version_under3p0, 

22 pa_version_under4p0, 

23 pa_version_under5p0, 

24 pa_version_under6p0, 

25 pa_version_under7p0, 

26) 

27from pandas.util._decorators import ( 

28 deprecate_nonkeyword_arguments, 

29 doc, 

30) 

31 

32from pandas.core.dtypes.common import ( 

33 is_array_like, 

34 is_bool_dtype, 

35 is_integer, 

36 is_integer_dtype, 

37 is_scalar, 

38) 

39from pandas.core.dtypes.missing import isna 

40 

41from pandas.core.algorithms import resolve_na_sentinel 

42from pandas.core.arraylike import OpsMixin 

43from pandas.core.arrays.base import ExtensionArray 

44from pandas.core.indexers import ( 

45 check_array_indexer, 

46 unpack_tuple_and_ellipses, 

47 validate_indices, 

48) 

49 

50if not pa_version_under1p01: 50 ↛ 51line 50 didn't jump to line 51, because the condition on line 50 was never true

51 import pyarrow as pa 

52 import pyarrow.compute as pc 

53 

54 from pandas.core.arrays.arrow._arrow_utils import fallback_performancewarning 

55 from pandas.core.arrays.arrow.dtype import ArrowDtype 

56 

57 ARROW_CMP_FUNCS = { 

58 "eq": pc.equal, 

59 "ne": pc.not_equal, 

60 "lt": pc.less, 

61 "gt": pc.greater, 

62 "le": pc.less_equal, 

63 "ge": pc.greater_equal, 

64 } 

65 

66 ARROW_LOGICAL_FUNCS = { 

67 "and": NotImplemented if pa_version_under2p0 else pc.and_kleene, 

68 "rand": NotImplemented 

69 if pa_version_under2p0 

70 else lambda x, y: pc.and_kleene(y, x), 

71 "or": NotImplemented if pa_version_under2p0 else pc.or_kleene, 

72 "ror": NotImplemented 

73 if pa_version_under2p0 

74 else lambda x, y: pc.or_kleene(y, x), 

75 "xor": NotImplemented if pa_version_under2p0 else pc.xor, 

76 "rxor": NotImplemented if pa_version_under2p0 else lambda x, y: pc.xor(y, x), 

77 } 

78 

79 def cast_for_truediv( 

80 arrow_array: pa.ChunkedArray, pa_object: pa.Array | pa.Scalar 

81 ) -> pa.ChunkedArray: 

82 # Ensure int / int -> float mirroring Python/Numpy behavior 

83 # as pc.divide_checked(int, int) -> int 

84 if pa.types.is_integer(arrow_array.type) and pa.types.is_integer( 

85 pa_object.type 

86 ): 

87 return arrow_array.cast(pa.float64()) 

88 return arrow_array 

89 

90 def floordiv_compat( 

91 left: pa.ChunkedArray | pa.Array | pa.Scalar, 

92 right: pa.ChunkedArray | pa.Array | pa.Scalar, 

93 ) -> pa.ChunkedArray: 

94 # Ensure int // int -> int mirroring Python/Numpy behavior 

95 # as pc.floor(pc.divide_checked(int, int)) -> float 

96 result = pc.floor(pc.divide_checked(left, right)) 

97 if pa.types.is_integer(left.type) and pa.types.is_integer(right.type): 

98 result = result.cast(left.type) 

99 return result 

100 

101 ARROW_ARITHMETIC_FUNCS = { 

102 "add": NotImplemented if pa_version_under2p0 else pc.add_checked, 

103 "radd": NotImplemented 

104 if pa_version_under2p0 

105 else lambda x, y: pc.add_checked(y, x), 

106 "sub": NotImplemented if pa_version_under2p0 else pc.subtract_checked, 

107 "rsub": NotImplemented 

108 if pa_version_under2p0 

109 else lambda x, y: pc.subtract_checked(y, x), 

110 "mul": NotImplemented if pa_version_under2p0 else pc.multiply_checked, 

111 "rmul": NotImplemented 

112 if pa_version_under2p0 

113 else lambda x, y: pc.multiply_checked(y, x), 

114 "truediv": NotImplemented 

115 if pa_version_under2p0 

116 else lambda x, y: pc.divide_checked(cast_for_truediv(x, y), y), 

117 "rtruediv": NotImplemented 

118 if pa_version_under2p0 

119 else lambda x, y: pc.divide_checked(y, cast_for_truediv(x, y)), 

120 "floordiv": NotImplemented 

121 if pa_version_under2p0 

122 else lambda x, y: floordiv_compat(x, y), 

123 "rfloordiv": NotImplemented 

124 if pa_version_under2p0 

125 else lambda x, y: floordiv_compat(y, x), 

126 "mod": NotImplemented, 

127 "rmod": NotImplemented, 

128 "divmod": NotImplemented, 

129 "rdivmod": NotImplemented, 

130 "pow": NotImplemented if pa_version_under4p0 else pc.power_checked, 

131 "rpow": NotImplemented 

132 if pa_version_under4p0 

133 else lambda x, y: pc.power_checked(y, x), 

134 } 

135 

136if TYPE_CHECKING: 136 ↛ 137line 136 didn't jump to line 137, because the condition on line 136 was never true

137 from pandas import Series 

138 

139ArrowExtensionArrayT = TypeVar("ArrowExtensionArrayT", bound="ArrowExtensionArray") 

140 

141 

142def to_pyarrow_type( 

143 dtype: ArrowDtype | pa.DataType | Dtype | None, 

144) -> pa.DataType | None: 

145 """ 

146 Convert dtype to a pyarrow type instance. 

147 """ 

148 if isinstance(dtype, ArrowDtype): 

149 pa_dtype = dtype.pyarrow_dtype 

150 elif isinstance(dtype, pa.DataType): 

151 pa_dtype = dtype 

152 elif dtype: 

153 # Accepts python types too 

154 pa_dtype = pa.from_numpy_dtype(dtype) 

155 else: 

156 pa_dtype = None 

157 return pa_dtype 

158 

159 

160class ArrowExtensionArray(OpsMixin, ExtensionArray): 

161 """ 

162 Pandas ExtensionArray backed by a PyArrow ChunkedArray. 

163 

164 .. warning:: 

165 

166 ArrowExtensionArray is considered experimental. The implementation and 

167 parts of the API may change without warning. 

168 

169 Parameters 

170 ---------- 

171 values : pyarrow.Array or pyarrow.ChunkedArray 

172 

173 Attributes 

174 ---------- 

175 None 

176 

177 Methods 

178 ------- 

179 None 

180 

181 Returns 

182 ------- 

183 ArrowExtensionArray 

184 

185 Notes 

186 ----- 

187 Most methods are implemented using `pyarrow compute functions. <https://arrow.apache.org/docs/python/api/compute.html>`__ 

188 Some methods may either raise an exception or raise a ``PerformanceWarning`` if an 

189 associated compute function is not available based on the installed version of PyArrow. 

190 

191 Please install the latest version of PyArrow to enable the best functionality and avoid 

192 potential bugs in prior versions of PyArrow. 

193 

194 Examples 

195 -------- 

196 Create an ArrowExtensionArray with :func:`pandas.array`: 

197 

198 >>> pd.array([1, 1, None], dtype="int64[pyarrow]") 

199 <ArrowExtensionArray> 

200 [1, 1, <NA>] 

201 Length: 3, dtype: int64[pyarrow] 

202 """ # noqa: E501 (http link too long) 

203 

204 _data: pa.ChunkedArray 

205 _dtype: ArrowDtype 

206 

207 def __init__(self, values: pa.Array | pa.ChunkedArray) -> None: 

208 if pa_version_under1p01: 

209 msg = "pyarrow>=1.0.0 is required for PyArrow backed ArrowExtensionArray." 

210 raise ImportError(msg) 

211 if isinstance(values, pa.Array): 

212 self._data = pa.chunked_array([values]) 

213 elif isinstance(values, pa.ChunkedArray): 

214 self._data = values 

215 else: 

216 raise ValueError( 

217 f"Unsupported type '{type(values)}' for ArrowExtensionArray" 

218 ) 

219 self._dtype = ArrowDtype(self._data.type) 

220 

221 @classmethod 

222 def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy=False): 

223 """ 

224 Construct a new ExtensionArray from a sequence of scalars. 

225 """ 

226 pa_dtype = to_pyarrow_type(dtype) 

227 is_cls = isinstance(scalars, cls) 

228 if is_cls or isinstance(scalars, (pa.Array, pa.ChunkedArray)): 

229 if is_cls: 

230 scalars = scalars._data 

231 if pa_dtype: 

232 scalars = scalars.cast(pa_dtype) 

233 return cls(scalars) 

234 else: 

235 return cls( 

236 pa.chunked_array(pa.array(scalars, type=pa_dtype, from_pandas=True)) 

237 ) 

238 

239 @classmethod 

240 def _from_sequence_of_strings( 

241 cls, strings, *, dtype: Dtype | None = None, copy=False 

242 ): 

243 """ 

244 Construct a new ExtensionArray from a sequence of strings. 

245 """ 

246 pa_type = to_pyarrow_type(dtype) 

247 if pa_type is None: 

248 # Let pyarrow try to infer or raise 

249 scalars = strings 

250 elif pa.types.is_timestamp(pa_type): 

251 from pandas.core.tools.datetimes import to_datetime 

252 

253 scalars = to_datetime(strings, errors="raise") 

254 elif pa.types.is_date(pa_type): 

255 from pandas.core.tools.datetimes import to_datetime 

256 

257 scalars = to_datetime(strings, errors="raise").date 

258 elif pa.types.is_duration(pa_type): 

259 from pandas.core.tools.timedeltas import to_timedelta 

260 

261 scalars = to_timedelta(strings, errors="raise") 

262 elif pa.types.is_time(pa_type): 

263 from pandas.core.tools.times import to_time 

264 

265 # "coerce" to allow "null times" (None) to not raise 

266 scalars = to_time(strings, errors="coerce") 

267 elif pa.types.is_boolean(pa_type): 

268 from pandas.core.arrays import BooleanArray 

269 

270 scalars = BooleanArray._from_sequence_of_strings(strings).to_numpy() 

271 elif ( 

272 pa.types.is_integer(pa_type) 

273 or pa.types.is_floating(pa_type) 

274 or pa.types.is_decimal(pa_type) 

275 ): 

276 from pandas.core.tools.numeric import to_numeric 

277 

278 scalars = to_numeric(strings, errors="raise") 

279 else: 

280 raise NotImplementedError( 

281 f"Converting strings to {pa_type} is not implemented." 

282 ) 

283 return cls._from_sequence(scalars, dtype=pa_type, copy=copy) 

284 

285 def __getitem__(self, item: PositionalIndexer): 

286 """Select a subset of self. 

287 

288 Parameters 

289 ---------- 

290 item : int, slice, or ndarray 

291 * int: The position in 'self' to get. 

292 * slice: A slice object, where 'start', 'stop', and 'step' are 

293 integers or None 

294 * ndarray: A 1-d boolean NumPy ndarray the same length as 'self' 

295 

296 Returns 

297 ------- 

298 item : scalar or ExtensionArray 

299 

300 Notes 

301 ----- 

302 For scalar ``item``, return a scalar value suitable for the array's 

303 type. This should be an instance of ``self.dtype.type``. 

304 For slice ``key``, return an instance of ``ExtensionArray``, even 

305 if the slice is length 0 or 1. 

306 For a boolean mask, return an instance of ``ExtensionArray``, filtered 

307 to the values where ``item`` is True. 

308 """ 

309 item = check_array_indexer(self, item) 

310 

311 if isinstance(item, np.ndarray): 

312 if not len(item): 

313 # Removable once we migrate StringDtype[pyarrow] to ArrowDtype[string] 

314 if self._dtype.name == "string" and self._dtype.storage == "pyarrow": 

315 pa_dtype = pa.string() 

316 else: 

317 pa_dtype = self._dtype.pyarrow_dtype 

318 return type(self)(pa.chunked_array([], type=pa_dtype)) 

319 elif is_integer_dtype(item.dtype): 

320 return self.take(item) 

321 elif is_bool_dtype(item.dtype): 

322 return type(self)(self._data.filter(item)) 

323 else: 

324 raise IndexError( 

325 "Only integers, slices and integer or " 

326 "boolean arrays are valid indices." 

327 ) 

328 elif isinstance(item, tuple): 

329 item = unpack_tuple_and_ellipses(item) 

330 

331 # error: Non-overlapping identity check (left operand type: 

332 # "Union[Union[int, integer[Any]], Union[slice, List[int], 

333 # ndarray[Any, Any]]]", right operand type: "ellipsis") 

334 if item is Ellipsis: # type: ignore[comparison-overlap] 

335 # TODO: should be handled by pyarrow? 

336 item = slice(None) 

337 

338 if is_scalar(item) and not is_integer(item): 

339 # e.g. "foo" or 2.5 

340 # exception message copied from numpy 

341 raise IndexError( 

342 r"only integers, slices (`:`), ellipsis (`...`), numpy.newaxis " 

343 r"(`None`) and integer or boolean arrays are valid indices" 

344 ) 

345 # We are not an array indexer, so maybe e.g. a slice or integer 

346 # indexer. We dispatch to pyarrow. 

347 value = self._data[item] 

348 if isinstance(value, pa.ChunkedArray): 

349 return type(self)(value) 

350 else: 

351 scalar = value.as_py() 

352 if scalar is None: 

353 return self._dtype.na_value 

354 else: 

355 return scalar 

356 

357 def __arrow_array__(self, type=None): 

358 """Convert myself to a pyarrow ChunkedArray.""" 

359 return self._data 

360 

361 def __invert__(self: ArrowExtensionArrayT) -> ArrowExtensionArrayT: 

362 if pa_version_under2p0: 

363 raise NotImplementedError("__invert__ not implement for pyarrow < 2.0") 

364 return type(self)(pc.invert(self._data)) 

365 

366 def __neg__(self: ArrowExtensionArrayT) -> ArrowExtensionArrayT: 

367 return type(self)(pc.negate_checked(self._data)) 

368 

369 def __pos__(self: ArrowExtensionArrayT) -> ArrowExtensionArrayT: 

370 return type(self)(self._data) 

371 

372 def __abs__(self: ArrowExtensionArrayT) -> ArrowExtensionArrayT: 

373 return type(self)(pc.abs_checked(self._data)) 

374 

375 def _cmp_method(self, other, op): 

376 from pandas.arrays import BooleanArray 

377 

378 pc_func = ARROW_CMP_FUNCS[op.__name__] 

379 if isinstance(other, ArrowExtensionArray): 

380 result = pc_func(self._data, other._data) 

381 elif isinstance(other, (np.ndarray, list)): 

382 result = pc_func(self._data, other) 

383 elif is_scalar(other): 

384 try: 

385 result = pc_func(self._data, pa.scalar(other)) 

386 except (pa.lib.ArrowNotImplementedError, pa.lib.ArrowInvalid): 

387 mask = isna(self) | isna(other) 

388 valid = ~mask 

389 result = np.zeros(len(self), dtype="bool") 

390 result[valid] = op(np.array(self)[valid], other) 

391 return BooleanArray(result, mask) 

392 else: 

393 raise NotImplementedError( 

394 f"{op.__name__} not implemented for {type(other)}" 

395 ) 

396 

397 if pa_version_under2p0: 

398 result = result.to_pandas().values 

399 else: 

400 result = result.to_numpy() 

401 return BooleanArray._from_sequence(result) 

402 

403 def _evaluate_op_method(self, other, op, arrow_funcs): 

404 pc_func = arrow_funcs[op.__name__] 

405 if pc_func is NotImplemented: 

406 raise NotImplementedError(f"{op.__name__} not implemented.") 

407 if isinstance(other, ArrowExtensionArray): 

408 result = pc_func(self._data, other._data) 

409 elif isinstance(other, (np.ndarray, list)): 

410 result = pc_func(self._data, pa.array(other, from_pandas=True)) 

411 elif is_scalar(other): 

412 result = pc_func(self._data, pa.scalar(other)) 

413 else: 

414 raise NotImplementedError( 

415 f"{op.__name__} not implemented for {type(other)}" 

416 ) 

417 return type(self)(result) 

418 

419 def _logical_method(self, other, op): 

420 return self._evaluate_op_method(other, op, ARROW_LOGICAL_FUNCS) 

421 

422 def _arith_method(self, other, op): 

423 return self._evaluate_op_method(other, op, ARROW_ARITHMETIC_FUNCS) 

424 

425 def equals(self, other) -> bool: 

426 if not isinstance(other, ArrowExtensionArray): 

427 return False 

428 # I'm told that pyarrow makes __eq__ behave like pandas' equals; 

429 # TODO: is this documented somewhere? 

430 return self._data == other._data 

431 

432 @property 

433 def dtype(self) -> ArrowDtype: 

434 """ 

435 An instance of 'ExtensionDtype'. 

436 """ 

437 return self._dtype 

438 

439 @property 

440 def nbytes(self) -> int: 

441 """ 

442 The number of bytes needed to store this object in memory. 

443 """ 

444 return self._data.nbytes 

445 

446 def __len__(self) -> int: 

447 """ 

448 Length of this array. 

449 

450 Returns 

451 ------- 

452 length : int 

453 """ 

454 return len(self._data) 

455 

456 @property 

457 def _hasna(self) -> bool: 

458 return self._data.null_count > 0 

459 

460 def isna(self) -> npt.NDArray[np.bool_]: 

461 """ 

462 Boolean NumPy array indicating if each value is missing. 

463 

464 This should return a 1-D array the same length as 'self'. 

465 """ 

466 if pa_version_under2p0: 

467 return self._data.is_null().to_pandas().values 

468 else: 

469 return self._data.is_null().to_numpy() 

470 

471 @deprecate_nonkeyword_arguments(version=None, allowed_args=["self"]) 

472 def argsort( 

473 self, 

474 ascending: bool = True, 

475 kind: str = "quicksort", 

476 na_position: str = "last", 

477 *args, 

478 **kwargs, 

479 ) -> np.ndarray: 

480 order = "ascending" if ascending else "descending" 

481 null_placement = {"last": "at_end", "first": "at_start"}.get(na_position, None) 

482 if null_placement is None or pa_version_under7p0: 

483 # Although pc.array_sort_indices exists in version 6 

484 # there's a bug that affects the pa.ChunkedArray backing 

485 # https://issues.apache.org/jira/browse/ARROW-12042 

486 fallback_performancewarning("7") 

487 return super().argsort( 

488 ascending=ascending, kind=kind, na_position=na_position 

489 ) 

490 

491 result = pc.array_sort_indices( 

492 self._data, order=order, null_placement=null_placement 

493 ) 

494 if pa_version_under2p0: 

495 np_result = result.to_pandas().values 

496 else: 

497 np_result = result.to_numpy() 

498 return np_result.astype(np.intp, copy=False) 

499 

500 def _argmin_max(self, skipna: bool, method: str) -> int: 

501 if self._data.length() in (0, self._data.null_count) or ( 

502 self._hasna and not skipna 

503 ): 

504 # For empty or all null, pyarrow returns -1 but pandas expects TypeError 

505 # For skipna=False and data w/ null, pandas expects NotImplementedError 

506 # let ExtensionArray.arg{max|min} raise 

507 return getattr(super(), f"arg{method}")(skipna=skipna) 

508 

509 if pa_version_under6p0: 

510 raise NotImplementedError( 

511 f"arg{method} only implemented for pyarrow version >= 6.0" 

512 ) 

513 

514 value = getattr(pc, method)(self._data, skip_nulls=skipna) 

515 return pc.index(self._data, value).as_py() 

516 

517 def argmin(self, skipna: bool = True) -> int: 

518 return self._argmin_max(skipna, "min") 

519 

520 def argmax(self, skipna: bool = True) -> int: 

521 return self._argmin_max(skipna, "max") 

522 

523 def copy(self: ArrowExtensionArrayT) -> ArrowExtensionArrayT: 

524 """ 

525 Return a shallow copy of the array. 

526 

527 Underlying ChunkedArray is immutable, so a deep copy is unnecessary. 

528 

529 Returns 

530 ------- 

531 type(self) 

532 """ 

533 return type(self)(self._data) 

534 

535 def dropna(self: ArrowExtensionArrayT) -> ArrowExtensionArrayT: 

536 """ 

537 Return ArrowExtensionArray without NA values. 

538 

539 Returns 

540 ------- 

541 ArrowExtensionArray 

542 """ 

543 if pa_version_under6p0: 

544 fallback_performancewarning(version="6") 

545 return super().dropna() 

546 else: 

547 return type(self)(pc.drop_null(self._data)) 

548 

549 def isin(self, values) -> npt.NDArray[np.bool_]: 

550 if pa_version_under2p0: 

551 fallback_performancewarning(version="2") 

552 return super().isin(values) 

553 

554 # for an empty value_set pyarrow 3.0.0 segfaults and pyarrow 2.0.0 returns True 

555 # for null values, so we short-circuit to return all False array. 

556 if not len(values): 

557 return np.zeros(len(self), dtype=bool) 

558 

559 kwargs = {} 

560 if pa_version_under3p0: 

561 # in pyarrow 2.0.0 skip_null is ignored but is a required keyword and raises 

562 # with unexpected keyword argument in pyarrow 3.0.0+ 

563 kwargs["skip_null"] = True 

564 

565 result = pc.is_in( 

566 self._data, value_set=pa.array(values, from_pandas=True), **kwargs 

567 ) 

568 # pyarrow 2.0.0 returned nulls, so we explicitly specify dtype to convert nulls 

569 # to False 

570 return np.array(result, dtype=np.bool_) 

571 

572 def _values_for_factorize(self) -> tuple[np.ndarray, Any]: 

573 """ 

574 Return an array and missing value suitable for factorization. 

575 

576 Returns 

577 ------- 

578 values : ndarray 

579 na_value : pd.NA 

580 

581 Notes 

582 ----- 

583 The values returned by this method are also used in 

584 :func:`pandas.util.hash_pandas_object`. 

585 """ 

586 if pa_version_under2p0: 

587 values = self._data.to_pandas().values 

588 else: 

589 values = self._data.to_numpy() 

590 return values, self.dtype.na_value 

591 

592 @doc(ExtensionArray.factorize) 

593 def factorize( 

594 self, 

595 na_sentinel: int | lib.NoDefault = lib.no_default, 

596 use_na_sentinel: bool | lib.NoDefault = lib.no_default, 

597 ) -> tuple[np.ndarray, ExtensionArray]: 

598 resolved_na_sentinel = resolve_na_sentinel(na_sentinel, use_na_sentinel) 

599 if pa_version_under4p0: 

600 encoded = self._data.dictionary_encode() 

601 else: 

602 null_encoding = "mask" if resolved_na_sentinel is not None else "encode" 

603 encoded = self._data.dictionary_encode(null_encoding=null_encoding) 

604 indices = pa.chunked_array( 

605 [c.indices for c in encoded.chunks], type=encoded.type.index_type 

606 ).to_pandas() 

607 if indices.dtype.kind == "f": 

608 indices[np.isnan(indices)] = ( 

609 resolved_na_sentinel if resolved_na_sentinel is not None else -1 

610 ) 

611 indices = indices.astype(np.int64, copy=False) 

612 

613 if encoded.num_chunks: 

614 uniques = type(self)(encoded.chunk(0).dictionary) 

615 if resolved_na_sentinel is None and pa_version_under4p0: 

616 # TODO: share logic with BaseMaskedArray.factorize 

617 # Insert na with the proper code 

618 na_mask = indices.values == -1 

619 na_index = na_mask.argmax() 

620 if na_mask[na_index]: 

621 na_code = 0 if na_index == 0 else indices[:na_index].max() + 1 

622 uniques = uniques.insert(na_code, self.dtype.na_value) 

623 indices[indices >= na_code] += 1 

624 indices[indices == -1] = na_code 

625 else: 

626 uniques = type(self)(pa.array([], type=encoded.type.value_type)) 

627 

628 return indices.values, uniques 

629 

630 def reshape(self, *args, **kwargs): 

631 raise NotImplementedError( 

632 f"{type(self)} does not support reshape " 

633 f"as backed by a 1D pyarrow.ChunkedArray." 

634 ) 

635 

636 def take( 

637 self, 

638 indices: TakeIndexer, 

639 allow_fill: bool = False, 

640 fill_value: Any = None, 

641 ) -> ArrowExtensionArray: 

642 """ 

643 Take elements from an array. 

644 

645 Parameters 

646 ---------- 

647 indices : sequence of int or one-dimensional np.ndarray of int 

648 Indices to be taken. 

649 allow_fill : bool, default False 

650 How to handle negative values in `indices`. 

651 

652 * False: negative values in `indices` indicate positional indices 

653 from the right (the default). This is similar to 

654 :func:`numpy.take`. 

655 

656 * True: negative values in `indices` indicate 

657 missing values. These values are set to `fill_value`. Any other 

658 other negative values raise a ``ValueError``. 

659 

660 fill_value : any, optional 

661 Fill value to use for NA-indices when `allow_fill` is True. 

662 This may be ``None``, in which case the default NA value for 

663 the type, ``self.dtype.na_value``, is used. 

664 

665 For many ExtensionArrays, there will be two representations of 

666 `fill_value`: a user-facing "boxed" scalar, and a low-level 

667 physical NA value. `fill_value` should be the user-facing version, 

668 and the implementation should handle translating that to the 

669 physical version for processing the take if necessary. 

670 

671 Returns 

672 ------- 

673 ExtensionArray 

674 

675 Raises 

676 ------ 

677 IndexError 

678 When the indices are out of bounds for the array. 

679 ValueError 

680 When `indices` contains negative values other than ``-1`` 

681 and `allow_fill` is True. 

682 

683 See Also 

684 -------- 

685 numpy.take 

686 api.extensions.take 

687 

688 Notes 

689 ----- 

690 ExtensionArray.take is called by ``Series.__getitem__``, ``.loc``, 

691 ``iloc``, when `indices` is a sequence of values. Additionally, 

692 it's called by :meth:`Series.reindex`, or any other method 

693 that causes realignment, with a `fill_value`. 

694 """ 

695 # TODO: Remove once we got rid of the (indices < 0) check 

696 if not is_array_like(indices): 

697 indices_array = np.asanyarray(indices) 

698 else: 

699 # error: Incompatible types in assignment (expression has type 

700 # "Sequence[int]", variable has type "ndarray") 

701 indices_array = indices # type: ignore[assignment] 

702 

703 if len(self._data) == 0 and (indices_array >= 0).any(): 

704 raise IndexError("cannot do a non-empty take") 

705 if indices_array.size > 0 and indices_array.max() >= len(self._data): 

706 raise IndexError("out of bounds value in 'indices'.") 

707 

708 if allow_fill: 

709 fill_mask = indices_array < 0 

710 if fill_mask.any(): 

711 validate_indices(indices_array, len(self._data)) 

712 # TODO(ARROW-9433): Treat negative indices as NULL 

713 indices_array = pa.array(indices_array, mask=fill_mask) 

714 result = self._data.take(indices_array) 

715 if isna(fill_value): 

716 return type(self)(result) 

717 # TODO: ArrowNotImplementedError: Function fill_null has no 

718 # kernel matching input types (array[string], scalar[string]) 

719 result = type(self)(result) 

720 result[fill_mask] = fill_value 

721 return result 

722 # return type(self)(pc.fill_null(result, pa.scalar(fill_value))) 

723 else: 

724 # Nothing to fill 

725 return type(self)(self._data.take(indices)) 

726 else: # allow_fill=False 

727 # TODO(ARROW-9432): Treat negative indices as indices from the right. 

728 if (indices_array < 0).any(): 

729 # Don't modify in-place 

730 indices_array = np.copy(indices_array) 

731 indices_array[indices_array < 0] += len(self._data) 

732 return type(self)(self._data.take(indices_array)) 

733 

734 def unique(self: ArrowExtensionArrayT) -> ArrowExtensionArrayT: 

735 """ 

736 Compute the ArrowExtensionArray of unique values. 

737 

738 Returns 

739 ------- 

740 ArrowExtensionArray 

741 """ 

742 if pa_version_under2p0: 

743 fallback_performancewarning(version="2") 

744 return super().unique() 

745 else: 

746 return type(self)(pc.unique(self._data)) 

747 

748 def value_counts(self, dropna: bool = True) -> Series: 

749 """ 

750 Return a Series containing counts of each unique value. 

751 

752 Parameters 

753 ---------- 

754 dropna : bool, default True 

755 Don't include counts of missing values. 

756 

757 Returns 

758 ------- 

759 counts : Series 

760 

761 See Also 

762 -------- 

763 Series.value_counts 

764 """ 

765 from pandas import ( 

766 Index, 

767 Series, 

768 ) 

769 

770 vc = self._data.value_counts() 

771 

772 values = vc.field(0) 

773 counts = vc.field(1) 

774 if dropna and self._data.null_count > 0: 

775 mask = values.is_valid() 

776 values = values.filter(mask) 

777 counts = counts.filter(mask) 

778 

779 # No missing values so we can adhere to the interface and return a numpy array. 

780 counts = np.array(counts) 

781 

782 index = Index(type(self)(values)) 

783 

784 return Series(counts, index=index).astype("Int64") 

785 

786 @classmethod 

787 def _concat_same_type( 

788 cls: type[ArrowExtensionArrayT], to_concat 

789 ) -> ArrowExtensionArrayT: 

790 """ 

791 Concatenate multiple ArrowExtensionArrays. 

792 

793 Parameters 

794 ---------- 

795 to_concat : sequence of ArrowExtensionArrays 

796 

797 Returns 

798 ------- 

799 ArrowExtensionArray 

800 """ 

801 chunks = [array for ea in to_concat for array in ea._data.iterchunks()] 

802 arr = pa.chunked_array(chunks) 

803 return cls(arr) 

804 

805 def _reduce(self, name: str, *, skipna: bool = True, **kwargs): 

806 """ 

807 Return a scalar result of performing the reduction operation. 

808 

809 Parameters 

810 ---------- 

811 name : str 

812 Name of the function, supported values are: 

813 { any, all, min, max, sum, mean, median, prod, 

814 std, var, sem, kurt, skew }. 

815 skipna : bool, default True 

816 If True, skip NaN values. 

817 **kwargs 

818 Additional keyword arguments passed to the reduction function. 

819 Currently, `ddof` is the only supported kwarg. 

820 

821 Returns 

822 ------- 

823 scalar 

824 

825 Raises 

826 ------ 

827 TypeError : subclass does not define reductions 

828 """ 

829 if name == "sem": 

830 

831 def pyarrow_meth(data, skipna, **kwargs): 

832 numerator = pc.stddev(data, skip_nulls=skipna, **kwargs) 

833 denominator = pc.sqrt_checked( 

834 pc.subtract_checked( 

835 pc.count(self._data, skip_nulls=skipna), kwargs["ddof"] 

836 ) 

837 ) 

838 return pc.divide_checked(numerator, denominator) 

839 

840 else: 

841 pyarrow_name = { 

842 "median": "approximate_median", 

843 "prod": "product", 

844 "std": "stddev", 

845 "var": "variance", 

846 }.get(name, name) 

847 # error: Incompatible types in assignment 

848 # (expression has type "Optional[Any]", variable has type 

849 # "Callable[[Any, Any, KwArg(Any)], Any]") 

850 pyarrow_meth = getattr(pc, pyarrow_name, None) # type: ignore[assignment] 

851 if pyarrow_meth is None: 

852 # Let ExtensionArray._reduce raise the TypeError 

853 return super()._reduce(name, skipna=skipna, **kwargs) 

854 try: 

855 result = pyarrow_meth(self._data, skip_nulls=skipna, **kwargs) 

856 except (AttributeError, NotImplementedError, TypeError) as err: 

857 msg = ( 

858 f"'{type(self).__name__}' with dtype {self.dtype} " 

859 f"does not support reduction '{name}' with pyarrow " 

860 f"version {pa.__version__}. '{name}' may be supported by " 

861 f"upgrading pyarrow." 

862 ) 

863 raise TypeError(msg) from err 

864 if pc.is_null(result).as_py(): 

865 return self.dtype.na_value 

866 return result.as_py() 

867 

868 def __setitem__(self, key: int | slice | np.ndarray, value: Any) -> None: 

869 """Set one or more values inplace. 

870 

871 Parameters 

872 ---------- 

873 key : int, ndarray, or slice 

874 When called from, e.g. ``Series.__setitem__``, ``key`` will be 

875 one of 

876 

877 * scalar int 

878 * ndarray of integers. 

879 * boolean ndarray 

880 * slice object 

881 

882 value : ExtensionDtype.type, Sequence[ExtensionDtype.type], or object 

883 value or values to be set of ``key``. 

884 

885 Returns 

886 ------- 

887 None 

888 """ 

889 key = check_array_indexer(self, key) 

890 indices = self._indexing_key_to_indices(key) 

891 value = self._maybe_convert_setitem_value(value) 

892 

893 argsort = np.argsort(indices) 

894 indices = indices[argsort] 

895 

896 if is_scalar(value): 

897 value = np.broadcast_to(value, len(self)) 

898 elif len(indices) != len(value): 

899 raise ValueError("Length of indexer and values mismatch") 

900 else: 

901 value = np.asarray(value)[argsort] 

902 

903 self._data = self._set_via_chunk_iteration(indices=indices, value=value) 

904 

905 def _indexing_key_to_indices( 

906 self, key: int | slice | np.ndarray 

907 ) -> npt.NDArray[np.intp]: 

908 """ 

909 Convert indexing key for self into positional indices. 

910 

911 Parameters 

912 ---------- 

913 key : int | slice | np.ndarray 

914 

915 Returns 

916 ------- 

917 npt.NDArray[np.intp] 

918 """ 

919 n = len(self) 

920 if isinstance(key, slice): 

921 indices = np.arange(n)[key] 

922 elif is_integer(key): 

923 # error: Invalid index type "List[Union[int, ndarray[Any, Any]]]" 

924 # for "ndarray[Any, dtype[signedinteger[Any]]]"; expected type 

925 # "Union[SupportsIndex, _SupportsArray[dtype[Union[bool_, 

926 # integer[Any]]]], _NestedSequence[_SupportsArray[dtype[Union 

927 # [bool_, integer[Any]]]]], _NestedSequence[Union[bool, int]] 

928 # , Tuple[Union[SupportsIndex, _SupportsArray[dtype[Union[bool_ 

929 # , integer[Any]]]], _NestedSequence[_SupportsArray[dtype[Union 

930 # [bool_, integer[Any]]]]], _NestedSequence[Union[bool, int]]], ...]]" 

931 indices = np.arange(n)[[key]] # type: ignore[index] 

932 elif is_bool_dtype(key): 

933 key = np.asarray(key) 

934 if len(key) != n: 

935 raise ValueError("Length of indexer and values mismatch") 

936 indices = key.nonzero()[0] 

937 else: 

938 key = np.asarray(key) 

939 indices = np.arange(n)[key] 

940 return indices 

941 

942 # TODO: redefine _rank using pc.rank with pyarrow 9.0 

943 

944 def _quantile( 

945 self: ArrowExtensionArrayT, qs: npt.NDArray[np.float64], interpolation: str 

946 ) -> ArrowExtensionArrayT: 

947 """ 

948 Compute the quantiles of self for each quantile in `qs`. 

949 

950 Parameters 

951 ---------- 

952 qs : np.ndarray[float64] 

953 interpolation: str 

954 

955 Returns 

956 ------- 

957 same type as self 

958 """ 

959 if pa_version_under4p0: 

960 raise NotImplementedError( 

961 "quantile only supported for pyarrow version >= 4.0" 

962 ) 

963 result = pc.quantile(self._data, q=qs, interpolation=interpolation) 

964 return type(self)(result) 

965 

966 def _mode(self: ArrowExtensionArrayT, dropna: bool = True) -> ArrowExtensionArrayT: 

967 """ 

968 Returns the mode(s) of the ExtensionArray. 

969 

970 Always returns `ExtensionArray` even if only one value. 

971 

972 Parameters 

973 ---------- 

974 dropna : bool, default True 

975 Don't consider counts of NA values. 

976 Not implemented by pyarrow. 

977 

978 Returns 

979 ------- 

980 same type as self 

981 Sorted, if possible. 

982 """ 

983 if pa_version_under6p0: 

984 raise NotImplementedError("mode only supported for pyarrow version >= 6.0") 

985 modes = pc.mode(self._data, pc.count_distinct(self._data).as_py()) 

986 values = modes.field(0) 

987 counts = modes.field(1) 

988 # counts sorted descending i.e counts[0] = max 

989 mask = pc.equal(counts, counts[0]) 

990 most_common = values.filter(mask) 

991 return type(self)(most_common) 

992 

993 def _maybe_convert_setitem_value(self, value): 

994 """Maybe convert value to be pyarrow compatible.""" 

995 # TODO: Make more robust like ArrowStringArray._maybe_convert_setitem_value 

996 return value 

997 

998 def _set_via_chunk_iteration( 

999 self, indices: npt.NDArray[np.intp], value: npt.NDArray[Any] 

1000 ) -> pa.ChunkedArray: 

1001 """ 

1002 Loop through the array chunks and set the new values while 

1003 leaving the chunking layout unchanged. 

1004 

1005 Parameters 

1006 ---------- 

1007 indices : npt.NDArray[np.intp] 

1008 Position indices for the underlying ChunkedArray. 

1009 

1010 value : ExtensionDtype.type, Sequence[ExtensionDtype.type], or object 

1011 value or values to be set of ``key``. 

1012 

1013 Notes 

1014 ----- 

1015 Assumes that indices is sorted. Caller is responsible for sorting. 

1016 """ 

1017 new_data = [] 

1018 stop = 0 

1019 for chunk in self._data.iterchunks(): 

1020 start, stop = stop, stop + len(chunk) 

1021 if len(indices) == 0 or stop <= indices[0]: 

1022 new_data.append(chunk) 

1023 else: 

1024 n = int(np.searchsorted(indices, stop, side="left")) 

1025 c_ind = indices[:n] - start 

1026 indices = indices[n:] 

1027 n = len(c_ind) 

1028 c_value, value = value[:n], value[n:] 

1029 new_data.append(self._replace_with_indices(chunk, c_ind, c_value)) 

1030 return pa.chunked_array(new_data) 

1031 

1032 @classmethod 

1033 def _replace_with_indices( 

1034 cls, 

1035 chunk: pa.Array, 

1036 indices: npt.NDArray[np.intp], 

1037 value: npt.NDArray[Any], 

1038 ) -> pa.Array: 

1039 """ 

1040 Replace items selected with a set of positional indices. 

1041 

1042 Analogous to pyarrow.compute.replace_with_mask, except that replacement 

1043 positions are identified via indices rather than a mask. 

1044 

1045 Parameters 

1046 ---------- 

1047 chunk : pa.Array 

1048 indices : npt.NDArray[np.intp] 

1049 value : npt.NDArray[Any] 

1050 Replacement value(s). 

1051 

1052 Returns 

1053 ------- 

1054 pa.Array 

1055 """ 

1056 n = len(indices) 

1057 

1058 if n == 0: 

1059 return chunk 

1060 

1061 start, stop = indices[[0, -1]] 

1062 

1063 if (stop - start) == (n - 1): 

1064 # fast path for a contiguous set of indices 

1065 arrays = [ 

1066 chunk[:start], 

1067 pa.array(value, type=chunk.type, from_pandas=True), 

1068 chunk[stop + 1 :], 

1069 ] 

1070 arrays = [arr for arr in arrays if len(arr)] 

1071 if len(arrays) == 1: 

1072 return arrays[0] 

1073 return pa.concat_arrays(arrays) 

1074 

1075 mask = np.zeros(len(chunk), dtype=np.bool_) 

1076 mask[indices] = True 

1077 

1078 if pa_version_under5p0: 

1079 arr = chunk.to_numpy(zero_copy_only=False) 

1080 arr[mask] = value 

1081 return pa.array(arr, type=chunk.type) 

1082 

1083 if isna(value).all(): 

1084 return pc.if_else(mask, None, chunk) 

1085 

1086 return pc.replace_with_mask(chunk, mask, value)