Coverage for /var/srv/projects/api.amasfac.comuna18.com/tmp/venv/lib/python3.9/site-packages/pandas/core/arrays/sparse/array.py: 13%

803 statements  

« prev     ^ index     » next       coverage.py v6.4.4, created at 2023-07-17 14:22 -0600

1""" 

2SparseArray data structure 

3""" 

4from __future__ import annotations 

5 

6from collections import abc 

7import numbers 

8import operator 

9from typing import ( 

10 TYPE_CHECKING, 

11 Any, 

12 Callable, 

13 Literal, 

14 Sequence, 

15 TypeVar, 

16 cast, 

17 overload, 

18) 

19import warnings 

20 

21import numpy as np 

22 

23from pandas._libs import lib 

24import pandas._libs.sparse as splib 

25from pandas._libs.sparse import ( 

26 BlockIndex, 

27 IntIndex, 

28 SparseIndex, 

29) 

30from pandas._libs.tslibs import NaT 

31from pandas._typing import ( 

32 ArrayLike, 

33 AstypeArg, 

34 Dtype, 

35 NpDtype, 

36 PositionalIndexer, 

37 Scalar, 

38 ScalarIndexer, 

39 SequenceIndexer, 

40 npt, 

41) 

42from pandas.compat.numpy import function as nv 

43from pandas.errors import PerformanceWarning 

44from pandas.util._exceptions import find_stack_level 

45from pandas.util._validators import ( 

46 validate_bool_kwarg, 

47 validate_insert_loc, 

48) 

49 

50from pandas.core.dtypes.astype import astype_nansafe 

51from pandas.core.dtypes.cast import ( 

52 construct_1d_arraylike_from_scalar, 

53 find_common_type, 

54 maybe_box_datetimelike, 

55) 

56from pandas.core.dtypes.common import ( 

57 is_array_like, 

58 is_bool_dtype, 

59 is_datetime64_any_dtype, 

60 is_datetime64tz_dtype, 

61 is_dtype_equal, 

62 is_integer, 

63 is_list_like, 

64 is_object_dtype, 

65 is_scalar, 

66 is_string_dtype, 

67 pandas_dtype, 

68) 

69from pandas.core.dtypes.generic import ( 

70 ABCIndex, 

71 ABCSeries, 

72) 

73from pandas.core.dtypes.missing import ( 

74 isna, 

75 na_value_for_dtype, 

76 notna, 

77) 

78 

79from pandas.core import arraylike 

80import pandas.core.algorithms as algos 

81from pandas.core.array_algos.quantile import quantile_with_mask 

82from pandas.core.arraylike import OpsMixin 

83from pandas.core.arrays import ExtensionArray 

84from pandas.core.arrays.sparse.dtype import SparseDtype 

85from pandas.core.base import PandasObject 

86import pandas.core.common as com 

87from pandas.core.construction import ( 

88 extract_array, 

89 sanitize_array, 

90) 

91from pandas.core.indexers import ( 

92 check_array_indexer, 

93 unpack_tuple_and_ellipses, 

94) 

95from pandas.core.missing import interpolate_2d 

96from pandas.core.nanops import check_below_min_count 

97import pandas.core.ops as ops 

98 

99import pandas.io.formats.printing as printing 

100 

101# See https://github.com/python/typing/issues/684 

102if TYPE_CHECKING: 102 ↛ 103line 102 didn't jump to line 103, because the condition on line 102 was never true

103 from enum import Enum 

104 

105 class ellipsis(Enum): 

106 Ellipsis = "..." 

107 

108 Ellipsis = ellipsis.Ellipsis 

109 

110 from scipy.sparse import spmatrix 

111 

112 from pandas._typing import ( 

113 FillnaOptions, 

114 NumpySorter, 

115 ) 

116 

117 SparseIndexKind = Literal["integer", "block"] 

118 

119 from pandas import Series 

120 

121else: 

122 ellipsis = type(Ellipsis) 

123 

124 

125# ---------------------------------------------------------------------------- 

126# Array 

127 

128SparseArrayT = TypeVar("SparseArrayT", bound="SparseArray") 

129 

130_sparray_doc_kwargs = {"klass": "SparseArray"} 

131 

132 

133def _get_fill(arr: SparseArray) -> np.ndarray: 

134 """ 

135 Create a 0-dim ndarray containing the fill value 

136 

137 Parameters 

138 ---------- 

139 arr : SparseArray 

140 

141 Returns 

142 ------- 

143 fill_value : ndarray 

144 0-dim ndarray with just the fill value. 

145 

146 Notes 

147 ----- 

148 coerce fill_value to arr dtype if possible 

149 int64 SparseArray can have NaN as fill_value if there is no missing 

150 """ 

151 try: 

152 return np.asarray(arr.fill_value, dtype=arr.dtype.subtype) 

153 except ValueError: 

154 return np.asarray(arr.fill_value) 

155 

156 

157def _sparse_array_op( 

158 left: SparseArray, right: SparseArray, op: Callable, name: str 

159) -> SparseArray: 

160 """ 

161 Perform a binary operation between two arrays. 

162 

163 Parameters 

164 ---------- 

165 left : Union[SparseArray, ndarray] 

166 right : Union[SparseArray, ndarray] 

167 op : Callable 

168 The binary operation to perform 

169 name str 

170 Name of the callable. 

171 

172 Returns 

173 ------- 

174 SparseArray 

175 """ 

176 if name.startswith("__"): 

177 # For lookups in _libs.sparse we need non-dunder op name 

178 name = name[2:-2] 

179 

180 # dtype used to find corresponding sparse method 

181 ltype = left.dtype.subtype 

182 rtype = right.dtype.subtype 

183 

184 if not is_dtype_equal(ltype, rtype): 

185 subtype = find_common_type([ltype, rtype]) 

186 ltype = SparseDtype(subtype, left.fill_value) 

187 rtype = SparseDtype(subtype, right.fill_value) 

188 

189 left = left.astype(ltype, copy=False) 

190 right = right.astype(rtype, copy=False) 

191 dtype = ltype.subtype 

192 else: 

193 dtype = ltype 

194 

195 # dtype the result must have 

196 result_dtype = None 

197 

198 if left.sp_index.ngaps == 0 or right.sp_index.ngaps == 0: 

199 with np.errstate(all="ignore"): 

200 result = op(left.to_dense(), right.to_dense()) 

201 fill = op(_get_fill(left), _get_fill(right)) 

202 

203 if left.sp_index.ngaps == 0: 

204 index = left.sp_index 

205 else: 

206 index = right.sp_index 

207 elif left.sp_index.equals(right.sp_index): 

208 with np.errstate(all="ignore"): 

209 result = op(left.sp_values, right.sp_values) 

210 fill = op(_get_fill(left), _get_fill(right)) 

211 index = left.sp_index 

212 else: 

213 if name[0] == "r": 

214 left, right = right, left 

215 name = name[1:] 

216 

217 if name in ("and", "or", "xor") and dtype == "bool": 

218 opname = f"sparse_{name}_uint8" 

219 # to make template simple, cast here 

220 left_sp_values = left.sp_values.view(np.uint8) 

221 right_sp_values = right.sp_values.view(np.uint8) 

222 result_dtype = bool 

223 else: 

224 opname = f"sparse_{name}_{dtype}" 

225 left_sp_values = left.sp_values 

226 right_sp_values = right.sp_values 

227 

228 if ( 

229 name in ["floordiv", "mod"] 

230 and (right == 0).any() 

231 and left.dtype.kind in ["i", "u"] 

232 ): 

233 # Match the non-Sparse Series behavior 

234 opname = f"sparse_{name}_float64" 

235 left_sp_values = left_sp_values.astype("float64") 

236 right_sp_values = right_sp_values.astype("float64") 

237 

238 sparse_op = getattr(splib, opname) 

239 

240 with np.errstate(all="ignore"): 

241 result, index, fill = sparse_op( 

242 left_sp_values, 

243 left.sp_index, 

244 left.fill_value, 

245 right_sp_values, 

246 right.sp_index, 

247 right.fill_value, 

248 ) 

249 

250 if name == "divmod": 

251 # result is a 2-tuple 

252 # error: Incompatible return value type (got "Tuple[SparseArray, 

253 # SparseArray]", expected "SparseArray") 

254 return ( # type: ignore[return-value] 

255 _wrap_result(name, result[0], index, fill[0], dtype=result_dtype), 

256 _wrap_result(name, result[1], index, fill[1], dtype=result_dtype), 

257 ) 

258 

259 if result_dtype is None: 

260 result_dtype = result.dtype 

261 

262 return _wrap_result(name, result, index, fill, dtype=result_dtype) 

263 

264 

265def _wrap_result( 

266 name: str, data, sparse_index, fill_value, dtype: Dtype | None = None 

267) -> SparseArray: 

268 """ 

269 wrap op result to have correct dtype 

270 """ 

271 if name.startswith("__"): 

272 # e.g. __eq__ --> eq 

273 name = name[2:-2] 

274 

275 if name in ("eq", "ne", "lt", "gt", "le", "ge"): 

276 dtype = bool 

277 

278 fill_value = lib.item_from_zerodim(fill_value) 

279 

280 if is_bool_dtype(dtype): 

281 # fill_value may be np.bool_ 

282 fill_value = bool(fill_value) 

283 return SparseArray( 

284 data, sparse_index=sparse_index, fill_value=fill_value, dtype=dtype 

285 ) 

286 

287 

288class SparseArray(OpsMixin, PandasObject, ExtensionArray): 

289 """ 

290 An ExtensionArray for storing sparse data. 

291 

292 Parameters 

293 ---------- 

294 data : array-like or scalar 

295 A dense array of values to store in the SparseArray. This may contain 

296 `fill_value`. 

297 sparse_index : SparseIndex, optional 

298 index : Index 

299 

300 .. deprecated:: 1.4.0 

301 Use a function like `np.full` to construct an array with the desired 

302 repeats of the scalar value instead. 

303 

304 fill_value : scalar, optional 

305 Elements in data that are ``fill_value`` are not stored in the 

306 SparseArray. For memory savings, this should be the most common value 

307 in `data`. By default, `fill_value` depends on the dtype of `data`: 

308 

309 =========== ========== 

310 data.dtype na_value 

311 =========== ========== 

312 float ``np.nan`` 

313 int ``0`` 

314 bool False 

315 datetime64 ``pd.NaT`` 

316 timedelta64 ``pd.NaT`` 

317 =========== ========== 

318 

319 The fill value is potentially specified in three ways. In order of 

320 precedence, these are 

321 

322 1. The `fill_value` argument 

323 2. ``dtype.fill_value`` if `fill_value` is None and `dtype` is 

324 a ``SparseDtype`` 

325 3. ``data.dtype.fill_value`` if `fill_value` is None and `dtype` 

326 is not a ``SparseDtype`` and `data` is a ``SparseArray``. 

327 

328 kind : str 

329 Can be 'integer' or 'block', default is 'integer'. 

330 The type of storage for sparse locations. 

331 

332 * 'block': Stores a `block` and `block_length` for each 

333 contiguous *span* of sparse values. This is best when 

334 sparse data tends to be clumped together, with large 

335 regions of ``fill-value`` values between sparse values. 

336 * 'integer': uses an integer to store the location of 

337 each sparse value. 

338 

339 dtype : np.dtype or SparseDtype, optional 

340 The dtype to use for the SparseArray. For numpy dtypes, this 

341 determines the dtype of ``self.sp_values``. For SparseDtype, 

342 this determines ``self.sp_values`` and ``self.fill_value``. 

343 copy : bool, default False 

344 Whether to explicitly copy the incoming `data` array. 

345 

346 Attributes 

347 ---------- 

348 None 

349 

350 Methods 

351 ------- 

352 None 

353 

354 Examples 

355 -------- 

356 >>> from pandas.arrays import SparseArray 

357 >>> arr = SparseArray([0, 0, 1, 2]) 

358 >>> arr 

359 [0, 0, 1, 2] 

360 Fill: 0 

361 IntIndex 

362 Indices: array([2, 3], dtype=int32) 

363 """ 

364 

365 _subtyp = "sparse_array" # register ABCSparseArray 

366 _hidden_attrs = PandasObject._hidden_attrs | frozenset(["get_values"]) 

367 _sparse_index: SparseIndex 

368 _sparse_values: np.ndarray 

369 _dtype: SparseDtype 

370 

371 def __init__( 

372 self, 

373 data, 

374 sparse_index=None, 

375 index=None, 

376 fill_value=None, 

377 kind: SparseIndexKind = "integer", 

378 dtype: Dtype | None = None, 

379 copy: bool = False, 

380 ) -> None: 

381 

382 if fill_value is None and isinstance(dtype, SparseDtype): 

383 fill_value = dtype.fill_value 

384 

385 if isinstance(data, type(self)): 

386 # disable normal inference on dtype, sparse_index, & fill_value 

387 if sparse_index is None: 

388 sparse_index = data.sp_index 

389 if fill_value is None: 

390 fill_value = data.fill_value 

391 if dtype is None: 

392 dtype = data.dtype 

393 # TODO: make kind=None, and use data.kind? 

394 data = data.sp_values 

395 

396 # Handle use-provided dtype 

397 if isinstance(dtype, str): 

398 # Two options: dtype='int', regular numpy dtype 

399 # or dtype='Sparse[int]', a sparse dtype 

400 try: 

401 dtype = SparseDtype.construct_from_string(dtype) 

402 except TypeError: 

403 dtype = pandas_dtype(dtype) 

404 

405 if isinstance(dtype, SparseDtype): 

406 if fill_value is None: 

407 fill_value = dtype.fill_value 

408 dtype = dtype.subtype 

409 

410 if index is not None: 

411 warnings.warn( 

412 "The index argument has been deprecated and will be " 

413 "removed in a future version. Use a function like np.full " 

414 "to construct an array with the desired repeats of the " 

415 "scalar value instead.\n\n", 

416 FutureWarning, 

417 stacklevel=find_stack_level(), 

418 ) 

419 

420 if index is not None and not is_scalar(data): 

421 raise Exception("must only pass scalars with an index") 

422 

423 if is_scalar(data): 

424 if index is not None and data is None: 

425 data = np.nan 

426 

427 if index is not None: 

428 npoints = len(index) 

429 elif sparse_index is None: 

430 npoints = 1 

431 else: 

432 npoints = sparse_index.length 

433 

434 data = construct_1d_arraylike_from_scalar(data, npoints, dtype=None) 

435 dtype = data.dtype 

436 

437 if dtype is not None: 

438 dtype = pandas_dtype(dtype) 

439 

440 # TODO: disentangle the fill_value dtype inference from 

441 # dtype inference 

442 if data is None: 

443 # TODO: What should the empty dtype be? Object or float? 

444 

445 # error: Argument "dtype" to "array" has incompatible type 

446 # "Union[ExtensionDtype, dtype[Any], None]"; expected "Union[dtype[Any], 

447 # None, type, _SupportsDType, str, Union[Tuple[Any, int], Tuple[Any, 

448 # Union[int, Sequence[int]]], List[Any], _DTypeDict, Tuple[Any, Any]]]" 

449 data = np.array([], dtype=dtype) # type: ignore[arg-type] 

450 

451 if not is_array_like(data): 

452 try: 

453 # probably shared code in sanitize_series 

454 

455 data = sanitize_array(data, index=None) 

456 except ValueError: 

457 # NumPy may raise a ValueError on data like [1, []] 

458 # we retry with object dtype here. 

459 if dtype is None: 

460 dtype = object 

461 data = np.atleast_1d(np.asarray(data, dtype=dtype)) 

462 else: 

463 raise 

464 

465 if copy: 

466 # TODO: avoid double copy when dtype forces cast. 

467 data = data.copy() 

468 

469 if fill_value is None: 

470 fill_value_dtype = data.dtype if dtype is None else dtype 

471 if fill_value_dtype is None: 

472 fill_value = np.nan 

473 else: 

474 fill_value = na_value_for_dtype(fill_value_dtype) 

475 

476 if isinstance(data, type(self)) and sparse_index is None: 

477 sparse_index = data._sparse_index 

478 # error: Argument "dtype" to "asarray" has incompatible type 

479 # "Union[ExtensionDtype, dtype[Any], Type[object], None]"; expected 

480 # "Union[dtype[Any], None, type, _SupportsDType, str, Union[Tuple[Any, int], 

481 # Tuple[Any, Union[int, Sequence[int]]], List[Any], _DTypeDict, Tuple[Any, 

482 # Any]]]" 

483 sparse_values = np.asarray( 

484 data.sp_values, dtype=dtype # type: ignore[arg-type] 

485 ) 

486 elif sparse_index is None: 

487 data = extract_array(data, extract_numpy=True) 

488 if not isinstance(data, np.ndarray): 

489 # EA 

490 if is_datetime64tz_dtype(data.dtype): 

491 warnings.warn( 

492 f"Creating SparseArray from {data.dtype} data " 

493 "loses timezone information. Cast to object before " 

494 "sparse to retain timezone information.", 

495 UserWarning, 

496 stacklevel=find_stack_level(), 

497 ) 

498 data = np.asarray(data, dtype="datetime64[ns]") 

499 if fill_value is NaT: 

500 fill_value = np.datetime64("NaT", "ns") 

501 data = np.asarray(data) 

502 sparse_values, sparse_index, fill_value = make_sparse( 

503 # error: Argument "dtype" to "make_sparse" has incompatible type 

504 # "Union[ExtensionDtype, dtype[Any], Type[object], None]"; expected 

505 # "Union[str, dtype[Any], None]" 

506 data, 

507 kind=kind, 

508 fill_value=fill_value, 

509 dtype=dtype, # type: ignore[arg-type] 

510 ) 

511 else: 

512 # error: Argument "dtype" to "asarray" has incompatible type 

513 # "Union[ExtensionDtype, dtype[Any], Type[object], None]"; expected 

514 # "Union[dtype[Any], None, type, _SupportsDType, str, Union[Tuple[Any, int], 

515 # Tuple[Any, Union[int, Sequence[int]]], List[Any], _DTypeDict, Tuple[Any, 

516 # Any]]]" 

517 sparse_values = np.asarray(data, dtype=dtype) # type: ignore[arg-type] 

518 if len(sparse_values) != sparse_index.npoints: 

519 raise AssertionError( 

520 f"Non array-like type {type(sparse_values)} must " 

521 "have the same length as the index" 

522 ) 

523 self._sparse_index = sparse_index 

524 self._sparse_values = sparse_values 

525 self._dtype = SparseDtype(sparse_values.dtype, fill_value) 

526 

527 @classmethod 

528 def _simple_new( 

529 cls: type[SparseArrayT], 

530 sparse_array: np.ndarray, 

531 sparse_index: SparseIndex, 

532 dtype: SparseDtype, 

533 ) -> SparseArrayT: 

534 new = object.__new__(cls) 

535 new._sparse_index = sparse_index 

536 new._sparse_values = sparse_array 

537 new._dtype = dtype 

538 return new 

539 

540 @classmethod 

541 def from_spmatrix(cls: type[SparseArrayT], data: spmatrix) -> SparseArrayT: 

542 """ 

543 Create a SparseArray from a scipy.sparse matrix. 

544 

545 .. versionadded:: 0.25.0 

546 

547 Parameters 

548 ---------- 

549 data : scipy.sparse.sp_matrix 

550 This should be a SciPy sparse matrix where the size 

551 of the second dimension is 1. In other words, a 

552 sparse matrix with a single column. 

553 

554 Returns 

555 ------- 

556 SparseArray 

557 

558 Examples 

559 -------- 

560 >>> import scipy.sparse 

561 >>> mat = scipy.sparse.coo_matrix((4, 1)) 

562 >>> pd.arrays.SparseArray.from_spmatrix(mat) 

563 [0.0, 0.0, 0.0, 0.0] 

564 Fill: 0.0 

565 IntIndex 

566 Indices: array([], dtype=int32) 

567 """ 

568 length, ncol = data.shape 

569 

570 if ncol != 1: 

571 raise ValueError(f"'data' must have a single column, not '{ncol}'") 

572 

573 # our sparse index classes require that the positions be strictly 

574 # increasing. So we need to sort loc, and arr accordingly. 

575 data = data.tocsc() 

576 data.sort_indices() 

577 arr = data.data 

578 idx = data.indices 

579 

580 zero = np.array(0, dtype=arr.dtype).item() 

581 dtype = SparseDtype(arr.dtype, zero) 

582 index = IntIndex(length, idx) 

583 

584 return cls._simple_new(arr, index, dtype) 

585 

586 def __array__(self, dtype: NpDtype | None = None) -> np.ndarray: 

587 fill_value = self.fill_value 

588 

589 if self.sp_index.ngaps == 0: 

590 # Compat for na dtype and int values. 

591 return self.sp_values 

592 if dtype is None: 

593 # Can NumPy represent this type? 

594 # If not, `np.result_type` will raise. We catch that 

595 # and return object. 

596 if is_datetime64_any_dtype(self.sp_values.dtype): 

597 # However, we *do* special-case the common case of 

598 # a datetime64 with pandas NaT. 

599 if fill_value is NaT: 

600 # Can't put pd.NaT in a datetime64[ns] 

601 fill_value = np.datetime64("NaT") 

602 try: 

603 dtype = np.result_type(self.sp_values.dtype, type(fill_value)) 

604 except TypeError: 

605 dtype = object 

606 

607 out = np.full(self.shape, fill_value, dtype=dtype) 

608 out[self.sp_index.indices] = self.sp_values 

609 return out 

610 

611 def __setitem__(self, key, value): 

612 # I suppose we could allow setting of non-fill_value elements. 

613 # TODO(SparseArray.__setitem__): remove special cases in 

614 # ExtensionBlock.where 

615 msg = "SparseArray does not support item assignment via setitem" 

616 raise TypeError(msg) 

617 

618 @classmethod 

619 def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy: bool = False): 

620 return cls(scalars, dtype=dtype) 

621 

622 @classmethod 

623 def _from_factorized(cls, values, original): 

624 return cls(values, dtype=original.dtype) 

625 

626 # ------------------------------------------------------------------------ 

627 # Data 

628 # ------------------------------------------------------------------------ 

629 @property 

630 def sp_index(self) -> SparseIndex: 

631 """ 

632 The SparseIndex containing the location of non- ``fill_value`` points. 

633 """ 

634 return self._sparse_index 

635 

636 @property 

637 def sp_values(self) -> np.ndarray: 

638 """ 

639 An ndarray containing the non- ``fill_value`` values. 

640 

641 Examples 

642 -------- 

643 >>> s = SparseArray([0, 0, 1, 0, 2], fill_value=0) 

644 >>> s.sp_values 

645 array([1, 2]) 

646 """ 

647 return self._sparse_values 

648 

649 @property 

650 def dtype(self) -> SparseDtype: 

651 return self._dtype 

652 

653 @property 

654 def fill_value(self): 

655 """ 

656 Elements in `data` that are `fill_value` are not stored. 

657 

658 For memory savings, this should be the most common value in the array. 

659 """ 

660 return self.dtype.fill_value 

661 

662 @fill_value.setter 

663 def fill_value(self, value): 

664 self._dtype = SparseDtype(self.dtype.subtype, value) 

665 

666 @property 

667 def kind(self) -> SparseIndexKind: 

668 """ 

669 The kind of sparse index for this array. One of {'integer', 'block'}. 

670 """ 

671 if isinstance(self.sp_index, IntIndex): 

672 return "integer" 

673 else: 

674 return "block" 

675 

676 @property 

677 def _valid_sp_values(self) -> np.ndarray: 

678 sp_vals = self.sp_values 

679 mask = notna(sp_vals) 

680 return sp_vals[mask] 

681 

682 def __len__(self) -> int: 

683 return self.sp_index.length 

684 

685 @property 

686 def _null_fill_value(self) -> bool: 

687 return self._dtype._is_na_fill_value 

688 

689 def _fill_value_matches(self, fill_value) -> bool: 

690 if self._null_fill_value: 

691 return isna(fill_value) 

692 else: 

693 return self.fill_value == fill_value 

694 

695 @property 

696 def nbytes(self) -> int: 

697 return self.sp_values.nbytes + self.sp_index.nbytes 

698 

699 @property 

700 def density(self) -> float: 

701 """ 

702 The percent of non- ``fill_value`` points, as decimal. 

703 

704 Examples 

705 -------- 

706 >>> s = SparseArray([0, 0, 1, 1, 1], fill_value=0) 

707 >>> s.density 

708 0.6 

709 """ 

710 return self.sp_index.npoints / self.sp_index.length 

711 

712 @property 

713 def npoints(self) -> int: 

714 """ 

715 The number of non- ``fill_value`` points. 

716 

717 Examples 

718 -------- 

719 >>> s = SparseArray([0, 0, 1, 1, 1], fill_value=0) 

720 >>> s.npoints 

721 3 

722 """ 

723 return self.sp_index.npoints 

724 

725 def isna(self): 

726 # If null fill value, we want SparseDtype[bool, true] 

727 # to preserve the same memory usage. 

728 dtype = SparseDtype(bool, self._null_fill_value) 

729 if self._null_fill_value: 

730 return type(self)._simple_new(isna(self.sp_values), self.sp_index, dtype) 

731 mask = np.full(len(self), False, dtype=np.bool8) 

732 mask[self.sp_index.indices] = isna(self.sp_values) 

733 return type(self)(mask, fill_value=False, dtype=dtype) 

734 

735 def fillna( 

736 self: SparseArrayT, 

737 value=None, 

738 method: FillnaOptions | None = None, 

739 limit: int | None = None, 

740 ) -> SparseArrayT: 

741 """ 

742 Fill missing values with `value`. 

743 

744 Parameters 

745 ---------- 

746 value : scalar, optional 

747 method : str, optional 

748 

749 .. warning:: 

750 

751 Using 'method' will result in high memory use, 

752 as all `fill_value` methods will be converted to 

753 an in-memory ndarray 

754 

755 limit : int, optional 

756 

757 Returns 

758 ------- 

759 SparseArray 

760 

761 Notes 

762 ----- 

763 When `value` is specified, the result's ``fill_value`` depends on 

764 ``self.fill_value``. The goal is to maintain low-memory use. 

765 

766 If ``self.fill_value`` is NA, the result dtype will be 

767 ``SparseDtype(self.dtype, fill_value=value)``. This will preserve 

768 amount of memory used before and after filling. 

769 

770 When ``self.fill_value`` is not NA, the result dtype will be 

771 ``self.dtype``. Again, this preserves the amount of memory used. 

772 """ 

773 if (method is None and value is None) or ( 

774 method is not None and value is not None 

775 ): 

776 raise ValueError("Must specify one of 'method' or 'value'.") 

777 

778 elif method is not None: 

779 msg = "fillna with 'method' requires high memory usage." 

780 warnings.warn( 

781 msg, 

782 PerformanceWarning, 

783 stacklevel=find_stack_level(), 

784 ) 

785 new_values = np.asarray(self) 

786 # interpolate_2d modifies new_values inplace 

787 interpolate_2d(new_values, method=method, limit=limit) 

788 return type(self)(new_values, fill_value=self.fill_value) 

789 

790 else: 

791 new_values = np.where(isna(self.sp_values), value, self.sp_values) 

792 

793 if self._null_fill_value: 

794 # This is essentially just updating the dtype. 

795 new_dtype = SparseDtype(self.dtype.subtype, fill_value=value) 

796 else: 

797 new_dtype = self.dtype 

798 

799 return self._simple_new(new_values, self._sparse_index, new_dtype) 

800 

801 def shift(self: SparseArrayT, periods: int = 1, fill_value=None) -> SparseArrayT: 

802 

803 if not len(self) or periods == 0: 

804 return self.copy() 

805 

806 if isna(fill_value): 

807 fill_value = self.dtype.na_value 

808 

809 subtype = np.result_type(fill_value, self.dtype.subtype) 

810 

811 if subtype != self.dtype.subtype: 

812 # just coerce up front 

813 arr = self.astype(SparseDtype(subtype, self.fill_value)) 

814 else: 

815 arr = self 

816 

817 empty = self._from_sequence( 

818 [fill_value] * min(abs(periods), len(self)), dtype=arr.dtype 

819 ) 

820 

821 if periods > 0: 

822 a = empty 

823 b = arr[:-periods] 

824 else: 

825 a = arr[abs(periods) :] 

826 b = empty 

827 return arr._concat_same_type([a, b]) 

828 

829 def _first_fill_value_loc(self): 

830 """ 

831 Get the location of the first fill value. 

832 

833 Returns 

834 ------- 

835 int 

836 """ 

837 if len(self) == 0 or self.sp_index.npoints == len(self): 

838 return -1 

839 

840 indices = self.sp_index.indices 

841 if not len(indices) or indices[0] > 0: 

842 return 0 

843 

844 # a number larger than 1 should be appended to 

845 # the last in case of fill value only appears 

846 # in the tail of array 

847 diff = np.r_[np.diff(indices), 2] 

848 return indices[(diff > 1).argmax()] + 1 

849 

850 def unique(self: SparseArrayT) -> SparseArrayT: 

851 uniques = algos.unique(self.sp_values) 

852 if len(self.sp_values) != len(self): 

853 fill_loc = self._first_fill_value_loc() 

854 # Inorder to align the behavior of pd.unique or 

855 # pd.Series.unique, we should keep the original 

856 # order, here we use unique again to find the 

857 # insertion place. Since the length of sp_values 

858 # is not large, maybe minor performance hurt 

859 # is worthwhile to the correctness. 

860 insert_loc = len(algos.unique(self.sp_values[:fill_loc])) 

861 uniques = np.insert(uniques, insert_loc, self.fill_value) 

862 return type(self)._from_sequence(uniques, dtype=self.dtype) 

863 

864 def _values_for_factorize(self): 

865 # Still override this for hash_pandas_object 

866 return np.asarray(self), self.fill_value 

867 

868 def factorize( 

869 self, 

870 na_sentinel: int | lib.NoDefault = lib.no_default, 

871 use_na_sentinel: bool | lib.NoDefault = lib.no_default, 

872 ) -> tuple[np.ndarray, SparseArray]: 

873 # Currently, ExtensionArray.factorize -> Tuple[ndarray, EA] 

874 # The sparsity on this is backwards from what Sparse would want. Want 

875 # ExtensionArray.factorize -> Tuple[EA, EA] 

876 # Given that we have to return a dense array of codes, why bother 

877 # implementing an efficient factorize? 

878 codes, uniques = algos.factorize( 

879 np.asarray(self), na_sentinel=na_sentinel, use_na_sentinel=use_na_sentinel 

880 ) 

881 if na_sentinel is lib.no_default: 

882 na_sentinel = -1 

883 if use_na_sentinel is lib.no_default or use_na_sentinel: 

884 codes[codes == -1] = na_sentinel 

885 uniques_sp = SparseArray(uniques, dtype=self.dtype) 

886 return codes, uniques_sp 

887 

888 def value_counts(self, dropna: bool = True) -> Series: 

889 """ 

890 Returns a Series containing counts of unique values. 

891 

892 Parameters 

893 ---------- 

894 dropna : bool, default True 

895 Don't include counts of NaN, even if NaN is in sp_values. 

896 

897 Returns 

898 ------- 

899 counts : Series 

900 """ 

901 from pandas import ( 

902 Index, 

903 Series, 

904 ) 

905 

906 keys, counts = algos.value_counts_arraylike(self.sp_values, dropna=dropna) 

907 fcounts = self.sp_index.ngaps 

908 if fcounts > 0 and (not self._null_fill_value or not dropna): 

909 mask = isna(keys) if self._null_fill_value else keys == self.fill_value 

910 if mask.any(): 

911 counts[mask] += fcounts 

912 else: 

913 # error: Argument 1 to "insert" has incompatible type "Union[ 

914 # ExtensionArray,ndarray[Any, Any]]"; expected "Union[ 

915 # _SupportsArray[dtype[Any]], Sequence[_SupportsArray[dtype 

916 # [Any]]], Sequence[Sequence[_SupportsArray[dtype[Any]]]], 

917 # Sequence[Sequence[Sequence[_SupportsArray[dtype[Any]]]]], Sequence 

918 # [Sequence[Sequence[Sequence[_SupportsArray[dtype[Any]]]]]]]" 

919 keys = np.insert(keys, 0, self.fill_value) # type: ignore[arg-type] 

920 counts = np.insert(counts, 0, fcounts) 

921 

922 if not isinstance(keys, ABCIndex): 

923 index = Index(keys) 

924 else: 

925 index = keys 

926 return Series(counts, index=index) 

927 

928 def _quantile(self, qs: npt.NDArray[np.float64], interpolation: str): 

929 

930 if self._null_fill_value or self.sp_index.ngaps == 0: 

931 # We can avoid densifying 

932 npvalues = self.sp_values 

933 mask = np.zeros(npvalues.shape, dtype=bool) 

934 else: 

935 npvalues = self.to_numpy() 

936 mask = self.isna() 

937 

938 fill_value = na_value_for_dtype(npvalues.dtype, compat=False) 

939 res_values = quantile_with_mask( 

940 npvalues, 

941 mask, 

942 fill_value, 

943 qs, 

944 interpolation, 

945 ) 

946 

947 # Special case: the returned array isn't _really_ sparse, so we don't 

948 # wrap it in a SparseArray 

949 return res_values 

950 

951 # -------- 

952 # Indexing 

953 # -------- 

954 @overload 

955 def __getitem__(self, key: ScalarIndexer) -> Any: 

956 ... 

957 

958 @overload 

959 def __getitem__( 

960 self: SparseArrayT, 

961 key: SequenceIndexer | tuple[int | ellipsis, ...], 

962 ) -> SparseArrayT: 

963 ... 

964 

965 def __getitem__( 

966 self: SparseArrayT, 

967 key: PositionalIndexer | tuple[int | ellipsis, ...], 

968 ) -> SparseArrayT | Any: 

969 

970 if isinstance(key, tuple): 

971 key = unpack_tuple_and_ellipses(key) 

972 # Non-overlapping identity check (left operand type: 

973 # "Union[Union[Union[int, integer[Any]], Union[slice, List[int], 

974 # ndarray[Any, Any]]], Tuple[Union[int, ellipsis], ...]]", 

975 # right operand type: "ellipsis") 

976 if key is Ellipsis: # type: ignore[comparison-overlap] 

977 raise ValueError("Cannot slice with Ellipsis") 

978 

979 if is_integer(key): 

980 return self._get_val_at(key) 

981 elif isinstance(key, tuple): 

982 # error: Invalid index type "Tuple[Union[int, ellipsis], ...]" 

983 # for "ndarray[Any, Any]"; expected type 

984 # "Union[SupportsIndex, _SupportsArray[dtype[Union[bool_, 

985 # integer[Any]]]], _NestedSequence[_SupportsArray[dtype[ 

986 # Union[bool_, integer[Any]]]]], _NestedSequence[Union[ 

987 # bool, int]], Tuple[Union[SupportsIndex, _SupportsArray[ 

988 # dtype[Union[bool_, integer[Any]]]], _NestedSequence[ 

989 # _SupportsArray[dtype[Union[bool_, integer[Any]]]]], 

990 # _NestedSequence[Union[bool, int]]], ...]]" 

991 data_slice = self.to_dense()[key] # type: ignore[index] 

992 elif isinstance(key, slice): 

993 

994 # Avoid densifying when handling contiguous slices 

995 if key.step is None or key.step == 1: 

996 start = 0 if key.start is None else key.start 

997 if start < 0: 

998 start += len(self) 

999 

1000 end = len(self) if key.stop is None else key.stop 

1001 if end < 0: 

1002 end += len(self) 

1003 

1004 indices = self.sp_index.indices 

1005 keep_inds = np.flatnonzero((indices >= start) & (indices < end)) 

1006 sp_vals = self.sp_values[keep_inds] 

1007 

1008 sp_index = indices[keep_inds].copy() 

1009 

1010 # If we've sliced to not include the start of the array, all our indices 

1011 # should be shifted. NB: here we are careful to also not shift by a 

1012 # negative value for a case like [0, 1][-100:] where the start index 

1013 # should be treated like 0 

1014 if start > 0: 

1015 sp_index -= start 

1016 

1017 # Length of our result should match applying this slice to a range 

1018 # of the length of our original array 

1019 new_len = len(range(len(self))[key]) 

1020 new_sp_index = make_sparse_index(new_len, sp_index, self.kind) 

1021 return type(self)._simple_new(sp_vals, new_sp_index, self.dtype) 

1022 else: 

1023 indices = np.arange(len(self), dtype=np.int32)[key] 

1024 return self.take(indices) 

1025 

1026 elif not is_list_like(key): 

1027 # e.g. "foo" or 2.5 

1028 # exception message copied from numpy 

1029 raise IndexError( 

1030 r"only integers, slices (`:`), ellipsis (`...`), numpy.newaxis " 

1031 r"(`None`) and integer or boolean arrays are valid indices" 

1032 ) 

1033 

1034 else: 

1035 if isinstance(key, SparseArray): 

1036 # NOTE: If we guarantee that SparseDType(bool) 

1037 # has only fill_value - true, false or nan 

1038 # (see GH PR 44955) 

1039 # we can apply mask very fast: 

1040 if is_bool_dtype(key): 

1041 if isna(key.fill_value): 

1042 return self.take(key.sp_index.indices[key.sp_values]) 

1043 if not key.fill_value: 

1044 return self.take(key.sp_index.indices) 

1045 n = len(self) 

1046 mask = np.full(n, True, dtype=np.bool8) 

1047 mask[key.sp_index.indices] = False 

1048 return self.take(np.arange(n)[mask]) 

1049 else: 

1050 key = np.asarray(key) 

1051 

1052 key = check_array_indexer(self, key) 

1053 

1054 if com.is_bool_indexer(key): 

1055 # mypy doesn't know we have an array here 

1056 key = cast(np.ndarray, key) 

1057 return self.take(np.arange(len(key), dtype=np.int32)[key]) 

1058 elif hasattr(key, "__len__"): 

1059 return self.take(key) 

1060 else: 

1061 raise ValueError(f"Cannot slice with '{key}'") 

1062 

1063 return type(self)(data_slice, kind=self.kind) 

1064 

1065 def _get_val_at(self, loc): 

1066 loc = validate_insert_loc(loc, len(self)) 

1067 

1068 sp_loc = self.sp_index.lookup(loc) 

1069 if sp_loc == -1: 

1070 return self.fill_value 

1071 else: 

1072 val = self.sp_values[sp_loc] 

1073 val = maybe_box_datetimelike(val, self.sp_values.dtype) 

1074 return val 

1075 

1076 def take( 

1077 self: SparseArrayT, indices, *, allow_fill: bool = False, fill_value=None 

1078 ) -> SparseArrayT: 

1079 if is_scalar(indices): 

1080 raise ValueError(f"'indices' must be an array, not a scalar '{indices}'.") 

1081 indices = np.asarray(indices, dtype=np.int32) 

1082 

1083 dtype = None 

1084 if indices.size == 0: 

1085 result = np.array([], dtype="object") 

1086 dtype = self.dtype 

1087 elif allow_fill: 

1088 result = self._take_with_fill(indices, fill_value=fill_value) 

1089 else: 

1090 return self._take_without_fill(indices) 

1091 

1092 return type(self)( 

1093 result, fill_value=self.fill_value, kind=self.kind, dtype=dtype 

1094 ) 

1095 

1096 def _take_with_fill(self, indices, fill_value=None) -> np.ndarray: 

1097 if fill_value is None: 

1098 fill_value = self.dtype.na_value 

1099 

1100 if indices.min() < -1: 

1101 raise ValueError( 

1102 "Invalid value in 'indices'. Must be between -1 " 

1103 "and the length of the array." 

1104 ) 

1105 

1106 if indices.max() >= len(self): 

1107 raise IndexError("out of bounds value in 'indices'.") 

1108 

1109 if len(self) == 0: 

1110 # Empty... Allow taking only if all empty 

1111 if (indices == -1).all(): 

1112 dtype = np.result_type(self.sp_values, type(fill_value)) 

1113 taken = np.empty_like(indices, dtype=dtype) 

1114 taken.fill(fill_value) 

1115 return taken 

1116 else: 

1117 raise IndexError("cannot do a non-empty take from an empty axes.") 

1118 

1119 # sp_indexer may be -1 for two reasons 

1120 # 1.) we took for an index of -1 (new) 

1121 # 2.) we took a value that was self.fill_value (old) 

1122 sp_indexer = self.sp_index.lookup_array(indices) 

1123 new_fill_indices = indices == -1 

1124 old_fill_indices = (sp_indexer == -1) & ~new_fill_indices 

1125 

1126 if self.sp_index.npoints == 0 and old_fill_indices.all(): 

1127 # We've looked up all valid points on an all-sparse array. 

1128 taken = np.full( 

1129 sp_indexer.shape, fill_value=self.fill_value, dtype=self.dtype.subtype 

1130 ) 

1131 

1132 elif self.sp_index.npoints == 0: 

1133 # Avoid taking from the empty self.sp_values 

1134 _dtype = np.result_type(self.dtype.subtype, type(fill_value)) 

1135 taken = np.full(sp_indexer.shape, fill_value=fill_value, dtype=_dtype) 

1136 else: 

1137 taken = self.sp_values.take(sp_indexer) 

1138 

1139 # Fill in two steps. 

1140 # Old fill values 

1141 # New fill values 

1142 # potentially coercing to a new dtype at each stage. 

1143 

1144 m0 = sp_indexer[old_fill_indices] < 0 

1145 m1 = sp_indexer[new_fill_indices] < 0 

1146 

1147 result_type = taken.dtype 

1148 

1149 if m0.any(): 

1150 result_type = np.result_type(result_type, type(self.fill_value)) 

1151 taken = taken.astype(result_type) 

1152 taken[old_fill_indices] = self.fill_value 

1153 

1154 if m1.any(): 

1155 result_type = np.result_type(result_type, type(fill_value)) 

1156 taken = taken.astype(result_type) 

1157 taken[new_fill_indices] = fill_value 

1158 

1159 return taken 

1160 

1161 def _take_without_fill(self: SparseArrayT, indices) -> SparseArrayT: 

1162 to_shift = indices < 0 

1163 

1164 n = len(self) 

1165 

1166 if (indices.max() >= n) or (indices.min() < -n): 

1167 if n == 0: 

1168 raise IndexError("cannot do a non-empty take from an empty axes.") 

1169 else: 

1170 raise IndexError("out of bounds value in 'indices'.") 

1171 

1172 if to_shift.any(): 

1173 indices = indices.copy() 

1174 indices[to_shift] += n 

1175 

1176 sp_indexer = self.sp_index.lookup_array(indices) 

1177 value_mask = sp_indexer != -1 

1178 new_sp_values = self.sp_values[sp_indexer[value_mask]] 

1179 

1180 value_indices = np.flatnonzero(value_mask).astype(np.int32, copy=False) 

1181 

1182 new_sp_index = make_sparse_index(len(indices), value_indices, kind=self.kind) 

1183 return type(self)._simple_new(new_sp_values, new_sp_index, dtype=self.dtype) 

1184 

1185 def searchsorted( 

1186 self, 

1187 v: ArrayLike | object, 

1188 side: Literal["left", "right"] = "left", 

1189 sorter: NumpySorter = None, 

1190 ) -> npt.NDArray[np.intp] | np.intp: 

1191 

1192 msg = "searchsorted requires high memory usage." 

1193 warnings.warn(msg, PerformanceWarning, stacklevel=find_stack_level()) 

1194 if not is_scalar(v): 

1195 v = np.asarray(v) 

1196 v = np.asarray(v) 

1197 return np.asarray(self, dtype=self.dtype.subtype).searchsorted(v, side, sorter) 

1198 

1199 def copy(self: SparseArrayT) -> SparseArrayT: 

1200 values = self.sp_values.copy() 

1201 return self._simple_new(values, self.sp_index, self.dtype) 

1202 

1203 @classmethod 

1204 def _concat_same_type( 

1205 cls: type[SparseArrayT], to_concat: Sequence[SparseArrayT] 

1206 ) -> SparseArrayT: 

1207 fill_value = to_concat[0].fill_value 

1208 

1209 values = [] 

1210 length = 0 

1211 

1212 if to_concat: 

1213 sp_kind = to_concat[0].kind 

1214 else: 

1215 sp_kind = "integer" 

1216 

1217 sp_index: SparseIndex 

1218 if sp_kind == "integer": 

1219 indices = [] 

1220 

1221 for arr in to_concat: 

1222 int_idx = arr.sp_index.indices.copy() 

1223 int_idx += length # TODO: wraparound 

1224 length += arr.sp_index.length 

1225 

1226 values.append(arr.sp_values) 

1227 indices.append(int_idx) 

1228 

1229 data = np.concatenate(values) 

1230 indices_arr = np.concatenate(indices) 

1231 # error: Argument 2 to "IntIndex" has incompatible type 

1232 # "ndarray[Any, dtype[signedinteger[_32Bit]]]"; 

1233 # expected "Sequence[int]" 

1234 sp_index = IntIndex(length, indices_arr) # type: ignore[arg-type] 

1235 

1236 else: 

1237 # when concatenating block indices, we don't claim that you'll 

1238 # get an identical index as concatenating the values and then 

1239 # creating a new index. We don't want to spend the time trying 

1240 # to merge blocks across arrays in `to_concat`, so the resulting 

1241 # BlockIndex may have more blocks. 

1242 blengths = [] 

1243 blocs = [] 

1244 

1245 for arr in to_concat: 

1246 block_idx = arr.sp_index.to_block_index() 

1247 

1248 values.append(arr.sp_values) 

1249 blocs.append(block_idx.blocs.copy() + length) 

1250 blengths.append(block_idx.blengths) 

1251 length += arr.sp_index.length 

1252 

1253 data = np.concatenate(values) 

1254 blocs_arr = np.concatenate(blocs) 

1255 blengths_arr = np.concatenate(blengths) 

1256 

1257 sp_index = BlockIndex(length, blocs_arr, blengths_arr) 

1258 

1259 return cls(data, sparse_index=sp_index, fill_value=fill_value) 

1260 

1261 def astype(self, dtype: AstypeArg | None = None, copy: bool = True): 

1262 """ 

1263 Change the dtype of a SparseArray. 

1264 

1265 The output will always be a SparseArray. To convert to a dense 

1266 ndarray with a certain dtype, use :meth:`numpy.asarray`. 

1267 

1268 Parameters 

1269 ---------- 

1270 dtype : np.dtype or ExtensionDtype 

1271 For SparseDtype, this changes the dtype of 

1272 ``self.sp_values`` and the ``self.fill_value``. 

1273 

1274 For other dtypes, this only changes the dtype of 

1275 ``self.sp_values``. 

1276 

1277 copy : bool, default True 

1278 Whether to ensure a copy is made, even if not necessary. 

1279 

1280 Returns 

1281 ------- 

1282 SparseArray 

1283 

1284 Examples 

1285 -------- 

1286 >>> arr = pd.arrays.SparseArray([0, 0, 1, 2]) 

1287 >>> arr 

1288 [0, 0, 1, 2] 

1289 Fill: 0 

1290 IntIndex 

1291 Indices: array([2, 3], dtype=int32) 

1292 

1293 >>> arr.astype(SparseDtype(np.dtype('int32'))) 

1294 [0, 0, 1, 2] 

1295 Fill: 0 

1296 IntIndex 

1297 Indices: array([2, 3], dtype=int32) 

1298 

1299 Using a NumPy dtype with a different kind (e.g. float) will coerce 

1300 just ``self.sp_values``. 

1301 

1302 >>> arr.astype(SparseDtype(np.dtype('float64'))) 

1303 ... # doctest: +NORMALIZE_WHITESPACE 

1304 [nan, nan, 1.0, 2.0] 

1305 Fill: nan 

1306 IntIndex 

1307 Indices: array([2, 3], dtype=int32) 

1308 

1309 Using a SparseDtype, you can also change the fill value as well. 

1310 

1311 >>> arr.astype(SparseDtype("float64", fill_value=0.0)) 

1312 ... # doctest: +NORMALIZE_WHITESPACE 

1313 [0.0, 0.0, 1.0, 2.0] 

1314 Fill: 0.0 

1315 IntIndex 

1316 Indices: array([2, 3], dtype=int32) 

1317 """ 

1318 if is_dtype_equal(dtype, self._dtype): 

1319 if not copy: 

1320 return self 

1321 else: 

1322 return self.copy() 

1323 

1324 future_dtype = pandas_dtype(dtype) 

1325 if not isinstance(future_dtype, SparseDtype): 

1326 # GH#34457 

1327 warnings.warn( 

1328 "The behavior of .astype from SparseDtype to a non-sparse dtype " 

1329 "is deprecated. In a future version, this will return a non-sparse " 

1330 "array with the requested dtype. To retain the old behavior, use " 

1331 "`obj.astype(SparseDtype(dtype))`", 

1332 FutureWarning, 

1333 stacklevel=find_stack_level(), 

1334 ) 

1335 

1336 dtype = self.dtype.update_dtype(dtype) 

1337 subtype = pandas_dtype(dtype._subtype_with_str) 

1338 sp_values = astype_nansafe(self.sp_values, subtype, copy=copy) 

1339 

1340 # error: Argument 1 to "_simple_new" of "SparseArray" has incompatible type 

1341 # "ExtensionArray"; expected "ndarray" 

1342 return self._simple_new( 

1343 sp_values, self.sp_index, dtype # type: ignore[arg-type] 

1344 ) 

1345 

1346 def map(self: SparseArrayT, mapper) -> SparseArrayT: 

1347 """ 

1348 Map categories using an input mapping or function. 

1349 

1350 Parameters 

1351 ---------- 

1352 mapper : dict, Series, callable 

1353 The correspondence from old values to new. 

1354 

1355 Returns 

1356 ------- 

1357 SparseArray 

1358 The output array will have the same density as the input. 

1359 The output fill value will be the result of applying the 

1360 mapping to ``self.fill_value`` 

1361 

1362 Examples 

1363 -------- 

1364 >>> arr = pd.arrays.SparseArray([0, 1, 2]) 

1365 >>> arr.map(lambda x: x + 10) 

1366 [10, 11, 12] 

1367 Fill: 10 

1368 IntIndex 

1369 Indices: array([1, 2], dtype=int32) 

1370 

1371 >>> arr.map({0: 10, 1: 11, 2: 12}) 

1372 [10, 11, 12] 

1373 Fill: 10 

1374 IntIndex 

1375 Indices: array([1, 2], dtype=int32) 

1376 

1377 >>> arr.map(pd.Series([10, 11, 12], index=[0, 1, 2])) 

1378 [10, 11, 12] 

1379 Fill: 10 

1380 IntIndex 

1381 Indices: array([1, 2], dtype=int32) 

1382 """ 

1383 # this is used in apply. 

1384 # We get hit since we're an "is_extension_type" but regular extension 

1385 # types are not hit. This may be worth adding to the interface. 

1386 if isinstance(mapper, ABCSeries): 

1387 mapper = mapper.to_dict() 

1388 

1389 if isinstance(mapper, abc.Mapping): 

1390 fill_value = mapper.get(self.fill_value, self.fill_value) 

1391 sp_values = [mapper.get(x, None) for x in self.sp_values] 

1392 else: 

1393 fill_value = mapper(self.fill_value) 

1394 sp_values = [mapper(x) for x in self.sp_values] 

1395 

1396 return type(self)(sp_values, sparse_index=self.sp_index, fill_value=fill_value) 

1397 

1398 def to_dense(self) -> np.ndarray: 

1399 """ 

1400 Convert SparseArray to a NumPy array. 

1401 

1402 Returns 

1403 ------- 

1404 arr : NumPy array 

1405 """ 

1406 return np.asarray(self, dtype=self.sp_values.dtype) 

1407 

1408 def _where(self, mask, value): 

1409 # NB: may not preserve dtype, e.g. result may be Sparse[float64] 

1410 # while self is Sparse[int64] 

1411 naive_implementation = np.where(mask, self, value) 

1412 dtype = SparseDtype(naive_implementation.dtype, fill_value=self.fill_value) 

1413 result = type(self)._from_sequence(naive_implementation, dtype=dtype) 

1414 return result 

1415 

1416 # ------------------------------------------------------------------------ 

1417 # IO 

1418 # ------------------------------------------------------------------------ 

1419 def __setstate__(self, state) -> None: 

1420 """Necessary for making this object picklable""" 

1421 if isinstance(state, tuple): 

1422 # Compat for pandas < 0.24.0 

1423 nd_state, (fill_value, sp_index) = state 

1424 sparse_values = np.array([]) 

1425 sparse_values.__setstate__(nd_state) 

1426 

1427 self._sparse_values = sparse_values 

1428 self._sparse_index = sp_index 

1429 self._dtype = SparseDtype(sparse_values.dtype, fill_value) 

1430 else: 

1431 self.__dict__.update(state) 

1432 

1433 def nonzero(self) -> tuple[npt.NDArray[np.int32]]: 

1434 if self.fill_value == 0: 

1435 return (self.sp_index.indices,) 

1436 else: 

1437 return (self.sp_index.indices[self.sp_values != 0],) 

1438 

1439 # ------------------------------------------------------------------------ 

1440 # Reductions 

1441 # ------------------------------------------------------------------------ 

1442 

1443 def _reduce(self, name: str, *, skipna: bool = True, **kwargs): 

1444 method = getattr(self, name, None) 

1445 

1446 if method is None: 

1447 raise TypeError(f"cannot perform {name} with type {self.dtype}") 

1448 

1449 if skipna: 

1450 arr = self 

1451 else: 

1452 arr = self.dropna() 

1453 

1454 return getattr(arr, name)(**kwargs) 

1455 

1456 def all(self, axis=None, *args, **kwargs): 

1457 """ 

1458 Tests whether all elements evaluate True 

1459 

1460 Returns 

1461 ------- 

1462 all : bool 

1463 

1464 See Also 

1465 -------- 

1466 numpy.all 

1467 """ 

1468 nv.validate_all(args, kwargs) 

1469 

1470 values = self.sp_values 

1471 

1472 if len(values) != len(self) and not np.all(self.fill_value): 

1473 return False 

1474 

1475 return values.all() 

1476 

1477 def any(self, axis=0, *args, **kwargs): 

1478 """ 

1479 Tests whether at least one of elements evaluate True 

1480 

1481 Returns 

1482 ------- 

1483 any : bool 

1484 

1485 See Also 

1486 -------- 

1487 numpy.any 

1488 """ 

1489 nv.validate_any(args, kwargs) 

1490 

1491 values = self.sp_values 

1492 

1493 if len(values) != len(self) and np.any(self.fill_value): 

1494 return True 

1495 

1496 return values.any().item() 

1497 

1498 def sum( 

1499 self, axis: int = 0, min_count: int = 0, skipna: bool = True, *args, **kwargs 

1500 ) -> Scalar: 

1501 """ 

1502 Sum of non-NA/null values 

1503 

1504 Parameters 

1505 ---------- 

1506 axis : int, default 0 

1507 Not Used. NumPy compatibility. 

1508 min_count : int, default 0 

1509 The required number of valid values to perform the summation. If fewer 

1510 than ``min_count`` valid values are present, the result will be the missing 

1511 value indicator for subarray type. 

1512 *args, **kwargs 

1513 Not Used. NumPy compatibility. 

1514 

1515 Returns 

1516 ------- 

1517 scalar 

1518 """ 

1519 nv.validate_sum(args, kwargs) 

1520 valid_vals = self._valid_sp_values 

1521 sp_sum = valid_vals.sum() 

1522 has_na = self.sp_index.ngaps > 0 and not self._null_fill_value 

1523 

1524 if has_na and not skipna: 

1525 return na_value_for_dtype(self.dtype.subtype, compat=False) 

1526 

1527 if self._null_fill_value: 

1528 if check_below_min_count(valid_vals.shape, None, min_count): 

1529 return na_value_for_dtype(self.dtype.subtype, compat=False) 

1530 return sp_sum 

1531 else: 

1532 nsparse = self.sp_index.ngaps 

1533 if check_below_min_count(valid_vals.shape, None, min_count - nsparse): 

1534 return na_value_for_dtype(self.dtype.subtype, compat=False) 

1535 return sp_sum + self.fill_value * nsparse 

1536 

1537 def cumsum(self, axis: int = 0, *args, **kwargs) -> SparseArray: 

1538 """ 

1539 Cumulative sum of non-NA/null values. 

1540 

1541 When performing the cumulative summation, any non-NA/null values will 

1542 be skipped. The resulting SparseArray will preserve the locations of 

1543 NaN values, but the fill value will be `np.nan` regardless. 

1544 

1545 Parameters 

1546 ---------- 

1547 axis : int or None 

1548 Axis over which to perform the cumulative summation. If None, 

1549 perform cumulative summation over flattened array. 

1550 

1551 Returns 

1552 ------- 

1553 cumsum : SparseArray 

1554 """ 

1555 nv.validate_cumsum(args, kwargs) 

1556 

1557 if axis is not None and axis >= self.ndim: # Mimic ndarray behaviour. 

1558 raise ValueError(f"axis(={axis}) out of bounds") 

1559 

1560 if not self._null_fill_value: 

1561 return SparseArray(self.to_dense()).cumsum() 

1562 

1563 return SparseArray( 

1564 self.sp_values.cumsum(), 

1565 sparse_index=self.sp_index, 

1566 fill_value=self.fill_value, 

1567 ) 

1568 

1569 def mean(self, axis=0, *args, **kwargs): 

1570 """ 

1571 Mean of non-NA/null values 

1572 

1573 Returns 

1574 ------- 

1575 mean : float 

1576 """ 

1577 nv.validate_mean(args, kwargs) 

1578 valid_vals = self._valid_sp_values 

1579 sp_sum = valid_vals.sum() 

1580 ct = len(valid_vals) 

1581 

1582 if self._null_fill_value: 

1583 return sp_sum / ct 

1584 else: 

1585 nsparse = self.sp_index.ngaps 

1586 return (sp_sum + self.fill_value * nsparse) / (ct + nsparse) 

1587 

1588 def max(self, *, axis: int | None = None, skipna: bool = True): 

1589 """ 

1590 Max of array values, ignoring NA values if specified. 

1591 

1592 Parameters 

1593 ---------- 

1594 axis : int, default 0 

1595 Not Used. NumPy compatibility. 

1596 skipna : bool, default True 

1597 Whether to ignore NA values. 

1598 

1599 Returns 

1600 ------- 

1601 scalar 

1602 """ 

1603 nv.validate_minmax_axis(axis, self.ndim) 

1604 return self._min_max("max", skipna=skipna) 

1605 

1606 def min(self, *, axis: int | None = None, skipna: bool = True): 

1607 """ 

1608 Min of array values, ignoring NA values if specified. 

1609 

1610 Parameters 

1611 ---------- 

1612 axis : int, default 0 

1613 Not Used. NumPy compatibility. 

1614 skipna : bool, default True 

1615 Whether to ignore NA values. 

1616 

1617 Returns 

1618 ------- 

1619 scalar 

1620 """ 

1621 nv.validate_minmax_axis(axis, self.ndim) 

1622 return self._min_max("min", skipna=skipna) 

1623 

1624 def _min_max(self, kind: Literal["min", "max"], skipna: bool) -> Scalar: 

1625 """ 

1626 Min/max of non-NA/null values 

1627 

1628 Parameters 

1629 ---------- 

1630 kind : {"min", "max"} 

1631 skipna : bool 

1632 

1633 Returns 

1634 ------- 

1635 scalar 

1636 """ 

1637 valid_vals = self._valid_sp_values 

1638 has_nonnull_fill_vals = not self._null_fill_value and self.sp_index.ngaps > 0 

1639 

1640 if len(valid_vals) > 0: 

1641 sp_min_max = getattr(valid_vals, kind)() 

1642 

1643 # If a non-null fill value is currently present, it might be the min/max 

1644 if has_nonnull_fill_vals: 

1645 func = max if kind == "max" else min 

1646 return func(sp_min_max, self.fill_value) 

1647 elif skipna: 

1648 return sp_min_max 

1649 elif self.sp_index.ngaps == 0: 

1650 # No NAs present 

1651 return sp_min_max 

1652 else: 

1653 return na_value_for_dtype(self.dtype.subtype, compat=False) 

1654 elif has_nonnull_fill_vals: 

1655 return self.fill_value 

1656 else: 

1657 return na_value_for_dtype(self.dtype.subtype, compat=False) 

1658 

1659 def _argmin_argmax(self, kind: Literal["argmin", "argmax"]) -> int: 

1660 

1661 values = self._sparse_values 

1662 index = self._sparse_index.indices 

1663 mask = np.asarray(isna(values)) 

1664 func = np.argmax if kind == "argmax" else np.argmin 

1665 

1666 idx = np.arange(values.shape[0]) 

1667 non_nans = values[~mask] 

1668 non_nan_idx = idx[~mask] 

1669 

1670 _candidate = non_nan_idx[func(non_nans)] 

1671 candidate = index[_candidate] 

1672 

1673 if isna(self.fill_value): 

1674 return candidate 

1675 if kind == "argmin" and self[candidate] < self.fill_value: 

1676 return candidate 

1677 if kind == "argmax" and self[candidate] > self.fill_value: 

1678 return candidate 

1679 _loc = self._first_fill_value_loc() 

1680 if _loc == -1: 

1681 # fill_value doesn't exist 

1682 return candidate 

1683 else: 

1684 return _loc 

1685 

1686 def argmax(self, skipna: bool = True) -> int: 

1687 validate_bool_kwarg(skipna, "skipna") 

1688 if not skipna and self._hasna: 

1689 raise NotImplementedError 

1690 return self._argmin_argmax("argmax") 

1691 

1692 def argmin(self, skipna: bool = True) -> int: 

1693 validate_bool_kwarg(skipna, "skipna") 

1694 if not skipna and self._hasna: 

1695 raise NotImplementedError 

1696 return self._argmin_argmax("argmin") 

1697 

1698 # ------------------------------------------------------------------------ 

1699 # Ufuncs 

1700 # ------------------------------------------------------------------------ 

1701 

1702 _HANDLED_TYPES = (np.ndarray, numbers.Number) 

1703 

1704 def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs, **kwargs): 

1705 out = kwargs.get("out", ()) 

1706 

1707 for x in inputs + out: 

1708 if not isinstance(x, self._HANDLED_TYPES + (SparseArray,)): 

1709 return NotImplemented 

1710 

1711 # for binary ops, use our custom dunder methods 

1712 result = ops.maybe_dispatch_ufunc_to_dunder_op( 

1713 self, ufunc, method, *inputs, **kwargs 

1714 ) 

1715 if result is not NotImplemented: 

1716 return result 

1717 

1718 if "out" in kwargs: 

1719 # e.g. tests.arrays.sparse.test_arithmetics.test_ndarray_inplace 

1720 res = arraylike.dispatch_ufunc_with_out( 

1721 self, ufunc, method, *inputs, **kwargs 

1722 ) 

1723 return res 

1724 

1725 if method == "reduce": 

1726 result = arraylike.dispatch_reduction_ufunc( 

1727 self, ufunc, method, *inputs, **kwargs 

1728 ) 

1729 if result is not NotImplemented: 

1730 # e.g. tests.series.test_ufunc.TestNumpyReductions 

1731 return result 

1732 

1733 if len(inputs) == 1: 

1734 # No alignment necessary. 

1735 sp_values = getattr(ufunc, method)(self.sp_values, **kwargs) 

1736 fill_value = getattr(ufunc, method)(self.fill_value, **kwargs) 

1737 

1738 if ufunc.nout > 1: 

1739 # multiple outputs. e.g. modf 

1740 arrays = tuple( 

1741 self._simple_new( 

1742 sp_value, self.sp_index, SparseDtype(sp_value.dtype, fv) 

1743 ) 

1744 for sp_value, fv in zip(sp_values, fill_value) 

1745 ) 

1746 return arrays 

1747 elif method == "reduce": 

1748 # e.g. reductions 

1749 return sp_values 

1750 

1751 return self._simple_new( 

1752 sp_values, self.sp_index, SparseDtype(sp_values.dtype, fill_value) 

1753 ) 

1754 

1755 new_inputs = tuple(np.asarray(x) for x in inputs) 

1756 result = getattr(ufunc, method)(*new_inputs, **kwargs) 

1757 if out: 

1758 if len(out) == 1: 

1759 out = out[0] 

1760 return out 

1761 

1762 if ufunc.nout > 1: 

1763 return tuple(type(self)(x) for x in result) 

1764 elif method == "at": 

1765 # no return value 

1766 return None 

1767 else: 

1768 return type(self)(result) 

1769 

1770 # ------------------------------------------------------------------------ 

1771 # Ops 

1772 # ------------------------------------------------------------------------ 

1773 

1774 def _arith_method(self, other, op): 

1775 op_name = op.__name__ 

1776 

1777 if isinstance(other, SparseArray): 

1778 return _sparse_array_op(self, other, op, op_name) 

1779 

1780 elif is_scalar(other): 

1781 with np.errstate(all="ignore"): 

1782 fill = op(_get_fill(self), np.asarray(other)) 

1783 result = op(self.sp_values, other) 

1784 

1785 if op_name == "divmod": 

1786 left, right = result 

1787 lfill, rfill = fill 

1788 return ( 

1789 _wrap_result(op_name, left, self.sp_index, lfill), 

1790 _wrap_result(op_name, right, self.sp_index, rfill), 

1791 ) 

1792 

1793 return _wrap_result(op_name, result, self.sp_index, fill) 

1794 

1795 else: 

1796 other = np.asarray(other) 

1797 with np.errstate(all="ignore"): 

1798 if len(self) != len(other): 

1799 raise AssertionError( 

1800 f"length mismatch: {len(self)} vs. {len(other)}" 

1801 ) 

1802 if not isinstance(other, SparseArray): 

1803 dtype = getattr(other, "dtype", None) 

1804 other = SparseArray(other, fill_value=self.fill_value, dtype=dtype) 

1805 return _sparse_array_op(self, other, op, op_name) 

1806 

1807 def _cmp_method(self, other, op) -> SparseArray: 

1808 if not is_scalar(other) and not isinstance(other, type(self)): 

1809 # convert list-like to ndarray 

1810 other = np.asarray(other) 

1811 

1812 if isinstance(other, np.ndarray): 

1813 # TODO: make this more flexible than just ndarray... 

1814 other = SparseArray(other, fill_value=self.fill_value) 

1815 

1816 if isinstance(other, SparseArray): 

1817 if len(self) != len(other): 

1818 raise ValueError( 

1819 f"operands have mismatched length {len(self)} and {len(other)}" 

1820 ) 

1821 

1822 op_name = op.__name__.strip("_") 

1823 return _sparse_array_op(self, other, op, op_name) 

1824 else: 

1825 # scalar 

1826 with np.errstate(all="ignore"): 

1827 fill_value = op(self.fill_value, other) 

1828 result = np.full(len(self), fill_value, dtype=np.bool_) 

1829 result[self.sp_index.indices] = op(self.sp_values, other) 

1830 

1831 return type(self)( 

1832 result, 

1833 fill_value=fill_value, 

1834 dtype=np.bool_, 

1835 ) 

1836 

1837 _logical_method = _cmp_method 

1838 

1839 def _unary_method(self, op) -> SparseArray: 

1840 fill_value = op(np.array(self.fill_value)).item() 

1841 dtype = SparseDtype(self.dtype.subtype, fill_value) 

1842 # NOTE: if fill_value doesn't change 

1843 # we just have to apply op to sp_values 

1844 if isna(self.fill_value) or fill_value == self.fill_value: 

1845 values = op(self.sp_values) 

1846 return type(self)._simple_new(values, self.sp_index, self.dtype) 

1847 # In the other case we have to recalc indexes 

1848 return type(self)(op(self.to_dense()), dtype=dtype) 

1849 

1850 def __pos__(self) -> SparseArray: 

1851 return self._unary_method(operator.pos) 

1852 

1853 def __neg__(self) -> SparseArray: 

1854 return self._unary_method(operator.neg) 

1855 

1856 def __invert__(self) -> SparseArray: 

1857 return self._unary_method(operator.invert) 

1858 

1859 def __abs__(self) -> SparseArray: 

1860 return self._unary_method(operator.abs) 

1861 

1862 # ---------- 

1863 # Formatting 

1864 # ----------- 

1865 def __repr__(self) -> str: 

1866 pp_str = printing.pprint_thing(self) 

1867 pp_fill = printing.pprint_thing(self.fill_value) 

1868 pp_index = printing.pprint_thing(self.sp_index) 

1869 return f"{pp_str}\nFill: {pp_fill}\n{pp_index}" 

1870 

1871 def _formatter(self, boxed=False): 

1872 # Defer to the formatter from the GenericArrayFormatter calling us. 

1873 # This will infer the correct formatter from the dtype of the values. 

1874 return None 

1875 

1876 

1877def make_sparse( 

1878 arr: np.ndarray, 

1879 kind: SparseIndexKind = "block", 

1880 fill_value=None, 

1881 dtype: NpDtype | None = None, 

1882): 

1883 """ 

1884 Convert ndarray to sparse format 

1885 

1886 Parameters 

1887 ---------- 

1888 arr : ndarray 

1889 kind : {'block', 'integer'} 

1890 fill_value : NaN or another value 

1891 dtype : np.dtype, optional 

1892 copy : bool, default False 

1893 

1894 Returns 

1895 ------- 

1896 (sparse_values, index, fill_value) : (ndarray, SparseIndex, Scalar) 

1897 """ 

1898 assert isinstance(arr, np.ndarray) 

1899 

1900 if arr.ndim > 1: 

1901 raise TypeError("expected dimension <= 1 data") 

1902 

1903 if fill_value is None: 

1904 fill_value = na_value_for_dtype(arr.dtype) 

1905 

1906 if isna(fill_value): 

1907 mask = notna(arr) 

1908 else: 

1909 # cast to object comparison to be safe 

1910 if is_string_dtype(arr.dtype): 

1911 arr = arr.astype(object) 

1912 

1913 if is_object_dtype(arr.dtype): 

1914 # element-wise equality check method in numpy doesn't treat 

1915 # each element type, eg. 0, 0.0, and False are treated as 

1916 # same. So we have to check the both of its type and value. 

1917 mask = splib.make_mask_object_ndarray(arr, fill_value) 

1918 else: 

1919 mask = arr != fill_value 

1920 

1921 length = len(arr) 

1922 if length != len(mask): 

1923 # the arr is a SparseArray 

1924 indices = mask.sp_index.indices 

1925 else: 

1926 indices = mask.nonzero()[0].astype(np.int32) 

1927 

1928 index = make_sparse_index(length, indices, kind) 

1929 sparsified_values = arr[mask] 

1930 if dtype is not None: 

1931 # error: Argument "dtype" to "astype_nansafe" has incompatible type "Union[str, 

1932 # dtype[Any]]"; expected "Union[dtype[Any], ExtensionDtype]" 

1933 sparsified_values = astype_nansafe( 

1934 sparsified_values, dtype=dtype # type: ignore[arg-type] 

1935 ) 

1936 # TODO: copy 

1937 return sparsified_values, index, fill_value 

1938 

1939 

1940@overload 

1941def make_sparse_index(length: int, indices, kind: Literal["block"]) -> BlockIndex: 

1942 ... 

1943 

1944 

1945@overload 

1946def make_sparse_index(length: int, indices, kind: Literal["integer"]) -> IntIndex: 

1947 ... 

1948 

1949 

1950def make_sparse_index(length: int, indices, kind: SparseIndexKind) -> SparseIndex: 

1951 index: SparseIndex 

1952 if kind == "block": 

1953 locs, lens = splib.get_blocks(indices) 

1954 index = BlockIndex(length, locs, lens) 

1955 elif kind == "integer": 

1956 index = IntIndex(length, indices) 

1957 else: # pragma: no cover 

1958 raise ValueError("must be block or integer type") 

1959 return index