Coverage for /var/srv/projects/api.amasfac.comuna18.com/tmp/venv/lib/python3.9/site-packages/pandas/core/arrays/sparse/array.py: 13%

1"""

2SparseArray data structure

3"""

4from __future__ import annotations

6from collections import abc

7import numbers

8import operator

9from typing import (

10 TYPE_CHECKING,

11 Any,

12 Callable,

13 Literal,

14 Sequence,

15 TypeVar,

16 cast,

17 overload,

18)

19import warnings

21import numpy as np

23from pandas._libs import lib

24import pandas._libs.sparse as splib

25from pandas._libs.sparse import (

26 BlockIndex,

27 IntIndex,

28 SparseIndex,

29)

30from pandas._libs.tslibs import NaT

31from pandas._typing import (

32 ArrayLike,

33 AstypeArg,

34 Dtype,

35 NpDtype,

36 PositionalIndexer,

37 Scalar,

38 ScalarIndexer,

39 SequenceIndexer,

40 npt,

41)

42from pandas.compat.numpy import function as nv

43from pandas.errors import PerformanceWarning

44from pandas.util._exceptions import find_stack_level

45from pandas.util._validators import (

46 validate_bool_kwarg,

47 validate_insert_loc,

48)

50from pandas.core.dtypes.astype import astype_nansafe

51from pandas.core.dtypes.cast import (

52 construct_1d_arraylike_from_scalar,

53 find_common_type,

54 maybe_box_datetimelike,

55)

56from pandas.core.dtypes.common import (

57 is_array_like,

58 is_bool_dtype,

59 is_datetime64_any_dtype,

60 is_datetime64tz_dtype,

61 is_dtype_equal,

62 is_integer,

63 is_list_like,

64 is_object_dtype,

65 is_scalar,

66 is_string_dtype,

67 pandas_dtype,

68)

69from pandas.core.dtypes.generic import (

70 ABCIndex,

71 ABCSeries,

72)

73from pandas.core.dtypes.missing import (

74 isna,

75 na_value_for_dtype,

76 notna,

77)

79from pandas.core import arraylike

80import pandas.core.algorithms as algos

81from pandas.core.array_algos.quantile import quantile_with_mask

82from pandas.core.arraylike import OpsMixin

83from pandas.core.arrays import ExtensionArray

84from pandas.core.arrays.sparse.dtype import SparseDtype

85from pandas.core.base import PandasObject

86import pandas.core.common as com

87from pandas.core.construction import (

88 extract_array,

89 sanitize_array,

90)

91from pandas.core.indexers import (

92 check_array_indexer,

93 unpack_tuple_and_ellipses,

94)

95from pandas.core.missing import interpolate_2d

96from pandas.core.nanops import check_below_min_count

97import pandas.core.ops as ops

99import pandas.io.formats.printing as printing

100

101# See https://github.com/python/typing/issues/684

102if TYPE_CHECKING: 102 ↛ 103line 102 didn't jump to line 103, because the condition on line 102 was never true

103 from enum import Enum

104

105 class ellipsis(Enum):

106 Ellipsis = "..."

107

108 Ellipsis = ellipsis.Ellipsis

109

110 from scipy.sparse import spmatrix

111

112 from pandas._typing import (

113 FillnaOptions,

114 NumpySorter,

115 )

116

117 SparseIndexKind = Literal["integer", "block"]

118

119 from pandas import Series

120

121else:

122 ellipsis = type(Ellipsis)

123

124

125# ----------------------------------------------------------------------------

126# Array

127

128SparseArrayT = TypeVar("SparseArrayT", bound="SparseArray")

129

130_sparray_doc_kwargs = {"klass": "SparseArray"}

131

132

133def _get_fill(arr: SparseArray) -> np.ndarray:

134 """

135 Create a 0-dim ndarray containing the fill value

136

137 Parameters

138 ----------

139 arr : SparseArray

140

141 Returns

142 -------

143 fill_value : ndarray

144 0-dim ndarray with just the fill value.

145

146 Notes

147 -----

148 coerce fill_value to arr dtype if possible

149 int64 SparseArray can have NaN as fill_value if there is no missing

150 """

151 try:

152 return np.asarray(arr.fill_value, dtype=arr.dtype.subtype)

153 except ValueError:

154 return np.asarray(arr.fill_value)

155

156

157def _sparse_array_op(

158 left: SparseArray, right: SparseArray, op: Callable, name: str

159) -> SparseArray:

160 """

161 Perform a binary operation between two arrays.

162

163 Parameters

164 ----------

165 left : Union[SparseArray, ndarray]

166 right : Union[SparseArray, ndarray]

167 op : Callable

168 The binary operation to perform

169 name str

170 Name of the callable.

171

172 Returns

173 -------

174 SparseArray

175 """

176 if name.startswith("__"):

177 # For lookups in _libs.sparse we need non-dunder op name

178 name = name[2:-2]

179

180 # dtype used to find corresponding sparse method

181 ltype = left.dtype.subtype

182 rtype = right.dtype.subtype

183

184 if not is_dtype_equal(ltype, rtype):

185 subtype = find_common_type([ltype, rtype])

186 ltype = SparseDtype(subtype, left.fill_value)

187 rtype = SparseDtype(subtype, right.fill_value)

188

189 left = left.astype(ltype, copy=False)

190 right = right.astype(rtype, copy=False)

191 dtype = ltype.subtype

192 else:

193 dtype = ltype

194

195 # dtype the result must have

196 result_dtype = None

197

198 if left.sp_index.ngaps == 0 or right.sp_index.ngaps == 0:

199 with np.errstate(all="ignore"):

200 result = op(left.to_dense(), right.to_dense())

201 fill = op(_get_fill(left), _get_fill(right))

202

203 if left.sp_index.ngaps == 0:

204 index = left.sp_index

205 else:

206 index = right.sp_index

207 elif left.sp_index.equals(right.sp_index):

208 with np.errstate(all="ignore"):

209 result = op(left.sp_values, right.sp_values)

210 fill = op(_get_fill(left), _get_fill(right))

211 index = left.sp_index

212 else:

213 if name[0] == "r":

214 left, right = right, left

215 name = name[1:]

216

217 if name in ("and", "or", "xor") and dtype == "bool":

218 opname = f"sparse_{name}_uint8"

219 # to make template simple, cast here

220 left_sp_values = left.sp_values.view(np.uint8)

221 right_sp_values = right.sp_values.view(np.uint8)

222 result_dtype = bool

223 else:

224 opname = f"sparse_{name}_{dtype}"

225 left_sp_values = left.sp_values

226 right_sp_values = right.sp_values

227

228 if (

229 name in ["floordiv", "mod"]

230 and (right == 0).any()

231 and left.dtype.kind in ["i", "u"]

232 ):

233 # Match the non-Sparse Series behavior

234 opname = f"sparse_{name}_float64"

235 left_sp_values = left_sp_values.astype("float64")

236 right_sp_values = right_sp_values.astype("float64")

237

238 sparse_op = getattr(splib, opname)

239

240 with np.errstate(all="ignore"):

241 result, index, fill = sparse_op(

242 left_sp_values,

243 left.sp_index,

244 left.fill_value,

245 right_sp_values,

246 right.sp_index,

247 right.fill_value,

248 )

249

250 if name == "divmod":

251 # result is a 2-tuple

252 # error: Incompatible return value type (got "Tuple[SparseArray,

253 # SparseArray]", expected "SparseArray")

254 return ( # type: ignore[return-value]

255 _wrap_result(name, result[0], index, fill[0], dtype=result_dtype),

256 _wrap_result(name, result[1], index, fill[1], dtype=result_dtype),

257 )

258

259 if result_dtype is None:

260 result_dtype = result.dtype

261

262 return _wrap_result(name, result, index, fill, dtype=result_dtype)

263

264

265def _wrap_result(

266 name: str, data, sparse_index, fill_value, dtype: Dtype | None = None

267) -> SparseArray:

268 """

269 wrap op result to have correct dtype

270 """

271 if name.startswith("__"):

272 # e.g. __eq__ --> eq

273 name = name[2:-2]

274

275 if name in ("eq", "ne", "lt", "gt", "le", "ge"):

276 dtype = bool

277

278 fill_value = lib.item_from_zerodim(fill_value)

279

280 if is_bool_dtype(dtype):

281 # fill_value may be np.bool_

282 fill_value = bool(fill_value)

283 return SparseArray(

284 data, sparse_index=sparse_index, fill_value=fill_value, dtype=dtype

285 )

286

287

288class SparseArray(OpsMixin, PandasObject, ExtensionArray):

289 """

290 An ExtensionArray for storing sparse data.

291

292 Parameters

293 ----------

294 data : array-like or scalar

295 A dense array of values to store in the SparseArray. This may contain

296 `fill_value`.

297 sparse_index : SparseIndex, optional

298 index : Index

299

300 .. deprecated:: 1.4.0

301 Use a function like `np.full` to construct an array with the desired

302 repeats of the scalar value instead.

303

304 fill_value : scalar, optional

305 Elements in data that are ``fill_value`` are not stored in the

306 SparseArray. For memory savings, this should be the most common value

307 in `data`. By default, `fill_value` depends on the dtype of `data`:

308

309 =========== ==========

310 data.dtype na_value

311 =========== ==========

312 float ``np.nan``

313 int ``0``

314 bool False

315 datetime64 ``pd.NaT``

316 timedelta64 ``pd.NaT``

317 =========== ==========

318

319 The fill value is potentially specified in three ways. In order of

320 precedence, these are

321

322 1. The `fill_value` argument

323 2. ``dtype.fill_value`` if `fill_value` is None and `dtype` is

324 a ``SparseDtype``

325 3. ``data.dtype.fill_value`` if `fill_value` is None and `dtype`

326 is not a ``SparseDtype`` and `data` is a ``SparseArray``.

327

328 kind : str

329 Can be 'integer' or 'block', default is 'integer'.

330 The type of storage for sparse locations.

331

332 * 'block': Stores a `block` and `block_length` for each

333 contiguous *span* of sparse values. This is best when

334 sparse data tends to be clumped together, with large

335 regions of ``fill-value`` values between sparse values.

336 * 'integer': uses an integer to store the location of

337 each sparse value.

338

339 dtype : np.dtype or SparseDtype, optional

340 The dtype to use for the SparseArray. For numpy dtypes, this

341 determines the dtype of ``self.sp_values``. For SparseDtype,

342 this determines ``self.sp_values`` and ``self.fill_value``.

343 copy : bool, default False

344 Whether to explicitly copy the incoming `data` array.

345

346 Attributes

347 ----------

348 None

349

350 Methods

351 -------

352 None

353

354 Examples

355 --------

356 >>> from pandas.arrays import SparseArray

357 >>> arr = SparseArray([0, 0, 1, 2])

358 >>> arr

359 [0, 0, 1, 2]

360 Fill: 0

361 IntIndex

362 Indices: array([2, 3], dtype=int32)

363 """

364

365 _subtyp = "sparse_array" # register ABCSparseArray

366 _hidden_attrs = PandasObject._hidden_attrs | frozenset(["get_values"])

367 _sparse_index: SparseIndex

368 _sparse_values: np.ndarray

369 _dtype: SparseDtype

370

371 def __init__(

372 self,

373 data,

374 sparse_index=None,

375 index=None,

376 fill_value=None,

377 kind: SparseIndexKind = "integer",

378 dtype: Dtype | None = None,

379 copy: bool = False,

380 ) -> None:

381

382 if fill_value is None and isinstance(dtype, SparseDtype):

383 fill_value = dtype.fill_value

384

385 if isinstance(data, type(self)):

386 # disable normal inference on dtype, sparse_index, & fill_value

387 if sparse_index is None:

388 sparse_index = data.sp_index

389 if fill_value is None:

390 fill_value = data.fill_value

391 if dtype is None:

392 dtype = data.dtype

393 # TODO: make kind=None, and use data.kind?

394 data = data.sp_values

395

396 # Handle use-provided dtype

397 if isinstance(dtype, str):

398 # Two options: dtype='int', regular numpy dtype

399 # or dtype='Sparse[int]', a sparse dtype

400 try:

401 dtype = SparseDtype.construct_from_string(dtype)

402 except TypeError:

403 dtype = pandas_dtype(dtype)

404

405 if isinstance(dtype, SparseDtype):

406 if fill_value is None:

407 fill_value = dtype.fill_value

408 dtype = dtype.subtype

409

410 if index is not None:

411 warnings.warn(

412 "The index argument has been deprecated and will be "

413 "removed in a future version. Use a function like np.full "

414 "to construct an array with the desired repeats of the "

415 "scalar value instead.\n\n",

416 FutureWarning,

417 stacklevel=find_stack_level(),

418 )

419

420 if index is not None and not is_scalar(data):

421 raise Exception("must only pass scalars with an index")

422

423 if is_scalar(data):

424 if index is not None and data is None:

425 data = np.nan

426

427 if index is not None:

428 npoints = len(index)

429 elif sparse_index is None:

430 npoints = 1

431 else:

432 npoints = sparse_index.length

433

434 data = construct_1d_arraylike_from_scalar(data, npoints, dtype=None)

435 dtype = data.dtype

436

437 if dtype is not None:

438 dtype = pandas_dtype(dtype)

439

440 # TODO: disentangle the fill_value dtype inference from

441 # dtype inference

442 if data is None:

443 # TODO: What should the empty dtype be? Object or float?

444

445 # error: Argument "dtype" to "array" has incompatible type

446 # "Union[ExtensionDtype, dtype[Any], None]"; expected "Union[dtype[Any],

447 # None, type, _SupportsDType, str, Union[Tuple[Any, int], Tuple[Any,

448 # Union[int, Sequence[int]]], List[Any], _DTypeDict, Tuple[Any, Any]]]"

449 data = np.array([], dtype=dtype) # type: ignore[arg-type]

450

451 if not is_array_like(data):

452 try:

453 # probably shared code in sanitize_series

454

455 data = sanitize_array(data, index=None)

456 except ValueError:

457 # NumPy may raise a ValueError on data like [1, []]

458 # we retry with object dtype here.

459 if dtype is None:

460 dtype = object

461 data = np.atleast_1d(np.asarray(data, dtype=dtype))

462 else:

463 raise

464

465 if copy:

466 # TODO: avoid double copy when dtype forces cast.

467 data = data.copy()

468

469 if fill_value is None:

470 fill_value_dtype = data.dtype if dtype is None else dtype

471 if fill_value_dtype is None:

472 fill_value = np.nan

473 else:

474 fill_value = na_value_for_dtype(fill_value_dtype)

475

476 if isinstance(data, type(self)) and sparse_index is None:

477 sparse_index = data._sparse_index

478 # error: Argument "dtype" to "asarray" has incompatible type

479 # "Union[ExtensionDtype, dtype[Any], Type[object], None]"; expected

480 # "Union[dtype[Any], None, type, _SupportsDType, str, Union[Tuple[Any, int],

481 # Tuple[Any, Union[int, Sequence[int]]], List[Any], _DTypeDict, Tuple[Any,

482 # Any]]]"

483 sparse_values = np.asarray(

484 data.sp_values, dtype=dtype # type: ignore[arg-type]

485 )

486 elif sparse_index is None:

487 data = extract_array(data, extract_numpy=True)

488 if not isinstance(data, np.ndarray):

489 # EA

490 if is_datetime64tz_dtype(data.dtype):

491 warnings.warn(

492 f"Creating SparseArray from {data.dtype} data "

493 "loses timezone information. Cast to object before "

494 "sparse to retain timezone information.",

495 UserWarning,

496 stacklevel=find_stack_level(),

497 )

498 data = np.asarray(data, dtype="datetime64[ns]")

499 if fill_value is NaT:

500 fill_value = np.datetime64("NaT", "ns")

501 data = np.asarray(data)

502 sparse_values, sparse_index, fill_value = make_sparse(

503 # error: Argument "dtype" to "make_sparse" has incompatible type

504 # "Union[ExtensionDtype, dtype[Any], Type[object], None]"; expected

505 # "Union[str, dtype[Any], None]"

506 data,

507 kind=kind,

508 fill_value=fill_value,

509 dtype=dtype, # type: ignore[arg-type]

510 )

511 else:

512 # error: Argument "dtype" to "asarray" has incompatible type

513 # "Union[ExtensionDtype, dtype[Any], Type[object], None]"; expected

514 # "Union[dtype[Any], None, type, _SupportsDType, str, Union[Tuple[Any, int],

515 # Tuple[Any, Union[int, Sequence[int]]], List[Any], _DTypeDict, Tuple[Any,

516 # Any]]]"

517 sparse_values = np.asarray(data, dtype=dtype) # type: ignore[arg-type]

518 if len(sparse_values) != sparse_index.npoints:

519 raise AssertionError(

520 f"Non array-like type {type(sparse_values)} must "

521 "have the same length as the index"

522 )

523 self._sparse_index = sparse_index

524 self._sparse_values = sparse_values

525 self._dtype = SparseDtype(sparse_values.dtype, fill_value)

526

527 @classmethod

528 def _simple_new(

529 cls: type[SparseArrayT],

530 sparse_array: np.ndarray,

531 sparse_index: SparseIndex,

532 dtype: SparseDtype,

533 ) -> SparseArrayT:

534 new = object.__new__(cls)

535 new._sparse_index = sparse_index

536 new._sparse_values = sparse_array

537 new._dtype = dtype

538 return new

539

540 @classmethod

541 def from_spmatrix(cls: type[SparseArrayT], data: spmatrix) -> SparseArrayT:

542 """

543 Create a SparseArray from a scipy.sparse matrix.

544

545 .. versionadded:: 0.25.0

546

547 Parameters

548 ----------

549 data : scipy.sparse.sp_matrix

550 This should be a SciPy sparse matrix where the size

551 of the second dimension is 1. In other words, a

552 sparse matrix with a single column.

553

554 Returns

555 -------

556 SparseArray

557

558 Examples

559 --------

560 >>> import scipy.sparse

561 >>> mat = scipy.sparse.coo_matrix((4, 1))

562 >>> pd.arrays.SparseArray.from_spmatrix(mat)

563 [0.0, 0.0, 0.0, 0.0]

564 Fill: 0.0

565 IntIndex

566 Indices: array([], dtype=int32)

567 """

568 length, ncol = data.shape

569

570 if ncol != 1:

571 raise ValueError(f"'data' must have a single column, not '{ncol}'")

572

573 # our sparse index classes require that the positions be strictly

574 # increasing. So we need to sort loc, and arr accordingly.

575 data = data.tocsc()

576 data.sort_indices()

577 arr = data.data

578 idx = data.indices

579

580 zero = np.array(0, dtype=arr.dtype).item()

581 dtype = SparseDtype(arr.dtype, zero)

582 index = IntIndex(length, idx)

583

584 return cls._simple_new(arr, index, dtype)

585

586 def __array__(self, dtype: NpDtype | None = None) -> np.ndarray:

587 fill_value = self.fill_value

588

589 if self.sp_index.ngaps == 0:

590 # Compat for na dtype and int values.

591 return self.sp_values

592 if dtype is None:

593 # Can NumPy represent this type?

594 # If not, `np.result_type` will raise. We catch that

595 # and return object.

596 if is_datetime64_any_dtype(self.sp_values.dtype):

597 # However, we *do* special-case the common case of

598 # a datetime64 with pandas NaT.

599 if fill_value is NaT:

600 # Can't put pd.NaT in a datetime64[ns]

601 fill_value = np.datetime64("NaT")

602 try:

603 dtype = np.result_type(self.sp_values.dtype, type(fill_value))

604 except TypeError:

605 dtype = object

606

607 out = np.full(self.shape, fill_value, dtype=dtype)

608 out[self.sp_index.indices] = self.sp_values

609 return out

610

611 def __setitem__(self, key, value):

612 # I suppose we could allow setting of non-fill_value elements.

613 # TODO(SparseArray.__setitem__): remove special cases in

614 # ExtensionBlock.where

615 msg = "SparseArray does not support item assignment via setitem"

616 raise TypeError(msg)

617

618 @classmethod

619 def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy: bool = False):

620 return cls(scalars, dtype=dtype)

621

622 @classmethod

623 def _from_factorized(cls, values, original):

624 return cls(values, dtype=original.dtype)

625

626 # ------------------------------------------------------------------------

627 # Data

628 # ------------------------------------------------------------------------

629 @property

630 def sp_index(self) -> SparseIndex:

631 """

632 The SparseIndex containing the location of non- ``fill_value`` points.

633 """

634 return self._sparse_index

635

636 @property

637 def sp_values(self) -> np.ndarray:

638 """

639 An ndarray containing the non- ``fill_value`` values.

640

641 Examples

642 --------

643 >>> s = SparseArray([0, 0, 1, 0, 2], fill_value=0)

644 >>> s.sp_values

645 array([1, 2])

646 """

647 return self._sparse_values

648

649 @property

650 def dtype(self) -> SparseDtype:

651 return self._dtype

652

653 @property

654 def fill_value(self):

655 """

656 Elements in `data` that are `fill_value` are not stored.

657

658 For memory savings, this should be the most common value in the array.

659 """

660 return self.dtype.fill_value

661

662 @fill_value.setter

663 def fill_value(self, value):

664 self._dtype = SparseDtype(self.dtype.subtype, value)

665

666 @property

667 def kind(self) -> SparseIndexKind:

668 """

669 The kind of sparse index for this array. One of {'integer', 'block'}.

670 """

671 if isinstance(self.sp_index, IntIndex):

672 return "integer"

673 else:

674 return "block"

675

676 @property

677 def _valid_sp_values(self) -> np.ndarray:

678 sp_vals = self.sp_values

679 mask = notna(sp_vals)

680 return sp_vals[mask]

681

682 def __len__(self) -> int:

683 return self.sp_index.length

684

685 @property

686 def _null_fill_value(self) -> bool:

687 return self._dtype._is_na_fill_value

688

689 def _fill_value_matches(self, fill_value) -> bool:

690 if self._null_fill_value:

691 return isna(fill_value)

692 else:

693 return self.fill_value == fill_value

694

695 @property

696 def nbytes(self) -> int:

697 return self.sp_values.nbytes + self.sp_index.nbytes

698

699 @property

700 def density(self) -> float:

701 """

702 The percent of non- ``fill_value`` points, as decimal.

703

704 Examples

705 --------

706 >>> s = SparseArray([0, 0, 1, 1, 1], fill_value=0)

707 >>> s.density

708 0.6

709 """

710 return self.sp_index.npoints / self.sp_index.length

711

712 @property

713 def npoints(self) -> int:

714 """

715 The number of non- ``fill_value`` points.

716

717 Examples

718 --------

719 >>> s = SparseArray([0, 0, 1, 1, 1], fill_value=0)

720 >>> s.npoints

721 3

722 """

723 return self.sp_index.npoints

724

725 def isna(self):

726 # If null fill value, we want SparseDtype[bool, true]

727 # to preserve the same memory usage.

728 dtype = SparseDtype(bool, self._null_fill_value)

729 if self._null_fill_value:

730 return type(self)._simple_new(isna(self.sp_values), self.sp_index, dtype)

731 mask = np.full(len(self), False, dtype=np.bool8)

732 mask[self.sp_index.indices] = isna(self.sp_values)

733 return type(self)(mask, fill_value=False, dtype=dtype)

734

735 def fillna(

736 self: SparseArrayT,

737 value=None,

738 method: FillnaOptions | None = None,

739 limit: int | None = None,

740 ) -> SparseArrayT:

741 """

742 Fill missing values with `value`.

743

744 Parameters

745 ----------

746 value : scalar, optional

747 method : str, optional

748

749 .. warning::

750

751 Using 'method' will result in high memory use,

752 as all `fill_value` methods will be converted to

753 an in-memory ndarray

754

755 limit : int, optional

756

757 Returns

758 -------

759 SparseArray

760

761 Notes

762 -----

763 When `value` is specified, the result's ``fill_value`` depends on

764 ``self.fill_value``. The goal is to maintain low-memory use.

765

766 If ``self.fill_value`` is NA, the result dtype will be

767 ``SparseDtype(self.dtype, fill_value=value)``. This will preserve

768 amount of memory used before and after filling.

769

770 When ``self.fill_value`` is not NA, the result dtype will be

771 ``self.dtype``. Again, this preserves the amount of memory used.

772 """

773 if (method is None and value is None) or (

774 method is not None and value is not None

775 ):

776 raise ValueError("Must specify one of 'method' or 'value'.")

777

778 elif method is not None:

779 msg = "fillna with 'method' requires high memory usage."

780 warnings.warn(

781 msg,

782 PerformanceWarning,

783 stacklevel=find_stack_level(),

784 )

785 new_values = np.asarray(self)

786 # interpolate_2d modifies new_values inplace

787 interpolate_2d(new_values, method=method, limit=limit)

788 return type(self)(new_values, fill_value=self.fill_value)

789

790 else:

791 new_values = np.where(isna(self.sp_values), value, self.sp_values)

792

793 if self._null_fill_value:

794 # This is essentially just updating the dtype.

795 new_dtype = SparseDtype(self.dtype.subtype, fill_value=value)

796 else:

797 new_dtype = self.dtype

798

799 return self._simple_new(new_values, self._sparse_index, new_dtype)

800

801 def shift(self: SparseArrayT, periods: int = 1, fill_value=None) -> SparseArrayT:

802

803 if not len(self) or periods == 0:

804 return self.copy()

805

806 if isna(fill_value):

807 fill_value = self.dtype.na_value

808

809 subtype = np.result_type(fill_value, self.dtype.subtype)

810

811 if subtype != self.dtype.subtype:

812 # just coerce up front

813 arr = self.astype(SparseDtype(subtype, self.fill_value))

814 else:

815 arr = self

816

817 empty = self._from_sequence(

818 [fill_value] * min(abs(periods), len(self)), dtype=arr.dtype

819 )

820

821 if periods > 0:

822 a = empty

823 b = arr[:-periods]

824 else:

825 a = arr[abs(periods) :]

826 b = empty

827 return arr._concat_same_type([a, b])

828

829 def _first_fill_value_loc(self):

830 """

831 Get the location of the first fill value.

832

833 Returns

834 -------

835 int

836 """

837 if len(self) == 0 or self.sp_index.npoints == len(self):

838 return -1

839

840 indices = self.sp_index.indices

841 if not len(indices) or indices[0] > 0:

842 return 0

843

844 # a number larger than 1 should be appended to

845 # the last in case of fill value only appears

846 # in the tail of array

847 diff = np.r_[np.diff(indices), 2]

848 return indices[(diff > 1).argmax()] + 1

849

850 def unique(self: SparseArrayT) -> SparseArrayT:

851 uniques = algos.unique(self.sp_values)

852 if len(self.sp_values) != len(self):

853 fill_loc = self._first_fill_value_loc()

854 # Inorder to align the behavior of pd.unique or

855 # pd.Series.unique, we should keep the original

856 # order, here we use unique again to find the

857 # insertion place. Since the length of sp_values

858 # is not large, maybe minor performance hurt

859 # is worthwhile to the correctness.

860 insert_loc = len(algos.unique(self.sp_values[:fill_loc]))

861 uniques = np.insert(uniques, insert_loc, self.fill_value)

862 return type(self)._from_sequence(uniques, dtype=self.dtype)

863

864 def _values_for_factorize(self):

865 # Still override this for hash_pandas_object

866 return np.asarray(self), self.fill_value

867

868 def factorize(

869 self,

870 na_sentinel: int | lib.NoDefault = lib.no_default,

871 use_na_sentinel: bool | lib.NoDefault = lib.no_default,

872 ) -> tuple[np.ndarray, SparseArray]:

873 # Currently, ExtensionArray.factorize -> Tuple[ndarray, EA]

874 # The sparsity on this is backwards from what Sparse would want. Want

875 # ExtensionArray.factorize -> Tuple[EA, EA]

876 # Given that we have to return a dense array of codes, why bother

877 # implementing an efficient factorize?

878 codes, uniques = algos.factorize(

879 np.asarray(self), na_sentinel=na_sentinel, use_na_sentinel=use_na_sentinel

880 )

881 if na_sentinel is lib.no_default:

882 na_sentinel = -1

883 if use_na_sentinel is lib.no_default or use_na_sentinel:

884 codes[codes == -1] = na_sentinel

885 uniques_sp = SparseArray(uniques, dtype=self.dtype)

886 return codes, uniques_sp

887

888 def value_counts(self, dropna: bool = True) -> Series:

889 """

890 Returns a Series containing counts of unique values.

891

892 Parameters

893 ----------

894 dropna : bool, default True

895 Don't include counts of NaN, even if NaN is in sp_values.

896

897 Returns

898 -------

899 counts : Series

900 """

901 from pandas import (

902 Index,

903 Series,

904 )

905

906 keys, counts = algos.value_counts_arraylike(self.sp_values, dropna=dropna)

907 fcounts = self.sp_index.ngaps

908 if fcounts > 0 and (not self._null_fill_value or not dropna):

909 mask = isna(keys) if self._null_fill_value else keys == self.fill_value

910 if mask.any():

911 counts[mask] += fcounts

912 else:

913 # error: Argument 1 to "insert" has incompatible type "Union[

914 # ExtensionArray,ndarray[Any, Any]]"; expected "Union[

915 # _SupportsArray[dtype[Any]], Sequence[_SupportsArray[dtype

916 # [Any]]], Sequence[Sequence[_SupportsArray[dtype[Any]]]],

917 # Sequence[Sequence[Sequence[_SupportsArray[dtype[Any]]]]], Sequence

918 # [Sequence[Sequence[Sequence[_SupportsArray[dtype[Any]]]]]]]"

919 keys = np.insert(keys, 0, self.fill_value) # type: ignore[arg-type]

920 counts = np.insert(counts, 0, fcounts)

921

922 if not isinstance(keys, ABCIndex):

923 index = Index(keys)

924 else:

925 index = keys

926 return Series(counts, index=index)

927

928 def _quantile(self, qs: npt.NDArray[np.float64], interpolation: str):

929

930 if self._null_fill_value or self.sp_index.ngaps == 0:

931 # We can avoid densifying

932 npvalues = self.sp_values

933 mask = np.zeros(npvalues.shape, dtype=bool)

934 else:

935 npvalues = self.to_numpy()

936 mask = self.isna()

937

938 fill_value = na_value_for_dtype(npvalues.dtype, compat=False)

939 res_values = quantile_with_mask(

940 npvalues,

941 mask,

942 fill_value,

943 qs,

944 interpolation,

945 )

946

947 # Special case: the returned array isn't _really_ sparse, so we don't

948 # wrap it in a SparseArray

949 return res_values

950

951 # --------

952 # Indexing

953 # --------

954 @overload

955 def __getitem__(self, key: ScalarIndexer) -> Any:

956 ...

957

958 @overload

959 def __getitem__(

960 self: SparseArrayT,

961 key: SequenceIndexer | tuple[int | ellipsis, ...],

962 ) -> SparseArrayT:

963 ...

964

965 def __getitem__(

966 self: SparseArrayT,

967 key: PositionalIndexer | tuple[int | ellipsis, ...],

968 ) -> SparseArrayT | Any:

969

970 if isinstance(key, tuple):

971 key = unpack_tuple_and_ellipses(key)

972 # Non-overlapping identity check (left operand type:

973 # "Union[Union[Union[int, integer[Any]], Union[slice, List[int],

974 # ndarray[Any, Any]]], Tuple[Union[int, ellipsis], ...]]",

975 # right operand type: "ellipsis")

976 if key is Ellipsis: # type: ignore[comparison-overlap]

977 raise ValueError("Cannot slice with Ellipsis")

978

979 if is_integer(key):

980 return self._get_val_at(key)

981 elif isinstance(key, tuple):

982 # error: Invalid index type "Tuple[Union[int, ellipsis], ...]"

983 # for "ndarray[Any, Any]"; expected type

984 # "Union[SupportsIndex, _SupportsArray[dtype[Union[bool_,

985 # integer[Any]]]], _NestedSequence[_SupportsArray[dtype[

986 # Union[bool_, integer[Any]]]]], _NestedSequence[Union[

987 # bool, int]], Tuple[Union[SupportsIndex, _SupportsArray[

988 # dtype[Union[bool_, integer[Any]]]], _NestedSequence[

989 # _SupportsArray[dtype[Union[bool_, integer[Any]]]]],

990 # _NestedSequence[Union[bool, int]]], ...]]"

991 data_slice = self.to_dense()[key] # type: ignore[index]

992 elif isinstance(key, slice):

993

994 # Avoid densifying when handling contiguous slices

995 if key.step is None or key.step == 1:

996 start = 0 if key.start is None else key.start

997 if start < 0:

998 start += len(self)

999

1000 end = len(self) if key.stop is None else key.stop

1001 if end < 0:

1002 end += len(self)

1003

1004 indices = self.sp_index.indices

1005 keep_inds = np.flatnonzero((indices >= start) & (indices < end))

1006 sp_vals = self.sp_values[keep_inds]

1007

1008 sp_index = indices[keep_inds].copy()

1009

1010 # If we've sliced to not include the start of the array, all our indices

1011 # should be shifted. NB: here we are careful to also not shift by a

1012 # negative value for a case like [0, 1][-100:] where the start index

1013 # should be treated like 0

1014 if start > 0:

1015 sp_index -= start

1016

1017 # Length of our result should match applying this slice to a range

1018 # of the length of our original array

1019 new_len = len(range(len(self))[key])

1020 new_sp_index = make_sparse_index(new_len, sp_index, self.kind)

1021 return type(self)._simple_new(sp_vals, new_sp_index, self.dtype)

1022 else:

1023 indices = np.arange(len(self), dtype=np.int32)[key]

1024 return self.take(indices)

1025

1026 elif not is_list_like(key):

1027 # e.g. "foo" or 2.5

1028 # exception message copied from numpy

1029 raise IndexError(

1030 r"only integers, slices (`:`), ellipsis (`...`), numpy.newaxis "

1031 r"(`None`) and integer or boolean arrays are valid indices"

1032 )

1033

1034 else:

1035 if isinstance(key, SparseArray):

1036 # NOTE: If we guarantee that SparseDType(bool)

1037 # has only fill_value - true, false or nan

1038 # (see GH PR 44955)

1039 # we can apply mask very fast:

1040 if is_bool_dtype(key):

1041 if isna(key.fill_value):

1042 return self.take(key.sp_index.indices[key.sp_values])

1043 if not key.fill_value:

1044 return self.take(key.sp_index.indices)

1045 n = len(self)

1046 mask = np.full(n, True, dtype=np.bool8)

1047 mask[key.sp_index.indices] = False

1048 return self.take(np.arange(n)[mask])

1049 else:

1050 key = np.asarray(key)

1051

1052 key = check_array_indexer(self, key)

1053

1054 if com.is_bool_indexer(key):

1055 # mypy doesn't know we have an array here

1056 key = cast(np.ndarray, key)

1057 return self.take(np.arange(len(key), dtype=np.int32)[key])

1058 elif hasattr(key, "__len__"):

1059 return self.take(key)

1060 else:

1061 raise ValueError(f"Cannot slice with '{key}'")

1062

1063 return type(self)(data_slice, kind=self.kind)

1064

1065 def _get_val_at(self, loc):

1066 loc = validate_insert_loc(loc, len(self))

1067

1068 sp_loc = self.sp_index.lookup(loc)

1069 if sp_loc == -1:

1070 return self.fill_value

1071 else:

1072 val = self.sp_values[sp_loc]

1073 val = maybe_box_datetimelike(val, self.sp_values.dtype)

1074 return val

1075

1076 def take(

1077 self: SparseArrayT, indices, *, allow_fill: bool = False, fill_value=None

1078 ) -> SparseArrayT:

1079 if is_scalar(indices):

1080 raise ValueError(f"'indices' must be an array, not a scalar '{indices}'.")

1081 indices = np.asarray(indices, dtype=np.int32)

1082

1083 dtype = None

1084 if indices.size == 0:

1085 result = np.array([], dtype="object")

1086 dtype = self.dtype

1087 elif allow_fill:

1088 result = self._take_with_fill(indices, fill_value=fill_value)

1089 else:

1090 return self._take_without_fill(indices)

1091

1092 return type(self)(

1093 result, fill_value=self.fill_value, kind=self.kind, dtype=dtype

1094 )

1095

1096 def _take_with_fill(self, indices, fill_value=None) -> np.ndarray:

1097 if fill_value is None:

1098 fill_value = self.dtype.na_value

1099

1100 if indices.min() < -1:

1101 raise ValueError(

1102 "Invalid value in 'indices'. Must be between -1 "

1103 "and the length of the array."

1104 )

1105

1106 if indices.max() >= len(self):

1107 raise IndexError("out of bounds value in 'indices'.")

1108

1109 if len(self) == 0:

1110 # Empty... Allow taking only if all empty

1111 if (indices == -1).all():

1112 dtype = np.result_type(self.sp_values, type(fill_value))

1113 taken = np.empty_like(indices, dtype=dtype)

1114 taken.fill(fill_value)

1115 return taken

1116 else:

1117 raise IndexError("cannot do a non-empty take from an empty axes.")

1118

1119 # sp_indexer may be -1 for two reasons

1120 # 1.) we took for an index of -1 (new)

1121 # 2.) we took a value that was self.fill_value (old)

1122 sp_indexer = self.sp_index.lookup_array(indices)

1123 new_fill_indices = indices == -1

1124 old_fill_indices = (sp_indexer == -1) & ~new_fill_indices

1125

1126 if self.sp_index.npoints == 0 and old_fill_indices.all():

1127 # We've looked up all valid points on an all-sparse array.

1128 taken = np.full(

1129 sp_indexer.shape, fill_value=self.fill_value, dtype=self.dtype.subtype

1130 )

1131

1132 elif self.sp_index.npoints == 0:

1133 # Avoid taking from the empty self.sp_values

1134 _dtype = np.result_type(self.dtype.subtype, type(fill_value))

1135 taken = np.full(sp_indexer.shape, fill_value=fill_value, dtype=_dtype)

1136 else:

1137 taken = self.sp_values.take(sp_indexer)

1138

1139 # Fill in two steps.

1140 # Old fill values

1141 # New fill values

1142 # potentially coercing to a new dtype at each stage.

1143

1144 m0 = sp_indexer[old_fill_indices] < 0

1145 m1 = sp_indexer[new_fill_indices] < 0

1146

1147 result_type = taken.dtype

1148

1149 if m0.any():

1150 result_type = np.result_type(result_type, type(self.fill_value))

1151 taken = taken.astype(result_type)

1152 taken[old_fill_indices] = self.fill_value

1153

1154 if m1.any():

1155 result_type = np.result_type(result_type, type(fill_value))

1156 taken = taken.astype(result_type)

1157 taken[new_fill_indices] = fill_value

1158

1159 return taken

1160

1161 def _take_without_fill(self: SparseArrayT, indices) -> SparseArrayT:

1162 to_shift = indices < 0

1163

1164 n = len(self)

1165

1166 if (indices.max() >= n) or (indices.min() < -n):

1167 if n == 0:

1168 raise IndexError("cannot do a non-empty take from an empty axes.")

1169 else:

1170 raise IndexError("out of bounds value in 'indices'.")

1171

1172 if to_shift.any():

1173 indices = indices.copy()

1174 indices[to_shift] += n

1175

1176 sp_indexer = self.sp_index.lookup_array(indices)

1177 value_mask = sp_indexer != -1

1178 new_sp_values = self.sp_values[sp_indexer[value_mask]]

1179

1180 value_indices = np.flatnonzero(value_mask).astype(np.int32, copy=False)

1181

1182 new_sp_index = make_sparse_index(len(indices), value_indices, kind=self.kind)

1183 return type(self)._simple_new(new_sp_values, new_sp_index, dtype=self.dtype)

1184

1185 def searchsorted(

1186 self,

1187 v: ArrayLike | object,

1188 side: Literal["left", "right"] = "left",

1189 sorter: NumpySorter = None,

1190 ) -> npt.NDArray[np.intp] | np.intp:

1191

1192 msg = "searchsorted requires high memory usage."

1193 warnings.warn(msg, PerformanceWarning, stacklevel=find_stack_level())

1194 if not is_scalar(v):

1195 v = np.asarray(v)

1196 v = np.asarray(v)

1197 return np.asarray(self, dtype=self.dtype.subtype).searchsorted(v, side, sorter)

1198

1199 def copy(self: SparseArrayT) -> SparseArrayT:

1200 values = self.sp_values.copy()

1201 return self._simple_new(values, self.sp_index, self.dtype)

1202

1203 @classmethod

1204 def _concat_same_type(

1205 cls: type[SparseArrayT], to_concat: Sequence[SparseArrayT]

1206 ) -> SparseArrayT:

1207 fill_value = to_concat[0].fill_value

1208

1209 values = []

1210 length = 0

1211

1212 if to_concat:

1213 sp_kind = to_concat[0].kind

1214 else:

1215 sp_kind = "integer"

1216

1217 sp_index: SparseIndex

1218 if sp_kind == "integer":

1219 indices = []

1220

1221 for arr in to_concat:

1222 int_idx = arr.sp_index.indices.copy()

1223 int_idx += length # TODO: wraparound

1224 length += arr.sp_index.length

1225

1226 values.append(arr.sp_values)

1227 indices.append(int_idx)

1228

1229 data = np.concatenate(values)

1230 indices_arr = np.concatenate(indices)

1231 # error: Argument 2 to "IntIndex" has incompatible type

1232 # "ndarray[Any, dtype[signedinteger[_32Bit]]]";

1233 # expected "Sequence[int]"

1234 sp_index = IntIndex(length, indices_arr) # type: ignore[arg-type]

1235

1236 else:

1237 # when concatenating block indices, we don't claim that you'll

1238 # get an identical index as concatenating the values and then

1239 # creating a new index. We don't want to spend the time trying

1240 # to merge blocks across arrays in `to_concat`, so the resulting

1241 # BlockIndex may have more blocks.

1242 blengths = []

1243 blocs = []

1244

1245 for arr in to_concat:

1246 block_idx = arr.sp_index.to_block_index()

1247

1248 values.append(arr.sp_values)

1249 blocs.append(block_idx.blocs.copy() + length)

1250 blengths.append(block_idx.blengths)

1251 length += arr.sp_index.length

1252

1253 data = np.concatenate(values)

1254 blocs_arr = np.concatenate(blocs)

1255 blengths_arr = np.concatenate(blengths)

1256

1257 sp_index = BlockIndex(length, blocs_arr, blengths_arr)

1258

1259 return cls(data, sparse_index=sp_index, fill_value=fill_value)

1260

1261 def astype(self, dtype: AstypeArg | None = None, copy: bool = True):

1262 """

1263 Change the dtype of a SparseArray.

1264

1265 The output will always be a SparseArray. To convert to a dense

1266 ndarray with a certain dtype, use :meth:`numpy.asarray`.

1267

1268 Parameters

1269 ----------

1270 dtype : np.dtype or ExtensionDtype

1271 For SparseDtype, this changes the dtype of

1272 ``self.sp_values`` and the ``self.fill_value``.

1273

1274 For other dtypes, this only changes the dtype of

1275 ``self.sp_values``.

1276

1277 copy : bool, default True

1278 Whether to ensure a copy is made, even if not necessary.

1279

1280 Returns

1281 -------

1282 SparseArray

1283

1284 Examples

1285 --------

1286 >>> arr = pd.arrays.SparseArray([0, 0, 1, 2])

1287 >>> arr

1288 [0, 0, 1, 2]

1289 Fill: 0

1290 IntIndex

1291 Indices: array([2, 3], dtype=int32)

1292

1293 >>> arr.astype(SparseDtype(np.dtype('int32')))

1294 [0, 0, 1, 2]

1295 Fill: 0

1296 IntIndex

1297 Indices: array([2, 3], dtype=int32)

1298

1299 Using a NumPy dtype with a different kind (e.g. float) will coerce

1300 just ``self.sp_values``.

1301

1302 >>> arr.astype(SparseDtype(np.dtype('float64')))

1303 ... # doctest: +NORMALIZE_WHITESPACE

1304 [nan, nan, 1.0, 2.0]

1305 Fill: nan

1306 IntIndex

1307 Indices: array([2, 3], dtype=int32)

1308

1309 Using a SparseDtype, you can also change the fill value as well.

1310

1311 >>> arr.astype(SparseDtype("float64", fill_value=0.0))

1312 ... # doctest: +NORMALIZE_WHITESPACE

1313 [0.0, 0.0, 1.0, 2.0]

1314 Fill: 0.0

1315 IntIndex

1316 Indices: array([2, 3], dtype=int32)

1317 """

1318 if is_dtype_equal(dtype, self._dtype):

1319 if not copy:

1320 return self

1321 else:

1322 return self.copy()

1323

1324 future_dtype = pandas_dtype(dtype)

1325 if not isinstance(future_dtype, SparseDtype):

1326 # GH#34457

1327 warnings.warn(

1328 "The behavior of .astype from SparseDtype to a non-sparse dtype "

1329 "is deprecated. In a future version, this will return a non-sparse "

1330 "array with the requested dtype. To retain the old behavior, use "

1331 "`obj.astype(SparseDtype(dtype))`",

1332 FutureWarning,

1333 stacklevel=find_stack_level(),

1334 )

1335

1336 dtype = self.dtype.update_dtype(dtype)

1337 subtype = pandas_dtype(dtype._subtype_with_str)

1338 sp_values = astype_nansafe(self.sp_values, subtype, copy=copy)

1339

1340 # error: Argument 1 to "_simple_new" of "SparseArray" has incompatible type

1341 # "ExtensionArray"; expected "ndarray"

1342 return self._simple_new(

1343 sp_values, self.sp_index, dtype # type: ignore[arg-type]

1344 )

1345

1346 def map(self: SparseArrayT, mapper) -> SparseArrayT:

1347 """

1348 Map categories using an input mapping or function.

1349

1350 Parameters

1351 ----------

1352 mapper : dict, Series, callable

1353 The correspondence from old values to new.

1354

1355 Returns

1356 -------

1357 SparseArray

1358 The output array will have the same density as the input.

1359 The output fill value will be the result of applying the

1360 mapping to ``self.fill_value``

1361

1362 Examples

1363 --------

1364 >>> arr = pd.arrays.SparseArray([0, 1, 2])

1365 >>> arr.map(lambda x: x + 10)

1366 [10, 11, 12]

1367 Fill: 10

1368 IntIndex

1369 Indices: array([1, 2], dtype=int32)

1370

1371 >>> arr.map({0: 10, 1: 11, 2: 12})

1372 [10, 11, 12]

1373 Fill: 10

1374 IntIndex

1375 Indices: array([1, 2], dtype=int32)

1376

1377 >>> arr.map(pd.Series([10, 11, 12], index=[0, 1, 2]))

1378 [10, 11, 12]

1379 Fill: 10

1380 IntIndex

1381 Indices: array([1, 2], dtype=int32)

1382 """

1383 # this is used in apply.

1384 # We get hit since we're an "is_extension_type" but regular extension

1385 # types are not hit. This may be worth adding to the interface.

1386 if isinstance(mapper, ABCSeries):

1387 mapper = mapper.to_dict()

1388

1389 if isinstance(mapper, abc.Mapping):

1390 fill_value = mapper.get(self.fill_value, self.fill_value)

1391 sp_values = [mapper.get(x, None) for x in self.sp_values]

1392 else:

1393 fill_value = mapper(self.fill_value)

1394 sp_values = [mapper(x) for x in self.sp_values]

1395

1396 return type(self)(sp_values, sparse_index=self.sp_index, fill_value=fill_value)

1397

1398 def to_dense(self) -> np.ndarray:

1399 """

1400 Convert SparseArray to a NumPy array.

1401

1402 Returns

1403 -------

1404 arr : NumPy array

1405 """

1406 return np.asarray(self, dtype=self.sp_values.dtype)

1407

1408 def _where(self, mask, value):

1409 # NB: may not preserve dtype, e.g. result may be Sparse[float64]

1410 # while self is Sparse[int64]

1411 naive_implementation = np.where(mask, self, value)

1412 dtype = SparseDtype(naive_implementation.dtype, fill_value=self.fill_value)

1413 result = type(self)._from_sequence(naive_implementation, dtype=dtype)

1414 return result

1415

1416 # ------------------------------------------------------------------------

1417 # IO

1418 # ------------------------------------------------------------------------

1419 def __setstate__(self, state) -> None:

1420 """Necessary for making this object picklable"""

1421 if isinstance(state, tuple):

1422 # Compat for pandas < 0.24.0

1423 nd_state, (fill_value, sp_index) = state

1424 sparse_values = np.array([])

1425 sparse_values.__setstate__(nd_state)

1426

1427 self._sparse_values = sparse_values

1428 self._sparse_index = sp_index

1429 self._dtype = SparseDtype(sparse_values.dtype, fill_value)

1430 else:

1431 self.__dict__.update(state)

1432

1433 def nonzero(self) -> tuple[npt.NDArray[np.int32]]:

1434 if self.fill_value == 0:

1435 return (self.sp_index.indices,)

1436 else:

1437 return (self.sp_index.indices[self.sp_values != 0],)

1438

1439 # ------------------------------------------------------------------------

1440 # Reductions

1441 # ------------------------------------------------------------------------

1442

1443 def _reduce(self, name: str, *, skipna: bool = True, **kwargs):

1444 method = getattr(self, name, None)

1445

1446 if method is None:

1447 raise TypeError(f"cannot perform {name} with type {self.dtype}")

1448

1449 if skipna:

1450 arr = self

1451 else:

1452 arr = self.dropna()

1453

1454 return getattr(arr, name)(**kwargs)

1455

1456 def all(self, axis=None, *args, **kwargs):

1457 """

1458 Tests whether all elements evaluate True

1459

1460 Returns

1461 -------

1462 all : bool

1463

1464 See Also

1465 --------

1466 numpy.all

1467 """

1468 nv.validate_all(args, kwargs)

1469

1470 values = self.sp_values

1471

1472 if len(values) != len(self) and not np.all(self.fill_value):

1473 return False

1474

1475 return values.all()

1476

1477 def any(self, axis=0, *args, **kwargs):

1478 """

1479 Tests whether at least one of elements evaluate True

1480

1481 Returns

1482 -------

1483 any : bool

1484

1485 See Also

1486 --------

1487 numpy.any

1488 """

1489 nv.validate_any(args, kwargs)

1490

1491 values = self.sp_values

1492

1493 if len(values) != len(self) and np.any(self.fill_value):

1494 return True

1495

1496 return values.any().item()

1497

1498 def sum(

1499 self, axis: int = 0, min_count: int = 0, skipna: bool = True, *args, **kwargs

1500 ) -> Scalar:

1501 """

1502 Sum of non-NA/null values

1503

1504 Parameters

1505 ----------

1506 axis : int, default 0

1507 Not Used. NumPy compatibility.

1508 min_count : int, default 0

1509 The required number of valid values to perform the summation. If fewer

1510 than ``min_count`` valid values are present, the result will be the missing

1511 value indicator for subarray type.

1512 *args, **kwargs

1513 Not Used. NumPy compatibility.

1514

1515 Returns

1516 -------

1517 scalar

1518 """

1519 nv.validate_sum(args, kwargs)

1520 valid_vals = self._valid_sp_values

1521 sp_sum = valid_vals.sum()

1522 has_na = self.sp_index.ngaps > 0 and not self._null_fill_value

1523

1524 if has_na and not skipna:

1525 return na_value_for_dtype(self.dtype.subtype, compat=False)

1526

1527 if self._null_fill_value:

1528 if check_below_min_count(valid_vals.shape, None, min_count):

1529 return na_value_for_dtype(self.dtype.subtype, compat=False)

1530 return sp_sum

1531 else:

1532 nsparse = self.sp_index.ngaps

1533 if check_below_min_count(valid_vals.shape, None, min_count - nsparse):

1534 return na_value_for_dtype(self.dtype.subtype, compat=False)

1535 return sp_sum + self.fill_value * nsparse

1536

1537 def cumsum(self, axis: int = 0, *args, **kwargs) -> SparseArray:

1538 """

1539 Cumulative sum of non-NA/null values.

1540

1541 When performing the cumulative summation, any non-NA/null values will

1542 be skipped. The resulting SparseArray will preserve the locations of

1543 NaN values, but the fill value will be `np.nan` regardless.

1544

1545 Parameters

1546 ----------

1547 axis : int or None

1548 Axis over which to perform the cumulative summation. If None,

1549 perform cumulative summation over flattened array.

1550

1551 Returns

1552 -------

1553 cumsum : SparseArray

1554 """

1555 nv.validate_cumsum(args, kwargs)

1556

1557 if axis is not None and axis >= self.ndim: # Mimic ndarray behaviour.

1558 raise ValueError(f"axis(={axis}) out of bounds")

1559

1560 if not self._null_fill_value:

1561 return SparseArray(self.to_dense()).cumsum()

1562

1563 return SparseArray(

1564 self.sp_values.cumsum(),

1565 sparse_index=self.sp_index,

1566 fill_value=self.fill_value,

1567 )

1568

1569 def mean(self, axis=0, *args, **kwargs):

1570 """

1571 Mean of non-NA/null values

1572

1573 Returns

1574 -------

1575 mean : float

1576 """

1577 nv.validate_mean(args, kwargs)

1578 valid_vals = self._valid_sp_values

1579 sp_sum = valid_vals.sum()

1580 ct = len(valid_vals)

1581

1582 if self._null_fill_value:

1583 return sp_sum / ct

1584 else:

1585 nsparse = self.sp_index.ngaps

1586 return (sp_sum + self.fill_value * nsparse) / (ct + nsparse)

1587

1588 def max(self, *, axis: int | None = None, skipna: bool = True):

1589 """

1590 Max of array values, ignoring NA values if specified.

1591

1592 Parameters

1593 ----------

1594 axis : int, default 0

1595 Not Used. NumPy compatibility.

1596 skipna : bool, default True

1597 Whether to ignore NA values.

1598

1599 Returns

1600 -------

1601 scalar

1602 """

1603 nv.validate_minmax_axis(axis, self.ndim)

1604 return self._min_max("max", skipna=skipna)

1605

1606 def min(self, *, axis: int | None = None, skipna: bool = True):

1607 """

1608 Min of array values, ignoring NA values if specified.

1609

1610 Parameters

1611 ----------

1612 axis : int, default 0

1613 Not Used. NumPy compatibility.

1614 skipna : bool, default True

1615 Whether to ignore NA values.

1616

1617 Returns

1618 -------

1619 scalar

1620 """

1621 nv.validate_minmax_axis(axis, self.ndim)

1622 return self._min_max("min", skipna=skipna)

1623

1624 def _min_max(self, kind: Literal["min", "max"], skipna: bool) -> Scalar:

1625 """

1626 Min/max of non-NA/null values

1627

1628 Parameters

1629 ----------

1630 kind : {"min", "max"}

1631 skipna : bool

1632

1633 Returns

1634 -------

1635 scalar

1636 """

1637 valid_vals = self._valid_sp_values

1638 has_nonnull_fill_vals = not self._null_fill_value and self.sp_index.ngaps > 0

1639

1640 if len(valid_vals) > 0:

1641 sp_min_max = getattr(valid_vals, kind)()

1642

1643 # If a non-null fill value is currently present, it might be the min/max

1644 if has_nonnull_fill_vals:

1645 func = max if kind == "max" else min

1646 return func(sp_min_max, self.fill_value)

1647 elif skipna:

1648 return sp_min_max

1649 elif self.sp_index.ngaps == 0:

1650 # No NAs present

1651 return sp_min_max

1652 else:

1653 return na_value_for_dtype(self.dtype.subtype, compat=False)

1654 elif has_nonnull_fill_vals:

1655 return self.fill_value

1656 else:

1657 return na_value_for_dtype(self.dtype.subtype, compat=False)

1658

1659 def _argmin_argmax(self, kind: Literal["argmin", "argmax"]) -> int:

1660

1661 values = self._sparse_values

1662 index = self._sparse_index.indices

1663 mask = np.asarray(isna(values))

1664 func = np.argmax if kind == "argmax" else np.argmin

1665

1666 idx = np.arange(values.shape[0])

1667 non_nans = values[~mask]

1668 non_nan_idx = idx[~mask]

1669

1670 _candidate = non_nan_idx[func(non_nans)]

1671 candidate = index[_candidate]

1672

1673 if isna(self.fill_value):

1674 return candidate

1675 if kind == "argmin" and self[candidate] < self.fill_value:

1676 return candidate

1677 if kind == "argmax" and self[candidate] > self.fill_value:

1678 return candidate

1679 _loc = self._first_fill_value_loc()

1680 if _loc == -1:

1681 # fill_value doesn't exist

1682 return candidate

1683 else:

1684 return _loc

1685

1686 def argmax(self, skipna: bool = True) -> int:

1687 validate_bool_kwarg(skipna, "skipna")

1688 if not skipna and self._hasna:

1689 raise NotImplementedError

1690 return self._argmin_argmax("argmax")

1691

1692 def argmin(self, skipna: bool = True) -> int:

1693 validate_bool_kwarg(skipna, "skipna")

1694 if not skipna and self._hasna:

1695 raise NotImplementedError

1696 return self._argmin_argmax("argmin")

1697

1698 # ------------------------------------------------------------------------

1699 # Ufuncs

1700 # ------------------------------------------------------------------------

1701

1702 _HANDLED_TYPES = (np.ndarray, numbers.Number)

1703

1704 def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs, **kwargs):

1705 out = kwargs.get("out", ())

1706

1707 for x in inputs + out:

1708 if not isinstance(x, self._HANDLED_TYPES + (SparseArray,)):

1709 return NotImplemented

1710

1711 # for binary ops, use our custom dunder methods

1712 result = ops.maybe_dispatch_ufunc_to_dunder_op(

1713 self, ufunc, method, *inputs, **kwargs

1714 )

1715 if result is not NotImplemented:

1716 return result

1717

1718 if "out" in kwargs:

1719 # e.g. tests.arrays.sparse.test_arithmetics.test_ndarray_inplace

1720 res = arraylike.dispatch_ufunc_with_out(

1721 self, ufunc, method, *inputs, **kwargs

1722 )

1723 return res

1724

1725 if method == "reduce":

1726 result = arraylike.dispatch_reduction_ufunc(

1727 self, ufunc, method, *inputs, **kwargs

1728 )

1729 if result is not NotImplemented:

1730 # e.g. tests.series.test_ufunc.TestNumpyReductions

1731 return result

1732

1733 if len(inputs) == 1:

1734 # No alignment necessary.

1735 sp_values = getattr(ufunc, method)(self.sp_values, **kwargs)

1736 fill_value = getattr(ufunc, method)(self.fill_value, **kwargs)

1737

1738 if ufunc.nout > 1:

1739 # multiple outputs. e.g. modf

1740 arrays = tuple(

1741 self._simple_new(

1742 sp_value, self.sp_index, SparseDtype(sp_value.dtype, fv)

1743 )

1744 for sp_value, fv in zip(sp_values, fill_value)

1745 )

1746 return arrays

1747 elif method == "reduce":

1748 # e.g. reductions

1749 return sp_values

1750

1751 return self._simple_new(

1752 sp_values, self.sp_index, SparseDtype(sp_values.dtype, fill_value)

1753 )

1754

1755 new_inputs = tuple(np.asarray(x) for x in inputs)

1756 result = getattr(ufunc, method)(*new_inputs, **kwargs)

1757 if out:

1758 if len(out) == 1:

1759 out = out[0]

1760 return out

1761

1762 if ufunc.nout > 1:

1763 return tuple(type(self)(x) for x in result)

1764 elif method == "at":

1765 # no return value

1766 return None

1767 else:

1768 return type(self)(result)

1769

1770 # ------------------------------------------------------------------------

1771 # Ops

1772 # ------------------------------------------------------------------------

1773

1774 def _arith_method(self, other, op):

1775 op_name = op.__name__

1776

1777 if isinstance(other, SparseArray):

1778 return _sparse_array_op(self, other, op, op_name)

1779

1780 elif is_scalar(other):

1781 with np.errstate(all="ignore"):

1782 fill = op(_get_fill(self), np.asarray(other))

1783 result = op(self.sp_values, other)

1784

1785 if op_name == "divmod":

1786 left, right = result

1787 lfill, rfill = fill

1788 return (

1789 _wrap_result(op_name, left, self.sp_index, lfill),

1790 _wrap_result(op_name, right, self.sp_index, rfill),

1791 )

1792

1793 return _wrap_result(op_name, result, self.sp_index, fill)

1794

1795 else:

1796 other = np.asarray(other)

1797 with np.errstate(all="ignore"):

1798 if len(self) != len(other):

1799 raise AssertionError(

1800 f"length mismatch: {len(self)} vs. {len(other)}"

1801 )

1802 if not isinstance(other, SparseArray):

1803 dtype = getattr(other, "dtype", None)

1804 other = SparseArray(other, fill_value=self.fill_value, dtype=dtype)

1805 return _sparse_array_op(self, other, op, op_name)

1806

1807 def _cmp_method(self, other, op) -> SparseArray:

1808 if not is_scalar(other) and not isinstance(other, type(self)):

1809 # convert list-like to ndarray

1810 other = np.asarray(other)

1811

1812 if isinstance(other, np.ndarray):

1813 # TODO: make this more flexible than just ndarray...

1814 other = SparseArray(other, fill_value=self.fill_value)

1815

1816 if isinstance(other, SparseArray):

1817 if len(self) != len(other):

1818 raise ValueError(

1819 f"operands have mismatched length {len(self)} and {len(other)}"

1820 )

1821

1822 op_name = op.__name__.strip("_")

1823 return _sparse_array_op(self, other, op, op_name)

1824 else:

1825 # scalar

1826 with np.errstate(all="ignore"):

1827 fill_value = op(self.fill_value, other)

1828 result = np.full(len(self), fill_value, dtype=np.bool_)

1829 result[self.sp_index.indices] = op(self.sp_values, other)

1830

1831 return type(self)(

1832 result,

1833 fill_value=fill_value,

1834 dtype=np.bool_,

1835 )

1836

1837 _logical_method = _cmp_method

1838

1839 def _unary_method(self, op) -> SparseArray:

1840 fill_value = op(np.array(self.fill_value)).item()

1841 dtype = SparseDtype(self.dtype.subtype, fill_value)

1842 # NOTE: if fill_value doesn't change

1843 # we just have to apply op to sp_values

1844 if isna(self.fill_value) or fill_value == self.fill_value:

1845 values = op(self.sp_values)

1846 return type(self)._simple_new(values, self.sp_index, self.dtype)

1847 # In the other case we have to recalc indexes

1848 return type(self)(op(self.to_dense()), dtype=dtype)

1849

1850 def __pos__(self) -> SparseArray:

1851 return self._unary_method(operator.pos)

1852

1853 def __neg__(self) -> SparseArray:

1854 return self._unary_method(operator.neg)

1855

1856 def __invert__(self) -> SparseArray:

1857 return self._unary_method(operator.invert)

1858

1859 def __abs__(self) -> SparseArray:

1860 return self._unary_method(operator.abs)

1861

1862 # ----------

1863 # Formatting

1864 # -----------

1865 def __repr__(self) -> str:

1866 pp_str = printing.pprint_thing(self)

1867 pp_fill = printing.pprint_thing(self.fill_value)

1868 pp_index = printing.pprint_thing(self.sp_index)

1869 return f"{pp_str}\nFill: {pp_fill}\n{pp_index}"

1870

1871 def _formatter(self, boxed=False):

1872 # Defer to the formatter from the GenericArrayFormatter calling us.

1873 # This will infer the correct formatter from the dtype of the values.

1874 return None

1875

1876

1877def make_sparse(

1878 arr: np.ndarray,

1879 kind: SparseIndexKind = "block",

1880 fill_value=None,

1881 dtype: NpDtype | None = None,

1882):

1883 """

1884 Convert ndarray to sparse format

1885

1886 Parameters

1887 ----------

1888 arr : ndarray

1889 kind : {'block', 'integer'}

1890 fill_value : NaN or another value

1891 dtype : np.dtype, optional

1892 copy : bool, default False

1893

1894 Returns

1895 -------

1896 (sparse_values, index, fill_value) : (ndarray, SparseIndex, Scalar)

1897 """

1898 assert isinstance(arr, np.ndarray)

1899

1900 if arr.ndim > 1:

1901 raise TypeError("expected dimension <= 1 data")

1902

1903 if fill_value is None:

1904 fill_value = na_value_for_dtype(arr.dtype)

1905

1906 if isna(fill_value):

1907 mask = notna(arr)

1908 else:

1909 # cast to object comparison to be safe

1910 if is_string_dtype(arr.dtype):

1911 arr = arr.astype(object)

1912

1913 if is_object_dtype(arr.dtype):

1914 # element-wise equality check method in numpy doesn't treat

1915 # each element type, eg. 0, 0.0, and False are treated as

1916 # same. So we have to check the both of its type and value.

1917 mask = splib.make_mask_object_ndarray(arr, fill_value)

1918 else:

1919 mask = arr != fill_value

1920

1921 length = len(arr)

1922 if length != len(mask):

1923 # the arr is a SparseArray

1924 indices = mask.sp_index.indices

1925 else:

1926 indices = mask.nonzero()[0].astype(np.int32)

1927

1928 index = make_sparse_index(length, indices, kind)

1929 sparsified_values = arr[mask]

1930 if dtype is not None:

1931 # error: Argument "dtype" to "astype_nansafe" has incompatible type "Union[str,

1932 # dtype[Any]]"; expected "Union[dtype[Any], ExtensionDtype]"

1933 sparsified_values = astype_nansafe(

1934 sparsified_values, dtype=dtype # type: ignore[arg-type]

1935 )

1936 # TODO: copy

1937 return sparsified_values, index, fill_value

1938

1939

1940@overload

1941def make_sparse_index(length: int, indices, kind: Literal["block"]) -> BlockIndex:

1942 ...

1943

1944

1945@overload

1946def make_sparse_index(length: int, indices, kind: Literal["integer"]) -> IntIndex:

1947 ...

1948

1949

1950def make_sparse_index(length: int, indices, kind: SparseIndexKind) -> SparseIndex:

1951 index: SparseIndex

1952 if kind == "block":

1953 locs, lens = splib.get_blocks(indices)

1954 index = BlockIndex(length, locs, lens)

1955 elif kind == "integer":

1956 index = IntIndex(length, indices)

1957 else: # pragma: no cover

1958 raise ValueError("must be block or integer type")

1959 return index