Coverage for /var/srv/projects/api.amasfac.comuna18.com/tmp/venv/lib/python3.9/site-packages/pandas/core/arrays/sparse/array.py: 13%
803 statements
« prev ^ index » next coverage.py v6.4.4, created at 2023-07-17 14:22 -0600
« prev ^ index » next coverage.py v6.4.4, created at 2023-07-17 14:22 -0600
1"""
2SparseArray data structure
3"""
4from __future__ import annotations
6from collections import abc
7import numbers
8import operator
9from typing import (
10 TYPE_CHECKING,
11 Any,
12 Callable,
13 Literal,
14 Sequence,
15 TypeVar,
16 cast,
17 overload,
18)
19import warnings
21import numpy as np
23from pandas._libs import lib
24import pandas._libs.sparse as splib
25from pandas._libs.sparse import (
26 BlockIndex,
27 IntIndex,
28 SparseIndex,
29)
30from pandas._libs.tslibs import NaT
31from pandas._typing import (
32 ArrayLike,
33 AstypeArg,
34 Dtype,
35 NpDtype,
36 PositionalIndexer,
37 Scalar,
38 ScalarIndexer,
39 SequenceIndexer,
40 npt,
41)
42from pandas.compat.numpy import function as nv
43from pandas.errors import PerformanceWarning
44from pandas.util._exceptions import find_stack_level
45from pandas.util._validators import (
46 validate_bool_kwarg,
47 validate_insert_loc,
48)
50from pandas.core.dtypes.astype import astype_nansafe
51from pandas.core.dtypes.cast import (
52 construct_1d_arraylike_from_scalar,
53 find_common_type,
54 maybe_box_datetimelike,
55)
56from pandas.core.dtypes.common import (
57 is_array_like,
58 is_bool_dtype,
59 is_datetime64_any_dtype,
60 is_datetime64tz_dtype,
61 is_dtype_equal,
62 is_integer,
63 is_list_like,
64 is_object_dtype,
65 is_scalar,
66 is_string_dtype,
67 pandas_dtype,
68)
69from pandas.core.dtypes.generic import (
70 ABCIndex,
71 ABCSeries,
72)
73from pandas.core.dtypes.missing import (
74 isna,
75 na_value_for_dtype,
76 notna,
77)
79from pandas.core import arraylike
80import pandas.core.algorithms as algos
81from pandas.core.array_algos.quantile import quantile_with_mask
82from pandas.core.arraylike import OpsMixin
83from pandas.core.arrays import ExtensionArray
84from pandas.core.arrays.sparse.dtype import SparseDtype
85from pandas.core.base import PandasObject
86import pandas.core.common as com
87from pandas.core.construction import (
88 extract_array,
89 sanitize_array,
90)
91from pandas.core.indexers import (
92 check_array_indexer,
93 unpack_tuple_and_ellipses,
94)
95from pandas.core.missing import interpolate_2d
96from pandas.core.nanops import check_below_min_count
97import pandas.core.ops as ops
99import pandas.io.formats.printing as printing
101# See https://github.com/python/typing/issues/684
102if TYPE_CHECKING: 102 ↛ 103line 102 didn't jump to line 103, because the condition on line 102 was never true
103 from enum import Enum
105 class ellipsis(Enum):
106 Ellipsis = "..."
108 Ellipsis = ellipsis.Ellipsis
110 from scipy.sparse import spmatrix
112 from pandas._typing import (
113 FillnaOptions,
114 NumpySorter,
115 )
117 SparseIndexKind = Literal["integer", "block"]
119 from pandas import Series
121else:
122 ellipsis = type(Ellipsis)
125# ----------------------------------------------------------------------------
126# Array
128SparseArrayT = TypeVar("SparseArrayT", bound="SparseArray")
130_sparray_doc_kwargs = {"klass": "SparseArray"}
133def _get_fill(arr: SparseArray) -> np.ndarray:
134 """
135 Create a 0-dim ndarray containing the fill value
137 Parameters
138 ----------
139 arr : SparseArray
141 Returns
142 -------
143 fill_value : ndarray
144 0-dim ndarray with just the fill value.
146 Notes
147 -----
148 coerce fill_value to arr dtype if possible
149 int64 SparseArray can have NaN as fill_value if there is no missing
150 """
151 try:
152 return np.asarray(arr.fill_value, dtype=arr.dtype.subtype)
153 except ValueError:
154 return np.asarray(arr.fill_value)
157def _sparse_array_op(
158 left: SparseArray, right: SparseArray, op: Callable, name: str
159) -> SparseArray:
160 """
161 Perform a binary operation between two arrays.
163 Parameters
164 ----------
165 left : Union[SparseArray, ndarray]
166 right : Union[SparseArray, ndarray]
167 op : Callable
168 The binary operation to perform
169 name str
170 Name of the callable.
172 Returns
173 -------
174 SparseArray
175 """
176 if name.startswith("__"):
177 # For lookups in _libs.sparse we need non-dunder op name
178 name = name[2:-2]
180 # dtype used to find corresponding sparse method
181 ltype = left.dtype.subtype
182 rtype = right.dtype.subtype
184 if not is_dtype_equal(ltype, rtype):
185 subtype = find_common_type([ltype, rtype])
186 ltype = SparseDtype(subtype, left.fill_value)
187 rtype = SparseDtype(subtype, right.fill_value)
189 left = left.astype(ltype, copy=False)
190 right = right.astype(rtype, copy=False)
191 dtype = ltype.subtype
192 else:
193 dtype = ltype
195 # dtype the result must have
196 result_dtype = None
198 if left.sp_index.ngaps == 0 or right.sp_index.ngaps == 0:
199 with np.errstate(all="ignore"):
200 result = op(left.to_dense(), right.to_dense())
201 fill = op(_get_fill(left), _get_fill(right))
203 if left.sp_index.ngaps == 0:
204 index = left.sp_index
205 else:
206 index = right.sp_index
207 elif left.sp_index.equals(right.sp_index):
208 with np.errstate(all="ignore"):
209 result = op(left.sp_values, right.sp_values)
210 fill = op(_get_fill(left), _get_fill(right))
211 index = left.sp_index
212 else:
213 if name[0] == "r":
214 left, right = right, left
215 name = name[1:]
217 if name in ("and", "or", "xor") and dtype == "bool":
218 opname = f"sparse_{name}_uint8"
219 # to make template simple, cast here
220 left_sp_values = left.sp_values.view(np.uint8)
221 right_sp_values = right.sp_values.view(np.uint8)
222 result_dtype = bool
223 else:
224 opname = f"sparse_{name}_{dtype}"
225 left_sp_values = left.sp_values
226 right_sp_values = right.sp_values
228 if (
229 name in ["floordiv", "mod"]
230 and (right == 0).any()
231 and left.dtype.kind in ["i", "u"]
232 ):
233 # Match the non-Sparse Series behavior
234 opname = f"sparse_{name}_float64"
235 left_sp_values = left_sp_values.astype("float64")
236 right_sp_values = right_sp_values.astype("float64")
238 sparse_op = getattr(splib, opname)
240 with np.errstate(all="ignore"):
241 result, index, fill = sparse_op(
242 left_sp_values,
243 left.sp_index,
244 left.fill_value,
245 right_sp_values,
246 right.sp_index,
247 right.fill_value,
248 )
250 if name == "divmod":
251 # result is a 2-tuple
252 # error: Incompatible return value type (got "Tuple[SparseArray,
253 # SparseArray]", expected "SparseArray")
254 return ( # type: ignore[return-value]
255 _wrap_result(name, result[0], index, fill[0], dtype=result_dtype),
256 _wrap_result(name, result[1], index, fill[1], dtype=result_dtype),
257 )
259 if result_dtype is None:
260 result_dtype = result.dtype
262 return _wrap_result(name, result, index, fill, dtype=result_dtype)
265def _wrap_result(
266 name: str, data, sparse_index, fill_value, dtype: Dtype | None = None
267) -> SparseArray:
268 """
269 wrap op result to have correct dtype
270 """
271 if name.startswith("__"):
272 # e.g. __eq__ --> eq
273 name = name[2:-2]
275 if name in ("eq", "ne", "lt", "gt", "le", "ge"):
276 dtype = bool
278 fill_value = lib.item_from_zerodim(fill_value)
280 if is_bool_dtype(dtype):
281 # fill_value may be np.bool_
282 fill_value = bool(fill_value)
283 return SparseArray(
284 data, sparse_index=sparse_index, fill_value=fill_value, dtype=dtype
285 )
288class SparseArray(OpsMixin, PandasObject, ExtensionArray):
289 """
290 An ExtensionArray for storing sparse data.
292 Parameters
293 ----------
294 data : array-like or scalar
295 A dense array of values to store in the SparseArray. This may contain
296 `fill_value`.
297 sparse_index : SparseIndex, optional
298 index : Index
300 .. deprecated:: 1.4.0
301 Use a function like `np.full` to construct an array with the desired
302 repeats of the scalar value instead.
304 fill_value : scalar, optional
305 Elements in data that are ``fill_value`` are not stored in the
306 SparseArray. For memory savings, this should be the most common value
307 in `data`. By default, `fill_value` depends on the dtype of `data`:
309 =========== ==========
310 data.dtype na_value
311 =========== ==========
312 float ``np.nan``
313 int ``0``
314 bool False
315 datetime64 ``pd.NaT``
316 timedelta64 ``pd.NaT``
317 =========== ==========
319 The fill value is potentially specified in three ways. In order of
320 precedence, these are
322 1. The `fill_value` argument
323 2. ``dtype.fill_value`` if `fill_value` is None and `dtype` is
324 a ``SparseDtype``
325 3. ``data.dtype.fill_value`` if `fill_value` is None and `dtype`
326 is not a ``SparseDtype`` and `data` is a ``SparseArray``.
328 kind : str
329 Can be 'integer' or 'block', default is 'integer'.
330 The type of storage for sparse locations.
332 * 'block': Stores a `block` and `block_length` for each
333 contiguous *span* of sparse values. This is best when
334 sparse data tends to be clumped together, with large
335 regions of ``fill-value`` values between sparse values.
336 * 'integer': uses an integer to store the location of
337 each sparse value.
339 dtype : np.dtype or SparseDtype, optional
340 The dtype to use for the SparseArray. For numpy dtypes, this
341 determines the dtype of ``self.sp_values``. For SparseDtype,
342 this determines ``self.sp_values`` and ``self.fill_value``.
343 copy : bool, default False
344 Whether to explicitly copy the incoming `data` array.
346 Attributes
347 ----------
348 None
350 Methods
351 -------
352 None
354 Examples
355 --------
356 >>> from pandas.arrays import SparseArray
357 >>> arr = SparseArray([0, 0, 1, 2])
358 >>> arr
359 [0, 0, 1, 2]
360 Fill: 0
361 IntIndex
362 Indices: array([2, 3], dtype=int32)
363 """
365 _subtyp = "sparse_array" # register ABCSparseArray
366 _hidden_attrs = PandasObject._hidden_attrs | frozenset(["get_values"])
367 _sparse_index: SparseIndex
368 _sparse_values: np.ndarray
369 _dtype: SparseDtype
371 def __init__(
372 self,
373 data,
374 sparse_index=None,
375 index=None,
376 fill_value=None,
377 kind: SparseIndexKind = "integer",
378 dtype: Dtype | None = None,
379 copy: bool = False,
380 ) -> None:
382 if fill_value is None and isinstance(dtype, SparseDtype):
383 fill_value = dtype.fill_value
385 if isinstance(data, type(self)):
386 # disable normal inference on dtype, sparse_index, & fill_value
387 if sparse_index is None:
388 sparse_index = data.sp_index
389 if fill_value is None:
390 fill_value = data.fill_value
391 if dtype is None:
392 dtype = data.dtype
393 # TODO: make kind=None, and use data.kind?
394 data = data.sp_values
396 # Handle use-provided dtype
397 if isinstance(dtype, str):
398 # Two options: dtype='int', regular numpy dtype
399 # or dtype='Sparse[int]', a sparse dtype
400 try:
401 dtype = SparseDtype.construct_from_string(dtype)
402 except TypeError:
403 dtype = pandas_dtype(dtype)
405 if isinstance(dtype, SparseDtype):
406 if fill_value is None:
407 fill_value = dtype.fill_value
408 dtype = dtype.subtype
410 if index is not None:
411 warnings.warn(
412 "The index argument has been deprecated and will be "
413 "removed in a future version. Use a function like np.full "
414 "to construct an array with the desired repeats of the "
415 "scalar value instead.\n\n",
416 FutureWarning,
417 stacklevel=find_stack_level(),
418 )
420 if index is not None and not is_scalar(data):
421 raise Exception("must only pass scalars with an index")
423 if is_scalar(data):
424 if index is not None and data is None:
425 data = np.nan
427 if index is not None:
428 npoints = len(index)
429 elif sparse_index is None:
430 npoints = 1
431 else:
432 npoints = sparse_index.length
434 data = construct_1d_arraylike_from_scalar(data, npoints, dtype=None)
435 dtype = data.dtype
437 if dtype is not None:
438 dtype = pandas_dtype(dtype)
440 # TODO: disentangle the fill_value dtype inference from
441 # dtype inference
442 if data is None:
443 # TODO: What should the empty dtype be? Object or float?
445 # error: Argument "dtype" to "array" has incompatible type
446 # "Union[ExtensionDtype, dtype[Any], None]"; expected "Union[dtype[Any],
447 # None, type, _SupportsDType, str, Union[Tuple[Any, int], Tuple[Any,
448 # Union[int, Sequence[int]]], List[Any], _DTypeDict, Tuple[Any, Any]]]"
449 data = np.array([], dtype=dtype) # type: ignore[arg-type]
451 if not is_array_like(data):
452 try:
453 # probably shared code in sanitize_series
455 data = sanitize_array(data, index=None)
456 except ValueError:
457 # NumPy may raise a ValueError on data like [1, []]
458 # we retry with object dtype here.
459 if dtype is None:
460 dtype = object
461 data = np.atleast_1d(np.asarray(data, dtype=dtype))
462 else:
463 raise
465 if copy:
466 # TODO: avoid double copy when dtype forces cast.
467 data = data.copy()
469 if fill_value is None:
470 fill_value_dtype = data.dtype if dtype is None else dtype
471 if fill_value_dtype is None:
472 fill_value = np.nan
473 else:
474 fill_value = na_value_for_dtype(fill_value_dtype)
476 if isinstance(data, type(self)) and sparse_index is None:
477 sparse_index = data._sparse_index
478 # error: Argument "dtype" to "asarray" has incompatible type
479 # "Union[ExtensionDtype, dtype[Any], Type[object], None]"; expected
480 # "Union[dtype[Any], None, type, _SupportsDType, str, Union[Tuple[Any, int],
481 # Tuple[Any, Union[int, Sequence[int]]], List[Any], _DTypeDict, Tuple[Any,
482 # Any]]]"
483 sparse_values = np.asarray(
484 data.sp_values, dtype=dtype # type: ignore[arg-type]
485 )
486 elif sparse_index is None:
487 data = extract_array(data, extract_numpy=True)
488 if not isinstance(data, np.ndarray):
489 # EA
490 if is_datetime64tz_dtype(data.dtype):
491 warnings.warn(
492 f"Creating SparseArray from {data.dtype} data "
493 "loses timezone information. Cast to object before "
494 "sparse to retain timezone information.",
495 UserWarning,
496 stacklevel=find_stack_level(),
497 )
498 data = np.asarray(data, dtype="datetime64[ns]")
499 if fill_value is NaT:
500 fill_value = np.datetime64("NaT", "ns")
501 data = np.asarray(data)
502 sparse_values, sparse_index, fill_value = make_sparse(
503 # error: Argument "dtype" to "make_sparse" has incompatible type
504 # "Union[ExtensionDtype, dtype[Any], Type[object], None]"; expected
505 # "Union[str, dtype[Any], None]"
506 data,
507 kind=kind,
508 fill_value=fill_value,
509 dtype=dtype, # type: ignore[arg-type]
510 )
511 else:
512 # error: Argument "dtype" to "asarray" has incompatible type
513 # "Union[ExtensionDtype, dtype[Any], Type[object], None]"; expected
514 # "Union[dtype[Any], None, type, _SupportsDType, str, Union[Tuple[Any, int],
515 # Tuple[Any, Union[int, Sequence[int]]], List[Any], _DTypeDict, Tuple[Any,
516 # Any]]]"
517 sparse_values = np.asarray(data, dtype=dtype) # type: ignore[arg-type]
518 if len(sparse_values) != sparse_index.npoints:
519 raise AssertionError(
520 f"Non array-like type {type(sparse_values)} must "
521 "have the same length as the index"
522 )
523 self._sparse_index = sparse_index
524 self._sparse_values = sparse_values
525 self._dtype = SparseDtype(sparse_values.dtype, fill_value)
527 @classmethod
528 def _simple_new(
529 cls: type[SparseArrayT],
530 sparse_array: np.ndarray,
531 sparse_index: SparseIndex,
532 dtype: SparseDtype,
533 ) -> SparseArrayT:
534 new = object.__new__(cls)
535 new._sparse_index = sparse_index
536 new._sparse_values = sparse_array
537 new._dtype = dtype
538 return new
540 @classmethod
541 def from_spmatrix(cls: type[SparseArrayT], data: spmatrix) -> SparseArrayT:
542 """
543 Create a SparseArray from a scipy.sparse matrix.
545 .. versionadded:: 0.25.0
547 Parameters
548 ----------
549 data : scipy.sparse.sp_matrix
550 This should be a SciPy sparse matrix where the size
551 of the second dimension is 1. In other words, a
552 sparse matrix with a single column.
554 Returns
555 -------
556 SparseArray
558 Examples
559 --------
560 >>> import scipy.sparse
561 >>> mat = scipy.sparse.coo_matrix((4, 1))
562 >>> pd.arrays.SparseArray.from_spmatrix(mat)
563 [0.0, 0.0, 0.0, 0.0]
564 Fill: 0.0
565 IntIndex
566 Indices: array([], dtype=int32)
567 """
568 length, ncol = data.shape
570 if ncol != 1:
571 raise ValueError(f"'data' must have a single column, not '{ncol}'")
573 # our sparse index classes require that the positions be strictly
574 # increasing. So we need to sort loc, and arr accordingly.
575 data = data.tocsc()
576 data.sort_indices()
577 arr = data.data
578 idx = data.indices
580 zero = np.array(0, dtype=arr.dtype).item()
581 dtype = SparseDtype(arr.dtype, zero)
582 index = IntIndex(length, idx)
584 return cls._simple_new(arr, index, dtype)
586 def __array__(self, dtype: NpDtype | None = None) -> np.ndarray:
587 fill_value = self.fill_value
589 if self.sp_index.ngaps == 0:
590 # Compat for na dtype and int values.
591 return self.sp_values
592 if dtype is None:
593 # Can NumPy represent this type?
594 # If not, `np.result_type` will raise. We catch that
595 # and return object.
596 if is_datetime64_any_dtype(self.sp_values.dtype):
597 # However, we *do* special-case the common case of
598 # a datetime64 with pandas NaT.
599 if fill_value is NaT:
600 # Can't put pd.NaT in a datetime64[ns]
601 fill_value = np.datetime64("NaT")
602 try:
603 dtype = np.result_type(self.sp_values.dtype, type(fill_value))
604 except TypeError:
605 dtype = object
607 out = np.full(self.shape, fill_value, dtype=dtype)
608 out[self.sp_index.indices] = self.sp_values
609 return out
611 def __setitem__(self, key, value):
612 # I suppose we could allow setting of non-fill_value elements.
613 # TODO(SparseArray.__setitem__): remove special cases in
614 # ExtensionBlock.where
615 msg = "SparseArray does not support item assignment via setitem"
616 raise TypeError(msg)
618 @classmethod
619 def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy: bool = False):
620 return cls(scalars, dtype=dtype)
622 @classmethod
623 def _from_factorized(cls, values, original):
624 return cls(values, dtype=original.dtype)
626 # ------------------------------------------------------------------------
627 # Data
628 # ------------------------------------------------------------------------
629 @property
630 def sp_index(self) -> SparseIndex:
631 """
632 The SparseIndex containing the location of non- ``fill_value`` points.
633 """
634 return self._sparse_index
636 @property
637 def sp_values(self) -> np.ndarray:
638 """
639 An ndarray containing the non- ``fill_value`` values.
641 Examples
642 --------
643 >>> s = SparseArray([0, 0, 1, 0, 2], fill_value=0)
644 >>> s.sp_values
645 array([1, 2])
646 """
647 return self._sparse_values
649 @property
650 def dtype(self) -> SparseDtype:
651 return self._dtype
653 @property
654 def fill_value(self):
655 """
656 Elements in `data` that are `fill_value` are not stored.
658 For memory savings, this should be the most common value in the array.
659 """
660 return self.dtype.fill_value
662 @fill_value.setter
663 def fill_value(self, value):
664 self._dtype = SparseDtype(self.dtype.subtype, value)
666 @property
667 def kind(self) -> SparseIndexKind:
668 """
669 The kind of sparse index for this array. One of {'integer', 'block'}.
670 """
671 if isinstance(self.sp_index, IntIndex):
672 return "integer"
673 else:
674 return "block"
676 @property
677 def _valid_sp_values(self) -> np.ndarray:
678 sp_vals = self.sp_values
679 mask = notna(sp_vals)
680 return sp_vals[mask]
682 def __len__(self) -> int:
683 return self.sp_index.length
685 @property
686 def _null_fill_value(self) -> bool:
687 return self._dtype._is_na_fill_value
689 def _fill_value_matches(self, fill_value) -> bool:
690 if self._null_fill_value:
691 return isna(fill_value)
692 else:
693 return self.fill_value == fill_value
695 @property
696 def nbytes(self) -> int:
697 return self.sp_values.nbytes + self.sp_index.nbytes
699 @property
700 def density(self) -> float:
701 """
702 The percent of non- ``fill_value`` points, as decimal.
704 Examples
705 --------
706 >>> s = SparseArray([0, 0, 1, 1, 1], fill_value=0)
707 >>> s.density
708 0.6
709 """
710 return self.sp_index.npoints / self.sp_index.length
712 @property
713 def npoints(self) -> int:
714 """
715 The number of non- ``fill_value`` points.
717 Examples
718 --------
719 >>> s = SparseArray([0, 0, 1, 1, 1], fill_value=0)
720 >>> s.npoints
721 3
722 """
723 return self.sp_index.npoints
725 def isna(self):
726 # If null fill value, we want SparseDtype[bool, true]
727 # to preserve the same memory usage.
728 dtype = SparseDtype(bool, self._null_fill_value)
729 if self._null_fill_value:
730 return type(self)._simple_new(isna(self.sp_values), self.sp_index, dtype)
731 mask = np.full(len(self), False, dtype=np.bool8)
732 mask[self.sp_index.indices] = isna(self.sp_values)
733 return type(self)(mask, fill_value=False, dtype=dtype)
735 def fillna(
736 self: SparseArrayT,
737 value=None,
738 method: FillnaOptions | None = None,
739 limit: int | None = None,
740 ) -> SparseArrayT:
741 """
742 Fill missing values with `value`.
744 Parameters
745 ----------
746 value : scalar, optional
747 method : str, optional
749 .. warning::
751 Using 'method' will result in high memory use,
752 as all `fill_value` methods will be converted to
753 an in-memory ndarray
755 limit : int, optional
757 Returns
758 -------
759 SparseArray
761 Notes
762 -----
763 When `value` is specified, the result's ``fill_value`` depends on
764 ``self.fill_value``. The goal is to maintain low-memory use.
766 If ``self.fill_value`` is NA, the result dtype will be
767 ``SparseDtype(self.dtype, fill_value=value)``. This will preserve
768 amount of memory used before and after filling.
770 When ``self.fill_value`` is not NA, the result dtype will be
771 ``self.dtype``. Again, this preserves the amount of memory used.
772 """
773 if (method is None and value is None) or (
774 method is not None and value is not None
775 ):
776 raise ValueError("Must specify one of 'method' or 'value'.")
778 elif method is not None:
779 msg = "fillna with 'method' requires high memory usage."
780 warnings.warn(
781 msg,
782 PerformanceWarning,
783 stacklevel=find_stack_level(),
784 )
785 new_values = np.asarray(self)
786 # interpolate_2d modifies new_values inplace
787 interpolate_2d(new_values, method=method, limit=limit)
788 return type(self)(new_values, fill_value=self.fill_value)
790 else:
791 new_values = np.where(isna(self.sp_values), value, self.sp_values)
793 if self._null_fill_value:
794 # This is essentially just updating the dtype.
795 new_dtype = SparseDtype(self.dtype.subtype, fill_value=value)
796 else:
797 new_dtype = self.dtype
799 return self._simple_new(new_values, self._sparse_index, new_dtype)
801 def shift(self: SparseArrayT, periods: int = 1, fill_value=None) -> SparseArrayT:
803 if not len(self) or periods == 0:
804 return self.copy()
806 if isna(fill_value):
807 fill_value = self.dtype.na_value
809 subtype = np.result_type(fill_value, self.dtype.subtype)
811 if subtype != self.dtype.subtype:
812 # just coerce up front
813 arr = self.astype(SparseDtype(subtype, self.fill_value))
814 else:
815 arr = self
817 empty = self._from_sequence(
818 [fill_value] * min(abs(periods), len(self)), dtype=arr.dtype
819 )
821 if periods > 0:
822 a = empty
823 b = arr[:-periods]
824 else:
825 a = arr[abs(periods) :]
826 b = empty
827 return arr._concat_same_type([a, b])
829 def _first_fill_value_loc(self):
830 """
831 Get the location of the first fill value.
833 Returns
834 -------
835 int
836 """
837 if len(self) == 0 or self.sp_index.npoints == len(self):
838 return -1
840 indices = self.sp_index.indices
841 if not len(indices) or indices[0] > 0:
842 return 0
844 # a number larger than 1 should be appended to
845 # the last in case of fill value only appears
846 # in the tail of array
847 diff = np.r_[np.diff(indices), 2]
848 return indices[(diff > 1).argmax()] + 1
850 def unique(self: SparseArrayT) -> SparseArrayT:
851 uniques = algos.unique(self.sp_values)
852 if len(self.sp_values) != len(self):
853 fill_loc = self._first_fill_value_loc()
854 # Inorder to align the behavior of pd.unique or
855 # pd.Series.unique, we should keep the original
856 # order, here we use unique again to find the
857 # insertion place. Since the length of sp_values
858 # is not large, maybe minor performance hurt
859 # is worthwhile to the correctness.
860 insert_loc = len(algos.unique(self.sp_values[:fill_loc]))
861 uniques = np.insert(uniques, insert_loc, self.fill_value)
862 return type(self)._from_sequence(uniques, dtype=self.dtype)
864 def _values_for_factorize(self):
865 # Still override this for hash_pandas_object
866 return np.asarray(self), self.fill_value
868 def factorize(
869 self,
870 na_sentinel: int | lib.NoDefault = lib.no_default,
871 use_na_sentinel: bool | lib.NoDefault = lib.no_default,
872 ) -> tuple[np.ndarray, SparseArray]:
873 # Currently, ExtensionArray.factorize -> Tuple[ndarray, EA]
874 # The sparsity on this is backwards from what Sparse would want. Want
875 # ExtensionArray.factorize -> Tuple[EA, EA]
876 # Given that we have to return a dense array of codes, why bother
877 # implementing an efficient factorize?
878 codes, uniques = algos.factorize(
879 np.asarray(self), na_sentinel=na_sentinel, use_na_sentinel=use_na_sentinel
880 )
881 if na_sentinel is lib.no_default:
882 na_sentinel = -1
883 if use_na_sentinel is lib.no_default or use_na_sentinel:
884 codes[codes == -1] = na_sentinel
885 uniques_sp = SparseArray(uniques, dtype=self.dtype)
886 return codes, uniques_sp
888 def value_counts(self, dropna: bool = True) -> Series:
889 """
890 Returns a Series containing counts of unique values.
892 Parameters
893 ----------
894 dropna : bool, default True
895 Don't include counts of NaN, even if NaN is in sp_values.
897 Returns
898 -------
899 counts : Series
900 """
901 from pandas import (
902 Index,
903 Series,
904 )
906 keys, counts = algos.value_counts_arraylike(self.sp_values, dropna=dropna)
907 fcounts = self.sp_index.ngaps
908 if fcounts > 0 and (not self._null_fill_value or not dropna):
909 mask = isna(keys) if self._null_fill_value else keys == self.fill_value
910 if mask.any():
911 counts[mask] += fcounts
912 else:
913 # error: Argument 1 to "insert" has incompatible type "Union[
914 # ExtensionArray,ndarray[Any, Any]]"; expected "Union[
915 # _SupportsArray[dtype[Any]], Sequence[_SupportsArray[dtype
916 # [Any]]], Sequence[Sequence[_SupportsArray[dtype[Any]]]],
917 # Sequence[Sequence[Sequence[_SupportsArray[dtype[Any]]]]], Sequence
918 # [Sequence[Sequence[Sequence[_SupportsArray[dtype[Any]]]]]]]"
919 keys = np.insert(keys, 0, self.fill_value) # type: ignore[arg-type]
920 counts = np.insert(counts, 0, fcounts)
922 if not isinstance(keys, ABCIndex):
923 index = Index(keys)
924 else:
925 index = keys
926 return Series(counts, index=index)
928 def _quantile(self, qs: npt.NDArray[np.float64], interpolation: str):
930 if self._null_fill_value or self.sp_index.ngaps == 0:
931 # We can avoid densifying
932 npvalues = self.sp_values
933 mask = np.zeros(npvalues.shape, dtype=bool)
934 else:
935 npvalues = self.to_numpy()
936 mask = self.isna()
938 fill_value = na_value_for_dtype(npvalues.dtype, compat=False)
939 res_values = quantile_with_mask(
940 npvalues,
941 mask,
942 fill_value,
943 qs,
944 interpolation,
945 )
947 # Special case: the returned array isn't _really_ sparse, so we don't
948 # wrap it in a SparseArray
949 return res_values
951 # --------
952 # Indexing
953 # --------
954 @overload
955 def __getitem__(self, key: ScalarIndexer) -> Any:
956 ...
958 @overload
959 def __getitem__(
960 self: SparseArrayT,
961 key: SequenceIndexer | tuple[int | ellipsis, ...],
962 ) -> SparseArrayT:
963 ...
965 def __getitem__(
966 self: SparseArrayT,
967 key: PositionalIndexer | tuple[int | ellipsis, ...],
968 ) -> SparseArrayT | Any:
970 if isinstance(key, tuple):
971 key = unpack_tuple_and_ellipses(key)
972 # Non-overlapping identity check (left operand type:
973 # "Union[Union[Union[int, integer[Any]], Union[slice, List[int],
974 # ndarray[Any, Any]]], Tuple[Union[int, ellipsis], ...]]",
975 # right operand type: "ellipsis")
976 if key is Ellipsis: # type: ignore[comparison-overlap]
977 raise ValueError("Cannot slice with Ellipsis")
979 if is_integer(key):
980 return self._get_val_at(key)
981 elif isinstance(key, tuple):
982 # error: Invalid index type "Tuple[Union[int, ellipsis], ...]"
983 # for "ndarray[Any, Any]"; expected type
984 # "Union[SupportsIndex, _SupportsArray[dtype[Union[bool_,
985 # integer[Any]]]], _NestedSequence[_SupportsArray[dtype[
986 # Union[bool_, integer[Any]]]]], _NestedSequence[Union[
987 # bool, int]], Tuple[Union[SupportsIndex, _SupportsArray[
988 # dtype[Union[bool_, integer[Any]]]], _NestedSequence[
989 # _SupportsArray[dtype[Union[bool_, integer[Any]]]]],
990 # _NestedSequence[Union[bool, int]]], ...]]"
991 data_slice = self.to_dense()[key] # type: ignore[index]
992 elif isinstance(key, slice):
994 # Avoid densifying when handling contiguous slices
995 if key.step is None or key.step == 1:
996 start = 0 if key.start is None else key.start
997 if start < 0:
998 start += len(self)
1000 end = len(self) if key.stop is None else key.stop
1001 if end < 0:
1002 end += len(self)
1004 indices = self.sp_index.indices
1005 keep_inds = np.flatnonzero((indices >= start) & (indices < end))
1006 sp_vals = self.sp_values[keep_inds]
1008 sp_index = indices[keep_inds].copy()
1010 # If we've sliced to not include the start of the array, all our indices
1011 # should be shifted. NB: here we are careful to also not shift by a
1012 # negative value for a case like [0, 1][-100:] where the start index
1013 # should be treated like 0
1014 if start > 0:
1015 sp_index -= start
1017 # Length of our result should match applying this slice to a range
1018 # of the length of our original array
1019 new_len = len(range(len(self))[key])
1020 new_sp_index = make_sparse_index(new_len, sp_index, self.kind)
1021 return type(self)._simple_new(sp_vals, new_sp_index, self.dtype)
1022 else:
1023 indices = np.arange(len(self), dtype=np.int32)[key]
1024 return self.take(indices)
1026 elif not is_list_like(key):
1027 # e.g. "foo" or 2.5
1028 # exception message copied from numpy
1029 raise IndexError(
1030 r"only integers, slices (`:`), ellipsis (`...`), numpy.newaxis "
1031 r"(`None`) and integer or boolean arrays are valid indices"
1032 )
1034 else:
1035 if isinstance(key, SparseArray):
1036 # NOTE: If we guarantee that SparseDType(bool)
1037 # has only fill_value - true, false or nan
1038 # (see GH PR 44955)
1039 # we can apply mask very fast:
1040 if is_bool_dtype(key):
1041 if isna(key.fill_value):
1042 return self.take(key.sp_index.indices[key.sp_values])
1043 if not key.fill_value:
1044 return self.take(key.sp_index.indices)
1045 n = len(self)
1046 mask = np.full(n, True, dtype=np.bool8)
1047 mask[key.sp_index.indices] = False
1048 return self.take(np.arange(n)[mask])
1049 else:
1050 key = np.asarray(key)
1052 key = check_array_indexer(self, key)
1054 if com.is_bool_indexer(key):
1055 # mypy doesn't know we have an array here
1056 key = cast(np.ndarray, key)
1057 return self.take(np.arange(len(key), dtype=np.int32)[key])
1058 elif hasattr(key, "__len__"):
1059 return self.take(key)
1060 else:
1061 raise ValueError(f"Cannot slice with '{key}'")
1063 return type(self)(data_slice, kind=self.kind)
1065 def _get_val_at(self, loc):
1066 loc = validate_insert_loc(loc, len(self))
1068 sp_loc = self.sp_index.lookup(loc)
1069 if sp_loc == -1:
1070 return self.fill_value
1071 else:
1072 val = self.sp_values[sp_loc]
1073 val = maybe_box_datetimelike(val, self.sp_values.dtype)
1074 return val
1076 def take(
1077 self: SparseArrayT, indices, *, allow_fill: bool = False, fill_value=None
1078 ) -> SparseArrayT:
1079 if is_scalar(indices):
1080 raise ValueError(f"'indices' must be an array, not a scalar '{indices}'.")
1081 indices = np.asarray(indices, dtype=np.int32)
1083 dtype = None
1084 if indices.size == 0:
1085 result = np.array([], dtype="object")
1086 dtype = self.dtype
1087 elif allow_fill:
1088 result = self._take_with_fill(indices, fill_value=fill_value)
1089 else:
1090 return self._take_without_fill(indices)
1092 return type(self)(
1093 result, fill_value=self.fill_value, kind=self.kind, dtype=dtype
1094 )
1096 def _take_with_fill(self, indices, fill_value=None) -> np.ndarray:
1097 if fill_value is None:
1098 fill_value = self.dtype.na_value
1100 if indices.min() < -1:
1101 raise ValueError(
1102 "Invalid value in 'indices'. Must be between -1 "
1103 "and the length of the array."
1104 )
1106 if indices.max() >= len(self):
1107 raise IndexError("out of bounds value in 'indices'.")
1109 if len(self) == 0:
1110 # Empty... Allow taking only if all empty
1111 if (indices == -1).all():
1112 dtype = np.result_type(self.sp_values, type(fill_value))
1113 taken = np.empty_like(indices, dtype=dtype)
1114 taken.fill(fill_value)
1115 return taken
1116 else:
1117 raise IndexError("cannot do a non-empty take from an empty axes.")
1119 # sp_indexer may be -1 for two reasons
1120 # 1.) we took for an index of -1 (new)
1121 # 2.) we took a value that was self.fill_value (old)
1122 sp_indexer = self.sp_index.lookup_array(indices)
1123 new_fill_indices = indices == -1
1124 old_fill_indices = (sp_indexer == -1) & ~new_fill_indices
1126 if self.sp_index.npoints == 0 and old_fill_indices.all():
1127 # We've looked up all valid points on an all-sparse array.
1128 taken = np.full(
1129 sp_indexer.shape, fill_value=self.fill_value, dtype=self.dtype.subtype
1130 )
1132 elif self.sp_index.npoints == 0:
1133 # Avoid taking from the empty self.sp_values
1134 _dtype = np.result_type(self.dtype.subtype, type(fill_value))
1135 taken = np.full(sp_indexer.shape, fill_value=fill_value, dtype=_dtype)
1136 else:
1137 taken = self.sp_values.take(sp_indexer)
1139 # Fill in two steps.
1140 # Old fill values
1141 # New fill values
1142 # potentially coercing to a new dtype at each stage.
1144 m0 = sp_indexer[old_fill_indices] < 0
1145 m1 = sp_indexer[new_fill_indices] < 0
1147 result_type = taken.dtype
1149 if m0.any():
1150 result_type = np.result_type(result_type, type(self.fill_value))
1151 taken = taken.astype(result_type)
1152 taken[old_fill_indices] = self.fill_value
1154 if m1.any():
1155 result_type = np.result_type(result_type, type(fill_value))
1156 taken = taken.astype(result_type)
1157 taken[new_fill_indices] = fill_value
1159 return taken
1161 def _take_without_fill(self: SparseArrayT, indices) -> SparseArrayT:
1162 to_shift = indices < 0
1164 n = len(self)
1166 if (indices.max() >= n) or (indices.min() < -n):
1167 if n == 0:
1168 raise IndexError("cannot do a non-empty take from an empty axes.")
1169 else:
1170 raise IndexError("out of bounds value in 'indices'.")
1172 if to_shift.any():
1173 indices = indices.copy()
1174 indices[to_shift] += n
1176 sp_indexer = self.sp_index.lookup_array(indices)
1177 value_mask = sp_indexer != -1
1178 new_sp_values = self.sp_values[sp_indexer[value_mask]]
1180 value_indices = np.flatnonzero(value_mask).astype(np.int32, copy=False)
1182 new_sp_index = make_sparse_index(len(indices), value_indices, kind=self.kind)
1183 return type(self)._simple_new(new_sp_values, new_sp_index, dtype=self.dtype)
1185 def searchsorted(
1186 self,
1187 v: ArrayLike | object,
1188 side: Literal["left", "right"] = "left",
1189 sorter: NumpySorter = None,
1190 ) -> npt.NDArray[np.intp] | np.intp:
1192 msg = "searchsorted requires high memory usage."
1193 warnings.warn(msg, PerformanceWarning, stacklevel=find_stack_level())
1194 if not is_scalar(v):
1195 v = np.asarray(v)
1196 v = np.asarray(v)
1197 return np.asarray(self, dtype=self.dtype.subtype).searchsorted(v, side, sorter)
1199 def copy(self: SparseArrayT) -> SparseArrayT:
1200 values = self.sp_values.copy()
1201 return self._simple_new(values, self.sp_index, self.dtype)
1203 @classmethod
1204 def _concat_same_type(
1205 cls: type[SparseArrayT], to_concat: Sequence[SparseArrayT]
1206 ) -> SparseArrayT:
1207 fill_value = to_concat[0].fill_value
1209 values = []
1210 length = 0
1212 if to_concat:
1213 sp_kind = to_concat[0].kind
1214 else:
1215 sp_kind = "integer"
1217 sp_index: SparseIndex
1218 if sp_kind == "integer":
1219 indices = []
1221 for arr in to_concat:
1222 int_idx = arr.sp_index.indices.copy()
1223 int_idx += length # TODO: wraparound
1224 length += arr.sp_index.length
1226 values.append(arr.sp_values)
1227 indices.append(int_idx)
1229 data = np.concatenate(values)
1230 indices_arr = np.concatenate(indices)
1231 # error: Argument 2 to "IntIndex" has incompatible type
1232 # "ndarray[Any, dtype[signedinteger[_32Bit]]]";
1233 # expected "Sequence[int]"
1234 sp_index = IntIndex(length, indices_arr) # type: ignore[arg-type]
1236 else:
1237 # when concatenating block indices, we don't claim that you'll
1238 # get an identical index as concatenating the values and then
1239 # creating a new index. We don't want to spend the time trying
1240 # to merge blocks across arrays in `to_concat`, so the resulting
1241 # BlockIndex may have more blocks.
1242 blengths = []
1243 blocs = []
1245 for arr in to_concat:
1246 block_idx = arr.sp_index.to_block_index()
1248 values.append(arr.sp_values)
1249 blocs.append(block_idx.blocs.copy() + length)
1250 blengths.append(block_idx.blengths)
1251 length += arr.sp_index.length
1253 data = np.concatenate(values)
1254 blocs_arr = np.concatenate(blocs)
1255 blengths_arr = np.concatenate(blengths)
1257 sp_index = BlockIndex(length, blocs_arr, blengths_arr)
1259 return cls(data, sparse_index=sp_index, fill_value=fill_value)
1261 def astype(self, dtype: AstypeArg | None = None, copy: bool = True):
1262 """
1263 Change the dtype of a SparseArray.
1265 The output will always be a SparseArray. To convert to a dense
1266 ndarray with a certain dtype, use :meth:`numpy.asarray`.
1268 Parameters
1269 ----------
1270 dtype : np.dtype or ExtensionDtype
1271 For SparseDtype, this changes the dtype of
1272 ``self.sp_values`` and the ``self.fill_value``.
1274 For other dtypes, this only changes the dtype of
1275 ``self.sp_values``.
1277 copy : bool, default True
1278 Whether to ensure a copy is made, even if not necessary.
1280 Returns
1281 -------
1282 SparseArray
1284 Examples
1285 --------
1286 >>> arr = pd.arrays.SparseArray([0, 0, 1, 2])
1287 >>> arr
1288 [0, 0, 1, 2]
1289 Fill: 0
1290 IntIndex
1291 Indices: array([2, 3], dtype=int32)
1293 >>> arr.astype(SparseDtype(np.dtype('int32')))
1294 [0, 0, 1, 2]
1295 Fill: 0
1296 IntIndex
1297 Indices: array([2, 3], dtype=int32)
1299 Using a NumPy dtype with a different kind (e.g. float) will coerce
1300 just ``self.sp_values``.
1302 >>> arr.astype(SparseDtype(np.dtype('float64')))
1303 ... # doctest: +NORMALIZE_WHITESPACE
1304 [nan, nan, 1.0, 2.0]
1305 Fill: nan
1306 IntIndex
1307 Indices: array([2, 3], dtype=int32)
1309 Using a SparseDtype, you can also change the fill value as well.
1311 >>> arr.astype(SparseDtype("float64", fill_value=0.0))
1312 ... # doctest: +NORMALIZE_WHITESPACE
1313 [0.0, 0.0, 1.0, 2.0]
1314 Fill: 0.0
1315 IntIndex
1316 Indices: array([2, 3], dtype=int32)
1317 """
1318 if is_dtype_equal(dtype, self._dtype):
1319 if not copy:
1320 return self
1321 else:
1322 return self.copy()
1324 future_dtype = pandas_dtype(dtype)
1325 if not isinstance(future_dtype, SparseDtype):
1326 # GH#34457
1327 warnings.warn(
1328 "The behavior of .astype from SparseDtype to a non-sparse dtype "
1329 "is deprecated. In a future version, this will return a non-sparse "
1330 "array with the requested dtype. To retain the old behavior, use "
1331 "`obj.astype(SparseDtype(dtype))`",
1332 FutureWarning,
1333 stacklevel=find_stack_level(),
1334 )
1336 dtype = self.dtype.update_dtype(dtype)
1337 subtype = pandas_dtype(dtype._subtype_with_str)
1338 sp_values = astype_nansafe(self.sp_values, subtype, copy=copy)
1340 # error: Argument 1 to "_simple_new" of "SparseArray" has incompatible type
1341 # "ExtensionArray"; expected "ndarray"
1342 return self._simple_new(
1343 sp_values, self.sp_index, dtype # type: ignore[arg-type]
1344 )
1346 def map(self: SparseArrayT, mapper) -> SparseArrayT:
1347 """
1348 Map categories using an input mapping or function.
1350 Parameters
1351 ----------
1352 mapper : dict, Series, callable
1353 The correspondence from old values to new.
1355 Returns
1356 -------
1357 SparseArray
1358 The output array will have the same density as the input.
1359 The output fill value will be the result of applying the
1360 mapping to ``self.fill_value``
1362 Examples
1363 --------
1364 >>> arr = pd.arrays.SparseArray([0, 1, 2])
1365 >>> arr.map(lambda x: x + 10)
1366 [10, 11, 12]
1367 Fill: 10
1368 IntIndex
1369 Indices: array([1, 2], dtype=int32)
1371 >>> arr.map({0: 10, 1: 11, 2: 12})
1372 [10, 11, 12]
1373 Fill: 10
1374 IntIndex
1375 Indices: array([1, 2], dtype=int32)
1377 >>> arr.map(pd.Series([10, 11, 12], index=[0, 1, 2]))
1378 [10, 11, 12]
1379 Fill: 10
1380 IntIndex
1381 Indices: array([1, 2], dtype=int32)
1382 """
1383 # this is used in apply.
1384 # We get hit since we're an "is_extension_type" but regular extension
1385 # types are not hit. This may be worth adding to the interface.
1386 if isinstance(mapper, ABCSeries):
1387 mapper = mapper.to_dict()
1389 if isinstance(mapper, abc.Mapping):
1390 fill_value = mapper.get(self.fill_value, self.fill_value)
1391 sp_values = [mapper.get(x, None) for x in self.sp_values]
1392 else:
1393 fill_value = mapper(self.fill_value)
1394 sp_values = [mapper(x) for x in self.sp_values]
1396 return type(self)(sp_values, sparse_index=self.sp_index, fill_value=fill_value)
1398 def to_dense(self) -> np.ndarray:
1399 """
1400 Convert SparseArray to a NumPy array.
1402 Returns
1403 -------
1404 arr : NumPy array
1405 """
1406 return np.asarray(self, dtype=self.sp_values.dtype)
1408 def _where(self, mask, value):
1409 # NB: may not preserve dtype, e.g. result may be Sparse[float64]
1410 # while self is Sparse[int64]
1411 naive_implementation = np.where(mask, self, value)
1412 dtype = SparseDtype(naive_implementation.dtype, fill_value=self.fill_value)
1413 result = type(self)._from_sequence(naive_implementation, dtype=dtype)
1414 return result
1416 # ------------------------------------------------------------------------
1417 # IO
1418 # ------------------------------------------------------------------------
1419 def __setstate__(self, state) -> None:
1420 """Necessary for making this object picklable"""
1421 if isinstance(state, tuple):
1422 # Compat for pandas < 0.24.0
1423 nd_state, (fill_value, sp_index) = state
1424 sparse_values = np.array([])
1425 sparse_values.__setstate__(nd_state)
1427 self._sparse_values = sparse_values
1428 self._sparse_index = sp_index
1429 self._dtype = SparseDtype(sparse_values.dtype, fill_value)
1430 else:
1431 self.__dict__.update(state)
1433 def nonzero(self) -> tuple[npt.NDArray[np.int32]]:
1434 if self.fill_value == 0:
1435 return (self.sp_index.indices,)
1436 else:
1437 return (self.sp_index.indices[self.sp_values != 0],)
1439 # ------------------------------------------------------------------------
1440 # Reductions
1441 # ------------------------------------------------------------------------
1443 def _reduce(self, name: str, *, skipna: bool = True, **kwargs):
1444 method = getattr(self, name, None)
1446 if method is None:
1447 raise TypeError(f"cannot perform {name} with type {self.dtype}")
1449 if skipna:
1450 arr = self
1451 else:
1452 arr = self.dropna()
1454 return getattr(arr, name)(**kwargs)
1456 def all(self, axis=None, *args, **kwargs):
1457 """
1458 Tests whether all elements evaluate True
1460 Returns
1461 -------
1462 all : bool
1464 See Also
1465 --------
1466 numpy.all
1467 """
1468 nv.validate_all(args, kwargs)
1470 values = self.sp_values
1472 if len(values) != len(self) and not np.all(self.fill_value):
1473 return False
1475 return values.all()
1477 def any(self, axis=0, *args, **kwargs):
1478 """
1479 Tests whether at least one of elements evaluate True
1481 Returns
1482 -------
1483 any : bool
1485 See Also
1486 --------
1487 numpy.any
1488 """
1489 nv.validate_any(args, kwargs)
1491 values = self.sp_values
1493 if len(values) != len(self) and np.any(self.fill_value):
1494 return True
1496 return values.any().item()
1498 def sum(
1499 self, axis: int = 0, min_count: int = 0, skipna: bool = True, *args, **kwargs
1500 ) -> Scalar:
1501 """
1502 Sum of non-NA/null values
1504 Parameters
1505 ----------
1506 axis : int, default 0
1507 Not Used. NumPy compatibility.
1508 min_count : int, default 0
1509 The required number of valid values to perform the summation. If fewer
1510 than ``min_count`` valid values are present, the result will be the missing
1511 value indicator for subarray type.
1512 *args, **kwargs
1513 Not Used. NumPy compatibility.
1515 Returns
1516 -------
1517 scalar
1518 """
1519 nv.validate_sum(args, kwargs)
1520 valid_vals = self._valid_sp_values
1521 sp_sum = valid_vals.sum()
1522 has_na = self.sp_index.ngaps > 0 and not self._null_fill_value
1524 if has_na and not skipna:
1525 return na_value_for_dtype(self.dtype.subtype, compat=False)
1527 if self._null_fill_value:
1528 if check_below_min_count(valid_vals.shape, None, min_count):
1529 return na_value_for_dtype(self.dtype.subtype, compat=False)
1530 return sp_sum
1531 else:
1532 nsparse = self.sp_index.ngaps
1533 if check_below_min_count(valid_vals.shape, None, min_count - nsparse):
1534 return na_value_for_dtype(self.dtype.subtype, compat=False)
1535 return sp_sum + self.fill_value * nsparse
1537 def cumsum(self, axis: int = 0, *args, **kwargs) -> SparseArray:
1538 """
1539 Cumulative sum of non-NA/null values.
1541 When performing the cumulative summation, any non-NA/null values will
1542 be skipped. The resulting SparseArray will preserve the locations of
1543 NaN values, but the fill value will be `np.nan` regardless.
1545 Parameters
1546 ----------
1547 axis : int or None
1548 Axis over which to perform the cumulative summation. If None,
1549 perform cumulative summation over flattened array.
1551 Returns
1552 -------
1553 cumsum : SparseArray
1554 """
1555 nv.validate_cumsum(args, kwargs)
1557 if axis is not None and axis >= self.ndim: # Mimic ndarray behaviour.
1558 raise ValueError(f"axis(={axis}) out of bounds")
1560 if not self._null_fill_value:
1561 return SparseArray(self.to_dense()).cumsum()
1563 return SparseArray(
1564 self.sp_values.cumsum(),
1565 sparse_index=self.sp_index,
1566 fill_value=self.fill_value,
1567 )
1569 def mean(self, axis=0, *args, **kwargs):
1570 """
1571 Mean of non-NA/null values
1573 Returns
1574 -------
1575 mean : float
1576 """
1577 nv.validate_mean(args, kwargs)
1578 valid_vals = self._valid_sp_values
1579 sp_sum = valid_vals.sum()
1580 ct = len(valid_vals)
1582 if self._null_fill_value:
1583 return sp_sum / ct
1584 else:
1585 nsparse = self.sp_index.ngaps
1586 return (sp_sum + self.fill_value * nsparse) / (ct + nsparse)
1588 def max(self, *, axis: int | None = None, skipna: bool = True):
1589 """
1590 Max of array values, ignoring NA values if specified.
1592 Parameters
1593 ----------
1594 axis : int, default 0
1595 Not Used. NumPy compatibility.
1596 skipna : bool, default True
1597 Whether to ignore NA values.
1599 Returns
1600 -------
1601 scalar
1602 """
1603 nv.validate_minmax_axis(axis, self.ndim)
1604 return self._min_max("max", skipna=skipna)
1606 def min(self, *, axis: int | None = None, skipna: bool = True):
1607 """
1608 Min of array values, ignoring NA values if specified.
1610 Parameters
1611 ----------
1612 axis : int, default 0
1613 Not Used. NumPy compatibility.
1614 skipna : bool, default True
1615 Whether to ignore NA values.
1617 Returns
1618 -------
1619 scalar
1620 """
1621 nv.validate_minmax_axis(axis, self.ndim)
1622 return self._min_max("min", skipna=skipna)
1624 def _min_max(self, kind: Literal["min", "max"], skipna: bool) -> Scalar:
1625 """
1626 Min/max of non-NA/null values
1628 Parameters
1629 ----------
1630 kind : {"min", "max"}
1631 skipna : bool
1633 Returns
1634 -------
1635 scalar
1636 """
1637 valid_vals = self._valid_sp_values
1638 has_nonnull_fill_vals = not self._null_fill_value and self.sp_index.ngaps > 0
1640 if len(valid_vals) > 0:
1641 sp_min_max = getattr(valid_vals, kind)()
1643 # If a non-null fill value is currently present, it might be the min/max
1644 if has_nonnull_fill_vals:
1645 func = max if kind == "max" else min
1646 return func(sp_min_max, self.fill_value)
1647 elif skipna:
1648 return sp_min_max
1649 elif self.sp_index.ngaps == 0:
1650 # No NAs present
1651 return sp_min_max
1652 else:
1653 return na_value_for_dtype(self.dtype.subtype, compat=False)
1654 elif has_nonnull_fill_vals:
1655 return self.fill_value
1656 else:
1657 return na_value_for_dtype(self.dtype.subtype, compat=False)
1659 def _argmin_argmax(self, kind: Literal["argmin", "argmax"]) -> int:
1661 values = self._sparse_values
1662 index = self._sparse_index.indices
1663 mask = np.asarray(isna(values))
1664 func = np.argmax if kind == "argmax" else np.argmin
1666 idx = np.arange(values.shape[0])
1667 non_nans = values[~mask]
1668 non_nan_idx = idx[~mask]
1670 _candidate = non_nan_idx[func(non_nans)]
1671 candidate = index[_candidate]
1673 if isna(self.fill_value):
1674 return candidate
1675 if kind == "argmin" and self[candidate] < self.fill_value:
1676 return candidate
1677 if kind == "argmax" and self[candidate] > self.fill_value:
1678 return candidate
1679 _loc = self._first_fill_value_loc()
1680 if _loc == -1:
1681 # fill_value doesn't exist
1682 return candidate
1683 else:
1684 return _loc
1686 def argmax(self, skipna: bool = True) -> int:
1687 validate_bool_kwarg(skipna, "skipna")
1688 if not skipna and self._hasna:
1689 raise NotImplementedError
1690 return self._argmin_argmax("argmax")
1692 def argmin(self, skipna: bool = True) -> int:
1693 validate_bool_kwarg(skipna, "skipna")
1694 if not skipna and self._hasna:
1695 raise NotImplementedError
1696 return self._argmin_argmax("argmin")
1698 # ------------------------------------------------------------------------
1699 # Ufuncs
1700 # ------------------------------------------------------------------------
1702 _HANDLED_TYPES = (np.ndarray, numbers.Number)
1704 def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs, **kwargs):
1705 out = kwargs.get("out", ())
1707 for x in inputs + out:
1708 if not isinstance(x, self._HANDLED_TYPES + (SparseArray,)):
1709 return NotImplemented
1711 # for binary ops, use our custom dunder methods
1712 result = ops.maybe_dispatch_ufunc_to_dunder_op(
1713 self, ufunc, method, *inputs, **kwargs
1714 )
1715 if result is not NotImplemented:
1716 return result
1718 if "out" in kwargs:
1719 # e.g. tests.arrays.sparse.test_arithmetics.test_ndarray_inplace
1720 res = arraylike.dispatch_ufunc_with_out(
1721 self, ufunc, method, *inputs, **kwargs
1722 )
1723 return res
1725 if method == "reduce":
1726 result = arraylike.dispatch_reduction_ufunc(
1727 self, ufunc, method, *inputs, **kwargs
1728 )
1729 if result is not NotImplemented:
1730 # e.g. tests.series.test_ufunc.TestNumpyReductions
1731 return result
1733 if len(inputs) == 1:
1734 # No alignment necessary.
1735 sp_values = getattr(ufunc, method)(self.sp_values, **kwargs)
1736 fill_value = getattr(ufunc, method)(self.fill_value, **kwargs)
1738 if ufunc.nout > 1:
1739 # multiple outputs. e.g. modf
1740 arrays = tuple(
1741 self._simple_new(
1742 sp_value, self.sp_index, SparseDtype(sp_value.dtype, fv)
1743 )
1744 for sp_value, fv in zip(sp_values, fill_value)
1745 )
1746 return arrays
1747 elif method == "reduce":
1748 # e.g. reductions
1749 return sp_values
1751 return self._simple_new(
1752 sp_values, self.sp_index, SparseDtype(sp_values.dtype, fill_value)
1753 )
1755 new_inputs = tuple(np.asarray(x) for x in inputs)
1756 result = getattr(ufunc, method)(*new_inputs, **kwargs)
1757 if out:
1758 if len(out) == 1:
1759 out = out[0]
1760 return out
1762 if ufunc.nout > 1:
1763 return tuple(type(self)(x) for x in result)
1764 elif method == "at":
1765 # no return value
1766 return None
1767 else:
1768 return type(self)(result)
1770 # ------------------------------------------------------------------------
1771 # Ops
1772 # ------------------------------------------------------------------------
1774 def _arith_method(self, other, op):
1775 op_name = op.__name__
1777 if isinstance(other, SparseArray):
1778 return _sparse_array_op(self, other, op, op_name)
1780 elif is_scalar(other):
1781 with np.errstate(all="ignore"):
1782 fill = op(_get_fill(self), np.asarray(other))
1783 result = op(self.sp_values, other)
1785 if op_name == "divmod":
1786 left, right = result
1787 lfill, rfill = fill
1788 return (
1789 _wrap_result(op_name, left, self.sp_index, lfill),
1790 _wrap_result(op_name, right, self.sp_index, rfill),
1791 )
1793 return _wrap_result(op_name, result, self.sp_index, fill)
1795 else:
1796 other = np.asarray(other)
1797 with np.errstate(all="ignore"):
1798 if len(self) != len(other):
1799 raise AssertionError(
1800 f"length mismatch: {len(self)} vs. {len(other)}"
1801 )
1802 if not isinstance(other, SparseArray):
1803 dtype = getattr(other, "dtype", None)
1804 other = SparseArray(other, fill_value=self.fill_value, dtype=dtype)
1805 return _sparse_array_op(self, other, op, op_name)
1807 def _cmp_method(self, other, op) -> SparseArray:
1808 if not is_scalar(other) and not isinstance(other, type(self)):
1809 # convert list-like to ndarray
1810 other = np.asarray(other)
1812 if isinstance(other, np.ndarray):
1813 # TODO: make this more flexible than just ndarray...
1814 other = SparseArray(other, fill_value=self.fill_value)
1816 if isinstance(other, SparseArray):
1817 if len(self) != len(other):
1818 raise ValueError(
1819 f"operands have mismatched length {len(self)} and {len(other)}"
1820 )
1822 op_name = op.__name__.strip("_")
1823 return _sparse_array_op(self, other, op, op_name)
1824 else:
1825 # scalar
1826 with np.errstate(all="ignore"):
1827 fill_value = op(self.fill_value, other)
1828 result = np.full(len(self), fill_value, dtype=np.bool_)
1829 result[self.sp_index.indices] = op(self.sp_values, other)
1831 return type(self)(
1832 result,
1833 fill_value=fill_value,
1834 dtype=np.bool_,
1835 )
1837 _logical_method = _cmp_method
1839 def _unary_method(self, op) -> SparseArray:
1840 fill_value = op(np.array(self.fill_value)).item()
1841 dtype = SparseDtype(self.dtype.subtype, fill_value)
1842 # NOTE: if fill_value doesn't change
1843 # we just have to apply op to sp_values
1844 if isna(self.fill_value) or fill_value == self.fill_value:
1845 values = op(self.sp_values)
1846 return type(self)._simple_new(values, self.sp_index, self.dtype)
1847 # In the other case we have to recalc indexes
1848 return type(self)(op(self.to_dense()), dtype=dtype)
1850 def __pos__(self) -> SparseArray:
1851 return self._unary_method(operator.pos)
1853 def __neg__(self) -> SparseArray:
1854 return self._unary_method(operator.neg)
1856 def __invert__(self) -> SparseArray:
1857 return self._unary_method(operator.invert)
1859 def __abs__(self) -> SparseArray:
1860 return self._unary_method(operator.abs)
1862 # ----------
1863 # Formatting
1864 # -----------
1865 def __repr__(self) -> str:
1866 pp_str = printing.pprint_thing(self)
1867 pp_fill = printing.pprint_thing(self.fill_value)
1868 pp_index = printing.pprint_thing(self.sp_index)
1869 return f"{pp_str}\nFill: {pp_fill}\n{pp_index}"
1871 def _formatter(self, boxed=False):
1872 # Defer to the formatter from the GenericArrayFormatter calling us.
1873 # This will infer the correct formatter from the dtype of the values.
1874 return None
1877def make_sparse(
1878 arr: np.ndarray,
1879 kind: SparseIndexKind = "block",
1880 fill_value=None,
1881 dtype: NpDtype | None = None,
1882):
1883 """
1884 Convert ndarray to sparse format
1886 Parameters
1887 ----------
1888 arr : ndarray
1889 kind : {'block', 'integer'}
1890 fill_value : NaN or another value
1891 dtype : np.dtype, optional
1892 copy : bool, default False
1894 Returns
1895 -------
1896 (sparse_values, index, fill_value) : (ndarray, SparseIndex, Scalar)
1897 """
1898 assert isinstance(arr, np.ndarray)
1900 if arr.ndim > 1:
1901 raise TypeError("expected dimension <= 1 data")
1903 if fill_value is None:
1904 fill_value = na_value_for_dtype(arr.dtype)
1906 if isna(fill_value):
1907 mask = notna(arr)
1908 else:
1909 # cast to object comparison to be safe
1910 if is_string_dtype(arr.dtype):
1911 arr = arr.astype(object)
1913 if is_object_dtype(arr.dtype):
1914 # element-wise equality check method in numpy doesn't treat
1915 # each element type, eg. 0, 0.0, and False are treated as
1916 # same. So we have to check the both of its type and value.
1917 mask = splib.make_mask_object_ndarray(arr, fill_value)
1918 else:
1919 mask = arr != fill_value
1921 length = len(arr)
1922 if length != len(mask):
1923 # the arr is a SparseArray
1924 indices = mask.sp_index.indices
1925 else:
1926 indices = mask.nonzero()[0].astype(np.int32)
1928 index = make_sparse_index(length, indices, kind)
1929 sparsified_values = arr[mask]
1930 if dtype is not None:
1931 # error: Argument "dtype" to "astype_nansafe" has incompatible type "Union[str,
1932 # dtype[Any]]"; expected "Union[dtype[Any], ExtensionDtype]"
1933 sparsified_values = astype_nansafe(
1934 sparsified_values, dtype=dtype # type: ignore[arg-type]
1935 )
1936 # TODO: copy
1937 return sparsified_values, index, fill_value
1940@overload
1941def make_sparse_index(length: int, indices, kind: Literal["block"]) -> BlockIndex:
1942 ...
1945@overload
1946def make_sparse_index(length: int, indices, kind: Literal["integer"]) -> IntIndex:
1947 ...
1950def make_sparse_index(length: int, indices, kind: SparseIndexKind) -> SparseIndex:
1951 index: SparseIndex
1952 if kind == "block":
1953 locs, lens = splib.get_blocks(indices)
1954 index = BlockIndex(length, locs, lens)
1955 elif kind == "integer":
1956 index = IntIndex(length, indices)
1957 else: # pragma: no cover
1958 raise ValueError("must be block or integer type")
1959 return index