Coverage for /var/srv/projects/api.amasfac.comuna18.com/tmp/venv/lib/python3.9/site-packages/pandas/core/arrays/masked.py: 16%
537 statements
« prev ^ index » next coverage.py v6.4.4, created at 2023-07-17 14:22 -0600
« prev ^ index » next coverage.py v6.4.4, created at 2023-07-17 14:22 -0600
1from __future__ import annotations
3from typing import (
4 TYPE_CHECKING,
5 Any,
6 Literal,
7 Sequence,
8 TypeVar,
9 overload,
10)
11import warnings
13import numpy as np
15from pandas._libs import (
16 lib,
17 missing as libmissing,
18)
19from pandas._typing import (
20 ArrayLike,
21 AstypeArg,
22 DtypeObj,
23 NpDtype,
24 PositionalIndexer,
25 Scalar,
26 ScalarIndexer,
27 SequenceIndexer,
28 Shape,
29 npt,
30)
31from pandas.errors import AbstractMethodError
32from pandas.util._decorators import doc
33from pandas.util._validators import validate_fillna_kwargs
35from pandas.core.dtypes.astype import astype_nansafe
36from pandas.core.dtypes.base import ExtensionDtype
37from pandas.core.dtypes.common import (
38 is_bool,
39 is_bool_dtype,
40 is_datetime64_dtype,
41 is_dtype_equal,
42 is_float_dtype,
43 is_integer_dtype,
44 is_list_like,
45 is_object_dtype,
46 is_scalar,
47 is_string_dtype,
48 pandas_dtype,
49)
50from pandas.core.dtypes.dtypes import BaseMaskedDtype
51from pandas.core.dtypes.inference import is_array_like
52from pandas.core.dtypes.missing import (
53 array_equivalent,
54 is_valid_na_for_dtype,
55 isna,
56 notna,
57)
59from pandas.core import (
60 algorithms as algos,
61 arraylike,
62 missing,
63 nanops,
64 ops,
65)
66from pandas.core.algorithms import (
67 factorize_array,
68 isin,
69 take,
70)
71from pandas.core.array_algos import masked_reductions
72from pandas.core.array_algos.quantile import quantile_with_mask
73from pandas.core.arraylike import OpsMixin
74from pandas.core.arrays import ExtensionArray
75from pandas.core.construction import ensure_wrapped_if_datetimelike
76from pandas.core.indexers import check_array_indexer
77from pandas.core.ops import invalid_comparison
79if TYPE_CHECKING: 79 ↛ 80line 79 didn't jump to line 80, because the condition on line 79 was never true
80 from pandas import Series
81 from pandas.core.arrays import BooleanArray
82 from pandas._typing import (
83 NumpySorter,
84 NumpyValueArrayLike,
85 )
87from pandas.compat.numpy import function as nv
89BaseMaskedArrayT = TypeVar("BaseMaskedArrayT", bound="BaseMaskedArray")
92class BaseMaskedArray(OpsMixin, ExtensionArray):
93 """
94 Base class for masked arrays (which use _data and _mask to store the data).
96 numpy based
97 """
99 # The value used to fill '_data' to avoid upcasting
100 _internal_fill_value: Scalar
101 # our underlying data and mask are each ndarrays
102 _data: np.ndarray
103 _mask: npt.NDArray[np.bool_]
105 # Fill values used for any/all
106 _truthy_value = Scalar # bool(_truthy_value) = True
107 _falsey_value = Scalar # bool(_falsey_value) = False
109 def __init__(
110 self, values: np.ndarray, mask: npt.NDArray[np.bool_], copy: bool = False
111 ) -> None:
112 # values is supposed to already be validated in the subclass
113 if not (isinstance(mask, np.ndarray) and mask.dtype == np.bool_):
114 raise TypeError(
115 "mask should be boolean numpy array. Use "
116 "the 'pd.array' function instead"
117 )
118 if values.shape != mask.shape:
119 raise ValueError("values.shape must match mask.shape")
121 if copy:
122 values = values.copy()
123 mask = mask.copy()
125 self._data = values
126 self._mask = mask
128 @classmethod
129 def _from_sequence(
130 cls: type[BaseMaskedArrayT], scalars, *, dtype=None, copy: bool = False
131 ) -> BaseMaskedArrayT:
132 values, mask = cls._coerce_to_array(scalars, dtype=dtype, copy=copy)
133 return cls(values, mask)
135 @property
136 def dtype(self) -> BaseMaskedDtype:
137 raise AbstractMethodError(self)
139 @overload
140 def __getitem__(self, item: ScalarIndexer) -> Any:
141 ...
143 @overload
144 def __getitem__(self: BaseMaskedArrayT, item: SequenceIndexer) -> BaseMaskedArrayT:
145 ...
147 def __getitem__(
148 self: BaseMaskedArrayT, item: PositionalIndexer
149 ) -> BaseMaskedArrayT | Any:
150 item = check_array_indexer(self, item)
152 newmask = self._mask[item]
153 if is_bool(newmask):
154 # This is a scalar indexing
155 if newmask:
156 return self.dtype.na_value
157 return self._data[item]
159 return type(self)(self._data[item], newmask)
161 @doc(ExtensionArray.fillna)
162 def fillna(
163 self: BaseMaskedArrayT, value=None, method=None, limit=None
164 ) -> BaseMaskedArrayT:
165 value, method = validate_fillna_kwargs(value, method)
167 mask = self._mask
169 if is_array_like(value):
170 if len(value) != len(self):
171 raise ValueError(
172 f"Length of 'value' does not match. Got ({len(value)}) "
173 f" expected {len(self)}"
174 )
175 value = value[mask]
177 if mask.any():
178 if method is not None:
179 func = missing.get_fill_func(method, ndim=self.ndim)
180 npvalues = self._data.copy().T
181 new_mask = mask.copy().T
182 func(npvalues, limit=limit, mask=new_mask)
183 return type(self)(npvalues.T, new_mask.T)
184 else:
185 # fill with value
186 new_values = self.copy()
187 new_values[mask] = value
188 else:
189 new_values = self.copy()
190 return new_values
192 @classmethod
193 def _coerce_to_array(
194 cls, values, *, dtype: DtypeObj, copy: bool = False
195 ) -> tuple[np.ndarray, np.ndarray]:
196 raise AbstractMethodError(cls)
198 def _validate_setitem_value(self, value):
199 """
200 Check if we have a scalar that we can cast losslessly.
202 Raises
203 ------
204 TypeError
205 """
206 kind = self.dtype.kind
207 # TODO: get this all from np_can_hold_element?
208 if kind == "b":
209 if lib.is_bool(value):
210 return value
212 elif kind == "f":
213 if lib.is_integer(value) or lib.is_float(value):
214 return value
216 else:
217 if lib.is_integer(value) or (lib.is_float(value) and value.is_integer()):
218 return value
219 # TODO: unsigned checks
221 # Note: without the "str" here, the f-string rendering raises in
222 # py38 builds.
223 raise TypeError(f"Invalid value '{str(value)}' for dtype {self.dtype}")
225 def __setitem__(self, key, value) -> None:
226 key = check_array_indexer(self, key)
228 if is_scalar(value):
229 if is_valid_na_for_dtype(value, self.dtype):
230 self._mask[key] = True
231 else:
232 value = self._validate_setitem_value(value)
233 self._data[key] = value
234 self._mask[key] = False
235 return
237 value, mask = self._coerce_to_array(value, dtype=self.dtype)
239 self._data[key] = value
240 self._mask[key] = mask
242 def __iter__(self):
243 if self.ndim == 1:
244 for i in range(len(self)):
245 if self._mask[i]:
246 yield self.dtype.na_value
247 else:
248 yield self._data[i]
249 else:
250 for i in range(len(self)):
251 yield self[i]
253 def __len__(self) -> int:
254 return len(self._data)
256 @property
257 def shape(self) -> Shape:
258 return self._data.shape
260 @property
261 def ndim(self) -> int:
262 return self._data.ndim
264 def swapaxes(self: BaseMaskedArrayT, axis1, axis2) -> BaseMaskedArrayT:
265 data = self._data.swapaxes(axis1, axis2)
266 mask = self._mask.swapaxes(axis1, axis2)
267 return type(self)(data, mask)
269 def delete(self: BaseMaskedArrayT, loc, axis: int = 0) -> BaseMaskedArrayT:
270 data = np.delete(self._data, loc, axis=axis)
271 mask = np.delete(self._mask, loc, axis=axis)
272 return type(self)(data, mask)
274 def reshape(self: BaseMaskedArrayT, *args, **kwargs) -> BaseMaskedArrayT:
275 data = self._data.reshape(*args, **kwargs)
276 mask = self._mask.reshape(*args, **kwargs)
277 return type(self)(data, mask)
279 def ravel(self: BaseMaskedArrayT, *args, **kwargs) -> BaseMaskedArrayT:
280 # TODO: need to make sure we have the same order for data/mask
281 data = self._data.ravel(*args, **kwargs)
282 mask = self._mask.ravel(*args, **kwargs)
283 return type(self)(data, mask)
285 @property
286 def T(self: BaseMaskedArrayT) -> BaseMaskedArrayT:
287 return type(self)(self._data.T, self._mask.T)
289 def round(self, decimals: int = 0, *args, **kwargs):
290 """
291 Round each value in the array a to the given number of decimals.
293 Parameters
294 ----------
295 decimals : int, default 0
296 Number of decimal places to round to. If decimals is negative,
297 it specifies the number of positions to the left of the decimal point.
298 *args, **kwargs
299 Additional arguments and keywords have no effect but might be
300 accepted for compatibility with NumPy.
302 Returns
303 -------
304 NumericArray
305 Rounded values of the NumericArray.
307 See Also
308 --------
309 numpy.around : Round values of an np.array.
310 DataFrame.round : Round values of a DataFrame.
311 Series.round : Round values of a Series.
312 """
313 nv.validate_round(args, kwargs)
314 values = np.round(self._data, decimals=decimals, **kwargs)
316 # Usually we'll get same type as self, but ndarray[bool] casts to float
317 return self._maybe_mask_result(values, self._mask.copy())
319 # ------------------------------------------------------------------
320 # Unary Methods
322 def __invert__(self: BaseMaskedArrayT) -> BaseMaskedArrayT:
323 return type(self)(~self._data, self._mask.copy())
325 def __neg__(self: BaseMaskedArrayT) -> BaseMaskedArrayT:
326 return type(self)(-self._data, self._mask.copy())
328 def __pos__(self: BaseMaskedArrayT) -> BaseMaskedArrayT:
329 return self.copy()
331 def __abs__(self: BaseMaskedArrayT) -> BaseMaskedArrayT:
332 return type(self)(abs(self._data), self._mask.copy())
334 # ------------------------------------------------------------------
336 def to_numpy(
337 self,
338 dtype: npt.DTypeLike | None = None,
339 copy: bool = False,
340 na_value: object = lib.no_default,
341 ) -> np.ndarray:
342 """
343 Convert to a NumPy Array.
345 By default converts to an object-dtype NumPy array. Specify the `dtype` and
346 `na_value` keywords to customize the conversion.
348 Parameters
349 ----------
350 dtype : dtype, default object
351 The numpy dtype to convert to.
352 copy : bool, default False
353 Whether to ensure that the returned value is a not a view on
354 the array. Note that ``copy=False`` does not *ensure* that
355 ``to_numpy()`` is no-copy. Rather, ``copy=True`` ensure that
356 a copy is made, even if not strictly necessary. This is typically
357 only possible when no missing values are present and `dtype`
358 is the equivalent numpy dtype.
359 na_value : scalar, optional
360 Scalar missing value indicator to use in numpy array. Defaults
361 to the native missing value indicator of this array (pd.NA).
363 Returns
364 -------
365 numpy.ndarray
367 Examples
368 --------
369 An object-dtype is the default result
371 >>> a = pd.array([True, False, pd.NA], dtype="boolean")
372 >>> a.to_numpy()
373 array([True, False, <NA>], dtype=object)
375 When no missing values are present, an equivalent dtype can be used.
377 >>> pd.array([True, False], dtype="boolean").to_numpy(dtype="bool")
378 array([ True, False])
379 >>> pd.array([1, 2], dtype="Int64").to_numpy("int64")
380 array([1, 2])
382 However, requesting such dtype will raise a ValueError if
383 missing values are present and the default missing value :attr:`NA`
384 is used.
386 >>> a = pd.array([True, False, pd.NA], dtype="boolean")
387 >>> a
388 <BooleanArray>
389 [True, False, <NA>]
390 Length: 3, dtype: boolean
392 >>> a.to_numpy(dtype="bool")
393 Traceback (most recent call last):
394 ...
395 ValueError: cannot convert to bool numpy array in presence of missing values
397 Specify a valid `na_value` instead
399 >>> a.to_numpy(dtype="bool", na_value=False)
400 array([ True, False, False])
401 """
402 if na_value is lib.no_default:
403 na_value = libmissing.NA
404 if dtype is None:
405 dtype = object
406 if self._hasna:
407 if (
408 not is_object_dtype(dtype)
409 and not is_string_dtype(dtype)
410 and na_value is libmissing.NA
411 ):
412 raise ValueError(
413 f"cannot convert to '{dtype}'-dtype NumPy array "
414 "with missing values. Specify an appropriate 'na_value' "
415 "for this dtype."
416 )
417 # don't pass copy to astype -> always need a copy since we are mutating
418 data = self._data.astype(dtype)
419 data[self._mask] = na_value
420 else:
421 data = self._data.astype(dtype, copy=copy)
422 return data
424 @overload
425 def astype(self, dtype: npt.DTypeLike, copy: bool = ...) -> np.ndarray:
426 ...
428 @overload
429 def astype(self, dtype: ExtensionDtype, copy: bool = ...) -> ExtensionArray:
430 ...
432 @overload
433 def astype(self, dtype: AstypeArg, copy: bool = ...) -> ArrayLike:
434 ...
436 def astype(self, dtype: AstypeArg, copy: bool = True) -> ArrayLike:
437 dtype = pandas_dtype(dtype)
439 if is_dtype_equal(dtype, self.dtype):
440 if copy:
441 return self.copy()
442 return self
444 # if we are astyping to another nullable masked dtype, we can fastpath
445 if isinstance(dtype, BaseMaskedDtype):
446 # TODO deal with NaNs for FloatingArray case
447 data = self._data.astype(dtype.numpy_dtype, copy=copy)
448 # mask is copied depending on whether the data was copied, and
449 # not directly depending on the `copy` keyword
450 mask = self._mask if data is self._data else self._mask.copy()
451 cls = dtype.construct_array_type()
452 return cls(data, mask, copy=False)
454 if isinstance(dtype, ExtensionDtype):
455 eacls = dtype.construct_array_type()
456 return eacls._from_sequence(self, dtype=dtype, copy=copy)
458 na_value: float | np.datetime64 | lib.NoDefault
460 # coerce
461 if is_float_dtype(dtype):
462 # In astype, we consider dtype=float to also mean na_value=np.nan
463 na_value = np.nan
464 elif is_datetime64_dtype(dtype):
465 na_value = np.datetime64("NaT")
466 else:
467 na_value = lib.no_default
469 # to_numpy will also raise, but we get somewhat nicer exception messages here
470 if is_integer_dtype(dtype) and self._hasna:
471 raise ValueError("cannot convert NA to integer")
472 if is_bool_dtype(dtype) and self._hasna:
473 # careful: astype_nansafe converts np.nan to True
474 raise ValueError("cannot convert float NaN to bool")
476 data = self.to_numpy(dtype=dtype, na_value=na_value, copy=copy)
477 if self.dtype.kind == "f":
478 # TODO: make this consistent between IntegerArray/FloatingArray,
479 # see test_astype_str
480 return astype_nansafe(data, dtype, copy=False)
481 return data
483 __array_priority__ = 1000 # higher than ndarray so ops dispatch to us
485 def __array__(self, dtype: NpDtype | None = None) -> np.ndarray:
486 """
487 the array interface, return my values
488 We return an object array here to preserve our scalar values
489 """
490 return self.to_numpy(dtype=dtype)
492 _HANDLED_TYPES: tuple[type, ...]
494 def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs, **kwargs):
495 # For MaskedArray inputs, we apply the ufunc to ._data
496 # and mask the result.
498 out = kwargs.get("out", ())
500 for x in inputs + out:
501 if not isinstance(x, self._HANDLED_TYPES + (BaseMaskedArray,)):
502 return NotImplemented
504 # for binary ops, use our custom dunder methods
505 result = ops.maybe_dispatch_ufunc_to_dunder_op(
506 self, ufunc, method, *inputs, **kwargs
507 )
508 if result is not NotImplemented:
509 return result
511 if "out" in kwargs:
512 # e.g. test_ufunc_with_out
513 return arraylike.dispatch_ufunc_with_out(
514 self, ufunc, method, *inputs, **kwargs
515 )
517 if method == "reduce":
518 result = arraylike.dispatch_reduction_ufunc(
519 self, ufunc, method, *inputs, **kwargs
520 )
521 if result is not NotImplemented:
522 return result
524 mask = np.zeros(len(self), dtype=bool)
525 inputs2 = []
526 for x in inputs:
527 if isinstance(x, BaseMaskedArray):
528 mask |= x._mask
529 inputs2.append(x._data)
530 else:
531 inputs2.append(x)
533 def reconstruct(x):
534 # we don't worry about scalar `x` here, since we
535 # raise for reduce up above.
536 from pandas.core.arrays import (
537 BooleanArray,
538 FloatingArray,
539 IntegerArray,
540 )
542 if is_bool_dtype(x.dtype):
543 m = mask.copy()
544 return BooleanArray(x, m)
545 elif is_integer_dtype(x.dtype):
546 m = mask.copy()
547 return IntegerArray(x, m)
548 elif is_float_dtype(x.dtype):
549 m = mask.copy()
550 if x.dtype == np.float16:
551 # reached in e.g. np.sqrt on BooleanArray
552 # we don't support float16
553 x = x.astype(np.float32)
554 return FloatingArray(x, m)
555 else:
556 x[mask] = np.nan
557 return x
559 result = getattr(ufunc, method)(*inputs2, **kwargs)
560 if ufunc.nout > 1:
561 # e.g. np.divmod
562 return tuple(reconstruct(x) for x in result)
563 elif method == "reduce":
564 # e.g. np.add.reduce; test_ufunc_reduce_raises
565 if self._mask.any():
566 return self._na_value
567 return result
568 else:
569 return reconstruct(result)
571 def __arrow_array__(self, type=None):
572 """
573 Convert myself into a pyarrow Array.
574 """
575 import pyarrow as pa
577 return pa.array(self._data, mask=self._mask, type=type)
579 @property
580 def _hasna(self) -> bool:
581 # Note: this is expensive right now! The hope is that we can
582 # make this faster by having an optional mask, but not have to change
583 # source code using it..
585 # error: Incompatible return value type (got "bool_", expected "bool")
586 return self._mask.any() # type: ignore[return-value]
588 def _propagate_mask(
589 self, mask: npt.NDArray[np.bool_] | None, other
590 ) -> npt.NDArray[np.bool_]:
591 if mask is None:
592 mask = self._mask.copy() # TODO: need test for BooleanArray needing a copy
593 if other is libmissing.NA:
594 # GH#45421 don't alter inplace
595 mask = mask | True
596 else:
597 mask = self._mask | mask
598 return mask
600 def _arith_method(self, other, op):
601 op_name = op.__name__
602 omask = None
604 if isinstance(other, BaseMaskedArray):
605 other, omask = other._data, other._mask
607 elif is_list_like(other):
608 if not isinstance(other, ExtensionArray):
609 other = np.asarray(other)
610 if other.ndim > 1:
611 raise NotImplementedError("can only perform ops with 1-d structures")
613 # We wrap the non-masked arithmetic logic used for numpy dtypes
614 # in Series/Index arithmetic ops.
615 other = ops.maybe_prepare_scalar_for_op(other, (len(self),))
616 pd_op = ops.get_array_op(op)
617 other = ensure_wrapped_if_datetimelike(other)
619 if op_name in {"pow", "rpow"} and isinstance(other, np.bool_):
620 # Avoid DeprecationWarning: In future, it will be an error
621 # for 'np.bool_' scalars to be interpreted as an index
622 # e.g. test_array_scalar_like_equivalence
623 other = bool(other)
625 mask = self._propagate_mask(omask, other)
627 if other is libmissing.NA:
628 result = np.ones_like(self._data)
629 if self.dtype.kind == "b":
630 if op_name in {
631 "floordiv",
632 "rfloordiv",
633 "pow",
634 "rpow",
635 "truediv",
636 "rtruediv",
637 }:
638 # GH#41165 Try to match non-masked Series behavior
639 # This is still imperfect GH#46043
640 raise NotImplementedError(
641 f"operator '{op_name}' not implemented for bool dtypes"
642 )
643 elif op_name in {"mod", "rmod"}:
644 dtype = "int8"
645 else:
646 dtype = "bool"
647 result = result.astype(dtype)
648 elif "truediv" in op_name and self.dtype.kind != "f":
649 # The actual data here doesn't matter since the mask
650 # will be all-True, but since this is division, we want
651 # to end up with floating dtype.
652 result = result.astype(np.float64)
653 else:
654 # Make sure we do this before the "pow" mask checks
655 # to get an expected exception message on shape mismatch.
656 if self.dtype.kind in ["i", "u"] and op_name in ["floordiv", "mod"]:
657 # TODO(GH#30188) ATM we don't match the behavior of non-masked
658 # types with respect to floordiv-by-zero
659 pd_op = op
661 with np.errstate(all="ignore"):
662 result = pd_op(self._data, other)
664 if op_name == "pow":
665 # 1 ** x is 1.
666 mask = np.where((self._data == 1) & ~self._mask, False, mask)
667 # x ** 0 is 1.
668 if omask is not None:
669 mask = np.where((other == 0) & ~omask, False, mask)
670 elif other is not libmissing.NA:
671 mask = np.where(other == 0, False, mask)
673 elif op_name == "rpow":
674 # 1 ** x is 1.
675 if omask is not None:
676 mask = np.where((other == 1) & ~omask, False, mask)
677 elif other is not libmissing.NA:
678 mask = np.where(other == 1, False, mask)
679 # x ** 0 is 1.
680 mask = np.where((self._data == 0) & ~self._mask, False, mask)
682 return self._maybe_mask_result(result, mask)
684 _logical_method = _arith_method
686 def _cmp_method(self, other, op) -> BooleanArray:
687 from pandas.core.arrays import BooleanArray
689 mask = None
691 if isinstance(other, BaseMaskedArray):
692 other, mask = other._data, other._mask
694 elif is_list_like(other):
695 other = np.asarray(other)
696 if other.ndim > 1:
697 raise NotImplementedError("can only perform ops with 1-d structures")
698 if len(self) != len(other):
699 raise ValueError("Lengths must match to compare")
701 if other is libmissing.NA:
702 # numpy does not handle pd.NA well as "other" scalar (it returns
703 # a scalar False instead of an array)
704 # This may be fixed by NA.__array_ufunc__. Revisit this check
705 # once that's implemented.
706 result = np.zeros(self._data.shape, dtype="bool")
707 mask = np.ones(self._data.shape, dtype="bool")
708 else:
709 with warnings.catch_warnings():
710 # numpy may show a FutureWarning:
711 # elementwise comparison failed; returning scalar instead,
712 # but in the future will perform elementwise comparison
713 # before returning NotImplemented. We fall back to the correct
714 # behavior today, so that should be fine to ignore.
715 warnings.filterwarnings("ignore", "elementwise", FutureWarning)
716 with np.errstate(all="ignore"):
717 method = getattr(self._data, f"__{op.__name__}__")
718 result = method(other)
720 if result is NotImplemented:
721 result = invalid_comparison(self._data, other, op)
723 mask = self._propagate_mask(mask, other)
724 return BooleanArray(result, mask, copy=False)
726 def _maybe_mask_result(self, result, mask):
727 """
728 Parameters
729 ----------
730 result : array-like or tuple[array-like]
731 mask : array-like bool
732 """
733 if isinstance(result, tuple):
734 # i.e. divmod
735 div, mod = result
736 return (
737 self._maybe_mask_result(div, mask),
738 self._maybe_mask_result(mod, mask),
739 )
741 if is_float_dtype(result.dtype):
742 from pandas.core.arrays import FloatingArray
744 return FloatingArray(result, mask, copy=False)
746 elif is_bool_dtype(result.dtype):
747 from pandas.core.arrays import BooleanArray
749 return BooleanArray(result, mask, copy=False)
751 elif result.dtype == "timedelta64[ns]":
752 # e.g. test_numeric_arr_mul_tdscalar_numexpr_path
753 from pandas.core.arrays import TimedeltaArray
755 if not isinstance(result, TimedeltaArray):
756 result = TimedeltaArray._simple_new(result)
758 result[mask] = result.dtype.type("NaT")
759 return result
761 elif is_integer_dtype(result.dtype):
762 from pandas.core.arrays import IntegerArray
764 return IntegerArray(result, mask, copy=False)
766 else:
767 result[mask] = np.nan
768 return result
770 def isna(self) -> np.ndarray:
771 return self._mask.copy()
773 @property
774 def _na_value(self):
775 return self.dtype.na_value
777 @property
778 def nbytes(self) -> int:
779 return self._data.nbytes + self._mask.nbytes
781 @classmethod
782 def _concat_same_type(
783 cls: type[BaseMaskedArrayT],
784 to_concat: Sequence[BaseMaskedArrayT],
785 axis: int = 0,
786 ) -> BaseMaskedArrayT:
787 data = np.concatenate([x._data for x in to_concat], axis=axis)
788 mask = np.concatenate([x._mask for x in to_concat], axis=axis)
789 return cls(data, mask)
791 def take(
792 self: BaseMaskedArrayT,
793 indexer,
794 *,
795 allow_fill: bool = False,
796 fill_value: Scalar | None = None,
797 axis: int = 0,
798 ) -> BaseMaskedArrayT:
799 # we always fill with 1 internally
800 # to avoid upcasting
801 data_fill_value = self._internal_fill_value if isna(fill_value) else fill_value
802 result = take(
803 self._data,
804 indexer,
805 fill_value=data_fill_value,
806 allow_fill=allow_fill,
807 axis=axis,
808 )
810 mask = take(
811 self._mask, indexer, fill_value=True, allow_fill=allow_fill, axis=axis
812 )
814 # if we are filling
815 # we only fill where the indexer is null
816 # not existing missing values
817 # TODO(jreback) what if we have a non-na float as a fill value?
818 if allow_fill and notna(fill_value):
819 fill_mask = np.asarray(indexer) == -1
820 result[fill_mask] = fill_value
821 mask = mask ^ fill_mask
823 return type(self)(result, mask, copy=False)
825 # error: Return type "BooleanArray" of "isin" incompatible with return type
826 # "ndarray" in supertype "ExtensionArray"
827 def isin(self, values) -> BooleanArray: # type: ignore[override]
829 from pandas.core.arrays import BooleanArray
831 # algorithms.isin will eventually convert values to an ndarray, so no extra
832 # cost to doing it here first
833 values_arr = np.asarray(values)
834 result = isin(self._data, values_arr)
836 if self._hasna:
837 values_have_NA = is_object_dtype(values_arr.dtype) and any(
838 val is self.dtype.na_value for val in values_arr
839 )
841 # For now, NA does not propagate so set result according to presence of NA,
842 # see https://github.com/pandas-dev/pandas/pull/38379 for some discussion
843 result[self._mask] = values_have_NA
845 mask = np.zeros(self._data.shape, dtype=bool)
846 return BooleanArray(result, mask, copy=False)
848 def copy(self: BaseMaskedArrayT) -> BaseMaskedArrayT:
849 data, mask = self._data, self._mask
850 data = data.copy()
851 mask = mask.copy()
852 return type(self)(data, mask, copy=False)
854 def unique(self: BaseMaskedArrayT) -> BaseMaskedArrayT:
855 """
856 Compute the BaseMaskedArray of unique values.
858 Returns
859 -------
860 uniques : BaseMaskedArray
861 """
862 uniques, mask = algos.unique_with_mask(self._data, self._mask)
863 return type(self)(uniques, mask, copy=False)
865 @doc(ExtensionArray.searchsorted)
866 def searchsorted(
867 self,
868 value: NumpyValueArrayLike | ExtensionArray,
869 side: Literal["left", "right"] = "left",
870 sorter: NumpySorter = None,
871 ) -> npt.NDArray[np.intp] | np.intp:
872 if self._hasna:
873 raise ValueError(
874 "searchsorted requires array to be sorted, which is impossible "
875 "with NAs present."
876 )
877 if isinstance(value, ExtensionArray):
878 value = value.astype(object)
879 # Base class searchsorted would cast to object, which is *much* slower.
880 return self._data.searchsorted(value, side=side, sorter=sorter)
882 @doc(ExtensionArray.factorize)
883 def factorize(
884 self,
885 na_sentinel: int | lib.NoDefault = lib.no_default,
886 use_na_sentinel: bool | lib.NoDefault = lib.no_default,
887 ) -> tuple[np.ndarray, ExtensionArray]:
888 resolved_na_sentinel = algos.resolve_na_sentinel(na_sentinel, use_na_sentinel)
889 arr = self._data
890 mask = self._mask
892 # Pass non-None na_sentinel; recode and add NA to uniques if necessary below
893 na_sentinel_arg = -1 if resolved_na_sentinel is None else resolved_na_sentinel
894 codes, uniques = factorize_array(arr, na_sentinel=na_sentinel_arg, mask=mask)
896 # check that factorize_array correctly preserves dtype.
897 assert uniques.dtype == self.dtype.numpy_dtype, (uniques.dtype, self.dtype)
899 has_na = mask.any()
900 if resolved_na_sentinel is not None or not has_na:
901 size = len(uniques)
902 else:
903 # Make room for an NA value
904 size = len(uniques) + 1
905 uniques_mask = np.zeros(size, dtype=bool)
906 if resolved_na_sentinel is None and has_na:
907 na_index = mask.argmax()
908 # Insert na with the proper code
909 if na_index == 0:
910 na_code = np.intp(0)
911 else:
912 # mypy error: Slice index must be an integer or None
913 # https://github.com/python/mypy/issues/2410
914 na_code = codes[:na_index].max() + 1 # type: ignore[misc]
915 codes[codes >= na_code] += 1
916 codes[codes == -1] = na_code
917 # dummy value for uniques; not used since uniques_mask will be True
918 uniques = np.insert(uniques, na_code, 0)
919 uniques_mask[na_code] = True
920 uniques_ea = type(self)(uniques, uniques_mask)
922 return codes, uniques_ea
924 @doc(ExtensionArray._values_for_argsort)
925 def _values_for_argsort(self) -> np.ndarray:
926 return self._data
928 def value_counts(self, dropna: bool = True) -> Series:
929 """
930 Returns a Series containing counts of each unique value.
932 Parameters
933 ----------
934 dropna : bool, default True
935 Don't include counts of missing values.
937 Returns
938 -------
939 counts : Series
941 See Also
942 --------
943 Series.value_counts
944 """
945 from pandas import (
946 Index,
947 Series,
948 )
949 from pandas.arrays import IntegerArray
951 if dropna:
952 keys, counts = algos.value_counts_arraylike(
953 self._data, dropna=True, mask=self._mask
954 )
955 res = Series(counts, index=keys)
956 res.index = res.index.astype(self.dtype)
957 res = res.astype("Int64")
958 return res
960 # compute counts on the data with no nans
961 data = self._data[~self._mask]
962 value_counts = Index(data).value_counts()
964 index = value_counts.index
966 # if we want nans, count the mask
967 if dropna:
968 counts = value_counts._values
969 else:
970 counts = np.empty(len(value_counts) + 1, dtype="int64")
971 counts[:-1] = value_counts
972 counts[-1] = self._mask.sum()
974 index = index.insert(len(index), self.dtype.na_value)
976 index = index.astype(self.dtype)
978 mask = np.zeros(len(counts), dtype="bool")
979 counts_array = IntegerArray(counts, mask)
981 return Series(counts_array, index=index)
983 @doc(ExtensionArray.equals)
984 def equals(self, other) -> bool:
985 if type(self) != type(other):
986 return False
987 if other.dtype != self.dtype:
988 return False
990 # GH#44382 if e.g. self[1] is np.nan and other[1] is pd.NA, we are NOT
991 # equal.
992 if not np.array_equal(self._mask, other._mask):
993 return False
995 left = self._data[~self._mask]
996 right = other._data[~other._mask]
997 return array_equivalent(left, right, dtype_equal=True)
999 def _quantile(
1000 self, qs: npt.NDArray[np.float64], interpolation: str
1001 ) -> BaseMaskedArray:
1002 """
1003 Dispatch to quantile_with_mask, needed because we do not have
1004 _from_factorized.
1006 Notes
1007 -----
1008 We assume that all impacted cases are 1D-only.
1009 """
1010 res = quantile_with_mask(
1011 self._data,
1012 mask=self._mask,
1013 # TODO(GH#40932): na_value_for_dtype(self.dtype.numpy_dtype)
1014 # instead of np.nan
1015 fill_value=np.nan,
1016 qs=qs,
1017 interpolation=interpolation,
1018 )
1020 if self._hasna:
1021 # Our result mask is all-False unless we are all-NA, in which
1022 # case it is all-True.
1023 if self.ndim == 2:
1024 # I think this should be out_mask=self.isna().all(axis=1)
1025 # but am holding off until we have tests
1026 raise NotImplementedError
1027 elif self.isna().all():
1028 out_mask = np.ones(res.shape, dtype=bool)
1029 else:
1030 out_mask = np.zeros(res.shape, dtype=bool)
1031 else:
1032 out_mask = np.zeros(res.shape, dtype=bool)
1033 return self._maybe_mask_result(res, mask=out_mask)
1035 # ------------------------------------------------------------------
1036 # Reductions
1038 def _reduce(self, name: str, *, skipna: bool = True, **kwargs):
1039 if name in {"any", "all", "min", "max", "sum", "prod"}:
1040 return getattr(self, name)(skipna=skipna, **kwargs)
1042 data = self._data
1043 mask = self._mask
1045 if name in {"mean"}:
1046 op = getattr(masked_reductions, name)
1047 result = op(data, mask, skipna=skipna, **kwargs)
1048 return result
1050 # coerce to a nan-aware float if needed
1051 # (we explicitly use NaN within reductions)
1052 if self._hasna:
1053 data = self.to_numpy("float64", na_value=np.nan)
1055 # median, var, std, skew, kurt, idxmin, idxmax
1056 op = getattr(nanops, "nan" + name)
1057 result = op(data, axis=0, skipna=skipna, mask=mask, **kwargs)
1059 if np.isnan(result):
1060 return libmissing.NA
1062 return result
1064 def _wrap_reduction_result(self, name: str, result, skipna, **kwargs):
1065 if isinstance(result, np.ndarray):
1066 axis = kwargs["axis"]
1067 if skipna:
1068 # we only retain mask for all-NA rows/columns
1069 mask = self._mask.all(axis=axis)
1070 else:
1071 mask = self._mask.any(axis=axis)
1073 return self._maybe_mask_result(result, mask)
1074 return result
1076 def sum(self, *, skipna=True, min_count=0, axis: int | None = 0, **kwargs):
1077 nv.validate_sum((), kwargs)
1079 # TODO: do this in validate_sum?
1080 if "out" in kwargs:
1081 # np.sum; test_floating_array_numpy_sum
1082 if kwargs["out"] is not None:
1083 raise NotImplementedError
1084 kwargs.pop("out")
1086 result = masked_reductions.sum(
1087 self._data,
1088 self._mask,
1089 skipna=skipna,
1090 min_count=min_count,
1091 axis=axis,
1092 )
1093 return self._wrap_reduction_result(
1094 "sum", result, skipna=skipna, axis=axis, **kwargs
1095 )
1097 def prod(self, *, skipna=True, min_count=0, axis: int | None = 0, **kwargs):
1098 nv.validate_prod((), kwargs)
1099 result = masked_reductions.prod(
1100 self._data,
1101 self._mask,
1102 skipna=skipna,
1103 min_count=min_count,
1104 axis=axis,
1105 )
1106 return self._wrap_reduction_result(
1107 "prod", result, skipna=skipna, axis=axis, **kwargs
1108 )
1110 def min(self, *, skipna=True, axis: int | None = 0, **kwargs):
1111 nv.validate_min((), kwargs)
1112 return masked_reductions.min(
1113 self._data,
1114 self._mask,
1115 skipna=skipna,
1116 axis=axis,
1117 )
1119 def max(self, *, skipna=True, axis: int | None = 0, **kwargs):
1120 nv.validate_max((), kwargs)
1121 return masked_reductions.max(
1122 self._data,
1123 self._mask,
1124 skipna=skipna,
1125 axis=axis,
1126 )
1128 def any(self, *, skipna: bool = True, **kwargs):
1129 """
1130 Return whether any element is truthy.
1132 Returns False unless there is at least one element that is truthy.
1133 By default, NAs are skipped. If ``skipna=False`` is specified and
1134 missing values are present, similar :ref:`Kleene logic <boolean.kleene>`
1135 is used as for logical operations.
1137 .. versionchanged:: 1.4.0
1139 Parameters
1140 ----------
1141 skipna : bool, default True
1142 Exclude NA values. If the entire array is NA and `skipna` is
1143 True, then the result will be False, as for an empty array.
1144 If `skipna` is False, the result will still be True if there is
1145 at least one element that is truthy, otherwise NA will be returned
1146 if there are NA's present.
1147 **kwargs : any, default None
1148 Additional keywords have no effect but might be accepted for
1149 compatibility with NumPy.
1151 Returns
1152 -------
1153 bool or :attr:`pandas.NA`
1155 See Also
1156 --------
1157 numpy.any : Numpy version of this method.
1158 BaseMaskedArray.all : Return whether all elements are truthy.
1160 Examples
1161 --------
1162 The result indicates whether any element is truthy (and by default
1163 skips NAs):
1165 >>> pd.array([True, False, True]).any()
1166 True
1167 >>> pd.array([True, False, pd.NA]).any()
1168 True
1169 >>> pd.array([False, False, pd.NA]).any()
1170 False
1171 >>> pd.array([], dtype="boolean").any()
1172 False
1173 >>> pd.array([pd.NA], dtype="boolean").any()
1174 False
1175 >>> pd.array([pd.NA], dtype="Float64").any()
1176 False
1178 With ``skipna=False``, the result can be NA if this is logically
1179 required (whether ``pd.NA`` is True or False influences the result):
1181 >>> pd.array([True, False, pd.NA]).any(skipna=False)
1182 True
1183 >>> pd.array([1, 0, pd.NA]).any(skipna=False)
1184 True
1185 >>> pd.array([False, False, pd.NA]).any(skipna=False)
1186 <NA>
1187 >>> pd.array([0, 0, pd.NA]).any(skipna=False)
1188 <NA>
1189 """
1190 kwargs.pop("axis", None)
1191 nv.validate_any((), kwargs)
1193 values = self._data.copy()
1194 # error: Argument 3 to "putmask" has incompatible type "object";
1195 # expected "Union[_SupportsArray[dtype[Any]],
1196 # _NestedSequence[_SupportsArray[dtype[Any]]],
1197 # bool, int, float, complex, str, bytes,
1198 # _NestedSequence[Union[bool, int, float, complex, str, bytes]]]"
1199 np.putmask(values, self._mask, self._falsey_value) # type: ignore[arg-type]
1200 result = values.any()
1201 if skipna:
1202 return result
1203 else:
1204 if result or len(self) == 0 or not self._mask.any():
1205 return result
1206 else:
1207 return self.dtype.na_value
1209 def all(self, *, skipna: bool = True, **kwargs):
1210 """
1211 Return whether all elements are truthy.
1213 Returns True unless there is at least one element that is falsey.
1214 By default, NAs are skipped. If ``skipna=False`` is specified and
1215 missing values are present, similar :ref:`Kleene logic <boolean.kleene>`
1216 is used as for logical operations.
1218 .. versionchanged:: 1.4.0
1220 Parameters
1221 ----------
1222 skipna : bool, default True
1223 Exclude NA values. If the entire array is NA and `skipna` is
1224 True, then the result will be True, as for an empty array.
1225 If `skipna` is False, the result will still be False if there is
1226 at least one element that is falsey, otherwise NA will be returned
1227 if there are NA's present.
1228 **kwargs : any, default None
1229 Additional keywords have no effect but might be accepted for
1230 compatibility with NumPy.
1232 Returns
1233 -------
1234 bool or :attr:`pandas.NA`
1236 See Also
1237 --------
1238 numpy.all : Numpy version of this method.
1239 BooleanArray.any : Return whether any element is truthy.
1241 Examples
1242 --------
1243 The result indicates whether all elements are truthy (and by default
1244 skips NAs):
1246 >>> pd.array([True, True, pd.NA]).all()
1247 True
1248 >>> pd.array([1, 1, pd.NA]).all()
1249 True
1250 >>> pd.array([True, False, pd.NA]).all()
1251 False
1252 >>> pd.array([], dtype="boolean").all()
1253 True
1254 >>> pd.array([pd.NA], dtype="boolean").all()
1255 True
1256 >>> pd.array([pd.NA], dtype="Float64").all()
1257 True
1259 With ``skipna=False``, the result can be NA if this is logically
1260 required (whether ``pd.NA`` is True or False influences the result):
1262 >>> pd.array([True, True, pd.NA]).all(skipna=False)
1263 <NA>
1264 >>> pd.array([1, 1, pd.NA]).all(skipna=False)
1265 <NA>
1266 >>> pd.array([True, False, pd.NA]).all(skipna=False)
1267 False
1268 >>> pd.array([1, 0, pd.NA]).all(skipna=False)
1269 False
1270 """
1271 kwargs.pop("axis", None)
1272 nv.validate_all((), kwargs)
1274 values = self._data.copy()
1275 # error: Argument 3 to "putmask" has incompatible type "object";
1276 # expected "Union[_SupportsArray[dtype[Any]],
1277 # _NestedSequence[_SupportsArray[dtype[Any]]],
1278 # bool, int, float, complex, str, bytes,
1279 # _NestedSequence[Union[bool, int, float, complex, str, bytes]]]"
1280 np.putmask(values, self._mask, self._truthy_value) # type: ignore[arg-type]
1281 result = values.all()
1283 if skipna:
1284 return result
1285 else:
1286 if not result or len(self) == 0 or not self._mask.any():
1287 return result
1288 else:
1289 return self.dtype.na_value