Coverage for /var/srv/projects/api.amasfac.comuna18.com/tmp/venv/lib/python3.9/site-packages/pandas/core/base.py: 32%

1"""

2Base and utility classes for pandas objects.

3"""

5from __future__ import annotations

7import textwrap

8from typing import (

9 TYPE_CHECKING,

10 Any,

11 Generic,

12 Hashable,

13 Literal,

14 TypeVar,

15 cast,

16 final,

17 overload,

18)

19import warnings

21import numpy as np

23import pandas._libs.lib as lib

24from pandas._typing import (

25 ArrayLike,

26 DtypeObj,

27 IndexLabel,

28 NDFrameT,

29 Shape,

30 npt,

31)

32from pandas.compat import PYPY

33from pandas.compat.numpy import function as nv

34from pandas.errors import AbstractMethodError

35from pandas.util._decorators import (

36 cache_readonly,

37 doc,

38)

39from pandas.util._exceptions import find_stack_level

41from pandas.core.dtypes.common import (

42 is_categorical_dtype,

43 is_dict_like,

44 is_extension_array_dtype,

45 is_object_dtype,

46 is_scalar,

47)

48from pandas.core.dtypes.generic import (

49 ABCDataFrame,

50 ABCIndex,

51 ABCSeries,

52)

53from pandas.core.dtypes.missing import (

54 isna,

55 remove_na_arraylike,

56)

58from pandas.core import (

59 algorithms,

60 nanops,

61 ops,

62)

63from pandas.core.accessor import DirNamesMixin

64from pandas.core.algorithms import (

65 duplicated,

66 unique1d,

67 value_counts,

68)

69from pandas.core.arraylike import OpsMixin

70from pandas.core.arrays import ExtensionArray

71from pandas.core.construction import (

72 create_series_with_explicit_dtype,

73 ensure_wrapped_if_datetimelike,

74 extract_array,

75)

77if TYPE_CHECKING: 77 ↛ 79line 77 didn't jump to line 79, because the condition on line 77 was never true

79 from pandas._typing import (

80 NumpySorter,

81 NumpyValueArrayLike,

82 ScalarLike_co,

83 )

85 from pandas import (

86 Categorical,

87 Series,

88 )

91_shared_docs: dict[str, str] = {}

92_indexops_doc_kwargs = {

93 "klass": "IndexOpsMixin",

94 "inplace": "",

95 "unique": "IndexOpsMixin",

96 "duplicated": "IndexOpsMixin",

97}

99_T = TypeVar("_T", bound="IndexOpsMixin")

100

101

102class PandasObject(DirNamesMixin):

103 """

104 Baseclass for various pandas objects.

105 """

106

107 # results from calls to methods decorated with cache_readonly get added to _cache

108 _cache: dict[str, Any]

109

110 @property

111 def _constructor(self):

112 """

113 Class constructor (for this class it's just `__class__`.

114 """

115 return type(self)

116

117 def __repr__(self) -> str:

118 """

119 Return a string representation for a particular object.

120 """

121 # Should be overwritten by base classes

122 return object.__repr__(self)

123

124 def _reset_cache(self, key: str | None = None) -> None:

125 """

126 Reset cached properties. If ``key`` is passed, only clears that key.

127 """

128 if not hasattr(self, "_cache"):

129 return

130 if key is None:

131 self._cache.clear()

132 else:

133 self._cache.pop(key, None)

134

135 def __sizeof__(self) -> int:

136 """

137 Generates the total memory usage for an object that returns

138 either a value or Series of values

139 """

140 memory_usage = getattr(self, "memory_usage", None)

141 if memory_usage:

142 mem = memory_usage(deep=True)

143 return int(mem if is_scalar(mem) else mem.sum())

144

145 # no memory_usage attribute, so fall back to object's 'sizeof'

146 return super().__sizeof__()

147

148

149class NoNewAttributesMixin:

150 """

151 Mixin which prevents adding new attributes.

152

153 Prevents additional attributes via xxx.attribute = "something" after a

154 call to `self.__freeze()`. Mainly used to prevent the user from using

155 wrong attributes on an accessor (`Series.cat/.str/.dt`).

156

157 If you really want to add a new attribute at a later time, you need to use

158 `object.__setattr__(self, key, value)`.

159 """

160

161 def _freeze(self):

162 """

163 Prevents setting additional attributes.

164 """

165 object.__setattr__(self, "__frozen", True)

166

167 # prevent adding any attribute via s.xxx.new_attribute = ...

168 def __setattr__(self, key: str, value) -> None:

169 # _cache is used by a decorator

170 # We need to check both 1.) cls.__dict__ and 2.) getattr(self, key)

171 # because

172 # 1.) getattr is false for attributes that raise errors

173 # 2.) cls.__dict__ doesn't traverse into base classes

174 if getattr(self, "__frozen", False) and not (

175 key == "_cache"

176 or key in type(self).__dict__

177 or getattr(self, key, None) is not None

178 ):

179 raise AttributeError(f"You cannot add any new attribute '{key}'")

180 object.__setattr__(self, key, value)

181

182

183class SelectionMixin(Generic[NDFrameT]):

184 """

185 mixin implementing the selection & aggregation interface on a group-like

186 object sub-classes need to define: obj, exclusions

187 """

188

189 obj: NDFrameT

190 _selection: IndexLabel | None = None

191 exclusions: frozenset[Hashable]

192 _internal_names = ["_cache", "__setstate__"]

193 _internal_names_set = set(_internal_names)

194

195 @final

196 @property

197 def _selection_list(self):

198 if not isinstance(

199 self._selection, (list, tuple, ABCSeries, ABCIndex, np.ndarray)

200 ):

201 return [self._selection]

202 return self._selection

203

204 @cache_readonly

205 def _selected_obj(self):

206 if self._selection is None or isinstance(self.obj, ABCSeries):

207 return self.obj

208 else:

209 return self.obj[self._selection]

210

211 @final

212 @cache_readonly

213 def ndim(self) -> int:

214 return self._selected_obj.ndim

215

216 @final

217 @cache_readonly

218 def _obj_with_exclusions(self):

219 if self._selection is not None and isinstance(self.obj, ABCDataFrame):

220 return self.obj[self._selection_list]

221

222 if len(self.exclusions) > 0:

223 # equivalent to `self.obj.drop(self.exclusions, axis=1)

224 # but this avoids consolidating and making a copy

225 # TODO: following GH#45287 can we now use .drop directly without

226 # making a copy?

227 return self.obj._drop_axis(self.exclusions, axis=1, only_slice=True)

228 else:

229 return self.obj

230

231 def __getitem__(self, key):

232 if self._selection is not None:

233 raise IndexError(f"Column(s) {self._selection} already selected")

234

235 if isinstance(key, (list, tuple, ABCSeries, ABCIndex, np.ndarray)):

236 if len(self.obj.columns.intersection(key)) != len(set(key)):

237 bad_keys = list(set(key).difference(self.obj.columns))

238 raise KeyError(f"Columns not found: {str(bad_keys)[1:-1]}")

239 return self._gotitem(list(key), ndim=2)

240

241 elif not getattr(self, "as_index", False):

242 if key not in self.obj.columns:

243 raise KeyError(f"Column not found: {key}")

244 return self._gotitem(key, ndim=2)

245

246 else:

247 if key not in self.obj:

248 raise KeyError(f"Column not found: {key}")

249 subset = self.obj[key]

250 ndim = subset.ndim

251 return self._gotitem(key, ndim=ndim, subset=subset)

252

253 def _gotitem(self, key, ndim: int, subset=None):

254 """

255 sub-classes to define

256 return a sliced object

257

258 Parameters

259 ----------

260 key : str / list of selections

261 ndim : {1, 2}

262 requested ndim of result

263 subset : object, default None

264 subset to act on

265 """

266 raise AbstractMethodError(self)

267

268 def aggregate(self, func, *args, **kwargs):

269 raise AbstractMethodError(self)

270

271 agg = aggregate

272

273

274class IndexOpsMixin(OpsMixin):

275 """

276 Common ops mixin to support a unified interface / docs for Series / Index

277 """

278

279 # ndarray compatibility

280 __array_priority__ = 1000

281 _hidden_attrs: frozenset[str] = frozenset(

282 ["tolist"] # tolist is not deprecated, just suppressed in the __dir__

283 )

284

285 @property

286 def dtype(self) -> DtypeObj:

287 # must be defined here as a property for mypy

288 raise AbstractMethodError(self)

289

290 @property

291 def _values(self) -> ExtensionArray | np.ndarray:

292 # must be defined here as a property for mypy

293 raise AbstractMethodError(self)

294

295 def transpose(self: _T, *args, **kwargs) -> _T:

296 """

297 Return the transpose, which is by definition self.

298

299 Returns

300 -------

301 %(klass)s

302 """

303 nv.validate_transpose(args, kwargs)

304 return self

305

306 T = property(

307 transpose,

308 doc="""

309 Return the transpose, which is by definition self.

310 """,

311 )

312

313 @property

314 def shape(self) -> Shape:

315 """

316 Return a tuple of the shape of the underlying data.

317 """

318 return self._values.shape

319

320 def __len__(self) -> int:

321 # We need this defined here for mypy

322 raise AbstractMethodError(self)

323

324 @property

325 def ndim(self) -> Literal[1]:

326 """

327 Number of dimensions of the underlying data, by definition 1.

328 """

329 return 1

330

331 def item(self):

332 """

333 Return the first element of the underlying data as a Python scalar.

334

335 Returns

336 -------

337 scalar

338 The first element of %(klass)s.

339

340 Raises

341 ------

342 ValueError

343 If the data is not length-1.

344 """

345 if len(self) == 1:

346 return next(iter(self))

347 raise ValueError("can only convert an array of size 1 to a Python scalar")

348

349 @property

350 def nbytes(self) -> int:

351 """

352 Return the number of bytes in the underlying data.

353 """

354 return self._values.nbytes

355

356 @property

357 def size(self) -> int:

358 """

359 Return the number of elements in the underlying data.

360 """

361 return len(self._values)

362

363 @property

364 def array(self) -> ExtensionArray:

365 """

366 The ExtensionArray of the data backing this Series or Index.

367

368 Returns

369 -------

370 ExtensionArray

371 An ExtensionArray of the values stored within. For extension

372 types, this is the actual array. For NumPy native types, this

373 is a thin (no copy) wrapper around :class:`numpy.ndarray`.

374

375 ``.array`` differs ``.values`` which may require converting the

376 data to a different form.

377

378 See Also

379 --------

380 Index.to_numpy : Similar method that always returns a NumPy array.

381 Series.to_numpy : Similar method that always returns a NumPy array.

382

383 Notes

384 -----

385 This table lays out the different array types for each extension

386 dtype within pandas.

387

388 ================== =============================

389 dtype array type

390 ================== =============================

391 category Categorical

392 period PeriodArray

393 interval IntervalArray

394 IntegerNA IntegerArray

395 string StringArray

396 boolean BooleanArray

397 datetime64[ns, tz] DatetimeArray

398 ================== =============================

399

400 For any 3rd-party extension types, the array type will be an

401 ExtensionArray.

402

403 For all remaining dtypes ``.array`` will be a

404 :class:`arrays.NumpyExtensionArray` wrapping the actual ndarray

405 stored within. If you absolutely need a NumPy array (possibly with

406 copying / coercing data), then use :meth:`Series.to_numpy` instead.

407

408 Examples

409 --------

410 For regular NumPy types like int, and float, a PandasArray

411 is returned.

412

413 >>> pd.Series([1, 2, 3]).array

414 <PandasArray>

415 [1, 2, 3]

416 Length: 3, dtype: int64

417

418 For extension types, like Categorical, the actual ExtensionArray

419 is returned

420

421 >>> ser = pd.Series(pd.Categorical(['a', 'b', 'a']))

422 >>> ser.array

423 ['a', 'b', 'a']

424 Categories (2, object): ['a', 'b']

425 """

426 raise AbstractMethodError(self)

427

428 def to_numpy(

429 self,

430 dtype: npt.DTypeLike | None = None,

431 copy: bool = False,

432 na_value: object = lib.no_default,

433 **kwargs,

434 ) -> np.ndarray:

435 """

436 A NumPy ndarray representing the values in this Series or Index.

437

438 Parameters

439 ----------

440 dtype : str or numpy.dtype, optional

441 The dtype to pass to :meth:`numpy.asarray`.

442 copy : bool, default False

443 Whether to ensure that the returned value is not a view on

444 another array. Note that ``copy=False`` does not *ensure* that

445 ``to_numpy()`` is no-copy. Rather, ``copy=True`` ensure that

446 a copy is made, even if not strictly necessary.

447 na_value : Any, optional

448 The value to use for missing values. The default value depends

449 on `dtype` and the type of the array.

450

451 .. versionadded:: 1.0.0

452

453 **kwargs

454 Additional keywords passed through to the ``to_numpy`` method

455 of the underlying array (for extension arrays).

456

457 .. versionadded:: 1.0.0

458

459 Returns

460 -------

461 numpy.ndarray

462

463 See Also

464 --------

465 Series.array : Get the actual data stored within.

466 Index.array : Get the actual data stored within.

467 DataFrame.to_numpy : Similar method for DataFrame.

468

469 Notes

470 -----

471 The returned array will be the same up to equality (values equal

472 in `self` will be equal in the returned array; likewise for values

473 that are not equal). When `self` contains an ExtensionArray, the

474 dtype may be different. For example, for a category-dtype Series,

475 ``to_numpy()`` will return a NumPy array and the categorical dtype

476 will be lost.

477

478 For NumPy dtypes, this will be a reference to the actual data stored

479 in this Series or Index (assuming ``copy=False``). Modifying the result

480 in place will modify the data stored in the Series or Index (not that

481 we recommend doing that).

482

483 For extension types, ``to_numpy()`` *may* require copying data and

484 coercing the result to a NumPy type (possibly object), which may be

485 expensive. When you need a no-copy reference to the underlying data,

486 :attr:`Series.array` should be used instead.

487

488 This table lays out the different dtypes and default return types of

489 ``to_numpy()`` for various dtypes within pandas.

490

491 ================== ================================

492 dtype array type

493 ================== ================================

494 category[T] ndarray[T] (same dtype as input)

495 period ndarray[object] (Periods)

496 interval ndarray[object] (Intervals)

497 IntegerNA ndarray[object]

498 datetime64[ns] datetime64[ns]

499 datetime64[ns, tz] ndarray[object] (Timestamps)

500 ================== ================================

501

502 Examples

503 --------

504 >>> ser = pd.Series(pd.Categorical(['a', 'b', 'a']))

505 >>> ser.to_numpy()

506 array(['a', 'b', 'a'], dtype=object)

507

508 Specify the `dtype` to control how datetime-aware data is represented.

509 Use ``dtype=object`` to return an ndarray of pandas :class:`Timestamp`

510 objects, each with the correct ``tz``.

511

512 >>> ser = pd.Series(pd.date_range('2000', periods=2, tz="CET"))

513 >>> ser.to_numpy(dtype=object)

514 array([Timestamp('2000-01-01 00:00:00+0100', tz='CET'),

515 Timestamp('2000-01-02 00:00:00+0100', tz='CET')],

516 dtype=object)

517

518 Or ``dtype='datetime64[ns]'`` to return an ndarray of native

519 datetime64 values. The values are converted to UTC and the timezone

520 info is dropped.

521

522 >>> ser.to_numpy(dtype="datetime64[ns]")

523 ... # doctest: +ELLIPSIS

524 array(['1999-12-31T23:00:00.000000000', '2000-01-01T23:00:00...'],

525 dtype='datetime64[ns]')

526 """

527 if is_extension_array_dtype(self.dtype):

528 return self.array.to_numpy(dtype, copy=copy, na_value=na_value, **kwargs)

529 elif kwargs:

530 bad_keys = list(kwargs.keys())[0]

531 raise TypeError(

532 f"to_numpy() got an unexpected keyword argument '{bad_keys}'"

533 )

534

535 result = np.asarray(self._values, dtype=dtype)

536 # TODO(GH-24345): Avoid potential double copy

537 if copy or na_value is not lib.no_default:

538 result = result.copy()

539 if na_value is not lib.no_default:

540 result[np.asanyarray(self.isna())] = na_value

541 return result

542

543 @property

544 def empty(self) -> bool:

545 return not self.size

546

547 def max(self, axis=None, skipna: bool = True, *args, **kwargs):

548 """

549 Return the maximum value of the Index.

550

551 Parameters

552 ----------

553 axis : int, optional

554 For compatibility with NumPy. Only 0 or None are allowed.

555 skipna : bool, default True

556 Exclude NA/null values when showing the result.

557 *args, **kwargs

558 Additional arguments and keywords for compatibility with NumPy.

559

560 Returns

561 -------

562 scalar

563 Maximum value.

564

565 See Also

566 --------

567 Index.min : Return the minimum value in an Index.

568 Series.max : Return the maximum value in a Series.

569 DataFrame.max : Return the maximum values in a DataFrame.

570

571 Examples

572 --------

573 >>> idx = pd.Index([3, 2, 1])

574 >>> idx.max()

575 3

576

577 >>> idx = pd.Index(['c', 'b', 'a'])

578 >>> idx.max()

579 'c'

580

581 For a MultiIndex, the maximum is determined lexicographically.

582

583 >>> idx = pd.MultiIndex.from_product([('a', 'b'), (2, 1)])

584 >>> idx.max()

585 ('b', 2)

586 """

587 nv.validate_minmax_axis(axis)

588 nv.validate_max(args, kwargs)

589 return nanops.nanmax(self._values, skipna=skipna)

590

591 @doc(op="max", oppose="min", value="largest")

592 def argmax(self, axis=None, skipna: bool = True, *args, **kwargs) -> int:

593 """

594 Return int position of the {value} value in the Series.

595

596 If the {op}imum is achieved in multiple locations,

597 the first row position is returned.

598

599 Parameters

600 ----------

601 axis : {{None}}

602 Unused. Parameter needed for compatibility with DataFrame.

603 skipna : bool, default True

604 Exclude NA/null values when showing the result.

605 *args, **kwargs

606 Additional arguments and keywords for compatibility with NumPy.

607

608 Returns

609 -------

610 int

611 Row position of the {op}imum value.

612

613 See Also

614 --------

615 Series.arg{op} : Return position of the {op}imum value.

616 Series.arg{oppose} : Return position of the {oppose}imum value.

617 numpy.ndarray.arg{op} : Equivalent method for numpy arrays.

618 Series.idxmax : Return index label of the maximum values.

619 Series.idxmin : Return index label of the minimum values.

620

621 Examples

622 --------

623 Consider dataset containing cereal calories

624

625 >>> s = pd.Series({{'Corn Flakes': 100.0, 'Almond Delight': 110.0,

626 ... 'Cinnamon Toast Crunch': 120.0, 'Cocoa Puff': 110.0}})

627 >>> s

628 Corn Flakes 100.0

629 Almond Delight 110.0

630 Cinnamon Toast Crunch 120.0

631 Cocoa Puff 110.0

632 dtype: float64

633

634 >>> s.argmax()

635 2

636 >>> s.argmin()

637 0

638

639 The maximum cereal calories is the third element and

640 the minimum cereal calories is the first element,

641 since series is zero-indexed.

642 """

643 delegate = self._values

644 nv.validate_minmax_axis(axis)

645 skipna = nv.validate_argmax_with_skipna(skipna, args, kwargs)

646

647 if isinstance(delegate, ExtensionArray):

648 if not skipna and delegate.isna().any():

649 return -1

650 else:

651 return delegate.argmax()

652 else:

653 # error: Incompatible return value type (got "Union[int, ndarray]", expected

654 # "int")

655 return nanops.nanargmax( # type: ignore[return-value]

656 delegate, skipna=skipna

657 )

658

659 def min(self, axis=None, skipna: bool = True, *args, **kwargs):

660 """

661 Return the minimum value of the Index.

662

663 Parameters

664 ----------

665 axis : {None}

666 Dummy argument for consistency with Series.

667 skipna : bool, default True

668 Exclude NA/null values when showing the result.

669 *args, **kwargs

670 Additional arguments and keywords for compatibility with NumPy.

671

672 Returns

673 -------

674 scalar

675 Minimum value.

676

677 See Also

678 --------

679 Index.max : Return the maximum value of the object.

680 Series.min : Return the minimum value in a Series.

681 DataFrame.min : Return the minimum values in a DataFrame.

682

683 Examples

684 --------

685 >>> idx = pd.Index([3, 2, 1])

686 >>> idx.min()

687 1

688

689 >>> idx = pd.Index(['c', 'b', 'a'])

690 >>> idx.min()

691 'a'

692

693 For a MultiIndex, the minimum is determined lexicographically.

694

695 >>> idx = pd.MultiIndex.from_product([('a', 'b'), (2, 1)])

696 >>> idx.min()

697 ('a', 1)

698 """

699 nv.validate_minmax_axis(axis)

700 nv.validate_min(args, kwargs)

701 return nanops.nanmin(self._values, skipna=skipna)

702

703 @doc(argmax, op="min", oppose="max", value="smallest")

704 def argmin(self, axis=None, skipna=True, *args, **kwargs) -> int:

705 delegate = self._values

706 nv.validate_minmax_axis(axis)

707 skipna = nv.validate_argmin_with_skipna(skipna, args, kwargs)

708

709 if isinstance(delegate, ExtensionArray):

710 if not skipna and delegate.isna().any():

711 return -1

712 else:

713 return delegate.argmin()

714 else:

715 # error: Incompatible return value type (got "Union[int, ndarray]", expected

716 # "int")

717 return nanops.nanargmin( # type: ignore[return-value]

718 delegate, skipna=skipna

719 )

720

721 def tolist(self):

722 """

723 Return a list of the values.

724

725 These are each a scalar type, which is a Python scalar

726 (for str, int, float) or a pandas scalar

727 (for Timestamp/Timedelta/Interval/Period)

728

729 Returns

730 -------

731 list

732

733 See Also

734 --------

735 numpy.ndarray.tolist : Return the array as an a.ndim-levels deep

736 nested list of Python scalars.

737 """

738 return self._values.tolist()

739

740 to_list = tolist

741

742 def __iter__(self):

743 """

744 Return an iterator of the values.

745

746 These are each a scalar type, which is a Python scalar

747 (for str, int, float) or a pandas scalar

748 (for Timestamp/Timedelta/Interval/Period)

749

750 Returns

751 -------

752 iterator

753 """

754 # We are explicitly making element iterators.

755 if not isinstance(self._values, np.ndarray):

756 # Check type instead of dtype to catch DTA/TDA

757 return iter(self._values)

758 else:

759 return map(self._values.item, range(self._values.size))

760

761 @cache_readonly

762 def hasnans(self) -> bool:

763 """

764 Return True if there are any NaNs.

765

766 Enables various performance speedups.

767 """

768 # error: Item "bool" of "Union[bool, ndarray[Any, dtype[bool_]], NDFrame]"

769 # has no attribute "any"

770 return bool(isna(self).any()) # type: ignore[union-attr]

771

772 def isna(self) -> npt.NDArray[np.bool_]:

773 return isna(self._values)

774

775 def _reduce(

776 self,

777 op,

778 name: str,

779 *,

780 axis=0,

781 skipna=True,

782 numeric_only=None,

783 filter_type=None,

784 **kwds,

785 ):

786 """

787 Perform the reduction type operation if we can.

788 """

789 func = getattr(self, name, None)

790 if func is None:

791 raise TypeError(

792 f"{type(self).__name__} cannot perform the operation {name}"

793 )

794 return func(skipna=skipna, **kwds)

795

796 @final

797 def _map_values(self, mapper, na_action=None):

798 """

799 An internal function that maps values using the input

800 correspondence (which can be a dict, Series, or function).

801

802 Parameters

803 ----------

804 mapper : function, dict, or Series

805 The input correspondence object

806 na_action : {None, 'ignore'}

807 If 'ignore', propagate NA values, without passing them to the

808 mapping function

809

810 Returns

811 -------

812 Union[Index, MultiIndex], inferred

813 The output of the mapping function applied to the index.

814 If the function returns a tuple with more than one element

815 a MultiIndex will be returned.

816 """

817 # we can fastpath dict/Series to an efficient map

818 # as we know that we are not going to have to yield

819 # python types

820 if is_dict_like(mapper):

821 if isinstance(mapper, dict) and hasattr(mapper, "__missing__"):

822 # If a dictionary subclass defines a default value method,

823 # convert mapper to a lookup function (GH #15999).

824 dict_with_default = mapper

825 mapper = lambda x: dict_with_default[x]

826 else:

827 # Dictionary does not have a default. Thus it's safe to

828 # convert to an Series for efficiency.

829 # we specify the keys here to handle the

830 # possibility that they are tuples

831

832 # The return value of mapping with an empty mapper is

833 # expected to be pd.Series(np.nan, ...). As np.nan is

834 # of dtype float64 the return value of this method should

835 # be float64 as well

836 mapper = create_series_with_explicit_dtype(

837 mapper, dtype_if_empty=np.float64

838 )

839

840 if isinstance(mapper, ABCSeries):

841 if na_action not in (None, "ignore"):

842 msg = (

843 "na_action must either be 'ignore' or None, "

844 f"{na_action} was passed"

845 )

846 raise ValueError(msg)

847

848 if na_action == "ignore":

849 mapper = mapper[mapper.index.notna()]

850

851 # Since values were input this means we came from either

852 # a dict or a series and mapper should be an index

853 if is_categorical_dtype(self.dtype):

854 # use the built in categorical series mapper which saves

855 # time by mapping the categories instead of all values

856

857 cat = cast("Categorical", self._values)

858 return cat.map(mapper)

859

860 values = self._values

861

862 indexer = mapper.index.get_indexer(values)

863 new_values = algorithms.take_nd(mapper._values, indexer)

864

865 return new_values

866

867 # we must convert to python types

868 if is_extension_array_dtype(self.dtype) and hasattr(self._values, "map"):

869 # GH#23179 some EAs do not have `map`

870 values = self._values

871 if na_action is not None:

872 raise NotImplementedError

873 map_f = lambda values, f: values.map(f)

874 else:

875 values = self._values.astype(object)

876 if na_action == "ignore":

877 map_f = lambda values, f: lib.map_infer_mask(

878 values, f, isna(values).view(np.uint8)

879 )

880 elif na_action is None:

881 map_f = lib.map_infer

882 else:

883 msg = (

884 "na_action must either be 'ignore' or None, "

885 f"{na_action} was passed"

886 )

887 raise ValueError(msg)

888

889 # mapper is a function

890 new_values = map_f(values, mapper)

891

892 return new_values

893

894 def value_counts(

895 self,

896 normalize: bool = False,

897 sort: bool = True,

898 ascending: bool = False,

899 bins=None,

900 dropna: bool = True,

901 ) -> Series:

902 """

903 Return a Series containing counts of unique values.

904

905 The resulting object will be in descending order so that the

906 first element is the most frequently-occurring element.

907 Excludes NA values by default.

908

909 Parameters

910 ----------

911 normalize : bool, default False

912 If True then the object returned will contain the relative

913 frequencies of the unique values.

914 sort : bool, default True

915 Sort by frequencies.

916 ascending : bool, default False

917 Sort in ascending order.

918 bins : int, optional

919 Rather than count values, group them into half-open bins,

920 a convenience for ``pd.cut``, only works with numeric data.

921 dropna : bool, default True

922 Don't include counts of NaN.

923

924 Returns

925 -------

926 Series

927

928 See Also

929 --------

930 Series.count: Number of non-NA elements in a Series.

931 DataFrame.count: Number of non-NA elements in a DataFrame.

932 DataFrame.value_counts: Equivalent method on DataFrames.

933

934 Examples

935 --------

936 >>> index = pd.Index([3, 1, 2, 3, 4, np.nan])

937 >>> index.value_counts()

938 3.0 2

939 1.0 1

940 2.0 1

941 4.0 1

942 dtype: int64

943

944 With `normalize` set to `True`, returns the relative frequency by

945 dividing all values by the sum of values.

946

947 >>> s = pd.Series([3, 1, 2, 3, 4, np.nan])

948 >>> s.value_counts(normalize=True)

949 3.0 0.4

950 1.0 0.2

951 2.0 0.2

952 4.0 0.2

953 dtype: float64

954

955 **bins**

956

957 Bins can be useful for going from a continuous variable to a

958 categorical variable; instead of counting unique

959 apparitions of values, divide the index in the specified

960 number of half-open bins.

961

962 >>> s.value_counts(bins=3)

963 (0.996, 2.0] 2

964 (2.0, 3.0] 2

965 (3.0, 4.0] 1

966 dtype: int64

967

968 **dropna**

969

970 With `dropna` set to `False` we can also see NaN index values.

971

972 >>> s.value_counts(dropna=False)

973 3.0 2

974 1.0 1

975 2.0 1

976 4.0 1

977 NaN 1

978 dtype: int64

979 """

980 return value_counts(

981 self,

982 sort=sort,

983 ascending=ascending,

984 normalize=normalize,

985 bins=bins,

986 dropna=dropna,

987 )

988

989 def unique(self):

990 values = self._values

991

992 if not isinstance(values, np.ndarray):

993 result: ArrayLike = values.unique()

994 if (

995 isinstance(self.dtype, np.dtype) and self.dtype.kind in ["m", "M"]

996 ) and isinstance(self, ABCSeries):

997 # GH#31182 Series._values returns EA

998 # unpack numpy datetime for backward-compat

999 result = np.asarray(result)

1000 else:

1001 result = unique1d(values)

1002

1003 return result

1004

1005 def nunique(self, dropna: bool = True) -> int:

1006 """

1007 Return number of unique elements in the object.

1008

1009 Excludes NA values by default.

1010

1011 Parameters

1012 ----------

1013 dropna : bool, default True

1014 Don't include NaN in the count.

1015

1016 Returns

1017 -------

1018 int

1019

1020 See Also

1021 --------

1022 DataFrame.nunique: Method nunique for DataFrame.

1023 Series.count: Count non-NA/null observations in the Series.

1024

1025 Examples

1026 --------

1027 >>> s = pd.Series([1, 3, 5, 7, 7])

1028 >>> s

1029 0 1

1030 1 3

1031 2 5

1032 3 7

1033 4 7

1034 dtype: int64

1035

1036 >>> s.nunique()

1037 4

1038 """

1039 uniqs = self.unique()

1040 if dropna:

1041 uniqs = remove_na_arraylike(uniqs)

1042 return len(uniqs)

1043

1044 @property

1045 def is_unique(self) -> bool:

1046 """

1047 Return boolean if values in the object are unique.

1048

1049 Returns

1050 -------

1051 bool

1052 """

1053 return self.nunique(dropna=False) == len(self)

1054

1055 @property

1056 def is_monotonic(self) -> bool:

1057 """

1058 Return boolean if values in the object are monotonically increasing.

1059

1060 .. deprecated:: 1.5.0

1061 is_monotonic is deprecated and will be removed in a future version.

1062 Use is_monotonic_increasing instead.

1063

1064 Returns

1065 -------

1066 bool

1067 """

1068 warnings.warn(

1069 "is_monotonic is deprecated and will be removed in a future version. "

1070 "Use is_monotonic_increasing instead.",

1071 FutureWarning,

1072 stacklevel=find_stack_level(),

1073 )

1074 return self.is_monotonic_increasing

1075

1076 @property

1077 def is_monotonic_increasing(self) -> bool:

1078 """

1079 Return boolean if values in the object are monotonically increasing.

1080

1081 Returns

1082 -------

1083 bool

1084 """

1085 from pandas import Index

1086

1087 return Index(self).is_monotonic_increasing

1088

1089 @property

1090 def is_monotonic_decreasing(self) -> bool:

1091 """

1092 Return boolean if values in the object are monotonically decreasing.

1093

1094 Returns

1095 -------

1096 bool

1097 """

1098 from pandas import Index

1099

1100 return Index(self).is_monotonic_decreasing

1101

1102 def _memory_usage(self, deep: bool = False) -> int:

1103 """

1104 Memory usage of the values.

1105

1106 Parameters

1107 ----------

1108 deep : bool, default False

1109 Introspect the data deeply, interrogate

1110 `object` dtypes for system-level memory consumption.

1111

1112 Returns

1113 -------

1114 bytes used

1115

1116 See Also

1117 --------

1118 numpy.ndarray.nbytes : Total bytes consumed by the elements of the

1119 array.

1120

1121 Notes

1122 -----

1123 Memory usage does not include memory consumed by elements that

1124 are not components of the array if deep=False or if used on PyPy

1125 """

1126 if hasattr(self.array, "memory_usage"):

1127 # https://github.com/python/mypy/issues/1424

1128 # error: "ExtensionArray" has no attribute "memory_usage"

1129 return self.array.memory_usage(deep=deep) # type: ignore[attr-defined]

1130

1131 v = self.array.nbytes

1132 if deep and is_object_dtype(self) and not PYPY:

1133 values = cast(np.ndarray, self._values)

1134 v += lib.memory_usage_of_objects(values)

1135 return v

1136

1137 @doc(

1138 algorithms.factorize,

1139 values="",

1140 order="",

1141 size_hint="",

1142 sort=textwrap.dedent(

1143 """\

1144 sort : bool, default False

1145 Sort `uniques` and shuffle `codes` to maintain the

1146 relationship.

1147 """

1148 ),

1149 )

1150 def factorize(

1151 self,

1152 sort: bool = False,

1153 na_sentinel: int | lib.NoDefault = lib.no_default,

1154 use_na_sentinel: bool | lib.NoDefault = lib.no_default,

1155 ):

1156 return algorithms.factorize(

1157 self, sort=sort, na_sentinel=na_sentinel, use_na_sentinel=use_na_sentinel

1158 )

1159

1160 _shared_docs[

1161 "searchsorted"

1162 ] = """

1163 Find indices where elements should be inserted to maintain order.

1164

1165 Find the indices into a sorted {klass} `self` such that, if the

1166 corresponding elements in `value` were inserted before the indices,

1167 the order of `self` would be preserved.

1168

1169 .. note::

1170

1171 The {klass} *must* be monotonically sorted, otherwise

1172 wrong locations will likely be returned. Pandas does *not*

1173 check this for you.

1174

1175 Parameters

1176 ----------

1177 value : array-like or scalar

1178 Values to insert into `self`.

1179 side : {{'left', 'right'}}, optional

1180 If 'left', the index of the first suitable location found is given.

1181 If 'right', return the last such index. If there is no suitable

1182 index, return either 0 or N (where N is the length of `self`).

1183 sorter : 1-D array-like, optional

1184 Optional array of integer indices that sort `self` into ascending

1185 order. They are typically the result of ``np.argsort``.

1186

1187 Returns

1188 -------

1189 int or array of int

1190 A scalar or array of insertion points with the

1191 same shape as `value`.

1192

1193 See Also

1194 --------

1195 sort_values : Sort by the values along either axis.

1196 numpy.searchsorted : Similar method from NumPy.

1197

1198 Notes

1199 -----

1200 Binary search is used to find the required insertion points.

1201

1202 Examples

1203 --------

1204 >>> ser = pd.Series([1, 2, 3])

1205 >>> ser

1206 0 1

1207 1 2

1208 2 3

1209 dtype: int64

1210

1211 >>> ser.searchsorted(4)

1212 3

1213

1214 >>> ser.searchsorted([0, 4])

1215 array([0, 3])

1216

1217 >>> ser.searchsorted([1, 3], side='left')

1218 array([0, 2])

1219

1220 >>> ser.searchsorted([1, 3], side='right')

1221 array([1, 3])

1222

1223 >>> ser = pd.Series(pd.to_datetime(['3/11/2000', '3/12/2000', '3/13/2000']))

1224 >>> ser

1225 0 2000-03-11

1226 1 2000-03-12

1227 2 2000-03-13

1228 dtype: datetime64[ns]

1229

1230 >>> ser.searchsorted('3/14/2000')

1231 3

1232

1233 >>> ser = pd.Categorical(

1234 ... ['apple', 'bread', 'bread', 'cheese', 'milk'], ordered=True

1235 ... )

1236 >>> ser

1237 ['apple', 'bread', 'bread', 'cheese', 'milk']

1238 Categories (4, object): ['apple' < 'bread' < 'cheese' < 'milk']

1239

1240 >>> ser.searchsorted('bread')

1241 1

1242

1243 >>> ser.searchsorted(['bread'], side='right')

1244 array([3])

1245

1246 If the values are not monotonically sorted, wrong locations

1247 may be returned:

1248

1249 >>> ser = pd.Series([2, 1, 3])

1250 >>> ser

1251 0 2

1252 1 1

1253 2 3

1254 dtype: int64

1255

1256 >>> ser.searchsorted(1) # doctest: +SKIP

1257 0 # wrong result, correct would be 1

1258 """

1259

1260 # This overload is needed so that the call to searchsorted in

1261 # pandas.core.resample.TimeGrouper._get_period_bins picks the correct result

1262

1263 @overload

1264 # The following ignore is also present in numpy/__init__.pyi

1265 # Possibly a mypy bug??

1266 # error: Overloaded function signatures 1 and 2 overlap with incompatible

1267 # return types [misc]

1268 def searchsorted( # type: ignore[misc]

1269 self,

1270 value: ScalarLike_co,

1271 side: Literal["left", "right"] = ...,

1272 sorter: NumpySorter = ...,

1273 ) -> np.intp:

1274 ...

1275

1276 @overload

1277 def searchsorted(

1278 self,

1279 value: npt.ArrayLike | ExtensionArray,

1280 side: Literal["left", "right"] = ...,

1281 sorter: NumpySorter = ...,

1282 ) -> npt.NDArray[np.intp]:

1283 ...

1284

1285 @doc(_shared_docs["searchsorted"], klass="Index")

1286 def searchsorted(

1287 self,

1288 value: NumpyValueArrayLike | ExtensionArray,

1289 side: Literal["left", "right"] = "left",

1290 sorter: NumpySorter = None,

1291 ) -> npt.NDArray[np.intp] | np.intp:

1292

1293 values = self._values

1294 if not isinstance(values, np.ndarray):

1295 # Going through EA.searchsorted directly improves performance GH#38083

1296 return values.searchsorted(value, side=side, sorter=sorter)

1297

1298 return algorithms.searchsorted(

1299 values,

1300 value,

1301 side=side,

1302 sorter=sorter,

1303 )

1304

1305 def drop_duplicates(self, keep="first"):

1306 duplicated = self._duplicated(keep=keep)

1307 # error: Value of type "IndexOpsMixin" is not indexable

1308 return self[~duplicated] # type: ignore[index]

1309

1310 @final

1311 def _duplicated(

1312 self, keep: Literal["first", "last", False] = "first"

1313 ) -> npt.NDArray[np.bool_]:

1314 return duplicated(self._values, keep=keep)

1315

1316 def _arith_method(self, other, op):

1317 res_name = ops.get_op_result_name(self, other)

1318

1319 lvalues = self._values

1320 rvalues = extract_array(other, extract_numpy=True, extract_range=True)

1321 rvalues = ops.maybe_prepare_scalar_for_op(rvalues, lvalues.shape)

1322 rvalues = ensure_wrapped_if_datetimelike(rvalues)

1323

1324 with np.errstate(all="ignore"):

1325 result = ops.arithmetic_op(lvalues, rvalues, op)

1326

1327 return self._construct_result(result, name=res_name)

1328

1329 def _construct_result(self, result, name):

1330 """

1331 Construct an appropriately-wrapped result from the ArrayLike result

1332 of an arithmetic-like operation.

1333 """

1334 raise AbstractMethodError(self)