Coverage for /var/srv/projects/api.amasfac.comuna18.com/tmp/venv/lib/python3.9/site-packages/pandas/core/arrays/categorical.py: 19%

1from __future__ import annotations

3from csv import QUOTE_NONNUMERIC

4from functools import partial

5import operator

6from shutil import get_terminal_size

7from typing import (

8 TYPE_CHECKING,

9 Hashable,

10 Literal,

11 Sequence,

12 TypeVar,

13 Union,

14 cast,

15 overload,

16)

17from warnings import (

18 catch_warnings,

19 simplefilter,

20 warn,

21)

23import numpy as np

25from pandas._config import get_option

27from pandas._libs import (

28 NaT,

29 algos as libalgos,

30 lib,

31)

32from pandas._libs.arrays import NDArrayBacked

33from pandas._libs.lib import (

34 NoDefault,

35 no_default,

36)

37from pandas._typing import (

38 ArrayLike,

39 AstypeArg,

40 Dtype,

41 NpDtype,

42 Ordered,

43 Shape,

44 npt,

45 type_t,

46)

47from pandas.compat.numpy import function as nv

48from pandas.util._decorators import (

49 deprecate_kwarg,

50 deprecate_nonkeyword_arguments,

51)

52from pandas.util._exceptions import find_stack_level

53from pandas.util._validators import validate_bool_kwarg

55from pandas.core.dtypes.cast import coerce_indexer_dtype

56from pandas.core.dtypes.common import (

57 ensure_int64,

58 ensure_platform_int,

59 is_categorical_dtype,

60 is_datetime64_dtype,

61 is_dict_like,

62 is_dtype_equal,

63 is_extension_array_dtype,

64 is_hashable,

65 is_integer_dtype,

66 is_list_like,

67 is_scalar,

68 is_timedelta64_dtype,

69 needs_i8_conversion,

70 pandas_dtype,

71)

72from pandas.core.dtypes.dtypes import (

73 CategoricalDtype,

74 ExtensionDtype,

75)

76from pandas.core.dtypes.generic import (

77 ABCIndex,

78 ABCSeries,

79)

80from pandas.core.dtypes.missing import (

81 is_valid_na_for_dtype,

82 isna,

83 notna,

84)

86from pandas.core import (

87 arraylike,

88 ops,

89)

90from pandas.core.accessor import (

91 PandasDelegate,

92 delegate_names,

93)

94import pandas.core.algorithms as algorithms

95from pandas.core.algorithms import (

96 factorize,

97 take_nd,

98 unique1d,

99)

100from pandas.core.arrays._mixins import (

101 NDArrayBackedExtensionArray,

102 ravel_compat,

103)

104from pandas.core.base import (

105 ExtensionArray,

106 NoNewAttributesMixin,

107 PandasObject,

108)

109import pandas.core.common as com

110from pandas.core.construction import (

111 extract_array,

112 sanitize_array,

113)

114from pandas.core.ops.common import unpack_zerodim_and_defer

115from pandas.core.sorting import nargsort

116from pandas.core.strings.object_array import ObjectStringArrayMixin

117

118from pandas.io.formats import console

119

120if TYPE_CHECKING: 120 ↛ 121line 120 didn't jump to line 121, because the condition on line 120 was never true

121 from pandas import (

122 DataFrame,

123 Index,

124 Series,

125 )

126

127

128CategoricalT = TypeVar("CategoricalT", bound="Categorical")

129

130

131def _cat_compare_op(op):

132 opname = f"__{op.__name__}__"

133 fill_value = True if op is operator.ne else False

134

135 @unpack_zerodim_and_defer(opname)

136 def func(self, other):

137 hashable = is_hashable(other)

138 if is_list_like(other) and len(other) != len(self) and not hashable:

139 # in hashable case we may have a tuple that is itself a category

140 raise ValueError("Lengths must match.")

141

142 if not self.ordered:

143 if opname in ["__lt__", "__gt__", "__le__", "__ge__"]:

144 raise TypeError(

145 "Unordered Categoricals can only compare equality or not"

146 )

147 if isinstance(other, Categorical):

148 # Two Categoricals can only be compared if the categories are

149 # the same (maybe up to ordering, depending on ordered)

150

151 msg = "Categoricals can only be compared if 'categories' are the same."

152 if not self._categories_match_up_to_permutation(other):

153 raise TypeError(msg)

154

155 if not self.ordered and not self.categories.equals(other.categories):

156 # both unordered and different order

157 other_codes = recode_for_categories(

158 other.codes, other.categories, self.categories, copy=False

159 )

160 else:

161 other_codes = other._codes

162

163 ret = op(self._codes, other_codes)

164 mask = (self._codes == -1) | (other_codes == -1)

165 if mask.any():

166 ret[mask] = fill_value

167 return ret

168

169 if hashable:

170 if other in self.categories:

171 i = self._unbox_scalar(other)

172 ret = op(self._codes, i)

173

174 if opname not in {"__eq__", "__ge__", "__gt__"}:

175 # GH#29820 performance trick; get_loc will always give i>=0,

176 # so in the cases (__ne__, __le__, __lt__) the setting

177 # here is a no-op, so can be skipped.

178 mask = self._codes == -1

179 ret[mask] = fill_value

180 return ret

181 else:

182 return ops.invalid_comparison(self, other, op)

183 else:

184 # allow categorical vs object dtype array comparisons for equality

185 # these are only positional comparisons

186 if opname not in ["__eq__", "__ne__"]:

187 raise TypeError(

188 f"Cannot compare a Categorical for op {opname} with "

189 f"type {type(other)}.\nIf you want to compare values, "

190 "use 'np.asarray(cat) <op> other'."

191 )

192

193 if isinstance(other, ExtensionArray) and needs_i8_conversion(other.dtype):

194 # We would return NotImplemented here, but that messes up

195 # ExtensionIndex's wrapped methods

196 return op(other, self)

197 return getattr(np.array(self), opname)(np.array(other))

198

199 func.__name__ = opname

200

201 return func

202

203

204def contains(cat, key, container) -> bool:

205 """

206 Helper for membership check for ``key`` in ``cat``.

207

208 This is a helper method for :method:`__contains__`

209 and :class:`CategoricalIndex.__contains__`.

210

211 Returns True if ``key`` is in ``cat.categories`` and the

212 location of ``key`` in ``categories`` is in ``container``.

213

214 Parameters

215 ----------

216 cat : :class:`Categorical`or :class:`categoricalIndex`

217 key : a hashable object

218 The key to check membership for.

219 container : Container (e.g. list-like or mapping)

220 The container to check for membership in.

221

222 Returns

223 -------

224 is_in : bool

225 True if ``key`` is in ``self.categories`` and location of

226 ``key`` in ``categories`` is in ``container``, else False.

227

228 Notes

229 -----

230 This method does not check for NaN values. Do that separately

231 before calling this method.

232 """

233 hash(key)

234

235 # get location of key in categories.

236 # If a KeyError, the key isn't in categories, so logically

237 # can't be in container either.

238 try:

239 loc = cat.categories.get_loc(key)

240 except (KeyError, TypeError):

241 return False

242

243 # loc is the location of key in categories, but also the *value*

244 # for key in container. So, `key` may be in categories,

245 # but still not in `container`. Example ('b' in categories,

246 # but not in values):

247 # 'b' in Categorical(['a'], categories=['a', 'b']) # False

248 if is_scalar(loc):

249 return loc in container

250 else:

251 # if categories is an IntervalIndex, loc is an array.

252 return any(loc_ in container for loc_ in loc)

253

254

255class Categorical(NDArrayBackedExtensionArray, PandasObject, ObjectStringArrayMixin):

256 """

257 Represent a categorical variable in classic R / S-plus fashion.

258

259 `Categoricals` can only take on only a limited, and usually fixed, number

260 of possible values (`categories`). In contrast to statistical categorical

261 variables, a `Categorical` might have an order, but numerical operations

262 (additions, divisions, ...) are not possible.

263

264 All values of the `Categorical` are either in `categories` or `np.nan`.

265 Assigning values outside of `categories` will raise a `ValueError`. Order

266 is defined by the order of the `categories`, not lexical order of the

267 values.

268

269 Parameters

270 ----------

271 values : list-like

272 The values of the categorical. If categories are given, values not in

273 categories will be replaced with NaN.

274 categories : Index-like (unique), optional

275 The unique categories for this categorical. If not given, the

276 categories are assumed to be the unique values of `values` (sorted, if

277 possible, otherwise in the order in which they appear).

278 ordered : bool, default False

279 Whether or not this categorical is treated as a ordered categorical.

280 If True, the resulting categorical will be ordered.

281 An ordered categorical respects, when sorted, the order of its

282 `categories` attribute (which in turn is the `categories` argument, if

283 provided).

284 dtype : CategoricalDtype

285 An instance of ``CategoricalDtype`` to use for this categorical.

286

287 Attributes

288 ----------

289 categories : Index

290 The categories of this categorical

291 codes : ndarray

292 The codes (integer positions, which point to the categories) of this

293 categorical, read only.

294 ordered : bool

295 Whether or not this Categorical is ordered.

296 dtype : CategoricalDtype

297 The instance of ``CategoricalDtype`` storing the ``categories``

298 and ``ordered``.

299

300 Methods

301 -------

302 from_codes

303 __array__

304

305 Raises

306 ------

307 ValueError

308 If the categories do not validate.

309 TypeError

310 If an explicit ``ordered=True`` is given but no `categories` and the

311 `values` are not sortable.

312

313 See Also

314 --------

315 CategoricalDtype : Type for categorical data.

316 CategoricalIndex : An Index with an underlying ``Categorical``.

317

318 Notes

319 -----

320 See the `user guide

321 <https://pandas.pydata.org/pandas-docs/stable/user_guide/categorical.html>`__

322 for more.

323

324 Examples

325 --------

326 >>> pd.Categorical([1, 2, 3, 1, 2, 3])

327 [1, 2, 3, 1, 2, 3]

328 Categories (3, int64): [1, 2, 3]

329

330 >>> pd.Categorical(['a', 'b', 'c', 'a', 'b', 'c'])

331 ['a', 'b', 'c', 'a', 'b', 'c']

332 Categories (3, object): ['a', 'b', 'c']

333

334 Missing values are not included as a category.

335

336 >>> c = pd.Categorical([1, 2, 3, 1, 2, 3, np.nan])

337 >>> c

338 [1, 2, 3, 1, 2, 3, NaN]

339 Categories (3, int64): [1, 2, 3]

340

341 However, their presence is indicated in the `codes` attribute

342 by code `-1`.

343

344 >>> c.codes

345 array([ 0, 1, 2, 0, 1, 2, -1], dtype=int8)

346

347 Ordered `Categoricals` can be sorted according to the custom order

348 of the categories and can have a min and max value.

349

350 >>> c = pd.Categorical(['a', 'b', 'c', 'a', 'b', 'c'], ordered=True,

351 ... categories=['c', 'b', 'a'])

352 >>> c

353 ['a', 'b', 'c', 'a', 'b', 'c']

354 Categories (3, object): ['c' < 'b' < 'a']

355 >>> c.min()

356 'c'

357 """

358

359 # For comparisons, so that numpy uses our implementation if the compare

360 # ops, which raise

361 __array_priority__ = 1000

362 # tolist is not actually deprecated, just suppressed in the __dir__

363 _hidden_attrs = PandasObject._hidden_attrs | frozenset(["tolist"])

364 _typ = "categorical"

365

366 _dtype: CategoricalDtype

367

368 def __init__(

369 self,

370 values,

371 categories=None,

372 ordered=None,

373 dtype: Dtype | None = None,

374 fastpath: bool = False,

375 copy: bool = True,

376 ) -> None:

377

378 dtype = CategoricalDtype._from_values_or_dtype(

379 values, categories, ordered, dtype

380 )

381 # At this point, dtype is always a CategoricalDtype, but

382 # we may have dtype.categories be None, and we need to

383 # infer categories in a factorization step further below

384

385 if fastpath:

386 codes = coerce_indexer_dtype(values, dtype.categories)

387 dtype = CategoricalDtype(ordered=False).update_dtype(dtype)

388 super().__init__(codes, dtype)

389 return

390

391 if not is_list_like(values):

392 # GH#38433

393 warn(

394 "Allowing scalars in the Categorical constructor is deprecated "

395 "and will raise in a future version. Use `[value]` instead",

396 FutureWarning,

397 stacklevel=find_stack_level(),

398 )

399 values = [values]

400

401 # null_mask indicates missing values we want to exclude from inference.

402 # This means: only missing values in list-likes (not arrays/ndframes).

403 null_mask = np.array(False)

404

405 # sanitize input

406 if is_categorical_dtype(values):

407 if dtype.categories is None:

408 dtype = CategoricalDtype(values.categories, dtype.ordered)

409 elif not isinstance(values, (ABCIndex, ABCSeries, ExtensionArray)):

410 values = com.convert_to_list_like(values)

411 if isinstance(values, list) and len(values) == 0:

412 # By convention, empty lists result in object dtype:

413 values = np.array([], dtype=object)

414 elif isinstance(values, np.ndarray):

415 if values.ndim > 1:

416 # preempt sanitize_array from raising ValueError

417 raise NotImplementedError(

418 "> 1 ndim Categorical are not supported at this time"

419 )

420 values = sanitize_array(values, None)

421 else:

422 # i.e. must be a list

423 arr = sanitize_array(values, None)

424 null_mask = isna(arr)

425 if null_mask.any():

426 # We remove null values here, then below will re-insert

427 # them, grep "full_codes"

428 arr_list = [values[idx] for idx in np.where(~null_mask)[0]]

429

430 # GH#44900 Do not cast to float if we have only missing values

431 if arr_list or arr.dtype == "object":

432 sanitize_dtype = None

433 else:

434 sanitize_dtype = arr.dtype

435

436 arr = sanitize_array(arr_list, None, dtype=sanitize_dtype)

437 values = arr

438

439 if dtype.categories is None:

440 try:

441 codes, categories = factorize(values, sort=True)

442 except TypeError as err:

443 codes, categories = factorize(values, sort=False)

444 if dtype.ordered:

445 # raise, as we don't have a sortable data structure and so

446 # the user should give us one by specifying categories

447 raise TypeError(

448 "'values' is not ordered, please "

449 "explicitly specify the categories order "

450 "by passing in a categories argument."

451 ) from err

452

453 # we're inferring from values

454 dtype = CategoricalDtype(categories, dtype.ordered)

455

456 elif is_categorical_dtype(values.dtype):

457 old_codes = extract_array(values)._codes

458 codes = recode_for_categories(

459 old_codes, values.dtype.categories, dtype.categories, copy=copy

460 )

461

462 else:

463 codes = _get_codes_for_values(values, dtype.categories)

464

465 if null_mask.any():

466 # Reinsert -1 placeholders for previously removed missing values

467 full_codes = -np.ones(null_mask.shape, dtype=codes.dtype)

468 full_codes[~null_mask] = codes

469 codes = full_codes

470

471 dtype = CategoricalDtype(ordered=False).update_dtype(dtype)

472 arr = coerce_indexer_dtype(codes, dtype.categories)

473 super().__init__(arr, dtype)

474

475 @property

476 def dtype(self) -> CategoricalDtype:

477 """

478 The :class:`~pandas.api.types.CategoricalDtype` for this instance.

479 """

480 return self._dtype

481

482 @property

483 def _internal_fill_value(self) -> int:

484 # using the specific numpy integer instead of python int to get

485 # the correct dtype back from _quantile in the all-NA case

486 dtype = self._ndarray.dtype

487 return dtype.type(-1)

488

489 @property

490 def _constructor(self) -> type[Categorical]:

491 return Categorical

492

493 @classmethod

494 def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy=False):

495 return Categorical(scalars, dtype=dtype, copy=copy)

496

497 @overload

498 def astype(self, dtype: npt.DTypeLike, copy: bool = ...) -> np.ndarray:

499 ...

500

501 @overload

502 def astype(self, dtype: ExtensionDtype, copy: bool = ...) -> ExtensionArray:

503 ...

504

505 @overload

506 def astype(self, dtype: AstypeArg, copy: bool = ...) -> ArrayLike:

507 ...

508

509 def astype(self, dtype: AstypeArg, copy: bool = True) -> ArrayLike:

510 """

511 Coerce this type to another dtype

512

513 Parameters

514 ----------

515 dtype : numpy dtype or pandas type

516 copy : bool, default True

517 By default, astype always returns a newly allocated object.

518 If copy is set to False and dtype is categorical, the original

519 object is returned.

520 """

521 dtype = pandas_dtype(dtype)

522 if self.dtype is dtype:

523 result = self.copy() if copy else self

524

525 elif is_categorical_dtype(dtype):

526 dtype = cast("Union[str, CategoricalDtype]", dtype)

527

528 # GH 10696/18593/18630

529 dtype = self.dtype.update_dtype(dtype)

530 self = self.copy() if copy else self

531 result = self._set_dtype(dtype)

532

533 elif isinstance(dtype, ExtensionDtype):

534 return super().astype(dtype, copy=copy)

535

536 elif is_integer_dtype(dtype) and self.isna().any():

537 raise ValueError("Cannot convert float NaN to integer")

538

539 elif len(self.codes) == 0 or len(self.categories) == 0:

540 result = np.array(

541 self,

542 dtype=dtype,

543 copy=copy,

544 )

545

546 else:

547 # GH8628 (PERF): astype category codes instead of astyping array

548 new_cats = self.categories._values

549

550 try:

551 new_cats = new_cats.astype(dtype=dtype, copy=copy)

552 fill_value = self.categories._na_value

553 if not is_valid_na_for_dtype(fill_value, dtype):

554 fill_value = lib.item_from_zerodim(

555 np.array(self.categories._na_value).astype(dtype)

556 )

557 except (

558 TypeError, # downstream error msg for CategoricalIndex is misleading

559 ValueError,

560 ):

561 msg = f"Cannot cast {self.categories.dtype} dtype to {dtype}"

562 raise ValueError(msg)

563

564 result = take_nd(

565 new_cats, ensure_platform_int(self._codes), fill_value=fill_value

566 )

567

568 return result

569

570 def to_list(self):

571 """

572 Alias for tolist.

573 """

574 return self.tolist()

575

576 @classmethod

577 def _from_inferred_categories(

578 cls, inferred_categories, inferred_codes, dtype, true_values=None

579 ):

580 """

581 Construct a Categorical from inferred values.

582

583 For inferred categories (`dtype` is None) the categories are sorted.

584 For explicit `dtype`, the `inferred_categories` are cast to the

585 appropriate type.

586

587 Parameters

588 ----------

589 inferred_categories : Index

590 inferred_codes : Index

591 dtype : CategoricalDtype or 'category'

592 true_values : list, optional

593 If none are provided, the default ones are

594 "True", "TRUE", and "true."

595

596 Returns

597 -------

598 Categorical

599 """

600 from pandas import (

601 Index,

602 to_datetime,

603 to_numeric,

604 to_timedelta,

605 )

606

607 cats = Index(inferred_categories)

608 known_categories = (

609 isinstance(dtype, CategoricalDtype) and dtype.categories is not None

610 )

611

612 if known_categories:

613 # Convert to a specialized type with `dtype` if specified.

614 if dtype.categories.is_numeric():

615 cats = to_numeric(inferred_categories, errors="coerce")

616 elif is_datetime64_dtype(dtype.categories):

617 cats = to_datetime(inferred_categories, errors="coerce")

618 elif is_timedelta64_dtype(dtype.categories):

619 cats = to_timedelta(inferred_categories, errors="coerce")

620 elif dtype.categories.is_boolean():

621 if true_values is None:

622 true_values = ["True", "TRUE", "true"]

623

624 # error: Incompatible types in assignment (expression has type

625 # "ndarray", variable has type "Index")

626 cats = cats.isin(true_values) # type: ignore[assignment]

627

628 if known_categories:

629 # Recode from observation order to dtype.categories order.

630 categories = dtype.categories

631 codes = recode_for_categories(inferred_codes, cats, categories)

632 elif not cats.is_monotonic_increasing:

633 # Sort categories and recode for unknown categories.

634 unsorted = cats.copy()

635 categories = cats.sort_values()

636

637 codes = recode_for_categories(inferred_codes, unsorted, categories)

638 dtype = CategoricalDtype(categories, ordered=False)

639 else:

640 dtype = CategoricalDtype(cats, ordered=False)

641 codes = inferred_codes

642

643 return cls(codes, dtype=dtype, fastpath=True)

644

645 @classmethod

646 def from_codes(

647 cls, codes, categories=None, ordered=None, dtype: Dtype | None = None

648 ) -> Categorical:

649 """

650 Make a Categorical type from codes and categories or dtype.

651

652 This constructor is useful if you already have codes and

653 categories/dtype and so do not need the (computation intensive)

654 factorization step, which is usually done on the constructor.

655

656 If your data does not follow this convention, please use the normal

657 constructor.

658

659 Parameters

660 ----------

661 codes : array-like of int

662 An integer array, where each integer points to a category in

663 categories or dtype.categories, or else is -1 for NaN.

664 categories : index-like, optional

665 The categories for the categorical. Items need to be unique.

666 If the categories are not given here, then they must be provided

667 in `dtype`.

668 ordered : bool, optional

669 Whether or not this categorical is treated as an ordered

670 categorical. If not given here or in `dtype`, the resulting

671 categorical will be unordered.

672 dtype : CategoricalDtype or "category", optional

673 If :class:`CategoricalDtype`, cannot be used together with

674 `categories` or `ordered`.

675

676 Returns

677 -------

678 Categorical

679

680 Examples

681 --------

682 >>> dtype = pd.CategoricalDtype(['a', 'b'], ordered=True)

683 >>> pd.Categorical.from_codes(codes=[0, 1, 0, 1], dtype=dtype)

684 ['a', 'b', 'a', 'b']

685 Categories (2, object): ['a' < 'b']

686 """

687 dtype = CategoricalDtype._from_values_or_dtype(

688 categories=categories, ordered=ordered, dtype=dtype

689 )

690 if dtype.categories is None:

691 msg = (

692 "The categories must be provided in 'categories' or "

693 "'dtype'. Both were None."

694 )

695 raise ValueError(msg)

696

697 if is_extension_array_dtype(codes) and is_integer_dtype(codes):

698 # Avoid the implicit conversion of Int to object

699 if isna(codes).any():

700 raise ValueError("codes cannot contain NA values")

701 codes = codes.to_numpy(dtype=np.int64)

702 else:

703 codes = np.asarray(codes)

704 if len(codes) and not is_integer_dtype(codes):

705 raise ValueError("codes need to be array-like integers")

706

707 if len(codes) and (codes.max() >= len(dtype.categories) or codes.min() < -1):

708 raise ValueError("codes need to be between -1 and len(categories)-1")

709

710 return cls(codes, dtype=dtype, fastpath=True)

711

712 # ------------------------------------------------------------------

713 # Categories/Codes/Ordered

714

715 @property

716 def categories(self) -> Index:

717 """

718 The categories of this categorical.

719

720 Setting assigns new values to each category (effectively a rename of

721 each individual category).

722

723 The assigned value has to be a list-like object. All items must be

724 unique and the number of items in the new categories must be the same

725 as the number of items in the old categories.

726

727 Assigning to `categories` is a inplace operation!

728

729 Raises

730 ------

731 ValueError

732 If the new categories do not validate as categories or if the

733 number of new categories is unequal the number of old categories

734

735 See Also

736 --------

737 rename_categories : Rename categories.

738 reorder_categories : Reorder categories.

739 add_categories : Add new categories.

740 remove_categories : Remove the specified categories.

741 remove_unused_categories : Remove categories which are not used.

742 set_categories : Set the categories to the specified ones.

743 """

744 return self.dtype.categories

745

746 @categories.setter

747 def categories(self, categories) -> None:

748 warn(

749 "Setting categories in-place is deprecated and will raise in a "

750 "future version. Use rename_categories instead.",

751 FutureWarning,

752 stacklevel=find_stack_level(),

753 )

754

755 self._set_categories(categories)

756

757 @property

758 def ordered(self) -> Ordered:

759 """

760 Whether the categories have an ordered relationship.

761 """

762 return self.dtype.ordered

763

764 @property

765 def codes(self) -> np.ndarray:

766 """

767 The category codes of this categorical.

768

769 Codes are an array of integers which are the positions of the actual

770 values in the categories array.

771

772 There is no setter, use the other categorical methods and the normal item

773 setter to change values in the categorical.

774

775 Returns

776 -------

777 ndarray[int]

778 A non-writable view of the `codes` array.

779 """

780 v = self._codes.view()

781 v.flags.writeable = False

782 return v

783

784 def _set_categories(self, categories, fastpath=False):

785 """

786 Sets new categories inplace

787

788 Parameters

789 ----------

790 fastpath : bool, default False

791 Don't perform validation of the categories for uniqueness or nulls

792

793 Examples

794 --------

795 >>> c = pd.Categorical(['a', 'b'])

796 >>> c

797 ['a', 'b']

798 Categories (2, object): ['a', 'b']

799

800 >>> c._set_categories(pd.Index(['a', 'c']))

801 >>> c

802 ['a', 'c']

803 Categories (2, object): ['a', 'c']

804 """

805 if fastpath:

806 new_dtype = CategoricalDtype._from_fastpath(categories, self.ordered)

807 else:

808 new_dtype = CategoricalDtype(categories, ordered=self.ordered)

809 if (

810 not fastpath

811 and self.dtype.categories is not None

812 and len(new_dtype.categories) != len(self.dtype.categories)

813 ):

814 raise ValueError(

815 "new categories need to have the same number of "

816 "items as the old categories!"

817 )

818

819 super().__init__(self._ndarray, new_dtype)

820

821 def _set_dtype(self, dtype: CategoricalDtype) -> Categorical:

822 """

823 Internal method for directly updating the CategoricalDtype

824

825 Parameters

826 ----------

827 dtype : CategoricalDtype

828

829 Notes

830 -----

831 We don't do any validation here. It's assumed that the dtype is

832 a (valid) instance of `CategoricalDtype`.

833 """

834 codes = recode_for_categories(self.codes, self.categories, dtype.categories)

835 return type(self)(codes, dtype=dtype, fastpath=True)

836

837 @overload

838 def set_ordered(

839 self, value, *, inplace: NoDefault | Literal[False] = ...

840 ) -> Categorical:

841 ...

842

843 @overload

844 def set_ordered(self, value, *, inplace: Literal[True]) -> None:

845 ...

846

847 @overload

848 def set_ordered(self, value, *, inplace: bool) -> Categorical | None:

849 ...

850

851 @deprecate_nonkeyword_arguments(version=None, allowed_args=["self", "value"])

852 def set_ordered(

853 self, value, inplace: bool | NoDefault = no_default

854 ) -> Categorical | None:

855 """

856 Set the ordered attribute to the boolean value.

857

858 Parameters

859 ----------

860 value : bool

861 Set whether this categorical is ordered (True) or not (False).

862 inplace : bool, default False

863 Whether or not to set the ordered attribute in-place or return

864 a copy of this categorical with ordered set to the value.

865

866 .. deprecated:: 1.5.0

867

868 """

869 if inplace is not no_default:

870 warn(

871 "The `inplace` parameter in pandas.Categorical."

872 "set_ordered is deprecated and will be removed in "

873 "a future version. setting ordered-ness on categories will always "

874 "return a new Categorical object.",

875 FutureWarning,

876 stacklevel=find_stack_level(),

877 )

878 else:

879 inplace = False

880

881 inplace = validate_bool_kwarg(inplace, "inplace")

882 new_dtype = CategoricalDtype(self.categories, ordered=value)

883 cat = self if inplace else self.copy()

884 NDArrayBacked.__init__(cat, cat._ndarray, new_dtype)

885 if not inplace:

886 return cat

887 return None

888

889 @overload

890 def as_ordered(self, *, inplace: NoDefault | Literal[False] = ...) -> Categorical:

891 ...

892

893 @overload

894 def as_ordered(self, *, inplace: Literal[True]) -> None:

895 ...

896

897 @deprecate_nonkeyword_arguments(version=None, allowed_args=["self"])

898 def as_ordered(self, inplace: bool | NoDefault = no_default) -> Categorical | None:

899 """

900 Set the Categorical to be ordered.

901

902 Parameters

903 ----------

904 inplace : bool, default False

905 Whether or not to set the ordered attribute in-place or return

906 a copy of this categorical with ordered set to True.

907

908 .. deprecated:: 1.5.0

909

910 Returns

911 -------

912 Categorical or None

913 Ordered Categorical or None if ``inplace=True``.

914 """

915 if inplace is not no_default:

916 inplace = validate_bool_kwarg(inplace, "inplace")

917 return self.set_ordered(True, inplace=inplace)

918

919 @overload

920 def as_unordered(self, *, inplace: NoDefault | Literal[False] = ...) -> Categorical:

921 ...

922

923 @overload

924 def as_unordered(self, *, inplace: Literal[True]) -> None:

925 ...

926

927 @deprecate_nonkeyword_arguments(version=None, allowed_args=["self"])

928 def as_unordered(

929 self, inplace: bool | NoDefault = no_default

930 ) -> Categorical | None:

931 """

932 Set the Categorical to be unordered.

933

934 Parameters

935 ----------

936 inplace : bool, default False

937 Whether or not to set the ordered attribute in-place or return

938 a copy of this categorical with ordered set to False.

939

940 .. deprecated:: 1.5.0

941

942 Returns

943 -------

944 Categorical or None

945 Unordered Categorical or None if ``inplace=True``.

946 """

947 if inplace is not no_default:

948 inplace = validate_bool_kwarg(inplace, "inplace")

949 return self.set_ordered(False, inplace=inplace)

950

951 def set_categories(

952 self, new_categories, ordered=None, rename=False, inplace=no_default

953 ):

954 """

955 Set the categories to the specified new_categories.

956

957 `new_categories` can include new categories (which will result in

958 unused categories) or remove old categories (which results in values

959 set to NaN). If `rename==True`, the categories will simple be renamed

960 (less or more items than in old categories will result in values set to

961 NaN or in unused categories respectively).

962

963 This method can be used to perform more than one action of adding,

964 removing, and reordering simultaneously and is therefore faster than

965 performing the individual steps via the more specialised methods.

966

967 On the other hand this methods does not do checks (e.g., whether the

968 old categories are included in the new categories on a reorder), which

969 can result in surprising changes, for example when using special string

970 dtypes, which does not considers a S1 string equal to a single char

971 python string.

972

973 Parameters

974 ----------

975 new_categories : Index-like

976 The categories in new order.

977 ordered : bool, default False

978 Whether or not the categorical is treated as a ordered categorical.

979 If not given, do not change the ordered information.

980 rename : bool, default False

981 Whether or not the new_categories should be considered as a rename

982 of the old categories or as reordered categories.

983 inplace : bool, default False

984 Whether or not to reorder the categories in-place or return a copy

985 of this categorical with reordered categories.

986

987 .. deprecated:: 1.3.0

988

989 Returns

990 -------

991 Categorical with reordered categories or None if inplace.

992

993 Raises

994 ------

995 ValueError

996 If new_categories does not validate as categories

997

998 See Also

999 --------

1000 rename_categories : Rename categories.

1001 reorder_categories : Reorder categories.

1002 add_categories : Add new categories.

1003 remove_categories : Remove the specified categories.

1004 remove_unused_categories : Remove categories which are not used.

1005 """

1006 if inplace is not no_default:

1007 warn(

1008 "The `inplace` parameter in pandas.Categorical."

1009 "set_categories is deprecated and will be removed in "

1010 "a future version. Removing unused categories will always "

1011 "return a new Categorical object.",

1012 FutureWarning,

1013 stacklevel=find_stack_level(),

1014 )

1015 else:

1016 inplace = False

1017

1018 inplace = validate_bool_kwarg(inplace, "inplace")

1019 if ordered is None:

1020 ordered = self.dtype.ordered

1021 new_dtype = CategoricalDtype(new_categories, ordered=ordered)

1022

1023 cat = self if inplace else self.copy()

1024 if rename:

1025 if cat.dtype.categories is not None and len(new_dtype.categories) < len(

1026 cat.dtype.categories

1027 ):

1028 # remove all _codes which are larger and set to -1/NaN

1029 cat._codes[cat._codes >= len(new_dtype.categories)] = -1

1030 codes = cat._codes

1031 else:

1032 codes = recode_for_categories(

1033 cat.codes, cat.categories, new_dtype.categories

1034 )

1035 NDArrayBacked.__init__(cat, codes, new_dtype)

1036

1037 if not inplace:

1038 return cat

1039

1040 @overload

1041 def rename_categories(

1042 self, new_categories, *, inplace: Literal[False] | NoDefault = ...

1043 ) -> Categorical:

1044 ...

1045

1046 @overload

1047 def rename_categories(self, new_categories, *, inplace: Literal[True]) -> None:

1048 ...

1049

1050 @deprecate_nonkeyword_arguments(

1051 version=None, allowed_args=["self", "new_categories"]

1052 )

1053 def rename_categories(

1054 self, new_categories, inplace: bool | NoDefault = no_default

1055 ) -> Categorical | None:

1056 """

1057 Rename categories.

1058

1059 Parameters

1060 ----------

1061 new_categories : list-like, dict-like or callable

1062

1063 New categories which will replace old categories.

1064

1065 * list-like: all items must be unique and the number of items in

1066 the new categories must match the existing number of categories.

1067

1068 * dict-like: specifies a mapping from

1069 old categories to new. Categories not contained in the mapping

1070 are passed through and extra categories in the mapping are

1071 ignored.

1072

1073 * callable : a callable that is called on all items in the old

1074 categories and whose return values comprise the new categories.

1075

1076 inplace : bool, default False

1077 Whether or not to rename the categories inplace or return a copy of

1078 this categorical with renamed categories.

1079

1080 .. deprecated:: 1.3.0

1081

1082 Returns

1083 -------

1084 cat : Categorical or None

1085 Categorical with removed categories or None if ``inplace=True``.

1086

1087 Raises

1088 ------

1089 ValueError

1090 If new categories are list-like and do not have the same number of

1091 items than the current categories or do not validate as categories

1092

1093 See Also

1094 --------

1095 reorder_categories : Reorder categories.

1096 add_categories : Add new categories.

1097 remove_categories : Remove the specified categories.

1098 remove_unused_categories : Remove categories which are not used.

1099 set_categories : Set the categories to the specified ones.

1100

1101 Examples

1102 --------

1103 >>> c = pd.Categorical(['a', 'a', 'b'])

1104 >>> c.rename_categories([0, 1])

1105 [0, 0, 1]

1106 Categories (2, int64): [0, 1]

1107

1108 For dict-like ``new_categories``, extra keys are ignored and

1109 categories not in the dictionary are passed through

1110

1111 >>> c.rename_categories({'a': 'A', 'c': 'C'})

1112 ['A', 'A', 'b']

1113 Categories (2, object): ['A', 'b']

1114

1115 You may also provide a callable to create the new categories

1116

1117 >>> c.rename_categories(lambda x: x.upper())

1118 ['A', 'A', 'B']

1119 Categories (2, object): ['A', 'B']

1120 """

1121 if inplace is not no_default:

1122 warn(

1123 "The `inplace` parameter in pandas.Categorical."

1124 "rename_categories is deprecated and will be removed in "

1125 "a future version. Removing unused categories will always "

1126 "return a new Categorical object.",

1127 FutureWarning,

1128 stacklevel=find_stack_level(),

1129 )

1130 else:

1131 inplace = False

1132

1133 inplace = validate_bool_kwarg(inplace, "inplace")

1134 cat = self if inplace else self.copy()

1135

1136 if is_dict_like(new_categories):

1137 new_categories = [new_categories.get(item, item) for item in cat.categories]

1138 elif callable(new_categories):

1139 new_categories = [new_categories(item) for item in cat.categories]

1140

1141 cat._set_categories(new_categories)

1142 if not inplace:

1143 return cat

1144 return None

1145

1146 def reorder_categories(self, new_categories, ordered=None, inplace=no_default):

1147 """

1148 Reorder categories as specified in new_categories.

1149

1150 `new_categories` need to include all old categories and no new category

1151 items.

1152

1153 Parameters

1154 ----------

1155 new_categories : Index-like

1156 The categories in new order.

1157 ordered : bool, optional

1158 Whether or not the categorical is treated as a ordered categorical.

1159 If not given, do not change the ordered information.

1160 inplace : bool, default False

1161 Whether or not to reorder the categories inplace or return a copy of

1162 this categorical with reordered categories.

1163

1164 .. deprecated:: 1.3.0

1165

1166 Returns

1167 -------

1168 cat : Categorical or None

1169 Categorical with removed categories or None if ``inplace=True``.

1170

1171 Raises

1172 ------

1173 ValueError

1174 If the new categories do not contain all old category items or any

1175 new ones

1176

1177 See Also

1178 --------

1179 rename_categories : Rename categories.

1180 add_categories : Add new categories.

1181 remove_categories : Remove the specified categories.

1182 remove_unused_categories : Remove categories which are not used.

1183 set_categories : Set the categories to the specified ones.

1184 """

1185 if inplace is not no_default:

1186 warn(

1187 "The `inplace` parameter in pandas.Categorical."

1188 "reorder_categories is deprecated and will be removed in "

1189 "a future version. Reordering categories will always "

1190 "return a new Categorical object.",

1191 FutureWarning,

1192 stacklevel=find_stack_level(),

1193 )

1194 else:

1195 inplace = False

1196

1197 inplace = validate_bool_kwarg(inplace, "inplace")

1198 if set(self.dtype.categories) != set(new_categories):

1199 raise ValueError(

1200 "items in new_categories are not the same as in old categories"

1201 )

1202

1203 with catch_warnings():

1204 simplefilter("ignore")

1205 return self.set_categories(new_categories, ordered=ordered, inplace=inplace)

1206

1207 @overload

1208 def add_categories(

1209 self, new_categories, *, inplace: Literal[False] | NoDefault = ...

1210 ) -> Categorical:

1211 ...

1212

1213 @overload

1214 def add_categories(self, new_categories, *, inplace: Literal[True]) -> None:

1215 ...

1216

1217 @deprecate_nonkeyword_arguments(

1218 version=None, allowed_args=["self", "new_categories"]

1219 )

1220 def add_categories(

1221 self, new_categories, inplace: bool | NoDefault = no_default

1222 ) -> Categorical | None:

1223 """

1224 Add new categories.

1225

1226 `new_categories` will be included at the last/highest place in the

1227 categories and will be unused directly after this call.

1228

1229 Parameters

1230 ----------

1231 new_categories : category or list-like of category

1232 The new categories to be included.

1233 inplace : bool, default False

1234 Whether or not to add the categories inplace or return a copy of

1235 this categorical with added categories.

1236

1237 .. deprecated:: 1.3.0

1238

1239 Returns

1240 -------

1241 cat : Categorical or None

1242 Categorical with new categories added or None if ``inplace=True``.

1243

1244 Raises

1245 ------

1246 ValueError

1247 If the new categories include old categories or do not validate as

1248 categories

1249

1250 See Also

1251 --------

1252 rename_categories : Rename categories.

1253 reorder_categories : Reorder categories.

1254 remove_categories : Remove the specified categories.

1255 remove_unused_categories : Remove categories which are not used.

1256 set_categories : Set the categories to the specified ones.

1257

1258 Examples

1259 --------

1260 >>> c = pd.Categorical(['c', 'b', 'c'])

1261 >>> c

1262 ['c', 'b', 'c']

1263 Categories (2, object): ['b', 'c']

1264

1265 >>> c.add_categories(['d', 'a'])

1266 ['c', 'b', 'c']

1267 Categories (4, object): ['b', 'c', 'd', 'a']

1268 """

1269 if inplace is not no_default:

1270 warn(

1271 "The `inplace` parameter in pandas.Categorical."

1272 "add_categories is deprecated and will be removed in "

1273 "a future version. Removing unused categories will always "

1274 "return a new Categorical object.",

1275 FutureWarning,

1276 stacklevel=find_stack_level(),

1277 )

1278 else:

1279 inplace = False

1280

1281 inplace = validate_bool_kwarg(inplace, "inplace")

1282 if not is_list_like(new_categories):

1283 new_categories = [new_categories]

1284 already_included = set(new_categories) & set(self.dtype.categories)

1285 if len(already_included) != 0:

1286 raise ValueError(

1287 f"new categories must not include old categories: {already_included}"

1288 )

1289 new_categories = list(self.dtype.categories) + list(new_categories)

1290 new_dtype = CategoricalDtype(new_categories, self.ordered)

1291

1292 cat = self if inplace else self.copy()

1293 codes = coerce_indexer_dtype(cat._ndarray, new_dtype.categories)

1294 NDArrayBacked.__init__(cat, codes, new_dtype)

1295 if not inplace:

1296 return cat

1297 return None

1298

1299 def remove_categories(self, removals, inplace=no_default):

1300 """

1301 Remove the specified categories.

1302

1303 `removals` must be included in the old categories. Values which were in

1304 the removed categories will be set to NaN

1305

1306 Parameters

1307 ----------

1308 removals : category or list of categories

1309 The categories which should be removed.

1310 inplace : bool, default False

1311 Whether or not to remove the categories inplace or return a copy of

1312 this categorical with removed categories.

1313

1314 .. deprecated:: 1.3.0

1315

1316 Returns

1317 -------

1318 cat : Categorical or None

1319 Categorical with removed categories or None if ``inplace=True``.

1320

1321 Raises

1322 ------

1323 ValueError

1324 If the removals are not contained in the categories

1325

1326 See Also

1327 --------

1328 rename_categories : Rename categories.

1329 reorder_categories : Reorder categories.

1330 add_categories : Add new categories.

1331 remove_unused_categories : Remove categories which are not used.

1332 set_categories : Set the categories to the specified ones.

1333

1334 Examples

1335 --------

1336 >>> c = pd.Categorical(['a', 'c', 'b', 'c', 'd'])

1337 >>> c

1338 ['a', 'c', 'b', 'c', 'd']

1339 Categories (4, object): ['a', 'b', 'c', 'd']

1340

1341 >>> c.remove_categories(['d', 'a'])

1342 [NaN, 'c', 'b', 'c', NaN]

1343 Categories (2, object): ['b', 'c']

1344 """

1345 if inplace is not no_default:

1346 warn(

1347 "The `inplace` parameter in pandas.Categorical."

1348 "remove_categories is deprecated and will be removed in "

1349 "a future version. Removing unused categories will always "

1350 "return a new Categorical object.",

1351 FutureWarning,

1352 stacklevel=find_stack_level(),

1353 )

1354 else:

1355 inplace = False

1356

1357 inplace = validate_bool_kwarg(inplace, "inplace")

1358 if not is_list_like(removals):

1359 removals = [removals]

1360

1361 removal_set = set(removals)

1362 not_included = removal_set - set(self.dtype.categories)

1363 new_categories = [c for c in self.dtype.categories if c not in removal_set]

1364

1365 # GH 10156

1366 if any(isna(removals)):

1367 not_included = {x for x in not_included if notna(x)}

1368 new_categories = [x for x in new_categories if notna(x)]

1369

1370 if len(not_included) != 0:

1371 raise ValueError(f"removals must all be in old categories: {not_included}")

1372

1373 with catch_warnings():

1374 simplefilter("ignore")

1375 return self.set_categories(

1376 new_categories, ordered=self.ordered, rename=False, inplace=inplace

1377 )

1378

1379 @overload

1380 def remove_unused_categories(

1381 self, *, inplace: Literal[False] | NoDefault = ...

1382 ) -> Categorical:

1383 ...

1384

1385 @overload

1386 def remove_unused_categories(self, *, inplace: Literal[True]) -> None:

1387 ...

1388

1389 @deprecate_nonkeyword_arguments(version=None, allowed_args=["self"])

1390 def remove_unused_categories(

1391 self, inplace: bool | NoDefault = no_default

1392 ) -> Categorical | None:

1393 """

1394 Remove categories which are not used.

1395

1396 Parameters

1397 ----------

1398 inplace : bool, default False

1399 Whether or not to drop unused categories inplace or return a copy of

1400 this categorical with unused categories dropped.

1401

1402 .. deprecated:: 1.2.0

1403

1404 Returns

1405 -------

1406 cat : Categorical or None

1407 Categorical with unused categories dropped or None if ``inplace=True``.

1408

1409 See Also

1410 --------

1411 rename_categories : Rename categories.

1412 reorder_categories : Reorder categories.

1413 add_categories : Add new categories.

1414 remove_categories : Remove the specified categories.

1415 set_categories : Set the categories to the specified ones.

1416

1417 Examples

1418 --------

1419 >>> c = pd.Categorical(['a', 'c', 'b', 'c', 'd'])

1420 >>> c

1421 ['a', 'c', 'b', 'c', 'd']

1422 Categories (4, object): ['a', 'b', 'c', 'd']

1423

1424 >>> c[2] = 'a'

1425 >>> c[4] = 'c'

1426 >>> c

1427 ['a', 'c', 'a', 'c', 'c']

1428 Categories (4, object): ['a', 'b', 'c', 'd']

1429

1430 >>> c.remove_unused_categories()

1431 ['a', 'c', 'a', 'c', 'c']

1432 Categories (2, object): ['a', 'c']

1433 """

1434 if inplace is not no_default:

1435 warn(

1436 "The `inplace` parameter in pandas.Categorical."

1437 "remove_unused_categories is deprecated and "

1438 "will be removed in a future version.",

1439 FutureWarning,

1440 stacklevel=find_stack_level(),

1441 )

1442 else:

1443 inplace = False

1444

1445 inplace = validate_bool_kwarg(inplace, "inplace")

1446 cat = self if inplace else self.copy()

1447 idx, inv = np.unique(cat._codes, return_inverse=True)

1448

1449 if idx.size != 0 and idx[0] == -1: # na sentinel

1450 idx, inv = idx[1:], inv - 1

1451

1452 new_categories = cat.dtype.categories.take(idx)

1453 new_dtype = CategoricalDtype._from_fastpath(

1454 new_categories, ordered=self.ordered

1455 )

1456 new_codes = coerce_indexer_dtype(inv, new_dtype.categories)

1457 NDArrayBacked.__init__(cat, new_codes, new_dtype)

1458 if not inplace:

1459 return cat

1460 return None

1461

1462 # ------------------------------------------------------------------

1463

1464 def map(self, mapper):

1465 """

1466 Map categories using an input mapping or function.

1467

1468 Maps the categories to new categories. If the mapping correspondence is

1469 one-to-one the result is a :class:`~pandas.Categorical` which has the

1470 same order property as the original, otherwise a :class:`~pandas.Index`

1471 is returned. NaN values are unaffected.

1472

1473 If a `dict` or :class:`~pandas.Series` is used any unmapped category is

1474 mapped to `NaN`. Note that if this happens an :class:`~pandas.Index`

1475 will be returned.

1476

1477 Parameters

1478 ----------

1479 mapper : function, dict, or Series

1480 Mapping correspondence.

1481

1482 Returns

1483 -------

1484 pandas.Categorical or pandas.Index

1485 Mapped categorical.

1486

1487 See Also

1488 --------

1489 CategoricalIndex.map : Apply a mapping correspondence on a

1490 :class:`~pandas.CategoricalIndex`.

1491 Index.map : Apply a mapping correspondence on an

1492 :class:`~pandas.Index`.

1493 Series.map : Apply a mapping correspondence on a

1494 :class:`~pandas.Series`.

1495 Series.apply : Apply more complex functions on a

1496 :class:`~pandas.Series`.

1497

1498 Examples

1499 --------

1500 >>> cat = pd.Categorical(['a', 'b', 'c'])

1501 >>> cat

1502 ['a', 'b', 'c']

1503 Categories (3, object): ['a', 'b', 'c']

1504 >>> cat.map(lambda x: x.upper())

1505 ['A', 'B', 'C']

1506 Categories (3, object): ['A', 'B', 'C']

1507 >>> cat.map({'a': 'first', 'b': 'second', 'c': 'third'})

1508 ['first', 'second', 'third']

1509 Categories (3, object): ['first', 'second', 'third']

1510

1511 If the mapping is one-to-one the ordering of the categories is

1512 preserved:

1513

1514 >>> cat = pd.Categorical(['a', 'b', 'c'], ordered=True)

1515 >>> cat

1516 ['a', 'b', 'c']

1517 Categories (3, object): ['a' < 'b' < 'c']

1518 >>> cat.map({'a': 3, 'b': 2, 'c': 1})

1519 [3, 2, 1]

1520 Categories (3, int64): [3 < 2 < 1]

1521

1522 If the mapping is not one-to-one an :class:`~pandas.Index` is returned:

1523

1524 >>> cat.map({'a': 'first', 'b': 'second', 'c': 'first'})

1525 Index(['first', 'second', 'first'], dtype='object')

1526

1527 If a `dict` is used, all unmapped categories are mapped to `NaN` and

1528 the result is an :class:`~pandas.Index`:

1529

1530 >>> cat.map({'a': 'first', 'b': 'second'})

1531 Index(['first', 'second', nan], dtype='object')

1532 """

1533 new_categories = self.categories.map(mapper)

1534 try:

1535 return self.from_codes(

1536 self._codes.copy(), categories=new_categories, ordered=self.ordered

1537 )

1538 except ValueError:

1539 # NA values are represented in self._codes with -1

1540 # np.take causes NA values to take final element in new_categories

1541 if np.any(self._codes == -1):

1542 new_categories = new_categories.insert(len(new_categories), np.nan)

1543 return np.take(new_categories, self._codes)

1544

1545 __eq__ = _cat_compare_op(operator.eq)

1546 __ne__ = _cat_compare_op(operator.ne)

1547 __lt__ = _cat_compare_op(operator.lt)

1548 __gt__ = _cat_compare_op(operator.gt)

1549 __le__ = _cat_compare_op(operator.le)

1550 __ge__ = _cat_compare_op(operator.ge)

1551

1552 # -------------------------------------------------------------

1553 # Validators; ideally these can be de-duplicated

1554

1555 def _validate_setitem_value(self, value):

1556 if not is_hashable(value):

1557 # wrap scalars and hashable-listlikes in list

1558 return self._validate_listlike(value)

1559 else:

1560 return self._validate_scalar(value)

1561

1562 _validate_searchsorted_value = _validate_setitem_value

1563

1564 def _validate_scalar(self, fill_value):

1565 """

1566 Convert a user-facing fill_value to a representation to use with our

1567 underlying ndarray, raising TypeError if this is not possible.

1568

1569 Parameters

1570 ----------

1571 fill_value : object

1572

1573 Returns

1574 -------

1575 fill_value : int

1576

1577 Raises

1578 ------

1579 TypeError

1580 """

1581

1582 if is_valid_na_for_dtype(fill_value, self.categories.dtype):

1583 fill_value = -1

1584 elif fill_value in self.categories:

1585 fill_value = self._unbox_scalar(fill_value)

1586 else:

1587 raise TypeError(

1588 "Cannot setitem on a Categorical with a new "

1589 f"category ({fill_value}), set the categories first"

1590 ) from None

1591 return fill_value

1592

1593 # -------------------------------------------------------------

1594

1595 @ravel_compat

1596 def __array__(self, dtype: NpDtype | None = None) -> np.ndarray:

1597 """

1598 The numpy array interface.

1599

1600 Returns

1601 -------

1602 numpy.array

1603 A numpy array of either the specified dtype or,

1604 if dtype==None (default), the same dtype as

1605 categorical.categories.dtype.

1606 """

1607 ret = take_nd(self.categories._values, self._codes)

1608 if dtype and not is_dtype_equal(dtype, self.categories.dtype):

1609 return np.asarray(ret, dtype)

1610 # When we're a Categorical[ExtensionArray], like Interval,

1611 # we need to ensure __array__ gets all the way to an

1612 # ndarray.

1613 return np.asarray(ret)

1614

1615 def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs, **kwargs):

1616 # for binary ops, use our custom dunder methods

1617 result = ops.maybe_dispatch_ufunc_to_dunder_op(

1618 self, ufunc, method, *inputs, **kwargs

1619 )

1620 if result is not NotImplemented:

1621 return result

1622

1623 if "out" in kwargs:

1624 # e.g. test_numpy_ufuncs_out

1625 return arraylike.dispatch_ufunc_with_out(

1626 self, ufunc, method, *inputs, **kwargs

1627 )

1628

1629 if method == "reduce":

1630 # e.g. TestCategoricalAnalytics::test_min_max_ordered

1631 result = arraylike.dispatch_reduction_ufunc(

1632 self, ufunc, method, *inputs, **kwargs

1633 )

1634 if result is not NotImplemented:

1635 return result

1636

1637 # for all other cases, raise for now (similarly as what happens in

1638 # Series.__array_prepare__)

1639 raise TypeError(

1640 f"Object with dtype {self.dtype} cannot perform "

1641 f"the numpy op {ufunc.__name__}"

1642 )

1643

1644 def __setstate__(self, state) -> None:

1645 """Necessary for making this object picklable"""

1646 if not isinstance(state, dict):

1647 return super().__setstate__(state)

1648

1649 if "_dtype" not in state:

1650 state["_dtype"] = CategoricalDtype(state["_categories"], state["_ordered"])

1651

1652 if "_codes" in state and "_ndarray" not in state:

1653 # backward compat, changed what is property vs attribute

1654 state["_ndarray"] = state.pop("_codes")

1655

1656 super().__setstate__(state)

1657

1658 @property

1659 def nbytes(self) -> int:

1660 return self._codes.nbytes + self.dtype.categories.values.nbytes

1661

1662 def memory_usage(self, deep: bool = False) -> int:

1663 """

1664 Memory usage of my values

1665

1666 Parameters

1667 ----------

1668 deep : bool

1669 Introspect the data deeply, interrogate

1670 `object` dtypes for system-level memory consumption

1671

1672 Returns

1673 -------

1674 bytes used

1675

1676 Notes

1677 -----

1678 Memory usage does not include memory consumed by elements that

1679 are not components of the array if deep=False

1680

1681 See Also

1682 --------

1683 numpy.ndarray.nbytes

1684 """

1685 return self._codes.nbytes + self.dtype.categories.memory_usage(deep=deep)

1686

1687 def isna(self) -> np.ndarray:

1688 """

1689 Detect missing values

1690

1691 Missing values (-1 in .codes) are detected.

1692

1693 Returns

1694 -------

1695 np.ndarray[bool] of whether my values are null

1696

1697 See Also

1698 --------

1699 isna : Top-level isna.

1700 isnull : Alias of isna.

1701 Categorical.notna : Boolean inverse of Categorical.isna.

1702

1703 """

1704 return self._codes == -1

1705

1706 isnull = isna

1707

1708 def notna(self) -> np.ndarray:

1709 """

1710 Inverse of isna

1711

1712 Both missing values (-1 in .codes) and NA as a category are detected as

1713 null.

1714

1715 Returns

1716 -------

1717 np.ndarray[bool] of whether my values are not null

1718

1719 See Also

1720 --------

1721 notna : Top-level notna.

1722 notnull : Alias of notna.

1723 Categorical.isna : Boolean inverse of Categorical.notna.

1724

1725 """

1726 return ~self.isna()

1727

1728 notnull = notna

1729

1730 def value_counts(self, dropna: bool = True) -> Series:

1731 """

1732 Return a Series containing counts of each category.

1733

1734 Every category will have an entry, even those with a count of 0.

1735

1736 Parameters

1737 ----------

1738 dropna : bool, default True

1739 Don't include counts of NaN.

1740

1741 Returns

1742 -------

1743 counts : Series

1744

1745 See Also

1746 --------

1747 Series.value_counts

1748 """

1749 from pandas import (

1750 CategoricalIndex,

1751 Series,

1752 )

1753

1754 code, cat = self._codes, self.categories

1755 ncat, mask = (len(cat), code >= 0)

1756 ix, clean = np.arange(ncat), mask.all()

1757

1758 if dropna or clean:

1759 obs = code if clean else code[mask]

1760 count = np.bincount(obs, minlength=ncat or 0)

1761 else:

1762 count = np.bincount(np.where(mask, code, ncat))

1763 ix = np.append(ix, -1)

1764

1765 ix = coerce_indexer_dtype(ix, self.dtype.categories)

1766 ix = self._from_backing_data(ix)

1767

1768 return Series(count, index=CategoricalIndex(ix), dtype="int64")

1769

1770 # error: Argument 2 of "_empty" is incompatible with supertype

1771 # "NDArrayBackedExtensionArray"; supertype defines the argument type as

1772 # "ExtensionDtype"

1773 @classmethod

1774 def _empty( # type: ignore[override]

1775 cls: type_t[Categorical], shape: Shape, dtype: CategoricalDtype

1776 ) -> Categorical:

1777 """

1778 Analogous to np.empty(shape, dtype=dtype)

1779

1780 Parameters

1781 ----------

1782 shape : tuple[int]

1783 dtype : CategoricalDtype

1784 """

1785 arr = cls._from_sequence([], dtype=dtype)

1786

1787 # We have to use np.zeros instead of np.empty otherwise the resulting

1788 # ndarray may contain codes not supported by this dtype, in which

1789 # case repr(result) could segfault.

1790 backing = np.zeros(shape, dtype=arr._ndarray.dtype)

1791

1792 return arr._from_backing_data(backing)

1793

1794 def _internal_get_values(self):

1795 """

1796 Return the values.

1797

1798 For internal compatibility with pandas formatting.

1799

1800 Returns

1801 -------

1802 np.ndarray or Index

1803 A numpy array of the same dtype as categorical.categories.dtype or

1804 Index if datetime / periods.

1805 """

1806 # if we are a datetime and period index, return Index to keep metadata

1807 if needs_i8_conversion(self.categories.dtype):

1808 return self.categories.take(self._codes, fill_value=NaT)

1809 elif is_integer_dtype(self.categories) and -1 in self._codes:

1810 return self.categories.astype("object").take(self._codes, fill_value=np.nan)

1811 return np.array(self)

1812

1813 def check_for_ordered(self, op) -> None:

1814 """assert that we are ordered"""

1815 if not self.ordered:

1816 raise TypeError(

1817 f"Categorical is not ordered for operation {op}\n"

1818 "you can use .as_ordered() to change the "

1819 "Categorical to an ordered one\n"

1820 )

1821

1822 @deprecate_nonkeyword_arguments(version=None, allowed_args=["self"])

1823 def argsort(self, ascending=True, kind="quicksort", **kwargs):

1824 """

1825 Return the indices that would sort the Categorical.

1826

1827 .. versionchanged:: 0.25.0

1828

1829 Changed to sort missing values at the end.

1830

1831 Parameters

1832 ----------

1833 ascending : bool, default True

1834 Whether the indices should result in an ascending

1835 or descending sort.

1836 kind : {'quicksort', 'mergesort', 'heapsort', 'stable'}, optional

1837 Sorting algorithm.

1838 **kwargs:

1839 passed through to :func:`numpy.argsort`.

1840

1841 Returns

1842 -------

1843 np.ndarray[np.intp]

1844

1845 See Also

1846 --------

1847 numpy.ndarray.argsort

1848

1849 Notes

1850 -----

1851 While an ordering is applied to the category values, arg-sorting

1852 in this context refers more to organizing and grouping together

1853 based on matching category values. Thus, this function can be

1854 called on an unordered Categorical instance unlike the functions

1855 'Categorical.min' and 'Categorical.max'.

1856

1857 Examples

1858 --------

1859 >>> pd.Categorical(['b', 'b', 'a', 'c']).argsort()

1860 array([2, 0, 1, 3])

1861

1862 >>> cat = pd.Categorical(['b', 'b', 'a', 'c'],

1863 ... categories=['c', 'b', 'a'],

1864 ... ordered=True)

1865 >>> cat.argsort()

1866 array([3, 0, 1, 2])

1867

1868 Missing values are placed at the end

1869

1870 >>> cat = pd.Categorical([2, None, 1])

1871 >>> cat.argsort()

1872 array([2, 0, 1])

1873 """

1874 return super().argsort(ascending=ascending, kind=kind, **kwargs)

1875

1876 @overload

1877 def sort_values(

1878 self,

1879 *,

1880 inplace: Literal[False] = ...,

1881 ascending: bool = ...,

1882 na_position: str = ...,

1883 ) -> Categorical:

1884 ...

1885

1886 @overload

1887 def sort_values(

1888 self, *, inplace: Literal[True], ascending: bool = ..., na_position: str = ...

1889 ) -> None:

1890 ...

1891

1892 @deprecate_nonkeyword_arguments(version=None, allowed_args=["self"])

1893 def sort_values(

1894 self, inplace: bool = False, ascending: bool = True, na_position: str = "last"

1895 ) -> Categorical | None:

1896 """

1897 Sort the Categorical by category value returning a new

1898 Categorical by default.

1899

1900 While an ordering is applied to the category values, sorting in this

1901 context refers more to organizing and grouping together based on

1902 matching category values. Thus, this function can be called on an

1903 unordered Categorical instance unlike the functions 'Categorical.min'

1904 and 'Categorical.max'.

1905

1906 Parameters

1907 ----------

1908 inplace : bool, default False

1909 Do operation in place.

1910 ascending : bool, default True

1911 Order ascending. Passing False orders descending. The

1912 ordering parameter provides the method by which the

1913 category values are organized.

1914 na_position : {'first', 'last'} (optional, default='last')

1915 'first' puts NaNs at the beginning

1916 'last' puts NaNs at the end

1917

1918 Returns

1919 -------

1920 Categorical or None

1921

1922 See Also

1923 --------

1924 Categorical.sort

1925 Series.sort_values

1926

1927 Examples

1928 --------

1929 >>> c = pd.Categorical([1, 2, 2, 1, 5])

1930 >>> c

1931 [1, 2, 2, 1, 5]

1932 Categories (3, int64): [1, 2, 5]

1933 >>> c.sort_values()

1934 [1, 1, 2, 2, 5]

1935 Categories (3, int64): [1, 2, 5]

1936 >>> c.sort_values(ascending=False)

1937 [5, 2, 2, 1, 1]

1938 Categories (3, int64): [1, 2, 5]

1939

1940 Inplace sorting can be done as well:

1941

1942 >>> c.sort_values(inplace=True)

1943 >>> c

1944 [1, 1, 2, 2, 5]

1945 Categories (3, int64): [1, 2, 5]

1946 >>>

1947 >>> c = pd.Categorical([1, 2, 2, 1, 5])

1948

1949 'sort_values' behaviour with NaNs. Note that 'na_position'

1950 is independent of the 'ascending' parameter:

1951

1952 >>> c = pd.Categorical([np.nan, 2, 2, np.nan, 5])

1953 >>> c

1954 [NaN, 2, 2, NaN, 5]

1955 Categories (2, int64): [2, 5]

1956 >>> c.sort_values()

1957 [2, 2, 5, NaN, NaN]

1958 Categories (2, int64): [2, 5]

1959 >>> c.sort_values(ascending=False)

1960 [5, 2, 2, NaN, NaN]

1961 Categories (2, int64): [2, 5]

1962 >>> c.sort_values(na_position='first')

1963 [NaN, NaN, 2, 2, 5]

1964 Categories (2, int64): [2, 5]

1965 >>> c.sort_values(ascending=False, na_position='first')

1966 [NaN, NaN, 5, 2, 2]

1967 Categories (2, int64): [2, 5]

1968 """

1969 inplace = validate_bool_kwarg(inplace, "inplace")

1970 if na_position not in ["last", "first"]:

1971 raise ValueError(f"invalid na_position: {repr(na_position)}")

1972

1973 sorted_idx = nargsort(self, ascending=ascending, na_position=na_position)

1974

1975 if not inplace:

1976 codes = self._codes[sorted_idx]

1977 return self._from_backing_data(codes)

1978 self._codes[:] = self._codes[sorted_idx]

1979 return None

1980

1981 def _rank(

1982 self,

1983 *,

1984 axis: int = 0,

1985 method: str = "average",

1986 na_option: str = "keep",

1987 ascending: bool = True,

1988 pct: bool = False,

1989 ):

1990 """

1991 See Series.rank.__doc__.

1992 """

1993 if axis != 0:

1994 raise NotImplementedError

1995 vff = self._values_for_rank()

1996 return algorithms.rank(

1997 vff,

1998 axis=axis,

1999 method=method,

2000 na_option=na_option,

2001 ascending=ascending,

2002 pct=pct,

2003 )

2004

2005 def _values_for_rank(self):

2006 """

2007 For correctly ranking ordered categorical data. See GH#15420

2008

2009 Ordered categorical data should be ranked on the basis of

2010 codes with -1 translated to NaN.

2011

2012 Returns

2013 -------

2014 numpy.array

2015

2016 """

2017 from pandas import Series

2018

2019 if self.ordered:

2020 values = self.codes

2021 mask = values == -1

2022 if mask.any():

2023 values = values.astype("float64")

2024 values[mask] = np.nan

2025 elif self.categories.is_numeric():

2026 values = np.array(self)

2027 else:

2028 # reorder the categories (so rank can use the float codes)

2029 # instead of passing an object array to rank

2030 values = np.array(

2031 self.rename_categories(Series(self.categories).rank().values)

2032 )

2033 return values

2034

2035 def to_dense(self) -> np.ndarray:

2036 """

2037 Return my 'dense' representation

2038

2039 For internal compatibility with numpy arrays.

2040

2041 Returns

2042 -------

2043 dense : array

2044 """

2045 warn(

2046 "Categorical.to_dense is deprecated and will be removed in "

2047 "a future version. Use np.asarray(cat) instead.",

2048 FutureWarning,

2049 stacklevel=find_stack_level(),

2050 )

2051 return np.asarray(self)

2052

2053 # ------------------------------------------------------------------

2054 # NDArrayBackedExtensionArray compat

2055

2056 @property

2057 def _codes(self) -> np.ndarray:

2058 return self._ndarray

2059

2060 @_codes.setter

2061 def _codes(self, value: np.ndarray):

2062 warn(

2063 "Setting the codes on a Categorical is deprecated and will raise in "

2064 "a future version. Create a new Categorical object instead",

2065 FutureWarning,

2066 stacklevel=find_stack_level(),

2067 ) # GH#40606

2068 NDArrayBacked.__init__(self, value, self.dtype)

2069

2070 def _box_func(self, i: int):

2071 if i == -1:

2072 return np.NaN

2073 return self.categories[i]

2074

2075 def _unbox_scalar(self, key) -> int:

2076 # searchsorted is very performance sensitive. By converting codes

2077 # to same dtype as self.codes, we get much faster performance.

2078 code = self.categories.get_loc(key)

2079 code = self._ndarray.dtype.type(code)

2080 return code

2081

2082 # ------------------------------------------------------------------

2083

2084 def take_nd(

2085 self, indexer, allow_fill: bool = False, fill_value=None

2086 ) -> Categorical:

2087 # GH#27745 deprecate alias that other EAs dont have

2088 warn(

2089 "Categorical.take_nd is deprecated, use Categorical.take instead",

2090 FutureWarning,

2091 stacklevel=find_stack_level(),

2092 )

2093 return self.take(indexer, allow_fill=allow_fill, fill_value=fill_value)

2094

2095 def __iter__(self):

2096 """

2097 Returns an Iterator over the values of this Categorical.

2098 """

2099 if self.ndim == 1:

2100 return iter(self._internal_get_values().tolist())

2101 else:

2102 return (self[n] for n in range(len(self)))

2103

2104 def __contains__(self, key) -> bool:

2105 """

2106 Returns True if `key` is in this Categorical.

2107 """

2108 # if key is a NaN, check if any NaN is in self.

2109 if is_valid_na_for_dtype(key, self.categories.dtype):

2110 return bool(self.isna().any())

2111

2112 return contains(self, key, container=self._codes)

2113

2114 # ------------------------------------------------------------------

2115 # Rendering Methods

2116

2117 def _formatter(self, boxed: bool = False):

2118 # Defer to CategoricalFormatter's formatter.

2119 return None

2120

2121 def _tidy_repr(self, max_vals: int = 10, footer: bool = True) -> str:

2122 """

2123 a short repr displaying only max_vals and an optional (but default

2124 footer)

2125 """

2126 num = max_vals // 2

2127 head = self[:num]._get_repr(length=False, footer=False)

2128 tail = self[-(max_vals - num) :]._get_repr(length=False, footer=False)

2129

2130 result = f"{head[:-1]}, ..., {tail[1:]}"

2131 if footer:

2132 result = f"{result}\n{self._repr_footer()}"

2133

2134 return str(result)

2135

2136 def _repr_categories(self) -> list[str]:

2137 """

2138 return the base repr for the categories

2139 """

2140 max_categories = (

2141 10

2142 if get_option("display.max_categories") == 0

2143 else get_option("display.max_categories")

2144 )

2145 from pandas.io.formats import format as fmt

2146

2147 format_array = partial(

2148 fmt.format_array, formatter=None, quoting=QUOTE_NONNUMERIC

2149 )

2150 if len(self.categories) > max_categories:

2151 num = max_categories // 2

2152 head = format_array(self.categories[:num])

2153 tail = format_array(self.categories[-num:])

2154 category_strs = head + ["..."] + tail

2155 else:

2156 category_strs = format_array(self.categories)

2157

2158 # Strip all leading spaces, which format_array adds for columns...

2159 category_strs = [x.strip() for x in category_strs]

2160 return category_strs

2161

2162 def _repr_categories_info(self) -> str:

2163 """

2164 Returns a string representation of the footer.

2165 """

2166 category_strs = self._repr_categories()

2167 dtype = str(self.categories.dtype)

2168 levheader = f"Categories ({len(self.categories)}, {dtype}): "

2169 width, height = get_terminal_size()

2170 max_width = get_option("display.width") or width

2171 if console.in_ipython_frontend():

2172 # 0 = no breaks

2173 max_width = 0

2174 levstring = ""

2175 start = True

2176 cur_col_len = len(levheader) # header

2177 sep_len, sep = (3, " < ") if self.ordered else (2, ", ")

2178 linesep = sep.rstrip() + "\n" # remove whitespace

2179 for val in category_strs:

2180 if max_width != 0 and cur_col_len + sep_len + len(val) > max_width:

2181 levstring += linesep + (" " * (len(levheader) + 1))

2182 cur_col_len = len(levheader) + 1 # header + a whitespace

2183 elif not start:

2184 levstring += sep

2185 cur_col_len += len(val)

2186 levstring += val

2187 start = False

2188 # replace to simple save space by

2189 return levheader + "[" + levstring.replace(" < ... < ", " ... ") + "]"

2190

2191 def _repr_footer(self) -> str:

2192 info = self._repr_categories_info()

2193 return f"Length: {len(self)}\n{info}"

2194

2195 def _get_repr(self, length: bool = True, na_rep="NaN", footer: bool = True) -> str:

2196 from pandas.io.formats import format as fmt

2197

2198 formatter = fmt.CategoricalFormatter(

2199 self, length=length, na_rep=na_rep, footer=footer

2200 )

2201 result = formatter.to_string()

2202 return str(result)

2203

2204 def __repr__(self) -> str:

2205 """

2206 String representation.

2207 """

2208 _maxlen = 10

2209 if len(self._codes) > _maxlen:

2210 result = self._tidy_repr(_maxlen)

2211 elif len(self._codes) > 0:

2212 result = self._get_repr(length=len(self) > _maxlen)

2213 else:

2214 msg = self._get_repr(length=False, footer=True).replace("\n", ", ")

2215 result = f"[], {msg}"

2216

2217 return result

2218

2219 # ------------------------------------------------------------------

2220

2221 def _validate_listlike(self, value):

2222 # NB: here we assume scalar-like tuples have already been excluded

2223 value = extract_array(value, extract_numpy=True)

2224

2225 # require identical categories set

2226 if isinstance(value, Categorical):

2227 if not is_dtype_equal(self.dtype, value.dtype):

2228 raise TypeError(

2229 "Cannot set a Categorical with another, "

2230 "without identical categories"

2231 )

2232 # is_dtype_equal implies categories_match_up_to_permutation

2233 value = self._encode_with_my_categories(value)

2234 return value._codes

2235

2236 from pandas import Index

2237

2238 # tupleize_cols=False for e.g. test_fillna_iterable_category GH#41914

2239 to_add = Index._with_infer(value, tupleize_cols=False).difference(

2240 self.categories

2241 )

2242

2243 # no assignments of values not in categories, but it's always ok to set

2244 # something to np.nan

2245 if len(to_add) and not isna(to_add).all():

2246 raise TypeError(

2247 "Cannot setitem on a Categorical with a new "

2248 "category, set the categories first"

2249 )

2250

2251 codes = self.categories.get_indexer(value)

2252 return codes.astype(self._ndarray.dtype, copy=False)

2253

2254 def _reverse_indexer(self) -> dict[Hashable, npt.NDArray[np.intp]]:

2255 """

2256 Compute the inverse of a categorical, returning

2257 a dict of categories -> indexers.

2258

2259 *This is an internal function*

2260

2261 Returns

2262 -------

2263 Dict[Hashable, np.ndarray[np.intp]]

2264 dict of categories -> indexers

2265

2266 Examples

2267 --------

2268 >>> c = pd.Categorical(list('aabca'))

2269 >>> c

2270 ['a', 'a', 'b', 'c', 'a']

2271 Categories (3, object): ['a', 'b', 'c']

2272 >>> c.categories

2273 Index(['a', 'b', 'c'], dtype='object')

2274 >>> c.codes

2275 array([0, 0, 1, 2, 0], dtype=int8)

2276 >>> c._reverse_indexer()

2277 {'a': array([0, 1, 4]), 'b': array([2]), 'c': array([3])}

2278

2279 """

2280 categories = self.categories

2281 r, counts = libalgos.groupsort_indexer(

2282 ensure_platform_int(self.codes), categories.size

2283 )

2284 counts = ensure_int64(counts).cumsum()

2285 _result = (r[start:end] for start, end in zip(counts, counts[1:]))

2286 return dict(zip(categories, _result))

2287

2288 # ------------------------------------------------------------------

2289 # Reductions

2290

2291 @deprecate_kwarg(old_arg_name="numeric_only", new_arg_name="skipna")

2292 def min(self, *, skipna=True, **kwargs):

2293 """

2294 The minimum value of the object.

2295

2296 Only ordered `Categoricals` have a minimum!

2297

2298 .. versionchanged:: 1.0.0

2299

2300 Returns an NA value on empty arrays

2301

2302 Raises

2303 ------

2304 TypeError

2305 If the `Categorical` is not `ordered`.

2306

2307 Returns

2308 -------

2309 min : the minimum of this `Categorical`

2310 """

2311 nv.validate_minmax_axis(kwargs.get("axis", 0))

2312 nv.validate_min((), kwargs)

2313 self.check_for_ordered("min")

2314

2315 if not len(self._codes):

2316 return self.dtype.na_value

2317

2318 good = self._codes != -1

2319 if not good.all():

2320 if skipna and good.any():

2321 pointer = self._codes[good].min()

2322 else:

2323 return np.nan

2324 else:

2325 pointer = self._codes.min()

2326 return self._wrap_reduction_result(None, pointer)

2327

2328 @deprecate_kwarg(old_arg_name="numeric_only", new_arg_name="skipna")

2329 def max(self, *, skipna=True, **kwargs):

2330 """

2331 The maximum value of the object.

2332

2333 Only ordered `Categoricals` have a maximum!

2334

2335 .. versionchanged:: 1.0.0

2336

2337 Returns an NA value on empty arrays

2338

2339 Raises

2340 ------

2341 TypeError

2342 If the `Categorical` is not `ordered`.

2343

2344 Returns

2345 -------

2346 max : the maximum of this `Categorical`

2347 """

2348 nv.validate_minmax_axis(kwargs.get("axis", 0))

2349 nv.validate_max((), kwargs)

2350 self.check_for_ordered("max")

2351

2352 if not len(self._codes):

2353 return self.dtype.na_value

2354

2355 good = self._codes != -1

2356 if not good.all():

2357 if skipna and good.any():

2358 pointer = self._codes[good].max()

2359 else:

2360 return np.nan

2361 else:

2362 pointer = self._codes.max()

2363 return self._wrap_reduction_result(None, pointer)

2364

2365 def mode(self, dropna: bool = True) -> Categorical:

2366 """

2367 Returns the mode(s) of the Categorical.

2368

2369 Always returns `Categorical` even if only one value.

2370

2371 Parameters

2372 ----------

2373 dropna : bool, default True

2374 Don't consider counts of NaN/NaT.

2375

2376 Returns

2377 -------

2378 modes : `Categorical` (sorted)

2379 """

2380 warn(

2381 "Categorical.mode is deprecated and will be removed in a future version. "

2382 "Use Series.mode instead.",

2383 FutureWarning,

2384 stacklevel=find_stack_level(),

2385 )

2386 return self._mode(dropna=dropna)

2387

2388 def _mode(self, dropna: bool = True) -> Categorical:

2389 codes = self._codes

2390 mask = None

2391 if dropna:

2392 mask = self.isna()

2393

2394 res_codes = algorithms.mode(codes, mask=mask)

2395 res_codes = cast(np.ndarray, res_codes)

2396 assert res_codes.dtype == codes.dtype

2397 res = self._from_backing_data(res_codes)

2398 return res

2399

2400 # ------------------------------------------------------------------

2401 # ExtensionArray Interface

2402

2403 def unique(self):

2404 """

2405 Return the ``Categorical`` which ``categories`` and ``codes`` are

2406 unique.

2407

2408 .. versionchanged:: 1.3.0

2409

2410 Previously, unused categories were dropped from the new categories.

2411

2412 Returns

2413 -------

2414 Categorical

2415

2416 See Also

2417 --------

2418 pandas.unique

2419 CategoricalIndex.unique

2420 Series.unique : Return unique values of Series object.

2421

2422 Examples

2423 --------

2424 >>> pd.Categorical(list("baabc")).unique()

2425 ['b', 'a', 'c']

2426 Categories (3, object): ['a', 'b', 'c']

2427 >>> pd.Categorical(list("baab"), categories=list("abc"), ordered=True).unique()

2428 ['b', 'a']

2429 Categories (3, object): ['a' < 'b' < 'c']

2430 """

2431 unique_codes = unique1d(self.codes)

2432 return self._from_backing_data(unique_codes)

2433

2434 def _cast_quantile_result(self, res_values: np.ndarray) -> np.ndarray:

2435 # make sure we have correct itemsize for resulting codes

2436 assert res_values.dtype == self._ndarray.dtype

2437 return res_values

2438

2439 def equals(self, other: object) -> bool:

2440 """

2441 Returns True if categorical arrays are equal.

2442

2443 Parameters

2444 ----------

2445 other : `Categorical`

2446

2447 Returns

2448 -------

2449 bool

2450 """

2451 if not isinstance(other, Categorical):

2452 return False

2453 elif self._categories_match_up_to_permutation(other):

2454 other = self._encode_with_my_categories(other)

2455 return np.array_equal(self._codes, other._codes)

2456 return False

2457

2458 @classmethod

2459 def _concat_same_type(

2460 cls: type[CategoricalT], to_concat: Sequence[CategoricalT], axis: int = 0

2461 ) -> CategoricalT:

2462 from pandas.core.dtypes.concat import union_categoricals

2463

2464 first = to_concat[0]

2465 if axis >= first.ndim:

2466 raise ValueError(

2467 f"axis {axis} is out of bounds for array of dimension {first.ndim}"

2468 )

2469

2470 if axis == 1:

2471 # Flatten, concatenate then reshape

2472 if not all(x.ndim == 2 for x in to_concat):

2473 raise ValueError

2474

2475 # pass correctly-shaped to union_categoricals

2476 tc_flat = []

2477 for obj in to_concat:

2478 tc_flat.extend([obj[:, i] for i in range(obj.shape[1])])

2479

2480 res_flat = cls._concat_same_type(tc_flat, axis=0)

2481

2482 result = res_flat.reshape(len(first), -1, order="F")

2483 return result

2484

2485 result = union_categoricals(to_concat)

2486 return result

2487

2488 # ------------------------------------------------------------------

2489

2490 def _encode_with_my_categories(self, other: Categorical) -> Categorical:

2491 """

2492 Re-encode another categorical using this Categorical's categories.

2493

2494 Notes

2495 -----

2496 This assumes we have already checked

2497 self._categories_match_up_to_permutation(other).

2498 """

2499 # Indexing on codes is more efficient if categories are the same,

2500 # so we can apply some optimizations based on the degree of

2501 # dtype-matching.

2502 codes = recode_for_categories(

2503 other.codes, other.categories, self.categories, copy=False

2504 )

2505 return self._from_backing_data(codes)

2506

2507 def _categories_match_up_to_permutation(self, other: Categorical) -> bool:

2508 """

2509 Returns True if categoricals are the same dtype

2510 same categories, and same ordered

2511

2512 Parameters

2513 ----------

2514 other : Categorical

2515

2516 Returns

2517 -------

2518 bool

2519 """

2520 return hash(self.dtype) == hash(other.dtype)

2521

2522 def is_dtype_equal(self, other) -> bool:

2523 warn(

2524 "Categorical.is_dtype_equal is deprecated and will be removed "

2525 "in a future version",

2526 FutureWarning,

2527 stacklevel=find_stack_level(),

2528 )

2529 try:

2530 return self._categories_match_up_to_permutation(other)

2531 except (AttributeError, TypeError):

2532 return False

2533

2534 def describe(self) -> DataFrame:

2535 """

2536 Describes this Categorical

2537

2538 Returns

2539 -------

2540 description: `DataFrame`

2541 A dataframe with frequency and counts by category.

2542 """

2543 counts = self.value_counts(dropna=False)

2544 freqs = counts / counts.sum()

2545

2546 from pandas import Index

2547 from pandas.core.reshape.concat import concat

2548

2549 result = concat([counts, freqs], axis=1)

2550 result.columns = Index(["counts", "freqs"])

2551 result.index.name = "categories"

2552

2553 return result

2554

2555 def isin(self, values) -> npt.NDArray[np.bool_]:

2556 """

2557 Check whether `values` are contained in Categorical.

2558

2559 Return a boolean NumPy Array showing whether each element in

2560 the Categorical matches an element in the passed sequence of

2561 `values` exactly.

2562

2563 Parameters

2564 ----------

2565 values : set or list-like

2566 The sequence of values to test. Passing in a single string will

2567 raise a ``TypeError``. Instead, turn a single string into a

2568 list of one element.

2569

2570 Returns

2571 -------

2572 np.ndarray[bool]

2573

2574 Raises

2575 ------

2576 TypeError

2577 * If `values` is not a set or list-like

2578

2579 See Also

2580 --------

2581 pandas.Series.isin : Equivalent method on Series.

2582

2583 Examples

2584 --------

2585 >>> s = pd.Categorical(['lama', 'cow', 'lama', 'beetle', 'lama',

2586 ... 'hippo'])

2587 >>> s.isin(['cow', 'lama'])

2588 array([ True, True, True, False, True, False])

2589

2590 Passing a single string as ``s.isin('lama')`` will raise an error. Use

2591 a list of one element instead:

2592

2593 >>> s.isin(['lama'])

2594 array([ True, False, True, False, True, False])

2595 """

2596 if not is_list_like(values):

2597 values_type = type(values).__name__

2598 raise TypeError(

2599 "only list-like objects are allowed to be passed "

2600 f"to isin(), you passed a [{values_type}]"

2601 )

2602 values = sanitize_array(values, None, None)

2603 null_mask = np.asarray(isna(values))

2604 code_values = self.categories.get_indexer(values)

2605 code_values = code_values[null_mask | (code_values >= 0)]

2606 return algorithms.isin(self.codes, code_values)

2607

2608 @overload

2609 def replace(

2610 self, to_replace, value, *, inplace: Literal[False] = ...

2611 ) -> Categorical:

2612 ...

2613

2614 @overload

2615 def replace(self, to_replace, value, *, inplace: Literal[True]) -> None:

2616 ...

2617

2618 @deprecate_nonkeyword_arguments(version=None, allowed_args=["self", "value"])

2619 def replace(self, to_replace, value, inplace: bool = False) -> Categorical | None:

2620 """

2621 Replaces all instances of one value with another

2622

2623 Parameters

2624 ----------

2625 to_replace: object

2626 The value to be replaced

2627

2628 value: object

2629 The value to replace it with

2630

2631 inplace: bool

2632 Whether the operation is done in-place

2633

2634 Returns

2635 -------

2636 None if inplace is True, otherwise the new Categorical after replacement

2637

2638

2639 Examples

2640 --------

2641 >>> s = pd.Categorical([1, 2, 1, 3])

2642 >>> s.replace(1, 3)

2643 [3, 2, 3, 3]

2644 Categories (2, int64): [2, 3]

2645 """

2646 # GH#44929 deprecation

2647 warn(

2648 "Categorical.replace is deprecated and will be removed in a future "

2649 "version. Use Series.replace directly instead.",

2650 FutureWarning,

2651 stacklevel=find_stack_level(),

2652 )

2653 return self._replace(to_replace=to_replace, value=value, inplace=inplace)

2654

2655 def _replace(self, *, to_replace, value, inplace: bool = False):

2656 inplace = validate_bool_kwarg(inplace, "inplace")

2657 cat = self if inplace else self.copy()

2658

2659 # build a dict of (to replace -> value) pairs

2660 if is_list_like(to_replace):

2661 # if to_replace is list-like and value is scalar

2662 replace_dict = {replace_value: value for replace_value in to_replace}

2663 else:

2664 # if both to_replace and value are scalar

2665 replace_dict = {to_replace: value}

2666

2667 # other cases, like if both to_replace and value are list-like or if

2668 # to_replace is a dict, are handled separately in NDFrame

2669 for replace_value, new_value in replace_dict.items():

2670 if new_value == replace_value:

2671 continue

2672 if replace_value in cat.categories:

2673 if isna(new_value):

2674 with catch_warnings():

2675 simplefilter("ignore")

2676 cat.remove_categories(replace_value, inplace=True)

2677 continue

2678

2679 categories = cat.categories.tolist()

2680 index = categories.index(replace_value)

2681

2682 if new_value in cat.categories:

2683 value_index = categories.index(new_value)

2684 cat._codes[cat._codes == index] = value_index

2685 with catch_warnings():

2686 simplefilter("ignore")

2687 cat.remove_categories(replace_value, inplace=True)

2688 else:

2689 categories[index] = new_value

2690 with catch_warnings():

2691 simplefilter("ignore")

2692 cat.rename_categories(categories, inplace=True)

2693 if not inplace:

2694 return cat

2695

2696 # ------------------------------------------------------------------------

2697 # String methods interface

2698 def _str_map(

2699 self, f, na_value=np.nan, dtype=np.dtype("object"), convert: bool = True

2700 ):

2701 # Optimization to apply the callable `f` to the categories once

2702 # and rebuild the result by `take`ing from the result with the codes.

2703 # Returns the same type as the object-dtype implementation though.

2704 from pandas.core.arrays import PandasArray

2705

2706 categories = self.categories

2707 codes = self.codes

2708 result = PandasArray(categories.to_numpy())._str_map(f, na_value, dtype)

2709 return take_nd(result, codes, fill_value=na_value)

2710

2711 def _str_get_dummies(self, sep="|"):

2712 # sep may not be in categories. Just bail on this.

2713 from pandas.core.arrays import PandasArray

2714

2715 return PandasArray(self.astype(str))._str_get_dummies(sep)

2716

2717

2718# The Series.cat accessor

2719

2720

2721@delegate_names(

2722 delegate=Categorical, accessors=["categories", "ordered"], typ="property"

2723)

2724@delegate_names(

2725 delegate=Categorical,

2726 accessors=[

2727 "rename_categories",

2728 "reorder_categories",

2729 "add_categories",

2730 "remove_categories",

2731 "remove_unused_categories",

2732 "set_categories",

2733 "as_ordered",

2734 "as_unordered",

2735 ],

2736 typ="method",

2737)

2738class CategoricalAccessor(PandasDelegate, PandasObject, NoNewAttributesMixin):

2739 """

2740 Accessor object for categorical properties of the Series values.

2741

2742 Be aware that assigning to `categories` is a inplace operation, while all

2743 methods return new categorical data per default (but can be called with

2744 `inplace=True`).

2745

2746 Parameters

2747 ----------

2748 data : Series or CategoricalIndex

2749

2750 Examples

2751 --------

2752 >>> s = pd.Series(list("abbccc")).astype("category")

2753 >>> s

2754 0 a

2755 1 b

2756 2 b

2757 3 c

2758 4 c

2759 5 c

2760 dtype: category

2761 Categories (3, object): ['a', 'b', 'c']

2762

2763 >>> s.cat.categories

2764 Index(['a', 'b', 'c'], dtype='object')

2765

2766 >>> s.cat.rename_categories(list("cba"))

2767 0 c

2768 1 b

2769 2 b

2770 3 a

2771 4 a

2772 5 a

2773 dtype: category

2774 Categories (3, object): ['c', 'b', 'a']

2775

2776 >>> s.cat.reorder_categories(list("cba"))

2777 0 a

2778 1 b

2779 2 b

2780 3 c

2781 4 c

2782 5 c

2783 dtype: category

2784 Categories (3, object): ['c', 'b', 'a']

2785

2786 >>> s.cat.add_categories(["d", "e"])

2787 0 a

2788 1 b

2789 2 b

2790 3 c

2791 4 c

2792 5 c

2793 dtype: category

2794 Categories (5, object): ['a', 'b', 'c', 'd', 'e']

2795

2796 >>> s.cat.remove_categories(["a", "c"])

2797 0 NaN

2798 1 b

2799 2 b

2800 3 NaN

2801 4 NaN

2802 5 NaN

2803 dtype: category

2804 Categories (1, object): ['b']

2805

2806 >>> s1 = s.cat.add_categories(["d", "e"])

2807 >>> s1.cat.remove_unused_categories()

2808 0 a

2809 1 b

2810 2 b

2811 3 c

2812 4 c

2813 5 c

2814 dtype: category

2815 Categories (3, object): ['a', 'b', 'c']

2816

2817 >>> s.cat.set_categories(list("abcde"))

2818 0 a

2819 1 b

2820 2 b

2821 3 c

2822 4 c

2823 5 c

2824 dtype: category

2825 Categories (5, object): ['a', 'b', 'c', 'd', 'e']

2826

2827 >>> s.cat.as_ordered()

2828 0 a

2829 1 b

2830 2 b

2831 3 c

2832 4 c

2833 5 c

2834 dtype: category

2835 Categories (3, object): ['a' < 'b' < 'c']

2836

2837 >>> s.cat.as_unordered()

2838 0 a

2839 1 b

2840 2 b

2841 3 c

2842 4 c

2843 5 c

2844 dtype: category

2845 Categories (3, object): ['a', 'b', 'c']

2846 """

2847

2848 def __init__(self, data) -> None:

2849 self._validate(data)

2850 self._parent = data.values

2851 self._index = data.index

2852 self._name = data.name

2853 self._freeze()

2854

2855 @staticmethod

2856 def _validate(data):

2857 if not is_categorical_dtype(data.dtype):

2858 raise AttributeError("Can only use .cat accessor with a 'category' dtype")

2859

2860 def _delegate_property_get(self, name):

2861 return getattr(self._parent, name)

2862

2863 def _delegate_property_set(self, name, new_values):

2864 return setattr(self._parent, name, new_values)

2865

2866 @property

2867 def codes(self) -> Series:

2868 """

2869 Return Series of codes as well as the index.

2870 """

2871 from pandas import Series

2872

2873 return Series(self._parent.codes, index=self._index)

2874

2875 def _delegate_method(self, name, *args, **kwargs):

2876 from pandas import Series

2877

2878 method = getattr(self._parent, name)

2879 res = method(*args, **kwargs)

2880 if res is not None:

2881 return Series(res, index=self._index, name=self._name)

2882

2883

2884# utility routines

2885

2886

2887def _get_codes_for_values(values, categories: Index) -> np.ndarray:

2888 """

2889 utility routine to turn values into codes given the specified categories

2890

2891 If `values` is known to be a Categorical, use recode_for_categories instead.

2892 """

2893 if values.ndim > 1:

2894 flat = values.ravel()

2895 codes = _get_codes_for_values(flat, categories)

2896 return codes.reshape(values.shape)

2897

2898 codes = categories.get_indexer_for(values)

2899 return coerce_indexer_dtype(codes, categories)

2900

2901

2902def recode_for_categories(

2903 codes: np.ndarray, old_categories, new_categories, copy: bool = True

2904) -> np.ndarray:

2905 """

2906 Convert a set of codes for to a new set of categories

2907

2908 Parameters

2909 ----------

2910 codes : np.ndarray

2911 old_categories, new_categories : Index

2912 copy: bool, default True

2913 Whether to copy if the codes are unchanged.

2914

2915 Returns

2916 -------

2917 new_codes : np.ndarray[np.int64]

2918

2919 Examples

2920 --------

2921 >>> old_cat = pd.Index(['b', 'a', 'c'])

2922 >>> new_cat = pd.Index(['a', 'b'])

2923 >>> codes = np.array([0, 1, 1, 2])

2924 >>> recode_for_categories(codes, old_cat, new_cat)

2925 array([ 1, 0, 0, -1], dtype=int8)

2926 """

2927 if len(old_categories) == 0:

2928 # All null anyway, so just retain the nulls

2929 if copy:

2930 return codes.copy()

2931 return codes

2932 elif new_categories.equals(old_categories):

2933 # Same categories, so no need to actually recode

2934 if copy:

2935 return codes.copy()

2936 return codes

2937

2938 indexer = coerce_indexer_dtype(

2939 new_categories.get_indexer(old_categories), new_categories

2940 )

2941 new_codes = take_nd(indexer, codes, fill_value=-1)

2942 return new_codes

2943

2944

2945def factorize_from_iterable(values) -> tuple[np.ndarray, Index]:

2946 """

2947 Factorize an input `values` into `categories` and `codes`. Preserves

2948 categorical dtype in `categories`.

2949

2950 Parameters

2951 ----------

2952 values : list-like

2953

2954 Returns

2955 -------

2956 codes : ndarray

2957 categories : Index

2958 If `values` has a categorical dtype, then `categories` is

2959 a CategoricalIndex keeping the categories and order of `values`.

2960 """

2961 from pandas import CategoricalIndex

2962

2963 if not is_list_like(values):

2964 raise TypeError("Input must be list-like")

2965

2966 categories: Index

2967 if is_categorical_dtype(values):

2968 values = extract_array(values)

2969 # The Categorical we want to build has the same categories

2970 # as values but its codes are by def [0, ..., len(n_categories) - 1]

2971 cat_codes = np.arange(len(values.categories), dtype=values.codes.dtype)

2972 cat = Categorical.from_codes(cat_codes, dtype=values.dtype)

2973

2974 categories = CategoricalIndex(cat)

2975 codes = values.codes

2976 else:

2977 # The value of ordered is irrelevant since we don't use cat as such,

2978 # but only the resulting categories, the order of which is independent

2979 # from ordered. Set ordered to False as default. See GH #15457

2980 cat = Categorical(values, ordered=False)

2981 categories = cat.categories

2982 codes = cat.codes

2983 return codes, categories

2984

2985

2986def factorize_from_iterables(iterables) -> tuple[list[np.ndarray], list[Index]]:

2987 """

2988 A higher-level wrapper over `factorize_from_iterable`.

2989

2990 Parameters

2991 ----------

2992 iterables : list-like of list-likes

2993

2994 Returns

2995 -------

2996 codes : list of ndarrays

2997 categories : list of Indexes

2998

2999 Notes

3000 -----

3001 See `factorize_from_iterable` for more info.

3002 """

3003 if len(iterables) == 0:

3004 # For consistency, it should return two empty lists.

3005 return [], []

3006

3007 codes, categories = zip(*(factorize_from_iterable(it) for it in iterables))

3008 return list(codes), list(categories)