Coverage for /var/srv/projects/api.amasfac.comuna18.com/tmp/venv/lib/python3.9/site-packages/pandas/core/arrays/categorical.py: 19%
864 statements
« prev ^ index » next coverage.py v6.4.4, created at 2023-07-17 14:22 -0600
« prev ^ index » next coverage.py v6.4.4, created at 2023-07-17 14:22 -0600
1from __future__ import annotations
3from csv import QUOTE_NONNUMERIC
4from functools import partial
5import operator
6from shutil import get_terminal_size
7from typing import (
8 TYPE_CHECKING,
9 Hashable,
10 Literal,
11 Sequence,
12 TypeVar,
13 Union,
14 cast,
15 overload,
16)
17from warnings import (
18 catch_warnings,
19 simplefilter,
20 warn,
21)
23import numpy as np
25from pandas._config import get_option
27from pandas._libs import (
28 NaT,
29 algos as libalgos,
30 lib,
31)
32from pandas._libs.arrays import NDArrayBacked
33from pandas._libs.lib import (
34 NoDefault,
35 no_default,
36)
37from pandas._typing import (
38 ArrayLike,
39 AstypeArg,
40 Dtype,
41 NpDtype,
42 Ordered,
43 Shape,
44 npt,
45 type_t,
46)
47from pandas.compat.numpy import function as nv
48from pandas.util._decorators import (
49 deprecate_kwarg,
50 deprecate_nonkeyword_arguments,
51)
52from pandas.util._exceptions import find_stack_level
53from pandas.util._validators import validate_bool_kwarg
55from pandas.core.dtypes.cast import coerce_indexer_dtype
56from pandas.core.dtypes.common import (
57 ensure_int64,
58 ensure_platform_int,
59 is_categorical_dtype,
60 is_datetime64_dtype,
61 is_dict_like,
62 is_dtype_equal,
63 is_extension_array_dtype,
64 is_hashable,
65 is_integer_dtype,
66 is_list_like,
67 is_scalar,
68 is_timedelta64_dtype,
69 needs_i8_conversion,
70 pandas_dtype,
71)
72from pandas.core.dtypes.dtypes import (
73 CategoricalDtype,
74 ExtensionDtype,
75)
76from pandas.core.dtypes.generic import (
77 ABCIndex,
78 ABCSeries,
79)
80from pandas.core.dtypes.missing import (
81 is_valid_na_for_dtype,
82 isna,
83 notna,
84)
86from pandas.core import (
87 arraylike,
88 ops,
89)
90from pandas.core.accessor import (
91 PandasDelegate,
92 delegate_names,
93)
94import pandas.core.algorithms as algorithms
95from pandas.core.algorithms import (
96 factorize,
97 take_nd,
98 unique1d,
99)
100from pandas.core.arrays._mixins import (
101 NDArrayBackedExtensionArray,
102 ravel_compat,
103)
104from pandas.core.base import (
105 ExtensionArray,
106 NoNewAttributesMixin,
107 PandasObject,
108)
109import pandas.core.common as com
110from pandas.core.construction import (
111 extract_array,
112 sanitize_array,
113)
114from pandas.core.ops.common import unpack_zerodim_and_defer
115from pandas.core.sorting import nargsort
116from pandas.core.strings.object_array import ObjectStringArrayMixin
118from pandas.io.formats import console
120if TYPE_CHECKING: 120 ↛ 121line 120 didn't jump to line 121, because the condition on line 120 was never true
121 from pandas import (
122 DataFrame,
123 Index,
124 Series,
125 )
128CategoricalT = TypeVar("CategoricalT", bound="Categorical")
131def _cat_compare_op(op):
132 opname = f"__{op.__name__}__"
133 fill_value = True if op is operator.ne else False
135 @unpack_zerodim_and_defer(opname)
136 def func(self, other):
137 hashable = is_hashable(other)
138 if is_list_like(other) and len(other) != len(self) and not hashable:
139 # in hashable case we may have a tuple that is itself a category
140 raise ValueError("Lengths must match.")
142 if not self.ordered:
143 if opname in ["__lt__", "__gt__", "__le__", "__ge__"]:
144 raise TypeError(
145 "Unordered Categoricals can only compare equality or not"
146 )
147 if isinstance(other, Categorical):
148 # Two Categoricals can only be compared if the categories are
149 # the same (maybe up to ordering, depending on ordered)
151 msg = "Categoricals can only be compared if 'categories' are the same."
152 if not self._categories_match_up_to_permutation(other):
153 raise TypeError(msg)
155 if not self.ordered and not self.categories.equals(other.categories):
156 # both unordered and different order
157 other_codes = recode_for_categories(
158 other.codes, other.categories, self.categories, copy=False
159 )
160 else:
161 other_codes = other._codes
163 ret = op(self._codes, other_codes)
164 mask = (self._codes == -1) | (other_codes == -1)
165 if mask.any():
166 ret[mask] = fill_value
167 return ret
169 if hashable:
170 if other in self.categories:
171 i = self._unbox_scalar(other)
172 ret = op(self._codes, i)
174 if opname not in {"__eq__", "__ge__", "__gt__"}:
175 # GH#29820 performance trick; get_loc will always give i>=0,
176 # so in the cases (__ne__, __le__, __lt__) the setting
177 # here is a no-op, so can be skipped.
178 mask = self._codes == -1
179 ret[mask] = fill_value
180 return ret
181 else:
182 return ops.invalid_comparison(self, other, op)
183 else:
184 # allow categorical vs object dtype array comparisons for equality
185 # these are only positional comparisons
186 if opname not in ["__eq__", "__ne__"]:
187 raise TypeError(
188 f"Cannot compare a Categorical for op {opname} with "
189 f"type {type(other)}.\nIf you want to compare values, "
190 "use 'np.asarray(cat) <op> other'."
191 )
193 if isinstance(other, ExtensionArray) and needs_i8_conversion(other.dtype):
194 # We would return NotImplemented here, but that messes up
195 # ExtensionIndex's wrapped methods
196 return op(other, self)
197 return getattr(np.array(self), opname)(np.array(other))
199 func.__name__ = opname
201 return func
204def contains(cat, key, container) -> bool:
205 """
206 Helper for membership check for ``key`` in ``cat``.
208 This is a helper method for :method:`__contains__`
209 and :class:`CategoricalIndex.__contains__`.
211 Returns True if ``key`` is in ``cat.categories`` and the
212 location of ``key`` in ``categories`` is in ``container``.
214 Parameters
215 ----------
216 cat : :class:`Categorical`or :class:`categoricalIndex`
217 key : a hashable object
218 The key to check membership for.
219 container : Container (e.g. list-like or mapping)
220 The container to check for membership in.
222 Returns
223 -------
224 is_in : bool
225 True if ``key`` is in ``self.categories`` and location of
226 ``key`` in ``categories`` is in ``container``, else False.
228 Notes
229 -----
230 This method does not check for NaN values. Do that separately
231 before calling this method.
232 """
233 hash(key)
235 # get location of key in categories.
236 # If a KeyError, the key isn't in categories, so logically
237 # can't be in container either.
238 try:
239 loc = cat.categories.get_loc(key)
240 except (KeyError, TypeError):
241 return False
243 # loc is the location of key in categories, but also the *value*
244 # for key in container. So, `key` may be in categories,
245 # but still not in `container`. Example ('b' in categories,
246 # but not in values):
247 # 'b' in Categorical(['a'], categories=['a', 'b']) # False
248 if is_scalar(loc):
249 return loc in container
250 else:
251 # if categories is an IntervalIndex, loc is an array.
252 return any(loc_ in container for loc_ in loc)
255class Categorical(NDArrayBackedExtensionArray, PandasObject, ObjectStringArrayMixin):
256 """
257 Represent a categorical variable in classic R / S-plus fashion.
259 `Categoricals` can only take on only a limited, and usually fixed, number
260 of possible values (`categories`). In contrast to statistical categorical
261 variables, a `Categorical` might have an order, but numerical operations
262 (additions, divisions, ...) are not possible.
264 All values of the `Categorical` are either in `categories` or `np.nan`.
265 Assigning values outside of `categories` will raise a `ValueError`. Order
266 is defined by the order of the `categories`, not lexical order of the
267 values.
269 Parameters
270 ----------
271 values : list-like
272 The values of the categorical. If categories are given, values not in
273 categories will be replaced with NaN.
274 categories : Index-like (unique), optional
275 The unique categories for this categorical. If not given, the
276 categories are assumed to be the unique values of `values` (sorted, if
277 possible, otherwise in the order in which they appear).
278 ordered : bool, default False
279 Whether or not this categorical is treated as a ordered categorical.
280 If True, the resulting categorical will be ordered.
281 An ordered categorical respects, when sorted, the order of its
282 `categories` attribute (which in turn is the `categories` argument, if
283 provided).
284 dtype : CategoricalDtype
285 An instance of ``CategoricalDtype`` to use for this categorical.
287 Attributes
288 ----------
289 categories : Index
290 The categories of this categorical
291 codes : ndarray
292 The codes (integer positions, which point to the categories) of this
293 categorical, read only.
294 ordered : bool
295 Whether or not this Categorical is ordered.
296 dtype : CategoricalDtype
297 The instance of ``CategoricalDtype`` storing the ``categories``
298 and ``ordered``.
300 Methods
301 -------
302 from_codes
303 __array__
305 Raises
306 ------
307 ValueError
308 If the categories do not validate.
309 TypeError
310 If an explicit ``ordered=True`` is given but no `categories` and the
311 `values` are not sortable.
313 See Also
314 --------
315 CategoricalDtype : Type for categorical data.
316 CategoricalIndex : An Index with an underlying ``Categorical``.
318 Notes
319 -----
320 See the `user guide
321 <https://pandas.pydata.org/pandas-docs/stable/user_guide/categorical.html>`__
322 for more.
324 Examples
325 --------
326 >>> pd.Categorical([1, 2, 3, 1, 2, 3])
327 [1, 2, 3, 1, 2, 3]
328 Categories (3, int64): [1, 2, 3]
330 >>> pd.Categorical(['a', 'b', 'c', 'a', 'b', 'c'])
331 ['a', 'b', 'c', 'a', 'b', 'c']
332 Categories (3, object): ['a', 'b', 'c']
334 Missing values are not included as a category.
336 >>> c = pd.Categorical([1, 2, 3, 1, 2, 3, np.nan])
337 >>> c
338 [1, 2, 3, 1, 2, 3, NaN]
339 Categories (3, int64): [1, 2, 3]
341 However, their presence is indicated in the `codes` attribute
342 by code `-1`.
344 >>> c.codes
345 array([ 0, 1, 2, 0, 1, 2, -1], dtype=int8)
347 Ordered `Categoricals` can be sorted according to the custom order
348 of the categories and can have a min and max value.
350 >>> c = pd.Categorical(['a', 'b', 'c', 'a', 'b', 'c'], ordered=True,
351 ... categories=['c', 'b', 'a'])
352 >>> c
353 ['a', 'b', 'c', 'a', 'b', 'c']
354 Categories (3, object): ['c' < 'b' < 'a']
355 >>> c.min()
356 'c'
357 """
359 # For comparisons, so that numpy uses our implementation if the compare
360 # ops, which raise
361 __array_priority__ = 1000
362 # tolist is not actually deprecated, just suppressed in the __dir__
363 _hidden_attrs = PandasObject._hidden_attrs | frozenset(["tolist"])
364 _typ = "categorical"
366 _dtype: CategoricalDtype
368 def __init__(
369 self,
370 values,
371 categories=None,
372 ordered=None,
373 dtype: Dtype | None = None,
374 fastpath: bool = False,
375 copy: bool = True,
376 ) -> None:
378 dtype = CategoricalDtype._from_values_or_dtype(
379 values, categories, ordered, dtype
380 )
381 # At this point, dtype is always a CategoricalDtype, but
382 # we may have dtype.categories be None, and we need to
383 # infer categories in a factorization step further below
385 if fastpath:
386 codes = coerce_indexer_dtype(values, dtype.categories)
387 dtype = CategoricalDtype(ordered=False).update_dtype(dtype)
388 super().__init__(codes, dtype)
389 return
391 if not is_list_like(values):
392 # GH#38433
393 warn(
394 "Allowing scalars in the Categorical constructor is deprecated "
395 "and will raise in a future version. Use `[value]` instead",
396 FutureWarning,
397 stacklevel=find_stack_level(),
398 )
399 values = [values]
401 # null_mask indicates missing values we want to exclude from inference.
402 # This means: only missing values in list-likes (not arrays/ndframes).
403 null_mask = np.array(False)
405 # sanitize input
406 if is_categorical_dtype(values):
407 if dtype.categories is None:
408 dtype = CategoricalDtype(values.categories, dtype.ordered)
409 elif not isinstance(values, (ABCIndex, ABCSeries, ExtensionArray)):
410 values = com.convert_to_list_like(values)
411 if isinstance(values, list) and len(values) == 0:
412 # By convention, empty lists result in object dtype:
413 values = np.array([], dtype=object)
414 elif isinstance(values, np.ndarray):
415 if values.ndim > 1:
416 # preempt sanitize_array from raising ValueError
417 raise NotImplementedError(
418 "> 1 ndim Categorical are not supported at this time"
419 )
420 values = sanitize_array(values, None)
421 else:
422 # i.e. must be a list
423 arr = sanitize_array(values, None)
424 null_mask = isna(arr)
425 if null_mask.any():
426 # We remove null values here, then below will re-insert
427 # them, grep "full_codes"
428 arr_list = [values[idx] for idx in np.where(~null_mask)[0]]
430 # GH#44900 Do not cast to float if we have only missing values
431 if arr_list or arr.dtype == "object":
432 sanitize_dtype = None
433 else:
434 sanitize_dtype = arr.dtype
436 arr = sanitize_array(arr_list, None, dtype=sanitize_dtype)
437 values = arr
439 if dtype.categories is None:
440 try:
441 codes, categories = factorize(values, sort=True)
442 except TypeError as err:
443 codes, categories = factorize(values, sort=False)
444 if dtype.ordered:
445 # raise, as we don't have a sortable data structure and so
446 # the user should give us one by specifying categories
447 raise TypeError(
448 "'values' is not ordered, please "
449 "explicitly specify the categories order "
450 "by passing in a categories argument."
451 ) from err
453 # we're inferring from values
454 dtype = CategoricalDtype(categories, dtype.ordered)
456 elif is_categorical_dtype(values.dtype):
457 old_codes = extract_array(values)._codes
458 codes = recode_for_categories(
459 old_codes, values.dtype.categories, dtype.categories, copy=copy
460 )
462 else:
463 codes = _get_codes_for_values(values, dtype.categories)
465 if null_mask.any():
466 # Reinsert -1 placeholders for previously removed missing values
467 full_codes = -np.ones(null_mask.shape, dtype=codes.dtype)
468 full_codes[~null_mask] = codes
469 codes = full_codes
471 dtype = CategoricalDtype(ordered=False).update_dtype(dtype)
472 arr = coerce_indexer_dtype(codes, dtype.categories)
473 super().__init__(arr, dtype)
475 @property
476 def dtype(self) -> CategoricalDtype:
477 """
478 The :class:`~pandas.api.types.CategoricalDtype` for this instance.
479 """
480 return self._dtype
482 @property
483 def _internal_fill_value(self) -> int:
484 # using the specific numpy integer instead of python int to get
485 # the correct dtype back from _quantile in the all-NA case
486 dtype = self._ndarray.dtype
487 return dtype.type(-1)
489 @property
490 def _constructor(self) -> type[Categorical]:
491 return Categorical
493 @classmethod
494 def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy=False):
495 return Categorical(scalars, dtype=dtype, copy=copy)
497 @overload
498 def astype(self, dtype: npt.DTypeLike, copy: bool = ...) -> np.ndarray:
499 ...
501 @overload
502 def astype(self, dtype: ExtensionDtype, copy: bool = ...) -> ExtensionArray:
503 ...
505 @overload
506 def astype(self, dtype: AstypeArg, copy: bool = ...) -> ArrayLike:
507 ...
509 def astype(self, dtype: AstypeArg, copy: bool = True) -> ArrayLike:
510 """
511 Coerce this type to another dtype
513 Parameters
514 ----------
515 dtype : numpy dtype or pandas type
516 copy : bool, default True
517 By default, astype always returns a newly allocated object.
518 If copy is set to False and dtype is categorical, the original
519 object is returned.
520 """
521 dtype = pandas_dtype(dtype)
522 if self.dtype is dtype:
523 result = self.copy() if copy else self
525 elif is_categorical_dtype(dtype):
526 dtype = cast("Union[str, CategoricalDtype]", dtype)
528 # GH 10696/18593/18630
529 dtype = self.dtype.update_dtype(dtype)
530 self = self.copy() if copy else self
531 result = self._set_dtype(dtype)
533 elif isinstance(dtype, ExtensionDtype):
534 return super().astype(dtype, copy=copy)
536 elif is_integer_dtype(dtype) and self.isna().any():
537 raise ValueError("Cannot convert float NaN to integer")
539 elif len(self.codes) == 0 or len(self.categories) == 0:
540 result = np.array(
541 self,
542 dtype=dtype,
543 copy=copy,
544 )
546 else:
547 # GH8628 (PERF): astype category codes instead of astyping array
548 new_cats = self.categories._values
550 try:
551 new_cats = new_cats.astype(dtype=dtype, copy=copy)
552 fill_value = self.categories._na_value
553 if not is_valid_na_for_dtype(fill_value, dtype):
554 fill_value = lib.item_from_zerodim(
555 np.array(self.categories._na_value).astype(dtype)
556 )
557 except (
558 TypeError, # downstream error msg for CategoricalIndex is misleading
559 ValueError,
560 ):
561 msg = f"Cannot cast {self.categories.dtype} dtype to {dtype}"
562 raise ValueError(msg)
564 result = take_nd(
565 new_cats, ensure_platform_int(self._codes), fill_value=fill_value
566 )
568 return result
570 def to_list(self):
571 """
572 Alias for tolist.
573 """
574 return self.tolist()
576 @classmethod
577 def _from_inferred_categories(
578 cls, inferred_categories, inferred_codes, dtype, true_values=None
579 ):
580 """
581 Construct a Categorical from inferred values.
583 For inferred categories (`dtype` is None) the categories are sorted.
584 For explicit `dtype`, the `inferred_categories` are cast to the
585 appropriate type.
587 Parameters
588 ----------
589 inferred_categories : Index
590 inferred_codes : Index
591 dtype : CategoricalDtype or 'category'
592 true_values : list, optional
593 If none are provided, the default ones are
594 "True", "TRUE", and "true."
596 Returns
597 -------
598 Categorical
599 """
600 from pandas import (
601 Index,
602 to_datetime,
603 to_numeric,
604 to_timedelta,
605 )
607 cats = Index(inferred_categories)
608 known_categories = (
609 isinstance(dtype, CategoricalDtype) and dtype.categories is not None
610 )
612 if known_categories:
613 # Convert to a specialized type with `dtype` if specified.
614 if dtype.categories.is_numeric():
615 cats = to_numeric(inferred_categories, errors="coerce")
616 elif is_datetime64_dtype(dtype.categories):
617 cats = to_datetime(inferred_categories, errors="coerce")
618 elif is_timedelta64_dtype(dtype.categories):
619 cats = to_timedelta(inferred_categories, errors="coerce")
620 elif dtype.categories.is_boolean():
621 if true_values is None:
622 true_values = ["True", "TRUE", "true"]
624 # error: Incompatible types in assignment (expression has type
625 # "ndarray", variable has type "Index")
626 cats = cats.isin(true_values) # type: ignore[assignment]
628 if known_categories:
629 # Recode from observation order to dtype.categories order.
630 categories = dtype.categories
631 codes = recode_for_categories(inferred_codes, cats, categories)
632 elif not cats.is_monotonic_increasing:
633 # Sort categories and recode for unknown categories.
634 unsorted = cats.copy()
635 categories = cats.sort_values()
637 codes = recode_for_categories(inferred_codes, unsorted, categories)
638 dtype = CategoricalDtype(categories, ordered=False)
639 else:
640 dtype = CategoricalDtype(cats, ordered=False)
641 codes = inferred_codes
643 return cls(codes, dtype=dtype, fastpath=True)
645 @classmethod
646 def from_codes(
647 cls, codes, categories=None, ordered=None, dtype: Dtype | None = None
648 ) -> Categorical:
649 """
650 Make a Categorical type from codes and categories or dtype.
652 This constructor is useful if you already have codes and
653 categories/dtype and so do not need the (computation intensive)
654 factorization step, which is usually done on the constructor.
656 If your data does not follow this convention, please use the normal
657 constructor.
659 Parameters
660 ----------
661 codes : array-like of int
662 An integer array, where each integer points to a category in
663 categories or dtype.categories, or else is -1 for NaN.
664 categories : index-like, optional
665 The categories for the categorical. Items need to be unique.
666 If the categories are not given here, then they must be provided
667 in `dtype`.
668 ordered : bool, optional
669 Whether or not this categorical is treated as an ordered
670 categorical. If not given here or in `dtype`, the resulting
671 categorical will be unordered.
672 dtype : CategoricalDtype or "category", optional
673 If :class:`CategoricalDtype`, cannot be used together with
674 `categories` or `ordered`.
676 Returns
677 -------
678 Categorical
680 Examples
681 --------
682 >>> dtype = pd.CategoricalDtype(['a', 'b'], ordered=True)
683 >>> pd.Categorical.from_codes(codes=[0, 1, 0, 1], dtype=dtype)
684 ['a', 'b', 'a', 'b']
685 Categories (2, object): ['a' < 'b']
686 """
687 dtype = CategoricalDtype._from_values_or_dtype(
688 categories=categories, ordered=ordered, dtype=dtype
689 )
690 if dtype.categories is None:
691 msg = (
692 "The categories must be provided in 'categories' or "
693 "'dtype'. Both were None."
694 )
695 raise ValueError(msg)
697 if is_extension_array_dtype(codes) and is_integer_dtype(codes):
698 # Avoid the implicit conversion of Int to object
699 if isna(codes).any():
700 raise ValueError("codes cannot contain NA values")
701 codes = codes.to_numpy(dtype=np.int64)
702 else:
703 codes = np.asarray(codes)
704 if len(codes) and not is_integer_dtype(codes):
705 raise ValueError("codes need to be array-like integers")
707 if len(codes) and (codes.max() >= len(dtype.categories) or codes.min() < -1):
708 raise ValueError("codes need to be between -1 and len(categories)-1")
710 return cls(codes, dtype=dtype, fastpath=True)
712 # ------------------------------------------------------------------
713 # Categories/Codes/Ordered
715 @property
716 def categories(self) -> Index:
717 """
718 The categories of this categorical.
720 Setting assigns new values to each category (effectively a rename of
721 each individual category).
723 The assigned value has to be a list-like object. All items must be
724 unique and the number of items in the new categories must be the same
725 as the number of items in the old categories.
727 Assigning to `categories` is a inplace operation!
729 Raises
730 ------
731 ValueError
732 If the new categories do not validate as categories or if the
733 number of new categories is unequal the number of old categories
735 See Also
736 --------
737 rename_categories : Rename categories.
738 reorder_categories : Reorder categories.
739 add_categories : Add new categories.
740 remove_categories : Remove the specified categories.
741 remove_unused_categories : Remove categories which are not used.
742 set_categories : Set the categories to the specified ones.
743 """
744 return self.dtype.categories
746 @categories.setter
747 def categories(self, categories) -> None:
748 warn(
749 "Setting categories in-place is deprecated and will raise in a "
750 "future version. Use rename_categories instead.",
751 FutureWarning,
752 stacklevel=find_stack_level(),
753 )
755 self._set_categories(categories)
757 @property
758 def ordered(self) -> Ordered:
759 """
760 Whether the categories have an ordered relationship.
761 """
762 return self.dtype.ordered
764 @property
765 def codes(self) -> np.ndarray:
766 """
767 The category codes of this categorical.
769 Codes are an array of integers which are the positions of the actual
770 values in the categories array.
772 There is no setter, use the other categorical methods and the normal item
773 setter to change values in the categorical.
775 Returns
776 -------
777 ndarray[int]
778 A non-writable view of the `codes` array.
779 """
780 v = self._codes.view()
781 v.flags.writeable = False
782 return v
784 def _set_categories(self, categories, fastpath=False):
785 """
786 Sets new categories inplace
788 Parameters
789 ----------
790 fastpath : bool, default False
791 Don't perform validation of the categories for uniqueness or nulls
793 Examples
794 --------
795 >>> c = pd.Categorical(['a', 'b'])
796 >>> c
797 ['a', 'b']
798 Categories (2, object): ['a', 'b']
800 >>> c._set_categories(pd.Index(['a', 'c']))
801 >>> c
802 ['a', 'c']
803 Categories (2, object): ['a', 'c']
804 """
805 if fastpath:
806 new_dtype = CategoricalDtype._from_fastpath(categories, self.ordered)
807 else:
808 new_dtype = CategoricalDtype(categories, ordered=self.ordered)
809 if (
810 not fastpath
811 and self.dtype.categories is not None
812 and len(new_dtype.categories) != len(self.dtype.categories)
813 ):
814 raise ValueError(
815 "new categories need to have the same number of "
816 "items as the old categories!"
817 )
819 super().__init__(self._ndarray, new_dtype)
821 def _set_dtype(self, dtype: CategoricalDtype) -> Categorical:
822 """
823 Internal method for directly updating the CategoricalDtype
825 Parameters
826 ----------
827 dtype : CategoricalDtype
829 Notes
830 -----
831 We don't do any validation here. It's assumed that the dtype is
832 a (valid) instance of `CategoricalDtype`.
833 """
834 codes = recode_for_categories(self.codes, self.categories, dtype.categories)
835 return type(self)(codes, dtype=dtype, fastpath=True)
837 @overload
838 def set_ordered(
839 self, value, *, inplace: NoDefault | Literal[False] = ...
840 ) -> Categorical:
841 ...
843 @overload
844 def set_ordered(self, value, *, inplace: Literal[True]) -> None:
845 ...
847 @overload
848 def set_ordered(self, value, *, inplace: bool) -> Categorical | None:
849 ...
851 @deprecate_nonkeyword_arguments(version=None, allowed_args=["self", "value"])
852 def set_ordered(
853 self, value, inplace: bool | NoDefault = no_default
854 ) -> Categorical | None:
855 """
856 Set the ordered attribute to the boolean value.
858 Parameters
859 ----------
860 value : bool
861 Set whether this categorical is ordered (True) or not (False).
862 inplace : bool, default False
863 Whether or not to set the ordered attribute in-place or return
864 a copy of this categorical with ordered set to the value.
866 .. deprecated:: 1.5.0
868 """
869 if inplace is not no_default:
870 warn(
871 "The `inplace` parameter in pandas.Categorical."
872 "set_ordered is deprecated and will be removed in "
873 "a future version. setting ordered-ness on categories will always "
874 "return a new Categorical object.",
875 FutureWarning,
876 stacklevel=find_stack_level(),
877 )
878 else:
879 inplace = False
881 inplace = validate_bool_kwarg(inplace, "inplace")
882 new_dtype = CategoricalDtype(self.categories, ordered=value)
883 cat = self if inplace else self.copy()
884 NDArrayBacked.__init__(cat, cat._ndarray, new_dtype)
885 if not inplace:
886 return cat
887 return None
889 @overload
890 def as_ordered(self, *, inplace: NoDefault | Literal[False] = ...) -> Categorical:
891 ...
893 @overload
894 def as_ordered(self, *, inplace: Literal[True]) -> None:
895 ...
897 @deprecate_nonkeyword_arguments(version=None, allowed_args=["self"])
898 def as_ordered(self, inplace: bool | NoDefault = no_default) -> Categorical | None:
899 """
900 Set the Categorical to be ordered.
902 Parameters
903 ----------
904 inplace : bool, default False
905 Whether or not to set the ordered attribute in-place or return
906 a copy of this categorical with ordered set to True.
908 .. deprecated:: 1.5.0
910 Returns
911 -------
912 Categorical or None
913 Ordered Categorical or None if ``inplace=True``.
914 """
915 if inplace is not no_default:
916 inplace = validate_bool_kwarg(inplace, "inplace")
917 return self.set_ordered(True, inplace=inplace)
919 @overload
920 def as_unordered(self, *, inplace: NoDefault | Literal[False] = ...) -> Categorical:
921 ...
923 @overload
924 def as_unordered(self, *, inplace: Literal[True]) -> None:
925 ...
927 @deprecate_nonkeyword_arguments(version=None, allowed_args=["self"])
928 def as_unordered(
929 self, inplace: bool | NoDefault = no_default
930 ) -> Categorical | None:
931 """
932 Set the Categorical to be unordered.
934 Parameters
935 ----------
936 inplace : bool, default False
937 Whether or not to set the ordered attribute in-place or return
938 a copy of this categorical with ordered set to False.
940 .. deprecated:: 1.5.0
942 Returns
943 -------
944 Categorical or None
945 Unordered Categorical or None if ``inplace=True``.
946 """
947 if inplace is not no_default:
948 inplace = validate_bool_kwarg(inplace, "inplace")
949 return self.set_ordered(False, inplace=inplace)
951 def set_categories(
952 self, new_categories, ordered=None, rename=False, inplace=no_default
953 ):
954 """
955 Set the categories to the specified new_categories.
957 `new_categories` can include new categories (which will result in
958 unused categories) or remove old categories (which results in values
959 set to NaN). If `rename==True`, the categories will simple be renamed
960 (less or more items than in old categories will result in values set to
961 NaN or in unused categories respectively).
963 This method can be used to perform more than one action of adding,
964 removing, and reordering simultaneously and is therefore faster than
965 performing the individual steps via the more specialised methods.
967 On the other hand this methods does not do checks (e.g., whether the
968 old categories are included in the new categories on a reorder), which
969 can result in surprising changes, for example when using special string
970 dtypes, which does not considers a S1 string equal to a single char
971 python string.
973 Parameters
974 ----------
975 new_categories : Index-like
976 The categories in new order.
977 ordered : bool, default False
978 Whether or not the categorical is treated as a ordered categorical.
979 If not given, do not change the ordered information.
980 rename : bool, default False
981 Whether or not the new_categories should be considered as a rename
982 of the old categories or as reordered categories.
983 inplace : bool, default False
984 Whether or not to reorder the categories in-place or return a copy
985 of this categorical with reordered categories.
987 .. deprecated:: 1.3.0
989 Returns
990 -------
991 Categorical with reordered categories or None if inplace.
993 Raises
994 ------
995 ValueError
996 If new_categories does not validate as categories
998 See Also
999 --------
1000 rename_categories : Rename categories.
1001 reorder_categories : Reorder categories.
1002 add_categories : Add new categories.
1003 remove_categories : Remove the specified categories.
1004 remove_unused_categories : Remove categories which are not used.
1005 """
1006 if inplace is not no_default:
1007 warn(
1008 "The `inplace` parameter in pandas.Categorical."
1009 "set_categories is deprecated and will be removed in "
1010 "a future version. Removing unused categories will always "
1011 "return a new Categorical object.",
1012 FutureWarning,
1013 stacklevel=find_stack_level(),
1014 )
1015 else:
1016 inplace = False
1018 inplace = validate_bool_kwarg(inplace, "inplace")
1019 if ordered is None:
1020 ordered = self.dtype.ordered
1021 new_dtype = CategoricalDtype(new_categories, ordered=ordered)
1023 cat = self if inplace else self.copy()
1024 if rename:
1025 if cat.dtype.categories is not None and len(new_dtype.categories) < len(
1026 cat.dtype.categories
1027 ):
1028 # remove all _codes which are larger and set to -1/NaN
1029 cat._codes[cat._codes >= len(new_dtype.categories)] = -1
1030 codes = cat._codes
1031 else:
1032 codes = recode_for_categories(
1033 cat.codes, cat.categories, new_dtype.categories
1034 )
1035 NDArrayBacked.__init__(cat, codes, new_dtype)
1037 if not inplace:
1038 return cat
1040 @overload
1041 def rename_categories(
1042 self, new_categories, *, inplace: Literal[False] | NoDefault = ...
1043 ) -> Categorical:
1044 ...
1046 @overload
1047 def rename_categories(self, new_categories, *, inplace: Literal[True]) -> None:
1048 ...
1050 @deprecate_nonkeyword_arguments(
1051 version=None, allowed_args=["self", "new_categories"]
1052 )
1053 def rename_categories(
1054 self, new_categories, inplace: bool | NoDefault = no_default
1055 ) -> Categorical | None:
1056 """
1057 Rename categories.
1059 Parameters
1060 ----------
1061 new_categories : list-like, dict-like or callable
1063 New categories which will replace old categories.
1065 * list-like: all items must be unique and the number of items in
1066 the new categories must match the existing number of categories.
1068 * dict-like: specifies a mapping from
1069 old categories to new. Categories not contained in the mapping
1070 are passed through and extra categories in the mapping are
1071 ignored.
1073 * callable : a callable that is called on all items in the old
1074 categories and whose return values comprise the new categories.
1076 inplace : bool, default False
1077 Whether or not to rename the categories inplace or return a copy of
1078 this categorical with renamed categories.
1080 .. deprecated:: 1.3.0
1082 Returns
1083 -------
1084 cat : Categorical or None
1085 Categorical with removed categories or None if ``inplace=True``.
1087 Raises
1088 ------
1089 ValueError
1090 If new categories are list-like and do not have the same number of
1091 items than the current categories or do not validate as categories
1093 See Also
1094 --------
1095 reorder_categories : Reorder categories.
1096 add_categories : Add new categories.
1097 remove_categories : Remove the specified categories.
1098 remove_unused_categories : Remove categories which are not used.
1099 set_categories : Set the categories to the specified ones.
1101 Examples
1102 --------
1103 >>> c = pd.Categorical(['a', 'a', 'b'])
1104 >>> c.rename_categories([0, 1])
1105 [0, 0, 1]
1106 Categories (2, int64): [0, 1]
1108 For dict-like ``new_categories``, extra keys are ignored and
1109 categories not in the dictionary are passed through
1111 >>> c.rename_categories({'a': 'A', 'c': 'C'})
1112 ['A', 'A', 'b']
1113 Categories (2, object): ['A', 'b']
1115 You may also provide a callable to create the new categories
1117 >>> c.rename_categories(lambda x: x.upper())
1118 ['A', 'A', 'B']
1119 Categories (2, object): ['A', 'B']
1120 """
1121 if inplace is not no_default:
1122 warn(
1123 "The `inplace` parameter in pandas.Categorical."
1124 "rename_categories is deprecated and will be removed in "
1125 "a future version. Removing unused categories will always "
1126 "return a new Categorical object.",
1127 FutureWarning,
1128 stacklevel=find_stack_level(),
1129 )
1130 else:
1131 inplace = False
1133 inplace = validate_bool_kwarg(inplace, "inplace")
1134 cat = self if inplace else self.copy()
1136 if is_dict_like(new_categories):
1137 new_categories = [new_categories.get(item, item) for item in cat.categories]
1138 elif callable(new_categories):
1139 new_categories = [new_categories(item) for item in cat.categories]
1141 cat._set_categories(new_categories)
1142 if not inplace:
1143 return cat
1144 return None
1146 def reorder_categories(self, new_categories, ordered=None, inplace=no_default):
1147 """
1148 Reorder categories as specified in new_categories.
1150 `new_categories` need to include all old categories and no new category
1151 items.
1153 Parameters
1154 ----------
1155 new_categories : Index-like
1156 The categories in new order.
1157 ordered : bool, optional
1158 Whether or not the categorical is treated as a ordered categorical.
1159 If not given, do not change the ordered information.
1160 inplace : bool, default False
1161 Whether or not to reorder the categories inplace or return a copy of
1162 this categorical with reordered categories.
1164 .. deprecated:: 1.3.0
1166 Returns
1167 -------
1168 cat : Categorical or None
1169 Categorical with removed categories or None if ``inplace=True``.
1171 Raises
1172 ------
1173 ValueError
1174 If the new categories do not contain all old category items or any
1175 new ones
1177 See Also
1178 --------
1179 rename_categories : Rename categories.
1180 add_categories : Add new categories.
1181 remove_categories : Remove the specified categories.
1182 remove_unused_categories : Remove categories which are not used.
1183 set_categories : Set the categories to the specified ones.
1184 """
1185 if inplace is not no_default:
1186 warn(
1187 "The `inplace` parameter in pandas.Categorical."
1188 "reorder_categories is deprecated and will be removed in "
1189 "a future version. Reordering categories will always "
1190 "return a new Categorical object.",
1191 FutureWarning,
1192 stacklevel=find_stack_level(),
1193 )
1194 else:
1195 inplace = False
1197 inplace = validate_bool_kwarg(inplace, "inplace")
1198 if set(self.dtype.categories) != set(new_categories):
1199 raise ValueError(
1200 "items in new_categories are not the same as in old categories"
1201 )
1203 with catch_warnings():
1204 simplefilter("ignore")
1205 return self.set_categories(new_categories, ordered=ordered, inplace=inplace)
1207 @overload
1208 def add_categories(
1209 self, new_categories, *, inplace: Literal[False] | NoDefault = ...
1210 ) -> Categorical:
1211 ...
1213 @overload
1214 def add_categories(self, new_categories, *, inplace: Literal[True]) -> None:
1215 ...
1217 @deprecate_nonkeyword_arguments(
1218 version=None, allowed_args=["self", "new_categories"]
1219 )
1220 def add_categories(
1221 self, new_categories, inplace: bool | NoDefault = no_default
1222 ) -> Categorical | None:
1223 """
1224 Add new categories.
1226 `new_categories` will be included at the last/highest place in the
1227 categories and will be unused directly after this call.
1229 Parameters
1230 ----------
1231 new_categories : category or list-like of category
1232 The new categories to be included.
1233 inplace : bool, default False
1234 Whether or not to add the categories inplace or return a copy of
1235 this categorical with added categories.
1237 .. deprecated:: 1.3.0
1239 Returns
1240 -------
1241 cat : Categorical or None
1242 Categorical with new categories added or None if ``inplace=True``.
1244 Raises
1245 ------
1246 ValueError
1247 If the new categories include old categories or do not validate as
1248 categories
1250 See Also
1251 --------
1252 rename_categories : Rename categories.
1253 reorder_categories : Reorder categories.
1254 remove_categories : Remove the specified categories.
1255 remove_unused_categories : Remove categories which are not used.
1256 set_categories : Set the categories to the specified ones.
1258 Examples
1259 --------
1260 >>> c = pd.Categorical(['c', 'b', 'c'])
1261 >>> c
1262 ['c', 'b', 'c']
1263 Categories (2, object): ['b', 'c']
1265 >>> c.add_categories(['d', 'a'])
1266 ['c', 'b', 'c']
1267 Categories (4, object): ['b', 'c', 'd', 'a']
1268 """
1269 if inplace is not no_default:
1270 warn(
1271 "The `inplace` parameter in pandas.Categorical."
1272 "add_categories is deprecated and will be removed in "
1273 "a future version. Removing unused categories will always "
1274 "return a new Categorical object.",
1275 FutureWarning,
1276 stacklevel=find_stack_level(),
1277 )
1278 else:
1279 inplace = False
1281 inplace = validate_bool_kwarg(inplace, "inplace")
1282 if not is_list_like(new_categories):
1283 new_categories = [new_categories]
1284 already_included = set(new_categories) & set(self.dtype.categories)
1285 if len(already_included) != 0:
1286 raise ValueError(
1287 f"new categories must not include old categories: {already_included}"
1288 )
1289 new_categories = list(self.dtype.categories) + list(new_categories)
1290 new_dtype = CategoricalDtype(new_categories, self.ordered)
1292 cat = self if inplace else self.copy()
1293 codes = coerce_indexer_dtype(cat._ndarray, new_dtype.categories)
1294 NDArrayBacked.__init__(cat, codes, new_dtype)
1295 if not inplace:
1296 return cat
1297 return None
1299 def remove_categories(self, removals, inplace=no_default):
1300 """
1301 Remove the specified categories.
1303 `removals` must be included in the old categories. Values which were in
1304 the removed categories will be set to NaN
1306 Parameters
1307 ----------
1308 removals : category or list of categories
1309 The categories which should be removed.
1310 inplace : bool, default False
1311 Whether or not to remove the categories inplace or return a copy of
1312 this categorical with removed categories.
1314 .. deprecated:: 1.3.0
1316 Returns
1317 -------
1318 cat : Categorical or None
1319 Categorical with removed categories or None if ``inplace=True``.
1321 Raises
1322 ------
1323 ValueError
1324 If the removals are not contained in the categories
1326 See Also
1327 --------
1328 rename_categories : Rename categories.
1329 reorder_categories : Reorder categories.
1330 add_categories : Add new categories.
1331 remove_unused_categories : Remove categories which are not used.
1332 set_categories : Set the categories to the specified ones.
1334 Examples
1335 --------
1336 >>> c = pd.Categorical(['a', 'c', 'b', 'c', 'd'])
1337 >>> c
1338 ['a', 'c', 'b', 'c', 'd']
1339 Categories (4, object): ['a', 'b', 'c', 'd']
1341 >>> c.remove_categories(['d', 'a'])
1342 [NaN, 'c', 'b', 'c', NaN]
1343 Categories (2, object): ['b', 'c']
1344 """
1345 if inplace is not no_default:
1346 warn(
1347 "The `inplace` parameter in pandas.Categorical."
1348 "remove_categories is deprecated and will be removed in "
1349 "a future version. Removing unused categories will always "
1350 "return a new Categorical object.",
1351 FutureWarning,
1352 stacklevel=find_stack_level(),
1353 )
1354 else:
1355 inplace = False
1357 inplace = validate_bool_kwarg(inplace, "inplace")
1358 if not is_list_like(removals):
1359 removals = [removals]
1361 removal_set = set(removals)
1362 not_included = removal_set - set(self.dtype.categories)
1363 new_categories = [c for c in self.dtype.categories if c not in removal_set]
1365 # GH 10156
1366 if any(isna(removals)):
1367 not_included = {x for x in not_included if notna(x)}
1368 new_categories = [x for x in new_categories if notna(x)]
1370 if len(not_included) != 0:
1371 raise ValueError(f"removals must all be in old categories: {not_included}")
1373 with catch_warnings():
1374 simplefilter("ignore")
1375 return self.set_categories(
1376 new_categories, ordered=self.ordered, rename=False, inplace=inplace
1377 )
1379 @overload
1380 def remove_unused_categories(
1381 self, *, inplace: Literal[False] | NoDefault = ...
1382 ) -> Categorical:
1383 ...
1385 @overload
1386 def remove_unused_categories(self, *, inplace: Literal[True]) -> None:
1387 ...
1389 @deprecate_nonkeyword_arguments(version=None, allowed_args=["self"])
1390 def remove_unused_categories(
1391 self, inplace: bool | NoDefault = no_default
1392 ) -> Categorical | None:
1393 """
1394 Remove categories which are not used.
1396 Parameters
1397 ----------
1398 inplace : bool, default False
1399 Whether or not to drop unused categories inplace or return a copy of
1400 this categorical with unused categories dropped.
1402 .. deprecated:: 1.2.0
1404 Returns
1405 -------
1406 cat : Categorical or None
1407 Categorical with unused categories dropped or None if ``inplace=True``.
1409 See Also
1410 --------
1411 rename_categories : Rename categories.
1412 reorder_categories : Reorder categories.
1413 add_categories : Add new categories.
1414 remove_categories : Remove the specified categories.
1415 set_categories : Set the categories to the specified ones.
1417 Examples
1418 --------
1419 >>> c = pd.Categorical(['a', 'c', 'b', 'c', 'd'])
1420 >>> c
1421 ['a', 'c', 'b', 'c', 'd']
1422 Categories (4, object): ['a', 'b', 'c', 'd']
1424 >>> c[2] = 'a'
1425 >>> c[4] = 'c'
1426 >>> c
1427 ['a', 'c', 'a', 'c', 'c']
1428 Categories (4, object): ['a', 'b', 'c', 'd']
1430 >>> c.remove_unused_categories()
1431 ['a', 'c', 'a', 'c', 'c']
1432 Categories (2, object): ['a', 'c']
1433 """
1434 if inplace is not no_default:
1435 warn(
1436 "The `inplace` parameter in pandas.Categorical."
1437 "remove_unused_categories is deprecated and "
1438 "will be removed in a future version.",
1439 FutureWarning,
1440 stacklevel=find_stack_level(),
1441 )
1442 else:
1443 inplace = False
1445 inplace = validate_bool_kwarg(inplace, "inplace")
1446 cat = self if inplace else self.copy()
1447 idx, inv = np.unique(cat._codes, return_inverse=True)
1449 if idx.size != 0 and idx[0] == -1: # na sentinel
1450 idx, inv = idx[1:], inv - 1
1452 new_categories = cat.dtype.categories.take(idx)
1453 new_dtype = CategoricalDtype._from_fastpath(
1454 new_categories, ordered=self.ordered
1455 )
1456 new_codes = coerce_indexer_dtype(inv, new_dtype.categories)
1457 NDArrayBacked.__init__(cat, new_codes, new_dtype)
1458 if not inplace:
1459 return cat
1460 return None
1462 # ------------------------------------------------------------------
1464 def map(self, mapper):
1465 """
1466 Map categories using an input mapping or function.
1468 Maps the categories to new categories. If the mapping correspondence is
1469 one-to-one the result is a :class:`~pandas.Categorical` which has the
1470 same order property as the original, otherwise a :class:`~pandas.Index`
1471 is returned. NaN values are unaffected.
1473 If a `dict` or :class:`~pandas.Series` is used any unmapped category is
1474 mapped to `NaN`. Note that if this happens an :class:`~pandas.Index`
1475 will be returned.
1477 Parameters
1478 ----------
1479 mapper : function, dict, or Series
1480 Mapping correspondence.
1482 Returns
1483 -------
1484 pandas.Categorical or pandas.Index
1485 Mapped categorical.
1487 See Also
1488 --------
1489 CategoricalIndex.map : Apply a mapping correspondence on a
1490 :class:`~pandas.CategoricalIndex`.
1491 Index.map : Apply a mapping correspondence on an
1492 :class:`~pandas.Index`.
1493 Series.map : Apply a mapping correspondence on a
1494 :class:`~pandas.Series`.
1495 Series.apply : Apply more complex functions on a
1496 :class:`~pandas.Series`.
1498 Examples
1499 --------
1500 >>> cat = pd.Categorical(['a', 'b', 'c'])
1501 >>> cat
1502 ['a', 'b', 'c']
1503 Categories (3, object): ['a', 'b', 'c']
1504 >>> cat.map(lambda x: x.upper())
1505 ['A', 'B', 'C']
1506 Categories (3, object): ['A', 'B', 'C']
1507 >>> cat.map({'a': 'first', 'b': 'second', 'c': 'third'})
1508 ['first', 'second', 'third']
1509 Categories (3, object): ['first', 'second', 'third']
1511 If the mapping is one-to-one the ordering of the categories is
1512 preserved:
1514 >>> cat = pd.Categorical(['a', 'b', 'c'], ordered=True)
1515 >>> cat
1516 ['a', 'b', 'c']
1517 Categories (3, object): ['a' < 'b' < 'c']
1518 >>> cat.map({'a': 3, 'b': 2, 'c': 1})
1519 [3, 2, 1]
1520 Categories (3, int64): [3 < 2 < 1]
1522 If the mapping is not one-to-one an :class:`~pandas.Index` is returned:
1524 >>> cat.map({'a': 'first', 'b': 'second', 'c': 'first'})
1525 Index(['first', 'second', 'first'], dtype='object')
1527 If a `dict` is used, all unmapped categories are mapped to `NaN` and
1528 the result is an :class:`~pandas.Index`:
1530 >>> cat.map({'a': 'first', 'b': 'second'})
1531 Index(['first', 'second', nan], dtype='object')
1532 """
1533 new_categories = self.categories.map(mapper)
1534 try:
1535 return self.from_codes(
1536 self._codes.copy(), categories=new_categories, ordered=self.ordered
1537 )
1538 except ValueError:
1539 # NA values are represented in self._codes with -1
1540 # np.take causes NA values to take final element in new_categories
1541 if np.any(self._codes == -1):
1542 new_categories = new_categories.insert(len(new_categories), np.nan)
1543 return np.take(new_categories, self._codes)
1545 __eq__ = _cat_compare_op(operator.eq)
1546 __ne__ = _cat_compare_op(operator.ne)
1547 __lt__ = _cat_compare_op(operator.lt)
1548 __gt__ = _cat_compare_op(operator.gt)
1549 __le__ = _cat_compare_op(operator.le)
1550 __ge__ = _cat_compare_op(operator.ge)
1552 # -------------------------------------------------------------
1553 # Validators; ideally these can be de-duplicated
1555 def _validate_setitem_value(self, value):
1556 if not is_hashable(value):
1557 # wrap scalars and hashable-listlikes in list
1558 return self._validate_listlike(value)
1559 else:
1560 return self._validate_scalar(value)
1562 _validate_searchsorted_value = _validate_setitem_value
1564 def _validate_scalar(self, fill_value):
1565 """
1566 Convert a user-facing fill_value to a representation to use with our
1567 underlying ndarray, raising TypeError if this is not possible.
1569 Parameters
1570 ----------
1571 fill_value : object
1573 Returns
1574 -------
1575 fill_value : int
1577 Raises
1578 ------
1579 TypeError
1580 """
1582 if is_valid_na_for_dtype(fill_value, self.categories.dtype):
1583 fill_value = -1
1584 elif fill_value in self.categories:
1585 fill_value = self._unbox_scalar(fill_value)
1586 else:
1587 raise TypeError(
1588 "Cannot setitem on a Categorical with a new "
1589 f"category ({fill_value}), set the categories first"
1590 ) from None
1591 return fill_value
1593 # -------------------------------------------------------------
1595 @ravel_compat
1596 def __array__(self, dtype: NpDtype | None = None) -> np.ndarray:
1597 """
1598 The numpy array interface.
1600 Returns
1601 -------
1602 numpy.array
1603 A numpy array of either the specified dtype or,
1604 if dtype==None (default), the same dtype as
1605 categorical.categories.dtype.
1606 """
1607 ret = take_nd(self.categories._values, self._codes)
1608 if dtype and not is_dtype_equal(dtype, self.categories.dtype):
1609 return np.asarray(ret, dtype)
1610 # When we're a Categorical[ExtensionArray], like Interval,
1611 # we need to ensure __array__ gets all the way to an
1612 # ndarray.
1613 return np.asarray(ret)
1615 def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs, **kwargs):
1616 # for binary ops, use our custom dunder methods
1617 result = ops.maybe_dispatch_ufunc_to_dunder_op(
1618 self, ufunc, method, *inputs, **kwargs
1619 )
1620 if result is not NotImplemented:
1621 return result
1623 if "out" in kwargs:
1624 # e.g. test_numpy_ufuncs_out
1625 return arraylike.dispatch_ufunc_with_out(
1626 self, ufunc, method, *inputs, **kwargs
1627 )
1629 if method == "reduce":
1630 # e.g. TestCategoricalAnalytics::test_min_max_ordered
1631 result = arraylike.dispatch_reduction_ufunc(
1632 self, ufunc, method, *inputs, **kwargs
1633 )
1634 if result is not NotImplemented:
1635 return result
1637 # for all other cases, raise for now (similarly as what happens in
1638 # Series.__array_prepare__)
1639 raise TypeError(
1640 f"Object with dtype {self.dtype} cannot perform "
1641 f"the numpy op {ufunc.__name__}"
1642 )
1644 def __setstate__(self, state) -> None:
1645 """Necessary for making this object picklable"""
1646 if not isinstance(state, dict):
1647 return super().__setstate__(state)
1649 if "_dtype" not in state:
1650 state["_dtype"] = CategoricalDtype(state["_categories"], state["_ordered"])
1652 if "_codes" in state and "_ndarray" not in state:
1653 # backward compat, changed what is property vs attribute
1654 state["_ndarray"] = state.pop("_codes")
1656 super().__setstate__(state)
1658 @property
1659 def nbytes(self) -> int:
1660 return self._codes.nbytes + self.dtype.categories.values.nbytes
1662 def memory_usage(self, deep: bool = False) -> int:
1663 """
1664 Memory usage of my values
1666 Parameters
1667 ----------
1668 deep : bool
1669 Introspect the data deeply, interrogate
1670 `object` dtypes for system-level memory consumption
1672 Returns
1673 -------
1674 bytes used
1676 Notes
1677 -----
1678 Memory usage does not include memory consumed by elements that
1679 are not components of the array if deep=False
1681 See Also
1682 --------
1683 numpy.ndarray.nbytes
1684 """
1685 return self._codes.nbytes + self.dtype.categories.memory_usage(deep=deep)
1687 def isna(self) -> np.ndarray:
1688 """
1689 Detect missing values
1691 Missing values (-1 in .codes) are detected.
1693 Returns
1694 -------
1695 np.ndarray[bool] of whether my values are null
1697 See Also
1698 --------
1699 isna : Top-level isna.
1700 isnull : Alias of isna.
1701 Categorical.notna : Boolean inverse of Categorical.isna.
1703 """
1704 return self._codes == -1
1706 isnull = isna
1708 def notna(self) -> np.ndarray:
1709 """
1710 Inverse of isna
1712 Both missing values (-1 in .codes) and NA as a category are detected as
1713 null.
1715 Returns
1716 -------
1717 np.ndarray[bool] of whether my values are not null
1719 See Also
1720 --------
1721 notna : Top-level notna.
1722 notnull : Alias of notna.
1723 Categorical.isna : Boolean inverse of Categorical.notna.
1725 """
1726 return ~self.isna()
1728 notnull = notna
1730 def value_counts(self, dropna: bool = True) -> Series:
1731 """
1732 Return a Series containing counts of each category.
1734 Every category will have an entry, even those with a count of 0.
1736 Parameters
1737 ----------
1738 dropna : bool, default True
1739 Don't include counts of NaN.
1741 Returns
1742 -------
1743 counts : Series
1745 See Also
1746 --------
1747 Series.value_counts
1748 """
1749 from pandas import (
1750 CategoricalIndex,
1751 Series,
1752 )
1754 code, cat = self._codes, self.categories
1755 ncat, mask = (len(cat), code >= 0)
1756 ix, clean = np.arange(ncat), mask.all()
1758 if dropna or clean:
1759 obs = code if clean else code[mask]
1760 count = np.bincount(obs, minlength=ncat or 0)
1761 else:
1762 count = np.bincount(np.where(mask, code, ncat))
1763 ix = np.append(ix, -1)
1765 ix = coerce_indexer_dtype(ix, self.dtype.categories)
1766 ix = self._from_backing_data(ix)
1768 return Series(count, index=CategoricalIndex(ix), dtype="int64")
1770 # error: Argument 2 of "_empty" is incompatible with supertype
1771 # "NDArrayBackedExtensionArray"; supertype defines the argument type as
1772 # "ExtensionDtype"
1773 @classmethod
1774 def _empty( # type: ignore[override]
1775 cls: type_t[Categorical], shape: Shape, dtype: CategoricalDtype
1776 ) -> Categorical:
1777 """
1778 Analogous to np.empty(shape, dtype=dtype)
1780 Parameters
1781 ----------
1782 shape : tuple[int]
1783 dtype : CategoricalDtype
1784 """
1785 arr = cls._from_sequence([], dtype=dtype)
1787 # We have to use np.zeros instead of np.empty otherwise the resulting
1788 # ndarray may contain codes not supported by this dtype, in which
1789 # case repr(result) could segfault.
1790 backing = np.zeros(shape, dtype=arr._ndarray.dtype)
1792 return arr._from_backing_data(backing)
1794 def _internal_get_values(self):
1795 """
1796 Return the values.
1798 For internal compatibility with pandas formatting.
1800 Returns
1801 -------
1802 np.ndarray or Index
1803 A numpy array of the same dtype as categorical.categories.dtype or
1804 Index if datetime / periods.
1805 """
1806 # if we are a datetime and period index, return Index to keep metadata
1807 if needs_i8_conversion(self.categories.dtype):
1808 return self.categories.take(self._codes, fill_value=NaT)
1809 elif is_integer_dtype(self.categories) and -1 in self._codes:
1810 return self.categories.astype("object").take(self._codes, fill_value=np.nan)
1811 return np.array(self)
1813 def check_for_ordered(self, op) -> None:
1814 """assert that we are ordered"""
1815 if not self.ordered:
1816 raise TypeError(
1817 f"Categorical is not ordered for operation {op}\n"
1818 "you can use .as_ordered() to change the "
1819 "Categorical to an ordered one\n"
1820 )
1822 @deprecate_nonkeyword_arguments(version=None, allowed_args=["self"])
1823 def argsort(self, ascending=True, kind="quicksort", **kwargs):
1824 """
1825 Return the indices that would sort the Categorical.
1827 .. versionchanged:: 0.25.0
1829 Changed to sort missing values at the end.
1831 Parameters
1832 ----------
1833 ascending : bool, default True
1834 Whether the indices should result in an ascending
1835 or descending sort.
1836 kind : {'quicksort', 'mergesort', 'heapsort', 'stable'}, optional
1837 Sorting algorithm.
1838 **kwargs:
1839 passed through to :func:`numpy.argsort`.
1841 Returns
1842 -------
1843 np.ndarray[np.intp]
1845 See Also
1846 --------
1847 numpy.ndarray.argsort
1849 Notes
1850 -----
1851 While an ordering is applied to the category values, arg-sorting
1852 in this context refers more to organizing and grouping together
1853 based on matching category values. Thus, this function can be
1854 called on an unordered Categorical instance unlike the functions
1855 'Categorical.min' and 'Categorical.max'.
1857 Examples
1858 --------
1859 >>> pd.Categorical(['b', 'b', 'a', 'c']).argsort()
1860 array([2, 0, 1, 3])
1862 >>> cat = pd.Categorical(['b', 'b', 'a', 'c'],
1863 ... categories=['c', 'b', 'a'],
1864 ... ordered=True)
1865 >>> cat.argsort()
1866 array([3, 0, 1, 2])
1868 Missing values are placed at the end
1870 >>> cat = pd.Categorical([2, None, 1])
1871 >>> cat.argsort()
1872 array([2, 0, 1])
1873 """
1874 return super().argsort(ascending=ascending, kind=kind, **kwargs)
1876 @overload
1877 def sort_values(
1878 self,
1879 *,
1880 inplace: Literal[False] = ...,
1881 ascending: bool = ...,
1882 na_position: str = ...,
1883 ) -> Categorical:
1884 ...
1886 @overload
1887 def sort_values(
1888 self, *, inplace: Literal[True], ascending: bool = ..., na_position: str = ...
1889 ) -> None:
1890 ...
1892 @deprecate_nonkeyword_arguments(version=None, allowed_args=["self"])
1893 def sort_values(
1894 self, inplace: bool = False, ascending: bool = True, na_position: str = "last"
1895 ) -> Categorical | None:
1896 """
1897 Sort the Categorical by category value returning a new
1898 Categorical by default.
1900 While an ordering is applied to the category values, sorting in this
1901 context refers more to organizing and grouping together based on
1902 matching category values. Thus, this function can be called on an
1903 unordered Categorical instance unlike the functions 'Categorical.min'
1904 and 'Categorical.max'.
1906 Parameters
1907 ----------
1908 inplace : bool, default False
1909 Do operation in place.
1910 ascending : bool, default True
1911 Order ascending. Passing False orders descending. The
1912 ordering parameter provides the method by which the
1913 category values are organized.
1914 na_position : {'first', 'last'} (optional, default='last')
1915 'first' puts NaNs at the beginning
1916 'last' puts NaNs at the end
1918 Returns
1919 -------
1920 Categorical or None
1922 See Also
1923 --------
1924 Categorical.sort
1925 Series.sort_values
1927 Examples
1928 --------
1929 >>> c = pd.Categorical([1, 2, 2, 1, 5])
1930 >>> c
1931 [1, 2, 2, 1, 5]
1932 Categories (3, int64): [1, 2, 5]
1933 >>> c.sort_values()
1934 [1, 1, 2, 2, 5]
1935 Categories (3, int64): [1, 2, 5]
1936 >>> c.sort_values(ascending=False)
1937 [5, 2, 2, 1, 1]
1938 Categories (3, int64): [1, 2, 5]
1940 Inplace sorting can be done as well:
1942 >>> c.sort_values(inplace=True)
1943 >>> c
1944 [1, 1, 2, 2, 5]
1945 Categories (3, int64): [1, 2, 5]
1946 >>>
1947 >>> c = pd.Categorical([1, 2, 2, 1, 5])
1949 'sort_values' behaviour with NaNs. Note that 'na_position'
1950 is independent of the 'ascending' parameter:
1952 >>> c = pd.Categorical([np.nan, 2, 2, np.nan, 5])
1953 >>> c
1954 [NaN, 2, 2, NaN, 5]
1955 Categories (2, int64): [2, 5]
1956 >>> c.sort_values()
1957 [2, 2, 5, NaN, NaN]
1958 Categories (2, int64): [2, 5]
1959 >>> c.sort_values(ascending=False)
1960 [5, 2, 2, NaN, NaN]
1961 Categories (2, int64): [2, 5]
1962 >>> c.sort_values(na_position='first')
1963 [NaN, NaN, 2, 2, 5]
1964 Categories (2, int64): [2, 5]
1965 >>> c.sort_values(ascending=False, na_position='first')
1966 [NaN, NaN, 5, 2, 2]
1967 Categories (2, int64): [2, 5]
1968 """
1969 inplace = validate_bool_kwarg(inplace, "inplace")
1970 if na_position not in ["last", "first"]:
1971 raise ValueError(f"invalid na_position: {repr(na_position)}")
1973 sorted_idx = nargsort(self, ascending=ascending, na_position=na_position)
1975 if not inplace:
1976 codes = self._codes[sorted_idx]
1977 return self._from_backing_data(codes)
1978 self._codes[:] = self._codes[sorted_idx]
1979 return None
1981 def _rank(
1982 self,
1983 *,
1984 axis: int = 0,
1985 method: str = "average",
1986 na_option: str = "keep",
1987 ascending: bool = True,
1988 pct: bool = False,
1989 ):
1990 """
1991 See Series.rank.__doc__.
1992 """
1993 if axis != 0:
1994 raise NotImplementedError
1995 vff = self._values_for_rank()
1996 return algorithms.rank(
1997 vff,
1998 axis=axis,
1999 method=method,
2000 na_option=na_option,
2001 ascending=ascending,
2002 pct=pct,
2003 )
2005 def _values_for_rank(self):
2006 """
2007 For correctly ranking ordered categorical data. See GH#15420
2009 Ordered categorical data should be ranked on the basis of
2010 codes with -1 translated to NaN.
2012 Returns
2013 -------
2014 numpy.array
2016 """
2017 from pandas import Series
2019 if self.ordered:
2020 values = self.codes
2021 mask = values == -1
2022 if mask.any():
2023 values = values.astype("float64")
2024 values[mask] = np.nan
2025 elif self.categories.is_numeric():
2026 values = np.array(self)
2027 else:
2028 # reorder the categories (so rank can use the float codes)
2029 # instead of passing an object array to rank
2030 values = np.array(
2031 self.rename_categories(Series(self.categories).rank().values)
2032 )
2033 return values
2035 def to_dense(self) -> np.ndarray:
2036 """
2037 Return my 'dense' representation
2039 For internal compatibility with numpy arrays.
2041 Returns
2042 -------
2043 dense : array
2044 """
2045 warn(
2046 "Categorical.to_dense is deprecated and will be removed in "
2047 "a future version. Use np.asarray(cat) instead.",
2048 FutureWarning,
2049 stacklevel=find_stack_level(),
2050 )
2051 return np.asarray(self)
2053 # ------------------------------------------------------------------
2054 # NDArrayBackedExtensionArray compat
2056 @property
2057 def _codes(self) -> np.ndarray:
2058 return self._ndarray
2060 @_codes.setter
2061 def _codes(self, value: np.ndarray):
2062 warn(
2063 "Setting the codes on a Categorical is deprecated and will raise in "
2064 "a future version. Create a new Categorical object instead",
2065 FutureWarning,
2066 stacklevel=find_stack_level(),
2067 ) # GH#40606
2068 NDArrayBacked.__init__(self, value, self.dtype)
2070 def _box_func(self, i: int):
2071 if i == -1:
2072 return np.NaN
2073 return self.categories[i]
2075 def _unbox_scalar(self, key) -> int:
2076 # searchsorted is very performance sensitive. By converting codes
2077 # to same dtype as self.codes, we get much faster performance.
2078 code = self.categories.get_loc(key)
2079 code = self._ndarray.dtype.type(code)
2080 return code
2082 # ------------------------------------------------------------------
2084 def take_nd(
2085 self, indexer, allow_fill: bool = False, fill_value=None
2086 ) -> Categorical:
2087 # GH#27745 deprecate alias that other EAs dont have
2088 warn(
2089 "Categorical.take_nd is deprecated, use Categorical.take instead",
2090 FutureWarning,
2091 stacklevel=find_stack_level(),
2092 )
2093 return self.take(indexer, allow_fill=allow_fill, fill_value=fill_value)
2095 def __iter__(self):
2096 """
2097 Returns an Iterator over the values of this Categorical.
2098 """
2099 if self.ndim == 1:
2100 return iter(self._internal_get_values().tolist())
2101 else:
2102 return (self[n] for n in range(len(self)))
2104 def __contains__(self, key) -> bool:
2105 """
2106 Returns True if `key` is in this Categorical.
2107 """
2108 # if key is a NaN, check if any NaN is in self.
2109 if is_valid_na_for_dtype(key, self.categories.dtype):
2110 return bool(self.isna().any())
2112 return contains(self, key, container=self._codes)
2114 # ------------------------------------------------------------------
2115 # Rendering Methods
2117 def _formatter(self, boxed: bool = False):
2118 # Defer to CategoricalFormatter's formatter.
2119 return None
2121 def _tidy_repr(self, max_vals: int = 10, footer: bool = True) -> str:
2122 """
2123 a short repr displaying only max_vals and an optional (but default
2124 footer)
2125 """
2126 num = max_vals // 2
2127 head = self[:num]._get_repr(length=False, footer=False)
2128 tail = self[-(max_vals - num) :]._get_repr(length=False, footer=False)
2130 result = f"{head[:-1]}, ..., {tail[1:]}"
2131 if footer:
2132 result = f"{result}\n{self._repr_footer()}"
2134 return str(result)
2136 def _repr_categories(self) -> list[str]:
2137 """
2138 return the base repr for the categories
2139 """
2140 max_categories = (
2141 10
2142 if get_option("display.max_categories") == 0
2143 else get_option("display.max_categories")
2144 )
2145 from pandas.io.formats import format as fmt
2147 format_array = partial(
2148 fmt.format_array, formatter=None, quoting=QUOTE_NONNUMERIC
2149 )
2150 if len(self.categories) > max_categories:
2151 num = max_categories // 2
2152 head = format_array(self.categories[:num])
2153 tail = format_array(self.categories[-num:])
2154 category_strs = head + ["..."] + tail
2155 else:
2156 category_strs = format_array(self.categories)
2158 # Strip all leading spaces, which format_array adds for columns...
2159 category_strs = [x.strip() for x in category_strs]
2160 return category_strs
2162 def _repr_categories_info(self) -> str:
2163 """
2164 Returns a string representation of the footer.
2165 """
2166 category_strs = self._repr_categories()
2167 dtype = str(self.categories.dtype)
2168 levheader = f"Categories ({len(self.categories)}, {dtype}): "
2169 width, height = get_terminal_size()
2170 max_width = get_option("display.width") or width
2171 if console.in_ipython_frontend():
2172 # 0 = no breaks
2173 max_width = 0
2174 levstring = ""
2175 start = True
2176 cur_col_len = len(levheader) # header
2177 sep_len, sep = (3, " < ") if self.ordered else (2, ", ")
2178 linesep = sep.rstrip() + "\n" # remove whitespace
2179 for val in category_strs:
2180 if max_width != 0 and cur_col_len + sep_len + len(val) > max_width:
2181 levstring += linesep + (" " * (len(levheader) + 1))
2182 cur_col_len = len(levheader) + 1 # header + a whitespace
2183 elif not start:
2184 levstring += sep
2185 cur_col_len += len(val)
2186 levstring += val
2187 start = False
2188 # replace to simple save space by
2189 return levheader + "[" + levstring.replace(" < ... < ", " ... ") + "]"
2191 def _repr_footer(self) -> str:
2192 info = self._repr_categories_info()
2193 return f"Length: {len(self)}\n{info}"
2195 def _get_repr(self, length: bool = True, na_rep="NaN", footer: bool = True) -> str:
2196 from pandas.io.formats import format as fmt
2198 formatter = fmt.CategoricalFormatter(
2199 self, length=length, na_rep=na_rep, footer=footer
2200 )
2201 result = formatter.to_string()
2202 return str(result)
2204 def __repr__(self) -> str:
2205 """
2206 String representation.
2207 """
2208 _maxlen = 10
2209 if len(self._codes) > _maxlen:
2210 result = self._tidy_repr(_maxlen)
2211 elif len(self._codes) > 0:
2212 result = self._get_repr(length=len(self) > _maxlen)
2213 else:
2214 msg = self._get_repr(length=False, footer=True).replace("\n", ", ")
2215 result = f"[], {msg}"
2217 return result
2219 # ------------------------------------------------------------------
2221 def _validate_listlike(self, value):
2222 # NB: here we assume scalar-like tuples have already been excluded
2223 value = extract_array(value, extract_numpy=True)
2225 # require identical categories set
2226 if isinstance(value, Categorical):
2227 if not is_dtype_equal(self.dtype, value.dtype):
2228 raise TypeError(
2229 "Cannot set a Categorical with another, "
2230 "without identical categories"
2231 )
2232 # is_dtype_equal implies categories_match_up_to_permutation
2233 value = self._encode_with_my_categories(value)
2234 return value._codes
2236 from pandas import Index
2238 # tupleize_cols=False for e.g. test_fillna_iterable_category GH#41914
2239 to_add = Index._with_infer(value, tupleize_cols=False).difference(
2240 self.categories
2241 )
2243 # no assignments of values not in categories, but it's always ok to set
2244 # something to np.nan
2245 if len(to_add) and not isna(to_add).all():
2246 raise TypeError(
2247 "Cannot setitem on a Categorical with a new "
2248 "category, set the categories first"
2249 )
2251 codes = self.categories.get_indexer(value)
2252 return codes.astype(self._ndarray.dtype, copy=False)
2254 def _reverse_indexer(self) -> dict[Hashable, npt.NDArray[np.intp]]:
2255 """
2256 Compute the inverse of a categorical, returning
2257 a dict of categories -> indexers.
2259 *This is an internal function*
2261 Returns
2262 -------
2263 Dict[Hashable, np.ndarray[np.intp]]
2264 dict of categories -> indexers
2266 Examples
2267 --------
2268 >>> c = pd.Categorical(list('aabca'))
2269 >>> c
2270 ['a', 'a', 'b', 'c', 'a']
2271 Categories (3, object): ['a', 'b', 'c']
2272 >>> c.categories
2273 Index(['a', 'b', 'c'], dtype='object')
2274 >>> c.codes
2275 array([0, 0, 1, 2, 0], dtype=int8)
2276 >>> c._reverse_indexer()
2277 {'a': array([0, 1, 4]), 'b': array([2]), 'c': array([3])}
2279 """
2280 categories = self.categories
2281 r, counts = libalgos.groupsort_indexer(
2282 ensure_platform_int(self.codes), categories.size
2283 )
2284 counts = ensure_int64(counts).cumsum()
2285 _result = (r[start:end] for start, end in zip(counts, counts[1:]))
2286 return dict(zip(categories, _result))
2288 # ------------------------------------------------------------------
2289 # Reductions
2291 @deprecate_kwarg(old_arg_name="numeric_only", new_arg_name="skipna")
2292 def min(self, *, skipna=True, **kwargs):
2293 """
2294 The minimum value of the object.
2296 Only ordered `Categoricals` have a minimum!
2298 .. versionchanged:: 1.0.0
2300 Returns an NA value on empty arrays
2302 Raises
2303 ------
2304 TypeError
2305 If the `Categorical` is not `ordered`.
2307 Returns
2308 -------
2309 min : the minimum of this `Categorical`
2310 """
2311 nv.validate_minmax_axis(kwargs.get("axis", 0))
2312 nv.validate_min((), kwargs)
2313 self.check_for_ordered("min")
2315 if not len(self._codes):
2316 return self.dtype.na_value
2318 good = self._codes != -1
2319 if not good.all():
2320 if skipna and good.any():
2321 pointer = self._codes[good].min()
2322 else:
2323 return np.nan
2324 else:
2325 pointer = self._codes.min()
2326 return self._wrap_reduction_result(None, pointer)
2328 @deprecate_kwarg(old_arg_name="numeric_only", new_arg_name="skipna")
2329 def max(self, *, skipna=True, **kwargs):
2330 """
2331 The maximum value of the object.
2333 Only ordered `Categoricals` have a maximum!
2335 .. versionchanged:: 1.0.0
2337 Returns an NA value on empty arrays
2339 Raises
2340 ------
2341 TypeError
2342 If the `Categorical` is not `ordered`.
2344 Returns
2345 -------
2346 max : the maximum of this `Categorical`
2347 """
2348 nv.validate_minmax_axis(kwargs.get("axis", 0))
2349 nv.validate_max((), kwargs)
2350 self.check_for_ordered("max")
2352 if not len(self._codes):
2353 return self.dtype.na_value
2355 good = self._codes != -1
2356 if not good.all():
2357 if skipna and good.any():
2358 pointer = self._codes[good].max()
2359 else:
2360 return np.nan
2361 else:
2362 pointer = self._codes.max()
2363 return self._wrap_reduction_result(None, pointer)
2365 def mode(self, dropna: bool = True) -> Categorical:
2366 """
2367 Returns the mode(s) of the Categorical.
2369 Always returns `Categorical` even if only one value.
2371 Parameters
2372 ----------
2373 dropna : bool, default True
2374 Don't consider counts of NaN/NaT.
2376 Returns
2377 -------
2378 modes : `Categorical` (sorted)
2379 """
2380 warn(
2381 "Categorical.mode is deprecated and will be removed in a future version. "
2382 "Use Series.mode instead.",
2383 FutureWarning,
2384 stacklevel=find_stack_level(),
2385 )
2386 return self._mode(dropna=dropna)
2388 def _mode(self, dropna: bool = True) -> Categorical:
2389 codes = self._codes
2390 mask = None
2391 if dropna:
2392 mask = self.isna()
2394 res_codes = algorithms.mode(codes, mask=mask)
2395 res_codes = cast(np.ndarray, res_codes)
2396 assert res_codes.dtype == codes.dtype
2397 res = self._from_backing_data(res_codes)
2398 return res
2400 # ------------------------------------------------------------------
2401 # ExtensionArray Interface
2403 def unique(self):
2404 """
2405 Return the ``Categorical`` which ``categories`` and ``codes`` are
2406 unique.
2408 .. versionchanged:: 1.3.0
2410 Previously, unused categories were dropped from the new categories.
2412 Returns
2413 -------
2414 Categorical
2416 See Also
2417 --------
2418 pandas.unique
2419 CategoricalIndex.unique
2420 Series.unique : Return unique values of Series object.
2422 Examples
2423 --------
2424 >>> pd.Categorical(list("baabc")).unique()
2425 ['b', 'a', 'c']
2426 Categories (3, object): ['a', 'b', 'c']
2427 >>> pd.Categorical(list("baab"), categories=list("abc"), ordered=True).unique()
2428 ['b', 'a']
2429 Categories (3, object): ['a' < 'b' < 'c']
2430 """
2431 unique_codes = unique1d(self.codes)
2432 return self._from_backing_data(unique_codes)
2434 def _cast_quantile_result(self, res_values: np.ndarray) -> np.ndarray:
2435 # make sure we have correct itemsize for resulting codes
2436 assert res_values.dtype == self._ndarray.dtype
2437 return res_values
2439 def equals(self, other: object) -> bool:
2440 """
2441 Returns True if categorical arrays are equal.
2443 Parameters
2444 ----------
2445 other : `Categorical`
2447 Returns
2448 -------
2449 bool
2450 """
2451 if not isinstance(other, Categorical):
2452 return False
2453 elif self._categories_match_up_to_permutation(other):
2454 other = self._encode_with_my_categories(other)
2455 return np.array_equal(self._codes, other._codes)
2456 return False
2458 @classmethod
2459 def _concat_same_type(
2460 cls: type[CategoricalT], to_concat: Sequence[CategoricalT], axis: int = 0
2461 ) -> CategoricalT:
2462 from pandas.core.dtypes.concat import union_categoricals
2464 first = to_concat[0]
2465 if axis >= first.ndim:
2466 raise ValueError(
2467 f"axis {axis} is out of bounds for array of dimension {first.ndim}"
2468 )
2470 if axis == 1:
2471 # Flatten, concatenate then reshape
2472 if not all(x.ndim == 2 for x in to_concat):
2473 raise ValueError
2475 # pass correctly-shaped to union_categoricals
2476 tc_flat = []
2477 for obj in to_concat:
2478 tc_flat.extend([obj[:, i] for i in range(obj.shape[1])])
2480 res_flat = cls._concat_same_type(tc_flat, axis=0)
2482 result = res_flat.reshape(len(first), -1, order="F")
2483 return result
2485 result = union_categoricals(to_concat)
2486 return result
2488 # ------------------------------------------------------------------
2490 def _encode_with_my_categories(self, other: Categorical) -> Categorical:
2491 """
2492 Re-encode another categorical using this Categorical's categories.
2494 Notes
2495 -----
2496 This assumes we have already checked
2497 self._categories_match_up_to_permutation(other).
2498 """
2499 # Indexing on codes is more efficient if categories are the same,
2500 # so we can apply some optimizations based on the degree of
2501 # dtype-matching.
2502 codes = recode_for_categories(
2503 other.codes, other.categories, self.categories, copy=False
2504 )
2505 return self._from_backing_data(codes)
2507 def _categories_match_up_to_permutation(self, other: Categorical) -> bool:
2508 """
2509 Returns True if categoricals are the same dtype
2510 same categories, and same ordered
2512 Parameters
2513 ----------
2514 other : Categorical
2516 Returns
2517 -------
2518 bool
2519 """
2520 return hash(self.dtype) == hash(other.dtype)
2522 def is_dtype_equal(self, other) -> bool:
2523 warn(
2524 "Categorical.is_dtype_equal is deprecated and will be removed "
2525 "in a future version",
2526 FutureWarning,
2527 stacklevel=find_stack_level(),
2528 )
2529 try:
2530 return self._categories_match_up_to_permutation(other)
2531 except (AttributeError, TypeError):
2532 return False
2534 def describe(self) -> DataFrame:
2535 """
2536 Describes this Categorical
2538 Returns
2539 -------
2540 description: `DataFrame`
2541 A dataframe with frequency and counts by category.
2542 """
2543 counts = self.value_counts(dropna=False)
2544 freqs = counts / counts.sum()
2546 from pandas import Index
2547 from pandas.core.reshape.concat import concat
2549 result = concat([counts, freqs], axis=1)
2550 result.columns = Index(["counts", "freqs"])
2551 result.index.name = "categories"
2553 return result
2555 def isin(self, values) -> npt.NDArray[np.bool_]:
2556 """
2557 Check whether `values` are contained in Categorical.
2559 Return a boolean NumPy Array showing whether each element in
2560 the Categorical matches an element in the passed sequence of
2561 `values` exactly.
2563 Parameters
2564 ----------
2565 values : set or list-like
2566 The sequence of values to test. Passing in a single string will
2567 raise a ``TypeError``. Instead, turn a single string into a
2568 list of one element.
2570 Returns
2571 -------
2572 np.ndarray[bool]
2574 Raises
2575 ------
2576 TypeError
2577 * If `values` is not a set or list-like
2579 See Also
2580 --------
2581 pandas.Series.isin : Equivalent method on Series.
2583 Examples
2584 --------
2585 >>> s = pd.Categorical(['lama', 'cow', 'lama', 'beetle', 'lama',
2586 ... 'hippo'])
2587 >>> s.isin(['cow', 'lama'])
2588 array([ True, True, True, False, True, False])
2590 Passing a single string as ``s.isin('lama')`` will raise an error. Use
2591 a list of one element instead:
2593 >>> s.isin(['lama'])
2594 array([ True, False, True, False, True, False])
2595 """
2596 if not is_list_like(values):
2597 values_type = type(values).__name__
2598 raise TypeError(
2599 "only list-like objects are allowed to be passed "
2600 f"to isin(), you passed a [{values_type}]"
2601 )
2602 values = sanitize_array(values, None, None)
2603 null_mask = np.asarray(isna(values))
2604 code_values = self.categories.get_indexer(values)
2605 code_values = code_values[null_mask | (code_values >= 0)]
2606 return algorithms.isin(self.codes, code_values)
2608 @overload
2609 def replace(
2610 self, to_replace, value, *, inplace: Literal[False] = ...
2611 ) -> Categorical:
2612 ...
2614 @overload
2615 def replace(self, to_replace, value, *, inplace: Literal[True]) -> None:
2616 ...
2618 @deprecate_nonkeyword_arguments(version=None, allowed_args=["self", "value"])
2619 def replace(self, to_replace, value, inplace: bool = False) -> Categorical | None:
2620 """
2621 Replaces all instances of one value with another
2623 Parameters
2624 ----------
2625 to_replace: object
2626 The value to be replaced
2628 value: object
2629 The value to replace it with
2631 inplace: bool
2632 Whether the operation is done in-place
2634 Returns
2635 -------
2636 None if inplace is True, otherwise the new Categorical after replacement
2639 Examples
2640 --------
2641 >>> s = pd.Categorical([1, 2, 1, 3])
2642 >>> s.replace(1, 3)
2643 [3, 2, 3, 3]
2644 Categories (2, int64): [2, 3]
2645 """
2646 # GH#44929 deprecation
2647 warn(
2648 "Categorical.replace is deprecated and will be removed in a future "
2649 "version. Use Series.replace directly instead.",
2650 FutureWarning,
2651 stacklevel=find_stack_level(),
2652 )
2653 return self._replace(to_replace=to_replace, value=value, inplace=inplace)
2655 def _replace(self, *, to_replace, value, inplace: bool = False):
2656 inplace = validate_bool_kwarg(inplace, "inplace")
2657 cat = self if inplace else self.copy()
2659 # build a dict of (to replace -> value) pairs
2660 if is_list_like(to_replace):
2661 # if to_replace is list-like and value is scalar
2662 replace_dict = {replace_value: value for replace_value in to_replace}
2663 else:
2664 # if both to_replace and value are scalar
2665 replace_dict = {to_replace: value}
2667 # other cases, like if both to_replace and value are list-like or if
2668 # to_replace is a dict, are handled separately in NDFrame
2669 for replace_value, new_value in replace_dict.items():
2670 if new_value == replace_value:
2671 continue
2672 if replace_value in cat.categories:
2673 if isna(new_value):
2674 with catch_warnings():
2675 simplefilter("ignore")
2676 cat.remove_categories(replace_value, inplace=True)
2677 continue
2679 categories = cat.categories.tolist()
2680 index = categories.index(replace_value)
2682 if new_value in cat.categories:
2683 value_index = categories.index(new_value)
2684 cat._codes[cat._codes == index] = value_index
2685 with catch_warnings():
2686 simplefilter("ignore")
2687 cat.remove_categories(replace_value, inplace=True)
2688 else:
2689 categories[index] = new_value
2690 with catch_warnings():
2691 simplefilter("ignore")
2692 cat.rename_categories(categories, inplace=True)
2693 if not inplace:
2694 return cat
2696 # ------------------------------------------------------------------------
2697 # String methods interface
2698 def _str_map(
2699 self, f, na_value=np.nan, dtype=np.dtype("object"), convert: bool = True
2700 ):
2701 # Optimization to apply the callable `f` to the categories once
2702 # and rebuild the result by `take`ing from the result with the codes.
2703 # Returns the same type as the object-dtype implementation though.
2704 from pandas.core.arrays import PandasArray
2706 categories = self.categories
2707 codes = self.codes
2708 result = PandasArray(categories.to_numpy())._str_map(f, na_value, dtype)
2709 return take_nd(result, codes, fill_value=na_value)
2711 def _str_get_dummies(self, sep="|"):
2712 # sep may not be in categories. Just bail on this.
2713 from pandas.core.arrays import PandasArray
2715 return PandasArray(self.astype(str))._str_get_dummies(sep)
2718# The Series.cat accessor
2721@delegate_names(
2722 delegate=Categorical, accessors=["categories", "ordered"], typ="property"
2723)
2724@delegate_names(
2725 delegate=Categorical,
2726 accessors=[
2727 "rename_categories",
2728 "reorder_categories",
2729 "add_categories",
2730 "remove_categories",
2731 "remove_unused_categories",
2732 "set_categories",
2733 "as_ordered",
2734 "as_unordered",
2735 ],
2736 typ="method",
2737)
2738class CategoricalAccessor(PandasDelegate, PandasObject, NoNewAttributesMixin):
2739 """
2740 Accessor object for categorical properties of the Series values.
2742 Be aware that assigning to `categories` is a inplace operation, while all
2743 methods return new categorical data per default (but can be called with
2744 `inplace=True`).
2746 Parameters
2747 ----------
2748 data : Series or CategoricalIndex
2750 Examples
2751 --------
2752 >>> s = pd.Series(list("abbccc")).astype("category")
2753 >>> s
2754 0 a
2755 1 b
2756 2 b
2757 3 c
2758 4 c
2759 5 c
2760 dtype: category
2761 Categories (3, object): ['a', 'b', 'c']
2763 >>> s.cat.categories
2764 Index(['a', 'b', 'c'], dtype='object')
2766 >>> s.cat.rename_categories(list("cba"))
2767 0 c
2768 1 b
2769 2 b
2770 3 a
2771 4 a
2772 5 a
2773 dtype: category
2774 Categories (3, object): ['c', 'b', 'a']
2776 >>> s.cat.reorder_categories(list("cba"))
2777 0 a
2778 1 b
2779 2 b
2780 3 c
2781 4 c
2782 5 c
2783 dtype: category
2784 Categories (3, object): ['c', 'b', 'a']
2786 >>> s.cat.add_categories(["d", "e"])
2787 0 a
2788 1 b
2789 2 b
2790 3 c
2791 4 c
2792 5 c
2793 dtype: category
2794 Categories (5, object): ['a', 'b', 'c', 'd', 'e']
2796 >>> s.cat.remove_categories(["a", "c"])
2797 0 NaN
2798 1 b
2799 2 b
2800 3 NaN
2801 4 NaN
2802 5 NaN
2803 dtype: category
2804 Categories (1, object): ['b']
2806 >>> s1 = s.cat.add_categories(["d", "e"])
2807 >>> s1.cat.remove_unused_categories()
2808 0 a
2809 1 b
2810 2 b
2811 3 c
2812 4 c
2813 5 c
2814 dtype: category
2815 Categories (3, object): ['a', 'b', 'c']
2817 >>> s.cat.set_categories(list("abcde"))
2818 0 a
2819 1 b
2820 2 b
2821 3 c
2822 4 c
2823 5 c
2824 dtype: category
2825 Categories (5, object): ['a', 'b', 'c', 'd', 'e']
2827 >>> s.cat.as_ordered()
2828 0 a
2829 1 b
2830 2 b
2831 3 c
2832 4 c
2833 5 c
2834 dtype: category
2835 Categories (3, object): ['a' < 'b' < 'c']
2837 >>> s.cat.as_unordered()
2838 0 a
2839 1 b
2840 2 b
2841 3 c
2842 4 c
2843 5 c
2844 dtype: category
2845 Categories (3, object): ['a', 'b', 'c']
2846 """
2848 def __init__(self, data) -> None:
2849 self._validate(data)
2850 self._parent = data.values
2851 self._index = data.index
2852 self._name = data.name
2853 self._freeze()
2855 @staticmethod
2856 def _validate(data):
2857 if not is_categorical_dtype(data.dtype):
2858 raise AttributeError("Can only use .cat accessor with a 'category' dtype")
2860 def _delegate_property_get(self, name):
2861 return getattr(self._parent, name)
2863 def _delegate_property_set(self, name, new_values):
2864 return setattr(self._parent, name, new_values)
2866 @property
2867 def codes(self) -> Series:
2868 """
2869 Return Series of codes as well as the index.
2870 """
2871 from pandas import Series
2873 return Series(self._parent.codes, index=self._index)
2875 def _delegate_method(self, name, *args, **kwargs):
2876 from pandas import Series
2878 method = getattr(self._parent, name)
2879 res = method(*args, **kwargs)
2880 if res is not None:
2881 return Series(res, index=self._index, name=self._name)
2884# utility routines
2887def _get_codes_for_values(values, categories: Index) -> np.ndarray:
2888 """
2889 utility routine to turn values into codes given the specified categories
2891 If `values` is known to be a Categorical, use recode_for_categories instead.
2892 """
2893 if values.ndim > 1:
2894 flat = values.ravel()
2895 codes = _get_codes_for_values(flat, categories)
2896 return codes.reshape(values.shape)
2898 codes = categories.get_indexer_for(values)
2899 return coerce_indexer_dtype(codes, categories)
2902def recode_for_categories(
2903 codes: np.ndarray, old_categories, new_categories, copy: bool = True
2904) -> np.ndarray:
2905 """
2906 Convert a set of codes for to a new set of categories
2908 Parameters
2909 ----------
2910 codes : np.ndarray
2911 old_categories, new_categories : Index
2912 copy: bool, default True
2913 Whether to copy if the codes are unchanged.
2915 Returns
2916 -------
2917 new_codes : np.ndarray[np.int64]
2919 Examples
2920 --------
2921 >>> old_cat = pd.Index(['b', 'a', 'c'])
2922 >>> new_cat = pd.Index(['a', 'b'])
2923 >>> codes = np.array([0, 1, 1, 2])
2924 >>> recode_for_categories(codes, old_cat, new_cat)
2925 array([ 1, 0, 0, -1], dtype=int8)
2926 """
2927 if len(old_categories) == 0:
2928 # All null anyway, so just retain the nulls
2929 if copy:
2930 return codes.copy()
2931 return codes
2932 elif new_categories.equals(old_categories):
2933 # Same categories, so no need to actually recode
2934 if copy:
2935 return codes.copy()
2936 return codes
2938 indexer = coerce_indexer_dtype(
2939 new_categories.get_indexer(old_categories), new_categories
2940 )
2941 new_codes = take_nd(indexer, codes, fill_value=-1)
2942 return new_codes
2945def factorize_from_iterable(values) -> tuple[np.ndarray, Index]:
2946 """
2947 Factorize an input `values` into `categories` and `codes`. Preserves
2948 categorical dtype in `categories`.
2950 Parameters
2951 ----------
2952 values : list-like
2954 Returns
2955 -------
2956 codes : ndarray
2957 categories : Index
2958 If `values` has a categorical dtype, then `categories` is
2959 a CategoricalIndex keeping the categories and order of `values`.
2960 """
2961 from pandas import CategoricalIndex
2963 if not is_list_like(values):
2964 raise TypeError("Input must be list-like")
2966 categories: Index
2967 if is_categorical_dtype(values):
2968 values = extract_array(values)
2969 # The Categorical we want to build has the same categories
2970 # as values but its codes are by def [0, ..., len(n_categories) - 1]
2971 cat_codes = np.arange(len(values.categories), dtype=values.codes.dtype)
2972 cat = Categorical.from_codes(cat_codes, dtype=values.dtype)
2974 categories = CategoricalIndex(cat)
2975 codes = values.codes
2976 else:
2977 # The value of ordered is irrelevant since we don't use cat as such,
2978 # but only the resulting categories, the order of which is independent
2979 # from ordered. Set ordered to False as default. See GH #15457
2980 cat = Categorical(values, ordered=False)
2981 categories = cat.categories
2982 codes = cat.codes
2983 return codes, categories
2986def factorize_from_iterables(iterables) -> tuple[list[np.ndarray], list[Index]]:
2987 """
2988 A higher-level wrapper over `factorize_from_iterable`.
2990 Parameters
2991 ----------
2992 iterables : list-like of list-likes
2994 Returns
2995 -------
2996 codes : list of ndarrays
2997 categories : list of Indexes
2999 Notes
3000 -----
3001 See `factorize_from_iterable` for more info.
3002 """
3003 if len(iterables) == 0:
3004 # For consistency, it should return two empty lists.
3005 return [], []
3007 codes, categories = zip(*(factorize_from_iterable(it) for it in iterables))
3008 return list(codes), list(categories)