Coverage for /var/srv/projects/api.amasfac.comuna18.com/tmp/venv/lib/python3.9/site-packages/pandas/core/indexes/category.py: 25%
168 statements
« prev ^ index » next coverage.py v6.4.4, created at 2023-07-17 14:22 -0600
« prev ^ index » next coverage.py v6.4.4, created at 2023-07-17 14:22 -0600
1from __future__ import annotations
3from typing import (
4 Any,
5 Hashable,
6)
7import warnings
9import numpy as np
11from pandas._libs import index as libindex
12from pandas._typing import (
13 Dtype,
14 DtypeObj,
15 npt,
16)
17from pandas.util._decorators import (
18 cache_readonly,
19 doc,
20)
21from pandas.util._exceptions import find_stack_level
23from pandas.core.dtypes.common import (
24 is_categorical_dtype,
25 is_scalar,
26 pandas_dtype,
27)
28from pandas.core.dtypes.missing import (
29 is_valid_na_for_dtype,
30 isna,
31 notna,
32)
34from pandas.core.arrays.categorical import (
35 Categorical,
36 contains,
37)
38from pandas.core.construction import extract_array
39import pandas.core.indexes.base as ibase
40from pandas.core.indexes.base import (
41 Index,
42 maybe_extract_name,
43)
44from pandas.core.indexes.extension import (
45 NDArrayBackedExtensionIndex,
46 inherit_names,
47)
49from pandas.io.formats.printing import pprint_thing
51_index_doc_kwargs: dict[str, str] = dict(ibase._index_doc_kwargs)
52_index_doc_kwargs.update({"target_klass": "CategoricalIndex"})
55@inherit_names(
56 [
57 "argsort",
58 "tolist",
59 "codes",
60 "categories",
61 "ordered",
62 "_reverse_indexer",
63 "searchsorted",
64 "is_dtype_equal",
65 "min",
66 "max",
67 ],
68 Categorical,
69)
70@inherit_names(
71 [
72 "rename_categories",
73 "reorder_categories",
74 "add_categories",
75 "remove_categories",
76 "remove_unused_categories",
77 "set_categories",
78 "as_ordered",
79 "as_unordered",
80 ],
81 Categorical,
82 wrap=True,
83)
84class CategoricalIndex(NDArrayBackedExtensionIndex):
85 """
86 Index based on an underlying :class:`Categorical`.
88 CategoricalIndex, like Categorical, can only take on a limited,
89 and usually fixed, number of possible values (`categories`). Also,
90 like Categorical, it might have an order, but numerical operations
91 (additions, divisions, ...) are not possible.
93 Parameters
94 ----------
95 data : array-like (1-dimensional)
96 The values of the categorical. If `categories` are given, values not in
97 `categories` will be replaced with NaN.
98 categories : index-like, optional
99 The categories for the categorical. Items need to be unique.
100 If the categories are not given here (and also not in `dtype`), they
101 will be inferred from the `data`.
102 ordered : bool, optional
103 Whether or not this categorical is treated as an ordered
104 categorical. If not given here or in `dtype`, the resulting
105 categorical will be unordered.
106 dtype : CategoricalDtype or "category", optional
107 If :class:`CategoricalDtype`, cannot be used together with
108 `categories` or `ordered`.
109 copy : bool, default False
110 Make a copy of input ndarray.
111 name : object, optional
112 Name to be stored in the index.
114 Attributes
115 ----------
116 codes
117 categories
118 ordered
120 Methods
121 -------
122 rename_categories
123 reorder_categories
124 add_categories
125 remove_categories
126 remove_unused_categories
127 set_categories
128 as_ordered
129 as_unordered
130 map
132 Raises
133 ------
134 ValueError
135 If the categories do not validate.
136 TypeError
137 If an explicit ``ordered=True`` is given but no `categories` and the
138 `values` are not sortable.
140 See Also
141 --------
142 Index : The base pandas Index type.
143 Categorical : A categorical array.
144 CategoricalDtype : Type for categorical data.
146 Notes
147 -----
148 See the `user guide
149 <https://pandas.pydata.org/pandas-docs/stable/user_guide/advanced.html#categoricalindex>`__
150 for more.
152 Examples
153 --------
154 >>> pd.CategoricalIndex(["a", "b", "c", "a", "b", "c"])
155 CategoricalIndex(['a', 'b', 'c', 'a', 'b', 'c'],
156 categories=['a', 'b', 'c'], ordered=False, dtype='category')
158 ``CategoricalIndex`` can also be instantiated from a ``Categorical``:
160 >>> c = pd.Categorical(["a", "b", "c", "a", "b", "c"])
161 >>> pd.CategoricalIndex(c)
162 CategoricalIndex(['a', 'b', 'c', 'a', 'b', 'c'],
163 categories=['a', 'b', 'c'], ordered=False, dtype='category')
165 Ordered ``CategoricalIndex`` can have a min and max value.
167 >>> ci = pd.CategoricalIndex(
168 ... ["a", "b", "c", "a", "b", "c"], ordered=True, categories=["c", "b", "a"]
169 ... )
170 >>> ci
171 CategoricalIndex(['a', 'b', 'c', 'a', 'b', 'c'],
172 categories=['c', 'b', 'a'], ordered=True, dtype='category')
173 >>> ci.min()
174 'c'
175 """
177 _typ = "categoricalindex"
178 _data_cls = Categorical
180 @property
181 def _can_hold_strings(self):
182 return self.categories._can_hold_strings
184 @cache_readonly
185 def _should_fallback_to_positional(self) -> bool:
186 return self.categories._should_fallback_to_positional
188 codes: np.ndarray
189 categories: Index
190 ordered: bool | None
191 _data: Categorical
192 _values: Categorical
194 @property
195 def _engine_type(self) -> type[libindex.IndexEngine]:
196 # self.codes can have dtype int8, int16, int32 or int64, so we need
197 # to return the corresponding engine type (libindex.Int8Engine, etc.).
198 return {
199 np.int8: libindex.Int8Engine,
200 np.int16: libindex.Int16Engine,
201 np.int32: libindex.Int32Engine,
202 np.int64: libindex.Int64Engine,
203 }[self.codes.dtype.type]
205 # --------------------------------------------------------------------
206 # Constructors
208 def __new__(
209 cls,
210 data=None,
211 categories=None,
212 ordered=None,
213 dtype: Dtype | None = None,
214 copy: bool = False,
215 name: Hashable = None,
216 ) -> CategoricalIndex:
218 name = maybe_extract_name(name, data, cls)
220 if data is None:
221 # GH#38944
222 warnings.warn(
223 "Constructing a CategoricalIndex without passing data is "
224 "deprecated and will raise in a future version. "
225 "Use CategoricalIndex([], ...) instead.",
226 FutureWarning,
227 stacklevel=find_stack_level(),
228 )
229 data = []
231 if is_scalar(data):
232 raise cls._scalar_data_error(data)
234 data = Categorical(
235 data, categories=categories, ordered=ordered, dtype=dtype, copy=copy
236 )
238 return cls._simple_new(data, name=name)
240 # --------------------------------------------------------------------
242 def _is_dtype_compat(self, other) -> Categorical:
243 """
244 *this is an internal non-public method*
246 provide a comparison between the dtype of self and other (coercing if
247 needed)
249 Parameters
250 ----------
251 other : Index
253 Returns
254 -------
255 Categorical
257 Raises
258 ------
259 TypeError if the dtypes are not compatible
260 """
261 if is_categorical_dtype(other):
262 other = extract_array(other)
263 if not other._categories_match_up_to_permutation(self):
264 raise TypeError(
265 "categories must match existing categories when appending"
266 )
268 elif other._is_multi:
269 # preempt raising NotImplementedError in isna call
270 raise TypeError("MultiIndex is not dtype-compatible with CategoricalIndex")
271 else:
272 values = other
274 cat = Categorical(other, dtype=self.dtype)
275 other = CategoricalIndex(cat)
276 if not other.isin(values).all():
277 raise TypeError(
278 "cannot append a non-category item to a CategoricalIndex"
279 )
280 other = other._values
282 if not ((other == values) | (isna(other) & isna(values))).all():
283 # GH#37667 see test_equals_non_category
284 raise TypeError(
285 "categories must match existing categories when appending"
286 )
288 return other
290 @doc(Index.astype)
291 def astype(self, dtype: Dtype, copy: bool = True) -> Index:
292 from pandas.core.api import NumericIndex
294 dtype = pandas_dtype(dtype)
296 categories = self.categories
297 # the super method always returns Int64Index, UInt64Index and Float64Index
298 # but if the categories are a NumericIndex with dtype float32, we want to
299 # return an index with the same dtype as self.categories.
300 if categories._is_backward_compat_public_numeric_index:
301 assert isinstance(categories, NumericIndex) # mypy complaint fix
302 try:
303 categories._validate_dtype(dtype)
304 except ValueError:
305 pass
306 else:
307 new_values = self._data.astype(dtype, copy=copy)
308 # pass copy=False because any copying has been done in the
309 # _data.astype call above
310 return categories._constructor(new_values, name=self.name, copy=False)
312 return super().astype(dtype, copy=copy)
314 def equals(self, other: object) -> bool:
315 """
316 Determine if two CategoricalIndex objects contain the same elements.
318 Returns
319 -------
320 bool
321 If two CategoricalIndex objects have equal elements True,
322 otherwise False.
323 """
324 if self.is_(other):
325 return True
327 if not isinstance(other, Index):
328 return False
330 try:
331 other = self._is_dtype_compat(other)
332 except (TypeError, ValueError):
333 return False
335 return self._data.equals(other)
337 # --------------------------------------------------------------------
338 # Rendering Methods
340 @property
341 def _formatter_func(self):
342 return self.categories._formatter_func
344 def _format_attrs(self):
345 """
346 Return a list of tuples of the (attr,formatted_value)
347 """
348 attrs: list[tuple[str, str | int | bool | None]]
350 attrs = [
351 (
352 "categories",
353 "[" + ", ".join(self._data._repr_categories()) + "]",
354 ),
355 ("ordered", self.ordered),
356 ]
357 extra = super()._format_attrs()
358 return attrs + extra
360 def _format_with_header(self, header: list[str], na_rep: str) -> list[str]:
361 result = [
362 pprint_thing(x, escape_chars=("\t", "\r", "\n")) if notna(x) else na_rep
363 for x in self._values
364 ]
365 return header + result
367 # --------------------------------------------------------------------
369 @property
370 def inferred_type(self) -> str:
371 return "categorical"
373 @doc(Index.__contains__)
374 def __contains__(self, key: Any) -> bool:
375 # if key is a NaN, check if any NaN is in self.
376 if is_valid_na_for_dtype(key, self.categories.dtype):
377 return self.hasnans
379 return contains(self, key, container=self._engine)
381 # TODO(2.0): remove reindex once non-unique deprecation is enforced
382 def reindex(
383 self, target, method=None, level=None, limit=None, tolerance=None
384 ) -> tuple[Index, npt.NDArray[np.intp] | None]:
385 """
386 Create index with target's values (move/add/delete values as necessary)
388 Returns
389 -------
390 new_index : pd.Index
391 Resulting index
392 indexer : np.ndarray[np.intp] or None
393 Indices of output values in original index
395 """
396 if method is not None:
397 raise NotImplementedError(
398 "argument method is not implemented for CategoricalIndex.reindex"
399 )
400 if level is not None:
401 raise NotImplementedError(
402 "argument level is not implemented for CategoricalIndex.reindex"
403 )
404 if limit is not None:
405 raise NotImplementedError(
406 "argument limit is not implemented for CategoricalIndex.reindex"
407 )
409 target = ibase.ensure_index(target)
411 if self.equals(target):
412 indexer = None
413 missing = np.array([], dtype=np.intp)
414 else:
415 indexer, missing = self.get_indexer_non_unique(target)
416 if not self.is_unique:
417 # GH#42568
418 warnings.warn(
419 "reindexing with a non-unique Index is deprecated and will "
420 "raise in a future version.",
421 FutureWarning,
422 stacklevel=find_stack_level(),
423 )
425 new_target: Index
426 if len(self) and indexer is not None:
427 new_target = self.take(indexer)
428 else:
429 new_target = target
431 # filling in missing if needed
432 if len(missing):
433 cats = self.categories.get_indexer(target)
435 if not isinstance(target, CategoricalIndex) or (cats == -1).any():
436 new_target, indexer, _ = super()._reindex_non_unique(target)
437 else:
438 # error: "Index" has no attribute "codes"
439 codes = new_target.codes.copy() # type: ignore[attr-defined]
440 codes[indexer == -1] = cats[missing]
441 cat = self._data._from_backing_data(codes)
442 new_target = type(self)._simple_new(cat, name=self.name)
444 # we always want to return an Index type here
445 # to be consistent with .reindex for other index types (e.g. they don't
446 # coerce based on the actual values, only on the dtype)
447 # unless we had an initial Categorical to begin with
448 # in which case we are going to conform to the passed Categorical
449 if is_categorical_dtype(target):
450 cat = Categorical(new_target, dtype=target.dtype)
451 new_target = type(self)._simple_new(cat, name=self.name)
452 else:
453 # e.g. test_reindex_with_categoricalindex, test_reindex_duplicate_target
454 new_target_array = np.asarray(new_target)
455 new_target = Index._with_infer(new_target_array, name=self.name)
457 return new_target, indexer
459 # --------------------------------------------------------------------
460 # Indexing Methods
462 def _maybe_cast_indexer(self, key) -> int:
463 # GH#41933: we have to do this instead of self._data._validate_scalar
464 # because this will correctly get partial-indexing on Interval categories
465 try:
466 return self._data._unbox_scalar(key)
467 except KeyError:
468 if is_valid_na_for_dtype(key, self.categories.dtype):
469 return -1
470 raise
472 def _maybe_cast_listlike_indexer(self, values) -> CategoricalIndex:
473 if isinstance(values, CategoricalIndex):
474 values = values._data
475 if isinstance(values, Categorical):
476 # Indexing on codes is more efficient if categories are the same,
477 # so we can apply some optimizations based on the degree of
478 # dtype-matching.
479 cat = self._data._encode_with_my_categories(values)
480 codes = cat._codes
481 else:
482 codes = self.categories.get_indexer(values)
483 codes = codes.astype(self.codes.dtype, copy=False)
484 cat = self._data._from_backing_data(codes)
485 return type(self)._simple_new(cat)
487 # --------------------------------------------------------------------
489 def _is_comparable_dtype(self, dtype: DtypeObj) -> bool:
490 return self.categories._is_comparable_dtype(dtype)
492 def take_nd(self, *args, **kwargs) -> CategoricalIndex:
493 """Alias for `take`"""
494 warnings.warn(
495 "CategoricalIndex.take_nd is deprecated, use CategoricalIndex.take "
496 "instead.",
497 FutureWarning,
498 stacklevel=find_stack_level(),
499 )
500 return self.take(*args, **kwargs)
502 def map(self, mapper):
503 """
504 Map values using input an input mapping or function.
506 Maps the values (their categories, not the codes) of the index to new
507 categories. If the mapping correspondence is one-to-one the result is a
508 :class:`~pandas.CategoricalIndex` which has the same order property as
509 the original, otherwise an :class:`~pandas.Index` is returned.
511 If a `dict` or :class:`~pandas.Series` is used any unmapped category is
512 mapped to `NaN`. Note that if this happens an :class:`~pandas.Index`
513 will be returned.
515 Parameters
516 ----------
517 mapper : function, dict, or Series
518 Mapping correspondence.
520 Returns
521 -------
522 pandas.CategoricalIndex or pandas.Index
523 Mapped index.
525 See Also
526 --------
527 Index.map : Apply a mapping correspondence on an
528 :class:`~pandas.Index`.
529 Series.map : Apply a mapping correspondence on a
530 :class:`~pandas.Series`.
531 Series.apply : Apply more complex functions on a
532 :class:`~pandas.Series`.
534 Examples
535 --------
536 >>> idx = pd.CategoricalIndex(['a', 'b', 'c'])
537 >>> idx
538 CategoricalIndex(['a', 'b', 'c'], categories=['a', 'b', 'c'],
539 ordered=False, dtype='category')
540 >>> idx.map(lambda x: x.upper())
541 CategoricalIndex(['A', 'B', 'C'], categories=['A', 'B', 'C'],
542 ordered=False, dtype='category')
543 >>> idx.map({'a': 'first', 'b': 'second', 'c': 'third'})
544 CategoricalIndex(['first', 'second', 'third'], categories=['first',
545 'second', 'third'], ordered=False, dtype='category')
547 If the mapping is one-to-one the ordering of the categories is
548 preserved:
550 >>> idx = pd.CategoricalIndex(['a', 'b', 'c'], ordered=True)
551 >>> idx
552 CategoricalIndex(['a', 'b', 'c'], categories=['a', 'b', 'c'],
553 ordered=True, dtype='category')
554 >>> idx.map({'a': 3, 'b': 2, 'c': 1})
555 CategoricalIndex([3, 2, 1], categories=[3, 2, 1], ordered=True,
556 dtype='category')
558 If the mapping is not one-to-one an :class:`~pandas.Index` is returned:
560 >>> idx.map({'a': 'first', 'b': 'second', 'c': 'first'})
561 Index(['first', 'second', 'first'], dtype='object')
563 If a `dict` is used, all unmapped categories are mapped to `NaN` and
564 the result is an :class:`~pandas.Index`:
566 >>> idx.map({'a': 'first', 'b': 'second'})
567 Index(['first', 'second', nan], dtype='object')
568 """
569 mapped = self._values.map(mapper)
570 return Index(mapped, name=self.name)
572 def _concat(self, to_concat: list[Index], name: Hashable) -> Index:
573 # if calling index is category, don't check dtype of others
574 try:
575 cat = Categorical._concat_same_type(
576 [self._is_dtype_compat(c) for c in to_concat]
577 )
578 except TypeError:
579 # not all to_concat elements are among our categories (or NA)
580 from pandas.core.dtypes.concat import concat_compat
582 res = concat_compat([x._values for x in to_concat])
583 return Index(res, name=name)
584 else:
585 return type(self)._simple_new(cat, name=name)