Coverage for /var/srv/projects/api.amasfac.comuna18.com/tmp/venv/lib/python3.9/site-packages/pandas/core/arrays/categorical.py: 19%

864 statements  

« prev     ^ index     » next       coverage.py v6.4.4, created at 2023-07-17 14:22 -0600

1from __future__ import annotations 

2 

3from csv import QUOTE_NONNUMERIC 

4from functools import partial 

5import operator 

6from shutil import get_terminal_size 

7from typing import ( 

8 TYPE_CHECKING, 

9 Hashable, 

10 Literal, 

11 Sequence, 

12 TypeVar, 

13 Union, 

14 cast, 

15 overload, 

16) 

17from warnings import ( 

18 catch_warnings, 

19 simplefilter, 

20 warn, 

21) 

22 

23import numpy as np 

24 

25from pandas._config import get_option 

26 

27from pandas._libs import ( 

28 NaT, 

29 algos as libalgos, 

30 lib, 

31) 

32from pandas._libs.arrays import NDArrayBacked 

33from pandas._libs.lib import ( 

34 NoDefault, 

35 no_default, 

36) 

37from pandas._typing import ( 

38 ArrayLike, 

39 AstypeArg, 

40 Dtype, 

41 NpDtype, 

42 Ordered, 

43 Shape, 

44 npt, 

45 type_t, 

46) 

47from pandas.compat.numpy import function as nv 

48from pandas.util._decorators import ( 

49 deprecate_kwarg, 

50 deprecate_nonkeyword_arguments, 

51) 

52from pandas.util._exceptions import find_stack_level 

53from pandas.util._validators import validate_bool_kwarg 

54 

55from pandas.core.dtypes.cast import coerce_indexer_dtype 

56from pandas.core.dtypes.common import ( 

57 ensure_int64, 

58 ensure_platform_int, 

59 is_categorical_dtype, 

60 is_datetime64_dtype, 

61 is_dict_like, 

62 is_dtype_equal, 

63 is_extension_array_dtype, 

64 is_hashable, 

65 is_integer_dtype, 

66 is_list_like, 

67 is_scalar, 

68 is_timedelta64_dtype, 

69 needs_i8_conversion, 

70 pandas_dtype, 

71) 

72from pandas.core.dtypes.dtypes import ( 

73 CategoricalDtype, 

74 ExtensionDtype, 

75) 

76from pandas.core.dtypes.generic import ( 

77 ABCIndex, 

78 ABCSeries, 

79) 

80from pandas.core.dtypes.missing import ( 

81 is_valid_na_for_dtype, 

82 isna, 

83 notna, 

84) 

85 

86from pandas.core import ( 

87 arraylike, 

88 ops, 

89) 

90from pandas.core.accessor import ( 

91 PandasDelegate, 

92 delegate_names, 

93) 

94import pandas.core.algorithms as algorithms 

95from pandas.core.algorithms import ( 

96 factorize, 

97 take_nd, 

98 unique1d, 

99) 

100from pandas.core.arrays._mixins import ( 

101 NDArrayBackedExtensionArray, 

102 ravel_compat, 

103) 

104from pandas.core.base import ( 

105 ExtensionArray, 

106 NoNewAttributesMixin, 

107 PandasObject, 

108) 

109import pandas.core.common as com 

110from pandas.core.construction import ( 

111 extract_array, 

112 sanitize_array, 

113) 

114from pandas.core.ops.common import unpack_zerodim_and_defer 

115from pandas.core.sorting import nargsort 

116from pandas.core.strings.object_array import ObjectStringArrayMixin 

117 

118from pandas.io.formats import console 

119 

120if TYPE_CHECKING: 120 ↛ 121line 120 didn't jump to line 121, because the condition on line 120 was never true

121 from pandas import ( 

122 DataFrame, 

123 Index, 

124 Series, 

125 ) 

126 

127 

128CategoricalT = TypeVar("CategoricalT", bound="Categorical") 

129 

130 

131def _cat_compare_op(op): 

132 opname = f"__{op.__name__}__" 

133 fill_value = True if op is operator.ne else False 

134 

135 @unpack_zerodim_and_defer(opname) 

136 def func(self, other): 

137 hashable = is_hashable(other) 

138 if is_list_like(other) and len(other) != len(self) and not hashable: 

139 # in hashable case we may have a tuple that is itself a category 

140 raise ValueError("Lengths must match.") 

141 

142 if not self.ordered: 

143 if opname in ["__lt__", "__gt__", "__le__", "__ge__"]: 

144 raise TypeError( 

145 "Unordered Categoricals can only compare equality or not" 

146 ) 

147 if isinstance(other, Categorical): 

148 # Two Categoricals can only be compared if the categories are 

149 # the same (maybe up to ordering, depending on ordered) 

150 

151 msg = "Categoricals can only be compared if 'categories' are the same." 

152 if not self._categories_match_up_to_permutation(other): 

153 raise TypeError(msg) 

154 

155 if not self.ordered and not self.categories.equals(other.categories): 

156 # both unordered and different order 

157 other_codes = recode_for_categories( 

158 other.codes, other.categories, self.categories, copy=False 

159 ) 

160 else: 

161 other_codes = other._codes 

162 

163 ret = op(self._codes, other_codes) 

164 mask = (self._codes == -1) | (other_codes == -1) 

165 if mask.any(): 

166 ret[mask] = fill_value 

167 return ret 

168 

169 if hashable: 

170 if other in self.categories: 

171 i = self._unbox_scalar(other) 

172 ret = op(self._codes, i) 

173 

174 if opname not in {"__eq__", "__ge__", "__gt__"}: 

175 # GH#29820 performance trick; get_loc will always give i>=0, 

176 # so in the cases (__ne__, __le__, __lt__) the setting 

177 # here is a no-op, so can be skipped. 

178 mask = self._codes == -1 

179 ret[mask] = fill_value 

180 return ret 

181 else: 

182 return ops.invalid_comparison(self, other, op) 

183 else: 

184 # allow categorical vs object dtype array comparisons for equality 

185 # these are only positional comparisons 

186 if opname not in ["__eq__", "__ne__"]: 

187 raise TypeError( 

188 f"Cannot compare a Categorical for op {opname} with " 

189 f"type {type(other)}.\nIf you want to compare values, " 

190 "use 'np.asarray(cat) <op> other'." 

191 ) 

192 

193 if isinstance(other, ExtensionArray) and needs_i8_conversion(other.dtype): 

194 # We would return NotImplemented here, but that messes up 

195 # ExtensionIndex's wrapped methods 

196 return op(other, self) 

197 return getattr(np.array(self), opname)(np.array(other)) 

198 

199 func.__name__ = opname 

200 

201 return func 

202 

203 

204def contains(cat, key, container) -> bool: 

205 """ 

206 Helper for membership check for ``key`` in ``cat``. 

207 

208 This is a helper method for :method:`__contains__` 

209 and :class:`CategoricalIndex.__contains__`. 

210 

211 Returns True if ``key`` is in ``cat.categories`` and the 

212 location of ``key`` in ``categories`` is in ``container``. 

213 

214 Parameters 

215 ---------- 

216 cat : :class:`Categorical`or :class:`categoricalIndex` 

217 key : a hashable object 

218 The key to check membership for. 

219 container : Container (e.g. list-like or mapping) 

220 The container to check for membership in. 

221 

222 Returns 

223 ------- 

224 is_in : bool 

225 True if ``key`` is in ``self.categories`` and location of 

226 ``key`` in ``categories`` is in ``container``, else False. 

227 

228 Notes 

229 ----- 

230 This method does not check for NaN values. Do that separately 

231 before calling this method. 

232 """ 

233 hash(key) 

234 

235 # get location of key in categories. 

236 # If a KeyError, the key isn't in categories, so logically 

237 # can't be in container either. 

238 try: 

239 loc = cat.categories.get_loc(key) 

240 except (KeyError, TypeError): 

241 return False 

242 

243 # loc is the location of key in categories, but also the *value* 

244 # for key in container. So, `key` may be in categories, 

245 # but still not in `container`. Example ('b' in categories, 

246 # but not in values): 

247 # 'b' in Categorical(['a'], categories=['a', 'b']) # False 

248 if is_scalar(loc): 

249 return loc in container 

250 else: 

251 # if categories is an IntervalIndex, loc is an array. 

252 return any(loc_ in container for loc_ in loc) 

253 

254 

255class Categorical(NDArrayBackedExtensionArray, PandasObject, ObjectStringArrayMixin): 

256 """ 

257 Represent a categorical variable in classic R / S-plus fashion. 

258 

259 `Categoricals` can only take on only a limited, and usually fixed, number 

260 of possible values (`categories`). In contrast to statistical categorical 

261 variables, a `Categorical` might have an order, but numerical operations 

262 (additions, divisions, ...) are not possible. 

263 

264 All values of the `Categorical` are either in `categories` or `np.nan`. 

265 Assigning values outside of `categories` will raise a `ValueError`. Order 

266 is defined by the order of the `categories`, not lexical order of the 

267 values. 

268 

269 Parameters 

270 ---------- 

271 values : list-like 

272 The values of the categorical. If categories are given, values not in 

273 categories will be replaced with NaN. 

274 categories : Index-like (unique), optional 

275 The unique categories for this categorical. If not given, the 

276 categories are assumed to be the unique values of `values` (sorted, if 

277 possible, otherwise in the order in which they appear). 

278 ordered : bool, default False 

279 Whether or not this categorical is treated as a ordered categorical. 

280 If True, the resulting categorical will be ordered. 

281 An ordered categorical respects, when sorted, the order of its 

282 `categories` attribute (which in turn is the `categories` argument, if 

283 provided). 

284 dtype : CategoricalDtype 

285 An instance of ``CategoricalDtype`` to use for this categorical. 

286 

287 Attributes 

288 ---------- 

289 categories : Index 

290 The categories of this categorical 

291 codes : ndarray 

292 The codes (integer positions, which point to the categories) of this 

293 categorical, read only. 

294 ordered : bool 

295 Whether or not this Categorical is ordered. 

296 dtype : CategoricalDtype 

297 The instance of ``CategoricalDtype`` storing the ``categories`` 

298 and ``ordered``. 

299 

300 Methods 

301 ------- 

302 from_codes 

303 __array__ 

304 

305 Raises 

306 ------ 

307 ValueError 

308 If the categories do not validate. 

309 TypeError 

310 If an explicit ``ordered=True`` is given but no `categories` and the 

311 `values` are not sortable. 

312 

313 See Also 

314 -------- 

315 CategoricalDtype : Type for categorical data. 

316 CategoricalIndex : An Index with an underlying ``Categorical``. 

317 

318 Notes 

319 ----- 

320 See the `user guide 

321 <https://pandas.pydata.org/pandas-docs/stable/user_guide/categorical.html>`__ 

322 for more. 

323 

324 Examples 

325 -------- 

326 >>> pd.Categorical([1, 2, 3, 1, 2, 3]) 

327 [1, 2, 3, 1, 2, 3] 

328 Categories (3, int64): [1, 2, 3] 

329 

330 >>> pd.Categorical(['a', 'b', 'c', 'a', 'b', 'c']) 

331 ['a', 'b', 'c', 'a', 'b', 'c'] 

332 Categories (3, object): ['a', 'b', 'c'] 

333 

334 Missing values are not included as a category. 

335 

336 >>> c = pd.Categorical([1, 2, 3, 1, 2, 3, np.nan]) 

337 >>> c 

338 [1, 2, 3, 1, 2, 3, NaN] 

339 Categories (3, int64): [1, 2, 3] 

340 

341 However, their presence is indicated in the `codes` attribute 

342 by code `-1`. 

343 

344 >>> c.codes 

345 array([ 0, 1, 2, 0, 1, 2, -1], dtype=int8) 

346 

347 Ordered `Categoricals` can be sorted according to the custom order 

348 of the categories and can have a min and max value. 

349 

350 >>> c = pd.Categorical(['a', 'b', 'c', 'a', 'b', 'c'], ordered=True, 

351 ... categories=['c', 'b', 'a']) 

352 >>> c 

353 ['a', 'b', 'c', 'a', 'b', 'c'] 

354 Categories (3, object): ['c' < 'b' < 'a'] 

355 >>> c.min() 

356 'c' 

357 """ 

358 

359 # For comparisons, so that numpy uses our implementation if the compare 

360 # ops, which raise 

361 __array_priority__ = 1000 

362 # tolist is not actually deprecated, just suppressed in the __dir__ 

363 _hidden_attrs = PandasObject._hidden_attrs | frozenset(["tolist"]) 

364 _typ = "categorical" 

365 

366 _dtype: CategoricalDtype 

367 

368 def __init__( 

369 self, 

370 values, 

371 categories=None, 

372 ordered=None, 

373 dtype: Dtype | None = None, 

374 fastpath: bool = False, 

375 copy: bool = True, 

376 ) -> None: 

377 

378 dtype = CategoricalDtype._from_values_or_dtype( 

379 values, categories, ordered, dtype 

380 ) 

381 # At this point, dtype is always a CategoricalDtype, but 

382 # we may have dtype.categories be None, and we need to 

383 # infer categories in a factorization step further below 

384 

385 if fastpath: 

386 codes = coerce_indexer_dtype(values, dtype.categories) 

387 dtype = CategoricalDtype(ordered=False).update_dtype(dtype) 

388 super().__init__(codes, dtype) 

389 return 

390 

391 if not is_list_like(values): 

392 # GH#38433 

393 warn( 

394 "Allowing scalars in the Categorical constructor is deprecated " 

395 "and will raise in a future version. Use `[value]` instead", 

396 FutureWarning, 

397 stacklevel=find_stack_level(), 

398 ) 

399 values = [values] 

400 

401 # null_mask indicates missing values we want to exclude from inference. 

402 # This means: only missing values in list-likes (not arrays/ndframes). 

403 null_mask = np.array(False) 

404 

405 # sanitize input 

406 if is_categorical_dtype(values): 

407 if dtype.categories is None: 

408 dtype = CategoricalDtype(values.categories, dtype.ordered) 

409 elif not isinstance(values, (ABCIndex, ABCSeries, ExtensionArray)): 

410 values = com.convert_to_list_like(values) 

411 if isinstance(values, list) and len(values) == 0: 

412 # By convention, empty lists result in object dtype: 

413 values = np.array([], dtype=object) 

414 elif isinstance(values, np.ndarray): 

415 if values.ndim > 1: 

416 # preempt sanitize_array from raising ValueError 

417 raise NotImplementedError( 

418 "> 1 ndim Categorical are not supported at this time" 

419 ) 

420 values = sanitize_array(values, None) 

421 else: 

422 # i.e. must be a list 

423 arr = sanitize_array(values, None) 

424 null_mask = isna(arr) 

425 if null_mask.any(): 

426 # We remove null values here, then below will re-insert 

427 # them, grep "full_codes" 

428 arr_list = [values[idx] for idx in np.where(~null_mask)[0]] 

429 

430 # GH#44900 Do not cast to float if we have only missing values 

431 if arr_list or arr.dtype == "object": 

432 sanitize_dtype = None 

433 else: 

434 sanitize_dtype = arr.dtype 

435 

436 arr = sanitize_array(arr_list, None, dtype=sanitize_dtype) 

437 values = arr 

438 

439 if dtype.categories is None: 

440 try: 

441 codes, categories = factorize(values, sort=True) 

442 except TypeError as err: 

443 codes, categories = factorize(values, sort=False) 

444 if dtype.ordered: 

445 # raise, as we don't have a sortable data structure and so 

446 # the user should give us one by specifying categories 

447 raise TypeError( 

448 "'values' is not ordered, please " 

449 "explicitly specify the categories order " 

450 "by passing in a categories argument." 

451 ) from err 

452 

453 # we're inferring from values 

454 dtype = CategoricalDtype(categories, dtype.ordered) 

455 

456 elif is_categorical_dtype(values.dtype): 

457 old_codes = extract_array(values)._codes 

458 codes = recode_for_categories( 

459 old_codes, values.dtype.categories, dtype.categories, copy=copy 

460 ) 

461 

462 else: 

463 codes = _get_codes_for_values(values, dtype.categories) 

464 

465 if null_mask.any(): 

466 # Reinsert -1 placeholders for previously removed missing values 

467 full_codes = -np.ones(null_mask.shape, dtype=codes.dtype) 

468 full_codes[~null_mask] = codes 

469 codes = full_codes 

470 

471 dtype = CategoricalDtype(ordered=False).update_dtype(dtype) 

472 arr = coerce_indexer_dtype(codes, dtype.categories) 

473 super().__init__(arr, dtype) 

474 

475 @property 

476 def dtype(self) -> CategoricalDtype: 

477 """ 

478 The :class:`~pandas.api.types.CategoricalDtype` for this instance. 

479 """ 

480 return self._dtype 

481 

482 @property 

483 def _internal_fill_value(self) -> int: 

484 # using the specific numpy integer instead of python int to get 

485 # the correct dtype back from _quantile in the all-NA case 

486 dtype = self._ndarray.dtype 

487 return dtype.type(-1) 

488 

489 @property 

490 def _constructor(self) -> type[Categorical]: 

491 return Categorical 

492 

493 @classmethod 

494 def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy=False): 

495 return Categorical(scalars, dtype=dtype, copy=copy) 

496 

497 @overload 

498 def astype(self, dtype: npt.DTypeLike, copy: bool = ...) -> np.ndarray: 

499 ... 

500 

501 @overload 

502 def astype(self, dtype: ExtensionDtype, copy: bool = ...) -> ExtensionArray: 

503 ... 

504 

505 @overload 

506 def astype(self, dtype: AstypeArg, copy: bool = ...) -> ArrayLike: 

507 ... 

508 

509 def astype(self, dtype: AstypeArg, copy: bool = True) -> ArrayLike: 

510 """ 

511 Coerce this type to another dtype 

512 

513 Parameters 

514 ---------- 

515 dtype : numpy dtype or pandas type 

516 copy : bool, default True 

517 By default, astype always returns a newly allocated object. 

518 If copy is set to False and dtype is categorical, the original 

519 object is returned. 

520 """ 

521 dtype = pandas_dtype(dtype) 

522 if self.dtype is dtype: 

523 result = self.copy() if copy else self 

524 

525 elif is_categorical_dtype(dtype): 

526 dtype = cast("Union[str, CategoricalDtype]", dtype) 

527 

528 # GH 10696/18593/18630 

529 dtype = self.dtype.update_dtype(dtype) 

530 self = self.copy() if copy else self 

531 result = self._set_dtype(dtype) 

532 

533 elif isinstance(dtype, ExtensionDtype): 

534 return super().astype(dtype, copy=copy) 

535 

536 elif is_integer_dtype(dtype) and self.isna().any(): 

537 raise ValueError("Cannot convert float NaN to integer") 

538 

539 elif len(self.codes) == 0 or len(self.categories) == 0: 

540 result = np.array( 

541 self, 

542 dtype=dtype, 

543 copy=copy, 

544 ) 

545 

546 else: 

547 # GH8628 (PERF): astype category codes instead of astyping array 

548 new_cats = self.categories._values 

549 

550 try: 

551 new_cats = new_cats.astype(dtype=dtype, copy=copy) 

552 fill_value = self.categories._na_value 

553 if not is_valid_na_for_dtype(fill_value, dtype): 

554 fill_value = lib.item_from_zerodim( 

555 np.array(self.categories._na_value).astype(dtype) 

556 ) 

557 except ( 

558 TypeError, # downstream error msg for CategoricalIndex is misleading 

559 ValueError, 

560 ): 

561 msg = f"Cannot cast {self.categories.dtype} dtype to {dtype}" 

562 raise ValueError(msg) 

563 

564 result = take_nd( 

565 new_cats, ensure_platform_int(self._codes), fill_value=fill_value 

566 ) 

567 

568 return result 

569 

570 def to_list(self): 

571 """ 

572 Alias for tolist. 

573 """ 

574 return self.tolist() 

575 

576 @classmethod 

577 def _from_inferred_categories( 

578 cls, inferred_categories, inferred_codes, dtype, true_values=None 

579 ): 

580 """ 

581 Construct a Categorical from inferred values. 

582 

583 For inferred categories (`dtype` is None) the categories are sorted. 

584 For explicit `dtype`, the `inferred_categories` are cast to the 

585 appropriate type. 

586 

587 Parameters 

588 ---------- 

589 inferred_categories : Index 

590 inferred_codes : Index 

591 dtype : CategoricalDtype or 'category' 

592 true_values : list, optional 

593 If none are provided, the default ones are 

594 "True", "TRUE", and "true." 

595 

596 Returns 

597 ------- 

598 Categorical 

599 """ 

600 from pandas import ( 

601 Index, 

602 to_datetime, 

603 to_numeric, 

604 to_timedelta, 

605 ) 

606 

607 cats = Index(inferred_categories) 

608 known_categories = ( 

609 isinstance(dtype, CategoricalDtype) and dtype.categories is not None 

610 ) 

611 

612 if known_categories: 

613 # Convert to a specialized type with `dtype` if specified. 

614 if dtype.categories.is_numeric(): 

615 cats = to_numeric(inferred_categories, errors="coerce") 

616 elif is_datetime64_dtype(dtype.categories): 

617 cats = to_datetime(inferred_categories, errors="coerce") 

618 elif is_timedelta64_dtype(dtype.categories): 

619 cats = to_timedelta(inferred_categories, errors="coerce") 

620 elif dtype.categories.is_boolean(): 

621 if true_values is None: 

622 true_values = ["True", "TRUE", "true"] 

623 

624 # error: Incompatible types in assignment (expression has type 

625 # "ndarray", variable has type "Index") 

626 cats = cats.isin(true_values) # type: ignore[assignment] 

627 

628 if known_categories: 

629 # Recode from observation order to dtype.categories order. 

630 categories = dtype.categories 

631 codes = recode_for_categories(inferred_codes, cats, categories) 

632 elif not cats.is_monotonic_increasing: 

633 # Sort categories and recode for unknown categories. 

634 unsorted = cats.copy() 

635 categories = cats.sort_values() 

636 

637 codes = recode_for_categories(inferred_codes, unsorted, categories) 

638 dtype = CategoricalDtype(categories, ordered=False) 

639 else: 

640 dtype = CategoricalDtype(cats, ordered=False) 

641 codes = inferred_codes 

642 

643 return cls(codes, dtype=dtype, fastpath=True) 

644 

645 @classmethod 

646 def from_codes( 

647 cls, codes, categories=None, ordered=None, dtype: Dtype | None = None 

648 ) -> Categorical: 

649 """ 

650 Make a Categorical type from codes and categories or dtype. 

651 

652 This constructor is useful if you already have codes and 

653 categories/dtype and so do not need the (computation intensive) 

654 factorization step, which is usually done on the constructor. 

655 

656 If your data does not follow this convention, please use the normal 

657 constructor. 

658 

659 Parameters 

660 ---------- 

661 codes : array-like of int 

662 An integer array, where each integer points to a category in 

663 categories or dtype.categories, or else is -1 for NaN. 

664 categories : index-like, optional 

665 The categories for the categorical. Items need to be unique. 

666 If the categories are not given here, then they must be provided 

667 in `dtype`. 

668 ordered : bool, optional 

669 Whether or not this categorical is treated as an ordered 

670 categorical. If not given here or in `dtype`, the resulting 

671 categorical will be unordered. 

672 dtype : CategoricalDtype or "category", optional 

673 If :class:`CategoricalDtype`, cannot be used together with 

674 `categories` or `ordered`. 

675 

676 Returns 

677 ------- 

678 Categorical 

679 

680 Examples 

681 -------- 

682 >>> dtype = pd.CategoricalDtype(['a', 'b'], ordered=True) 

683 >>> pd.Categorical.from_codes(codes=[0, 1, 0, 1], dtype=dtype) 

684 ['a', 'b', 'a', 'b'] 

685 Categories (2, object): ['a' < 'b'] 

686 """ 

687 dtype = CategoricalDtype._from_values_or_dtype( 

688 categories=categories, ordered=ordered, dtype=dtype 

689 ) 

690 if dtype.categories is None: 

691 msg = ( 

692 "The categories must be provided in 'categories' or " 

693 "'dtype'. Both were None." 

694 ) 

695 raise ValueError(msg) 

696 

697 if is_extension_array_dtype(codes) and is_integer_dtype(codes): 

698 # Avoid the implicit conversion of Int to object 

699 if isna(codes).any(): 

700 raise ValueError("codes cannot contain NA values") 

701 codes = codes.to_numpy(dtype=np.int64) 

702 else: 

703 codes = np.asarray(codes) 

704 if len(codes) and not is_integer_dtype(codes): 

705 raise ValueError("codes need to be array-like integers") 

706 

707 if len(codes) and (codes.max() >= len(dtype.categories) or codes.min() < -1): 

708 raise ValueError("codes need to be between -1 and len(categories)-1") 

709 

710 return cls(codes, dtype=dtype, fastpath=True) 

711 

712 # ------------------------------------------------------------------ 

713 # Categories/Codes/Ordered 

714 

715 @property 

716 def categories(self) -> Index: 

717 """ 

718 The categories of this categorical. 

719 

720 Setting assigns new values to each category (effectively a rename of 

721 each individual category). 

722 

723 The assigned value has to be a list-like object. All items must be 

724 unique and the number of items in the new categories must be the same 

725 as the number of items in the old categories. 

726 

727 Assigning to `categories` is a inplace operation! 

728 

729 Raises 

730 ------ 

731 ValueError 

732 If the new categories do not validate as categories or if the 

733 number of new categories is unequal the number of old categories 

734 

735 See Also 

736 -------- 

737 rename_categories : Rename categories. 

738 reorder_categories : Reorder categories. 

739 add_categories : Add new categories. 

740 remove_categories : Remove the specified categories. 

741 remove_unused_categories : Remove categories which are not used. 

742 set_categories : Set the categories to the specified ones. 

743 """ 

744 return self.dtype.categories 

745 

746 @categories.setter 

747 def categories(self, categories) -> None: 

748 warn( 

749 "Setting categories in-place is deprecated and will raise in a " 

750 "future version. Use rename_categories instead.", 

751 FutureWarning, 

752 stacklevel=find_stack_level(), 

753 ) 

754 

755 self._set_categories(categories) 

756 

757 @property 

758 def ordered(self) -> Ordered: 

759 """ 

760 Whether the categories have an ordered relationship. 

761 """ 

762 return self.dtype.ordered 

763 

764 @property 

765 def codes(self) -> np.ndarray: 

766 """ 

767 The category codes of this categorical. 

768 

769 Codes are an array of integers which are the positions of the actual 

770 values in the categories array. 

771 

772 There is no setter, use the other categorical methods and the normal item 

773 setter to change values in the categorical. 

774 

775 Returns 

776 ------- 

777 ndarray[int] 

778 A non-writable view of the `codes` array. 

779 """ 

780 v = self._codes.view() 

781 v.flags.writeable = False 

782 return v 

783 

784 def _set_categories(self, categories, fastpath=False): 

785 """ 

786 Sets new categories inplace 

787 

788 Parameters 

789 ---------- 

790 fastpath : bool, default False 

791 Don't perform validation of the categories for uniqueness or nulls 

792 

793 Examples 

794 -------- 

795 >>> c = pd.Categorical(['a', 'b']) 

796 >>> c 

797 ['a', 'b'] 

798 Categories (2, object): ['a', 'b'] 

799 

800 >>> c._set_categories(pd.Index(['a', 'c'])) 

801 >>> c 

802 ['a', 'c'] 

803 Categories (2, object): ['a', 'c'] 

804 """ 

805 if fastpath: 

806 new_dtype = CategoricalDtype._from_fastpath(categories, self.ordered) 

807 else: 

808 new_dtype = CategoricalDtype(categories, ordered=self.ordered) 

809 if ( 

810 not fastpath 

811 and self.dtype.categories is not None 

812 and len(new_dtype.categories) != len(self.dtype.categories) 

813 ): 

814 raise ValueError( 

815 "new categories need to have the same number of " 

816 "items as the old categories!" 

817 ) 

818 

819 super().__init__(self._ndarray, new_dtype) 

820 

821 def _set_dtype(self, dtype: CategoricalDtype) -> Categorical: 

822 """ 

823 Internal method for directly updating the CategoricalDtype 

824 

825 Parameters 

826 ---------- 

827 dtype : CategoricalDtype 

828 

829 Notes 

830 ----- 

831 We don't do any validation here. It's assumed that the dtype is 

832 a (valid) instance of `CategoricalDtype`. 

833 """ 

834 codes = recode_for_categories(self.codes, self.categories, dtype.categories) 

835 return type(self)(codes, dtype=dtype, fastpath=True) 

836 

837 @overload 

838 def set_ordered( 

839 self, value, *, inplace: NoDefault | Literal[False] = ... 

840 ) -> Categorical: 

841 ... 

842 

843 @overload 

844 def set_ordered(self, value, *, inplace: Literal[True]) -> None: 

845 ... 

846 

847 @overload 

848 def set_ordered(self, value, *, inplace: bool) -> Categorical | None: 

849 ... 

850 

851 @deprecate_nonkeyword_arguments(version=None, allowed_args=["self", "value"]) 

852 def set_ordered( 

853 self, value, inplace: bool | NoDefault = no_default 

854 ) -> Categorical | None: 

855 """ 

856 Set the ordered attribute to the boolean value. 

857 

858 Parameters 

859 ---------- 

860 value : bool 

861 Set whether this categorical is ordered (True) or not (False). 

862 inplace : bool, default False 

863 Whether or not to set the ordered attribute in-place or return 

864 a copy of this categorical with ordered set to the value. 

865 

866 .. deprecated:: 1.5.0 

867 

868 """ 

869 if inplace is not no_default: 

870 warn( 

871 "The `inplace` parameter in pandas.Categorical." 

872 "set_ordered is deprecated and will be removed in " 

873 "a future version. setting ordered-ness on categories will always " 

874 "return a new Categorical object.", 

875 FutureWarning, 

876 stacklevel=find_stack_level(), 

877 ) 

878 else: 

879 inplace = False 

880 

881 inplace = validate_bool_kwarg(inplace, "inplace") 

882 new_dtype = CategoricalDtype(self.categories, ordered=value) 

883 cat = self if inplace else self.copy() 

884 NDArrayBacked.__init__(cat, cat._ndarray, new_dtype) 

885 if not inplace: 

886 return cat 

887 return None 

888 

889 @overload 

890 def as_ordered(self, *, inplace: NoDefault | Literal[False] = ...) -> Categorical: 

891 ... 

892 

893 @overload 

894 def as_ordered(self, *, inplace: Literal[True]) -> None: 

895 ... 

896 

897 @deprecate_nonkeyword_arguments(version=None, allowed_args=["self"]) 

898 def as_ordered(self, inplace: bool | NoDefault = no_default) -> Categorical | None: 

899 """ 

900 Set the Categorical to be ordered. 

901 

902 Parameters 

903 ---------- 

904 inplace : bool, default False 

905 Whether or not to set the ordered attribute in-place or return 

906 a copy of this categorical with ordered set to True. 

907 

908 .. deprecated:: 1.5.0 

909 

910 Returns 

911 ------- 

912 Categorical or None 

913 Ordered Categorical or None if ``inplace=True``. 

914 """ 

915 if inplace is not no_default: 

916 inplace = validate_bool_kwarg(inplace, "inplace") 

917 return self.set_ordered(True, inplace=inplace) 

918 

919 @overload 

920 def as_unordered(self, *, inplace: NoDefault | Literal[False] = ...) -> Categorical: 

921 ... 

922 

923 @overload 

924 def as_unordered(self, *, inplace: Literal[True]) -> None: 

925 ... 

926 

927 @deprecate_nonkeyword_arguments(version=None, allowed_args=["self"]) 

928 def as_unordered( 

929 self, inplace: bool | NoDefault = no_default 

930 ) -> Categorical | None: 

931 """ 

932 Set the Categorical to be unordered. 

933 

934 Parameters 

935 ---------- 

936 inplace : bool, default False 

937 Whether or not to set the ordered attribute in-place or return 

938 a copy of this categorical with ordered set to False. 

939 

940 .. deprecated:: 1.5.0 

941 

942 Returns 

943 ------- 

944 Categorical or None 

945 Unordered Categorical or None if ``inplace=True``. 

946 """ 

947 if inplace is not no_default: 

948 inplace = validate_bool_kwarg(inplace, "inplace") 

949 return self.set_ordered(False, inplace=inplace) 

950 

951 def set_categories( 

952 self, new_categories, ordered=None, rename=False, inplace=no_default 

953 ): 

954 """ 

955 Set the categories to the specified new_categories. 

956 

957 `new_categories` can include new categories (which will result in 

958 unused categories) or remove old categories (which results in values 

959 set to NaN). If `rename==True`, the categories will simple be renamed 

960 (less or more items than in old categories will result in values set to 

961 NaN or in unused categories respectively). 

962 

963 This method can be used to perform more than one action of adding, 

964 removing, and reordering simultaneously and is therefore faster than 

965 performing the individual steps via the more specialised methods. 

966 

967 On the other hand this methods does not do checks (e.g., whether the 

968 old categories are included in the new categories on a reorder), which 

969 can result in surprising changes, for example when using special string 

970 dtypes, which does not considers a S1 string equal to a single char 

971 python string. 

972 

973 Parameters 

974 ---------- 

975 new_categories : Index-like 

976 The categories in new order. 

977 ordered : bool, default False 

978 Whether or not the categorical is treated as a ordered categorical. 

979 If not given, do not change the ordered information. 

980 rename : bool, default False 

981 Whether or not the new_categories should be considered as a rename 

982 of the old categories or as reordered categories. 

983 inplace : bool, default False 

984 Whether or not to reorder the categories in-place or return a copy 

985 of this categorical with reordered categories. 

986 

987 .. deprecated:: 1.3.0 

988 

989 Returns 

990 ------- 

991 Categorical with reordered categories or None if inplace. 

992 

993 Raises 

994 ------ 

995 ValueError 

996 If new_categories does not validate as categories 

997 

998 See Also 

999 -------- 

1000 rename_categories : Rename categories. 

1001 reorder_categories : Reorder categories. 

1002 add_categories : Add new categories. 

1003 remove_categories : Remove the specified categories. 

1004 remove_unused_categories : Remove categories which are not used. 

1005 """ 

1006 if inplace is not no_default: 

1007 warn( 

1008 "The `inplace` parameter in pandas.Categorical." 

1009 "set_categories is deprecated and will be removed in " 

1010 "a future version. Removing unused categories will always " 

1011 "return a new Categorical object.", 

1012 FutureWarning, 

1013 stacklevel=find_stack_level(), 

1014 ) 

1015 else: 

1016 inplace = False 

1017 

1018 inplace = validate_bool_kwarg(inplace, "inplace") 

1019 if ordered is None: 

1020 ordered = self.dtype.ordered 

1021 new_dtype = CategoricalDtype(new_categories, ordered=ordered) 

1022 

1023 cat = self if inplace else self.copy() 

1024 if rename: 

1025 if cat.dtype.categories is not None and len(new_dtype.categories) < len( 

1026 cat.dtype.categories 

1027 ): 

1028 # remove all _codes which are larger and set to -1/NaN 

1029 cat._codes[cat._codes >= len(new_dtype.categories)] = -1 

1030 codes = cat._codes 

1031 else: 

1032 codes = recode_for_categories( 

1033 cat.codes, cat.categories, new_dtype.categories 

1034 ) 

1035 NDArrayBacked.__init__(cat, codes, new_dtype) 

1036 

1037 if not inplace: 

1038 return cat 

1039 

1040 @overload 

1041 def rename_categories( 

1042 self, new_categories, *, inplace: Literal[False] | NoDefault = ... 

1043 ) -> Categorical: 

1044 ... 

1045 

1046 @overload 

1047 def rename_categories(self, new_categories, *, inplace: Literal[True]) -> None: 

1048 ... 

1049 

1050 @deprecate_nonkeyword_arguments( 

1051 version=None, allowed_args=["self", "new_categories"] 

1052 ) 

1053 def rename_categories( 

1054 self, new_categories, inplace: bool | NoDefault = no_default 

1055 ) -> Categorical | None: 

1056 """ 

1057 Rename categories. 

1058 

1059 Parameters 

1060 ---------- 

1061 new_categories : list-like, dict-like or callable 

1062 

1063 New categories which will replace old categories. 

1064 

1065 * list-like: all items must be unique and the number of items in 

1066 the new categories must match the existing number of categories. 

1067 

1068 * dict-like: specifies a mapping from 

1069 old categories to new. Categories not contained in the mapping 

1070 are passed through and extra categories in the mapping are 

1071 ignored. 

1072 

1073 * callable : a callable that is called on all items in the old 

1074 categories and whose return values comprise the new categories. 

1075 

1076 inplace : bool, default False 

1077 Whether or not to rename the categories inplace or return a copy of 

1078 this categorical with renamed categories. 

1079 

1080 .. deprecated:: 1.3.0 

1081 

1082 Returns 

1083 ------- 

1084 cat : Categorical or None 

1085 Categorical with removed categories or None if ``inplace=True``. 

1086 

1087 Raises 

1088 ------ 

1089 ValueError 

1090 If new categories are list-like and do not have the same number of 

1091 items than the current categories or do not validate as categories 

1092 

1093 See Also 

1094 -------- 

1095 reorder_categories : Reorder categories. 

1096 add_categories : Add new categories. 

1097 remove_categories : Remove the specified categories. 

1098 remove_unused_categories : Remove categories which are not used. 

1099 set_categories : Set the categories to the specified ones. 

1100 

1101 Examples 

1102 -------- 

1103 >>> c = pd.Categorical(['a', 'a', 'b']) 

1104 >>> c.rename_categories([0, 1]) 

1105 [0, 0, 1] 

1106 Categories (2, int64): [0, 1] 

1107 

1108 For dict-like ``new_categories``, extra keys are ignored and 

1109 categories not in the dictionary are passed through 

1110 

1111 >>> c.rename_categories({'a': 'A', 'c': 'C'}) 

1112 ['A', 'A', 'b'] 

1113 Categories (2, object): ['A', 'b'] 

1114 

1115 You may also provide a callable to create the new categories 

1116 

1117 >>> c.rename_categories(lambda x: x.upper()) 

1118 ['A', 'A', 'B'] 

1119 Categories (2, object): ['A', 'B'] 

1120 """ 

1121 if inplace is not no_default: 

1122 warn( 

1123 "The `inplace` parameter in pandas.Categorical." 

1124 "rename_categories is deprecated and will be removed in " 

1125 "a future version. Removing unused categories will always " 

1126 "return a new Categorical object.", 

1127 FutureWarning, 

1128 stacklevel=find_stack_level(), 

1129 ) 

1130 else: 

1131 inplace = False 

1132 

1133 inplace = validate_bool_kwarg(inplace, "inplace") 

1134 cat = self if inplace else self.copy() 

1135 

1136 if is_dict_like(new_categories): 

1137 new_categories = [new_categories.get(item, item) for item in cat.categories] 

1138 elif callable(new_categories): 

1139 new_categories = [new_categories(item) for item in cat.categories] 

1140 

1141 cat._set_categories(new_categories) 

1142 if not inplace: 

1143 return cat 

1144 return None 

1145 

1146 def reorder_categories(self, new_categories, ordered=None, inplace=no_default): 

1147 """ 

1148 Reorder categories as specified in new_categories. 

1149 

1150 `new_categories` need to include all old categories and no new category 

1151 items. 

1152 

1153 Parameters 

1154 ---------- 

1155 new_categories : Index-like 

1156 The categories in new order. 

1157 ordered : bool, optional 

1158 Whether or not the categorical is treated as a ordered categorical. 

1159 If not given, do not change the ordered information. 

1160 inplace : bool, default False 

1161 Whether or not to reorder the categories inplace or return a copy of 

1162 this categorical with reordered categories. 

1163 

1164 .. deprecated:: 1.3.0 

1165 

1166 Returns 

1167 ------- 

1168 cat : Categorical or None 

1169 Categorical with removed categories or None if ``inplace=True``. 

1170 

1171 Raises 

1172 ------ 

1173 ValueError 

1174 If the new categories do not contain all old category items or any 

1175 new ones 

1176 

1177 See Also 

1178 -------- 

1179 rename_categories : Rename categories. 

1180 add_categories : Add new categories. 

1181 remove_categories : Remove the specified categories. 

1182 remove_unused_categories : Remove categories which are not used. 

1183 set_categories : Set the categories to the specified ones. 

1184 """ 

1185 if inplace is not no_default: 

1186 warn( 

1187 "The `inplace` parameter in pandas.Categorical." 

1188 "reorder_categories is deprecated and will be removed in " 

1189 "a future version. Reordering categories will always " 

1190 "return a new Categorical object.", 

1191 FutureWarning, 

1192 stacklevel=find_stack_level(), 

1193 ) 

1194 else: 

1195 inplace = False 

1196 

1197 inplace = validate_bool_kwarg(inplace, "inplace") 

1198 if set(self.dtype.categories) != set(new_categories): 

1199 raise ValueError( 

1200 "items in new_categories are not the same as in old categories" 

1201 ) 

1202 

1203 with catch_warnings(): 

1204 simplefilter("ignore") 

1205 return self.set_categories(new_categories, ordered=ordered, inplace=inplace) 

1206 

1207 @overload 

1208 def add_categories( 

1209 self, new_categories, *, inplace: Literal[False] | NoDefault = ... 

1210 ) -> Categorical: 

1211 ... 

1212 

1213 @overload 

1214 def add_categories(self, new_categories, *, inplace: Literal[True]) -> None: 

1215 ... 

1216 

1217 @deprecate_nonkeyword_arguments( 

1218 version=None, allowed_args=["self", "new_categories"] 

1219 ) 

1220 def add_categories( 

1221 self, new_categories, inplace: bool | NoDefault = no_default 

1222 ) -> Categorical | None: 

1223 """ 

1224 Add new categories. 

1225 

1226 `new_categories` will be included at the last/highest place in the 

1227 categories and will be unused directly after this call. 

1228 

1229 Parameters 

1230 ---------- 

1231 new_categories : category or list-like of category 

1232 The new categories to be included. 

1233 inplace : bool, default False 

1234 Whether or not to add the categories inplace or return a copy of 

1235 this categorical with added categories. 

1236 

1237 .. deprecated:: 1.3.0 

1238 

1239 Returns 

1240 ------- 

1241 cat : Categorical or None 

1242 Categorical with new categories added or None if ``inplace=True``. 

1243 

1244 Raises 

1245 ------ 

1246 ValueError 

1247 If the new categories include old categories or do not validate as 

1248 categories 

1249 

1250 See Also 

1251 -------- 

1252 rename_categories : Rename categories. 

1253 reorder_categories : Reorder categories. 

1254 remove_categories : Remove the specified categories. 

1255 remove_unused_categories : Remove categories which are not used. 

1256 set_categories : Set the categories to the specified ones. 

1257 

1258 Examples 

1259 -------- 

1260 >>> c = pd.Categorical(['c', 'b', 'c']) 

1261 >>> c 

1262 ['c', 'b', 'c'] 

1263 Categories (2, object): ['b', 'c'] 

1264 

1265 >>> c.add_categories(['d', 'a']) 

1266 ['c', 'b', 'c'] 

1267 Categories (4, object): ['b', 'c', 'd', 'a'] 

1268 """ 

1269 if inplace is not no_default: 

1270 warn( 

1271 "The `inplace` parameter in pandas.Categorical." 

1272 "add_categories is deprecated and will be removed in " 

1273 "a future version. Removing unused categories will always " 

1274 "return a new Categorical object.", 

1275 FutureWarning, 

1276 stacklevel=find_stack_level(), 

1277 ) 

1278 else: 

1279 inplace = False 

1280 

1281 inplace = validate_bool_kwarg(inplace, "inplace") 

1282 if not is_list_like(new_categories): 

1283 new_categories = [new_categories] 

1284 already_included = set(new_categories) & set(self.dtype.categories) 

1285 if len(already_included) != 0: 

1286 raise ValueError( 

1287 f"new categories must not include old categories: {already_included}" 

1288 ) 

1289 new_categories = list(self.dtype.categories) + list(new_categories) 

1290 new_dtype = CategoricalDtype(new_categories, self.ordered) 

1291 

1292 cat = self if inplace else self.copy() 

1293 codes = coerce_indexer_dtype(cat._ndarray, new_dtype.categories) 

1294 NDArrayBacked.__init__(cat, codes, new_dtype) 

1295 if not inplace: 

1296 return cat 

1297 return None 

1298 

1299 def remove_categories(self, removals, inplace=no_default): 

1300 """ 

1301 Remove the specified categories. 

1302 

1303 `removals` must be included in the old categories. Values which were in 

1304 the removed categories will be set to NaN 

1305 

1306 Parameters 

1307 ---------- 

1308 removals : category or list of categories 

1309 The categories which should be removed. 

1310 inplace : bool, default False 

1311 Whether or not to remove the categories inplace or return a copy of 

1312 this categorical with removed categories. 

1313 

1314 .. deprecated:: 1.3.0 

1315 

1316 Returns 

1317 ------- 

1318 cat : Categorical or None 

1319 Categorical with removed categories or None if ``inplace=True``. 

1320 

1321 Raises 

1322 ------ 

1323 ValueError 

1324 If the removals are not contained in the categories 

1325 

1326 See Also 

1327 -------- 

1328 rename_categories : Rename categories. 

1329 reorder_categories : Reorder categories. 

1330 add_categories : Add new categories. 

1331 remove_unused_categories : Remove categories which are not used. 

1332 set_categories : Set the categories to the specified ones. 

1333 

1334 Examples 

1335 -------- 

1336 >>> c = pd.Categorical(['a', 'c', 'b', 'c', 'd']) 

1337 >>> c 

1338 ['a', 'c', 'b', 'c', 'd'] 

1339 Categories (4, object): ['a', 'b', 'c', 'd'] 

1340 

1341 >>> c.remove_categories(['d', 'a']) 

1342 [NaN, 'c', 'b', 'c', NaN] 

1343 Categories (2, object): ['b', 'c'] 

1344 """ 

1345 if inplace is not no_default: 

1346 warn( 

1347 "The `inplace` parameter in pandas.Categorical." 

1348 "remove_categories is deprecated and will be removed in " 

1349 "a future version. Removing unused categories will always " 

1350 "return a new Categorical object.", 

1351 FutureWarning, 

1352 stacklevel=find_stack_level(), 

1353 ) 

1354 else: 

1355 inplace = False 

1356 

1357 inplace = validate_bool_kwarg(inplace, "inplace") 

1358 if not is_list_like(removals): 

1359 removals = [removals] 

1360 

1361 removal_set = set(removals) 

1362 not_included = removal_set - set(self.dtype.categories) 

1363 new_categories = [c for c in self.dtype.categories if c not in removal_set] 

1364 

1365 # GH 10156 

1366 if any(isna(removals)): 

1367 not_included = {x for x in not_included if notna(x)} 

1368 new_categories = [x for x in new_categories if notna(x)] 

1369 

1370 if len(not_included) != 0: 

1371 raise ValueError(f"removals must all be in old categories: {not_included}") 

1372 

1373 with catch_warnings(): 

1374 simplefilter("ignore") 

1375 return self.set_categories( 

1376 new_categories, ordered=self.ordered, rename=False, inplace=inplace 

1377 ) 

1378 

1379 @overload 

1380 def remove_unused_categories( 

1381 self, *, inplace: Literal[False] | NoDefault = ... 

1382 ) -> Categorical: 

1383 ... 

1384 

1385 @overload 

1386 def remove_unused_categories(self, *, inplace: Literal[True]) -> None: 

1387 ... 

1388 

1389 @deprecate_nonkeyword_arguments(version=None, allowed_args=["self"]) 

1390 def remove_unused_categories( 

1391 self, inplace: bool | NoDefault = no_default 

1392 ) -> Categorical | None: 

1393 """ 

1394 Remove categories which are not used. 

1395 

1396 Parameters 

1397 ---------- 

1398 inplace : bool, default False 

1399 Whether or not to drop unused categories inplace or return a copy of 

1400 this categorical with unused categories dropped. 

1401 

1402 .. deprecated:: 1.2.0 

1403 

1404 Returns 

1405 ------- 

1406 cat : Categorical or None 

1407 Categorical with unused categories dropped or None if ``inplace=True``. 

1408 

1409 See Also 

1410 -------- 

1411 rename_categories : Rename categories. 

1412 reorder_categories : Reorder categories. 

1413 add_categories : Add new categories. 

1414 remove_categories : Remove the specified categories. 

1415 set_categories : Set the categories to the specified ones. 

1416 

1417 Examples 

1418 -------- 

1419 >>> c = pd.Categorical(['a', 'c', 'b', 'c', 'd']) 

1420 >>> c 

1421 ['a', 'c', 'b', 'c', 'd'] 

1422 Categories (4, object): ['a', 'b', 'c', 'd'] 

1423 

1424 >>> c[2] = 'a' 

1425 >>> c[4] = 'c' 

1426 >>> c 

1427 ['a', 'c', 'a', 'c', 'c'] 

1428 Categories (4, object): ['a', 'b', 'c', 'd'] 

1429 

1430 >>> c.remove_unused_categories() 

1431 ['a', 'c', 'a', 'c', 'c'] 

1432 Categories (2, object): ['a', 'c'] 

1433 """ 

1434 if inplace is not no_default: 

1435 warn( 

1436 "The `inplace` parameter in pandas.Categorical." 

1437 "remove_unused_categories is deprecated and " 

1438 "will be removed in a future version.", 

1439 FutureWarning, 

1440 stacklevel=find_stack_level(), 

1441 ) 

1442 else: 

1443 inplace = False 

1444 

1445 inplace = validate_bool_kwarg(inplace, "inplace") 

1446 cat = self if inplace else self.copy() 

1447 idx, inv = np.unique(cat._codes, return_inverse=True) 

1448 

1449 if idx.size != 0 and idx[0] == -1: # na sentinel 

1450 idx, inv = idx[1:], inv - 1 

1451 

1452 new_categories = cat.dtype.categories.take(idx) 

1453 new_dtype = CategoricalDtype._from_fastpath( 

1454 new_categories, ordered=self.ordered 

1455 ) 

1456 new_codes = coerce_indexer_dtype(inv, new_dtype.categories) 

1457 NDArrayBacked.__init__(cat, new_codes, new_dtype) 

1458 if not inplace: 

1459 return cat 

1460 return None 

1461 

1462 # ------------------------------------------------------------------ 

1463 

1464 def map(self, mapper): 

1465 """ 

1466 Map categories using an input mapping or function. 

1467 

1468 Maps the categories to new categories. If the mapping correspondence is 

1469 one-to-one the result is a :class:`~pandas.Categorical` which has the 

1470 same order property as the original, otherwise a :class:`~pandas.Index` 

1471 is returned. NaN values are unaffected. 

1472 

1473 If a `dict` or :class:`~pandas.Series` is used any unmapped category is 

1474 mapped to `NaN`. Note that if this happens an :class:`~pandas.Index` 

1475 will be returned. 

1476 

1477 Parameters 

1478 ---------- 

1479 mapper : function, dict, or Series 

1480 Mapping correspondence. 

1481 

1482 Returns 

1483 ------- 

1484 pandas.Categorical or pandas.Index 

1485 Mapped categorical. 

1486 

1487 See Also 

1488 -------- 

1489 CategoricalIndex.map : Apply a mapping correspondence on a 

1490 :class:`~pandas.CategoricalIndex`. 

1491 Index.map : Apply a mapping correspondence on an 

1492 :class:`~pandas.Index`. 

1493 Series.map : Apply a mapping correspondence on a 

1494 :class:`~pandas.Series`. 

1495 Series.apply : Apply more complex functions on a 

1496 :class:`~pandas.Series`. 

1497 

1498 Examples 

1499 -------- 

1500 >>> cat = pd.Categorical(['a', 'b', 'c']) 

1501 >>> cat 

1502 ['a', 'b', 'c'] 

1503 Categories (3, object): ['a', 'b', 'c'] 

1504 >>> cat.map(lambda x: x.upper()) 

1505 ['A', 'B', 'C'] 

1506 Categories (3, object): ['A', 'B', 'C'] 

1507 >>> cat.map({'a': 'first', 'b': 'second', 'c': 'third'}) 

1508 ['first', 'second', 'third'] 

1509 Categories (3, object): ['first', 'second', 'third'] 

1510 

1511 If the mapping is one-to-one the ordering of the categories is 

1512 preserved: 

1513 

1514 >>> cat = pd.Categorical(['a', 'b', 'c'], ordered=True) 

1515 >>> cat 

1516 ['a', 'b', 'c'] 

1517 Categories (3, object): ['a' < 'b' < 'c'] 

1518 >>> cat.map({'a': 3, 'b': 2, 'c': 1}) 

1519 [3, 2, 1] 

1520 Categories (3, int64): [3 < 2 < 1] 

1521 

1522 If the mapping is not one-to-one an :class:`~pandas.Index` is returned: 

1523 

1524 >>> cat.map({'a': 'first', 'b': 'second', 'c': 'first'}) 

1525 Index(['first', 'second', 'first'], dtype='object') 

1526 

1527 If a `dict` is used, all unmapped categories are mapped to `NaN` and 

1528 the result is an :class:`~pandas.Index`: 

1529 

1530 >>> cat.map({'a': 'first', 'b': 'second'}) 

1531 Index(['first', 'second', nan], dtype='object') 

1532 """ 

1533 new_categories = self.categories.map(mapper) 

1534 try: 

1535 return self.from_codes( 

1536 self._codes.copy(), categories=new_categories, ordered=self.ordered 

1537 ) 

1538 except ValueError: 

1539 # NA values are represented in self._codes with -1 

1540 # np.take causes NA values to take final element in new_categories 

1541 if np.any(self._codes == -1): 

1542 new_categories = new_categories.insert(len(new_categories), np.nan) 

1543 return np.take(new_categories, self._codes) 

1544 

1545 __eq__ = _cat_compare_op(operator.eq) 

1546 __ne__ = _cat_compare_op(operator.ne) 

1547 __lt__ = _cat_compare_op(operator.lt) 

1548 __gt__ = _cat_compare_op(operator.gt) 

1549 __le__ = _cat_compare_op(operator.le) 

1550 __ge__ = _cat_compare_op(operator.ge) 

1551 

1552 # ------------------------------------------------------------- 

1553 # Validators; ideally these can be de-duplicated 

1554 

1555 def _validate_setitem_value(self, value): 

1556 if not is_hashable(value): 

1557 # wrap scalars and hashable-listlikes in list 

1558 return self._validate_listlike(value) 

1559 else: 

1560 return self._validate_scalar(value) 

1561 

1562 _validate_searchsorted_value = _validate_setitem_value 

1563 

1564 def _validate_scalar(self, fill_value): 

1565 """ 

1566 Convert a user-facing fill_value to a representation to use with our 

1567 underlying ndarray, raising TypeError if this is not possible. 

1568 

1569 Parameters 

1570 ---------- 

1571 fill_value : object 

1572 

1573 Returns 

1574 ------- 

1575 fill_value : int 

1576 

1577 Raises 

1578 ------ 

1579 TypeError 

1580 """ 

1581 

1582 if is_valid_na_for_dtype(fill_value, self.categories.dtype): 

1583 fill_value = -1 

1584 elif fill_value in self.categories: 

1585 fill_value = self._unbox_scalar(fill_value) 

1586 else: 

1587 raise TypeError( 

1588 "Cannot setitem on a Categorical with a new " 

1589 f"category ({fill_value}), set the categories first" 

1590 ) from None 

1591 return fill_value 

1592 

1593 # ------------------------------------------------------------- 

1594 

1595 @ravel_compat 

1596 def __array__(self, dtype: NpDtype | None = None) -> np.ndarray: 

1597 """ 

1598 The numpy array interface. 

1599 

1600 Returns 

1601 ------- 

1602 numpy.array 

1603 A numpy array of either the specified dtype or, 

1604 if dtype==None (default), the same dtype as 

1605 categorical.categories.dtype. 

1606 """ 

1607 ret = take_nd(self.categories._values, self._codes) 

1608 if dtype and not is_dtype_equal(dtype, self.categories.dtype): 

1609 return np.asarray(ret, dtype) 

1610 # When we're a Categorical[ExtensionArray], like Interval, 

1611 # we need to ensure __array__ gets all the way to an 

1612 # ndarray. 

1613 return np.asarray(ret) 

1614 

1615 def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs, **kwargs): 

1616 # for binary ops, use our custom dunder methods 

1617 result = ops.maybe_dispatch_ufunc_to_dunder_op( 

1618 self, ufunc, method, *inputs, **kwargs 

1619 ) 

1620 if result is not NotImplemented: 

1621 return result 

1622 

1623 if "out" in kwargs: 

1624 # e.g. test_numpy_ufuncs_out 

1625 return arraylike.dispatch_ufunc_with_out( 

1626 self, ufunc, method, *inputs, **kwargs 

1627 ) 

1628 

1629 if method == "reduce": 

1630 # e.g. TestCategoricalAnalytics::test_min_max_ordered 

1631 result = arraylike.dispatch_reduction_ufunc( 

1632 self, ufunc, method, *inputs, **kwargs 

1633 ) 

1634 if result is not NotImplemented: 

1635 return result 

1636 

1637 # for all other cases, raise for now (similarly as what happens in 

1638 # Series.__array_prepare__) 

1639 raise TypeError( 

1640 f"Object with dtype {self.dtype} cannot perform " 

1641 f"the numpy op {ufunc.__name__}" 

1642 ) 

1643 

1644 def __setstate__(self, state) -> None: 

1645 """Necessary for making this object picklable""" 

1646 if not isinstance(state, dict): 

1647 return super().__setstate__(state) 

1648 

1649 if "_dtype" not in state: 

1650 state["_dtype"] = CategoricalDtype(state["_categories"], state["_ordered"]) 

1651 

1652 if "_codes" in state and "_ndarray" not in state: 

1653 # backward compat, changed what is property vs attribute 

1654 state["_ndarray"] = state.pop("_codes") 

1655 

1656 super().__setstate__(state) 

1657 

1658 @property 

1659 def nbytes(self) -> int: 

1660 return self._codes.nbytes + self.dtype.categories.values.nbytes 

1661 

1662 def memory_usage(self, deep: bool = False) -> int: 

1663 """ 

1664 Memory usage of my values 

1665 

1666 Parameters 

1667 ---------- 

1668 deep : bool 

1669 Introspect the data deeply, interrogate 

1670 `object` dtypes for system-level memory consumption 

1671 

1672 Returns 

1673 ------- 

1674 bytes used 

1675 

1676 Notes 

1677 ----- 

1678 Memory usage does not include memory consumed by elements that 

1679 are not components of the array if deep=False 

1680 

1681 See Also 

1682 -------- 

1683 numpy.ndarray.nbytes 

1684 """ 

1685 return self._codes.nbytes + self.dtype.categories.memory_usage(deep=deep) 

1686 

1687 def isna(self) -> np.ndarray: 

1688 """ 

1689 Detect missing values 

1690 

1691 Missing values (-1 in .codes) are detected. 

1692 

1693 Returns 

1694 ------- 

1695 np.ndarray[bool] of whether my values are null 

1696 

1697 See Also 

1698 -------- 

1699 isna : Top-level isna. 

1700 isnull : Alias of isna. 

1701 Categorical.notna : Boolean inverse of Categorical.isna. 

1702 

1703 """ 

1704 return self._codes == -1 

1705 

1706 isnull = isna 

1707 

1708 def notna(self) -> np.ndarray: 

1709 """ 

1710 Inverse of isna 

1711 

1712 Both missing values (-1 in .codes) and NA as a category are detected as 

1713 null. 

1714 

1715 Returns 

1716 ------- 

1717 np.ndarray[bool] of whether my values are not null 

1718 

1719 See Also 

1720 -------- 

1721 notna : Top-level notna. 

1722 notnull : Alias of notna. 

1723 Categorical.isna : Boolean inverse of Categorical.notna. 

1724 

1725 """ 

1726 return ~self.isna() 

1727 

1728 notnull = notna 

1729 

1730 def value_counts(self, dropna: bool = True) -> Series: 

1731 """ 

1732 Return a Series containing counts of each category. 

1733 

1734 Every category will have an entry, even those with a count of 0. 

1735 

1736 Parameters 

1737 ---------- 

1738 dropna : bool, default True 

1739 Don't include counts of NaN. 

1740 

1741 Returns 

1742 ------- 

1743 counts : Series 

1744 

1745 See Also 

1746 -------- 

1747 Series.value_counts 

1748 """ 

1749 from pandas import ( 

1750 CategoricalIndex, 

1751 Series, 

1752 ) 

1753 

1754 code, cat = self._codes, self.categories 

1755 ncat, mask = (len(cat), code >= 0) 

1756 ix, clean = np.arange(ncat), mask.all() 

1757 

1758 if dropna or clean: 

1759 obs = code if clean else code[mask] 

1760 count = np.bincount(obs, minlength=ncat or 0) 

1761 else: 

1762 count = np.bincount(np.where(mask, code, ncat)) 

1763 ix = np.append(ix, -1) 

1764 

1765 ix = coerce_indexer_dtype(ix, self.dtype.categories) 

1766 ix = self._from_backing_data(ix) 

1767 

1768 return Series(count, index=CategoricalIndex(ix), dtype="int64") 

1769 

1770 # error: Argument 2 of "_empty" is incompatible with supertype 

1771 # "NDArrayBackedExtensionArray"; supertype defines the argument type as 

1772 # "ExtensionDtype" 

1773 @classmethod 

1774 def _empty( # type: ignore[override] 

1775 cls: type_t[Categorical], shape: Shape, dtype: CategoricalDtype 

1776 ) -> Categorical: 

1777 """ 

1778 Analogous to np.empty(shape, dtype=dtype) 

1779 

1780 Parameters 

1781 ---------- 

1782 shape : tuple[int] 

1783 dtype : CategoricalDtype 

1784 """ 

1785 arr = cls._from_sequence([], dtype=dtype) 

1786 

1787 # We have to use np.zeros instead of np.empty otherwise the resulting 

1788 # ndarray may contain codes not supported by this dtype, in which 

1789 # case repr(result) could segfault. 

1790 backing = np.zeros(shape, dtype=arr._ndarray.dtype) 

1791 

1792 return arr._from_backing_data(backing) 

1793 

1794 def _internal_get_values(self): 

1795 """ 

1796 Return the values. 

1797 

1798 For internal compatibility with pandas formatting. 

1799 

1800 Returns 

1801 ------- 

1802 np.ndarray or Index 

1803 A numpy array of the same dtype as categorical.categories.dtype or 

1804 Index if datetime / periods. 

1805 """ 

1806 # if we are a datetime and period index, return Index to keep metadata 

1807 if needs_i8_conversion(self.categories.dtype): 

1808 return self.categories.take(self._codes, fill_value=NaT) 

1809 elif is_integer_dtype(self.categories) and -1 in self._codes: 

1810 return self.categories.astype("object").take(self._codes, fill_value=np.nan) 

1811 return np.array(self) 

1812 

1813 def check_for_ordered(self, op) -> None: 

1814 """assert that we are ordered""" 

1815 if not self.ordered: 

1816 raise TypeError( 

1817 f"Categorical is not ordered for operation {op}\n" 

1818 "you can use .as_ordered() to change the " 

1819 "Categorical to an ordered one\n" 

1820 ) 

1821 

1822 @deprecate_nonkeyword_arguments(version=None, allowed_args=["self"]) 

1823 def argsort(self, ascending=True, kind="quicksort", **kwargs): 

1824 """ 

1825 Return the indices that would sort the Categorical. 

1826 

1827 .. versionchanged:: 0.25.0 

1828 

1829 Changed to sort missing values at the end. 

1830 

1831 Parameters 

1832 ---------- 

1833 ascending : bool, default True 

1834 Whether the indices should result in an ascending 

1835 or descending sort. 

1836 kind : {'quicksort', 'mergesort', 'heapsort', 'stable'}, optional 

1837 Sorting algorithm. 

1838 **kwargs: 

1839 passed through to :func:`numpy.argsort`. 

1840 

1841 Returns 

1842 ------- 

1843 np.ndarray[np.intp] 

1844 

1845 See Also 

1846 -------- 

1847 numpy.ndarray.argsort 

1848 

1849 Notes 

1850 ----- 

1851 While an ordering is applied to the category values, arg-sorting 

1852 in this context refers more to organizing and grouping together 

1853 based on matching category values. Thus, this function can be 

1854 called on an unordered Categorical instance unlike the functions 

1855 'Categorical.min' and 'Categorical.max'. 

1856 

1857 Examples 

1858 -------- 

1859 >>> pd.Categorical(['b', 'b', 'a', 'c']).argsort() 

1860 array([2, 0, 1, 3]) 

1861 

1862 >>> cat = pd.Categorical(['b', 'b', 'a', 'c'], 

1863 ... categories=['c', 'b', 'a'], 

1864 ... ordered=True) 

1865 >>> cat.argsort() 

1866 array([3, 0, 1, 2]) 

1867 

1868 Missing values are placed at the end 

1869 

1870 >>> cat = pd.Categorical([2, None, 1]) 

1871 >>> cat.argsort() 

1872 array([2, 0, 1]) 

1873 """ 

1874 return super().argsort(ascending=ascending, kind=kind, **kwargs) 

1875 

1876 @overload 

1877 def sort_values( 

1878 self, 

1879 *, 

1880 inplace: Literal[False] = ..., 

1881 ascending: bool = ..., 

1882 na_position: str = ..., 

1883 ) -> Categorical: 

1884 ... 

1885 

1886 @overload 

1887 def sort_values( 

1888 self, *, inplace: Literal[True], ascending: bool = ..., na_position: str = ... 

1889 ) -> None: 

1890 ... 

1891 

1892 @deprecate_nonkeyword_arguments(version=None, allowed_args=["self"]) 

1893 def sort_values( 

1894 self, inplace: bool = False, ascending: bool = True, na_position: str = "last" 

1895 ) -> Categorical | None: 

1896 """ 

1897 Sort the Categorical by category value returning a new 

1898 Categorical by default. 

1899 

1900 While an ordering is applied to the category values, sorting in this 

1901 context refers more to organizing and grouping together based on 

1902 matching category values. Thus, this function can be called on an 

1903 unordered Categorical instance unlike the functions 'Categorical.min' 

1904 and 'Categorical.max'. 

1905 

1906 Parameters 

1907 ---------- 

1908 inplace : bool, default False 

1909 Do operation in place. 

1910 ascending : bool, default True 

1911 Order ascending. Passing False orders descending. The 

1912 ordering parameter provides the method by which the 

1913 category values are organized. 

1914 na_position : {'first', 'last'} (optional, default='last') 

1915 'first' puts NaNs at the beginning 

1916 'last' puts NaNs at the end 

1917 

1918 Returns 

1919 ------- 

1920 Categorical or None 

1921 

1922 See Also 

1923 -------- 

1924 Categorical.sort 

1925 Series.sort_values 

1926 

1927 Examples 

1928 -------- 

1929 >>> c = pd.Categorical([1, 2, 2, 1, 5]) 

1930 >>> c 

1931 [1, 2, 2, 1, 5] 

1932 Categories (3, int64): [1, 2, 5] 

1933 >>> c.sort_values() 

1934 [1, 1, 2, 2, 5] 

1935 Categories (3, int64): [1, 2, 5] 

1936 >>> c.sort_values(ascending=False) 

1937 [5, 2, 2, 1, 1] 

1938 Categories (3, int64): [1, 2, 5] 

1939 

1940 Inplace sorting can be done as well: 

1941 

1942 >>> c.sort_values(inplace=True) 

1943 >>> c 

1944 [1, 1, 2, 2, 5] 

1945 Categories (3, int64): [1, 2, 5] 

1946 >>> 

1947 >>> c = pd.Categorical([1, 2, 2, 1, 5]) 

1948 

1949 'sort_values' behaviour with NaNs. Note that 'na_position' 

1950 is independent of the 'ascending' parameter: 

1951 

1952 >>> c = pd.Categorical([np.nan, 2, 2, np.nan, 5]) 

1953 >>> c 

1954 [NaN, 2, 2, NaN, 5] 

1955 Categories (2, int64): [2, 5] 

1956 >>> c.sort_values() 

1957 [2, 2, 5, NaN, NaN] 

1958 Categories (2, int64): [2, 5] 

1959 >>> c.sort_values(ascending=False) 

1960 [5, 2, 2, NaN, NaN] 

1961 Categories (2, int64): [2, 5] 

1962 >>> c.sort_values(na_position='first') 

1963 [NaN, NaN, 2, 2, 5] 

1964 Categories (2, int64): [2, 5] 

1965 >>> c.sort_values(ascending=False, na_position='first') 

1966 [NaN, NaN, 5, 2, 2] 

1967 Categories (2, int64): [2, 5] 

1968 """ 

1969 inplace = validate_bool_kwarg(inplace, "inplace") 

1970 if na_position not in ["last", "first"]: 

1971 raise ValueError(f"invalid na_position: {repr(na_position)}") 

1972 

1973 sorted_idx = nargsort(self, ascending=ascending, na_position=na_position) 

1974 

1975 if not inplace: 

1976 codes = self._codes[sorted_idx] 

1977 return self._from_backing_data(codes) 

1978 self._codes[:] = self._codes[sorted_idx] 

1979 return None 

1980 

1981 def _rank( 

1982 self, 

1983 *, 

1984 axis: int = 0, 

1985 method: str = "average", 

1986 na_option: str = "keep", 

1987 ascending: bool = True, 

1988 pct: bool = False, 

1989 ): 

1990 """ 

1991 See Series.rank.__doc__. 

1992 """ 

1993 if axis != 0: 

1994 raise NotImplementedError 

1995 vff = self._values_for_rank() 

1996 return algorithms.rank( 

1997 vff, 

1998 axis=axis, 

1999 method=method, 

2000 na_option=na_option, 

2001 ascending=ascending, 

2002 pct=pct, 

2003 ) 

2004 

2005 def _values_for_rank(self): 

2006 """ 

2007 For correctly ranking ordered categorical data. See GH#15420 

2008 

2009 Ordered categorical data should be ranked on the basis of 

2010 codes with -1 translated to NaN. 

2011 

2012 Returns 

2013 ------- 

2014 numpy.array 

2015 

2016 """ 

2017 from pandas import Series 

2018 

2019 if self.ordered: 

2020 values = self.codes 

2021 mask = values == -1 

2022 if mask.any(): 

2023 values = values.astype("float64") 

2024 values[mask] = np.nan 

2025 elif self.categories.is_numeric(): 

2026 values = np.array(self) 

2027 else: 

2028 # reorder the categories (so rank can use the float codes) 

2029 # instead of passing an object array to rank 

2030 values = np.array( 

2031 self.rename_categories(Series(self.categories).rank().values) 

2032 ) 

2033 return values 

2034 

2035 def to_dense(self) -> np.ndarray: 

2036 """ 

2037 Return my 'dense' representation 

2038 

2039 For internal compatibility with numpy arrays. 

2040 

2041 Returns 

2042 ------- 

2043 dense : array 

2044 """ 

2045 warn( 

2046 "Categorical.to_dense is deprecated and will be removed in " 

2047 "a future version. Use np.asarray(cat) instead.", 

2048 FutureWarning, 

2049 stacklevel=find_stack_level(), 

2050 ) 

2051 return np.asarray(self) 

2052 

2053 # ------------------------------------------------------------------ 

2054 # NDArrayBackedExtensionArray compat 

2055 

2056 @property 

2057 def _codes(self) -> np.ndarray: 

2058 return self._ndarray 

2059 

2060 @_codes.setter 

2061 def _codes(self, value: np.ndarray): 

2062 warn( 

2063 "Setting the codes on a Categorical is deprecated and will raise in " 

2064 "a future version. Create a new Categorical object instead", 

2065 FutureWarning, 

2066 stacklevel=find_stack_level(), 

2067 ) # GH#40606 

2068 NDArrayBacked.__init__(self, value, self.dtype) 

2069 

2070 def _box_func(self, i: int): 

2071 if i == -1: 

2072 return np.NaN 

2073 return self.categories[i] 

2074 

2075 def _unbox_scalar(self, key) -> int: 

2076 # searchsorted is very performance sensitive. By converting codes 

2077 # to same dtype as self.codes, we get much faster performance. 

2078 code = self.categories.get_loc(key) 

2079 code = self._ndarray.dtype.type(code) 

2080 return code 

2081 

2082 # ------------------------------------------------------------------ 

2083 

2084 def take_nd( 

2085 self, indexer, allow_fill: bool = False, fill_value=None 

2086 ) -> Categorical: 

2087 # GH#27745 deprecate alias that other EAs dont have 

2088 warn( 

2089 "Categorical.take_nd is deprecated, use Categorical.take instead", 

2090 FutureWarning, 

2091 stacklevel=find_stack_level(), 

2092 ) 

2093 return self.take(indexer, allow_fill=allow_fill, fill_value=fill_value) 

2094 

2095 def __iter__(self): 

2096 """ 

2097 Returns an Iterator over the values of this Categorical. 

2098 """ 

2099 if self.ndim == 1: 

2100 return iter(self._internal_get_values().tolist()) 

2101 else: 

2102 return (self[n] for n in range(len(self))) 

2103 

2104 def __contains__(self, key) -> bool: 

2105 """ 

2106 Returns True if `key` is in this Categorical. 

2107 """ 

2108 # if key is a NaN, check if any NaN is in self. 

2109 if is_valid_na_for_dtype(key, self.categories.dtype): 

2110 return bool(self.isna().any()) 

2111 

2112 return contains(self, key, container=self._codes) 

2113 

2114 # ------------------------------------------------------------------ 

2115 # Rendering Methods 

2116 

2117 def _formatter(self, boxed: bool = False): 

2118 # Defer to CategoricalFormatter's formatter. 

2119 return None 

2120 

2121 def _tidy_repr(self, max_vals: int = 10, footer: bool = True) -> str: 

2122 """ 

2123 a short repr displaying only max_vals and an optional (but default 

2124 footer) 

2125 """ 

2126 num = max_vals // 2 

2127 head = self[:num]._get_repr(length=False, footer=False) 

2128 tail = self[-(max_vals - num) :]._get_repr(length=False, footer=False) 

2129 

2130 result = f"{head[:-1]}, ..., {tail[1:]}" 

2131 if footer: 

2132 result = f"{result}\n{self._repr_footer()}" 

2133 

2134 return str(result) 

2135 

2136 def _repr_categories(self) -> list[str]: 

2137 """ 

2138 return the base repr for the categories 

2139 """ 

2140 max_categories = ( 

2141 10 

2142 if get_option("display.max_categories") == 0 

2143 else get_option("display.max_categories") 

2144 ) 

2145 from pandas.io.formats import format as fmt 

2146 

2147 format_array = partial( 

2148 fmt.format_array, formatter=None, quoting=QUOTE_NONNUMERIC 

2149 ) 

2150 if len(self.categories) > max_categories: 

2151 num = max_categories // 2 

2152 head = format_array(self.categories[:num]) 

2153 tail = format_array(self.categories[-num:]) 

2154 category_strs = head + ["..."] + tail 

2155 else: 

2156 category_strs = format_array(self.categories) 

2157 

2158 # Strip all leading spaces, which format_array adds for columns... 

2159 category_strs = [x.strip() for x in category_strs] 

2160 return category_strs 

2161 

2162 def _repr_categories_info(self) -> str: 

2163 """ 

2164 Returns a string representation of the footer. 

2165 """ 

2166 category_strs = self._repr_categories() 

2167 dtype = str(self.categories.dtype) 

2168 levheader = f"Categories ({len(self.categories)}, {dtype}): " 

2169 width, height = get_terminal_size() 

2170 max_width = get_option("display.width") or width 

2171 if console.in_ipython_frontend(): 

2172 # 0 = no breaks 

2173 max_width = 0 

2174 levstring = "" 

2175 start = True 

2176 cur_col_len = len(levheader) # header 

2177 sep_len, sep = (3, " < ") if self.ordered else (2, ", ") 

2178 linesep = sep.rstrip() + "\n" # remove whitespace 

2179 for val in category_strs: 

2180 if max_width != 0 and cur_col_len + sep_len + len(val) > max_width: 

2181 levstring += linesep + (" " * (len(levheader) + 1)) 

2182 cur_col_len = len(levheader) + 1 # header + a whitespace 

2183 elif not start: 

2184 levstring += sep 

2185 cur_col_len += len(val) 

2186 levstring += val 

2187 start = False 

2188 # replace to simple save space by 

2189 return levheader + "[" + levstring.replace(" < ... < ", " ... ") + "]" 

2190 

2191 def _repr_footer(self) -> str: 

2192 info = self._repr_categories_info() 

2193 return f"Length: {len(self)}\n{info}" 

2194 

2195 def _get_repr(self, length: bool = True, na_rep="NaN", footer: bool = True) -> str: 

2196 from pandas.io.formats import format as fmt 

2197 

2198 formatter = fmt.CategoricalFormatter( 

2199 self, length=length, na_rep=na_rep, footer=footer 

2200 ) 

2201 result = formatter.to_string() 

2202 return str(result) 

2203 

2204 def __repr__(self) -> str: 

2205 """ 

2206 String representation. 

2207 """ 

2208 _maxlen = 10 

2209 if len(self._codes) > _maxlen: 

2210 result = self._tidy_repr(_maxlen) 

2211 elif len(self._codes) > 0: 

2212 result = self._get_repr(length=len(self) > _maxlen) 

2213 else: 

2214 msg = self._get_repr(length=False, footer=True).replace("\n", ", ") 

2215 result = f"[], {msg}" 

2216 

2217 return result 

2218 

2219 # ------------------------------------------------------------------ 

2220 

2221 def _validate_listlike(self, value): 

2222 # NB: here we assume scalar-like tuples have already been excluded 

2223 value = extract_array(value, extract_numpy=True) 

2224 

2225 # require identical categories set 

2226 if isinstance(value, Categorical): 

2227 if not is_dtype_equal(self.dtype, value.dtype): 

2228 raise TypeError( 

2229 "Cannot set a Categorical with another, " 

2230 "without identical categories" 

2231 ) 

2232 # is_dtype_equal implies categories_match_up_to_permutation 

2233 value = self._encode_with_my_categories(value) 

2234 return value._codes 

2235 

2236 from pandas import Index 

2237 

2238 # tupleize_cols=False for e.g. test_fillna_iterable_category GH#41914 

2239 to_add = Index._with_infer(value, tupleize_cols=False).difference( 

2240 self.categories 

2241 ) 

2242 

2243 # no assignments of values not in categories, but it's always ok to set 

2244 # something to np.nan 

2245 if len(to_add) and not isna(to_add).all(): 

2246 raise TypeError( 

2247 "Cannot setitem on a Categorical with a new " 

2248 "category, set the categories first" 

2249 ) 

2250 

2251 codes = self.categories.get_indexer(value) 

2252 return codes.astype(self._ndarray.dtype, copy=False) 

2253 

2254 def _reverse_indexer(self) -> dict[Hashable, npt.NDArray[np.intp]]: 

2255 """ 

2256 Compute the inverse of a categorical, returning 

2257 a dict of categories -> indexers. 

2258 

2259 *This is an internal function* 

2260 

2261 Returns 

2262 ------- 

2263 Dict[Hashable, np.ndarray[np.intp]] 

2264 dict of categories -> indexers 

2265 

2266 Examples 

2267 -------- 

2268 >>> c = pd.Categorical(list('aabca')) 

2269 >>> c 

2270 ['a', 'a', 'b', 'c', 'a'] 

2271 Categories (3, object): ['a', 'b', 'c'] 

2272 >>> c.categories 

2273 Index(['a', 'b', 'c'], dtype='object') 

2274 >>> c.codes 

2275 array([0, 0, 1, 2, 0], dtype=int8) 

2276 >>> c._reverse_indexer() 

2277 {'a': array([0, 1, 4]), 'b': array([2]), 'c': array([3])} 

2278 

2279 """ 

2280 categories = self.categories 

2281 r, counts = libalgos.groupsort_indexer( 

2282 ensure_platform_int(self.codes), categories.size 

2283 ) 

2284 counts = ensure_int64(counts).cumsum() 

2285 _result = (r[start:end] for start, end in zip(counts, counts[1:])) 

2286 return dict(zip(categories, _result)) 

2287 

2288 # ------------------------------------------------------------------ 

2289 # Reductions 

2290 

2291 @deprecate_kwarg(old_arg_name="numeric_only", new_arg_name="skipna") 

2292 def min(self, *, skipna=True, **kwargs): 

2293 """ 

2294 The minimum value of the object. 

2295 

2296 Only ordered `Categoricals` have a minimum! 

2297 

2298 .. versionchanged:: 1.0.0 

2299 

2300 Returns an NA value on empty arrays 

2301 

2302 Raises 

2303 ------ 

2304 TypeError 

2305 If the `Categorical` is not `ordered`. 

2306 

2307 Returns 

2308 ------- 

2309 min : the minimum of this `Categorical` 

2310 """ 

2311 nv.validate_minmax_axis(kwargs.get("axis", 0)) 

2312 nv.validate_min((), kwargs) 

2313 self.check_for_ordered("min") 

2314 

2315 if not len(self._codes): 

2316 return self.dtype.na_value 

2317 

2318 good = self._codes != -1 

2319 if not good.all(): 

2320 if skipna and good.any(): 

2321 pointer = self._codes[good].min() 

2322 else: 

2323 return np.nan 

2324 else: 

2325 pointer = self._codes.min() 

2326 return self._wrap_reduction_result(None, pointer) 

2327 

2328 @deprecate_kwarg(old_arg_name="numeric_only", new_arg_name="skipna") 

2329 def max(self, *, skipna=True, **kwargs): 

2330 """ 

2331 The maximum value of the object. 

2332 

2333 Only ordered `Categoricals` have a maximum! 

2334 

2335 .. versionchanged:: 1.0.0 

2336 

2337 Returns an NA value on empty arrays 

2338 

2339 Raises 

2340 ------ 

2341 TypeError 

2342 If the `Categorical` is not `ordered`. 

2343 

2344 Returns 

2345 ------- 

2346 max : the maximum of this `Categorical` 

2347 """ 

2348 nv.validate_minmax_axis(kwargs.get("axis", 0)) 

2349 nv.validate_max((), kwargs) 

2350 self.check_for_ordered("max") 

2351 

2352 if not len(self._codes): 

2353 return self.dtype.na_value 

2354 

2355 good = self._codes != -1 

2356 if not good.all(): 

2357 if skipna and good.any(): 

2358 pointer = self._codes[good].max() 

2359 else: 

2360 return np.nan 

2361 else: 

2362 pointer = self._codes.max() 

2363 return self._wrap_reduction_result(None, pointer) 

2364 

2365 def mode(self, dropna: bool = True) -> Categorical: 

2366 """ 

2367 Returns the mode(s) of the Categorical. 

2368 

2369 Always returns `Categorical` even if only one value. 

2370 

2371 Parameters 

2372 ---------- 

2373 dropna : bool, default True 

2374 Don't consider counts of NaN/NaT. 

2375 

2376 Returns 

2377 ------- 

2378 modes : `Categorical` (sorted) 

2379 """ 

2380 warn( 

2381 "Categorical.mode is deprecated and will be removed in a future version. " 

2382 "Use Series.mode instead.", 

2383 FutureWarning, 

2384 stacklevel=find_stack_level(), 

2385 ) 

2386 return self._mode(dropna=dropna) 

2387 

2388 def _mode(self, dropna: bool = True) -> Categorical: 

2389 codes = self._codes 

2390 mask = None 

2391 if dropna: 

2392 mask = self.isna() 

2393 

2394 res_codes = algorithms.mode(codes, mask=mask) 

2395 res_codes = cast(np.ndarray, res_codes) 

2396 assert res_codes.dtype == codes.dtype 

2397 res = self._from_backing_data(res_codes) 

2398 return res 

2399 

2400 # ------------------------------------------------------------------ 

2401 # ExtensionArray Interface 

2402 

2403 def unique(self): 

2404 """ 

2405 Return the ``Categorical`` which ``categories`` and ``codes`` are 

2406 unique. 

2407 

2408 .. versionchanged:: 1.3.0 

2409 

2410 Previously, unused categories were dropped from the new categories. 

2411 

2412 Returns 

2413 ------- 

2414 Categorical 

2415 

2416 See Also 

2417 -------- 

2418 pandas.unique 

2419 CategoricalIndex.unique 

2420 Series.unique : Return unique values of Series object. 

2421 

2422 Examples 

2423 -------- 

2424 >>> pd.Categorical(list("baabc")).unique() 

2425 ['b', 'a', 'c'] 

2426 Categories (3, object): ['a', 'b', 'c'] 

2427 >>> pd.Categorical(list("baab"), categories=list("abc"), ordered=True).unique() 

2428 ['b', 'a'] 

2429 Categories (3, object): ['a' < 'b' < 'c'] 

2430 """ 

2431 unique_codes = unique1d(self.codes) 

2432 return self._from_backing_data(unique_codes) 

2433 

2434 def _cast_quantile_result(self, res_values: np.ndarray) -> np.ndarray: 

2435 # make sure we have correct itemsize for resulting codes 

2436 assert res_values.dtype == self._ndarray.dtype 

2437 return res_values 

2438 

2439 def equals(self, other: object) -> bool: 

2440 """ 

2441 Returns True if categorical arrays are equal. 

2442 

2443 Parameters 

2444 ---------- 

2445 other : `Categorical` 

2446 

2447 Returns 

2448 ------- 

2449 bool 

2450 """ 

2451 if not isinstance(other, Categorical): 

2452 return False 

2453 elif self._categories_match_up_to_permutation(other): 

2454 other = self._encode_with_my_categories(other) 

2455 return np.array_equal(self._codes, other._codes) 

2456 return False 

2457 

2458 @classmethod 

2459 def _concat_same_type( 

2460 cls: type[CategoricalT], to_concat: Sequence[CategoricalT], axis: int = 0 

2461 ) -> CategoricalT: 

2462 from pandas.core.dtypes.concat import union_categoricals 

2463 

2464 first = to_concat[0] 

2465 if axis >= first.ndim: 

2466 raise ValueError( 

2467 f"axis {axis} is out of bounds for array of dimension {first.ndim}" 

2468 ) 

2469 

2470 if axis == 1: 

2471 # Flatten, concatenate then reshape 

2472 if not all(x.ndim == 2 for x in to_concat): 

2473 raise ValueError 

2474 

2475 # pass correctly-shaped to union_categoricals 

2476 tc_flat = [] 

2477 for obj in to_concat: 

2478 tc_flat.extend([obj[:, i] for i in range(obj.shape[1])]) 

2479 

2480 res_flat = cls._concat_same_type(tc_flat, axis=0) 

2481 

2482 result = res_flat.reshape(len(first), -1, order="F") 

2483 return result 

2484 

2485 result = union_categoricals(to_concat) 

2486 return result 

2487 

2488 # ------------------------------------------------------------------ 

2489 

2490 def _encode_with_my_categories(self, other: Categorical) -> Categorical: 

2491 """ 

2492 Re-encode another categorical using this Categorical's categories. 

2493 

2494 Notes 

2495 ----- 

2496 This assumes we have already checked 

2497 self._categories_match_up_to_permutation(other). 

2498 """ 

2499 # Indexing on codes is more efficient if categories are the same, 

2500 # so we can apply some optimizations based on the degree of 

2501 # dtype-matching. 

2502 codes = recode_for_categories( 

2503 other.codes, other.categories, self.categories, copy=False 

2504 ) 

2505 return self._from_backing_data(codes) 

2506 

2507 def _categories_match_up_to_permutation(self, other: Categorical) -> bool: 

2508 """ 

2509 Returns True if categoricals are the same dtype 

2510 same categories, and same ordered 

2511 

2512 Parameters 

2513 ---------- 

2514 other : Categorical 

2515 

2516 Returns 

2517 ------- 

2518 bool 

2519 """ 

2520 return hash(self.dtype) == hash(other.dtype) 

2521 

2522 def is_dtype_equal(self, other) -> bool: 

2523 warn( 

2524 "Categorical.is_dtype_equal is deprecated and will be removed " 

2525 "in a future version", 

2526 FutureWarning, 

2527 stacklevel=find_stack_level(), 

2528 ) 

2529 try: 

2530 return self._categories_match_up_to_permutation(other) 

2531 except (AttributeError, TypeError): 

2532 return False 

2533 

2534 def describe(self) -> DataFrame: 

2535 """ 

2536 Describes this Categorical 

2537 

2538 Returns 

2539 ------- 

2540 description: `DataFrame` 

2541 A dataframe with frequency and counts by category. 

2542 """ 

2543 counts = self.value_counts(dropna=False) 

2544 freqs = counts / counts.sum() 

2545 

2546 from pandas import Index 

2547 from pandas.core.reshape.concat import concat 

2548 

2549 result = concat([counts, freqs], axis=1) 

2550 result.columns = Index(["counts", "freqs"]) 

2551 result.index.name = "categories" 

2552 

2553 return result 

2554 

2555 def isin(self, values) -> npt.NDArray[np.bool_]: 

2556 """ 

2557 Check whether `values` are contained in Categorical. 

2558 

2559 Return a boolean NumPy Array showing whether each element in 

2560 the Categorical matches an element in the passed sequence of 

2561 `values` exactly. 

2562 

2563 Parameters 

2564 ---------- 

2565 values : set or list-like 

2566 The sequence of values to test. Passing in a single string will 

2567 raise a ``TypeError``. Instead, turn a single string into a 

2568 list of one element. 

2569 

2570 Returns 

2571 ------- 

2572 np.ndarray[bool] 

2573 

2574 Raises 

2575 ------ 

2576 TypeError 

2577 * If `values` is not a set or list-like 

2578 

2579 See Also 

2580 -------- 

2581 pandas.Series.isin : Equivalent method on Series. 

2582 

2583 Examples 

2584 -------- 

2585 >>> s = pd.Categorical(['lama', 'cow', 'lama', 'beetle', 'lama', 

2586 ... 'hippo']) 

2587 >>> s.isin(['cow', 'lama']) 

2588 array([ True, True, True, False, True, False]) 

2589 

2590 Passing a single string as ``s.isin('lama')`` will raise an error. Use 

2591 a list of one element instead: 

2592 

2593 >>> s.isin(['lama']) 

2594 array([ True, False, True, False, True, False]) 

2595 """ 

2596 if not is_list_like(values): 

2597 values_type = type(values).__name__ 

2598 raise TypeError( 

2599 "only list-like objects are allowed to be passed " 

2600 f"to isin(), you passed a [{values_type}]" 

2601 ) 

2602 values = sanitize_array(values, None, None) 

2603 null_mask = np.asarray(isna(values)) 

2604 code_values = self.categories.get_indexer(values) 

2605 code_values = code_values[null_mask | (code_values >= 0)] 

2606 return algorithms.isin(self.codes, code_values) 

2607 

2608 @overload 

2609 def replace( 

2610 self, to_replace, value, *, inplace: Literal[False] = ... 

2611 ) -> Categorical: 

2612 ... 

2613 

2614 @overload 

2615 def replace(self, to_replace, value, *, inplace: Literal[True]) -> None: 

2616 ... 

2617 

2618 @deprecate_nonkeyword_arguments(version=None, allowed_args=["self", "value"]) 

2619 def replace(self, to_replace, value, inplace: bool = False) -> Categorical | None: 

2620 """ 

2621 Replaces all instances of one value with another 

2622 

2623 Parameters 

2624 ---------- 

2625 to_replace: object 

2626 The value to be replaced 

2627 

2628 value: object 

2629 The value to replace it with 

2630 

2631 inplace: bool 

2632 Whether the operation is done in-place 

2633 

2634 Returns 

2635 ------- 

2636 None if inplace is True, otherwise the new Categorical after replacement 

2637 

2638 

2639 Examples 

2640 -------- 

2641 >>> s = pd.Categorical([1, 2, 1, 3]) 

2642 >>> s.replace(1, 3) 

2643 [3, 2, 3, 3] 

2644 Categories (2, int64): [2, 3] 

2645 """ 

2646 # GH#44929 deprecation 

2647 warn( 

2648 "Categorical.replace is deprecated and will be removed in a future " 

2649 "version. Use Series.replace directly instead.", 

2650 FutureWarning, 

2651 stacklevel=find_stack_level(), 

2652 ) 

2653 return self._replace(to_replace=to_replace, value=value, inplace=inplace) 

2654 

2655 def _replace(self, *, to_replace, value, inplace: bool = False): 

2656 inplace = validate_bool_kwarg(inplace, "inplace") 

2657 cat = self if inplace else self.copy() 

2658 

2659 # build a dict of (to replace -> value) pairs 

2660 if is_list_like(to_replace): 

2661 # if to_replace is list-like and value is scalar 

2662 replace_dict = {replace_value: value for replace_value in to_replace} 

2663 else: 

2664 # if both to_replace and value are scalar 

2665 replace_dict = {to_replace: value} 

2666 

2667 # other cases, like if both to_replace and value are list-like or if 

2668 # to_replace is a dict, are handled separately in NDFrame 

2669 for replace_value, new_value in replace_dict.items(): 

2670 if new_value == replace_value: 

2671 continue 

2672 if replace_value in cat.categories: 

2673 if isna(new_value): 

2674 with catch_warnings(): 

2675 simplefilter("ignore") 

2676 cat.remove_categories(replace_value, inplace=True) 

2677 continue 

2678 

2679 categories = cat.categories.tolist() 

2680 index = categories.index(replace_value) 

2681 

2682 if new_value in cat.categories: 

2683 value_index = categories.index(new_value) 

2684 cat._codes[cat._codes == index] = value_index 

2685 with catch_warnings(): 

2686 simplefilter("ignore") 

2687 cat.remove_categories(replace_value, inplace=True) 

2688 else: 

2689 categories[index] = new_value 

2690 with catch_warnings(): 

2691 simplefilter("ignore") 

2692 cat.rename_categories(categories, inplace=True) 

2693 if not inplace: 

2694 return cat 

2695 

2696 # ------------------------------------------------------------------------ 

2697 # String methods interface 

2698 def _str_map( 

2699 self, f, na_value=np.nan, dtype=np.dtype("object"), convert: bool = True 

2700 ): 

2701 # Optimization to apply the callable `f` to the categories once 

2702 # and rebuild the result by `take`ing from the result with the codes. 

2703 # Returns the same type as the object-dtype implementation though. 

2704 from pandas.core.arrays import PandasArray 

2705 

2706 categories = self.categories 

2707 codes = self.codes 

2708 result = PandasArray(categories.to_numpy())._str_map(f, na_value, dtype) 

2709 return take_nd(result, codes, fill_value=na_value) 

2710 

2711 def _str_get_dummies(self, sep="|"): 

2712 # sep may not be in categories. Just bail on this. 

2713 from pandas.core.arrays import PandasArray 

2714 

2715 return PandasArray(self.astype(str))._str_get_dummies(sep) 

2716 

2717 

2718# The Series.cat accessor 

2719 

2720 

2721@delegate_names( 

2722 delegate=Categorical, accessors=["categories", "ordered"], typ="property" 

2723) 

2724@delegate_names( 

2725 delegate=Categorical, 

2726 accessors=[ 

2727 "rename_categories", 

2728 "reorder_categories", 

2729 "add_categories", 

2730 "remove_categories", 

2731 "remove_unused_categories", 

2732 "set_categories", 

2733 "as_ordered", 

2734 "as_unordered", 

2735 ], 

2736 typ="method", 

2737) 

2738class CategoricalAccessor(PandasDelegate, PandasObject, NoNewAttributesMixin): 

2739 """ 

2740 Accessor object for categorical properties of the Series values. 

2741 

2742 Be aware that assigning to `categories` is a inplace operation, while all 

2743 methods return new categorical data per default (but can be called with 

2744 `inplace=True`). 

2745 

2746 Parameters 

2747 ---------- 

2748 data : Series or CategoricalIndex 

2749 

2750 Examples 

2751 -------- 

2752 >>> s = pd.Series(list("abbccc")).astype("category") 

2753 >>> s 

2754 0 a 

2755 1 b 

2756 2 b 

2757 3 c 

2758 4 c 

2759 5 c 

2760 dtype: category 

2761 Categories (3, object): ['a', 'b', 'c'] 

2762 

2763 >>> s.cat.categories 

2764 Index(['a', 'b', 'c'], dtype='object') 

2765 

2766 >>> s.cat.rename_categories(list("cba")) 

2767 0 c 

2768 1 b 

2769 2 b 

2770 3 a 

2771 4 a 

2772 5 a 

2773 dtype: category 

2774 Categories (3, object): ['c', 'b', 'a'] 

2775 

2776 >>> s.cat.reorder_categories(list("cba")) 

2777 0 a 

2778 1 b 

2779 2 b 

2780 3 c 

2781 4 c 

2782 5 c 

2783 dtype: category 

2784 Categories (3, object): ['c', 'b', 'a'] 

2785 

2786 >>> s.cat.add_categories(["d", "e"]) 

2787 0 a 

2788 1 b 

2789 2 b 

2790 3 c 

2791 4 c 

2792 5 c 

2793 dtype: category 

2794 Categories (5, object): ['a', 'b', 'c', 'd', 'e'] 

2795 

2796 >>> s.cat.remove_categories(["a", "c"]) 

2797 0 NaN 

2798 1 b 

2799 2 b 

2800 3 NaN 

2801 4 NaN 

2802 5 NaN 

2803 dtype: category 

2804 Categories (1, object): ['b'] 

2805 

2806 >>> s1 = s.cat.add_categories(["d", "e"]) 

2807 >>> s1.cat.remove_unused_categories() 

2808 0 a 

2809 1 b 

2810 2 b 

2811 3 c 

2812 4 c 

2813 5 c 

2814 dtype: category 

2815 Categories (3, object): ['a', 'b', 'c'] 

2816 

2817 >>> s.cat.set_categories(list("abcde")) 

2818 0 a 

2819 1 b 

2820 2 b 

2821 3 c 

2822 4 c 

2823 5 c 

2824 dtype: category 

2825 Categories (5, object): ['a', 'b', 'c', 'd', 'e'] 

2826 

2827 >>> s.cat.as_ordered() 

2828 0 a 

2829 1 b 

2830 2 b 

2831 3 c 

2832 4 c 

2833 5 c 

2834 dtype: category 

2835 Categories (3, object): ['a' < 'b' < 'c'] 

2836 

2837 >>> s.cat.as_unordered() 

2838 0 a 

2839 1 b 

2840 2 b 

2841 3 c 

2842 4 c 

2843 5 c 

2844 dtype: category 

2845 Categories (3, object): ['a', 'b', 'c'] 

2846 """ 

2847 

2848 def __init__(self, data) -> None: 

2849 self._validate(data) 

2850 self._parent = data.values 

2851 self._index = data.index 

2852 self._name = data.name 

2853 self._freeze() 

2854 

2855 @staticmethod 

2856 def _validate(data): 

2857 if not is_categorical_dtype(data.dtype): 

2858 raise AttributeError("Can only use .cat accessor with a 'category' dtype") 

2859 

2860 def _delegate_property_get(self, name): 

2861 return getattr(self._parent, name) 

2862 

2863 def _delegate_property_set(self, name, new_values): 

2864 return setattr(self._parent, name, new_values) 

2865 

2866 @property 

2867 def codes(self) -> Series: 

2868 """ 

2869 Return Series of codes as well as the index. 

2870 """ 

2871 from pandas import Series 

2872 

2873 return Series(self._parent.codes, index=self._index) 

2874 

2875 def _delegate_method(self, name, *args, **kwargs): 

2876 from pandas import Series 

2877 

2878 method = getattr(self._parent, name) 

2879 res = method(*args, **kwargs) 

2880 if res is not None: 

2881 return Series(res, index=self._index, name=self._name) 

2882 

2883 

2884# utility routines 

2885 

2886 

2887def _get_codes_for_values(values, categories: Index) -> np.ndarray: 

2888 """ 

2889 utility routine to turn values into codes given the specified categories 

2890 

2891 If `values` is known to be a Categorical, use recode_for_categories instead. 

2892 """ 

2893 if values.ndim > 1: 

2894 flat = values.ravel() 

2895 codes = _get_codes_for_values(flat, categories) 

2896 return codes.reshape(values.shape) 

2897 

2898 codes = categories.get_indexer_for(values) 

2899 return coerce_indexer_dtype(codes, categories) 

2900 

2901 

2902def recode_for_categories( 

2903 codes: np.ndarray, old_categories, new_categories, copy: bool = True 

2904) -> np.ndarray: 

2905 """ 

2906 Convert a set of codes for to a new set of categories 

2907 

2908 Parameters 

2909 ---------- 

2910 codes : np.ndarray 

2911 old_categories, new_categories : Index 

2912 copy: bool, default True 

2913 Whether to copy if the codes are unchanged. 

2914 

2915 Returns 

2916 ------- 

2917 new_codes : np.ndarray[np.int64] 

2918 

2919 Examples 

2920 -------- 

2921 >>> old_cat = pd.Index(['b', 'a', 'c']) 

2922 >>> new_cat = pd.Index(['a', 'b']) 

2923 >>> codes = np.array([0, 1, 1, 2]) 

2924 >>> recode_for_categories(codes, old_cat, new_cat) 

2925 array([ 1, 0, 0, -1], dtype=int8) 

2926 """ 

2927 if len(old_categories) == 0: 

2928 # All null anyway, so just retain the nulls 

2929 if copy: 

2930 return codes.copy() 

2931 return codes 

2932 elif new_categories.equals(old_categories): 

2933 # Same categories, so no need to actually recode 

2934 if copy: 

2935 return codes.copy() 

2936 return codes 

2937 

2938 indexer = coerce_indexer_dtype( 

2939 new_categories.get_indexer(old_categories), new_categories 

2940 ) 

2941 new_codes = take_nd(indexer, codes, fill_value=-1) 

2942 return new_codes 

2943 

2944 

2945def factorize_from_iterable(values) -> tuple[np.ndarray, Index]: 

2946 """ 

2947 Factorize an input `values` into `categories` and `codes`. Preserves 

2948 categorical dtype in `categories`. 

2949 

2950 Parameters 

2951 ---------- 

2952 values : list-like 

2953 

2954 Returns 

2955 ------- 

2956 codes : ndarray 

2957 categories : Index 

2958 If `values` has a categorical dtype, then `categories` is 

2959 a CategoricalIndex keeping the categories and order of `values`. 

2960 """ 

2961 from pandas import CategoricalIndex 

2962 

2963 if not is_list_like(values): 

2964 raise TypeError("Input must be list-like") 

2965 

2966 categories: Index 

2967 if is_categorical_dtype(values): 

2968 values = extract_array(values) 

2969 # The Categorical we want to build has the same categories 

2970 # as values but its codes are by def [0, ..., len(n_categories) - 1] 

2971 cat_codes = np.arange(len(values.categories), dtype=values.codes.dtype) 

2972 cat = Categorical.from_codes(cat_codes, dtype=values.dtype) 

2973 

2974 categories = CategoricalIndex(cat) 

2975 codes = values.codes 

2976 else: 

2977 # The value of ordered is irrelevant since we don't use cat as such, 

2978 # but only the resulting categories, the order of which is independent 

2979 # from ordered. Set ordered to False as default. See GH #15457 

2980 cat = Categorical(values, ordered=False) 

2981 categories = cat.categories 

2982 codes = cat.codes 

2983 return codes, categories 

2984 

2985 

2986def factorize_from_iterables(iterables) -> tuple[list[np.ndarray], list[Index]]: 

2987 """ 

2988 A higher-level wrapper over `factorize_from_iterable`. 

2989 

2990 Parameters 

2991 ---------- 

2992 iterables : list-like of list-likes 

2993 

2994 Returns 

2995 ------- 

2996 codes : list of ndarrays 

2997 categories : list of Indexes 

2998 

2999 Notes 

3000 ----- 

3001 See `factorize_from_iterable` for more info. 

3002 """ 

3003 if len(iterables) == 0: 

3004 # For consistency, it should return two empty lists. 

3005 return [], [] 

3006 

3007 codes, categories = zip(*(factorize_from_iterable(it) for it in iterables)) 

3008 return list(codes), list(categories)