Coverage for /var/srv/projects/api.amasfac.comuna18.com/tmp/venv/lib/python3.9/site-packages/pandas/core/indexes/category.py: 25%

1from __future__ import annotations

3from typing import (

4 Any,

5 Hashable,

7import warnings

9import numpy as np

11from pandas._libs import index as libindex

12from pandas._typing import (

13 Dtype,

14 DtypeObj,

15 npt,

16)

17from pandas.util._decorators import (

18 cache_readonly,

19 doc,

20)

21from pandas.util._exceptions import find_stack_level

23from pandas.core.dtypes.common import (

24 is_categorical_dtype,

25 is_scalar,

26 pandas_dtype,

27)

28from pandas.core.dtypes.missing import (

29 is_valid_na_for_dtype,

30 isna,

31 notna,

32)

34from pandas.core.arrays.categorical import (

35 Categorical,

36 contains,

37)

38from pandas.core.construction import extract_array

39import pandas.core.indexes.base as ibase

40from pandas.core.indexes.base import (

41 Index,

42 maybe_extract_name,

43)

44from pandas.core.indexes.extension import (

45 NDArrayBackedExtensionIndex,

46 inherit_names,

47)

49from pandas.io.formats.printing import pprint_thing

51_index_doc_kwargs: dict[str, str] = dict(ibase._index_doc_kwargs)

52_index_doc_kwargs.update({"target_klass": "CategoricalIndex"})

55@inherit_names(

56 [

57 "argsort",

58 "tolist",

59 "codes",

60 "categories",

61 "ordered",

62 "_reverse_indexer",

63 "searchsorted",

64 "is_dtype_equal",

65 "min",

66 "max",

67 ],

68 Categorical,

69)

70@inherit_names(

71 [

72 "rename_categories",

73 "reorder_categories",

74 "add_categories",

75 "remove_categories",

76 "remove_unused_categories",

77 "set_categories",

78 "as_ordered",

79 "as_unordered",

80 ],

81 Categorical,

82 wrap=True,

83)

84class CategoricalIndex(NDArrayBackedExtensionIndex):

85 """

86 Index based on an underlying :class:`Categorical`.

88 CategoricalIndex, like Categorical, can only take on a limited,

89 and usually fixed, number of possible values (`categories`). Also,

90 like Categorical, it might have an order, but numerical operations

91 (additions, divisions, ...) are not possible.

93 Parameters

94 ----------

95 data : array-like (1-dimensional)

96 The values of the categorical. If `categories` are given, values not in

97 `categories` will be replaced with NaN.

98 categories : index-like, optional

99 The categories for the categorical. Items need to be unique.

100 If the categories are not given here (and also not in `dtype`), they

101 will be inferred from the `data`.

102 ordered : bool, optional

103 Whether or not this categorical is treated as an ordered

104 categorical. If not given here or in `dtype`, the resulting

105 categorical will be unordered.

106 dtype : CategoricalDtype or "category", optional

107 If :class:`CategoricalDtype`, cannot be used together with

108 `categories` or `ordered`.

109 copy : bool, default False

110 Make a copy of input ndarray.

111 name : object, optional

112 Name to be stored in the index.

113

114 Attributes

115 ----------

116 codes

117 categories

118 ordered

119

120 Methods

121 -------

122 rename_categories

123 reorder_categories

124 add_categories

125 remove_categories

126 remove_unused_categories

127 set_categories

128 as_ordered

129 as_unordered

130 map

131

132 Raises

133 ------

134 ValueError

135 If the categories do not validate.

136 TypeError

137 If an explicit ``ordered=True`` is given but no `categories` and the

138 `values` are not sortable.

139

140 See Also

141 --------

142 Index : The base pandas Index type.

143 Categorical : A categorical array.

144 CategoricalDtype : Type for categorical data.

145

146 Notes

147 -----

148 See the `user guide

149 <https://pandas.pydata.org/pandas-docs/stable/user_guide/advanced.html#categoricalindex>`__

150 for more.

151

152 Examples

153 --------

154 >>> pd.CategoricalIndex(["a", "b", "c", "a", "b", "c"])

155 CategoricalIndex(['a', 'b', 'c', 'a', 'b', 'c'],

156 categories=['a', 'b', 'c'], ordered=False, dtype='category')

157

158 ``CategoricalIndex`` can also be instantiated from a ``Categorical``:

159

160 >>> c = pd.Categorical(["a", "b", "c", "a", "b", "c"])

161 >>> pd.CategoricalIndex(c)

162 CategoricalIndex(['a', 'b', 'c', 'a', 'b', 'c'],

163 categories=['a', 'b', 'c'], ordered=False, dtype='category')

164

165 Ordered ``CategoricalIndex`` can have a min and max value.

166

167 >>> ci = pd.CategoricalIndex(

168 ... ["a", "b", "c", "a", "b", "c"], ordered=True, categories=["c", "b", "a"]

169 ... )

170 >>> ci

171 CategoricalIndex(['a', 'b', 'c', 'a', 'b', 'c'],

172 categories=['c', 'b', 'a'], ordered=True, dtype='category')

173 >>> ci.min()

174 'c'

175 """

176

177 _typ = "categoricalindex"

178 _data_cls = Categorical

179

180 @property

181 def _can_hold_strings(self):

182 return self.categories._can_hold_strings

183

184 @cache_readonly

185 def _should_fallback_to_positional(self) -> bool:

186 return self.categories._should_fallback_to_positional

187

188 codes: np.ndarray

189 categories: Index

190 ordered: bool | None

191 _data: Categorical

192 _values: Categorical

193

194 @property

195 def _engine_type(self) -> type[libindex.IndexEngine]:

196 # self.codes can have dtype int8, int16, int32 or int64, so we need

197 # to return the corresponding engine type (libindex.Int8Engine, etc.).

198 return {

199 np.int8: libindex.Int8Engine,

200 np.int16: libindex.Int16Engine,

201 np.int32: libindex.Int32Engine,

202 np.int64: libindex.Int64Engine,

203 }[self.codes.dtype.type]

204

205 # --------------------------------------------------------------------

206 # Constructors

207

208 def __new__(

209 cls,

210 data=None,

211 categories=None,

212 ordered=None,

213 dtype: Dtype | None = None,

214 copy: bool = False,

215 name: Hashable = None,

216 ) -> CategoricalIndex:

217

218 name = maybe_extract_name(name, data, cls)

219

220 if data is None:

221 # GH#38944

222 warnings.warn(

223 "Constructing a CategoricalIndex without passing data is "

224 "deprecated and will raise in a future version. "

225 "Use CategoricalIndex([], ...) instead.",

226 FutureWarning,

227 stacklevel=find_stack_level(),

228 )

229 data = []

230

231 if is_scalar(data):

232 raise cls._scalar_data_error(data)

233

234 data = Categorical(

235 data, categories=categories, ordered=ordered, dtype=dtype, copy=copy

236 )

237

238 return cls._simple_new(data, name=name)

239

240 # --------------------------------------------------------------------

241

242 def _is_dtype_compat(self, other) -> Categorical:

243 """

244 *this is an internal non-public method*

245

246 provide a comparison between the dtype of self and other (coercing if

247 needed)

248

249 Parameters

250 ----------

251 other : Index

252

253 Returns

254 -------

255 Categorical

256

257 Raises

258 ------

259 TypeError if the dtypes are not compatible

260 """

261 if is_categorical_dtype(other):

262 other = extract_array(other)

263 if not other._categories_match_up_to_permutation(self):

264 raise TypeError(

265 "categories must match existing categories when appending"

266 )

267

268 elif other._is_multi:

269 # preempt raising NotImplementedError in isna call

270 raise TypeError("MultiIndex is not dtype-compatible with CategoricalIndex")

271 else:

272 values = other

273

274 cat = Categorical(other, dtype=self.dtype)

275 other = CategoricalIndex(cat)

276 if not other.isin(values).all():

277 raise TypeError(

278 "cannot append a non-category item to a CategoricalIndex"

279 )

280 other = other._values

281

282 if not ((other == values) | (isna(other) & isna(values))).all():

283 # GH#37667 see test_equals_non_category

284 raise TypeError(

285 "categories must match existing categories when appending"

286 )

287

288 return other

289

290 @doc(Index.astype)

291 def astype(self, dtype: Dtype, copy: bool = True) -> Index:

292 from pandas.core.api import NumericIndex

293

294 dtype = pandas_dtype(dtype)

295

296 categories = self.categories

297 # the super method always returns Int64Index, UInt64Index and Float64Index

298 # but if the categories are a NumericIndex with dtype float32, we want to

299 # return an index with the same dtype as self.categories.

300 if categories._is_backward_compat_public_numeric_index:

301 assert isinstance(categories, NumericIndex) # mypy complaint fix

302 try:

303 categories._validate_dtype(dtype)

304 except ValueError:

305 pass

306 else:

307 new_values = self._data.astype(dtype, copy=copy)

308 # pass copy=False because any copying has been done in the

309 # _data.astype call above

310 return categories._constructor(new_values, name=self.name, copy=False)

311

312 return super().astype(dtype, copy=copy)

313

314 def equals(self, other: object) -> bool:

315 """

316 Determine if two CategoricalIndex objects contain the same elements.

317

318 Returns

319 -------

320 bool

321 If two CategoricalIndex objects have equal elements True,

322 otherwise False.

323 """

324 if self.is_(other):

325 return True

326

327 if not isinstance(other, Index):

328 return False

329

330 try:

331 other = self._is_dtype_compat(other)

332 except (TypeError, ValueError):

333 return False

334

335 return self._data.equals(other)

336

337 # --------------------------------------------------------------------

338 # Rendering Methods

339

340 @property

341 def _formatter_func(self):

342 return self.categories._formatter_func

343

344 def _format_attrs(self):

345 """

346 Return a list of tuples of the (attr,formatted_value)

347 """

348 attrs: list[tuple[str, str | int | bool | None]]

349

350 attrs = [

351 (

352 "categories",

353 "[" + ", ".join(self._data._repr_categories()) + "]",

354 ),

355 ("ordered", self.ordered),

356 ]

357 extra = super()._format_attrs()

358 return attrs + extra

359

360 def _format_with_header(self, header: list[str], na_rep: str) -> list[str]:

361 result = [

362 pprint_thing(x, escape_chars=("\t", "\r", "\n")) if notna(x) else na_rep

363 for x in self._values

364 ]

365 return header + result

366

367 # --------------------------------------------------------------------

368

369 @property

370 def inferred_type(self) -> str:

371 return "categorical"

372

373 @doc(Index.__contains__)

374 def __contains__(self, key: Any) -> bool:

375 # if key is a NaN, check if any NaN is in self.

376 if is_valid_na_for_dtype(key, self.categories.dtype):

377 return self.hasnans

378

379 return contains(self, key, container=self._engine)

380

381 # TODO(2.0): remove reindex once non-unique deprecation is enforced

382 def reindex(

383 self, target, method=None, level=None, limit=None, tolerance=None

384 ) -> tuple[Index, npt.NDArray[np.intp] | None]:

385 """

386 Create index with target's values (move/add/delete values as necessary)

387

388 Returns

389 -------

390 new_index : pd.Index

391 Resulting index

392 indexer : np.ndarray[np.intp] or None

393 Indices of output values in original index

394

395 """

396 if method is not None:

397 raise NotImplementedError(

398 "argument method is not implemented for CategoricalIndex.reindex"

399 )

400 if level is not None:

401 raise NotImplementedError(

402 "argument level is not implemented for CategoricalIndex.reindex"

403 )

404 if limit is not None:

405 raise NotImplementedError(

406 "argument limit is not implemented for CategoricalIndex.reindex"

407 )

408

409 target = ibase.ensure_index(target)

410

411 if self.equals(target):

412 indexer = None

413 missing = np.array([], dtype=np.intp)

414 else:

415 indexer, missing = self.get_indexer_non_unique(target)

416 if not self.is_unique:

417 # GH#42568

418 warnings.warn(

419 "reindexing with a non-unique Index is deprecated and will "

420 "raise in a future version.",

421 FutureWarning,

422 stacklevel=find_stack_level(),

423 )

424

425 new_target: Index

426 if len(self) and indexer is not None:

427 new_target = self.take(indexer)

428 else:

429 new_target = target

430

431 # filling in missing if needed

432 if len(missing):

433 cats = self.categories.get_indexer(target)

434

435 if not isinstance(target, CategoricalIndex) or (cats == -1).any():

436 new_target, indexer, _ = super()._reindex_non_unique(target)

437 else:

438 # error: "Index" has no attribute "codes"

439 codes = new_target.codes.copy() # type: ignore[attr-defined]

440 codes[indexer == -1] = cats[missing]

441 cat = self._data._from_backing_data(codes)

442 new_target = type(self)._simple_new(cat, name=self.name)

443

444 # we always want to return an Index type here

445 # to be consistent with .reindex for other index types (e.g. they don't

446 # coerce based on the actual values, only on the dtype)

447 # unless we had an initial Categorical to begin with

448 # in which case we are going to conform to the passed Categorical

449 if is_categorical_dtype(target):

450 cat = Categorical(new_target, dtype=target.dtype)

451 new_target = type(self)._simple_new(cat, name=self.name)

452 else:

453 # e.g. test_reindex_with_categoricalindex, test_reindex_duplicate_target

454 new_target_array = np.asarray(new_target)

455 new_target = Index._with_infer(new_target_array, name=self.name)

456

457 return new_target, indexer

458

459 # --------------------------------------------------------------------

460 # Indexing Methods

461

462 def _maybe_cast_indexer(self, key) -> int:

463 # GH#41933: we have to do this instead of self._data._validate_scalar

464 # because this will correctly get partial-indexing on Interval categories

465 try:

466 return self._data._unbox_scalar(key)

467 except KeyError:

468 if is_valid_na_for_dtype(key, self.categories.dtype):

469 return -1

470 raise

471

472 def _maybe_cast_listlike_indexer(self, values) -> CategoricalIndex:

473 if isinstance(values, CategoricalIndex):

474 values = values._data

475 if isinstance(values, Categorical):

476 # Indexing on codes is more efficient if categories are the same,

477 # so we can apply some optimizations based on the degree of

478 # dtype-matching.

479 cat = self._data._encode_with_my_categories(values)

480 codes = cat._codes

481 else:

482 codes = self.categories.get_indexer(values)

483 codes = codes.astype(self.codes.dtype, copy=False)

484 cat = self._data._from_backing_data(codes)

485 return type(self)._simple_new(cat)

486

487 # --------------------------------------------------------------------

488

489 def _is_comparable_dtype(self, dtype: DtypeObj) -> bool:

490 return self.categories._is_comparable_dtype(dtype)

491

492 def take_nd(self, *args, **kwargs) -> CategoricalIndex:

493 """Alias for `take`"""

494 warnings.warn(

495 "CategoricalIndex.take_nd is deprecated, use CategoricalIndex.take "

496 "instead.",

497 FutureWarning,

498 stacklevel=find_stack_level(),

499 )

500 return self.take(*args, **kwargs)

501

502 def map(self, mapper):

503 """

504 Map values using input an input mapping or function.

505

506 Maps the values (their categories, not the codes) of the index to new

507 categories. If the mapping correspondence is one-to-one the result is a

508 :class:`~pandas.CategoricalIndex` which has the same order property as

509 the original, otherwise an :class:`~pandas.Index` is returned.

510

511 If a `dict` or :class:`~pandas.Series` is used any unmapped category is

512 mapped to `NaN`. Note that if this happens an :class:`~pandas.Index`

513 will be returned.

514

515 Parameters

516 ----------

517 mapper : function, dict, or Series

518 Mapping correspondence.

519

520 Returns

521 -------

522 pandas.CategoricalIndex or pandas.Index

523 Mapped index.

524

525 See Also

526 --------

527 Index.map : Apply a mapping correspondence on an

528 :class:`~pandas.Index`.

529 Series.map : Apply a mapping correspondence on a

530 :class:`~pandas.Series`.

531 Series.apply : Apply more complex functions on a

532 :class:`~pandas.Series`.

533

534 Examples

535 --------

536 >>> idx = pd.CategoricalIndex(['a', 'b', 'c'])

537 >>> idx

538 CategoricalIndex(['a', 'b', 'c'], categories=['a', 'b', 'c'],

539 ordered=False, dtype='category')

540 >>> idx.map(lambda x: x.upper())

541 CategoricalIndex(['A', 'B', 'C'], categories=['A', 'B', 'C'],

542 ordered=False, dtype='category')

543 >>> idx.map({'a': 'first', 'b': 'second', 'c': 'third'})

544 CategoricalIndex(['first', 'second', 'third'], categories=['first',

545 'second', 'third'], ordered=False, dtype='category')

546

547 If the mapping is one-to-one the ordering of the categories is

548 preserved:

549

550 >>> idx = pd.CategoricalIndex(['a', 'b', 'c'], ordered=True)

551 >>> idx

552 CategoricalIndex(['a', 'b', 'c'], categories=['a', 'b', 'c'],

553 ordered=True, dtype='category')

554 >>> idx.map({'a': 3, 'b': 2, 'c': 1})

555 CategoricalIndex([3, 2, 1], categories=[3, 2, 1], ordered=True,

556 dtype='category')

557

558 If the mapping is not one-to-one an :class:`~pandas.Index` is returned:

559

560 >>> idx.map({'a': 'first', 'b': 'second', 'c': 'first'})

561 Index(['first', 'second', 'first'], dtype='object')

562

563 If a `dict` is used, all unmapped categories are mapped to `NaN` and

564 the result is an :class:`~pandas.Index`:

565

566 >>> idx.map({'a': 'first', 'b': 'second'})

567 Index(['first', 'second', nan], dtype='object')

568 """

569 mapped = self._values.map(mapper)

570 return Index(mapped, name=self.name)

571

572 def _concat(self, to_concat: list[Index], name: Hashable) -> Index:

573 # if calling index is category, don't check dtype of others

574 try:

575 cat = Categorical._concat_same_type(

576 [self._is_dtype_compat(c) for c in to_concat]

577 )

578 except TypeError:

579 # not all to_concat elements are among our categories (or NA)

580 from pandas.core.dtypes.concat import concat_compat

581

582 res = concat_compat([x._values for x in to_concat])

583 return Index(res, name=name)

584 else:

585 return type(self)._simple_new(cat, name=name)