Coverage for /var/srv/projects/api.amasfac.comuna18.com/tmp/venv/lib/python3.9/site-packages/pandas/core/dtypes/dtypes.py: 28%

1"""

2Define extension dtypes.

3"""

4from __future__ import annotations

6import re

7from typing import (

8 TYPE_CHECKING,

9 Any,

10 MutableMapping,

11 cast,

12)

14import numpy as np

15import pytz

17from pandas._libs import missing as libmissing

18from pandas._libs.interval import Interval

19from pandas._libs.properties import cache_readonly

20from pandas._libs.tslibs import (

21 BaseOffset,

22 NaT,

23 NaTType,

24 Period,

25 Timestamp,

26 dtypes,

27 timezones,

28 to_offset,

29 tz_compare,

30)

31from pandas._typing import (

32 Dtype,

33 DtypeObj,

34 Ordered,

35 npt,

36 type_t,

37)

39from pandas.core.dtypes.base import (

40 ExtensionDtype,

41 register_extension_dtype,

42)

43from pandas.core.dtypes.generic import (

44 ABCCategoricalIndex,

45 ABCIndex,

46)

47from pandas.core.dtypes.inference import (

48 is_bool,

49 is_list_like,

50)

52if TYPE_CHECKING: 52 ↛ 53line 52 didn't jump to line 53, because the condition on line 52 was never true

53 from datetime import tzinfo

55 import pyarrow

57 from pandas import (

58 Categorical,

59 Index,

60 )

61 from pandas.core.arrays import (

62 BaseMaskedArray,

63 DatetimeArray,

64 IntervalArray,

65 PandasArray,

66 PeriodArray,

67 )

69str_type = str

72class PandasExtensionDtype(ExtensionDtype):

73 """

74 A np.dtype duck-typed class, suitable for holding a custom dtype.

76 THIS IS NOT A REAL NUMPY DTYPE

77 """

79 type: Any

80 kind: Any

81 # The Any type annotations above are here only because mypy seems to have a

82 # problem dealing with multiple inheritance from PandasExtensionDtype

83 # and ExtensionDtype's @properties in the subclasses below. The kind and

84 # type variables in those subclasses are explicitly typed below.

85 subdtype = None

86 str: str_type

87 num = 100

88 shape: tuple[int, ...] = ()

89 itemsize = 8

90 base: DtypeObj | None = None

91 isbuiltin = 0

92 isnative = 0

93 _cache_dtypes: dict[str_type, PandasExtensionDtype] = {}

95 def __repr__(self) -> str_type:

96 """

97 Return a string representation for a particular object.

98 """

99 return str(self)

100

101 def __hash__(self) -> int:

102 raise NotImplementedError("sub-classes should implement an __hash__ method")

103

104 def __getstate__(self) -> dict[str_type, Any]:

105 # pickle support; we don't want to pickle the cache

106 return {k: getattr(self, k, None) for k in self._metadata}

107

108 @classmethod

109 def reset_cache(cls) -> None:

110 """clear the cache"""

111 cls._cache_dtypes = {}

112

113

114class CategoricalDtypeType(type):

115 """

116 the type of CategoricalDtype, this metaclass determines subclass ability

117 """

118

119 pass

120

121

122@register_extension_dtype

123class CategoricalDtype(PandasExtensionDtype, ExtensionDtype):

124 """

125 Type for categorical data with the categories and orderedness.

126

127 Parameters

128 ----------

129 categories : sequence, optional

130 Must be unique, and must not contain any nulls.

131 The categories are stored in an Index,

132 and if an index is provided the dtype of that index will be used.

133 ordered : bool or None, default False

134 Whether or not this categorical is treated as a ordered categorical.

135 None can be used to maintain the ordered value of existing categoricals when

136 used in operations that combine categoricals, e.g. astype, and will resolve to

137 False if there is no existing ordered to maintain.

138

139 Attributes

140 ----------

141 categories

142 ordered

143

144 Methods

145 -------

146 None

147

148 See Also

149 --------

150 Categorical : Represent a categorical variable in classic R / S-plus fashion.

151

152 Notes

153 -----

154 This class is useful for specifying the type of a ``Categorical``

155 independent of the values. See :ref:`categorical.categoricaldtype`

156 for more.

157

158 Examples

159 --------

160 >>> t = pd.CategoricalDtype(categories=['b', 'a'], ordered=True)

161 >>> pd.Series(['a', 'b', 'a', 'c'], dtype=t)

162 0 a

163 1 b

164 2 a

165 3 NaN

166 dtype: category

167 Categories (2, object): ['b' < 'a']

168

169 An empty CategoricalDtype with a specific dtype can be created

170 by providing an empty index. As follows,

171

172 >>> pd.CategoricalDtype(pd.DatetimeIndex([])).categories.dtype

173 dtype('<M8[ns]')

174 """

175

176 # TODO: Document public vs. private API

177 name = "category"

178 type: type[CategoricalDtypeType] = CategoricalDtypeType

179 kind: str_type = "O"

180 str = "|O08"

181 base = np.dtype("O")

182 _metadata = ("categories", "ordered")

183 _cache_dtypes: dict[str_type, PandasExtensionDtype] = {}

184

185 def __init__(self, categories=None, ordered: Ordered = False) -> None:

186 self._finalize(categories, ordered, fastpath=False)

187

188 @classmethod

189 def _from_fastpath(

190 cls, categories=None, ordered: bool | None = None

191 ) -> CategoricalDtype:

192 self = cls.__new__(cls)

193 self._finalize(categories, ordered, fastpath=True)

194 return self

195

196 @classmethod

197 def _from_categorical_dtype(

198 cls, dtype: CategoricalDtype, categories=None, ordered: Ordered = None

199 ) -> CategoricalDtype:

200 if categories is ordered is None:

201 return dtype

202 if categories is None:

203 categories = dtype.categories

204 if ordered is None:

205 ordered = dtype.ordered

206 return cls(categories, ordered)

207

208 @classmethod

209 def _from_values_or_dtype(

210 cls,

211 values=None,

212 categories=None,

213 ordered: bool | None = None,

214 dtype: Dtype | None = None,

215 ) -> CategoricalDtype:

216 """

217 Construct dtype from the input parameters used in :class:`Categorical`.

218

219 This constructor method specifically does not do the factorization

220 step, if that is needed to find the categories. This constructor may

221 therefore return ``CategoricalDtype(categories=None, ordered=None)``,

222 which may not be useful. Additional steps may therefore have to be

223 taken to create the final dtype.

224

225 The return dtype is specified from the inputs in this prioritized

226 order:

227 1. if dtype is a CategoricalDtype, return dtype

228 2. if dtype is the string 'category', create a CategoricalDtype from

229 the supplied categories and ordered parameters, and return that.

230 3. if values is a categorical, use value.dtype, but override it with

231 categories and ordered if either/both of those are not None.

232 4. if dtype is None and values is not a categorical, construct the

233 dtype from categories and ordered, even if either of those is None.

234

235 Parameters

236 ----------

237 values : list-like, optional

238 The list-like must be 1-dimensional.

239 categories : list-like, optional

240 Categories for the CategoricalDtype.

241 ordered : bool, optional

242 Designating if the categories are ordered.

243 dtype : CategoricalDtype or the string "category", optional

244 If ``CategoricalDtype``, cannot be used together with

245 `categories` or `ordered`.

246

247 Returns

248 -------

249 CategoricalDtype

250

251 Examples

252 --------

253 >>> pd.CategoricalDtype._from_values_or_dtype()

254 CategoricalDtype(categories=None, ordered=None)

255 >>> pd.CategoricalDtype._from_values_or_dtype(

256 ... categories=['a', 'b'], ordered=True

257 ... )

258 CategoricalDtype(categories=['a', 'b'], ordered=True)

259 >>> dtype1 = pd.CategoricalDtype(['a', 'b'], ordered=True)

260 >>> dtype2 = pd.CategoricalDtype(['x', 'y'], ordered=False)

261 >>> c = pd.Categorical([0, 1], dtype=dtype1, fastpath=True)

262 >>> pd.CategoricalDtype._from_values_or_dtype(

263 ... c, ['x', 'y'], ordered=True, dtype=dtype2

264 ... )

265 Traceback (most recent call last):

266 ...

267 ValueError: Cannot specify `categories` or `ordered` together with

268 `dtype`.

269

270 The supplied dtype takes precedence over values' dtype:

271

272 >>> pd.CategoricalDtype._from_values_or_dtype(c, dtype=dtype2)

273 CategoricalDtype(categories=['x', 'y'], ordered=False)

274 """

275

276 if dtype is not None:

277 # The dtype argument takes precedence over values.dtype (if any)

278 if isinstance(dtype, str):

279 if dtype == "category":

280 dtype = CategoricalDtype(categories, ordered)

281 else:

282 raise ValueError(f"Unknown dtype {repr(dtype)}")

283 elif categories is not None or ordered is not None:

284 raise ValueError(

285 "Cannot specify `categories` or `ordered` together with `dtype`."

286 )

287 elif not isinstance(dtype, CategoricalDtype):

288 raise ValueError(f"Cannot not construct CategoricalDtype from {dtype}")

289 elif cls.is_dtype(values):

290 # If no "dtype" was passed, use the one from "values", but honor

291 # the "ordered" and "categories" arguments

292 dtype = values.dtype._from_categorical_dtype(

293 values.dtype, categories, ordered

294 )

295 else:

296 # If dtype=None and values is not categorical, create a new dtype.

297 # Note: This could potentially have categories=None and

298 # ordered=None.

299 dtype = CategoricalDtype(categories, ordered)

300

301 return cast(CategoricalDtype, dtype)

302

303 @classmethod

304 def construct_from_string(cls, string: str_type) -> CategoricalDtype:

305 """

306 Construct a CategoricalDtype from a string.

307

308 Parameters

309 ----------

310 string : str

311 Must be the string "category" in order to be successfully constructed.

312

313 Returns

314 -------

315 CategoricalDtype

316 Instance of the dtype.

317

318 Raises

319 ------

320 TypeError

321 If a CategoricalDtype cannot be constructed from the input.

322 """

323 if not isinstance(string, str): 323 ↛ 324line 323 didn't jump to line 324, because the condition on line 323 was never true

324 raise TypeError(

325 f"'construct_from_string' expects a string, got {type(string)}"

326 )

327 if string != cls.name: 327 ↛ 332line 327 didn't jump to line 332, because the condition on line 327 was never false

328 raise TypeError(f"Cannot construct a 'CategoricalDtype' from '{string}'")

329

330 # need ordered=None to ensure that operations specifying dtype="category" don't

331 # override the ordered value for existing categoricals

332 return cls(ordered=None)

333

334 def _finalize(self, categories, ordered: Ordered, fastpath: bool = False) -> None:

335

336 if ordered is not None:

337 self.validate_ordered(ordered)

338

339 if categories is not None:

340 categories = self.validate_categories(categories, fastpath=fastpath)

341

342 self._categories = categories

343 self._ordered = ordered

344

345 def __setstate__(self, state: MutableMapping[str_type, Any]) -> None:

346 # for pickle compat. __get_state__ is defined in the

347 # PandasExtensionDtype superclass and uses the public properties to

348 # pickle -> need to set the settable private ones here (see GH26067)

349 self._categories = state.pop("categories", None)

350 self._ordered = state.pop("ordered", False)

351

352 def __hash__(self) -> int:

353 # _hash_categories returns a uint64, so use the negative

354 # space for when we have unknown categories to avoid a conflict

355 if self.categories is None:

356 if self.ordered:

357 return -1

358 else:

359 return -2

360 # We *do* want to include the real self.ordered here

361 return int(self._hash_categories)

362

363 def __eq__(self, other: Any) -> bool:

364 """

365 Rules for CDT equality:

366 1) Any CDT is equal to the string 'category'

367 2) Any CDT is equal to itself

368 3) Any CDT is equal to a CDT with categories=None regardless of ordered

369 4) A CDT with ordered=True is only equal to another CDT with

370 ordered=True and identical categories in the same order

371 5) A CDT with ordered={False, None} is only equal to another CDT with

372 ordered={False, None} and identical categories, but same order is

373 not required. There is no distinction between False/None.

374 6) Any other comparison returns False

375 """

376 if isinstance(other, str):

377 return other == self.name

378 elif other is self:

379 return True

380 elif not (hasattr(other, "ordered") and hasattr(other, "categories")):

381 return False

382 elif self.categories is None or other.categories is None:

383 # For non-fully-initialized dtypes, these are only equal to

384 # - the string "category" (handled above)

385 # - other CategoricalDtype with categories=None

386 return self.categories is other.categories

387 elif self.ordered or other.ordered:

388 # At least one has ordered=True; equal if both have ordered=True

389 # and the same values for categories in the same order.

390 return (self.ordered == other.ordered) and self.categories.equals(

391 other.categories

392 )

393 else:

394 # Neither has ordered=True; equal if both have the same categories,

395 # but same order is not necessary. There is no distinction between

396 # ordered=False and ordered=None: CDT(., False) and CDT(., None)

397 # will be equal if they have the same categories.

398 left = self.categories

399 right = other.categories

400

401 # GH#36280 the ordering of checks here is for performance

402 if not left.dtype == right.dtype:

403 return False

404

405 if len(left) != len(right):

406 return False

407

408 if self.categories.equals(other.categories):

409 # Check and see if they happen to be identical categories

410 return True

411

412 if left.dtype != object:

413 # Faster than calculating hash

414 indexer = left.get_indexer(right)

415 # Because left and right have the same length and are unique,

416 # `indexer` not having any -1s implies that there is a

417 # bijection between `left` and `right`.

418 return (indexer != -1).all()

419

420 # With object-dtype we need a comparison that identifies

421 # e.g. int(2) as distinct from float(2)

422 return hash(self) == hash(other)

423

424 def __repr__(self) -> str_type:

425 if self.categories is None:

426 data = "None"

427 else:

428 data = self.categories._format_data(name=type(self).__name__)

429 if data is None:

430 # self.categories is RangeIndex

431 data = str(self.categories._range)

432 data = data.rstrip(", ")

433 return f"CategoricalDtype(categories={data}, ordered={self.ordered})"

434

435 @cache_readonly

436 def _hash_categories(self) -> int:

437 from pandas.core.util.hashing import (

438 combine_hash_arrays,

439 hash_array,

440 hash_tuples,

441 )

442

443 categories = self.categories

444 ordered = self.ordered

445

446 if len(categories) and isinstance(categories[0], tuple):

447 # assumes if any individual category is a tuple, then all our. ATM

448 # I don't really want to support just some of the categories being

449 # tuples.

450 cat_list = list(categories) # breaks if a np.array of categories

451 cat_array = hash_tuples(cat_list)

452 else:

453 if categories.dtype == "O" and len({type(x) for x in categories}) != 1:

454 # TODO: hash_array doesn't handle mixed types. It casts

455 # everything to a str first, which means we treat

456 # {'1', '2'} the same as {'1', 2}

457 # find a better solution

458 hashed = hash((tuple(categories), ordered))

459 return hashed

460

461 if DatetimeTZDtype.is_dtype(categories.dtype):

462 # Avoid future warning.

463 categories = categories.view("datetime64[ns]")

464

465 cat_array = hash_array(np.asarray(categories), categorize=False)

466 if ordered:

467 cat_array = np.vstack(

468 [cat_array, np.arange(len(cat_array), dtype=cat_array.dtype)]

469 )

470 else:

471 cat_array = np.array([cat_array])

472 combined_hashed = combine_hash_arrays(iter(cat_array), num_items=len(cat_array))

473 return np.bitwise_xor.reduce(combined_hashed)

474

475 @classmethod

476 def construct_array_type(cls) -> type_t[Categorical]:

477 """

478 Return the array type associated with this dtype.

479

480 Returns

481 -------

482 type

483 """

484 from pandas import Categorical

485

486 return Categorical

487

488 @staticmethod

489 def validate_ordered(ordered: Ordered) -> None:

490 """

491 Validates that we have a valid ordered parameter. If

492 it is not a boolean, a TypeError will be raised.

493

494 Parameters

495 ----------

496 ordered : object

497 The parameter to be verified.

498

499 Raises

500 ------

501 TypeError

502 If 'ordered' is not a boolean.

503 """

504 if not is_bool(ordered):

505 raise TypeError("'ordered' must either be 'True' or 'False'")

506

507 @staticmethod

508 def validate_categories(categories, fastpath: bool = False) -> Index:

509 """

510 Validates that we have good categories

511

512 Parameters

513 ----------

514 categories : array-like

515 fastpath : bool

516 Whether to skip nan and uniqueness checks

517

518 Returns

519 -------

520 categories : Index

521 """

522 from pandas.core.indexes.base import Index

523

524 if not fastpath and not is_list_like(categories):

525 raise TypeError(

526 f"Parameter 'categories' must be list-like, was {repr(categories)}"

527 )

528 elif not isinstance(categories, ABCIndex):

529 categories = Index._with_infer(categories, tupleize_cols=False)

530

531 if not fastpath:

532

533 if categories.hasnans:

534 raise ValueError("Categorical categories cannot be null")

535

536 if not categories.is_unique:

537 raise ValueError("Categorical categories must be unique")

538

539 if isinstance(categories, ABCCategoricalIndex):

540 categories = categories.categories

541

542 return categories

543

544 def update_dtype(self, dtype: str_type | CategoricalDtype) -> CategoricalDtype:

545 """

546 Returns a CategoricalDtype with categories and ordered taken from dtype

547 if specified, otherwise falling back to self if unspecified

548

549 Parameters

550 ----------

551 dtype : CategoricalDtype

552

553 Returns

554 -------

555 new_dtype : CategoricalDtype

556 """

557 if isinstance(dtype, str) and dtype == "category":

558 # dtype='category' should not change anything

559 return self

560 elif not self.is_dtype(dtype):

561 raise ValueError(

562 f"a CategoricalDtype must be passed to perform an update, "

563 f"got {repr(dtype)}"

564 )

565 else:

566 # from here on, dtype is a CategoricalDtype

567 dtype = cast(CategoricalDtype, dtype)

568

569 # update categories/ordered unless they've been explicitly passed as None

570 new_categories = (

571 dtype.categories if dtype.categories is not None else self.categories

572 )

573 new_ordered = dtype.ordered if dtype.ordered is not None else self.ordered

574

575 return CategoricalDtype(new_categories, new_ordered)

576

577 @property

578 def categories(self) -> Index:

579 """

580 An ``Index`` containing the unique categories allowed.

581 """

582 return self._categories

583

584 @property

585 def ordered(self) -> Ordered:

586 """

587 Whether the categories have an ordered relationship.

588 """

589 return self._ordered

590

591 @property

592 def _is_boolean(self) -> bool:

593 from pandas.core.dtypes.common import is_bool_dtype

594

595 return is_bool_dtype(self.categories)

596

597 def _get_common_dtype(self, dtypes: list[DtypeObj]) -> DtypeObj | None:

598 from pandas.core.arrays.sparse import SparseDtype

599

600 # check if we have all categorical dtype with identical categories

601 if all(isinstance(x, CategoricalDtype) for x in dtypes):

602 first = dtypes[0]

603 if all(first == other for other in dtypes[1:]):

604 return first

605

606 # special case non-initialized categorical

607 # TODO we should figure out the expected return value in general

608 non_init_cats = [

609 isinstance(x, CategoricalDtype) and x.categories is None for x in dtypes

610 ]

611 if all(non_init_cats):

612 return self

613 elif any(non_init_cats):

614 return None

615

616 # categorical is aware of Sparse -> extract sparse subdtypes

617 dtypes = [x.subtype if isinstance(x, SparseDtype) else x for x in dtypes]

618 # extract the categories' dtype

619 non_cat_dtypes = [

620 x.categories.dtype if isinstance(x, CategoricalDtype) else x for x in dtypes

621 ]

622 # TODO should categorical always give an answer?

623 from pandas.core.dtypes.cast import find_common_type

624

625 return find_common_type(non_cat_dtypes)

626

627

628@register_extension_dtype

629class DatetimeTZDtype(PandasExtensionDtype):

630 """

631 An ExtensionDtype for timezone-aware datetime data.

632

633 **This is not an actual numpy dtype**, but a duck type.

634

635 Parameters

636 ----------

637 unit : str, default "ns"

638 The precision of the datetime data. Currently limited

639 to ``"ns"``.

640 tz : str, int, or datetime.tzinfo

641 The timezone.

642

643 Attributes

644 ----------

645 unit

646 tz

647

648 Methods

649 -------

650 None

651

652 Raises

653 ------

654 pytz.UnknownTimeZoneError

655 When the requested timezone cannot be found.

656

657 Examples

658 --------

659 >>> pd.DatetimeTZDtype(tz='UTC')

660 datetime64[ns, UTC]

661

662 >>> pd.DatetimeTZDtype(tz='dateutil/US/Central')

663 datetime64[ns, tzfile('/usr/share/zoneinfo/US/Central')]

664 """

665

666 type: type[Timestamp] = Timestamp

667 kind: str_type = "M"

668 num = 101

669 base = np.dtype("M8[ns]") # TODO: depend on reso?

670 _metadata = ("unit", "tz")

671 _match = re.compile(r"(datetime64|M8)\[(?P<unit>.+), (?P<tz>.+)\]")

672 _cache_dtypes: dict[str_type, PandasExtensionDtype] = {}

673

674 @property

675 def na_value(self) -> NaTType:

676 return NaT

677

678 @cache_readonly

679 def str(self):

680 return f"|M8[{self._unit}]"

681

682 def __init__(self, unit: str_type | DatetimeTZDtype = "ns", tz=None) -> None:

683 if isinstance(unit, DatetimeTZDtype):

684 # error: "str" has no attribute "tz"

685 unit, tz = unit.unit, unit.tz # type: ignore[attr-defined]

686

687 if unit != "ns":

688 if isinstance(unit, str) and tz is None:

689 # maybe a string like datetime64[ns, tz], which we support for

690 # now.

691 result = type(self).construct_from_string(unit)

692 unit = result.unit

693 tz = result.tz

694 msg = (

695 f"Passing a dtype alias like 'datetime64[ns, {tz}]' "

696 "to DatetimeTZDtype is no longer supported. Use "

697 "'DatetimeTZDtype.construct_from_string()' instead."

698 )

699 raise ValueError(msg)

700 if unit not in ["s", "ms", "us", "ns"]:

701 raise ValueError("DatetimeTZDtype only supports s, ms, us, ns units")

702

703 if tz:

704 tz = timezones.maybe_get_tz(tz)

705 tz = timezones.tz_standardize(tz)

706 elif tz is not None:

707 raise pytz.UnknownTimeZoneError(tz)

708 if tz is None:

709 raise TypeError("A 'tz' is required.")

710

711 self._unit = unit

712 self._tz = tz

713

714 @cache_readonly

715 def _reso(self) -> int:

716 """

717 The NPY_DATETIMEUNIT corresponding to this dtype's resolution.

718 """

719 reso = {

720 "s": dtypes.NpyDatetimeUnit.NPY_FR_s,

721 "ms": dtypes.NpyDatetimeUnit.NPY_FR_ms,

722 "us": dtypes.NpyDatetimeUnit.NPY_FR_us,

723 "ns": dtypes.NpyDatetimeUnit.NPY_FR_ns,

724 }[self._unit]

725 return reso.value

726

727 @property

728 def unit(self) -> str_type:

729 """

730 The precision of the datetime data.

731 """

732 return self._unit

733

734 @property

735 def tz(self) -> tzinfo:

736 """

737 The timezone.

738 """

739 return self._tz

740

741 @classmethod

742 def construct_array_type(cls) -> type_t[DatetimeArray]:

743 """

744 Return the array type associated with this dtype.

745

746 Returns

747 -------

748 type

749 """

750 from pandas.core.arrays import DatetimeArray

751

752 return DatetimeArray

753

754 @classmethod

755 def construct_from_string(cls, string: str_type) -> DatetimeTZDtype:

756 """

757 Construct a DatetimeTZDtype from a string.

758

759 Parameters

760 ----------

761 string : str

762 The string alias for this DatetimeTZDtype.

763 Should be formatted like ``datetime64[ns, <tz>]``,

764 where ``<tz>`` is the timezone name.

765

766 Examples

767 --------

768 >>> DatetimeTZDtype.construct_from_string('datetime64[ns, UTC]')

769 datetime64[ns, UTC]

770 """

771 if not isinstance(string, str): 771 ↛ 772line 771 didn't jump to line 772, because the condition on line 771 was never true

772 raise TypeError(

773 f"'construct_from_string' expects a string, got {type(string)}"

774 )

775

776 msg = f"Cannot construct a 'DatetimeTZDtype' from '{string}'"

777 match = cls._match.match(string)

778 if match: 778 ↛ 779line 778 didn't jump to line 779, because the condition on line 778 was never true

779 d = match.groupdict()

780 try:

781 return cls(unit=d["unit"], tz=d["tz"])

782 except (KeyError, TypeError, ValueError) as err:

783 # KeyError if maybe_get_tz tries and fails to get a

784 # pytz timezone (actually pytz.UnknownTimeZoneError).

785 # TypeError if we pass a nonsense tz;

786 # ValueError if we pass a unit other than "ns"

787 raise TypeError(msg) from err

788 raise TypeError(msg)

789

790 def __str__(self) -> str_type:

791 return f"datetime64[{self.unit}, {self.tz}]"

792

793 @property

794 def name(self) -> str_type:

795 """A string representation of the dtype."""

796 return str(self)

797

798 def __hash__(self) -> int:

799 # make myself hashable

800 # TODO: update this.

801 return hash(str(self))

802

803 def __eq__(self, other: Any) -> bool:

804 if isinstance(other, str):

805 if other.startswith("M8["):

806 other = "datetime64[" + other[3:]

807 return other == self.name

808

809 return (

810 isinstance(other, DatetimeTZDtype)

811 and self.unit == other.unit

812 and tz_compare(self.tz, other.tz)

813 )

814

815 def __setstate__(self, state) -> None:

816 # for pickle compat. __get_state__ is defined in the

817 # PandasExtensionDtype superclass and uses the public properties to

818 # pickle -> need to set the settable private ones here (see GH26067)

819 self._tz = state["tz"]

820 self._unit = state["unit"]

821

822

823@register_extension_dtype

824class PeriodDtype(dtypes.PeriodDtypeBase, PandasExtensionDtype):

825 """

826 An ExtensionDtype for Period data.

827

828 **This is not an actual numpy dtype**, but a duck type.

829

830 Parameters

831 ----------

832 freq : str or DateOffset

833 The frequency of this PeriodDtype.

834

835 Attributes

836 ----------

837 freq

838

839 Methods

840 -------

841 None

842

843 Examples

844 --------

845 >>> pd.PeriodDtype(freq='D')

846 period[D]

847

848 >>> pd.PeriodDtype(freq=pd.offsets.MonthEnd())

849 period[M]

850 """

851

852 type: type[Period] = Period

853 kind: str_type = "O"

854 str = "|O08"

855 base = np.dtype("O")

856 num = 102

857 _metadata = ("freq",)

858 _match = re.compile(r"(P|p)eriod\[(?P<freq>.+)\]")

859 _cache_dtypes: dict[str_type, PandasExtensionDtype] = {}

860

861 def __new__(cls, freq=None):

862 """

863 Parameters

864 ----------

865 freq : frequency

866 """

867 if isinstance(freq, PeriodDtype):

868 return freq

869

870 elif freq is None:

871 # empty constructor for pickle compat

872 # -10_000 corresponds to PeriodDtypeCode.UNDEFINED

873 u = dtypes.PeriodDtypeBase.__new__(cls, -10_000)

874 u._freq = None

875 return u

876

877 if not isinstance(freq, BaseOffset):

878 freq = cls._parse_dtype_strict(freq)

879

880 try:

881 return cls._cache_dtypes[freq.freqstr]

882 except KeyError:

883 dtype_code = freq._period_dtype_code

884 u = dtypes.PeriodDtypeBase.__new__(cls, dtype_code)

885 u._freq = freq

886 cls._cache_dtypes[freq.freqstr] = u

887 return u

888

889 def __reduce__(self):

890 return type(self), (self.freq,)

891

892 @property

893 def freq(self):

894 """

895 The frequency object of this PeriodDtype.

896 """

897 return self._freq

898

899 @classmethod

900 def _parse_dtype_strict(cls, freq: str_type) -> BaseOffset:

901 if isinstance(freq, str): # note: freq is already of type str!

902 if freq.startswith("period[") or freq.startswith("Period["):

903 m = cls._match.search(freq)

904 if m is not None:

905 freq = m.group("freq")

906

907 freq_offset = to_offset(freq)

908 if freq_offset is not None:

909 return freq_offset

910

911 raise ValueError("could not construct PeriodDtype")

912

913 @classmethod

914 def construct_from_string(cls, string: str_type) -> PeriodDtype:

915 """

916 Strict construction from a string, raise a TypeError if not

917 possible

918 """

919 if ( 919 ↛ 926line 919 didn't jump to line 926

920 isinstance(string, str)

921 and (string.startswith("period[") or string.startswith("Period["))

922 or isinstance(string, BaseOffset)

923 ):

924 # do not parse string like U as period[U]

925 # avoid tuple to be regarded as freq

926 try:

927 return cls(freq=string)

928 except ValueError:

929 pass

930 if isinstance(string, str): 930 ↛ 933line 930 didn't jump to line 933, because the condition on line 930 was never false

931 msg = f"Cannot construct a 'PeriodDtype' from '{string}'"

932 else:

933 msg = f"'construct_from_string' expects a string, got {type(string)}"

934 raise TypeError(msg)

935

936 def __str__(self) -> str_type:

937 return self.name

938

939 @property

940 def name(self) -> str_type:

941 return f"period[{self.freq.freqstr}]"

942

943 @property

944 def na_value(self) -> NaTType:

945 return NaT

946

947 def __hash__(self) -> int:

948 # make myself hashable

949 return hash(str(self))

950

951 def __eq__(self, other: Any) -> bool:

952 if isinstance(other, str):

953 return other in [self.name, self.name.title()]

954

955 elif isinstance(other, PeriodDtype):

956

957 # For freqs that can be held by a PeriodDtype, this check is

958 # equivalent to (and much faster than) self.freq == other.freq

959 sfreq = self.freq

960 ofreq = other.freq

961 return (

962 sfreq.n == ofreq.n

963 and sfreq._period_dtype_code == ofreq._period_dtype_code

964 )

965

966 return False

967

968 def __ne__(self, other: Any) -> bool:

969 return not self.__eq__(other)

970

971 def __setstate__(self, state) -> None:

972 # for pickle compat. __getstate__ is defined in the

973 # PandasExtensionDtype superclass and uses the public properties to

974 # pickle -> need to set the settable private ones here (see GH26067)

975 self._freq = state["freq"]

976

977 @classmethod

978 def is_dtype(cls, dtype: object) -> bool:

979 """

980 Return a boolean if we if the passed type is an actual dtype that we

981 can match (via string or type)

982 """

983 if isinstance(dtype, str):

984 # PeriodDtype can be instantiated from freq string like "U",

985 # but doesn't regard freq str like "U" as dtype.

986 if dtype.startswith("period[") or dtype.startswith("Period["):

987 try:

988 if cls._parse_dtype_strict(dtype) is not None:

989 return True

990 else:

991 return False

992 except ValueError:

993 return False

994 else:

995 return False

996 return super().is_dtype(dtype)

997

998 @classmethod

999 def construct_array_type(cls) -> type_t[PeriodArray]:

1000 """

1001 Return the array type associated with this dtype.

1002

1003 Returns

1004 -------

1005 type

1006 """

1007 from pandas.core.arrays import PeriodArray

1008

1009 return PeriodArray

1010

1011 def __from_arrow__(

1012 self, array: pyarrow.Array | pyarrow.ChunkedArray

1013 ) -> PeriodArray:

1014 """

1015 Construct PeriodArray from pyarrow Array/ChunkedArray.

1016 """

1017 import pyarrow

1018

1019 from pandas.core.arrays import PeriodArray

1020 from pandas.core.arrays.arrow._arrow_utils import (

1021 pyarrow_array_to_numpy_and_mask,

1022 )

1023

1024 if isinstance(array, pyarrow.Array):

1025 chunks = [array]

1026 else:

1027 chunks = array.chunks

1028

1029 results = []

1030 for arr in chunks:

1031 data, mask = pyarrow_array_to_numpy_and_mask(arr, dtype=np.dtype(np.int64))

1032 parr = PeriodArray(data.copy(), freq=self.freq, copy=False)

1033 # error: Invalid index type "ndarray[Any, dtype[bool_]]" for "PeriodArray";

1034 # expected type "Union[int, Sequence[int], Sequence[bool], slice]"

1035 parr[~mask] = NaT # type: ignore[index]

1036 results.append(parr)

1037

1038 if not results:

1039 return PeriodArray(np.array([], dtype="int64"), freq=self.freq, copy=False)

1040 return PeriodArray._concat_same_type(results)

1041

1042

1043@register_extension_dtype

1044class IntervalDtype(PandasExtensionDtype):

1045 """

1046 An ExtensionDtype for Interval data.

1047

1048 **This is not an actual numpy dtype**, but a duck type.

1049

1050 Parameters

1051 ----------

1052 subtype : str, np.dtype

1053 The dtype of the Interval bounds.

1054

1055 Attributes

1056 ----------

1057 subtype

1058

1059 Methods

1060 -------

1061 None

1062

1063 Examples

1064 --------

1065 >>> pd.IntervalDtype(subtype='int64', closed='both')

1066 interval[int64, both]

1067 """

1068

1069 name = "interval"

1070 kind: str_type = "O"

1071 str = "|O08"

1072 base = np.dtype("O")

1073 num = 103

1074 _metadata = (

1075 "subtype",

1076 "closed",

1077 )

1078

1079 _match = re.compile(

1080 r"(I|i)nterval\[(?P<subtype>[^,]+(\[.+\])?)"

1081 r"(, (?P<closed>(right|left|both|neither)))?\]"

1082 )

1083

1084 _cache_dtypes: dict[str_type, PandasExtensionDtype] = {}

1085

1086 def __new__(cls, subtype=None, closed: str_type | None = None):

1087 from pandas.core.dtypes.common import (

1088 is_string_dtype,

1089 pandas_dtype,

1090 )

1091

1092 if closed is not None and closed not in {"right", "left", "both", "neither"}:

1093 raise ValueError("closed must be one of 'right', 'left', 'both', 'neither'")

1094

1095 if isinstance(subtype, IntervalDtype):

1096 if closed is not None and closed != subtype.closed:

1097 raise ValueError(

1098 "dtype.closed and 'closed' do not match. "

1099 "Try IntervalDtype(dtype.subtype, closed) instead."

1100 )

1101 return subtype

1102 elif subtype is None:

1103 # we are called as an empty constructor

1104 # generally for pickle compat

1105 u = object.__new__(cls)

1106 u._subtype = None

1107 u._closed = closed

1108 return u

1109 elif isinstance(subtype, str) and subtype.lower() == "interval":

1110 subtype = None

1111 else:

1112 if isinstance(subtype, str):

1113 m = cls._match.search(subtype)

1114 if m is not None:

1115 gd = m.groupdict()

1116 subtype = gd["subtype"]

1117 if gd.get("closed", None) is not None:

1118 if closed is not None:

1119 if closed != gd["closed"]:

1120 raise ValueError(

1121 "'closed' keyword does not match value "

1122 "specified in dtype string"

1123 )

1124 closed = gd["closed"]

1125

1126 try:

1127 subtype = pandas_dtype(subtype)

1128 except TypeError as err:

1129 raise TypeError("could not construct IntervalDtype") from err

1130

1131 if CategoricalDtype.is_dtype(subtype) or is_string_dtype(subtype):

1132 # GH 19016

1133 msg = (

1134 "category, object, and string subtypes are not supported "

1135 "for IntervalDtype"

1136 )

1137 raise TypeError(msg)

1138

1139 key = str(subtype) + str(closed)

1140 try:

1141 return cls._cache_dtypes[key]

1142 except KeyError:

1143 u = object.__new__(cls)

1144 u._subtype = subtype

1145 u._closed = closed

1146 cls._cache_dtypes[key] = u

1147 return u

1148

1149 @cache_readonly

1150 def _can_hold_na(self) -> bool:

1151 subtype = self._subtype

1152 if subtype is None:

1153 # partially-initialized

1154 raise NotImplementedError(

1155 "_can_hold_na is not defined for partially-initialized IntervalDtype"

1156 )

1157 if subtype.kind in ["i", "u"]:

1158 return False

1159 return True

1160

1161 @property

1162 def closed(self):

1163 return self._closed

1164

1165 @property

1166 def subtype(self):

1167 """

1168 The dtype of the Interval bounds.

1169 """

1170 return self._subtype

1171

1172 @classmethod

1173 def construct_array_type(cls) -> type[IntervalArray]:

1174 """

1175 Return the array type associated with this dtype.

1176

1177 Returns

1178 -------

1179 type

1180 """

1181 from pandas.core.arrays import IntervalArray

1182

1183 return IntervalArray

1184

1185 @classmethod

1186 def construct_from_string(cls, string: str_type) -> IntervalDtype:

1187 """

1188 attempt to construct this type from a string, raise a TypeError

1189 if its not possible

1190 """

1191 if not isinstance(string, str): 1191 ↛ 1192line 1191 didn't jump to line 1192, because the condition on line 1191 was never true

1192 raise TypeError(

1193 f"'construct_from_string' expects a string, got {type(string)}"

1194 )

1195

1196 if string.lower() == "interval" or cls._match.search(string) is not None: 1196 ↛ 1197line 1196 didn't jump to line 1197, because the condition on line 1196 was never true

1197 return cls(string)

1198

1199 msg = (

1200 f"Cannot construct a 'IntervalDtype' from '{string}'.\n\n"

1201 "Incorrectly formatted string passed to constructor. "

1202 "Valid formats include Interval or Interval[dtype] "

1203 "where dtype is numeric, datetime, or timedelta"

1204 )

1205 raise TypeError(msg)

1206

1207 @property

1208 def type(self) -> type[Interval]:

1209 return Interval

1210

1211 def __str__(self) -> str_type:

1212 if self.subtype is None:

1213 return "interval"

1214 if self.closed is None:

1215 # Only partially initialized GH#38394

1216 return f"interval[{self.subtype}]"

1217 return f"interval[{self.subtype}, {self.closed}]"

1218

1219 def __hash__(self) -> int:

1220 # make myself hashable

1221 return hash(str(self))

1222

1223 def __eq__(self, other: Any) -> bool:

1224 if isinstance(other, str):

1225 return other.lower() in (self.name.lower(), str(self).lower())

1226 elif not isinstance(other, IntervalDtype):

1227 return False

1228 elif self.subtype is None or other.subtype is None:

1229 # None should match any subtype

1230 return True

1231 elif self.closed != other.closed:

1232 return False

1233 else:

1234 from pandas.core.dtypes.common import is_dtype_equal

1235

1236 return is_dtype_equal(self.subtype, other.subtype)

1237

1238 def __setstate__(self, state) -> None:

1239 # for pickle compat. __get_state__ is defined in the

1240 # PandasExtensionDtype superclass and uses the public properties to

1241 # pickle -> need to set the settable private ones here (see GH26067)

1242 self._subtype = state["subtype"]

1243

1244 # backward-compat older pickles won't have "closed" key

1245 self._closed = state.pop("closed", None)

1246

1247 @classmethod

1248 def is_dtype(cls, dtype: object) -> bool:

1249 """

1250 Return a boolean if we if the passed type is an actual dtype that we

1251 can match (via string or type)

1252 """

1253 if isinstance(dtype, str):

1254 if dtype.lower().startswith("interval"):

1255 try:

1256 if cls.construct_from_string(dtype) is not None:

1257 return True

1258 else:

1259 return False

1260 except (ValueError, TypeError):

1261 return False

1262 else:

1263 return False

1264 return super().is_dtype(dtype)

1265

1266 def __from_arrow__(

1267 self, array: pyarrow.Array | pyarrow.ChunkedArray

1268 ) -> IntervalArray:

1269 """

1270 Construct IntervalArray from pyarrow Array/ChunkedArray.

1271 """

1272 import pyarrow

1273

1274 from pandas.core.arrays import IntervalArray

1275

1276 if isinstance(array, pyarrow.Array):

1277 chunks = [array]

1278 else:

1279 chunks = array.chunks

1280

1281 results = []

1282 for arr in chunks:

1283 if isinstance(arr, pyarrow.ExtensionArray):

1284 arr = arr.storage

1285 left = np.asarray(arr.field("left"), dtype=self.subtype)

1286 right = np.asarray(arr.field("right"), dtype=self.subtype)

1287 iarr = IntervalArray.from_arrays(left, right, closed=self.closed)

1288 results.append(iarr)

1289

1290 if not results:

1291 return IntervalArray.from_arrays(

1292 np.array([], dtype=self.subtype),

1293 np.array([], dtype=self.subtype),

1294 closed=self.closed,

1295 )

1296 return IntervalArray._concat_same_type(results)

1297

1298 def _get_common_dtype(self, dtypes: list[DtypeObj]) -> DtypeObj | None:

1299 if not all(isinstance(x, IntervalDtype) for x in dtypes):

1300 return None

1301

1302 closed = cast("IntervalDtype", dtypes[0]).closed

1303 if not all(cast("IntervalDtype", x).closed == closed for x in dtypes):

1304 return np.dtype(object)

1305

1306 from pandas.core.dtypes.cast import find_common_type

1307

1308 common = find_common_type([cast("IntervalDtype", x).subtype for x in dtypes])

1309 if common == object:

1310 return np.dtype(object)

1311 return IntervalDtype(common, closed=closed)

1312

1313

1314class PandasDtype(ExtensionDtype):

1315 """

1316 A Pandas ExtensionDtype for NumPy dtypes.

1317

1318 This is mostly for internal compatibility, and is not especially

1319 useful on its own.

1320

1321 Parameters

1322 ----------

1323 dtype : object

1324 Object to be converted to a NumPy data type object.

1325

1326 See Also

1327 --------

1328 numpy.dtype

1329 """

1330

1331 _metadata = ("_dtype",)

1332

1333 def __init__(self, dtype: npt.DTypeLike | PandasDtype | None) -> None:

1334 if isinstance(dtype, PandasDtype):

1335 # make constructor univalent

1336 dtype = dtype.numpy_dtype

1337 self._dtype = np.dtype(dtype)

1338

1339 def __repr__(self) -> str:

1340 return f"PandasDtype({repr(self.name)})"

1341

1342 @property

1343 def numpy_dtype(self) -> np.dtype:

1344 """

1345 The NumPy dtype this PandasDtype wraps.

1346 """

1347 return self._dtype

1348

1349 @property

1350 def name(self) -> str:

1351 """

1352 A bit-width name for this data-type.

1353 """

1354 return self._dtype.name

1355

1356 @property

1357 def type(self) -> type[np.generic]:

1358 """

1359 The type object used to instantiate a scalar of this NumPy data-type.

1360 """

1361 return self._dtype.type

1362

1363 @property

1364 def _is_numeric(self) -> bool:

1365 # exclude object, str, unicode, void.

1366 return self.kind in set("biufc")

1367

1368 @property

1369 def _is_boolean(self) -> bool:

1370 return self.kind == "b"

1371

1372 @classmethod

1373 def construct_from_string(cls, string: str) -> PandasDtype:

1374 try:

1375 dtype = np.dtype(string)

1376 except TypeError as err:

1377 if not isinstance(string, str):

1378 msg = f"'construct_from_string' expects a string, got {type(string)}"

1379 else:

1380 msg = f"Cannot construct a 'PandasDtype' from '{string}'"

1381 raise TypeError(msg) from err

1382 return cls(dtype)

1383

1384 @classmethod

1385 def construct_array_type(cls) -> type_t[PandasArray]:

1386 """

1387 Return the array type associated with this dtype.

1388

1389 Returns

1390 -------

1391 type

1392 """

1393 from pandas.core.arrays import PandasArray

1394

1395 return PandasArray

1396

1397 @property

1398 def kind(self) -> str:

1399 """

1400 A character code (one of 'biufcmMOSUV') identifying the general kind of data.

1401 """

1402 return self._dtype.kind

1403

1404 @property

1405 def itemsize(self) -> int:

1406 """

1407 The element size of this data-type object.

1408 """

1409 return self._dtype.itemsize

1410

1411

1412class BaseMaskedDtype(ExtensionDtype):

1413 """

1414 Base class for dtypes for BaseMaskedArray subclasses.

1415 """

1416

1417 name: str

1418 base = None

1419 type: type

1420

1421 @property

1422 def na_value(self) -> libmissing.NAType:

1423 return libmissing.NA

1424

1425 @cache_readonly

1426 def numpy_dtype(self) -> np.dtype:

1427 """Return an instance of our numpy dtype"""

1428 return np.dtype(self.type)

1429

1430 @cache_readonly

1431 def kind(self) -> str:

1432 return self.numpy_dtype.kind

1433

1434 @cache_readonly

1435 def itemsize(self) -> int:

1436 """Return the number of bytes in this dtype"""

1437 return self.numpy_dtype.itemsize

1438

1439 @classmethod

1440 def construct_array_type(cls) -> type_t[BaseMaskedArray]:

1441 """

1442 Return the array type associated with this dtype.

1443

1444 Returns

1445 -------

1446 type

1447 """

1448 raise NotImplementedError

1449

1450 @classmethod

1451 def from_numpy_dtype(cls, dtype: np.dtype) -> BaseMaskedDtype:

1452 """

1453 Construct the MaskedDtype corresponding to the given numpy dtype.

1454 """

1455 if dtype.kind == "b":

1456 from pandas.core.arrays.boolean import BooleanDtype

1457

1458 return BooleanDtype()

1459 elif dtype.kind in ["i", "u"]:

1460 from pandas.core.arrays.integer import INT_STR_TO_DTYPE

1461

1462 return INT_STR_TO_DTYPE[dtype.name]

1463 elif dtype.kind == "f":

1464 from pandas.core.arrays.floating import FLOAT_STR_TO_DTYPE

1465

1466 return FLOAT_STR_TO_DTYPE[dtype.name]

1467 else:

1468 raise NotImplementedError(dtype)

1469

1470 def _get_common_dtype(self, dtypes: list[DtypeObj]) -> DtypeObj | None:

1471 # We unwrap any masked dtypes, find the common dtype we would use

1472 # for that, then re-mask the result.

1473 from pandas.core.dtypes.cast import find_common_type

1474

1475 new_dtype = find_common_type(

1476 [

1477 dtype.numpy_dtype if isinstance(dtype, BaseMaskedDtype) else dtype

1478 for dtype in dtypes

1479 ]

1480 )

1481 if not isinstance(new_dtype, np.dtype):

1482 # If we ever support e.g. Masked[DatetimeArray] then this will change

1483 return None

1484 try:

1485 return type(self).from_numpy_dtype(new_dtype)

1486 except (KeyError, NotImplementedError):

1487 return None