Coverage for /var/srv/projects/api.amasfac.comuna18.com/tmp/venv/lib/python3.9/site-packages/pandas/core/dtypes/dtypes.py: 28%
614 statements
« prev ^ index » next coverage.py v6.4.4, created at 2023-07-17 14:22 -0600
« prev ^ index » next coverage.py v6.4.4, created at 2023-07-17 14:22 -0600
1"""
2Define extension dtypes.
3"""
4from __future__ import annotations
6import re
7from typing import (
8 TYPE_CHECKING,
9 Any,
10 MutableMapping,
11 cast,
12)
14import numpy as np
15import pytz
17from pandas._libs import missing as libmissing
18from pandas._libs.interval import Interval
19from pandas._libs.properties import cache_readonly
20from pandas._libs.tslibs import (
21 BaseOffset,
22 NaT,
23 NaTType,
24 Period,
25 Timestamp,
26 dtypes,
27 timezones,
28 to_offset,
29 tz_compare,
30)
31from pandas._typing import (
32 Dtype,
33 DtypeObj,
34 Ordered,
35 npt,
36 type_t,
37)
39from pandas.core.dtypes.base import (
40 ExtensionDtype,
41 register_extension_dtype,
42)
43from pandas.core.dtypes.generic import (
44 ABCCategoricalIndex,
45 ABCIndex,
46)
47from pandas.core.dtypes.inference import (
48 is_bool,
49 is_list_like,
50)
52if TYPE_CHECKING: 52 ↛ 53line 52 didn't jump to line 53, because the condition on line 52 was never true
53 from datetime import tzinfo
55 import pyarrow
57 from pandas import (
58 Categorical,
59 Index,
60 )
61 from pandas.core.arrays import (
62 BaseMaskedArray,
63 DatetimeArray,
64 IntervalArray,
65 PandasArray,
66 PeriodArray,
67 )
69str_type = str
72class PandasExtensionDtype(ExtensionDtype):
73 """
74 A np.dtype duck-typed class, suitable for holding a custom dtype.
76 THIS IS NOT A REAL NUMPY DTYPE
77 """
79 type: Any
80 kind: Any
81 # The Any type annotations above are here only because mypy seems to have a
82 # problem dealing with multiple inheritance from PandasExtensionDtype
83 # and ExtensionDtype's @properties in the subclasses below. The kind and
84 # type variables in those subclasses are explicitly typed below.
85 subdtype = None
86 str: str_type
87 num = 100
88 shape: tuple[int, ...] = ()
89 itemsize = 8
90 base: DtypeObj | None = None
91 isbuiltin = 0
92 isnative = 0
93 _cache_dtypes: dict[str_type, PandasExtensionDtype] = {}
95 def __repr__(self) -> str_type:
96 """
97 Return a string representation for a particular object.
98 """
99 return str(self)
101 def __hash__(self) -> int:
102 raise NotImplementedError("sub-classes should implement an __hash__ method")
104 def __getstate__(self) -> dict[str_type, Any]:
105 # pickle support; we don't want to pickle the cache
106 return {k: getattr(self, k, None) for k in self._metadata}
108 @classmethod
109 def reset_cache(cls) -> None:
110 """clear the cache"""
111 cls._cache_dtypes = {}
114class CategoricalDtypeType(type):
115 """
116 the type of CategoricalDtype, this metaclass determines subclass ability
117 """
119 pass
122@register_extension_dtype
123class CategoricalDtype(PandasExtensionDtype, ExtensionDtype):
124 """
125 Type for categorical data with the categories and orderedness.
127 Parameters
128 ----------
129 categories : sequence, optional
130 Must be unique, and must not contain any nulls.
131 The categories are stored in an Index,
132 and if an index is provided the dtype of that index will be used.
133 ordered : bool or None, default False
134 Whether or not this categorical is treated as a ordered categorical.
135 None can be used to maintain the ordered value of existing categoricals when
136 used in operations that combine categoricals, e.g. astype, and will resolve to
137 False if there is no existing ordered to maintain.
139 Attributes
140 ----------
141 categories
142 ordered
144 Methods
145 -------
146 None
148 See Also
149 --------
150 Categorical : Represent a categorical variable in classic R / S-plus fashion.
152 Notes
153 -----
154 This class is useful for specifying the type of a ``Categorical``
155 independent of the values. See :ref:`categorical.categoricaldtype`
156 for more.
158 Examples
159 --------
160 >>> t = pd.CategoricalDtype(categories=['b', 'a'], ordered=True)
161 >>> pd.Series(['a', 'b', 'a', 'c'], dtype=t)
162 0 a
163 1 b
164 2 a
165 3 NaN
166 dtype: category
167 Categories (2, object): ['b' < 'a']
169 An empty CategoricalDtype with a specific dtype can be created
170 by providing an empty index. As follows,
172 >>> pd.CategoricalDtype(pd.DatetimeIndex([])).categories.dtype
173 dtype('<M8[ns]')
174 """
176 # TODO: Document public vs. private API
177 name = "category"
178 type: type[CategoricalDtypeType] = CategoricalDtypeType
179 kind: str_type = "O"
180 str = "|O08"
181 base = np.dtype("O")
182 _metadata = ("categories", "ordered")
183 _cache_dtypes: dict[str_type, PandasExtensionDtype] = {}
185 def __init__(self, categories=None, ordered: Ordered = False) -> None:
186 self._finalize(categories, ordered, fastpath=False)
188 @classmethod
189 def _from_fastpath(
190 cls, categories=None, ordered: bool | None = None
191 ) -> CategoricalDtype:
192 self = cls.__new__(cls)
193 self._finalize(categories, ordered, fastpath=True)
194 return self
196 @classmethod
197 def _from_categorical_dtype(
198 cls, dtype: CategoricalDtype, categories=None, ordered: Ordered = None
199 ) -> CategoricalDtype:
200 if categories is ordered is None:
201 return dtype
202 if categories is None:
203 categories = dtype.categories
204 if ordered is None:
205 ordered = dtype.ordered
206 return cls(categories, ordered)
208 @classmethod
209 def _from_values_or_dtype(
210 cls,
211 values=None,
212 categories=None,
213 ordered: bool | None = None,
214 dtype: Dtype | None = None,
215 ) -> CategoricalDtype:
216 """
217 Construct dtype from the input parameters used in :class:`Categorical`.
219 This constructor method specifically does not do the factorization
220 step, if that is needed to find the categories. This constructor may
221 therefore return ``CategoricalDtype(categories=None, ordered=None)``,
222 which may not be useful. Additional steps may therefore have to be
223 taken to create the final dtype.
225 The return dtype is specified from the inputs in this prioritized
226 order:
227 1. if dtype is a CategoricalDtype, return dtype
228 2. if dtype is the string 'category', create a CategoricalDtype from
229 the supplied categories and ordered parameters, and return that.
230 3. if values is a categorical, use value.dtype, but override it with
231 categories and ordered if either/both of those are not None.
232 4. if dtype is None and values is not a categorical, construct the
233 dtype from categories and ordered, even if either of those is None.
235 Parameters
236 ----------
237 values : list-like, optional
238 The list-like must be 1-dimensional.
239 categories : list-like, optional
240 Categories for the CategoricalDtype.
241 ordered : bool, optional
242 Designating if the categories are ordered.
243 dtype : CategoricalDtype or the string "category", optional
244 If ``CategoricalDtype``, cannot be used together with
245 `categories` or `ordered`.
247 Returns
248 -------
249 CategoricalDtype
251 Examples
252 --------
253 >>> pd.CategoricalDtype._from_values_or_dtype()
254 CategoricalDtype(categories=None, ordered=None)
255 >>> pd.CategoricalDtype._from_values_or_dtype(
256 ... categories=['a', 'b'], ordered=True
257 ... )
258 CategoricalDtype(categories=['a', 'b'], ordered=True)
259 >>> dtype1 = pd.CategoricalDtype(['a', 'b'], ordered=True)
260 >>> dtype2 = pd.CategoricalDtype(['x', 'y'], ordered=False)
261 >>> c = pd.Categorical([0, 1], dtype=dtype1, fastpath=True)
262 >>> pd.CategoricalDtype._from_values_or_dtype(
263 ... c, ['x', 'y'], ordered=True, dtype=dtype2
264 ... )
265 Traceback (most recent call last):
266 ...
267 ValueError: Cannot specify `categories` or `ordered` together with
268 `dtype`.
270 The supplied dtype takes precedence over values' dtype:
272 >>> pd.CategoricalDtype._from_values_or_dtype(c, dtype=dtype2)
273 CategoricalDtype(categories=['x', 'y'], ordered=False)
274 """
276 if dtype is not None:
277 # The dtype argument takes precedence over values.dtype (if any)
278 if isinstance(dtype, str):
279 if dtype == "category":
280 dtype = CategoricalDtype(categories, ordered)
281 else:
282 raise ValueError(f"Unknown dtype {repr(dtype)}")
283 elif categories is not None or ordered is not None:
284 raise ValueError(
285 "Cannot specify `categories` or `ordered` together with `dtype`."
286 )
287 elif not isinstance(dtype, CategoricalDtype):
288 raise ValueError(f"Cannot not construct CategoricalDtype from {dtype}")
289 elif cls.is_dtype(values):
290 # If no "dtype" was passed, use the one from "values", but honor
291 # the "ordered" and "categories" arguments
292 dtype = values.dtype._from_categorical_dtype(
293 values.dtype, categories, ordered
294 )
295 else:
296 # If dtype=None and values is not categorical, create a new dtype.
297 # Note: This could potentially have categories=None and
298 # ordered=None.
299 dtype = CategoricalDtype(categories, ordered)
301 return cast(CategoricalDtype, dtype)
303 @classmethod
304 def construct_from_string(cls, string: str_type) -> CategoricalDtype:
305 """
306 Construct a CategoricalDtype from a string.
308 Parameters
309 ----------
310 string : str
311 Must be the string "category" in order to be successfully constructed.
313 Returns
314 -------
315 CategoricalDtype
316 Instance of the dtype.
318 Raises
319 ------
320 TypeError
321 If a CategoricalDtype cannot be constructed from the input.
322 """
323 if not isinstance(string, str): 323 ↛ 324line 323 didn't jump to line 324, because the condition on line 323 was never true
324 raise TypeError(
325 f"'construct_from_string' expects a string, got {type(string)}"
326 )
327 if string != cls.name: 327 ↛ 332line 327 didn't jump to line 332, because the condition on line 327 was never false
328 raise TypeError(f"Cannot construct a 'CategoricalDtype' from '{string}'")
330 # need ordered=None to ensure that operations specifying dtype="category" don't
331 # override the ordered value for existing categoricals
332 return cls(ordered=None)
334 def _finalize(self, categories, ordered: Ordered, fastpath: bool = False) -> None:
336 if ordered is not None:
337 self.validate_ordered(ordered)
339 if categories is not None:
340 categories = self.validate_categories(categories, fastpath=fastpath)
342 self._categories = categories
343 self._ordered = ordered
345 def __setstate__(self, state: MutableMapping[str_type, Any]) -> None:
346 # for pickle compat. __get_state__ is defined in the
347 # PandasExtensionDtype superclass and uses the public properties to
348 # pickle -> need to set the settable private ones here (see GH26067)
349 self._categories = state.pop("categories", None)
350 self._ordered = state.pop("ordered", False)
352 def __hash__(self) -> int:
353 # _hash_categories returns a uint64, so use the negative
354 # space for when we have unknown categories to avoid a conflict
355 if self.categories is None:
356 if self.ordered:
357 return -1
358 else:
359 return -2
360 # We *do* want to include the real self.ordered here
361 return int(self._hash_categories)
363 def __eq__(self, other: Any) -> bool:
364 """
365 Rules for CDT equality:
366 1) Any CDT is equal to the string 'category'
367 2) Any CDT is equal to itself
368 3) Any CDT is equal to a CDT with categories=None regardless of ordered
369 4) A CDT with ordered=True is only equal to another CDT with
370 ordered=True and identical categories in the same order
371 5) A CDT with ordered={False, None} is only equal to another CDT with
372 ordered={False, None} and identical categories, but same order is
373 not required. There is no distinction between False/None.
374 6) Any other comparison returns False
375 """
376 if isinstance(other, str):
377 return other == self.name
378 elif other is self:
379 return True
380 elif not (hasattr(other, "ordered") and hasattr(other, "categories")):
381 return False
382 elif self.categories is None or other.categories is None:
383 # For non-fully-initialized dtypes, these are only equal to
384 # - the string "category" (handled above)
385 # - other CategoricalDtype with categories=None
386 return self.categories is other.categories
387 elif self.ordered or other.ordered:
388 # At least one has ordered=True; equal if both have ordered=True
389 # and the same values for categories in the same order.
390 return (self.ordered == other.ordered) and self.categories.equals(
391 other.categories
392 )
393 else:
394 # Neither has ordered=True; equal if both have the same categories,
395 # but same order is not necessary. There is no distinction between
396 # ordered=False and ordered=None: CDT(., False) and CDT(., None)
397 # will be equal if they have the same categories.
398 left = self.categories
399 right = other.categories
401 # GH#36280 the ordering of checks here is for performance
402 if not left.dtype == right.dtype:
403 return False
405 if len(left) != len(right):
406 return False
408 if self.categories.equals(other.categories):
409 # Check and see if they happen to be identical categories
410 return True
412 if left.dtype != object:
413 # Faster than calculating hash
414 indexer = left.get_indexer(right)
415 # Because left and right have the same length and are unique,
416 # `indexer` not having any -1s implies that there is a
417 # bijection between `left` and `right`.
418 return (indexer != -1).all()
420 # With object-dtype we need a comparison that identifies
421 # e.g. int(2) as distinct from float(2)
422 return hash(self) == hash(other)
424 def __repr__(self) -> str_type:
425 if self.categories is None:
426 data = "None"
427 else:
428 data = self.categories._format_data(name=type(self).__name__)
429 if data is None:
430 # self.categories is RangeIndex
431 data = str(self.categories._range)
432 data = data.rstrip(", ")
433 return f"CategoricalDtype(categories={data}, ordered={self.ordered})"
435 @cache_readonly
436 def _hash_categories(self) -> int:
437 from pandas.core.util.hashing import (
438 combine_hash_arrays,
439 hash_array,
440 hash_tuples,
441 )
443 categories = self.categories
444 ordered = self.ordered
446 if len(categories) and isinstance(categories[0], tuple):
447 # assumes if any individual category is a tuple, then all our. ATM
448 # I don't really want to support just some of the categories being
449 # tuples.
450 cat_list = list(categories) # breaks if a np.array of categories
451 cat_array = hash_tuples(cat_list)
452 else:
453 if categories.dtype == "O" and len({type(x) for x in categories}) != 1:
454 # TODO: hash_array doesn't handle mixed types. It casts
455 # everything to a str first, which means we treat
456 # {'1', '2'} the same as {'1', 2}
457 # find a better solution
458 hashed = hash((tuple(categories), ordered))
459 return hashed
461 if DatetimeTZDtype.is_dtype(categories.dtype):
462 # Avoid future warning.
463 categories = categories.view("datetime64[ns]")
465 cat_array = hash_array(np.asarray(categories), categorize=False)
466 if ordered:
467 cat_array = np.vstack(
468 [cat_array, np.arange(len(cat_array), dtype=cat_array.dtype)]
469 )
470 else:
471 cat_array = np.array([cat_array])
472 combined_hashed = combine_hash_arrays(iter(cat_array), num_items=len(cat_array))
473 return np.bitwise_xor.reduce(combined_hashed)
475 @classmethod
476 def construct_array_type(cls) -> type_t[Categorical]:
477 """
478 Return the array type associated with this dtype.
480 Returns
481 -------
482 type
483 """
484 from pandas import Categorical
486 return Categorical
488 @staticmethod
489 def validate_ordered(ordered: Ordered) -> None:
490 """
491 Validates that we have a valid ordered parameter. If
492 it is not a boolean, a TypeError will be raised.
494 Parameters
495 ----------
496 ordered : object
497 The parameter to be verified.
499 Raises
500 ------
501 TypeError
502 If 'ordered' is not a boolean.
503 """
504 if not is_bool(ordered):
505 raise TypeError("'ordered' must either be 'True' or 'False'")
507 @staticmethod
508 def validate_categories(categories, fastpath: bool = False) -> Index:
509 """
510 Validates that we have good categories
512 Parameters
513 ----------
514 categories : array-like
515 fastpath : bool
516 Whether to skip nan and uniqueness checks
518 Returns
519 -------
520 categories : Index
521 """
522 from pandas.core.indexes.base import Index
524 if not fastpath and not is_list_like(categories):
525 raise TypeError(
526 f"Parameter 'categories' must be list-like, was {repr(categories)}"
527 )
528 elif not isinstance(categories, ABCIndex):
529 categories = Index._with_infer(categories, tupleize_cols=False)
531 if not fastpath:
533 if categories.hasnans:
534 raise ValueError("Categorical categories cannot be null")
536 if not categories.is_unique:
537 raise ValueError("Categorical categories must be unique")
539 if isinstance(categories, ABCCategoricalIndex):
540 categories = categories.categories
542 return categories
544 def update_dtype(self, dtype: str_type | CategoricalDtype) -> CategoricalDtype:
545 """
546 Returns a CategoricalDtype with categories and ordered taken from dtype
547 if specified, otherwise falling back to self if unspecified
549 Parameters
550 ----------
551 dtype : CategoricalDtype
553 Returns
554 -------
555 new_dtype : CategoricalDtype
556 """
557 if isinstance(dtype, str) and dtype == "category":
558 # dtype='category' should not change anything
559 return self
560 elif not self.is_dtype(dtype):
561 raise ValueError(
562 f"a CategoricalDtype must be passed to perform an update, "
563 f"got {repr(dtype)}"
564 )
565 else:
566 # from here on, dtype is a CategoricalDtype
567 dtype = cast(CategoricalDtype, dtype)
569 # update categories/ordered unless they've been explicitly passed as None
570 new_categories = (
571 dtype.categories if dtype.categories is not None else self.categories
572 )
573 new_ordered = dtype.ordered if dtype.ordered is not None else self.ordered
575 return CategoricalDtype(new_categories, new_ordered)
577 @property
578 def categories(self) -> Index:
579 """
580 An ``Index`` containing the unique categories allowed.
581 """
582 return self._categories
584 @property
585 def ordered(self) -> Ordered:
586 """
587 Whether the categories have an ordered relationship.
588 """
589 return self._ordered
591 @property
592 def _is_boolean(self) -> bool:
593 from pandas.core.dtypes.common import is_bool_dtype
595 return is_bool_dtype(self.categories)
597 def _get_common_dtype(self, dtypes: list[DtypeObj]) -> DtypeObj | None:
598 from pandas.core.arrays.sparse import SparseDtype
600 # check if we have all categorical dtype with identical categories
601 if all(isinstance(x, CategoricalDtype) for x in dtypes):
602 first = dtypes[0]
603 if all(first == other for other in dtypes[1:]):
604 return first
606 # special case non-initialized categorical
607 # TODO we should figure out the expected return value in general
608 non_init_cats = [
609 isinstance(x, CategoricalDtype) and x.categories is None for x in dtypes
610 ]
611 if all(non_init_cats):
612 return self
613 elif any(non_init_cats):
614 return None
616 # categorical is aware of Sparse -> extract sparse subdtypes
617 dtypes = [x.subtype if isinstance(x, SparseDtype) else x for x in dtypes]
618 # extract the categories' dtype
619 non_cat_dtypes = [
620 x.categories.dtype if isinstance(x, CategoricalDtype) else x for x in dtypes
621 ]
622 # TODO should categorical always give an answer?
623 from pandas.core.dtypes.cast import find_common_type
625 return find_common_type(non_cat_dtypes)
628@register_extension_dtype
629class DatetimeTZDtype(PandasExtensionDtype):
630 """
631 An ExtensionDtype for timezone-aware datetime data.
633 **This is not an actual numpy dtype**, but a duck type.
635 Parameters
636 ----------
637 unit : str, default "ns"
638 The precision of the datetime data. Currently limited
639 to ``"ns"``.
640 tz : str, int, or datetime.tzinfo
641 The timezone.
643 Attributes
644 ----------
645 unit
646 tz
648 Methods
649 -------
650 None
652 Raises
653 ------
654 pytz.UnknownTimeZoneError
655 When the requested timezone cannot be found.
657 Examples
658 --------
659 >>> pd.DatetimeTZDtype(tz='UTC')
660 datetime64[ns, UTC]
662 >>> pd.DatetimeTZDtype(tz='dateutil/US/Central')
663 datetime64[ns, tzfile('/usr/share/zoneinfo/US/Central')]
664 """
666 type: type[Timestamp] = Timestamp
667 kind: str_type = "M"
668 num = 101
669 base = np.dtype("M8[ns]") # TODO: depend on reso?
670 _metadata = ("unit", "tz")
671 _match = re.compile(r"(datetime64|M8)\[(?P<unit>.+), (?P<tz>.+)\]")
672 _cache_dtypes: dict[str_type, PandasExtensionDtype] = {}
674 @property
675 def na_value(self) -> NaTType:
676 return NaT
678 @cache_readonly
679 def str(self):
680 return f"|M8[{self._unit}]"
682 def __init__(self, unit: str_type | DatetimeTZDtype = "ns", tz=None) -> None:
683 if isinstance(unit, DatetimeTZDtype):
684 # error: "str" has no attribute "tz"
685 unit, tz = unit.unit, unit.tz # type: ignore[attr-defined]
687 if unit != "ns":
688 if isinstance(unit, str) and tz is None:
689 # maybe a string like datetime64[ns, tz], which we support for
690 # now.
691 result = type(self).construct_from_string(unit)
692 unit = result.unit
693 tz = result.tz
694 msg = (
695 f"Passing a dtype alias like 'datetime64[ns, {tz}]' "
696 "to DatetimeTZDtype is no longer supported. Use "
697 "'DatetimeTZDtype.construct_from_string()' instead."
698 )
699 raise ValueError(msg)
700 if unit not in ["s", "ms", "us", "ns"]:
701 raise ValueError("DatetimeTZDtype only supports s, ms, us, ns units")
703 if tz:
704 tz = timezones.maybe_get_tz(tz)
705 tz = timezones.tz_standardize(tz)
706 elif tz is not None:
707 raise pytz.UnknownTimeZoneError(tz)
708 if tz is None:
709 raise TypeError("A 'tz' is required.")
711 self._unit = unit
712 self._tz = tz
714 @cache_readonly
715 def _reso(self) -> int:
716 """
717 The NPY_DATETIMEUNIT corresponding to this dtype's resolution.
718 """
719 reso = {
720 "s": dtypes.NpyDatetimeUnit.NPY_FR_s,
721 "ms": dtypes.NpyDatetimeUnit.NPY_FR_ms,
722 "us": dtypes.NpyDatetimeUnit.NPY_FR_us,
723 "ns": dtypes.NpyDatetimeUnit.NPY_FR_ns,
724 }[self._unit]
725 return reso.value
727 @property
728 def unit(self) -> str_type:
729 """
730 The precision of the datetime data.
731 """
732 return self._unit
734 @property
735 def tz(self) -> tzinfo:
736 """
737 The timezone.
738 """
739 return self._tz
741 @classmethod
742 def construct_array_type(cls) -> type_t[DatetimeArray]:
743 """
744 Return the array type associated with this dtype.
746 Returns
747 -------
748 type
749 """
750 from pandas.core.arrays import DatetimeArray
752 return DatetimeArray
754 @classmethod
755 def construct_from_string(cls, string: str_type) -> DatetimeTZDtype:
756 """
757 Construct a DatetimeTZDtype from a string.
759 Parameters
760 ----------
761 string : str
762 The string alias for this DatetimeTZDtype.
763 Should be formatted like ``datetime64[ns, <tz>]``,
764 where ``<tz>`` is the timezone name.
766 Examples
767 --------
768 >>> DatetimeTZDtype.construct_from_string('datetime64[ns, UTC]')
769 datetime64[ns, UTC]
770 """
771 if not isinstance(string, str): 771 ↛ 772line 771 didn't jump to line 772, because the condition on line 771 was never true
772 raise TypeError(
773 f"'construct_from_string' expects a string, got {type(string)}"
774 )
776 msg = f"Cannot construct a 'DatetimeTZDtype' from '{string}'"
777 match = cls._match.match(string)
778 if match: 778 ↛ 779line 778 didn't jump to line 779, because the condition on line 778 was never true
779 d = match.groupdict()
780 try:
781 return cls(unit=d["unit"], tz=d["tz"])
782 except (KeyError, TypeError, ValueError) as err:
783 # KeyError if maybe_get_tz tries and fails to get a
784 # pytz timezone (actually pytz.UnknownTimeZoneError).
785 # TypeError if we pass a nonsense tz;
786 # ValueError if we pass a unit other than "ns"
787 raise TypeError(msg) from err
788 raise TypeError(msg)
790 def __str__(self) -> str_type:
791 return f"datetime64[{self.unit}, {self.tz}]"
793 @property
794 def name(self) -> str_type:
795 """A string representation of the dtype."""
796 return str(self)
798 def __hash__(self) -> int:
799 # make myself hashable
800 # TODO: update this.
801 return hash(str(self))
803 def __eq__(self, other: Any) -> bool:
804 if isinstance(other, str):
805 if other.startswith("M8["):
806 other = "datetime64[" + other[3:]
807 return other == self.name
809 return (
810 isinstance(other, DatetimeTZDtype)
811 and self.unit == other.unit
812 and tz_compare(self.tz, other.tz)
813 )
815 def __setstate__(self, state) -> None:
816 # for pickle compat. __get_state__ is defined in the
817 # PandasExtensionDtype superclass and uses the public properties to
818 # pickle -> need to set the settable private ones here (see GH26067)
819 self._tz = state["tz"]
820 self._unit = state["unit"]
823@register_extension_dtype
824class PeriodDtype(dtypes.PeriodDtypeBase, PandasExtensionDtype):
825 """
826 An ExtensionDtype for Period data.
828 **This is not an actual numpy dtype**, but a duck type.
830 Parameters
831 ----------
832 freq : str or DateOffset
833 The frequency of this PeriodDtype.
835 Attributes
836 ----------
837 freq
839 Methods
840 -------
841 None
843 Examples
844 --------
845 >>> pd.PeriodDtype(freq='D')
846 period[D]
848 >>> pd.PeriodDtype(freq=pd.offsets.MonthEnd())
849 period[M]
850 """
852 type: type[Period] = Period
853 kind: str_type = "O"
854 str = "|O08"
855 base = np.dtype("O")
856 num = 102
857 _metadata = ("freq",)
858 _match = re.compile(r"(P|p)eriod\[(?P<freq>.+)\]")
859 _cache_dtypes: dict[str_type, PandasExtensionDtype] = {}
861 def __new__(cls, freq=None):
862 """
863 Parameters
864 ----------
865 freq : frequency
866 """
867 if isinstance(freq, PeriodDtype):
868 return freq
870 elif freq is None:
871 # empty constructor for pickle compat
872 # -10_000 corresponds to PeriodDtypeCode.UNDEFINED
873 u = dtypes.PeriodDtypeBase.__new__(cls, -10_000)
874 u._freq = None
875 return u
877 if not isinstance(freq, BaseOffset):
878 freq = cls._parse_dtype_strict(freq)
880 try:
881 return cls._cache_dtypes[freq.freqstr]
882 except KeyError:
883 dtype_code = freq._period_dtype_code
884 u = dtypes.PeriodDtypeBase.__new__(cls, dtype_code)
885 u._freq = freq
886 cls._cache_dtypes[freq.freqstr] = u
887 return u
889 def __reduce__(self):
890 return type(self), (self.freq,)
892 @property
893 def freq(self):
894 """
895 The frequency object of this PeriodDtype.
896 """
897 return self._freq
899 @classmethod
900 def _parse_dtype_strict(cls, freq: str_type) -> BaseOffset:
901 if isinstance(freq, str): # note: freq is already of type str!
902 if freq.startswith("period[") or freq.startswith("Period["):
903 m = cls._match.search(freq)
904 if m is not None:
905 freq = m.group("freq")
907 freq_offset = to_offset(freq)
908 if freq_offset is not None:
909 return freq_offset
911 raise ValueError("could not construct PeriodDtype")
913 @classmethod
914 def construct_from_string(cls, string: str_type) -> PeriodDtype:
915 """
916 Strict construction from a string, raise a TypeError if not
917 possible
918 """
919 if ( 919 ↛ 926line 919 didn't jump to line 926
920 isinstance(string, str)
921 and (string.startswith("period[") or string.startswith("Period["))
922 or isinstance(string, BaseOffset)
923 ):
924 # do not parse string like U as period[U]
925 # avoid tuple to be regarded as freq
926 try:
927 return cls(freq=string)
928 except ValueError:
929 pass
930 if isinstance(string, str): 930 ↛ 933line 930 didn't jump to line 933, because the condition on line 930 was never false
931 msg = f"Cannot construct a 'PeriodDtype' from '{string}'"
932 else:
933 msg = f"'construct_from_string' expects a string, got {type(string)}"
934 raise TypeError(msg)
936 def __str__(self) -> str_type:
937 return self.name
939 @property
940 def name(self) -> str_type:
941 return f"period[{self.freq.freqstr}]"
943 @property
944 def na_value(self) -> NaTType:
945 return NaT
947 def __hash__(self) -> int:
948 # make myself hashable
949 return hash(str(self))
951 def __eq__(self, other: Any) -> bool:
952 if isinstance(other, str):
953 return other in [self.name, self.name.title()]
955 elif isinstance(other, PeriodDtype):
957 # For freqs that can be held by a PeriodDtype, this check is
958 # equivalent to (and much faster than) self.freq == other.freq
959 sfreq = self.freq
960 ofreq = other.freq
961 return (
962 sfreq.n == ofreq.n
963 and sfreq._period_dtype_code == ofreq._period_dtype_code
964 )
966 return False
968 def __ne__(self, other: Any) -> bool:
969 return not self.__eq__(other)
971 def __setstate__(self, state) -> None:
972 # for pickle compat. __getstate__ is defined in the
973 # PandasExtensionDtype superclass and uses the public properties to
974 # pickle -> need to set the settable private ones here (see GH26067)
975 self._freq = state["freq"]
977 @classmethod
978 def is_dtype(cls, dtype: object) -> bool:
979 """
980 Return a boolean if we if the passed type is an actual dtype that we
981 can match (via string or type)
982 """
983 if isinstance(dtype, str):
984 # PeriodDtype can be instantiated from freq string like "U",
985 # but doesn't regard freq str like "U" as dtype.
986 if dtype.startswith("period[") or dtype.startswith("Period["):
987 try:
988 if cls._parse_dtype_strict(dtype) is not None:
989 return True
990 else:
991 return False
992 except ValueError:
993 return False
994 else:
995 return False
996 return super().is_dtype(dtype)
998 @classmethod
999 def construct_array_type(cls) -> type_t[PeriodArray]:
1000 """
1001 Return the array type associated with this dtype.
1003 Returns
1004 -------
1005 type
1006 """
1007 from pandas.core.arrays import PeriodArray
1009 return PeriodArray
1011 def __from_arrow__(
1012 self, array: pyarrow.Array | pyarrow.ChunkedArray
1013 ) -> PeriodArray:
1014 """
1015 Construct PeriodArray from pyarrow Array/ChunkedArray.
1016 """
1017 import pyarrow
1019 from pandas.core.arrays import PeriodArray
1020 from pandas.core.arrays.arrow._arrow_utils import (
1021 pyarrow_array_to_numpy_and_mask,
1022 )
1024 if isinstance(array, pyarrow.Array):
1025 chunks = [array]
1026 else:
1027 chunks = array.chunks
1029 results = []
1030 for arr in chunks:
1031 data, mask = pyarrow_array_to_numpy_and_mask(arr, dtype=np.dtype(np.int64))
1032 parr = PeriodArray(data.copy(), freq=self.freq, copy=False)
1033 # error: Invalid index type "ndarray[Any, dtype[bool_]]" for "PeriodArray";
1034 # expected type "Union[int, Sequence[int], Sequence[bool], slice]"
1035 parr[~mask] = NaT # type: ignore[index]
1036 results.append(parr)
1038 if not results:
1039 return PeriodArray(np.array([], dtype="int64"), freq=self.freq, copy=False)
1040 return PeriodArray._concat_same_type(results)
1043@register_extension_dtype
1044class IntervalDtype(PandasExtensionDtype):
1045 """
1046 An ExtensionDtype for Interval data.
1048 **This is not an actual numpy dtype**, but a duck type.
1050 Parameters
1051 ----------
1052 subtype : str, np.dtype
1053 The dtype of the Interval bounds.
1055 Attributes
1056 ----------
1057 subtype
1059 Methods
1060 -------
1061 None
1063 Examples
1064 --------
1065 >>> pd.IntervalDtype(subtype='int64', closed='both')
1066 interval[int64, both]
1067 """
1069 name = "interval"
1070 kind: str_type = "O"
1071 str = "|O08"
1072 base = np.dtype("O")
1073 num = 103
1074 _metadata = (
1075 "subtype",
1076 "closed",
1077 )
1079 _match = re.compile(
1080 r"(I|i)nterval\[(?P<subtype>[^,]+(\[.+\])?)"
1081 r"(, (?P<closed>(right|left|both|neither)))?\]"
1082 )
1084 _cache_dtypes: dict[str_type, PandasExtensionDtype] = {}
1086 def __new__(cls, subtype=None, closed: str_type | None = None):
1087 from pandas.core.dtypes.common import (
1088 is_string_dtype,
1089 pandas_dtype,
1090 )
1092 if closed is not None and closed not in {"right", "left", "both", "neither"}:
1093 raise ValueError("closed must be one of 'right', 'left', 'both', 'neither'")
1095 if isinstance(subtype, IntervalDtype):
1096 if closed is not None and closed != subtype.closed:
1097 raise ValueError(
1098 "dtype.closed and 'closed' do not match. "
1099 "Try IntervalDtype(dtype.subtype, closed) instead."
1100 )
1101 return subtype
1102 elif subtype is None:
1103 # we are called as an empty constructor
1104 # generally for pickle compat
1105 u = object.__new__(cls)
1106 u._subtype = None
1107 u._closed = closed
1108 return u
1109 elif isinstance(subtype, str) and subtype.lower() == "interval":
1110 subtype = None
1111 else:
1112 if isinstance(subtype, str):
1113 m = cls._match.search(subtype)
1114 if m is not None:
1115 gd = m.groupdict()
1116 subtype = gd["subtype"]
1117 if gd.get("closed", None) is not None:
1118 if closed is not None:
1119 if closed != gd["closed"]:
1120 raise ValueError(
1121 "'closed' keyword does not match value "
1122 "specified in dtype string"
1123 )
1124 closed = gd["closed"]
1126 try:
1127 subtype = pandas_dtype(subtype)
1128 except TypeError as err:
1129 raise TypeError("could not construct IntervalDtype") from err
1131 if CategoricalDtype.is_dtype(subtype) or is_string_dtype(subtype):
1132 # GH 19016
1133 msg = (
1134 "category, object, and string subtypes are not supported "
1135 "for IntervalDtype"
1136 )
1137 raise TypeError(msg)
1139 key = str(subtype) + str(closed)
1140 try:
1141 return cls._cache_dtypes[key]
1142 except KeyError:
1143 u = object.__new__(cls)
1144 u._subtype = subtype
1145 u._closed = closed
1146 cls._cache_dtypes[key] = u
1147 return u
1149 @cache_readonly
1150 def _can_hold_na(self) -> bool:
1151 subtype = self._subtype
1152 if subtype is None:
1153 # partially-initialized
1154 raise NotImplementedError(
1155 "_can_hold_na is not defined for partially-initialized IntervalDtype"
1156 )
1157 if subtype.kind in ["i", "u"]:
1158 return False
1159 return True
1161 @property
1162 def closed(self):
1163 return self._closed
1165 @property
1166 def subtype(self):
1167 """
1168 The dtype of the Interval bounds.
1169 """
1170 return self._subtype
1172 @classmethod
1173 def construct_array_type(cls) -> type[IntervalArray]:
1174 """
1175 Return the array type associated with this dtype.
1177 Returns
1178 -------
1179 type
1180 """
1181 from pandas.core.arrays import IntervalArray
1183 return IntervalArray
1185 @classmethod
1186 def construct_from_string(cls, string: str_type) -> IntervalDtype:
1187 """
1188 attempt to construct this type from a string, raise a TypeError
1189 if its not possible
1190 """
1191 if not isinstance(string, str): 1191 ↛ 1192line 1191 didn't jump to line 1192, because the condition on line 1191 was never true
1192 raise TypeError(
1193 f"'construct_from_string' expects a string, got {type(string)}"
1194 )
1196 if string.lower() == "interval" or cls._match.search(string) is not None: 1196 ↛ 1197line 1196 didn't jump to line 1197, because the condition on line 1196 was never true
1197 return cls(string)
1199 msg = (
1200 f"Cannot construct a 'IntervalDtype' from '{string}'.\n\n"
1201 "Incorrectly formatted string passed to constructor. "
1202 "Valid formats include Interval or Interval[dtype] "
1203 "where dtype is numeric, datetime, or timedelta"
1204 )
1205 raise TypeError(msg)
1207 @property
1208 def type(self) -> type[Interval]:
1209 return Interval
1211 def __str__(self) -> str_type:
1212 if self.subtype is None:
1213 return "interval"
1214 if self.closed is None:
1215 # Only partially initialized GH#38394
1216 return f"interval[{self.subtype}]"
1217 return f"interval[{self.subtype}, {self.closed}]"
1219 def __hash__(self) -> int:
1220 # make myself hashable
1221 return hash(str(self))
1223 def __eq__(self, other: Any) -> bool:
1224 if isinstance(other, str):
1225 return other.lower() in (self.name.lower(), str(self).lower())
1226 elif not isinstance(other, IntervalDtype):
1227 return False
1228 elif self.subtype is None or other.subtype is None:
1229 # None should match any subtype
1230 return True
1231 elif self.closed != other.closed:
1232 return False
1233 else:
1234 from pandas.core.dtypes.common import is_dtype_equal
1236 return is_dtype_equal(self.subtype, other.subtype)
1238 def __setstate__(self, state) -> None:
1239 # for pickle compat. __get_state__ is defined in the
1240 # PandasExtensionDtype superclass and uses the public properties to
1241 # pickle -> need to set the settable private ones here (see GH26067)
1242 self._subtype = state["subtype"]
1244 # backward-compat older pickles won't have "closed" key
1245 self._closed = state.pop("closed", None)
1247 @classmethod
1248 def is_dtype(cls, dtype: object) -> bool:
1249 """
1250 Return a boolean if we if the passed type is an actual dtype that we
1251 can match (via string or type)
1252 """
1253 if isinstance(dtype, str):
1254 if dtype.lower().startswith("interval"):
1255 try:
1256 if cls.construct_from_string(dtype) is not None:
1257 return True
1258 else:
1259 return False
1260 except (ValueError, TypeError):
1261 return False
1262 else:
1263 return False
1264 return super().is_dtype(dtype)
1266 def __from_arrow__(
1267 self, array: pyarrow.Array | pyarrow.ChunkedArray
1268 ) -> IntervalArray:
1269 """
1270 Construct IntervalArray from pyarrow Array/ChunkedArray.
1271 """
1272 import pyarrow
1274 from pandas.core.arrays import IntervalArray
1276 if isinstance(array, pyarrow.Array):
1277 chunks = [array]
1278 else:
1279 chunks = array.chunks
1281 results = []
1282 for arr in chunks:
1283 if isinstance(arr, pyarrow.ExtensionArray):
1284 arr = arr.storage
1285 left = np.asarray(arr.field("left"), dtype=self.subtype)
1286 right = np.asarray(arr.field("right"), dtype=self.subtype)
1287 iarr = IntervalArray.from_arrays(left, right, closed=self.closed)
1288 results.append(iarr)
1290 if not results:
1291 return IntervalArray.from_arrays(
1292 np.array([], dtype=self.subtype),
1293 np.array([], dtype=self.subtype),
1294 closed=self.closed,
1295 )
1296 return IntervalArray._concat_same_type(results)
1298 def _get_common_dtype(self, dtypes: list[DtypeObj]) -> DtypeObj | None:
1299 if not all(isinstance(x, IntervalDtype) for x in dtypes):
1300 return None
1302 closed = cast("IntervalDtype", dtypes[0]).closed
1303 if not all(cast("IntervalDtype", x).closed == closed for x in dtypes):
1304 return np.dtype(object)
1306 from pandas.core.dtypes.cast import find_common_type
1308 common = find_common_type([cast("IntervalDtype", x).subtype for x in dtypes])
1309 if common == object:
1310 return np.dtype(object)
1311 return IntervalDtype(common, closed=closed)
1314class PandasDtype(ExtensionDtype):
1315 """
1316 A Pandas ExtensionDtype for NumPy dtypes.
1318 This is mostly for internal compatibility, and is not especially
1319 useful on its own.
1321 Parameters
1322 ----------
1323 dtype : object
1324 Object to be converted to a NumPy data type object.
1326 See Also
1327 --------
1328 numpy.dtype
1329 """
1331 _metadata = ("_dtype",)
1333 def __init__(self, dtype: npt.DTypeLike | PandasDtype | None) -> None:
1334 if isinstance(dtype, PandasDtype):
1335 # make constructor univalent
1336 dtype = dtype.numpy_dtype
1337 self._dtype = np.dtype(dtype)
1339 def __repr__(self) -> str:
1340 return f"PandasDtype({repr(self.name)})"
1342 @property
1343 def numpy_dtype(self) -> np.dtype:
1344 """
1345 The NumPy dtype this PandasDtype wraps.
1346 """
1347 return self._dtype
1349 @property
1350 def name(self) -> str:
1351 """
1352 A bit-width name for this data-type.
1353 """
1354 return self._dtype.name
1356 @property
1357 def type(self) -> type[np.generic]:
1358 """
1359 The type object used to instantiate a scalar of this NumPy data-type.
1360 """
1361 return self._dtype.type
1363 @property
1364 def _is_numeric(self) -> bool:
1365 # exclude object, str, unicode, void.
1366 return self.kind in set("biufc")
1368 @property
1369 def _is_boolean(self) -> bool:
1370 return self.kind == "b"
1372 @classmethod
1373 def construct_from_string(cls, string: str) -> PandasDtype:
1374 try:
1375 dtype = np.dtype(string)
1376 except TypeError as err:
1377 if not isinstance(string, str):
1378 msg = f"'construct_from_string' expects a string, got {type(string)}"
1379 else:
1380 msg = f"Cannot construct a 'PandasDtype' from '{string}'"
1381 raise TypeError(msg) from err
1382 return cls(dtype)
1384 @classmethod
1385 def construct_array_type(cls) -> type_t[PandasArray]:
1386 """
1387 Return the array type associated with this dtype.
1389 Returns
1390 -------
1391 type
1392 """
1393 from pandas.core.arrays import PandasArray
1395 return PandasArray
1397 @property
1398 def kind(self) -> str:
1399 """
1400 A character code (one of 'biufcmMOSUV') identifying the general kind of data.
1401 """
1402 return self._dtype.kind
1404 @property
1405 def itemsize(self) -> int:
1406 """
1407 The element size of this data-type object.
1408 """
1409 return self._dtype.itemsize
1412class BaseMaskedDtype(ExtensionDtype):
1413 """
1414 Base class for dtypes for BaseMaskedArray subclasses.
1415 """
1417 name: str
1418 base = None
1419 type: type
1421 @property
1422 def na_value(self) -> libmissing.NAType:
1423 return libmissing.NA
1425 @cache_readonly
1426 def numpy_dtype(self) -> np.dtype:
1427 """Return an instance of our numpy dtype"""
1428 return np.dtype(self.type)
1430 @cache_readonly
1431 def kind(self) -> str:
1432 return self.numpy_dtype.kind
1434 @cache_readonly
1435 def itemsize(self) -> int:
1436 """Return the number of bytes in this dtype"""
1437 return self.numpy_dtype.itemsize
1439 @classmethod
1440 def construct_array_type(cls) -> type_t[BaseMaskedArray]:
1441 """
1442 Return the array type associated with this dtype.
1444 Returns
1445 -------
1446 type
1447 """
1448 raise NotImplementedError
1450 @classmethod
1451 def from_numpy_dtype(cls, dtype: np.dtype) -> BaseMaskedDtype:
1452 """
1453 Construct the MaskedDtype corresponding to the given numpy dtype.
1454 """
1455 if dtype.kind == "b":
1456 from pandas.core.arrays.boolean import BooleanDtype
1458 return BooleanDtype()
1459 elif dtype.kind in ["i", "u"]:
1460 from pandas.core.arrays.integer import INT_STR_TO_DTYPE
1462 return INT_STR_TO_DTYPE[dtype.name]
1463 elif dtype.kind == "f":
1464 from pandas.core.arrays.floating import FLOAT_STR_TO_DTYPE
1466 return FLOAT_STR_TO_DTYPE[dtype.name]
1467 else:
1468 raise NotImplementedError(dtype)
1470 def _get_common_dtype(self, dtypes: list[DtypeObj]) -> DtypeObj | None:
1471 # We unwrap any masked dtypes, find the common dtype we would use
1472 # for that, then re-mask the result.
1473 from pandas.core.dtypes.cast import find_common_type
1475 new_dtype = find_common_type(
1476 [
1477 dtype.numpy_dtype if isinstance(dtype, BaseMaskedDtype) else dtype
1478 for dtype in dtypes
1479 ]
1480 )
1481 if not isinstance(new_dtype, np.dtype):
1482 # If we ever support e.g. Masked[DatetimeArray] then this will change
1483 return None
1484 try:
1485 return type(self).from_numpy_dtype(new_dtype)
1486 except (KeyError, NotImplementedError):
1487 return None