Coverage for /var/srv/projects/api.amasfac.comuna18.com/tmp/venv/lib/python3.9/site-packages/pandas/core/dtypes/cast.py: 7%
812 statements
« prev ^ index » next coverage.py v6.4.4, created at 2023-07-17 14:22 -0600
« prev ^ index » next coverage.py v6.4.4, created at 2023-07-17 14:22 -0600
1"""
2Routines for casting.
3"""
5from __future__ import annotations
7from datetime import (
8 date,
9 datetime,
10 timedelta,
11)
12import functools
13from typing import (
14 TYPE_CHECKING,
15 Any,
16 Sized,
17 TypeVar,
18 cast,
19 overload,
20)
21import warnings
23from dateutil.parser import ParserError
24import numpy as np
26from pandas._libs import lib
27from pandas._libs.tslibs import (
28 NaT,
29 OutOfBoundsDatetime,
30 OutOfBoundsTimedelta,
31 Timedelta,
32 Timestamp,
33 astype_overflowsafe,
34)
35from pandas._libs.tslibs.timedeltas import array_to_timedelta64
36from pandas._typing import (
37 ArrayLike,
38 Dtype,
39 DtypeObj,
40 Scalar,
41)
42from pandas.errors import IntCastingNaNError
43from pandas.util._exceptions import find_stack_level
44from pandas.util._validators import validate_bool_kwarg
46from pandas.core.dtypes.astype import astype_nansafe
47from pandas.core.dtypes.common import (
48 DT64NS_DTYPE,
49 TD64NS_DTYPE,
50 ensure_int8,
51 ensure_int16,
52 ensure_int32,
53 ensure_int64,
54 ensure_object,
55 ensure_str,
56 is_bool,
57 is_bool_dtype,
58 is_complex,
59 is_complex_dtype,
60 is_datetime64_dtype,
61 is_datetime64tz_dtype,
62 is_dtype_equal,
63 is_extension_array_dtype,
64 is_float,
65 is_float_dtype,
66 is_integer,
67 is_integer_dtype,
68 is_numeric_dtype,
69 is_object_dtype,
70 is_scalar,
71 is_string_dtype,
72 is_timedelta64_dtype,
73 is_unsigned_integer_dtype,
74 pandas_dtype,
75)
76from pandas.core.dtypes.dtypes import (
77 CategoricalDtype,
78 DatetimeTZDtype,
79 ExtensionDtype,
80 IntervalDtype,
81 PeriodDtype,
82)
83from pandas.core.dtypes.generic import (
84 ABCExtensionArray,
85 ABCIndex,
86 ABCSeries,
87)
88from pandas.core.dtypes.inference import is_list_like
89from pandas.core.dtypes.missing import (
90 array_equivalent,
91 is_valid_na_for_dtype,
92 isna,
93 na_value_for_dtype,
94 notna,
95)
97if TYPE_CHECKING: 97 ↛ 99line 97 didn't jump to line 99, because the condition on line 97 was never true
99 from pandas import Index
100 from pandas.core.arrays import (
101 Categorical,
102 DatetimeArray,
103 ExtensionArray,
104 IntervalArray,
105 PeriodArray,
106 TimedeltaArray,
107 )
110_int8_max = np.iinfo(np.int8).max
111_int16_max = np.iinfo(np.int16).max
112_int32_max = np.iinfo(np.int32).max
113_int64_max = np.iinfo(np.int64).max
115_dtype_obj = np.dtype(object)
117NumpyArrayT = TypeVar("NumpyArrayT", bound=np.ndarray)
120def maybe_convert_platform(
121 values: list | tuple | range | np.ndarray | ExtensionArray,
122) -> ArrayLike:
123 """try to do platform conversion, allow ndarray or list here"""
124 arr: ArrayLike
126 if isinstance(values, (list, tuple, range)):
127 arr = construct_1d_object_array_from_listlike(values)
128 else:
129 # The caller is responsible for ensuring that we have np.ndarray
130 # or ExtensionArray here.
131 arr = values
133 if arr.dtype == _dtype_obj:
134 arr = cast(np.ndarray, arr)
135 arr = lib.maybe_convert_objects(arr)
137 return arr
140def is_nested_object(obj) -> bool:
141 """
142 return a boolean if we have a nested object, e.g. a Series with 1 or
143 more Series elements
145 This may not be necessarily be performant.
147 """
148 return bool(
149 isinstance(obj, ABCSeries)
150 and is_object_dtype(obj.dtype)
151 and any(isinstance(v, ABCSeries) for v in obj._values)
152 )
155def maybe_box_datetimelike(value: Scalar, dtype: Dtype | None = None) -> Scalar:
156 """
157 Cast scalar to Timestamp or Timedelta if scalar is datetime-like
158 and dtype is not object.
160 Parameters
161 ----------
162 value : scalar
163 dtype : Dtype, optional
165 Returns
166 -------
167 scalar
168 """
169 if dtype == _dtype_obj:
170 pass
171 elif isinstance(value, (np.datetime64, datetime)):
172 value = Timestamp(value)
173 elif isinstance(value, (np.timedelta64, timedelta)):
174 value = Timedelta(value)
176 return value
179def maybe_box_native(value: Scalar) -> Scalar:
180 """
181 If passed a scalar cast the scalar to a python native type.
183 Parameters
184 ----------
185 value : scalar or Series
187 Returns
188 -------
189 scalar or Series
190 """
191 if is_float(value):
192 # error: Argument 1 to "float" has incompatible type
193 # "Union[Union[str, int, float, bool], Union[Any, Timestamp, Timedelta, Any]]";
194 # expected "Union[SupportsFloat, _SupportsIndex, str]"
195 value = float(value) # type: ignore[arg-type]
196 elif is_integer(value):
197 # error: Argument 1 to "int" has incompatible type
198 # "Union[Union[str, int, float, bool], Union[Any, Timestamp, Timedelta, Any]]";
199 # expected "Union[str, SupportsInt, _SupportsIndex, _SupportsTrunc]"
200 value = int(value) # type: ignore[arg-type]
201 elif is_bool(value):
202 value = bool(value)
203 elif isinstance(value, (np.datetime64, np.timedelta64)):
204 value = maybe_box_datetimelike(value)
205 return value
208def _maybe_unbox_datetimelike(value: Scalar, dtype: DtypeObj) -> Scalar:
209 """
210 Convert a Timedelta or Timestamp to timedelta64 or datetime64 for setting
211 into a numpy array. Failing to unbox would risk dropping nanoseconds.
213 Notes
214 -----
215 Caller is responsible for checking dtype.kind in ["m", "M"]
216 """
217 if is_valid_na_for_dtype(value, dtype):
218 # GH#36541: can't fill array directly with pd.NaT
219 # > np.empty(10, dtype="datetime64[64]").fill(pd.NaT)
220 # ValueError: cannot convert float NaN to integer
221 value = dtype.type("NaT", "ns")
222 elif isinstance(value, Timestamp):
223 if value.tz is None:
224 value = value.to_datetime64()
225 elif not isinstance(dtype, DatetimeTZDtype):
226 raise TypeError("Cannot unbox tzaware Timestamp to tznaive dtype")
227 elif isinstance(value, Timedelta):
228 value = value.to_timedelta64()
230 _disallow_mismatched_datetimelike(value, dtype)
231 return value
234def _disallow_mismatched_datetimelike(value, dtype: DtypeObj):
235 """
236 numpy allows np.array(dt64values, dtype="timedelta64[ns]") and
237 vice-versa, but we do not want to allow this, so we need to
238 check explicitly
239 """
240 vdtype = getattr(value, "dtype", None)
241 if vdtype is None:
242 return
243 elif (vdtype.kind == "m" and dtype.kind == "M") or (
244 vdtype.kind == "M" and dtype.kind == "m"
245 ):
246 raise TypeError(f"Cannot cast {repr(value)} to {dtype}")
249@overload
250def maybe_downcast_to_dtype(result: np.ndarray, dtype: str | np.dtype) -> np.ndarray:
251 ...
254@overload
255def maybe_downcast_to_dtype(result: ExtensionArray, dtype: str | np.dtype) -> ArrayLike:
256 ...
259def maybe_downcast_to_dtype(result: ArrayLike, dtype: str | np.dtype) -> ArrayLike:
260 """
261 try to cast to the specified dtype (e.g. convert back to bool/int
262 or could be an astype of float64->float32
263 """
264 do_round = False
266 if isinstance(dtype, str):
267 if dtype == "infer":
268 inferred_type = lib.infer_dtype(result, skipna=False)
269 if inferred_type == "boolean":
270 dtype = "bool"
271 elif inferred_type == "integer":
272 dtype = "int64"
273 elif inferred_type == "datetime64":
274 dtype = "datetime64[ns]"
275 elif inferred_type in ["timedelta", "timedelta64"]:
276 dtype = "timedelta64[ns]"
278 # try to upcast here
279 elif inferred_type == "floating":
280 dtype = "int64"
281 if issubclass(result.dtype.type, np.number):
282 do_round = True
284 else:
285 # TODO: complex? what if result is already non-object?
286 dtype = "object"
288 dtype = np.dtype(dtype)
290 if not isinstance(dtype, np.dtype):
291 # enforce our signature annotation
292 raise TypeError(dtype) # pragma: no cover
294 converted = maybe_downcast_numeric(result, dtype, do_round)
295 if converted is not result:
296 return converted
298 # a datetimelike
299 # GH12821, iNaT is cast to float
300 if dtype.kind in ["M", "m"] and result.dtype.kind in ["i", "f"]:
301 result = result.astype(dtype)
303 elif dtype.kind == "m" and result.dtype == _dtype_obj:
304 # test_where_downcast_to_td64
305 result = cast(np.ndarray, result)
306 result = array_to_timedelta64(result)
308 elif dtype == np.dtype("M8[ns]") and result.dtype == _dtype_obj:
309 return np.asarray(maybe_cast_to_datetime(result, dtype=dtype))
311 return result
314@overload
315def maybe_downcast_numeric(
316 result: np.ndarray, dtype: np.dtype, do_round: bool = False
317) -> np.ndarray:
318 ...
321@overload
322def maybe_downcast_numeric(
323 result: ExtensionArray, dtype: DtypeObj, do_round: bool = False
324) -> ArrayLike:
325 ...
328def maybe_downcast_numeric(
329 result: ArrayLike, dtype: DtypeObj, do_round: bool = False
330) -> ArrayLike:
331 """
332 Subset of maybe_downcast_to_dtype restricted to numeric dtypes.
334 Parameters
335 ----------
336 result : ndarray or ExtensionArray
337 dtype : np.dtype or ExtensionDtype
338 do_round : bool
340 Returns
341 -------
342 ndarray or ExtensionArray
343 """
344 if not isinstance(dtype, np.dtype) or not isinstance(result.dtype, np.dtype):
345 # e.g. SparseDtype has no itemsize attr
346 return result
348 def trans(x):
349 if do_round:
350 return x.round()
351 return x
353 if dtype.kind == result.dtype.kind:
354 # don't allow upcasts here (except if empty)
355 if result.dtype.itemsize <= dtype.itemsize and result.size:
356 return result
358 if is_bool_dtype(dtype) or is_integer_dtype(dtype):
360 if not result.size:
361 # if we don't have any elements, just astype it
362 return trans(result).astype(dtype)
364 # do a test on the first element, if it fails then we are done
365 r = result.ravel()
366 arr = np.array([r[0]])
368 if isna(arr).any():
369 # if we have any nulls, then we are done
370 return result
372 elif not isinstance(r[0], (np.integer, np.floating, int, float, bool)):
373 # a comparable, e.g. a Decimal may slip in here
374 return result
376 if (
377 issubclass(result.dtype.type, (np.object_, np.number))
378 and notna(result).all()
379 ):
380 new_result = trans(result).astype(dtype)
381 if new_result.dtype.kind == "O" or result.dtype.kind == "O":
382 # np.allclose may raise TypeError on object-dtype
383 if (new_result == result).all():
384 return new_result
385 else:
386 if np.allclose(new_result, result, rtol=0):
387 return new_result
389 elif (
390 issubclass(dtype.type, np.floating)
391 and not is_bool_dtype(result.dtype)
392 and not is_string_dtype(result.dtype)
393 ):
394 new_result = result.astype(dtype)
396 # Adjust tolerances based on floating point size
397 size_tols = {4: 5e-4, 8: 5e-8, 16: 5e-16}
399 atol = size_tols.get(new_result.dtype.itemsize, 0.0)
401 # Check downcast float values are still equal within 7 digits when
402 # converting from float64 to float32
403 if np.allclose(new_result, result, equal_nan=True, rtol=0.0, atol=atol):
404 return new_result
406 elif dtype.kind == result.dtype.kind == "c":
407 new_result = result.astype(dtype)
409 if array_equivalent(new_result, result):
410 # TODO: use tolerance like we do for float?
411 return new_result
413 return result
416def maybe_cast_pointwise_result(
417 result: ArrayLike,
418 dtype: DtypeObj,
419 numeric_only: bool = False,
420 same_dtype: bool = True,
421) -> ArrayLike:
422 """
423 Try casting result of a pointwise operation back to the original dtype if
424 appropriate.
426 Parameters
427 ----------
428 result : array-like
429 Result to cast.
430 dtype : np.dtype or ExtensionDtype
431 Input Series from which result was calculated.
432 numeric_only : bool, default False
433 Whether to cast only numerics or datetimes as well.
434 same_dtype : bool, default True
435 Specify dtype when calling _from_sequence
437 Returns
438 -------
439 result : array-like
440 result maybe casted to the dtype.
441 """
443 assert not is_scalar(result)
445 if isinstance(dtype, ExtensionDtype):
446 if not isinstance(dtype, (CategoricalDtype, DatetimeTZDtype)):
447 # TODO: avoid this special-casing
448 # We have to special case categorical so as not to upcast
449 # things like counts back to categorical
451 cls = dtype.construct_array_type()
452 if same_dtype:
453 result = maybe_cast_to_extension_array(cls, result, dtype=dtype)
454 else:
455 result = maybe_cast_to_extension_array(cls, result)
457 elif (numeric_only and is_numeric_dtype(dtype)) or not numeric_only:
458 result = maybe_downcast_to_dtype(result, dtype)
460 return result
463def maybe_cast_to_extension_array(
464 cls: type[ExtensionArray], obj: ArrayLike, dtype: ExtensionDtype | None = None
465) -> ArrayLike:
466 """
467 Call to `_from_sequence` that returns the object unchanged on Exception.
469 Parameters
470 ----------
471 cls : class, subclass of ExtensionArray
472 obj : arraylike
473 Values to pass to cls._from_sequence
474 dtype : ExtensionDtype, optional
476 Returns
477 -------
478 ExtensionArray or obj
479 """
480 from pandas.core.arrays.string_ import BaseStringArray
482 assert isinstance(cls, type), f"must pass a type: {cls}"
483 assertion_msg = f"must pass a subclass of ExtensionArray: {cls}"
484 assert issubclass(cls, ABCExtensionArray), assertion_msg
486 # Everything can be converted to StringArrays, but we may not want to convert
487 if issubclass(cls, BaseStringArray) and lib.infer_dtype(obj) != "string":
488 return obj
490 try:
491 result = cls._from_sequence(obj, dtype=dtype)
492 except Exception:
493 # We can't predict what downstream EA constructors may raise
494 result = obj
495 return result
498@overload
499def ensure_dtype_can_hold_na(dtype: np.dtype) -> np.dtype:
500 ...
503@overload
504def ensure_dtype_can_hold_na(dtype: ExtensionDtype) -> ExtensionDtype:
505 ...
508def ensure_dtype_can_hold_na(dtype: DtypeObj) -> DtypeObj:
509 """
510 If we have a dtype that cannot hold NA values, find the best match that can.
511 """
512 if isinstance(dtype, ExtensionDtype):
513 if dtype._can_hold_na:
514 return dtype
515 elif isinstance(dtype, IntervalDtype):
516 # TODO(GH#45349): don't special-case IntervalDtype, allow
517 # overriding instead of returning object below.
518 return IntervalDtype(np.float64, closed=dtype.closed)
519 return _dtype_obj
520 elif dtype.kind == "b":
521 return _dtype_obj
522 elif dtype.kind in ["i", "u"]:
523 return np.dtype(np.float64)
524 return dtype
527def maybe_promote(dtype: np.dtype, fill_value=np.nan):
528 """
529 Find the minimal dtype that can hold both the given dtype and fill_value.
531 Parameters
532 ----------
533 dtype : np.dtype
534 fill_value : scalar, default np.nan
536 Returns
537 -------
538 dtype
539 Upcasted from dtype argument if necessary.
540 fill_value
541 Upcasted from fill_value argument if necessary.
543 Raises
544 ------
545 ValueError
546 If fill_value is a non-scalar and dtype is not object.
547 """
548 # TODO(2.0): need to directly use the non-cached version as long as we
549 # possibly raise a deprecation warning for datetime dtype
550 if dtype.kind == "M":
551 return _maybe_promote(dtype, fill_value)
552 # for performance, we are using a cached version of the actual implementation
553 # of the function in _maybe_promote. However, this doesn't always work (in case
554 # of non-hashable arguments), so we fallback to the actual implementation if needed
555 try:
556 # error: Argument 3 to "__call__" of "_lru_cache_wrapper" has incompatible type
557 # "Type[Any]"; expected "Hashable" [arg-type]
558 return _maybe_promote_cached(
559 dtype, fill_value, type(fill_value) # type: ignore[arg-type]
560 )
561 except TypeError:
562 # if fill_value is not hashable (required for caching)
563 return _maybe_promote(dtype, fill_value)
566@functools.lru_cache(maxsize=128)
567def _maybe_promote_cached(dtype, fill_value, fill_value_type):
568 # The cached version of _maybe_promote below
569 # This also use fill_value_type as (unused) argument to use this in the
570 # cache lookup -> to differentiate 1 and True
571 return _maybe_promote(dtype, fill_value)
574def _maybe_promote(dtype: np.dtype, fill_value=np.nan):
575 # The actual implementation of the function, use `maybe_promote` above for
576 # a cached version.
577 if not is_scalar(fill_value):
578 # with object dtype there is nothing to promote, and the user can
579 # pass pretty much any weird fill_value they like
580 if not is_object_dtype(dtype):
581 # with object dtype there is nothing to promote, and the user can
582 # pass pretty much any weird fill_value they like
583 raise ValueError("fill_value must be a scalar")
584 dtype = _dtype_obj
585 return dtype, fill_value
587 kinds = ["i", "u", "f", "c", "m", "M"]
588 if is_valid_na_for_dtype(fill_value, dtype) and dtype.kind in kinds:
589 dtype = ensure_dtype_can_hold_na(dtype)
590 fv = na_value_for_dtype(dtype)
591 return dtype, fv
593 elif isinstance(dtype, CategoricalDtype):
594 if fill_value in dtype.categories or isna(fill_value):
595 return dtype, fill_value
596 else:
597 return object, ensure_object(fill_value)
599 elif isna(fill_value):
600 dtype = _dtype_obj
601 if fill_value is None:
602 # but we retain e.g. pd.NA
603 fill_value = np.nan
604 return dtype, fill_value
606 # returns tuple of (dtype, fill_value)
607 if issubclass(dtype.type, np.datetime64):
608 inferred, fv = infer_dtype_from_scalar(fill_value, pandas_dtype=True)
609 if inferred == dtype:
610 return dtype, fv
612 # TODO(2.0): once this deprecation is enforced, this whole case
613 # becomes equivalent to:
614 # dta = DatetimeArray._from_sequence([], dtype="M8[ns]")
615 # try:
616 # fv = dta._validate_setitem_value(fill_value)
617 # return dta.dtype, fv
618 # except (ValueError, TypeError):
619 # return _dtype_obj, fill_value
620 if isinstance(fill_value, date) and not isinstance(fill_value, datetime):
621 # deprecate casting of date object to match infer_dtype_from_scalar
622 # and DatetimeArray._validate_setitem_value
623 try:
624 fv = Timestamp(fill_value).to_datetime64()
625 except OutOfBoundsDatetime:
626 pass
627 else:
628 warnings.warn(
629 "Using a `date` object for fill_value with `datetime64[ns]` "
630 "dtype is deprecated. In a future version, this will be cast "
631 "to object dtype. Pass `fill_value=Timestamp(date_obj)` instead.",
632 FutureWarning,
633 stacklevel=find_stack_level(),
634 )
635 return dtype, fv
636 elif isinstance(fill_value, str):
637 try:
638 # explicitly wrap in str to convert np.str_
639 fv = Timestamp(str(fill_value))
640 except (ValueError, TypeError):
641 pass
642 else:
643 if isna(fv) or fv.tz is None:
644 return dtype, fv.asm8
646 return np.dtype("object"), fill_value
648 elif issubclass(dtype.type, np.timedelta64):
649 inferred, fv = infer_dtype_from_scalar(fill_value, pandas_dtype=True)
650 if inferred == dtype:
651 return dtype, fv
653 return np.dtype("object"), fill_value
655 elif is_float(fill_value):
656 if issubclass(dtype.type, np.bool_):
657 dtype = np.dtype(np.object_)
659 elif issubclass(dtype.type, np.integer):
660 dtype = np.dtype(np.float64)
662 elif dtype.kind == "f":
663 mst = np.min_scalar_type(fill_value)
664 if mst > dtype:
665 # e.g. mst is np.float64 and dtype is np.float32
666 dtype = mst
668 elif dtype.kind == "c":
669 mst = np.min_scalar_type(fill_value)
670 dtype = np.promote_types(dtype, mst)
672 elif is_bool(fill_value):
673 if not issubclass(dtype.type, np.bool_):
674 dtype = np.dtype(np.object_)
676 elif is_integer(fill_value):
677 if issubclass(dtype.type, np.bool_):
678 dtype = np.dtype(np.object_)
680 elif issubclass(dtype.type, np.integer):
681 if not np.can_cast(fill_value, dtype):
682 # upcast to prevent overflow
683 mst = np.min_scalar_type(fill_value)
684 dtype = np.promote_types(dtype, mst)
685 if dtype.kind == "f":
686 # Case where we disagree with numpy
687 dtype = np.dtype(np.object_)
689 elif is_complex(fill_value):
690 if issubclass(dtype.type, np.bool_):
691 dtype = np.dtype(np.object_)
693 elif issubclass(dtype.type, (np.integer, np.floating)):
694 mst = np.min_scalar_type(fill_value)
695 dtype = np.promote_types(dtype, mst)
697 elif dtype.kind == "c":
698 mst = np.min_scalar_type(fill_value)
699 if mst > dtype:
700 # e.g. mst is np.complex128 and dtype is np.complex64
701 dtype = mst
703 else:
704 dtype = np.dtype(np.object_)
706 # in case we have a string that looked like a number
707 if issubclass(dtype.type, (bytes, str)):
708 dtype = np.dtype(np.object_)
710 fill_value = _ensure_dtype_type(fill_value, dtype)
711 return dtype, fill_value
714def _ensure_dtype_type(value, dtype: np.dtype):
715 """
716 Ensure that the given value is an instance of the given dtype.
718 e.g. if out dtype is np.complex64_, we should have an instance of that
719 as opposed to a python complex object.
721 Parameters
722 ----------
723 value : object
724 dtype : np.dtype
726 Returns
727 -------
728 object
729 """
730 # Start with exceptions in which we do _not_ cast to numpy types
732 if dtype == _dtype_obj:
733 return value
735 # Note: before we get here we have already excluded isna(value)
736 return dtype.type(value)
739def infer_dtype_from(val, pandas_dtype: bool = False) -> tuple[DtypeObj, Any]:
740 """
741 Interpret the dtype from a scalar or array.
743 Parameters
744 ----------
745 val : object
746 pandas_dtype : bool, default False
747 whether to infer dtype including pandas extension types.
748 If False, scalar/array belongs to pandas extension types is inferred as
749 object
750 """
751 if not is_list_like(val):
752 return infer_dtype_from_scalar(val, pandas_dtype=pandas_dtype)
753 return infer_dtype_from_array(val, pandas_dtype=pandas_dtype)
756def infer_dtype_from_scalar(val, pandas_dtype: bool = False) -> tuple[DtypeObj, Any]:
757 """
758 Interpret the dtype from a scalar.
760 Parameters
761 ----------
762 pandas_dtype : bool, default False
763 whether to infer dtype including pandas extension types.
764 If False, scalar belongs to pandas extension types is inferred as
765 object
766 """
767 dtype: DtypeObj = _dtype_obj
769 # a 1-element ndarray
770 if isinstance(val, np.ndarray):
771 if val.ndim != 0:
772 msg = "invalid ndarray passed to infer_dtype_from_scalar"
773 raise ValueError(msg)
775 dtype = val.dtype
776 val = lib.item_from_zerodim(val)
778 elif isinstance(val, str):
780 # If we create an empty array using a string to infer
781 # the dtype, NumPy will only allocate one character per entry
782 # so this is kind of bad. Alternately we could use np.repeat
783 # instead of np.empty (but then you still don't want things
784 # coming out as np.str_!
786 dtype = _dtype_obj
788 elif isinstance(val, (np.datetime64, datetime)):
789 try:
790 val = Timestamp(val)
791 except OutOfBoundsDatetime:
792 return _dtype_obj, val
794 # error: Non-overlapping identity check (left operand type: "Timestamp",
795 # right operand type: "NaTType")
796 if val is NaT or val.tz is None: # type: ignore[comparison-overlap]
797 dtype = np.dtype("M8[ns]")
798 val = val.to_datetime64()
799 else:
800 if pandas_dtype:
801 dtype = DatetimeTZDtype(unit="ns", tz=val.tz)
802 else:
803 # return datetimetz as object
804 return _dtype_obj, val
806 elif isinstance(val, (np.timedelta64, timedelta)):
807 try:
808 val = Timedelta(val)
809 except (OutOfBoundsTimedelta, OverflowError):
810 dtype = _dtype_obj
811 else:
812 dtype = np.dtype("m8[ns]")
813 val = np.timedelta64(val.value, "ns")
815 elif is_bool(val):
816 dtype = np.dtype(np.bool_)
818 elif is_integer(val):
819 if isinstance(val, np.integer):
820 dtype = np.dtype(type(val))
821 else:
822 dtype = np.dtype(np.int64)
824 try:
825 np.array(val, dtype=dtype)
826 except OverflowError:
827 dtype = np.array(val).dtype
829 elif is_float(val):
830 if isinstance(val, np.floating):
831 dtype = np.dtype(type(val))
832 else:
833 dtype = np.dtype(np.float64)
835 elif is_complex(val):
836 dtype = np.dtype(np.complex_)
838 elif pandas_dtype:
839 if lib.is_period(val):
840 dtype = PeriodDtype(freq=val.freq)
841 elif lib.is_interval(val):
842 subtype = infer_dtype_from_scalar(val.left, pandas_dtype=True)[0]
843 dtype = IntervalDtype(subtype=subtype, closed=val.closed)
845 return dtype, val
848def dict_compat(d: dict[Scalar, Scalar]) -> dict[Scalar, Scalar]:
849 """
850 Convert datetimelike-keyed dicts to a Timestamp-keyed dict.
852 Parameters
853 ----------
854 d: dict-like object
856 Returns
857 -------
858 dict
859 """
860 return {maybe_box_datetimelike(key): value for key, value in d.items()}
863def infer_dtype_from_array(
864 arr, pandas_dtype: bool = False
865) -> tuple[DtypeObj, ArrayLike]:
866 """
867 Infer the dtype from an array.
869 Parameters
870 ----------
871 arr : array
872 pandas_dtype : bool, default False
873 whether to infer dtype including pandas extension types.
874 If False, array belongs to pandas extension types
875 is inferred as object
877 Returns
878 -------
879 tuple (numpy-compat/pandas-compat dtype, array)
881 Notes
882 -----
883 if pandas_dtype=False. these infer to numpy dtypes
884 exactly with the exception that mixed / object dtypes
885 are not coerced by stringifying or conversion
887 if pandas_dtype=True. datetime64tz-aware/categorical
888 types will retain there character.
890 Examples
891 --------
892 >>> np.asarray([1, '1'])
893 array(['1', '1'], dtype='<U21')
895 >>> infer_dtype_from_array([1, '1'])
896 (dtype('O'), [1, '1'])
897 """
898 if isinstance(arr, np.ndarray):
899 return arr.dtype, arr
901 if not is_list_like(arr):
902 raise TypeError("'arr' must be list-like")
904 if pandas_dtype and is_extension_array_dtype(arr):
905 return arr.dtype, arr
907 elif isinstance(arr, ABCSeries):
908 return arr.dtype, np.asarray(arr)
910 # don't force numpy coerce with nan's
911 inferred = lib.infer_dtype(arr, skipna=False)
912 if inferred in ["string", "bytes", "mixed", "mixed-integer"]:
913 return (np.dtype(np.object_), arr)
915 arr = np.asarray(arr)
916 return arr.dtype, arr
919def _maybe_infer_dtype_type(element):
920 """
921 Try to infer an object's dtype, for use in arithmetic ops.
923 Uses `element.dtype` if that's available.
924 Objects implementing the iterator protocol are cast to a NumPy array,
925 and from there the array's type is used.
927 Parameters
928 ----------
929 element : object
930 Possibly has a `.dtype` attribute, and possibly the iterator
931 protocol.
933 Returns
934 -------
935 tipo : type
937 Examples
938 --------
939 >>> from collections import namedtuple
940 >>> Foo = namedtuple("Foo", "dtype")
941 >>> _maybe_infer_dtype_type(Foo(np.dtype("i8")))
942 dtype('int64')
943 """
944 tipo = None
945 if hasattr(element, "dtype"):
946 tipo = element.dtype
947 elif is_list_like(element):
948 element = np.asarray(element)
949 tipo = element.dtype
950 return tipo
953def maybe_upcast(
954 values: NumpyArrayT,
955 fill_value: Scalar = np.nan,
956 copy: bool = False,
957) -> tuple[NumpyArrayT, Scalar]:
958 """
959 Provide explicit type promotion and coercion.
961 Parameters
962 ----------
963 values : np.ndarray
964 The array that we may want to upcast.
965 fill_value : what we want to fill with
966 copy : bool, default True
967 If True always make a copy even if no upcast is required.
969 Returns
970 -------
971 values: np.ndarray
972 the original array, possibly upcast
973 fill_value:
974 the fill value, possibly upcast
975 """
976 new_dtype, fill_value = maybe_promote(values.dtype, fill_value)
977 # We get a copy in all cases _except_ (values.dtype == new_dtype and not copy)
978 upcast_values = values.astype(new_dtype, copy=copy)
980 # error: Incompatible return value type (got "Tuple[ndarray[Any, dtype[Any]],
981 # Union[Union[str, int, float, bool] Union[Period, Timestamp, Timedelta, Any]]]",
982 # expected "Tuple[NumpyArrayT, Union[Union[str, int, float, bool], Union[Period,
983 # Timestamp, Timedelta, Any]]]")
984 return upcast_values, fill_value # type: ignore[return-value]
987def invalidate_string_dtypes(dtype_set: set[DtypeObj]) -> None:
988 """
989 Change string like dtypes to object for
990 ``DataFrame.select_dtypes()``.
991 """
992 # error: Argument 1 to <set> has incompatible type "Type[generic]"; expected
993 # "Union[dtype[Any], ExtensionDtype, None]"
994 # error: Argument 2 to <set> has incompatible type "Type[generic]"; expected
995 # "Union[dtype[Any], ExtensionDtype, None]"
996 non_string_dtypes = dtype_set - {
997 np.dtype("S").type, # type: ignore[arg-type]
998 np.dtype("<U").type, # type: ignore[arg-type]
999 }
1000 if non_string_dtypes != dtype_set:
1001 raise TypeError("string dtypes are not allowed, use 'object' instead")
1004def coerce_indexer_dtype(indexer, categories) -> np.ndarray:
1005 """coerce the indexer input array to the smallest dtype possible"""
1006 length = len(categories)
1007 if length < _int8_max:
1008 return ensure_int8(indexer)
1009 elif length < _int16_max:
1010 return ensure_int16(indexer)
1011 elif length < _int32_max:
1012 return ensure_int32(indexer)
1013 return ensure_int64(indexer)
1016def soft_convert_objects(
1017 values: np.ndarray,
1018 datetime: bool = True,
1019 numeric: bool = True,
1020 timedelta: bool = True,
1021 period: bool = True,
1022 copy: bool = True,
1023) -> ArrayLike:
1024 """
1025 Try to coerce datetime, timedelta, and numeric object-dtype columns
1026 to inferred dtype.
1028 Parameters
1029 ----------
1030 values : np.ndarray[object]
1031 datetime : bool, default True
1032 numeric: bool, default True
1033 timedelta : bool, default True
1034 period : bool, default True
1035 copy : bool, default True
1037 Returns
1038 -------
1039 np.ndarray or ExtensionArray
1040 """
1041 validate_bool_kwarg(datetime, "datetime")
1042 validate_bool_kwarg(numeric, "numeric")
1043 validate_bool_kwarg(timedelta, "timedelta")
1044 validate_bool_kwarg(copy, "copy")
1046 conversion_count = sum((datetime, numeric, timedelta))
1047 if conversion_count == 0:
1048 raise ValueError("At least one of datetime, numeric or timedelta must be True.")
1050 # Soft conversions
1051 if datetime or timedelta:
1052 # GH 20380, when datetime is beyond year 2262, hence outside
1053 # bound of nanosecond-resolution 64-bit integers.
1054 try:
1055 converted = lib.maybe_convert_objects(
1056 values,
1057 convert_datetime=datetime,
1058 convert_timedelta=timedelta,
1059 convert_period=period,
1060 )
1061 except (OutOfBoundsDatetime, ValueError):
1062 return values
1063 if converted is not values:
1064 return converted
1066 if numeric and is_object_dtype(values.dtype):
1067 converted, _ = lib.maybe_convert_numeric(values, set(), coerce_numeric=True)
1069 # If all NaNs, then do not-alter
1070 values = converted if not isna(converted).all() else values
1071 values = values.copy() if copy else values
1073 return values
1076def convert_dtypes(
1077 input_array: ArrayLike,
1078 convert_string: bool = True,
1079 convert_integer: bool = True,
1080 convert_boolean: bool = True,
1081 convert_floating: bool = True,
1082) -> DtypeObj:
1083 """
1084 Convert objects to best possible type, and optionally,
1085 to types supporting ``pd.NA``.
1087 Parameters
1088 ----------
1089 input_array : ExtensionArray or np.ndarray
1090 convert_string : bool, default True
1091 Whether object dtypes should be converted to ``StringDtype()``.
1092 convert_integer : bool, default True
1093 Whether, if possible, conversion can be done to integer extension types.
1094 convert_boolean : bool, defaults True
1095 Whether object dtypes should be converted to ``BooleanDtypes()``.
1096 convert_floating : bool, defaults True
1097 Whether, if possible, conversion can be done to floating extension types.
1098 If `convert_integer` is also True, preference will be give to integer
1099 dtypes if the floats can be faithfully casted to integers.
1101 Returns
1102 -------
1103 np.dtype, or ExtensionDtype
1104 """
1105 inferred_dtype: str | DtypeObj
1107 if (
1108 convert_string or convert_integer or convert_boolean or convert_floating
1109 ) and isinstance(input_array, np.ndarray):
1111 if is_object_dtype(input_array.dtype):
1112 inferred_dtype = lib.infer_dtype(input_array)
1113 else:
1114 inferred_dtype = input_array.dtype
1116 if is_string_dtype(inferred_dtype):
1117 if not convert_string or inferred_dtype == "bytes":
1118 return input_array.dtype
1119 else:
1120 return pandas_dtype("string")
1122 if convert_integer:
1123 target_int_dtype = pandas_dtype("Int64")
1125 if is_integer_dtype(input_array.dtype):
1126 from pandas.core.arrays.integer import INT_STR_TO_DTYPE
1128 inferred_dtype = INT_STR_TO_DTYPE.get(
1129 input_array.dtype.name, target_int_dtype
1130 )
1131 elif is_numeric_dtype(input_array.dtype):
1132 # TODO: de-dup with maybe_cast_to_integer_array?
1133 arr = input_array[notna(input_array)]
1134 if (arr.astype(int) == arr).all():
1135 inferred_dtype = target_int_dtype
1136 else:
1137 inferred_dtype = input_array.dtype
1139 if convert_floating:
1140 if not is_integer_dtype(input_array.dtype) and is_numeric_dtype(
1141 input_array.dtype
1142 ):
1143 from pandas.core.arrays.floating import FLOAT_STR_TO_DTYPE
1145 inferred_float_dtype: DtypeObj = FLOAT_STR_TO_DTYPE.get(
1146 input_array.dtype.name, pandas_dtype("Float64")
1147 )
1148 # if we could also convert to integer, check if all floats
1149 # are actually integers
1150 if convert_integer:
1151 # TODO: de-dup with maybe_cast_to_integer_array?
1152 arr = input_array[notna(input_array)]
1153 if (arr.astype(int) == arr).all():
1154 inferred_dtype = pandas_dtype("Int64")
1155 else:
1156 inferred_dtype = inferred_float_dtype
1157 else:
1158 inferred_dtype = inferred_float_dtype
1160 if convert_boolean:
1161 if is_bool_dtype(input_array.dtype):
1162 inferred_dtype = pandas_dtype("boolean")
1163 elif isinstance(inferred_dtype, str) and inferred_dtype == "boolean":
1164 inferred_dtype = pandas_dtype("boolean")
1166 if isinstance(inferred_dtype, str):
1167 # If we couldn't do anything else, then we retain the dtype
1168 inferred_dtype = input_array.dtype
1170 else:
1171 return input_array.dtype
1173 # error: Incompatible return value type (got "Union[str, Union[dtype[Any],
1174 # ExtensionDtype]]", expected "Union[dtype[Any], ExtensionDtype]")
1175 return inferred_dtype # type: ignore[return-value]
1178def maybe_infer_to_datetimelike(
1179 value: np.ndarray,
1180) -> np.ndarray | DatetimeArray | TimedeltaArray | PeriodArray | IntervalArray:
1181 """
1182 we might have a array (or single object) that is datetime like,
1183 and no dtype is passed don't change the value unless we find a
1184 datetime/timedelta set
1186 this is pretty strict in that a datetime/timedelta is REQUIRED
1187 in addition to possible nulls/string likes
1189 Parameters
1190 ----------
1191 value : np.ndarray[object]
1193 Returns
1194 -------
1195 np.ndarray, DatetimeArray, TimedeltaArray, PeriodArray, or IntervalArray
1197 """
1198 if not isinstance(value, np.ndarray) or value.dtype != object:
1199 # Caller is responsible for passing only ndarray[object]
1200 raise TypeError(type(value)) # pragma: no cover
1202 v = np.array(value, copy=False)
1204 shape = v.shape
1205 if v.ndim != 1:
1206 v = v.ravel()
1208 if not len(v):
1209 return value
1211 def try_datetime(v: np.ndarray) -> ArrayLike:
1212 # Coerce to datetime64, datetime64tz, or in corner cases
1213 # object[datetimes]
1214 from pandas.core.arrays.datetimes import sequence_to_datetimes
1216 try:
1217 # GH#19671 we pass require_iso8601 to be relatively strict
1218 # when parsing strings.
1219 dta = sequence_to_datetimes(v, require_iso8601=True)
1220 except (ValueError, TypeError):
1221 # e.g. <class 'numpy.timedelta64'> is not convertible to datetime
1222 return v.reshape(shape)
1223 else:
1224 # GH#19761 we may have mixed timezones, in which cast 'dta' is
1225 # an ndarray[object]. Only 1 test
1226 # relies on this behavior, see GH#40111
1227 return dta.reshape(shape)
1229 def try_timedelta(v: np.ndarray) -> np.ndarray:
1230 # safe coerce to timedelta64
1232 # will try first with a string & object conversion
1233 try:
1234 # bc we know v.dtype == object, this is equivalent to
1235 # `np.asarray(to_timedelta(v))`, but using a lower-level API that
1236 # does not require a circular import.
1237 td_values = array_to_timedelta64(v).view("m8[ns]")
1238 except (ValueError, OverflowError):
1239 return v.reshape(shape)
1240 else:
1241 return td_values.reshape(shape)
1243 inferred_type, seen_str = lib.infer_datetimelike_array(ensure_object(v))
1244 if inferred_type in ["period", "interval"]:
1245 # Incompatible return value type (got "Union[ExtensionArray, ndarray]",
1246 # expected "Union[ndarray, DatetimeArray, TimedeltaArray, PeriodArray,
1247 # IntervalArray]")
1248 return lib.maybe_convert_objects( # type: ignore[return-value]
1249 v, convert_period=True, convert_interval=True
1250 )
1252 if inferred_type == "datetime":
1253 # error: Incompatible types in assignment (expression has type "ExtensionArray",
1254 # variable has type "Union[ndarray, List[Any]]")
1255 value = try_datetime(v) # type: ignore[assignment]
1256 elif inferred_type == "timedelta":
1257 value = try_timedelta(v)
1258 elif inferred_type == "nat":
1260 # if all NaT, return as datetime
1261 if isna(v).all():
1262 # error: Incompatible types in assignment (expression has type
1263 # "ExtensionArray", variable has type "Union[ndarray, List[Any]]")
1264 value = try_datetime(v) # type: ignore[assignment]
1265 else:
1267 # We have at least a NaT and a string
1268 # try timedelta first to avoid spurious datetime conversions
1269 # e.g. '00:00:01' is a timedelta but technically is also a datetime
1270 value = try_timedelta(v)
1271 if lib.infer_dtype(value, skipna=False) in ["mixed"]:
1272 # cannot skip missing values, as NaT implies that the string
1273 # is actually a datetime
1275 # error: Incompatible types in assignment (expression has type
1276 # "ExtensionArray", variable has type "Union[ndarray, List[Any]]")
1277 value = try_datetime(v) # type: ignore[assignment]
1279 if value.dtype.kind in ["m", "M"] and seen_str:
1280 # TODO(2.0): enforcing this deprecation should close GH#40111
1281 warnings.warn(
1282 f"Inferring {value.dtype} from data containing strings is deprecated "
1283 "and will be removed in a future version. To retain the old behavior "
1284 f"explicitly pass Series(data, dtype={value.dtype})",
1285 FutureWarning,
1286 stacklevel=find_stack_level(),
1287 )
1288 return value
1291def maybe_cast_to_datetime(
1292 value: ExtensionArray | np.ndarray | list, dtype: DtypeObj | None
1293) -> ExtensionArray | np.ndarray:
1294 """
1295 try to cast the array/value to a datetimelike dtype, converting float
1296 nan to iNaT
1298 We allow a list *only* when dtype is not None.
1299 """
1300 from pandas.core.arrays.datetimes import sequence_to_datetimes
1301 from pandas.core.arrays.timedeltas import TimedeltaArray
1303 if not is_list_like(value):
1304 raise TypeError("value must be listlike")
1306 if is_timedelta64_dtype(dtype):
1307 # TODO: _from_sequence would raise ValueError in cases where
1308 # _ensure_nanosecond_dtype raises TypeError
1309 dtype = cast(np.dtype, dtype)
1310 dtype = _ensure_nanosecond_dtype(dtype)
1311 res = TimedeltaArray._from_sequence(value, dtype=dtype)
1312 return res
1314 if dtype is not None:
1315 is_datetime64 = is_datetime64_dtype(dtype)
1316 is_datetime64tz = is_datetime64tz_dtype(dtype)
1318 vdtype = getattr(value, "dtype", None)
1320 if is_datetime64 or is_datetime64tz:
1321 dtype = _ensure_nanosecond_dtype(dtype)
1323 value = np.array(value, copy=False)
1325 # we have an array of datetime or timedeltas & nulls
1326 if value.size or not is_dtype_equal(value.dtype, dtype):
1327 _disallow_mismatched_datetimelike(value, dtype)
1329 try:
1330 if is_datetime64:
1331 dta = sequence_to_datetimes(value)
1332 # GH 25843: Remove tz information since the dtype
1333 # didn't specify one
1335 if dta.tz is not None:
1336 warnings.warn(
1337 "Data is timezone-aware. Converting "
1338 "timezone-aware data to timezone-naive by "
1339 "passing dtype='datetime64[ns]' to "
1340 "DataFrame or Series is deprecated and will "
1341 "raise in a future version. Use "
1342 "`pd.Series(values).dt.tz_localize(None)` "
1343 "instead.",
1344 FutureWarning,
1345 stacklevel=find_stack_level(),
1346 )
1347 # equiv: dta.view(dtype)
1348 # Note: NOT equivalent to dta.astype(dtype)
1349 dta = dta.tz_localize(None)
1351 value = dta
1352 elif is_datetime64tz:
1353 dtype = cast(DatetimeTZDtype, dtype)
1354 # The string check can be removed once issue #13712
1355 # is solved. String data that is passed with a
1356 # datetime64tz is assumed to be naive which should
1357 # be localized to the timezone.
1358 is_dt_string = is_string_dtype(value.dtype)
1359 dta = sequence_to_datetimes(value)
1360 if dta.tz is not None:
1361 value = dta.astype(dtype, copy=False)
1362 elif is_dt_string:
1363 # Strings here are naive, so directly localize
1364 # equiv: dta.astype(dtype) # though deprecated
1366 value = dta.tz_localize(dtype.tz)
1367 else:
1368 # Numeric values are UTC at this point,
1369 # so localize and convert
1370 # equiv: Series(dta).astype(dtype) # though deprecated
1371 if getattr(vdtype, "kind", None) == "M":
1372 # GH#24559, GH#33401 deprecate behavior inconsistent
1373 # with DatetimeArray/DatetimeIndex
1374 warnings.warn(
1375 "In a future version, constructing a Series "
1376 "from datetime64[ns] data and a "
1377 "DatetimeTZDtype will interpret the data "
1378 "as wall-times instead of "
1379 "UTC times, matching the behavior of "
1380 "DatetimeIndex. To treat the data as UTC "
1381 "times, use pd.Series(data).dt"
1382 ".tz_localize('UTC').tz_convert(dtype.tz) "
1383 "or pd.Series(data.view('int64'), dtype=dtype)",
1384 FutureWarning,
1385 stacklevel=find_stack_level(),
1386 )
1388 value = dta.tz_localize("UTC").tz_convert(dtype.tz)
1389 except OutOfBoundsDatetime:
1390 raise
1391 except ParserError:
1392 # Note: this is dateutil's ParserError, not ours.
1393 pass
1395 elif getattr(vdtype, "kind", None) in ["m", "M"]:
1396 # we are already datetimelike and want to coerce to non-datetimelike;
1397 # astype_nansafe will raise for anything other than object, then upcast.
1398 # see test_datetimelike_values_with_object_dtype
1399 # error: Argument 2 to "astype_nansafe" has incompatible type
1400 # "Union[dtype[Any], ExtensionDtype]"; expected "dtype[Any]"
1401 return astype_nansafe(value, dtype) # type: ignore[arg-type]
1403 elif isinstance(value, np.ndarray):
1404 if value.dtype.kind in ["M", "m"]:
1405 # catch a datetime/timedelta that is not of ns variety
1406 # and no coercion specified
1407 value = sanitize_to_nanoseconds(value)
1409 elif value.dtype == _dtype_obj:
1410 value = maybe_infer_to_datetimelike(value)
1412 elif isinstance(value, list):
1413 # we only get here with dtype=None, which we do not allow
1414 raise ValueError(
1415 "maybe_cast_to_datetime allows a list *only* if dtype is not None"
1416 )
1418 # at this point we have converted or raised in all cases where we had a list
1419 return cast(ArrayLike, value)
1422def sanitize_to_nanoseconds(values: np.ndarray, copy: bool = False) -> np.ndarray:
1423 """
1424 Safely convert non-nanosecond datetime64 or timedelta64 values to nanosecond.
1425 """
1426 dtype = values.dtype
1427 if dtype.kind == "M" and dtype != DT64NS_DTYPE:
1428 values = astype_overflowsafe(values, dtype=DT64NS_DTYPE)
1430 elif dtype.kind == "m" and dtype != TD64NS_DTYPE:
1431 values = astype_overflowsafe(values, dtype=TD64NS_DTYPE)
1433 elif copy:
1434 values = values.copy()
1436 return values
1439def _ensure_nanosecond_dtype(dtype: DtypeObj) -> DtypeObj:
1440 """
1441 Convert dtypes with granularity less than nanosecond to nanosecond
1443 >>> _ensure_nanosecond_dtype(np.dtype("M8[s]"))
1444 dtype('<M8[ns]')
1446 >>> _ensure_nanosecond_dtype(np.dtype("m8[ps]"))
1447 Traceback (most recent call last):
1448 ...
1449 TypeError: cannot convert timedeltalike to dtype [timedelta64[ps]]
1450 """
1451 msg = (
1452 f"The '{dtype.name}' dtype has no unit. "
1453 f"Please pass in '{dtype.name}[ns]' instead."
1454 )
1456 # unpack e.g. SparseDtype
1457 dtype = getattr(dtype, "subtype", dtype)
1459 if not isinstance(dtype, np.dtype):
1460 # i.e. datetime64tz
1461 pass
1463 elif dtype.kind == "M" and dtype != DT64NS_DTYPE:
1464 # pandas supports dtype whose granularity is less than [ns]
1465 # e.g., [ps], [fs], [as]
1466 if dtype <= np.dtype("M8[ns]"):
1467 if dtype.name == "datetime64":
1468 raise ValueError(msg)
1469 dtype = DT64NS_DTYPE
1470 else:
1471 raise TypeError(f"cannot convert datetimelike to dtype [{dtype}]")
1473 elif dtype.kind == "m" and dtype != TD64NS_DTYPE:
1474 # pandas supports dtype whose granularity is less than [ns]
1475 # e.g., [ps], [fs], [as]
1476 if dtype <= np.dtype("m8[ns]"):
1477 if dtype.name == "timedelta64":
1478 raise ValueError(msg)
1479 dtype = TD64NS_DTYPE
1480 else:
1481 raise TypeError(f"cannot convert timedeltalike to dtype [{dtype}]")
1482 return dtype
1485# TODO: other value-dependent functions to standardize here include
1486# dtypes.concat.cast_to_common_type and Index._find_common_type_compat
1487def find_result_type(left: ArrayLike, right: Any) -> DtypeObj:
1488 """
1489 Find the type/dtype for a the result of an operation between these objects.
1491 This is similar to find_common_type, but looks at the objects instead
1492 of just their dtypes. This can be useful in particular when one of the
1493 objects does not have a `dtype`.
1495 Parameters
1496 ----------
1497 left : np.ndarray or ExtensionArray
1498 right : Any
1500 Returns
1501 -------
1502 np.dtype or ExtensionDtype
1504 See also
1505 --------
1506 find_common_type
1507 numpy.result_type
1508 """
1509 new_dtype: DtypeObj
1511 if (
1512 isinstance(left, np.ndarray)
1513 and left.dtype.kind in ["i", "u", "c"]
1514 and (lib.is_integer(right) or lib.is_float(right))
1515 ):
1516 # e.g. with int8 dtype and right=512, we want to end up with
1517 # np.int16, whereas infer_dtype_from(512) gives np.int64,
1518 # which will make us upcast too far.
1519 if lib.is_float(right) and right.is_integer() and left.dtype.kind != "f":
1520 right = int(right)
1522 new_dtype = np.result_type(left, right)
1524 elif is_valid_na_for_dtype(right, left.dtype):
1525 # e.g. IntervalDtype[int] and None/np.nan
1526 new_dtype = ensure_dtype_can_hold_na(left.dtype)
1528 else:
1529 dtype, _ = infer_dtype_from(right, pandas_dtype=True)
1531 new_dtype = find_common_type([left.dtype, dtype])
1533 return new_dtype
1536def common_dtype_categorical_compat(
1537 objs: list[Index | ArrayLike], dtype: DtypeObj
1538) -> DtypeObj:
1539 """
1540 Update the result of find_common_type to account for NAs in a Categorical.
1542 Parameters
1543 ----------
1544 objs : list[np.ndarray | ExtensionArray | Index]
1545 dtype : np.dtype or ExtensionDtype
1547 Returns
1548 -------
1549 np.dtype or ExtensionDtype
1550 """
1551 # GH#38240
1553 # TODO: more generally, could do `not can_hold_na(dtype)`
1554 if isinstance(dtype, np.dtype) and dtype.kind in ["i", "u"]:
1556 for obj in objs:
1557 # We don't want to accientally allow e.g. "categorical" str here
1558 obj_dtype = getattr(obj, "dtype", None)
1559 if isinstance(obj_dtype, CategoricalDtype):
1560 if isinstance(obj, ABCIndex):
1561 # This check may already be cached
1562 hasnas = obj.hasnans
1563 else:
1564 # Categorical
1565 hasnas = cast("Categorical", obj)._hasna
1567 if hasnas:
1568 # see test_union_int_categorical_with_nan
1569 dtype = np.dtype(np.float64)
1570 break
1571 return dtype
1574@overload
1575def find_common_type(types: list[np.dtype]) -> np.dtype:
1576 ...
1579@overload
1580def find_common_type(types: list[ExtensionDtype]) -> DtypeObj:
1581 ...
1584@overload
1585def find_common_type(types: list[DtypeObj]) -> DtypeObj:
1586 ...
1589def find_common_type(types):
1590 """
1591 Find a common data type among the given dtypes.
1593 Parameters
1594 ----------
1595 types : list of dtypes
1597 Returns
1598 -------
1599 pandas extension or numpy dtype
1601 See Also
1602 --------
1603 numpy.find_common_type
1605 """
1606 if not types:
1607 raise ValueError("no types given")
1609 first = types[0]
1611 # workaround for find_common_type([np.dtype('datetime64[ns]')] * 2)
1612 # => object
1613 if lib.dtypes_all_equal(list(types)):
1614 return first
1616 # get unique types (dict.fromkeys is used as order-preserving set())
1617 types = list(dict.fromkeys(types).keys())
1619 if any(isinstance(t, ExtensionDtype) for t in types):
1620 for t in types:
1621 if isinstance(t, ExtensionDtype):
1622 res = t._get_common_dtype(types)
1623 if res is not None:
1624 return res
1625 return np.dtype("object")
1627 # take lowest unit
1628 if all(is_datetime64_dtype(t) for t in types):
1629 return np.dtype("datetime64[ns]")
1630 if all(is_timedelta64_dtype(t) for t in types):
1631 return np.dtype("timedelta64[ns]")
1633 # don't mix bool / int or float or complex
1634 # this is different from numpy, which casts bool with float/int as int
1635 has_bools = any(is_bool_dtype(t) for t in types)
1636 if has_bools:
1637 for t in types:
1638 if is_integer_dtype(t) or is_float_dtype(t) or is_complex_dtype(t):
1639 return np.dtype("object")
1641 return np.find_common_type(types, [])
1644def construct_2d_arraylike_from_scalar(
1645 value: Scalar, length: int, width: int, dtype: np.dtype, copy: bool
1646) -> np.ndarray:
1648 shape = (length, width)
1650 if dtype.kind in ["m", "M"]:
1651 value = _maybe_unbox_datetimelike_tz_deprecation(value, dtype)
1652 elif dtype == _dtype_obj:
1653 if isinstance(value, (np.timedelta64, np.datetime64)):
1654 # calling np.array below would cast to pytimedelta/pydatetime
1655 out = np.empty(shape, dtype=object)
1656 out.fill(value)
1657 return out
1659 # Attempt to coerce to a numpy array
1660 try:
1661 arr = np.array(value, dtype=dtype, copy=copy)
1662 except (ValueError, TypeError) as err:
1663 raise TypeError(
1664 f"DataFrame constructor called with incompatible data and dtype: {err}"
1665 ) from err
1667 if arr.ndim != 0:
1668 raise ValueError("DataFrame constructor not properly called!")
1670 return np.full(shape, arr)
1673def construct_1d_arraylike_from_scalar(
1674 value: Scalar, length: int, dtype: DtypeObj | None
1675) -> ArrayLike:
1676 """
1677 create a np.ndarray / pandas type of specified shape and dtype
1678 filled with values
1680 Parameters
1681 ----------
1682 value : scalar value
1683 length : int
1684 dtype : pandas_dtype or np.dtype
1686 Returns
1687 -------
1688 np.ndarray / pandas type of length, filled with value
1690 """
1692 if dtype is None:
1693 try:
1694 dtype, value = infer_dtype_from_scalar(value, pandas_dtype=True)
1695 except OutOfBoundsDatetime:
1696 dtype = _dtype_obj
1698 if isinstance(dtype, ExtensionDtype):
1699 cls = dtype.construct_array_type()
1700 seq = [] if length == 0 else [value]
1701 subarr = cls._from_sequence(seq, dtype=dtype).repeat(length)
1703 else:
1705 if length and is_integer_dtype(dtype) and isna(value):
1706 # coerce if we have nan for an integer dtype
1707 dtype = np.dtype("float64")
1708 elif isinstance(dtype, np.dtype) and dtype.kind in ("U", "S"):
1709 # we need to coerce to object dtype to avoid
1710 # to allow numpy to take our string as a scalar value
1711 dtype = np.dtype("object")
1712 if not isna(value):
1713 value = ensure_str(value)
1714 elif dtype.kind in ["M", "m"]:
1715 value = _maybe_unbox_datetimelike_tz_deprecation(value, dtype)
1717 subarr = np.empty(length, dtype=dtype)
1718 if length:
1719 # GH 47391: numpy > 1.24 will raise filling np.nan into int dtypes
1720 subarr.fill(value)
1722 return subarr
1725def _maybe_unbox_datetimelike_tz_deprecation(value: Scalar, dtype: DtypeObj):
1726 """
1727 Wrap _maybe_unbox_datetimelike with a check for a timezone-aware Timestamp
1728 along with a timezone-naive datetime64 dtype, which is deprecated.
1729 """
1730 # Caller is responsible for checking dtype.kind in ["m", "M"]
1732 if isinstance(value, datetime):
1733 # we dont want to box dt64, in particular datetime64("NaT")
1734 value = maybe_box_datetimelike(value, dtype)
1736 try:
1737 value = _maybe_unbox_datetimelike(value, dtype)
1738 except TypeError:
1739 if (
1740 isinstance(value, Timestamp)
1741 and value.tzinfo is not None
1742 and isinstance(dtype, np.dtype)
1743 and dtype.kind == "M"
1744 ):
1745 warnings.warn(
1746 "Data is timezone-aware. Converting "
1747 "timezone-aware data to timezone-naive by "
1748 "passing dtype='datetime64[ns]' to "
1749 "DataFrame or Series is deprecated and will "
1750 "raise in a future version. Use "
1751 "`pd.Series(values).dt.tz_localize(None)` "
1752 "instead.",
1753 FutureWarning,
1754 stacklevel=find_stack_level(),
1755 )
1756 new_value = value.tz_localize(None)
1757 return _maybe_unbox_datetimelike(new_value, dtype)
1758 else:
1759 raise
1760 return value
1763def construct_1d_object_array_from_listlike(values: Sized) -> np.ndarray:
1764 """
1765 Transform any list-like object in a 1-dimensional numpy array of object
1766 dtype.
1768 Parameters
1769 ----------
1770 values : any iterable which has a len()
1772 Raises
1773 ------
1774 TypeError
1775 * If `values` does not have a len()
1777 Returns
1778 -------
1779 1-dimensional numpy array of dtype object
1780 """
1781 # numpy will try to interpret nested lists as further dimensions, hence
1782 # making a 1D array that contains list-likes is a bit tricky:
1783 result = np.empty(len(values), dtype="object")
1784 result[:] = values
1785 return result
1788def maybe_cast_to_integer_array(
1789 arr: list | np.ndarray, dtype: np.dtype, copy: bool = False
1790) -> np.ndarray:
1791 """
1792 Takes any dtype and returns the casted version, raising for when data is
1793 incompatible with integer/unsigned integer dtypes.
1795 Parameters
1796 ----------
1797 arr : np.ndarray or list
1798 The array to cast.
1799 dtype : np.dtype
1800 The integer dtype to cast the array to.
1801 copy: bool, default False
1802 Whether to make a copy of the array before returning.
1804 Returns
1805 -------
1806 ndarray
1807 Array of integer or unsigned integer dtype.
1809 Raises
1810 ------
1811 OverflowError : the dtype is incompatible with the data
1812 ValueError : loss of precision has occurred during casting
1814 Examples
1815 --------
1816 If you try to coerce negative values to unsigned integers, it raises:
1818 >>> pd.Series([-1], dtype="uint64")
1819 Traceback (most recent call last):
1820 ...
1821 OverflowError: Trying to coerce negative values to unsigned integers
1823 Also, if you try to coerce float values to integers, it raises:
1825 >>> maybe_cast_to_integer_array([1, 2, 3.5], dtype=np.dtype("int64"))
1826 Traceback (most recent call last):
1827 ...
1828 ValueError: Trying to coerce float values to integers
1829 """
1830 assert is_integer_dtype(dtype)
1832 try:
1833 if not isinstance(arr, np.ndarray):
1834 casted = np.array(arr, dtype=dtype, copy=copy)
1835 else:
1836 casted = arr.astype(dtype, copy=copy)
1837 except OverflowError as err:
1838 raise OverflowError(
1839 "The elements provided in the data cannot all be "
1840 f"casted to the dtype {dtype}"
1841 ) from err
1843 if np.array_equal(arr, casted):
1844 return casted
1846 # We do this casting to allow for proper
1847 # data and dtype checking.
1848 #
1849 # We didn't do this earlier because NumPy
1850 # doesn't handle `uint64` correctly.
1851 arr = np.asarray(arr)
1853 if is_unsigned_integer_dtype(dtype) and (arr < 0).any():
1854 raise OverflowError("Trying to coerce negative values to unsigned integers")
1856 if is_float_dtype(arr.dtype):
1857 if not np.isfinite(arr).all():
1858 raise IntCastingNaNError(
1859 "Cannot convert non-finite values (NA or inf) to integer"
1860 )
1861 raise ValueError("Trying to coerce float values to integers")
1862 if is_object_dtype(arr.dtype):
1863 raise ValueError("Trying to coerce float values to integers")
1865 if casted.dtype < arr.dtype:
1866 # GH#41734 e.g. [1, 200, 923442] and dtype="int8" -> overflows
1867 warnings.warn(
1868 f"Values are too large to be losslessly cast to {dtype}. "
1869 "In a future version this will raise OverflowError. To retain the "
1870 f"old behavior, use pd.Series(values).astype({dtype})",
1871 FutureWarning,
1872 stacklevel=find_stack_level(),
1873 )
1874 return casted
1876 if arr.dtype.kind in ["m", "M"]:
1877 # test_constructor_maskedarray_nonfloat
1878 warnings.warn(
1879 f"Constructing Series or DataFrame from {arr.dtype} values and "
1880 f"dtype={dtype} is deprecated and will raise in a future version. "
1881 "Use values.view(dtype) instead.",
1882 FutureWarning,
1883 stacklevel=find_stack_level(),
1884 )
1885 return casted
1887 # No known cases that get here, but raising explicitly to cover our bases.
1888 raise ValueError(f"values cannot be losslessly cast to {dtype}")
1891def can_hold_element(arr: ArrayLike, element: Any) -> bool:
1892 """
1893 Can we do an inplace setitem with this element in an array with this dtype?
1895 Parameters
1896 ----------
1897 arr : np.ndarray or ExtensionArray
1898 element : Any
1900 Returns
1901 -------
1902 bool
1903 """
1904 dtype = arr.dtype
1905 if not isinstance(dtype, np.dtype) or dtype.kind in ["m", "M"]:
1906 if isinstance(dtype, (PeriodDtype, IntervalDtype, DatetimeTZDtype, np.dtype)):
1907 # np.dtype here catches datetime64ns and timedelta64ns; we assume
1908 # in this case that we have DatetimeArray/TimedeltaArray
1909 arr = cast(
1910 "PeriodArray | DatetimeArray | TimedeltaArray | IntervalArray", arr
1911 )
1912 try:
1913 arr._validate_setitem_value(element)
1914 return True
1915 except (ValueError, TypeError):
1916 # TODO(2.0): stop catching ValueError for tzaware, see
1917 # _catch_deprecated_value_error
1918 return False
1920 # This is technically incorrect, but maintains the behavior of
1921 # ExtensionBlock._can_hold_element
1922 return True
1924 try:
1925 np_can_hold_element(dtype, element)
1926 return True
1927 except (TypeError, LossySetitemError):
1928 return False
1931def np_can_hold_element(dtype: np.dtype, element: Any) -> Any:
1932 """
1933 Raise if we cannot losslessly set this element into an ndarray with this dtype.
1935 Specifically about places where we disagree with numpy. i.e. there are
1936 cases where numpy will raise in doing the setitem that we do not check
1937 for here, e.g. setting str "X" into a numeric ndarray.
1939 Returns
1940 -------
1941 Any
1942 The element, potentially cast to the dtype.
1944 Raises
1945 ------
1946 ValueError : If we cannot losslessly store this element with this dtype.
1947 """
1948 if dtype == _dtype_obj:
1949 return element
1951 tipo = _maybe_infer_dtype_type(element)
1953 if dtype.kind in ["i", "u"]:
1954 if isinstance(element, range):
1955 if _dtype_can_hold_range(element, dtype):
1956 return element
1957 raise LossySetitemError
1959 elif is_integer(element) or (is_float(element) and element.is_integer()):
1960 # e.g. test_setitem_series_int8 if we have a python int 1
1961 # tipo may be np.int32, despite the fact that it will fit
1962 # in smaller int dtypes.
1963 info = np.iinfo(dtype)
1964 if info.min <= element <= info.max:
1965 return dtype.type(element)
1966 raise LossySetitemError
1968 if tipo is not None:
1969 if tipo.kind not in ["i", "u"]:
1970 if isinstance(element, np.ndarray) and element.dtype.kind == "f":
1971 # If all can be losslessly cast to integers, then we can hold them
1972 with np.errstate(invalid="ignore"):
1973 # We check afterwards if cast was losslessly, so no need to show
1974 # the warning
1975 casted = element.astype(dtype)
1976 comp = casted == element
1977 if comp.all():
1978 # Return the casted values bc they can be passed to
1979 # np.putmask, whereas the raw values cannot.
1980 # see TestSetitemFloatNDarrayIntoIntegerSeries
1981 return casted
1982 raise LossySetitemError
1984 # Anything other than integer we cannot hold
1985 raise LossySetitemError
1986 elif (
1987 dtype.kind == "u"
1988 and isinstance(element, np.ndarray)
1989 and element.dtype.kind == "i"
1990 ):
1991 # see test_where_uint64
1992 casted = element.astype(dtype)
1993 if (casted == element).all():
1994 # TODO: faster to check (element >=0).all()? potential
1995 # itemsize issues there?
1996 return casted
1997 raise LossySetitemError
1998 elif dtype.itemsize < tipo.itemsize:
1999 raise LossySetitemError
2000 elif not isinstance(tipo, np.dtype):
2001 # i.e. nullable IntegerDtype; we can put this into an ndarray
2002 # losslessly iff it has no NAs
2003 if element._hasna:
2004 raise LossySetitemError
2005 return element
2007 return element
2009 raise LossySetitemError
2011 elif dtype.kind == "f":
2012 if lib.is_integer(element) or lib.is_float(element):
2013 casted = dtype.type(element)
2014 if np.isnan(casted) or casted == element:
2015 return casted
2016 # otherwise e.g. overflow see TestCoercionFloat32
2017 raise LossySetitemError
2019 if tipo is not None:
2020 # TODO: itemsize check?
2021 if tipo.kind not in ["f", "i", "u"]:
2022 # Anything other than float/integer we cannot hold
2023 raise LossySetitemError
2024 elif not isinstance(tipo, np.dtype):
2025 # i.e. nullable IntegerDtype or FloatingDtype;
2026 # we can put this into an ndarray losslessly iff it has no NAs
2027 if element._hasna:
2028 raise LossySetitemError
2029 return element
2030 elif tipo.itemsize > dtype.itemsize or tipo.kind != dtype.kind:
2031 if isinstance(element, np.ndarray):
2032 # e.g. TestDataFrameIndexingWhere::test_where_alignment
2033 casted = element.astype(dtype)
2034 # TODO(np>=1.20): we can just use np.array_equal with equal_nan
2035 if array_equivalent(casted, element):
2036 return casted
2037 raise LossySetitemError
2039 return element
2041 raise LossySetitemError
2043 elif dtype.kind == "c":
2044 if lib.is_integer(element) or lib.is_complex(element) or lib.is_float(element):
2045 if np.isnan(element):
2046 # see test_where_complex GH#6345
2047 return dtype.type(element)
2049 casted = dtype.type(element)
2050 if casted == element:
2051 return casted
2052 # otherwise e.g. overflow see test_32878_complex_itemsize
2053 raise LossySetitemError
2055 if tipo is not None:
2056 if tipo.kind in ["c", "f", "i", "u"]:
2057 return element
2058 raise LossySetitemError
2059 raise LossySetitemError
2061 elif dtype.kind == "b":
2062 if tipo is not None:
2063 if tipo.kind == "b":
2064 if not isinstance(tipo, np.dtype):
2065 # i.e. we have a BooleanArray
2066 if element._hasna:
2067 # i.e. there are pd.NA elements
2068 raise LossySetitemError
2069 return element
2070 raise LossySetitemError
2071 if lib.is_bool(element):
2072 return element
2073 raise LossySetitemError
2075 elif dtype.kind == "S":
2076 # TODO: test tests.frame.methods.test_replace tests get here,
2077 # need more targeted tests. xref phofl has a PR about this
2078 if tipo is not None:
2079 if tipo.kind == "S" and tipo.itemsize <= dtype.itemsize:
2080 return element
2081 raise LossySetitemError
2082 if isinstance(element, bytes) and len(element) <= dtype.itemsize:
2083 return element
2084 raise LossySetitemError
2086 raise NotImplementedError(dtype)
2089def _dtype_can_hold_range(rng: range, dtype: np.dtype) -> bool:
2090 """
2091 _maybe_infer_dtype_type infers to int64 (and float64 for very large endpoints),
2092 but in many cases a range can be held by a smaller integer dtype.
2093 Check if this is one of those cases.
2094 """
2095 if not len(rng):
2096 return True
2097 return np.can_cast(rng[0], dtype) and np.can_cast(rng[-1], dtype)
2100class LossySetitemError(Exception):
2101 """
2102 Raised when trying to do a __setitem__ on an np.ndarray that is not lossless.
2103 """
2105 pass