Coverage for /var/srv/projects/api.amasfac.comuna18.com/tmp/venv/lib/python3.9/site-packages/pandas/core/tools/datetimes.py: 14%
371 statements
« prev ^ index » next coverage.py v6.4.4, created at 2023-07-17 14:22 -0600
« prev ^ index » next coverage.py v6.4.4, created at 2023-07-17 14:22 -0600
1from __future__ import annotations
3from collections import abc
4from datetime import datetime
5from functools import partial
6from itertools import islice
7from typing import (
8 TYPE_CHECKING,
9 Callable,
10 Hashable,
11 List,
12 Tuple,
13 TypedDict,
14 Union,
15 cast,
16 overload,
17)
18import warnings
20import numpy as np
22from pandas._libs import tslib
23from pandas._libs.tslibs import (
24 OutOfBoundsDatetime,
25 Timedelta,
26 Timestamp,
27 iNaT,
28 nat_strings,
29 parsing,
30 timezones,
31)
32from pandas._libs.tslibs.parsing import (
33 DateParseError,
34 format_is_iso,
35 guess_datetime_format,
36)
37from pandas._libs.tslibs.strptime import array_strptime
38from pandas._typing import (
39 AnyArrayLike,
40 ArrayLike,
41 DateTimeErrorChoices,
42 Timezone,
43 npt,
44)
45from pandas.util._exceptions import find_stack_level
47from pandas.core.dtypes.common import (
48 ensure_object,
49 is_datetime64_dtype,
50 is_datetime64_ns_dtype,
51 is_datetime64tz_dtype,
52 is_float,
53 is_integer,
54 is_integer_dtype,
55 is_list_like,
56 is_numeric_dtype,
57 is_scalar,
58)
59from pandas.core.dtypes.generic import (
60 ABCDataFrame,
61 ABCSeries,
62)
63from pandas.core.dtypes.missing import notna
65from pandas.arrays import (
66 DatetimeArray,
67 IntegerArray,
68)
69from pandas.core import algorithms
70from pandas.core.algorithms import unique
71from pandas.core.arrays.base import ExtensionArray
72from pandas.core.arrays.datetimes import (
73 maybe_convert_dtype,
74 objects_to_datetime64ns,
75 tz_to_dtype,
76)
77from pandas.core.construction import extract_array
78from pandas.core.indexes.base import Index
79from pandas.core.indexes.datetimes import DatetimeIndex
81if TYPE_CHECKING: 81 ↛ 82line 81 didn't jump to line 82, because the condition on line 81 was never true
82 from pandas._libs.tslibs.nattype import NaTType
83 from pandas._libs.tslibs.timedeltas import UnitChoices
85 from pandas import (
86 DataFrame,
87 Series,
88 )
90# ---------------------------------------------------------------------
91# types used in annotations
93ArrayConvertible = Union[List, Tuple, AnyArrayLike]
94Scalar = Union[float, str]
95DatetimeScalar = Union[Scalar, datetime]
97DatetimeScalarOrArrayConvertible = Union[DatetimeScalar, ArrayConvertible]
99DatetimeDictArg = Union[List[Scalar], Tuple[Scalar, ...], AnyArrayLike]
102class YearMonthDayDict(TypedDict, total=True):
103 year: DatetimeDictArg
104 month: DatetimeDictArg
105 day: DatetimeDictArg
108class FulldatetimeDict(YearMonthDayDict, total=False):
109 hour: DatetimeDictArg
110 hours: DatetimeDictArg
111 minute: DatetimeDictArg
112 minutes: DatetimeDictArg
113 second: DatetimeDictArg
114 seconds: DatetimeDictArg
115 ms: DatetimeDictArg
116 us: DatetimeDictArg
117 ns: DatetimeDictArg
120DictConvertible = Union[FulldatetimeDict, "DataFrame"]
121start_caching_at = 50
124# ---------------------------------------------------------------------
127def _guess_datetime_format_for_array(arr, dayfirst: bool | None = False):
128 # Try to guess the format based on the first non-NaN element
129 non_nan_elements = notna(arr).nonzero()[0]
130 if len(non_nan_elements):
131 return guess_datetime_format(arr[non_nan_elements[0]], dayfirst=dayfirst)
134def should_cache(
135 arg: ArrayConvertible, unique_share: float = 0.7, check_count: int | None = None
136) -> bool:
137 """
138 Decides whether to do caching.
140 If the percent of unique elements among `check_count` elements less
141 than `unique_share * 100` then we can do caching.
143 Parameters
144 ----------
145 arg: listlike, tuple, 1-d array, Series
146 unique_share: float, default=0.7, optional
147 0 < unique_share < 1
148 check_count: int, optional
149 0 <= check_count <= len(arg)
151 Returns
152 -------
153 do_caching: bool
155 Notes
156 -----
157 By default for a sequence of less than 50 items in size, we don't do
158 caching; for the number of elements less than 5000, we take ten percent of
159 all elements to check for a uniqueness share; if the sequence size is more
160 than 5000, then we check only the first 500 elements.
161 All constants were chosen empirically by.
162 """
163 do_caching = True
165 # default realization
166 if check_count is None:
167 # in this case, the gain from caching is negligible
168 if len(arg) <= start_caching_at:
169 return False
171 if len(arg) <= 5000:
172 check_count = len(arg) // 10
173 else:
174 check_count = 500
175 else:
176 assert (
177 0 <= check_count <= len(arg)
178 ), "check_count must be in next bounds: [0; len(arg)]"
179 if check_count == 0:
180 return False
182 assert 0 < unique_share < 1, "unique_share must be in next bounds: (0; 1)"
184 try:
185 # We can't cache if the items are not hashable.
186 unique_elements = set(islice(arg, check_count))
187 except TypeError:
188 return False
189 if len(unique_elements) > check_count * unique_share:
190 do_caching = False
191 return do_caching
194def _maybe_cache(
195 arg: ArrayConvertible,
196 format: str | None,
197 cache: bool,
198 convert_listlike: Callable,
199) -> Series:
200 """
201 Create a cache of unique dates from an array of dates
203 Parameters
204 ----------
205 arg : listlike, tuple, 1-d array, Series
206 format : string
207 Strftime format to parse time
208 cache : bool
209 True attempts to create a cache of converted values
210 convert_listlike : function
211 Conversion function to apply on dates
213 Returns
214 -------
215 cache_array : Series
216 Cache of converted, unique dates. Can be empty
217 """
218 from pandas import Series
220 cache_array = Series(dtype=object)
222 if cache:
223 # Perform a quicker unique check
224 if not should_cache(arg):
225 return cache_array
227 unique_dates = unique(arg)
228 if len(unique_dates) < len(arg):
229 cache_dates = convert_listlike(unique_dates, format)
230 # GH#45319
231 try:
232 cache_array = Series(cache_dates, index=unique_dates)
233 except OutOfBoundsDatetime:
234 return cache_array
235 # GH#39882 and GH#35888 in case of None and NaT we get duplicates
236 if not cache_array.index.is_unique:
237 cache_array = cache_array[~cache_array.index.duplicated()]
238 return cache_array
241def _box_as_indexlike(
242 dt_array: ArrayLike, utc: bool | None = None, name: Hashable = None
243) -> Index:
244 """
245 Properly boxes the ndarray of datetimes to DatetimeIndex
246 if it is possible or to generic Index instead
248 Parameters
249 ----------
250 dt_array: 1-d array
251 Array of datetimes to be wrapped in an Index.
252 tz : object
253 None or 'utc'
254 name : string, default None
255 Name for a resulting index
257 Returns
258 -------
259 result : datetime of converted dates
260 - DatetimeIndex if convertible to sole datetime64 type
261 - general Index otherwise
262 """
264 if is_datetime64_dtype(dt_array):
265 tz = "utc" if utc else None
266 return DatetimeIndex(dt_array, tz=tz, name=name)
267 return Index(dt_array, name=name, dtype=dt_array.dtype)
270def _convert_and_box_cache(
271 arg: DatetimeScalarOrArrayConvertible,
272 cache_array: Series,
273 name: Hashable | None = None,
274) -> Index:
275 """
276 Convert array of dates with a cache and wrap the result in an Index.
278 Parameters
279 ----------
280 arg : integer, float, string, datetime, list, tuple, 1-d array, Series
281 cache_array : Series
282 Cache of converted, unique dates
283 name : string, default None
284 Name for a DatetimeIndex
286 Returns
287 -------
288 result : Index-like of converted dates
289 """
290 from pandas import Series
292 result = Series(arg).map(cache_array)
293 return _box_as_indexlike(result._values, utc=None, name=name)
296def _return_parsed_timezone_results(result: np.ndarray, timezones, tz, name) -> Index:
297 """
298 Return results from array_strptime if a %z or %Z directive was passed.
300 Parameters
301 ----------
302 result : ndarray[int64]
303 int64 date representations of the dates
304 timezones : ndarray
305 pytz timezone objects
306 tz : object
307 None or pytz timezone object
308 name : string, default None
309 Name for a DatetimeIndex
311 Returns
312 -------
313 tz_result : Index-like of parsed dates with timezone
314 """
315 tz_results = np.array(
316 [Timestamp(res).tz_localize(zone) for res, zone in zip(result, timezones)]
317 )
318 if tz is not None:
319 # Convert to the same tz
320 tz_results = np.array([tz_result.tz_convert(tz) for tz_result in tz_results])
322 return Index(tz_results, name=name)
325def _convert_listlike_datetimes(
326 arg,
327 format: str | None,
328 name: Hashable = None,
329 tz: Timezone | None = None,
330 unit: str | None = None,
331 errors: str = "raise",
332 infer_datetime_format: bool = False,
333 dayfirst: bool | None = None,
334 yearfirst: bool | None = None,
335 exact: bool = True,
336):
337 """
338 Helper function for to_datetime. Performs the conversions of 1D listlike
339 of dates
341 Parameters
342 ----------
343 arg : list, tuple, ndarray, Series, Index
344 date to be parsed
345 name : object
346 None or string for the Index name
347 tz : object
348 None or 'utc'
349 unit : str
350 None or string of the frequency of the passed data
351 errors : str
352 error handing behaviors from to_datetime, 'raise', 'coerce', 'ignore'
353 infer_datetime_format : bool, default False
354 inferring format behavior from to_datetime
355 dayfirst : bool
356 dayfirst parsing behavior from to_datetime
357 yearfirst : bool
358 yearfirst parsing behavior from to_datetime
359 exact : bool, default True
360 exact format matching behavior from to_datetime
362 Returns
363 -------
364 Index-like of parsed dates
365 """
366 if isinstance(arg, (list, tuple)):
367 arg = np.array(arg, dtype="O")
369 arg_dtype = getattr(arg, "dtype", None)
370 # these are shortcutable
371 if is_datetime64tz_dtype(arg_dtype):
372 if not isinstance(arg, (DatetimeArray, DatetimeIndex)):
373 return DatetimeIndex(arg, tz=tz, name=name)
374 if tz == "utc":
375 arg = arg.tz_convert(None).tz_localize(tz)
376 return arg
378 elif is_datetime64_ns_dtype(arg_dtype):
379 if not isinstance(arg, (DatetimeArray, DatetimeIndex)):
380 try:
381 return DatetimeIndex(arg, tz=tz, name=name)
382 except ValueError:
383 pass
384 elif tz:
385 # DatetimeArray, DatetimeIndex
386 return arg.tz_localize(tz)
388 return arg
390 elif unit is not None:
391 if format is not None:
392 raise ValueError("cannot specify both format and unit")
393 return _to_datetime_with_unit(arg, unit, name, tz, errors)
394 elif getattr(arg, "ndim", 1) > 1:
395 raise TypeError(
396 "arg must be a string, datetime, list, tuple, 1-d array, or Series"
397 )
399 # warn if passing timedelta64, raise for PeriodDtype
400 # NB: this must come after unit transformation
401 orig_arg = arg
402 try:
403 arg, _ = maybe_convert_dtype(arg, copy=False, tz=timezones.maybe_get_tz(tz))
404 except TypeError:
405 if errors == "coerce":
406 npvalues = np.array(["NaT"], dtype="datetime64[ns]").repeat(len(arg))
407 return DatetimeIndex(npvalues, name=name)
408 elif errors == "ignore":
409 idx = Index(arg, name=name)
410 return idx
411 raise
413 arg = ensure_object(arg)
414 require_iso8601 = False
416 if infer_datetime_format and format is None:
417 format = _guess_datetime_format_for_array(arg, dayfirst=dayfirst)
419 if format is not None:
420 # There is a special fast-path for iso8601 formatted
421 # datetime strings, so in those cases don't use the inferred
422 # format because this path makes process slower in this
423 # special case
424 format_is_iso8601 = format_is_iso(format)
425 if format_is_iso8601:
426 require_iso8601 = not infer_datetime_format
427 format = None
429 if format is not None:
430 res = _to_datetime_with_format(
431 arg, orig_arg, name, tz, format, exact, errors, infer_datetime_format
432 )
433 if res is not None:
434 return res
436 assert format is None or infer_datetime_format
437 utc = tz == "utc"
438 result, tz_parsed = objects_to_datetime64ns(
439 arg,
440 dayfirst=dayfirst,
441 yearfirst=yearfirst,
442 utc=utc,
443 errors=errors,
444 require_iso8601=require_iso8601,
445 allow_object=True,
446 )
448 if tz_parsed is not None:
449 # We can take a shortcut since the datetime64 numpy array
450 # is in UTC
451 dta = DatetimeArray(result, dtype=tz_to_dtype(tz_parsed))
452 return DatetimeIndex._simple_new(dta, name=name)
454 utc = tz == "utc"
455 return _box_as_indexlike(result, utc=utc, name=name)
458def _array_strptime_with_fallback(
459 arg,
460 name,
461 tz,
462 fmt: str,
463 exact: bool,
464 errors: str,
465 infer_datetime_format: bool,
466) -> Index | None:
467 """
468 Call array_strptime, with fallback behavior depending on 'errors'.
469 """
470 utc = tz == "utc"
472 try:
473 result, timezones = array_strptime(arg, fmt, exact=exact, errors=errors)
474 except OutOfBoundsDatetime:
475 if errors == "raise":
476 raise
477 elif errors == "coerce":
478 result = np.empty(arg.shape, dtype="M8[ns]")
479 iresult = result.view("i8")
480 iresult.fill(iNaT)
481 else:
482 result = arg
483 except ValueError:
484 # if fmt was inferred, try falling back
485 # to array_to_datetime - terminate here
486 # for specified formats
487 if not infer_datetime_format:
488 if errors == "raise":
489 raise
490 elif errors == "coerce":
491 result = np.empty(arg.shape, dtype="M8[ns]")
492 iresult = result.view("i8")
493 iresult.fill(iNaT)
494 else:
495 result = arg
496 else:
497 # Indicates to the caller to fallback to objects_to_datetime64ns
498 return None
499 else:
500 if "%Z" in fmt or "%z" in fmt:
501 return _return_parsed_timezone_results(result, timezones, tz, name)
503 return _box_as_indexlike(result, utc=utc, name=name)
506def _to_datetime_with_format(
507 arg,
508 orig_arg,
509 name,
510 tz,
511 fmt: str,
512 exact: bool,
513 errors: str,
514 infer_datetime_format: bool,
515) -> Index | None:
516 """
517 Try parsing with the given format, returning None on failure.
518 """
519 result = None
521 # shortcut formatting here
522 if fmt == "%Y%m%d":
523 # pass orig_arg as float-dtype may have been converted to
524 # datetime64[ns]
525 orig_arg = ensure_object(orig_arg)
526 try:
527 # may return None without raising
528 result = _attempt_YYYYMMDD(orig_arg, errors=errors)
529 except (ValueError, TypeError, OutOfBoundsDatetime) as err:
530 raise ValueError(
531 "cannot convert the input to '%Y%m%d' date format"
532 ) from err
533 if result is not None:
534 utc = tz == "utc"
535 return _box_as_indexlike(result, utc=utc, name=name)
537 # fallback
538 res = _array_strptime_with_fallback(
539 arg, name, tz, fmt, exact, errors, infer_datetime_format
540 )
541 return res
544def _to_datetime_with_unit(arg, unit, name, tz, errors: str) -> Index:
545 """
546 to_datetime specalized to the case where a 'unit' is passed.
547 """
548 arg = extract_array(arg, extract_numpy=True)
550 # GH#30050 pass an ndarray to tslib.array_with_unit_to_datetime
551 # because it expects an ndarray argument
552 if isinstance(arg, IntegerArray):
553 arr = arg.astype(f"datetime64[{unit}]")
554 tz_parsed = None
555 else:
556 arg = np.asarray(arg)
557 arr, tz_parsed = tslib.array_with_unit_to_datetime(arg, unit, errors=errors)
559 if errors == "ignore":
560 # Index constructor _may_ infer to DatetimeIndex
561 result = Index._with_infer(arr, name=name)
562 else:
563 result = DatetimeIndex(arr, name=name)
565 if not isinstance(result, DatetimeIndex):
566 return result
568 # GH#23758: We may still need to localize the result with tz
569 # GH#25546: Apply tz_parsed first (from arg), then tz (from caller)
570 # result will be naive but in UTC
571 result = result.tz_localize("UTC").tz_convert(tz_parsed)
573 if tz is not None:
574 if result.tz is None:
575 result = result.tz_localize(tz)
576 else:
577 result = result.tz_convert(tz)
578 return result
581def _adjust_to_origin(arg, origin, unit):
582 """
583 Helper function for to_datetime.
584 Adjust input argument to the specified origin
586 Parameters
587 ----------
588 arg : list, tuple, ndarray, Series, Index
589 date to be adjusted
590 origin : 'julian' or Timestamp
591 origin offset for the arg
592 unit : str
593 passed unit from to_datetime, must be 'D'
595 Returns
596 -------
597 ndarray or scalar of adjusted date(s)
598 """
599 if origin == "julian":
600 original = arg
601 j0 = Timestamp(0).to_julian_date()
602 if unit != "D":
603 raise ValueError("unit must be 'D' for origin='julian'")
604 try:
605 arg = arg - j0
606 except TypeError as err:
607 raise ValueError(
608 "incompatible 'arg' type for given 'origin'='julian'"
609 ) from err
611 # preemptively check this for a nice range
612 j_max = Timestamp.max.to_julian_date() - j0
613 j_min = Timestamp.min.to_julian_date() - j0
614 if np.any(arg > j_max) or np.any(arg < j_min):
615 raise OutOfBoundsDatetime(
616 f"{original} is Out of Bounds for origin='julian'"
617 )
618 else:
619 # arg must be numeric
620 if not (
621 (is_scalar(arg) and (is_integer(arg) or is_float(arg)))
622 or is_numeric_dtype(np.asarray(arg))
623 ):
624 raise ValueError(
625 f"'{arg}' is not compatible with origin='{origin}'; "
626 "it must be numeric with a unit specified"
627 )
629 # we are going to offset back to unix / epoch time
630 try:
631 offset = Timestamp(origin)
632 except OutOfBoundsDatetime as err:
633 raise OutOfBoundsDatetime(f"origin {origin} is Out of Bounds") from err
634 except ValueError as err:
635 raise ValueError(
636 f"origin {origin} cannot be converted to a Timestamp"
637 ) from err
639 if offset.tz is not None:
640 raise ValueError(f"origin offset {offset} must be tz-naive")
641 td_offset = offset - Timestamp(0)
643 # convert the offset to the unit of the arg
644 # this should be lossless in terms of precision
645 ioffset = td_offset // Timedelta(1, unit=unit)
647 # scalars & ndarray-like can handle the addition
648 if is_list_like(arg) and not isinstance(arg, (ABCSeries, Index, np.ndarray)):
649 arg = np.asarray(arg)
650 arg = arg + ioffset
651 return arg
654@overload
655def to_datetime(
656 arg: DatetimeScalar,
657 errors: DateTimeErrorChoices = ...,
658 dayfirst: bool = ...,
659 yearfirst: bool = ...,
660 utc: bool | None = ...,
661 format: str | None = ...,
662 exact: bool = ...,
663 unit: str | None = ...,
664 infer_datetime_format: bool = ...,
665 origin=...,
666 cache: bool = ...,
667) -> Timestamp:
668 ...
671@overload
672def to_datetime(
673 arg: Series | DictConvertible,
674 errors: DateTimeErrorChoices = ...,
675 dayfirst: bool = ...,
676 yearfirst: bool = ...,
677 utc: bool | None = ...,
678 format: str | None = ...,
679 exact: bool = ...,
680 unit: str | None = ...,
681 infer_datetime_format: bool = ...,
682 origin=...,
683 cache: bool = ...,
684) -> Series:
685 ...
688@overload
689def to_datetime(
690 arg: list | tuple | Index | ArrayLike,
691 errors: DateTimeErrorChoices = ...,
692 dayfirst: bool = ...,
693 yearfirst: bool = ...,
694 utc: bool | None = ...,
695 format: str | None = ...,
696 exact: bool = ...,
697 unit: str | None = ...,
698 infer_datetime_format: bool = ...,
699 origin=...,
700 cache: bool = ...,
701) -> DatetimeIndex:
702 ...
705def to_datetime(
706 arg: DatetimeScalarOrArrayConvertible | DictConvertible,
707 errors: DateTimeErrorChoices = "raise",
708 dayfirst: bool = False,
709 yearfirst: bool = False,
710 utc: bool | None = None,
711 format: str | None = None,
712 exact: bool = True,
713 unit: str | None = None,
714 infer_datetime_format: bool = False,
715 origin="unix",
716 cache: bool = True,
717) -> DatetimeIndex | Series | DatetimeScalar | NaTType | None:
718 """
719 Convert argument to datetime.
721 This function converts a scalar, array-like, :class:`Series` or
722 :class:`DataFrame`/dict-like to a pandas datetime object.
724 Parameters
725 ----------
726 arg : int, float, str, datetime, list, tuple, 1-d array, Series, DataFrame/dict-like
727 The object to convert to a datetime. If a :class:`DataFrame` is provided, the
728 method expects minimally the following columns: :const:`"year"`,
729 :const:`"month"`, :const:`"day"`.
730 errors : {'ignore', 'raise', 'coerce'}, default 'raise'
731 - If :const:`'raise'`, then invalid parsing will raise an exception.
732 - If :const:`'coerce'`, then invalid parsing will be set as :const:`NaT`.
733 - If :const:`'ignore'`, then invalid parsing will return the input.
734 dayfirst : bool, default False
735 Specify a date parse order if `arg` is str or is list-like.
736 If :const:`True`, parses dates with the day first, e.g. :const:`"10/11/12"`
737 is parsed as :const:`2012-11-10`.
739 .. warning::
741 ``dayfirst=True`` is not strict, but will prefer to parse
742 with day first. If a delimited date string cannot be parsed in
743 accordance with the given `dayfirst` option, e.g.
744 ``to_datetime(['31-12-2021'])``, then a warning will be shown.
746 yearfirst : bool, default False
747 Specify a date parse order if `arg` is str or is list-like.
749 - If :const:`True` parses dates with the year first, e.g.
750 :const:`"10/11/12"` is parsed as :const:`2010-11-12`.
751 - If both `dayfirst` and `yearfirst` are :const:`True`, `yearfirst` is
752 preceded (same as :mod:`dateutil`).
754 .. warning::
756 ``yearfirst=True`` is not strict, but will prefer to parse
757 with year first.
759 utc : bool, default None
760 Control timezone-related parsing, localization and conversion.
762 - If :const:`True`, the function *always* returns a timezone-aware
763 UTC-localized :class:`Timestamp`, :class:`Series` or
764 :class:`DatetimeIndex`. To do this, timezone-naive inputs are
765 *localized* as UTC, while timezone-aware inputs are *converted* to UTC.
767 - If :const:`False` (default), inputs will not be coerced to UTC.
768 Timezone-naive inputs will remain naive, while timezone-aware ones
769 will keep their time offsets. Limitations exist for mixed
770 offsets (typically, daylight savings), see :ref:`Examples
771 <to_datetime_tz_examples>` section for details.
773 See also: pandas general documentation about `timezone conversion and
774 localization
775 <https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html
776 #time-zone-handling>`_.
778 format : str, default None
779 The strftime to parse time, e.g. :const:`"%d/%m/%Y"`. Note that
780 :const:`"%f"` will parse all the way up to nanoseconds. See
781 `strftime documentation
782 <https://docs.python.org/3/library/datetime.html
783 #strftime-and-strptime-behavior>`_ for more information on choices.
784 exact : bool, default True
785 Control how `format` is used:
787 - If :const:`True`, require an exact `format` match.
788 - If :const:`False`, allow the `format` to match anywhere in the target
789 string.
791 unit : str, default 'ns'
792 The unit of the arg (D,s,ms,us,ns) denote the unit, which is an
793 integer or float number. This will be based off the origin.
794 Example, with ``unit='ms'`` and ``origin='unix'``, this would calculate
795 the number of milliseconds to the unix epoch start.
796 infer_datetime_format : bool, default False
797 If :const:`True` and no `format` is given, attempt to infer the format
798 of the datetime strings based on the first non-NaN element,
799 and if it can be inferred, switch to a faster method of parsing them.
800 In some cases this can increase the parsing speed by ~5-10x.
801 origin : scalar, default 'unix'
802 Define the reference date. The numeric values would be parsed as number
803 of units (defined by `unit`) since this reference date.
805 - If :const:`'unix'` (or POSIX) time; origin is set to 1970-01-01.
806 - If :const:`'julian'`, unit must be :const:`'D'`, and origin is set to
807 beginning of Julian Calendar. Julian day number :const:`0` is assigned
808 to the day starting at noon on January 1, 4713 BC.
809 - If Timestamp convertible, origin is set to Timestamp identified by
810 origin.
811 cache : bool, default True
812 If :const:`True`, use a cache of unique, converted dates to apply the
813 datetime conversion. May produce significant speed-up when parsing
814 duplicate date strings, especially ones with timezone offsets. The cache
815 is only used when there are at least 50 values. The presence of
816 out-of-bounds values will render the cache unusable and may slow down
817 parsing.
819 .. versionchanged:: 0.25.0
820 changed default value from :const:`False` to :const:`True`.
822 Returns
823 -------
824 datetime
825 If parsing succeeded.
826 Return type depends on input (types in parenthesis correspond to
827 fallback in case of unsuccessful timezone or out-of-range timestamp
828 parsing):
830 - scalar: :class:`Timestamp` (or :class:`datetime.datetime`)
831 - array-like: :class:`DatetimeIndex` (or :class:`Series` with
832 :class:`object` dtype containing :class:`datetime.datetime`)
833 - Series: :class:`Series` of :class:`datetime64` dtype (or
834 :class:`Series` of :class:`object` dtype containing
835 :class:`datetime.datetime`)
836 - DataFrame: :class:`Series` of :class:`datetime64` dtype (or
837 :class:`Series` of :class:`object` dtype containing
838 :class:`datetime.datetime`)
840 Raises
841 ------
842 ParserError
843 When parsing a date from string fails.
844 ValueError
845 When another datetime conversion error happens. For example when one
846 of 'year', 'month', day' columns is missing in a :class:`DataFrame`, or
847 when a Timezone-aware :class:`datetime.datetime` is found in an array-like
848 of mixed time offsets, and ``utc=False``.
850 See Also
851 --------
852 DataFrame.astype : Cast argument to a specified dtype.
853 to_timedelta : Convert argument to timedelta.
854 convert_dtypes : Convert dtypes.
856 Notes
857 -----
859 Many input types are supported, and lead to different output types:
861 - **scalars** can be int, float, str, datetime object (from stdlib :mod:`datetime`
862 module or :mod:`numpy`). They are converted to :class:`Timestamp` when
863 possible, otherwise they are converted to :class:`datetime.datetime`.
864 None/NaN/null scalars are converted to :const:`NaT`.
866 - **array-like** can contain int, float, str, datetime objects. They are
867 converted to :class:`DatetimeIndex` when possible, otherwise they are
868 converted to :class:`Index` with :class:`object` dtype, containing
869 :class:`datetime.datetime`. None/NaN/null entries are converted to
870 :const:`NaT` in both cases.
872 - **Series** are converted to :class:`Series` with :class:`datetime64`
873 dtype when possible, otherwise they are converted to :class:`Series` with
874 :class:`object` dtype, containing :class:`datetime.datetime`. None/NaN/null
875 entries are converted to :const:`NaT` in both cases.
877 - **DataFrame/dict-like** are converted to :class:`Series` with
878 :class:`datetime64` dtype. For each row a datetime is created from assembling
879 the various dataframe columns. Column keys can be common abbreviations
880 like [‘year’, ‘month’, ‘day’, ‘minute’, ‘second’, ‘ms’, ‘us’, ‘ns’]) or
881 plurals of the same.
883 The following causes are responsible for :class:`datetime.datetime` objects
884 being returned (possibly inside an :class:`Index` or a :class:`Series` with
885 :class:`object` dtype) instead of a proper pandas designated type
886 (:class:`Timestamp`, :class:`DatetimeIndex` or :class:`Series`
887 with :class:`datetime64` dtype):
889 - when any input element is before :const:`Timestamp.min` or after
890 :const:`Timestamp.max`, see `timestamp limitations
891 <https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html
892 #timeseries-timestamp-limits>`_.
894 - when ``utc=False`` (default) and the input is an array-like or
895 :class:`Series` containing mixed naive/aware datetime, or aware with mixed
896 time offsets. Note that this happens in the (quite frequent) situation when
897 the timezone has a daylight savings policy. In that case you may wish to
898 use ``utc=True``.
900 Examples
901 --------
903 **Handling various input formats**
905 Assembling a datetime from multiple columns of a :class:`DataFrame`. The keys
906 can be common abbreviations like ['year', 'month', 'day', 'minute', 'second',
907 'ms', 'us', 'ns']) or plurals of the same
909 >>> df = pd.DataFrame({'year': [2015, 2016],
910 ... 'month': [2, 3],
911 ... 'day': [4, 5]})
912 >>> pd.to_datetime(df)
913 0 2015-02-04
914 1 2016-03-05
915 dtype: datetime64[ns]
917 Passing ``infer_datetime_format=True`` can often-times speedup a parsing
918 if its not an ISO8601 format exactly, but in a regular format.
920 >>> s = pd.Series(['3/11/2000', '3/12/2000', '3/13/2000'] * 1000)
921 >>> s.head()
922 0 3/11/2000
923 1 3/12/2000
924 2 3/13/2000
925 3 3/11/2000
926 4 3/12/2000
927 dtype: object
929 >>> %timeit pd.to_datetime(s, infer_datetime_format=True) # doctest: +SKIP
930 100 loops, best of 3: 10.4 ms per loop
932 >>> %timeit pd.to_datetime(s, infer_datetime_format=False) # doctest: +SKIP
933 1 loop, best of 3: 471 ms per loop
935 Using a unix epoch time
937 >>> pd.to_datetime(1490195805, unit='s')
938 Timestamp('2017-03-22 15:16:45')
939 >>> pd.to_datetime(1490195805433502912, unit='ns')
940 Timestamp('2017-03-22 15:16:45.433502912')
942 .. warning:: For float arg, precision rounding might happen. To prevent
943 unexpected behavior use a fixed-width exact type.
945 Using a non-unix epoch origin
947 >>> pd.to_datetime([1, 2, 3], unit='D',
948 ... origin=pd.Timestamp('1960-01-01'))
949 DatetimeIndex(['1960-01-02', '1960-01-03', '1960-01-04'],
950 dtype='datetime64[ns]', freq=None)
952 **Non-convertible date/times**
954 If a date does not meet the `timestamp limitations
955 <https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html
956 #timeseries-timestamp-limits>`_, passing ``errors='ignore'``
957 will return the original input instead of raising any exception.
959 Passing ``errors='coerce'`` will force an out-of-bounds date to :const:`NaT`,
960 in addition to forcing non-dates (or non-parseable dates) to :const:`NaT`.
962 >>> pd.to_datetime('13000101', format='%Y%m%d', errors='ignore')
963 datetime.datetime(1300, 1, 1, 0, 0)
964 >>> pd.to_datetime('13000101', format='%Y%m%d', errors='coerce')
965 NaT
967 .. _to_datetime_tz_examples:
969 **Timezones and time offsets**
971 The default behaviour (``utc=False``) is as follows:
973 - Timezone-naive inputs are converted to timezone-naive :class:`DatetimeIndex`:
975 >>> pd.to_datetime(['2018-10-26 12:00', '2018-10-26 13:00:15'])
976 DatetimeIndex(['2018-10-26 12:00:00', '2018-10-26 13:00:15'],
977 dtype='datetime64[ns]', freq=None)
979 - Timezone-aware inputs *with constant time offset* are converted to
980 timezone-aware :class:`DatetimeIndex`:
982 >>> pd.to_datetime(['2018-10-26 12:00 -0500', '2018-10-26 13:00 -0500'])
983 DatetimeIndex(['2018-10-26 12:00:00-05:00', '2018-10-26 13:00:00-05:00'],
984 dtype='datetime64[ns, pytz.FixedOffset(-300)]', freq=None)
986 - However, timezone-aware inputs *with mixed time offsets* (for example
987 issued from a timezone with daylight savings, such as Europe/Paris)
988 are **not successfully converted** to a :class:`DatetimeIndex`. Instead a
989 simple :class:`Index` containing :class:`datetime.datetime` objects is
990 returned:
992 >>> pd.to_datetime(['2020-10-25 02:00 +0200', '2020-10-25 04:00 +0100'])
993 Index([2020-10-25 02:00:00+02:00, 2020-10-25 04:00:00+01:00],
994 dtype='object')
996 - A mix of timezone-aware and timezone-naive inputs is converted to
997 a timezone-aware :class:`DatetimeIndex` if the offsets of the timezone-aware
998 are constant:
1000 >>> from datetime import datetime
1001 >>> pd.to_datetime(["2020-01-01 01:00 -01:00", datetime(2020, 1, 1, 3, 0)])
1002 DatetimeIndex(['2020-01-01 01:00:00-01:00', '2020-01-01 02:00:00-01:00'],
1003 dtype='datetime64[ns, pytz.FixedOffset(-60)]', freq=None)
1005 |
1007 Setting ``utc=True`` solves most of the above issues:
1009 - Timezone-naive inputs are *localized* as UTC
1011 >>> pd.to_datetime(['2018-10-26 12:00', '2018-10-26 13:00'], utc=True)
1012 DatetimeIndex(['2018-10-26 12:00:00+00:00', '2018-10-26 13:00:00+00:00'],
1013 dtype='datetime64[ns, UTC]', freq=None)
1015 - Timezone-aware inputs are *converted* to UTC (the output represents the
1016 exact same datetime, but viewed from the UTC time offset `+00:00`).
1018 >>> pd.to_datetime(['2018-10-26 12:00 -0530', '2018-10-26 12:00 -0500'],
1019 ... utc=True)
1020 DatetimeIndex(['2018-10-26 17:30:00+00:00', '2018-10-26 17:00:00+00:00'],
1021 dtype='datetime64[ns, UTC]', freq=None)
1023 - Inputs can contain both naive and aware, string or datetime, the above
1024 rules still apply
1026 >>> from datetime import timezone, timedelta
1027 >>> pd.to_datetime(['2018-10-26 12:00', '2018-10-26 12:00 -0530',
1028 ... datetime(2020, 1, 1, 18),
1029 ... datetime(2020, 1, 1, 18,
1030 ... tzinfo=timezone(-timedelta(hours=1)))],
1031 ... utc=True)
1032 DatetimeIndex(['2018-10-26 12:00:00+00:00', '2018-10-26 17:30:00+00:00',
1033 '2020-01-01 18:00:00+00:00', '2020-01-01 19:00:00+00:00'],
1034 dtype='datetime64[ns, UTC]', freq=None)
1035 """
1036 if arg is None:
1037 return None
1039 if origin != "unix":
1040 arg = _adjust_to_origin(arg, origin, unit)
1042 tz = "utc" if utc else None
1043 convert_listlike = partial(
1044 _convert_listlike_datetimes,
1045 tz=tz,
1046 unit=unit,
1047 dayfirst=dayfirst,
1048 yearfirst=yearfirst,
1049 errors=errors,
1050 exact=exact,
1051 infer_datetime_format=infer_datetime_format,
1052 )
1054 result: Timestamp | NaTType | Series | Index
1056 if isinstance(arg, Timestamp):
1057 result = arg
1058 if tz is not None:
1059 if arg.tz is not None:
1060 result = arg.tz_convert(tz)
1061 else:
1062 result = arg.tz_localize(tz)
1063 elif isinstance(arg, ABCSeries):
1064 cache_array = _maybe_cache(arg, format, cache, convert_listlike)
1065 if not cache_array.empty:
1066 result = arg.map(cache_array)
1067 else:
1068 values = convert_listlike(arg._values, format)
1069 result = arg._constructor(values, index=arg.index, name=arg.name)
1070 elif isinstance(arg, (ABCDataFrame, abc.MutableMapping)):
1071 result = _assemble_from_unit_mappings(arg, errors, tz)
1072 elif isinstance(arg, Index):
1073 cache_array = _maybe_cache(arg, format, cache, convert_listlike)
1074 if not cache_array.empty:
1075 result = _convert_and_box_cache(arg, cache_array, name=arg.name)
1076 else:
1077 result = convert_listlike(arg, format, name=arg.name)
1078 elif is_list_like(arg):
1079 try:
1080 # error: Argument 1 to "_maybe_cache" has incompatible type
1081 # "Union[float, str, datetime, List[Any], Tuple[Any, ...], ExtensionArray,
1082 # ndarray[Any, Any], Series]"; expected "Union[List[Any], Tuple[Any, ...],
1083 # Union[Union[ExtensionArray, ndarray[Any, Any]], Index, Series], Series]"
1084 argc = cast(
1085 Union[list, tuple, ExtensionArray, np.ndarray, "Series", Index], arg
1086 )
1087 cache_array = _maybe_cache(argc, format, cache, convert_listlike)
1088 except OutOfBoundsDatetime:
1089 # caching attempts to create a DatetimeIndex, which may raise
1090 # an OOB. If that's the desired behavior, then just reraise...
1091 if errors == "raise":
1092 raise
1093 # ... otherwise, continue without the cache.
1094 from pandas import Series
1096 cache_array = Series([], dtype=object) # just an empty array
1097 if not cache_array.empty:
1098 result = _convert_and_box_cache(argc, cache_array)
1099 else:
1100 result = convert_listlike(argc, format)
1101 else:
1102 result = convert_listlike(np.array([arg]), format)[0]
1103 if isinstance(arg, bool) and isinstance(result, np.bool_):
1104 result = bool(result) # TODO: avoid this kludge.
1106 # error: Incompatible return value type (got "Union[Timestamp, NaTType,
1107 # Series, Index]", expected "Union[DatetimeIndex, Series, float, str,
1108 # NaTType, None]")
1109 return result # type: ignore[return-value]
1112# mappings for assembling units
1113_unit_map = {
1114 "year": "year",
1115 "years": "year",
1116 "month": "month",
1117 "months": "month",
1118 "day": "day",
1119 "days": "day",
1120 "hour": "h",
1121 "hours": "h",
1122 "minute": "m",
1123 "minutes": "m",
1124 "second": "s",
1125 "seconds": "s",
1126 "ms": "ms",
1127 "millisecond": "ms",
1128 "milliseconds": "ms",
1129 "us": "us",
1130 "microsecond": "us",
1131 "microseconds": "us",
1132 "ns": "ns",
1133 "nanosecond": "ns",
1134 "nanoseconds": "ns",
1135}
1138def _assemble_from_unit_mappings(arg, errors: DateTimeErrorChoices, tz):
1139 """
1140 assemble the unit specified fields from the arg (DataFrame)
1141 Return a Series for actual parsing
1143 Parameters
1144 ----------
1145 arg : DataFrame
1146 errors : {'ignore', 'raise', 'coerce'}, default 'raise'
1148 - If :const:`'raise'`, then invalid parsing will raise an exception
1149 - If :const:`'coerce'`, then invalid parsing will be set as :const:`NaT`
1150 - If :const:`'ignore'`, then invalid parsing will return the input
1151 tz : None or 'utc'
1153 Returns
1154 -------
1155 Series
1156 """
1157 from pandas import (
1158 DataFrame,
1159 to_numeric,
1160 to_timedelta,
1161 )
1163 arg = DataFrame(arg)
1164 if not arg.columns.is_unique:
1165 raise ValueError("cannot assemble with duplicate keys")
1167 # replace passed unit with _unit_map
1168 def f(value):
1169 if value in _unit_map:
1170 return _unit_map[value]
1172 # m is case significant
1173 if value.lower() in _unit_map:
1174 return _unit_map[value.lower()]
1176 return value
1178 unit = {k: f(k) for k in arg.keys()}
1179 unit_rev = {v: k for k, v in unit.items()}
1181 # we require at least Ymd
1182 required = ["year", "month", "day"]
1183 req = sorted(set(required) - set(unit_rev.keys()))
1184 if len(req):
1185 _required = ",".join(req)
1186 raise ValueError(
1187 "to assemble mappings requires at least that "
1188 f"[year, month, day] be specified: [{_required}] is missing"
1189 )
1191 # keys we don't recognize
1192 excess = sorted(set(unit_rev.keys()) - set(_unit_map.values()))
1193 if len(excess):
1194 _excess = ",".join(excess)
1195 raise ValueError(
1196 f"extra keys have been passed to the datetime assemblage: [{_excess}]"
1197 )
1199 def coerce(values):
1200 # we allow coercion to if errors allows
1201 values = to_numeric(values, errors=errors)
1203 # prevent overflow in case of int8 or int16
1204 if is_integer_dtype(values):
1205 values = values.astype("int64", copy=False)
1206 return values
1208 values = (
1209 coerce(arg[unit_rev["year"]]) * 10000
1210 + coerce(arg[unit_rev["month"]]) * 100
1211 + coerce(arg[unit_rev["day"]])
1212 )
1213 try:
1214 values = to_datetime(values, format="%Y%m%d", errors=errors, utc=tz)
1215 except (TypeError, ValueError) as err:
1216 raise ValueError(f"cannot assemble the datetimes: {err}") from err
1218 units: list[UnitChoices] = ["h", "m", "s", "ms", "us", "ns"]
1219 for u in units:
1220 value = unit_rev.get(u)
1221 if value is not None and value in arg:
1222 try:
1223 values += to_timedelta(coerce(arg[value]), unit=u, errors=errors)
1224 except (TypeError, ValueError) as err:
1225 raise ValueError(
1226 f"cannot assemble the datetimes [{value}]: {err}"
1227 ) from err
1228 return values
1231def _attempt_YYYYMMDD(arg: npt.NDArray[np.object_], errors: str) -> np.ndarray | None:
1232 """
1233 try to parse the YYYYMMDD/%Y%m%d format, try to deal with NaT-like,
1234 arg is a passed in as an object dtype, but could really be ints/strings
1235 with nan-like/or floats (e.g. with nan)
1237 Parameters
1238 ----------
1239 arg : np.ndarray[object]
1240 errors : {'raise','ignore','coerce'}
1241 """
1243 def calc(carg):
1244 # calculate the actual result
1245 carg = carg.astype(object, copy=False)
1246 parsed = parsing.try_parse_year_month_day(
1247 carg / 10000, carg / 100 % 100, carg % 100
1248 )
1249 return tslib.array_to_datetime(parsed, errors=errors)[0]
1251 def calc_with_mask(carg, mask):
1252 result = np.empty(carg.shape, dtype="M8[ns]")
1253 iresult = result.view("i8")
1254 iresult[~mask] = iNaT
1256 masked_result = calc(carg[mask].astype(np.float64).astype(np.int64))
1257 result[mask] = masked_result.astype("M8[ns]")
1258 return result
1260 # try intlike / strings that are ints
1261 try:
1262 return calc(arg.astype(np.int64))
1263 except (ValueError, OverflowError, TypeError):
1264 pass
1266 # a float with actual np.nan
1267 try:
1268 carg = arg.astype(np.float64)
1269 return calc_with_mask(carg, notna(carg))
1270 except (ValueError, OverflowError, TypeError):
1271 pass
1273 # string with NaN-like
1274 try:
1275 # error: Argument 2 to "isin" has incompatible type "List[Any]"; expected
1276 # "Union[Union[ExtensionArray, ndarray], Index, Series]"
1277 mask = ~algorithms.isin(arg, list(nat_strings)) # type: ignore[arg-type]
1278 return calc_with_mask(arg, mask)
1279 except (ValueError, OverflowError, TypeError):
1280 pass
1282 return None
1285def to_time(arg, format=None, infer_time_format=False, errors="raise"):
1286 # GH#34145
1287 warnings.warn(
1288 "`to_time` has been moved, should be imported from pandas.core.tools.times. "
1289 "This alias will be removed in a future version.",
1290 FutureWarning,
1291 stacklevel=find_stack_level(),
1292 )
1293 from pandas.core.tools.times import to_time
1295 return to_time(arg, format, infer_time_format, errors)
1298__all__ = [
1299 "DateParseError",
1300 "should_cache",
1301 "to_datetime",
1302 "to_time",
1303]