Coverage for /var/srv/projects/api.amasfac.comuna18.com/tmp/venv/lib/python3.9/site-packages/pandas/core/generic.py: 19%
2323 statements
« prev ^ index » next coverage.py v6.4.4, created at 2023-07-17 14:22 -0600
« prev ^ index » next coverage.py v6.4.4, created at 2023-07-17 14:22 -0600
1# pyright: reportPropertyTypeMismatch=false
2from __future__ import annotations
4import collections
5from datetime import timedelta
6import functools
7import gc
8import json
9import operator
10import pickle
11import re
12from typing import (
13 TYPE_CHECKING,
14 Any,
15 Callable,
16 ClassVar,
17 Hashable,
18 Literal,
19 Mapping,
20 NoReturn,
21 Sequence,
22 Type,
23 cast,
24 final,
25 overload,
26)
27import warnings
28import weakref
30import numpy as np
32from pandas._config import config
34from pandas._libs import lib
35from pandas._libs.tslibs import (
36 Period,
37 Tick,
38 Timestamp,
39 to_offset,
40)
41from pandas._typing import (
42 AnyArrayLike,
43 ArrayLike,
44 Axis,
45 ColspaceArgType,
46 CompressionOptions,
47 Dtype,
48 DtypeArg,
49 DtypeObj,
50 FilePath,
51 FillnaOptions,
52 FloatFormatType,
53 FormattersType,
54 Frequency,
55 IgnoreRaise,
56 IndexKeyFunc,
57 IndexLabel,
58 IntervalClosedType,
59 JSONSerializable,
60 Level,
61 Manager,
62 NaPosition,
63 NDFrameT,
64 RandomState,
65 Renamer,
66 SortKind,
67 StorageOptions,
68 Suffixes,
69 T,
70 TimedeltaConvertibleTypes,
71 TimestampConvertibleTypes,
72 ValueKeyFunc,
73 WriteBuffer,
74 npt,
75)
76from pandas.compat._optional import import_optional_dependency
77from pandas.compat.numpy import function as nv
78from pandas.errors import (
79 AbstractMethodError,
80 InvalidIndexError,
81 SettingWithCopyError,
82 SettingWithCopyWarning,
83)
84from pandas.util._decorators import (
85 deprecate_kwarg,
86 deprecate_nonkeyword_arguments,
87 doc,
88 rewrite_axis_style_signature,
89)
90from pandas.util._exceptions import find_stack_level
91from pandas.util._validators import (
92 validate_ascending,
93 validate_bool_kwarg,
94 validate_fillna_kwargs,
95 validate_inclusive,
96)
98from pandas.core.dtypes.common import (
99 ensure_object,
100 ensure_platform_int,
101 ensure_str,
102 is_bool,
103 is_bool_dtype,
104 is_datetime64_any_dtype,
105 is_datetime64tz_dtype,
106 is_dict_like,
107 is_dtype_equal,
108 is_extension_array_dtype,
109 is_float,
110 is_list_like,
111 is_number,
112 is_numeric_dtype,
113 is_re_compilable,
114 is_scalar,
115 is_timedelta64_dtype,
116 pandas_dtype,
117)
118from pandas.core.dtypes.generic import (
119 ABCDataFrame,
120 ABCSeries,
121)
122from pandas.core.dtypes.inference import (
123 is_hashable,
124 is_nested_list_like,
125)
126from pandas.core.dtypes.missing import (
127 isna,
128 notna,
129)
131from pandas.core import (
132 algorithms as algos,
133 arraylike,
134 common as com,
135 indexing,
136 missing,
137 nanops,
138 sample,
139)
140from pandas.core.array_algos.replace import should_use_regex
141from pandas.core.arrays import ExtensionArray
142from pandas.core.base import PandasObject
143from pandas.core.construction import (
144 create_series_with_explicit_dtype,
145 extract_array,
146)
147from pandas.core.describe import describe_ndframe
148from pandas.core.flags import Flags
149from pandas.core.indexes.api import (
150 DatetimeIndex,
151 Index,
152 MultiIndex,
153 PeriodIndex,
154 RangeIndex,
155 default_index,
156 ensure_index,
157)
158from pandas.core.internals import (
159 ArrayManager,
160 BlockManager,
161 SingleArrayManager,
162)
163from pandas.core.internals.construction import mgr_to_mgr
164from pandas.core.missing import find_valid_index
165from pandas.core.ops import align_method_FRAME
166from pandas.core.reshape.concat import concat
167from pandas.core.shared_docs import _shared_docs
168from pandas.core.sorting import get_indexer_indexer
169from pandas.core.window import (
170 Expanding,
171 ExponentialMovingWindow,
172 Rolling,
173 Window,
174)
176from pandas.io.formats import format as fmt
177from pandas.io.formats.format import (
178 DataFrameFormatter,
179 DataFrameRenderer,
180)
181from pandas.io.formats.printing import pprint_thing
183if TYPE_CHECKING: 183 ↛ 185line 183 didn't jump to line 185, because the condition on line 183 was never true
185 from pandas._libs.tslibs import BaseOffset
187 from pandas.core.frame import DataFrame
188 from pandas.core.indexers.objects import BaseIndexer
189 from pandas.core.resample import Resampler
190 from pandas.core.series import Series
192 from pandas.io.pytables import HDFStore
195# goal is to be able to define the docs close to function, while still being
196# able to share
197_shared_docs = {**_shared_docs}
198_shared_doc_kwargs = {
199 "axes": "keywords for axes",
200 "klass": "Series/DataFrame",
201 "axes_single_arg": "int or labels for object",
202 "args_transpose": "axes to permute (int or label for object)",
203 "inplace": """
204 inplace : bool, default False
205 If True, performs operation inplace and returns None.""",
206 "optional_by": """
207 by : str or list of str
208 Name or list of names to sort by""",
209 "replace_iloc": """
210 This differs from updating with ``.loc`` or ``.iloc``, which require
211 you to specify a location to update with some value.""",
212}
215bool_t = bool # Need alias because NDFrame has def bool:
218class NDFrame(PandasObject, indexing.IndexingMixin):
219 """
220 N-dimensional analogue of DataFrame. Store multi-dimensional in a
221 size-mutable, labeled data structure
223 Parameters
224 ----------
225 data : BlockManager
226 axes : list
227 copy : bool, default False
228 """
230 _internal_names: list[str] = [
231 "_mgr",
232 "_cacher",
233 "_item_cache",
234 "_cache",
235 "_is_copy",
236 "_subtyp",
237 "_name",
238 "_default_kind",
239 "_default_fill_value",
240 "_metadata",
241 "__array_struct__",
242 "__array_interface__",
243 "_flags",
244 ]
245 _internal_names_set: set[str] = set(_internal_names)
246 _accessors: set[str] = set()
247 _hidden_attrs: frozenset[str] = frozenset(
248 ["_AXIS_NAMES", "_AXIS_NUMBERS", "get_values", "tshift"]
249 )
250 _metadata: list[str] = []
251 _is_copy: weakref.ReferenceType[NDFrame] | None = None
252 _mgr: Manager
253 _attrs: dict[Hashable, Any]
254 _typ: str
256 # ----------------------------------------------------------------------
257 # Constructors
259 def __init__(
260 self,
261 data: Manager,
262 copy: bool_t = False,
263 attrs: Mapping[Hashable, Any] | None = None,
264 ) -> None:
265 # copy kwarg is retained for mypy compat, is not used
267 object.__setattr__(self, "_is_copy", None)
268 object.__setattr__(self, "_mgr", data)
269 object.__setattr__(self, "_item_cache", {})
270 if attrs is None:
271 attrs = {}
272 else:
273 attrs = dict(attrs)
274 object.__setattr__(self, "_attrs", attrs)
275 object.__setattr__(self, "_flags", Flags(self, allows_duplicate_labels=True))
277 @classmethod
278 def _init_mgr(
279 cls,
280 mgr: Manager,
281 axes,
282 dtype: Dtype | None = None,
283 copy: bool_t = False,
284 ) -> Manager:
285 """passed a manager and a axes dict"""
286 for a, axe in axes.items():
287 if axe is not None:
288 axe = ensure_index(axe)
289 bm_axis = cls._get_block_manager_axis(a)
290 mgr = mgr.reindex_axis(axe, axis=bm_axis)
292 # make a copy if explicitly requested
293 if copy:
294 mgr = mgr.copy()
295 if dtype is not None:
296 # avoid further copies if we can
297 if (
298 isinstance(mgr, BlockManager)
299 and len(mgr.blocks) == 1
300 and is_dtype_equal(mgr.blocks[0].values.dtype, dtype)
301 ):
302 pass
303 else:
304 mgr = mgr.astype(dtype=dtype)
305 return mgr
307 def _as_manager(self: NDFrameT, typ: str, copy: bool_t = True) -> NDFrameT:
308 """
309 Private helper function to create a DataFrame with specific manager.
311 Parameters
312 ----------
313 typ : {"block", "array"}
314 copy : bool, default True
315 Only controls whether the conversion from Block->ArrayManager
316 copies the 1D arrays (to ensure proper/contiguous memory layout).
318 Returns
319 -------
320 DataFrame
321 New DataFrame using specified manager type. Is not guaranteed
322 to be a copy or not.
323 """
324 new_mgr: Manager
325 new_mgr = mgr_to_mgr(self._mgr, typ=typ, copy=copy)
326 # fastpath of passing a manager doesn't check the option/manager class
327 return self._constructor(new_mgr).__finalize__(self)
329 # ----------------------------------------------------------------------
330 # attrs and flags
332 @property
333 def attrs(self) -> dict[Hashable, Any]:
334 """
335 Dictionary of global attributes of this dataset.
337 .. warning::
339 attrs is experimental and may change without warning.
341 See Also
342 --------
343 DataFrame.flags : Global flags applying to this object.
344 """
345 if self._attrs is None:
346 self._attrs = {}
347 return self._attrs
349 @attrs.setter
350 def attrs(self, value: Mapping[Hashable, Any]) -> None:
351 self._attrs = dict(value)
353 @final
354 @property
355 def flags(self) -> Flags:
356 """
357 Get the properties associated with this pandas object.
359 The available flags are
361 * :attr:`Flags.allows_duplicate_labels`
363 See Also
364 --------
365 Flags : Flags that apply to pandas objects.
366 DataFrame.attrs : Global metadata applying to this dataset.
368 Notes
369 -----
370 "Flags" differ from "metadata". Flags reflect properties of the
371 pandas object (the Series or DataFrame). Metadata refer to properties
372 of the dataset, and should be stored in :attr:`DataFrame.attrs`.
374 Examples
375 --------
376 >>> df = pd.DataFrame({"A": [1, 2]})
377 >>> df.flags
378 <Flags(allows_duplicate_labels=True)>
380 Flags can be get or set using ``.``
382 >>> df.flags.allows_duplicate_labels
383 True
384 >>> df.flags.allows_duplicate_labels = False
386 Or by slicing with a key
388 >>> df.flags["allows_duplicate_labels"]
389 False
390 >>> df.flags["allows_duplicate_labels"] = True
391 """
392 return self._flags
394 @final
395 def set_flags(
396 self: NDFrameT,
397 *,
398 copy: bool_t = False,
399 allows_duplicate_labels: bool_t | None = None,
400 ) -> NDFrameT:
401 """
402 Return a new object with updated flags.
404 Parameters
405 ----------
406 allows_duplicate_labels : bool, optional
407 Whether the returned object allows duplicate labels.
409 Returns
410 -------
411 Series or DataFrame
412 The same type as the caller.
414 See Also
415 --------
416 DataFrame.attrs : Global metadata applying to this dataset.
417 DataFrame.flags : Global flags applying to this object.
419 Notes
420 -----
421 This method returns a new object that's a view on the same data
422 as the input. Mutating the input or the output values will be reflected
423 in the other.
425 This method is intended to be used in method chains.
427 "Flags" differ from "metadata". Flags reflect properties of the
428 pandas object (the Series or DataFrame). Metadata refer to properties
429 of the dataset, and should be stored in :attr:`DataFrame.attrs`.
431 Examples
432 --------
433 >>> df = pd.DataFrame({"A": [1, 2]})
434 >>> df.flags.allows_duplicate_labels
435 True
436 >>> df2 = df.set_flags(allows_duplicate_labels=False)
437 >>> df2.flags.allows_duplicate_labels
438 False
439 """
440 df = self.copy(deep=copy)
441 if allows_duplicate_labels is not None:
442 df.flags["allows_duplicate_labels"] = allows_duplicate_labels
443 return df
445 @final
446 @classmethod
447 def _validate_dtype(cls, dtype) -> DtypeObj | None:
448 """validate the passed dtype"""
449 if dtype is not None:
450 dtype = pandas_dtype(dtype)
452 # a compound dtype
453 if dtype.kind == "V":
454 raise NotImplementedError(
455 "compound dtypes are not implemented "
456 f"in the {cls.__name__} constructor"
457 )
459 return dtype
461 # ----------------------------------------------------------------------
462 # Construction
464 @property
465 def _constructor(self: NDFrameT) -> Callable[..., NDFrameT]:
466 """
467 Used when a manipulation result has the same dimensions as the
468 original.
469 """
470 raise AbstractMethodError(self)
472 # ----------------------------------------------------------------------
473 # Internals
475 @final
476 @property
477 def _data(self):
478 # GH#33054 retained because some downstream packages uses this,
479 # e.g. fastparquet
480 return self._mgr
482 # ----------------------------------------------------------------------
483 # Axis
484 _stat_axis_number = 0
485 _stat_axis_name = "index"
486 _AXIS_ORDERS: list[str]
487 _AXIS_TO_AXIS_NUMBER: dict[Axis, int] = {0: 0, "index": 0, "rows": 0}
488 _info_axis_number: int
489 _info_axis_name: str
490 _AXIS_LEN: int
492 @property
493 def _AXIS_NUMBERS(self) -> dict[str, int]:
494 """.. deprecated:: 1.1.0"""
495 warnings.warn(
496 "_AXIS_NUMBERS has been deprecated.",
497 FutureWarning,
498 stacklevel=find_stack_level(),
499 )
500 return {"index": 0}
502 @property
503 def _AXIS_NAMES(self) -> dict[int, str]:
504 """.. deprecated:: 1.1.0"""
505 level = self.ndim + 1
506 warnings.warn(
507 "_AXIS_NAMES has been deprecated.", FutureWarning, stacklevel=level
508 )
509 return {0: "index"}
511 @final
512 def _construct_axes_dict(self, axes=None, **kwargs):
513 """Return an axes dictionary for myself."""
514 d = {a: self._get_axis(a) for a in (axes or self._AXIS_ORDERS)}
515 d.update(kwargs)
516 return d
518 @final
519 @classmethod
520 def _construct_axes_from_arguments(
521 cls, args, kwargs, require_all: bool_t = False, sentinel=None
522 ):
523 """
524 Construct and returns axes if supplied in args/kwargs.
526 If require_all, raise if all axis arguments are not supplied
527 return a tuple of (axes, kwargs).
529 sentinel specifies the default parameter when an axis is not
530 supplied; useful to distinguish when a user explicitly passes None
531 in scenarios where None has special meaning.
532 """
533 # construct the args
534 args = list(args)
535 for a in cls._AXIS_ORDERS:
537 # look for a argument by position
538 if a not in kwargs:
539 try:
540 kwargs[a] = args.pop(0)
541 except IndexError as err:
542 if require_all:
543 raise TypeError(
544 "not enough/duplicate arguments specified!"
545 ) from err
547 axes = {a: kwargs.pop(a, sentinel) for a in cls._AXIS_ORDERS}
548 return axes, kwargs
550 @final
551 @classmethod
552 def _get_axis_number(cls, axis: Axis) -> int:
553 try:
554 return cls._AXIS_TO_AXIS_NUMBER[axis]
555 except KeyError:
556 raise ValueError(f"No axis named {axis} for object type {cls.__name__}")
558 @final
559 @classmethod
560 def _get_axis_name(cls, axis: Axis) -> str:
561 axis_number = cls._get_axis_number(axis)
562 return cls._AXIS_ORDERS[axis_number]
564 @final
565 def _get_axis(self, axis: Axis) -> Index:
566 axis_number = self._get_axis_number(axis)
567 assert axis_number in {0, 1}
568 return self.index if axis_number == 0 else self.columns
570 @final
571 @classmethod
572 def _get_block_manager_axis(cls, axis: Axis) -> int:
573 """Map the axis to the block_manager axis."""
574 axis = cls._get_axis_number(axis)
575 ndim = cls._AXIS_LEN
576 if ndim == 2:
577 # i.e. DataFrame
578 return 1 - axis
579 return axis
581 @final
582 def _get_axis_resolvers(self, axis: str) -> dict[str, Series | MultiIndex]:
583 # index or columns
584 axis_index = getattr(self, axis)
585 d = {}
586 prefix = axis[0]
588 for i, name in enumerate(axis_index.names):
589 if name is not None:
590 key = level = name
591 else:
592 # prefix with 'i' or 'c' depending on the input axis
593 # e.g., you must do ilevel_0 for the 0th level of an unnamed
594 # multiiindex
595 key = f"{prefix}level_{i}"
596 level = i
598 level_values = axis_index.get_level_values(level)
599 s = level_values.to_series()
600 s.index = axis_index
601 d[key] = s
603 # put the index/columns itself in the dict
604 if isinstance(axis_index, MultiIndex):
605 dindex = axis_index
606 else:
607 dindex = axis_index.to_series()
609 d[axis] = dindex
610 return d
612 @final
613 def _get_index_resolvers(self) -> dict[Hashable, Series | MultiIndex]:
614 from pandas.core.computation.parsing import clean_column_name
616 d: dict[str, Series | MultiIndex] = {}
617 for axis_name in self._AXIS_ORDERS:
618 d.update(self._get_axis_resolvers(axis_name))
620 return {clean_column_name(k): v for k, v in d.items() if not isinstance(k, int)}
622 @final
623 def _get_cleaned_column_resolvers(self) -> dict[Hashable, Series]:
624 """
625 Return the special character free column resolvers of a dataframe.
627 Column names with special characters are 'cleaned up' so that they can
628 be referred to by backtick quoting.
629 Used in :meth:`DataFrame.eval`.
630 """
631 from pandas.core.computation.parsing import clean_column_name
633 if isinstance(self, ABCSeries):
634 return {clean_column_name(self.name): self}
636 return {
637 clean_column_name(k): v for k, v in self.items() if not isinstance(k, int)
638 }
640 @property
641 def _info_axis(self) -> Index:
642 return getattr(self, self._info_axis_name)
644 @property
645 def _stat_axis(self) -> Index:
646 return getattr(self, self._stat_axis_name)
648 @property
649 def shape(self) -> tuple[int, ...]:
650 """
651 Return a tuple of axis dimensions
652 """
653 return tuple(len(self._get_axis(a)) for a in self._AXIS_ORDERS)
655 @property
656 def axes(self) -> list[Index]:
657 """
658 Return index label(s) of the internal NDFrame
659 """
660 # we do it this way because if we have reversed axes, then
661 # the block manager shows then reversed
662 return [self._get_axis(a) for a in self._AXIS_ORDERS]
664 @property
665 def ndim(self) -> int:
666 """
667 Return an int representing the number of axes / array dimensions.
669 Return 1 if Series. Otherwise return 2 if DataFrame.
671 See Also
672 --------
673 ndarray.ndim : Number of array dimensions.
675 Examples
676 --------
677 >>> s = pd.Series({'a': 1, 'b': 2, 'c': 3})
678 >>> s.ndim
679 1
681 >>> df = pd.DataFrame({'col1': [1, 2], 'col2': [3, 4]})
682 >>> df.ndim
683 2
684 """
685 return self._mgr.ndim
687 @property
688 def size(self) -> int:
689 """
690 Return an int representing the number of elements in this object.
692 Return the number of rows if Series. Otherwise return the number of
693 rows times number of columns if DataFrame.
695 See Also
696 --------
697 ndarray.size : Number of elements in the array.
699 Examples
700 --------
701 >>> s = pd.Series({'a': 1, 'b': 2, 'c': 3})
702 >>> s.size
703 3
705 >>> df = pd.DataFrame({'col1': [1, 2], 'col2': [3, 4]})
706 >>> df.size
707 4
708 """
709 # error: Incompatible return value type (got "signedinteger[_64Bit]",
710 # expected "int") [return-value]
711 return np.prod(self.shape) # type: ignore[return-value]
713 @overload
714 def set_axis(
715 self: NDFrameT,
716 labels,
717 *,
718 axis: Axis = ...,
719 inplace: Literal[False] | lib.NoDefault = ...,
720 copy: bool_t | lib.NoDefault = ...,
721 ) -> NDFrameT:
722 ...
724 @overload
725 def set_axis(
726 self,
727 labels,
728 *,
729 axis: Axis = ...,
730 inplace: Literal[True],
731 copy: bool_t | lib.NoDefault = ...,
732 ) -> None:
733 ...
735 @overload
736 def set_axis(
737 self: NDFrameT,
738 labels,
739 *,
740 axis: Axis = ...,
741 inplace: bool_t | lib.NoDefault = ...,
742 copy: bool_t | lib.NoDefault = ...,
743 ) -> NDFrameT | None:
744 ...
746 @deprecate_nonkeyword_arguments(version=None, allowed_args=["self", "labels"])
747 def set_axis(
748 self: NDFrameT,
749 labels,
750 axis: Axis = 0,
751 inplace: bool_t | lib.NoDefault = lib.no_default,
752 *,
753 copy: bool_t | lib.NoDefault = lib.no_default,
754 ) -> NDFrameT | None:
755 """
756 Assign desired index to given axis.
758 Indexes for%(extended_summary_sub)s row labels can be changed by assigning
759 a list-like or Index.
761 Parameters
762 ----------
763 labels : list-like, Index
764 The values for the new index.
766 axis : %(axes_single_arg)s, default 0
767 The axis to update. The value 0 identifies the rows. For `Series`
768 this parameter is unused and defaults to 0.
770 inplace : bool, default False
771 Whether to return a new %(klass)s instance.
773 .. deprecated:: 1.5.0
775 copy : bool, default True
776 Whether to make a copy of the underlying data.
778 .. versionadded:: 1.5.0
780 Returns
781 -------
782 renamed : %(klass)s or None
783 An object of type %(klass)s or None if ``inplace=True``.
785 See Also
786 --------
787 %(klass)s.rename_axis : Alter the name of the index%(see_also_sub)s.
788 """
789 if inplace is not lib.no_default:
790 warnings.warn(
791 f"{type(self).__name__}.set_axis 'inplace' keyword is deprecated "
792 "and will be removed in a future version. Use "
793 "`obj = obj.set_axis(..., copy=False)` instead",
794 FutureWarning,
795 stacklevel=find_stack_level(),
796 )
797 else:
798 inplace = False
800 if inplace:
801 if copy is True:
802 raise ValueError("Cannot specify both inplace=True and copy=True")
803 copy = False
804 elif copy is lib.no_default:
805 copy = True
807 self._check_inplace_and_allows_duplicate_labels(inplace)
808 return self._set_axis_nocheck(labels, axis, inplace, copy=copy)
810 @final
811 def _set_axis_nocheck(self, labels, axis: Axis, inplace: bool_t, copy: bool_t):
812 if inplace:
813 setattr(self, self._get_axis_name(axis), labels)
814 else:
815 # With copy=False, we create a new object but don't copy the
816 # underlying data.
817 obj = self.copy(deep=copy)
818 setattr(obj, obj._get_axis_name(axis), labels)
819 return obj
821 def _set_axis(self, axis: int, labels: AnyArrayLike | list) -> None:
822 labels = ensure_index(labels)
823 self._mgr.set_axis(axis, labels)
824 self._clear_item_cache()
826 @final
827 def swapaxes(
828 self: NDFrameT, axis1: Axis, axis2: Axis, copy: bool_t = True
829 ) -> NDFrameT:
830 """
831 Interchange axes and swap values axes appropriately.
833 Returns
834 -------
835 y : same as input
836 """
837 i = self._get_axis_number(axis1)
838 j = self._get_axis_number(axis2)
840 if i == j:
841 if copy:
842 return self.copy()
843 return self
845 mapping = {i: j, j: i}
847 new_axes = (self._get_axis(mapping.get(k, k)) for k in range(self._AXIS_LEN))
848 new_values = self.values.swapaxes(i, j)
849 if copy:
850 new_values = new_values.copy()
852 return self._constructor(
853 new_values,
854 *new_axes,
855 ).__finalize__(self, method="swapaxes")
857 @final
858 @doc(klass=_shared_doc_kwargs["klass"])
859 def droplevel(self: NDFrameT, level: IndexLabel, axis: Axis = 0) -> NDFrameT:
860 """
861 Return {klass} with requested index / column level(s) removed.
863 Parameters
864 ----------
865 level : int, str, or list-like
866 If a string is given, must be the name of a level
867 If list-like, elements must be names or positional indexes
868 of levels.
870 axis : {{0 or 'index', 1 or 'columns'}}, default 0
871 Axis along which the level(s) is removed:
873 * 0 or 'index': remove level(s) in column.
874 * 1 or 'columns': remove level(s) in row.
876 For `Series` this parameter is unused and defaults to 0.
878 Returns
879 -------
880 {klass}
881 {klass} with requested index / column level(s) removed.
883 Examples
884 --------
885 >>> df = pd.DataFrame([
886 ... [1, 2, 3, 4],
887 ... [5, 6, 7, 8],
888 ... [9, 10, 11, 12]
889 ... ]).set_index([0, 1]).rename_axis(['a', 'b'])
891 >>> df.columns = pd.MultiIndex.from_tuples([
892 ... ('c', 'e'), ('d', 'f')
893 ... ], names=['level_1', 'level_2'])
895 >>> df
896 level_1 c d
897 level_2 e f
898 a b
899 1 2 3 4
900 5 6 7 8
901 9 10 11 12
903 >>> df.droplevel('a')
904 level_1 c d
905 level_2 e f
906 b
907 2 3 4
908 6 7 8
909 10 11 12
911 >>> df.droplevel('level_2', axis=1)
912 level_1 c d
913 a b
914 1 2 3 4
915 5 6 7 8
916 9 10 11 12
917 """
918 labels = self._get_axis(axis)
919 new_labels = labels.droplevel(level)
920 return self.set_axis(new_labels, axis=axis)
922 def pop(self, item: Hashable) -> Series | Any:
923 result = self[item]
924 del self[item]
926 return result
928 @final
929 def squeeze(self, axis=None):
930 """
931 Squeeze 1 dimensional axis objects into scalars.
933 Series or DataFrames with a single element are squeezed to a scalar.
934 DataFrames with a single column or a single row are squeezed to a
935 Series. Otherwise the object is unchanged.
937 This method is most useful when you don't know if your
938 object is a Series or DataFrame, but you do know it has just a single
939 column. In that case you can safely call `squeeze` to ensure you have a
940 Series.
942 Parameters
943 ----------
944 axis : {0 or 'index', 1 or 'columns', None}, default None
945 A specific axis to squeeze. By default, all length-1 axes are
946 squeezed. For `Series` this parameter is unused and defaults to `None`.
948 Returns
949 -------
950 DataFrame, Series, or scalar
951 The projection after squeezing `axis` or all the axes.
953 See Also
954 --------
955 Series.iloc : Integer-location based indexing for selecting scalars.
956 DataFrame.iloc : Integer-location based indexing for selecting Series.
957 Series.to_frame : Inverse of DataFrame.squeeze for a
958 single-column DataFrame.
960 Examples
961 --------
962 >>> primes = pd.Series([2, 3, 5, 7])
964 Slicing might produce a Series with a single value:
966 >>> even_primes = primes[primes % 2 == 0]
967 >>> even_primes
968 0 2
969 dtype: int64
971 >>> even_primes.squeeze()
972 2
974 Squeezing objects with more than one value in every axis does nothing:
976 >>> odd_primes = primes[primes % 2 == 1]
977 >>> odd_primes
978 1 3
979 2 5
980 3 7
981 dtype: int64
983 >>> odd_primes.squeeze()
984 1 3
985 2 5
986 3 7
987 dtype: int64
989 Squeezing is even more effective when used with DataFrames.
991 >>> df = pd.DataFrame([[1, 2], [3, 4]], columns=['a', 'b'])
992 >>> df
993 a b
994 0 1 2
995 1 3 4
997 Slicing a single column will produce a DataFrame with the columns
998 having only one value:
1000 >>> df_a = df[['a']]
1001 >>> df_a
1002 a
1003 0 1
1004 1 3
1006 So the columns can be squeezed down, resulting in a Series:
1008 >>> df_a.squeeze('columns')
1009 0 1
1010 1 3
1011 Name: a, dtype: int64
1013 Slicing a single row from a single column will produce a single
1014 scalar DataFrame:
1016 >>> df_0a = df.loc[df.index < 1, ['a']]
1017 >>> df_0a
1018 a
1019 0 1
1021 Squeezing the rows produces a single scalar Series:
1023 >>> df_0a.squeeze('rows')
1024 a 1
1025 Name: 0, dtype: int64
1027 Squeezing all axes will project directly into a scalar:
1029 >>> df_0a.squeeze()
1030 1
1031 """
1032 axis = range(self._AXIS_LEN) if axis is None else (self._get_axis_number(axis),)
1033 return self.iloc[
1034 tuple(
1035 0 if i in axis and len(a) == 1 else slice(None)
1036 for i, a in enumerate(self.axes)
1037 )
1038 ]
1040 # ----------------------------------------------------------------------
1041 # Rename
1043 def _rename(
1044 self: NDFrameT,
1045 mapper: Renamer | None = None,
1046 *,
1047 index: Renamer | None = None,
1048 columns: Renamer | None = None,
1049 axis: Axis | None = None,
1050 copy: bool_t | None = None,
1051 inplace: bool_t = False,
1052 level: Level | None = None,
1053 errors: str = "ignore",
1054 ) -> NDFrameT | None:
1055 # called by Series.rename and DataFrame.rename
1057 if mapper is None and index is None and columns is None:
1058 raise TypeError("must pass an index to rename")
1060 if index is not None or columns is not None:
1061 if axis is not None:
1062 raise TypeError(
1063 "Cannot specify both 'axis' and any of 'index' or 'columns'"
1064 )
1065 elif mapper is not None:
1066 raise TypeError(
1067 "Cannot specify both 'mapper' and any of 'index' or 'columns'"
1068 )
1069 else:
1070 # use the mapper argument
1071 if axis and self._get_axis_number(axis) == 1:
1072 columns = mapper
1073 else:
1074 index = mapper
1076 self._check_inplace_and_allows_duplicate_labels(inplace)
1077 result = self if inplace else self.copy(deep=copy)
1079 for axis_no, replacements in enumerate((index, columns)):
1080 if replacements is None:
1081 continue
1083 ax = self._get_axis(axis_no)
1084 f = com.get_rename_function(replacements)
1086 if level is not None:
1087 level = ax._get_level_number(level)
1089 # GH 13473
1090 if not callable(replacements):
1091 if ax._is_multi and level is not None:
1092 indexer = ax.get_level_values(level).get_indexer_for(replacements)
1093 else:
1094 indexer = ax.get_indexer_for(replacements)
1096 if errors == "raise" and len(indexer[indexer == -1]):
1097 missing_labels = [
1098 label
1099 for index, label in enumerate(replacements)
1100 if indexer[index] == -1
1101 ]
1102 raise KeyError(f"{missing_labels} not found in axis")
1104 new_index = ax._transform_index(f, level=level)
1105 result._set_axis_nocheck(new_index, axis=axis_no, inplace=True, copy=False)
1106 result._clear_item_cache()
1108 if inplace:
1109 self._update_inplace(result)
1110 return None
1111 else:
1112 return result.__finalize__(self, method="rename")
1114 @overload
1115 def rename_axis(
1116 self: NDFrameT,
1117 mapper: IndexLabel | lib.NoDefault = ...,
1118 *,
1119 inplace: Literal[False] = ...,
1120 **kwargs,
1121 ) -> NDFrameT:
1122 ...
1124 @overload
1125 def rename_axis(
1126 self,
1127 mapper: IndexLabel | lib.NoDefault = ...,
1128 *,
1129 inplace: Literal[True],
1130 **kwargs,
1131 ) -> None:
1132 ...
1134 @overload
1135 def rename_axis(
1136 self: NDFrameT,
1137 mapper: IndexLabel | lib.NoDefault = ...,
1138 *,
1139 inplace: bool_t = ...,
1140 **kwargs,
1141 ) -> NDFrameT | None:
1142 ...
1144 @rewrite_axis_style_signature("mapper", [("copy", True)])
1145 @deprecate_nonkeyword_arguments(version=None, allowed_args=["self", "mapper"])
1146 def rename_axis(
1147 self: NDFrameT,
1148 mapper: IndexLabel | lib.NoDefault = lib.no_default,
1149 inplace: bool_t = False,
1150 **kwargs,
1151 ) -> NDFrameT | None:
1152 """
1153 Set the name of the axis for the index or columns.
1155 Parameters
1156 ----------
1157 mapper : scalar, list-like, optional
1158 Value to set the axis name attribute.
1159 index, columns : scalar, list-like, dict-like or function, optional
1160 A scalar, list-like, dict-like or functions transformations to
1161 apply to that axis' values.
1162 Note that the ``columns`` parameter is not allowed if the
1163 object is a Series. This parameter only apply for DataFrame
1164 type objects.
1166 Use either ``mapper`` and ``axis`` to
1167 specify the axis to target with ``mapper``, or ``index``
1168 and/or ``columns``.
1169 axis : {0 or 'index', 1 or 'columns'}, default 0
1170 The axis to rename. For `Series` this parameter is unused and defaults to 0.
1171 copy : bool, default True
1172 Also copy underlying data.
1173 inplace : bool, default False
1174 Modifies the object directly, instead of creating a new Series
1175 or DataFrame.
1177 Returns
1178 -------
1179 Series, DataFrame, or None
1180 The same type as the caller or None if ``inplace=True``.
1182 See Also
1183 --------
1184 Series.rename : Alter Series index labels or name.
1185 DataFrame.rename : Alter DataFrame index labels or name.
1186 Index.rename : Set new names on index.
1188 Notes
1189 -----
1190 ``DataFrame.rename_axis`` supports two calling conventions
1192 * ``(index=index_mapper, columns=columns_mapper, ...)``
1193 * ``(mapper, axis={'index', 'columns'}, ...)``
1195 The first calling convention will only modify the names of
1196 the index and/or the names of the Index object that is the columns.
1197 In this case, the parameter ``copy`` is ignored.
1199 The second calling convention will modify the names of the
1200 corresponding index if mapper is a list or a scalar.
1201 However, if mapper is dict-like or a function, it will use the
1202 deprecated behavior of modifying the axis *labels*.
1204 We *highly* recommend using keyword arguments to clarify your
1205 intent.
1207 Examples
1208 --------
1209 **Series**
1211 >>> s = pd.Series(["dog", "cat", "monkey"])
1212 >>> s
1213 0 dog
1214 1 cat
1215 2 monkey
1216 dtype: object
1217 >>> s.rename_axis("animal")
1218 animal
1219 0 dog
1220 1 cat
1221 2 monkey
1222 dtype: object
1224 **DataFrame**
1226 >>> df = pd.DataFrame({"num_legs": [4, 4, 2],
1227 ... "num_arms": [0, 0, 2]},
1228 ... ["dog", "cat", "monkey"])
1229 >>> df
1230 num_legs num_arms
1231 dog 4 0
1232 cat 4 0
1233 monkey 2 2
1234 >>> df = df.rename_axis("animal")
1235 >>> df
1236 num_legs num_arms
1237 animal
1238 dog 4 0
1239 cat 4 0
1240 monkey 2 2
1241 >>> df = df.rename_axis("limbs", axis="columns")
1242 >>> df
1243 limbs num_legs num_arms
1244 animal
1245 dog 4 0
1246 cat 4 0
1247 monkey 2 2
1249 **MultiIndex**
1251 >>> df.index = pd.MultiIndex.from_product([['mammal'],
1252 ... ['dog', 'cat', 'monkey']],
1253 ... names=['type', 'name'])
1254 >>> df
1255 limbs num_legs num_arms
1256 type name
1257 mammal dog 4 0
1258 cat 4 0
1259 monkey 2 2
1261 >>> df.rename_axis(index={'type': 'class'})
1262 limbs num_legs num_arms
1263 class name
1264 mammal dog 4 0
1265 cat 4 0
1266 monkey 2 2
1268 >>> df.rename_axis(columns=str.upper)
1269 LIMBS num_legs num_arms
1270 type name
1271 mammal dog 4 0
1272 cat 4 0
1273 monkey 2 2
1274 """
1275 kwargs["inplace"] = inplace
1276 axes, kwargs = self._construct_axes_from_arguments(
1277 (), kwargs, sentinel=lib.no_default
1278 )
1279 copy = kwargs.pop("copy", True)
1280 inplace = kwargs.pop("inplace", False)
1281 axis = kwargs.pop("axis", 0)
1282 if axis is not None:
1283 axis = self._get_axis_number(axis)
1285 if kwargs:
1286 raise TypeError(
1287 "rename_axis() got an unexpected keyword "
1288 f'argument "{list(kwargs.keys())[0]}"'
1289 )
1291 inplace = validate_bool_kwarg(inplace, "inplace")
1293 if mapper is not lib.no_default:
1294 # Use v0.23 behavior if a scalar or list
1295 non_mapper = is_scalar(mapper) or (
1296 is_list_like(mapper) and not is_dict_like(mapper)
1297 )
1298 if non_mapper:
1299 return self._set_axis_name(mapper, axis=axis, inplace=inplace)
1300 else:
1301 raise ValueError("Use `.rename` to alter labels with a mapper.")
1302 else:
1303 # Use new behavior. Means that index and/or columns
1304 # is specified
1305 result = self if inplace else self.copy(deep=copy)
1307 for axis in range(self._AXIS_LEN):
1308 v = axes.get(self._get_axis_name(axis))
1309 if v is lib.no_default:
1310 continue
1311 non_mapper = is_scalar(v) or (is_list_like(v) and not is_dict_like(v))
1312 if non_mapper:
1313 newnames = v
1314 else:
1315 f = com.get_rename_function(v)
1316 curnames = self._get_axis(axis).names
1317 newnames = [f(name) for name in curnames]
1318 result._set_axis_name(newnames, axis=axis, inplace=True)
1319 if not inplace:
1320 return result
1321 return None
1323 @final
1324 def _set_axis_name(self, name, axis=0, inplace=False):
1325 """
1326 Set the name(s) of the axis.
1328 Parameters
1329 ----------
1330 name : str or list of str
1331 Name(s) to set.
1332 axis : {0 or 'index', 1 or 'columns'}, default 0
1333 The axis to set the label. The value 0 or 'index' specifies index,
1334 and the value 1 or 'columns' specifies columns.
1335 inplace : bool, default False
1336 If `True`, do operation inplace and return None.
1338 Returns
1339 -------
1340 Series, DataFrame, or None
1341 The same type as the caller or `None` if `inplace` is `True`.
1343 See Also
1344 --------
1345 DataFrame.rename : Alter the axis labels of :class:`DataFrame`.
1346 Series.rename : Alter the index labels or set the index name
1347 of :class:`Series`.
1348 Index.rename : Set the name of :class:`Index` or :class:`MultiIndex`.
1350 Examples
1351 --------
1352 >>> df = pd.DataFrame({"num_legs": [4, 4, 2]},
1353 ... ["dog", "cat", "monkey"])
1354 >>> df
1355 num_legs
1356 dog 4
1357 cat 4
1358 monkey 2
1359 >>> df._set_axis_name("animal")
1360 num_legs
1361 animal
1362 dog 4
1363 cat 4
1364 monkey 2
1365 >>> df.index = pd.MultiIndex.from_product(
1366 ... [["mammal"], ['dog', 'cat', 'monkey']])
1367 >>> df._set_axis_name(["type", "name"])
1368 num_legs
1369 type name
1370 mammal dog 4
1371 cat 4
1372 monkey 2
1373 """
1374 axis = self._get_axis_number(axis)
1375 idx = self._get_axis(axis).set_names(name)
1377 inplace = validate_bool_kwarg(inplace, "inplace")
1378 renamed = self if inplace else self.copy()
1379 if axis == 0:
1380 renamed.index = idx
1381 else:
1382 renamed.columns = idx
1384 if not inplace:
1385 return renamed
1387 # ----------------------------------------------------------------------
1388 # Comparison Methods
1390 @final
1391 def _indexed_same(self, other) -> bool_t:
1392 return all(
1393 self._get_axis(a).equals(other._get_axis(a)) for a in self._AXIS_ORDERS
1394 )
1396 @final
1397 def equals(self, other: object) -> bool_t:
1398 """
1399 Test whether two objects contain the same elements.
1401 This function allows two Series or DataFrames to be compared against
1402 each other to see if they have the same shape and elements. NaNs in
1403 the same location are considered equal.
1405 The row/column index do not need to have the same type, as long
1406 as the values are considered equal. Corresponding columns must be of
1407 the same dtype.
1409 Parameters
1410 ----------
1411 other : Series or DataFrame
1412 The other Series or DataFrame to be compared with the first.
1414 Returns
1415 -------
1416 bool
1417 True if all elements are the same in both objects, False
1418 otherwise.
1420 See Also
1421 --------
1422 Series.eq : Compare two Series objects of the same length
1423 and return a Series where each element is True if the element
1424 in each Series is equal, False otherwise.
1425 DataFrame.eq : Compare two DataFrame objects of the same shape and
1426 return a DataFrame where each element is True if the respective
1427 element in each DataFrame is equal, False otherwise.
1428 testing.assert_series_equal : Raises an AssertionError if left and
1429 right are not equal. Provides an easy interface to ignore
1430 inequality in dtypes, indexes and precision among others.
1431 testing.assert_frame_equal : Like assert_series_equal, but targets
1432 DataFrames.
1433 numpy.array_equal : Return True if two arrays have the same shape
1434 and elements, False otherwise.
1436 Examples
1437 --------
1438 >>> df = pd.DataFrame({1: [10], 2: [20]})
1439 >>> df
1440 1 2
1441 0 10 20
1443 DataFrames df and exactly_equal have the same types and values for
1444 their elements and column labels, which will return True.
1446 >>> exactly_equal = pd.DataFrame({1: [10], 2: [20]})
1447 >>> exactly_equal
1448 1 2
1449 0 10 20
1450 >>> df.equals(exactly_equal)
1451 True
1453 DataFrames df and different_column_type have the same element
1454 types and values, but have different types for the column labels,
1455 which will still return True.
1457 >>> different_column_type = pd.DataFrame({1.0: [10], 2.0: [20]})
1458 >>> different_column_type
1459 1.0 2.0
1460 0 10 20
1461 >>> df.equals(different_column_type)
1462 True
1464 DataFrames df and different_data_type have different types for the
1465 same values for their elements, and will return False even though
1466 their column labels are the same values and types.
1468 >>> different_data_type = pd.DataFrame({1: [10.0], 2: [20.0]})
1469 >>> different_data_type
1470 1 2
1471 0 10.0 20.0
1472 >>> df.equals(different_data_type)
1473 False
1474 """
1475 if not (isinstance(other, type(self)) or isinstance(self, type(other))):
1476 return False
1477 other = cast(NDFrame, other)
1478 return self._mgr.equals(other._mgr)
1480 # -------------------------------------------------------------------------
1481 # Unary Methods
1483 @final
1484 def __neg__(self: NDFrameT) -> NDFrameT:
1485 def blk_func(values: ArrayLike):
1486 if is_bool_dtype(values.dtype):
1487 # error: Argument 1 to "inv" has incompatible type "Union
1488 # [ExtensionArray, ndarray[Any, Any]]"; expected
1489 # "_SupportsInversion[ndarray[Any, dtype[bool_]]]"
1490 return operator.inv(values) # type: ignore[arg-type]
1491 else:
1492 # error: Argument 1 to "neg" has incompatible type "Union
1493 # [ExtensionArray, ndarray[Any, Any]]"; expected
1494 # "_SupportsNeg[ndarray[Any, dtype[Any]]]"
1495 return operator.neg(values) # type: ignore[arg-type]
1497 new_data = self._mgr.apply(blk_func)
1498 res = self._constructor(new_data)
1499 return res.__finalize__(self, method="__neg__")
1501 @final
1502 def __pos__(self: NDFrameT) -> NDFrameT:
1503 def blk_func(values: ArrayLike):
1504 if is_bool_dtype(values.dtype):
1505 return values.copy()
1506 else:
1507 # error: Argument 1 to "pos" has incompatible type "Union
1508 # [ExtensionArray, ndarray[Any, Any]]"; expected
1509 # "_SupportsPos[ndarray[Any, dtype[Any]]]"
1510 return operator.pos(values) # type: ignore[arg-type]
1512 new_data = self._mgr.apply(blk_func)
1513 res = self._constructor(new_data)
1514 return res.__finalize__(self, method="__pos__")
1516 @final
1517 def __invert__(self: NDFrameT) -> NDFrameT:
1518 if not self.size:
1519 # inv fails with 0 len
1520 return self
1522 new_data = self._mgr.apply(operator.invert)
1523 return self._constructor(new_data).__finalize__(self, method="__invert__")
1525 @final
1526 def __nonzero__(self) -> NoReturn:
1527 raise ValueError(
1528 f"The truth value of a {type(self).__name__} is ambiguous. "
1529 "Use a.empty, a.bool(), a.item(), a.any() or a.all()."
1530 )
1532 __bool__ = __nonzero__
1534 @final
1535 def bool(self) -> bool_t:
1536 """
1537 Return the bool of a single element Series or DataFrame.
1539 This must be a boolean scalar value, either True or False. It will raise a
1540 ValueError if the Series or DataFrame does not have exactly 1 element, or that
1541 element is not boolean (integer values 0 and 1 will also raise an exception).
1543 Returns
1544 -------
1545 bool
1546 The value in the Series or DataFrame.
1548 See Also
1549 --------
1550 Series.astype : Change the data type of a Series, including to boolean.
1551 DataFrame.astype : Change the data type of a DataFrame, including to boolean.
1552 numpy.bool_ : NumPy boolean data type, used by pandas for boolean values.
1554 Examples
1555 --------
1556 The method will only work for single element objects with a boolean value:
1558 >>> pd.Series([True]).bool()
1559 True
1560 >>> pd.Series([False]).bool()
1561 False
1563 >>> pd.DataFrame({'col': [True]}).bool()
1564 True
1565 >>> pd.DataFrame({'col': [False]}).bool()
1566 False
1567 """
1568 v = self.squeeze()
1569 if isinstance(v, (bool, np.bool_)):
1570 return bool(v)
1571 elif is_scalar(v):
1572 raise ValueError(
1573 "bool cannot act on a non-boolean single element "
1574 f"{type(self).__name__}"
1575 )
1577 self.__nonzero__()
1578 # for mypy (__nonzero__ raises)
1579 return True
1581 @final
1582 def abs(self: NDFrameT) -> NDFrameT:
1583 """
1584 Return a Series/DataFrame with absolute numeric value of each element.
1586 This function only applies to elements that are all numeric.
1588 Returns
1589 -------
1590 abs
1591 Series/DataFrame containing the absolute value of each element.
1593 See Also
1594 --------
1595 numpy.absolute : Calculate the absolute value element-wise.
1597 Notes
1598 -----
1599 For ``complex`` inputs, ``1.2 + 1j``, the absolute value is
1600 :math:`\\sqrt{ a^2 + b^2 }`.
1602 Examples
1603 --------
1604 Absolute numeric values in a Series.
1606 >>> s = pd.Series([-1.10, 2, -3.33, 4])
1607 >>> s.abs()
1608 0 1.10
1609 1 2.00
1610 2 3.33
1611 3 4.00
1612 dtype: float64
1614 Absolute numeric values in a Series with complex numbers.
1616 >>> s = pd.Series([1.2 + 1j])
1617 >>> s.abs()
1618 0 1.56205
1619 dtype: float64
1621 Absolute numeric values in a Series with a Timedelta element.
1623 >>> s = pd.Series([pd.Timedelta('1 days')])
1624 >>> s.abs()
1625 0 1 days
1626 dtype: timedelta64[ns]
1628 Select rows with data closest to certain value using argsort (from
1629 `StackOverflow <https://stackoverflow.com/a/17758115>`__).
1631 >>> df = pd.DataFrame({
1632 ... 'a': [4, 5, 6, 7],
1633 ... 'b': [10, 20, 30, 40],
1634 ... 'c': [100, 50, -30, -50]
1635 ... })
1636 >>> df
1637 a b c
1638 0 4 10 100
1639 1 5 20 50
1640 2 6 30 -30
1641 3 7 40 -50
1642 >>> df.loc[(df.c - 43).abs().argsort()]
1643 a b c
1644 1 5 20 50
1645 0 4 10 100
1646 2 6 30 -30
1647 3 7 40 -50
1648 """
1649 res_mgr = self._mgr.apply(np.abs)
1650 return self._constructor(res_mgr).__finalize__(self, name="abs")
1652 @final
1653 def __abs__(self: NDFrameT) -> NDFrameT:
1654 return self.abs()
1656 @final
1657 def __round__(self: NDFrameT, decimals: int = 0) -> NDFrameT:
1658 return self.round(decimals).__finalize__(self, method="__round__")
1660 # -------------------------------------------------------------------------
1661 # Label or Level Combination Helpers
1662 #
1663 # A collection of helper methods for DataFrame/Series operations that
1664 # accept a combination of column/index labels and levels. All such
1665 # operations should utilize/extend these methods when possible so that we
1666 # have consistent precedence and validation logic throughout the library.
1668 @final
1669 def _is_level_reference(self, key: Level, axis=0) -> bool_t:
1670 """
1671 Test whether a key is a level reference for a given axis.
1673 To be considered a level reference, `key` must be a string that:
1674 - (axis=0): Matches the name of an index level and does NOT match
1675 a column label.
1676 - (axis=1): Matches the name of a column level and does NOT match
1677 an index label.
1679 Parameters
1680 ----------
1681 key : Hashable
1682 Potential level name for the given axis
1683 axis : int, default 0
1684 Axis that levels are associated with (0 for index, 1 for columns)
1686 Returns
1687 -------
1688 is_level : bool
1689 """
1690 axis = self._get_axis_number(axis)
1692 return (
1693 key is not None
1694 and is_hashable(key)
1695 and key in self.axes[axis].names
1696 and not self._is_label_reference(key, axis=axis)
1697 )
1699 @final
1700 def _is_label_reference(self, key: Level, axis=0) -> bool_t:
1701 """
1702 Test whether a key is a label reference for a given axis.
1704 To be considered a label reference, `key` must be a string that:
1705 - (axis=0): Matches a column label
1706 - (axis=1): Matches an index label
1708 Parameters
1709 ----------
1710 key : Hashable
1711 Potential label name, i.e. Index entry.
1712 axis : int, default 0
1713 Axis perpendicular to the axis that labels are associated with
1714 (0 means search for column labels, 1 means search for index labels)
1716 Returns
1717 -------
1718 is_label: bool
1719 """
1720 axis = self._get_axis_number(axis)
1721 other_axes = (ax for ax in range(self._AXIS_LEN) if ax != axis)
1723 return (
1724 key is not None
1725 and is_hashable(key)
1726 and any(key in self.axes[ax] for ax in other_axes)
1727 )
1729 @final
1730 def _is_label_or_level_reference(self, key: Level, axis: int = 0) -> bool_t:
1731 """
1732 Test whether a key is a label or level reference for a given axis.
1734 To be considered either a label or a level reference, `key` must be a
1735 string that:
1736 - (axis=0): Matches a column label or an index level
1737 - (axis=1): Matches an index label or a column level
1739 Parameters
1740 ----------
1741 key : Hashable
1742 Potential label or level name
1743 axis : int, default 0
1744 Axis that levels are associated with (0 for index, 1 for columns)
1746 Returns
1747 -------
1748 bool
1749 """
1750 return self._is_level_reference(key, axis=axis) or self._is_label_reference(
1751 key, axis=axis
1752 )
1754 @final
1755 def _check_label_or_level_ambiguity(self, key: Level, axis: int = 0) -> None:
1756 """
1757 Check whether `key` is ambiguous.
1759 By ambiguous, we mean that it matches both a level of the input
1760 `axis` and a label of the other axis.
1762 Parameters
1763 ----------
1764 key : Hashable
1765 Label or level name.
1766 axis : int, default 0
1767 Axis that levels are associated with (0 for index, 1 for columns).
1769 Raises
1770 ------
1771 ValueError: `key` is ambiguous
1772 """
1774 axis = self._get_axis_number(axis)
1775 other_axes = (ax for ax in range(self._AXIS_LEN) if ax != axis)
1777 if (
1778 key is not None
1779 and is_hashable(key)
1780 and key in self.axes[axis].names
1781 and any(key in self.axes[ax] for ax in other_axes)
1782 ):
1784 # Build an informative and grammatical warning
1785 level_article, level_type = (
1786 ("an", "index") if axis == 0 else ("a", "column")
1787 )
1789 label_article, label_type = (
1790 ("a", "column") if axis == 0 else ("an", "index")
1791 )
1793 msg = (
1794 f"'{key}' is both {level_article} {level_type} level and "
1795 f"{label_article} {label_type} label, which is ambiguous."
1796 )
1797 raise ValueError(msg)
1799 @final
1800 def _get_label_or_level_values(self, key: Level, axis: int = 0) -> ArrayLike:
1801 """
1802 Return a 1-D array of values associated with `key`, a label or level
1803 from the given `axis`.
1805 Retrieval logic:
1806 - (axis=0): Return column values if `key` matches a column label.
1807 Otherwise return index level values if `key` matches an index
1808 level.
1809 - (axis=1): Return row values if `key` matches an index label.
1810 Otherwise return column level values if 'key' matches a column
1811 level
1813 Parameters
1814 ----------
1815 key : Hashable
1816 Label or level name.
1817 axis : int, default 0
1818 Axis that levels are associated with (0 for index, 1 for columns)
1820 Returns
1821 -------
1822 np.ndarray or ExtensionArray
1824 Raises
1825 ------
1826 KeyError
1827 if `key` matches neither a label nor a level
1828 ValueError
1829 if `key` matches multiple labels
1830 FutureWarning
1831 if `key` is ambiguous. This will become an ambiguity error in a
1832 future version
1833 """
1834 axis = self._get_axis_number(axis)
1835 other_axes = [ax for ax in range(self._AXIS_LEN) if ax != axis]
1837 if self._is_label_reference(key, axis=axis):
1838 self._check_label_or_level_ambiguity(key, axis=axis)
1839 values = self.xs(key, axis=other_axes[0])._values
1840 elif self._is_level_reference(key, axis=axis):
1841 # error: Incompatible types in assignment (expression has type "Union[
1842 # ExtensionArray, ndarray[Any, Any]]", variable has type "ndarray[Any,
1843 # Any]")
1844 values = (
1845 self.axes[axis]
1846 .get_level_values(key) # type: ignore[assignment]
1847 ._values
1848 )
1849 else:
1850 raise KeyError(key)
1852 # Check for duplicates
1853 if values.ndim > 1:
1855 if other_axes and isinstance(self._get_axis(other_axes[0]), MultiIndex):
1856 multi_message = (
1857 "\n"
1858 "For a multi-index, the label must be a "
1859 "tuple with elements corresponding to each level."
1860 )
1861 else:
1862 multi_message = ""
1864 label_axis_name = "column" if axis == 0 else "index"
1865 raise ValueError(
1866 f"The {label_axis_name} label '{key}' is not unique.{multi_message}"
1867 )
1869 return values
1871 @final
1872 def _drop_labels_or_levels(self, keys, axis: int = 0):
1873 """
1874 Drop labels and/or levels for the given `axis`.
1876 For each key in `keys`:
1877 - (axis=0): If key matches a column label then drop the column.
1878 Otherwise if key matches an index level then drop the level.
1879 - (axis=1): If key matches an index label then drop the row.
1880 Otherwise if key matches a column level then drop the level.
1882 Parameters
1883 ----------
1884 keys : str or list of str
1885 labels or levels to drop
1886 axis : int, default 0
1887 Axis that levels are associated with (0 for index, 1 for columns)
1889 Returns
1890 -------
1891 dropped: DataFrame
1893 Raises
1894 ------
1895 ValueError
1896 if any `keys` match neither a label nor a level
1897 """
1898 axis = self._get_axis_number(axis)
1900 # Validate keys
1901 keys = com.maybe_make_list(keys)
1902 invalid_keys = [
1903 k for k in keys if not self._is_label_or_level_reference(k, axis=axis)
1904 ]
1906 if invalid_keys:
1907 raise ValueError(
1908 "The following keys are not valid labels or "
1909 f"levels for axis {axis}: {invalid_keys}"
1910 )
1912 # Compute levels and labels to drop
1913 levels_to_drop = [k for k in keys if self._is_level_reference(k, axis=axis)]
1915 labels_to_drop = [k for k in keys if not self._is_level_reference(k, axis=axis)]
1917 # Perform copy upfront and then use inplace operations below.
1918 # This ensures that we always perform exactly one copy.
1919 # ``copy`` and/or ``inplace`` options could be added in the future.
1920 dropped = self.copy()
1922 if axis == 0:
1923 # Handle dropping index levels
1924 if levels_to_drop:
1925 dropped.reset_index(levels_to_drop, drop=True, inplace=True)
1927 # Handle dropping columns labels
1928 if labels_to_drop:
1929 dropped.drop(labels_to_drop, axis=1, inplace=True)
1930 else:
1931 # Handle dropping column levels
1932 if levels_to_drop:
1933 if isinstance(dropped.columns, MultiIndex):
1934 # Drop the specified levels from the MultiIndex
1935 dropped.columns = dropped.columns.droplevel(levels_to_drop)
1936 else:
1937 # Drop the last level of Index by replacing with
1938 # a RangeIndex
1939 dropped.columns = RangeIndex(dropped.columns.size)
1941 # Handle dropping index labels
1942 if labels_to_drop:
1943 dropped.drop(labels_to_drop, axis=0, inplace=True)
1945 return dropped
1947 # ----------------------------------------------------------------------
1948 # Iteration
1950 # https://github.com/python/typeshed/issues/2148#issuecomment-520783318
1951 # Incompatible types in assignment (expression has type "None", base class
1952 # "object" defined the type as "Callable[[object], int]")
1953 __hash__: ClassVar[None] # type: ignore[assignment]
1955 def __iter__(self):
1956 """
1957 Iterate over info axis.
1959 Returns
1960 -------
1961 iterator
1962 Info axis as iterator.
1963 """
1964 return iter(self._info_axis)
1966 # can we get a better explanation of this?
1967 def keys(self) -> Index:
1968 """
1969 Get the 'info axis' (see Indexing for more).
1971 This is index for Series, columns for DataFrame.
1973 Returns
1974 -------
1975 Index
1976 Info axis.
1977 """
1978 return self._info_axis
1980 def items(self):
1981 """
1982 Iterate over (label, values) on info axis
1984 This is index for Series and columns for DataFrame.
1986 Returns
1987 -------
1988 Generator
1989 """
1990 for h in self._info_axis:
1991 yield h, self[h]
1993 def __len__(self) -> int:
1994 """Returns length of info axis"""
1995 return len(self._info_axis)
1997 @final
1998 def __contains__(self, key) -> bool_t:
1999 """True if the key is in the info axis"""
2000 return key in self._info_axis
2002 @property
2003 def empty(self) -> bool_t:
2004 """
2005 Indicator whether Series/DataFrame is empty.
2007 True if Series/DataFrame is entirely empty (no items), meaning any of the
2008 axes are of length 0.
2010 Returns
2011 -------
2012 bool
2013 If Series/DataFrame is empty, return True, if not return False.
2015 See Also
2016 --------
2017 Series.dropna : Return series without null values.
2018 DataFrame.dropna : Return DataFrame with labels on given axis omitted
2019 where (all or any) data are missing.
2021 Notes
2022 -----
2023 If Series/DataFrame contains only NaNs, it is still not considered empty. See
2024 the example below.
2026 Examples
2027 --------
2028 An example of an actual empty DataFrame. Notice the index is empty:
2030 >>> df_empty = pd.DataFrame({'A' : []})
2031 >>> df_empty
2032 Empty DataFrame
2033 Columns: [A]
2034 Index: []
2035 >>> df_empty.empty
2036 True
2038 If we only have NaNs in our DataFrame, it is not considered empty! We
2039 will need to drop the NaNs to make the DataFrame empty:
2041 >>> df = pd.DataFrame({'A' : [np.nan]})
2042 >>> df
2043 A
2044 0 NaN
2045 >>> df.empty
2046 False
2047 >>> df.dropna().empty
2048 True
2050 >>> ser_empty = pd.Series({'A' : []})
2051 >>> ser_empty
2052 A []
2053 dtype: object
2054 >>> ser_empty.empty
2055 False
2056 >>> ser_empty = pd.Series()
2057 >>> ser_empty.empty
2058 True
2059 """
2060 return any(len(self._get_axis(a)) == 0 for a in self._AXIS_ORDERS)
2062 # ----------------------------------------------------------------------
2063 # Array Interface
2065 # This is also set in IndexOpsMixin
2066 # GH#23114 Ensure ndarray.__op__(DataFrame) returns NotImplemented
2067 __array_priority__: int = 1000
2069 def __array__(self, dtype: npt.DTypeLike | None = None) -> np.ndarray:
2070 return np.asarray(self._values, dtype=dtype)
2072 def __array_wrap__(
2073 self,
2074 result: np.ndarray,
2075 context: tuple[Callable, tuple[Any, ...], int] | None = None,
2076 ):
2077 """
2078 Gets called after a ufunc and other functions.
2080 Parameters
2081 ----------
2082 result: np.ndarray
2083 The result of the ufunc or other function called on the NumPy array
2084 returned by __array__
2085 context: tuple of (func, tuple, int)
2086 This parameter is returned by ufuncs as a 3-element tuple: (name of the
2087 ufunc, arguments of the ufunc, domain of the ufunc), but is not set by
2088 other numpy functions.q
2090 Notes
2091 -----
2092 Series implements __array_ufunc_ so this not called for ufunc on Series.
2093 """
2094 # Note: at time of dask 2022.01.0, this is still used by dask
2095 warnings.warn(
2096 "The __array_wrap__ method of DataFrame and Series will be removed in "
2097 "a future version",
2098 DeprecationWarning,
2099 stacklevel=find_stack_level(),
2100 )
2101 res = lib.item_from_zerodim(result)
2102 if is_scalar(res):
2103 # e.g. we get here with np.ptp(series)
2104 # ptp also requires the item_from_zerodim
2105 return res
2106 d = self._construct_axes_dict(self._AXIS_ORDERS, copy=False)
2107 return self._constructor(res, **d).__finalize__(self, method="__array_wrap__")
2109 @final
2110 def __array_ufunc__(
2111 self, ufunc: np.ufunc, method: str, *inputs: Any, **kwargs: Any
2112 ):
2113 return arraylike.array_ufunc(self, ufunc, method, *inputs, **kwargs)
2115 # ----------------------------------------------------------------------
2116 # Picklability
2118 @final
2119 def __getstate__(self) -> dict[str, Any]:
2120 meta = {k: getattr(self, k, None) for k in self._metadata}
2121 return {
2122 "_mgr": self._mgr,
2123 "_typ": self._typ,
2124 "_metadata": self._metadata,
2125 "attrs": self.attrs,
2126 "_flags": {k: self.flags[k] for k in self.flags._keys},
2127 **meta,
2128 }
2130 @final
2131 def __setstate__(self, state) -> None:
2132 if isinstance(state, BlockManager):
2133 self._mgr = state
2134 elif isinstance(state, dict):
2135 if "_data" in state and "_mgr" not in state:
2136 # compat for older pickles
2137 state["_mgr"] = state.pop("_data")
2138 typ = state.get("_typ")
2139 if typ is not None:
2140 attrs = state.get("_attrs", {})
2141 object.__setattr__(self, "_attrs", attrs)
2142 flags = state.get("_flags", {"allows_duplicate_labels": True})
2143 object.__setattr__(self, "_flags", Flags(self, **flags))
2145 # set in the order of internal names
2146 # to avoid definitional recursion
2147 # e.g. say fill_value needing _mgr to be
2148 # defined
2149 meta = set(self._internal_names + self._metadata)
2150 for k in list(meta):
2151 if k in state and k != "_flags":
2152 v = state[k]
2153 object.__setattr__(self, k, v)
2155 for k, v in state.items():
2156 if k not in meta:
2157 object.__setattr__(self, k, v)
2159 else:
2160 raise NotImplementedError("Pre-0.12 pickles are no longer supported")
2161 elif len(state) == 2:
2162 raise NotImplementedError("Pre-0.12 pickles are no longer supported")
2164 self._item_cache: dict[Hashable, Series] = {}
2166 # ----------------------------------------------------------------------
2167 # Rendering Methods
2169 def __repr__(self) -> str:
2170 # string representation based upon iterating over self
2171 # (since, by definition, `PandasContainers` are iterable)
2172 prepr = f"[{','.join(map(pprint_thing, self))}]"
2173 return f"{type(self).__name__}({prepr})"
2175 @final
2176 def _repr_latex_(self):
2177 """
2178 Returns a LaTeX representation for a particular object.
2179 Mainly for use with nbconvert (jupyter notebook conversion to pdf).
2180 """
2181 if config.get_option("display.latex.repr"):
2182 return self.to_latex()
2183 else:
2184 return None
2186 @final
2187 def _repr_data_resource_(self):
2188 """
2189 Not a real Jupyter special repr method, but we use the same
2190 naming convention.
2191 """
2192 if config.get_option("display.html.table_schema"):
2193 data = self.head(config.get_option("display.max_rows"))
2195 as_json = data.to_json(orient="table")
2196 as_json = cast(str, as_json)
2197 return json.loads(as_json, object_pairs_hook=collections.OrderedDict)
2199 # ----------------------------------------------------------------------
2200 # I/O Methods
2202 @final
2203 @deprecate_kwarg(old_arg_name="verbose", new_arg_name=None)
2204 @deprecate_kwarg(old_arg_name="encoding", new_arg_name=None)
2205 @doc(
2206 klass="object",
2207 storage_options=_shared_docs["storage_options"],
2208 storage_options_versionadded="1.2.0",
2209 )
2210 def to_excel(
2211 self,
2212 excel_writer,
2213 sheet_name: str = "Sheet1",
2214 na_rep: str = "",
2215 float_format: str | None = None,
2216 columns: Sequence[Hashable] | None = None,
2217 header: Sequence[Hashable] | bool_t = True,
2218 index: bool_t = True,
2219 index_label: IndexLabel = None,
2220 startrow: int = 0,
2221 startcol: int = 0,
2222 engine: str | None = None,
2223 merge_cells: bool_t = True,
2224 encoding: lib.NoDefault = lib.no_default,
2225 inf_rep: str = "inf",
2226 verbose: lib.NoDefault = lib.no_default,
2227 freeze_panes: tuple[int, int] | None = None,
2228 storage_options: StorageOptions = None,
2229 ) -> None:
2230 """
2231 Write {klass} to an Excel sheet.
2233 To write a single {klass} to an Excel .xlsx file it is only necessary to
2234 specify a target file name. To write to multiple sheets it is necessary to
2235 create an `ExcelWriter` object with a target file name, and specify a sheet
2236 in the file to write to.
2238 Multiple sheets may be written to by specifying unique `sheet_name`.
2239 With all data written to the file it is necessary to save the changes.
2240 Note that creating an `ExcelWriter` object with a file name that already
2241 exists will result in the contents of the existing file being erased.
2243 Parameters
2244 ----------
2245 excel_writer : path-like, file-like, or ExcelWriter object
2246 File path or existing ExcelWriter.
2247 sheet_name : str, default 'Sheet1'
2248 Name of sheet which will contain DataFrame.
2249 na_rep : str, default ''
2250 Missing data representation.
2251 float_format : str, optional
2252 Format string for floating point numbers. For example
2253 ``float_format="%.2f"`` will format 0.1234 to 0.12.
2254 columns : sequence or list of str, optional
2255 Columns to write.
2256 header : bool or list of str, default True
2257 Write out the column names. If a list of string is given it is
2258 assumed to be aliases for the column names.
2259 index : bool, default True
2260 Write row names (index).
2261 index_label : str or sequence, optional
2262 Column label for index column(s) if desired. If not specified, and
2263 `header` and `index` are True, then the index names are used. A
2264 sequence should be given if the DataFrame uses MultiIndex.
2265 startrow : int, default 0
2266 Upper left cell row to dump data frame.
2267 startcol : int, default 0
2268 Upper left cell column to dump data frame.
2269 engine : str, optional
2270 Write engine to use, 'openpyxl' or 'xlsxwriter'. You can also set this
2271 via the options ``io.excel.xlsx.writer``, ``io.excel.xls.writer``, and
2272 ``io.excel.xlsm.writer``.
2274 .. deprecated:: 1.2.0
2276 As the `xlwt <https://pypi.org/project/xlwt/>`__ package is no longer
2277 maintained, the ``xlwt`` engine will be removed in a future version
2278 of pandas.
2280 merge_cells : bool, default True
2281 Write MultiIndex and Hierarchical Rows as merged cells.
2282 encoding : str, optional
2283 Encoding of the resulting excel file. Only necessary for xlwt,
2284 other writers support unicode natively.
2286 .. deprecated:: 1.5.0
2288 This keyword was not used.
2290 inf_rep : str, default 'inf'
2291 Representation for infinity (there is no native representation for
2292 infinity in Excel).
2293 verbose : bool, default True
2294 Display more information in the error logs.
2296 .. deprecated:: 1.5.0
2298 This keyword was not used.
2300 freeze_panes : tuple of int (length 2), optional
2301 Specifies the one-based bottommost row and rightmost column that
2302 is to be frozen.
2303 {storage_options}
2305 .. versionadded:: {storage_options_versionadded}
2307 See Also
2308 --------
2309 to_csv : Write DataFrame to a comma-separated values (csv) file.
2310 ExcelWriter : Class for writing DataFrame objects into excel sheets.
2311 read_excel : Read an Excel file into a pandas DataFrame.
2312 read_csv : Read a comma-separated values (csv) file into DataFrame.
2313 io.formats.style.Styler.to_excel : Add styles to Excel sheet.
2315 Notes
2316 -----
2317 For compatibility with :meth:`~DataFrame.to_csv`,
2318 to_excel serializes lists and dicts to strings before writing.
2320 Once a workbook has been saved it is not possible to write further
2321 data without rewriting the whole workbook.
2323 Examples
2324 --------
2326 Create, write to and save a workbook:
2328 >>> df1 = pd.DataFrame([['a', 'b'], ['c', 'd']],
2329 ... index=['row 1', 'row 2'],
2330 ... columns=['col 1', 'col 2'])
2331 >>> df1.to_excel("output.xlsx") # doctest: +SKIP
2333 To specify the sheet name:
2335 >>> df1.to_excel("output.xlsx",
2336 ... sheet_name='Sheet_name_1') # doctest: +SKIP
2338 If you wish to write to more than one sheet in the workbook, it is
2339 necessary to specify an ExcelWriter object:
2341 >>> df2 = df1.copy()
2342 >>> with pd.ExcelWriter('output.xlsx') as writer: # doctest: +SKIP
2343 ... df1.to_excel(writer, sheet_name='Sheet_name_1')
2344 ... df2.to_excel(writer, sheet_name='Sheet_name_2')
2346 ExcelWriter can also be used to append to an existing Excel file:
2348 >>> with pd.ExcelWriter('output.xlsx',
2349 ... mode='a') as writer: # doctest: +SKIP
2350 ... df.to_excel(writer, sheet_name='Sheet_name_3')
2352 To set the library that is used to write the Excel file,
2353 you can pass the `engine` keyword (the default engine is
2354 automatically chosen depending on the file extension):
2356 >>> df1.to_excel('output1.xlsx', engine='xlsxwriter') # doctest: +SKIP
2357 """
2359 df = self if isinstance(self, ABCDataFrame) else self.to_frame()
2361 from pandas.io.formats.excel import ExcelFormatter
2363 formatter = ExcelFormatter(
2364 df,
2365 na_rep=na_rep,
2366 cols=columns,
2367 header=header,
2368 float_format=float_format,
2369 index=index,
2370 index_label=index_label,
2371 merge_cells=merge_cells,
2372 inf_rep=inf_rep,
2373 )
2374 formatter.write(
2375 excel_writer,
2376 sheet_name=sheet_name,
2377 startrow=startrow,
2378 startcol=startcol,
2379 freeze_panes=freeze_panes,
2380 engine=engine,
2381 storage_options=storage_options,
2382 )
2384 @final
2385 @doc(
2386 storage_options=_shared_docs["storage_options"],
2387 compression_options=_shared_docs["compression_options"] % "path_or_buf",
2388 )
2389 def to_json(
2390 self,
2391 path_or_buf: FilePath | WriteBuffer[bytes] | WriteBuffer[str] | None = None,
2392 orient: str | None = None,
2393 date_format: str | None = None,
2394 double_precision: int = 10,
2395 force_ascii: bool_t = True,
2396 date_unit: str = "ms",
2397 default_handler: Callable[[Any], JSONSerializable] | None = None,
2398 lines: bool_t = False,
2399 compression: CompressionOptions = "infer",
2400 index: bool_t = True,
2401 indent: int | None = None,
2402 storage_options: StorageOptions = None,
2403 ) -> str | None:
2404 """
2405 Convert the object to a JSON string.
2407 Note NaN's and None will be converted to null and datetime objects
2408 will be converted to UNIX timestamps.
2410 Parameters
2411 ----------
2412 path_or_buf : str, path object, file-like object, or None, default None
2413 String, path object (implementing os.PathLike[str]), or file-like
2414 object implementing a write() function. If None, the result is
2415 returned as a string.
2416 orient : str
2417 Indication of expected JSON string format.
2419 * Series:
2421 - default is 'index'
2422 - allowed values are: {{'split', 'records', 'index', 'table'}}.
2424 * DataFrame:
2426 - default is 'columns'
2427 - allowed values are: {{'split', 'records', 'index', 'columns',
2428 'values', 'table'}}.
2430 * The format of the JSON string:
2432 - 'split' : dict like {{'index' -> [index], 'columns' -> [columns],
2433 'data' -> [values]}}
2434 - 'records' : list like [{{column -> value}}, ... , {{column -> value}}]
2435 - 'index' : dict like {{index -> {{column -> value}}}}
2436 - 'columns' : dict like {{column -> {{index -> value}}}}
2437 - 'values' : just the values array
2438 - 'table' : dict like {{'schema': {{schema}}, 'data': {{data}}}}
2440 Describing the data, where data component is like ``orient='records'``.
2442 date_format : {{None, 'epoch', 'iso'}}
2443 Type of date conversion. 'epoch' = epoch milliseconds,
2444 'iso' = ISO8601. The default depends on the `orient`. For
2445 ``orient='table'``, the default is 'iso'. For all other orients,
2446 the default is 'epoch'.
2447 double_precision : int, default 10
2448 The number of decimal places to use when encoding
2449 floating point values.
2450 force_ascii : bool, default True
2451 Force encoded string to be ASCII.
2452 date_unit : str, default 'ms' (milliseconds)
2453 The time unit to encode to, governs timestamp and ISO8601
2454 precision. One of 's', 'ms', 'us', 'ns' for second, millisecond,
2455 microsecond, and nanosecond respectively.
2456 default_handler : callable, default None
2457 Handler to call if object cannot otherwise be converted to a
2458 suitable format for JSON. Should receive a single argument which is
2459 the object to convert and return a serialisable object.
2460 lines : bool, default False
2461 If 'orient' is 'records' write out line-delimited json format. Will
2462 throw ValueError if incorrect 'orient' since others are not
2463 list-like.
2464 {compression_options}
2466 .. versionchanged:: 1.4.0 Zstandard support.
2468 index : bool, default True
2469 Whether to include the index values in the JSON string. Not
2470 including the index (``index=False``) is only supported when
2471 orient is 'split' or 'table'.
2472 indent : int, optional
2473 Length of whitespace used to indent each record.
2475 .. versionadded:: 1.0.0
2477 {storage_options}
2479 .. versionadded:: 1.2.0
2481 Returns
2482 -------
2483 None or str
2484 If path_or_buf is None, returns the resulting json format as a
2485 string. Otherwise returns None.
2487 See Also
2488 --------
2489 read_json : Convert a JSON string to pandas object.
2491 Notes
2492 -----
2493 The behavior of ``indent=0`` varies from the stdlib, which does not
2494 indent the output but does insert newlines. Currently, ``indent=0``
2495 and the default ``indent=None`` are equivalent in pandas, though this
2496 may change in a future release.
2498 ``orient='table'`` contains a 'pandas_version' field under 'schema'.
2499 This stores the version of `pandas` used in the latest revision of the
2500 schema.
2502 Examples
2503 --------
2504 >>> import json
2505 >>> df = pd.DataFrame(
2506 ... [["a", "b"], ["c", "d"]],
2507 ... index=["row 1", "row 2"],
2508 ... columns=["col 1", "col 2"],
2509 ... )
2511 >>> result = df.to_json(orient="split")
2512 >>> parsed = json.loads(result)
2513 >>> json.dumps(parsed, indent=4) # doctest: +SKIP
2514 {{
2515 "columns": [
2516 "col 1",
2517 "col 2"
2518 ],
2519 "index": [
2520 "row 1",
2521 "row 2"
2522 ],
2523 "data": [
2524 [
2525 "a",
2526 "b"
2527 ],
2528 [
2529 "c",
2530 "d"
2531 ]
2532 ]
2533 }}
2535 Encoding/decoding a Dataframe using ``'records'`` formatted JSON.
2536 Note that index labels are not preserved with this encoding.
2538 >>> result = df.to_json(orient="records")
2539 >>> parsed = json.loads(result)
2540 >>> json.dumps(parsed, indent=4) # doctest: +SKIP
2541 [
2542 {{
2543 "col 1": "a",
2544 "col 2": "b"
2545 }},
2546 {{
2547 "col 1": "c",
2548 "col 2": "d"
2549 }}
2550 ]
2552 Encoding/decoding a Dataframe using ``'index'`` formatted JSON:
2554 >>> result = df.to_json(orient="index")
2555 >>> parsed = json.loads(result)
2556 >>> json.dumps(parsed, indent=4) # doctest: +SKIP
2557 {{
2558 "row 1": {{
2559 "col 1": "a",
2560 "col 2": "b"
2561 }},
2562 "row 2": {{
2563 "col 1": "c",
2564 "col 2": "d"
2565 }}
2566 }}
2568 Encoding/decoding a Dataframe using ``'columns'`` formatted JSON:
2570 >>> result = df.to_json(orient="columns")
2571 >>> parsed = json.loads(result)
2572 >>> json.dumps(parsed, indent=4) # doctest: +SKIP
2573 {{
2574 "col 1": {{
2575 "row 1": "a",
2576 "row 2": "c"
2577 }},
2578 "col 2": {{
2579 "row 1": "b",
2580 "row 2": "d"
2581 }}
2582 }}
2584 Encoding/decoding a Dataframe using ``'values'`` formatted JSON:
2586 >>> result = df.to_json(orient="values")
2587 >>> parsed = json.loads(result)
2588 >>> json.dumps(parsed, indent=4) # doctest: +SKIP
2589 [
2590 [
2591 "a",
2592 "b"
2593 ],
2594 [
2595 "c",
2596 "d"
2597 ]
2598 ]
2600 Encoding with Table Schema:
2602 >>> result = df.to_json(orient="table")
2603 >>> parsed = json.loads(result)
2604 >>> json.dumps(parsed, indent=4) # doctest: +SKIP
2605 {{
2606 "schema": {{
2607 "fields": [
2608 {{
2609 "name": "index",
2610 "type": "string"
2611 }},
2612 {{
2613 "name": "col 1",
2614 "type": "string"
2615 }},
2616 {{
2617 "name": "col 2",
2618 "type": "string"
2619 }}
2620 ],
2621 "primaryKey": [
2622 "index"
2623 ],
2624 "pandas_version": "1.4.0"
2625 }},
2626 "data": [
2627 {{
2628 "index": "row 1",
2629 "col 1": "a",
2630 "col 2": "b"
2631 }},
2632 {{
2633 "index": "row 2",
2634 "col 1": "c",
2635 "col 2": "d"
2636 }}
2637 ]
2638 }}
2639 """
2640 from pandas.io import json
2642 if date_format is None and orient == "table":
2643 date_format = "iso"
2644 elif date_format is None:
2645 date_format = "epoch"
2647 config.is_nonnegative_int(indent)
2648 indent = indent or 0
2650 return json.to_json(
2651 path_or_buf=path_or_buf,
2652 obj=self,
2653 orient=orient,
2654 date_format=date_format,
2655 double_precision=double_precision,
2656 force_ascii=force_ascii,
2657 date_unit=date_unit,
2658 default_handler=default_handler,
2659 lines=lines,
2660 compression=compression,
2661 index=index,
2662 indent=indent,
2663 storage_options=storage_options,
2664 )
2666 @final
2667 def to_hdf(
2668 self,
2669 path_or_buf: FilePath | HDFStore,
2670 key: str,
2671 mode: str = "a",
2672 complevel: int | None = None,
2673 complib: str | None = None,
2674 append: bool_t = False,
2675 format: str | None = None,
2676 index: bool_t = True,
2677 min_itemsize: int | dict[str, int] | None = None,
2678 nan_rep=None,
2679 dropna: bool_t | None = None,
2680 data_columns: Literal[True] | list[str] | None = None,
2681 errors: str = "strict",
2682 encoding: str = "UTF-8",
2683 ) -> None:
2684 """
2685 Write the contained data to an HDF5 file using HDFStore.
2687 Hierarchical Data Format (HDF) is self-describing, allowing an
2688 application to interpret the structure and contents of a file with
2689 no outside information. One HDF file can hold a mix of related objects
2690 which can be accessed as a group or as individual objects.
2692 In order to add another DataFrame or Series to an existing HDF file
2693 please use append mode and a different a key.
2695 .. warning::
2697 One can store a subclass of ``DataFrame`` or ``Series`` to HDF5,
2698 but the type of the subclass is lost upon storing.
2700 For more information see the :ref:`user guide <io.hdf5>`.
2702 Parameters
2703 ----------
2704 path_or_buf : str or pandas.HDFStore
2705 File path or HDFStore object.
2706 key : str
2707 Identifier for the group in the store.
2708 mode : {'a', 'w', 'r+'}, default 'a'
2709 Mode to open file:
2711 - 'w': write, a new file is created (an existing file with
2712 the same name would be deleted).
2713 - 'a': append, an existing file is opened for reading and
2714 writing, and if the file does not exist it is created.
2715 - 'r+': similar to 'a', but the file must already exist.
2716 complevel : {0-9}, default None
2717 Specifies a compression level for data.
2718 A value of 0 or None disables compression.
2719 complib : {'zlib', 'lzo', 'bzip2', 'blosc'}, default 'zlib'
2720 Specifies the compression library to be used.
2721 As of v0.20.2 these additional compressors for Blosc are supported
2722 (default if no compressor specified: 'blosc:blosclz'):
2723 {'blosc:blosclz', 'blosc:lz4', 'blosc:lz4hc', 'blosc:snappy',
2724 'blosc:zlib', 'blosc:zstd'}.
2725 Specifying a compression library which is not available issues
2726 a ValueError.
2727 append : bool, default False
2728 For Table formats, append the input data to the existing.
2729 format : {'fixed', 'table', None}, default 'fixed'
2730 Possible values:
2732 - 'fixed': Fixed format. Fast writing/reading. Not-appendable,
2733 nor searchable.
2734 - 'table': Table format. Write as a PyTables Table structure
2735 which may perform worse but allow more flexible operations
2736 like searching / selecting subsets of the data.
2737 - If None, pd.get_option('io.hdf.default_format') is checked,
2738 followed by fallback to "fixed".
2739 index : bool, default True
2740 Write DataFrame index as a column.
2741 min_itemsize : dict or int, optional
2742 Map column names to minimum string sizes for columns.
2743 nan_rep : Any, optional
2744 How to represent null values as str.
2745 Not allowed with append=True.
2746 dropna : bool, default False, optional
2747 Remove missing values.
2748 data_columns : list of columns or True, optional
2749 List of columns to create as indexed data columns for on-disk
2750 queries, or True to use all columns. By default only the axes
2751 of the object are indexed. See
2752 :ref:`Query via data columns<io.hdf5-query-data-columns>`. for
2753 more information.
2754 Applicable only to format='table'.
2755 errors : str, default 'strict'
2756 Specifies how encoding and decoding errors are to be handled.
2757 See the errors argument for :func:`open` for a full list
2758 of options.
2759 encoding : str, default "UTF-8"
2761 See Also
2762 --------
2763 read_hdf : Read from HDF file.
2764 DataFrame.to_orc : Write a DataFrame to the binary orc format.
2765 DataFrame.to_parquet : Write a DataFrame to the binary parquet format.
2766 DataFrame.to_sql : Write to a SQL table.
2767 DataFrame.to_feather : Write out feather-format for DataFrames.
2768 DataFrame.to_csv : Write out to a csv file.
2770 Examples
2771 --------
2772 >>> df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]},
2773 ... index=['a', 'b', 'c']) # doctest: +SKIP
2774 >>> df.to_hdf('data.h5', key='df', mode='w') # doctest: +SKIP
2776 We can add another object to the same file:
2778 >>> s = pd.Series([1, 2, 3, 4]) # doctest: +SKIP
2779 >>> s.to_hdf('data.h5', key='s') # doctest: +SKIP
2781 Reading from HDF file:
2783 >>> pd.read_hdf('data.h5', 'df') # doctest: +SKIP
2784 A B
2785 a 1 4
2786 b 2 5
2787 c 3 6
2788 >>> pd.read_hdf('data.h5', 's') # doctest: +SKIP
2789 0 1
2790 1 2
2791 2 3
2792 3 4
2793 dtype: int64
2794 """
2795 from pandas.io import pytables
2797 # Argument 3 to "to_hdf" has incompatible type "NDFrame"; expected
2798 # "Union[DataFrame, Series]" [arg-type]
2799 pytables.to_hdf(
2800 path_or_buf,
2801 key,
2802 self, # type: ignore[arg-type]
2803 mode=mode,
2804 complevel=complevel,
2805 complib=complib,
2806 append=append,
2807 format=format,
2808 index=index,
2809 min_itemsize=min_itemsize,
2810 nan_rep=nan_rep,
2811 dropna=dropna,
2812 data_columns=data_columns,
2813 errors=errors,
2814 encoding=encoding,
2815 )
2817 @final
2818 def to_sql(
2819 self,
2820 name: str,
2821 con,
2822 schema: str | None = None,
2823 if_exists: str = "fail",
2824 index: bool_t = True,
2825 index_label: IndexLabel = None,
2826 chunksize: int | None = None,
2827 dtype: DtypeArg | None = None,
2828 method: str | None = None,
2829 ) -> int | None:
2830 """
2831 Write records stored in a DataFrame to a SQL database.
2833 Databases supported by SQLAlchemy [1]_ are supported. Tables can be
2834 newly created, appended to, or overwritten.
2836 Parameters
2837 ----------
2838 name : str
2839 Name of SQL table.
2840 con : sqlalchemy.engine.(Engine or Connection) or sqlite3.Connection
2841 Using SQLAlchemy makes it possible to use any DB supported by that
2842 library. Legacy support is provided for sqlite3.Connection objects. The user
2843 is responsible for engine disposal and connection closure for the SQLAlchemy
2844 connectable See `here \
2845 <https://docs.sqlalchemy.org/en/13/core/connections.html>`_.
2847 schema : str, optional
2848 Specify the schema (if database flavor supports this). If None, use
2849 default schema.
2850 if_exists : {'fail', 'replace', 'append'}, default 'fail'
2851 How to behave if the table already exists.
2853 * fail: Raise a ValueError.
2854 * replace: Drop the table before inserting new values.
2855 * append: Insert new values to the existing table.
2857 index : bool, default True
2858 Write DataFrame index as a column. Uses `index_label` as the column
2859 name in the table.
2860 index_label : str or sequence, default None
2861 Column label for index column(s). If None is given (default) and
2862 `index` is True, then the index names are used.
2863 A sequence should be given if the DataFrame uses MultiIndex.
2864 chunksize : int, optional
2865 Specify the number of rows in each batch to be written at a time.
2866 By default, all rows will be written at once.
2867 dtype : dict or scalar, optional
2868 Specifying the datatype for columns. If a dictionary is used, the
2869 keys should be the column names and the values should be the
2870 SQLAlchemy types or strings for the sqlite3 legacy mode. If a
2871 scalar is provided, it will be applied to all columns.
2872 method : {None, 'multi', callable}, optional
2873 Controls the SQL insertion clause used:
2875 * None : Uses standard SQL ``INSERT`` clause (one per row).
2876 * 'multi': Pass multiple values in a single ``INSERT`` clause.
2877 * callable with signature ``(pd_table, conn, keys, data_iter)``.
2879 Details and a sample callable implementation can be found in the
2880 section :ref:`insert method <io.sql.method>`.
2882 Returns
2883 -------
2884 None or int
2885 Number of rows affected by to_sql. None is returned if the callable
2886 passed into ``method`` does not return an integer number of rows.
2888 The number of returned rows affected is the sum of the ``rowcount``
2889 attribute of ``sqlite3.Cursor`` or SQLAlchemy connectable which may not
2890 reflect the exact number of written rows as stipulated in the
2891 `sqlite3 <https://docs.python.org/3/library/sqlite3.html#sqlite3.Cursor.rowcount>`__ or
2892 `SQLAlchemy <https://docs.sqlalchemy.org/en/14/core/connections.html#sqlalchemy.engine.BaseCursorResult.rowcount>`__.
2894 .. versionadded:: 1.4.0
2896 Raises
2897 ------
2898 ValueError
2899 When the table already exists and `if_exists` is 'fail' (the
2900 default).
2902 See Also
2903 --------
2904 read_sql : Read a DataFrame from a table.
2906 Notes
2907 -----
2908 Timezone aware datetime columns will be written as
2909 ``Timestamp with timezone`` type with SQLAlchemy if supported by the
2910 database. Otherwise, the datetimes will be stored as timezone unaware
2911 timestamps local to the original timezone.
2913 References
2914 ----------
2915 .. [1] https://docs.sqlalchemy.org
2916 .. [2] https://www.python.org/dev/peps/pep-0249/
2918 Examples
2919 --------
2920 Create an in-memory SQLite database.
2922 >>> from sqlalchemy import create_engine
2923 >>> engine = create_engine('sqlite://', echo=False)
2925 Create a table from scratch with 3 rows.
2927 >>> df = pd.DataFrame({'name' : ['User 1', 'User 2', 'User 3']})
2928 >>> df
2929 name
2930 0 User 1
2931 1 User 2
2932 2 User 3
2934 >>> df.to_sql('users', con=engine)
2935 3
2936 >>> engine.execute("SELECT * FROM users").fetchall()
2937 [(0, 'User 1'), (1, 'User 2'), (2, 'User 3')]
2939 An `sqlalchemy.engine.Connection` can also be passed to `con`:
2941 >>> with engine.begin() as connection:
2942 ... df1 = pd.DataFrame({'name' : ['User 4', 'User 5']})
2943 ... df1.to_sql('users', con=connection, if_exists='append')
2944 2
2946 This is allowed to support operations that require that the same
2947 DBAPI connection is used for the entire operation.
2949 >>> df2 = pd.DataFrame({'name' : ['User 6', 'User 7']})
2950 >>> df2.to_sql('users', con=engine, if_exists='append')
2951 2
2952 >>> engine.execute("SELECT * FROM users").fetchall()
2953 [(0, 'User 1'), (1, 'User 2'), (2, 'User 3'),
2954 (0, 'User 4'), (1, 'User 5'), (0, 'User 6'),
2955 (1, 'User 7')]
2957 Overwrite the table with just ``df2``.
2959 >>> df2.to_sql('users', con=engine, if_exists='replace',
2960 ... index_label='id')
2961 2
2962 >>> engine.execute("SELECT * FROM users").fetchall()
2963 [(0, 'User 6'), (1, 'User 7')]
2965 Specify the dtype (especially useful for integers with missing values).
2966 Notice that while pandas is forced to store the data as floating point,
2967 the database supports nullable integers. When fetching the data with
2968 Python, we get back integer scalars.
2970 >>> df = pd.DataFrame({"A": [1, None, 2]})
2971 >>> df
2972 A
2973 0 1.0
2974 1 NaN
2975 2 2.0
2977 >>> from sqlalchemy.types import Integer
2978 >>> df.to_sql('integers', con=engine, index=False,
2979 ... dtype={"A": Integer()})
2980 3
2982 >>> engine.execute("SELECT * FROM integers").fetchall()
2983 [(1,), (None,), (2,)]
2984 """ # noqa:E501
2985 from pandas.io import sql
2987 return sql.to_sql(
2988 self,
2989 name,
2990 con,
2991 schema=schema,
2992 if_exists=if_exists,
2993 index=index,
2994 index_label=index_label,
2995 chunksize=chunksize,
2996 dtype=dtype,
2997 method=method,
2998 )
3000 @final
3001 @doc(
3002 storage_options=_shared_docs["storage_options"],
3003 compression_options=_shared_docs["compression_options"] % "path",
3004 )
3005 def to_pickle(
3006 self,
3007 path: FilePath | WriteBuffer[bytes],
3008 compression: CompressionOptions = "infer",
3009 protocol: int = pickle.HIGHEST_PROTOCOL,
3010 storage_options: StorageOptions = None,
3011 ) -> None:
3012 """
3013 Pickle (serialize) object to file.
3015 Parameters
3016 ----------
3017 path : str, path object, or file-like object
3018 String, path object (implementing ``os.PathLike[str]``), or file-like
3019 object implementing a binary ``write()`` function. File path where
3020 the pickled object will be stored.
3021 {compression_options}
3022 protocol : int
3023 Int which indicates which protocol should be used by the pickler,
3024 default HIGHEST_PROTOCOL (see [1]_ paragraph 12.1.2). The possible
3025 values are 0, 1, 2, 3, 4, 5. A negative value for the protocol
3026 parameter is equivalent to setting its value to HIGHEST_PROTOCOL.
3028 .. [1] https://docs.python.org/3/library/pickle.html.
3030 {storage_options}
3032 .. versionadded:: 1.2.0
3034 See Also
3035 --------
3036 read_pickle : Load pickled pandas object (or any object) from file.
3037 DataFrame.to_hdf : Write DataFrame to an HDF5 file.
3038 DataFrame.to_sql : Write DataFrame to a SQL database.
3039 DataFrame.to_parquet : Write a DataFrame to the binary parquet format.
3041 Examples
3042 --------
3043 >>> original_df = pd.DataFrame({{"foo": range(5), "bar": range(5, 10)}}) # doctest: +SKIP
3044 >>> original_df # doctest: +SKIP
3045 foo bar
3046 0 0 5
3047 1 1 6
3048 2 2 7
3049 3 3 8
3050 4 4 9
3051 >>> original_df.to_pickle("./dummy.pkl") # doctest: +SKIP
3053 >>> unpickled_df = pd.read_pickle("./dummy.pkl") # doctest: +SKIP
3054 >>> unpickled_df # doctest: +SKIP
3055 foo bar
3056 0 0 5
3057 1 1 6
3058 2 2 7
3059 3 3 8
3060 4 4 9
3061 """ # noqa: E501
3062 from pandas.io.pickle import to_pickle
3064 to_pickle(
3065 self,
3066 path,
3067 compression=compression,
3068 protocol=protocol,
3069 storage_options=storage_options,
3070 )
3072 @final
3073 def to_clipboard(
3074 self, excel: bool_t = True, sep: str | None = None, **kwargs
3075 ) -> None:
3076 r"""
3077 Copy object to the system clipboard.
3079 Write a text representation of object to the system clipboard.
3080 This can be pasted into Excel, for example.
3082 Parameters
3083 ----------
3084 excel : bool, default True
3085 Produce output in a csv format for easy pasting into excel.
3087 - True, use the provided separator for csv pasting.
3088 - False, write a string representation of the object to the clipboard.
3090 sep : str, default ``'\t'``
3091 Field delimiter.
3092 **kwargs
3093 These parameters will be passed to DataFrame.to_csv.
3095 See Also
3096 --------
3097 DataFrame.to_csv : Write a DataFrame to a comma-separated values
3098 (csv) file.
3099 read_clipboard : Read text from clipboard and pass to read_csv.
3101 Notes
3102 -----
3103 Requirements for your platform.
3105 - Linux : `xclip`, or `xsel` (with `PyQt4` modules)
3106 - Windows : none
3107 - macOS : none
3109 This method uses the processes developed for the package `pyperclip`. A
3110 solution to render any output string format is given in the examples.
3112 Examples
3113 --------
3114 Copy the contents of a DataFrame to the clipboard.
3116 >>> df = pd.DataFrame([[1, 2, 3], [4, 5, 6]], columns=['A', 'B', 'C'])
3118 >>> df.to_clipboard(sep=',') # doctest: +SKIP
3119 ... # Wrote the following to the system clipboard:
3120 ... # ,A,B,C
3121 ... # 0,1,2,3
3122 ... # 1,4,5,6
3124 We can omit the index by passing the keyword `index` and setting
3125 it to false.
3127 >>> df.to_clipboard(sep=',', index=False) # doctest: +SKIP
3128 ... # Wrote the following to the system clipboard:
3129 ... # A,B,C
3130 ... # 1,2,3
3131 ... # 4,5,6
3133 Using the original `pyperclip` package for any string output format.
3135 .. code-block:: python
3137 import pyperclip
3138 html = df.style.to_html()
3139 pyperclip.copy(html)
3140 """
3141 from pandas.io import clipboards
3143 clipboards.to_clipboard(self, excel=excel, sep=sep, **kwargs)
3145 @final
3146 def to_xarray(self):
3147 """
3148 Return an xarray object from the pandas object.
3150 Returns
3151 -------
3152 xarray.DataArray or xarray.Dataset
3153 Data in the pandas structure converted to Dataset if the object is
3154 a DataFrame, or a DataArray if the object is a Series.
3156 See Also
3157 --------
3158 DataFrame.to_hdf : Write DataFrame to an HDF5 file.
3159 DataFrame.to_parquet : Write a DataFrame to the binary parquet format.
3161 Notes
3162 -----
3163 See the `xarray docs <https://xarray.pydata.org/en/stable/>`__
3165 Examples
3166 --------
3167 >>> df = pd.DataFrame([('falcon', 'bird', 389.0, 2),
3168 ... ('parrot', 'bird', 24.0, 2),
3169 ... ('lion', 'mammal', 80.5, 4),
3170 ... ('monkey', 'mammal', np.nan, 4)],
3171 ... columns=['name', 'class', 'max_speed',
3172 ... 'num_legs'])
3173 >>> df
3174 name class max_speed num_legs
3175 0 falcon bird 389.0 2
3176 1 parrot bird 24.0 2
3177 2 lion mammal 80.5 4
3178 3 monkey mammal NaN 4
3180 >>> df.to_xarray()
3181 <xarray.Dataset>
3182 Dimensions: (index: 4)
3183 Coordinates:
3184 * index (index) int64 0 1 2 3
3185 Data variables:
3186 name (index) object 'falcon' 'parrot' 'lion' 'monkey'
3187 class (index) object 'bird' 'bird' 'mammal' 'mammal'
3188 max_speed (index) float64 389.0 24.0 80.5 nan
3189 num_legs (index) int64 2 2 4 4
3191 >>> df['max_speed'].to_xarray()
3192 <xarray.DataArray 'max_speed' (index: 4)>
3193 array([389. , 24. , 80.5, nan])
3194 Coordinates:
3195 * index (index) int64 0 1 2 3
3197 >>> dates = pd.to_datetime(['2018-01-01', '2018-01-01',
3198 ... '2018-01-02', '2018-01-02'])
3199 >>> df_multiindex = pd.DataFrame({'date': dates,
3200 ... 'animal': ['falcon', 'parrot',
3201 ... 'falcon', 'parrot'],
3202 ... 'speed': [350, 18, 361, 15]})
3203 >>> df_multiindex = df_multiindex.set_index(['date', 'animal'])
3205 >>> df_multiindex
3206 speed
3207 date animal
3208 2018-01-01 falcon 350
3209 parrot 18
3210 2018-01-02 falcon 361
3211 parrot 15
3213 >>> df_multiindex.to_xarray()
3214 <xarray.Dataset>
3215 Dimensions: (date: 2, animal: 2)
3216 Coordinates:
3217 * date (date) datetime64[ns] 2018-01-01 2018-01-02
3218 * animal (animal) object 'falcon' 'parrot'
3219 Data variables:
3220 speed (date, animal) int64 350 18 361 15
3221 """
3222 xarray = import_optional_dependency("xarray")
3224 if self.ndim == 1:
3225 return xarray.DataArray.from_series(self)
3226 else:
3227 return xarray.Dataset.from_dataframe(self)
3229 @overload
3230 def to_latex(
3231 self,
3232 buf: None = ...,
3233 columns: Sequence[Hashable] | None = ...,
3234 col_space: ColspaceArgType | None = ...,
3235 header: bool_t | Sequence[str] = ...,
3236 index: bool_t = ...,
3237 na_rep: str = ...,
3238 formatters: FormattersType | None = ...,
3239 float_format: FloatFormatType | None = ...,
3240 sparsify: bool_t | None = ...,
3241 index_names: bool_t = ...,
3242 bold_rows: bool_t = ...,
3243 column_format: str | None = ...,
3244 longtable: bool_t | None = ...,
3245 escape: bool_t | None = ...,
3246 encoding: str | None = ...,
3247 decimal: str = ...,
3248 multicolumn: bool_t | None = ...,
3249 multicolumn_format: str | None = ...,
3250 multirow: bool_t | None = ...,
3251 caption: str | tuple[str, str] | None = ...,
3252 label: str | None = ...,
3253 position: str | None = ...,
3254 ) -> str:
3255 ...
3257 @overload
3258 def to_latex(
3259 self,
3260 buf: FilePath | WriteBuffer[str],
3261 columns: Sequence[Hashable] | None = ...,
3262 col_space: ColspaceArgType | None = ...,
3263 header: bool_t | Sequence[str] = ...,
3264 index: bool_t = ...,
3265 na_rep: str = ...,
3266 formatters: FormattersType | None = ...,
3267 float_format: FloatFormatType | None = ...,
3268 sparsify: bool_t | None = ...,
3269 index_names: bool_t = ...,
3270 bold_rows: bool_t = ...,
3271 column_format: str | None = ...,
3272 longtable: bool_t | None = ...,
3273 escape: bool_t | None = ...,
3274 encoding: str | None = ...,
3275 decimal: str = ...,
3276 multicolumn: bool_t | None = ...,
3277 multicolumn_format: str | None = ...,
3278 multirow: bool_t | None = ...,
3279 caption: str | tuple[str, str] | None = ...,
3280 label: str | None = ...,
3281 position: str | None = ...,
3282 ) -> None:
3283 ...
3285 @final
3286 @doc(returns=fmt.return_docstring)
3287 def to_latex(
3288 self,
3289 buf: FilePath | WriteBuffer[str] | None = None,
3290 columns: Sequence[Hashable] | None = None,
3291 col_space: ColspaceArgType | None = None,
3292 header: bool_t | Sequence[str] = True,
3293 index: bool_t = True,
3294 na_rep: str = "NaN",
3295 formatters: FormattersType | None = None,
3296 float_format: FloatFormatType | None = None,
3297 sparsify: bool_t | None = None,
3298 index_names: bool_t = True,
3299 bold_rows: bool_t = False,
3300 column_format: str | None = None,
3301 longtable: bool_t | None = None,
3302 escape: bool_t | None = None,
3303 encoding: str | None = None,
3304 decimal: str = ".",
3305 multicolumn: bool_t | None = None,
3306 multicolumn_format: str | None = None,
3307 multirow: bool_t | None = None,
3308 caption: str | tuple[str, str] | None = None,
3309 label: str | None = None,
3310 position: str | None = None,
3311 ) -> str | None:
3312 r"""
3313 Render object to a LaTeX tabular, longtable, or nested table.
3315 Requires ``\usepackage{{booktabs}}``. The output can be copy/pasted
3316 into a main LaTeX document or read from an external file
3317 with ``\input{{table.tex}}``.
3319 .. versionchanged:: 1.0.0
3320 Added caption and label arguments.
3322 .. versionchanged:: 1.2.0
3323 Added position argument, changed meaning of caption argument.
3325 Parameters
3326 ----------
3327 buf : str, Path or StringIO-like, optional, default None
3328 Buffer to write to. If None, the output is returned as a string.
3329 columns : list of label, optional
3330 The subset of columns to write. Writes all columns by default.
3331 col_space : int, optional
3332 The minimum width of each column.
3333 header : bool or list of str, default True
3334 Write out the column names. If a list of strings is given,
3335 it is assumed to be aliases for the column names.
3336 index : bool, default True
3337 Write row names (index).
3338 na_rep : str, default 'NaN'
3339 Missing data representation.
3340 formatters : list of functions or dict of {{str: function}}, optional
3341 Formatter functions to apply to columns' elements by position or
3342 name. The result of each function must be a unicode string.
3343 List must be of length equal to the number of columns.
3344 float_format : one-parameter function or str, optional, default None
3345 Formatter for floating point numbers. For example
3346 ``float_format="%.2f"`` and ``float_format="{{:0.2f}}".format`` will
3347 both result in 0.1234 being formatted as 0.12.
3348 sparsify : bool, optional
3349 Set to False for a DataFrame with a hierarchical index to print
3350 every multiindex key at each row. By default, the value will be
3351 read from the config module.
3352 index_names : bool, default True
3353 Prints the names of the indexes.
3354 bold_rows : bool, default False
3355 Make the row labels bold in the output.
3356 column_format : str, optional
3357 The columns format as specified in `LaTeX table format
3358 <https://en.wikibooks.org/wiki/LaTeX/Tables>`__ e.g. 'rcl' for 3
3359 columns. By default, 'l' will be used for all columns except
3360 columns of numbers, which default to 'r'.
3361 longtable : bool, optional
3362 By default, the value will be read from the pandas config
3363 module. Use a longtable environment instead of tabular. Requires
3364 adding a \usepackage{{longtable}} to your LaTeX preamble.
3365 escape : bool, optional
3366 By default, the value will be read from the pandas config
3367 module. When set to False prevents from escaping latex special
3368 characters in column names.
3369 encoding : str, optional
3370 A string representing the encoding to use in the output file,
3371 defaults to 'utf-8'.
3372 decimal : str, default '.'
3373 Character recognized as decimal separator, e.g. ',' in Europe.
3374 multicolumn : bool, default True
3375 Use \multicolumn to enhance MultiIndex columns.
3376 The default will be read from the config module.
3377 multicolumn_format : str, default 'l'
3378 The alignment for multicolumns, similar to `column_format`
3379 The default will be read from the config module.
3380 multirow : bool, default False
3381 Use \multirow to enhance MultiIndex rows. Requires adding a
3382 \usepackage{{multirow}} to your LaTeX preamble. Will print
3383 centered labels (instead of top-aligned) across the contained
3384 rows, separating groups via clines. The default will be read
3385 from the pandas config module.
3386 caption : str or tuple, optional
3387 Tuple (full_caption, short_caption),
3388 which results in ``\caption[short_caption]{{full_caption}}``;
3389 if a single string is passed, no short caption will be set.
3391 .. versionadded:: 1.0.0
3393 .. versionchanged:: 1.2.0
3394 Optionally allow caption to be a tuple ``(full_caption, short_caption)``.
3396 label : str, optional
3397 The LaTeX label to be placed inside ``\label{{}}`` in the output.
3398 This is used with ``\ref{{}}`` in the main ``.tex`` file.
3400 .. versionadded:: 1.0.0
3401 position : str, optional
3402 The LaTeX positional argument for tables, to be placed after
3403 ``\begin{{}}`` in the output.
3405 .. versionadded:: 1.2.0
3406 {returns}
3407 See Also
3408 --------
3409 io.formats.style.Styler.to_latex : Render a DataFrame to LaTeX
3410 with conditional formatting.
3411 DataFrame.to_string : Render a DataFrame to a console-friendly
3412 tabular output.
3413 DataFrame.to_html : Render a DataFrame as an HTML table.
3415 Examples
3416 --------
3417 >>> df = pd.DataFrame(dict(name=['Raphael', 'Donatello'],
3418 ... mask=['red', 'purple'],
3419 ... weapon=['sai', 'bo staff']))
3420 >>> print(df.to_latex(index=False)) # doctest: +SKIP
3421 \begin{{tabular}}{{lll}}
3422 \toprule
3423 name & mask & weapon \\
3424 \midrule
3425 Raphael & red & sai \\
3426 Donatello & purple & bo staff \\
3427 \bottomrule
3428 \end{{tabular}}
3429 """
3430 msg = (
3431 "In future versions `DataFrame.to_latex` is expected to utilise the base "
3432 "implementation of `Styler.to_latex` for formatting and rendering. "
3433 "The arguments signature may therefore change. It is recommended instead "
3434 "to use `DataFrame.style.to_latex` which also contains additional "
3435 "functionality."
3436 )
3437 warnings.warn(msg, FutureWarning, stacklevel=find_stack_level())
3439 # Get defaults from the pandas config
3440 if self.ndim == 1:
3441 self = self.to_frame()
3442 if longtable is None:
3443 longtable = config.get_option("display.latex.longtable")
3444 if escape is None:
3445 escape = config.get_option("display.latex.escape")
3446 if multicolumn is None:
3447 multicolumn = config.get_option("display.latex.multicolumn")
3448 if multicolumn_format is None:
3449 multicolumn_format = config.get_option("display.latex.multicolumn_format")
3450 if multirow is None:
3451 multirow = config.get_option("display.latex.multirow")
3453 self = cast("DataFrame", self)
3454 formatter = DataFrameFormatter(
3455 self,
3456 columns=columns,
3457 col_space=col_space,
3458 na_rep=na_rep,
3459 header=header,
3460 index=index,
3461 formatters=formatters,
3462 float_format=float_format,
3463 bold_rows=bold_rows,
3464 sparsify=sparsify,
3465 index_names=index_names,
3466 escape=escape,
3467 decimal=decimal,
3468 )
3469 return DataFrameRenderer(formatter).to_latex(
3470 buf=buf,
3471 column_format=column_format,
3472 longtable=longtable,
3473 encoding=encoding,
3474 multicolumn=multicolumn,
3475 multicolumn_format=multicolumn_format,
3476 multirow=multirow,
3477 caption=caption,
3478 label=label,
3479 position=position,
3480 )
3482 @overload
3483 def to_csv(
3484 self,
3485 path_or_buf: None = ...,
3486 sep: str = ...,
3487 na_rep: str = ...,
3488 float_format: str | Callable | None = ...,
3489 columns: Sequence[Hashable] | None = ...,
3490 header: bool_t | list[str] = ...,
3491 index: bool_t = ...,
3492 index_label: IndexLabel | None = ...,
3493 mode: str = ...,
3494 encoding: str | None = ...,
3495 compression: CompressionOptions = ...,
3496 quoting: int | None = ...,
3497 quotechar: str = ...,
3498 lineterminator: str | None = ...,
3499 chunksize: int | None = ...,
3500 date_format: str | None = ...,
3501 doublequote: bool_t = ...,
3502 escapechar: str | None = ...,
3503 decimal: str = ...,
3504 errors: str = ...,
3505 storage_options: StorageOptions = ...,
3506 ) -> str:
3507 ...
3509 @overload
3510 def to_csv(
3511 self,
3512 path_or_buf: FilePath | WriteBuffer[bytes] | WriteBuffer[str],
3513 sep: str = ...,
3514 na_rep: str = ...,
3515 float_format: str | Callable | None = ...,
3516 columns: Sequence[Hashable] | None = ...,
3517 header: bool_t | list[str] = ...,
3518 index: bool_t = ...,
3519 index_label: IndexLabel | None = ...,
3520 mode: str = ...,
3521 encoding: str | None = ...,
3522 compression: CompressionOptions = ...,
3523 quoting: int | None = ...,
3524 quotechar: str = ...,
3525 lineterminator: str | None = ...,
3526 chunksize: int | None = ...,
3527 date_format: str | None = ...,
3528 doublequote: bool_t = ...,
3529 escapechar: str | None = ...,
3530 decimal: str = ...,
3531 errors: str = ...,
3532 storage_options: StorageOptions = ...,
3533 ) -> None:
3534 ...
3536 @final
3537 @doc(
3538 storage_options=_shared_docs["storage_options"],
3539 compression_options=_shared_docs["compression_options"] % "path_or_buf",
3540 )
3541 @deprecate_kwarg(old_arg_name="line_terminator", new_arg_name="lineterminator")
3542 def to_csv(
3543 self,
3544 path_or_buf: FilePath | WriteBuffer[bytes] | WriteBuffer[str] | None = None,
3545 sep: str = ",",
3546 na_rep: str = "",
3547 float_format: str | Callable | None = None,
3548 columns: Sequence[Hashable] | None = None,
3549 header: bool_t | list[str] = True,
3550 index: bool_t = True,
3551 index_label: IndexLabel | None = None,
3552 mode: str = "w",
3553 encoding: str | None = None,
3554 compression: CompressionOptions = "infer",
3555 quoting: int | None = None,
3556 quotechar: str = '"',
3557 lineterminator: str | None = None,
3558 chunksize: int | None = None,
3559 date_format: str | None = None,
3560 doublequote: bool_t = True,
3561 escapechar: str | None = None,
3562 decimal: str = ".",
3563 errors: str = "strict",
3564 storage_options: StorageOptions = None,
3565 ) -> str | None:
3566 r"""
3567 Write object to a comma-separated values (csv) file.
3569 Parameters
3570 ----------
3571 path_or_buf : str, path object, file-like object, or None, default None
3572 String, path object (implementing os.PathLike[str]), or file-like
3573 object implementing a write() function. If None, the result is
3574 returned as a string. If a non-binary file object is passed, it should
3575 be opened with `newline=''`, disabling universal newlines. If a binary
3576 file object is passed, `mode` might need to contain a `'b'`.
3578 .. versionchanged:: 1.2.0
3580 Support for binary file objects was introduced.
3582 sep : str, default ','
3583 String of length 1. Field delimiter for the output file.
3584 na_rep : str, default ''
3585 Missing data representation.
3586 float_format : str, Callable, default None
3587 Format string for floating point numbers. If a Callable is given, it takes
3588 precedence over other numeric formatting parameters, like decimal.
3589 columns : sequence, optional
3590 Columns to write.
3591 header : bool or list of str, default True
3592 Write out the column names. If a list of strings is given it is
3593 assumed to be aliases for the column names.
3594 index : bool, default True
3595 Write row names (index).
3596 index_label : str or sequence, or False, default None
3597 Column label for index column(s) if desired. If None is given, and
3598 `header` and `index` are True, then the index names are used. A
3599 sequence should be given if the object uses MultiIndex. If
3600 False do not print fields for index names. Use index_label=False
3601 for easier importing in R.
3602 mode : str, default 'w'
3603 Python write mode. The available write modes are the same as
3604 :py:func:`open`.
3605 encoding : str, optional
3606 A string representing the encoding to use in the output file,
3607 defaults to 'utf-8'. `encoding` is not supported if `path_or_buf`
3608 is a non-binary file object.
3609 {compression_options}
3611 .. versionchanged:: 1.0.0
3613 May now be a dict with key 'method' as compression mode
3614 and other entries as additional compression options if
3615 compression mode is 'zip'.
3617 .. versionchanged:: 1.1.0
3619 Passing compression options as keys in dict is
3620 supported for compression modes 'gzip', 'bz2', 'zstd', and 'zip'.
3622 .. versionchanged:: 1.2.0
3624 Compression is supported for binary file objects.
3626 .. versionchanged:: 1.2.0
3628 Previous versions forwarded dict entries for 'gzip' to
3629 `gzip.open` instead of `gzip.GzipFile` which prevented
3630 setting `mtime`.
3632 quoting : optional constant from csv module
3633 Defaults to csv.QUOTE_MINIMAL. If you have set a `float_format`
3634 then floats are converted to strings and thus csv.QUOTE_NONNUMERIC
3635 will treat them as non-numeric.
3636 quotechar : str, default '\"'
3637 String of length 1. Character used to quote fields.
3638 lineterminator : str, optional
3639 The newline character or character sequence to use in the output
3640 file. Defaults to `os.linesep`, which depends on the OS in which
3641 this method is called ('\\n' for linux, '\\r\\n' for Windows, i.e.).
3643 .. versionchanged:: 1.5.0
3645 Previously was line_terminator, changed for consistency with
3646 read_csv and the standard library 'csv' module.
3648 chunksize : int or None
3649 Rows to write at a time.
3650 date_format : str, default None
3651 Format string for datetime objects.
3652 doublequote : bool, default True
3653 Control quoting of `quotechar` inside a field.
3654 escapechar : str, default None
3655 String of length 1. Character used to escape `sep` and `quotechar`
3656 when appropriate.
3657 decimal : str, default '.'
3658 Character recognized as decimal separator. E.g. use ',' for
3659 European data.
3660 errors : str, default 'strict'
3661 Specifies how encoding and decoding errors are to be handled.
3662 See the errors argument for :func:`open` for a full list
3663 of options.
3665 .. versionadded:: 1.1.0
3667 {storage_options}
3669 .. versionadded:: 1.2.0
3671 Returns
3672 -------
3673 None or str
3674 If path_or_buf is None, returns the resulting csv format as a
3675 string. Otherwise returns None.
3677 See Also
3678 --------
3679 read_csv : Load a CSV file into a DataFrame.
3680 to_excel : Write DataFrame to an Excel file.
3682 Examples
3683 --------
3684 >>> df = pd.DataFrame({{'name': ['Raphael', 'Donatello'],
3685 ... 'mask': ['red', 'purple'],
3686 ... 'weapon': ['sai', 'bo staff']}})
3687 >>> df.to_csv(index=False)
3688 'name,mask,weapon\nRaphael,red,sai\nDonatello,purple,bo staff\n'
3690 Create 'out.zip' containing 'out.csv'
3692 >>> compression_opts = dict(method='zip',
3693 ... archive_name='out.csv') # doctest: +SKIP
3694 >>> df.to_csv('out.zip', index=False,
3695 ... compression=compression_opts) # doctest: +SKIP
3697 To write a csv file to a new folder or nested folder you will first
3698 need to create it using either Pathlib or os:
3700 >>> from pathlib import Path # doctest: +SKIP
3701 >>> filepath = Path('folder/subfolder/out.csv') # doctest: +SKIP
3702 >>> filepath.parent.mkdir(parents=True, exist_ok=True) # doctest: +SKIP
3703 >>> df.to_csv(filepath) # doctest: +SKIP
3705 >>> import os # doctest: +SKIP
3706 >>> os.makedirs('folder/subfolder', exist_ok=True) # doctest: +SKIP
3707 >>> df.to_csv('folder/subfolder/out.csv') # doctest: +SKIP
3708 """
3709 df = self if isinstance(self, ABCDataFrame) else self.to_frame()
3711 formatter = DataFrameFormatter(
3712 frame=df,
3713 header=header,
3714 index=index,
3715 na_rep=na_rep,
3716 float_format=float_format,
3717 decimal=decimal,
3718 )
3720 return DataFrameRenderer(formatter).to_csv(
3721 path_or_buf,
3722 lineterminator=lineterminator,
3723 sep=sep,
3724 encoding=encoding,
3725 errors=errors,
3726 compression=compression,
3727 quoting=quoting,
3728 columns=columns,
3729 index_label=index_label,
3730 mode=mode,
3731 chunksize=chunksize,
3732 quotechar=quotechar,
3733 date_format=date_format,
3734 doublequote=doublequote,
3735 escapechar=escapechar,
3736 storage_options=storage_options,
3737 )
3739 # ----------------------------------------------------------------------
3740 # Lookup Caching
3742 def _reset_cacher(self) -> None:
3743 """
3744 Reset the cacher.
3745 """
3746 raise AbstractMethodError(self)
3748 def _maybe_update_cacher(
3749 self,
3750 clear: bool_t = False,
3751 verify_is_copy: bool_t = True,
3752 inplace: bool_t = False,
3753 ) -> None:
3754 """
3755 See if we need to update our parent cacher if clear, then clear our
3756 cache.
3758 Parameters
3759 ----------
3760 clear : bool, default False
3761 Clear the item cache.
3762 verify_is_copy : bool, default True
3763 Provide is_copy checks.
3764 """
3766 if verify_is_copy:
3767 self._check_setitem_copy(t="referent")
3769 if clear:
3770 self._clear_item_cache()
3772 def _clear_item_cache(self) -> None:
3773 raise AbstractMethodError(self)
3775 # ----------------------------------------------------------------------
3776 # Indexing Methods
3778 def take(
3779 self: NDFrameT, indices, axis=0, is_copy: bool_t | None = None, **kwargs
3780 ) -> NDFrameT:
3781 """
3782 Return the elements in the given *positional* indices along an axis.
3784 This means that we are not indexing according to actual values in
3785 the index attribute of the object. We are indexing according to the
3786 actual position of the element in the object.
3788 Parameters
3789 ----------
3790 indices : array-like
3791 An array of ints indicating which positions to take.
3792 axis : {0 or 'index', 1 or 'columns', None}, default 0
3793 The axis on which to select elements. ``0`` means that we are
3794 selecting rows, ``1`` means that we are selecting columns.
3795 For `Series` this parameter is unused and defaults to 0.
3796 is_copy : bool
3797 Before pandas 1.0, ``is_copy=False`` can be specified to ensure
3798 that the return value is an actual copy. Starting with pandas 1.0,
3799 ``take`` always returns a copy, and the keyword is therefore
3800 deprecated.
3802 .. deprecated:: 1.0.0
3803 **kwargs
3804 For compatibility with :meth:`numpy.take`. Has no effect on the
3805 output.
3807 Returns
3808 -------
3809 taken : same type as caller
3810 An array-like containing the elements taken from the object.
3812 See Also
3813 --------
3814 DataFrame.loc : Select a subset of a DataFrame by labels.
3815 DataFrame.iloc : Select a subset of a DataFrame by positions.
3816 numpy.take : Take elements from an array along an axis.
3818 Examples
3819 --------
3820 >>> df = pd.DataFrame([('falcon', 'bird', 389.0),
3821 ... ('parrot', 'bird', 24.0),
3822 ... ('lion', 'mammal', 80.5),
3823 ... ('monkey', 'mammal', np.nan)],
3824 ... columns=['name', 'class', 'max_speed'],
3825 ... index=[0, 2, 3, 1])
3826 >>> df
3827 name class max_speed
3828 0 falcon bird 389.0
3829 2 parrot bird 24.0
3830 3 lion mammal 80.5
3831 1 monkey mammal NaN
3833 Take elements at positions 0 and 3 along the axis 0 (default).
3835 Note how the actual indices selected (0 and 1) do not correspond to
3836 our selected indices 0 and 3. That's because we are selecting the 0th
3837 and 3rd rows, not rows whose indices equal 0 and 3.
3839 >>> df.take([0, 3])
3840 name class max_speed
3841 0 falcon bird 389.0
3842 1 monkey mammal NaN
3844 Take elements at indices 1 and 2 along the axis 1 (column selection).
3846 >>> df.take([1, 2], axis=1)
3847 class max_speed
3848 0 bird 389.0
3849 2 bird 24.0
3850 3 mammal 80.5
3851 1 mammal NaN
3853 We may take elements using negative integers for positive indices,
3854 starting from the end of the object, just like with Python lists.
3856 >>> df.take([-1, -2])
3857 name class max_speed
3858 1 monkey mammal NaN
3859 3 lion mammal 80.5
3860 """
3861 if is_copy is not None:
3862 warnings.warn(
3863 "is_copy is deprecated and will be removed in a future version. "
3864 "'take' always returns a copy, so there is no need to specify this.",
3865 FutureWarning,
3866 stacklevel=find_stack_level(),
3867 )
3869 nv.validate_take((), kwargs)
3871 return self._take(indices, axis)
3873 def _take(
3874 self: NDFrameT,
3875 indices,
3876 axis=0,
3877 convert_indices: bool_t = True,
3878 ) -> NDFrameT:
3879 """
3880 Internal version of the `take` allowing specification of additional args.
3882 See the docstring of `take` for full explanation of the parameters.
3883 """
3884 self._consolidate_inplace()
3886 new_data = self._mgr.take(
3887 indices,
3888 axis=self._get_block_manager_axis(axis),
3889 verify=True,
3890 convert_indices=convert_indices,
3891 )
3892 return self._constructor(new_data).__finalize__(self, method="take")
3894 def _take_with_is_copy(self: NDFrameT, indices, axis=0) -> NDFrameT:
3895 """
3896 Internal version of the `take` method that sets the `_is_copy`
3897 attribute to keep track of the parent dataframe (using in indexing
3898 for the SettingWithCopyWarning).
3900 See the docstring of `take` for full explanation of the parameters.
3901 """
3902 result = self._take(indices=indices, axis=axis)
3903 # Maybe set copy if we didn't actually change the index.
3904 if not result._get_axis(axis).equals(self._get_axis(axis)):
3905 result._set_is_copy(self)
3906 return result
3908 @final
3909 def xs(
3910 self: NDFrameT,
3911 key: IndexLabel,
3912 axis: Axis = 0,
3913 level: IndexLabel = None,
3914 drop_level: bool_t = True,
3915 ) -> NDFrameT:
3916 """
3917 Return cross-section from the Series/DataFrame.
3919 This method takes a `key` argument to select data at a particular
3920 level of a MultiIndex.
3922 Parameters
3923 ----------
3924 key : label or tuple of label
3925 Label contained in the index, or partially in a MultiIndex.
3926 axis : {0 or 'index', 1 or 'columns'}, default 0
3927 Axis to retrieve cross-section on.
3928 level : object, defaults to first n levels (n=1 or len(key))
3929 In case of a key partially contained in a MultiIndex, indicate
3930 which levels are used. Levels can be referred by label or position.
3931 drop_level : bool, default True
3932 If False, returns object with same levels as self.
3934 Returns
3935 -------
3936 Series or DataFrame
3937 Cross-section from the original Series or DataFrame
3938 corresponding to the selected index levels.
3940 See Also
3941 --------
3942 DataFrame.loc : Access a group of rows and columns
3943 by label(s) or a boolean array.
3944 DataFrame.iloc : Purely integer-location based indexing
3945 for selection by position.
3947 Notes
3948 -----
3949 `xs` can not be used to set values.
3951 MultiIndex Slicers is a generic way to get/set values on
3952 any level or levels.
3953 It is a superset of `xs` functionality, see
3954 :ref:`MultiIndex Slicers <advanced.mi_slicers>`.
3956 Examples
3957 --------
3958 >>> d = {'num_legs': [4, 4, 2, 2],
3959 ... 'num_wings': [0, 0, 2, 2],
3960 ... 'class': ['mammal', 'mammal', 'mammal', 'bird'],
3961 ... 'animal': ['cat', 'dog', 'bat', 'penguin'],
3962 ... 'locomotion': ['walks', 'walks', 'flies', 'walks']}
3963 >>> df = pd.DataFrame(data=d)
3964 >>> df = df.set_index(['class', 'animal', 'locomotion'])
3965 >>> df
3966 num_legs num_wings
3967 class animal locomotion
3968 mammal cat walks 4 0
3969 dog walks 4 0
3970 bat flies 2 2
3971 bird penguin walks 2 2
3973 Get values at specified index
3975 >>> df.xs('mammal')
3976 num_legs num_wings
3977 animal locomotion
3978 cat walks 4 0
3979 dog walks 4 0
3980 bat flies 2 2
3982 Get values at several indexes
3984 >>> df.xs(('mammal', 'dog'))
3985 num_legs num_wings
3986 locomotion
3987 walks 4 0
3989 Get values at specified index and level
3991 >>> df.xs('cat', level=1)
3992 num_legs num_wings
3993 class locomotion
3994 mammal walks 4 0
3996 Get values at several indexes and levels
3998 >>> df.xs(('bird', 'walks'),
3999 ... level=[0, 'locomotion'])
4000 num_legs num_wings
4001 animal
4002 penguin 2 2
4004 Get values at specified column and axis
4006 >>> df.xs('num_wings', axis=1)
4007 class animal locomotion
4008 mammal cat walks 0
4009 dog walks 0
4010 bat flies 2
4011 bird penguin walks 2
4012 Name: num_wings, dtype: int64
4013 """
4014 axis = self._get_axis_number(axis)
4015 labels = self._get_axis(axis)
4017 if isinstance(key, list):
4018 warnings.warn(
4019 "Passing lists as key for xs is deprecated and will be removed in a "
4020 "future version. Pass key as a tuple instead.",
4021 FutureWarning,
4022 stacklevel=find_stack_level(),
4023 )
4025 if level is not None:
4026 if not isinstance(labels, MultiIndex):
4027 raise TypeError("Index must be a MultiIndex")
4028 loc, new_ax = labels.get_loc_level(key, level=level, drop_level=drop_level)
4030 # create the tuple of the indexer
4031 _indexer = [slice(None)] * self.ndim
4032 _indexer[axis] = loc
4033 indexer = tuple(_indexer)
4035 result = self.iloc[indexer]
4036 setattr(result, result._get_axis_name(axis), new_ax)
4037 return result
4039 if axis == 1:
4040 if drop_level:
4041 return self[key]
4042 index = self.columns
4043 else:
4044 index = self.index
4046 self._consolidate_inplace()
4048 if isinstance(index, MultiIndex):
4049 loc, new_index = index._get_loc_level(key, level=0)
4050 if not drop_level:
4051 if lib.is_integer(loc):
4052 new_index = index[loc : loc + 1]
4053 else:
4054 new_index = index[loc]
4055 else:
4056 loc = index.get_loc(key)
4058 if isinstance(loc, np.ndarray):
4059 if loc.dtype == np.bool_:
4060 (inds,) = loc.nonzero()
4061 return self._take_with_is_copy(inds, axis=axis)
4062 else:
4063 return self._take_with_is_copy(loc, axis=axis)
4065 if not is_scalar(loc):
4066 new_index = index[loc]
4068 if is_scalar(loc) and axis == 0:
4069 # In this case loc should be an integer
4070 if self.ndim == 1:
4071 # if we encounter an array-like and we only have 1 dim
4072 # that means that their are list/ndarrays inside the Series!
4073 # so just return them (GH 6394)
4074 return self._values[loc]
4076 new_mgr = self._mgr.fast_xs(loc)
4078 result = self._constructor_sliced(
4079 new_mgr, name=self.index[loc]
4080 ).__finalize__(self)
4081 elif is_scalar(loc):
4082 result = self.iloc[:, slice(loc, loc + 1)]
4083 elif axis == 1:
4084 result = self.iloc[:, loc]
4085 else:
4086 result = self.iloc[loc]
4087 result.index = new_index
4089 # this could be a view
4090 # but only in a single-dtyped view sliceable case
4091 result._set_is_copy(self, copy=not result._is_view)
4092 return result
4094 def __getitem__(self, item):
4095 raise AbstractMethodError(self)
4097 def _slice(self: NDFrameT, slobj: slice, axis=0) -> NDFrameT:
4098 """
4099 Construct a slice of this container.
4101 Slicing with this method is *always* positional.
4102 """
4103 assert isinstance(slobj, slice), type(slobj)
4104 axis = self._get_block_manager_axis(axis)
4105 result = self._constructor(self._mgr.get_slice(slobj, axis=axis))
4106 result = result.__finalize__(self)
4108 # this could be a view
4109 # but only in a single-dtyped view sliceable case
4110 is_copy = axis != 0 or result._is_view
4111 result._set_is_copy(self, copy=is_copy)
4112 return result
4114 @final
4115 def _set_is_copy(self, ref: NDFrame, copy: bool_t = True) -> None:
4116 if not copy:
4117 self._is_copy = None
4118 else:
4119 assert ref is not None
4120 self._is_copy = weakref.ref(ref)
4122 def _check_is_chained_assignment_possible(self) -> bool_t:
4123 """
4124 Check if we are a view, have a cacher, and are of mixed type.
4125 If so, then force a setitem_copy check.
4127 Should be called just near setting a value
4129 Will return a boolean if it we are a view and are cached, but a
4130 single-dtype meaning that the cacher should be updated following
4131 setting.
4132 """
4133 if self._is_copy:
4134 self._check_setitem_copy(t="referent")
4135 return False
4137 @final
4138 def _check_setitem_copy(self, t="setting", force=False):
4139 """
4141 Parameters
4142 ----------
4143 t : str, the type of setting error
4144 force : bool, default False
4145 If True, then force showing an error.
4147 validate if we are doing a setitem on a chained copy.
4149 It is technically possible to figure out that we are setting on
4150 a copy even WITH a multi-dtyped pandas object. In other words, some
4151 blocks may be views while other are not. Currently _is_view will ALWAYS
4152 return False for multi-blocks to avoid having to handle this case.
4154 df = DataFrame(np.arange(0,9), columns=['count'])
4155 df['group'] = 'b'
4157 # This technically need not raise SettingWithCopy if both are view
4158 # (which is not generally guaranteed but is usually True. However,
4159 # this is in general not a good practice and we recommend using .loc.
4160 df.iloc[0:5]['group'] = 'a'
4162 """
4163 if (
4164 config.get_option("mode.copy_on_write")
4165 and config.get_option("mode.data_manager") == "block"
4166 ):
4167 return
4169 # return early if the check is not needed
4170 if not (force or self._is_copy):
4171 return
4173 value = config.get_option("mode.chained_assignment")
4174 if value is None:
4175 return
4177 # see if the copy is not actually referred; if so, then dissolve
4178 # the copy weakref
4179 if self._is_copy is not None and not isinstance(self._is_copy, str):
4180 r = self._is_copy()
4181 if not gc.get_referents(r) or (r is not None and r.shape == self.shape):
4182 self._is_copy = None
4183 return
4185 # a custom message
4186 if isinstance(self._is_copy, str):
4187 t = self._is_copy
4189 elif t == "referent":
4190 t = (
4191 "\n"
4192 "A value is trying to be set on a copy of a slice from a "
4193 "DataFrame\n\n"
4194 "See the caveats in the documentation: "
4195 "https://pandas.pydata.org/pandas-docs/stable/user_guide/"
4196 "indexing.html#returning-a-view-versus-a-copy"
4197 )
4199 else:
4200 t = (
4201 "\n"
4202 "A value is trying to be set on a copy of a slice from a "
4203 "DataFrame.\n"
4204 "Try using .loc[row_indexer,col_indexer] = value "
4205 "instead\n\nSee the caveats in the documentation: "
4206 "https://pandas.pydata.org/pandas-docs/stable/user_guide/"
4207 "indexing.html#returning-a-view-versus-a-copy"
4208 )
4210 if value == "raise":
4211 raise SettingWithCopyError(t)
4212 elif value == "warn":
4213 warnings.warn(t, SettingWithCopyWarning, stacklevel=find_stack_level())
4215 def __delitem__(self, key) -> None:
4216 """
4217 Delete item
4218 """
4219 deleted = False
4221 maybe_shortcut = False
4222 if self.ndim == 2 and isinstance(self.columns, MultiIndex):
4223 try:
4224 # By using engine's __contains__ we effectively
4225 # restrict to same-length tuples
4226 maybe_shortcut = key not in self.columns._engine
4227 except TypeError:
4228 pass
4230 if maybe_shortcut:
4231 # Allow shorthand to delete all columns whose first len(key)
4232 # elements match key:
4233 if not isinstance(key, tuple):
4234 key = (key,)
4235 for col in self.columns:
4236 if isinstance(col, tuple) and col[: len(key)] == key:
4237 del self[col]
4238 deleted = True
4239 if not deleted:
4240 # If the above loop ran and didn't delete anything because
4241 # there was no match, this call should raise the appropriate
4242 # exception:
4243 loc = self.axes[-1].get_loc(key)
4244 self._mgr = self._mgr.idelete(loc)
4246 # delete from the caches
4247 try:
4248 del self._item_cache[key]
4249 except KeyError:
4250 pass
4252 # ----------------------------------------------------------------------
4253 # Unsorted
4255 @final
4256 def _check_inplace_and_allows_duplicate_labels(self, inplace):
4257 if inplace and not self.flags.allows_duplicate_labels:
4258 raise ValueError(
4259 "Cannot specify 'inplace=True' when "
4260 "'self.flags.allows_duplicate_labels' is False."
4261 )
4263 @final
4264 def get(self, key, default=None):
4265 """
4266 Get item from object for given key (ex: DataFrame column).
4268 Returns default value if not found.
4270 Parameters
4271 ----------
4272 key : object
4274 Returns
4275 -------
4276 value : same type as items contained in object
4278 Examples
4279 --------
4280 >>> df = pd.DataFrame(
4281 ... [
4282 ... [24.3, 75.7, "high"],
4283 ... [31, 87.8, "high"],
4284 ... [22, 71.6, "medium"],
4285 ... [35, 95, "medium"],
4286 ... ],
4287 ... columns=["temp_celsius", "temp_fahrenheit", "windspeed"],
4288 ... index=pd.date_range(start="2014-02-12", end="2014-02-15", freq="D"),
4289 ... )
4291 >>> df
4292 temp_celsius temp_fahrenheit windspeed
4293 2014-02-12 24.3 75.7 high
4294 2014-02-13 31.0 87.8 high
4295 2014-02-14 22.0 71.6 medium
4296 2014-02-15 35.0 95.0 medium
4298 >>> df.get(["temp_celsius", "windspeed"])
4299 temp_celsius windspeed
4300 2014-02-12 24.3 high
4301 2014-02-13 31.0 high
4302 2014-02-14 22.0 medium
4303 2014-02-15 35.0 medium
4305 If the key isn't found, the default value will be used.
4307 >>> df.get(["temp_celsius", "temp_kelvin"], default="default_value")
4308 'default_value'
4309 """
4310 try:
4311 return self[key]
4312 except (KeyError, ValueError, IndexError):
4313 return default
4315 @final
4316 @property
4317 def _is_view(self) -> bool_t:
4318 """Return boolean indicating if self is view of another array"""
4319 return self._mgr.is_view
4321 @final
4322 def reindex_like(
4323 self: NDFrameT,
4324 other,
4325 method: str | None = None,
4326 copy: bool_t = True,
4327 limit=None,
4328 tolerance=None,
4329 ) -> NDFrameT:
4330 """
4331 Return an object with matching indices as other object.
4333 Conform the object to the same index on all axes. Optional
4334 filling logic, placing NaN in locations having no value
4335 in the previous index. A new object is produced unless the
4336 new index is equivalent to the current one and copy=False.
4338 Parameters
4339 ----------
4340 other : Object of the same data type
4341 Its row and column indices are used to define the new indices
4342 of this object.
4343 method : {None, 'backfill'/'bfill', 'pad'/'ffill', 'nearest'}
4344 Method to use for filling holes in reindexed DataFrame.
4345 Please note: this is only applicable to DataFrames/Series with a
4346 monotonically increasing/decreasing index.
4348 * None (default): don't fill gaps
4349 * pad / ffill: propagate last valid observation forward to next
4350 valid
4351 * backfill / bfill: use next valid observation to fill gap
4352 * nearest: use nearest valid observations to fill gap.
4354 copy : bool, default True
4355 Return a new object, even if the passed indexes are the same.
4356 limit : int, default None
4357 Maximum number of consecutive labels to fill for inexact matches.
4358 tolerance : optional
4359 Maximum distance between original and new labels for inexact
4360 matches. The values of the index at the matching locations must
4361 satisfy the equation ``abs(index[indexer] - target) <= tolerance``.
4363 Tolerance may be a scalar value, which applies the same tolerance
4364 to all values, or list-like, which applies variable tolerance per
4365 element. List-like includes list, tuple, array, Series, and must be
4366 the same size as the index and its dtype must exactly match the
4367 index's type.
4369 Returns
4370 -------
4371 Series or DataFrame
4372 Same type as caller, but with changed indices on each axis.
4374 See Also
4375 --------
4376 DataFrame.set_index : Set row labels.
4377 DataFrame.reset_index : Remove row labels or move them to new columns.
4378 DataFrame.reindex : Change to new indices or expand indices.
4380 Notes
4381 -----
4382 Same as calling
4383 ``.reindex(index=other.index, columns=other.columns,...)``.
4385 Examples
4386 --------
4387 >>> df1 = pd.DataFrame([[24.3, 75.7, 'high'],
4388 ... [31, 87.8, 'high'],
4389 ... [22, 71.6, 'medium'],
4390 ... [35, 95, 'medium']],
4391 ... columns=['temp_celsius', 'temp_fahrenheit',
4392 ... 'windspeed'],
4393 ... index=pd.date_range(start='2014-02-12',
4394 ... end='2014-02-15', freq='D'))
4396 >>> df1
4397 temp_celsius temp_fahrenheit windspeed
4398 2014-02-12 24.3 75.7 high
4399 2014-02-13 31.0 87.8 high
4400 2014-02-14 22.0 71.6 medium
4401 2014-02-15 35.0 95.0 medium
4403 >>> df2 = pd.DataFrame([[28, 'low'],
4404 ... [30, 'low'],
4405 ... [35.1, 'medium']],
4406 ... columns=['temp_celsius', 'windspeed'],
4407 ... index=pd.DatetimeIndex(['2014-02-12', '2014-02-13',
4408 ... '2014-02-15']))
4410 >>> df2
4411 temp_celsius windspeed
4412 2014-02-12 28.0 low
4413 2014-02-13 30.0 low
4414 2014-02-15 35.1 medium
4416 >>> df2.reindex_like(df1)
4417 temp_celsius temp_fahrenheit windspeed
4418 2014-02-12 28.0 NaN low
4419 2014-02-13 30.0 NaN low
4420 2014-02-14 NaN NaN NaN
4421 2014-02-15 35.1 NaN medium
4422 """
4423 d = other._construct_axes_dict(
4424 axes=self._AXIS_ORDERS,
4425 method=method,
4426 copy=copy,
4427 limit=limit,
4428 tolerance=tolerance,
4429 )
4431 return self.reindex(**d)
4433 @overload
4434 def drop(
4435 self,
4436 labels: IndexLabel = ...,
4437 *,
4438 axis: Axis = ...,
4439 index: IndexLabel = ...,
4440 columns: IndexLabel = ...,
4441 level: Level | None = ...,
4442 inplace: Literal[True],
4443 errors: IgnoreRaise = ...,
4444 ) -> None:
4445 ...
4447 @overload
4448 def drop(
4449 self: NDFrameT,
4450 labels: IndexLabel = ...,
4451 *,
4452 axis: Axis = ...,
4453 index: IndexLabel = ...,
4454 columns: IndexLabel = ...,
4455 level: Level | None = ...,
4456 inplace: Literal[False] = ...,
4457 errors: IgnoreRaise = ...,
4458 ) -> NDFrameT:
4459 ...
4461 @overload
4462 def drop(
4463 self: NDFrameT,
4464 labels: IndexLabel = ...,
4465 *,
4466 axis: Axis = ...,
4467 index: IndexLabel = ...,
4468 columns: IndexLabel = ...,
4469 level: Level | None = ...,
4470 inplace: bool_t = ...,
4471 errors: IgnoreRaise = ...,
4472 ) -> NDFrameT | None:
4473 ...
4475 @deprecate_nonkeyword_arguments(version=None, allowed_args=["self", "labels"])
4476 def drop(
4477 self: NDFrameT,
4478 labels: IndexLabel = None,
4479 axis: Axis = 0,
4480 index: IndexLabel = None,
4481 columns: IndexLabel = None,
4482 level: Level | None = None,
4483 inplace: bool_t = False,
4484 errors: IgnoreRaise = "raise",
4485 ) -> NDFrameT | None:
4487 inplace = validate_bool_kwarg(inplace, "inplace")
4489 if labels is not None:
4490 if index is not None or columns is not None:
4491 raise ValueError("Cannot specify both 'labels' and 'index'/'columns'")
4492 axis_name = self._get_axis_name(axis)
4493 axes = {axis_name: labels}
4494 elif index is not None or columns is not None:
4495 axes, _ = self._construct_axes_from_arguments((index, columns), {})
4496 else:
4497 raise ValueError(
4498 "Need to specify at least one of 'labels', 'index' or 'columns'"
4499 )
4501 obj = self
4503 for axis, labels in axes.items():
4504 if labels is not None:
4505 obj = obj._drop_axis(labels, axis, level=level, errors=errors)
4507 if inplace:
4508 self._update_inplace(obj)
4509 else:
4510 return obj
4512 @final
4513 def _drop_axis(
4514 self: NDFrameT,
4515 labels,
4516 axis,
4517 level=None,
4518 errors: IgnoreRaise = "raise",
4519 only_slice: bool_t = False,
4520 ) -> NDFrameT:
4521 """
4522 Drop labels from specified axis. Used in the ``drop`` method
4523 internally.
4525 Parameters
4526 ----------
4527 labels : single label or list-like
4528 axis : int or axis name
4529 level : int or level name, default None
4530 For MultiIndex
4531 errors : {'ignore', 'raise'}, default 'raise'
4532 If 'ignore', suppress error and existing labels are dropped.
4533 only_slice : bool, default False
4534 Whether indexing along columns should be view-only.
4536 """
4537 axis_num = self._get_axis_number(axis)
4538 axis = self._get_axis(axis)
4540 if axis.is_unique:
4541 if level is not None:
4542 if not isinstance(axis, MultiIndex):
4543 raise AssertionError("axis must be a MultiIndex")
4544 new_axis = axis.drop(labels, level=level, errors=errors)
4545 else:
4546 new_axis = axis.drop(labels, errors=errors)
4547 indexer = axis.get_indexer(new_axis)
4549 # Case for non-unique axis
4550 else:
4551 is_tuple_labels = is_nested_list_like(labels) or isinstance(labels, tuple)
4552 labels = ensure_object(com.index_labels_to_array(labels))
4553 if level is not None:
4554 if not isinstance(axis, MultiIndex):
4555 raise AssertionError("axis must be a MultiIndex")
4556 mask = ~axis.get_level_values(level).isin(labels)
4558 # GH 18561 MultiIndex.drop should raise if label is absent
4559 if errors == "raise" and mask.all():
4560 raise KeyError(f"{labels} not found in axis")
4561 elif (
4562 isinstance(axis, MultiIndex)
4563 and labels.dtype == "object"
4564 and not is_tuple_labels
4565 ):
4566 # Set level to zero in case of MultiIndex and label is string,
4567 # because isin can't handle strings for MultiIndexes GH#36293
4568 # In case of tuples we get dtype object but have to use isin GH#42771
4569 mask = ~axis.get_level_values(0).isin(labels)
4570 else:
4571 mask = ~axis.isin(labels)
4572 # Check if label doesn't exist along axis
4573 labels_missing = (axis.get_indexer_for(labels) == -1).any()
4574 if errors == "raise" and labels_missing:
4575 raise KeyError(f"{labels} not found in axis")
4577 if is_extension_array_dtype(mask.dtype):
4578 # GH#45860
4579 mask = mask.to_numpy(dtype=bool)
4581 indexer = mask.nonzero()[0]
4582 new_axis = axis.take(indexer)
4584 bm_axis = self.ndim - axis_num - 1
4585 new_mgr = self._mgr.reindex_indexer(
4586 new_axis,
4587 indexer,
4588 axis=bm_axis,
4589 allow_dups=True,
4590 only_slice=only_slice,
4591 )
4592 result = self._constructor(new_mgr)
4593 if self.ndim == 1:
4594 result.name = self.name
4596 return result.__finalize__(self)
4598 @final
4599 def _update_inplace(self, result, verify_is_copy: bool_t = True) -> None:
4600 """
4601 Replace self internals with result.
4603 Parameters
4604 ----------
4605 result : same type as self
4606 verify_is_copy : bool, default True
4607 Provide is_copy checks.
4608 """
4609 # NOTE: This does *not* call __finalize__ and that's an explicit
4610 # decision that we may revisit in the future.
4611 self._reset_cache()
4612 self._clear_item_cache()
4613 self._mgr = result._mgr
4614 self._maybe_update_cacher(verify_is_copy=verify_is_copy, inplace=True)
4616 @final
4617 def add_prefix(self: NDFrameT, prefix: str) -> NDFrameT:
4618 """
4619 Prefix labels with string `prefix`.
4621 For Series, the row labels are prefixed.
4622 For DataFrame, the column labels are prefixed.
4624 Parameters
4625 ----------
4626 prefix : str
4627 The string to add before each label.
4629 Returns
4630 -------
4631 Series or DataFrame
4632 New Series or DataFrame with updated labels.
4634 See Also
4635 --------
4636 Series.add_suffix: Suffix row labels with string `suffix`.
4637 DataFrame.add_suffix: Suffix column labels with string `suffix`.
4639 Examples
4640 --------
4641 >>> s = pd.Series([1, 2, 3, 4])
4642 >>> s
4643 0 1
4644 1 2
4645 2 3
4646 3 4
4647 dtype: int64
4649 >>> s.add_prefix('item_')
4650 item_0 1
4651 item_1 2
4652 item_2 3
4653 item_3 4
4654 dtype: int64
4656 >>> df = pd.DataFrame({'A': [1, 2, 3, 4], 'B': [3, 4, 5, 6]})
4657 >>> df
4658 A B
4659 0 1 3
4660 1 2 4
4661 2 3 5
4662 3 4 6
4664 >>> df.add_prefix('col_')
4665 col_A col_B
4666 0 1 3
4667 1 2 4
4668 2 3 5
4669 3 4 6
4670 """
4671 f = functools.partial("{prefix}{}".format, prefix=prefix)
4673 mapper = {self._info_axis_name: f}
4674 # error: Incompatible return value type (got "Optional[NDFrameT]",
4675 # expected "NDFrameT")
4676 # error: Argument 1 to "rename" of "NDFrame" has incompatible type
4677 # "**Dict[str, partial[str]]"; expected "Union[str, int, None]"
4678 return self._rename(**mapper) # type: ignore[return-value, arg-type]
4680 @final
4681 def add_suffix(self: NDFrameT, suffix: str) -> NDFrameT:
4682 """
4683 Suffix labels with string `suffix`.
4685 For Series, the row labels are suffixed.
4686 For DataFrame, the column labels are suffixed.
4688 Parameters
4689 ----------
4690 suffix : str
4691 The string to add after each label.
4693 Returns
4694 -------
4695 Series or DataFrame
4696 New Series or DataFrame with updated labels.
4698 See Also
4699 --------
4700 Series.add_prefix: Prefix row labels with string `prefix`.
4701 DataFrame.add_prefix: Prefix column labels with string `prefix`.
4703 Examples
4704 --------
4705 >>> s = pd.Series([1, 2, 3, 4])
4706 >>> s
4707 0 1
4708 1 2
4709 2 3
4710 3 4
4711 dtype: int64
4713 >>> s.add_suffix('_item')
4714 0_item 1
4715 1_item 2
4716 2_item 3
4717 3_item 4
4718 dtype: int64
4720 >>> df = pd.DataFrame({'A': [1, 2, 3, 4], 'B': [3, 4, 5, 6]})
4721 >>> df
4722 A B
4723 0 1 3
4724 1 2 4
4725 2 3 5
4726 3 4 6
4728 >>> df.add_suffix('_col')
4729 A_col B_col
4730 0 1 3
4731 1 2 4
4732 2 3 5
4733 3 4 6
4734 """
4735 f = functools.partial("{}{suffix}".format, suffix=suffix)
4737 mapper = {self._info_axis_name: f}
4738 # error: Incompatible return value type (got "Optional[NDFrameT]",
4739 # expected "NDFrameT")
4740 # error: Argument 1 to "rename" of "NDFrame" has incompatible type
4741 # "**Dict[str, partial[str]]"; expected "Union[str, int, None]"
4742 return self._rename(**mapper) # type: ignore[return-value, arg-type]
4744 @overload
4745 def sort_values(
4746 self: NDFrameT,
4747 *,
4748 axis: Axis = ...,
4749 ascending=...,
4750 inplace: Literal[False] = ...,
4751 kind: str = ...,
4752 na_position: str = ...,
4753 ignore_index: bool_t = ...,
4754 key: ValueKeyFunc = ...,
4755 ) -> NDFrameT:
4756 ...
4758 @overload
4759 def sort_values(
4760 self,
4761 *,
4762 axis: Axis = ...,
4763 ascending=...,
4764 inplace: Literal[True],
4765 kind: str = ...,
4766 na_position: str = ...,
4767 ignore_index: bool_t = ...,
4768 key: ValueKeyFunc = ...,
4769 ) -> None:
4770 ...
4772 @overload
4773 def sort_values(
4774 self: NDFrameT,
4775 *,
4776 axis: Axis = ...,
4777 ascending=...,
4778 inplace: bool_t = ...,
4779 kind: str = ...,
4780 na_position: str = ...,
4781 ignore_index: bool_t = ...,
4782 key: ValueKeyFunc = ...,
4783 ) -> NDFrameT | None:
4784 ...
4786 @deprecate_nonkeyword_arguments(version=None, allowed_args=["self"])
4787 def sort_values(
4788 self: NDFrameT,
4789 axis: Axis = 0,
4790 ascending=True,
4791 inplace: bool_t = False,
4792 kind: str = "quicksort",
4793 na_position: str = "last",
4794 ignore_index: bool_t = False,
4795 key: ValueKeyFunc = None,
4796 ) -> NDFrameT | None:
4797 """
4798 Sort by the values along either axis.
4800 Parameters
4801 ----------%(optional_by)s
4802 axis : %(axes_single_arg)s, default 0
4803 Axis to be sorted.
4804 ascending : bool or list of bool, default True
4805 Sort ascending vs. descending. Specify list for multiple sort
4806 orders. If this is a list of bools, must match the length of
4807 the by.
4808 inplace : bool, default False
4809 If True, perform operation in-place.
4810 kind : {'quicksort', 'mergesort', 'heapsort', 'stable'}, default 'quicksort'
4811 Choice of sorting algorithm. See also :func:`numpy.sort` for more
4812 information. `mergesort` and `stable` are the only stable algorithms. For
4813 DataFrames, this option is only applied when sorting on a single
4814 column or label.
4815 na_position : {'first', 'last'}, default 'last'
4816 Puts NaNs at the beginning if `first`; `last` puts NaNs at the
4817 end.
4818 ignore_index : bool, default False
4819 If True, the resulting axis will be labeled 0, 1, …, n - 1.
4821 .. versionadded:: 1.0.0
4823 key : callable, optional
4824 Apply the key function to the values
4825 before sorting. This is similar to the `key` argument in the
4826 builtin :meth:`sorted` function, with the notable difference that
4827 this `key` function should be *vectorized*. It should expect a
4828 ``Series`` and return a Series with the same shape as the input.
4829 It will be applied to each column in `by` independently.
4831 .. versionadded:: 1.1.0
4833 Returns
4834 -------
4835 DataFrame or None
4836 DataFrame with sorted values or None if ``inplace=True``.
4838 See Also
4839 --------
4840 DataFrame.sort_index : Sort a DataFrame by the index.
4841 Series.sort_values : Similar method for a Series.
4843 Examples
4844 --------
4845 >>> df = pd.DataFrame({
4846 ... 'col1': ['A', 'A', 'B', np.nan, 'D', 'C'],
4847 ... 'col2': [2, 1, 9, 8, 7, 4],
4848 ... 'col3': [0, 1, 9, 4, 2, 3],
4849 ... 'col4': ['a', 'B', 'c', 'D', 'e', 'F']
4850 ... })
4851 >>> df
4852 col1 col2 col3 col4
4853 0 A 2 0 a
4854 1 A 1 1 B
4855 2 B 9 9 c
4856 3 NaN 8 4 D
4857 4 D 7 2 e
4858 5 C 4 3 F
4860 Sort by col1
4862 >>> df.sort_values(by=['col1'])
4863 col1 col2 col3 col4
4864 0 A 2 0 a
4865 1 A 1 1 B
4866 2 B 9 9 c
4867 5 C 4 3 F
4868 4 D 7 2 e
4869 3 NaN 8 4 D
4871 Sort by multiple columns
4873 >>> df.sort_values(by=['col1', 'col2'])
4874 col1 col2 col3 col4
4875 1 A 1 1 B
4876 0 A 2 0 a
4877 2 B 9 9 c
4878 5 C 4 3 F
4879 4 D 7 2 e
4880 3 NaN 8 4 D
4882 Sort Descending
4884 >>> df.sort_values(by='col1', ascending=False)
4885 col1 col2 col3 col4
4886 4 D 7 2 e
4887 5 C 4 3 F
4888 2 B 9 9 c
4889 0 A 2 0 a
4890 1 A 1 1 B
4891 3 NaN 8 4 D
4893 Putting NAs first
4895 >>> df.sort_values(by='col1', ascending=False, na_position='first')
4896 col1 col2 col3 col4
4897 3 NaN 8 4 D
4898 4 D 7 2 e
4899 5 C 4 3 F
4900 2 B 9 9 c
4901 0 A 2 0 a
4902 1 A 1 1 B
4904 Sorting with a key function
4906 >>> df.sort_values(by='col4', key=lambda col: col.str.lower())
4907 col1 col2 col3 col4
4908 0 A 2 0 a
4909 1 A 1 1 B
4910 2 B 9 9 c
4911 3 NaN 8 4 D
4912 4 D 7 2 e
4913 5 C 4 3 F
4915 Natural sort with the key argument,
4916 using the `natsort <https://github.com/SethMMorton/natsort>` package.
4918 >>> df = pd.DataFrame({
4919 ... "time": ['0hr', '128hr', '72hr', '48hr', '96hr'],
4920 ... "value": [10, 20, 30, 40, 50]
4921 ... })
4922 >>> df
4923 time value
4924 0 0hr 10
4925 1 128hr 20
4926 2 72hr 30
4927 3 48hr 40
4928 4 96hr 50
4929 >>> from natsort import index_natsorted
4930 >>> df.sort_values(
4931 ... by="time",
4932 ... key=lambda x: np.argsort(index_natsorted(df["time"]))
4933 ... )
4934 time value
4935 0 0hr 10
4936 3 48hr 40
4937 2 72hr 30
4938 4 96hr 50
4939 1 128hr 20
4940 """
4941 raise AbstractMethodError(self)
4943 @overload
4944 def sort_index(
4945 self,
4946 *,
4947 axis: Axis = ...,
4948 level: IndexLabel = ...,
4949 ascending: bool_t | Sequence[bool_t] = ...,
4950 inplace: Literal[True],
4951 kind: SortKind = ...,
4952 na_position: NaPosition = ...,
4953 sort_remaining: bool_t = ...,
4954 ignore_index: bool_t = ...,
4955 key: IndexKeyFunc = ...,
4956 ) -> None:
4957 ...
4959 @overload
4960 def sort_index(
4961 self: NDFrameT,
4962 *,
4963 axis: Axis = ...,
4964 level: IndexLabel = ...,
4965 ascending: bool_t | Sequence[bool_t] = ...,
4966 inplace: Literal[False] = ...,
4967 kind: SortKind = ...,
4968 na_position: NaPosition = ...,
4969 sort_remaining: bool_t = ...,
4970 ignore_index: bool_t = ...,
4971 key: IndexKeyFunc = ...,
4972 ) -> NDFrameT:
4973 ...
4975 @overload
4976 def sort_index(
4977 self: NDFrameT,
4978 *,
4979 axis: Axis = ...,
4980 level: IndexLabel = ...,
4981 ascending: bool_t | Sequence[bool_t] = ...,
4982 inplace: bool_t = ...,
4983 kind: SortKind = ...,
4984 na_position: NaPosition = ...,
4985 sort_remaining: bool_t = ...,
4986 ignore_index: bool_t = ...,
4987 key: IndexKeyFunc = ...,
4988 ) -> NDFrameT | None:
4989 ...
4991 def sort_index(
4992 self: NDFrameT,
4993 axis: Axis = 0,
4994 level: IndexLabel = None,
4995 ascending: bool_t | Sequence[bool_t] = True,
4996 inplace: bool_t = False,
4997 kind: SortKind = "quicksort",
4998 na_position: NaPosition = "last",
4999 sort_remaining: bool_t = True,
5000 ignore_index: bool_t = False,
5001 key: IndexKeyFunc = None,
5002 ) -> NDFrameT | None:
5004 inplace = validate_bool_kwarg(inplace, "inplace")
5005 axis = self._get_axis_number(axis)
5006 ascending = validate_ascending(ascending)
5008 target = self._get_axis(axis)
5010 indexer = get_indexer_indexer(
5011 target, level, ascending, kind, na_position, sort_remaining, key
5012 )
5014 if indexer is None:
5015 if inplace:
5016 result = self
5017 else:
5018 result = self.copy()
5020 if ignore_index:
5021 result.index = default_index(len(self))
5022 if inplace:
5023 return None
5024 else:
5025 return result
5027 baxis = self._get_block_manager_axis(axis)
5028 new_data = self._mgr.take(indexer, axis=baxis, verify=False)
5030 # reconstruct axis if needed
5031 new_data.set_axis(baxis, new_data.axes[baxis]._sort_levels_monotonic())
5033 if ignore_index:
5034 axis = 1 if isinstance(self, ABCDataFrame) else 0
5035 new_data.set_axis(axis, default_index(len(indexer)))
5037 result = self._constructor(new_data)
5039 if inplace:
5040 return self._update_inplace(result)
5041 else:
5042 return result.__finalize__(self, method="sort_index")
5044 @doc(
5045 klass=_shared_doc_kwargs["klass"],
5046 axes=_shared_doc_kwargs["axes"],
5047 optional_labels="",
5048 optional_axis="",
5049 )
5050 def reindex(self: NDFrameT, *args, **kwargs) -> NDFrameT:
5051 """
5052 Conform {klass} to new index with optional filling logic.
5054 Places NA/NaN in locations having no value in the previous index. A new object
5055 is produced unless the new index is equivalent to the current one and
5056 ``copy=False``.
5058 Parameters
5059 ----------
5060 {optional_labels}
5061 {axes} : array-like, optional
5062 New labels / index to conform to, should be specified using
5063 keywords. Preferably an Index object to avoid duplicating data.
5064 {optional_axis}
5065 method : {{None, 'backfill'/'bfill', 'pad'/'ffill', 'nearest'}}
5066 Method to use for filling holes in reindexed DataFrame.
5067 Please note: this is only applicable to DataFrames/Series with a
5068 monotonically increasing/decreasing index.
5070 * None (default): don't fill gaps
5071 * pad / ffill: Propagate last valid observation forward to next
5072 valid.
5073 * backfill / bfill: Use next valid observation to fill gap.
5074 * nearest: Use nearest valid observations to fill gap.
5076 copy : bool, default True
5077 Return a new object, even if the passed indexes are the same.
5078 level : int or name
5079 Broadcast across a level, matching Index values on the
5080 passed MultiIndex level.
5081 fill_value : scalar, default np.NaN
5082 Value to use for missing values. Defaults to NaN, but can be any
5083 "compatible" value.
5084 limit : int, default None
5085 Maximum number of consecutive elements to forward or backward fill.
5086 tolerance : optional
5087 Maximum distance between original and new labels for inexact
5088 matches. The values of the index at the matching locations most
5089 satisfy the equation ``abs(index[indexer] - target) <= tolerance``.
5091 Tolerance may be a scalar value, which applies the same tolerance
5092 to all values, or list-like, which applies variable tolerance per
5093 element. List-like includes list, tuple, array, Series, and must be
5094 the same size as the index and its dtype must exactly match the
5095 index's type.
5097 Returns
5098 -------
5099 {klass} with changed index.
5101 See Also
5102 --------
5103 DataFrame.set_index : Set row labels.
5104 DataFrame.reset_index : Remove row labels or move them to new columns.
5105 DataFrame.reindex_like : Change to same indices as other DataFrame.
5107 Examples
5108 --------
5109 ``DataFrame.reindex`` supports two calling conventions
5111 * ``(index=index_labels, columns=column_labels, ...)``
5112 * ``(labels, axis={{'index', 'columns'}}, ...)``
5114 We *highly* recommend using keyword arguments to clarify your
5115 intent.
5117 Create a dataframe with some fictional data.
5119 >>> index = ['Firefox', 'Chrome', 'Safari', 'IE10', 'Konqueror']
5120 >>> df = pd.DataFrame({{'http_status': [200, 200, 404, 404, 301],
5121 ... 'response_time': [0.04, 0.02, 0.07, 0.08, 1.0]}},
5122 ... index=index)
5123 >>> df
5124 http_status response_time
5125 Firefox 200 0.04
5126 Chrome 200 0.02
5127 Safari 404 0.07
5128 IE10 404 0.08
5129 Konqueror 301 1.00
5131 Create a new index and reindex the dataframe. By default
5132 values in the new index that do not have corresponding
5133 records in the dataframe are assigned ``NaN``.
5135 >>> new_index = ['Safari', 'Iceweasel', 'Comodo Dragon', 'IE10',
5136 ... 'Chrome']
5137 >>> df.reindex(new_index)
5138 http_status response_time
5139 Safari 404.0 0.07
5140 Iceweasel NaN NaN
5141 Comodo Dragon NaN NaN
5142 IE10 404.0 0.08
5143 Chrome 200.0 0.02
5145 We can fill in the missing values by passing a value to
5146 the keyword ``fill_value``. Because the index is not monotonically
5147 increasing or decreasing, we cannot use arguments to the keyword
5148 ``method`` to fill the ``NaN`` values.
5150 >>> df.reindex(new_index, fill_value=0)
5151 http_status response_time
5152 Safari 404 0.07
5153 Iceweasel 0 0.00
5154 Comodo Dragon 0 0.00
5155 IE10 404 0.08
5156 Chrome 200 0.02
5158 >>> df.reindex(new_index, fill_value='missing')
5159 http_status response_time
5160 Safari 404 0.07
5161 Iceweasel missing missing
5162 Comodo Dragon missing missing
5163 IE10 404 0.08
5164 Chrome 200 0.02
5166 We can also reindex the columns.
5168 >>> df.reindex(columns=['http_status', 'user_agent'])
5169 http_status user_agent
5170 Firefox 200 NaN
5171 Chrome 200 NaN
5172 Safari 404 NaN
5173 IE10 404 NaN
5174 Konqueror 301 NaN
5176 Or we can use "axis-style" keyword arguments
5178 >>> df.reindex(['http_status', 'user_agent'], axis="columns")
5179 http_status user_agent
5180 Firefox 200 NaN
5181 Chrome 200 NaN
5182 Safari 404 NaN
5183 IE10 404 NaN
5184 Konqueror 301 NaN
5186 To further illustrate the filling functionality in
5187 ``reindex``, we will create a dataframe with a
5188 monotonically increasing index (for example, a sequence
5189 of dates).
5191 >>> date_index = pd.date_range('1/1/2010', periods=6, freq='D')
5192 >>> df2 = pd.DataFrame({{"prices": [100, 101, np.nan, 100, 89, 88]}},
5193 ... index=date_index)
5194 >>> df2
5195 prices
5196 2010-01-01 100.0
5197 2010-01-02 101.0
5198 2010-01-03 NaN
5199 2010-01-04 100.0
5200 2010-01-05 89.0
5201 2010-01-06 88.0
5203 Suppose we decide to expand the dataframe to cover a wider
5204 date range.
5206 >>> date_index2 = pd.date_range('12/29/2009', periods=10, freq='D')
5207 >>> df2.reindex(date_index2)
5208 prices
5209 2009-12-29 NaN
5210 2009-12-30 NaN
5211 2009-12-31 NaN
5212 2010-01-01 100.0
5213 2010-01-02 101.0
5214 2010-01-03 NaN
5215 2010-01-04 100.0
5216 2010-01-05 89.0
5217 2010-01-06 88.0
5218 2010-01-07 NaN
5220 The index entries that did not have a value in the original data frame
5221 (for example, '2009-12-29') are by default filled with ``NaN``.
5222 If desired, we can fill in the missing values using one of several
5223 options.
5225 For example, to back-propagate the last valid value to fill the ``NaN``
5226 values, pass ``bfill`` as an argument to the ``method`` keyword.
5228 >>> df2.reindex(date_index2, method='bfill')
5229 prices
5230 2009-12-29 100.0
5231 2009-12-30 100.0
5232 2009-12-31 100.0
5233 2010-01-01 100.0
5234 2010-01-02 101.0
5235 2010-01-03 NaN
5236 2010-01-04 100.0
5237 2010-01-05 89.0
5238 2010-01-06 88.0
5239 2010-01-07 NaN
5241 Please note that the ``NaN`` value present in the original dataframe
5242 (at index value 2010-01-03) will not be filled by any of the
5243 value propagation schemes. This is because filling while reindexing
5244 does not look at dataframe values, but only compares the original and
5245 desired indexes. If you do want to fill in the ``NaN`` values present
5246 in the original dataframe, use the ``fillna()`` method.
5248 See the :ref:`user guide <basics.reindexing>` for more.
5249 """
5250 # TODO: Decide if we care about having different examples for different
5251 # kinds
5253 # construct the args
5254 axes, kwargs = self._construct_axes_from_arguments(args, kwargs)
5255 method = missing.clean_reindex_fill_method(kwargs.pop("method", None))
5256 level = kwargs.pop("level", None)
5257 copy = kwargs.pop("copy", None)
5258 limit = kwargs.pop("limit", None)
5259 tolerance = kwargs.pop("tolerance", None)
5260 fill_value = kwargs.pop("fill_value", None)
5262 # Series.reindex doesn't use / need the axis kwarg
5263 # We pop and ignore it here, to make writing Series/Frame generic code
5264 # easier
5265 kwargs.pop("axis", None)
5267 if kwargs:
5268 raise TypeError(
5269 "reindex() got an unexpected keyword "
5270 f'argument "{list(kwargs.keys())[0]}"'
5271 )
5273 self._consolidate_inplace()
5275 # if all axes that are requested to reindex are equal, then only copy
5276 # if indicated must have index names equal here as well as values
5277 if all(
5278 self._get_axis(axis).identical(ax)
5279 for axis, ax in axes.items()
5280 if ax is not None
5281 ):
5282 return self.copy(deep=copy)
5284 # check if we are a multi reindex
5285 if self._needs_reindex_multi(axes, method, level):
5286 return self._reindex_multi(axes, copy, fill_value)
5288 # perform the reindex on the axes
5289 return self._reindex_axes(
5290 axes, level, limit, tolerance, method, fill_value, copy
5291 ).__finalize__(self, method="reindex")
5293 def _reindex_axes(
5294 self: NDFrameT, axes, level, limit, tolerance, method, fill_value, copy
5295 ) -> NDFrameT:
5296 """Perform the reindex for all the axes."""
5297 obj = self
5298 for a in self._AXIS_ORDERS:
5299 labels = axes[a]
5300 if labels is None:
5301 continue
5303 ax = self._get_axis(a)
5304 new_index, indexer = ax.reindex(
5305 labels, level=level, limit=limit, tolerance=tolerance, method=method
5306 )
5308 axis = self._get_axis_number(a)
5309 obj = obj._reindex_with_indexers(
5310 {axis: [new_index, indexer]},
5311 fill_value=fill_value,
5312 copy=copy,
5313 allow_dups=False,
5314 )
5315 # If we've made a copy once, no need to make another one
5316 copy = False
5318 return obj
5320 def _needs_reindex_multi(self, axes, method, level) -> bool_t:
5321 """Check if we do need a multi reindex."""
5322 return (
5323 (com.count_not_none(*axes.values()) == self._AXIS_LEN)
5324 and method is None
5325 and level is None
5326 and not self._is_mixed_type
5327 )
5329 def _reindex_multi(self, axes, copy, fill_value):
5330 raise AbstractMethodError(self)
5332 @final
5333 def _reindex_with_indexers(
5334 self: NDFrameT,
5335 reindexers,
5336 fill_value=None,
5337 copy: bool_t = False,
5338 allow_dups: bool_t = False,
5339 ) -> NDFrameT:
5340 """allow_dups indicates an internal call here"""
5341 # reindex doing multiple operations on different axes if indicated
5342 new_data = self._mgr
5343 for axis in sorted(reindexers.keys()):
5344 index, indexer = reindexers[axis]
5345 baxis = self._get_block_manager_axis(axis)
5347 if index is None:
5348 continue
5350 index = ensure_index(index)
5351 if indexer is not None:
5352 indexer = ensure_platform_int(indexer)
5354 # TODO: speed up on homogeneous DataFrame objects (see _reindex_multi)
5355 new_data = new_data.reindex_indexer(
5356 index,
5357 indexer,
5358 axis=baxis,
5359 fill_value=fill_value,
5360 allow_dups=allow_dups,
5361 copy=copy,
5362 )
5363 # If we've made a copy once, no need to make another one
5364 copy = False
5366 if copy and new_data is self._mgr:
5367 new_data = new_data.copy()
5369 return self._constructor(new_data).__finalize__(self)
5371 def filter(
5372 self: NDFrameT,
5373 items=None,
5374 like: str | None = None,
5375 regex: str | None = None,
5376 axis=None,
5377 ) -> NDFrameT:
5378 """
5379 Subset the dataframe rows or columns according to the specified index labels.
5381 Note that this routine does not filter a dataframe on its
5382 contents. The filter is applied to the labels of the index.
5384 Parameters
5385 ----------
5386 items : list-like
5387 Keep labels from axis which are in items.
5388 like : str
5389 Keep labels from axis for which "like in label == True".
5390 regex : str (regular expression)
5391 Keep labels from axis for which re.search(regex, label) == True.
5392 axis : {0 or ‘index’, 1 or ‘columns’, None}, default None
5393 The axis to filter on, expressed either as an index (int)
5394 or axis name (str). By default this is the info axis, 'columns' for
5395 DataFrame. For `Series` this parameter is unused and defaults to `None`.
5397 Returns
5398 -------
5399 same type as input object
5401 See Also
5402 --------
5403 DataFrame.loc : Access a group of rows and columns
5404 by label(s) or a boolean array.
5406 Notes
5407 -----
5408 The ``items``, ``like``, and ``regex`` parameters are
5409 enforced to be mutually exclusive.
5411 ``axis`` defaults to the info axis that is used when indexing
5412 with ``[]``.
5414 Examples
5415 --------
5416 >>> df = pd.DataFrame(np.array(([1, 2, 3], [4, 5, 6])),
5417 ... index=['mouse', 'rabbit'],
5418 ... columns=['one', 'two', 'three'])
5419 >>> df
5420 one two three
5421 mouse 1 2 3
5422 rabbit 4 5 6
5424 >>> # select columns by name
5425 >>> df.filter(items=['one', 'three'])
5426 one three
5427 mouse 1 3
5428 rabbit 4 6
5430 >>> # select columns by regular expression
5431 >>> df.filter(regex='e$', axis=1)
5432 one three
5433 mouse 1 3
5434 rabbit 4 6
5436 >>> # select rows containing 'bbi'
5437 >>> df.filter(like='bbi', axis=0)
5438 one two three
5439 rabbit 4 5 6
5440 """
5441 nkw = com.count_not_none(items, like, regex)
5442 if nkw > 1:
5443 raise TypeError(
5444 "Keyword arguments `items`, `like`, or `regex` "
5445 "are mutually exclusive"
5446 )
5448 if axis is None:
5449 axis = self._info_axis_name
5450 labels = self._get_axis(axis)
5452 if items is not None:
5453 name = self._get_axis_name(axis)
5454 return self.reindex(**{name: [r for r in items if r in labels]})
5455 elif like:
5457 def f(x) -> bool_t:
5458 assert like is not None # needed for mypy
5459 return like in ensure_str(x)
5461 values = labels.map(f)
5462 return self.loc(axis=axis)[values]
5463 elif regex:
5465 def f(x) -> bool_t:
5466 return matcher.search(ensure_str(x)) is not None
5468 matcher = re.compile(regex)
5469 values = labels.map(f)
5470 return self.loc(axis=axis)[values]
5471 else:
5472 raise TypeError("Must pass either `items`, `like`, or `regex`")
5474 @final
5475 def head(self: NDFrameT, n: int = 5) -> NDFrameT:
5476 """
5477 Return the first `n` rows.
5479 This function returns the first `n` rows for the object based
5480 on position. It is useful for quickly testing if your object
5481 has the right type of data in it.
5483 For negative values of `n`, this function returns all rows except
5484 the last `|n|` rows, equivalent to ``df[:n]``.
5486 If n is larger than the number of rows, this function returns all rows.
5488 Parameters
5489 ----------
5490 n : int, default 5
5491 Number of rows to select.
5493 Returns
5494 -------
5495 same type as caller
5496 The first `n` rows of the caller object.
5498 See Also
5499 --------
5500 DataFrame.tail: Returns the last `n` rows.
5502 Examples
5503 --------
5504 >>> df = pd.DataFrame({'animal': ['alligator', 'bee', 'falcon', 'lion',
5505 ... 'monkey', 'parrot', 'shark', 'whale', 'zebra']})
5506 >>> df
5507 animal
5508 0 alligator
5509 1 bee
5510 2 falcon
5511 3 lion
5512 4 monkey
5513 5 parrot
5514 6 shark
5515 7 whale
5516 8 zebra
5518 Viewing the first 5 lines
5520 >>> df.head()
5521 animal
5522 0 alligator
5523 1 bee
5524 2 falcon
5525 3 lion
5526 4 monkey
5528 Viewing the first `n` lines (three in this case)
5530 >>> df.head(3)
5531 animal
5532 0 alligator
5533 1 bee
5534 2 falcon
5536 For negative values of `n`
5538 >>> df.head(-3)
5539 animal
5540 0 alligator
5541 1 bee
5542 2 falcon
5543 3 lion
5544 4 monkey
5545 5 parrot
5546 """
5547 return self.iloc[:n]
5549 @final
5550 def tail(self: NDFrameT, n: int = 5) -> NDFrameT:
5551 """
5552 Return the last `n` rows.
5554 This function returns last `n` rows from the object based on
5555 position. It is useful for quickly verifying data, for example,
5556 after sorting or appending rows.
5558 For negative values of `n`, this function returns all rows except
5559 the first `|n|` rows, equivalent to ``df[|n|:]``.
5561 If n is larger than the number of rows, this function returns all rows.
5563 Parameters
5564 ----------
5565 n : int, default 5
5566 Number of rows to select.
5568 Returns
5569 -------
5570 type of caller
5571 The last `n` rows of the caller object.
5573 See Also
5574 --------
5575 DataFrame.head : The first `n` rows of the caller object.
5577 Examples
5578 --------
5579 >>> df = pd.DataFrame({'animal': ['alligator', 'bee', 'falcon', 'lion',
5580 ... 'monkey', 'parrot', 'shark', 'whale', 'zebra']})
5581 >>> df
5582 animal
5583 0 alligator
5584 1 bee
5585 2 falcon
5586 3 lion
5587 4 monkey
5588 5 parrot
5589 6 shark
5590 7 whale
5591 8 zebra
5593 Viewing the last 5 lines
5595 >>> df.tail()
5596 animal
5597 4 monkey
5598 5 parrot
5599 6 shark
5600 7 whale
5601 8 zebra
5603 Viewing the last `n` lines (three in this case)
5605 >>> df.tail(3)
5606 animal
5607 6 shark
5608 7 whale
5609 8 zebra
5611 For negative values of `n`
5613 >>> df.tail(-3)
5614 animal
5615 3 lion
5616 4 monkey
5617 5 parrot
5618 6 shark
5619 7 whale
5620 8 zebra
5621 """
5622 if n == 0:
5623 return self.iloc[0:0]
5624 return self.iloc[-n:]
5626 @final
5627 def sample(
5628 self: NDFrameT,
5629 n: int | None = None,
5630 frac: float | None = None,
5631 replace: bool_t = False,
5632 weights=None,
5633 random_state: RandomState | None = None,
5634 axis: Axis | None = None,
5635 ignore_index: bool_t = False,
5636 ) -> NDFrameT:
5637 """
5638 Return a random sample of items from an axis of object.
5640 You can use `random_state` for reproducibility.
5642 Parameters
5643 ----------
5644 n : int, optional
5645 Number of items from axis to return. Cannot be used with `frac`.
5646 Default = 1 if `frac` = None.
5647 frac : float, optional
5648 Fraction of axis items to return. Cannot be used with `n`.
5649 replace : bool, default False
5650 Allow or disallow sampling of the same row more than once.
5651 weights : str or ndarray-like, optional
5652 Default 'None' results in equal probability weighting.
5653 If passed a Series, will align with target object on index. Index
5654 values in weights not found in sampled object will be ignored and
5655 index values in sampled object not in weights will be assigned
5656 weights of zero.
5657 If called on a DataFrame, will accept the name of a column
5658 when axis = 0.
5659 Unless weights are a Series, weights must be same length as axis
5660 being sampled.
5661 If weights do not sum to 1, they will be normalized to sum to 1.
5662 Missing values in the weights column will be treated as zero.
5663 Infinite values not allowed.
5664 random_state : int, array-like, BitGenerator, np.random.RandomState, np.random.Generator, optional
5665 If int, array-like, or BitGenerator, seed for random number generator.
5666 If np.random.RandomState or np.random.Generator, use as given.
5668 .. versionchanged:: 1.1.0
5670 array-like and BitGenerator object now passed to np.random.RandomState()
5671 as seed
5673 .. versionchanged:: 1.4.0
5675 np.random.Generator objects now accepted
5677 axis : {0 or ‘index’, 1 or ‘columns’, None}, default None
5678 Axis to sample. Accepts axis number or name. Default is stat axis
5679 for given data type. For `Series` this parameter is unused and defaults to `None`.
5680 ignore_index : bool, default False
5681 If True, the resulting index will be labeled 0, 1, …, n - 1.
5683 .. versionadded:: 1.3.0
5685 Returns
5686 -------
5687 Series or DataFrame
5688 A new object of same type as caller containing `n` items randomly
5689 sampled from the caller object.
5691 See Also
5692 --------
5693 DataFrameGroupBy.sample: Generates random samples from each group of a
5694 DataFrame object.
5695 SeriesGroupBy.sample: Generates random samples from each group of a
5696 Series object.
5697 numpy.random.choice: Generates a random sample from a given 1-D numpy
5698 array.
5700 Notes
5701 -----
5702 If `frac` > 1, `replacement` should be set to `True`.
5704 Examples
5705 --------
5706 >>> df = pd.DataFrame({'num_legs': [2, 4, 8, 0],
5707 ... 'num_wings': [2, 0, 0, 0],
5708 ... 'num_specimen_seen': [10, 2, 1, 8]},
5709 ... index=['falcon', 'dog', 'spider', 'fish'])
5710 >>> df
5711 num_legs num_wings num_specimen_seen
5712 falcon 2 2 10
5713 dog 4 0 2
5714 spider 8 0 1
5715 fish 0 0 8
5717 Extract 3 random elements from the ``Series`` ``df['num_legs']``:
5718 Note that we use `random_state` to ensure the reproducibility of
5719 the examples.
5721 >>> df['num_legs'].sample(n=3, random_state=1)
5722 fish 0
5723 spider 8
5724 falcon 2
5725 Name: num_legs, dtype: int64
5727 A random 50% sample of the ``DataFrame`` with replacement:
5729 >>> df.sample(frac=0.5, replace=True, random_state=1)
5730 num_legs num_wings num_specimen_seen
5731 dog 4 0 2
5732 fish 0 0 8
5734 An upsample sample of the ``DataFrame`` with replacement:
5735 Note that `replace` parameter has to be `True` for `frac` parameter > 1.
5737 >>> df.sample(frac=2, replace=True, random_state=1)
5738 num_legs num_wings num_specimen_seen
5739 dog 4 0 2
5740 fish 0 0 8
5741 falcon 2 2 10
5742 falcon 2 2 10
5743 fish 0 0 8
5744 dog 4 0 2
5745 fish 0 0 8
5746 dog 4 0 2
5748 Using a DataFrame column as weights. Rows with larger value in the
5749 `num_specimen_seen` column are more likely to be sampled.
5751 >>> df.sample(n=2, weights='num_specimen_seen', random_state=1)
5752 num_legs num_wings num_specimen_seen
5753 falcon 2 2 10
5754 fish 0 0 8
5755 """ # noqa:E501
5756 if axis is None:
5757 axis = self._stat_axis_number
5759 axis = self._get_axis_number(axis)
5760 obj_len = self.shape[axis]
5762 # Process random_state argument
5763 rs = com.random_state(random_state)
5765 size = sample.process_sampling_size(n, frac, replace)
5766 if size is None:
5767 assert frac is not None
5768 size = round(frac * obj_len)
5770 if weights is not None:
5771 weights = sample.preprocess_weights(self, weights, axis)
5773 sampled_indices = sample.sample(obj_len, size, replace, weights, rs)
5774 result = self.take(sampled_indices, axis=axis)
5776 if ignore_index:
5777 result.index = default_index(len(result))
5779 return result
5781 @final
5782 @doc(klass=_shared_doc_kwargs["klass"])
5783 def pipe(
5784 self,
5785 func: Callable[..., T] | tuple[Callable[..., T], str],
5786 *args,
5787 **kwargs,
5788 ) -> T:
5789 r"""
5790 Apply chainable functions that expect Series or DataFrames.
5792 Parameters
5793 ----------
5794 func : function
5795 Function to apply to the {klass}.
5796 ``args``, and ``kwargs`` are passed into ``func``.
5797 Alternatively a ``(callable, data_keyword)`` tuple where
5798 ``data_keyword`` is a string indicating the keyword of
5799 ``callable`` that expects the {klass}.
5800 args : iterable, optional
5801 Positional arguments passed into ``func``.
5802 kwargs : mapping, optional
5803 A dictionary of keyword arguments passed into ``func``.
5805 Returns
5806 -------
5807 object : the return type of ``func``.
5809 See Also
5810 --------
5811 DataFrame.apply : Apply a function along input axis of DataFrame.
5812 DataFrame.applymap : Apply a function elementwise on a whole DataFrame.
5813 Series.map : Apply a mapping correspondence on a
5814 :class:`~pandas.Series`.
5816 Notes
5817 -----
5818 Use ``.pipe`` when chaining together functions that expect
5819 Series, DataFrames or GroupBy objects. Instead of writing
5821 >>> func(g(h(df), arg1=a), arg2=b, arg3=c) # doctest: +SKIP
5823 You can write
5825 >>> (df.pipe(h)
5826 ... .pipe(g, arg1=a)
5827 ... .pipe(func, arg2=b, arg3=c)
5828 ... ) # doctest: +SKIP
5830 If you have a function that takes the data as (say) the second
5831 argument, pass a tuple indicating which keyword expects the
5832 data. For example, suppose ``f`` takes its data as ``arg2``:
5834 >>> (df.pipe(h)
5835 ... .pipe(g, arg1=a)
5836 ... .pipe((func, 'arg2'), arg1=a, arg3=c)
5837 ... ) # doctest: +SKIP
5838 """
5839 return com.pipe(self, func, *args, **kwargs)
5841 # ----------------------------------------------------------------------
5842 # Attribute access
5844 @final
5845 def __finalize__(
5846 self: NDFrameT, other, method: str | None = None, **kwargs
5847 ) -> NDFrameT:
5848 """
5849 Propagate metadata from other to self.
5851 Parameters
5852 ----------
5853 other : the object from which to get the attributes that we are going
5854 to propagate
5855 method : str, optional
5856 A passed method name providing context on where ``__finalize__``
5857 was called.
5859 .. warning::
5861 The value passed as `method` are not currently considered
5862 stable across pandas releases.
5863 """
5864 if isinstance(other, NDFrame):
5865 for name in other.attrs:
5866 self.attrs[name] = other.attrs[name]
5868 self.flags.allows_duplicate_labels = other.flags.allows_duplicate_labels
5869 # For subclasses using _metadata.
5870 for name in set(self._metadata) & set(other._metadata):
5871 assert isinstance(name, str)
5872 object.__setattr__(self, name, getattr(other, name, None))
5874 if method == "concat":
5875 attrs = other.objs[0].attrs
5876 check_attrs = all(objs.attrs == attrs for objs in other.objs[1:])
5877 if check_attrs:
5878 for name in attrs:
5879 self.attrs[name] = attrs[name]
5881 allows_duplicate_labels = all(
5882 x.flags.allows_duplicate_labels for x in other.objs
5883 )
5884 self.flags.allows_duplicate_labels = allows_duplicate_labels
5886 return self
5888 def __getattr__(self, name: str):
5889 """
5890 After regular attribute access, try looking up the name
5891 This allows simpler access to columns for interactive use.
5892 """
5893 # Note: obj.x will always call obj.__getattribute__('x') prior to
5894 # calling obj.__getattr__('x').
5895 if (
5896 name not in self._internal_names_set
5897 and name not in self._metadata
5898 and name not in self._accessors
5899 and self._info_axis._can_hold_identifiers_and_holds_name(name)
5900 ):
5901 return self[name]
5902 return object.__getattribute__(self, name)
5904 def __setattr__(self, name: str, value) -> None:
5905 """
5906 After regular attribute access, try setting the name
5907 This allows simpler access to columns for interactive use.
5908 """
5909 # first try regular attribute access via __getattribute__, so that
5910 # e.g. ``obj.x`` and ``obj.x = 4`` will always reference/modify
5911 # the same attribute.
5913 try:
5914 object.__getattribute__(self, name)
5915 return object.__setattr__(self, name, value)
5916 except AttributeError:
5917 pass
5919 # if this fails, go on to more involved attribute setting
5920 # (note that this matches __getattr__, above).
5921 if name in self._internal_names_set:
5922 object.__setattr__(self, name, value)
5923 elif name in self._metadata:
5924 object.__setattr__(self, name, value)
5925 else:
5926 try:
5927 existing = getattr(self, name)
5928 if isinstance(existing, Index):
5929 object.__setattr__(self, name, value)
5930 elif name in self._info_axis:
5931 self[name] = value
5932 else:
5933 object.__setattr__(self, name, value)
5934 except (AttributeError, TypeError):
5935 if isinstance(self, ABCDataFrame) and (is_list_like(value)):
5936 warnings.warn(
5937 "Pandas doesn't allow columns to be "
5938 "created via a new attribute name - see "
5939 "https://pandas.pydata.org/pandas-docs/"
5940 "stable/indexing.html#attribute-access",
5941 stacklevel=find_stack_level(),
5942 )
5943 object.__setattr__(self, name, value)
5945 @final
5946 def _dir_additions(self) -> set[str]:
5947 """
5948 add the string-like attributes from the info_axis.
5949 If info_axis is a MultiIndex, its first level values are used.
5950 """
5951 additions = super()._dir_additions()
5952 if self._info_axis._can_hold_strings:
5953 additions.update(self._info_axis._dir_additions_for_owner)
5954 return additions
5956 # ----------------------------------------------------------------------
5957 # Consolidation of internals
5959 @final
5960 def _protect_consolidate(self, f):
5961 """
5962 Consolidate _mgr -- if the blocks have changed, then clear the
5963 cache
5964 """
5965 if isinstance(self._mgr, (ArrayManager, SingleArrayManager)):
5966 return f()
5967 blocks_before = len(self._mgr.blocks)
5968 result = f()
5969 if len(self._mgr.blocks) != blocks_before:
5970 self._clear_item_cache()
5971 return result
5973 @final
5974 def _consolidate_inplace(self) -> None:
5975 """Consolidate data in place and return None"""
5977 def f():
5978 self._mgr = self._mgr.consolidate()
5980 self._protect_consolidate(f)
5982 @final
5983 def _consolidate(self):
5984 """
5985 Compute NDFrame with "consolidated" internals (data of each dtype
5986 grouped together in a single ndarray).
5988 Returns
5989 -------
5990 consolidated : same type as caller
5991 """
5992 f = lambda: self._mgr.consolidate()
5993 cons_data = self._protect_consolidate(f)
5994 return self._constructor(cons_data).__finalize__(self)
5996 @final
5997 @property
5998 def _is_mixed_type(self) -> bool_t:
5999 if self._mgr.is_single_block:
6000 return False
6002 if self._mgr.any_extension_types:
6003 # Even if they have the same dtype, we can't consolidate them,
6004 # so we pretend this is "mixed'"
6005 return True
6007 return self.dtypes.nunique() > 1
6009 @final
6010 def _check_inplace_setting(self, value) -> bool_t:
6011 """check whether we allow in-place setting with this type of value"""
6012 if self._is_mixed_type and not self._mgr.is_numeric_mixed_type:
6014 # allow an actual np.nan thru
6015 if is_float(value) and np.isnan(value):
6016 return True
6018 raise TypeError(
6019 "Cannot do inplace boolean setting on "
6020 "mixed-types with a non np.nan value"
6021 )
6023 return True
6025 @final
6026 def _get_numeric_data(self: NDFrameT) -> NDFrameT:
6027 return self._constructor(self._mgr.get_numeric_data()).__finalize__(self)
6029 @final
6030 def _get_bool_data(self):
6031 return self._constructor(self._mgr.get_bool_data()).__finalize__(self)
6033 # ----------------------------------------------------------------------
6034 # Internal Interface Methods
6036 @property
6037 def values(self):
6038 raise AbstractMethodError(self)
6040 @property
6041 def _values(self) -> np.ndarray:
6042 """internal implementation"""
6043 raise AbstractMethodError(self)
6045 @property
6046 def dtypes(self):
6047 """
6048 Return the dtypes in the DataFrame.
6050 This returns a Series with the data type of each column.
6051 The result's index is the original DataFrame's columns. Columns
6052 with mixed types are stored with the ``object`` dtype. See
6053 :ref:`the User Guide <basics.dtypes>` for more.
6055 Returns
6056 -------
6057 pandas.Series
6058 The data type of each column.
6060 Examples
6061 --------
6062 >>> df = pd.DataFrame({'float': [1.0],
6063 ... 'int': [1],
6064 ... 'datetime': [pd.Timestamp('20180310')],
6065 ... 'string': ['foo']})
6066 >>> df.dtypes
6067 float float64
6068 int int64
6069 datetime datetime64[ns]
6070 string object
6071 dtype: object
6072 """
6073 data = self._mgr.get_dtypes()
6074 return self._constructor_sliced(data, index=self._info_axis, dtype=np.object_)
6076 def astype(
6077 self: NDFrameT, dtype, copy: bool_t = True, errors: IgnoreRaise = "raise"
6078 ) -> NDFrameT:
6079 """
6080 Cast a pandas object to a specified dtype ``dtype``.
6082 Parameters
6083 ----------
6084 dtype : data type, or dict of column name -> data type
6085 Use a numpy.dtype or Python type to cast entire pandas object to
6086 the same type. Alternatively, use {col: dtype, ...}, where col is a
6087 column label and dtype is a numpy.dtype or Python type to cast one
6088 or more of the DataFrame's columns to column-specific types.
6089 copy : bool, default True
6090 Return a copy when ``copy=True`` (be very careful setting
6091 ``copy=False`` as changes to values then may propagate to other
6092 pandas objects).
6093 errors : {'raise', 'ignore'}, default 'raise'
6094 Control raising of exceptions on invalid data for provided dtype.
6096 - ``raise`` : allow exceptions to be raised
6097 - ``ignore`` : suppress exceptions. On error return original object.
6099 Returns
6100 -------
6101 casted : same type as caller
6103 See Also
6104 --------
6105 to_datetime : Convert argument to datetime.
6106 to_timedelta : Convert argument to timedelta.
6107 to_numeric : Convert argument to a numeric type.
6108 numpy.ndarray.astype : Cast a numpy array to a specified type.
6110 Notes
6111 -----
6112 .. deprecated:: 1.3.0
6114 Using ``astype`` to convert from timezone-naive dtype to
6115 timezone-aware dtype is deprecated and will raise in a
6116 future version. Use :meth:`Series.dt.tz_localize` instead.
6118 Examples
6119 --------
6120 Create a DataFrame:
6122 >>> d = {'col1': [1, 2], 'col2': [3, 4]}
6123 >>> df = pd.DataFrame(data=d)
6124 >>> df.dtypes
6125 col1 int64
6126 col2 int64
6127 dtype: object
6129 Cast all columns to int32:
6131 >>> df.astype('int32').dtypes
6132 col1 int32
6133 col2 int32
6134 dtype: object
6136 Cast col1 to int32 using a dictionary:
6138 >>> df.astype({'col1': 'int32'}).dtypes
6139 col1 int32
6140 col2 int64
6141 dtype: object
6143 Create a series:
6145 >>> ser = pd.Series([1, 2], dtype='int32')
6146 >>> ser
6147 0 1
6148 1 2
6149 dtype: int32
6150 >>> ser.astype('int64')
6151 0 1
6152 1 2
6153 dtype: int64
6155 Convert to categorical type:
6157 >>> ser.astype('category')
6158 0 1
6159 1 2
6160 dtype: category
6161 Categories (2, int64): [1, 2]
6163 Convert to ordered categorical type with custom ordering:
6165 >>> from pandas.api.types import CategoricalDtype
6166 >>> cat_dtype = CategoricalDtype(
6167 ... categories=[2, 1], ordered=True)
6168 >>> ser.astype(cat_dtype)
6169 0 1
6170 1 2
6171 dtype: category
6172 Categories (2, int64): [2 < 1]
6174 Note that using ``copy=False`` and changing data on a new
6175 pandas object may propagate changes:
6177 >>> s1 = pd.Series([1, 2])
6178 >>> s2 = s1.astype('int64', copy=False)
6179 >>> s2[0] = 10
6180 >>> s1 # note that s1[0] has changed too
6181 0 10
6182 1 2
6183 dtype: int64
6185 Create a series of dates:
6187 >>> ser_date = pd.Series(pd.date_range('20200101', periods=3))
6188 >>> ser_date
6189 0 2020-01-01
6190 1 2020-01-02
6191 2 2020-01-03
6192 dtype: datetime64[ns]
6193 """
6194 if is_dict_like(dtype):
6195 if self.ndim == 1: # i.e. Series
6196 if len(dtype) > 1 or self.name not in dtype:
6197 raise KeyError(
6198 "Only the Series name can be used for "
6199 "the key in Series dtype mappings."
6200 )
6201 new_type = dtype[self.name]
6202 return self.astype(new_type, copy, errors)
6204 # GH#44417 cast to Series so we can use .iat below, which will be
6205 # robust in case we
6206 from pandas import Series
6208 dtype_ser = Series(dtype, dtype=object)
6210 for col_name in dtype_ser.index:
6211 if col_name not in self:
6212 raise KeyError(
6213 "Only a column name can be used for the "
6214 "key in a dtype mappings argument. "
6215 f"'{col_name}' not found in columns."
6216 )
6218 dtype_ser = dtype_ser.reindex(self.columns, fill_value=None, copy=False)
6220 results = []
6221 for i, (col_name, col) in enumerate(self.items()):
6222 cdt = dtype_ser.iat[i]
6223 if isna(cdt):
6224 res_col = col.copy() if copy else col
6225 else:
6226 res_col = col.astype(dtype=cdt, copy=copy, errors=errors)
6227 results.append(res_col)
6229 elif is_extension_array_dtype(dtype) and self.ndim > 1:
6230 # GH 18099/22869: columnwise conversion to extension dtype
6231 # GH 24704: use iloc to handle duplicate column names
6232 # TODO(EA2D): special case not needed with 2D EAs
6233 results = [
6234 self.iloc[:, i].astype(dtype, copy=copy)
6235 for i in range(len(self.columns))
6236 ]
6238 else:
6239 # else, only a single dtype is given
6240 new_data = self._mgr.astype(dtype=dtype, copy=copy, errors=errors)
6241 return self._constructor(new_data).__finalize__(self, method="astype")
6243 # GH 33113: handle empty frame or series
6244 if not results:
6245 return self.copy()
6247 # GH 19920: retain column metadata after concat
6248 result = concat(results, axis=1, copy=False)
6249 # GH#40810 retain subclass
6250 # error: Incompatible types in assignment
6251 # (expression has type "NDFrameT", variable has type "DataFrame")
6252 result = self._constructor(result) # type: ignore[assignment]
6253 result.columns = self.columns
6254 result = result.__finalize__(self, method="astype")
6255 # https://github.com/python/mypy/issues/8354
6256 return cast(NDFrameT, result)
6258 @final
6259 def copy(self: NDFrameT, deep: bool_t | None = True) -> NDFrameT:
6260 """
6261 Make a copy of this object's indices and data.
6263 When ``deep=True`` (default), a new object will be created with a
6264 copy of the calling object's data and indices. Modifications to
6265 the data or indices of the copy will not be reflected in the
6266 original object (see notes below).
6268 When ``deep=False``, a new object will be created without copying
6269 the calling object's data or index (only references to the data
6270 and index are copied). Any changes to the data of the original
6271 will be reflected in the shallow copy (and vice versa).
6273 Parameters
6274 ----------
6275 deep : bool, default True
6276 Make a deep copy, including a copy of the data and the indices.
6277 With ``deep=False`` neither the indices nor the data are copied.
6279 Returns
6280 -------
6281 copy : Series or DataFrame
6282 Object type matches caller.
6284 Notes
6285 -----
6286 When ``deep=True``, data is copied but actual Python objects
6287 will not be copied recursively, only the reference to the object.
6288 This is in contrast to `copy.deepcopy` in the Standard Library,
6289 which recursively copies object data (see examples below).
6291 While ``Index`` objects are copied when ``deep=True``, the underlying
6292 numpy array is not copied for performance reasons. Since ``Index`` is
6293 immutable, the underlying data can be safely shared and a copy
6294 is not needed.
6296 Since pandas is not thread safe, see the
6297 :ref:`gotchas <gotchas.thread-safety>` when copying in a threading
6298 environment.
6300 Examples
6301 --------
6302 >>> s = pd.Series([1, 2], index=["a", "b"])
6303 >>> s
6304 a 1
6305 b 2
6306 dtype: int64
6308 >>> s_copy = s.copy()
6309 >>> s_copy
6310 a 1
6311 b 2
6312 dtype: int64
6314 **Shallow copy versus default (deep) copy:**
6316 >>> s = pd.Series([1, 2], index=["a", "b"])
6317 >>> deep = s.copy()
6318 >>> shallow = s.copy(deep=False)
6320 Shallow copy shares data and index with original.
6322 >>> s is shallow
6323 False
6324 >>> s.values is shallow.values and s.index is shallow.index
6325 True
6327 Deep copy has own copy of data and index.
6329 >>> s is deep
6330 False
6331 >>> s.values is deep.values or s.index is deep.index
6332 False
6334 Updates to the data shared by shallow copy and original is reflected
6335 in both; deep copy remains unchanged.
6337 >>> s[0] = 3
6338 >>> shallow[1] = 4
6339 >>> s
6340 a 3
6341 b 4
6342 dtype: int64
6343 >>> shallow
6344 a 3
6345 b 4
6346 dtype: int64
6347 >>> deep
6348 a 1
6349 b 2
6350 dtype: int64
6352 Note that when copying an object containing Python objects, a deep copy
6353 will copy the data, but will not do so recursively. Updating a nested
6354 data object will be reflected in the deep copy.
6356 >>> s = pd.Series([[1, 2], [3, 4]])
6357 >>> deep = s.copy()
6358 >>> s[0][0] = 10
6359 >>> s
6360 0 [10, 2]
6361 1 [3, 4]
6362 dtype: object
6363 >>> deep
6364 0 [10, 2]
6365 1 [3, 4]
6366 dtype: object
6367 """
6368 data = self._mgr.copy(deep=deep)
6369 self._clear_item_cache()
6370 return self._constructor(data).__finalize__(self, method="copy")
6372 @final
6373 def __copy__(self: NDFrameT, deep: bool_t = True) -> NDFrameT:
6374 return self.copy(deep=deep)
6376 @final
6377 def __deepcopy__(self: NDFrameT, memo=None) -> NDFrameT:
6378 """
6379 Parameters
6380 ----------
6381 memo, default None
6382 Standard signature. Unused
6383 """
6384 return self.copy(deep=True)
6386 @final
6387 def _convert(
6388 self: NDFrameT,
6389 datetime: bool_t = False,
6390 numeric: bool_t = False,
6391 timedelta: bool_t = False,
6392 ) -> NDFrameT:
6393 """
6394 Attempt to infer better dtype for object columns.
6396 Parameters
6397 ----------
6398 datetime : bool, default False
6399 If True, convert to date where possible.
6400 numeric : bool, default False
6401 If True, attempt to convert to numbers (including strings), with
6402 unconvertible values becoming NaN.
6403 timedelta : bool, default False
6404 If True, convert to timedelta where possible.
6406 Returns
6407 -------
6408 converted : same as input object
6409 """
6410 validate_bool_kwarg(datetime, "datetime")
6411 validate_bool_kwarg(numeric, "numeric")
6412 validate_bool_kwarg(timedelta, "timedelta")
6413 return self._constructor(
6414 self._mgr.convert(
6415 datetime=datetime,
6416 numeric=numeric,
6417 timedelta=timedelta,
6418 copy=True,
6419 )
6420 ).__finalize__(self)
6422 @final
6423 def infer_objects(self: NDFrameT) -> NDFrameT:
6424 """
6425 Attempt to infer better dtypes for object columns.
6427 Attempts soft conversion of object-dtyped
6428 columns, leaving non-object and unconvertible
6429 columns unchanged. The inference rules are the
6430 same as during normal Series/DataFrame construction.
6432 Returns
6433 -------
6434 converted : same type as input object
6436 See Also
6437 --------
6438 to_datetime : Convert argument to datetime.
6439 to_timedelta : Convert argument to timedelta.
6440 to_numeric : Convert argument to numeric type.
6441 convert_dtypes : Convert argument to best possible dtype.
6443 Examples
6444 --------
6445 >>> df = pd.DataFrame({"A": ["a", 1, 2, 3]})
6446 >>> df = df.iloc[1:]
6447 >>> df
6448 A
6449 1 1
6450 2 2
6451 3 3
6453 >>> df.dtypes
6454 A object
6455 dtype: object
6457 >>> df.infer_objects().dtypes
6458 A int64
6459 dtype: object
6460 """
6461 # numeric=False necessary to only soft convert;
6462 # python objects will still be converted to
6463 # native numpy numeric types
6464 return self._constructor(
6465 self._mgr.convert(datetime=True, numeric=False, timedelta=True, copy=True)
6466 ).__finalize__(self, method="infer_objects")
6468 @final
6469 def convert_dtypes(
6470 self: NDFrameT,
6471 infer_objects: bool_t = True,
6472 convert_string: bool_t = True,
6473 convert_integer: bool_t = True,
6474 convert_boolean: bool_t = True,
6475 convert_floating: bool_t = True,
6476 ) -> NDFrameT:
6477 """
6478 Convert columns to best possible dtypes using dtypes supporting ``pd.NA``.
6480 .. versionadded:: 1.0.0
6482 Parameters
6483 ----------
6484 infer_objects : bool, default True
6485 Whether object dtypes should be converted to the best possible types.
6486 convert_string : bool, default True
6487 Whether object dtypes should be converted to ``StringDtype()``.
6488 convert_integer : bool, default True
6489 Whether, if possible, conversion can be done to integer extension types.
6490 convert_boolean : bool, defaults True
6491 Whether object dtypes should be converted to ``BooleanDtypes()``.
6492 convert_floating : bool, defaults True
6493 Whether, if possible, conversion can be done to floating extension types.
6494 If `convert_integer` is also True, preference will be give to integer
6495 dtypes if the floats can be faithfully casted to integers.
6497 .. versionadded:: 1.2.0
6499 Returns
6500 -------
6501 Series or DataFrame
6502 Copy of input object with new dtype.
6504 See Also
6505 --------
6506 infer_objects : Infer dtypes of objects.
6507 to_datetime : Convert argument to datetime.
6508 to_timedelta : Convert argument to timedelta.
6509 to_numeric : Convert argument to a numeric type.
6511 Notes
6512 -----
6513 By default, ``convert_dtypes`` will attempt to convert a Series (or each
6514 Series in a DataFrame) to dtypes that support ``pd.NA``. By using the options
6515 ``convert_string``, ``convert_integer``, ``convert_boolean`` and
6516 ``convert_boolean``, it is possible to turn off individual conversions
6517 to ``StringDtype``, the integer extension types, ``BooleanDtype``
6518 or floating extension types, respectively.
6520 For object-dtyped columns, if ``infer_objects`` is ``True``, use the inference
6521 rules as during normal Series/DataFrame construction. Then, if possible,
6522 convert to ``StringDtype``, ``BooleanDtype`` or an appropriate integer
6523 or floating extension type, otherwise leave as ``object``.
6525 If the dtype is integer, convert to an appropriate integer extension type.
6527 If the dtype is numeric, and consists of all integers, convert to an
6528 appropriate integer extension type. Otherwise, convert to an
6529 appropriate floating extension type.
6531 .. versionchanged:: 1.2
6532 Starting with pandas 1.2, this method also converts float columns
6533 to the nullable floating extension type.
6535 In the future, as new dtypes are added that support ``pd.NA``, the results
6536 of this method will change to support those new dtypes.
6538 Examples
6539 --------
6540 >>> df = pd.DataFrame(
6541 ... {
6542 ... "a": pd.Series([1, 2, 3], dtype=np.dtype("int32")),
6543 ... "b": pd.Series(["x", "y", "z"], dtype=np.dtype("O")),
6544 ... "c": pd.Series([True, False, np.nan], dtype=np.dtype("O")),
6545 ... "d": pd.Series(["h", "i", np.nan], dtype=np.dtype("O")),
6546 ... "e": pd.Series([10, np.nan, 20], dtype=np.dtype("float")),
6547 ... "f": pd.Series([np.nan, 100.5, 200], dtype=np.dtype("float")),
6548 ... }
6549 ... )
6551 Start with a DataFrame with default dtypes.
6553 >>> df
6554 a b c d e f
6555 0 1 x True h 10.0 NaN
6556 1 2 y False i NaN 100.5
6557 2 3 z NaN NaN 20.0 200.0
6559 >>> df.dtypes
6560 a int32
6561 b object
6562 c object
6563 d object
6564 e float64
6565 f float64
6566 dtype: object
6568 Convert the DataFrame to use best possible dtypes.
6570 >>> dfn = df.convert_dtypes()
6571 >>> dfn
6572 a b c d e f
6573 0 1 x True h 10 <NA>
6574 1 2 y False i <NA> 100.5
6575 2 3 z <NA> <NA> 20 200.0
6577 >>> dfn.dtypes
6578 a Int32
6579 b string
6580 c boolean
6581 d string
6582 e Int64
6583 f Float64
6584 dtype: object
6586 Start with a Series of strings and missing data represented by ``np.nan``.
6588 >>> s = pd.Series(["a", "b", np.nan])
6589 >>> s
6590 0 a
6591 1 b
6592 2 NaN
6593 dtype: object
6595 Obtain a Series with dtype ``StringDtype``.
6597 >>> s.convert_dtypes()
6598 0 a
6599 1 b
6600 2 <NA>
6601 dtype: string
6602 """
6603 if self.ndim == 1:
6604 return self._convert_dtypes(
6605 infer_objects,
6606 convert_string,
6607 convert_integer,
6608 convert_boolean,
6609 convert_floating,
6610 )
6611 else:
6612 results = [
6613 col._convert_dtypes(
6614 infer_objects,
6615 convert_string,
6616 convert_integer,
6617 convert_boolean,
6618 convert_floating,
6619 )
6620 for col_name, col in self.items()
6621 ]
6622 if len(results) > 0:
6623 result = concat(results, axis=1, copy=False, keys=self.columns)
6624 cons = cast(Type["DataFrame"], self._constructor)
6625 result = cons(result)
6626 result = result.__finalize__(self, method="convert_dtypes")
6627 # https://github.com/python/mypy/issues/8354
6628 return cast(NDFrameT, result)
6629 else:
6630 return self.copy()
6632 # ----------------------------------------------------------------------
6633 # Filling NA's
6635 @overload
6636 def fillna(
6637 self: NDFrameT,
6638 value: Hashable | Mapping | Series | DataFrame = ...,
6639 *,
6640 method: FillnaOptions | None = ...,
6641 axis: Axis | None = ...,
6642 inplace: Literal[False] = ...,
6643 limit: int | None = ...,
6644 downcast: dict | None = ...,
6645 ) -> NDFrameT:
6646 ...
6648 @overload
6649 def fillna(
6650 self,
6651 value: Hashable | Mapping | Series | DataFrame = ...,
6652 *,
6653 method: FillnaOptions | None = ...,
6654 axis: Axis | None = ...,
6655 inplace: Literal[True],
6656 limit: int | None = ...,
6657 downcast: dict | None = ...,
6658 ) -> None:
6659 ...
6661 @overload
6662 def fillna(
6663 self: NDFrameT,
6664 value: Hashable | Mapping | Series | DataFrame = ...,
6665 *,
6666 method: FillnaOptions | None = ...,
6667 axis: Axis | None = ...,
6668 inplace: bool_t = ...,
6669 limit: int | None = ...,
6670 downcast: dict | None = ...,
6671 ) -> NDFrameT | None:
6672 ...
6674 @doc(**_shared_doc_kwargs)
6675 def fillna(
6676 self: NDFrameT,
6677 value: Hashable | Mapping | Series | DataFrame = None,
6678 method: FillnaOptions | None = None,
6679 axis: Axis | None = None,
6680 inplace: bool_t = False,
6681 limit: int | None = None,
6682 downcast: dict | None = None,
6683 ) -> NDFrameT | None:
6684 """
6685 Fill NA/NaN values using the specified method.
6687 Parameters
6688 ----------
6689 value : scalar, dict, Series, or DataFrame
6690 Value to use to fill holes (e.g. 0), alternately a
6691 dict/Series/DataFrame of values specifying which value to use for
6692 each index (for a Series) or column (for a DataFrame). Values not
6693 in the dict/Series/DataFrame will not be filled. This value cannot
6694 be a list.
6695 method : {{'backfill', 'bfill', 'pad', 'ffill', None}}, default None
6696 Method to use for filling holes in reindexed Series
6697 pad / ffill: propagate last valid observation forward to next valid
6698 backfill / bfill: use next valid observation to fill gap.
6699 axis : {axes_single_arg}
6700 Axis along which to fill missing values. For `Series`
6701 this parameter is unused and defaults to 0.
6702 inplace : bool, default False
6703 If True, fill in-place. Note: this will modify any
6704 other views on this object (e.g., a no-copy slice for a column in a
6705 DataFrame).
6706 limit : int, default None
6707 If method is specified, this is the maximum number of consecutive
6708 NaN values to forward/backward fill. In other words, if there is
6709 a gap with more than this number of consecutive NaNs, it will only
6710 be partially filled. If method is not specified, this is the
6711 maximum number of entries along the entire axis where NaNs will be
6712 filled. Must be greater than 0 if not None.
6713 downcast : dict, default is None
6714 A dict of item->dtype of what to downcast if possible,
6715 or the string 'infer' which will try to downcast to an appropriate
6716 equal type (e.g. float64 to int64 if possible).
6718 Returns
6719 -------
6720 {klass} or None
6721 Object with missing values filled or None if ``inplace=True``.
6723 See Also
6724 --------
6725 interpolate : Fill NaN values using interpolation.
6726 reindex : Conform object to new index.
6727 asfreq : Convert TimeSeries to specified frequency.
6729 Examples
6730 --------
6731 >>> df = pd.DataFrame([[np.nan, 2, np.nan, 0],
6732 ... [3, 4, np.nan, 1],
6733 ... [np.nan, np.nan, np.nan, np.nan],
6734 ... [np.nan, 3, np.nan, 4]],
6735 ... columns=list("ABCD"))
6736 >>> df
6737 A B C D
6738 0 NaN 2.0 NaN 0.0
6739 1 3.0 4.0 NaN 1.0
6740 2 NaN NaN NaN NaN
6741 3 NaN 3.0 NaN 4.0
6743 Replace all NaN elements with 0s.
6745 >>> df.fillna(0)
6746 A B C D
6747 0 0.0 2.0 0.0 0.0
6748 1 3.0 4.0 0.0 1.0
6749 2 0.0 0.0 0.0 0.0
6750 3 0.0 3.0 0.0 4.0
6752 We can also propagate non-null values forward or backward.
6754 >>> df.fillna(method="ffill")
6755 A B C D
6756 0 NaN 2.0 NaN 0.0
6757 1 3.0 4.0 NaN 1.0
6758 2 3.0 4.0 NaN 1.0
6759 3 3.0 3.0 NaN 4.0
6761 Replace all NaN elements in column 'A', 'B', 'C', and 'D', with 0, 1,
6762 2, and 3 respectively.
6764 >>> values = {{"A": 0, "B": 1, "C": 2, "D": 3}}
6765 >>> df.fillna(value=values)
6766 A B C D
6767 0 0.0 2.0 2.0 0.0
6768 1 3.0 4.0 2.0 1.0
6769 2 0.0 1.0 2.0 3.0
6770 3 0.0 3.0 2.0 4.0
6772 Only replace the first NaN element.
6774 >>> df.fillna(value=values, limit=1)
6775 A B C D
6776 0 0.0 2.0 2.0 0.0
6777 1 3.0 4.0 NaN 1.0
6778 2 NaN 1.0 NaN 3.0
6779 3 NaN 3.0 NaN 4.0
6781 When filling using a DataFrame, replacement happens along
6782 the same column names and same indices
6784 >>> df2 = pd.DataFrame(np.zeros((4, 4)), columns=list("ABCE"))
6785 >>> df.fillna(df2)
6786 A B C D
6787 0 0.0 2.0 0.0 0.0
6788 1 3.0 4.0 0.0 1.0
6789 2 0.0 0.0 0.0 NaN
6790 3 0.0 3.0 0.0 4.0
6792 Note that column D is not affected since it is not present in df2.
6793 """
6794 inplace = validate_bool_kwarg(inplace, "inplace")
6795 value, method = validate_fillna_kwargs(value, method)
6797 self._consolidate_inplace()
6799 # set the default here, so functions examining the signaure
6800 # can detect if something was set (e.g. in groupby) (GH9221)
6801 if axis is None:
6802 axis = 0
6803 axis = self._get_axis_number(axis)
6805 if value is None:
6806 if not self._mgr.is_single_block and axis == 1:
6807 if inplace:
6808 raise NotImplementedError()
6809 result = self.T.fillna(method=method, limit=limit).T
6811 return result
6813 new_data = self._mgr.interpolate(
6814 method=method,
6815 axis=axis,
6816 limit=limit,
6817 inplace=inplace,
6818 downcast=downcast,
6819 )
6820 else:
6821 if self.ndim == 1:
6822 if isinstance(value, (dict, ABCSeries)):
6823 if not len(value):
6824 # test_fillna_nonscalar
6825 if inplace:
6826 return None
6827 return self.copy()
6828 value = create_series_with_explicit_dtype(
6829 value, dtype_if_empty=object
6830 )
6831 value = value.reindex(self.index, copy=False)
6832 value = value._values
6833 elif not is_list_like(value):
6834 pass
6835 else:
6836 raise TypeError(
6837 '"value" parameter must be a scalar, dict '
6838 "or Series, but you passed a "
6839 f'"{type(value).__name__}"'
6840 )
6842 new_data = self._mgr.fillna(
6843 value=value, limit=limit, inplace=inplace, downcast=downcast
6844 )
6846 elif isinstance(value, (dict, ABCSeries)):
6847 if axis == 1:
6848 raise NotImplementedError(
6849 "Currently only can fill "
6850 "with dict/Series column "
6851 "by column"
6852 )
6854 result = self if inplace else self.copy()
6855 is_dict = isinstance(downcast, dict)
6856 for k, v in value.items():
6857 if k not in result:
6858 continue
6860 # error: Item "None" of "Optional[Dict[Any, Any]]" has no
6861 # attribute "get"
6862 downcast_k = (
6863 downcast
6864 if not is_dict
6865 else downcast.get(k) # type: ignore[union-attr]
6866 )
6868 res_k = result[k].fillna(v, limit=limit, downcast=downcast_k)
6870 if not inplace:
6871 result[k] = res_k
6872 else:
6873 # We can write into our existing column(s) iff dtype
6874 # was preserved.
6875 if isinstance(res_k, ABCSeries):
6876 # i.e. 'k' only shows up once in self.columns
6877 if res_k.dtype == result[k].dtype:
6878 result.loc[:, k] = res_k
6879 else:
6880 # Different dtype -> no way to do inplace.
6881 result[k] = res_k
6882 else:
6883 # see test_fillna_dict_inplace_nonunique_columns
6884 locs = result.columns.get_loc(k)
6885 if isinstance(locs, slice):
6886 locs = np.arange(self.shape[1])[locs]
6887 elif (
6888 isinstance(locs, np.ndarray) and locs.dtype.kind == "b"
6889 ):
6890 locs = locs.nonzero()[0]
6891 elif not (
6892 isinstance(locs, np.ndarray) and locs.dtype.kind == "i"
6893 ):
6894 # Should never be reached, but let's cover our bases
6895 raise NotImplementedError(
6896 "Unexpected get_loc result, please report a bug at "
6897 "https://github.com/pandas-dev/pandas"
6898 )
6900 for i, loc in enumerate(locs):
6901 res_loc = res_k.iloc[:, i]
6902 target = self.iloc[:, loc]
6904 if res_loc.dtype == target.dtype:
6905 result.iloc[:, loc] = res_loc
6906 else:
6907 result.isetitem(loc, res_loc)
6909 return result if not inplace else None
6911 elif not is_list_like(value):
6912 if axis == 1:
6914 result = self.T.fillna(value=value, limit=limit).T
6916 # error: Incompatible types in assignment (expression has type
6917 # "NDFrameT", variable has type "Union[ArrayManager,
6918 # SingleArrayManager, BlockManager, SingleBlockManager]")
6919 new_data = result # type: ignore[assignment]
6920 else:
6922 new_data = self._mgr.fillna(
6923 value=value, limit=limit, inplace=inplace, downcast=downcast
6924 )
6925 elif isinstance(value, ABCDataFrame) and self.ndim == 2:
6927 new_data = self.where(self.notna(), value)._mgr
6928 else:
6929 raise ValueError(f"invalid fill value with a {type(value)}")
6931 result = self._constructor(new_data)
6932 if inplace:
6933 return self._update_inplace(result)
6934 else:
6935 return result.__finalize__(self, method="fillna")
6937 @overload
6938 def ffill(
6939 self: NDFrameT,
6940 *,
6941 axis: None | Axis = ...,
6942 inplace: Literal[False] = ...,
6943 limit: None | int = ...,
6944 downcast: dict | None = ...,
6945 ) -> NDFrameT:
6946 ...
6948 @overload
6949 def ffill(
6950 self,
6951 *,
6952 axis: None | Axis = ...,
6953 inplace: Literal[True],
6954 limit: None | int = ...,
6955 downcast: dict | None = ...,
6956 ) -> None:
6957 ...
6959 @overload
6960 def ffill(
6961 self: NDFrameT,
6962 *,
6963 axis: None | Axis = ...,
6964 inplace: bool_t = ...,
6965 limit: None | int = ...,
6966 downcast: dict | None = ...,
6967 ) -> NDFrameT | None:
6968 ...
6970 @deprecate_nonkeyword_arguments(version=None, allowed_args=["self"])
6971 @doc(klass=_shared_doc_kwargs["klass"])
6972 def ffill(
6973 self: NDFrameT,
6974 axis: None | Axis = None,
6975 inplace: bool_t = False,
6976 limit: None | int = None,
6977 downcast: dict | None = None,
6978 ) -> NDFrameT | None:
6979 """
6980 Synonym for :meth:`DataFrame.fillna` with ``method='ffill'``.
6982 Returns
6983 -------
6984 {klass} or None
6985 Object with missing values filled or None if ``inplace=True``.
6986 """
6987 return self.fillna(
6988 method="ffill", axis=axis, inplace=inplace, limit=limit, downcast=downcast
6989 )
6991 pad = ffill
6993 @overload
6994 def bfill(
6995 self: NDFrameT,
6996 *,
6997 axis: None | Axis = ...,
6998 inplace: Literal[False] = ...,
6999 limit: None | int = ...,
7000 downcast: dict | None = ...,
7001 ) -> NDFrameT:
7002 ...
7004 @overload
7005 def bfill(
7006 self,
7007 *,
7008 axis: None | Axis = ...,
7009 inplace: Literal[True],
7010 limit: None | int = ...,
7011 downcast: dict | None = ...,
7012 ) -> None:
7013 ...
7015 @overload
7016 def bfill(
7017 self: NDFrameT,
7018 *,
7019 axis: None | Axis = ...,
7020 inplace: bool_t = ...,
7021 limit: None | int = ...,
7022 downcast: dict | None = ...,
7023 ) -> NDFrameT | None:
7024 ...
7026 @deprecate_nonkeyword_arguments(version=None, allowed_args=["self"])
7027 @doc(klass=_shared_doc_kwargs["klass"])
7028 def bfill(
7029 self: NDFrameT,
7030 axis: None | Axis = None,
7031 inplace: bool_t = False,
7032 limit: None | int = None,
7033 downcast: dict | None = None,
7034 ) -> NDFrameT | None:
7035 """
7036 Synonym for :meth:`DataFrame.fillna` with ``method='bfill'``.
7038 Returns
7039 -------
7040 {klass} or None
7041 Object with missing values filled or None if ``inplace=True``.
7042 """
7043 return self.fillna(
7044 method="bfill", axis=axis, inplace=inplace, limit=limit, downcast=downcast
7045 )
7047 backfill = bfill
7049 @overload
7050 def replace(
7051 self: NDFrameT,
7052 to_replace=...,
7053 value=...,
7054 *,
7055 inplace: Literal[False] = ...,
7056 limit: int | None = ...,
7057 regex: bool_t = ...,
7058 method: Literal["pad", "ffill", "bfill"] | lib.NoDefault = ...,
7059 ) -> NDFrameT:
7060 ...
7062 @overload
7063 def replace(
7064 self,
7065 to_replace=...,
7066 value=...,
7067 *,
7068 inplace: Literal[True],
7069 limit: int | None = ...,
7070 regex: bool_t = ...,
7071 method: Literal["pad", "ffill", "bfill"] | lib.NoDefault = ...,
7072 ) -> None:
7073 ...
7075 @overload
7076 def replace(
7077 self: NDFrameT,
7078 to_replace=...,
7079 value=...,
7080 *,
7081 inplace: bool_t = ...,
7082 limit: int | None = ...,
7083 regex: bool_t = ...,
7084 method: Literal["pad", "ffill", "bfill"] | lib.NoDefault = ...,
7085 ) -> NDFrameT | None:
7086 ...
7088 @deprecate_nonkeyword_arguments(
7089 version=None, allowed_args=["self", "to_replace", "value"]
7090 )
7091 @doc(
7092 _shared_docs["replace"],
7093 klass=_shared_doc_kwargs["klass"],
7094 inplace=_shared_doc_kwargs["inplace"],
7095 replace_iloc=_shared_doc_kwargs["replace_iloc"],
7096 )
7097 def replace(
7098 self: NDFrameT,
7099 to_replace=None,
7100 value=lib.no_default,
7101 inplace: bool_t = False,
7102 limit: int | None = None,
7103 regex: bool_t = False,
7104 method: Literal["pad", "ffill", "bfill"] | lib.NoDefault = lib.no_default,
7105 ) -> NDFrameT | None:
7106 if not (
7107 is_scalar(to_replace)
7108 or is_re_compilable(to_replace)
7109 or is_list_like(to_replace)
7110 ):
7111 raise TypeError(
7112 "Expecting 'to_replace' to be either a scalar, array-like, "
7113 "dict or None, got invalid type "
7114 f"{repr(type(to_replace).__name__)}"
7115 )
7117 inplace = validate_bool_kwarg(inplace, "inplace")
7118 if not is_bool(regex) and to_replace is not None:
7119 raise ValueError("'to_replace' must be 'None' if 'regex' is not a bool")
7121 self._consolidate_inplace()
7123 if value is lib.no_default or method is not lib.no_default:
7124 # GH#36984 if the user explicitly passes value=None we want to
7125 # respect that. We have the corner case where the user explicitly
7126 # passes value=None *and* a method, which we interpret as meaning
7127 # they want the (documented) default behavior.
7128 if method is lib.no_default:
7129 # TODO: get this to show up as the default in the docs?
7130 method = "pad"
7132 # passing a single value that is scalar like
7133 # when value is None (GH5319), for compat
7134 if not is_dict_like(to_replace) and not is_dict_like(regex):
7135 to_replace = [to_replace]
7137 if isinstance(to_replace, (tuple, list)):
7138 if isinstance(self, ABCDataFrame):
7139 from pandas import Series
7141 result = self.apply(
7142 Series._replace_single,
7143 args=(to_replace, method, inplace, limit),
7144 )
7145 if inplace:
7146 return None
7147 return result
7148 return self._replace_single(to_replace, method, inplace, limit)
7150 if not is_dict_like(to_replace):
7151 if not is_dict_like(regex):
7152 raise TypeError(
7153 'If "to_replace" and "value" are both None '
7154 'and "to_replace" is not a list, then '
7155 "regex must be a mapping"
7156 )
7157 to_replace = regex
7158 regex = True
7160 items = list(to_replace.items())
7161 if items:
7162 keys, values = zip(*items)
7163 else:
7164 keys, values = ([], [])
7166 are_mappings = [is_dict_like(v) for v in values]
7168 if any(are_mappings):
7169 if not all(are_mappings):
7170 raise TypeError(
7171 "If a nested mapping is passed, all values "
7172 "of the top level mapping must be mappings"
7173 )
7174 # passed a nested dict/Series
7175 to_rep_dict = {}
7176 value_dict = {}
7178 for k, v in items:
7179 keys, values = list(zip(*v.items())) or ([], [])
7181 to_rep_dict[k] = list(keys)
7182 value_dict[k] = list(values)
7184 to_replace, value = to_rep_dict, value_dict
7185 else:
7186 to_replace, value = keys, values
7188 return self.replace(
7189 to_replace, value, inplace=inplace, limit=limit, regex=regex
7190 )
7191 else:
7193 # need a non-zero len on all axes
7194 if not self.size:
7195 if inplace:
7196 return None
7197 return self.copy()
7199 if is_dict_like(to_replace):
7200 if is_dict_like(value): # {'A' : NA} -> {'A' : 0}
7201 # Note: Checking below for `in foo.keys()` instead of
7202 # `in foo` is needed for when we have a Series and not dict
7203 mapping = {
7204 col: (to_replace[col], value[col])
7205 for col in to_replace.keys()
7206 if col in value.keys() and col in self
7207 }
7208 return self._replace_columnwise(mapping, inplace, regex)
7210 # {'A': NA} -> 0
7211 elif not is_list_like(value):
7212 # Operate column-wise
7213 if self.ndim == 1:
7214 raise ValueError(
7215 "Series.replace cannot use dict-like to_replace "
7216 "and non-None value"
7217 )
7218 mapping = {
7219 col: (to_rep, value) for col, to_rep in to_replace.items()
7220 }
7221 return self._replace_columnwise(mapping, inplace, regex)
7222 else:
7223 raise TypeError("value argument must be scalar, dict, or Series")
7225 elif is_list_like(to_replace):
7226 if not is_list_like(value):
7227 # e.g. to_replace = [NA, ''] and value is 0,
7228 # so we replace NA with 0 and then replace '' with 0
7229 value = [value] * len(to_replace)
7231 # e.g. we have to_replace = [NA, ''] and value = [0, 'missing']
7232 if len(to_replace) != len(value):
7233 raise ValueError(
7234 f"Replacement lists must match in length. "
7235 f"Expecting {len(to_replace)} got {len(value)} "
7236 )
7237 new_data = self._mgr.replace_list(
7238 src_list=to_replace,
7239 dest_list=value,
7240 inplace=inplace,
7241 regex=regex,
7242 )
7244 elif to_replace is None:
7245 if not (
7246 is_re_compilable(regex)
7247 or is_list_like(regex)
7248 or is_dict_like(regex)
7249 ):
7250 raise TypeError(
7251 f"'regex' must be a string or a compiled regular expression "
7252 f"or a list or dict of strings or regular expressions, "
7253 f"you passed a {repr(type(regex).__name__)}"
7254 )
7255 return self.replace(
7256 regex, value, inplace=inplace, limit=limit, regex=True
7257 )
7258 else:
7260 # dest iterable dict-like
7261 if is_dict_like(value): # NA -> {'A' : 0, 'B' : -1}
7262 # Operate column-wise
7263 if self.ndim == 1:
7264 raise ValueError(
7265 "Series.replace cannot use dict-value and "
7266 "non-None to_replace"
7267 )
7268 mapping = {col: (to_replace, val) for col, val in value.items()}
7269 return self._replace_columnwise(mapping, inplace, regex)
7271 elif not is_list_like(value): # NA -> 0
7272 regex = should_use_regex(regex, to_replace)
7273 if regex:
7274 new_data = self._mgr.replace_regex(
7275 to_replace=to_replace,
7276 value=value,
7277 inplace=inplace,
7278 )
7279 else:
7280 new_data = self._mgr.replace(
7281 to_replace=to_replace, value=value, inplace=inplace
7282 )
7283 else:
7284 raise TypeError(
7285 f'Invalid "to_replace" type: {repr(type(to_replace).__name__)}'
7286 )
7288 result = self._constructor(new_data)
7289 if inplace:
7290 return self._update_inplace(result)
7291 else:
7292 return result.__finalize__(self, method="replace")
7294 def interpolate(
7295 self: NDFrameT,
7296 method: str = "linear",
7297 axis: Axis = 0,
7298 limit: int | None = None,
7299 inplace: bool_t = False,
7300 limit_direction: str | None = None,
7301 limit_area: str | None = None,
7302 downcast: str | None = None,
7303 **kwargs,
7304 ) -> NDFrameT | None:
7305 """
7306 Fill NaN values using an interpolation method.
7308 Please note that only ``method='linear'`` is supported for
7309 DataFrame/Series with a MultiIndex.
7311 Parameters
7312 ----------
7313 method : str, default 'linear'
7314 Interpolation technique to use. One of:
7316 * 'linear': Ignore the index and treat the values as equally
7317 spaced. This is the only method supported on MultiIndexes.
7318 * 'time': Works on daily and higher resolution data to interpolate
7319 given length of interval.
7320 * 'index', 'values': use the actual numerical values of the index.
7321 * 'pad': Fill in NaNs using existing values.
7322 * 'nearest', 'zero', 'slinear', 'quadratic', 'cubic', 'spline',
7323 'barycentric', 'polynomial': Passed to
7324 `scipy.interpolate.interp1d`. These methods use the numerical
7325 values of the index. Both 'polynomial' and 'spline' require that
7326 you also specify an `order` (int), e.g.
7327 ``df.interpolate(method='polynomial', order=5)``.
7328 * 'krogh', 'piecewise_polynomial', 'spline', 'pchip', 'akima',
7329 'cubicspline': Wrappers around the SciPy interpolation methods of
7330 similar names. See `Notes`.
7331 * 'from_derivatives': Refers to
7332 `scipy.interpolate.BPoly.from_derivatives` which
7333 replaces 'piecewise_polynomial' interpolation method in
7334 scipy 0.18.
7336 axis : {{0 or 'index', 1 or 'columns', None}}, default None
7337 Axis to interpolate along. For `Series` this parameter is unused
7338 and defaults to 0.
7339 limit : int, optional
7340 Maximum number of consecutive NaNs to fill. Must be greater than
7341 0.
7342 inplace : bool, default False
7343 Update the data in place if possible.
7344 limit_direction : {{'forward', 'backward', 'both'}}, Optional
7345 Consecutive NaNs will be filled in this direction.
7347 If limit is specified:
7348 * If 'method' is 'pad' or 'ffill', 'limit_direction' must be 'forward'.
7349 * If 'method' is 'backfill' or 'bfill', 'limit_direction' must be
7350 'backwards'.
7352 If 'limit' is not specified:
7353 * If 'method' is 'backfill' or 'bfill', the default is 'backward'
7354 * else the default is 'forward'
7356 .. versionchanged:: 1.1.0
7357 raises ValueError if `limit_direction` is 'forward' or 'both' and
7358 method is 'backfill' or 'bfill'.
7359 raises ValueError if `limit_direction` is 'backward' or 'both' and
7360 method is 'pad' or 'ffill'.
7362 limit_area : {{`None`, 'inside', 'outside'}}, default None
7363 If limit is specified, consecutive NaNs will be filled with this
7364 restriction.
7366 * ``None``: No fill restriction.
7367 * 'inside': Only fill NaNs surrounded by valid values
7368 (interpolate).
7369 * 'outside': Only fill NaNs outside valid values (extrapolate).
7371 downcast : optional, 'infer' or None, defaults to None
7372 Downcast dtypes if possible.
7373 ``**kwargs`` : optional
7374 Keyword arguments to pass on to the interpolating function.
7376 Returns
7377 -------
7378 Series or DataFrame or None
7379 Returns the same object type as the caller, interpolated at
7380 some or all ``NaN`` values or None if ``inplace=True``.
7382 See Also
7383 --------
7384 fillna : Fill missing values using different methods.
7385 scipy.interpolate.Akima1DInterpolator : Piecewise cubic polynomials
7386 (Akima interpolator).
7387 scipy.interpolate.BPoly.from_derivatives : Piecewise polynomial in the
7388 Bernstein basis.
7389 scipy.interpolate.interp1d : Interpolate a 1-D function.
7390 scipy.interpolate.KroghInterpolator : Interpolate polynomial (Krogh
7391 interpolator).
7392 scipy.interpolate.PchipInterpolator : PCHIP 1-d monotonic cubic
7393 interpolation.
7394 scipy.interpolate.CubicSpline : Cubic spline data interpolator.
7396 Notes
7397 -----
7398 The 'krogh', 'piecewise_polynomial', 'spline', 'pchip' and 'akima'
7399 methods are wrappers around the respective SciPy implementations of
7400 similar names. These use the actual numerical values of the index.
7401 For more information on their behavior, see the
7402 `SciPy documentation
7403 <https://docs.scipy.org/doc/scipy/reference/interpolate.html#univariate-interpolation>`__.
7405 Examples
7406 --------
7407 Filling in ``NaN`` in a :class:`~pandas.Series` via linear
7408 interpolation.
7410 >>> s = pd.Series([0, 1, np.nan, 3])
7411 >>> s
7412 0 0.0
7413 1 1.0
7414 2 NaN
7415 3 3.0
7416 dtype: float64
7417 >>> s.interpolate()
7418 0 0.0
7419 1 1.0
7420 2 2.0
7421 3 3.0
7422 dtype: float64
7424 Filling in ``NaN`` in a Series by padding, but filling at most two
7425 consecutive ``NaN`` at a time.
7427 >>> s = pd.Series([np.nan, "single_one", np.nan,
7428 ... "fill_two_more", np.nan, np.nan, np.nan,
7429 ... 4.71, np.nan])
7430 >>> s
7431 0 NaN
7432 1 single_one
7433 2 NaN
7434 3 fill_two_more
7435 4 NaN
7436 5 NaN
7437 6 NaN
7438 7 4.71
7439 8 NaN
7440 dtype: object
7441 >>> s.interpolate(method='pad', limit=2)
7442 0 NaN
7443 1 single_one
7444 2 single_one
7445 3 fill_two_more
7446 4 fill_two_more
7447 5 fill_two_more
7448 6 NaN
7449 7 4.71
7450 8 4.71
7451 dtype: object
7453 Filling in ``NaN`` in a Series via polynomial interpolation or splines:
7454 Both 'polynomial' and 'spline' methods require that you also specify
7455 an ``order`` (int).
7457 >>> s = pd.Series([0, 2, np.nan, 8])
7458 >>> s.interpolate(method='polynomial', order=2)
7459 0 0.000000
7460 1 2.000000
7461 2 4.666667
7462 3 8.000000
7463 dtype: float64
7465 Fill the DataFrame forward (that is, going down) along each column
7466 using linear interpolation.
7468 Note how the last entry in column 'a' is interpolated differently,
7469 because there is no entry after it to use for interpolation.
7470 Note how the first entry in column 'b' remains ``NaN``, because there
7471 is no entry before it to use for interpolation.
7473 >>> df = pd.DataFrame([(0.0, np.nan, -1.0, 1.0),
7474 ... (np.nan, 2.0, np.nan, np.nan),
7475 ... (2.0, 3.0, np.nan, 9.0),
7476 ... (np.nan, 4.0, -4.0, 16.0)],
7477 ... columns=list('abcd'))
7478 >>> df
7479 a b c d
7480 0 0.0 NaN -1.0 1.0
7481 1 NaN 2.0 NaN NaN
7482 2 2.0 3.0 NaN 9.0
7483 3 NaN 4.0 -4.0 16.0
7484 >>> df.interpolate(method='linear', limit_direction='forward', axis=0)
7485 a b c d
7486 0 0.0 NaN -1.0 1.0
7487 1 1.0 2.0 -2.0 5.0
7488 2 2.0 3.0 -3.0 9.0
7489 3 2.0 4.0 -4.0 16.0
7491 Using polynomial interpolation.
7493 >>> df['d'].interpolate(method='polynomial', order=2)
7494 0 1.0
7495 1 4.0
7496 2 9.0
7497 3 16.0
7498 Name: d, dtype: float64
7499 """
7500 inplace = validate_bool_kwarg(inplace, "inplace")
7502 axis = self._get_axis_number(axis)
7504 fillna_methods = ["ffill", "bfill", "pad", "backfill"]
7505 should_transpose = axis == 1 and method not in fillna_methods
7507 obj = self.T if should_transpose else self
7509 if obj.empty:
7510 return self.copy()
7512 if method not in fillna_methods:
7513 axis = self._info_axis_number
7515 if isinstance(obj.index, MultiIndex) and method != "linear":
7516 raise ValueError(
7517 "Only `method=linear` interpolation is supported on MultiIndexes."
7518 )
7520 # Set `limit_direction` depending on `method`
7521 if limit_direction is None:
7522 limit_direction = (
7523 "backward" if method in ("backfill", "bfill") else "forward"
7524 )
7525 else:
7526 if method in ("pad", "ffill") and limit_direction != "forward":
7527 raise ValueError(
7528 f"`limit_direction` must be 'forward' for method `{method}`"
7529 )
7530 if method in ("backfill", "bfill") and limit_direction != "backward":
7531 raise ValueError(
7532 f"`limit_direction` must be 'backward' for method `{method}`"
7533 )
7535 if obj.ndim == 2 and np.all(obj.dtypes == np.dtype("object")):
7536 raise TypeError(
7537 "Cannot interpolate with all object-dtype columns "
7538 "in the DataFrame. Try setting at least one "
7539 "column to a numeric dtype."
7540 )
7542 # create/use the index
7543 if method == "linear":
7544 # prior default
7545 index = Index(np.arange(len(obj.index)))
7546 else:
7547 index = obj.index
7548 methods = {"index", "values", "nearest", "time"}
7549 is_numeric_or_datetime = (
7550 is_numeric_dtype(index.dtype)
7551 or is_datetime64_any_dtype(index.dtype)
7552 or is_timedelta64_dtype(index.dtype)
7553 )
7554 if method not in methods and not is_numeric_or_datetime:
7555 raise ValueError(
7556 "Index column must be numeric or datetime type when "
7557 f"using {method} method other than linear. "
7558 "Try setting a numeric or datetime index column before "
7559 "interpolating."
7560 )
7562 if isna(index).any():
7563 raise NotImplementedError(
7564 "Interpolation with NaNs in the index "
7565 "has not been implemented. Try filling "
7566 "those NaNs before interpolating."
7567 )
7568 new_data = obj._mgr.interpolate(
7569 method=method,
7570 axis=axis,
7571 index=index,
7572 limit=limit,
7573 limit_direction=limit_direction,
7574 limit_area=limit_area,
7575 inplace=inplace,
7576 downcast=downcast,
7577 **kwargs,
7578 )
7580 result = self._constructor(new_data)
7581 if should_transpose:
7582 result = result.T
7583 if inplace:
7584 return self._update_inplace(result)
7585 else:
7586 return result.__finalize__(self, method="interpolate")
7588 # ----------------------------------------------------------------------
7589 # Timeseries methods Methods
7591 @final
7592 def asof(self, where, subset=None):
7593 """
7594 Return the last row(s) without any NaNs before `where`.
7596 The last row (for each element in `where`, if list) without any
7597 NaN is taken.
7598 In case of a :class:`~pandas.DataFrame`, the last row without NaN
7599 considering only the subset of columns (if not `None`)
7601 If there is no good value, NaN is returned for a Series or
7602 a Series of NaN values for a DataFrame
7604 Parameters
7605 ----------
7606 where : date or array-like of dates
7607 Date(s) before which the last row(s) are returned.
7608 subset : str or array-like of str, default `None`
7609 For DataFrame, if not `None`, only use these columns to
7610 check for NaNs.
7612 Returns
7613 -------
7614 scalar, Series, or DataFrame
7616 The return can be:
7618 * scalar : when `self` is a Series and `where` is a scalar
7619 * Series: when `self` is a Series and `where` is an array-like,
7620 or when `self` is a DataFrame and `where` is a scalar
7621 * DataFrame : when `self` is a DataFrame and `where` is an
7622 array-like
7624 Return scalar, Series, or DataFrame.
7626 See Also
7627 --------
7628 merge_asof : Perform an asof merge. Similar to left join.
7630 Notes
7631 -----
7632 Dates are assumed to be sorted. Raises if this is not the case.
7634 Examples
7635 --------
7636 A Series and a scalar `where`.
7638 >>> s = pd.Series([1, 2, np.nan, 4], index=[10, 20, 30, 40])
7639 >>> s
7640 10 1.0
7641 20 2.0
7642 30 NaN
7643 40 4.0
7644 dtype: float64
7646 >>> s.asof(20)
7647 2.0
7649 For a sequence `where`, a Series is returned. The first value is
7650 NaN, because the first element of `where` is before the first
7651 index value.
7653 >>> s.asof([5, 20])
7654 5 NaN
7655 20 2.0
7656 dtype: float64
7658 Missing values are not considered. The following is ``2.0``, not
7659 NaN, even though NaN is at the index location for ``30``.
7661 >>> s.asof(30)
7662 2.0
7664 Take all columns into consideration
7666 >>> df = pd.DataFrame({'a': [10, 20, 30, 40, 50],
7667 ... 'b': [None, None, None, None, 500]},
7668 ... index=pd.DatetimeIndex(['2018-02-27 09:01:00',
7669 ... '2018-02-27 09:02:00',
7670 ... '2018-02-27 09:03:00',
7671 ... '2018-02-27 09:04:00',
7672 ... '2018-02-27 09:05:00']))
7673 >>> df.asof(pd.DatetimeIndex(['2018-02-27 09:03:30',
7674 ... '2018-02-27 09:04:30']))
7675 a b
7676 2018-02-27 09:03:30 NaN NaN
7677 2018-02-27 09:04:30 NaN NaN
7679 Take a single column into consideration
7681 >>> df.asof(pd.DatetimeIndex(['2018-02-27 09:03:30',
7682 ... '2018-02-27 09:04:30']),
7683 ... subset=['a'])
7684 a b
7685 2018-02-27 09:03:30 30 NaN
7686 2018-02-27 09:04:30 40 NaN
7687 """
7688 if isinstance(where, str):
7689 where = Timestamp(where)
7691 if not self.index.is_monotonic_increasing:
7692 raise ValueError("asof requires a sorted index")
7694 is_series = isinstance(self, ABCSeries)
7695 if is_series:
7696 if subset is not None:
7697 raise ValueError("subset is not valid for Series")
7698 else:
7699 if subset is None:
7700 subset = self.columns
7701 if not is_list_like(subset):
7702 subset = [subset]
7704 is_list = is_list_like(where)
7705 if not is_list:
7706 start = self.index[0]
7707 if isinstance(self.index, PeriodIndex):
7708 where = Period(where, freq=self.index.freq)
7710 if where < start:
7711 if not is_series:
7712 return self._constructor_sliced(
7713 index=self.columns, name=where, dtype=np.float64
7714 )
7715 return np.nan
7717 # It's always much faster to use a *while* loop here for
7718 # Series than pre-computing all the NAs. However a
7719 # *while* loop is extremely expensive for DataFrame
7720 # so we later pre-compute all the NAs and use the same
7721 # code path whether *where* is a scalar or list.
7722 # See PR: https://github.com/pandas-dev/pandas/pull/14476
7723 if is_series:
7724 loc = self.index.searchsorted(where, side="right")
7725 if loc > 0:
7726 loc -= 1
7728 values = self._values
7729 while loc > 0 and isna(values[loc]):
7730 loc -= 1
7731 return values[loc]
7733 if not isinstance(where, Index):
7734 where = Index(where) if is_list else Index([where])
7736 nulls = self.isna() if is_series else self[subset].isna().any(axis=1)
7737 if nulls.all():
7738 if is_series:
7739 self = cast("Series", self)
7740 return self._constructor(np.nan, index=where, name=self.name)
7741 elif is_list:
7742 self = cast("DataFrame", self)
7743 return self._constructor(np.nan, index=where, columns=self.columns)
7744 else:
7745 self = cast("DataFrame", self)
7746 return self._constructor_sliced(
7747 np.nan, index=self.columns, name=where[0]
7748 )
7750 locs = self.index.asof_locs(where, ~(nulls._values))
7752 # mask the missing
7753 missing = locs == -1
7754 data = self.take(locs)
7755 data.index = where
7756 if missing.any():
7757 # GH#16063 only do this setting when necessary, otherwise
7758 # we'd cast e.g. bools to floats
7759 data.loc[missing] = np.nan
7760 return data if is_list else data.iloc[-1]
7762 # ----------------------------------------------------------------------
7763 # Action Methods
7765 @doc(klass=_shared_doc_kwargs["klass"])
7766 def isna(self: NDFrameT) -> NDFrameT:
7767 """
7768 Detect missing values.
7770 Return a boolean same-sized object indicating if the values are NA.
7771 NA values, such as None or :attr:`numpy.NaN`, gets mapped to True
7772 values.
7773 Everything else gets mapped to False values. Characters such as empty
7774 strings ``''`` or :attr:`numpy.inf` are not considered NA values
7775 (unless you set ``pandas.options.mode.use_inf_as_na = True``).
7777 Returns
7778 -------
7779 {klass}
7780 Mask of bool values for each element in {klass} that
7781 indicates whether an element is an NA value.
7783 See Also
7784 --------
7785 {klass}.isnull : Alias of isna.
7786 {klass}.notna : Boolean inverse of isna.
7787 {klass}.dropna : Omit axes labels with missing values.
7788 isna : Top-level isna.
7790 Examples
7791 --------
7792 Show which entries in a DataFrame are NA.
7794 >>> df = pd.DataFrame(dict(age=[5, 6, np.NaN],
7795 ... born=[pd.NaT, pd.Timestamp('1939-05-27'),
7796 ... pd.Timestamp('1940-04-25')],
7797 ... name=['Alfred', 'Batman', ''],
7798 ... toy=[None, 'Batmobile', 'Joker']))
7799 >>> df
7800 age born name toy
7801 0 5.0 NaT Alfred None
7802 1 6.0 1939-05-27 Batman Batmobile
7803 2 NaN 1940-04-25 Joker
7805 >>> df.isna()
7806 age born name toy
7807 0 False True False True
7808 1 False False False False
7809 2 True False False False
7811 Show which entries in a Series are NA.
7813 >>> ser = pd.Series([5, 6, np.NaN])
7814 >>> ser
7815 0 5.0
7816 1 6.0
7817 2 NaN
7818 dtype: float64
7820 >>> ser.isna()
7821 0 False
7822 1 False
7823 2 True
7824 dtype: bool
7825 """
7826 return isna(self).__finalize__(self, method="isna")
7828 @doc(isna, klass=_shared_doc_kwargs["klass"])
7829 def isnull(self: NDFrameT) -> NDFrameT:
7830 return isna(self).__finalize__(self, method="isnull")
7832 @doc(klass=_shared_doc_kwargs["klass"])
7833 def notna(self: NDFrameT) -> NDFrameT:
7834 """
7835 Detect existing (non-missing) values.
7837 Return a boolean same-sized object indicating if the values are not NA.
7838 Non-missing values get mapped to True. Characters such as empty
7839 strings ``''`` or :attr:`numpy.inf` are not considered NA values
7840 (unless you set ``pandas.options.mode.use_inf_as_na = True``).
7841 NA values, such as None or :attr:`numpy.NaN`, get mapped to False
7842 values.
7844 Returns
7845 -------
7846 {klass}
7847 Mask of bool values for each element in {klass} that
7848 indicates whether an element is not an NA value.
7850 See Also
7851 --------
7852 {klass}.notnull : Alias of notna.
7853 {klass}.isna : Boolean inverse of notna.
7854 {klass}.dropna : Omit axes labels with missing values.
7855 notna : Top-level notna.
7857 Examples
7858 --------
7859 Show which entries in a DataFrame are not NA.
7861 >>> df = pd.DataFrame(dict(age=[5, 6, np.NaN],
7862 ... born=[pd.NaT, pd.Timestamp('1939-05-27'),
7863 ... pd.Timestamp('1940-04-25')],
7864 ... name=['Alfred', 'Batman', ''],
7865 ... toy=[None, 'Batmobile', 'Joker']))
7866 >>> df
7867 age born name toy
7868 0 5.0 NaT Alfred None
7869 1 6.0 1939-05-27 Batman Batmobile
7870 2 NaN 1940-04-25 Joker
7872 >>> df.notna()
7873 age born name toy
7874 0 True False True False
7875 1 True True True True
7876 2 False True True True
7878 Show which entries in a Series are not NA.
7880 >>> ser = pd.Series([5, 6, np.NaN])
7881 >>> ser
7882 0 5.0
7883 1 6.0
7884 2 NaN
7885 dtype: float64
7887 >>> ser.notna()
7888 0 True
7889 1 True
7890 2 False
7891 dtype: bool
7892 """
7893 return notna(self).__finalize__(self, method="notna")
7895 @doc(notna, klass=_shared_doc_kwargs["klass"])
7896 def notnull(self: NDFrameT) -> NDFrameT:
7897 return notna(self).__finalize__(self, method="notnull")
7899 @final
7900 def _clip_with_scalar(self, lower, upper, inplace: bool_t = False):
7901 if (lower is not None and np.any(isna(lower))) or (
7902 upper is not None and np.any(isna(upper))
7903 ):
7904 raise ValueError("Cannot use an NA value as a clip threshold")
7906 result = self
7907 mask = isna(self._values)
7909 with np.errstate(all="ignore"):
7910 if upper is not None:
7911 subset = self <= upper
7912 result = result.where(subset, upper, axis=None, inplace=False)
7913 if lower is not None:
7914 subset = self >= lower
7915 result = result.where(subset, lower, axis=None, inplace=False)
7917 if np.any(mask):
7918 result[mask] = np.nan
7920 if inplace:
7921 return self._update_inplace(result)
7922 else:
7923 return result
7925 @final
7926 def _clip_with_one_bound(self, threshold, method, axis, inplace):
7928 if axis is not None:
7929 axis = self._get_axis_number(axis)
7931 # method is self.le for upper bound and self.ge for lower bound
7932 if is_scalar(threshold) and is_number(threshold):
7933 if method.__name__ == "le":
7934 return self._clip_with_scalar(None, threshold, inplace=inplace)
7935 return self._clip_with_scalar(threshold, None, inplace=inplace)
7937 # GH #15390
7938 # In order for where method to work, the threshold must
7939 # be transformed to NDFrame from other array like structure.
7940 if (not isinstance(threshold, ABCSeries)) and is_list_like(threshold):
7941 if isinstance(self, ABCSeries):
7942 threshold = self._constructor(threshold, index=self.index)
7943 else:
7944 threshold = align_method_FRAME(self, threshold, axis, flex=None)[1]
7946 # GH 40420
7947 # Treat missing thresholds as no bounds, not clipping the values
7948 if is_list_like(threshold):
7949 fill_value = np.inf if method.__name__ == "le" else -np.inf
7950 threshold_inf = threshold.fillna(fill_value)
7951 else:
7952 threshold_inf = threshold
7954 subset = method(threshold_inf, axis=axis) | isna(self)
7956 # GH 40420
7957 return self.where(subset, threshold, axis=axis, inplace=inplace)
7959 def clip(
7960 self: NDFrameT,
7961 lower=None,
7962 upper=None,
7963 axis: Axis | None = None,
7964 inplace: bool_t = False,
7965 *args,
7966 **kwargs,
7967 ) -> NDFrameT | None:
7968 """
7969 Trim values at input threshold(s).
7971 Assigns values outside boundary to boundary values. Thresholds
7972 can be singular values or array like, and in the latter case
7973 the clipping is performed element-wise in the specified axis.
7975 Parameters
7976 ----------
7977 lower : float or array-like, default None
7978 Minimum threshold value. All values below this
7979 threshold will be set to it. A missing
7980 threshold (e.g `NA`) will not clip the value.
7981 upper : float or array-like, default None
7982 Maximum threshold value. All values above this
7983 threshold will be set to it. A missing
7984 threshold (e.g `NA`) will not clip the value.
7985 axis : {{0 or 'index', 1 or 'columns', None}}, default None
7986 Align object with lower and upper along the given axis.
7987 For `Series` this parameter is unused and defaults to `None`.
7988 inplace : bool, default False
7989 Whether to perform the operation in place on the data.
7990 *args, **kwargs
7991 Additional keywords have no effect but might be accepted
7992 for compatibility with numpy.
7994 Returns
7995 -------
7996 Series or DataFrame or None
7997 Same type as calling object with the values outside the
7998 clip boundaries replaced or None if ``inplace=True``.
8000 See Also
8001 --------
8002 Series.clip : Trim values at input threshold in series.
8003 DataFrame.clip : Trim values at input threshold in dataframe.
8004 numpy.clip : Clip (limit) the values in an array.
8006 Examples
8007 --------
8008 >>> data = {'col_0': [9, -3, 0, -1, 5], 'col_1': [-2, -7, 6, 8, -5]}
8009 >>> df = pd.DataFrame(data)
8010 >>> df
8011 col_0 col_1
8012 0 9 -2
8013 1 -3 -7
8014 2 0 6
8015 3 -1 8
8016 4 5 -5
8018 Clips per column using lower and upper thresholds:
8020 >>> df.clip(-4, 6)
8021 col_0 col_1
8022 0 6 -2
8023 1 -3 -4
8024 2 0 6
8025 3 -1 6
8026 4 5 -4
8028 Clips using specific lower and upper thresholds per column element:
8030 >>> t = pd.Series([2, -4, -1, 6, 3])
8031 >>> t
8032 0 2
8033 1 -4
8034 2 -1
8035 3 6
8036 4 3
8037 dtype: int64
8039 >>> df.clip(t, t + 4, axis=0)
8040 col_0 col_1
8041 0 6 2
8042 1 -3 -4
8043 2 0 3
8044 3 6 8
8045 4 5 3
8047 Clips using specific lower threshold per column element, with missing values:
8049 >>> t = pd.Series([2, -4, np.NaN, 6, 3])
8050 >>> t
8051 0 2.0
8052 1 -4.0
8053 2 NaN
8054 3 6.0
8055 4 3.0
8056 dtype: float64
8058 >>> df.clip(t, axis=0)
8059 col_0 col_1
8060 0 9 2
8061 1 -3 -4
8062 2 0 6
8063 3 6 8
8064 4 5 3
8065 """
8066 inplace = validate_bool_kwarg(inplace, "inplace")
8068 axis = nv.validate_clip_with_axis(axis, args, kwargs)
8069 if axis is not None:
8070 axis = self._get_axis_number(axis)
8072 # GH 17276
8073 # numpy doesn't like NaN as a clip value
8074 # so ignore
8075 # GH 19992
8076 # numpy doesn't drop a list-like bound containing NaN
8077 isna_lower = isna(lower)
8078 if not is_list_like(lower):
8079 if np.any(isna_lower):
8080 lower = None
8081 elif np.all(isna_lower):
8082 lower = None
8083 isna_upper = isna(upper)
8084 if not is_list_like(upper):
8085 if np.any(isna_upper):
8086 upper = None
8087 elif np.all(isna_upper):
8088 upper = None
8090 # GH 2747 (arguments were reversed)
8091 if (
8092 lower is not None
8093 and upper is not None
8094 and is_scalar(lower)
8095 and is_scalar(upper)
8096 ):
8097 lower, upper = min(lower, upper), max(lower, upper)
8099 # fast-path for scalars
8100 if (lower is None or (is_scalar(lower) and is_number(lower))) and (
8101 upper is None or (is_scalar(upper) and is_number(upper))
8102 ):
8103 return self._clip_with_scalar(lower, upper, inplace=inplace)
8105 result = self
8106 if lower is not None:
8107 result = result._clip_with_one_bound(
8108 lower, method=self.ge, axis=axis, inplace=inplace
8109 )
8110 if upper is not None:
8111 if inplace:
8112 result = self
8113 result = result._clip_with_one_bound(
8114 upper, method=self.le, axis=axis, inplace=inplace
8115 )
8117 return result
8119 @doc(**_shared_doc_kwargs)
8120 def asfreq(
8121 self: NDFrameT,
8122 freq: Frequency,
8123 method: FillnaOptions | None = None,
8124 how: str | None = None,
8125 normalize: bool_t = False,
8126 fill_value: Hashable = None,
8127 ) -> NDFrameT:
8128 """
8129 Convert time series to specified frequency.
8131 Returns the original data conformed to a new index with the specified
8132 frequency.
8134 If the index of this {klass} is a :class:`~pandas.PeriodIndex`, the new index
8135 is the result of transforming the original index with
8136 :meth:`PeriodIndex.asfreq <pandas.PeriodIndex.asfreq>` (so the original index
8137 will map one-to-one to the new index).
8139 Otherwise, the new index will be equivalent to ``pd.date_range(start, end,
8140 freq=freq)`` where ``start`` and ``end`` are, respectively, the first and
8141 last entries in the original index (see :func:`pandas.date_range`). The
8142 values corresponding to any timesteps in the new index which were not present
8143 in the original index will be null (``NaN``), unless a method for filling
8144 such unknowns is provided (see the ``method`` parameter below).
8146 The :meth:`resample` method is more appropriate if an operation on each group of
8147 timesteps (such as an aggregate) is necessary to represent the data at the new
8148 frequency.
8150 Parameters
8151 ----------
8152 freq : DateOffset or str
8153 Frequency DateOffset or string.
8154 method : {{'backfill'/'bfill', 'pad'/'ffill'}}, default None
8155 Method to use for filling holes in reindexed Series (note this
8156 does not fill NaNs that already were present):
8158 * 'pad' / 'ffill': propagate last valid observation forward to next
8159 valid
8160 * 'backfill' / 'bfill': use NEXT valid observation to fill.
8161 how : {{'start', 'end'}}, default end
8162 For PeriodIndex only (see PeriodIndex.asfreq).
8163 normalize : bool, default False
8164 Whether to reset output index to midnight.
8165 fill_value : scalar, optional
8166 Value to use for missing values, applied during upsampling (note
8167 this does not fill NaNs that already were present).
8169 Returns
8170 -------
8171 {klass}
8172 {klass} object reindexed to the specified frequency.
8174 See Also
8175 --------
8176 reindex : Conform DataFrame to new index with optional filling logic.
8178 Notes
8179 -----
8180 To learn more about the frequency strings, please see `this link
8181 <https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#offset-aliases>`__.
8183 Examples
8184 --------
8185 Start by creating a series with 4 one minute timestamps.
8187 >>> index = pd.date_range('1/1/2000', periods=4, freq='T')
8188 >>> series = pd.Series([0.0, None, 2.0, 3.0], index=index)
8189 >>> df = pd.DataFrame({{'s': series}})
8190 >>> df
8191 s
8192 2000-01-01 00:00:00 0.0
8193 2000-01-01 00:01:00 NaN
8194 2000-01-01 00:02:00 2.0
8195 2000-01-01 00:03:00 3.0
8197 Upsample the series into 30 second bins.
8199 >>> df.asfreq(freq='30S')
8200 s
8201 2000-01-01 00:00:00 0.0
8202 2000-01-01 00:00:30 NaN
8203 2000-01-01 00:01:00 NaN
8204 2000-01-01 00:01:30 NaN
8205 2000-01-01 00:02:00 2.0
8206 2000-01-01 00:02:30 NaN
8207 2000-01-01 00:03:00 3.0
8209 Upsample again, providing a ``fill value``.
8211 >>> df.asfreq(freq='30S', fill_value=9.0)
8212 s
8213 2000-01-01 00:00:00 0.0
8214 2000-01-01 00:00:30 9.0
8215 2000-01-01 00:01:00 NaN
8216 2000-01-01 00:01:30 9.0
8217 2000-01-01 00:02:00 2.0
8218 2000-01-01 00:02:30 9.0
8219 2000-01-01 00:03:00 3.0
8221 Upsample again, providing a ``method``.
8223 >>> df.asfreq(freq='30S', method='bfill')
8224 s
8225 2000-01-01 00:00:00 0.0
8226 2000-01-01 00:00:30 NaN
8227 2000-01-01 00:01:00 NaN
8228 2000-01-01 00:01:30 2.0
8229 2000-01-01 00:02:00 2.0
8230 2000-01-01 00:02:30 3.0
8231 2000-01-01 00:03:00 3.0
8232 """
8233 from pandas.core.resample import asfreq
8235 return asfreq(
8236 self,
8237 freq,
8238 method=method,
8239 how=how,
8240 normalize=normalize,
8241 fill_value=fill_value,
8242 )
8244 @final
8245 def at_time(self: NDFrameT, time, asof: bool_t = False, axis=None) -> NDFrameT:
8246 """
8247 Select values at particular time of day (e.g., 9:30AM).
8249 Parameters
8250 ----------
8251 time : datetime.time or str
8252 axis : {0 or 'index', 1 or 'columns'}, default 0
8253 For `Series` this parameter is unused and defaults to 0.
8255 Returns
8256 -------
8257 Series or DataFrame
8259 Raises
8260 ------
8261 TypeError
8262 If the index is not a :class:`DatetimeIndex`
8264 See Also
8265 --------
8266 between_time : Select values between particular times of the day.
8267 first : Select initial periods of time series based on a date offset.
8268 last : Select final periods of time series based on a date offset.
8269 DatetimeIndex.indexer_at_time : Get just the index locations for
8270 values at particular time of the day.
8272 Examples
8273 --------
8274 >>> i = pd.date_range('2018-04-09', periods=4, freq='12H')
8275 >>> ts = pd.DataFrame({'A': [1, 2, 3, 4]}, index=i)
8276 >>> ts
8277 A
8278 2018-04-09 00:00:00 1
8279 2018-04-09 12:00:00 2
8280 2018-04-10 00:00:00 3
8281 2018-04-10 12:00:00 4
8283 >>> ts.at_time('12:00')
8284 A
8285 2018-04-09 12:00:00 2
8286 2018-04-10 12:00:00 4
8287 """
8288 if axis is None:
8289 axis = self._stat_axis_number
8290 axis = self._get_axis_number(axis)
8292 index = self._get_axis(axis)
8294 if not isinstance(index, DatetimeIndex):
8295 raise TypeError("Index must be DatetimeIndex")
8297 indexer = index.indexer_at_time(time, asof=asof)
8298 return self._take_with_is_copy(indexer, axis=axis)
8300 @final
8301 def between_time(
8302 self: NDFrameT,
8303 start_time,
8304 end_time,
8305 include_start: bool_t | lib.NoDefault = lib.no_default,
8306 include_end: bool_t | lib.NoDefault = lib.no_default,
8307 inclusive: IntervalClosedType | None = None,
8308 axis=None,
8309 ) -> NDFrameT:
8310 """
8311 Select values between particular times of the day (e.g., 9:00-9:30 AM).
8313 By setting ``start_time`` to be later than ``end_time``,
8314 you can get the times that are *not* between the two times.
8316 Parameters
8317 ----------
8318 start_time : datetime.time or str
8319 Initial time as a time filter limit.
8320 end_time : datetime.time or str
8321 End time as a time filter limit.
8322 include_start : bool, default True
8323 Whether the start time needs to be included in the result.
8325 .. deprecated:: 1.4.0
8326 Arguments `include_start` and `include_end` have been deprecated
8327 to standardize boundary inputs. Use `inclusive` instead, to set
8328 each bound as closed or open.
8329 include_end : bool, default True
8330 Whether the end time needs to be included in the result.
8332 .. deprecated:: 1.4.0
8333 Arguments `include_start` and `include_end` have been deprecated
8334 to standardize boundary inputs. Use `inclusive` instead, to set
8335 each bound as closed or open.
8336 inclusive : {"both", "neither", "left", "right"}, default "both"
8337 Include boundaries; whether to set each bound as closed or open.
8338 axis : {0 or 'index', 1 or 'columns'}, default 0
8339 Determine range time on index or columns value.
8340 For `Series` this parameter is unused and defaults to 0.
8342 Returns
8343 -------
8344 Series or DataFrame
8345 Data from the original object filtered to the specified dates range.
8347 Raises
8348 ------
8349 TypeError
8350 If the index is not a :class:`DatetimeIndex`
8352 See Also
8353 --------
8354 at_time : Select values at a particular time of the day.
8355 first : Select initial periods of time series based on a date offset.
8356 last : Select final periods of time series based on a date offset.
8357 DatetimeIndex.indexer_between_time : Get just the index locations for
8358 values between particular times of the day.
8360 Examples
8361 --------
8362 >>> i = pd.date_range('2018-04-09', periods=4, freq='1D20min')
8363 >>> ts = pd.DataFrame({'A': [1, 2, 3, 4]}, index=i)
8364 >>> ts
8365 A
8366 2018-04-09 00:00:00 1
8367 2018-04-10 00:20:00 2
8368 2018-04-11 00:40:00 3
8369 2018-04-12 01:00:00 4
8371 >>> ts.between_time('0:15', '0:45')
8372 A
8373 2018-04-10 00:20:00 2
8374 2018-04-11 00:40:00 3
8376 You get the times that are *not* between two times by setting
8377 ``start_time`` later than ``end_time``:
8379 >>> ts.between_time('0:45', '0:15')
8380 A
8381 2018-04-09 00:00:00 1
8382 2018-04-12 01:00:00 4
8383 """
8384 if axis is None:
8385 axis = self._stat_axis_number
8386 axis = self._get_axis_number(axis)
8388 index = self._get_axis(axis)
8389 if not isinstance(index, DatetimeIndex):
8390 raise TypeError("Index must be DatetimeIndex")
8392 old_include_arg_used = (include_start != lib.no_default) or (
8393 include_end != lib.no_default
8394 )
8396 if old_include_arg_used and inclusive is not None:
8397 raise ValueError(
8398 "Deprecated arguments `include_start` and `include_end` "
8399 "cannot be passed if `inclusive` has been given."
8400 )
8401 # If any of the deprecated arguments ('include_start', 'include_end')
8402 # have been passed
8403 elif old_include_arg_used:
8404 warnings.warn(
8405 "`include_start` and `include_end` are deprecated in "
8406 "favour of `inclusive`.",
8407 FutureWarning,
8408 stacklevel=find_stack_level(),
8409 )
8410 left = True if include_start is lib.no_default else include_start
8411 right = True if include_end is lib.no_default else include_end
8413 inc_dict: dict[tuple[bool_t, bool_t], IntervalClosedType] = {
8414 (True, True): "both",
8415 (True, False): "left",
8416 (False, True): "right",
8417 (False, False): "neither",
8418 }
8419 inclusive = inc_dict[(left, right)]
8420 elif inclusive is None:
8421 # On arg removal inclusive can default to "both"
8422 inclusive = "both"
8423 left_inclusive, right_inclusive = validate_inclusive(inclusive)
8424 indexer = index.indexer_between_time(
8425 start_time,
8426 end_time,
8427 include_start=left_inclusive,
8428 include_end=right_inclusive,
8429 )
8430 return self._take_with_is_copy(indexer, axis=axis)
8432 @doc(**_shared_doc_kwargs)
8433 def resample(
8434 self,
8435 rule,
8436 axis: Axis = 0,
8437 closed: str | None = None,
8438 label: str | None = None,
8439 convention: str = "start",
8440 kind: str | None = None,
8441 loffset=None,
8442 base: int | None = None,
8443 on: Level = None,
8444 level: Level = None,
8445 origin: str | TimestampConvertibleTypes = "start_day",
8446 offset: TimedeltaConvertibleTypes | None = None,
8447 group_keys: bool_t | lib.NoDefault = lib.no_default,
8448 ) -> Resampler:
8449 """
8450 Resample time-series data.
8452 Convenience method for frequency conversion and resampling of time series.
8453 The object must have a datetime-like index (`DatetimeIndex`, `PeriodIndex`,
8454 or `TimedeltaIndex`), or the caller must pass the label of a datetime-like
8455 series/index to the ``on``/``level`` keyword parameter.
8457 Parameters
8458 ----------
8459 rule : DateOffset, Timedelta or str
8460 The offset string or object representing target conversion.
8461 axis : {{0 or 'index', 1 or 'columns'}}, default 0
8462 Which axis to use for up- or down-sampling. For `Series` this parameter
8463 is unused and defaults to 0. Must be
8464 `DatetimeIndex`, `TimedeltaIndex` or `PeriodIndex`.
8465 closed : {{'right', 'left'}}, default None
8466 Which side of bin interval is closed. The default is 'left'
8467 for all frequency offsets except for 'M', 'A', 'Q', 'BM',
8468 'BA', 'BQ', and 'W' which all have a default of 'right'.
8469 label : {{'right', 'left'}}, default None
8470 Which bin edge label to label bucket with. The default is 'left'
8471 for all frequency offsets except for 'M', 'A', 'Q', 'BM',
8472 'BA', 'BQ', and 'W' which all have a default of 'right'.
8473 convention : {{'start', 'end', 's', 'e'}}, default 'start'
8474 For `PeriodIndex` only, controls whether to use the start or
8475 end of `rule`.
8476 kind : {{'timestamp', 'period'}}, optional, default None
8477 Pass 'timestamp' to convert the resulting index to a
8478 `DateTimeIndex` or 'period' to convert it to a `PeriodIndex`.
8479 By default the input representation is retained.
8480 loffset : timedelta, default None
8481 Adjust the resampled time labels.
8483 .. deprecated:: 1.1.0
8484 You should add the loffset to the `df.index` after the resample.
8485 See below.
8487 base : int, default 0
8488 For frequencies that evenly subdivide 1 day, the "origin" of the
8489 aggregated intervals. For example, for '5min' frequency, base could
8490 range from 0 through 4. Defaults to 0.
8492 .. deprecated:: 1.1.0
8493 The new arguments that you should use are 'offset' or 'origin'.
8495 on : str, optional
8496 For a DataFrame, column to use instead of index for resampling.
8497 Column must be datetime-like.
8498 level : str or int, optional
8499 For a MultiIndex, level (name or number) to use for
8500 resampling. `level` must be datetime-like.
8501 origin : Timestamp or str, default 'start_day'
8502 The timestamp on which to adjust the grouping. The timezone of origin
8503 must match the timezone of the index.
8504 If string, must be one of the following:
8506 - 'epoch': `origin` is 1970-01-01
8507 - 'start': `origin` is the first value of the timeseries
8508 - 'start_day': `origin` is the first day at midnight of the timeseries
8510 .. versionadded:: 1.1.0
8512 - 'end': `origin` is the last value of the timeseries
8513 - 'end_day': `origin` is the ceiling midnight of the last day
8515 .. versionadded:: 1.3.0
8517 offset : Timedelta or str, default is None
8518 An offset timedelta added to the origin.
8520 .. versionadded:: 1.1.0
8522 group_keys : bool, optional
8523 Whether to include the group keys in the result index when using
8524 ``.apply()`` on the resampled object. Not specifying ``group_keys``
8525 will retain values-dependent behavior from pandas 1.4
8526 and earlier (see :ref:`pandas 1.5.0 Release notes
8527 <whatsnew_150.enhancements.resample_group_keys>`
8528 for examples). In a future version of pandas, the behavior will
8529 default to the same as specifying ``group_keys=False``.
8531 .. versionadded:: 1.5.0
8533 Returns
8534 -------
8535 pandas.core.Resampler
8536 :class:`~pandas.core.Resampler` object.
8538 See Also
8539 --------
8540 Series.resample : Resample a Series.
8541 DataFrame.resample : Resample a DataFrame.
8542 groupby : Group {klass} by mapping, function, label, or list of labels.
8543 asfreq : Reindex a {klass} with the given frequency without grouping.
8545 Notes
8546 -----
8547 See the `user guide
8548 <https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#resampling>`__
8549 for more.
8551 To learn more about the offset strings, please see `this link
8552 <https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#dateoffset-objects>`__.
8554 Examples
8555 --------
8556 Start by creating a series with 9 one minute timestamps.
8558 >>> index = pd.date_range('1/1/2000', periods=9, freq='T')
8559 >>> series = pd.Series(range(9), index=index)
8560 >>> series
8561 2000-01-01 00:00:00 0
8562 2000-01-01 00:01:00 1
8563 2000-01-01 00:02:00 2
8564 2000-01-01 00:03:00 3
8565 2000-01-01 00:04:00 4
8566 2000-01-01 00:05:00 5
8567 2000-01-01 00:06:00 6
8568 2000-01-01 00:07:00 7
8569 2000-01-01 00:08:00 8
8570 Freq: T, dtype: int64
8572 Downsample the series into 3 minute bins and sum the values
8573 of the timestamps falling into a bin.
8575 >>> series.resample('3T').sum()
8576 2000-01-01 00:00:00 3
8577 2000-01-01 00:03:00 12
8578 2000-01-01 00:06:00 21
8579 Freq: 3T, dtype: int64
8581 Downsample the series into 3 minute bins as above, but label each
8582 bin using the right edge instead of the left. Please note that the
8583 value in the bucket used as the label is not included in the bucket,
8584 which it labels. For example, in the original series the
8585 bucket ``2000-01-01 00:03:00`` contains the value 3, but the summed
8586 value in the resampled bucket with the label ``2000-01-01 00:03:00``
8587 does not include 3 (if it did, the summed value would be 6, not 3).
8588 To include this value close the right side of the bin interval as
8589 illustrated in the example below this one.
8591 >>> series.resample('3T', label='right').sum()
8592 2000-01-01 00:03:00 3
8593 2000-01-01 00:06:00 12
8594 2000-01-01 00:09:00 21
8595 Freq: 3T, dtype: int64
8597 Downsample the series into 3 minute bins as above, but close the right
8598 side of the bin interval.
8600 >>> series.resample('3T', label='right', closed='right').sum()
8601 2000-01-01 00:00:00 0
8602 2000-01-01 00:03:00 6
8603 2000-01-01 00:06:00 15
8604 2000-01-01 00:09:00 15
8605 Freq: 3T, dtype: int64
8607 Upsample the series into 30 second bins.
8609 >>> series.resample('30S').asfreq()[0:5] # Select first 5 rows
8610 2000-01-01 00:00:00 0.0
8611 2000-01-01 00:00:30 NaN
8612 2000-01-01 00:01:00 1.0
8613 2000-01-01 00:01:30 NaN
8614 2000-01-01 00:02:00 2.0
8615 Freq: 30S, dtype: float64
8617 Upsample the series into 30 second bins and fill the ``NaN``
8618 values using the ``ffill`` method.
8620 >>> series.resample('30S').ffill()[0:5]
8621 2000-01-01 00:00:00 0
8622 2000-01-01 00:00:30 0
8623 2000-01-01 00:01:00 1
8624 2000-01-01 00:01:30 1
8625 2000-01-01 00:02:00 2
8626 Freq: 30S, dtype: int64
8628 Upsample the series into 30 second bins and fill the
8629 ``NaN`` values using the ``bfill`` method.
8631 >>> series.resample('30S').bfill()[0:5]
8632 2000-01-01 00:00:00 0
8633 2000-01-01 00:00:30 1
8634 2000-01-01 00:01:00 1
8635 2000-01-01 00:01:30 2
8636 2000-01-01 00:02:00 2
8637 Freq: 30S, dtype: int64
8639 Pass a custom function via ``apply``
8641 >>> def custom_resampler(arraylike):
8642 ... return np.sum(arraylike) + 5
8643 ...
8644 >>> series.resample('3T').apply(custom_resampler)
8645 2000-01-01 00:00:00 8
8646 2000-01-01 00:03:00 17
8647 2000-01-01 00:06:00 26
8648 Freq: 3T, dtype: int64
8650 For a Series with a PeriodIndex, the keyword `convention` can be
8651 used to control whether to use the start or end of `rule`.
8653 Resample a year by quarter using 'start' `convention`. Values are
8654 assigned to the first quarter of the period.
8656 >>> s = pd.Series([1, 2], index=pd.period_range('2012-01-01',
8657 ... freq='A',
8658 ... periods=2))
8659 >>> s
8660 2012 1
8661 2013 2
8662 Freq: A-DEC, dtype: int64
8663 >>> s.resample('Q', convention='start').asfreq()
8664 2012Q1 1.0
8665 2012Q2 NaN
8666 2012Q3 NaN
8667 2012Q4 NaN
8668 2013Q1 2.0
8669 2013Q2 NaN
8670 2013Q3 NaN
8671 2013Q4 NaN
8672 Freq: Q-DEC, dtype: float64
8674 Resample quarters by month using 'end' `convention`. Values are
8675 assigned to the last month of the period.
8677 >>> q = pd.Series([1, 2, 3, 4], index=pd.period_range('2018-01-01',
8678 ... freq='Q',
8679 ... periods=4))
8680 >>> q
8681 2018Q1 1
8682 2018Q2 2
8683 2018Q3 3
8684 2018Q4 4
8685 Freq: Q-DEC, dtype: int64
8686 >>> q.resample('M', convention='end').asfreq()
8687 2018-03 1.0
8688 2018-04 NaN
8689 2018-05 NaN
8690 2018-06 2.0
8691 2018-07 NaN
8692 2018-08 NaN
8693 2018-09 3.0
8694 2018-10 NaN
8695 2018-11 NaN
8696 2018-12 4.0
8697 Freq: M, dtype: float64
8699 For DataFrame objects, the keyword `on` can be used to specify the
8700 column instead of the index for resampling.
8702 >>> d = {{'price': [10, 11, 9, 13, 14, 18, 17, 19],
8703 ... 'volume': [50, 60, 40, 100, 50, 100, 40, 50]}}
8704 >>> df = pd.DataFrame(d)
8705 >>> df['week_starting'] = pd.date_range('01/01/2018',
8706 ... periods=8,
8707 ... freq='W')
8708 >>> df
8709 price volume week_starting
8710 0 10 50 2018-01-07
8711 1 11 60 2018-01-14
8712 2 9 40 2018-01-21
8713 3 13 100 2018-01-28
8714 4 14 50 2018-02-04
8715 5 18 100 2018-02-11
8716 6 17 40 2018-02-18
8717 7 19 50 2018-02-25
8718 >>> df.resample('M', on='week_starting').mean()
8719 price volume
8720 week_starting
8721 2018-01-31 10.75 62.5
8722 2018-02-28 17.00 60.0
8724 For a DataFrame with MultiIndex, the keyword `level` can be used to
8725 specify on which level the resampling needs to take place.
8727 >>> days = pd.date_range('1/1/2000', periods=4, freq='D')
8728 >>> d2 = {{'price': [10, 11, 9, 13, 14, 18, 17, 19],
8729 ... 'volume': [50, 60, 40, 100, 50, 100, 40, 50]}}
8730 >>> df2 = pd.DataFrame(
8731 ... d2,
8732 ... index=pd.MultiIndex.from_product(
8733 ... [days, ['morning', 'afternoon']]
8734 ... )
8735 ... )
8736 >>> df2
8737 price volume
8738 2000-01-01 morning 10 50
8739 afternoon 11 60
8740 2000-01-02 morning 9 40
8741 afternoon 13 100
8742 2000-01-03 morning 14 50
8743 afternoon 18 100
8744 2000-01-04 morning 17 40
8745 afternoon 19 50
8746 >>> df2.resample('D', level=0).sum()
8747 price volume
8748 2000-01-01 21 110
8749 2000-01-02 22 140
8750 2000-01-03 32 150
8751 2000-01-04 36 90
8753 If you want to adjust the start of the bins based on a fixed timestamp:
8755 >>> start, end = '2000-10-01 23:30:00', '2000-10-02 00:30:00'
8756 >>> rng = pd.date_range(start, end, freq='7min')
8757 >>> ts = pd.Series(np.arange(len(rng)) * 3, index=rng)
8758 >>> ts
8759 2000-10-01 23:30:00 0
8760 2000-10-01 23:37:00 3
8761 2000-10-01 23:44:00 6
8762 2000-10-01 23:51:00 9
8763 2000-10-01 23:58:00 12
8764 2000-10-02 00:05:00 15
8765 2000-10-02 00:12:00 18
8766 2000-10-02 00:19:00 21
8767 2000-10-02 00:26:00 24
8768 Freq: 7T, dtype: int64
8770 >>> ts.resample('17min').sum()
8771 2000-10-01 23:14:00 0
8772 2000-10-01 23:31:00 9
8773 2000-10-01 23:48:00 21
8774 2000-10-02 00:05:00 54
8775 2000-10-02 00:22:00 24
8776 Freq: 17T, dtype: int64
8778 >>> ts.resample('17min', origin='epoch').sum()
8779 2000-10-01 23:18:00 0
8780 2000-10-01 23:35:00 18
8781 2000-10-01 23:52:00 27
8782 2000-10-02 00:09:00 39
8783 2000-10-02 00:26:00 24
8784 Freq: 17T, dtype: int64
8786 >>> ts.resample('17min', origin='2000-01-01').sum()
8787 2000-10-01 23:24:00 3
8788 2000-10-01 23:41:00 15
8789 2000-10-01 23:58:00 45
8790 2000-10-02 00:15:00 45
8791 Freq: 17T, dtype: int64
8793 If you want to adjust the start of the bins with an `offset` Timedelta, the two
8794 following lines are equivalent:
8796 >>> ts.resample('17min', origin='start').sum()
8797 2000-10-01 23:30:00 9
8798 2000-10-01 23:47:00 21
8799 2000-10-02 00:04:00 54
8800 2000-10-02 00:21:00 24
8801 Freq: 17T, dtype: int64
8803 >>> ts.resample('17min', offset='23h30min').sum()
8804 2000-10-01 23:30:00 9
8805 2000-10-01 23:47:00 21
8806 2000-10-02 00:04:00 54
8807 2000-10-02 00:21:00 24
8808 Freq: 17T, dtype: int64
8810 If you want to take the largest Timestamp as the end of the bins:
8812 >>> ts.resample('17min', origin='end').sum()
8813 2000-10-01 23:35:00 0
8814 2000-10-01 23:52:00 18
8815 2000-10-02 00:09:00 27
8816 2000-10-02 00:26:00 63
8817 Freq: 17T, dtype: int64
8819 In contrast with the `start_day`, you can use `end_day` to take the ceiling
8820 midnight of the largest Timestamp as the end of the bins and drop the bins
8821 not containing data:
8823 >>> ts.resample('17min', origin='end_day').sum()
8824 2000-10-01 23:38:00 3
8825 2000-10-01 23:55:00 15
8826 2000-10-02 00:12:00 45
8827 2000-10-02 00:29:00 45
8828 Freq: 17T, dtype: int64
8830 To replace the use of the deprecated `base` argument, you can now use `offset`,
8831 in this example it is equivalent to have `base=2`:
8833 >>> ts.resample('17min', offset='2min').sum()
8834 2000-10-01 23:16:00 0
8835 2000-10-01 23:33:00 9
8836 2000-10-01 23:50:00 36
8837 2000-10-02 00:07:00 39
8838 2000-10-02 00:24:00 24
8839 Freq: 17T, dtype: int64
8841 To replace the use of the deprecated `loffset` argument:
8843 >>> from pandas.tseries.frequencies import to_offset
8844 >>> loffset = '19min'
8845 >>> ts_out = ts.resample('17min').sum()
8846 >>> ts_out.index = ts_out.index + to_offset(loffset)
8847 >>> ts_out
8848 2000-10-01 23:33:00 0
8849 2000-10-01 23:50:00 9
8850 2000-10-02 00:07:00 21
8851 2000-10-02 00:24:00 54
8852 2000-10-02 00:41:00 24
8853 Freq: 17T, dtype: int64
8854 """
8855 from pandas.core.resample import get_resampler
8857 axis = self._get_axis_number(axis)
8858 return get_resampler(
8859 self,
8860 freq=rule,
8861 label=label,
8862 closed=closed,
8863 axis=axis,
8864 kind=kind,
8865 loffset=loffset,
8866 convention=convention,
8867 base=base,
8868 key=on,
8869 level=level,
8870 origin=origin,
8871 offset=offset,
8872 group_keys=group_keys,
8873 )
8875 @final
8876 def first(self: NDFrameT, offset) -> NDFrameT:
8877 """
8878 Select initial periods of time series data based on a date offset.
8880 When having a DataFrame with dates as index, this function can
8881 select the first few rows based on a date offset.
8883 Parameters
8884 ----------
8885 offset : str, DateOffset or dateutil.relativedelta
8886 The offset length of the data that will be selected. For instance,
8887 '1M' will display all the rows having their index within the first month.
8889 Returns
8890 -------
8891 Series or DataFrame
8892 A subset of the caller.
8894 Raises
8895 ------
8896 TypeError
8897 If the index is not a :class:`DatetimeIndex`
8899 See Also
8900 --------
8901 last : Select final periods of time series based on a date offset.
8902 at_time : Select values at a particular time of the day.
8903 between_time : Select values between particular times of the day.
8905 Examples
8906 --------
8907 >>> i = pd.date_range('2018-04-09', periods=4, freq='2D')
8908 >>> ts = pd.DataFrame({'A': [1, 2, 3, 4]}, index=i)
8909 >>> ts
8910 A
8911 2018-04-09 1
8912 2018-04-11 2
8913 2018-04-13 3
8914 2018-04-15 4
8916 Get the rows for the first 3 days:
8918 >>> ts.first('3D')
8919 A
8920 2018-04-09 1
8921 2018-04-11 2
8923 Notice the data for 3 first calendar days were returned, not the first
8924 3 days observed in the dataset, and therefore data for 2018-04-13 was
8925 not returned.
8926 """
8927 if not isinstance(self.index, DatetimeIndex):
8928 raise TypeError("'first' only supports a DatetimeIndex index")
8930 if len(self.index) == 0:
8931 return self
8933 offset = to_offset(offset)
8934 if not isinstance(offset, Tick) and offset.is_on_offset(self.index[0]):
8935 # GH#29623 if first value is end of period, remove offset with n = 1
8936 # before adding the real offset
8937 end_date = end = self.index[0] - offset.base + offset
8938 else:
8939 end_date = end = self.index[0] + offset
8941 # Tick-like, e.g. 3 weeks
8942 if isinstance(offset, Tick) and end_date in self.index:
8943 end = self.index.searchsorted(end_date, side="left")
8944 return self.iloc[:end]
8946 return self.loc[:end]
8948 @final
8949 def last(self: NDFrameT, offset) -> NDFrameT:
8950 """
8951 Select final periods of time series data based on a date offset.
8953 For a DataFrame with a sorted DatetimeIndex, this function
8954 selects the last few rows based on a date offset.
8956 Parameters
8957 ----------
8958 offset : str, DateOffset, dateutil.relativedelta
8959 The offset length of the data that will be selected. For instance,
8960 '3D' will display all the rows having their index within the last 3 days.
8962 Returns
8963 -------
8964 Series or DataFrame
8965 A subset of the caller.
8967 Raises
8968 ------
8969 TypeError
8970 If the index is not a :class:`DatetimeIndex`
8972 See Also
8973 --------
8974 first : Select initial periods of time series based on a date offset.
8975 at_time : Select values at a particular time of the day.
8976 between_time : Select values between particular times of the day.
8978 Examples
8979 --------
8980 >>> i = pd.date_range('2018-04-09', periods=4, freq='2D')
8981 >>> ts = pd.DataFrame({'A': [1, 2, 3, 4]}, index=i)
8982 >>> ts
8983 A
8984 2018-04-09 1
8985 2018-04-11 2
8986 2018-04-13 3
8987 2018-04-15 4
8989 Get the rows for the last 3 days:
8991 >>> ts.last('3D')
8992 A
8993 2018-04-13 3
8994 2018-04-15 4
8996 Notice the data for 3 last calendar days were returned, not the last
8997 3 observed days in the dataset, and therefore data for 2018-04-11 was
8998 not returned.
8999 """
9000 if not isinstance(self.index, DatetimeIndex):
9001 raise TypeError("'last' only supports a DatetimeIndex index")
9003 if len(self.index) == 0:
9004 return self
9006 offset = to_offset(offset)
9008 start_date = self.index[-1] - offset
9009 start = self.index.searchsorted(start_date, side="right")
9010 return self.iloc[start:]
9012 @final
9013 def rank(
9014 self: NDFrameT,
9015 axis=0,
9016 method: str = "average",
9017 numeric_only: bool_t | None | lib.NoDefault = lib.no_default,
9018 na_option: str = "keep",
9019 ascending: bool_t = True,
9020 pct: bool_t = False,
9021 ) -> NDFrameT:
9022 """
9023 Compute numerical data ranks (1 through n) along axis.
9025 By default, equal values are assigned a rank that is the average of the
9026 ranks of those values.
9028 Parameters
9029 ----------
9030 axis : {0 or 'index', 1 or 'columns'}, default 0
9031 Index to direct ranking.
9032 For `Series` this parameter is unused and defaults to 0.
9033 method : {'average', 'min', 'max', 'first', 'dense'}, default 'average'
9034 How to rank the group of records that have the same value (i.e. ties):
9036 * average: average rank of the group
9037 * min: lowest rank in the group
9038 * max: highest rank in the group
9039 * first: ranks assigned in order they appear in the array
9040 * dense: like 'min', but rank always increases by 1 between groups.
9042 numeric_only : bool, optional
9043 For DataFrame objects, rank only numeric columns if set to True.
9044 na_option : {'keep', 'top', 'bottom'}, default 'keep'
9045 How to rank NaN values:
9047 * keep: assign NaN rank to NaN values
9048 * top: assign lowest rank to NaN values
9049 * bottom: assign highest rank to NaN values
9051 ascending : bool, default True
9052 Whether or not the elements should be ranked in ascending order.
9053 pct : bool, default False
9054 Whether or not to display the returned rankings in percentile
9055 form.
9057 Returns
9058 -------
9059 same type as caller
9060 Return a Series or DataFrame with data ranks as values.
9062 See Also
9063 --------
9064 core.groupby.GroupBy.rank : Rank of values within each group.
9066 Examples
9067 --------
9068 >>> df = pd.DataFrame(data={'Animal': ['cat', 'penguin', 'dog',
9069 ... 'spider', 'snake'],
9070 ... 'Number_legs': [4, 2, 4, 8, np.nan]})
9071 >>> df
9072 Animal Number_legs
9073 0 cat 4.0
9074 1 penguin 2.0
9075 2 dog 4.0
9076 3 spider 8.0
9077 4 snake NaN
9079 Ties are assigned the mean of the ranks (by default) for the group.
9081 >>> s = pd.Series(range(5), index=list("abcde"))
9082 >>> s["d"] = s["b"]
9083 >>> s.rank()
9084 a 1.0
9085 b 2.5
9086 c 4.0
9087 d 2.5
9088 e 5.0
9089 dtype: float64
9091 The following example shows how the method behaves with the above
9092 parameters:
9094 * default_rank: this is the default behaviour obtained without using
9095 any parameter.
9096 * max_rank: setting ``method = 'max'`` the records that have the
9097 same values are ranked using the highest rank (e.g.: since 'cat'
9098 and 'dog' are both in the 2nd and 3rd position, rank 3 is assigned.)
9099 * NA_bottom: choosing ``na_option = 'bottom'``, if there are records
9100 with NaN values they are placed at the bottom of the ranking.
9101 * pct_rank: when setting ``pct = True``, the ranking is expressed as
9102 percentile rank.
9104 >>> df['default_rank'] = df['Number_legs'].rank()
9105 >>> df['max_rank'] = df['Number_legs'].rank(method='max')
9106 >>> df['NA_bottom'] = df['Number_legs'].rank(na_option='bottom')
9107 >>> df['pct_rank'] = df['Number_legs'].rank(pct=True)
9108 >>> df
9109 Animal Number_legs default_rank max_rank NA_bottom pct_rank
9110 0 cat 4.0 2.5 3.0 2.5 0.625
9111 1 penguin 2.0 1.0 1.0 1.0 0.250
9112 2 dog 4.0 2.5 3.0 2.5 0.625
9113 3 spider 8.0 4.0 4.0 4.0 1.000
9114 4 snake NaN NaN NaN 5.0 NaN
9115 """
9116 warned = False
9117 if numeric_only is None:
9118 # GH#45036
9119 warnings.warn(
9120 f"'numeric_only=None' in {type(self).__name__}.rank is deprecated "
9121 "and will raise in a future version. Pass either 'True' or "
9122 "'False'. 'False' will be the default.",
9123 FutureWarning,
9124 stacklevel=find_stack_level(),
9125 )
9126 warned = True
9127 elif numeric_only is lib.no_default:
9128 numeric_only = None
9130 axis = self._get_axis_number(axis)
9132 if na_option not in {"keep", "top", "bottom"}:
9133 msg = "na_option must be one of 'keep', 'top', or 'bottom'"
9134 raise ValueError(msg)
9136 def ranker(data):
9137 if data.ndim == 2:
9138 # i.e. DataFrame, we cast to ndarray
9139 values = data.values
9140 else:
9141 # i.e. Series, can dispatch to EA
9142 values = data._values
9144 if isinstance(values, ExtensionArray):
9145 ranks = values._rank(
9146 axis=axis,
9147 method=method,
9148 ascending=ascending,
9149 na_option=na_option,
9150 pct=pct,
9151 )
9152 else:
9153 ranks = algos.rank(
9154 values,
9155 axis=axis,
9156 method=method,
9157 ascending=ascending,
9158 na_option=na_option,
9159 pct=pct,
9160 )
9162 ranks_obj = self._constructor(ranks, **data._construct_axes_dict())
9163 return ranks_obj.__finalize__(self, method="rank")
9165 # if numeric_only is None, and we can't get anything, we try with
9166 # numeric_only=True
9167 if numeric_only is None:
9168 try:
9169 return ranker(self)
9170 except TypeError:
9171 numeric_only = True
9172 if not warned:
9173 # Only warn here if we didn't already issue a warning above
9174 # GH#45036
9175 warnings.warn(
9176 f"Dropping of nuisance columns in {type(self).__name__}.rank "
9177 "is deprecated; in a future version this will raise TypeError. "
9178 "Select only valid columns before calling rank.",
9179 FutureWarning,
9180 stacklevel=find_stack_level(),
9181 )
9183 if numeric_only:
9184 if self.ndim == 1 and not is_numeric_dtype(self.dtype):
9185 # GH#47500
9186 warnings.warn(
9187 f"Calling Series.rank with numeric_only={numeric_only} and dtype "
9188 f"{self.dtype} is deprecated and will raise a TypeError in a "
9189 "future version of pandas",
9190 category=FutureWarning,
9191 stacklevel=find_stack_level(),
9192 )
9193 data = self._get_numeric_data()
9194 else:
9195 data = self
9197 return ranker(data)
9199 @doc(_shared_docs["compare"], klass=_shared_doc_kwargs["klass"])
9200 def compare(
9201 self,
9202 other,
9203 align_axis: Axis = 1,
9204 keep_shape: bool_t = False,
9205 keep_equal: bool_t = False,
9206 result_names: Suffixes = ("self", "other"),
9207 ):
9208 from pandas.core.reshape.concat import concat
9210 if type(self) is not type(other):
9211 cls_self, cls_other = type(self).__name__, type(other).__name__
9212 raise TypeError(
9213 f"can only compare '{cls_self}' (not '{cls_other}') with '{cls_self}'"
9214 )
9216 mask = ~((self == other) | (self.isna() & other.isna()))
9218 if not keep_equal:
9219 self = self.where(mask)
9220 other = other.where(mask)
9222 if not keep_shape:
9223 if isinstance(self, ABCDataFrame):
9224 cmask = mask.any()
9225 rmask = mask.any(axis=1)
9226 self = self.loc[rmask, cmask]
9227 other = other.loc[rmask, cmask]
9228 else:
9229 self = self[mask]
9230 other = other[mask]
9231 if not isinstance(result_names, tuple):
9232 raise TypeError(
9233 f"Passing 'result_names' as a {type(result_names)} is not "
9234 "supported. Provide 'result_names' as a tuple instead."
9235 )
9237 if align_axis in (1, "columns"): # This is needed for Series
9238 axis = 1
9239 else:
9240 axis = self._get_axis_number(align_axis)
9242 diff = concat([self, other], axis=axis, keys=result_names)
9244 if axis >= self.ndim:
9245 # No need to reorganize data if stacking on new axis
9246 # This currently applies for stacking two Series on columns
9247 return diff
9249 ax = diff._get_axis(axis)
9250 ax_names = np.array(ax.names)
9252 # set index names to positions to avoid confusion
9253 ax.names = np.arange(len(ax_names))
9255 # bring self-other to inner level
9256 order = list(range(1, ax.nlevels)) + [0]
9257 if isinstance(diff, ABCDataFrame):
9258 diff = diff.reorder_levels(order, axis=axis)
9259 else:
9260 diff = diff.reorder_levels(order)
9262 # restore the index names in order
9263 diff._get_axis(axis=axis).names = ax_names[order]
9265 # reorder axis to keep things organized
9266 indices = (
9267 np.arange(diff.shape[axis]).reshape([2, diff.shape[axis] // 2]).T.flatten()
9268 )
9269 diff = diff.take(indices, axis=axis)
9271 return diff
9273 @doc(**_shared_doc_kwargs)
9274 def align(
9275 self: NDFrameT,
9276 other: NDFrameT,
9277 join: Literal["outer", "inner", "left", "right"] = "outer",
9278 axis: Axis | None = None,
9279 level: Level = None,
9280 copy: bool_t = True,
9281 fill_value: Hashable = None,
9282 method: FillnaOptions | None = None,
9283 limit: int | None = None,
9284 fill_axis: Axis = 0,
9285 broadcast_axis: Axis | None = None,
9286 ) -> NDFrameT:
9287 """
9288 Align two objects on their axes with the specified join method.
9290 Join method is specified for each axis Index.
9292 Parameters
9293 ----------
9294 other : DataFrame or Series
9295 join : {{'outer', 'inner', 'left', 'right'}}, default 'outer'
9296 axis : allowed axis of the other object, default None
9297 Align on index (0), columns (1), or both (None).
9298 level : int or level name, default None
9299 Broadcast across a level, matching Index values on the
9300 passed MultiIndex level.
9301 copy : bool, default True
9302 Always returns new objects. If copy=False and no reindexing is
9303 required then original objects are returned.
9304 fill_value : scalar, default np.NaN
9305 Value to use for missing values. Defaults to NaN, but can be any
9306 "compatible" value.
9307 method : {{'backfill', 'bfill', 'pad', 'ffill', None}}, default None
9308 Method to use for filling holes in reindexed Series:
9310 - pad / ffill: propagate last valid observation forward to next valid.
9311 - backfill / bfill: use NEXT valid observation to fill gap.
9313 limit : int, default None
9314 If method is specified, this is the maximum number of consecutive
9315 NaN values to forward/backward fill. In other words, if there is
9316 a gap with more than this number of consecutive NaNs, it will only
9317 be partially filled. If method is not specified, this is the
9318 maximum number of entries along the entire axis where NaNs will be
9319 filled. Must be greater than 0 if not None.
9320 fill_axis : {axes_single_arg}, default 0
9321 Filling axis, method and limit.
9322 broadcast_axis : {axes_single_arg}, default None
9323 Broadcast values along this axis, if aligning two objects of
9324 different dimensions.
9326 Returns
9327 -------
9328 (left, right) : ({klass}, type of other)
9329 Aligned objects.
9331 Examples
9332 --------
9333 >>> df = pd.DataFrame(
9334 ... [[1, 2, 3, 4], [6, 7, 8, 9]], columns=["D", "B", "E", "A"], index=[1, 2]
9335 ... )
9336 >>> other = pd.DataFrame(
9337 ... [[10, 20, 30, 40], [60, 70, 80, 90], [600, 700, 800, 900]],
9338 ... columns=["A", "B", "C", "D"],
9339 ... index=[2, 3, 4],
9340 ... )
9341 >>> df
9342 D B E A
9343 1 1 2 3 4
9344 2 6 7 8 9
9345 >>> other
9346 A B C D
9347 2 10 20 30 40
9348 3 60 70 80 90
9349 4 600 700 800 900
9351 Align on columns:
9353 >>> left, right = df.align(other, join="outer", axis=1)
9354 >>> left
9355 A B C D E
9356 1 4 2 NaN 1 3
9357 2 9 7 NaN 6 8
9358 >>> right
9359 A B C D E
9360 2 10 20 30 40 NaN
9361 3 60 70 80 90 NaN
9362 4 600 700 800 900 NaN
9364 We can also align on the index:
9366 >>> left, right = df.align(other, join="outer", axis=0)
9367 >>> left
9368 D B E A
9369 1 1.0 2.0 3.0 4.0
9370 2 6.0 7.0 8.0 9.0
9371 3 NaN NaN NaN NaN
9372 4 NaN NaN NaN NaN
9373 >>> right
9374 A B C D
9375 1 NaN NaN NaN NaN
9376 2 10.0 20.0 30.0 40.0
9377 3 60.0 70.0 80.0 90.0
9378 4 600.0 700.0 800.0 900.0
9380 Finally, the default `axis=None` will align on both index and columns:
9382 >>> left, right = df.align(other, join="outer", axis=None)
9383 >>> left
9384 A B C D E
9385 1 4.0 2.0 NaN 1.0 3.0
9386 2 9.0 7.0 NaN 6.0 8.0
9387 3 NaN NaN NaN NaN NaN
9388 4 NaN NaN NaN NaN NaN
9389 >>> right
9390 A B C D E
9391 1 NaN NaN NaN NaN NaN
9392 2 10.0 20.0 30.0 40.0 NaN
9393 3 60.0 70.0 80.0 90.0 NaN
9394 4 600.0 700.0 800.0 900.0 NaN
9395 """
9397 method = missing.clean_fill_method(method)
9399 if broadcast_axis == 1 and self.ndim != other.ndim:
9400 if isinstance(self, ABCSeries):
9401 # this means other is a DataFrame, and we need to broadcast
9402 # self
9403 cons = self._constructor_expanddim
9404 df = cons(
9405 {c: self for c in other.columns}, **other._construct_axes_dict()
9406 )
9407 return df._align_frame(
9408 other,
9409 join=join,
9410 axis=axis,
9411 level=level,
9412 copy=copy,
9413 fill_value=fill_value,
9414 method=method,
9415 limit=limit,
9416 fill_axis=fill_axis,
9417 )
9418 elif isinstance(other, ABCSeries):
9419 # this means self is a DataFrame, and we need to broadcast
9420 # other
9421 cons = other._constructor_expanddim
9422 df = cons(
9423 {c: other for c in self.columns}, **self._construct_axes_dict()
9424 )
9425 return self._align_frame(
9426 df,
9427 join=join,
9428 axis=axis,
9429 level=level,
9430 copy=copy,
9431 fill_value=fill_value,
9432 method=method,
9433 limit=limit,
9434 fill_axis=fill_axis,
9435 )
9437 if axis is not None:
9438 axis = self._get_axis_number(axis)
9439 if isinstance(other, ABCDataFrame):
9440 return self._align_frame(
9441 other,
9442 join=join,
9443 axis=axis,
9444 level=level,
9445 copy=copy,
9446 fill_value=fill_value,
9447 method=method,
9448 limit=limit,
9449 fill_axis=fill_axis,
9450 )
9451 elif isinstance(other, ABCSeries):
9452 return self._align_series(
9453 other,
9454 join=join,
9455 axis=axis,
9456 level=level,
9457 copy=copy,
9458 fill_value=fill_value,
9459 method=method,
9460 limit=limit,
9461 fill_axis=fill_axis,
9462 )
9463 else: # pragma: no cover
9464 raise TypeError(f"unsupported type: {type(other)}")
9466 @final
9467 def _align_frame(
9468 self,
9469 other,
9470 join="outer",
9471 axis=None,
9472 level=None,
9473 copy: bool_t = True,
9474 fill_value=None,
9475 method=None,
9476 limit=None,
9477 fill_axis=0,
9478 ):
9479 # defaults
9480 join_index, join_columns = None, None
9481 ilidx, iridx = None, None
9482 clidx, cridx = None, None
9484 is_series = isinstance(self, ABCSeries)
9486 if (axis is None or axis == 0) and not self.index.equals(other.index):
9487 join_index, ilidx, iridx = self.index.join(
9488 other.index, how=join, level=level, return_indexers=True
9489 )
9491 if (
9492 (axis is None or axis == 1)
9493 and not is_series
9494 and not self.columns.equals(other.columns)
9495 ):
9496 join_columns, clidx, cridx = self.columns.join(
9497 other.columns, how=join, level=level, return_indexers=True
9498 )
9500 if is_series:
9501 reindexers = {0: [join_index, ilidx]}
9502 else:
9503 reindexers = {0: [join_index, ilidx], 1: [join_columns, clidx]}
9505 left = self._reindex_with_indexers(
9506 reindexers, copy=copy, fill_value=fill_value, allow_dups=True
9507 )
9508 # other must be always DataFrame
9509 right = other._reindex_with_indexers(
9510 {0: [join_index, iridx], 1: [join_columns, cridx]},
9511 copy=copy,
9512 fill_value=fill_value,
9513 allow_dups=True,
9514 )
9516 if method is not None:
9517 _left = left.fillna(method=method, axis=fill_axis, limit=limit)
9518 assert _left is not None # needed for mypy
9519 left = _left
9520 right = right.fillna(method=method, axis=fill_axis, limit=limit)
9522 # if DatetimeIndex have different tz, convert to UTC
9523 left, right = _align_as_utc(left, right, join_index)
9525 return (
9526 left.__finalize__(self),
9527 right.__finalize__(other),
9528 )
9530 @final
9531 def _align_series(
9532 self,
9533 other,
9534 join="outer",
9535 axis=None,
9536 level=None,
9537 copy: bool_t = True,
9538 fill_value=None,
9539 method=None,
9540 limit=None,
9541 fill_axis=0,
9542 ):
9544 is_series = isinstance(self, ABCSeries)
9546 if (not is_series and axis is None) or axis not in [None, 0, 1]:
9547 raise ValueError("Must specify axis=0 or 1")
9549 if is_series and axis == 1:
9550 raise ValueError("cannot align series to a series other than axis 0")
9552 # series/series compat, other must always be a Series
9553 if not axis:
9555 # equal
9556 if self.index.equals(other.index):
9557 join_index, lidx, ridx = None, None, None
9558 else:
9559 join_index, lidx, ridx = self.index.join(
9560 other.index, how=join, level=level, return_indexers=True
9561 )
9563 if is_series:
9564 left = self._reindex_indexer(join_index, lidx, copy)
9565 elif lidx is None or join_index is None:
9566 left = self.copy() if copy else self
9567 else:
9568 left = self._constructor(
9569 self._mgr.reindex_indexer(join_index, lidx, axis=1, copy=copy)
9570 )
9572 right = other._reindex_indexer(join_index, ridx, copy)
9574 else:
9576 # one has > 1 ndim
9577 fdata = self._mgr
9578 join_index = self.axes[1]
9579 lidx, ridx = None, None
9580 if not join_index.equals(other.index):
9581 join_index, lidx, ridx = join_index.join(
9582 other.index, how=join, level=level, return_indexers=True
9583 )
9585 if lidx is not None:
9586 bm_axis = self._get_block_manager_axis(1)
9587 fdata = fdata.reindex_indexer(join_index, lidx, axis=bm_axis)
9589 if copy and fdata is self._mgr:
9590 fdata = fdata.copy()
9592 left = self._constructor(fdata)
9594 if ridx is None:
9595 right = other
9596 else:
9597 right = other.reindex(join_index, level=level)
9599 # fill
9600 fill_na = notna(fill_value) or (method is not None)
9601 if fill_na:
9602 left = left.fillna(fill_value, method=method, limit=limit, axis=fill_axis)
9603 right = right.fillna(fill_value, method=method, limit=limit)
9605 # if DatetimeIndex have different tz, convert to UTC
9606 if is_series or (not is_series and axis == 0):
9607 left, right = _align_as_utc(left, right, join_index)
9609 return (
9610 left.__finalize__(self),
9611 right.__finalize__(other),
9612 )
9614 @final
9615 def _where(
9616 self,
9617 cond,
9618 other=lib.no_default,
9619 inplace=False,
9620 axis=None,
9621 level=None,
9622 ):
9623 """
9624 Equivalent to public method `where`, except that `other` is not
9625 applied as a function even if callable. Used in __setitem__.
9626 """
9627 inplace = validate_bool_kwarg(inplace, "inplace")
9629 if axis is not None:
9630 axis = self._get_axis_number(axis)
9632 # align the cond to same shape as myself
9633 cond = com.apply_if_callable(cond, self)
9634 if isinstance(cond, NDFrame):
9635 cond, _ = cond.align(self, join="right", broadcast_axis=1, copy=False)
9636 else:
9637 if not hasattr(cond, "shape"):
9638 cond = np.asanyarray(cond)
9639 if cond.shape != self.shape:
9640 raise ValueError("Array conditional must be same shape as self")
9641 cond = self._constructor(cond, **self._construct_axes_dict())
9643 # make sure we are boolean
9644 fill_value = bool(inplace)
9645 cond = cond.fillna(fill_value)
9647 msg = "Boolean array expected for the condition, not {dtype}"
9649 if not cond.empty:
9650 if not isinstance(cond, ABCDataFrame):
9651 # This is a single-dimensional object.
9652 if not is_bool_dtype(cond):
9653 raise ValueError(msg.format(dtype=cond.dtype))
9654 else:
9655 for dt in cond.dtypes:
9656 if not is_bool_dtype(dt):
9657 raise ValueError(msg.format(dtype=dt))
9658 else:
9659 # GH#21947 we have an empty DataFrame/Series, could be object-dtype
9660 cond = cond.astype(bool)
9662 cond = -cond if inplace else cond
9663 cond = cond.reindex(self._info_axis, axis=self._info_axis_number, copy=False)
9665 # try to align with other
9666 if isinstance(other, NDFrame):
9668 # align with me
9669 if other.ndim <= self.ndim:
9671 _, other = self.align(
9672 other,
9673 join="left",
9674 axis=axis,
9675 level=level,
9676 fill_value=None,
9677 copy=False,
9678 )
9680 # if we are NOT aligned, raise as we cannot where index
9681 if axis is None and not other._indexed_same(self):
9682 raise InvalidIndexError
9684 elif other.ndim < self.ndim:
9685 # TODO(EA2D): avoid object-dtype cast in EA case GH#38729
9686 other = other._values
9687 if axis == 0:
9688 other = np.reshape(other, (-1, 1))
9689 elif axis == 1:
9690 other = np.reshape(other, (1, -1))
9692 other = np.broadcast_to(other, self.shape)
9694 # slice me out of the other
9695 else:
9696 raise NotImplementedError(
9697 "cannot align with a higher dimensional NDFrame"
9698 )
9700 elif not isinstance(other, (MultiIndex, NDFrame)):
9701 # mainly just catching Index here
9702 other = extract_array(other, extract_numpy=True)
9704 if isinstance(other, (np.ndarray, ExtensionArray)):
9706 if other.shape != self.shape:
9707 if self.ndim != 1:
9708 # In the ndim == 1 case we may have
9709 # other length 1, which we treat as scalar (GH#2745, GH#4192)
9710 # or len(other) == icond.sum(), which we treat like
9711 # __setitem__ (GH#3235)
9712 raise ValueError(
9713 "other must be the same shape as self when an ndarray"
9714 )
9716 # we are the same shape, so create an actual object for alignment
9717 else:
9718 other = self._constructor(other, **self._construct_axes_dict())
9720 if axis is None:
9721 axis = 0
9723 if self.ndim == getattr(other, "ndim", 0):
9724 align = True
9725 else:
9726 align = self._get_axis_number(axis) == 1
9728 if inplace:
9729 # we may have different type blocks come out of putmask, so
9730 # reconstruct the block manager
9732 self._check_inplace_setting(other)
9733 new_data = self._mgr.putmask(mask=cond, new=other, align=align)
9734 result = self._constructor(new_data)
9735 return self._update_inplace(result)
9737 else:
9738 new_data = self._mgr.where(
9739 other=other,
9740 cond=cond,
9741 align=align,
9742 )
9743 result = self._constructor(new_data)
9744 return result.__finalize__(self)
9746 @overload
9747 def where(
9748 self: NDFrameT,
9749 cond,
9750 other=...,
9751 *,
9752 inplace: Literal[False] = ...,
9753 axis: Axis | None = ...,
9754 level: Level = ...,
9755 errors: IgnoreRaise | lib.NoDefault = ...,
9756 try_cast: bool_t | lib.NoDefault = ...,
9757 ) -> NDFrameT:
9758 ...
9760 @overload
9761 def where(
9762 self,
9763 cond,
9764 other=...,
9765 *,
9766 inplace: Literal[True],
9767 axis: Axis | None = ...,
9768 level: Level = ...,
9769 errors: IgnoreRaise | lib.NoDefault = ...,
9770 try_cast: bool_t | lib.NoDefault = ...,
9771 ) -> None:
9772 ...
9774 @overload
9775 def where(
9776 self: NDFrameT,
9777 cond,
9778 other=...,
9779 *,
9780 inplace: bool_t = ...,
9781 axis: Axis | None = ...,
9782 level: Level = ...,
9783 errors: IgnoreRaise | lib.NoDefault = ...,
9784 try_cast: bool_t | lib.NoDefault = ...,
9785 ) -> NDFrameT | None:
9786 ...
9788 @deprecate_kwarg(old_arg_name="errors", new_arg_name=None)
9789 @deprecate_nonkeyword_arguments(
9790 version=None, allowed_args=["self", "cond", "other"]
9791 )
9792 @doc(
9793 klass=_shared_doc_kwargs["klass"],
9794 cond="True",
9795 cond_rev="False",
9796 name="where",
9797 name_other="mask",
9798 )
9799 def where(
9800 self: NDFrameT,
9801 cond,
9802 other=np.nan,
9803 inplace: bool_t = False,
9804 axis: Axis | None = None,
9805 level: Level = None,
9806 errors: IgnoreRaise | lib.NoDefault = "raise",
9807 try_cast: bool_t | lib.NoDefault = lib.no_default,
9808 ) -> NDFrameT | None:
9809 """
9810 Replace values where the condition is {cond_rev}.
9812 Parameters
9813 ----------
9814 cond : bool {klass}, array-like, or callable
9815 Where `cond` is {cond}, keep the original value. Where
9816 {cond_rev}, replace with corresponding value from `other`.
9817 If `cond` is callable, it is computed on the {klass} and
9818 should return boolean {klass} or array. The callable must
9819 not change input {klass} (though pandas doesn't check it).
9820 other : scalar, {klass}, or callable
9821 Entries where `cond` is {cond_rev} are replaced with
9822 corresponding value from `other`.
9823 If other is callable, it is computed on the {klass} and
9824 should return scalar or {klass}. The callable must not
9825 change input {klass} (though pandas doesn't check it).
9826 inplace : bool, default False
9827 Whether to perform the operation in place on the data.
9828 axis : int, default None
9829 Alignment axis if needed. For `Series` this parameter is
9830 unused and defaults to 0.
9831 level : int, default None
9832 Alignment level if needed.
9833 errors : str, {{'raise', 'ignore'}}, default 'raise'
9834 Note that currently this parameter won't affect
9835 the results and will always coerce to a suitable dtype.
9837 - 'raise' : allow exceptions to be raised.
9838 - 'ignore' : suppress exceptions. On error return original object.
9840 .. deprecated:: 1.5.0
9841 This argument had no effect.
9843 try_cast : bool, default None
9844 Try to cast the result back to the input type (if possible).
9846 .. deprecated:: 1.3.0
9847 Manually cast back if necessary.
9849 Returns
9850 -------
9851 Same type as caller or None if ``inplace=True``.
9853 See Also
9854 --------
9855 :func:`DataFrame.{name_other}` : Return an object of same shape as
9856 self.
9858 Notes
9859 -----
9860 The {name} method is an application of the if-then idiom. For each
9861 element in the calling DataFrame, if ``cond`` is ``{cond}`` the
9862 element is used; otherwise the corresponding element from the DataFrame
9863 ``other`` is used. If the axis of ``other`` does not align with axis of
9864 ``cond`` {klass}, the misaligned index positions will be filled with
9865 {cond_rev}.
9867 The signature for :func:`DataFrame.where` differs from
9868 :func:`numpy.where`. Roughly ``df1.where(m, df2)`` is equivalent to
9869 ``np.where(m, df1, df2)``.
9871 For further details and examples see the ``{name}`` documentation in
9872 :ref:`indexing <indexing.where_mask>`.
9874 The dtype of the object takes precedence. The fill value is casted to
9875 the object's dtype, if this can be done losslessly.
9877 Examples
9878 --------
9879 >>> s = pd.Series(range(5))
9880 >>> s.where(s > 0)
9881 0 NaN
9882 1 1.0
9883 2 2.0
9884 3 3.0
9885 4 4.0
9886 dtype: float64
9887 >>> s.mask(s > 0)
9888 0 0.0
9889 1 NaN
9890 2 NaN
9891 3 NaN
9892 4 NaN
9893 dtype: float64
9895 >>> s = pd.Series(range(5))
9896 >>> t = pd.Series([True, False])
9897 >>> s.where(t, 99)
9898 0 0
9899 1 99
9900 2 99
9901 3 99
9902 4 99
9903 dtype: int64
9904 >>> s.mask(t, 99)
9905 0 99
9906 1 1
9907 2 99
9908 3 99
9909 4 99
9910 dtype: int64
9912 >>> s.where(s > 1, 10)
9913 0 10
9914 1 10
9915 2 2
9916 3 3
9917 4 4
9918 dtype: int64
9919 >>> s.mask(s > 1, 10)
9920 0 0
9921 1 1
9922 2 10
9923 3 10
9924 4 10
9925 dtype: int64
9927 >>> df = pd.DataFrame(np.arange(10).reshape(-1, 2), columns=['A', 'B'])
9928 >>> df
9929 A B
9930 0 0 1
9931 1 2 3
9932 2 4 5
9933 3 6 7
9934 4 8 9
9935 >>> m = df % 3 == 0
9936 >>> df.where(m, -df)
9937 A B
9938 0 0 -1
9939 1 -2 3
9940 2 -4 -5
9941 3 6 -7
9942 4 -8 9
9943 >>> df.where(m, -df) == np.where(m, df, -df)
9944 A B
9945 0 True True
9946 1 True True
9947 2 True True
9948 3 True True
9949 4 True True
9950 >>> df.where(m, -df) == df.mask(~m, -df)
9951 A B
9952 0 True True
9953 1 True True
9954 2 True True
9955 3 True True
9956 4 True True
9957 """
9958 other = com.apply_if_callable(other, self)
9960 if try_cast is not lib.no_default:
9961 warnings.warn(
9962 "try_cast keyword is deprecated and will be removed in a "
9963 "future version.",
9964 FutureWarning,
9965 stacklevel=find_stack_level(),
9966 )
9968 return self._where(cond, other, inplace, axis, level)
9970 @overload
9971 def mask(
9972 self: NDFrameT,
9973 cond,
9974 other=...,
9975 *,
9976 inplace: Literal[False] = ...,
9977 axis: Axis | None = ...,
9978 level: Level = ...,
9979 errors: IgnoreRaise | lib.NoDefault = ...,
9980 try_cast: bool_t | lib.NoDefault = ...,
9981 ) -> NDFrameT:
9982 ...
9984 @overload
9985 def mask(
9986 self,
9987 cond,
9988 other=...,
9989 *,
9990 inplace: Literal[True],
9991 axis: Axis | None = ...,
9992 level: Level = ...,
9993 errors: IgnoreRaise | lib.NoDefault = ...,
9994 try_cast: bool_t | lib.NoDefault = ...,
9995 ) -> None:
9996 ...
9998 @overload
9999 def mask(
10000 self: NDFrameT,
10001 cond,
10002 other=...,
10003 *,
10004 inplace: bool_t = ...,
10005 axis: Axis | None = ...,
10006 level: Level = ...,
10007 errors: IgnoreRaise | lib.NoDefault = ...,
10008 try_cast: bool_t | lib.NoDefault = ...,
10009 ) -> NDFrameT | None:
10010 ...
10012 @deprecate_kwarg(old_arg_name="errors", new_arg_name=None)
10013 @deprecate_nonkeyword_arguments(
10014 version=None, allowed_args=["self", "cond", "other"]
10015 )
10016 @doc(
10017 where,
10018 klass=_shared_doc_kwargs["klass"],
10019 cond="False",
10020 cond_rev="True",
10021 name="mask",
10022 name_other="where",
10023 )
10024 def mask(
10025 self: NDFrameT,
10026 cond,
10027 other=np.nan,
10028 inplace: bool_t = False,
10029 axis: Axis | None = None,
10030 level: Level = None,
10031 errors: IgnoreRaise | lib.NoDefault = "raise",
10032 try_cast: bool_t | lib.NoDefault = lib.no_default,
10033 ) -> NDFrameT | None:
10035 inplace = validate_bool_kwarg(inplace, "inplace")
10036 cond = com.apply_if_callable(cond, self)
10038 if try_cast is not lib.no_default:
10039 warnings.warn(
10040 "try_cast keyword is deprecated and will be removed in a "
10041 "future version.",
10042 FutureWarning,
10043 stacklevel=find_stack_level(),
10044 )
10046 # see gh-21891
10047 if not hasattr(cond, "__invert__"):
10048 cond = np.array(cond)
10050 return self.where(
10051 ~cond,
10052 other=other,
10053 inplace=inplace,
10054 axis=axis,
10055 level=level,
10056 )
10058 @doc(klass=_shared_doc_kwargs["klass"])
10059 def shift(
10060 self: NDFrameT,
10061 periods: int = 1,
10062 freq=None,
10063 axis: Axis = 0,
10064 fill_value: Hashable = None,
10065 ) -> NDFrameT:
10066 """
10067 Shift index by desired number of periods with an optional time `freq`.
10069 When `freq` is not passed, shift the index without realigning the data.
10070 If `freq` is passed (in this case, the index must be date or datetime,
10071 or it will raise a `NotImplementedError`), the index will be
10072 increased using the periods and the `freq`. `freq` can be inferred
10073 when specified as "infer" as long as either freq or inferred_freq
10074 attribute is set in the index.
10076 Parameters
10077 ----------
10078 periods : int
10079 Number of periods to shift. Can be positive or negative.
10080 freq : DateOffset, tseries.offsets, timedelta, or str, optional
10081 Offset to use from the tseries module or time rule (e.g. 'EOM').
10082 If `freq` is specified then the index values are shifted but the
10083 data is not realigned. That is, use `freq` if you would like to
10084 extend the index when shifting and preserve the original data.
10085 If `freq` is specified as "infer" then it will be inferred from
10086 the freq or inferred_freq attributes of the index. If neither of
10087 those attributes exist, a ValueError is thrown.
10088 axis : {{0 or 'index', 1 or 'columns', None}}, default None
10089 Shift direction. For `Series` this parameter is unused and defaults to 0.
10090 fill_value : object, optional
10091 The scalar value to use for newly introduced missing values.
10092 the default depends on the dtype of `self`.
10093 For numeric data, ``np.nan`` is used.
10094 For datetime, timedelta, or period data, etc. :attr:`NaT` is used.
10095 For extension dtypes, ``self.dtype.na_value`` is used.
10097 .. versionchanged:: 1.1.0
10099 Returns
10100 -------
10101 {klass}
10102 Copy of input object, shifted.
10104 See Also
10105 --------
10106 Index.shift : Shift values of Index.
10107 DatetimeIndex.shift : Shift values of DatetimeIndex.
10108 PeriodIndex.shift : Shift values of PeriodIndex.
10109 tshift : Shift the time index, using the index's frequency if
10110 available.
10112 Examples
10113 --------
10114 >>> df = pd.DataFrame({{"Col1": [10, 20, 15, 30, 45],
10115 ... "Col2": [13, 23, 18, 33, 48],
10116 ... "Col3": [17, 27, 22, 37, 52]}},
10117 ... index=pd.date_range("2020-01-01", "2020-01-05"))
10118 >>> df
10119 Col1 Col2 Col3
10120 2020-01-01 10 13 17
10121 2020-01-02 20 23 27
10122 2020-01-03 15 18 22
10123 2020-01-04 30 33 37
10124 2020-01-05 45 48 52
10126 >>> df.shift(periods=3)
10127 Col1 Col2 Col3
10128 2020-01-01 NaN NaN NaN
10129 2020-01-02 NaN NaN NaN
10130 2020-01-03 NaN NaN NaN
10131 2020-01-04 10.0 13.0 17.0
10132 2020-01-05 20.0 23.0 27.0
10134 >>> df.shift(periods=1, axis="columns")
10135 Col1 Col2 Col3
10136 2020-01-01 NaN 10 13
10137 2020-01-02 NaN 20 23
10138 2020-01-03 NaN 15 18
10139 2020-01-04 NaN 30 33
10140 2020-01-05 NaN 45 48
10142 >>> df.shift(periods=3, fill_value=0)
10143 Col1 Col2 Col3
10144 2020-01-01 0 0 0
10145 2020-01-02 0 0 0
10146 2020-01-03 0 0 0
10147 2020-01-04 10 13 17
10148 2020-01-05 20 23 27
10150 >>> df.shift(periods=3, freq="D")
10151 Col1 Col2 Col3
10152 2020-01-04 10 13 17
10153 2020-01-05 20 23 27
10154 2020-01-06 15 18 22
10155 2020-01-07 30 33 37
10156 2020-01-08 45 48 52
10158 >>> df.shift(periods=3, freq="infer")
10159 Col1 Col2 Col3
10160 2020-01-04 10 13 17
10161 2020-01-05 20 23 27
10162 2020-01-06 15 18 22
10163 2020-01-07 30 33 37
10164 2020-01-08 45 48 52
10165 """
10166 if periods == 0:
10167 return self.copy()
10169 if freq is None:
10170 # when freq is None, data is shifted, index is not
10171 axis = self._get_axis_number(axis)
10172 new_data = self._mgr.shift(
10173 periods=periods, axis=axis, fill_value=fill_value
10174 )
10175 return self._constructor(new_data).__finalize__(self, method="shift")
10177 # when freq is given, index is shifted, data is not
10178 index = self._get_axis(axis)
10180 if freq == "infer":
10181 freq = getattr(index, "freq", None)
10183 if freq is None:
10184 freq = getattr(index, "inferred_freq", None)
10186 if freq is None:
10187 msg = "Freq was not set in the index hence cannot be inferred"
10188 raise ValueError(msg)
10190 elif isinstance(freq, str):
10191 freq = to_offset(freq)
10193 if isinstance(index, PeriodIndex):
10194 orig_freq = to_offset(index.freq)
10195 if freq != orig_freq:
10196 assert orig_freq is not None # for mypy
10197 raise ValueError(
10198 f"Given freq {freq.rule_code} does not match "
10199 f"PeriodIndex freq {orig_freq.rule_code}"
10200 )
10201 new_ax = index.shift(periods)
10202 else:
10203 new_ax = index.shift(periods, freq)
10205 result = self.set_axis(new_ax, axis=axis)
10206 return result.__finalize__(self, method="shift")
10208 @final
10209 def slice_shift(self: NDFrameT, periods: int = 1, axis=0) -> NDFrameT:
10210 """
10211 Equivalent to `shift` without copying data.
10213 .. deprecated:: 1.2.0
10214 slice_shift is deprecated,
10215 use DataFrame/Series.shift instead.
10217 The shifted data will not include the dropped periods and the
10218 shifted axis will be smaller than the original.
10220 Parameters
10221 ----------
10222 periods : int
10223 Number of periods to move, can be positive or negative.
10224 axis : {0 or 'index', 1 or 'columns', None}, default 0
10225 For `Series` this parameter is unused and defaults to 0.
10227 Returns
10228 -------
10229 shifted : same type as caller
10231 Notes
10232 -----
10233 While the `slice_shift` is faster than `shift`, you may pay for it
10234 later during alignment.
10235 """
10237 msg = (
10238 "The 'slice_shift' method is deprecated "
10239 "and will be removed in a future version. "
10240 "You can use DataFrame/Series.shift instead."
10241 )
10242 warnings.warn(msg, FutureWarning, stacklevel=find_stack_level())
10244 if periods == 0:
10245 return self
10247 if periods > 0:
10248 vslicer = slice(None, -periods)
10249 islicer = slice(periods, None)
10250 else:
10251 vslicer = slice(-periods, None)
10252 islicer = slice(None, periods)
10254 new_obj = self._slice(vslicer, axis=axis)
10255 shifted_axis = self._get_axis(axis)[islicer]
10256 new_obj = new_obj.set_axis(shifted_axis, axis=axis, copy=False)
10257 return new_obj.__finalize__(self, method="slice_shift")
10259 @final
10260 def tshift(self: NDFrameT, periods: int = 1, freq=None, axis: Axis = 0) -> NDFrameT:
10261 """
10262 Shift the time index, using the index's frequency if available.
10264 .. deprecated:: 1.1.0
10265 Use `shift` instead.
10267 Parameters
10268 ----------
10269 periods : int
10270 Number of periods to move, can be positive or negative.
10271 freq : DateOffset, timedelta, or str, default None
10272 Increment to use from the tseries module
10273 or time rule expressed as a string (e.g. 'EOM').
10274 axis : {0 or ‘index’, 1 or ‘columns’, None}, default 0
10275 Corresponds to the axis that contains the Index.
10276 For `Series` this parameter is unused and defaults to 0.
10278 Returns
10279 -------
10280 shifted : Series/DataFrame
10282 Notes
10283 -----
10284 If freq is not specified then tries to use the freq or inferred_freq
10285 attributes of the index. If neither of those attributes exist, a
10286 ValueError is thrown
10287 """
10288 warnings.warn(
10289 (
10290 "tshift is deprecated and will be removed in a future version. "
10291 "Please use shift instead."
10292 ),
10293 FutureWarning,
10294 stacklevel=find_stack_level(),
10295 )
10297 if freq is None:
10298 freq = "infer"
10300 return self.shift(periods, freq, axis)
10302 def truncate(
10303 self: NDFrameT, before=None, after=None, axis=None, copy: bool_t = True
10304 ) -> NDFrameT:
10305 """
10306 Truncate a Series or DataFrame before and after some index value.
10308 This is a useful shorthand for boolean indexing based on index
10309 values above or below certain thresholds.
10311 Parameters
10312 ----------
10313 before : date, str, int
10314 Truncate all rows before this index value.
10315 after : date, str, int
10316 Truncate all rows after this index value.
10317 axis : {0 or 'index', 1 or 'columns'}, optional
10318 Axis to truncate. Truncates the index (rows) by default.
10319 For `Series` this parameter is unused and defaults to 0.
10320 copy : bool, default is True,
10321 Return a copy of the truncated section.
10323 Returns
10324 -------
10325 type of caller
10326 The truncated Series or DataFrame.
10328 See Also
10329 --------
10330 DataFrame.loc : Select a subset of a DataFrame by label.
10331 DataFrame.iloc : Select a subset of a DataFrame by position.
10333 Notes
10334 -----
10335 If the index being truncated contains only datetime values,
10336 `before` and `after` may be specified as strings instead of
10337 Timestamps.
10339 Examples
10340 --------
10341 >>> df = pd.DataFrame({'A': ['a', 'b', 'c', 'd', 'e'],
10342 ... 'B': ['f', 'g', 'h', 'i', 'j'],
10343 ... 'C': ['k', 'l', 'm', 'n', 'o']},
10344 ... index=[1, 2, 3, 4, 5])
10345 >>> df
10346 A B C
10347 1 a f k
10348 2 b g l
10349 3 c h m
10350 4 d i n
10351 5 e j o
10353 >>> df.truncate(before=2, after=4)
10354 A B C
10355 2 b g l
10356 3 c h m
10357 4 d i n
10359 The columns of a DataFrame can be truncated.
10361 >>> df.truncate(before="A", after="B", axis="columns")
10362 A B
10363 1 a f
10364 2 b g
10365 3 c h
10366 4 d i
10367 5 e j
10369 For Series, only rows can be truncated.
10371 >>> df['A'].truncate(before=2, after=4)
10372 2 b
10373 3 c
10374 4 d
10375 Name: A, dtype: object
10377 The index values in ``truncate`` can be datetimes or string
10378 dates.
10380 >>> dates = pd.date_range('2016-01-01', '2016-02-01', freq='s')
10381 >>> df = pd.DataFrame(index=dates, data={'A': 1})
10382 >>> df.tail()
10383 A
10384 2016-01-31 23:59:56 1
10385 2016-01-31 23:59:57 1
10386 2016-01-31 23:59:58 1
10387 2016-01-31 23:59:59 1
10388 2016-02-01 00:00:00 1
10390 >>> df.truncate(before=pd.Timestamp('2016-01-05'),
10391 ... after=pd.Timestamp('2016-01-10')).tail()
10392 A
10393 2016-01-09 23:59:56 1
10394 2016-01-09 23:59:57 1
10395 2016-01-09 23:59:58 1
10396 2016-01-09 23:59:59 1
10397 2016-01-10 00:00:00 1
10399 Because the index is a DatetimeIndex containing only dates, we can
10400 specify `before` and `after` as strings. They will be coerced to
10401 Timestamps before truncation.
10403 >>> df.truncate('2016-01-05', '2016-01-10').tail()
10404 A
10405 2016-01-09 23:59:56 1
10406 2016-01-09 23:59:57 1
10407 2016-01-09 23:59:58 1
10408 2016-01-09 23:59:59 1
10409 2016-01-10 00:00:00 1
10411 Note that ``truncate`` assumes a 0 value for any unspecified time
10412 component (midnight). This differs from partial string slicing, which
10413 returns any partially matching dates.
10415 >>> df.loc['2016-01-05':'2016-01-10', :].tail()
10416 A
10417 2016-01-10 23:59:55 1
10418 2016-01-10 23:59:56 1
10419 2016-01-10 23:59:57 1
10420 2016-01-10 23:59:58 1
10421 2016-01-10 23:59:59 1
10422 """
10423 if axis is None:
10424 axis = self._stat_axis_number
10425 axis = self._get_axis_number(axis)
10426 ax = self._get_axis(axis)
10428 # GH 17935
10429 # Check that index is sorted
10430 if not ax.is_monotonic_increasing and not ax.is_monotonic_decreasing:
10431 raise ValueError("truncate requires a sorted index")
10433 # if we have a date index, convert to dates, otherwise
10434 # treat like a slice
10435 if ax._is_all_dates:
10436 from pandas.core.tools.datetimes import to_datetime
10438 before = to_datetime(before)
10439 after = to_datetime(after)
10441 if before is not None and after is not None and before > after:
10442 raise ValueError(f"Truncate: {after} must be after {before}")
10444 if len(ax) > 1 and ax.is_monotonic_decreasing and ax.nunique() > 1:
10445 before, after = after, before
10447 slicer = [slice(None, None)] * self._AXIS_LEN
10448 slicer[axis] = slice(before, after)
10449 result = self.loc[tuple(slicer)]
10451 if isinstance(ax, MultiIndex):
10452 setattr(result, self._get_axis_name(axis), ax.truncate(before, after))
10454 if copy:
10455 result = result.copy()
10457 return result
10459 @final
10460 @doc(klass=_shared_doc_kwargs["klass"])
10461 def tz_convert(
10462 self: NDFrameT, tz, axis=0, level=None, copy: bool_t = True
10463 ) -> NDFrameT:
10464 """
10465 Convert tz-aware axis to target time zone.
10467 Parameters
10468 ----------
10469 tz : str or tzinfo object
10470 axis : the axis to convert
10471 level : int, str, default None
10472 If axis is a MultiIndex, convert a specific level. Otherwise
10473 must be None.
10474 copy : bool, default True
10475 Also make a copy of the underlying data.
10477 Returns
10478 -------
10479 {klass}
10480 Object with time zone converted axis.
10482 Raises
10483 ------
10484 TypeError
10485 If the axis is tz-naive.
10486 """
10487 axis = self._get_axis_number(axis)
10488 ax = self._get_axis(axis)
10490 def _tz_convert(ax, tz):
10491 if not hasattr(ax, "tz_convert"):
10492 if len(ax) > 0:
10493 ax_name = self._get_axis_name(axis)
10494 raise TypeError(
10495 f"{ax_name} is not a valid DatetimeIndex or PeriodIndex"
10496 )
10497 else:
10498 ax = DatetimeIndex([], tz=tz)
10499 else:
10500 ax = ax.tz_convert(tz)
10501 return ax
10503 # if a level is given it must be a MultiIndex level or
10504 # equivalent to the axis name
10505 if isinstance(ax, MultiIndex):
10506 level = ax._get_level_number(level)
10507 new_level = _tz_convert(ax.levels[level], tz)
10508 ax = ax.set_levels(new_level, level=level)
10509 else:
10510 if level not in (None, 0, ax.name):
10511 raise ValueError(f"The level {level} is not valid")
10512 ax = _tz_convert(ax, tz)
10514 result = self.copy(deep=copy)
10515 result = result.set_axis(ax, axis=axis, copy=False)
10516 return result.__finalize__(self, method="tz_convert")
10518 @final
10519 @doc(klass=_shared_doc_kwargs["klass"])
10520 def tz_localize(
10521 self: NDFrameT,
10522 tz,
10523 axis=0,
10524 level=None,
10525 copy: bool_t = True,
10526 ambiguous="raise",
10527 nonexistent: str = "raise",
10528 ) -> NDFrameT:
10529 """
10530 Localize tz-naive index of a Series or DataFrame to target time zone.
10532 This operation localizes the Index. To localize the values in a
10533 timezone-naive Series, use :meth:`Series.dt.tz_localize`.
10535 Parameters
10536 ----------
10537 tz : str or tzinfo
10538 axis : the axis to localize
10539 level : int, str, default None
10540 If axis ia a MultiIndex, localize a specific level. Otherwise
10541 must be None.
10542 copy : bool, default True
10543 Also make a copy of the underlying data.
10544 ambiguous : 'infer', bool-ndarray, 'NaT', default 'raise'
10545 When clocks moved backward due to DST, ambiguous times may arise.
10546 For example in Central European Time (UTC+01), when going from
10547 03:00 DST to 02:00 non-DST, 02:30:00 local time occurs both at
10548 00:30:00 UTC and at 01:30:00 UTC. In such a situation, the
10549 `ambiguous` parameter dictates how ambiguous times should be
10550 handled.
10552 - 'infer' will attempt to infer fall dst-transition hours based on
10553 order
10554 - bool-ndarray where True signifies a DST time, False designates
10555 a non-DST time (note that this flag is only applicable for
10556 ambiguous times)
10557 - 'NaT' will return NaT where there are ambiguous times
10558 - 'raise' will raise an AmbiguousTimeError if there are ambiguous
10559 times.
10560 nonexistent : str, default 'raise'
10561 A nonexistent time does not exist in a particular timezone
10562 where clocks moved forward due to DST. Valid values are:
10564 - 'shift_forward' will shift the nonexistent time forward to the
10565 closest existing time
10566 - 'shift_backward' will shift the nonexistent time backward to the
10567 closest existing time
10568 - 'NaT' will return NaT where there are nonexistent times
10569 - timedelta objects will shift nonexistent times by the timedelta
10570 - 'raise' will raise an NonExistentTimeError if there are
10571 nonexistent times.
10573 Returns
10574 -------
10575 {klass}
10576 Same type as the input.
10578 Raises
10579 ------
10580 TypeError
10581 If the TimeSeries is tz-aware and tz is not None.
10583 Examples
10584 --------
10585 Localize local times:
10587 >>> s = pd.Series([1],
10588 ... index=pd.DatetimeIndex(['2018-09-15 01:30:00']))
10589 >>> s.tz_localize('CET')
10590 2018-09-15 01:30:00+02:00 1
10591 dtype: int64
10593 Be careful with DST changes. When there is sequential data, pandas
10594 can infer the DST time:
10596 >>> s = pd.Series(range(7),
10597 ... index=pd.DatetimeIndex(['2018-10-28 01:30:00',
10598 ... '2018-10-28 02:00:00',
10599 ... '2018-10-28 02:30:00',
10600 ... '2018-10-28 02:00:00',
10601 ... '2018-10-28 02:30:00',
10602 ... '2018-10-28 03:00:00',
10603 ... '2018-10-28 03:30:00']))
10604 >>> s.tz_localize('CET', ambiguous='infer')
10605 2018-10-28 01:30:00+02:00 0
10606 2018-10-28 02:00:00+02:00 1
10607 2018-10-28 02:30:00+02:00 2
10608 2018-10-28 02:00:00+01:00 3
10609 2018-10-28 02:30:00+01:00 4
10610 2018-10-28 03:00:00+01:00 5
10611 2018-10-28 03:30:00+01:00 6
10612 dtype: int64
10614 In some cases, inferring the DST is impossible. In such cases, you can
10615 pass an ndarray to the ambiguous parameter to set the DST explicitly
10617 >>> s = pd.Series(range(3),
10618 ... index=pd.DatetimeIndex(['2018-10-28 01:20:00',
10619 ... '2018-10-28 02:36:00',
10620 ... '2018-10-28 03:46:00']))
10621 >>> s.tz_localize('CET', ambiguous=np.array([True, True, False]))
10622 2018-10-28 01:20:00+02:00 0
10623 2018-10-28 02:36:00+02:00 1
10624 2018-10-28 03:46:00+01:00 2
10625 dtype: int64
10627 If the DST transition causes nonexistent times, you can shift these
10628 dates forward or backward with a timedelta object or `'shift_forward'`
10629 or `'shift_backward'`.
10631 >>> s = pd.Series(range(2),
10632 ... index=pd.DatetimeIndex(['2015-03-29 02:30:00',
10633 ... '2015-03-29 03:30:00']))
10634 >>> s.tz_localize('Europe/Warsaw', nonexistent='shift_forward')
10635 2015-03-29 03:00:00+02:00 0
10636 2015-03-29 03:30:00+02:00 1
10637 dtype: int64
10638 >>> s.tz_localize('Europe/Warsaw', nonexistent='shift_backward')
10639 2015-03-29 01:59:59.999999999+01:00 0
10640 2015-03-29 03:30:00+02:00 1
10641 dtype: int64
10642 >>> s.tz_localize('Europe/Warsaw', nonexistent=pd.Timedelta('1H'))
10643 2015-03-29 03:30:00+02:00 0
10644 2015-03-29 03:30:00+02:00 1
10645 dtype: int64
10646 """
10647 nonexistent_options = ("raise", "NaT", "shift_forward", "shift_backward")
10648 if nonexistent not in nonexistent_options and not isinstance(
10649 nonexistent, timedelta
10650 ):
10651 raise ValueError(
10652 "The nonexistent argument must be one of 'raise', "
10653 "'NaT', 'shift_forward', 'shift_backward' or "
10654 "a timedelta object"
10655 )
10657 axis = self._get_axis_number(axis)
10658 ax = self._get_axis(axis)
10660 def _tz_localize(ax, tz, ambiguous, nonexistent):
10661 if not hasattr(ax, "tz_localize"):
10662 if len(ax) > 0:
10663 ax_name = self._get_axis_name(axis)
10664 raise TypeError(
10665 f"{ax_name} is not a valid DatetimeIndex or PeriodIndex"
10666 )
10667 else:
10668 ax = DatetimeIndex([], tz=tz)
10669 else:
10670 ax = ax.tz_localize(tz, ambiguous=ambiguous, nonexistent=nonexistent)
10671 return ax
10673 # if a level is given it must be a MultiIndex level or
10674 # equivalent to the axis name
10675 if isinstance(ax, MultiIndex):
10676 level = ax._get_level_number(level)
10677 new_level = _tz_localize(ax.levels[level], tz, ambiguous, nonexistent)
10678 ax = ax.set_levels(new_level, level=level)
10679 else:
10680 if level not in (None, 0, ax.name):
10681 raise ValueError(f"The level {level} is not valid")
10682 ax = _tz_localize(ax, tz, ambiguous, nonexistent)
10684 result = self.copy(deep=copy)
10685 result = result.set_axis(ax, axis=axis, copy=False)
10686 return result.__finalize__(self, method="tz_localize")
10688 # ----------------------------------------------------------------------
10689 # Numeric Methods
10691 @final
10692 def describe(
10693 self: NDFrameT,
10694 percentiles=None,
10695 include=None,
10696 exclude=None,
10697 datetime_is_numeric: bool_t = False,
10698 ) -> NDFrameT:
10699 """
10700 Generate descriptive statistics.
10702 Descriptive statistics include those that summarize the central
10703 tendency, dispersion and shape of a
10704 dataset's distribution, excluding ``NaN`` values.
10706 Analyzes both numeric and object series, as well
10707 as ``DataFrame`` column sets of mixed data types. The output
10708 will vary depending on what is provided. Refer to the notes
10709 below for more detail.
10711 Parameters
10712 ----------
10713 percentiles : list-like of numbers, optional
10714 The percentiles to include in the output. All should
10715 fall between 0 and 1. The default is
10716 ``[.25, .5, .75]``, which returns the 25th, 50th, and
10717 75th percentiles.
10718 include : 'all', list-like of dtypes or None (default), optional
10719 A white list of data types to include in the result. Ignored
10720 for ``Series``. Here are the options:
10722 - 'all' : All columns of the input will be included in the output.
10723 - A list-like of dtypes : Limits the results to the
10724 provided data types.
10725 To limit the result to numeric types submit
10726 ``numpy.number``. To limit it instead to object columns submit
10727 the ``numpy.object`` data type. Strings
10728 can also be used in the style of
10729 ``select_dtypes`` (e.g. ``df.describe(include=['O'])``). To
10730 select pandas categorical columns, use ``'category'``
10731 - None (default) : The result will include all numeric columns.
10732 exclude : list-like of dtypes or None (default), optional,
10733 A black list of data types to omit from the result. Ignored
10734 for ``Series``. Here are the options:
10736 - A list-like of dtypes : Excludes the provided data types
10737 from the result. To exclude numeric types submit
10738 ``numpy.number``. To exclude object columns submit the data
10739 type ``numpy.object``. Strings can also be used in the style of
10740 ``select_dtypes`` (e.g. ``df.describe(exclude=['O'])``). To
10741 exclude pandas categorical columns, use ``'category'``
10742 - None (default) : The result will exclude nothing.
10743 datetime_is_numeric : bool, default False
10744 Whether to treat datetime dtypes as numeric. This affects statistics
10745 calculated for the column. For DataFrame input, this also
10746 controls whether datetime columns are included by default.
10748 .. versionadded:: 1.1.0
10750 Returns
10751 -------
10752 Series or DataFrame
10753 Summary statistics of the Series or Dataframe provided.
10755 See Also
10756 --------
10757 DataFrame.count: Count number of non-NA/null observations.
10758 DataFrame.max: Maximum of the values in the object.
10759 DataFrame.min: Minimum of the values in the object.
10760 DataFrame.mean: Mean of the values.
10761 DataFrame.std: Standard deviation of the observations.
10762 DataFrame.select_dtypes: Subset of a DataFrame including/excluding
10763 columns based on their dtype.
10765 Notes
10766 -----
10767 For numeric data, the result's index will include ``count``,
10768 ``mean``, ``std``, ``min``, ``max`` as well as lower, ``50`` and
10769 upper percentiles. By default the lower percentile is ``25`` and the
10770 upper percentile is ``75``. The ``50`` percentile is the
10771 same as the median.
10773 For object data (e.g. strings or timestamps), the result's index
10774 will include ``count``, ``unique``, ``top``, and ``freq``. The ``top``
10775 is the most common value. The ``freq`` is the most common value's
10776 frequency. Timestamps also include the ``first`` and ``last`` items.
10778 If multiple object values have the highest count, then the
10779 ``count`` and ``top`` results will be arbitrarily chosen from
10780 among those with the highest count.
10782 For mixed data types provided via a ``DataFrame``, the default is to
10783 return only an analysis of numeric columns. If the dataframe consists
10784 only of object and categorical data without any numeric columns, the
10785 default is to return an analysis of both the object and categorical
10786 columns. If ``include='all'`` is provided as an option, the result
10787 will include a union of attributes of each type.
10789 The `include` and `exclude` parameters can be used to limit
10790 which columns in a ``DataFrame`` are analyzed for the output.
10791 The parameters are ignored when analyzing a ``Series``.
10793 Examples
10794 --------
10795 Describing a numeric ``Series``.
10797 >>> s = pd.Series([1, 2, 3])
10798 >>> s.describe()
10799 count 3.0
10800 mean 2.0
10801 std 1.0
10802 min 1.0
10803 25% 1.5
10804 50% 2.0
10805 75% 2.5
10806 max 3.0
10807 dtype: float64
10809 Describing a categorical ``Series``.
10811 >>> s = pd.Series(['a', 'a', 'b', 'c'])
10812 >>> s.describe()
10813 count 4
10814 unique 3
10815 top a
10816 freq 2
10817 dtype: object
10819 Describing a timestamp ``Series``.
10821 >>> s = pd.Series([
10822 ... np.datetime64("2000-01-01"),
10823 ... np.datetime64("2010-01-01"),
10824 ... np.datetime64("2010-01-01")
10825 ... ])
10826 >>> s.describe(datetime_is_numeric=True)
10827 count 3
10828 mean 2006-09-01 08:00:00
10829 min 2000-01-01 00:00:00
10830 25% 2004-12-31 12:00:00
10831 50% 2010-01-01 00:00:00
10832 75% 2010-01-01 00:00:00
10833 max 2010-01-01 00:00:00
10834 dtype: object
10836 Describing a ``DataFrame``. By default only numeric fields
10837 are returned.
10839 >>> df = pd.DataFrame({'categorical': pd.Categorical(['d','e','f']),
10840 ... 'numeric': [1, 2, 3],
10841 ... 'object': ['a', 'b', 'c']
10842 ... })
10843 >>> df.describe()
10844 numeric
10845 count 3.0
10846 mean 2.0
10847 std 1.0
10848 min 1.0
10849 25% 1.5
10850 50% 2.0
10851 75% 2.5
10852 max 3.0
10854 Describing all columns of a ``DataFrame`` regardless of data type.
10856 >>> df.describe(include='all') # doctest: +SKIP
10857 categorical numeric object
10858 count 3 3.0 3
10859 unique 3 NaN 3
10860 top f NaN a
10861 freq 1 NaN 1
10862 mean NaN 2.0 NaN
10863 std NaN 1.0 NaN
10864 min NaN 1.0 NaN
10865 25% NaN 1.5 NaN
10866 50% NaN 2.0 NaN
10867 75% NaN 2.5 NaN
10868 max NaN 3.0 NaN
10870 Describing a column from a ``DataFrame`` by accessing it as
10871 an attribute.
10873 >>> df.numeric.describe()
10874 count 3.0
10875 mean 2.0
10876 std 1.0
10877 min 1.0
10878 25% 1.5
10879 50% 2.0
10880 75% 2.5
10881 max 3.0
10882 Name: numeric, dtype: float64
10884 Including only numeric columns in a ``DataFrame`` description.
10886 >>> df.describe(include=[np.number])
10887 numeric
10888 count 3.0
10889 mean 2.0
10890 std 1.0
10891 min 1.0
10892 25% 1.5
10893 50% 2.0
10894 75% 2.5
10895 max 3.0
10897 Including only string columns in a ``DataFrame`` description.
10899 >>> df.describe(include=[object]) # doctest: +SKIP
10900 object
10901 count 3
10902 unique 3
10903 top a
10904 freq 1
10906 Including only categorical columns from a ``DataFrame`` description.
10908 >>> df.describe(include=['category'])
10909 categorical
10910 count 3
10911 unique 3
10912 top d
10913 freq 1
10915 Excluding numeric columns from a ``DataFrame`` description.
10917 >>> df.describe(exclude=[np.number]) # doctest: +SKIP
10918 categorical object
10919 count 3 3
10920 unique 3 3
10921 top f a
10922 freq 1 1
10924 Excluding object columns from a ``DataFrame`` description.
10926 >>> df.describe(exclude=[object]) # doctest: +SKIP
10927 categorical numeric
10928 count 3 3.0
10929 unique 3 NaN
10930 top f NaN
10931 freq 1 NaN
10932 mean NaN 2.0
10933 std NaN 1.0
10934 min NaN 1.0
10935 25% NaN 1.5
10936 50% NaN 2.0
10937 75% NaN 2.5
10938 max NaN 3.0
10939 """
10940 return describe_ndframe(
10941 obj=self,
10942 include=include,
10943 exclude=exclude,
10944 datetime_is_numeric=datetime_is_numeric,
10945 percentiles=percentiles,
10946 )
10948 @final
10949 def pct_change(
10950 self: NDFrameT,
10951 periods=1,
10952 fill_method="pad",
10953 limit=None,
10954 freq=None,
10955 **kwargs,
10956 ) -> NDFrameT:
10957 """
10958 Percentage change between the current and a prior element.
10960 Computes the percentage change from the immediately previous row by
10961 default. This is useful in comparing the percentage of change in a time
10962 series of elements.
10964 Parameters
10965 ----------
10966 periods : int, default 1
10967 Periods to shift for forming percent change.
10968 fill_method : str, default 'pad'
10969 How to handle NAs **before** computing percent changes.
10970 limit : int, default None
10971 The number of consecutive NAs to fill before stopping.
10972 freq : DateOffset, timedelta, or str, optional
10973 Increment to use from time series API (e.g. 'M' or BDay()).
10974 **kwargs
10975 Additional keyword arguments are passed into
10976 `DataFrame.shift` or `Series.shift`.
10978 Returns
10979 -------
10980 chg : Series or DataFrame
10981 The same type as the calling object.
10983 See Also
10984 --------
10985 Series.diff : Compute the difference of two elements in a Series.
10986 DataFrame.diff : Compute the difference of two elements in a DataFrame.
10987 Series.shift : Shift the index by some number of periods.
10988 DataFrame.shift : Shift the index by some number of periods.
10990 Examples
10991 --------
10992 **Series**
10994 >>> s = pd.Series([90, 91, 85])
10995 >>> s
10996 0 90
10997 1 91
10998 2 85
10999 dtype: int64
11001 >>> s.pct_change()
11002 0 NaN
11003 1 0.011111
11004 2 -0.065934
11005 dtype: float64
11007 >>> s.pct_change(periods=2)
11008 0 NaN
11009 1 NaN
11010 2 -0.055556
11011 dtype: float64
11013 See the percentage change in a Series where filling NAs with last
11014 valid observation forward to next valid.
11016 >>> s = pd.Series([90, 91, None, 85])
11017 >>> s
11018 0 90.0
11019 1 91.0
11020 2 NaN
11021 3 85.0
11022 dtype: float64
11024 >>> s.pct_change(fill_method='ffill')
11025 0 NaN
11026 1 0.011111
11027 2 0.000000
11028 3 -0.065934
11029 dtype: float64
11031 **DataFrame**
11033 Percentage change in French franc, Deutsche Mark, and Italian lira from
11034 1980-01-01 to 1980-03-01.
11036 >>> df = pd.DataFrame({
11037 ... 'FR': [4.0405, 4.0963, 4.3149],
11038 ... 'GR': [1.7246, 1.7482, 1.8519],
11039 ... 'IT': [804.74, 810.01, 860.13]},
11040 ... index=['1980-01-01', '1980-02-01', '1980-03-01'])
11041 >>> df
11042 FR GR IT
11043 1980-01-01 4.0405 1.7246 804.74
11044 1980-02-01 4.0963 1.7482 810.01
11045 1980-03-01 4.3149 1.8519 860.13
11047 >>> df.pct_change()
11048 FR GR IT
11049 1980-01-01 NaN NaN NaN
11050 1980-02-01 0.013810 0.013684 0.006549
11051 1980-03-01 0.053365 0.059318 0.061876
11053 Percentage of change in GOOG and APPL stock volume. Shows computing
11054 the percentage change between columns.
11056 >>> df = pd.DataFrame({
11057 ... '2016': [1769950, 30586265],
11058 ... '2015': [1500923, 40912316],
11059 ... '2014': [1371819, 41403351]},
11060 ... index=['GOOG', 'APPL'])
11061 >>> df
11062 2016 2015 2014
11063 GOOG 1769950 1500923 1371819
11064 APPL 30586265 40912316 41403351
11066 >>> df.pct_change(axis='columns', periods=-1)
11067 2016 2015 2014
11068 GOOG 0.179241 0.094112 NaN
11069 APPL -0.252395 -0.011860 NaN
11070 """
11071 axis = self._get_axis_number(kwargs.pop("axis", self._stat_axis_name))
11072 if fill_method is None:
11073 data = self
11074 else:
11075 _data = self.fillna(method=fill_method, axis=axis, limit=limit)
11076 assert _data is not None # needed for mypy
11077 data = _data
11079 shifted = data.shift(periods=periods, freq=freq, axis=axis, **kwargs)
11080 # Unsupported left operand type for / ("NDFrameT")
11081 rs = data / shifted - 1 # type: ignore[operator]
11082 if freq is not None:
11083 # Shift method is implemented differently when freq is not None
11084 # We want to restore the original index
11085 rs = rs.loc[~rs.index.duplicated()]
11086 rs = rs.reindex_like(data)
11087 return rs.__finalize__(self, method="pct_change")
11089 @final
11090 def _agg_by_level(
11091 self,
11092 name: str,
11093 axis: Axis = 0,
11094 level: Level = 0,
11095 skipna: bool_t = True,
11096 **kwargs,
11097 ):
11098 if axis is None:
11099 raise ValueError("Must specify 'axis' when aggregating by level.")
11100 grouped = self.groupby(level=level, axis=axis, sort=False)
11101 if hasattr(grouped, name) and skipna:
11102 return getattr(grouped, name)(**kwargs)
11103 axis = self._get_axis_number(axis)
11104 method = getattr(type(self), name)
11105 applyf = lambda x: method(x, axis=axis, skipna=skipna, **kwargs)
11106 return grouped.aggregate(applyf)
11108 @final
11109 def _logical_func(
11110 self,
11111 name: str,
11112 func,
11113 axis: Axis = 0,
11114 bool_only: bool_t | None = None,
11115 skipna: bool_t = True,
11116 level: Level | None = None,
11117 **kwargs,
11118 ) -> Series | bool_t:
11119 nv.validate_logical_func((), kwargs, fname=name)
11120 validate_bool_kwarg(skipna, "skipna", none_allowed=False)
11121 if level is not None:
11122 warnings.warn(
11123 "Using the level keyword in DataFrame and Series aggregations is "
11124 "deprecated and will be removed in a future version. Use groupby "
11125 "instead. df.any(level=1) should use df.groupby(level=1).any()",
11126 FutureWarning,
11127 stacklevel=find_stack_level(),
11128 )
11129 if bool_only is not None:
11130 raise NotImplementedError(
11131 "Option bool_only is not implemented with option level."
11132 )
11133 return self._agg_by_level(name, axis=axis, level=level, skipna=skipna)
11135 if self.ndim > 1 and axis is None:
11136 # Reduce along one dimension then the other, to simplify DataFrame._reduce
11137 res = self._logical_func(
11138 name, func, axis=0, bool_only=bool_only, skipna=skipna, **kwargs
11139 )
11140 return res._logical_func(name, func, skipna=skipna, **kwargs)
11142 if (
11143 self.ndim > 1
11144 and axis == 1
11145 and len(self._mgr.arrays) > 1
11146 # TODO(EA2D): special-case not needed
11147 and all(x.ndim == 2 for x in self._mgr.arrays)
11148 and bool_only is not None
11149 and not kwargs
11150 ):
11151 # Fastpath avoiding potentially expensive transpose
11152 obj = self
11153 if bool_only:
11154 obj = self._get_bool_data()
11155 return obj._reduce_axis1(name, func, skipna=skipna)
11157 return self._reduce(
11158 func,
11159 name=name,
11160 axis=axis,
11161 skipna=skipna,
11162 numeric_only=bool_only,
11163 filter_type="bool",
11164 )
11166 def any(
11167 self,
11168 axis: Axis = 0,
11169 bool_only: bool_t | None = None,
11170 skipna: bool_t = True,
11171 level: Level | None = None,
11172 **kwargs,
11173 ) -> DataFrame | Series | bool_t:
11174 return self._logical_func(
11175 "any", nanops.nanany, axis, bool_only, skipna, level, **kwargs
11176 )
11178 def all(
11179 self,
11180 axis: Axis = 0,
11181 bool_only: bool_t | None = None,
11182 skipna: bool_t = True,
11183 level: Level | None = None,
11184 **kwargs,
11185 ) -> Series | bool_t:
11186 return self._logical_func(
11187 "all", nanops.nanall, axis, bool_only, skipna, level, **kwargs
11188 )
11190 @final
11191 def _accum_func(
11192 self,
11193 name: str,
11194 func,
11195 axis: Axis | None = None,
11196 skipna: bool_t = True,
11197 *args,
11198 **kwargs,
11199 ):
11200 skipna = nv.validate_cum_func_with_skipna(skipna, args, kwargs, name)
11201 if axis is None:
11202 axis = self._stat_axis_number
11203 else:
11204 axis = self._get_axis_number(axis)
11206 if axis == 1:
11207 return self.T._accum_func(
11208 name, func, axis=0, skipna=skipna, *args, **kwargs
11209 ).T
11211 def block_accum_func(blk_values):
11212 values = blk_values.T if hasattr(blk_values, "T") else blk_values
11214 result = nanops.na_accum_func(values, func, skipna=skipna)
11216 result = result.T if hasattr(result, "T") else result
11217 return result
11219 result = self._mgr.apply(block_accum_func)
11221 return self._constructor(result).__finalize__(self, method=name)
11223 def cummax(self, axis: Axis | None = None, skipna: bool_t = True, *args, **kwargs):
11224 return self._accum_func(
11225 "cummax", np.maximum.accumulate, axis, skipna, *args, **kwargs
11226 )
11228 def cummin(self, axis: Axis | None = None, skipna: bool_t = True, *args, **kwargs):
11229 return self._accum_func(
11230 "cummin", np.minimum.accumulate, axis, skipna, *args, **kwargs
11231 )
11233 def cumsum(self, axis: Axis | None = None, skipna: bool_t = True, *args, **kwargs):
11234 return self._accum_func("cumsum", np.cumsum, axis, skipna, *args, **kwargs)
11236 def cumprod(self, axis: Axis | None = None, skipna: bool_t = True, *args, **kwargs):
11237 return self._accum_func("cumprod", np.cumprod, axis, skipna, *args, **kwargs)
11239 @final
11240 def _stat_function_ddof(
11241 self,
11242 name: str,
11243 func,
11244 axis: Axis | None = None,
11245 skipna: bool_t = True,
11246 level: Level | None = None,
11247 ddof: int = 1,
11248 numeric_only: bool_t | None = None,
11249 **kwargs,
11250 ) -> Series | float:
11251 nv.validate_stat_ddof_func((), kwargs, fname=name)
11252 validate_bool_kwarg(skipna, "skipna", none_allowed=False)
11253 if axis is None:
11254 axis = self._stat_axis_number
11255 if level is not None:
11256 warnings.warn(
11257 "Using the level keyword in DataFrame and Series aggregations is "
11258 "deprecated and will be removed in a future version. Use groupby "
11259 "instead. df.var(level=1) should use df.groupby(level=1).var().",
11260 FutureWarning,
11261 stacklevel=find_stack_level(),
11262 )
11263 return self._agg_by_level(
11264 name, axis=axis, level=level, skipna=skipna, ddof=ddof
11265 )
11266 return self._reduce(
11267 func, name, axis=axis, numeric_only=numeric_only, skipna=skipna, ddof=ddof
11268 )
11270 def sem(
11271 self,
11272 axis: Axis | None = None,
11273 skipna: bool_t = True,
11274 level: Level | None = None,
11275 ddof: int = 1,
11276 numeric_only: bool_t | None = None,
11277 **kwargs,
11278 ) -> Series | float:
11279 return self._stat_function_ddof(
11280 "sem", nanops.nansem, axis, skipna, level, ddof, numeric_only, **kwargs
11281 )
11283 def var(
11284 self,
11285 axis: Axis | None = None,
11286 skipna: bool_t = True,
11287 level: Level | None = None,
11288 ddof: int = 1,
11289 numeric_only: bool_t | None = None,
11290 **kwargs,
11291 ) -> Series | float:
11292 return self._stat_function_ddof(
11293 "var", nanops.nanvar, axis, skipna, level, ddof, numeric_only, **kwargs
11294 )
11296 def std(
11297 self,
11298 axis: Axis | None = None,
11299 skipna: bool_t = True,
11300 level: Level | None = None,
11301 ddof: int = 1,
11302 numeric_only: bool_t | None = None,
11303 **kwargs,
11304 ) -> Series | float:
11305 return self._stat_function_ddof(
11306 "std", nanops.nanstd, axis, skipna, level, ddof, numeric_only, **kwargs
11307 )
11309 @final
11310 def _stat_function(
11311 self,
11312 name: str,
11313 func,
11314 axis: Axis | None | lib.NoDefault = None,
11315 skipna: bool_t = True,
11316 level: Level | None = None,
11317 numeric_only: bool_t | None = None,
11318 **kwargs,
11319 ):
11320 if name == "median":
11321 nv.validate_median((), kwargs)
11322 else:
11323 nv.validate_stat_func((), kwargs, fname=name)
11325 validate_bool_kwarg(skipna, "skipna", none_allowed=False)
11327 if axis is None and level is None and self.ndim > 1:
11328 # user must have explicitly passed axis=None
11329 # GH#21597
11330 warnings.warn(
11331 f"In a future version, DataFrame.{name}(axis=None) will return a "
11332 f"scalar {name} over the entire DataFrame. To retain the old "
11333 f"behavior, use 'frame.{name}(axis=0)' or just 'frame.{name}()'",
11334 FutureWarning,
11335 stacklevel=find_stack_level(),
11336 )
11337 if axis is lib.no_default:
11338 axis = None
11340 if axis is None:
11341 axis = self._stat_axis_number
11342 if level is not None:
11343 warnings.warn(
11344 "Using the level keyword in DataFrame and Series aggregations is "
11345 "deprecated and will be removed in a future version. Use groupby "
11346 "instead. df.median(level=1) should use df.groupby(level=1).median().",
11347 FutureWarning,
11348 stacklevel=find_stack_level(),
11349 )
11350 return self._agg_by_level(
11351 name, axis=axis, level=level, skipna=skipna, numeric_only=numeric_only
11352 )
11353 return self._reduce(
11354 func, name=name, axis=axis, skipna=skipna, numeric_only=numeric_only
11355 )
11357 def min(
11358 self,
11359 axis: Axis | None | lib.NoDefault = lib.no_default,
11360 skipna: bool_t = True,
11361 level: Level | None = None,
11362 numeric_only: bool_t | None = None,
11363 **kwargs,
11364 ):
11365 return self._stat_function(
11366 "min",
11367 nanops.nanmin,
11368 axis,
11369 skipna,
11370 level,
11371 numeric_only,
11372 **kwargs,
11373 )
11375 def max(
11376 self,
11377 axis: Axis | None | lib.NoDefault = lib.no_default,
11378 skipna: bool_t = True,
11379 level: Level | None = None,
11380 numeric_only: bool_t | None = None,
11381 **kwargs,
11382 ):
11383 return self._stat_function(
11384 "max",
11385 nanops.nanmax,
11386 axis,
11387 skipna,
11388 level,
11389 numeric_only,
11390 **kwargs,
11391 )
11393 def mean(
11394 self,
11395 axis: Axis | None | lib.NoDefault = lib.no_default,
11396 skipna: bool_t = True,
11397 level: Level | None = None,
11398 numeric_only: bool_t | None = None,
11399 **kwargs,
11400 ) -> Series | float:
11401 return self._stat_function(
11402 "mean", nanops.nanmean, axis, skipna, level, numeric_only, **kwargs
11403 )
11405 def median(
11406 self,
11407 axis: Axis | None | lib.NoDefault = lib.no_default,
11408 skipna: bool_t = True,
11409 level: Level | None = None,
11410 numeric_only: bool_t | None = None,
11411 **kwargs,
11412 ) -> Series | float:
11413 return self._stat_function(
11414 "median", nanops.nanmedian, axis, skipna, level, numeric_only, **kwargs
11415 )
11417 def skew(
11418 self,
11419 axis: Axis | None | lib.NoDefault = lib.no_default,
11420 skipna: bool_t = True,
11421 level: Level | None = None,
11422 numeric_only: bool_t | None = None,
11423 **kwargs,
11424 ) -> Series | float:
11425 return self._stat_function(
11426 "skew", nanops.nanskew, axis, skipna, level, numeric_only, **kwargs
11427 )
11429 def kurt(
11430 self,
11431 axis: Axis | None | lib.NoDefault = lib.no_default,
11432 skipna: bool_t = True,
11433 level: Level | None = None,
11434 numeric_only: bool_t | None = None,
11435 **kwargs,
11436 ) -> Series | float:
11437 return self._stat_function(
11438 "kurt", nanops.nankurt, axis, skipna, level, numeric_only, **kwargs
11439 )
11441 kurtosis = kurt
11443 @final
11444 def _min_count_stat_function(
11445 self,
11446 name: str,
11447 func,
11448 axis: Axis | None = None,
11449 skipna: bool_t = True,
11450 level: Level | None = None,
11451 numeric_only: bool_t | None = None,
11452 min_count: int = 0,
11453 **kwargs,
11454 ):
11455 if name == "sum":
11456 nv.validate_sum((), kwargs)
11457 elif name == "prod":
11458 nv.validate_prod((), kwargs)
11459 else:
11460 nv.validate_stat_func((), kwargs, fname=name)
11462 validate_bool_kwarg(skipna, "skipna", none_allowed=False)
11464 if axis is None:
11465 axis = self._stat_axis_number
11466 if level is not None:
11467 warnings.warn(
11468 "Using the level keyword in DataFrame and Series aggregations is "
11469 "deprecated and will be removed in a future version. Use groupby "
11470 "instead. df.sum(level=1) should use df.groupby(level=1).sum().",
11471 FutureWarning,
11472 stacklevel=find_stack_level(),
11473 )
11474 return self._agg_by_level(
11475 name,
11476 axis=axis,
11477 level=level,
11478 skipna=skipna,
11479 min_count=min_count,
11480 numeric_only=numeric_only,
11481 )
11483 return self._reduce(
11484 func,
11485 name=name,
11486 axis=axis,
11487 skipna=skipna,
11488 numeric_only=numeric_only,
11489 min_count=min_count,
11490 )
11492 def sum(
11493 self,
11494 axis: Axis | None = None,
11495 skipna: bool_t = True,
11496 level: Level | None = None,
11497 numeric_only: bool_t | None = None,
11498 min_count=0,
11499 **kwargs,
11500 ):
11501 return self._min_count_stat_function(
11502 "sum", nanops.nansum, axis, skipna, level, numeric_only, min_count, **kwargs
11503 )
11505 def prod(
11506 self,
11507 axis: Axis | None = None,
11508 skipna: bool_t = True,
11509 level: Level | None = None,
11510 numeric_only: bool_t | None = None,
11511 min_count: int = 0,
11512 **kwargs,
11513 ):
11514 return self._min_count_stat_function(
11515 "prod",
11516 nanops.nanprod,
11517 axis,
11518 skipna,
11519 level,
11520 numeric_only,
11521 min_count,
11522 **kwargs,
11523 )
11525 product = prod
11527 def mad(
11528 self,
11529 axis: Axis | None = None,
11530 skipna: bool_t = True,
11531 level: Level | None = None,
11532 ) -> Series | float:
11533 """
11534 {desc}
11536 .. deprecated:: 1.5.0
11537 mad is deprecated.
11539 Parameters
11540 ----------
11541 axis : {axis_descr}
11542 Axis for the function to be applied on.
11543 For `Series` this parameter is unused and defaults to 0.
11544 skipna : bool, default True
11545 Exclude NA/null values when computing the result.
11546 level : int or level name, default None
11547 If the axis is a MultiIndex (hierarchical), count along a
11548 particular level, collapsing into a {name1}.
11550 Returns
11551 -------
11552 {name1} or {name2} (if level specified)\
11553 {see_also}\
11554 {examples}
11555 """
11556 msg = (
11557 "The 'mad' method is deprecated and will be removed in a future version. "
11558 "To compute the same result, you may do `(df - df.mean()).abs().mean()`."
11559 )
11560 warnings.warn(msg, FutureWarning, stacklevel=find_stack_level())
11562 if not is_bool(skipna):
11563 warnings.warn(
11564 "Passing None for skipna is deprecated and will raise in a future"
11565 "version. Pass True instead. Only boolean values will be allowed "
11566 "in the future.",
11567 FutureWarning,
11568 stacklevel=find_stack_level(),
11569 )
11570 skipna = True
11571 if axis is None:
11572 axis = self._stat_axis_number
11573 if level is not None:
11574 warnings.warn(
11575 "Using the level keyword in DataFrame and Series aggregations is "
11576 "deprecated and will be removed in a future version. Use groupby "
11577 "instead. df.mad(level=1) should use df.groupby(level=1).mad()",
11578 FutureWarning,
11579 stacklevel=find_stack_level(),
11580 )
11581 return self._agg_by_level("mad", axis=axis, level=level, skipna=skipna)
11583 data = self._get_numeric_data()
11584 if axis == 0:
11585 # error: Unsupported operand types for - ("NDFrame" and "float")
11586 demeaned = data - data.mean(axis=0) # type: ignore[operator]
11587 else:
11588 demeaned = data.sub(data.mean(axis=1), axis=0)
11589 return np.abs(demeaned).mean(axis=axis, skipna=skipna)
11591 @classmethod
11592 def _add_numeric_operations(cls):
11593 """
11594 Add the operations to the cls; evaluate the doc strings again
11595 """
11596 axis_descr, name1, name2 = _doc_params(cls)
11598 @deprecate_nonkeyword_arguments(
11599 version=None,
11600 allowed_args=["self"],
11601 name="DataFrame.any and Series.any",
11602 )
11603 @doc(
11604 _bool_doc,
11605 desc=_any_desc,
11606 name1=name1,
11607 name2=name2,
11608 axis_descr=axis_descr,
11609 see_also=_any_see_also,
11610 examples=_any_examples,
11611 empty_value=False,
11612 )
11613 def any(self, axis=0, bool_only=None, skipna=True, level=None, **kwargs):
11614 return NDFrame.any(self, axis, bool_only, skipna, level, **kwargs)
11616 setattr(cls, "any", any)
11618 @doc(
11619 _bool_doc,
11620 desc=_all_desc,
11621 name1=name1,
11622 name2=name2,
11623 axis_descr=axis_descr,
11624 see_also=_all_see_also,
11625 examples=_all_examples,
11626 empty_value=True,
11627 )
11628 def all(self, axis=0, bool_only=None, skipna=True, level=None, **kwargs):
11629 return NDFrame.all(self, axis, bool_only, skipna, level, **kwargs)
11631 setattr(cls, "all", all)
11633 # error: Argument 1 to "doc" has incompatible type "Optional[str]"; expected
11634 # "Union[str, Callable[..., Any]]"
11635 @doc(
11636 NDFrame.mad.__doc__, # type: ignore[arg-type]
11637 desc="Return the mean absolute deviation of the values "
11638 "over the requested axis.",
11639 name1=name1,
11640 name2=name2,
11641 axis_descr=axis_descr,
11642 see_also="",
11643 examples="",
11644 )
11645 def mad(self, axis=None, skipna=True, level=None):
11646 return NDFrame.mad(self, axis, skipna, level)
11648 setattr(cls, "mad", mad)
11650 @doc(
11651 _num_ddof_doc,
11652 desc="Return unbiased standard error of the mean over requested "
11653 "axis.\n\nNormalized by N-1 by default. This can be changed "
11654 "using the ddof argument",
11655 name1=name1,
11656 name2=name2,
11657 axis_descr=axis_descr,
11658 notes="",
11659 examples="",
11660 )
11661 def sem(
11662 self,
11663 axis=None,
11664 skipna=True,
11665 level=None,
11666 ddof=1,
11667 numeric_only=None,
11668 **kwargs,
11669 ):
11670 return NDFrame.sem(self, axis, skipna, level, ddof, numeric_only, **kwargs)
11672 setattr(cls, "sem", sem)
11674 @doc(
11675 _num_ddof_doc,
11676 desc="Return unbiased variance over requested axis.\n\nNormalized by "
11677 "N-1 by default. This can be changed using the ddof argument.",
11678 name1=name1,
11679 name2=name2,
11680 axis_descr=axis_descr,
11681 notes="",
11682 examples=_var_examples,
11683 )
11684 def var(
11685 self,
11686 axis=None,
11687 skipna=True,
11688 level=None,
11689 ddof=1,
11690 numeric_only=None,
11691 **kwargs,
11692 ):
11693 return NDFrame.var(self, axis, skipna, level, ddof, numeric_only, **kwargs)
11695 setattr(cls, "var", var)
11697 @doc(
11698 _num_ddof_doc,
11699 desc="Return sample standard deviation over requested axis."
11700 "\n\nNormalized by N-1 by default. This can be changed using the "
11701 "ddof argument.",
11702 name1=name1,
11703 name2=name2,
11704 axis_descr=axis_descr,
11705 notes=_std_notes,
11706 examples=_std_examples,
11707 )
11708 def std(
11709 self,
11710 axis=None,
11711 skipna=True,
11712 level=None,
11713 ddof=1,
11714 numeric_only=None,
11715 **kwargs,
11716 ):
11717 return NDFrame.std(self, axis, skipna, level, ddof, numeric_only, **kwargs)
11719 setattr(cls, "std", std)
11721 @doc(
11722 _cnum_doc,
11723 desc="minimum",
11724 name1=name1,
11725 name2=name2,
11726 axis_descr=axis_descr,
11727 accum_func_name="min",
11728 examples=_cummin_examples,
11729 )
11730 def cummin(self, axis=None, skipna=True, *args, **kwargs):
11731 return NDFrame.cummin(self, axis, skipna, *args, **kwargs)
11733 setattr(cls, "cummin", cummin)
11735 @doc(
11736 _cnum_doc,
11737 desc="maximum",
11738 name1=name1,
11739 name2=name2,
11740 axis_descr=axis_descr,
11741 accum_func_name="max",
11742 examples=_cummax_examples,
11743 )
11744 def cummax(self, axis=None, skipna=True, *args, **kwargs):
11745 return NDFrame.cummax(self, axis, skipna, *args, **kwargs)
11747 setattr(cls, "cummax", cummax)
11749 @doc(
11750 _cnum_doc,
11751 desc="sum",
11752 name1=name1,
11753 name2=name2,
11754 axis_descr=axis_descr,
11755 accum_func_name="sum",
11756 examples=_cumsum_examples,
11757 )
11758 def cumsum(self, axis=None, skipna=True, *args, **kwargs):
11759 return NDFrame.cumsum(self, axis, skipna, *args, **kwargs)
11761 setattr(cls, "cumsum", cumsum)
11763 @doc(
11764 _cnum_doc,
11765 desc="product",
11766 name1=name1,
11767 name2=name2,
11768 axis_descr=axis_descr,
11769 accum_func_name="prod",
11770 examples=_cumprod_examples,
11771 )
11772 def cumprod(self, axis=None, skipna=True, *args, **kwargs):
11773 return NDFrame.cumprod(self, axis, skipna, *args, **kwargs)
11775 setattr(cls, "cumprod", cumprod)
11777 @doc(
11778 _num_doc,
11779 desc="Return the sum of the values over the requested axis.\n\n"
11780 "This is equivalent to the method ``numpy.sum``.",
11781 name1=name1,
11782 name2=name2,
11783 axis_descr=axis_descr,
11784 min_count=_min_count_stub,
11785 see_also=_stat_func_see_also,
11786 examples=_sum_examples,
11787 )
11788 def sum(
11789 self,
11790 axis=None,
11791 skipna=True,
11792 level=None,
11793 numeric_only=None,
11794 min_count=0,
11795 **kwargs,
11796 ):
11797 return NDFrame.sum(
11798 self, axis, skipna, level, numeric_only, min_count, **kwargs
11799 )
11801 setattr(cls, "sum", sum)
11803 @doc(
11804 _num_doc,
11805 desc="Return the product of the values over the requested axis.",
11806 name1=name1,
11807 name2=name2,
11808 axis_descr=axis_descr,
11809 min_count=_min_count_stub,
11810 see_also=_stat_func_see_also,
11811 examples=_prod_examples,
11812 )
11813 def prod(
11814 self,
11815 axis=None,
11816 skipna=True,
11817 level=None,
11818 numeric_only=None,
11819 min_count=0,
11820 **kwargs,
11821 ):
11822 return NDFrame.prod(
11823 self, axis, skipna, level, numeric_only, min_count, **kwargs
11824 )
11826 setattr(cls, "prod", prod)
11827 cls.product = prod
11829 @doc(
11830 _num_doc,
11831 desc="Return the mean of the values over the requested axis.",
11832 name1=name1,
11833 name2=name2,
11834 axis_descr=axis_descr,
11835 min_count="",
11836 see_also="",
11837 examples="",
11838 )
11839 def mean(
11840 self,
11841 axis: int | None | lib.NoDefault = lib.no_default,
11842 skipna=True,
11843 level=None,
11844 numeric_only=None,
11845 **kwargs,
11846 ):
11847 return NDFrame.mean(self, axis, skipna, level, numeric_only, **kwargs)
11849 setattr(cls, "mean", mean)
11851 @doc(
11852 _num_doc,
11853 desc="Return unbiased skew over requested axis.\n\nNormalized by N-1.",
11854 name1=name1,
11855 name2=name2,
11856 axis_descr=axis_descr,
11857 min_count="",
11858 see_also="",
11859 examples="",
11860 )
11861 def skew(
11862 self,
11863 axis: int | None | lib.NoDefault = lib.no_default,
11864 skipna=True,
11865 level=None,
11866 numeric_only=None,
11867 **kwargs,
11868 ):
11869 return NDFrame.skew(self, axis, skipna, level, numeric_only, **kwargs)
11871 setattr(cls, "skew", skew)
11873 @doc(
11874 _num_doc,
11875 desc="Return unbiased kurtosis over requested axis.\n\n"
11876 "Kurtosis obtained using Fisher's definition of\n"
11877 "kurtosis (kurtosis of normal == 0.0). Normalized "
11878 "by N-1.",
11879 name1=name1,
11880 name2=name2,
11881 axis_descr=axis_descr,
11882 min_count="",
11883 see_also="",
11884 examples="",
11885 )
11886 def kurt(
11887 self,
11888 axis: Axis | None | lib.NoDefault = lib.no_default,
11889 skipna=True,
11890 level=None,
11891 numeric_only=None,
11892 **kwargs,
11893 ):
11894 return NDFrame.kurt(self, axis, skipna, level, numeric_only, **kwargs)
11896 setattr(cls, "kurt", kurt)
11897 cls.kurtosis = kurt
11899 @doc(
11900 _num_doc,
11901 desc="Return the median of the values over the requested axis.",
11902 name1=name1,
11903 name2=name2,
11904 axis_descr=axis_descr,
11905 min_count="",
11906 see_also="",
11907 examples="",
11908 )
11909 def median(
11910 self,
11911 axis: int | None | lib.NoDefault = lib.no_default,
11912 skipna=True,
11913 level=None,
11914 numeric_only=None,
11915 **kwargs,
11916 ):
11917 return NDFrame.median(self, axis, skipna, level, numeric_only, **kwargs)
11919 setattr(cls, "median", median)
11921 @doc(
11922 _num_doc,
11923 desc="Return the maximum of the values over the requested axis.\n\n"
11924 "If you want the *index* of the maximum, use ``idxmax``. This is "
11925 "the equivalent of the ``numpy.ndarray`` method ``argmax``.",
11926 name1=name1,
11927 name2=name2,
11928 axis_descr=axis_descr,
11929 min_count="",
11930 see_also=_stat_func_see_also,
11931 examples=_max_examples,
11932 )
11933 def max(
11934 self,
11935 axis: int | None | lib.NoDefault = lib.no_default,
11936 skipna=True,
11937 level=None,
11938 numeric_only=None,
11939 **kwargs,
11940 ):
11941 return NDFrame.max(self, axis, skipna, level, numeric_only, **kwargs)
11943 setattr(cls, "max", max)
11945 @doc(
11946 _num_doc,
11947 desc="Return the minimum of the values over the requested axis.\n\n"
11948 "If you want the *index* of the minimum, use ``idxmin``. This is "
11949 "the equivalent of the ``numpy.ndarray`` method ``argmin``.",
11950 name1=name1,
11951 name2=name2,
11952 axis_descr=axis_descr,
11953 min_count="",
11954 see_also=_stat_func_see_also,
11955 examples=_min_examples,
11956 )
11957 def min(
11958 self,
11959 axis: int | None | lib.NoDefault = lib.no_default,
11960 skipna=True,
11961 level=None,
11962 numeric_only=None,
11963 **kwargs,
11964 ):
11965 return NDFrame.min(self, axis, skipna, level, numeric_only, **kwargs)
11967 setattr(cls, "min", min)
11969 @final
11970 @doc(Rolling)
11971 def rolling(
11972 self,
11973 window: int | timedelta | BaseOffset | BaseIndexer,
11974 min_periods: int | None = None,
11975 center: bool_t = False,
11976 win_type: str | None = None,
11977 on: str | None = None,
11978 axis: Axis = 0,
11979 closed: str | None = None,
11980 step: int | None = None,
11981 method: str = "single",
11982 ) -> Window | Rolling:
11983 axis = self._get_axis_number(axis)
11985 if win_type is not None:
11986 return Window(
11987 self,
11988 window=window,
11989 min_periods=min_periods,
11990 center=center,
11991 win_type=win_type,
11992 on=on,
11993 axis=axis,
11994 closed=closed,
11995 step=step,
11996 method=method,
11997 )
11999 return Rolling(
12000 self,
12001 window=window,
12002 min_periods=min_periods,
12003 center=center,
12004 win_type=win_type,
12005 on=on,
12006 axis=axis,
12007 closed=closed,
12008 step=step,
12009 method=method,
12010 )
12012 @final
12013 @doc(Expanding)
12014 def expanding(
12015 self,
12016 min_periods: int = 1,
12017 center: bool_t | None = None,
12018 axis: Axis = 0,
12019 method: str = "single",
12020 ) -> Expanding:
12021 axis = self._get_axis_number(axis)
12022 if center is not None:
12023 warnings.warn(
12024 "The `center` argument on `expanding` will be removed in the future.",
12025 FutureWarning,
12026 stacklevel=find_stack_level(),
12027 )
12028 else:
12029 center = False
12031 return Expanding(
12032 self, min_periods=min_periods, center=center, axis=axis, method=method
12033 )
12035 @final
12036 @doc(ExponentialMovingWindow)
12037 def ewm(
12038 self,
12039 com: float | None = None,
12040 span: float | None = None,
12041 halflife: float | TimedeltaConvertibleTypes | None = None,
12042 alpha: float | None = None,
12043 min_periods: int | None = 0,
12044 adjust: bool_t = True,
12045 ignore_na: bool_t = False,
12046 axis: Axis = 0,
12047 times: str | np.ndarray | DataFrame | Series | None = None,
12048 method: str = "single",
12049 ) -> ExponentialMovingWindow:
12050 axis = self._get_axis_number(axis)
12051 return ExponentialMovingWindow(
12052 self,
12053 com=com,
12054 span=span,
12055 halflife=halflife,
12056 alpha=alpha,
12057 min_periods=min_periods,
12058 adjust=adjust,
12059 ignore_na=ignore_na,
12060 axis=axis,
12061 times=times,
12062 method=method,
12063 )
12065 # ----------------------------------------------------------------------
12066 # Arithmetic Methods
12068 @final
12069 def _inplace_method(self, other, op):
12070 """
12071 Wrap arithmetic method to operate inplace.
12072 """
12073 result = op(self, other)
12075 if (
12076 self.ndim == 1
12077 and result._indexed_same(self)
12078 and is_dtype_equal(result.dtype, self.dtype)
12079 ):
12080 # GH#36498 this inplace op can _actually_ be inplace.
12081 self._values[:] = result._values
12082 return self
12084 # Delete cacher
12085 self._reset_cacher()
12087 # this makes sure that we are aligned like the input
12088 # we are updating inplace so we want to ignore is_copy
12089 self._update_inplace(
12090 result.reindex_like(self, copy=False), verify_is_copy=False
12091 )
12092 return self
12094 def __iadd__(self: NDFrameT, other) -> NDFrameT:
12095 # error: Unsupported left operand type for + ("Type[NDFrame]")
12096 return self._inplace_method(other, type(self).__add__) # type: ignore[operator]
12098 def __isub__(self: NDFrameT, other) -> NDFrameT:
12099 # error: Unsupported left operand type for - ("Type[NDFrame]")
12100 return self._inplace_method(other, type(self).__sub__) # type: ignore[operator]
12102 def __imul__(self: NDFrameT, other) -> NDFrameT:
12103 # error: Unsupported left operand type for * ("Type[NDFrame]")
12104 return self._inplace_method(other, type(self).__mul__) # type: ignore[operator]
12106 def __itruediv__(self: NDFrameT, other) -> NDFrameT:
12107 # error: Unsupported left operand type for / ("Type[NDFrame]")
12108 return self._inplace_method(
12109 other, type(self).__truediv__ # type: ignore[operator]
12110 )
12112 def __ifloordiv__(self: NDFrameT, other) -> NDFrameT:
12113 # error: Unsupported left operand type for // ("Type[NDFrame]")
12114 return self._inplace_method(
12115 other, type(self).__floordiv__ # type: ignore[operator]
12116 )
12118 def __imod__(self: NDFrameT, other) -> NDFrameT:
12119 # error: Unsupported left operand type for % ("Type[NDFrame]")
12120 return self._inplace_method(other, type(self).__mod__) # type: ignore[operator]
12122 def __ipow__(self: NDFrameT, other) -> NDFrameT:
12123 # error: Unsupported left operand type for ** ("Type[NDFrame]")
12124 return self._inplace_method(other, type(self).__pow__) # type: ignore[operator]
12126 def __iand__(self: NDFrameT, other) -> NDFrameT:
12127 # error: Unsupported left operand type for & ("Type[NDFrame]")
12128 return self._inplace_method(other, type(self).__and__) # type: ignore[operator]
12130 def __ior__(self: NDFrameT, other) -> NDFrameT:
12131 # error: Unsupported left operand type for | ("Type[NDFrame]")
12132 return self._inplace_method(other, type(self).__or__) # type: ignore[operator]
12134 def __ixor__(self: NDFrameT, other) -> NDFrameT:
12135 # error: Unsupported left operand type for ^ ("Type[NDFrame]")
12136 return self._inplace_method(other, type(self).__xor__) # type: ignore[operator]
12138 # ----------------------------------------------------------------------
12139 # Misc methods
12141 @final
12142 def _find_valid_index(self, *, how: str) -> Hashable | None:
12143 """
12144 Retrieves the index of the first valid value.
12146 Parameters
12147 ----------
12148 how : {'first', 'last'}
12149 Use this parameter to change between the first or last valid index.
12151 Returns
12152 -------
12153 idx_first_valid : type of index
12154 """
12155 idxpos = find_valid_index(self._values, how=how)
12156 if idxpos is None:
12157 return None
12158 return self.index[idxpos]
12160 @final
12161 @doc(position="first", klass=_shared_doc_kwargs["klass"])
12162 def first_valid_index(self) -> Hashable | None:
12163 """
12164 Return index for {position} non-NA value or None, if no non-NA value is found.
12166 Returns
12167 -------
12168 scalar : type of index
12170 Notes
12171 -----
12172 If all elements are non-NA/null, returns None.
12173 Also returns None for empty {klass}.
12174 """
12175 return self._find_valid_index(how="first")
12177 @final
12178 @doc(first_valid_index, position="last", klass=_shared_doc_kwargs["klass"])
12179 def last_valid_index(self) -> Hashable | None:
12180 return self._find_valid_index(how="last")
12183def _doc_params(cls):
12184 """Return a tuple of the doc params."""
12185 axis_descr = (
12186 f"{{{', '.join([f'{a} ({i})' for i, a in enumerate(cls._AXIS_ORDERS)])}}}"
12187 )
12188 name = cls._constructor_sliced.__name__ if cls._AXIS_LEN > 1 else "scalar"
12189 name2 = cls.__name__
12190 return axis_descr, name, name2
12193_num_doc = """
12194{desc}
12196Parameters
12197----------
12198axis : {axis_descr}
12199 Axis for the function to be applied on.
12200 For `Series` this parameter is unused and defaults to 0.
12201skipna : bool, default True
12202 Exclude NA/null values when computing the result.
12203level : int or level name, default None
12204 If the axis is a MultiIndex (hierarchical), count along a
12205 particular level, collapsing into a {name1}.
12207 .. deprecated:: 1.3.0
12208 The level keyword is deprecated. Use groupby instead.
12209numeric_only : bool, default None
12210 Include only float, int, boolean columns. If None, will attempt to use
12211 everything, then use only numeric data. Not implemented for Series.
12213 .. deprecated:: 1.5.0
12214 Specifying ``numeric_only=None`` is deprecated. The default value will be
12215 ``False`` in a future version of pandas.
12217{min_count}\
12218**kwargs
12219 Additional keyword arguments to be passed to the function.
12221Returns
12222-------
12223{name1} or {name2} (if level specified)\
12224{see_also}\
12225{examples}
12226"""
12228_num_ddof_doc = """
12229{desc}
12231Parameters
12232----------
12233axis : {axis_descr}
12234 For `Series` this parameter is unused and defaults to 0.
12235skipna : bool, default True
12236 Exclude NA/null values. If an entire row/column is NA, the result
12237 will be NA.
12238level : int or level name, default None
12239 If the axis is a MultiIndex (hierarchical), count along a
12240 particular level, collapsing into a {name1}.
12242 .. deprecated:: 1.3.0
12243 The level keyword is deprecated. Use groupby instead.
12244ddof : int, default 1
12245 Delta Degrees of Freedom. The divisor used in calculations is N - ddof,
12246 where N represents the number of elements.
12247numeric_only : bool, default None
12248 Include only float, int, boolean columns. If None, will attempt to use
12249 everything, then use only numeric data. Not implemented for Series.
12251 .. deprecated:: 1.5.0
12252 Specifying ``numeric_only=None`` is deprecated. The default value will be
12253 ``False`` in a future version of pandas.
12255Returns
12256-------
12257{name1} or {name2} (if level specified) \
12258{notes}\
12259{examples}
12260"""
12262_std_notes = """
12264Notes
12265-----
12266To have the same behaviour as `numpy.std`, use `ddof=0` (instead of the
12267default `ddof=1`)"""
12269_std_examples = """
12271Examples
12272--------
12273>>> df = pd.DataFrame({'person_id': [0, 1, 2, 3],
12274... 'age': [21, 25, 62, 43],
12275... 'height': [1.61, 1.87, 1.49, 2.01]}
12276... ).set_index('person_id')
12277>>> df
12278 age height
12279person_id
122800 21 1.61
122811 25 1.87
122822 62 1.49
122833 43 2.01
12285The standard deviation of the columns can be found as follows:
12287>>> df.std()
12288age 18.786076
12289height 0.237417
12291Alternatively, `ddof=0` can be set to normalize by N instead of N-1:
12293>>> df.std(ddof=0)
12294age 16.269219
12295height 0.205609"""
12297_var_examples = """
12299Examples
12300--------
12301>>> df = pd.DataFrame({'person_id': [0, 1, 2, 3],
12302... 'age': [21, 25, 62, 43],
12303... 'height': [1.61, 1.87, 1.49, 2.01]}
12304... ).set_index('person_id')
12305>>> df
12306 age height
12307person_id
123080 21 1.61
123091 25 1.87
123102 62 1.49
123113 43 2.01
12313>>> df.var()
12314age 352.916667
12315height 0.056367
12317Alternatively, ``ddof=0`` can be set to normalize by N instead of N-1:
12319>>> df.var(ddof=0)
12320age 264.687500
12321height 0.042275"""
12323_bool_doc = """
12324{desc}
12326Parameters
12327----------
12328axis : {{0 or 'index', 1 or 'columns', None}}, default 0
12329 Indicate which axis or axes should be reduced. For `Series` this parameter
12330 is unused and defaults to 0.
12332 * 0 / 'index' : reduce the index, return a Series whose index is the
12333 original column labels.
12334 * 1 / 'columns' : reduce the columns, return a Series whose index is the
12335 original index.
12336 * None : reduce all axes, return a scalar.
12338bool_only : bool, default None
12339 Include only boolean columns. If None, will attempt to use everything,
12340 then use only boolean data. Not implemented for Series.
12341skipna : bool, default True
12342 Exclude NA/null values. If the entire row/column is NA and skipna is
12343 True, then the result will be {empty_value}, as for an empty row/column.
12344 If skipna is False, then NA are treated as True, because these are not
12345 equal to zero.
12346level : int or level name, default None
12347 If the axis is a MultiIndex (hierarchical), count along a
12348 particular level, collapsing into a {name1}.
12350 .. deprecated:: 1.3.0
12351 The level keyword is deprecated. Use groupby instead.
12352**kwargs : any, default None
12353 Additional keywords have no effect but might be accepted for
12354 compatibility with NumPy.
12356Returns
12357-------
12358{name1} or {name2}
12359 If level is specified, then, {name2} is returned; otherwise, {name1}
12360 is returned.
12362{see_also}
12363{examples}"""
12365_all_desc = """\
12366Return whether all elements are True, potentially over an axis.
12368Returns True unless there at least one element within a series or
12369along a Dataframe axis that is False or equivalent (e.g. zero or
12370empty)."""
12372_all_examples = """\
12373Examples
12374--------
12375**Series**
12377>>> pd.Series([True, True]).all()
12378True
12379>>> pd.Series([True, False]).all()
12380False
12381>>> pd.Series([], dtype="float64").all()
12382True
12383>>> pd.Series([np.nan]).all()
12384True
12385>>> pd.Series([np.nan]).all(skipna=False)
12386True
12388**DataFrames**
12390Create a dataframe from a dictionary.
12392>>> df = pd.DataFrame({'col1': [True, True], 'col2': [True, False]})
12393>>> df
12394 col1 col2
123950 True True
123961 True False
12398Default behaviour checks if values in each column all return True.
12400>>> df.all()
12401col1 True
12402col2 False
12403dtype: bool
12405Specify ``axis='columns'`` to check if values in each row all return True.
12407>>> df.all(axis='columns')
124080 True
124091 False
12410dtype: bool
12412Or ``axis=None`` for whether every value is True.
12414>>> df.all(axis=None)
12415False
12416"""
12418_all_see_also = """\
12419See Also
12420--------
12421Series.all : Return True if all elements are True.
12422DataFrame.any : Return True if one (or more) elements are True.
12423"""
12425_cnum_doc = """
12426Return cumulative {desc} over a DataFrame or Series axis.
12428Returns a DataFrame or Series of the same size containing the cumulative
12429{desc}.
12431Parameters
12432----------
12433axis : {{0 or 'index', 1 or 'columns'}}, default 0
12434 The index or the name of the axis. 0 is equivalent to None or 'index'.
12435 For `Series` this parameter is unused and defaults to 0.
12436skipna : bool, default True
12437 Exclude NA/null values. If an entire row/column is NA, the result
12438 will be NA.
12439*args, **kwargs
12440 Additional keywords have no effect but might be accepted for
12441 compatibility with NumPy.
12443Returns
12444-------
12445{name1} or {name2}
12446 Return cumulative {desc} of {name1} or {name2}.
12448See Also
12449--------
12450core.window.expanding.Expanding.{accum_func_name} : Similar functionality
12451 but ignores ``NaN`` values.
12452{name2}.{accum_func_name} : Return the {desc} over
12453 {name2} axis.
12454{name2}.cummax : Return cumulative maximum over {name2} axis.
12455{name2}.cummin : Return cumulative minimum over {name2} axis.
12456{name2}.cumsum : Return cumulative sum over {name2} axis.
12457{name2}.cumprod : Return cumulative product over {name2} axis.
12459{examples}"""
12461_cummin_examples = """\
12462Examples
12463--------
12464**Series**
12466>>> s = pd.Series([2, np.nan, 5, -1, 0])
12467>>> s
124680 2.0
124691 NaN
124702 5.0
124713 -1.0
124724 0.0
12473dtype: float64
12475By default, NA values are ignored.
12477>>> s.cummin()
124780 2.0
124791 NaN
124802 2.0
124813 -1.0
124824 -1.0
12483dtype: float64
12485To include NA values in the operation, use ``skipna=False``
12487>>> s.cummin(skipna=False)
124880 2.0
124891 NaN
124902 NaN
124913 NaN
124924 NaN
12493dtype: float64
12495**DataFrame**
12497>>> df = pd.DataFrame([[2.0, 1.0],
12498... [3.0, np.nan],
12499... [1.0, 0.0]],
12500... columns=list('AB'))
12501>>> df
12502 A B
125030 2.0 1.0
125041 3.0 NaN
125052 1.0 0.0
12507By default, iterates over rows and finds the minimum
12508in each column. This is equivalent to ``axis=None`` or ``axis='index'``.
12510>>> df.cummin()
12511 A B
125120 2.0 1.0
125131 2.0 NaN
125142 1.0 0.0
12516To iterate over columns and find the minimum in each row,
12517use ``axis=1``
12519>>> df.cummin(axis=1)
12520 A B
125210 2.0 1.0
125221 3.0 NaN
125232 1.0 0.0
12524"""
12526_cumsum_examples = """\
12527Examples
12528--------
12529**Series**
12531>>> s = pd.Series([2, np.nan, 5, -1, 0])
12532>>> s
125330 2.0
125341 NaN
125352 5.0
125363 -1.0
125374 0.0
12538dtype: float64
12540By default, NA values are ignored.
12542>>> s.cumsum()
125430 2.0
125441 NaN
125452 7.0
125463 6.0
125474 6.0
12548dtype: float64
12550To include NA values in the operation, use ``skipna=False``
12552>>> s.cumsum(skipna=False)
125530 2.0
125541 NaN
125552 NaN
125563 NaN
125574 NaN
12558dtype: float64
12560**DataFrame**
12562>>> df = pd.DataFrame([[2.0, 1.0],
12563... [3.0, np.nan],
12564... [1.0, 0.0]],
12565... columns=list('AB'))
12566>>> df
12567 A B
125680 2.0 1.0
125691 3.0 NaN
125702 1.0 0.0
12572By default, iterates over rows and finds the sum
12573in each column. This is equivalent to ``axis=None`` or ``axis='index'``.
12575>>> df.cumsum()
12576 A B
125770 2.0 1.0
125781 5.0 NaN
125792 6.0 1.0
12581To iterate over columns and find the sum in each row,
12582use ``axis=1``
12584>>> df.cumsum(axis=1)
12585 A B
125860 2.0 3.0
125871 3.0 NaN
125882 1.0 1.0
12589"""
12591_cumprod_examples = """\
12592Examples
12593--------
12594**Series**
12596>>> s = pd.Series([2, np.nan, 5, -1, 0])
12597>>> s
125980 2.0
125991 NaN
126002 5.0
126013 -1.0
126024 0.0
12603dtype: float64
12605By default, NA values are ignored.
12607>>> s.cumprod()
126080 2.0
126091 NaN
126102 10.0
126113 -10.0
126124 -0.0
12613dtype: float64
12615To include NA values in the operation, use ``skipna=False``
12617>>> s.cumprod(skipna=False)
126180 2.0
126191 NaN
126202 NaN
126213 NaN
126224 NaN
12623dtype: float64
12625**DataFrame**
12627>>> df = pd.DataFrame([[2.0, 1.0],
12628... [3.0, np.nan],
12629... [1.0, 0.0]],
12630... columns=list('AB'))
12631>>> df
12632 A B
126330 2.0 1.0
126341 3.0 NaN
126352 1.0 0.0
12637By default, iterates over rows and finds the product
12638in each column. This is equivalent to ``axis=None`` or ``axis='index'``.
12640>>> df.cumprod()
12641 A B
126420 2.0 1.0
126431 6.0 NaN
126442 6.0 0.0
12646To iterate over columns and find the product in each row,
12647use ``axis=1``
12649>>> df.cumprod(axis=1)
12650 A B
126510 2.0 2.0
126521 3.0 NaN
126532 1.0 0.0
12654"""
12656_cummax_examples = """\
12657Examples
12658--------
12659**Series**
12661>>> s = pd.Series([2, np.nan, 5, -1, 0])
12662>>> s
126630 2.0
126641 NaN
126652 5.0
126663 -1.0
126674 0.0
12668dtype: float64
12670By default, NA values are ignored.
12672>>> s.cummax()
126730 2.0
126741 NaN
126752 5.0
126763 5.0
126774 5.0
12678dtype: float64
12680To include NA values in the operation, use ``skipna=False``
12682>>> s.cummax(skipna=False)
126830 2.0
126841 NaN
126852 NaN
126863 NaN
126874 NaN
12688dtype: float64
12690**DataFrame**
12692>>> df = pd.DataFrame([[2.0, 1.0],
12693... [3.0, np.nan],
12694... [1.0, 0.0]],
12695... columns=list('AB'))
12696>>> df
12697 A B
126980 2.0 1.0
126991 3.0 NaN
127002 1.0 0.0
12702By default, iterates over rows and finds the maximum
12703in each column. This is equivalent to ``axis=None`` or ``axis='index'``.
12705>>> df.cummax()
12706 A B
127070 2.0 1.0
127081 3.0 NaN
127092 3.0 1.0
12711To iterate over columns and find the maximum in each row,
12712use ``axis=1``
12714>>> df.cummax(axis=1)
12715 A B
127160 2.0 2.0
127171 3.0 NaN
127182 1.0 1.0
12719"""
12721_any_see_also = """\
12722See Also
12723--------
12724numpy.any : Numpy version of this method.
12725Series.any : Return whether any element is True.
12726Series.all : Return whether all elements are True.
12727DataFrame.any : Return whether any element is True over requested axis.
12728DataFrame.all : Return whether all elements are True over requested axis.
12729"""
12731_any_desc = """\
12732Return whether any element is True, potentially over an axis.
12734Returns False unless there is at least one element within a series or
12735along a Dataframe axis that is True or equivalent (e.g. non-zero or
12736non-empty)."""
12738_any_examples = """\
12739Examples
12740--------
12741**Series**
12743For Series input, the output is a scalar indicating whether any element
12744is True.
12746>>> pd.Series([False, False]).any()
12747False
12748>>> pd.Series([True, False]).any()
12749True
12750>>> pd.Series([], dtype="float64").any()
12751False
12752>>> pd.Series([np.nan]).any()
12753False
12754>>> pd.Series([np.nan]).any(skipna=False)
12755True
12757**DataFrame**
12759Whether each column contains at least one True element (the default).
12761>>> df = pd.DataFrame({"A": [1, 2], "B": [0, 2], "C": [0, 0]})
12762>>> df
12763 A B C
127640 1 0 0
127651 2 2 0
12767>>> df.any()
12768A True
12769B True
12770C False
12771dtype: bool
12773Aggregating over the columns.
12775>>> df = pd.DataFrame({"A": [True, False], "B": [1, 2]})
12776>>> df
12777 A B
127780 True 1
127791 False 2
12781>>> df.any(axis='columns')
127820 True
127831 True
12784dtype: bool
12786>>> df = pd.DataFrame({"A": [True, False], "B": [1, 0]})
12787>>> df
12788 A B
127890 True 1
127901 False 0
12792>>> df.any(axis='columns')
127930 True
127941 False
12795dtype: bool
12797Aggregating over the entire DataFrame with ``axis=None``.
12799>>> df.any(axis=None)
12800True
12802`any` for an empty DataFrame is an empty Series.
12804>>> pd.DataFrame([]).any()
12805Series([], dtype: bool)
12806"""
12808_shared_docs[
12809 "stat_func_example"
12810] = """
12812Examples
12813--------
12814>>> idx = pd.MultiIndex.from_arrays([
12815... ['warm', 'warm', 'cold', 'cold'],
12816... ['dog', 'falcon', 'fish', 'spider']],
12817... names=['blooded', 'animal'])
12818>>> s = pd.Series([4, 2, 0, 8], name='legs', index=idx)
12819>>> s
12820blooded animal
12821warm dog 4
12822 falcon 2
12823cold fish 0
12824 spider 8
12825Name: legs, dtype: int64
12827>>> s.{stat_func}()
12828{default_output}"""
12830_sum_examples = _shared_docs["stat_func_example"].format(
12831 stat_func="sum", verb="Sum", default_output=14, level_output_0=6, level_output_1=8
12834_sum_examples += """
12836By default, the sum of an empty or all-NA Series is ``0``.
12838>>> pd.Series([], dtype="float64").sum() # min_count=0 is the default
128390.0
12841This can be controlled with the ``min_count`` parameter. For example, if
12842you'd like the sum of an empty series to be NaN, pass ``min_count=1``.
12844>>> pd.Series([], dtype="float64").sum(min_count=1)
12845nan
12847Thanks to the ``skipna`` parameter, ``min_count`` handles all-NA and
12848empty series identically.
12850>>> pd.Series([np.nan]).sum()
128510.0
12853>>> pd.Series([np.nan]).sum(min_count=1)
12854nan"""
12856_max_examples: str = _shared_docs["stat_func_example"].format(
12857 stat_func="max", verb="Max", default_output=8, level_output_0=4, level_output_1=8
12860_min_examples: str = _shared_docs["stat_func_example"].format(
12861 stat_func="min", verb="Min", default_output=0, level_output_0=2, level_output_1=0
12864_stat_func_see_also = """
12866See Also
12867--------
12868Series.sum : Return the sum.
12869Series.min : Return the minimum.
12870Series.max : Return the maximum.
12871Series.idxmin : Return the index of the minimum.
12872Series.idxmax : Return the index of the maximum.
12873DataFrame.sum : Return the sum over the requested axis.
12874DataFrame.min : Return the minimum over the requested axis.
12875DataFrame.max : Return the maximum over the requested axis.
12876DataFrame.idxmin : Return the index of the minimum over the requested axis.
12877DataFrame.idxmax : Return the index of the maximum over the requested axis."""
12879_prod_examples = """
12881Examples
12882--------
12883By default, the product of an empty or all-NA Series is ``1``
12885>>> pd.Series([], dtype="float64").prod()
128861.0
12888This can be controlled with the ``min_count`` parameter
12890>>> pd.Series([], dtype="float64").prod(min_count=1)
12891nan
12893Thanks to the ``skipna`` parameter, ``min_count`` handles all-NA and
12894empty series identically.
12896>>> pd.Series([np.nan]).prod()
128971.0
12899>>> pd.Series([np.nan]).prod(min_count=1)
12900nan"""
12902_min_count_stub = """\
12903min_count : int, default 0
12904 The required number of valid values to perform the operation. If fewer than
12905 ``min_count`` non-NA values are present the result will be NA.
12906"""
12909def _align_as_utc(
12910 left: NDFrameT, right: NDFrameT, join_index: Index | None
12911) -> tuple[NDFrameT, NDFrameT]:
12912 """
12913 If we are aligning timezone-aware DatetimeIndexes and the timezones
12914 do not match, convert both to UTC.
12915 """
12916 if is_datetime64tz_dtype(left.index.dtype):
12917 if left.index.tz != right.index.tz:
12918 if join_index is not None:
12919 # GH#33671 ensure we don't change the index on
12920 # our original Series (NB: by default deep=False)
12921 left = left.copy()
12922 right = right.copy()
12923 left.index = join_index
12924 right.index = join_index
12926 return left, right