Coverage for /var/srv/projects/api.amasfac.comuna18.com/tmp/venv/lib/python3.9/site-packages/pandas/core/frame.py: 14%
2312 statements
« prev ^ index » next coverage.py v6.4.4, created at 2023-07-17 14:22 -0600
« prev ^ index » next coverage.py v6.4.4, created at 2023-07-17 14:22 -0600
1"""
2DataFrame
3---------
4An efficient 2D container for potentially mixed-type time series or other
5labeled data series.
7Similar to its R counterpart, data.frame, except providing automatic data
8alignment and a host of useful data manipulation methods having to do with the
9labeling information
10"""
11from __future__ import annotations
13import collections
14from collections import abc
15import datetime
16import functools
17from io import StringIO
18import itertools
19from textwrap import dedent
20from typing import (
21 TYPE_CHECKING,
22 Any,
23 Callable,
24 Hashable,
25 Iterable,
26 Iterator,
27 Literal,
28 Mapping,
29 Sequence,
30 cast,
31 overload,
32)
33import warnings
35import numpy as np
36import numpy.ma as ma
38from pandas._config import get_option
40from pandas._libs import (
41 algos as libalgos,
42 lib,
43 properties,
44)
45from pandas._libs.hashtable import duplicated
46from pandas._libs.lib import (
47 NoDefault,
48 no_default,
49)
50from pandas._typing import (
51 AggFuncType,
52 AnyArrayLike,
53 ArrayLike,
54 Axes,
55 Axis,
56 ColspaceArgType,
57 CompressionOptions,
58 Dtype,
59 DtypeObj,
60 FilePath,
61 FillnaOptions,
62 FloatFormatType,
63 FormattersType,
64 Frequency,
65 IgnoreRaise,
66 IndexKeyFunc,
67 IndexLabel,
68 Level,
69 NaPosition,
70 PythonFuncType,
71 QuantileInterpolation,
72 ReadBuffer,
73 Renamer,
74 Scalar,
75 SortKind,
76 StorageOptions,
77 Suffixes,
78 TimedeltaConvertibleTypes,
79 TimestampConvertibleTypes,
80 ValueKeyFunc,
81 WriteBuffer,
82 npt,
83)
84from pandas.compat._optional import import_optional_dependency
85from pandas.compat.numpy import (
86 function as nv,
87 np_percentile_argname,
88)
89from pandas.errors import InvalidIndexError
90from pandas.util._decorators import (
91 Appender,
92 Substitution,
93 deprecate_kwarg,
94 deprecate_nonkeyword_arguments,
95 doc,
96 rewrite_axis_style_signature,
97)
98from pandas.util._exceptions import find_stack_level
99from pandas.util._validators import (
100 validate_ascending,
101 validate_axis_style_args,
102 validate_bool_kwarg,
103 validate_percentile,
104)
106from pandas.core.dtypes.cast import (
107 can_hold_element,
108 construct_1d_arraylike_from_scalar,
109 construct_2d_arraylike_from_scalar,
110 find_common_type,
111 infer_dtype_from_scalar,
112 invalidate_string_dtypes,
113 maybe_box_native,
114 maybe_downcast_to_dtype,
115)
116from pandas.core.dtypes.common import (
117 ensure_platform_int,
118 infer_dtype_from_object,
119 is_1d_only_ea_dtype,
120 is_bool_dtype,
121 is_dataclass,
122 is_datetime64_any_dtype,
123 is_dict_like,
124 is_dtype_equal,
125 is_extension_array_dtype,
126 is_float,
127 is_float_dtype,
128 is_hashable,
129 is_integer,
130 is_integer_dtype,
131 is_iterator,
132 is_list_like,
133 is_numeric_dtype,
134 is_object_dtype,
135 is_scalar,
136 is_sequence,
137 needs_i8_conversion,
138 pandas_dtype,
139)
140from pandas.core.dtypes.dtypes import ExtensionDtype
141from pandas.core.dtypes.missing import (
142 isna,
143 notna,
144)
146from pandas.core import (
147 algorithms,
148 common as com,
149 nanops,
150 ops,
151)
152from pandas.core.accessor import CachedAccessor
153from pandas.core.apply import (
154 reconstruct_func,
155 relabel_result,
156)
157from pandas.core.array_algos.take import take_2d_multi
158from pandas.core.arraylike import OpsMixin
159from pandas.core.arrays import (
160 DatetimeArray,
161 ExtensionArray,
162 PeriodArray,
163 TimedeltaArray,
164)
165from pandas.core.arrays.sparse import SparseFrameAccessor
166from pandas.core.construction import (
167 extract_array,
168 sanitize_array,
169 sanitize_masked_array,
170)
171from pandas.core.generic import NDFrame
172from pandas.core.indexers import check_key_length
173from pandas.core.indexes.api import (
174 DatetimeIndex,
175 Index,
176 PeriodIndex,
177 default_index,
178 ensure_index,
179 ensure_index_from_sequences,
180)
181from pandas.core.indexes.multi import (
182 MultiIndex,
183 maybe_droplevels,
184)
185from pandas.core.indexing import (
186 check_bool_indexer,
187 check_deprecated_indexers,
188 convert_to_index_sliceable,
189)
190from pandas.core.internals import (
191 ArrayManager,
192 BlockManager,
193)
194from pandas.core.internals.construction import (
195 arrays_to_mgr,
196 dataclasses_to_dicts,
197 dict_to_mgr,
198 mgr_to_mgr,
199 ndarray_to_mgr,
200 nested_data_to_arrays,
201 rec_array_to_mgr,
202 reorder_arrays,
203 to_arrays,
204 treat_as_nested,
205)
206from pandas.core.reshape.melt import melt
207from pandas.core.series import Series
208from pandas.core.shared_docs import _shared_docs
209from pandas.core.sorting import (
210 get_group_index,
211 lexsort_indexer,
212 nargsort,
213)
215from pandas.io.common import get_handle
216from pandas.io.formats import (
217 console,
218 format as fmt,
219)
220from pandas.io.formats.info import (
221 INFO_DOCSTRING,
222 DataFrameInfo,
223 frame_sub_kwargs,
224)
225import pandas.plotting
227if TYPE_CHECKING: 227 ↛ 229line 227 didn't jump to line 229, because the condition on line 227 was never true
229 from pandas.core.groupby.generic import DataFrameGroupBy
230 from pandas.core.interchange.dataframe_protocol import DataFrame as DataFrameXchg
231 from pandas.core.internals import SingleDataManager
232 from pandas.core.resample import Resampler
234 from pandas.io.formats.style import Styler
236# ---------------------------------------------------------------------
237# Docstring templates
239_shared_doc_kwargs = {
240 "axes": "index, columns",
241 "klass": "DataFrame",
242 "axes_single_arg": "{0 or 'index', 1 or 'columns'}",
243 "axis": """axis : {0 or 'index', 1 or 'columns'}, default 0
244 If 0 or 'index': apply function to each column.
245 If 1 or 'columns': apply function to each row.""",
246 "inplace": """
247 inplace : bool, default False
248 Whether to modify the DataFrame rather than creating a new one.""",
249 "optional_by": """
250 by : str or list of str
251 Name or list of names to sort by.
253 - if `axis` is 0 or `'index'` then `by` may contain index
254 levels and/or column labels.
255 - if `axis` is 1 or `'columns'` then `by` may contain column
256 levels and/or index labels.""",
257 "optional_labels": """labels : array-like, optional
258 New labels / index to conform the axis specified by 'axis' to.""",
259 "optional_axis": """axis : int or str, optional
260 Axis to target. Can be either the axis name ('index', 'columns')
261 or number (0, 1).""",
262 "replace_iloc": """
263 This differs from updating with ``.loc`` or ``.iloc``, which require
264 you to specify a location to update with some value.""",
265}
267_numeric_only_doc = """numeric_only : bool or None, default None
268 Include only float, int, boolean data. If None, will attempt to use
269 everything, then use only numeric data
270"""
272_merge_doc = """
273Merge DataFrame or named Series objects with a database-style join.
275A named Series object is treated as a DataFrame with a single named column.
277The join is done on columns or indexes. If joining columns on
278columns, the DataFrame indexes *will be ignored*. Otherwise if joining indexes
279on indexes or indexes on a column or columns, the index will be passed on.
280When performing a cross merge, no column specifications to merge on are
281allowed.
283.. warning::
285 If both key columns contain rows where the key is a null value, those
286 rows will be matched against each other. This is different from usual SQL
287 join behaviour and can lead to unexpected results.
289Parameters
290----------%s
291right : DataFrame or named Series
292 Object to merge with.
293how : {'left', 'right', 'outer', 'inner', 'cross'}, default 'inner'
294 Type of merge to be performed.
296 * left: use only keys from left frame, similar to a SQL left outer join;
297 preserve key order.
298 * right: use only keys from right frame, similar to a SQL right outer join;
299 preserve key order.
300 * outer: use union of keys from both frames, similar to a SQL full outer
301 join; sort keys lexicographically.
302 * inner: use intersection of keys from both frames, similar to a SQL inner
303 join; preserve the order of the left keys.
304 * cross: creates the cartesian product from both frames, preserves the order
305 of the left keys.
307 .. versionadded:: 1.2.0
309on : label or list
310 Column or index level names to join on. These must be found in both
311 DataFrames. If `on` is None and not merging on indexes then this defaults
312 to the intersection of the columns in both DataFrames.
313left_on : label or list, or array-like
314 Column or index level names to join on in the left DataFrame. Can also
315 be an array or list of arrays of the length of the left DataFrame.
316 These arrays are treated as if they are columns.
317right_on : label or list, or array-like
318 Column or index level names to join on in the right DataFrame. Can also
319 be an array or list of arrays of the length of the right DataFrame.
320 These arrays are treated as if they are columns.
321left_index : bool, default False
322 Use the index from the left DataFrame as the join key(s). If it is a
323 MultiIndex, the number of keys in the other DataFrame (either the index
324 or a number of columns) must match the number of levels.
325right_index : bool, default False
326 Use the index from the right DataFrame as the join key. Same caveats as
327 left_index.
328sort : bool, default False
329 Sort the join keys lexicographically in the result DataFrame. If False,
330 the order of the join keys depends on the join type (how keyword).
331suffixes : list-like, default is ("_x", "_y")
332 A length-2 sequence where each element is optionally a string
333 indicating the suffix to add to overlapping column names in
334 `left` and `right` respectively. Pass a value of `None` instead
335 of a string to indicate that the column name from `left` or
336 `right` should be left as-is, with no suffix. At least one of the
337 values must not be None.
338copy : bool, default True
339 If False, avoid copy if possible.
340indicator : bool or str, default False
341 If True, adds a column to the output DataFrame called "_merge" with
342 information on the source of each row. The column can be given a different
343 name by providing a string argument. The column will have a Categorical
344 type with the value of "left_only" for observations whose merge key only
345 appears in the left DataFrame, "right_only" for observations
346 whose merge key only appears in the right DataFrame, and "both"
347 if the observation's merge key is found in both DataFrames.
349validate : str, optional
350 If specified, checks if merge is of specified type.
352 * "one_to_one" or "1:1": check if merge keys are unique in both
353 left and right datasets.
354 * "one_to_many" or "1:m": check if merge keys are unique in left
355 dataset.
356 * "many_to_one" or "m:1": check if merge keys are unique in right
357 dataset.
358 * "many_to_many" or "m:m": allowed, but does not result in checks.
360Returns
361-------
362DataFrame
363 A DataFrame of the two merged objects.
365See Also
366--------
367merge_ordered : Merge with optional filling/interpolation.
368merge_asof : Merge on nearest keys.
369DataFrame.join : Similar method using indices.
371Notes
372-----
373Support for specifying index levels as the `on`, `left_on`, and
374`right_on` parameters was added in version 0.23.0
375Support for merging named Series objects was added in version 0.24.0
377Examples
378--------
379>>> df1 = pd.DataFrame({'lkey': ['foo', 'bar', 'baz', 'foo'],
380... 'value': [1, 2, 3, 5]})
381>>> df2 = pd.DataFrame({'rkey': ['foo', 'bar', 'baz', 'foo'],
382... 'value': [5, 6, 7, 8]})
383>>> df1
384 lkey value
3850 foo 1
3861 bar 2
3872 baz 3
3883 foo 5
389>>> df2
390 rkey value
3910 foo 5
3921 bar 6
3932 baz 7
3943 foo 8
396Merge df1 and df2 on the lkey and rkey columns. The value columns have
397the default suffixes, _x and _y, appended.
399>>> df1.merge(df2, left_on='lkey', right_on='rkey')
400 lkey value_x rkey value_y
4010 foo 1 foo 5
4021 foo 1 foo 8
4032 foo 5 foo 5
4043 foo 5 foo 8
4054 bar 2 bar 6
4065 baz 3 baz 7
408Merge DataFrames df1 and df2 with specified left and right suffixes
409appended to any overlapping columns.
411>>> df1.merge(df2, left_on='lkey', right_on='rkey',
412... suffixes=('_left', '_right'))
413 lkey value_left rkey value_right
4140 foo 1 foo 5
4151 foo 1 foo 8
4162 foo 5 foo 5
4173 foo 5 foo 8
4184 bar 2 bar 6
4195 baz 3 baz 7
421Merge DataFrames df1 and df2, but raise an exception if the DataFrames have
422any overlapping columns.
424>>> df1.merge(df2, left_on='lkey', right_on='rkey', suffixes=(False, False))
425Traceback (most recent call last):
426...
427ValueError: columns overlap but no suffix specified:
428 Index(['value'], dtype='object')
430>>> df1 = pd.DataFrame({'a': ['foo', 'bar'], 'b': [1, 2]})
431>>> df2 = pd.DataFrame({'a': ['foo', 'baz'], 'c': [3, 4]})
432>>> df1
433 a b
4340 foo 1
4351 bar 2
436>>> df2
437 a c
4380 foo 3
4391 baz 4
441>>> df1.merge(df2, how='inner', on='a')
442 a b c
4430 foo 1 3
445>>> df1.merge(df2, how='left', on='a')
446 a b c
4470 foo 1 3.0
4481 bar 2 NaN
450>>> df1 = pd.DataFrame({'left': ['foo', 'bar']})
451>>> df2 = pd.DataFrame({'right': [7, 8]})
452>>> df1
453 left
4540 foo
4551 bar
456>>> df2
457 right
4580 7
4591 8
461>>> df1.merge(df2, how='cross')
462 left right
4630 foo 7
4641 foo 8
4652 bar 7
4663 bar 8
467"""
470# -----------------------------------------------------------------------
471# DataFrame class
474class DataFrame(NDFrame, OpsMixin):
475 """
476 Two-dimensional, size-mutable, potentially heterogeneous tabular data.
478 Data structure also contains labeled axes (rows and columns).
479 Arithmetic operations align on both row and column labels. Can be
480 thought of as a dict-like container for Series objects. The primary
481 pandas data structure.
483 Parameters
484 ----------
485 data : ndarray (structured or homogeneous), Iterable, dict, or DataFrame
486 Dict can contain Series, arrays, constants, dataclass or list-like objects. If
487 data is a dict, column order follows insertion-order. If a dict contains Series
488 which have an index defined, it is aligned by its index.
490 .. versionchanged:: 0.25.0
491 If data is a list of dicts, column order follows insertion-order.
493 index : Index or array-like
494 Index to use for resulting frame. Will default to RangeIndex if
495 no indexing information part of input data and no index provided.
496 columns : Index or array-like
497 Column labels to use for resulting frame when data does not have them,
498 defaulting to RangeIndex(0, 1, 2, ..., n). If data contains column labels,
499 will perform column selection instead.
500 dtype : dtype, default None
501 Data type to force. Only a single dtype is allowed. If None, infer.
502 copy : bool or None, default None
503 Copy data from inputs.
504 For dict data, the default of None behaves like ``copy=True``. For DataFrame
505 or 2d ndarray input, the default of None behaves like ``copy=False``.
506 If data is a dict containing one or more Series (possibly of different dtypes),
507 ``copy=False`` will ensure that these inputs are not copied.
509 .. versionchanged:: 1.3.0
511 See Also
512 --------
513 DataFrame.from_records : Constructor from tuples, also record arrays.
514 DataFrame.from_dict : From dicts of Series, arrays, or dicts.
515 read_csv : Read a comma-separated values (csv) file into DataFrame.
516 read_table : Read general delimited file into DataFrame.
517 read_clipboard : Read text from clipboard into DataFrame.
519 Notes
520 -----
521 Please reference the :ref:`User Guide <basics.dataframe>` for more information.
523 Examples
524 --------
525 Constructing DataFrame from a dictionary.
527 >>> d = {'col1': [1, 2], 'col2': [3, 4]}
528 >>> df = pd.DataFrame(data=d)
529 >>> df
530 col1 col2
531 0 1 3
532 1 2 4
534 Notice that the inferred dtype is int64.
536 >>> df.dtypes
537 col1 int64
538 col2 int64
539 dtype: object
541 To enforce a single dtype:
543 >>> df = pd.DataFrame(data=d, dtype=np.int8)
544 >>> df.dtypes
545 col1 int8
546 col2 int8
547 dtype: object
549 Constructing DataFrame from a dictionary including Series:
551 >>> d = {'col1': [0, 1, 2, 3], 'col2': pd.Series([2, 3], index=[2, 3])}
552 >>> pd.DataFrame(data=d, index=[0, 1, 2, 3])
553 col1 col2
554 0 0 NaN
555 1 1 NaN
556 2 2 2.0
557 3 3 3.0
559 Constructing DataFrame from numpy ndarray:
561 >>> df2 = pd.DataFrame(np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]),
562 ... columns=['a', 'b', 'c'])
563 >>> df2
564 a b c
565 0 1 2 3
566 1 4 5 6
567 2 7 8 9
569 Constructing DataFrame from a numpy ndarray that has labeled columns:
571 >>> data = np.array([(1, 2, 3), (4, 5, 6), (7, 8, 9)],
572 ... dtype=[("a", "i4"), ("b", "i4"), ("c", "i4")])
573 >>> df3 = pd.DataFrame(data, columns=['c', 'a'])
574 ...
575 >>> df3
576 c a
577 0 3 1
578 1 6 4
579 2 9 7
581 Constructing DataFrame from dataclass:
583 >>> from dataclasses import make_dataclass
584 >>> Point = make_dataclass("Point", [("x", int), ("y", int)])
585 >>> pd.DataFrame([Point(0, 0), Point(0, 3), Point(2, 3)])
586 x y
587 0 0 0
588 1 0 3
589 2 2 3
590 """
592 _internal_names_set = {"columns", "index"} | NDFrame._internal_names_set
593 _typ = "dataframe"
594 _HANDLED_TYPES = (Series, Index, ExtensionArray, np.ndarray)
595 _accessors: set[str] = {"sparse"}
596 _hidden_attrs: frozenset[str] = NDFrame._hidden_attrs | frozenset([])
597 _mgr: BlockManager | ArrayManager
599 @property
600 def _constructor(self) -> Callable[..., DataFrame]:
601 return DataFrame
603 _constructor_sliced: Callable[..., Series] = Series
605 # ----------------------------------------------------------------------
606 # Constructors
608 def __init__(
609 self,
610 data=None,
611 index: Axes | None = None,
612 columns: Axes | None = None,
613 dtype: Dtype | None = None,
614 copy: bool | None = None,
615 ) -> None:
617 if data is None:
618 data = {}
619 if dtype is not None:
620 dtype = self._validate_dtype(dtype)
622 if isinstance(data, DataFrame):
623 data = data._mgr
625 if isinstance(data, (BlockManager, ArrayManager)):
626 # first check if a Manager is passed without any other arguments
627 # -> use fastpath (without checking Manager type)
628 if index is None and columns is None and dtype is None and not copy:
629 # GH#33357 fastpath
630 NDFrame.__init__(self, data)
631 return
633 manager = get_option("mode.data_manager")
635 # GH47215
636 if index is not None and isinstance(index, set):
637 raise ValueError("index cannot be a set")
638 if columns is not None and isinstance(columns, set):
639 raise ValueError("columns cannot be a set")
641 if copy is None:
642 if isinstance(data, dict):
643 # retain pre-GH#38939 default behavior
644 copy = True
645 elif (
646 manager == "array"
647 and isinstance(data, (np.ndarray, ExtensionArray))
648 and data.ndim == 2
649 ):
650 # INFO(ArrayManager) by default copy the 2D input array to get
651 # contiguous 1D arrays
652 copy = True
653 else:
654 copy = False
656 if isinstance(data, (BlockManager, ArrayManager)):
657 mgr = self._init_mgr(
658 data, axes={"index": index, "columns": columns}, dtype=dtype, copy=copy
659 )
661 elif isinstance(data, dict):
662 # GH#38939 de facto copy defaults to False only in non-dict cases
663 mgr = dict_to_mgr(data, index, columns, dtype=dtype, copy=copy, typ=manager)
664 elif isinstance(data, ma.MaskedArray):
665 import numpy.ma.mrecords as mrecords
667 # masked recarray
668 if isinstance(data, mrecords.MaskedRecords):
669 mgr = rec_array_to_mgr(
670 data,
671 index,
672 columns,
673 dtype,
674 copy,
675 typ=manager,
676 )
677 warnings.warn(
678 "Support for MaskedRecords is deprecated and will be "
679 "removed in a future version. Pass "
680 "{name: data[name] for name in data.dtype.names} instead.",
681 FutureWarning,
682 stacklevel=find_stack_level(),
683 )
685 # a masked array
686 else:
687 data = sanitize_masked_array(data)
688 mgr = ndarray_to_mgr(
689 data,
690 index,
691 columns,
692 dtype=dtype,
693 copy=copy,
694 typ=manager,
695 )
697 elif isinstance(data, (np.ndarray, Series, Index, ExtensionArray)):
698 if data.dtype.names:
699 # i.e. numpy structured array
700 data = cast(np.ndarray, data)
701 mgr = rec_array_to_mgr(
702 data,
703 index,
704 columns,
705 dtype,
706 copy,
707 typ=manager,
708 )
709 elif getattr(data, "name", None) is not None:
710 # i.e. Series/Index with non-None name
711 mgr = dict_to_mgr(
712 # error: Item "ndarray" of "Union[ndarray, Series, Index]" has no
713 # attribute "name"
714 {data.name: data}, # type: ignore[union-attr]
715 index,
716 columns,
717 dtype=dtype,
718 typ=manager,
719 )
720 else:
721 mgr = ndarray_to_mgr(
722 data,
723 index,
724 columns,
725 dtype=dtype,
726 copy=copy,
727 typ=manager,
728 )
730 # For data is list-like, or Iterable (will consume into list)
731 elif is_list_like(data):
732 if not isinstance(data, (abc.Sequence, ExtensionArray)):
733 if hasattr(data, "__array__"):
734 # GH#44616 big perf improvement for e.g. pytorch tensor
735 data = np.asarray(data)
736 else:
737 data = list(data)
738 if len(data) > 0:
739 if is_dataclass(data[0]):
740 data = dataclasses_to_dicts(data)
741 if not isinstance(data, np.ndarray) and treat_as_nested(data):
742 # exclude ndarray as we may have cast it a few lines above
743 if columns is not None:
744 columns = ensure_index(columns)
745 arrays, columns, index = nested_data_to_arrays(
746 # error: Argument 3 to "nested_data_to_arrays" has incompatible
747 # type "Optional[Collection[Any]]"; expected "Optional[Index]"
748 data,
749 columns,
750 index, # type: ignore[arg-type]
751 dtype,
752 )
753 mgr = arrays_to_mgr(
754 arrays,
755 columns,
756 index,
757 dtype=dtype,
758 typ=manager,
759 )
760 else:
761 mgr = ndarray_to_mgr(
762 data,
763 index,
764 columns,
765 dtype=dtype,
766 copy=copy,
767 typ=manager,
768 )
769 else:
770 mgr = dict_to_mgr(
771 {},
772 index,
773 columns,
774 dtype=dtype,
775 typ=manager,
776 )
777 # For data is scalar
778 else:
779 if index is None or columns is None:
780 raise ValueError("DataFrame constructor not properly called!")
782 index = ensure_index(index)
783 columns = ensure_index(columns)
785 if not dtype:
786 dtype, _ = infer_dtype_from_scalar(data, pandas_dtype=True)
788 # For data is a scalar extension dtype
789 if isinstance(dtype, ExtensionDtype):
790 # TODO(EA2D): special case not needed with 2D EAs
792 values = [
793 construct_1d_arraylike_from_scalar(data, len(index), dtype)
794 for _ in range(len(columns))
795 ]
796 mgr = arrays_to_mgr(values, columns, index, dtype=None, typ=manager)
797 else:
798 arr2d = construct_2d_arraylike_from_scalar(
799 data,
800 len(index),
801 len(columns),
802 dtype,
803 copy,
804 )
806 mgr = ndarray_to_mgr(
807 arr2d,
808 index,
809 columns,
810 dtype=arr2d.dtype,
811 copy=False,
812 typ=manager,
813 )
815 # ensure correct Manager type according to settings
816 mgr = mgr_to_mgr(mgr, typ=manager)
818 NDFrame.__init__(self, mgr)
820 # ----------------------------------------------------------------------
821 def __dataframe__(
822 self, nan_as_null: bool = False, allow_copy: bool = True
823 ) -> DataFrameXchg:
824 """
825 Return the dataframe interchange object implementing the interchange protocol.
827 Parameters
828 ----------
829 nan_as_null : bool, default False
830 Whether to tell the DataFrame to overwrite null values in the data
831 with ``NaN`` (or ``NaT``).
832 allow_copy : bool, default True
833 Whether to allow memory copying when exporting. If set to False
834 it would cause non-zero-copy exports to fail.
836 Returns
837 -------
838 DataFrame interchange object
839 The object which consuming library can use to ingress the dataframe.
841 Notes
842 -----
843 Details on the interchange protocol:
844 https://data-apis.org/dataframe-protocol/latest/index.html
846 `nan_as_null` currently has no effect; once support for nullable extension
847 dtypes is added, this value should be propagated to columns.
848 """
850 from pandas.core.interchange.dataframe import PandasDataFrameXchg
852 return PandasDataFrameXchg(self, nan_as_null, allow_copy)
854 # ----------------------------------------------------------------------
856 @property
857 def axes(self) -> list[Index]:
858 """
859 Return a list representing the axes of the DataFrame.
861 It has the row axis labels and column axis labels as the only members.
862 They are returned in that order.
864 Examples
865 --------
866 >>> df = pd.DataFrame({'col1': [1, 2], 'col2': [3, 4]})
867 >>> df.axes
868 [RangeIndex(start=0, stop=2, step=1), Index(['col1', 'col2'],
869 dtype='object')]
870 """
871 return [self.index, self.columns]
873 @property
874 def shape(self) -> tuple[int, int]:
875 """
876 Return a tuple representing the dimensionality of the DataFrame.
878 See Also
879 --------
880 ndarray.shape : Tuple of array dimensions.
882 Examples
883 --------
884 >>> df = pd.DataFrame({'col1': [1, 2], 'col2': [3, 4]})
885 >>> df.shape
886 (2, 2)
888 >>> df = pd.DataFrame({'col1': [1, 2], 'col2': [3, 4],
889 ... 'col3': [5, 6]})
890 >>> df.shape
891 (2, 3)
892 """
893 return len(self.index), len(self.columns)
895 @property
896 def _is_homogeneous_type(self) -> bool:
897 """
898 Whether all the columns in a DataFrame have the same type.
900 Returns
901 -------
902 bool
904 See Also
905 --------
906 Index._is_homogeneous_type : Whether the object has a single
907 dtype.
908 MultiIndex._is_homogeneous_type : Whether all the levels of a
909 MultiIndex have the same dtype.
911 Examples
912 --------
913 >>> DataFrame({"A": [1, 2], "B": [3, 4]})._is_homogeneous_type
914 True
915 >>> DataFrame({"A": [1, 2], "B": [3.0, 4.0]})._is_homogeneous_type
916 False
918 Items with the same type but different sizes are considered
919 different types.
921 >>> DataFrame({
922 ... "A": np.array([1, 2], dtype=np.int32),
923 ... "B": np.array([1, 2], dtype=np.int64)})._is_homogeneous_type
924 False
925 """
926 if isinstance(self._mgr, ArrayManager):
927 return len({arr.dtype for arr in self._mgr.arrays}) == 1
928 if self._mgr.any_extension_types:
929 return len({block.dtype for block in self._mgr.blocks}) == 1
930 else:
931 return not self._is_mixed_type
933 @property
934 def _can_fast_transpose(self) -> bool:
935 """
936 Can we transpose this DataFrame without creating any new array objects.
937 """
938 if isinstance(self._mgr, ArrayManager):
939 return False
940 blocks = self._mgr.blocks
941 if len(blocks) != 1:
942 return False
944 dtype = blocks[0].dtype
945 # TODO(EA2D) special case would be unnecessary with 2D EAs
946 return not is_1d_only_ea_dtype(dtype)
948 # error: Return type "Union[ndarray, DatetimeArray, TimedeltaArray]" of
949 # "_values" incompatible with return type "ndarray" in supertype "NDFrame"
950 @property
951 def _values( # type: ignore[override]
952 self,
953 ) -> np.ndarray | DatetimeArray | TimedeltaArray | PeriodArray:
954 """
955 Analogue to ._values that may return a 2D ExtensionArray.
956 """
957 self._consolidate_inplace()
959 mgr = self._mgr
961 if isinstance(mgr, ArrayManager):
962 if len(mgr.arrays) == 1 and not is_1d_only_ea_dtype(mgr.arrays[0].dtype):
963 # error: Item "ExtensionArray" of "Union[ndarray, ExtensionArray]"
964 # has no attribute "reshape"
965 return mgr.arrays[0].reshape(-1, 1) # type: ignore[union-attr]
966 return self.values
968 blocks = mgr.blocks
969 if len(blocks) != 1:
970 return self.values
972 arr = blocks[0].values
973 if arr.ndim == 1:
974 # non-2D ExtensionArray
975 return self.values
977 # more generally, whatever we allow in NDArrayBackedExtensionBlock
978 arr = cast("np.ndarray | DatetimeArray | TimedeltaArray | PeriodArray", arr)
979 return arr.T
981 # ----------------------------------------------------------------------
982 # Rendering Methods
984 def _repr_fits_vertical_(self) -> bool:
985 """
986 Check length against max_rows.
987 """
988 max_rows = get_option("display.max_rows")
989 return len(self) <= max_rows
991 def _repr_fits_horizontal_(self, ignore_width: bool = False) -> bool:
992 """
993 Check if full repr fits in horizontal boundaries imposed by the display
994 options width and max_columns.
996 In case of non-interactive session, no boundaries apply.
998 `ignore_width` is here so ipynb+HTML output can behave the way
999 users expect. display.max_columns remains in effect.
1000 GH3541, GH3573
1001 """
1002 width, height = console.get_console_size()
1003 max_columns = get_option("display.max_columns")
1004 nb_columns = len(self.columns)
1006 # exceed max columns
1007 if (max_columns and nb_columns > max_columns) or (
1008 (not ignore_width) and width and nb_columns > (width // 2)
1009 ):
1010 return False
1012 # used by repr_html under IPython notebook or scripts ignore terminal
1013 # dims
1014 if ignore_width or width is None or not console.in_interactive_session():
1015 return True
1017 if get_option("display.width") is not None or console.in_ipython_frontend():
1018 # check at least the column row for excessive width
1019 max_rows = 1
1020 else:
1021 max_rows = get_option("display.max_rows")
1023 # when auto-detecting, so width=None and not in ipython front end
1024 # check whether repr fits horizontal by actually checking
1025 # the width of the rendered repr
1026 buf = StringIO()
1028 # only care about the stuff we'll actually print out
1029 # and to_string on entire frame may be expensive
1030 d = self
1032 if max_rows is not None: # unlimited rows
1033 # min of two, where one may be None
1034 d = d.iloc[: min(max_rows, len(d))]
1035 else:
1036 return True
1038 d.to_string(buf=buf)
1039 value = buf.getvalue()
1040 repr_width = max(len(line) for line in value.split("\n"))
1042 return repr_width < width
1044 def _info_repr(self) -> bool:
1045 """
1046 True if the repr should show the info view.
1047 """
1048 info_repr_option = get_option("display.large_repr") == "info"
1049 return info_repr_option and not (
1050 self._repr_fits_horizontal_() and self._repr_fits_vertical_()
1051 )
1053 def __repr__(self) -> str:
1054 """
1055 Return a string representation for a particular DataFrame.
1056 """
1057 if self._info_repr():
1058 buf = StringIO()
1059 self.info(buf=buf)
1060 return buf.getvalue()
1062 repr_params = fmt.get_dataframe_repr_params()
1063 return self.to_string(**repr_params)
1065 def _repr_html_(self) -> str | None:
1066 """
1067 Return a html representation for a particular DataFrame.
1069 Mainly for IPython notebook.
1070 """
1071 if self._info_repr():
1072 buf = StringIO()
1073 self.info(buf=buf)
1074 # need to escape the <class>, should be the first line.
1075 val = buf.getvalue().replace("<", r"<", 1)
1076 val = val.replace(">", r">", 1)
1077 return "<pre>" + val + "</pre>"
1079 if get_option("display.notebook_repr_html"):
1080 max_rows = get_option("display.max_rows")
1081 min_rows = get_option("display.min_rows")
1082 max_cols = get_option("display.max_columns")
1083 show_dimensions = get_option("display.show_dimensions")
1085 formatter = fmt.DataFrameFormatter(
1086 self,
1087 columns=None,
1088 col_space=None,
1089 na_rep="NaN",
1090 formatters=None,
1091 float_format=None,
1092 sparsify=None,
1093 justify=None,
1094 index_names=True,
1095 header=True,
1096 index=True,
1097 bold_rows=True,
1098 escape=True,
1099 max_rows=max_rows,
1100 min_rows=min_rows,
1101 max_cols=max_cols,
1102 show_dimensions=show_dimensions,
1103 decimal=".",
1104 )
1105 return fmt.DataFrameRenderer(formatter).to_html(notebook=True)
1106 else:
1107 return None
1109 @overload
1110 def to_string(
1111 self,
1112 buf: None = ...,
1113 columns: Sequence[str] | None = ...,
1114 col_space: int | list[int] | dict[Hashable, int] | None = ...,
1115 header: bool | Sequence[str] = ...,
1116 index: bool = ...,
1117 na_rep: str = ...,
1118 formatters: fmt.FormattersType | None = ...,
1119 float_format: fmt.FloatFormatType | None = ...,
1120 sparsify: bool | None = ...,
1121 index_names: bool = ...,
1122 justify: str | None = ...,
1123 max_rows: int | None = ...,
1124 max_cols: int | None = ...,
1125 show_dimensions: bool = ...,
1126 decimal: str = ...,
1127 line_width: int | None = ...,
1128 min_rows: int | None = ...,
1129 max_colwidth: int | None = ...,
1130 encoding: str | None = ...,
1131 ) -> str:
1132 ...
1134 @overload
1135 def to_string(
1136 self,
1137 buf: FilePath | WriteBuffer[str],
1138 columns: Sequence[str] | None = ...,
1139 col_space: int | list[int] | dict[Hashable, int] | None = ...,
1140 header: bool | Sequence[str] = ...,
1141 index: bool = ...,
1142 na_rep: str = ...,
1143 formatters: fmt.FormattersType | None = ...,
1144 float_format: fmt.FloatFormatType | None = ...,
1145 sparsify: bool | None = ...,
1146 index_names: bool = ...,
1147 justify: str | None = ...,
1148 max_rows: int | None = ...,
1149 max_cols: int | None = ...,
1150 show_dimensions: bool = ...,
1151 decimal: str = ...,
1152 line_width: int | None = ...,
1153 min_rows: int | None = ...,
1154 max_colwidth: int | None = ...,
1155 encoding: str | None = ...,
1156 ) -> None:
1157 ...
1159 @Substitution(
1160 header_type="bool or sequence of str",
1161 header="Write out the column names. If a list of strings "
1162 "is given, it is assumed to be aliases for the "
1163 "column names",
1164 col_space_type="int, list or dict of int",
1165 col_space="The minimum width of each column. If a list of ints is given "
1166 "every integers corresponds with one column. If a dict is given, the key "
1167 "references the column, while the value defines the space to use.",
1168 )
1169 @Substitution(shared_params=fmt.common_docstring, returns=fmt.return_docstring)
1170 def to_string(
1171 self,
1172 buf: FilePath | WriteBuffer[str] | None = None,
1173 columns: Sequence[str] | None = None,
1174 col_space: int | list[int] | dict[Hashable, int] | None = None,
1175 header: bool | Sequence[str] = True,
1176 index: bool = True,
1177 na_rep: str = "NaN",
1178 formatters: fmt.FormattersType | None = None,
1179 float_format: fmt.FloatFormatType | None = None,
1180 sparsify: bool | None = None,
1181 index_names: bool = True,
1182 justify: str | None = None,
1183 max_rows: int | None = None,
1184 max_cols: int | None = None,
1185 show_dimensions: bool = False,
1186 decimal: str = ".",
1187 line_width: int | None = None,
1188 min_rows: int | None = None,
1189 max_colwidth: int | None = None,
1190 encoding: str | None = None,
1191 ) -> str | None:
1192 """
1193 Render a DataFrame to a console-friendly tabular output.
1194 %(shared_params)s
1195 line_width : int, optional
1196 Width to wrap a line in characters.
1197 min_rows : int, optional
1198 The number of rows to display in the console in a truncated repr
1199 (when number of rows is above `max_rows`).
1200 max_colwidth : int, optional
1201 Max width to truncate each column in characters. By default, no limit.
1203 .. versionadded:: 1.0.0
1204 encoding : str, default "utf-8"
1205 Set character encoding.
1207 .. versionadded:: 1.0
1208 %(returns)s
1209 See Also
1210 --------
1211 to_html : Convert DataFrame to HTML.
1213 Examples
1214 --------
1215 >>> d = {'col1': [1, 2, 3], 'col2': [4, 5, 6]}
1216 >>> df = pd.DataFrame(d)
1217 >>> print(df.to_string())
1218 col1 col2
1219 0 1 4
1220 1 2 5
1221 2 3 6
1222 """
1223 from pandas import option_context
1225 with option_context("display.max_colwidth", max_colwidth):
1226 formatter = fmt.DataFrameFormatter(
1227 self,
1228 columns=columns,
1229 col_space=col_space,
1230 na_rep=na_rep,
1231 formatters=formatters,
1232 float_format=float_format,
1233 sparsify=sparsify,
1234 justify=justify,
1235 index_names=index_names,
1236 header=header,
1237 index=index,
1238 min_rows=min_rows,
1239 max_rows=max_rows,
1240 max_cols=max_cols,
1241 show_dimensions=show_dimensions,
1242 decimal=decimal,
1243 )
1244 return fmt.DataFrameRenderer(formatter).to_string(
1245 buf=buf,
1246 encoding=encoding,
1247 line_width=line_width,
1248 )
1250 # ----------------------------------------------------------------------
1252 @property
1253 def style(self) -> Styler:
1254 """
1255 Returns a Styler object.
1257 Contains methods for building a styled HTML representation of the DataFrame.
1259 See Also
1260 --------
1261 io.formats.style.Styler : Helps style a DataFrame or Series according to the
1262 data with HTML and CSS.
1263 """
1264 from pandas.io.formats.style import Styler
1266 return Styler(self)
1268 _shared_docs[
1269 "items"
1270 ] = r"""
1271 Iterate over (column name, Series) pairs.
1273 Iterates over the DataFrame columns, returning a tuple with
1274 the column name and the content as a Series.
1276 Yields
1277 ------
1278 label : object
1279 The column names for the DataFrame being iterated over.
1280 content : Series
1281 The column entries belonging to each label, as a Series.
1283 See Also
1284 --------
1285 DataFrame.iterrows : Iterate over DataFrame rows as
1286 (index, Series) pairs.
1287 DataFrame.itertuples : Iterate over DataFrame rows as namedtuples
1288 of the values.
1290 Examples
1291 --------
1292 >>> df = pd.DataFrame({'species': ['bear', 'bear', 'marsupial'],
1293 ... 'population': [1864, 22000, 80000]},
1294 ... index=['panda', 'polar', 'koala'])
1295 >>> df
1296 species population
1297 panda bear 1864
1298 polar bear 22000
1299 koala marsupial 80000
1300 >>> for label, content in df.items():
1301 ... print(f'label: {label}')
1302 ... print(f'content: {content}', sep='\n')
1303 ...
1304 label: species
1305 content:
1306 panda bear
1307 polar bear
1308 koala marsupial
1309 Name: species, dtype: object
1310 label: population
1311 content:
1312 panda 1864
1313 polar 22000
1314 koala 80000
1315 Name: population, dtype: int64
1316 """
1318 @Appender(_shared_docs["items"])
1319 def items(self) -> Iterable[tuple[Hashable, Series]]:
1320 if self.columns.is_unique and hasattr(self, "_item_cache"):
1321 for k in self.columns:
1322 yield k, self._get_item_cache(k)
1323 else:
1324 for i, k in enumerate(self.columns):
1325 yield k, self._ixs(i, axis=1)
1327 _shared_docs[
1328 "iteritems"
1329 ] = r"""
1330 Iterate over (column name, Series) pairs.
1332 .. deprecated:: 1.5.0
1333 iteritems is deprecated and will be removed in a future version.
1334 Use .items instead.
1336 Iterates over the DataFrame columns, returning a tuple with
1337 the column name and the content as a Series.
1339 Yields
1340 ------
1341 label : object
1342 The column names for the DataFrame being iterated over.
1343 content : Series
1344 The column entries belonging to each label, as a Series.
1346 See Also
1347 --------
1348 DataFrame.iter : Recommended alternative.
1349 DataFrame.iterrows : Iterate over DataFrame rows as
1350 (index, Series) pairs.
1351 DataFrame.itertuples : Iterate over DataFrame rows as namedtuples
1352 of the values.
1353 """
1355 @Appender(_shared_docs["iteritems"])
1356 def iteritems(self) -> Iterable[tuple[Hashable, Series]]:
1357 warnings.warn(
1358 "iteritems is deprecated and will be removed in a future version. "
1359 "Use .items instead.",
1360 FutureWarning,
1361 stacklevel=find_stack_level(),
1362 )
1363 yield from self.items()
1365 def iterrows(self) -> Iterable[tuple[Hashable, Series]]:
1366 """
1367 Iterate over DataFrame rows as (index, Series) pairs.
1369 Yields
1370 ------
1371 index : label or tuple of label
1372 The index of the row. A tuple for a `MultiIndex`.
1373 data : Series
1374 The data of the row as a Series.
1376 See Also
1377 --------
1378 DataFrame.itertuples : Iterate over DataFrame rows as namedtuples of the values.
1379 DataFrame.items : Iterate over (column name, Series) pairs.
1381 Notes
1382 -----
1383 1. Because ``iterrows`` returns a Series for each row,
1384 it does **not** preserve dtypes across the rows (dtypes are
1385 preserved across columns for DataFrames). For example,
1387 >>> df = pd.DataFrame([[1, 1.5]], columns=['int', 'float'])
1388 >>> row = next(df.iterrows())[1]
1389 >>> row
1390 int 1.0
1391 float 1.5
1392 Name: 0, dtype: float64
1393 >>> print(row['int'].dtype)
1394 float64
1395 >>> print(df['int'].dtype)
1396 int64
1398 To preserve dtypes while iterating over the rows, it is better
1399 to use :meth:`itertuples` which returns namedtuples of the values
1400 and which is generally faster than ``iterrows``.
1402 2. You should **never modify** something you are iterating over.
1403 This is not guaranteed to work in all cases. Depending on the
1404 data types, the iterator returns a copy and not a view, and writing
1405 to it will have no effect.
1406 """
1407 columns = self.columns
1408 klass = self._constructor_sliced
1409 for k, v in zip(self.index, self.values):
1410 s = klass(v, index=columns, name=k).__finalize__(self)
1411 yield k, s
1413 def itertuples(
1414 self, index: bool = True, name: str | None = "Pandas"
1415 ) -> Iterable[tuple[Any, ...]]:
1416 """
1417 Iterate over DataFrame rows as namedtuples.
1419 Parameters
1420 ----------
1421 index : bool, default True
1422 If True, return the index as the first element of the tuple.
1423 name : str or None, default "Pandas"
1424 The name of the returned namedtuples or None to return regular
1425 tuples.
1427 Returns
1428 -------
1429 iterator
1430 An object to iterate over namedtuples for each row in the
1431 DataFrame with the first field possibly being the index and
1432 following fields being the column values.
1434 See Also
1435 --------
1436 DataFrame.iterrows : Iterate over DataFrame rows as (index, Series)
1437 pairs.
1438 DataFrame.items : Iterate over (column name, Series) pairs.
1440 Notes
1441 -----
1442 The column names will be renamed to positional names if they are
1443 invalid Python identifiers, repeated, or start with an underscore.
1445 Examples
1446 --------
1447 >>> df = pd.DataFrame({'num_legs': [4, 2], 'num_wings': [0, 2]},
1448 ... index=['dog', 'hawk'])
1449 >>> df
1450 num_legs num_wings
1451 dog 4 0
1452 hawk 2 2
1453 >>> for row in df.itertuples():
1454 ... print(row)
1455 ...
1456 Pandas(Index='dog', num_legs=4, num_wings=0)
1457 Pandas(Index='hawk', num_legs=2, num_wings=2)
1459 By setting the `index` parameter to False we can remove the index
1460 as the first element of the tuple:
1462 >>> for row in df.itertuples(index=False):
1463 ... print(row)
1464 ...
1465 Pandas(num_legs=4, num_wings=0)
1466 Pandas(num_legs=2, num_wings=2)
1468 With the `name` parameter set we set a custom name for the yielded
1469 namedtuples:
1471 >>> for row in df.itertuples(name='Animal'):
1472 ... print(row)
1473 ...
1474 Animal(Index='dog', num_legs=4, num_wings=0)
1475 Animal(Index='hawk', num_legs=2, num_wings=2)
1476 """
1477 arrays = []
1478 fields = list(self.columns)
1479 if index:
1480 arrays.append(self.index)
1481 fields.insert(0, "Index")
1483 # use integer indexing because of possible duplicate column names
1484 arrays.extend(self.iloc[:, k] for k in range(len(self.columns)))
1486 if name is not None:
1487 # https://github.com/python/mypy/issues/9046
1488 # error: namedtuple() expects a string literal as the first argument
1489 itertuple = collections.namedtuple( # type: ignore[misc]
1490 name, fields, rename=True
1491 )
1492 return map(itertuple._make, zip(*arrays))
1494 # fallback to regular tuples
1495 return zip(*arrays)
1497 def __len__(self) -> int:
1498 """
1499 Returns length of info axis, but here we use the index.
1500 """
1501 return len(self.index)
1503 @overload
1504 def dot(self, other: Series) -> Series:
1505 ...
1507 @overload
1508 def dot(self, other: DataFrame | Index | ArrayLike) -> DataFrame:
1509 ...
1511 def dot(self, other: AnyArrayLike | DataFrame) -> DataFrame | Series:
1512 """
1513 Compute the matrix multiplication between the DataFrame and other.
1515 This method computes the matrix product between the DataFrame and the
1516 values of an other Series, DataFrame or a numpy array.
1518 It can also be called using ``self @ other`` in Python >= 3.5.
1520 Parameters
1521 ----------
1522 other : Series, DataFrame or array-like
1523 The other object to compute the matrix product with.
1525 Returns
1526 -------
1527 Series or DataFrame
1528 If other is a Series, return the matrix product between self and
1529 other as a Series. If other is a DataFrame or a numpy.array, return
1530 the matrix product of self and other in a DataFrame of a np.array.
1532 See Also
1533 --------
1534 Series.dot: Similar method for Series.
1536 Notes
1537 -----
1538 The dimensions of DataFrame and other must be compatible in order to
1539 compute the matrix multiplication. In addition, the column names of
1540 DataFrame and the index of other must contain the same values, as they
1541 will be aligned prior to the multiplication.
1543 The dot method for Series computes the inner product, instead of the
1544 matrix product here.
1546 Examples
1547 --------
1548 Here we multiply a DataFrame with a Series.
1550 >>> df = pd.DataFrame([[0, 1, -2, -1], [1, 1, 1, 1]])
1551 >>> s = pd.Series([1, 1, 2, 1])
1552 >>> df.dot(s)
1553 0 -4
1554 1 5
1555 dtype: int64
1557 Here we multiply a DataFrame with another DataFrame.
1559 >>> other = pd.DataFrame([[0, 1], [1, 2], [-1, -1], [2, 0]])
1560 >>> df.dot(other)
1561 0 1
1562 0 1 4
1563 1 2 2
1565 Note that the dot method give the same result as @
1567 >>> df @ other
1568 0 1
1569 0 1 4
1570 1 2 2
1572 The dot method works also if other is an np.array.
1574 >>> arr = np.array([[0, 1], [1, 2], [-1, -1], [2, 0]])
1575 >>> df.dot(arr)
1576 0 1
1577 0 1 4
1578 1 2 2
1580 Note how shuffling of the objects does not change the result.
1582 >>> s2 = s.reindex([1, 0, 2, 3])
1583 >>> df.dot(s2)
1584 0 -4
1585 1 5
1586 dtype: int64
1587 """
1588 if isinstance(other, (Series, DataFrame)):
1589 common = self.columns.union(other.index)
1590 if len(common) > len(self.columns) or len(common) > len(other.index):
1591 raise ValueError("matrices are not aligned")
1593 left = self.reindex(columns=common, copy=False)
1594 right = other.reindex(index=common, copy=False)
1595 lvals = left.values
1596 rvals = right._values
1597 else:
1598 left = self
1599 lvals = self.values
1600 rvals = np.asarray(other)
1601 if lvals.shape[1] != rvals.shape[0]:
1602 raise ValueError(
1603 f"Dot product shape mismatch, {lvals.shape} vs {rvals.shape}"
1604 )
1606 if isinstance(other, DataFrame):
1607 return self._constructor(
1608 np.dot(lvals, rvals), index=left.index, columns=other.columns
1609 )
1610 elif isinstance(other, Series):
1611 return self._constructor_sliced(np.dot(lvals, rvals), index=left.index)
1612 elif isinstance(rvals, (np.ndarray, Index)):
1613 result = np.dot(lvals, rvals)
1614 if result.ndim == 2:
1615 return self._constructor(result, index=left.index)
1616 else:
1617 return self._constructor_sliced(result, index=left.index)
1618 else: # pragma: no cover
1619 raise TypeError(f"unsupported type: {type(other)}")
1621 @overload
1622 def __matmul__(self, other: Series) -> Series:
1623 ...
1625 @overload
1626 def __matmul__(self, other: AnyArrayLike | DataFrame) -> DataFrame | Series:
1627 ...
1629 def __matmul__(self, other: AnyArrayLike | DataFrame) -> DataFrame | Series:
1630 """
1631 Matrix multiplication using binary `@` operator in Python>=3.5.
1632 """
1633 return self.dot(other)
1635 def __rmatmul__(self, other) -> DataFrame:
1636 """
1637 Matrix multiplication using binary `@` operator in Python>=3.5.
1638 """
1639 try:
1640 return self.T.dot(np.transpose(other)).T
1641 except ValueError as err:
1642 if "shape mismatch" not in str(err):
1643 raise
1644 # GH#21581 give exception message for original shapes
1645 msg = f"shapes {np.shape(other)} and {self.shape} not aligned"
1646 raise ValueError(msg) from err
1648 # ----------------------------------------------------------------------
1649 # IO methods (to / from other formats)
1651 @classmethod
1652 def from_dict(
1653 cls,
1654 data: dict,
1655 orient: str = "columns",
1656 dtype: Dtype | None = None,
1657 columns: Axes | None = None,
1658 ) -> DataFrame:
1659 """
1660 Construct DataFrame from dict of array-like or dicts.
1662 Creates DataFrame object from dictionary by columns or by index
1663 allowing dtype specification.
1665 Parameters
1666 ----------
1667 data : dict
1668 Of the form {field : array-like} or {field : dict}.
1669 orient : {'columns', 'index', 'tight'}, default 'columns'
1670 The "orientation" of the data. If the keys of the passed dict
1671 should be the columns of the resulting DataFrame, pass 'columns'
1672 (default). Otherwise if the keys should be rows, pass 'index'.
1673 If 'tight', assume a dict with keys ['index', 'columns', 'data',
1674 'index_names', 'column_names'].
1676 .. versionadded:: 1.4.0
1677 'tight' as an allowed value for the ``orient`` argument
1679 dtype : dtype, default None
1680 Data type to force, otherwise infer.
1681 columns : list, default None
1682 Column labels to use when ``orient='index'``. Raises a ValueError
1683 if used with ``orient='columns'`` or ``orient='tight'``.
1685 Returns
1686 -------
1687 DataFrame
1689 See Also
1690 --------
1691 DataFrame.from_records : DataFrame from structured ndarray, sequence
1692 of tuples or dicts, or DataFrame.
1693 DataFrame : DataFrame object creation using constructor.
1694 DataFrame.to_dict : Convert the DataFrame to a dictionary.
1696 Examples
1697 --------
1698 By default the keys of the dict become the DataFrame columns:
1700 >>> data = {'col_1': [3, 2, 1, 0], 'col_2': ['a', 'b', 'c', 'd']}
1701 >>> pd.DataFrame.from_dict(data)
1702 col_1 col_2
1703 0 3 a
1704 1 2 b
1705 2 1 c
1706 3 0 d
1708 Specify ``orient='index'`` to create the DataFrame using dictionary
1709 keys as rows:
1711 >>> data = {'row_1': [3, 2, 1, 0], 'row_2': ['a', 'b', 'c', 'd']}
1712 >>> pd.DataFrame.from_dict(data, orient='index')
1713 0 1 2 3
1714 row_1 3 2 1 0
1715 row_2 a b c d
1717 When using the 'index' orientation, the column names can be
1718 specified manually:
1720 >>> pd.DataFrame.from_dict(data, orient='index',
1721 ... columns=['A', 'B', 'C', 'D'])
1722 A B C D
1723 row_1 3 2 1 0
1724 row_2 a b c d
1726 Specify ``orient='tight'`` to create the DataFrame using a 'tight'
1727 format:
1729 >>> data = {'index': [('a', 'b'), ('a', 'c')],
1730 ... 'columns': [('x', 1), ('y', 2)],
1731 ... 'data': [[1, 3], [2, 4]],
1732 ... 'index_names': ['n1', 'n2'],
1733 ... 'column_names': ['z1', 'z2']}
1734 >>> pd.DataFrame.from_dict(data, orient='tight')
1735 z1 x y
1736 z2 1 2
1737 n1 n2
1738 a b 1 3
1739 c 2 4
1740 """
1741 index = None
1742 orient = orient.lower()
1743 if orient == "index":
1744 if len(data) > 0:
1745 # TODO speed up Series case
1746 if isinstance(list(data.values())[0], (Series, dict)):
1747 data = _from_nested_dict(data)
1748 else:
1749 index = list(data.keys())
1750 # error: Incompatible types in assignment (expression has type
1751 # "List[Any]", variable has type "Dict[Any, Any]")
1752 data = list(data.values()) # type: ignore[assignment]
1753 elif orient == "columns" or orient == "tight":
1754 if columns is not None:
1755 raise ValueError(f"cannot use columns parameter with orient='{orient}'")
1756 else: # pragma: no cover
1757 raise ValueError(
1758 f"Expected 'index', 'columns' or 'tight' for orient parameter. "
1759 f"Got '{orient}' instead"
1760 )
1762 if orient != "tight":
1763 return cls(data, index=index, columns=columns, dtype=dtype)
1764 else:
1765 realdata = data["data"]
1767 def create_index(indexlist, namelist):
1768 index: Index
1769 if len(namelist) > 1:
1770 index = MultiIndex.from_tuples(indexlist, names=namelist)
1771 else:
1772 index = Index(indexlist, name=namelist[0])
1773 return index
1775 index = create_index(data["index"], data["index_names"])
1776 columns = create_index(data["columns"], data["column_names"])
1777 return cls(realdata, index=index, columns=columns, dtype=dtype)
1779 def to_numpy(
1780 self,
1781 dtype: npt.DTypeLike | None = None,
1782 copy: bool = False,
1783 na_value: object = lib.no_default,
1784 ) -> np.ndarray:
1785 """
1786 Convert the DataFrame to a NumPy array.
1788 By default, the dtype of the returned array will be the common NumPy
1789 dtype of all types in the DataFrame. For example, if the dtypes are
1790 ``float16`` and ``float32``, the results dtype will be ``float32``.
1791 This may require copying data and coercing values, which may be
1792 expensive.
1794 Parameters
1795 ----------
1796 dtype : str or numpy.dtype, optional
1797 The dtype to pass to :meth:`numpy.asarray`.
1798 copy : bool, default False
1799 Whether to ensure that the returned value is not a view on
1800 another array. Note that ``copy=False`` does not *ensure* that
1801 ``to_numpy()`` is no-copy. Rather, ``copy=True`` ensure that
1802 a copy is made, even if not strictly necessary.
1803 na_value : Any, optional
1804 The value to use for missing values. The default value depends
1805 on `dtype` and the dtypes of the DataFrame columns.
1807 .. versionadded:: 1.1.0
1809 Returns
1810 -------
1811 numpy.ndarray
1813 See Also
1814 --------
1815 Series.to_numpy : Similar method for Series.
1817 Examples
1818 --------
1819 >>> pd.DataFrame({"A": [1, 2], "B": [3, 4]}).to_numpy()
1820 array([[1, 3],
1821 [2, 4]])
1823 With heterogeneous data, the lowest common type will have to
1824 be used.
1826 >>> df = pd.DataFrame({"A": [1, 2], "B": [3.0, 4.5]})
1827 >>> df.to_numpy()
1828 array([[1. , 3. ],
1829 [2. , 4.5]])
1831 For a mix of numeric and non-numeric types, the output array will
1832 have object dtype.
1834 >>> df['C'] = pd.date_range('2000', periods=2)
1835 >>> df.to_numpy()
1836 array([[1, 3.0, Timestamp('2000-01-01 00:00:00')],
1837 [2, 4.5, Timestamp('2000-01-02 00:00:00')]], dtype=object)
1838 """
1839 self._consolidate_inplace()
1840 if dtype is not None:
1841 dtype = np.dtype(dtype)
1842 result = self._mgr.as_array(dtype=dtype, copy=copy, na_value=na_value)
1843 if result.dtype is not dtype:
1844 result = np.array(result, dtype=dtype, copy=False)
1846 return result
1848 @overload
1849 def to_dict(
1850 self,
1851 orient: Literal["dict", "list", "series", "split", "tight", "index"] = ...,
1852 into: type[dict] = ...,
1853 ) -> dict:
1854 ...
1856 @overload
1857 def to_dict(self, orient: Literal["records"], into: type[dict] = ...) -> list[dict]:
1858 ...
1860 def to_dict(
1861 self,
1862 orient: Literal[
1863 "dict", "list", "series", "split", "tight", "records", "index"
1864 ] = "dict",
1865 into: type[dict] = dict,
1866 ) -> dict | list[dict]:
1867 """
1868 Convert the DataFrame to a dictionary.
1870 The type of the key-value pairs can be customized with the parameters
1871 (see below).
1873 Parameters
1874 ----------
1875 orient : str {'dict', 'list', 'series', 'split', 'tight', 'records', 'index'}
1876 Determines the type of the values of the dictionary.
1878 - 'dict' (default) : dict like {column -> {index -> value}}
1879 - 'list' : dict like {column -> [values]}
1880 - 'series' : dict like {column -> Series(values)}
1881 - 'split' : dict like
1882 {'index' -> [index], 'columns' -> [columns], 'data' -> [values]}
1883 - 'tight' : dict like
1884 {'index' -> [index], 'columns' -> [columns], 'data' -> [values],
1885 'index_names' -> [index.names], 'column_names' -> [column.names]}
1886 - 'records' : list like
1887 [{column -> value}, ... , {column -> value}]
1888 - 'index' : dict like {index -> {column -> value}}
1890 Abbreviations are allowed. `s` indicates `series` and `sp`
1891 indicates `split`.
1893 .. versionadded:: 1.4.0
1894 'tight' as an allowed value for the ``orient`` argument
1896 into : class, default dict
1897 The collections.abc.Mapping subclass used for all Mappings
1898 in the return value. Can be the actual class or an empty
1899 instance of the mapping type you want. If you want a
1900 collections.defaultdict, you must pass it initialized.
1902 Returns
1903 -------
1904 dict, list or collections.abc.Mapping
1905 Return a collections.abc.Mapping object representing the DataFrame.
1906 The resulting transformation depends on the `orient` parameter.
1908 See Also
1909 --------
1910 DataFrame.from_dict: Create a DataFrame from a dictionary.
1911 DataFrame.to_json: Convert a DataFrame to JSON format.
1913 Examples
1914 --------
1915 >>> df = pd.DataFrame({'col1': [1, 2],
1916 ... 'col2': [0.5, 0.75]},
1917 ... index=['row1', 'row2'])
1918 >>> df
1919 col1 col2
1920 row1 1 0.50
1921 row2 2 0.75
1922 >>> df.to_dict()
1923 {'col1': {'row1': 1, 'row2': 2}, 'col2': {'row1': 0.5, 'row2': 0.75}}
1925 You can specify the return orientation.
1927 >>> df.to_dict('series')
1928 {'col1': row1 1
1929 row2 2
1930 Name: col1, dtype: int64,
1931 'col2': row1 0.50
1932 row2 0.75
1933 Name: col2, dtype: float64}
1935 >>> df.to_dict('split')
1936 {'index': ['row1', 'row2'], 'columns': ['col1', 'col2'],
1937 'data': [[1, 0.5], [2, 0.75]]}
1939 >>> df.to_dict('records')
1940 [{'col1': 1, 'col2': 0.5}, {'col1': 2, 'col2': 0.75}]
1942 >>> df.to_dict('index')
1943 {'row1': {'col1': 1, 'col2': 0.5}, 'row2': {'col1': 2, 'col2': 0.75}}
1945 >>> df.to_dict('tight')
1946 {'index': ['row1', 'row2'], 'columns': ['col1', 'col2'],
1947 'data': [[1, 0.5], [2, 0.75]], 'index_names': [None], 'column_names': [None]}
1949 You can also specify the mapping type.
1951 >>> from collections import OrderedDict, defaultdict
1952 >>> df.to_dict(into=OrderedDict)
1953 OrderedDict([('col1', OrderedDict([('row1', 1), ('row2', 2)])),
1954 ('col2', OrderedDict([('row1', 0.5), ('row2', 0.75)]))])
1956 If you want a `defaultdict`, you need to initialize it:
1958 >>> dd = defaultdict(list)
1959 >>> df.to_dict('records', into=dd)
1960 [defaultdict(<class 'list'>, {'col1': 1, 'col2': 0.5}),
1961 defaultdict(<class 'list'>, {'col1': 2, 'col2': 0.75})]
1962 """
1963 if not self.columns.is_unique:
1964 warnings.warn(
1965 "DataFrame columns are not unique, some columns will be omitted.",
1966 UserWarning,
1967 stacklevel=find_stack_level(),
1968 )
1969 # GH16122
1970 into_c = com.standardize_mapping(into)
1972 # error: Incompatible types in assignment (expression has type "str",
1973 # variable has type "Literal['dict', 'list', 'series', 'split', 'tight',
1974 # 'records', 'index']")
1975 orient = orient.lower() # type: ignore[assignment]
1976 # GH32515
1977 if orient.startswith(("d", "l", "s", "r", "i")) and orient not in {
1978 "dict",
1979 "list",
1980 "series",
1981 "split",
1982 "records",
1983 "index",
1984 }:
1985 warnings.warn(
1986 "Using short name for 'orient' is deprecated. Only the "
1987 "options: ('dict', list, 'series', 'split', 'records', 'index') "
1988 "will be used in a future version. Use one of the above "
1989 "to silence this warning.",
1990 FutureWarning,
1991 stacklevel=find_stack_level(),
1992 )
1994 if orient.startswith("d"):
1995 orient = "dict"
1996 elif orient.startswith("l"):
1997 orient = "list"
1998 elif orient.startswith("sp"):
1999 orient = "split"
2000 elif orient.startswith("s"):
2001 orient = "series"
2002 elif orient.startswith("r"):
2003 orient = "records"
2004 elif orient.startswith("i"):
2005 orient = "index"
2007 if orient == "dict":
2008 return into_c((k, v.to_dict(into)) for k, v in self.items())
2010 elif orient == "list":
2011 return into_c(
2012 (k, list(map(maybe_box_native, v.tolist()))) for k, v in self.items()
2013 )
2015 elif orient == "split":
2016 return into_c(
2017 (
2018 ("index", self.index.tolist()),
2019 ("columns", self.columns.tolist()),
2020 (
2021 "data",
2022 [
2023 list(map(maybe_box_native, t))
2024 for t in self.itertuples(index=False, name=None)
2025 ],
2026 ),
2027 )
2028 )
2030 elif orient == "tight":
2031 return into_c(
2032 (
2033 ("index", self.index.tolist()),
2034 ("columns", self.columns.tolist()),
2035 (
2036 "data",
2037 [
2038 list(map(maybe_box_native, t))
2039 for t in self.itertuples(index=False, name=None)
2040 ],
2041 ),
2042 ("index_names", list(self.index.names)),
2043 ("column_names", list(self.columns.names)),
2044 )
2045 )
2047 elif orient == "series":
2048 return into_c((k, v) for k, v in self.items())
2050 elif orient == "records":
2051 columns = self.columns.tolist()
2052 rows = (
2053 dict(zip(columns, row))
2054 for row in self.itertuples(index=False, name=None)
2055 )
2056 return [
2057 into_c((k, maybe_box_native(v)) for k, v in row.items()) for row in rows
2058 ]
2060 elif orient == "index":
2061 if not self.index.is_unique:
2062 raise ValueError("DataFrame index must be unique for orient='index'.")
2063 return into_c(
2064 (t[0], dict(zip(self.columns, map(maybe_box_native, t[1:]))))
2065 for t in self.itertuples(name=None)
2066 )
2068 else:
2069 raise ValueError(f"orient '{orient}' not understood")
2071 def to_gbq(
2072 self,
2073 destination_table: str,
2074 project_id: str | None = None,
2075 chunksize: int | None = None,
2076 reauth: bool = False,
2077 if_exists: str = "fail",
2078 auth_local_webserver: bool = True,
2079 table_schema: list[dict[str, str]] | None = None,
2080 location: str | None = None,
2081 progress_bar: bool = True,
2082 credentials=None,
2083 ) -> None:
2084 """
2085 Write a DataFrame to a Google BigQuery table.
2087 This function requires the `pandas-gbq package
2088 <https://pandas-gbq.readthedocs.io>`__.
2090 See the `How to authenticate with Google BigQuery
2091 <https://pandas-gbq.readthedocs.io/en/latest/howto/authentication.html>`__
2092 guide for authentication instructions.
2094 Parameters
2095 ----------
2096 destination_table : str
2097 Name of table to be written, in the form ``dataset.tablename``.
2098 project_id : str, optional
2099 Google BigQuery Account project ID. Optional when available from
2100 the environment.
2101 chunksize : int, optional
2102 Number of rows to be inserted in each chunk from the dataframe.
2103 Set to ``None`` to load the whole dataframe at once.
2104 reauth : bool, default False
2105 Force Google BigQuery to re-authenticate the user. This is useful
2106 if multiple accounts are used.
2107 if_exists : str, default 'fail'
2108 Behavior when the destination table exists. Value can be one of:
2110 ``'fail'``
2111 If table exists raise pandas_gbq.gbq.TableCreationError.
2112 ``'replace'``
2113 If table exists, drop it, recreate it, and insert data.
2114 ``'append'``
2115 If table exists, insert data. Create if does not exist.
2116 auth_local_webserver : bool, default True
2117 Use the `local webserver flow`_ instead of the `console flow`_
2118 when getting user credentials.
2120 .. _local webserver flow:
2121 https://google-auth-oauthlib.readthedocs.io/en/latest/reference/google_auth_oauthlib.flow.html#google_auth_oauthlib.flow.InstalledAppFlow.run_local_server
2122 .. _console flow:
2123 https://google-auth-oauthlib.readthedocs.io/en/latest/reference/google_auth_oauthlib.flow.html#google_auth_oauthlib.flow.InstalledAppFlow.run_console
2125 *New in version 0.2.0 of pandas-gbq*.
2127 .. versionchanged:: 1.5.0
2128 Default value is changed to ``True``. Google has deprecated the
2129 ``auth_local_webserver = False`` `"out of band" (copy-paste)
2130 flow
2131 <https://developers.googleblog.com/2022/02/making-oauth-flows-safer.html?m=1#disallowed-oob>`_.
2132 table_schema : list of dicts, optional
2133 List of BigQuery table fields to which according DataFrame
2134 columns conform to, e.g. ``[{'name': 'col1', 'type':
2135 'STRING'},...]``. If schema is not provided, it will be
2136 generated according to dtypes of DataFrame columns. See
2137 BigQuery API documentation on available names of a field.
2139 *New in version 0.3.1 of pandas-gbq*.
2140 location : str, optional
2141 Location where the load job should run. See the `BigQuery locations
2142 documentation
2143 <https://cloud.google.com/bigquery/docs/dataset-locations>`__ for a
2144 list of available locations. The location must match that of the
2145 target dataset.
2147 *New in version 0.5.0 of pandas-gbq*.
2148 progress_bar : bool, default True
2149 Use the library `tqdm` to show the progress bar for the upload,
2150 chunk by chunk.
2152 *New in version 0.5.0 of pandas-gbq*.
2153 credentials : google.auth.credentials.Credentials, optional
2154 Credentials for accessing Google APIs. Use this parameter to
2155 override default credentials, such as to use Compute Engine
2156 :class:`google.auth.compute_engine.Credentials` or Service
2157 Account :class:`google.oauth2.service_account.Credentials`
2158 directly.
2160 *New in version 0.8.0 of pandas-gbq*.
2162 See Also
2163 --------
2164 pandas_gbq.to_gbq : This function in the pandas-gbq library.
2165 read_gbq : Read a DataFrame from Google BigQuery.
2166 """
2167 from pandas.io import gbq
2169 gbq.to_gbq(
2170 self,
2171 destination_table,
2172 project_id=project_id,
2173 chunksize=chunksize,
2174 reauth=reauth,
2175 if_exists=if_exists,
2176 auth_local_webserver=auth_local_webserver,
2177 table_schema=table_schema,
2178 location=location,
2179 progress_bar=progress_bar,
2180 credentials=credentials,
2181 )
2183 @classmethod
2184 def from_records(
2185 cls,
2186 data,
2187 index=None,
2188 exclude=None,
2189 columns=None,
2190 coerce_float: bool = False,
2191 nrows: int | None = None,
2192 ) -> DataFrame:
2193 """
2194 Convert structured or record ndarray to DataFrame.
2196 Creates a DataFrame object from a structured ndarray, sequence of
2197 tuples or dicts, or DataFrame.
2199 Parameters
2200 ----------
2201 data : structured ndarray, sequence of tuples or dicts, or DataFrame
2202 Structured input data.
2203 index : str, list of fields, array-like
2204 Field of array to use as the index, alternately a specific set of
2205 input labels to use.
2206 exclude : sequence, default None
2207 Columns or fields to exclude.
2208 columns : sequence, default None
2209 Column names to use. If the passed data do not have names
2210 associated with them, this argument provides names for the
2211 columns. Otherwise this argument indicates the order of the columns
2212 in the result (any names not found in the data will become all-NA
2213 columns).
2214 coerce_float : bool, default False
2215 Attempt to convert values of non-string, non-numeric objects (like
2216 decimal.Decimal) to floating point, useful for SQL result sets.
2217 nrows : int, default None
2218 Number of rows to read if data is an iterator.
2220 Returns
2221 -------
2222 DataFrame
2224 See Also
2225 --------
2226 DataFrame.from_dict : DataFrame from dict of array-like or dicts.
2227 DataFrame : DataFrame object creation using constructor.
2229 Examples
2230 --------
2231 Data can be provided as a structured ndarray:
2233 >>> data = np.array([(3, 'a'), (2, 'b'), (1, 'c'), (0, 'd')],
2234 ... dtype=[('col_1', 'i4'), ('col_2', 'U1')])
2235 >>> pd.DataFrame.from_records(data)
2236 col_1 col_2
2237 0 3 a
2238 1 2 b
2239 2 1 c
2240 3 0 d
2242 Data can be provided as a list of dicts:
2244 >>> data = [{'col_1': 3, 'col_2': 'a'},
2245 ... {'col_1': 2, 'col_2': 'b'},
2246 ... {'col_1': 1, 'col_2': 'c'},
2247 ... {'col_1': 0, 'col_2': 'd'}]
2248 >>> pd.DataFrame.from_records(data)
2249 col_1 col_2
2250 0 3 a
2251 1 2 b
2252 2 1 c
2253 3 0 d
2255 Data can be provided as a list of tuples with corresponding columns:
2257 >>> data = [(3, 'a'), (2, 'b'), (1, 'c'), (0, 'd')]
2258 >>> pd.DataFrame.from_records(data, columns=['col_1', 'col_2'])
2259 col_1 col_2
2260 0 3 a
2261 1 2 b
2262 2 1 c
2263 3 0 d
2264 """
2265 result_index = None
2267 # Make a copy of the input columns so we can modify it
2268 if columns is not None:
2269 columns = ensure_index(columns)
2271 def maybe_reorder(
2272 arrays: list[ArrayLike], arr_columns: Index, columns: Index, index
2273 ) -> tuple[list[ArrayLike], Index, Index | None]:
2274 """
2275 If our desired 'columns' do not match the data's pre-existing 'arr_columns',
2276 we re-order our arrays. This is like a pre-emptive (cheap) reindex.
2277 """
2278 if len(arrays):
2279 length = len(arrays[0])
2280 else:
2281 length = 0
2283 result_index = None
2284 if len(arrays) == 0 and index is None and length == 0:
2285 # for backward compat use an object Index instead of RangeIndex
2286 result_index = Index([])
2288 arrays, arr_columns = reorder_arrays(arrays, arr_columns, columns, length)
2289 return arrays, arr_columns, result_index
2291 if is_iterator(data):
2292 if nrows == 0:
2293 return cls()
2295 try:
2296 first_row = next(data)
2297 except StopIteration:
2298 return cls(index=index, columns=columns)
2300 dtype = None
2301 if hasattr(first_row, "dtype") and first_row.dtype.names:
2302 dtype = first_row.dtype
2304 values = [first_row]
2306 if nrows is None:
2307 values += data
2308 else:
2309 values.extend(itertools.islice(data, nrows - 1))
2311 if dtype is not None:
2312 data = np.array(values, dtype=dtype)
2313 else:
2314 data = values
2316 if isinstance(data, dict):
2317 if columns is None:
2318 columns = arr_columns = ensure_index(sorted(data))
2319 arrays = [data[k] for k in columns]
2320 else:
2321 arrays = []
2322 arr_columns_list = []
2323 for k, v in data.items():
2324 if k in columns:
2325 arr_columns_list.append(k)
2326 arrays.append(v)
2328 arr_columns = Index(arr_columns_list)
2329 arrays, arr_columns, result_index = maybe_reorder(
2330 arrays, arr_columns, columns, index
2331 )
2333 elif isinstance(data, (np.ndarray, DataFrame)):
2334 arrays, columns = to_arrays(data, columns)
2335 arr_columns = columns
2336 else:
2337 arrays, arr_columns = to_arrays(data, columns)
2338 if coerce_float:
2339 for i, arr in enumerate(arrays):
2340 if arr.dtype == object:
2341 # error: Argument 1 to "maybe_convert_objects" has
2342 # incompatible type "Union[ExtensionArray, ndarray]";
2343 # expected "ndarray"
2344 arrays[i] = lib.maybe_convert_objects(
2345 arr, # type: ignore[arg-type]
2346 try_float=True,
2347 )
2349 arr_columns = ensure_index(arr_columns)
2350 if columns is None:
2351 columns = arr_columns
2352 else:
2353 arrays, arr_columns, result_index = maybe_reorder(
2354 arrays, arr_columns, columns, index
2355 )
2357 if exclude is None:
2358 exclude = set()
2359 else:
2360 exclude = set(exclude)
2362 if index is not None:
2363 if isinstance(index, str) or not hasattr(index, "__iter__"):
2364 i = columns.get_loc(index)
2365 exclude.add(index)
2366 if len(arrays) > 0:
2367 result_index = Index(arrays[i], name=index)
2368 else:
2369 result_index = Index([], name=index)
2370 else:
2371 try:
2372 index_data = [arrays[arr_columns.get_loc(field)] for field in index]
2373 except (KeyError, TypeError):
2374 # raised by get_loc, see GH#29258
2375 result_index = index
2376 else:
2377 result_index = ensure_index_from_sequences(index_data, names=index)
2378 exclude.update(index)
2380 if any(exclude):
2381 arr_exclude = [x for x in exclude if x in arr_columns]
2382 to_remove = [arr_columns.get_loc(col) for col in arr_exclude]
2383 arrays = [v for i, v in enumerate(arrays) if i not in to_remove]
2385 columns = columns.drop(exclude)
2387 manager = get_option("mode.data_manager")
2388 mgr = arrays_to_mgr(arrays, columns, result_index, typ=manager)
2390 return cls(mgr)
2392 def to_records(
2393 self, index: bool = True, column_dtypes=None, index_dtypes=None
2394 ) -> np.recarray:
2395 """
2396 Convert DataFrame to a NumPy record array.
2398 Index will be included as the first field of the record array if
2399 requested.
2401 Parameters
2402 ----------
2403 index : bool, default True
2404 Include index in resulting record array, stored in 'index'
2405 field or using the index label, if set.
2406 column_dtypes : str, type, dict, default None
2407 If a string or type, the data type to store all columns. If
2408 a dictionary, a mapping of column names and indices (zero-indexed)
2409 to specific data types.
2410 index_dtypes : str, type, dict, default None
2411 If a string or type, the data type to store all index levels. If
2412 a dictionary, a mapping of index level names and indices
2413 (zero-indexed) to specific data types.
2415 This mapping is applied only if `index=True`.
2417 Returns
2418 -------
2419 numpy.recarray
2420 NumPy ndarray with the DataFrame labels as fields and each row
2421 of the DataFrame as entries.
2423 See Also
2424 --------
2425 DataFrame.from_records: Convert structured or record ndarray
2426 to DataFrame.
2427 numpy.recarray: An ndarray that allows field access using
2428 attributes, analogous to typed columns in a
2429 spreadsheet.
2431 Examples
2432 --------
2433 >>> df = pd.DataFrame({'A': [1, 2], 'B': [0.5, 0.75]},
2434 ... index=['a', 'b'])
2435 >>> df
2436 A B
2437 a 1 0.50
2438 b 2 0.75
2439 >>> df.to_records()
2440 rec.array([('a', 1, 0.5 ), ('b', 2, 0.75)],
2441 dtype=[('index', 'O'), ('A', '<i8'), ('B', '<f8')])
2443 If the DataFrame index has no label then the recarray field name
2444 is set to 'index'. If the index has a label then this is used as the
2445 field name:
2447 >>> df.index = df.index.rename("I")
2448 >>> df.to_records()
2449 rec.array([('a', 1, 0.5 ), ('b', 2, 0.75)],
2450 dtype=[('I', 'O'), ('A', '<i8'), ('B', '<f8')])
2452 The index can be excluded from the record array:
2454 >>> df.to_records(index=False)
2455 rec.array([(1, 0.5 ), (2, 0.75)],
2456 dtype=[('A', '<i8'), ('B', '<f8')])
2458 Data types can be specified for the columns:
2460 >>> df.to_records(column_dtypes={"A": "int32"})
2461 rec.array([('a', 1, 0.5 ), ('b', 2, 0.75)],
2462 dtype=[('I', 'O'), ('A', '<i4'), ('B', '<f8')])
2464 As well as for the index:
2466 >>> df.to_records(index_dtypes="<S2")
2467 rec.array([(b'a', 1, 0.5 ), (b'b', 2, 0.75)],
2468 dtype=[('I', 'S2'), ('A', '<i8'), ('B', '<f8')])
2470 >>> index_dtypes = f"<S{df.index.str.len().max()}"
2471 >>> df.to_records(index_dtypes=index_dtypes)
2472 rec.array([(b'a', 1, 0.5 ), (b'b', 2, 0.75)],
2473 dtype=[('I', 'S1'), ('A', '<i8'), ('B', '<f8')])
2474 """
2475 if index:
2476 ix_vals = [
2477 np.asarray(self.index.get_level_values(i))
2478 for i in range(self.index.nlevels)
2479 ]
2481 arrays = ix_vals + [
2482 np.asarray(self.iloc[:, i]) for i in range(len(self.columns))
2483 ]
2485 index_names = list(self.index.names)
2487 if isinstance(self.index, MultiIndex):
2488 index_names = com.fill_missing_names(index_names)
2489 elif index_names[0] is None:
2490 index_names = ["index"]
2492 names = [str(name) for name in itertools.chain(index_names, self.columns)]
2493 else:
2494 arrays = [np.asarray(self.iloc[:, i]) for i in range(len(self.columns))]
2495 names = [str(c) for c in self.columns]
2496 index_names = []
2498 index_len = len(index_names)
2499 formats = []
2501 for i, v in enumerate(arrays):
2502 index_int = i
2504 # When the names and arrays are collected, we
2505 # first collect those in the DataFrame's index,
2506 # followed by those in its columns.
2507 #
2508 # Thus, the total length of the array is:
2509 # len(index_names) + len(DataFrame.columns).
2510 #
2511 # This check allows us to see whether we are
2512 # handling a name / array in the index or column.
2513 if index_int < index_len:
2514 dtype_mapping = index_dtypes
2515 name = index_names[index_int]
2516 else:
2517 index_int -= index_len
2518 dtype_mapping = column_dtypes
2519 name = self.columns[index_int]
2521 # We have a dictionary, so we get the data type
2522 # associated with the index or column (which can
2523 # be denoted by its name in the DataFrame or its
2524 # position in DataFrame's array of indices or
2525 # columns, whichever is applicable.
2526 if is_dict_like(dtype_mapping):
2527 if name in dtype_mapping:
2528 dtype_mapping = dtype_mapping[name]
2529 elif index_int in dtype_mapping:
2530 dtype_mapping = dtype_mapping[index_int]
2531 else:
2532 dtype_mapping = None
2534 # If no mapping can be found, use the array's
2535 # dtype attribute for formatting.
2536 #
2537 # A valid dtype must either be a type or
2538 # string naming a type.
2539 if dtype_mapping is None:
2540 formats.append(v.dtype)
2541 elif isinstance(dtype_mapping, (type, np.dtype, str)):
2542 # error: Argument 1 to "append" of "list" has incompatible
2543 # type "Union[type, dtype[Any], str]"; expected "dtype[Any]"
2544 formats.append(dtype_mapping) # type: ignore[arg-type]
2545 else:
2546 element = "row" if i < index_len else "column"
2547 msg = f"Invalid dtype {dtype_mapping} specified for {element} {name}"
2548 raise ValueError(msg)
2550 return np.rec.fromarrays(arrays, dtype={"names": names, "formats": formats})
2552 @classmethod
2553 def _from_arrays(
2554 cls,
2555 arrays,
2556 columns,
2557 index,
2558 dtype: Dtype | None = None,
2559 verify_integrity: bool = True,
2560 ) -> DataFrame:
2561 """
2562 Create DataFrame from a list of arrays corresponding to the columns.
2564 Parameters
2565 ----------
2566 arrays : list-like of arrays
2567 Each array in the list corresponds to one column, in order.
2568 columns : list-like, Index
2569 The column names for the resulting DataFrame.
2570 index : list-like, Index
2571 The rows labels for the resulting DataFrame.
2572 dtype : dtype, optional
2573 Optional dtype to enforce for all arrays.
2574 verify_integrity : bool, default True
2575 Validate and homogenize all input. If set to False, it is assumed
2576 that all elements of `arrays` are actual arrays how they will be
2577 stored in a block (numpy ndarray or ExtensionArray), have the same
2578 length as and are aligned with the index, and that `columns` and
2579 `index` are ensured to be an Index object.
2581 Returns
2582 -------
2583 DataFrame
2584 """
2585 if dtype is not None:
2586 dtype = pandas_dtype(dtype)
2588 manager = get_option("mode.data_manager")
2589 columns = ensure_index(columns)
2590 if len(columns) != len(arrays):
2591 raise ValueError("len(columns) must match len(arrays)")
2592 mgr = arrays_to_mgr(
2593 arrays,
2594 columns,
2595 index,
2596 dtype=dtype,
2597 verify_integrity=verify_integrity,
2598 typ=manager,
2599 )
2600 return cls(mgr)
2602 @doc(
2603 storage_options=_shared_docs["storage_options"],
2604 compression_options=_shared_docs["compression_options"] % "path",
2605 )
2606 @deprecate_kwarg(old_arg_name="fname", new_arg_name="path")
2607 @deprecate_nonkeyword_arguments(version=None, allowed_args=["self", "path"])
2608 def to_stata(
2609 self,
2610 path: FilePath | WriteBuffer[bytes],
2611 convert_dates: dict[Hashable, str] | None = None,
2612 write_index: bool = True,
2613 byteorder: str | None = None,
2614 time_stamp: datetime.datetime | None = None,
2615 data_label: str | None = None,
2616 variable_labels: dict[Hashable, str] | None = None,
2617 version: int | None = 114,
2618 convert_strl: Sequence[Hashable] | None = None,
2619 compression: CompressionOptions = "infer",
2620 storage_options: StorageOptions = None,
2621 *,
2622 value_labels: dict[Hashable, dict[float, str]] | None = None,
2623 ) -> None:
2624 """
2625 Export DataFrame object to Stata dta format.
2627 Writes the DataFrame to a Stata dataset file.
2628 "dta" files contain a Stata dataset.
2630 Parameters
2631 ----------
2632 path : str, path object, or buffer
2633 String, path object (implementing ``os.PathLike[str]``), or file-like
2634 object implementing a binary ``write()`` function.
2636 .. versionchanged:: 1.0.0
2638 Previously this was "fname"
2640 convert_dates : dict
2641 Dictionary mapping columns containing datetime types to stata
2642 internal format to use when writing the dates. Options are 'tc',
2643 'td', 'tm', 'tw', 'th', 'tq', 'ty'. Column can be either an integer
2644 or a name. Datetime columns that do not have a conversion type
2645 specified will be converted to 'tc'. Raises NotImplementedError if
2646 a datetime column has timezone information.
2647 write_index : bool
2648 Write the index to Stata dataset.
2649 byteorder : str
2650 Can be ">", "<", "little", or "big". default is `sys.byteorder`.
2651 time_stamp : datetime
2652 A datetime to use as file creation date. Default is the current
2653 time.
2654 data_label : str, optional
2655 A label for the data set. Must be 80 characters or smaller.
2656 variable_labels : dict
2657 Dictionary containing columns as keys and variable labels as
2658 values. Each label must be 80 characters or smaller.
2659 version : {{114, 117, 118, 119, None}}, default 114
2660 Version to use in the output dta file. Set to None to let pandas
2661 decide between 118 or 119 formats depending on the number of
2662 columns in the frame. Version 114 can be read by Stata 10 and
2663 later. Version 117 can be read by Stata 13 or later. Version 118
2664 is supported in Stata 14 and later. Version 119 is supported in
2665 Stata 15 and later. Version 114 limits string variables to 244
2666 characters or fewer while versions 117 and later allow strings
2667 with lengths up to 2,000,000 characters. Versions 118 and 119
2668 support Unicode characters, and version 119 supports more than
2669 32,767 variables.
2671 Version 119 should usually only be used when the number of
2672 variables exceeds the capacity of dta format 118. Exporting
2673 smaller datasets in format 119 may have unintended consequences,
2674 and, as of November 2020, Stata SE cannot read version 119 files.
2676 .. versionchanged:: 1.0.0
2678 Added support for formats 118 and 119.
2680 convert_strl : list, optional
2681 List of column names to convert to string columns to Stata StrL
2682 format. Only available if version is 117. Storing strings in the
2683 StrL format can produce smaller dta files if strings have more than
2684 8 characters and values are repeated.
2685 {compression_options}
2687 .. versionadded:: 1.1.0
2689 .. versionchanged:: 1.4.0 Zstandard support.
2691 {storage_options}
2693 .. versionadded:: 1.2.0
2695 value_labels : dict of dicts
2696 Dictionary containing columns as keys and dictionaries of column value
2697 to labels as values. Labels for a single variable must be 32,000
2698 characters or smaller.
2700 .. versionadded:: 1.4.0
2702 Raises
2703 ------
2704 NotImplementedError
2705 * If datetimes contain timezone information
2706 * Column dtype is not representable in Stata
2707 ValueError
2708 * Columns listed in convert_dates are neither datetime64[ns]
2709 or datetime.datetime
2710 * Column listed in convert_dates is not in DataFrame
2711 * Categorical label contains more than 32,000 characters
2713 See Also
2714 --------
2715 read_stata : Import Stata data files.
2716 io.stata.StataWriter : Low-level writer for Stata data files.
2717 io.stata.StataWriter117 : Low-level writer for version 117 files.
2719 Examples
2720 --------
2721 >>> df = pd.DataFrame({{'animal': ['falcon', 'parrot', 'falcon',
2722 ... 'parrot'],
2723 ... 'speed': [350, 18, 361, 15]}})
2724 >>> df.to_stata('animals.dta') # doctest: +SKIP
2725 """
2726 if version not in (114, 117, 118, 119, None):
2727 raise ValueError("Only formats 114, 117, 118 and 119 are supported.")
2728 if version == 114:
2729 if convert_strl is not None:
2730 raise ValueError("strl is not supported in format 114")
2731 from pandas.io.stata import StataWriter as statawriter
2732 elif version == 117:
2733 # mypy: Name 'statawriter' already defined (possibly by an import)
2734 from pandas.io.stata import ( # type: ignore[no-redef]
2735 StataWriter117 as statawriter,
2736 )
2737 else: # versions 118 and 119
2738 # mypy: Name 'statawriter' already defined (possibly by an import)
2739 from pandas.io.stata import ( # type: ignore[no-redef]
2740 StataWriterUTF8 as statawriter,
2741 )
2743 kwargs: dict[str, Any] = {}
2744 if version is None or version >= 117:
2745 # strl conversion is only supported >= 117
2746 kwargs["convert_strl"] = convert_strl
2747 if version is None or version >= 118:
2748 # Specifying the version is only supported for UTF8 (118 or 119)
2749 kwargs["version"] = version
2751 writer = statawriter(
2752 path,
2753 self,
2754 convert_dates=convert_dates,
2755 byteorder=byteorder,
2756 time_stamp=time_stamp,
2757 data_label=data_label,
2758 write_index=write_index,
2759 variable_labels=variable_labels,
2760 compression=compression,
2761 storage_options=storage_options,
2762 value_labels=value_labels,
2763 **kwargs,
2764 )
2765 writer.write_file()
2767 @deprecate_kwarg(old_arg_name="fname", new_arg_name="path")
2768 def to_feather(self, path: FilePath | WriteBuffer[bytes], **kwargs) -> None:
2769 """
2770 Write a DataFrame to the binary Feather format.
2772 Parameters
2773 ----------
2774 path : str, path object, file-like object
2775 String, path object (implementing ``os.PathLike[str]``), or file-like
2776 object implementing a binary ``write()`` function. If a string or a path,
2777 it will be used as Root Directory path when writing a partitioned dataset.
2778 **kwargs :
2779 Additional keywords passed to :func:`pyarrow.feather.write_feather`.
2780 Starting with pyarrow 0.17, this includes the `compression`,
2781 `compression_level`, `chunksize` and `version` keywords.
2783 .. versionadded:: 1.1.0
2785 Notes
2786 -----
2787 This function writes the dataframe as a `feather file
2788 <https://arrow.apache.org/docs/python/feather.html>`_. Requires a default
2789 index. For saving the DataFrame with your custom index use a method that
2790 supports custom indices e.g. `to_parquet`.
2791 """
2792 from pandas.io.feather_format import to_feather
2794 to_feather(self, path, **kwargs)
2796 @doc(
2797 Series.to_markdown,
2798 klass=_shared_doc_kwargs["klass"],
2799 storage_options=_shared_docs["storage_options"],
2800 examples="""Examples
2801 --------
2802 >>> df = pd.DataFrame(
2803 ... data={"animal_1": ["elk", "pig"], "animal_2": ["dog", "quetzal"]}
2804 ... )
2805 >>> print(df.to_markdown())
2806 | | animal_1 | animal_2 |
2807 |---:|:-----------|:-----------|
2808 | 0 | elk | dog |
2809 | 1 | pig | quetzal |
2811 Output markdown with a tabulate option.
2813 >>> print(df.to_markdown(tablefmt="grid"))
2814 +----+------------+------------+
2815 | | animal_1 | animal_2 |
2816 +====+============+============+
2817 | 0 | elk | dog |
2818 +----+------------+------------+
2819 | 1 | pig | quetzal |
2820 +----+------------+------------+""",
2821 )
2822 def to_markdown(
2823 self,
2824 buf: FilePath | WriteBuffer[str] | None = None,
2825 mode: str = "wt",
2826 index: bool = True,
2827 storage_options: StorageOptions = None,
2828 **kwargs,
2829 ) -> str | None:
2830 if "showindex" in kwargs:
2831 warnings.warn(
2832 "'showindex' is deprecated. Only 'index' will be used "
2833 "in a future version. Use 'index' to silence this warning.",
2834 FutureWarning,
2835 stacklevel=find_stack_level(),
2836 )
2838 kwargs.setdefault("headers", "keys")
2839 kwargs.setdefault("tablefmt", "pipe")
2840 kwargs.setdefault("showindex", index)
2841 tabulate = import_optional_dependency("tabulate")
2842 result = tabulate.tabulate(self, **kwargs)
2843 if buf is None:
2844 return result
2846 with get_handle(buf, mode, storage_options=storage_options) as handles:
2847 handles.handle.write(result)
2848 return None
2850 @overload
2851 def to_parquet(
2852 self,
2853 path: None = ...,
2854 engine: str = ...,
2855 compression: str | None = ...,
2856 index: bool | None = ...,
2857 partition_cols: list[str] | None = ...,
2858 storage_options: StorageOptions = ...,
2859 **kwargs,
2860 ) -> bytes:
2861 ...
2863 @overload
2864 def to_parquet(
2865 self,
2866 path: FilePath | WriteBuffer[bytes],
2867 engine: str = ...,
2868 compression: str | None = ...,
2869 index: bool | None = ...,
2870 partition_cols: list[str] | None = ...,
2871 storage_options: StorageOptions = ...,
2872 **kwargs,
2873 ) -> None:
2874 ...
2876 @doc(storage_options=_shared_docs["storage_options"])
2877 @deprecate_kwarg(old_arg_name="fname", new_arg_name="path")
2878 def to_parquet(
2879 self,
2880 path: FilePath | WriteBuffer[bytes] | None = None,
2881 engine: str = "auto",
2882 compression: str | None = "snappy",
2883 index: bool | None = None,
2884 partition_cols: list[str] | None = None,
2885 storage_options: StorageOptions = None,
2886 **kwargs,
2887 ) -> bytes | None:
2888 """
2889 Write a DataFrame to the binary parquet format.
2891 This function writes the dataframe as a `parquet file
2892 <https://parquet.apache.org/>`_. You can choose different parquet
2893 backends, and have the option of compression. See
2894 :ref:`the user guide <io.parquet>` for more details.
2896 Parameters
2897 ----------
2898 path : str, path object, file-like object, or None, default None
2899 String, path object (implementing ``os.PathLike[str]``), or file-like
2900 object implementing a binary ``write()`` function. If None, the result is
2901 returned as bytes. If a string or path, it will be used as Root Directory
2902 path when writing a partitioned dataset.
2904 .. versionchanged:: 1.2.0
2906 Previously this was "fname"
2908 engine : {{'auto', 'pyarrow', 'fastparquet'}}, default 'auto'
2909 Parquet library to use. If 'auto', then the option
2910 ``io.parquet.engine`` is used. The default ``io.parquet.engine``
2911 behavior is to try 'pyarrow', falling back to 'fastparquet' if
2912 'pyarrow' is unavailable.
2913 compression : {{'snappy', 'gzip', 'brotli', None}}, default 'snappy'
2914 Name of the compression to use. Use ``None`` for no compression.
2915 index : bool, default None
2916 If ``True``, include the dataframe's index(es) in the file output.
2917 If ``False``, they will not be written to the file.
2918 If ``None``, similar to ``True`` the dataframe's index(es)
2919 will be saved. However, instead of being saved as values,
2920 the RangeIndex will be stored as a range in the metadata so it
2921 doesn't require much space and is faster. Other indexes will
2922 be included as columns in the file output.
2923 partition_cols : list, optional, default None
2924 Column names by which to partition the dataset.
2925 Columns are partitioned in the order they are given.
2926 Must be None if path is not a string.
2927 {storage_options}
2929 .. versionadded:: 1.2.0
2931 **kwargs
2932 Additional arguments passed to the parquet library. See
2933 :ref:`pandas io <io.parquet>` for more details.
2935 Returns
2936 -------
2937 bytes if no path argument is provided else None
2939 See Also
2940 --------
2941 read_parquet : Read a parquet file.
2942 DataFrame.to_orc : Write an orc file.
2943 DataFrame.to_csv : Write a csv file.
2944 DataFrame.to_sql : Write to a sql table.
2945 DataFrame.to_hdf : Write to hdf.
2947 Notes
2948 -----
2949 This function requires either the `fastparquet
2950 <https://pypi.org/project/fastparquet>`_ or `pyarrow
2951 <https://arrow.apache.org/docs/python/>`_ library.
2953 Examples
2954 --------
2955 >>> df = pd.DataFrame(data={{'col1': [1, 2], 'col2': [3, 4]}})
2956 >>> df.to_parquet('df.parquet.gzip',
2957 ... compression='gzip') # doctest: +SKIP
2958 >>> pd.read_parquet('df.parquet.gzip') # doctest: +SKIP
2959 col1 col2
2960 0 1 3
2961 1 2 4
2963 If you want to get a buffer to the parquet content you can use a io.BytesIO
2964 object, as long as you don't use partition_cols, which creates multiple files.
2966 >>> import io
2967 >>> f = io.BytesIO()
2968 >>> df.to_parquet(f)
2969 >>> f.seek(0)
2970 0
2971 >>> content = f.read()
2972 """
2973 from pandas.io.parquet import to_parquet
2975 return to_parquet(
2976 self,
2977 path,
2978 engine,
2979 compression=compression,
2980 index=index,
2981 partition_cols=partition_cols,
2982 storage_options=storage_options,
2983 **kwargs,
2984 )
2986 def to_orc(
2987 self,
2988 path: FilePath | WriteBuffer[bytes] | None = None,
2989 *,
2990 engine: Literal["pyarrow"] = "pyarrow",
2991 index: bool | None = None,
2992 engine_kwargs: dict[str, Any] | None = None,
2993 ) -> bytes | None:
2994 """
2995 Write a DataFrame to the ORC format.
2997 .. versionadded:: 1.5.0
2999 Parameters
3000 ----------
3001 path : str, file-like object or None, default None
3002 If a string, it will be used as Root Directory path
3003 when writing a partitioned dataset. By file-like object,
3004 we refer to objects with a write() method, such as a file handle
3005 (e.g. via builtin open function). If path is None,
3006 a bytes object is returned.
3007 engine : str, default 'pyarrow'
3008 ORC library to use. Pyarrow must be >= 7.0.0.
3009 index : bool, optional
3010 If ``True``, include the dataframe's index(es) in the file output.
3011 If ``False``, they will not be written to the file.
3012 If ``None``, similar to ``infer`` the dataframe's index(es)
3013 will be saved. However, instead of being saved as values,
3014 the RangeIndex will be stored as a range in the metadata so it
3015 doesn't require much space and is faster. Other indexes will
3016 be included as columns in the file output.
3017 engine_kwargs : dict[str, Any] or None, default None
3018 Additional keyword arguments passed to :func:`pyarrow.orc.write_table`.
3020 Returns
3021 -------
3022 bytes if no path argument is provided else None
3024 Raises
3025 ------
3026 NotImplementedError
3027 Dtype of one or more columns is category, unsigned integers, interval,
3028 period or sparse.
3029 ValueError
3030 engine is not pyarrow.
3032 See Also
3033 --------
3034 read_orc : Read a ORC file.
3035 DataFrame.to_parquet : Write a parquet file.
3036 DataFrame.to_csv : Write a csv file.
3037 DataFrame.to_sql : Write to a sql table.
3038 DataFrame.to_hdf : Write to hdf.
3040 Notes
3041 -----
3042 * Before using this function you should read the :ref:`user guide about
3043 ORC <io.orc>` and :ref:`install optional dependencies <install.warn_orc>`.
3044 * This function requires `pyarrow <https://arrow.apache.org/docs/python/>`_
3045 library.
3046 * For supported dtypes please refer to `supported ORC features in Arrow
3047 <https://arrow.apache.org/docs/cpp/orc.html#data-types>`__.
3048 * Currently timezones in datetime columns are not preserved when a
3049 dataframe is converted into ORC files.
3051 Examples
3052 --------
3053 >>> df = pd.DataFrame(data={'col1': [1, 2], 'col2': [4, 3]})
3054 >>> df.to_orc('df.orc') # doctest: +SKIP
3055 >>> pd.read_orc('df.orc') # doctest: +SKIP
3056 col1 col2
3057 0 1 4
3058 1 2 3
3060 If you want to get a buffer to the orc content you can write it to io.BytesIO
3061 >>> import io
3062 >>> b = io.BytesIO(df.to_orc()) # doctest: +SKIP
3063 >>> b.seek(0) # doctest: +SKIP
3064 0
3065 >>> content = b.read() # doctest: +SKIP
3066 """
3067 from pandas.io.orc import to_orc
3069 return to_orc(
3070 self, path, engine=engine, index=index, engine_kwargs=engine_kwargs
3071 )
3073 @overload
3074 def to_html(
3075 self,
3076 buf: FilePath | WriteBuffer[str],
3077 columns: Sequence[Level] | None = ...,
3078 col_space: ColspaceArgType | None = ...,
3079 header: bool | Sequence[str] = ...,
3080 index: bool = ...,
3081 na_rep: str = ...,
3082 formatters: FormattersType | None = ...,
3083 float_format: FloatFormatType | None = ...,
3084 sparsify: bool | None = ...,
3085 index_names: bool = ...,
3086 justify: str | None = ...,
3087 max_rows: int | None = ...,
3088 max_cols: int | None = ...,
3089 show_dimensions: bool | str = ...,
3090 decimal: str = ...,
3091 bold_rows: bool = ...,
3092 classes: str | list | tuple | None = ...,
3093 escape: bool = ...,
3094 notebook: bool = ...,
3095 border: int | bool | None = ...,
3096 table_id: str | None = ...,
3097 render_links: bool = ...,
3098 encoding: str | None = ...,
3099 ) -> None:
3100 ...
3102 @overload
3103 def to_html(
3104 self,
3105 buf: None = ...,
3106 columns: Sequence[Level] | None = ...,
3107 col_space: ColspaceArgType | None = ...,
3108 header: bool | Sequence[str] = ...,
3109 index: bool = ...,
3110 na_rep: str = ...,
3111 formatters: FormattersType | None = ...,
3112 float_format: FloatFormatType | None = ...,
3113 sparsify: bool | None = ...,
3114 index_names: bool = ...,
3115 justify: str | None = ...,
3116 max_rows: int | None = ...,
3117 max_cols: int | None = ...,
3118 show_dimensions: bool | str = ...,
3119 decimal: str = ...,
3120 bold_rows: bool = ...,
3121 classes: str | list | tuple | None = ...,
3122 escape: bool = ...,
3123 notebook: bool = ...,
3124 border: int | bool | None = ...,
3125 table_id: str | None = ...,
3126 render_links: bool = ...,
3127 encoding: str | None = ...,
3128 ) -> str:
3129 ...
3131 @Substitution(
3132 header_type="bool",
3133 header="Whether to print column labels, default True",
3134 col_space_type="str or int, list or dict of int or str",
3135 col_space="The minimum width of each column in CSS length "
3136 "units. An int is assumed to be px units.\n\n"
3137 " .. versionadded:: 0.25.0\n"
3138 " Ability to use str",
3139 )
3140 @Substitution(shared_params=fmt.common_docstring, returns=fmt.return_docstring)
3141 def to_html(
3142 self,
3143 buf: FilePath | WriteBuffer[str] | None = None,
3144 columns: Sequence[Level] | None = None,
3145 col_space: ColspaceArgType | None = None,
3146 header: bool | Sequence[str] = True,
3147 index: bool = True,
3148 na_rep: str = "NaN",
3149 formatters: FormattersType | None = None,
3150 float_format: FloatFormatType | None = None,
3151 sparsify: bool | None = None,
3152 index_names: bool = True,
3153 justify: str | None = None,
3154 max_rows: int | None = None,
3155 max_cols: int | None = None,
3156 show_dimensions: bool | str = False,
3157 decimal: str = ".",
3158 bold_rows: bool = True,
3159 classes: str | list | tuple | None = None,
3160 escape: bool = True,
3161 notebook: bool = False,
3162 border: int | bool | None = None,
3163 table_id: str | None = None,
3164 render_links: bool = False,
3165 encoding: str | None = None,
3166 ) -> str | None:
3167 """
3168 Render a DataFrame as an HTML table.
3169 %(shared_params)s
3170 bold_rows : bool, default True
3171 Make the row labels bold in the output.
3172 classes : str or list or tuple, default None
3173 CSS class(es) to apply to the resulting html table.
3174 escape : bool, default True
3175 Convert the characters <, >, and & to HTML-safe sequences.
3176 notebook : {True, False}, default False
3177 Whether the generated HTML is for IPython Notebook.
3178 border : int
3179 A ``border=border`` attribute is included in the opening
3180 `<table>` tag. Default ``pd.options.display.html.border``.
3181 table_id : str, optional
3182 A css id is included in the opening `<table>` tag if specified.
3183 render_links : bool, default False
3184 Convert URLs to HTML links.
3185 encoding : str, default "utf-8"
3186 Set character encoding.
3188 .. versionadded:: 1.0
3189 %(returns)s
3190 See Also
3191 --------
3192 to_string : Convert DataFrame to a string.
3193 """
3194 if justify is not None and justify not in fmt._VALID_JUSTIFY_PARAMETERS:
3195 raise ValueError("Invalid value for justify parameter")
3197 formatter = fmt.DataFrameFormatter(
3198 self,
3199 columns=columns,
3200 col_space=col_space,
3201 na_rep=na_rep,
3202 header=header,
3203 index=index,
3204 formatters=formatters,
3205 float_format=float_format,
3206 bold_rows=bold_rows,
3207 sparsify=sparsify,
3208 justify=justify,
3209 index_names=index_names,
3210 escape=escape,
3211 decimal=decimal,
3212 max_rows=max_rows,
3213 max_cols=max_cols,
3214 show_dimensions=show_dimensions,
3215 )
3216 # TODO: a generic formatter wld b in DataFrameFormatter
3217 return fmt.DataFrameRenderer(formatter).to_html(
3218 buf=buf,
3219 classes=classes,
3220 notebook=notebook,
3221 border=border,
3222 encoding=encoding,
3223 table_id=table_id,
3224 render_links=render_links,
3225 )
3227 @doc(
3228 storage_options=_shared_docs["storage_options"],
3229 compression_options=_shared_docs["compression_options"] % "path_or_buffer",
3230 )
3231 def to_xml(
3232 self,
3233 path_or_buffer: FilePath | WriteBuffer[bytes] | WriteBuffer[str] | None = None,
3234 index: bool = True,
3235 root_name: str | None = "data",
3236 row_name: str | None = "row",
3237 na_rep: str | None = None,
3238 attr_cols: list[str] | None = None,
3239 elem_cols: list[str] | None = None,
3240 namespaces: dict[str | None, str] | None = None,
3241 prefix: str | None = None,
3242 encoding: str = "utf-8",
3243 xml_declaration: bool | None = True,
3244 pretty_print: bool | None = True,
3245 parser: str | None = "lxml",
3246 stylesheet: FilePath | ReadBuffer[str] | ReadBuffer[bytes] | None = None,
3247 compression: CompressionOptions = "infer",
3248 storage_options: StorageOptions = None,
3249 ) -> str | None:
3250 """
3251 Render a DataFrame to an XML document.
3253 .. versionadded:: 1.3.0
3255 Parameters
3256 ----------
3257 path_or_buffer : str, path object, file-like object, or None, default None
3258 String, path object (implementing ``os.PathLike[str]``), or file-like
3259 object implementing a ``write()`` function. If None, the result is returned
3260 as a string.
3261 index : bool, default True
3262 Whether to include index in XML document.
3263 root_name : str, default 'data'
3264 The name of root element in XML document.
3265 row_name : str, default 'row'
3266 The name of row element in XML document.
3267 na_rep : str, optional
3268 Missing data representation.
3269 attr_cols : list-like, optional
3270 List of columns to write as attributes in row element.
3271 Hierarchical columns will be flattened with underscore
3272 delimiting the different levels.
3273 elem_cols : list-like, optional
3274 List of columns to write as children in row element. By default,
3275 all columns output as children of row element. Hierarchical
3276 columns will be flattened with underscore delimiting the
3277 different levels.
3278 namespaces : dict, optional
3279 All namespaces to be defined in root element. Keys of dict
3280 should be prefix names and values of dict corresponding URIs.
3281 Default namespaces should be given empty string key. For
3282 example, ::
3284 namespaces = {{"": "https://example.com"}}
3286 prefix : str, optional
3287 Namespace prefix to be used for every element and/or attribute
3288 in document. This should be one of the keys in ``namespaces``
3289 dict.
3290 encoding : str, default 'utf-8'
3291 Encoding of the resulting document.
3292 xml_declaration : bool, default True
3293 Whether to include the XML declaration at start of document.
3294 pretty_print : bool, default True
3295 Whether output should be pretty printed with indentation and
3296 line breaks.
3297 parser : {{'lxml','etree'}}, default 'lxml'
3298 Parser module to use for building of tree. Only 'lxml' and
3299 'etree' are supported. With 'lxml', the ability to use XSLT
3300 stylesheet is supported.
3301 stylesheet : str, path object or file-like object, optional
3302 A URL, file-like object, or a raw string containing an XSLT
3303 script used to transform the raw XML output. Script should use
3304 layout of elements and attributes from original output. This
3305 argument requires ``lxml`` to be installed. Only XSLT 1.0
3306 scripts and not later versions is currently supported.
3307 {compression_options}
3309 .. versionchanged:: 1.4.0 Zstandard support.
3311 {storage_options}
3313 Returns
3314 -------
3315 None or str
3316 If ``io`` is None, returns the resulting XML format as a
3317 string. Otherwise returns None.
3319 See Also
3320 --------
3321 to_json : Convert the pandas object to a JSON string.
3322 to_html : Convert DataFrame to a html.
3324 Examples
3325 --------
3326 >>> df = pd.DataFrame({{'shape': ['square', 'circle', 'triangle'],
3327 ... 'degrees': [360, 360, 180],
3328 ... 'sides': [4, np.nan, 3]}})
3330 >>> df.to_xml() # doctest: +SKIP
3331 <?xml version='1.0' encoding='utf-8'?>
3332 <data>
3333 <row>
3334 <index>0</index>
3335 <shape>square</shape>
3336 <degrees>360</degrees>
3337 <sides>4.0</sides>
3338 </row>
3339 <row>
3340 <index>1</index>
3341 <shape>circle</shape>
3342 <degrees>360</degrees>
3343 <sides/>
3344 </row>
3345 <row>
3346 <index>2</index>
3347 <shape>triangle</shape>
3348 <degrees>180</degrees>
3349 <sides>3.0</sides>
3350 </row>
3351 </data>
3353 >>> df.to_xml(attr_cols=[
3354 ... 'index', 'shape', 'degrees', 'sides'
3355 ... ]) # doctest: +SKIP
3356 <?xml version='1.0' encoding='utf-8'?>
3357 <data>
3358 <row index="0" shape="square" degrees="360" sides="4.0"/>
3359 <row index="1" shape="circle" degrees="360"/>
3360 <row index="2" shape="triangle" degrees="180" sides="3.0"/>
3361 </data>
3363 >>> df.to_xml(namespaces={{"doc": "https://example.com"}},
3364 ... prefix="doc") # doctest: +SKIP
3365 <?xml version='1.0' encoding='utf-8'?>
3366 <doc:data xmlns:doc="https://example.com">
3367 <doc:row>
3368 <doc:index>0</doc:index>
3369 <doc:shape>square</doc:shape>
3370 <doc:degrees>360</doc:degrees>
3371 <doc:sides>4.0</doc:sides>
3372 </doc:row>
3373 <doc:row>
3374 <doc:index>1</doc:index>
3375 <doc:shape>circle</doc:shape>
3376 <doc:degrees>360</doc:degrees>
3377 <doc:sides/>
3378 </doc:row>
3379 <doc:row>
3380 <doc:index>2</doc:index>
3381 <doc:shape>triangle</doc:shape>
3382 <doc:degrees>180</doc:degrees>
3383 <doc:sides>3.0</doc:sides>
3384 </doc:row>
3385 </doc:data>
3386 """
3388 from pandas.io.formats.xml import (
3389 EtreeXMLFormatter,
3390 LxmlXMLFormatter,
3391 )
3393 lxml = import_optional_dependency("lxml.etree", errors="ignore")
3395 TreeBuilder: type[EtreeXMLFormatter] | type[LxmlXMLFormatter]
3397 if parser == "lxml":
3398 if lxml is not None:
3399 TreeBuilder = LxmlXMLFormatter
3400 else:
3401 raise ImportError(
3402 "lxml not found, please install or use the etree parser."
3403 )
3405 elif parser == "etree":
3406 TreeBuilder = EtreeXMLFormatter
3408 else:
3409 raise ValueError("Values for parser can only be lxml or etree.")
3411 xml_formatter = TreeBuilder(
3412 self,
3413 path_or_buffer=path_or_buffer,
3414 index=index,
3415 root_name=root_name,
3416 row_name=row_name,
3417 na_rep=na_rep,
3418 attr_cols=attr_cols,
3419 elem_cols=elem_cols,
3420 namespaces=namespaces,
3421 prefix=prefix,
3422 encoding=encoding,
3423 xml_declaration=xml_declaration,
3424 pretty_print=pretty_print,
3425 stylesheet=stylesheet,
3426 compression=compression,
3427 storage_options=storage_options,
3428 )
3430 return xml_formatter.write_output()
3432 # ----------------------------------------------------------------------
3433 @doc(INFO_DOCSTRING, **frame_sub_kwargs)
3434 def info(
3435 self,
3436 verbose: bool | None = None,
3437 buf: WriteBuffer[str] | None = None,
3438 max_cols: int | None = None,
3439 memory_usage: bool | str | None = None,
3440 show_counts: bool | None = None,
3441 null_counts: bool | None = None,
3442 ) -> None:
3443 if null_counts is not None:
3444 if show_counts is not None:
3445 raise ValueError("null_counts used with show_counts. Use show_counts.")
3446 warnings.warn(
3447 "null_counts is deprecated. Use show_counts instead",
3448 FutureWarning,
3449 stacklevel=find_stack_level(),
3450 )
3451 show_counts = null_counts
3452 info = DataFrameInfo(
3453 data=self,
3454 memory_usage=memory_usage,
3455 )
3456 info.render(
3457 buf=buf,
3458 max_cols=max_cols,
3459 verbose=verbose,
3460 show_counts=show_counts,
3461 )
3463 def memory_usage(self, index: bool = True, deep: bool = False) -> Series:
3464 """
3465 Return the memory usage of each column in bytes.
3467 The memory usage can optionally include the contribution of
3468 the index and elements of `object` dtype.
3470 This value is displayed in `DataFrame.info` by default. This can be
3471 suppressed by setting ``pandas.options.display.memory_usage`` to False.
3473 Parameters
3474 ----------
3475 index : bool, default True
3476 Specifies whether to include the memory usage of the DataFrame's
3477 index in returned Series. If ``index=True``, the memory usage of
3478 the index is the first item in the output.
3479 deep : bool, default False
3480 If True, introspect the data deeply by interrogating
3481 `object` dtypes for system-level memory consumption, and include
3482 it in the returned values.
3484 Returns
3485 -------
3486 Series
3487 A Series whose index is the original column names and whose values
3488 is the memory usage of each column in bytes.
3490 See Also
3491 --------
3492 numpy.ndarray.nbytes : Total bytes consumed by the elements of an
3493 ndarray.
3494 Series.memory_usage : Bytes consumed by a Series.
3495 Categorical : Memory-efficient array for string values with
3496 many repeated values.
3497 DataFrame.info : Concise summary of a DataFrame.
3499 Notes
3500 -----
3501 See the :ref:`Frequently Asked Questions <df-memory-usage>` for more
3502 details.
3504 Examples
3505 --------
3506 >>> dtypes = ['int64', 'float64', 'complex128', 'object', 'bool']
3507 >>> data = dict([(t, np.ones(shape=5000, dtype=int).astype(t))
3508 ... for t in dtypes])
3509 >>> df = pd.DataFrame(data)
3510 >>> df.head()
3511 int64 float64 complex128 object bool
3512 0 1 1.0 1.0+0.0j 1 True
3513 1 1 1.0 1.0+0.0j 1 True
3514 2 1 1.0 1.0+0.0j 1 True
3515 3 1 1.0 1.0+0.0j 1 True
3516 4 1 1.0 1.0+0.0j 1 True
3518 >>> df.memory_usage()
3519 Index 128
3520 int64 40000
3521 float64 40000
3522 complex128 80000
3523 object 40000
3524 bool 5000
3525 dtype: int64
3527 >>> df.memory_usage(index=False)
3528 int64 40000
3529 float64 40000
3530 complex128 80000
3531 object 40000
3532 bool 5000
3533 dtype: int64
3535 The memory footprint of `object` dtype columns is ignored by default:
3537 >>> df.memory_usage(deep=True)
3538 Index 128
3539 int64 40000
3540 float64 40000
3541 complex128 80000
3542 object 180000
3543 bool 5000
3544 dtype: int64
3546 Use a Categorical for efficient storage of an object-dtype column with
3547 many repeated values.
3549 >>> df['object'].astype('category').memory_usage(deep=True)
3550 5244
3551 """
3552 result = self._constructor_sliced(
3553 [c.memory_usage(index=False, deep=deep) for col, c in self.items()],
3554 index=self.columns,
3555 )
3556 if index:
3557 index_memory_usage = self._constructor_sliced(
3558 self.index.memory_usage(deep=deep), index=["Index"]
3559 )
3560 result = index_memory_usage._append(result)
3561 return result
3563 def transpose(self, *args, copy: bool = False) -> DataFrame:
3564 """
3565 Transpose index and columns.
3567 Reflect the DataFrame over its main diagonal by writing rows as columns
3568 and vice-versa. The property :attr:`.T` is an accessor to the method
3569 :meth:`transpose`.
3571 Parameters
3572 ----------
3573 *args : tuple, optional
3574 Accepted for compatibility with NumPy.
3575 copy : bool, default False
3576 Whether to copy the data after transposing, even for DataFrames
3577 with a single dtype.
3579 Note that a copy is always required for mixed dtype DataFrames,
3580 or for DataFrames with any extension types.
3582 Returns
3583 -------
3584 DataFrame
3585 The transposed DataFrame.
3587 See Also
3588 --------
3589 numpy.transpose : Permute the dimensions of a given array.
3591 Notes
3592 -----
3593 Transposing a DataFrame with mixed dtypes will result in a homogeneous
3594 DataFrame with the `object` dtype. In such a case, a copy of the data
3595 is always made.
3597 Examples
3598 --------
3599 **Square DataFrame with homogeneous dtype**
3601 >>> d1 = {'col1': [1, 2], 'col2': [3, 4]}
3602 >>> df1 = pd.DataFrame(data=d1)
3603 >>> df1
3604 col1 col2
3605 0 1 3
3606 1 2 4
3608 >>> df1_transposed = df1.T # or df1.transpose()
3609 >>> df1_transposed
3610 0 1
3611 col1 1 2
3612 col2 3 4
3614 When the dtype is homogeneous in the original DataFrame, we get a
3615 transposed DataFrame with the same dtype:
3617 >>> df1.dtypes
3618 col1 int64
3619 col2 int64
3620 dtype: object
3621 >>> df1_transposed.dtypes
3622 0 int64
3623 1 int64
3624 dtype: object
3626 **Non-square DataFrame with mixed dtypes**
3628 >>> d2 = {'name': ['Alice', 'Bob'],
3629 ... 'score': [9.5, 8],
3630 ... 'employed': [False, True],
3631 ... 'kids': [0, 0]}
3632 >>> df2 = pd.DataFrame(data=d2)
3633 >>> df2
3634 name score employed kids
3635 0 Alice 9.5 False 0
3636 1 Bob 8.0 True 0
3638 >>> df2_transposed = df2.T # or df2.transpose()
3639 >>> df2_transposed
3640 0 1
3641 name Alice Bob
3642 score 9.5 8.0
3643 employed False True
3644 kids 0 0
3646 When the DataFrame has mixed dtypes, we get a transposed DataFrame with
3647 the `object` dtype:
3649 >>> df2.dtypes
3650 name object
3651 score float64
3652 employed bool
3653 kids int64
3654 dtype: object
3655 >>> df2_transposed.dtypes
3656 0 object
3657 1 object
3658 dtype: object
3659 """
3660 nv.validate_transpose(args, {})
3661 # construct the args
3663 dtypes = list(self.dtypes)
3665 if self._can_fast_transpose:
3666 # Note: tests pass without this, but this improves perf quite a bit.
3667 new_vals = self._values.T
3668 if copy:
3669 new_vals = new_vals.copy()
3671 result = self._constructor(new_vals, index=self.columns, columns=self.index)
3673 elif (
3674 self._is_homogeneous_type and dtypes and is_extension_array_dtype(dtypes[0])
3675 ):
3676 # We have EAs with the same dtype. We can preserve that dtype in transpose.
3677 dtype = dtypes[0]
3678 arr_type = dtype.construct_array_type()
3679 values = self.values
3681 new_values = [arr_type._from_sequence(row, dtype=dtype) for row in values]
3682 result = type(self)._from_arrays(
3683 new_values, index=self.columns, columns=self.index
3684 )
3686 else:
3687 new_arr = self.values.T
3688 if copy:
3689 new_arr = new_arr.copy()
3690 result = self._constructor(new_arr, index=self.columns, columns=self.index)
3692 return result.__finalize__(self, method="transpose")
3694 @property
3695 def T(self) -> DataFrame:
3696 return self.transpose()
3698 # ----------------------------------------------------------------------
3699 # Indexing Methods
3701 def _ixs(self, i: int, axis: int = 0) -> Series:
3702 """
3703 Parameters
3704 ----------
3705 i : int
3706 axis : int
3708 Returns
3709 -------
3710 Series
3711 """
3712 # irow
3713 if axis == 0:
3714 new_mgr = self._mgr.fast_xs(i)
3716 # if we are a copy, mark as such
3717 copy = isinstance(new_mgr.array, np.ndarray) and new_mgr.array.base is None
3718 result = self._constructor_sliced(new_mgr, name=self.index[i]).__finalize__(
3719 self
3720 )
3721 result._set_is_copy(self, copy=copy)
3722 return result
3724 # icol
3725 else:
3726 label = self.columns[i]
3728 col_mgr = self._mgr.iget(i)
3729 result = self._box_col_values(col_mgr, i)
3731 # this is a cached value, mark it so
3732 result._set_as_cached(label, self)
3733 return result
3735 def _get_column_array(self, i: int) -> ArrayLike:
3736 """
3737 Get the values of the i'th column (ndarray or ExtensionArray, as stored
3738 in the Block)
3740 Warning! The returned array is a view but doesn't handle Copy-on-Write,
3741 so this should be used with caution (for read-only purposes).
3742 """
3743 return self._mgr.iget_values(i)
3745 def _iter_column_arrays(self) -> Iterator[ArrayLike]:
3746 """
3747 Iterate over the arrays of all columns in order.
3748 This returns the values as stored in the Block (ndarray or ExtensionArray).
3750 Warning! The returned array is a view but doesn't handle Copy-on-Write,
3751 so this should be used with caution (for read-only purposes).
3752 """
3753 for i in range(len(self.columns)):
3754 yield self._get_column_array(i)
3756 def __getitem__(self, key):
3757 check_deprecated_indexers(key)
3758 key = lib.item_from_zerodim(key)
3759 key = com.apply_if_callable(key, self)
3761 if is_hashable(key) and not is_iterator(key):
3762 # is_iterator to exclude generator e.g. test_getitem_listlike
3763 # shortcut if the key is in columns
3764 is_mi = isinstance(self.columns, MultiIndex)
3765 # GH#45316 Return view if key is not duplicated
3766 # Only use drop_duplicates with duplicates for performance
3767 if not is_mi and (
3768 self.columns.is_unique
3769 and key in self.columns
3770 or key in self.columns.drop_duplicates(keep=False)
3771 ):
3772 return self._get_item_cache(key)
3774 elif is_mi and self.columns.is_unique and key in self.columns:
3775 return self._getitem_multilevel(key)
3776 # Do we have a slicer (on rows)?
3777 indexer = convert_to_index_sliceable(self, key)
3778 if indexer is not None:
3779 if isinstance(indexer, np.ndarray):
3780 indexer = lib.maybe_indices_to_slice(
3781 indexer.astype(np.intp, copy=False), len(self)
3782 )
3783 if isinstance(indexer, np.ndarray):
3784 # GH#43223 If we can not convert, use take
3785 return self.take(indexer, axis=0)
3786 # either we have a slice or we have a string that can be converted
3787 # to a slice for partial-string date indexing
3788 return self._slice(indexer, axis=0)
3790 # Do we have a (boolean) DataFrame?
3791 if isinstance(key, DataFrame):
3792 return self.where(key)
3794 # Do we have a (boolean) 1d indexer?
3795 if com.is_bool_indexer(key):
3796 return self._getitem_bool_array(key)
3798 # We are left with two options: a single key, and a collection of keys,
3799 # We interpret tuples as collections only for non-MultiIndex
3800 is_single_key = isinstance(key, tuple) or not is_list_like(key)
3802 if is_single_key:
3803 if self.columns.nlevels > 1:
3804 return self._getitem_multilevel(key)
3805 indexer = self.columns.get_loc(key)
3806 if is_integer(indexer):
3807 indexer = [indexer]
3808 else:
3809 if is_iterator(key):
3810 key = list(key)
3811 indexer = self.columns._get_indexer_strict(key, "columns")[1]
3813 # take() does not accept boolean indexers
3814 if getattr(indexer, "dtype", None) == bool:
3815 indexer = np.where(indexer)[0]
3817 data = self._take_with_is_copy(indexer, axis=1)
3819 if is_single_key:
3820 # What does looking for a single key in a non-unique index return?
3821 # The behavior is inconsistent. It returns a Series, except when
3822 # - the key itself is repeated (test on data.shape, #9519), or
3823 # - we have a MultiIndex on columns (test on self.columns, #21309)
3824 if data.shape[1] == 1 and not isinstance(self.columns, MultiIndex):
3825 # GH#26490 using data[key] can cause RecursionError
3826 return data._get_item_cache(key)
3828 return data
3830 def _getitem_bool_array(self, key):
3831 # also raises Exception if object array with NA values
3832 # warning here just in case -- previously __setitem__ was
3833 # reindexing but __getitem__ was not; it seems more reasonable to
3834 # go with the __setitem__ behavior since that is more consistent
3835 # with all other indexing behavior
3836 if isinstance(key, Series) and not key.index.equals(self.index):
3837 warnings.warn(
3838 "Boolean Series key will be reindexed to match DataFrame index.",
3839 UserWarning,
3840 stacklevel=find_stack_level(),
3841 )
3842 elif len(key) != len(self.index):
3843 raise ValueError(
3844 f"Item wrong length {len(key)} instead of {len(self.index)}."
3845 )
3847 # check_bool_indexer will throw exception if Series key cannot
3848 # be reindexed to match DataFrame rows
3849 key = check_bool_indexer(self.index, key)
3850 indexer = key.nonzero()[0]
3851 return self._take_with_is_copy(indexer, axis=0)
3853 def _getitem_multilevel(self, key):
3854 # self.columns is a MultiIndex
3855 loc = self.columns.get_loc(key)
3856 if isinstance(loc, (slice, np.ndarray)):
3857 new_columns = self.columns[loc]
3858 result_columns = maybe_droplevels(new_columns, key)
3859 if self._is_mixed_type:
3860 result = self.reindex(columns=new_columns)
3861 result.columns = result_columns
3862 else:
3863 new_values = self.values[:, loc]
3864 result = self._constructor(
3865 new_values, index=self.index, columns=result_columns
3866 )
3867 result = result.__finalize__(self)
3869 # If there is only one column being returned, and its name is
3870 # either an empty string, or a tuple with an empty string as its
3871 # first element, then treat the empty string as a placeholder
3872 # and return the column as if the user had provided that empty
3873 # string in the key. If the result is a Series, exclude the
3874 # implied empty string from its name.
3875 if len(result.columns) == 1:
3876 top = result.columns[0]
3877 if isinstance(top, tuple):
3878 top = top[0]
3879 if top == "":
3880 result = result[""]
3881 if isinstance(result, Series):
3882 result = self._constructor_sliced(
3883 result, index=self.index, name=key
3884 )
3886 result._set_is_copy(self)
3887 return result
3888 else:
3889 # loc is neither a slice nor ndarray, so must be an int
3890 return self._ixs(loc, axis=1)
3892 def _get_value(self, index, col, takeable: bool = False) -> Scalar:
3893 """
3894 Quickly retrieve single value at passed column and index.
3896 Parameters
3897 ----------
3898 index : row label
3899 col : column label
3900 takeable : interpret the index/col as indexers, default False
3902 Returns
3903 -------
3904 scalar
3906 Notes
3907 -----
3908 Assumes that both `self.index._index_as_unique` and
3909 `self.columns._index_as_unique`; Caller is responsible for checking.
3910 """
3911 if takeable:
3912 series = self._ixs(col, axis=1)
3913 return series._values[index]
3915 series = self._get_item_cache(col)
3916 engine = self.index._engine
3918 if not isinstance(self.index, MultiIndex):
3919 # CategoricalIndex: Trying to use the engine fastpath may give incorrect
3920 # results if our categories are integers that dont match our codes
3921 # IntervalIndex: IntervalTree has no get_loc
3922 row = self.index.get_loc(index)
3923 return series._values[row]
3925 # For MultiIndex going through engine effectively restricts us to
3926 # same-length tuples; see test_get_set_value_no_partial_indexing
3927 loc = engine.get_loc(index)
3928 return series._values[loc]
3930 def isetitem(self, loc, value) -> None:
3931 """
3932 Set the given value in the column with position 'loc'.
3934 This is a positional analogue to __setitem__.
3936 Parameters
3937 ----------
3938 loc : int or sequence of ints
3939 value : scalar or arraylike
3941 Notes
3942 -----
3943 Unlike `frame.iloc[:, i] = value`, `frame.isetitem(loc, value)` will
3944 _never_ try to set the values in place, but will always insert a new
3945 array.
3947 In cases where `frame.columns` is unique, this is equivalent to
3948 `frame[frame.columns[i]] = value`.
3949 """
3950 arraylike = self._sanitize_column(value)
3951 self._iset_item_mgr(loc, arraylike, inplace=False)
3953 def __setitem__(self, key, value):
3954 key = com.apply_if_callable(key, self)
3956 # see if we can slice the rows
3957 indexer = convert_to_index_sliceable(self, key)
3958 if indexer is not None:
3959 # either we have a slice or we have a string that can be converted
3960 # to a slice for partial-string date indexing
3961 return self._setitem_slice(indexer, value)
3963 if isinstance(key, DataFrame) or getattr(key, "ndim", None) == 2:
3964 self._setitem_frame(key, value)
3965 elif isinstance(key, (Series, np.ndarray, list, Index)):
3966 self._setitem_array(key, value)
3967 elif isinstance(value, DataFrame):
3968 self._set_item_frame_value(key, value)
3969 elif (
3970 is_list_like(value)
3971 and not self.columns.is_unique
3972 and 1 < len(self.columns.get_indexer_for([key])) == len(value)
3973 ):
3974 # Column to set is duplicated
3975 self._setitem_array([key], value)
3976 else:
3977 # set column
3978 self._set_item(key, value)
3980 def _setitem_slice(self, key: slice, value):
3981 # NB: we can't just use self.loc[key] = value because that
3982 # operates on labels and we need to operate positional for
3983 # backwards-compat, xref GH#31469
3984 self._check_setitem_copy()
3985 self.iloc[key] = value
3987 def _setitem_array(self, key, value):
3988 # also raises Exception if object array with NA values
3989 if com.is_bool_indexer(key):
3990 # bool indexer is indexing along rows
3991 if len(key) != len(self.index):
3992 raise ValueError(
3993 f"Item wrong length {len(key)} instead of {len(self.index)}!"
3994 )
3995 key = check_bool_indexer(self.index, key)
3996 indexer = key.nonzero()[0]
3997 self._check_setitem_copy()
3998 if isinstance(value, DataFrame):
3999 # GH#39931 reindex since iloc does not align
4000 value = value.reindex(self.index.take(indexer))
4001 self.iloc[indexer] = value
4003 else:
4004 # Note: unlike self.iloc[:, indexer] = value, this will
4005 # never try to overwrite values inplace
4007 if isinstance(value, DataFrame):
4008 check_key_length(self.columns, key, value)
4009 for k1, k2 in zip(key, value.columns):
4010 self[k1] = value[k2]
4012 elif not is_list_like(value):
4013 for col in key:
4014 self[col] = value
4016 elif isinstance(value, np.ndarray) and value.ndim == 2:
4017 self._iset_not_inplace(key, value)
4019 elif np.ndim(value) > 1:
4020 # list of lists
4021 value = DataFrame(value).values
4022 return self._setitem_array(key, value)
4024 else:
4025 self._iset_not_inplace(key, value)
4027 def _iset_not_inplace(self, key, value):
4028 # GH#39510 when setting with df[key] = obj with a list-like key and
4029 # list-like value, we iterate over those listlikes and set columns
4030 # one at a time. This is different from dispatching to
4031 # `self.loc[:, key]= value` because loc.__setitem__ may overwrite
4032 # data inplace, whereas this will insert new arrays.
4034 def igetitem(obj, i: int):
4035 # Note: we catch DataFrame obj before getting here, but
4036 # hypothetically would return obj.iloc[:, i]
4037 if isinstance(obj, np.ndarray):
4038 return obj[..., i]
4039 else:
4040 return obj[i]
4042 if self.columns.is_unique:
4043 if np.shape(value)[-1] != len(key):
4044 raise ValueError("Columns must be same length as key")
4046 for i, col in enumerate(key):
4047 self[col] = igetitem(value, i)
4049 else:
4051 ilocs = self.columns.get_indexer_non_unique(key)[0]
4052 if (ilocs < 0).any():
4053 # key entries not in self.columns
4054 raise NotImplementedError
4056 if np.shape(value)[-1] != len(ilocs):
4057 raise ValueError("Columns must be same length as key")
4059 assert np.ndim(value) <= 2
4061 orig_columns = self.columns
4063 # Using self.iloc[:, i] = ... may set values inplace, which
4064 # by convention we do not do in __setitem__
4065 try:
4066 self.columns = Index(range(len(self.columns)))
4067 for i, iloc in enumerate(ilocs):
4068 self[iloc] = igetitem(value, i)
4069 finally:
4070 self.columns = orig_columns
4072 def _setitem_frame(self, key, value):
4073 # support boolean setting with DataFrame input, e.g.
4074 # df[df > df2] = 0
4075 if isinstance(key, np.ndarray):
4076 if key.shape != self.shape:
4077 raise ValueError("Array conditional must be same shape as self")
4078 key = self._constructor(key, **self._construct_axes_dict())
4080 if key.size and not is_bool_dtype(key.values):
4081 raise TypeError(
4082 "Must pass DataFrame or 2-d ndarray with boolean values only"
4083 )
4085 self._check_inplace_setting(value)
4086 self._check_setitem_copy()
4087 self._where(-key, value, inplace=True)
4089 def _set_item_frame_value(self, key, value: DataFrame) -> None:
4090 self._ensure_valid_index(value)
4092 # align columns
4093 if key in self.columns:
4094 loc = self.columns.get_loc(key)
4095 cols = self.columns[loc]
4096 len_cols = 1 if is_scalar(cols) else len(cols)
4097 if len_cols != len(value.columns):
4098 raise ValueError("Columns must be same length as key")
4100 # align right-hand-side columns if self.columns
4101 # is multi-index and self[key] is a sub-frame
4102 if isinstance(self.columns, MultiIndex) and isinstance(
4103 loc, (slice, Series, np.ndarray, Index)
4104 ):
4105 cols_droplevel = maybe_droplevels(cols, key)
4106 if len(cols_droplevel) and not cols_droplevel.equals(value.columns):
4107 value = value.reindex(cols_droplevel, axis=1)
4109 for col, col_droplevel in zip(cols, cols_droplevel):
4110 self[col] = value[col_droplevel]
4111 return
4113 if is_scalar(cols):
4114 self[cols] = value[value.columns[0]]
4115 return
4117 # now align rows
4118 arraylike = _reindex_for_setitem(value, self.index)
4119 self._set_item_mgr(key, arraylike)
4120 return
4122 if len(value.columns) != 1:
4123 raise ValueError(
4124 "Cannot set a DataFrame with multiple columns to the single "
4125 f"column {key}"
4126 )
4128 self[key] = value[value.columns[0]]
4130 def _iset_item_mgr(
4131 self, loc: int | slice | np.ndarray, value, inplace: bool = False
4132 ) -> None:
4133 # when called from _set_item_mgr loc can be anything returned from get_loc
4134 self._mgr.iset(loc, value, inplace=inplace)
4135 self._clear_item_cache()
4137 def _set_item_mgr(self, key, value: ArrayLike) -> None:
4138 try:
4139 loc = self._info_axis.get_loc(key)
4140 except KeyError:
4141 # This item wasn't present, just insert at end
4142 self._mgr.insert(len(self._info_axis), key, value)
4143 else:
4144 self._iset_item_mgr(loc, value)
4146 # check if we are modifying a copy
4147 # try to set first as we want an invalid
4148 # value exception to occur first
4149 if len(self):
4150 self._check_setitem_copy()
4152 def _iset_item(self, loc: int, value) -> None:
4153 arraylike = self._sanitize_column(value)
4154 self._iset_item_mgr(loc, arraylike, inplace=True)
4156 # check if we are modifying a copy
4157 # try to set first as we want an invalid
4158 # value exception to occur first
4159 if len(self):
4160 self._check_setitem_copy()
4162 def _set_item(self, key, value) -> None:
4163 """
4164 Add series to DataFrame in specified column.
4166 If series is a numpy-array (not a Series/TimeSeries), it must be the
4167 same length as the DataFrames index or an error will be thrown.
4169 Series/TimeSeries will be conformed to the DataFrames index to
4170 ensure homogeneity.
4171 """
4172 value = self._sanitize_column(value)
4174 if (
4175 key in self.columns
4176 and value.ndim == 1
4177 and not is_extension_array_dtype(value)
4178 ):
4179 # broadcast across multiple columns if necessary
4180 if not self.columns.is_unique or isinstance(self.columns, MultiIndex):
4181 existing_piece = self[key]
4182 if isinstance(existing_piece, DataFrame):
4183 value = np.tile(value, (len(existing_piece.columns), 1)).T
4185 self._set_item_mgr(key, value)
4187 def _set_value(
4188 self, index: IndexLabel, col, value: Scalar, takeable: bool = False
4189 ) -> None:
4190 """
4191 Put single value at passed column and index.
4193 Parameters
4194 ----------
4195 index : Label
4196 row label
4197 col : Label
4198 column label
4199 value : scalar
4200 takeable : bool, default False
4201 Sets whether or not index/col interpreted as indexers
4202 """
4203 try:
4204 if takeable:
4205 icol = col
4206 iindex = cast(int, index)
4207 else:
4208 icol = self.columns.get_loc(col)
4209 iindex = self.index.get_loc(index)
4210 self._mgr.column_setitem(icol, iindex, value)
4211 self._clear_item_cache()
4213 except (KeyError, TypeError, ValueError):
4214 # get_loc might raise a KeyError for missing labels (falling back
4215 # to (i)loc will do expansion of the index)
4216 # column_setitem will do validation that may raise TypeError or ValueError
4217 # set using a non-recursive method & reset the cache
4218 if takeable:
4219 self.iloc[index, col] = value
4220 else:
4221 self.loc[index, col] = value
4222 self._item_cache.pop(col, None)
4224 except InvalidIndexError as ii_err:
4225 # GH48729: Seems like you are trying to assign a value to a
4226 # row when only scalar options are permitted
4227 raise InvalidIndexError(
4228 f"You can only assign a scalar value not a {type(value)}"
4229 ) from ii_err
4231 def _ensure_valid_index(self, value) -> None:
4232 """
4233 Ensure that if we don't have an index, that we can create one from the
4234 passed value.
4235 """
4236 # GH5632, make sure that we are a Series convertible
4237 if not len(self.index) and is_list_like(value) and len(value):
4238 if not isinstance(value, DataFrame):
4239 try:
4240 value = Series(value)
4241 except (ValueError, NotImplementedError, TypeError) as err:
4242 raise ValueError(
4243 "Cannot set a frame with no defined index "
4244 "and a value that cannot be converted to a Series"
4245 ) from err
4247 # GH31368 preserve name of index
4248 index_copy = value.index.copy()
4249 if self.index.name is not None:
4250 index_copy.name = self.index.name
4252 self._mgr = self._mgr.reindex_axis(index_copy, axis=1, fill_value=np.nan)
4254 def _box_col_values(self, values: SingleDataManager, loc: int) -> Series:
4255 """
4256 Provide boxed values for a column.
4257 """
4258 # Lookup in columns so that if e.g. a str datetime was passed
4259 # we attach the Timestamp object as the name.
4260 name = self.columns[loc]
4261 klass = self._constructor_sliced
4262 # We get index=self.index bc values is a SingleDataManager
4263 return klass(values, name=name, fastpath=True).__finalize__(self)
4265 # ----------------------------------------------------------------------
4266 # Lookup Caching
4268 def _clear_item_cache(self) -> None:
4269 self._item_cache.clear()
4271 def _get_item_cache(self, item: Hashable) -> Series:
4272 """Return the cached item, item represents a label indexer."""
4273 cache = self._item_cache
4274 res = cache.get(item)
4275 if res is None:
4276 # All places that call _get_item_cache have unique columns,
4277 # pending resolution of GH#33047
4279 loc = self.columns.get_loc(item)
4280 res = self._ixs(loc, axis=1)
4282 cache[item] = res
4284 # for a chain
4285 res._is_copy = self._is_copy
4286 return res
4288 def _reset_cacher(self) -> None:
4289 # no-op for DataFrame
4290 pass
4292 def _maybe_cache_changed(self, item, value: Series, inplace: bool) -> None:
4293 """
4294 The object has called back to us saying maybe it has changed.
4295 """
4296 loc = self._info_axis.get_loc(item)
4297 arraylike = value._values
4299 old = self._ixs(loc, axis=1)
4300 if old._values is value._values and inplace:
4301 # GH#46149 avoid making unnecessary copies/block-splitting
4302 return
4304 self._mgr.iset(loc, arraylike, inplace=inplace)
4306 # ----------------------------------------------------------------------
4307 # Unsorted
4309 @overload
4310 def query(self, expr: str, *, inplace: Literal[False] = ..., **kwargs) -> DataFrame:
4311 ...
4313 @overload
4314 def query(self, expr: str, *, inplace: Literal[True], **kwargs) -> None:
4315 ...
4317 @overload
4318 def query(self, expr: str, *, inplace: bool = ..., **kwargs) -> DataFrame | None:
4319 ...
4321 @deprecate_nonkeyword_arguments(version=None, allowed_args=["self", "expr"])
4322 def query(self, expr: str, inplace: bool = False, **kwargs) -> DataFrame | None:
4323 """
4324 Query the columns of a DataFrame with a boolean expression.
4326 Parameters
4327 ----------
4328 expr : str
4329 The query string to evaluate.
4331 You can refer to variables
4332 in the environment by prefixing them with an '@' character like
4333 ``@a + b``.
4335 You can refer to column names that are not valid Python variable names
4336 by surrounding them in backticks. Thus, column names containing spaces
4337 or punctuations (besides underscores) or starting with digits must be
4338 surrounded by backticks. (For example, a column named "Area (cm^2)" would
4339 be referenced as ```Area (cm^2)```). Column names which are Python keywords
4340 (like "list", "for", "import", etc) cannot be used.
4342 For example, if one of your columns is called ``a a`` and you want
4343 to sum it with ``b``, your query should be ```a a` + b``.
4345 .. versionadded:: 0.25.0
4346 Backtick quoting introduced.
4348 .. versionadded:: 1.0.0
4349 Expanding functionality of backtick quoting for more than only spaces.
4351 inplace : bool
4352 Whether to modify the DataFrame rather than creating a new one.
4353 **kwargs
4354 See the documentation for :func:`eval` for complete details
4355 on the keyword arguments accepted by :meth:`DataFrame.query`.
4357 Returns
4358 -------
4359 DataFrame or None
4360 DataFrame resulting from the provided query expression or
4361 None if ``inplace=True``.
4363 See Also
4364 --------
4365 eval : Evaluate a string describing operations on
4366 DataFrame columns.
4367 DataFrame.eval : Evaluate a string describing operations on
4368 DataFrame columns.
4370 Notes
4371 -----
4372 The result of the evaluation of this expression is first passed to
4373 :attr:`DataFrame.loc` and if that fails because of a
4374 multidimensional key (e.g., a DataFrame) then the result will be passed
4375 to :meth:`DataFrame.__getitem__`.
4377 This method uses the top-level :func:`eval` function to
4378 evaluate the passed query.
4380 The :meth:`~pandas.DataFrame.query` method uses a slightly
4381 modified Python syntax by default. For example, the ``&`` and ``|``
4382 (bitwise) operators have the precedence of their boolean cousins,
4383 :keyword:`and` and :keyword:`or`. This *is* syntactically valid Python,
4384 however the semantics are different.
4386 You can change the semantics of the expression by passing the keyword
4387 argument ``parser='python'``. This enforces the same semantics as
4388 evaluation in Python space. Likewise, you can pass ``engine='python'``
4389 to evaluate an expression using Python itself as a backend. This is not
4390 recommended as it is inefficient compared to using ``numexpr`` as the
4391 engine.
4393 The :attr:`DataFrame.index` and
4394 :attr:`DataFrame.columns` attributes of the
4395 :class:`~pandas.DataFrame` instance are placed in the query namespace
4396 by default, which allows you to treat both the index and columns of the
4397 frame as a column in the frame.
4398 The identifier ``index`` is used for the frame index; you can also
4399 use the name of the index to identify it in a query. Please note that
4400 Python keywords may not be used as identifiers.
4402 For further details and examples see the ``query`` documentation in
4403 :ref:`indexing <indexing.query>`.
4405 *Backtick quoted variables*
4407 Backtick quoted variables are parsed as literal Python code and
4408 are converted internally to a Python valid identifier.
4409 This can lead to the following problems.
4411 During parsing a number of disallowed characters inside the backtick
4412 quoted string are replaced by strings that are allowed as a Python identifier.
4413 These characters include all operators in Python, the space character, the
4414 question mark, the exclamation mark, the dollar sign, and the euro sign.
4415 For other characters that fall outside the ASCII range (U+0001..U+007F)
4416 and those that are not further specified in PEP 3131,
4417 the query parser will raise an error.
4418 This excludes whitespace different than the space character,
4419 but also the hashtag (as it is used for comments) and the backtick
4420 itself (backtick can also not be escaped).
4422 In a special case, quotes that make a pair around a backtick can
4423 confuse the parser.
4424 For example, ```it's` > `that's``` will raise an error,
4425 as it forms a quoted string (``'s > `that'``) with a backtick inside.
4427 See also the Python documentation about lexical analysis
4428 (https://docs.python.org/3/reference/lexical_analysis.html)
4429 in combination with the source code in :mod:`pandas.core.computation.parsing`.
4431 Examples
4432 --------
4433 >>> df = pd.DataFrame({'A': range(1, 6),
4434 ... 'B': range(10, 0, -2),
4435 ... 'C C': range(10, 5, -1)})
4436 >>> df
4437 A B C C
4438 0 1 10 10
4439 1 2 8 9
4440 2 3 6 8
4441 3 4 4 7
4442 4 5 2 6
4443 >>> df.query('A > B')
4444 A B C C
4445 4 5 2 6
4447 The previous expression is equivalent to
4449 >>> df[df.A > df.B]
4450 A B C C
4451 4 5 2 6
4453 For columns with spaces in their name, you can use backtick quoting.
4455 >>> df.query('B == `C C`')
4456 A B C C
4457 0 1 10 10
4459 The previous expression is equivalent to
4461 >>> df[df.B == df['C C']]
4462 A B C C
4463 0 1 10 10
4464 """
4465 inplace = validate_bool_kwarg(inplace, "inplace")
4466 if not isinstance(expr, str):
4467 msg = f"expr must be a string to be evaluated, {type(expr)} given"
4468 raise ValueError(msg)
4469 kwargs["level"] = kwargs.pop("level", 0) + 2
4470 kwargs["target"] = None
4471 res = self.eval(expr, **kwargs)
4473 try:
4474 result = self.loc[res]
4475 except ValueError:
4476 # when res is multi-dimensional loc raises, but this is sometimes a
4477 # valid query
4478 result = self[res]
4480 if inplace:
4481 self._update_inplace(result)
4482 return None
4483 else:
4484 return result
4486 @overload
4487 def eval(self, expr: str, *, inplace: Literal[False] = ..., **kwargs) -> Any:
4488 ...
4490 @overload
4491 def eval(self, expr: str, *, inplace: Literal[True], **kwargs) -> None:
4492 ...
4494 @deprecate_nonkeyword_arguments(version=None, allowed_args=["self", "expr"])
4495 def eval(self, expr: str, inplace: bool = False, **kwargs) -> Any | None:
4496 """
4497 Evaluate a string describing operations on DataFrame columns.
4499 Operates on columns only, not specific rows or elements. This allows
4500 `eval` to run arbitrary code, which can make you vulnerable to code
4501 injection if you pass user input to this function.
4503 Parameters
4504 ----------
4505 expr : str
4506 The expression string to evaluate.
4507 inplace : bool, default False
4508 If the expression contains an assignment, whether to perform the
4509 operation inplace and mutate the existing DataFrame. Otherwise,
4510 a new DataFrame is returned.
4511 **kwargs
4512 See the documentation for :func:`eval` for complete details
4513 on the keyword arguments accepted by
4514 :meth:`~pandas.DataFrame.query`.
4516 Returns
4517 -------
4518 ndarray, scalar, pandas object, or None
4519 The result of the evaluation or None if ``inplace=True``.
4521 See Also
4522 --------
4523 DataFrame.query : Evaluates a boolean expression to query the columns
4524 of a frame.
4525 DataFrame.assign : Can evaluate an expression or function to create new
4526 values for a column.
4527 eval : Evaluate a Python expression as a string using various
4528 backends.
4530 Notes
4531 -----
4532 For more details see the API documentation for :func:`~eval`.
4533 For detailed examples see :ref:`enhancing performance with eval
4534 <enhancingperf.eval>`.
4536 Examples
4537 --------
4538 >>> df = pd.DataFrame({'A': range(1, 6), 'B': range(10, 0, -2)})
4539 >>> df
4540 A B
4541 0 1 10
4542 1 2 8
4543 2 3 6
4544 3 4 4
4545 4 5 2
4546 >>> df.eval('A + B')
4547 0 11
4548 1 10
4549 2 9
4550 3 8
4551 4 7
4552 dtype: int64
4554 Assignment is allowed though by default the original DataFrame is not
4555 modified.
4557 >>> df.eval('C = A + B')
4558 A B C
4559 0 1 10 11
4560 1 2 8 10
4561 2 3 6 9
4562 3 4 4 8
4563 4 5 2 7
4564 >>> df
4565 A B
4566 0 1 10
4567 1 2 8
4568 2 3 6
4569 3 4 4
4570 4 5 2
4572 Use ``inplace=True`` to modify the original DataFrame.
4574 >>> df.eval('C = A + B', inplace=True)
4575 >>> df
4576 A B C
4577 0 1 10 11
4578 1 2 8 10
4579 2 3 6 9
4580 3 4 4 8
4581 4 5 2 7
4583 Multiple columns can be assigned to using multi-line expressions:
4585 >>> df.eval(
4586 ... '''
4587 ... C = A + B
4588 ... D = A - B
4589 ... '''
4590 ... )
4591 A B C D
4592 0 1 10 11 -9
4593 1 2 8 10 -6
4594 2 3 6 9 -3
4595 3 4 4 8 0
4596 4 5 2 7 3
4597 """
4598 from pandas.core.computation.eval import eval as _eval
4600 inplace = validate_bool_kwarg(inplace, "inplace")
4601 kwargs["level"] = kwargs.pop("level", 0) + 2
4602 index_resolvers = self._get_index_resolvers()
4603 column_resolvers = self._get_cleaned_column_resolvers()
4604 resolvers = column_resolvers, index_resolvers
4605 if "target" not in kwargs:
4606 kwargs["target"] = self
4607 kwargs["resolvers"] = tuple(kwargs.get("resolvers", ())) + resolvers
4609 return _eval(expr, inplace=inplace, **kwargs)
4611 def select_dtypes(self, include=None, exclude=None) -> DataFrame:
4612 """
4613 Return a subset of the DataFrame's columns based on the column dtypes.
4615 Parameters
4616 ----------
4617 include, exclude : scalar or list-like
4618 A selection of dtypes or strings to be included/excluded. At least
4619 one of these parameters must be supplied.
4621 Returns
4622 -------
4623 DataFrame
4624 The subset of the frame including the dtypes in ``include`` and
4625 excluding the dtypes in ``exclude``.
4627 Raises
4628 ------
4629 ValueError
4630 * If both of ``include`` and ``exclude`` are empty
4631 * If ``include`` and ``exclude`` have overlapping elements
4632 * If any kind of string dtype is passed in.
4634 See Also
4635 --------
4636 DataFrame.dtypes: Return Series with the data type of each column.
4638 Notes
4639 -----
4640 * To select all *numeric* types, use ``np.number`` or ``'number'``
4641 * To select strings you must use the ``object`` dtype, but note that
4642 this will return *all* object dtype columns
4643 * See the `numpy dtype hierarchy
4644 <https://numpy.org/doc/stable/reference/arrays.scalars.html>`__
4645 * To select datetimes, use ``np.datetime64``, ``'datetime'`` or
4646 ``'datetime64'``
4647 * To select timedeltas, use ``np.timedelta64``, ``'timedelta'`` or
4648 ``'timedelta64'``
4649 * To select Pandas categorical dtypes, use ``'category'``
4650 * To select Pandas datetimetz dtypes, use ``'datetimetz'`` (new in
4651 0.20.0) or ``'datetime64[ns, tz]'``
4653 Examples
4654 --------
4655 >>> df = pd.DataFrame({'a': [1, 2] * 3,
4656 ... 'b': [True, False] * 3,
4657 ... 'c': [1.0, 2.0] * 3})
4658 >>> df
4659 a b c
4660 0 1 True 1.0
4661 1 2 False 2.0
4662 2 1 True 1.0
4663 3 2 False 2.0
4664 4 1 True 1.0
4665 5 2 False 2.0
4667 >>> df.select_dtypes(include='bool')
4668 b
4669 0 True
4670 1 False
4671 2 True
4672 3 False
4673 4 True
4674 5 False
4676 >>> df.select_dtypes(include=['float64'])
4677 c
4678 0 1.0
4679 1 2.0
4680 2 1.0
4681 3 2.0
4682 4 1.0
4683 5 2.0
4685 >>> df.select_dtypes(exclude=['int64'])
4686 b c
4687 0 True 1.0
4688 1 False 2.0
4689 2 True 1.0
4690 3 False 2.0
4691 4 True 1.0
4692 5 False 2.0
4693 """
4694 if not is_list_like(include):
4695 include = (include,) if include is not None else ()
4696 if not is_list_like(exclude):
4697 exclude = (exclude,) if exclude is not None else ()
4699 selection = (frozenset(include), frozenset(exclude))
4701 if not any(selection):
4702 raise ValueError("at least one of include or exclude must be nonempty")
4704 # convert the myriad valid dtypes object to a single representation
4705 def check_int_infer_dtype(dtypes):
4706 converted_dtypes: list[type] = []
4707 for dtype in dtypes:
4708 # Numpy maps int to different types (int32, in64) on Windows and Linux
4709 # see https://github.com/numpy/numpy/issues/9464
4710 if (isinstance(dtype, str) and dtype == "int") or (dtype is int):
4711 converted_dtypes.append(np.int32)
4712 converted_dtypes.append(np.int64)
4713 elif dtype == "float" or dtype is float:
4714 # GH#42452 : np.dtype("float") coerces to np.float64 from Numpy 1.20
4715 converted_dtypes.extend([np.float64, np.float32])
4716 else:
4717 converted_dtypes.append(infer_dtype_from_object(dtype))
4718 return frozenset(converted_dtypes)
4720 include = check_int_infer_dtype(include)
4721 exclude = check_int_infer_dtype(exclude)
4723 for dtypes in (include, exclude):
4724 invalidate_string_dtypes(dtypes)
4726 # can't both include AND exclude!
4727 if not include.isdisjoint(exclude):
4728 raise ValueError(f"include and exclude overlap on {(include & exclude)}")
4730 def dtype_predicate(dtype: DtypeObj, dtypes_set) -> bool:
4731 # GH 46870: BooleanDtype._is_numeric == True but should be excluded
4732 return issubclass(dtype.type, tuple(dtypes_set)) or (
4733 np.number in dtypes_set
4734 and getattr(dtype, "_is_numeric", False)
4735 and not is_bool_dtype(dtype)
4736 )
4738 def predicate(arr: ArrayLike) -> bool:
4739 dtype = arr.dtype
4740 if include:
4741 if not dtype_predicate(dtype, include):
4742 return False
4744 if exclude:
4745 if dtype_predicate(dtype, exclude):
4746 return False
4748 return True
4750 mgr = self._mgr._get_data_subset(predicate).copy(deep=None)
4751 return type(self)(mgr).__finalize__(self)
4753 def insert(
4754 self,
4755 loc: int,
4756 column: Hashable,
4757 value: Scalar | AnyArrayLike,
4758 allow_duplicates: bool | lib.NoDefault = lib.no_default,
4759 ) -> None:
4760 """
4761 Insert column into DataFrame at specified location.
4763 Raises a ValueError if `column` is already contained in the DataFrame,
4764 unless `allow_duplicates` is set to True.
4766 Parameters
4767 ----------
4768 loc : int
4769 Insertion index. Must verify 0 <= loc <= len(columns).
4770 column : str, number, or hashable object
4771 Label of the inserted column.
4772 value : Scalar, Series, or array-like
4773 allow_duplicates : bool, optional, default lib.no_default
4775 See Also
4776 --------
4777 Index.insert : Insert new item by index.
4779 Examples
4780 --------
4781 >>> df = pd.DataFrame({'col1': [1, 2], 'col2': [3, 4]})
4782 >>> df
4783 col1 col2
4784 0 1 3
4785 1 2 4
4786 >>> df.insert(1, "newcol", [99, 99])
4787 >>> df
4788 col1 newcol col2
4789 0 1 99 3
4790 1 2 99 4
4791 >>> df.insert(0, "col1", [100, 100], allow_duplicates=True)
4792 >>> df
4793 col1 col1 newcol col2
4794 0 100 1 99 3
4795 1 100 2 99 4
4797 Notice that pandas uses index alignment in case of `value` from type `Series`:
4799 >>> df.insert(0, "col0", pd.Series([5, 6], index=[1, 2]))
4800 >>> df
4801 col0 col1 col1 newcol col2
4802 0 NaN 100 1 99 3
4803 1 5.0 100 2 99 4
4804 """
4805 if allow_duplicates is lib.no_default:
4806 allow_duplicates = False
4807 if allow_duplicates and not self.flags.allows_duplicate_labels:
4808 raise ValueError(
4809 "Cannot specify 'allow_duplicates=True' when "
4810 "'self.flags.allows_duplicate_labels' is False."
4811 )
4812 if not allow_duplicates and column in self.columns:
4813 # Should this be a different kind of error??
4814 raise ValueError(f"cannot insert {column}, already exists")
4815 if not isinstance(loc, int):
4816 raise TypeError("loc must be int")
4818 value = self._sanitize_column(value)
4819 self._mgr.insert(loc, column, value)
4821 def assign(self, **kwargs) -> DataFrame:
4822 r"""
4823 Assign new columns to a DataFrame.
4825 Returns a new object with all original columns in addition to new ones.
4826 Existing columns that are re-assigned will be overwritten.
4828 Parameters
4829 ----------
4830 **kwargs : dict of {str: callable or Series}
4831 The column names are keywords. If the values are
4832 callable, they are computed on the DataFrame and
4833 assigned to the new columns. The callable must not
4834 change input DataFrame (though pandas doesn't check it).
4835 If the values are not callable, (e.g. a Series, scalar, or array),
4836 they are simply assigned.
4838 Returns
4839 -------
4840 DataFrame
4841 A new DataFrame with the new columns in addition to
4842 all the existing columns.
4844 Notes
4845 -----
4846 Assigning multiple columns within the same ``assign`` is possible.
4847 Later items in '\*\*kwargs' may refer to newly created or modified
4848 columns in 'df'; items are computed and assigned into 'df' in order.
4850 Examples
4851 --------
4852 >>> df = pd.DataFrame({'temp_c': [17.0, 25.0]},
4853 ... index=['Portland', 'Berkeley'])
4854 >>> df
4855 temp_c
4856 Portland 17.0
4857 Berkeley 25.0
4859 Where the value is a callable, evaluated on `df`:
4861 >>> df.assign(temp_f=lambda x: x.temp_c * 9 / 5 + 32)
4862 temp_c temp_f
4863 Portland 17.0 62.6
4864 Berkeley 25.0 77.0
4866 Alternatively, the same behavior can be achieved by directly
4867 referencing an existing Series or sequence:
4869 >>> df.assign(temp_f=df['temp_c'] * 9 / 5 + 32)
4870 temp_c temp_f
4871 Portland 17.0 62.6
4872 Berkeley 25.0 77.0
4874 You can create multiple columns within the same assign where one
4875 of the columns depends on another one defined within the same assign:
4877 >>> df.assign(temp_f=lambda x: x['temp_c'] * 9 / 5 + 32,
4878 ... temp_k=lambda x: (x['temp_f'] + 459.67) * 5 / 9)
4879 temp_c temp_f temp_k
4880 Portland 17.0 62.6 290.15
4881 Berkeley 25.0 77.0 298.15
4882 """
4883 data = self.copy()
4885 for k, v in kwargs.items():
4886 data[k] = com.apply_if_callable(v, data)
4887 return data
4889 def _sanitize_column(self, value) -> ArrayLike:
4890 """
4891 Ensures new columns (which go into the BlockManager as new blocks) are
4892 always copied and converted into an array.
4894 Parameters
4895 ----------
4896 value : scalar, Series, or array-like
4898 Returns
4899 -------
4900 numpy.ndarray or ExtensionArray
4901 """
4902 self._ensure_valid_index(value)
4904 # We can get there through isetitem with a DataFrame
4905 # or through loc single_block_path
4906 if isinstance(value, DataFrame):
4907 return _reindex_for_setitem(value, self.index)
4908 elif is_dict_like(value):
4909 return _reindex_for_setitem(Series(value), self.index)
4911 if is_list_like(value):
4912 com.require_length_match(value, self.index)
4913 return sanitize_array(value, self.index, copy=True, allow_2d=True)
4915 @property
4916 def _series(self):
4917 return {
4918 item: Series(
4919 self._mgr.iget(idx), index=self.index, name=item, fastpath=True
4920 )
4921 for idx, item in enumerate(self.columns)
4922 }
4924 def lookup(
4925 self, row_labels: Sequence[IndexLabel], col_labels: Sequence[IndexLabel]
4926 ) -> np.ndarray:
4927 """
4928 Label-based "fancy indexing" function for DataFrame.
4930 .. deprecated:: 1.2.0
4931 DataFrame.lookup is deprecated,
4932 use pandas.factorize and NumPy indexing instead.
4933 For further details see
4934 :ref:`Looking up values by index/column labels <indexing.lookup>`.
4936 Given equal-length arrays of row and column labels, return an
4937 array of the values corresponding to each (row, col) pair.
4939 Parameters
4940 ----------
4941 row_labels : sequence
4942 The row labels to use for lookup.
4943 col_labels : sequence
4944 The column labels to use for lookup.
4946 Returns
4947 -------
4948 numpy.ndarray
4949 The found values.
4950 """
4951 msg = (
4952 "The 'lookup' method is deprecated and will be "
4953 "removed in a future version. "
4954 "You can use DataFrame.melt and DataFrame.loc "
4955 "as a substitute."
4956 )
4957 warnings.warn(msg, FutureWarning, stacklevel=find_stack_level())
4959 n = len(row_labels)
4960 if n != len(col_labels):
4961 raise ValueError("Row labels must have same size as column labels")
4962 if not (self.index.is_unique and self.columns.is_unique):
4963 # GH#33041
4964 raise ValueError("DataFrame.lookup requires unique index and columns")
4966 thresh = 1000
4967 if not self._is_mixed_type or n > thresh:
4968 values = self.values
4969 ridx = self.index.get_indexer(row_labels)
4970 cidx = self.columns.get_indexer(col_labels)
4971 if (ridx == -1).any():
4972 raise KeyError("One or more row labels was not found")
4973 if (cidx == -1).any():
4974 raise KeyError("One or more column labels was not found")
4975 flat_index = ridx * len(self.columns) + cidx
4976 result = values.flat[flat_index]
4977 else:
4978 result = np.empty(n, dtype="O")
4979 for i, (r, c) in enumerate(zip(row_labels, col_labels)):
4980 result[i] = self._get_value(r, c)
4982 if is_object_dtype(result):
4983 result = lib.maybe_convert_objects(result)
4985 return result
4987 # ----------------------------------------------------------------------
4988 # Reindexing and alignment
4990 def _reindex_axes(self, axes, level, limit, tolerance, method, fill_value, copy):
4991 frame = self
4993 columns = axes["columns"]
4994 if columns is not None:
4995 frame = frame._reindex_columns(
4996 columns, method, copy, level, fill_value, limit, tolerance
4997 )
4999 index = axes["index"]
5000 if index is not None:
5001 frame = frame._reindex_index(
5002 index, method, copy, level, fill_value, limit, tolerance
5003 )
5005 return frame
5007 def _reindex_index(
5008 self,
5009 new_index,
5010 method,
5011 copy: bool,
5012 level: Level,
5013 fill_value=np.nan,
5014 limit=None,
5015 tolerance=None,
5016 ):
5017 new_index, indexer = self.index.reindex(
5018 new_index, method=method, level=level, limit=limit, tolerance=tolerance
5019 )
5020 return self._reindex_with_indexers(
5021 {0: [new_index, indexer]},
5022 copy=copy,
5023 fill_value=fill_value,
5024 allow_dups=False,
5025 )
5027 def _reindex_columns(
5028 self,
5029 new_columns,
5030 method,
5031 copy: bool,
5032 level: Level,
5033 fill_value=None,
5034 limit=None,
5035 tolerance=None,
5036 ):
5037 new_columns, indexer = self.columns.reindex(
5038 new_columns, method=method, level=level, limit=limit, tolerance=tolerance
5039 )
5040 return self._reindex_with_indexers(
5041 {1: [new_columns, indexer]},
5042 copy=copy,
5043 fill_value=fill_value,
5044 allow_dups=False,
5045 )
5047 def _reindex_multi(
5048 self, axes: dict[str, Index], copy: bool, fill_value
5049 ) -> DataFrame:
5050 """
5051 We are guaranteed non-Nones in the axes.
5052 """
5054 new_index, row_indexer = self.index.reindex(axes["index"])
5055 new_columns, col_indexer = self.columns.reindex(axes["columns"])
5057 if row_indexer is not None and col_indexer is not None:
5058 # Fastpath. By doing two 'take's at once we avoid making an
5059 # unnecessary copy.
5060 # We only get here with `not self._is_mixed_type`, which (almost)
5061 # ensures that self.values is cheap. It may be worth making this
5062 # condition more specific.
5063 indexer = row_indexer, col_indexer
5064 new_values = take_2d_multi(self.values, indexer, fill_value=fill_value)
5065 return self._constructor(new_values, index=new_index, columns=new_columns)
5066 else:
5067 return self._reindex_with_indexers(
5068 {0: [new_index, row_indexer], 1: [new_columns, col_indexer]},
5069 copy=copy,
5070 fill_value=fill_value,
5071 )
5073 @doc(NDFrame.align, **_shared_doc_kwargs)
5074 def align(
5075 self,
5076 other: DataFrame,
5077 join: Literal["outer", "inner", "left", "right"] = "outer",
5078 axis: Axis | None = None,
5079 level: Level = None,
5080 copy: bool = True,
5081 fill_value=None,
5082 method: FillnaOptions | None = None,
5083 limit: int | None = None,
5084 fill_axis: Axis = 0,
5085 broadcast_axis: Axis | None = None,
5086 ) -> DataFrame:
5087 return super().align(
5088 other,
5089 join=join,
5090 axis=axis,
5091 level=level,
5092 copy=copy,
5093 fill_value=fill_value,
5094 method=method,
5095 limit=limit,
5096 fill_axis=fill_axis,
5097 broadcast_axis=broadcast_axis,
5098 )
5100 @overload
5101 def set_axis(
5102 self,
5103 labels,
5104 *,
5105 axis: Axis = ...,
5106 inplace: Literal[False] | lib.NoDefault = ...,
5107 copy: bool | lib.NoDefault = ...,
5108 ) -> DataFrame:
5109 ...
5111 @overload
5112 def set_axis(
5113 self,
5114 labels,
5115 *,
5116 axis: Axis = ...,
5117 inplace: Literal[True],
5118 copy: bool | lib.NoDefault = ...,
5119 ) -> None:
5120 ...
5122 @overload
5123 def set_axis(
5124 self,
5125 labels,
5126 *,
5127 axis: Axis = ...,
5128 inplace: bool | lib.NoDefault = ...,
5129 copy: bool | lib.NoDefault = ...,
5130 ) -> DataFrame | None:
5131 ...
5133 # error: Signature of "set_axis" incompatible with supertype "NDFrame"
5134 @deprecate_nonkeyword_arguments(version=None, allowed_args=["self", "labels"])
5135 @Appender(
5136 """
5137 Examples
5138 --------
5139 >>> df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
5141 Change the row labels.
5143 >>> df.set_axis(['a', 'b', 'c'], axis='index')
5144 A B
5145 a 1 4
5146 b 2 5
5147 c 3 6
5149 Change the column labels.
5151 >>> df.set_axis(['I', 'II'], axis='columns')
5152 I II
5153 0 1 4
5154 1 2 5
5155 2 3 6
5157 Now, update the labels without copying the underlying data.
5159 >>> df.set_axis(['i', 'ii'], axis='columns', copy=False)
5160 i ii
5161 0 1 4
5162 1 2 5
5163 2 3 6
5164 """
5165 )
5166 @Substitution(
5167 **_shared_doc_kwargs,
5168 extended_summary_sub=" column or",
5169 axis_description_sub=", and 1 identifies the columns",
5170 see_also_sub=" or columns",
5171 )
5172 @Appender(NDFrame.set_axis.__doc__)
5173 def set_axis(
5174 self,
5175 labels,
5176 axis: Axis = 0,
5177 inplace: bool | lib.NoDefault = lib.no_default,
5178 *,
5179 copy: bool | lib.NoDefault = lib.no_default,
5180 ):
5181 return super().set_axis(labels, axis=axis, inplace=inplace, copy=copy)
5183 @Substitution(**_shared_doc_kwargs)
5184 @Appender(NDFrame.reindex.__doc__)
5185 @rewrite_axis_style_signature(
5186 "labels",
5187 [
5188 ("method", None),
5189 ("copy", None),
5190 ("level", None),
5191 ("fill_value", np.nan),
5192 ("limit", None),
5193 ("tolerance", None),
5194 ],
5195 )
5196 def reindex(self, *args, **kwargs) -> DataFrame:
5197 axes = validate_axis_style_args(self, args, kwargs, "labels", "reindex")
5198 kwargs.update(axes)
5199 # Pop these, since the values are in `kwargs` under different names
5200 kwargs.pop("axis", None)
5201 kwargs.pop("labels", None)
5202 return super().reindex(**kwargs)
5204 @overload
5205 def drop(
5206 self,
5207 labels: IndexLabel = ...,
5208 *,
5209 axis: Axis = ...,
5210 index: IndexLabel = ...,
5211 columns: IndexLabel = ...,
5212 level: Level = ...,
5213 inplace: Literal[True],
5214 errors: IgnoreRaise = ...,
5215 ) -> None:
5216 ...
5218 @overload
5219 def drop(
5220 self,
5221 labels: IndexLabel = ...,
5222 *,
5223 axis: Axis = ...,
5224 index: IndexLabel = ...,
5225 columns: IndexLabel = ...,
5226 level: Level = ...,
5227 inplace: Literal[False] = ...,
5228 errors: IgnoreRaise = ...,
5229 ) -> DataFrame:
5230 ...
5232 @overload
5233 def drop(
5234 self,
5235 labels: IndexLabel = ...,
5236 *,
5237 axis: Axis = ...,
5238 index: IndexLabel = ...,
5239 columns: IndexLabel = ...,
5240 level: Level = ...,
5241 inplace: bool = ...,
5242 errors: IgnoreRaise = ...,
5243 ) -> DataFrame | None:
5244 ...
5246 # error: Signature of "drop" incompatible with supertype "NDFrame"
5247 # github.com/python/mypy/issues/12387
5248 @deprecate_nonkeyword_arguments(version=None, allowed_args=["self", "labels"])
5249 def drop( # type: ignore[override]
5250 self,
5251 labels: IndexLabel = None,
5252 axis: Axis = 0,
5253 index: IndexLabel = None,
5254 columns: IndexLabel = None,
5255 level: Level = None,
5256 inplace: bool = False,
5257 errors: IgnoreRaise = "raise",
5258 ) -> DataFrame | None:
5259 """
5260 Drop specified labels from rows or columns.
5262 Remove rows or columns by specifying label names and corresponding
5263 axis, or by specifying directly index or column names. When using a
5264 multi-index, labels on different levels can be removed by specifying
5265 the level. See the `user guide <advanced.shown_levels>`
5266 for more information about the now unused levels.
5268 Parameters
5269 ----------
5270 labels : single label or list-like
5271 Index or column labels to drop. A tuple will be used as a single
5272 label and not treated as a list-like.
5273 axis : {0 or 'index', 1 or 'columns'}, default 0
5274 Whether to drop labels from the index (0 or 'index') or
5275 columns (1 or 'columns').
5276 index : single label or list-like
5277 Alternative to specifying axis (``labels, axis=0``
5278 is equivalent to ``index=labels``).
5279 columns : single label or list-like
5280 Alternative to specifying axis (``labels, axis=1``
5281 is equivalent to ``columns=labels``).
5282 level : int or level name, optional
5283 For MultiIndex, level from which the labels will be removed.
5284 inplace : bool, default False
5285 If False, return a copy. Otherwise, do operation
5286 inplace and return None.
5287 errors : {'ignore', 'raise'}, default 'raise'
5288 If 'ignore', suppress error and only existing labels are
5289 dropped.
5291 Returns
5292 -------
5293 DataFrame or None
5294 DataFrame without the removed index or column labels or
5295 None if ``inplace=True``.
5297 Raises
5298 ------
5299 KeyError
5300 If any of the labels is not found in the selected axis.
5302 See Also
5303 --------
5304 DataFrame.loc : Label-location based indexer for selection by label.
5305 DataFrame.dropna : Return DataFrame with labels on given axis omitted
5306 where (all or any) data are missing.
5307 DataFrame.drop_duplicates : Return DataFrame with duplicate rows
5308 removed, optionally only considering certain columns.
5309 Series.drop : Return Series with specified index labels removed.
5311 Examples
5312 --------
5313 >>> df = pd.DataFrame(np.arange(12).reshape(3, 4),
5314 ... columns=['A', 'B', 'C', 'D'])
5315 >>> df
5316 A B C D
5317 0 0 1 2 3
5318 1 4 5 6 7
5319 2 8 9 10 11
5321 Drop columns
5323 >>> df.drop(['B', 'C'], axis=1)
5324 A D
5325 0 0 3
5326 1 4 7
5327 2 8 11
5329 >>> df.drop(columns=['B', 'C'])
5330 A D
5331 0 0 3
5332 1 4 7
5333 2 8 11
5335 Drop a row by index
5337 >>> df.drop([0, 1])
5338 A B C D
5339 2 8 9 10 11
5341 Drop columns and/or rows of MultiIndex DataFrame
5343 >>> midx = pd.MultiIndex(levels=[['lama', 'cow', 'falcon'],
5344 ... ['speed', 'weight', 'length']],
5345 ... codes=[[0, 0, 0, 1, 1, 1, 2, 2, 2],
5346 ... [0, 1, 2, 0, 1, 2, 0, 1, 2]])
5347 >>> df = pd.DataFrame(index=midx, columns=['big', 'small'],
5348 ... data=[[45, 30], [200, 100], [1.5, 1], [30, 20],
5349 ... [250, 150], [1.5, 0.8], [320, 250],
5350 ... [1, 0.8], [0.3, 0.2]])
5351 >>> df
5352 big small
5353 lama speed 45.0 30.0
5354 weight 200.0 100.0
5355 length 1.5 1.0
5356 cow speed 30.0 20.0
5357 weight 250.0 150.0
5358 length 1.5 0.8
5359 falcon speed 320.0 250.0
5360 weight 1.0 0.8
5361 length 0.3 0.2
5363 Drop a specific index combination from the MultiIndex
5364 DataFrame, i.e., drop the combination ``'falcon'`` and
5365 ``'weight'``, which deletes only the corresponding row
5367 >>> df.drop(index=('falcon', 'weight'))
5368 big small
5369 lama speed 45.0 30.0
5370 weight 200.0 100.0
5371 length 1.5 1.0
5372 cow speed 30.0 20.0
5373 weight 250.0 150.0
5374 length 1.5 0.8
5375 falcon speed 320.0 250.0
5376 length 0.3 0.2
5378 >>> df.drop(index='cow', columns='small')
5379 big
5380 lama speed 45.0
5381 weight 200.0
5382 length 1.5
5383 falcon speed 320.0
5384 weight 1.0
5385 length 0.3
5387 >>> df.drop(index='length', level=1)
5388 big small
5389 lama speed 45.0 30.0
5390 weight 200.0 100.0
5391 cow speed 30.0 20.0
5392 weight 250.0 150.0
5393 falcon speed 320.0 250.0
5394 weight 1.0 0.8
5395 """
5396 return super().drop(
5397 labels=labels,
5398 axis=axis,
5399 index=index,
5400 columns=columns,
5401 level=level,
5402 inplace=inplace,
5403 errors=errors,
5404 )
5406 @overload
5407 def rename(
5408 self,
5409 mapper: Renamer | None = ...,
5410 *,
5411 index: Renamer | None = ...,
5412 columns: Renamer | None = ...,
5413 axis: Axis | None = ...,
5414 copy: bool | None = ...,
5415 inplace: Literal[True],
5416 level: Level = ...,
5417 errors: IgnoreRaise = ...,
5418 ) -> None:
5419 ...
5421 @overload
5422 def rename(
5423 self,
5424 mapper: Renamer | None = ...,
5425 *,
5426 index: Renamer | None = ...,
5427 columns: Renamer | None = ...,
5428 axis: Axis | None = ...,
5429 copy: bool | None = ...,
5430 inplace: Literal[False] = ...,
5431 level: Level = ...,
5432 errors: IgnoreRaise = ...,
5433 ) -> DataFrame:
5434 ...
5436 @overload
5437 def rename(
5438 self,
5439 mapper: Renamer | None = ...,
5440 *,
5441 index: Renamer | None = ...,
5442 columns: Renamer | None = ...,
5443 axis: Axis | None = ...,
5444 copy: bool | None = ...,
5445 inplace: bool = ...,
5446 level: Level = ...,
5447 errors: IgnoreRaise = ...,
5448 ) -> DataFrame | None:
5449 ...
5451 def rename(
5452 self,
5453 mapper: Renamer | None = None,
5454 *,
5455 index: Renamer | None = None,
5456 columns: Renamer | None = None,
5457 axis: Axis | None = None,
5458 copy: bool | None = None,
5459 inplace: bool = False,
5460 level: Level = None,
5461 errors: IgnoreRaise = "ignore",
5462 ) -> DataFrame | None:
5463 """
5464 Alter axes labels.
5466 Function / dict values must be unique (1-to-1). Labels not contained in
5467 a dict / Series will be left as-is. Extra labels listed don't throw an
5468 error.
5470 See the :ref:`user guide <basics.rename>` for more.
5472 Parameters
5473 ----------
5474 mapper : dict-like or function
5475 Dict-like or function transformations to apply to
5476 that axis' values. Use either ``mapper`` and ``axis`` to
5477 specify the axis to target with ``mapper``, or ``index`` and
5478 ``columns``.
5479 index : dict-like or function
5480 Alternative to specifying axis (``mapper, axis=0``
5481 is equivalent to ``index=mapper``).
5482 columns : dict-like or function
5483 Alternative to specifying axis (``mapper, axis=1``
5484 is equivalent to ``columns=mapper``).
5485 axis : {0 or 'index', 1 or 'columns'}, default 0
5486 Axis to target with ``mapper``. Can be either the axis name
5487 ('index', 'columns') or number (0, 1). The default is 'index'.
5488 copy : bool, default True
5489 Also copy underlying data.
5490 inplace : bool, default False
5491 Whether to modify the DataFrame rather than creating a new one.
5492 If True then value of copy is ignored.
5493 level : int or level name, default None
5494 In case of a MultiIndex, only rename labels in the specified
5495 level.
5496 errors : {'ignore', 'raise'}, default 'ignore'
5497 If 'raise', raise a `KeyError` when a dict-like `mapper`, `index`,
5498 or `columns` contains labels that are not present in the Index
5499 being transformed.
5500 If 'ignore', existing keys will be renamed and extra keys will be
5501 ignored.
5503 Returns
5504 -------
5505 DataFrame or None
5506 DataFrame with the renamed axis labels or None if ``inplace=True``.
5508 Raises
5509 ------
5510 KeyError
5511 If any of the labels is not found in the selected axis and
5512 "errors='raise'".
5514 See Also
5515 --------
5516 DataFrame.rename_axis : Set the name of the axis.
5518 Examples
5519 --------
5520 ``DataFrame.rename`` supports two calling conventions
5522 * ``(index=index_mapper, columns=columns_mapper, ...)``
5523 * ``(mapper, axis={'index', 'columns'}, ...)``
5525 We *highly* recommend using keyword arguments to clarify your
5526 intent.
5528 Rename columns using a mapping:
5530 >>> df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
5531 >>> df.rename(columns={"A": "a", "B": "c"})
5532 a c
5533 0 1 4
5534 1 2 5
5535 2 3 6
5537 Rename index using a mapping:
5539 >>> df.rename(index={0: "x", 1: "y", 2: "z"})
5540 A B
5541 x 1 4
5542 y 2 5
5543 z 3 6
5545 Cast index labels to a different type:
5547 >>> df.index
5548 RangeIndex(start=0, stop=3, step=1)
5549 >>> df.rename(index=str).index
5550 Index(['0', '1', '2'], dtype='object')
5552 >>> df.rename(columns={"A": "a", "B": "b", "C": "c"}, errors="raise")
5553 Traceback (most recent call last):
5554 KeyError: ['C'] not found in axis
5556 Using axis-style parameters:
5558 >>> df.rename(str.lower, axis='columns')
5559 a b
5560 0 1 4
5561 1 2 5
5562 2 3 6
5564 >>> df.rename({1: 2, 2: 4}, axis='index')
5565 A B
5566 0 1 4
5567 2 2 5
5568 4 3 6
5569 """
5570 return super()._rename(
5571 mapper=mapper,
5572 index=index,
5573 columns=columns,
5574 axis=axis,
5575 copy=copy,
5576 inplace=inplace,
5577 level=level,
5578 errors=errors,
5579 )
5581 @overload
5582 def fillna(
5583 self,
5584 value: Hashable | Mapping | Series | DataFrame = ...,
5585 *,
5586 method: FillnaOptions | None = ...,
5587 axis: Axis | None = ...,
5588 inplace: Literal[False] = ...,
5589 limit: int | None = ...,
5590 downcast: dict | None = ...,
5591 ) -> DataFrame:
5592 ...
5594 @overload
5595 def fillna(
5596 self,
5597 value: Hashable | Mapping | Series | DataFrame = ...,
5598 *,
5599 method: FillnaOptions | None = ...,
5600 axis: Axis | None = ...,
5601 inplace: Literal[True],
5602 limit: int | None = ...,
5603 downcast: dict | None = ...,
5604 ) -> None:
5605 ...
5607 @overload
5608 def fillna(
5609 self,
5610 value: Hashable | Mapping | Series | DataFrame = ...,
5611 *,
5612 method: FillnaOptions | None = ...,
5613 axis: Axis | None = ...,
5614 inplace: bool = ...,
5615 limit: int | None = ...,
5616 downcast: dict | None = ...,
5617 ) -> DataFrame | None:
5618 ...
5620 # error: Signature of "fillna" incompatible with supertype "NDFrame"
5621 @deprecate_nonkeyword_arguments(version=None, allowed_args=["self", "value"])
5622 @doc(NDFrame.fillna, **_shared_doc_kwargs)
5623 def fillna( # type: ignore[override]
5624 self,
5625 value: Hashable | Mapping | Series | DataFrame = None,
5626 method: FillnaOptions | None = None,
5627 axis: Axis | None = None,
5628 inplace: bool = False,
5629 limit: int | None = None,
5630 downcast: dict | None = None,
5631 ) -> DataFrame | None:
5632 return super().fillna(
5633 value=value,
5634 method=method,
5635 axis=axis,
5636 inplace=inplace,
5637 limit=limit,
5638 downcast=downcast,
5639 )
5641 def pop(self, item: Hashable) -> Series:
5642 """
5643 Return item and drop from frame. Raise KeyError if not found.
5645 Parameters
5646 ----------
5647 item : label
5648 Label of column to be popped.
5650 Returns
5651 -------
5652 Series
5654 Examples
5655 --------
5656 >>> df = pd.DataFrame([('falcon', 'bird', 389.0),
5657 ... ('parrot', 'bird', 24.0),
5658 ... ('lion', 'mammal', 80.5),
5659 ... ('monkey', 'mammal', np.nan)],
5660 ... columns=('name', 'class', 'max_speed'))
5661 >>> df
5662 name class max_speed
5663 0 falcon bird 389.0
5664 1 parrot bird 24.0
5665 2 lion mammal 80.5
5666 3 monkey mammal NaN
5668 >>> df.pop('class')
5669 0 bird
5670 1 bird
5671 2 mammal
5672 3 mammal
5673 Name: class, dtype: object
5675 >>> df
5676 name max_speed
5677 0 falcon 389.0
5678 1 parrot 24.0
5679 2 lion 80.5
5680 3 monkey NaN
5681 """
5682 return super().pop(item=item)
5684 # error: Signature of "replace" incompatible with supertype "NDFrame"
5685 @overload # type: ignore[override]
5686 def replace(
5687 self,
5688 to_replace=...,
5689 value=...,
5690 *,
5691 inplace: Literal[False] = ...,
5692 limit: int | None = ...,
5693 regex: bool = ...,
5694 method: Literal["pad", "ffill", "bfill"] | lib.NoDefault = ...,
5695 ) -> DataFrame:
5696 ...
5698 @overload
5699 def replace(
5700 self,
5701 to_replace=...,
5702 value=...,
5703 *,
5704 inplace: Literal[True],
5705 limit: int | None = ...,
5706 regex: bool = ...,
5707 method: Literal["pad", "ffill", "bfill"] | lib.NoDefault = ...,
5708 ) -> None:
5709 ...
5711 # error: Signature of "replace" incompatible with supertype "NDFrame"
5712 @deprecate_nonkeyword_arguments(
5713 version=None, allowed_args=["self", "to_replace", "value"]
5714 )
5715 @doc(NDFrame.replace, **_shared_doc_kwargs)
5716 def replace( # type: ignore[override]
5717 self,
5718 to_replace=None,
5719 value=lib.no_default,
5720 inplace: bool = False,
5721 limit: int | None = None,
5722 regex: bool = False,
5723 method: Literal["pad", "ffill", "bfill"] | lib.NoDefault = lib.no_default,
5724 ) -> DataFrame | None:
5725 return super().replace(
5726 to_replace=to_replace,
5727 value=value,
5728 inplace=inplace,
5729 limit=limit,
5730 regex=regex,
5731 method=method,
5732 )
5734 def _replace_columnwise(
5735 self, mapping: dict[Hashable, tuple[Any, Any]], inplace: bool, regex
5736 ):
5737 """
5738 Dispatch to Series.replace column-wise.
5740 Parameters
5741 ----------
5742 mapping : dict
5743 of the form {col: (target, value)}
5744 inplace : bool
5745 regex : bool or same types as `to_replace` in DataFrame.replace
5747 Returns
5748 -------
5749 DataFrame or None
5750 """
5751 # Operate column-wise
5752 res = self if inplace else self.copy()
5753 ax = self.columns
5755 for i in range(len(ax)):
5756 if ax[i] in mapping:
5757 ser = self.iloc[:, i]
5759 target, value = mapping[ax[i]]
5760 newobj = ser.replace(target, value, regex=regex)
5762 res._iset_item(i, newobj)
5764 if inplace:
5765 return
5766 return res.__finalize__(self)
5768 @doc(NDFrame.shift, klass=_shared_doc_kwargs["klass"])
5769 def shift(
5770 self,
5771 periods: int = 1,
5772 freq: Frequency | None = None,
5773 axis: Axis = 0,
5774 fill_value: Hashable = lib.no_default,
5775 ) -> DataFrame:
5776 axis = self._get_axis_number(axis)
5778 ncols = len(self.columns)
5779 if (
5780 axis == 1
5781 and periods != 0
5782 and freq is None
5783 and fill_value is lib.no_default
5784 and ncols > 0
5785 ):
5786 # We will infer fill_value to match the closest column
5788 # Use a column that we know is valid for our column's dtype GH#38434
5789 label = self.columns[0]
5791 if periods > 0:
5792 result = self.iloc[:, :-periods]
5793 for col in range(min(ncols, abs(periods))):
5794 # TODO(EA2D): doing this in a loop unnecessary with 2D EAs
5795 # Define filler inside loop so we get a copy
5796 filler = self.iloc[:, 0].shift(len(self))
5797 result.insert(0, label, filler, allow_duplicates=True)
5798 else:
5799 result = self.iloc[:, -periods:]
5800 for col in range(min(ncols, abs(periods))):
5801 # Define filler inside loop so we get a copy
5802 filler = self.iloc[:, -1].shift(len(self))
5803 result.insert(
5804 len(result.columns), label, filler, allow_duplicates=True
5805 )
5807 result.columns = self.columns.copy()
5808 return result
5809 elif (
5810 axis == 1
5811 and periods != 0
5812 and fill_value is not lib.no_default
5813 and ncols > 0
5814 ):
5815 arrays = self._mgr.arrays
5816 if len(arrays) > 1 or (
5817 # If we only have one block and we know that we can't
5818 # keep the same dtype (i.e. the _can_hold_element check)
5819 # then we can go through the reindex_indexer path
5820 # (and avoid casting logic in the Block method).
5821 # The exception to this (until 2.0) is datetimelike
5822 # dtypes with integers, which cast.
5823 not can_hold_element(arrays[0], fill_value)
5824 # TODO(2.0): remove special case for integer-with-datetimelike
5825 # once deprecation is enforced
5826 and not (
5827 lib.is_integer(fill_value) and needs_i8_conversion(arrays[0].dtype)
5828 )
5829 ):
5830 # GH#35488 we need to watch out for multi-block cases
5831 # We only get here with fill_value not-lib.no_default
5832 nper = abs(periods)
5833 nper = min(nper, ncols)
5834 if periods > 0:
5835 indexer = np.array(
5836 [-1] * nper + list(range(ncols - periods)), dtype=np.intp
5837 )
5838 else:
5839 indexer = np.array(
5840 list(range(nper, ncols)) + [-1] * nper, dtype=np.intp
5841 )
5842 mgr = self._mgr.reindex_indexer(
5843 self.columns,
5844 indexer,
5845 axis=0,
5846 fill_value=fill_value,
5847 allow_dups=True,
5848 )
5849 res_df = self._constructor(mgr)
5850 return res_df.__finalize__(self, method="shift")
5852 return super().shift(
5853 periods=periods, freq=freq, axis=axis, fill_value=fill_value
5854 )
5856 @overload
5857 def set_index(
5858 self,
5859 keys,
5860 *,
5861 drop: bool = ...,
5862 append: bool = ...,
5863 inplace: Literal[False] = ...,
5864 verify_integrity: bool = ...,
5865 ) -> DataFrame:
5866 ...
5868 @overload
5869 def set_index(
5870 self,
5871 keys,
5872 *,
5873 drop: bool = ...,
5874 append: bool = ...,
5875 inplace: Literal[True],
5876 verify_integrity: bool = ...,
5877 ) -> None:
5878 ...
5880 @deprecate_nonkeyword_arguments(version=None, allowed_args=["self", "keys"])
5881 def set_index(
5882 self,
5883 keys,
5884 drop: bool = True,
5885 append: bool = False,
5886 inplace: bool = False,
5887 verify_integrity: bool = False,
5888 ) -> DataFrame | None:
5889 """
5890 Set the DataFrame index using existing columns.
5892 Set the DataFrame index (row labels) using one or more existing
5893 columns or arrays (of the correct length). The index can replace the
5894 existing index or expand on it.
5896 Parameters
5897 ----------
5898 keys : label or array-like or list of labels/arrays
5899 This parameter can be either a single column key, a single array of
5900 the same length as the calling DataFrame, or a list containing an
5901 arbitrary combination of column keys and arrays. Here, "array"
5902 encompasses :class:`Series`, :class:`Index`, ``np.ndarray``, and
5903 instances of :class:`~collections.abc.Iterator`.
5904 drop : bool, default True
5905 Delete columns to be used as the new index.
5906 append : bool, default False
5907 Whether to append columns to existing index.
5908 inplace : bool, default False
5909 Whether to modify the DataFrame rather than creating a new one.
5910 verify_integrity : bool, default False
5911 Check the new index for duplicates. Otherwise defer the check until
5912 necessary. Setting to False will improve the performance of this
5913 method.
5915 Returns
5916 -------
5917 DataFrame or None
5918 Changed row labels or None if ``inplace=True``.
5920 See Also
5921 --------
5922 DataFrame.reset_index : Opposite of set_index.
5923 DataFrame.reindex : Change to new indices or expand indices.
5924 DataFrame.reindex_like : Change to same indices as other DataFrame.
5926 Examples
5927 --------
5928 >>> df = pd.DataFrame({'month': [1, 4, 7, 10],
5929 ... 'year': [2012, 2014, 2013, 2014],
5930 ... 'sale': [55, 40, 84, 31]})
5931 >>> df
5932 month year sale
5933 0 1 2012 55
5934 1 4 2014 40
5935 2 7 2013 84
5936 3 10 2014 31
5938 Set the index to become the 'month' column:
5940 >>> df.set_index('month')
5941 year sale
5942 month
5943 1 2012 55
5944 4 2014 40
5945 7 2013 84
5946 10 2014 31
5948 Create a MultiIndex using columns 'year' and 'month':
5950 >>> df.set_index(['year', 'month'])
5951 sale
5952 year month
5953 2012 1 55
5954 2014 4 40
5955 2013 7 84
5956 2014 10 31
5958 Create a MultiIndex using an Index and a column:
5960 >>> df.set_index([pd.Index([1, 2, 3, 4]), 'year'])
5961 month sale
5962 year
5963 1 2012 1 55
5964 2 2014 4 40
5965 3 2013 7 84
5966 4 2014 10 31
5968 Create a MultiIndex using two Series:
5970 >>> s = pd.Series([1, 2, 3, 4])
5971 >>> df.set_index([s, s**2])
5972 month year sale
5973 1 1 1 2012 55
5974 2 4 4 2014 40
5975 3 9 7 2013 84
5976 4 16 10 2014 31
5977 """
5978 inplace = validate_bool_kwarg(inplace, "inplace")
5979 self._check_inplace_and_allows_duplicate_labels(inplace)
5980 if not isinstance(keys, list):
5981 keys = [keys]
5983 err_msg = (
5984 'The parameter "keys" may be a column key, one-dimensional '
5985 "array, or a list containing only valid column keys and "
5986 "one-dimensional arrays."
5987 )
5989 missing: list[Hashable] = []
5990 for col in keys:
5991 if isinstance(col, (Index, Series, np.ndarray, list, abc.Iterator)):
5992 # arrays are fine as long as they are one-dimensional
5993 # iterators get converted to list below
5994 if getattr(col, "ndim", 1) != 1:
5995 raise ValueError(err_msg)
5996 else:
5997 # everything else gets tried as a key; see GH 24969
5998 try:
5999 found = col in self.columns
6000 except TypeError as err:
6001 raise TypeError(
6002 f"{err_msg}. Received column of type {type(col)}"
6003 ) from err
6004 else:
6005 if not found:
6006 missing.append(col)
6008 if missing:
6009 raise KeyError(f"None of {missing} are in the columns")
6011 if inplace:
6012 frame = self
6013 else:
6014 frame = self.copy()
6016 arrays = []
6017 names: list[Hashable] = []
6018 if append:
6019 names = list(self.index.names)
6020 if isinstance(self.index, MultiIndex):
6021 for i in range(self.index.nlevels):
6022 arrays.append(self.index._get_level_values(i))
6023 else:
6024 arrays.append(self.index)
6026 to_remove: list[Hashable] = []
6027 for col in keys:
6028 if isinstance(col, MultiIndex):
6029 for n in range(col.nlevels):
6030 arrays.append(col._get_level_values(n))
6031 names.extend(col.names)
6032 elif isinstance(col, (Index, Series)):
6033 # if Index then not MultiIndex (treated above)
6035 # error: Argument 1 to "append" of "list" has incompatible type
6036 # "Union[Index, Series]"; expected "Index"
6037 arrays.append(col) # type:ignore[arg-type]
6038 names.append(col.name)
6039 elif isinstance(col, (list, np.ndarray)):
6040 # error: Argument 1 to "append" of "list" has incompatible type
6041 # "Union[List[Any], ndarray]"; expected "Index"
6042 arrays.append(col) # type: ignore[arg-type]
6043 names.append(None)
6044 elif isinstance(col, abc.Iterator):
6045 # error: Argument 1 to "append" of "list" has incompatible type
6046 # "List[Any]"; expected "Index"
6047 arrays.append(list(col)) # type: ignore[arg-type]
6048 names.append(None)
6049 # from here, col can only be a column label
6050 else:
6051 arrays.append(frame[col]._values)
6052 names.append(col)
6053 if drop:
6054 to_remove.append(col)
6056 if len(arrays[-1]) != len(self):
6057 # check newest element against length of calling frame, since
6058 # ensure_index_from_sequences would not raise for append=False.
6059 raise ValueError(
6060 f"Length mismatch: Expected {len(self)} rows, "
6061 f"received array of length {len(arrays[-1])}"
6062 )
6064 index = ensure_index_from_sequences(arrays, names)
6066 if verify_integrity and not index.is_unique:
6067 duplicates = index[index.duplicated()].unique()
6068 raise ValueError(f"Index has duplicate keys: {duplicates}")
6070 # use set to handle duplicate column names gracefully in case of drop
6071 for c in set(to_remove):
6072 del frame[c]
6074 # clear up memory usage
6075 index._cleanup()
6077 frame.index = index
6079 if not inplace:
6080 return frame
6081 return None
6083 @overload
6084 def reset_index(
6085 self,
6086 level: IndexLabel = ...,
6087 *,
6088 drop: bool = ...,
6089 inplace: Literal[False] = ...,
6090 col_level: Hashable = ...,
6091 col_fill: Hashable = ...,
6092 allow_duplicates: bool | lib.NoDefault = ...,
6093 names: Hashable | Sequence[Hashable] = None,
6094 ) -> DataFrame:
6095 ...
6097 @overload
6098 def reset_index(
6099 self,
6100 level: IndexLabel = ...,
6101 *,
6102 drop: bool = ...,
6103 inplace: Literal[True],
6104 col_level: Hashable = ...,
6105 col_fill: Hashable = ...,
6106 allow_duplicates: bool | lib.NoDefault = ...,
6107 names: Hashable | Sequence[Hashable] = None,
6108 ) -> None:
6109 ...
6111 @overload
6112 def reset_index(
6113 self,
6114 level: IndexLabel = ...,
6115 *,
6116 drop: bool = ...,
6117 inplace: bool = ...,
6118 col_level: Hashable = ...,
6119 col_fill: Hashable = ...,
6120 allow_duplicates: bool | lib.NoDefault = ...,
6121 names: Hashable | Sequence[Hashable] = None,
6122 ) -> DataFrame | None:
6123 ...
6125 @deprecate_nonkeyword_arguments(version=None, allowed_args=["self", "level"])
6126 def reset_index(
6127 self,
6128 level: IndexLabel = None,
6129 drop: bool = False,
6130 inplace: bool = False,
6131 col_level: Hashable = 0,
6132 col_fill: Hashable = "",
6133 allow_duplicates: bool | lib.NoDefault = lib.no_default,
6134 names: Hashable | Sequence[Hashable] = None,
6135 ) -> DataFrame | None:
6136 """
6137 Reset the index, or a level of it.
6139 Reset the index of the DataFrame, and use the default one instead.
6140 If the DataFrame has a MultiIndex, this method can remove one or more
6141 levels.
6143 Parameters
6144 ----------
6145 level : int, str, tuple, or list, default None
6146 Only remove the given levels from the index. Removes all levels by
6147 default.
6148 drop : bool, default False
6149 Do not try to insert index into dataframe columns. This resets
6150 the index to the default integer index.
6151 inplace : bool, default False
6152 Whether to modify the DataFrame rather than creating a new one.
6153 col_level : int or str, default 0
6154 If the columns have multiple levels, determines which level the
6155 labels are inserted into. By default it is inserted into the first
6156 level.
6157 col_fill : object, default ''
6158 If the columns have multiple levels, determines how the other
6159 levels are named. If None then the index name is repeated.
6160 allow_duplicates : bool, optional, default lib.no_default
6161 Allow duplicate column labels to be created.
6163 .. versionadded:: 1.5.0
6165 names : int, str or 1-dimensional list, default None
6166 Using the given string, rename the DataFrame column which contains the
6167 index data. If the DataFrame has a MultiIndex, this has to be a list or
6168 tuple with length equal to the number of levels.
6170 .. versionadded:: 1.5.0
6172 Returns
6173 -------
6174 DataFrame or None
6175 DataFrame with the new index or None if ``inplace=True``.
6177 See Also
6178 --------
6179 DataFrame.set_index : Opposite of reset_index.
6180 DataFrame.reindex : Change to new indices or expand indices.
6181 DataFrame.reindex_like : Change to same indices as other DataFrame.
6183 Examples
6184 --------
6185 >>> df = pd.DataFrame([('bird', 389.0),
6186 ... ('bird', 24.0),
6187 ... ('mammal', 80.5),
6188 ... ('mammal', np.nan)],
6189 ... index=['falcon', 'parrot', 'lion', 'monkey'],
6190 ... columns=('class', 'max_speed'))
6191 >>> df
6192 class max_speed
6193 falcon bird 389.0
6194 parrot bird 24.0
6195 lion mammal 80.5
6196 monkey mammal NaN
6198 When we reset the index, the old index is added as a column, and a
6199 new sequential index is used:
6201 >>> df.reset_index()
6202 index class max_speed
6203 0 falcon bird 389.0
6204 1 parrot bird 24.0
6205 2 lion mammal 80.5
6206 3 monkey mammal NaN
6208 We can use the `drop` parameter to avoid the old index being added as
6209 a column:
6211 >>> df.reset_index(drop=True)
6212 class max_speed
6213 0 bird 389.0
6214 1 bird 24.0
6215 2 mammal 80.5
6216 3 mammal NaN
6218 You can also use `reset_index` with `MultiIndex`.
6220 >>> index = pd.MultiIndex.from_tuples([('bird', 'falcon'),
6221 ... ('bird', 'parrot'),
6222 ... ('mammal', 'lion'),
6223 ... ('mammal', 'monkey')],
6224 ... names=['class', 'name'])
6225 >>> columns = pd.MultiIndex.from_tuples([('speed', 'max'),
6226 ... ('species', 'type')])
6227 >>> df = pd.DataFrame([(389.0, 'fly'),
6228 ... ( 24.0, 'fly'),
6229 ... ( 80.5, 'run'),
6230 ... (np.nan, 'jump')],
6231 ... index=index,
6232 ... columns=columns)
6233 >>> df
6234 speed species
6235 max type
6236 class name
6237 bird falcon 389.0 fly
6238 parrot 24.0 fly
6239 mammal lion 80.5 run
6240 monkey NaN jump
6242 Using the `names` parameter, choose a name for the index column:
6244 >>> df.reset_index(names=['classes', 'names'])
6245 classes names speed species
6246 max type
6247 0 bird falcon 389.0 fly
6248 1 bird parrot 24.0 fly
6249 2 mammal lion 80.5 run
6250 3 mammal monkey NaN jump
6252 If the index has multiple levels, we can reset a subset of them:
6254 >>> df.reset_index(level='class')
6255 class speed species
6256 max type
6257 name
6258 falcon bird 389.0 fly
6259 parrot bird 24.0 fly
6260 lion mammal 80.5 run
6261 monkey mammal NaN jump
6263 If we are not dropping the index, by default, it is placed in the top
6264 level. We can place it in another level:
6266 >>> df.reset_index(level='class', col_level=1)
6267 speed species
6268 class max type
6269 name
6270 falcon bird 389.0 fly
6271 parrot bird 24.0 fly
6272 lion mammal 80.5 run
6273 monkey mammal NaN jump
6275 When the index is inserted under another level, we can specify under
6276 which one with the parameter `col_fill`:
6278 >>> df.reset_index(level='class', col_level=1, col_fill='species')
6279 species speed species
6280 class max type
6281 name
6282 falcon bird 389.0 fly
6283 parrot bird 24.0 fly
6284 lion mammal 80.5 run
6285 monkey mammal NaN jump
6287 If we specify a nonexistent level for `col_fill`, it is created:
6289 >>> df.reset_index(level='class', col_level=1, col_fill='genus')
6290 genus speed species
6291 class max type
6292 name
6293 falcon bird 389.0 fly
6294 parrot bird 24.0 fly
6295 lion mammal 80.5 run
6296 monkey mammal NaN jump
6297 """
6298 inplace = validate_bool_kwarg(inplace, "inplace")
6299 self._check_inplace_and_allows_duplicate_labels(inplace)
6300 if inplace:
6301 new_obj = self
6302 else:
6303 new_obj = self.copy(deep=None)
6304 if allow_duplicates is not lib.no_default:
6305 allow_duplicates = validate_bool_kwarg(allow_duplicates, "allow_duplicates")
6307 new_index = default_index(len(new_obj))
6308 if level is not None:
6309 if not isinstance(level, (tuple, list)):
6310 level = [level]
6311 level = [self.index._get_level_number(lev) for lev in level]
6312 if len(level) < self.index.nlevels:
6313 new_index = self.index.droplevel(level)
6315 if not drop:
6316 to_insert: Iterable[tuple[Any, Any | None]]
6318 default = "index" if "index" not in self else "level_0"
6319 names = self.index._get_default_index_names(names, default)
6321 if isinstance(self.index, MultiIndex):
6322 to_insert = zip(self.index.levels, self.index.codes)
6323 else:
6324 to_insert = ((self.index, None),)
6326 multi_col = isinstance(self.columns, MultiIndex)
6327 for i, (lev, lab) in reversed(list(enumerate(to_insert))):
6328 if level is not None and i not in level:
6329 continue
6330 name = names[i]
6331 if multi_col:
6332 col_name = list(name) if isinstance(name, tuple) else [name]
6333 if col_fill is None:
6334 if len(col_name) not in (1, self.columns.nlevels):
6335 raise ValueError(
6336 "col_fill=None is incompatible "
6337 f"with incomplete column name {name}"
6338 )
6339 col_fill = col_name[0]
6341 lev_num = self.columns._get_level_number(col_level)
6342 name_lst = [col_fill] * lev_num + col_name
6343 missing = self.columns.nlevels - len(name_lst)
6344 name_lst += [col_fill] * missing
6345 name = tuple(name_lst)
6347 # to ndarray and maybe infer different dtype
6348 level_values = lev._values
6349 if level_values.dtype == np.object_:
6350 level_values = lib.maybe_convert_objects(level_values)
6352 if lab is not None:
6353 # if we have the codes, extract the values with a mask
6354 level_values = algorithms.take(
6355 level_values, lab, allow_fill=True, fill_value=lev._na_value
6356 )
6358 new_obj.insert(
6359 0,
6360 name,
6361 level_values,
6362 allow_duplicates=allow_duplicates,
6363 )
6365 new_obj.index = new_index
6366 if not inplace:
6367 return new_obj
6369 return None
6371 # ----------------------------------------------------------------------
6372 # Reindex-based selection methods
6374 @doc(NDFrame.isna, klass=_shared_doc_kwargs["klass"])
6375 def isna(self) -> DataFrame:
6376 result = self._constructor(self._mgr.isna(func=isna))
6377 return result.__finalize__(self, method="isna")
6379 @doc(NDFrame.isna, klass=_shared_doc_kwargs["klass"])
6380 def isnull(self) -> DataFrame:
6381 """
6382 DataFrame.isnull is an alias for DataFrame.isna.
6383 """
6384 return self.isna()
6386 @doc(NDFrame.notna, klass=_shared_doc_kwargs["klass"])
6387 def notna(self) -> DataFrame:
6388 return ~self.isna()
6390 @doc(NDFrame.notna, klass=_shared_doc_kwargs["klass"])
6391 def notnull(self) -> DataFrame:
6392 """
6393 DataFrame.notnull is an alias for DataFrame.notna.
6394 """
6395 return ~self.isna()
6397 @overload
6398 def dropna(
6399 self,
6400 *,
6401 axis: Axis = ...,
6402 how: str | NoDefault = ...,
6403 thresh: int | NoDefault = ...,
6404 subset: IndexLabel = ...,
6405 inplace: Literal[False] = ...,
6406 ) -> DataFrame:
6407 ...
6409 @overload
6410 def dropna(
6411 self,
6412 *,
6413 axis: Axis = ...,
6414 how: str | NoDefault = ...,
6415 thresh: int | NoDefault = ...,
6416 subset: IndexLabel = ...,
6417 inplace: Literal[True],
6418 ) -> None:
6419 ...
6421 @deprecate_nonkeyword_arguments(version=None, allowed_args=["self"])
6422 def dropna(
6423 self,
6424 axis: Axis = 0,
6425 how: str | NoDefault = no_default,
6426 thresh: int | NoDefault = no_default,
6427 subset: IndexLabel = None,
6428 inplace: bool = False,
6429 ) -> DataFrame | None:
6430 """
6431 Remove missing values.
6433 See the :ref:`User Guide <missing_data>` for more on which values are
6434 considered missing, and how to work with missing data.
6436 Parameters
6437 ----------
6438 axis : {0 or 'index', 1 or 'columns'}, default 0
6439 Determine if rows or columns which contain missing values are
6440 removed.
6442 * 0, or 'index' : Drop rows which contain missing values.
6443 * 1, or 'columns' : Drop columns which contain missing value.
6445 .. versionchanged:: 1.0.0
6447 Pass tuple or list to drop on multiple axes.
6448 Only a single axis is allowed.
6450 how : {'any', 'all'}, default 'any'
6451 Determine if row or column is removed from DataFrame, when we have
6452 at least one NA or all NA.
6454 * 'any' : If any NA values are present, drop that row or column.
6455 * 'all' : If all values are NA, drop that row or column.
6457 thresh : int, optional
6458 Require that many non-NA values. Cannot be combined with how.
6459 subset : column label or sequence of labels, optional
6460 Labels along other axis to consider, e.g. if you are dropping rows
6461 these would be a list of columns to include.
6462 inplace : bool, default False
6463 Whether to modify the DataFrame rather than creating a new one.
6465 Returns
6466 -------
6467 DataFrame or None
6468 DataFrame with NA entries dropped from it or None if ``inplace=True``.
6470 See Also
6471 --------
6472 DataFrame.isna: Indicate missing values.
6473 DataFrame.notna : Indicate existing (non-missing) values.
6474 DataFrame.fillna : Replace missing values.
6475 Series.dropna : Drop missing values.
6476 Index.dropna : Drop missing indices.
6478 Examples
6479 --------
6480 >>> df = pd.DataFrame({"name": ['Alfred', 'Batman', 'Catwoman'],
6481 ... "toy": [np.nan, 'Batmobile', 'Bullwhip'],
6482 ... "born": [pd.NaT, pd.Timestamp("1940-04-25"),
6483 ... pd.NaT]})
6484 >>> df
6485 name toy born
6486 0 Alfred NaN NaT
6487 1 Batman Batmobile 1940-04-25
6488 2 Catwoman Bullwhip NaT
6490 Drop the rows where at least one element is missing.
6492 >>> df.dropna()
6493 name toy born
6494 1 Batman Batmobile 1940-04-25
6496 Drop the columns where at least one element is missing.
6498 >>> df.dropna(axis='columns')
6499 name
6500 0 Alfred
6501 1 Batman
6502 2 Catwoman
6504 Drop the rows where all elements are missing.
6506 >>> df.dropna(how='all')
6507 name toy born
6508 0 Alfred NaN NaT
6509 1 Batman Batmobile 1940-04-25
6510 2 Catwoman Bullwhip NaT
6512 Keep only the rows with at least 2 non-NA values.
6514 >>> df.dropna(thresh=2)
6515 name toy born
6516 1 Batman Batmobile 1940-04-25
6517 2 Catwoman Bullwhip NaT
6519 Define in which columns to look for missing values.
6521 >>> df.dropna(subset=['name', 'toy'])
6522 name toy born
6523 1 Batman Batmobile 1940-04-25
6524 2 Catwoman Bullwhip NaT
6526 Keep the DataFrame with valid entries in the same variable.
6528 >>> df.dropna(inplace=True)
6529 >>> df
6530 name toy born
6531 1 Batman Batmobile 1940-04-25
6532 """
6533 if (how is not no_default) and (thresh is not no_default):
6534 raise TypeError(
6535 "You cannot set both the how and thresh arguments at the same time."
6536 )
6538 if how is no_default:
6539 how = "any"
6541 inplace = validate_bool_kwarg(inplace, "inplace")
6542 if isinstance(axis, (tuple, list)):
6543 # GH20987
6544 raise TypeError("supplying multiple axes to axis is no longer supported.")
6546 axis = self._get_axis_number(axis)
6547 agg_axis = 1 - axis
6549 agg_obj = self
6550 if subset is not None:
6551 # subset needs to be list
6552 if not is_list_like(subset):
6553 subset = [subset]
6554 ax = self._get_axis(agg_axis)
6555 indices = ax.get_indexer_for(subset)
6556 check = indices == -1
6557 if check.any():
6558 raise KeyError(np.array(subset)[check].tolist())
6559 agg_obj = self.take(indices, axis=agg_axis)
6561 if thresh is not no_default:
6562 count = agg_obj.count(axis=agg_axis)
6563 mask = count >= thresh
6564 elif how == "any":
6565 # faster equivalent to 'agg_obj.count(agg_axis) == self.shape[agg_axis]'
6566 mask = notna(agg_obj).all(axis=agg_axis, bool_only=False)
6567 elif how == "all":
6568 # faster equivalent to 'agg_obj.count(agg_axis) > 0'
6569 mask = notna(agg_obj).any(axis=agg_axis, bool_only=False)
6570 else:
6571 raise ValueError(f"invalid how option: {how}")
6573 if np.all(mask):
6574 result = self.copy()
6575 else:
6576 result = self.loc(axis=axis)[mask]
6578 if not inplace:
6579 return result
6580 self._update_inplace(result)
6581 return None
6583 @deprecate_nonkeyword_arguments(version=None, allowed_args=["self", "subset"])
6584 def drop_duplicates(
6585 self,
6586 subset: Hashable | Sequence[Hashable] | None = None,
6587 keep: Literal["first", "last", False] = "first",
6588 inplace: bool = False,
6589 ignore_index: bool = False,
6590 ) -> DataFrame | None:
6591 """
6592 Return DataFrame with duplicate rows removed.
6594 Considering certain columns is optional. Indexes, including time indexes
6595 are ignored.
6597 Parameters
6598 ----------
6599 subset : column label or sequence of labels, optional
6600 Only consider certain columns for identifying duplicates, by
6601 default use all of the columns.
6602 keep : {'first', 'last', False}, default 'first'
6603 Determines which duplicates (if any) to keep.
6604 - ``first`` : Drop duplicates except for the first occurrence.
6605 - ``last`` : Drop duplicates except for the last occurrence.
6606 - False : Drop all duplicates.
6607 inplace : bool, default False
6608 Whether to modify the DataFrame rather than creating a new one.
6609 ignore_index : bool, default False
6610 If True, the resulting axis will be labeled 0, 1, …, n - 1.
6612 .. versionadded:: 1.0.0
6614 Returns
6615 -------
6616 DataFrame or None
6617 DataFrame with duplicates removed or None if ``inplace=True``.
6619 See Also
6620 --------
6621 DataFrame.value_counts: Count unique combinations of columns.
6623 Examples
6624 --------
6625 Consider dataset containing ramen rating.
6627 >>> df = pd.DataFrame({
6628 ... 'brand': ['Yum Yum', 'Yum Yum', 'Indomie', 'Indomie', 'Indomie'],
6629 ... 'style': ['cup', 'cup', 'cup', 'pack', 'pack'],
6630 ... 'rating': [4, 4, 3.5, 15, 5]
6631 ... })
6632 >>> df
6633 brand style rating
6634 0 Yum Yum cup 4.0
6635 1 Yum Yum cup 4.0
6636 2 Indomie cup 3.5
6637 3 Indomie pack 15.0
6638 4 Indomie pack 5.0
6640 By default, it removes duplicate rows based on all columns.
6642 >>> df.drop_duplicates()
6643 brand style rating
6644 0 Yum Yum cup 4.0
6645 2 Indomie cup 3.5
6646 3 Indomie pack 15.0
6647 4 Indomie pack 5.0
6649 To remove duplicates on specific column(s), use ``subset``.
6651 >>> df.drop_duplicates(subset=['brand'])
6652 brand style rating
6653 0 Yum Yum cup 4.0
6654 2 Indomie cup 3.5
6656 To remove duplicates and keep last occurrences, use ``keep``.
6658 >>> df.drop_duplicates(subset=['brand', 'style'], keep='last')
6659 brand style rating
6660 1 Yum Yum cup 4.0
6661 2 Indomie cup 3.5
6662 4 Indomie pack 5.0
6663 """
6664 if self.empty:
6665 return self.copy()
6667 inplace = validate_bool_kwarg(inplace, "inplace")
6668 ignore_index = validate_bool_kwarg(ignore_index, "ignore_index")
6669 duplicated = self.duplicated(subset, keep=keep)
6671 result = self[-duplicated]
6672 if ignore_index:
6673 result.index = default_index(len(result))
6675 if inplace:
6676 self._update_inplace(result)
6677 return None
6678 else:
6679 return result
6681 def duplicated(
6682 self,
6683 subset: Hashable | Sequence[Hashable] | None = None,
6684 keep: Literal["first", "last", False] = "first",
6685 ) -> Series:
6686 """
6687 Return boolean Series denoting duplicate rows.
6689 Considering certain columns is optional.
6691 Parameters
6692 ----------
6693 subset : column label or sequence of labels, optional
6694 Only consider certain columns for identifying duplicates, by
6695 default use all of the columns.
6696 keep : {'first', 'last', False}, default 'first'
6697 Determines which duplicates (if any) to mark.
6699 - ``first`` : Mark duplicates as ``True`` except for the first occurrence.
6700 - ``last`` : Mark duplicates as ``True`` except for the last occurrence.
6701 - False : Mark all duplicates as ``True``.
6703 Returns
6704 -------
6705 Series
6706 Boolean series for each duplicated rows.
6708 See Also
6709 --------
6710 Index.duplicated : Equivalent method on index.
6711 Series.duplicated : Equivalent method on Series.
6712 Series.drop_duplicates : Remove duplicate values from Series.
6713 DataFrame.drop_duplicates : Remove duplicate values from DataFrame.
6715 Examples
6716 --------
6717 Consider dataset containing ramen rating.
6719 >>> df = pd.DataFrame({
6720 ... 'brand': ['Yum Yum', 'Yum Yum', 'Indomie', 'Indomie', 'Indomie'],
6721 ... 'style': ['cup', 'cup', 'cup', 'pack', 'pack'],
6722 ... 'rating': [4, 4, 3.5, 15, 5]
6723 ... })
6724 >>> df
6725 brand style rating
6726 0 Yum Yum cup 4.0
6727 1 Yum Yum cup 4.0
6728 2 Indomie cup 3.5
6729 3 Indomie pack 15.0
6730 4 Indomie pack 5.0
6732 By default, for each set of duplicated values, the first occurrence
6733 is set on False and all others on True.
6735 >>> df.duplicated()
6736 0 False
6737 1 True
6738 2 False
6739 3 False
6740 4 False
6741 dtype: bool
6743 By using 'last', the last occurrence of each set of duplicated values
6744 is set on False and all others on True.
6746 >>> df.duplicated(keep='last')
6747 0 True
6748 1 False
6749 2 False
6750 3 False
6751 4 False
6752 dtype: bool
6754 By setting ``keep`` on False, all duplicates are True.
6756 >>> df.duplicated(keep=False)
6757 0 True
6758 1 True
6759 2 False
6760 3 False
6761 4 False
6762 dtype: bool
6764 To find duplicates on specific column(s), use ``subset``.
6766 >>> df.duplicated(subset=['brand'])
6767 0 False
6768 1 True
6769 2 False
6770 3 True
6771 4 True
6772 dtype: bool
6773 """
6775 if self.empty:
6776 return self._constructor_sliced(dtype=bool)
6778 def f(vals) -> tuple[np.ndarray, int]:
6779 labels, shape = algorithms.factorize(vals, size_hint=len(self))
6780 return labels.astype("i8", copy=False), len(shape)
6782 if subset is None:
6783 # https://github.com/pandas-dev/pandas/issues/28770
6784 # Incompatible types in assignment (expression has type "Index", variable
6785 # has type "Sequence[Any]")
6786 subset = self.columns # type: ignore[assignment]
6787 elif (
6788 not np.iterable(subset)
6789 or isinstance(subset, str)
6790 or isinstance(subset, tuple)
6791 and subset in self.columns
6792 ):
6793 subset = (subset,)
6795 # needed for mypy since can't narrow types using np.iterable
6796 subset = cast(Sequence, subset)
6798 # Verify all columns in subset exist in the queried dataframe
6799 # Otherwise, raise a KeyError, same as if you try to __getitem__ with a
6800 # key that doesn't exist.
6801 diff = set(subset) - set(self.columns)
6802 if diff:
6803 raise KeyError(Index(diff))
6805 if len(subset) == 1 and self.columns.is_unique:
6806 # GH#45236 This is faster than get_group_index below
6807 result = self[subset[0]].duplicated(keep)
6808 result.name = None
6809 else:
6810 vals = (col.values for name, col in self.items() if name in subset)
6811 labels, shape = map(list, zip(*map(f, vals)))
6813 ids = get_group_index(
6814 labels,
6815 # error: Argument 1 to "tuple" has incompatible type "List[_T]";
6816 # expected "Iterable[int]"
6817 tuple(shape), # type: ignore[arg-type]
6818 sort=False,
6819 xnull=False,
6820 )
6821 result = self._constructor_sliced(duplicated(ids, keep), index=self.index)
6822 return result.__finalize__(self, method="duplicated")
6824 # ----------------------------------------------------------------------
6825 # Sorting
6826 # error: Signature of "sort_values" incompatible with supertype "NDFrame"
6827 @overload # type: ignore[override]
6828 def sort_values(
6829 self,
6830 by: IndexLabel,
6831 *,
6832 axis: Axis = ...,
6833 ascending=...,
6834 inplace: Literal[False] = ...,
6835 kind: str = ...,
6836 na_position: str = ...,
6837 ignore_index: bool = ...,
6838 key: ValueKeyFunc = ...,
6839 ) -> DataFrame:
6840 ...
6842 @overload
6843 def sort_values(
6844 self,
6845 by: IndexLabel,
6846 *,
6847 axis: Axis = ...,
6848 ascending=...,
6849 inplace: Literal[True],
6850 kind: str = ...,
6851 na_position: str = ...,
6852 ignore_index: bool = ...,
6853 key: ValueKeyFunc = ...,
6854 ) -> None:
6855 ...
6857 # TODO: Just move the sort_values doc here.
6858 # error: Signature of "sort_values" incompatible with supertype "NDFrame"
6859 @deprecate_nonkeyword_arguments(version=None, allowed_args=["self", "by"])
6860 @Substitution(**_shared_doc_kwargs)
6861 @Appender(NDFrame.sort_values.__doc__)
6862 def sort_values( # type: ignore[override]
6863 self,
6864 by: IndexLabel,
6865 axis: Axis = 0,
6866 ascending: bool | list[bool] | tuple[bool, ...] = True,
6867 inplace: bool = False,
6868 kind: str = "quicksort",
6869 na_position: str = "last",
6870 ignore_index: bool = False,
6871 key: ValueKeyFunc = None,
6872 ) -> DataFrame | None:
6873 inplace = validate_bool_kwarg(inplace, "inplace")
6874 axis = self._get_axis_number(axis)
6875 ascending = validate_ascending(ascending)
6876 if not isinstance(by, list):
6877 by = [by]
6878 # error: Argument 1 to "len" has incompatible type "Union[bool, List[bool]]";
6879 # expected "Sized"
6880 if is_sequence(ascending) and (
6881 len(by) != len(ascending) # type: ignore[arg-type]
6882 ):
6883 # error: Argument 1 to "len" has incompatible type "Union[bool,
6884 # List[bool]]"; expected "Sized"
6885 raise ValueError(
6886 f"Length of ascending ({len(ascending)})" # type: ignore[arg-type]
6887 f" != length of by ({len(by)})"
6888 )
6889 if len(by) > 1:
6891 keys = [self._get_label_or_level_values(x, axis=axis) for x in by]
6893 # need to rewrap columns in Series to apply key function
6894 if key is not None:
6895 # error: List comprehension has incompatible type List[Series];
6896 # expected List[ndarray]
6897 keys = [
6898 Series(k, name=name) # type: ignore[misc]
6899 for (k, name) in zip(keys, by)
6900 ]
6902 indexer = lexsort_indexer(
6903 keys, orders=ascending, na_position=na_position, key=key
6904 )
6905 elif len(by):
6906 # len(by) == 1
6908 by = by[0]
6909 k = self._get_label_or_level_values(by, axis=axis)
6911 # need to rewrap column in Series to apply key function
6912 if key is not None:
6913 # error: Incompatible types in assignment (expression has type
6914 # "Series", variable has type "ndarray")
6915 k = Series(k, name=by) # type: ignore[assignment]
6917 if isinstance(ascending, (tuple, list)):
6918 ascending = ascending[0]
6920 indexer = nargsort(
6921 k, kind=kind, ascending=ascending, na_position=na_position, key=key
6922 )
6923 else:
6924 return self.copy()
6926 new_data = self._mgr.take(
6927 indexer, axis=self._get_block_manager_axis(axis), verify=False
6928 )
6930 if ignore_index:
6931 new_data.set_axis(
6932 self._get_block_manager_axis(axis), default_index(len(indexer))
6933 )
6935 result = self._constructor(new_data)
6936 if inplace:
6937 return self._update_inplace(result)
6938 else:
6939 return result.__finalize__(self, method="sort_values")
6941 @overload
6942 def sort_index(
6943 self,
6944 *,
6945 axis: Axis = ...,
6946 level: IndexLabel = ...,
6947 ascending: bool | Sequence[bool] = ...,
6948 inplace: Literal[True],
6949 kind: SortKind = ...,
6950 na_position: NaPosition = ...,
6951 sort_remaining: bool = ...,
6952 ignore_index: bool = ...,
6953 key: IndexKeyFunc = ...,
6954 ) -> None:
6955 ...
6957 @overload
6958 def sort_index(
6959 self,
6960 *,
6961 axis: Axis = ...,
6962 level: IndexLabel = ...,
6963 ascending: bool | Sequence[bool] = ...,
6964 inplace: Literal[False] = ...,
6965 kind: SortKind = ...,
6966 na_position: NaPosition = ...,
6967 sort_remaining: bool = ...,
6968 ignore_index: bool = ...,
6969 key: IndexKeyFunc = ...,
6970 ) -> DataFrame:
6971 ...
6973 @overload
6974 def sort_index(
6975 self,
6976 *,
6977 axis: Axis = ...,
6978 level: IndexLabel = ...,
6979 ascending: bool | Sequence[bool] = ...,
6980 inplace: bool = ...,
6981 kind: SortKind = ...,
6982 na_position: NaPosition = ...,
6983 sort_remaining: bool = ...,
6984 ignore_index: bool = ...,
6985 key: IndexKeyFunc = ...,
6986 ) -> DataFrame | None:
6987 ...
6989 # error: Signature of "sort_index" incompatible with supertype "NDFrame"
6990 @deprecate_nonkeyword_arguments(version=None, allowed_args=["self"])
6991 def sort_index( # type: ignore[override]
6992 self,
6993 axis: Axis = 0,
6994 level: IndexLabel = None,
6995 ascending: bool | Sequence[bool] = True,
6996 inplace: bool = False,
6997 kind: SortKind = "quicksort",
6998 na_position: NaPosition = "last",
6999 sort_remaining: bool = True,
7000 ignore_index: bool = False,
7001 key: IndexKeyFunc = None,
7002 ) -> DataFrame | None:
7003 """
7004 Sort object by labels (along an axis).
7006 Returns a new DataFrame sorted by label if `inplace` argument is
7007 ``False``, otherwise updates the original DataFrame and returns None.
7009 Parameters
7010 ----------
7011 axis : {0 or 'index', 1 or 'columns'}, default 0
7012 The axis along which to sort. The value 0 identifies the rows,
7013 and 1 identifies the columns.
7014 level : int or level name or list of ints or list of level names
7015 If not None, sort on values in specified index level(s).
7016 ascending : bool or list-like of bools, default True
7017 Sort ascending vs. descending. When the index is a MultiIndex the
7018 sort direction can be controlled for each level individually.
7019 inplace : bool, default False
7020 Whether to modify the DataFrame rather than creating a new one.
7021 kind : {'quicksort', 'mergesort', 'heapsort', 'stable'}, default 'quicksort'
7022 Choice of sorting algorithm. See also :func:`numpy.sort` for more
7023 information. `mergesort` and `stable` are the only stable algorithms. For
7024 DataFrames, this option is only applied when sorting on a single
7025 column or label.
7026 na_position : {'first', 'last'}, default 'last'
7027 Puts NaNs at the beginning if `first`; `last` puts NaNs at the end.
7028 Not implemented for MultiIndex.
7029 sort_remaining : bool, default True
7030 If True and sorting by level and index is multilevel, sort by other
7031 levels too (in order) after sorting by specified level.
7032 ignore_index : bool, default False
7033 If True, the resulting axis will be labeled 0, 1, …, n - 1.
7035 .. versionadded:: 1.0.0
7037 key : callable, optional
7038 If not None, apply the key function to the index values
7039 before sorting. This is similar to the `key` argument in the
7040 builtin :meth:`sorted` function, with the notable difference that
7041 this `key` function should be *vectorized*. It should expect an
7042 ``Index`` and return an ``Index`` of the same shape. For MultiIndex
7043 inputs, the key is applied *per level*.
7045 .. versionadded:: 1.1.0
7047 Returns
7048 -------
7049 DataFrame or None
7050 The original DataFrame sorted by the labels or None if ``inplace=True``.
7052 See Also
7053 --------
7054 Series.sort_index : Sort Series by the index.
7055 DataFrame.sort_values : Sort DataFrame by the value.
7056 Series.sort_values : Sort Series by the value.
7058 Examples
7059 --------
7060 >>> df = pd.DataFrame([1, 2, 3, 4, 5], index=[100, 29, 234, 1, 150],
7061 ... columns=['A'])
7062 >>> df.sort_index()
7063 A
7064 1 4
7065 29 2
7066 100 1
7067 150 5
7068 234 3
7070 By default, it sorts in ascending order, to sort in descending order,
7071 use ``ascending=False``
7073 >>> df.sort_index(ascending=False)
7074 A
7075 234 3
7076 150 5
7077 100 1
7078 29 2
7079 1 4
7081 A key function can be specified which is applied to the index before
7082 sorting. For a ``MultiIndex`` this is applied to each level separately.
7084 >>> df = pd.DataFrame({"a": [1, 2, 3, 4]}, index=['A', 'b', 'C', 'd'])
7085 >>> df.sort_index(key=lambda x: x.str.lower())
7086 a
7087 A 1
7088 b 2
7089 C 3
7090 d 4
7091 """
7092 return super().sort_index(
7093 axis=axis,
7094 level=level,
7095 ascending=ascending,
7096 inplace=inplace,
7097 kind=kind,
7098 na_position=na_position,
7099 sort_remaining=sort_remaining,
7100 ignore_index=ignore_index,
7101 key=key,
7102 )
7104 def value_counts(
7105 self,
7106 subset: Sequence[Hashable] | None = None,
7107 normalize: bool = False,
7108 sort: bool = True,
7109 ascending: bool = False,
7110 dropna: bool = True,
7111 ) -> Series:
7112 """
7113 Return a Series containing counts of unique rows in the DataFrame.
7115 .. versionadded:: 1.1.0
7117 Parameters
7118 ----------
7119 subset : list-like, optional
7120 Columns to use when counting unique combinations.
7121 normalize : bool, default False
7122 Return proportions rather than frequencies.
7123 sort : bool, default True
7124 Sort by frequencies.
7125 ascending : bool, default False
7126 Sort in ascending order.
7127 dropna : bool, default True
7128 Don’t include counts of rows that contain NA values.
7130 .. versionadded:: 1.3.0
7132 Returns
7133 -------
7134 Series
7136 See Also
7137 --------
7138 Series.value_counts: Equivalent method on Series.
7140 Notes
7141 -----
7142 The returned Series will have a MultiIndex with one level per input
7143 column. By default, rows that contain any NA values are omitted from
7144 the result. By default, the resulting Series will be in descending
7145 order so that the first element is the most frequently-occurring row.
7147 Examples
7148 --------
7149 >>> df = pd.DataFrame({'num_legs': [2, 4, 4, 6],
7150 ... 'num_wings': [2, 0, 0, 0]},
7151 ... index=['falcon', 'dog', 'cat', 'ant'])
7152 >>> df
7153 num_legs num_wings
7154 falcon 2 2
7155 dog 4 0
7156 cat 4 0
7157 ant 6 0
7159 >>> df.value_counts()
7160 num_legs num_wings
7161 4 0 2
7162 2 2 1
7163 6 0 1
7164 dtype: int64
7166 >>> df.value_counts(sort=False)
7167 num_legs num_wings
7168 2 2 1
7169 4 0 2
7170 6 0 1
7171 dtype: int64
7173 >>> df.value_counts(ascending=True)
7174 num_legs num_wings
7175 2 2 1
7176 6 0 1
7177 4 0 2
7178 dtype: int64
7180 >>> df.value_counts(normalize=True)
7181 num_legs num_wings
7182 4 0 0.50
7183 2 2 0.25
7184 6 0 0.25
7185 dtype: float64
7187 With `dropna` set to `False` we can also count rows with NA values.
7189 >>> df = pd.DataFrame({'first_name': ['John', 'Anne', 'John', 'Beth'],
7190 ... 'middle_name': ['Smith', pd.NA, pd.NA, 'Louise']})
7191 >>> df
7192 first_name middle_name
7193 0 John Smith
7194 1 Anne <NA>
7195 2 John <NA>
7196 3 Beth Louise
7198 >>> df.value_counts()
7199 first_name middle_name
7200 Beth Louise 1
7201 John Smith 1
7202 dtype: int64
7204 >>> df.value_counts(dropna=False)
7205 first_name middle_name
7206 Anne NaN 1
7207 Beth Louise 1
7208 John Smith 1
7209 NaN 1
7210 dtype: int64
7211 """
7212 if subset is None:
7213 subset = self.columns.tolist()
7215 counts = self.groupby(subset, dropna=dropna).grouper.size()
7217 if sort:
7218 counts = counts.sort_values(ascending=ascending)
7219 if normalize:
7220 counts /= counts.sum()
7222 # Force MultiIndex for single column
7223 if len(subset) == 1:
7224 counts.index = MultiIndex.from_arrays(
7225 [counts.index], names=[counts.index.name]
7226 )
7228 return counts
7230 def nlargest(self, n: int, columns: IndexLabel, keep: str = "first") -> DataFrame:
7231 """
7232 Return the first `n` rows ordered by `columns` in descending order.
7234 Return the first `n` rows with the largest values in `columns`, in
7235 descending order. The columns that are not specified are returned as
7236 well, but not used for ordering.
7238 This method is equivalent to
7239 ``df.sort_values(columns, ascending=False).head(n)``, but more
7240 performant.
7242 Parameters
7243 ----------
7244 n : int
7245 Number of rows to return.
7246 columns : label or list of labels
7247 Column label(s) to order by.
7248 keep : {'first', 'last', 'all'}, default 'first'
7249 Where there are duplicate values:
7251 - ``first`` : prioritize the first occurrence(s)
7252 - ``last`` : prioritize the last occurrence(s)
7253 - ``all`` : do not drop any duplicates, even it means
7254 selecting more than `n` items.
7256 Returns
7257 -------
7258 DataFrame
7259 The first `n` rows ordered by the given columns in descending
7260 order.
7262 See Also
7263 --------
7264 DataFrame.nsmallest : Return the first `n` rows ordered by `columns` in
7265 ascending order.
7266 DataFrame.sort_values : Sort DataFrame by the values.
7267 DataFrame.head : Return the first `n` rows without re-ordering.
7269 Notes
7270 -----
7271 This function cannot be used with all column types. For example, when
7272 specifying columns with `object` or `category` dtypes, ``TypeError`` is
7273 raised.
7275 Examples
7276 --------
7277 >>> df = pd.DataFrame({'population': [59000000, 65000000, 434000,
7278 ... 434000, 434000, 337000, 11300,
7279 ... 11300, 11300],
7280 ... 'GDP': [1937894, 2583560 , 12011, 4520, 12128,
7281 ... 17036, 182, 38, 311],
7282 ... 'alpha-2': ["IT", "FR", "MT", "MV", "BN",
7283 ... "IS", "NR", "TV", "AI"]},
7284 ... index=["Italy", "France", "Malta",
7285 ... "Maldives", "Brunei", "Iceland",
7286 ... "Nauru", "Tuvalu", "Anguilla"])
7287 >>> df
7288 population GDP alpha-2
7289 Italy 59000000 1937894 IT
7290 France 65000000 2583560 FR
7291 Malta 434000 12011 MT
7292 Maldives 434000 4520 MV
7293 Brunei 434000 12128 BN
7294 Iceland 337000 17036 IS
7295 Nauru 11300 182 NR
7296 Tuvalu 11300 38 TV
7297 Anguilla 11300 311 AI
7299 In the following example, we will use ``nlargest`` to select the three
7300 rows having the largest values in column "population".
7302 >>> df.nlargest(3, 'population')
7303 population GDP alpha-2
7304 France 65000000 2583560 FR
7305 Italy 59000000 1937894 IT
7306 Malta 434000 12011 MT
7308 When using ``keep='last'``, ties are resolved in reverse order:
7310 >>> df.nlargest(3, 'population', keep='last')
7311 population GDP alpha-2
7312 France 65000000 2583560 FR
7313 Italy 59000000 1937894 IT
7314 Brunei 434000 12128 BN
7316 When using ``keep='all'``, all duplicate items are maintained:
7318 >>> df.nlargest(3, 'population', keep='all')
7319 population GDP alpha-2
7320 France 65000000 2583560 FR
7321 Italy 59000000 1937894 IT
7322 Malta 434000 12011 MT
7323 Maldives 434000 4520 MV
7324 Brunei 434000 12128 BN
7326 To order by the largest values in column "population" and then "GDP",
7327 we can specify multiple columns like in the next example.
7329 >>> df.nlargest(3, ['population', 'GDP'])
7330 population GDP alpha-2
7331 France 65000000 2583560 FR
7332 Italy 59000000 1937894 IT
7333 Brunei 434000 12128 BN
7334 """
7335 return algorithms.SelectNFrame(self, n=n, keep=keep, columns=columns).nlargest()
7337 def nsmallest(self, n: int, columns: IndexLabel, keep: str = "first") -> DataFrame:
7338 """
7339 Return the first `n` rows ordered by `columns` in ascending order.
7341 Return the first `n` rows with the smallest values in `columns`, in
7342 ascending order. The columns that are not specified are returned as
7343 well, but not used for ordering.
7345 This method is equivalent to
7346 ``df.sort_values(columns, ascending=True).head(n)``, but more
7347 performant.
7349 Parameters
7350 ----------
7351 n : int
7352 Number of items to retrieve.
7353 columns : list or str
7354 Column name or names to order by.
7355 keep : {'first', 'last', 'all'}, default 'first'
7356 Where there are duplicate values:
7358 - ``first`` : take the first occurrence.
7359 - ``last`` : take the last occurrence.
7360 - ``all`` : do not drop any duplicates, even it means
7361 selecting more than `n` items.
7363 Returns
7364 -------
7365 DataFrame
7367 See Also
7368 --------
7369 DataFrame.nlargest : Return the first `n` rows ordered by `columns` in
7370 descending order.
7371 DataFrame.sort_values : Sort DataFrame by the values.
7372 DataFrame.head : Return the first `n` rows without re-ordering.
7374 Examples
7375 --------
7376 >>> df = pd.DataFrame({'population': [59000000, 65000000, 434000,
7377 ... 434000, 434000, 337000, 337000,
7378 ... 11300, 11300],
7379 ... 'GDP': [1937894, 2583560 , 12011, 4520, 12128,
7380 ... 17036, 182, 38, 311],
7381 ... 'alpha-2': ["IT", "FR", "MT", "MV", "BN",
7382 ... "IS", "NR", "TV", "AI"]},
7383 ... index=["Italy", "France", "Malta",
7384 ... "Maldives", "Brunei", "Iceland",
7385 ... "Nauru", "Tuvalu", "Anguilla"])
7386 >>> df
7387 population GDP alpha-2
7388 Italy 59000000 1937894 IT
7389 France 65000000 2583560 FR
7390 Malta 434000 12011 MT
7391 Maldives 434000 4520 MV
7392 Brunei 434000 12128 BN
7393 Iceland 337000 17036 IS
7394 Nauru 337000 182 NR
7395 Tuvalu 11300 38 TV
7396 Anguilla 11300 311 AI
7398 In the following example, we will use ``nsmallest`` to select the
7399 three rows having the smallest values in column "population".
7401 >>> df.nsmallest(3, 'population')
7402 population GDP alpha-2
7403 Tuvalu 11300 38 TV
7404 Anguilla 11300 311 AI
7405 Iceland 337000 17036 IS
7407 When using ``keep='last'``, ties are resolved in reverse order:
7409 >>> df.nsmallest(3, 'population', keep='last')
7410 population GDP alpha-2
7411 Anguilla 11300 311 AI
7412 Tuvalu 11300 38 TV
7413 Nauru 337000 182 NR
7415 When using ``keep='all'``, all duplicate items are maintained:
7417 >>> df.nsmallest(3, 'population', keep='all')
7418 population GDP alpha-2
7419 Tuvalu 11300 38 TV
7420 Anguilla 11300 311 AI
7421 Iceland 337000 17036 IS
7422 Nauru 337000 182 NR
7424 To order by the smallest values in column "population" and then "GDP", we can
7425 specify multiple columns like in the next example.
7427 >>> df.nsmallest(3, ['population', 'GDP'])
7428 population GDP alpha-2
7429 Tuvalu 11300 38 TV
7430 Anguilla 11300 311 AI
7431 Nauru 337000 182 NR
7432 """
7433 return algorithms.SelectNFrame(
7434 self, n=n, keep=keep, columns=columns
7435 ).nsmallest()
7437 @doc(
7438 Series.swaplevel,
7439 klass=_shared_doc_kwargs["klass"],
7440 extra_params=dedent(
7441 """axis : {0 or 'index', 1 or 'columns'}, default 0
7442 The axis to swap levels on. 0 or 'index' for row-wise, 1 or
7443 'columns' for column-wise."""
7444 ),
7445 examples=dedent(
7446 """\
7447 Examples
7448 --------
7449 >>> df = pd.DataFrame(
7450 ... {"Grade": ["A", "B", "A", "C"]},
7451 ... index=[
7452 ... ["Final exam", "Final exam", "Coursework", "Coursework"],
7453 ... ["History", "Geography", "History", "Geography"],
7454 ... ["January", "February", "March", "April"],
7455 ... ],
7456 ... )
7457 >>> df
7458 Grade
7459 Final exam History January A
7460 Geography February B
7461 Coursework History March A
7462 Geography April C
7464 In the following example, we will swap the levels of the indices.
7465 Here, we will swap the levels column-wise, but levels can be swapped row-wise
7466 in a similar manner. Note that column-wise is the default behaviour.
7467 By not supplying any arguments for i and j, we swap the last and second to
7468 last indices.
7470 >>> df.swaplevel()
7471 Grade
7472 Final exam January History A
7473 February Geography B
7474 Coursework March History A
7475 April Geography C
7477 By supplying one argument, we can choose which index to swap the last
7478 index with. We can for example swap the first index with the last one as
7479 follows.
7481 >>> df.swaplevel(0)
7482 Grade
7483 January History Final exam A
7484 February Geography Final exam B
7485 March History Coursework A
7486 April Geography Coursework C
7488 We can also define explicitly which indices we want to swap by supplying values
7489 for both i and j. Here, we for example swap the first and second indices.
7491 >>> df.swaplevel(0, 1)
7492 Grade
7493 History Final exam January A
7494 Geography Final exam February B
7495 History Coursework March A
7496 Geography Coursework April C"""
7497 ),
7498 )
7499 def swaplevel(self, i: Axis = -2, j: Axis = -1, axis: Axis = 0) -> DataFrame:
7500 result = self.copy()
7502 axis = self._get_axis_number(axis)
7504 if not isinstance(result._get_axis(axis), MultiIndex): # pragma: no cover
7505 raise TypeError("Can only swap levels on a hierarchical axis.")
7507 if axis == 0:
7508 assert isinstance(result.index, MultiIndex)
7509 result.index = result.index.swaplevel(i, j)
7510 else:
7511 assert isinstance(result.columns, MultiIndex)
7512 result.columns = result.columns.swaplevel(i, j)
7513 return result
7515 def reorder_levels(self, order: Sequence[Axis], axis: Axis = 0) -> DataFrame:
7516 """
7517 Rearrange index levels using input order. May not drop or duplicate levels.
7519 Parameters
7520 ----------
7521 order : list of int or list of str
7522 List representing new level order. Reference level by number
7523 (position) or by key (label).
7524 axis : {0 or 'index', 1 or 'columns'}, default 0
7525 Where to reorder levels.
7527 Returns
7528 -------
7529 DataFrame
7531 Examples
7532 --------
7533 >>> data = {
7534 ... "class": ["Mammals", "Mammals", "Reptiles"],
7535 ... "diet": ["Omnivore", "Carnivore", "Carnivore"],
7536 ... "species": ["Humans", "Dogs", "Snakes"],
7537 ... }
7538 >>> df = pd.DataFrame(data, columns=["class", "diet", "species"])
7539 >>> df = df.set_index(["class", "diet"])
7540 >>> df
7541 species
7542 class diet
7543 Mammals Omnivore Humans
7544 Carnivore Dogs
7545 Reptiles Carnivore Snakes
7547 Let's reorder the levels of the index:
7549 >>> df.reorder_levels(["diet", "class"])
7550 species
7551 diet class
7552 Omnivore Mammals Humans
7553 Carnivore Mammals Dogs
7554 Reptiles Snakes
7555 """
7556 axis = self._get_axis_number(axis)
7557 if not isinstance(self._get_axis(axis), MultiIndex): # pragma: no cover
7558 raise TypeError("Can only reorder levels on a hierarchical axis.")
7560 result = self.copy()
7562 if axis == 0:
7563 assert isinstance(result.index, MultiIndex)
7564 result.index = result.index.reorder_levels(order)
7565 else:
7566 assert isinstance(result.columns, MultiIndex)
7567 result.columns = result.columns.reorder_levels(order)
7568 return result
7570 # ----------------------------------------------------------------------
7571 # Arithmetic Methods
7573 def _cmp_method(self, other, op):
7574 axis = 1 # only relevant for Series other case
7576 self, other = ops.align_method_FRAME(self, other, axis, flex=False, level=None)
7578 # See GH#4537 for discussion of scalar op behavior
7579 new_data = self._dispatch_frame_op(other, op, axis=axis)
7580 return self._construct_result(new_data)
7582 def _arith_method(self, other, op):
7583 if ops.should_reindex_frame_op(self, other, op, 1, 1, None, None):
7584 return ops.frame_arith_method_with_reindex(self, other, op)
7586 axis = 1 # only relevant for Series other case
7587 other = ops.maybe_prepare_scalar_for_op(other, (self.shape[axis],))
7589 self, other = ops.align_method_FRAME(self, other, axis, flex=True, level=None)
7591 new_data = self._dispatch_frame_op(other, op, axis=axis)
7592 return self._construct_result(new_data)
7594 _logical_method = _arith_method
7596 def _dispatch_frame_op(self, right, func: Callable, axis: int | None = None):
7597 """
7598 Evaluate the frame operation func(left, right) by evaluating
7599 column-by-column, dispatching to the Series implementation.
7601 Parameters
7602 ----------
7603 right : scalar, Series, or DataFrame
7604 func : arithmetic or comparison operator
7605 axis : {None, 0, 1}
7607 Returns
7608 -------
7609 DataFrame
7610 """
7611 # Get the appropriate array-op to apply to each column/block's values.
7612 array_op = ops.get_array_op(func)
7614 right = lib.item_from_zerodim(right)
7615 if not is_list_like(right):
7616 # i.e. scalar, faster than checking np.ndim(right) == 0
7617 with np.errstate(all="ignore"):
7618 bm = self._mgr.apply(array_op, right=right)
7619 return self._constructor(bm)
7621 elif isinstance(right, DataFrame):
7622 assert self.index.equals(right.index)
7623 assert self.columns.equals(right.columns)
7624 # TODO: The previous assertion `assert right._indexed_same(self)`
7625 # fails in cases with empty columns reached via
7626 # _frame_arith_method_with_reindex
7628 # TODO operate_blockwise expects a manager of the same type
7629 with np.errstate(all="ignore"):
7630 bm = self._mgr.operate_blockwise(
7631 # error: Argument 1 to "operate_blockwise" of "ArrayManager" has
7632 # incompatible type "Union[ArrayManager, BlockManager]"; expected
7633 # "ArrayManager"
7634 # error: Argument 1 to "operate_blockwise" of "BlockManager" has
7635 # incompatible type "Union[ArrayManager, BlockManager]"; expected
7636 # "BlockManager"
7637 right._mgr, # type: ignore[arg-type]
7638 array_op,
7639 )
7640 return self._constructor(bm)
7642 elif isinstance(right, Series) and axis == 1:
7643 # axis=1 means we want to operate row-by-row
7644 assert right.index.equals(self.columns)
7646 right = right._values
7647 # maybe_align_as_frame ensures we do not have an ndarray here
7648 assert not isinstance(right, np.ndarray)
7650 with np.errstate(all="ignore"):
7651 arrays = [
7652 array_op(_left, _right)
7653 for _left, _right in zip(self._iter_column_arrays(), right)
7654 ]
7656 elif isinstance(right, Series):
7657 assert right.index.equals(self.index) # Handle other cases later
7658 right = right._values
7660 with np.errstate(all="ignore"):
7661 arrays = [array_op(left, right) for left in self._iter_column_arrays()]
7663 else:
7664 # Remaining cases have less-obvious dispatch rules
7665 raise NotImplementedError(right)
7667 return type(self)._from_arrays(
7668 arrays, self.columns, self.index, verify_integrity=False
7669 )
7671 def _combine_frame(self, other: DataFrame, func, fill_value=None):
7672 # at this point we have `self._indexed_same(other)`
7674 if fill_value is None:
7675 # since _arith_op may be called in a loop, avoid function call
7676 # overhead if possible by doing this check once
7677 _arith_op = func
7679 else:
7681 def _arith_op(left, right):
7682 # for the mixed_type case where we iterate over columns,
7683 # _arith_op(left, right) is equivalent to
7684 # left._binop(right, func, fill_value=fill_value)
7685 left, right = ops.fill_binop(left, right, fill_value)
7686 return func(left, right)
7688 new_data = self._dispatch_frame_op(other, _arith_op)
7689 return new_data
7691 def _construct_result(self, result) -> DataFrame:
7692 """
7693 Wrap the result of an arithmetic, comparison, or logical operation.
7695 Parameters
7696 ----------
7697 result : DataFrame
7699 Returns
7700 -------
7701 DataFrame
7702 """
7703 out = self._constructor(result, copy=False)
7704 # Pin columns instead of passing to constructor for compat with
7705 # non-unique columns case
7706 out.columns = self.columns
7707 out.index = self.index
7708 return out
7710 def __divmod__(self, other) -> tuple[DataFrame, DataFrame]:
7711 # Naive implementation, room for optimization
7712 div = self // other
7713 mod = self - div * other
7714 return div, mod
7716 def __rdivmod__(self, other) -> tuple[DataFrame, DataFrame]:
7717 # Naive implementation, room for optimization
7718 div = other // self
7719 mod = other - div * self
7720 return div, mod
7722 # ----------------------------------------------------------------------
7723 # Combination-Related
7725 @doc(
7726 _shared_docs["compare"],
7727 """
7728Returns
7729-------
7730DataFrame
7731 DataFrame that shows the differences stacked side by side.
7733 The resulting index will be a MultiIndex with 'self' and 'other'
7734 stacked alternately at the inner level.
7736Raises
7737------
7738ValueError
7739 When the two DataFrames don't have identical labels or shape.
7741See Also
7742--------
7743Series.compare : Compare with another Series and show differences.
7744DataFrame.equals : Test whether two objects contain the same elements.
7746Notes
7747-----
7748Matching NaNs will not appear as a difference.
7750Can only compare identically-labeled
7751(i.e. same shape, identical row and column labels) DataFrames
7753Examples
7754--------
7755>>> df = pd.DataFrame(
7756... {{
7757... "col1": ["a", "a", "b", "b", "a"],
7758... "col2": [1.0, 2.0, 3.0, np.nan, 5.0],
7759... "col3": [1.0, 2.0, 3.0, 4.0, 5.0]
7760... }},
7761... columns=["col1", "col2", "col3"],
7762... )
7763>>> df
7764 col1 col2 col3
77650 a 1.0 1.0
77661 a 2.0 2.0
77672 b 3.0 3.0
77683 b NaN 4.0
77694 a 5.0 5.0
7771>>> df2 = df.copy()
7772>>> df2.loc[0, 'col1'] = 'c'
7773>>> df2.loc[2, 'col3'] = 4.0
7774>>> df2
7775 col1 col2 col3
77760 c 1.0 1.0
77771 a 2.0 2.0
77782 b 3.0 4.0
77793 b NaN 4.0
77804 a 5.0 5.0
7782Align the differences on columns
7784>>> df.compare(df2)
7785 col1 col3
7786 self other self other
77870 a c NaN NaN
77882 NaN NaN 3.0 4.0
7790Assign result_names
7792>>> df.compare(df2, result_names=("left", "right"))
7793 col1 col3
7794 left right left right
77950 a c NaN NaN
77962 NaN NaN 3.0 4.0
7798Stack the differences on rows
7800>>> df.compare(df2, align_axis=0)
7801 col1 col3
78020 self a NaN
7803 other c NaN
78042 self NaN 3.0
7805 other NaN 4.0
7807Keep the equal values
7809>>> df.compare(df2, keep_equal=True)
7810 col1 col3
7811 self other self other
78120 a c 1.0 1.0
78132 b b 3.0 4.0
7815Keep all original rows and columns
7817>>> df.compare(df2, keep_shape=True)
7818 col1 col2 col3
7819 self other self other self other
78200 a c NaN NaN NaN NaN
78211 NaN NaN NaN NaN NaN NaN
78222 NaN NaN NaN NaN 3.0 4.0
78233 NaN NaN NaN NaN NaN NaN
78244 NaN NaN NaN NaN NaN NaN
7826Keep all original rows and columns and also all original values
7828>>> df.compare(df2, keep_shape=True, keep_equal=True)
7829 col1 col2 col3
7830 self other self other self other
78310 a c 1.0 1.0 1.0 1.0
78321 a a 2.0 2.0 2.0 2.0
78332 b b 3.0 3.0 3.0 4.0
78343 b b NaN NaN 4.0 4.0
78354 a a 5.0 5.0 5.0 5.0
7836""",
7837 klass=_shared_doc_kwargs["klass"],
7838 )
7839 def compare(
7840 self,
7841 other: DataFrame,
7842 align_axis: Axis = 1,
7843 keep_shape: bool = False,
7844 keep_equal: bool = False,
7845 result_names: Suffixes = ("self", "other"),
7846 ) -> DataFrame:
7847 return super().compare(
7848 other=other,
7849 align_axis=align_axis,
7850 keep_shape=keep_shape,
7851 keep_equal=keep_equal,
7852 result_names=result_names,
7853 )
7855 def combine(
7856 self,
7857 other: DataFrame,
7858 func: Callable[[Series, Series], Series | Hashable],
7859 fill_value=None,
7860 overwrite: bool = True,
7861 ) -> DataFrame:
7862 """
7863 Perform column-wise combine with another DataFrame.
7865 Combines a DataFrame with `other` DataFrame using `func`
7866 to element-wise combine columns. The row and column indexes of the
7867 resulting DataFrame will be the union of the two.
7869 Parameters
7870 ----------
7871 other : DataFrame
7872 The DataFrame to merge column-wise.
7873 func : function
7874 Function that takes two series as inputs and return a Series or a
7875 scalar. Used to merge the two dataframes column by columns.
7876 fill_value : scalar value, default None
7877 The value to fill NaNs with prior to passing any column to the
7878 merge func.
7879 overwrite : bool, default True
7880 If True, columns in `self` that do not exist in `other` will be
7881 overwritten with NaNs.
7883 Returns
7884 -------
7885 DataFrame
7886 Combination of the provided DataFrames.
7888 See Also
7889 --------
7890 DataFrame.combine_first : Combine two DataFrame objects and default to
7891 non-null values in frame calling the method.
7893 Examples
7894 --------
7895 Combine using a simple function that chooses the smaller column.
7897 >>> df1 = pd.DataFrame({'A': [0, 0], 'B': [4, 4]})
7898 >>> df2 = pd.DataFrame({'A': [1, 1], 'B': [3, 3]})
7899 >>> take_smaller = lambda s1, s2: s1 if s1.sum() < s2.sum() else s2
7900 >>> df1.combine(df2, take_smaller)
7901 A B
7902 0 0 3
7903 1 0 3
7905 Example using a true element-wise combine function.
7907 >>> df1 = pd.DataFrame({'A': [5, 0], 'B': [2, 4]})
7908 >>> df2 = pd.DataFrame({'A': [1, 1], 'B': [3, 3]})
7909 >>> df1.combine(df2, np.minimum)
7910 A B
7911 0 1 2
7912 1 0 3
7914 Using `fill_value` fills Nones prior to passing the column to the
7915 merge function.
7917 >>> df1 = pd.DataFrame({'A': [0, 0], 'B': [None, 4]})
7918 >>> df2 = pd.DataFrame({'A': [1, 1], 'B': [3, 3]})
7919 >>> df1.combine(df2, take_smaller, fill_value=-5)
7920 A B
7921 0 0 -5.0
7922 1 0 4.0
7924 However, if the same element in both dataframes is None, that None
7925 is preserved
7927 >>> df1 = pd.DataFrame({'A': [0, 0], 'B': [None, 4]})
7928 >>> df2 = pd.DataFrame({'A': [1, 1], 'B': [None, 3]})
7929 >>> df1.combine(df2, take_smaller, fill_value=-5)
7930 A B
7931 0 0 -5.0
7932 1 0 3.0
7934 Example that demonstrates the use of `overwrite` and behavior when
7935 the axis differ between the dataframes.
7937 >>> df1 = pd.DataFrame({'A': [0, 0], 'B': [4, 4]})
7938 >>> df2 = pd.DataFrame({'B': [3, 3], 'C': [-10, 1], }, index=[1, 2])
7939 >>> df1.combine(df2, take_smaller)
7940 A B C
7941 0 NaN NaN NaN
7942 1 NaN 3.0 -10.0
7943 2 NaN 3.0 1.0
7945 >>> df1.combine(df2, take_smaller, overwrite=False)
7946 A B C
7947 0 0.0 NaN NaN
7948 1 0.0 3.0 -10.0
7949 2 NaN 3.0 1.0
7951 Demonstrating the preference of the passed in dataframe.
7953 >>> df2 = pd.DataFrame({'B': [3, 3], 'C': [1, 1], }, index=[1, 2])
7954 >>> df2.combine(df1, take_smaller)
7955 A B C
7956 0 0.0 NaN NaN
7957 1 0.0 3.0 NaN
7958 2 NaN 3.0 NaN
7960 >>> df2.combine(df1, take_smaller, overwrite=False)
7961 A B C
7962 0 0.0 NaN NaN
7963 1 0.0 3.0 1.0
7964 2 NaN 3.0 1.0
7965 """
7966 other_idxlen = len(other.index) # save for compare
7968 this, other = self.align(other, copy=False)
7969 new_index = this.index
7971 if other.empty and len(new_index) == len(self.index):
7972 return self.copy()
7974 if self.empty and len(other) == other_idxlen:
7975 return other.copy()
7977 # sorts if possible
7978 new_columns = this.columns.union(other.columns)
7979 do_fill = fill_value is not None
7980 result = {}
7981 for col in new_columns:
7982 series = this[col]
7983 otherSeries = other[col]
7985 this_dtype = series.dtype
7986 other_dtype = otherSeries.dtype
7988 this_mask = isna(series)
7989 other_mask = isna(otherSeries)
7991 # don't overwrite columns unnecessarily
7992 # DO propagate if this column is not in the intersection
7993 if not overwrite and other_mask.all():
7994 result[col] = this[col].copy()
7995 continue
7997 if do_fill:
7998 series = series.copy()
7999 otherSeries = otherSeries.copy()
8000 series[this_mask] = fill_value
8001 otherSeries[other_mask] = fill_value
8003 if col not in self.columns:
8004 # If self DataFrame does not have col in other DataFrame,
8005 # try to promote series, which is all NaN, as other_dtype.
8006 new_dtype = other_dtype
8007 try:
8008 series = series.astype(new_dtype, copy=False)
8009 except ValueError:
8010 # e.g. new_dtype is integer types
8011 pass
8012 else:
8013 # if we have different dtypes, possibly promote
8014 new_dtype = find_common_type([this_dtype, other_dtype])
8015 series = series.astype(new_dtype, copy=False)
8016 otherSeries = otherSeries.astype(new_dtype, copy=False)
8018 arr = func(series, otherSeries)
8019 if isinstance(new_dtype, np.dtype):
8020 # if new_dtype is an EA Dtype, then `func` is expected to return
8021 # the correct dtype without any additional casting
8022 # error: No overload variant of "maybe_downcast_to_dtype" matches
8023 # argument types "Union[Series, Hashable]", "dtype[Any]"
8024 arr = maybe_downcast_to_dtype( # type: ignore[call-overload]
8025 arr, new_dtype
8026 )
8028 result[col] = arr
8030 # convert_objects just in case
8031 return self._constructor(result, index=new_index, columns=new_columns)
8033 def combine_first(self, other: DataFrame) -> DataFrame:
8034 """
8035 Update null elements with value in the same location in `other`.
8037 Combine two DataFrame objects by filling null values in one DataFrame
8038 with non-null values from other DataFrame. The row and column indexes
8039 of the resulting DataFrame will be the union of the two. The resulting
8040 dataframe contains the 'first' dataframe values and overrides the
8041 second one values where both first.loc[index, col] and
8042 second.loc[index, col] are not missing values, upon calling
8043 first.combine_first(second).
8045 Parameters
8046 ----------
8047 other : DataFrame
8048 Provided DataFrame to use to fill null values.
8050 Returns
8051 -------
8052 DataFrame
8053 The result of combining the provided DataFrame with the other object.
8055 See Also
8056 --------
8057 DataFrame.combine : Perform series-wise operation on two DataFrames
8058 using a given function.
8060 Examples
8061 --------
8062 >>> df1 = pd.DataFrame({'A': [None, 0], 'B': [None, 4]})
8063 >>> df2 = pd.DataFrame({'A': [1, 1], 'B': [3, 3]})
8064 >>> df1.combine_first(df2)
8065 A B
8066 0 1.0 3.0
8067 1 0.0 4.0
8069 Null values still persist if the location of that null value
8070 does not exist in `other`
8072 >>> df1 = pd.DataFrame({'A': [None, 0], 'B': [4, None]})
8073 >>> df2 = pd.DataFrame({'B': [3, 3], 'C': [1, 1]}, index=[1, 2])
8074 >>> df1.combine_first(df2)
8075 A B C
8076 0 NaN 4.0 NaN
8077 1 0.0 3.0 1.0
8078 2 NaN 3.0 1.0
8079 """
8080 import pandas.core.computation.expressions as expressions
8082 def combiner(x, y):
8083 mask = extract_array(isna(x))
8085 x_values = extract_array(x, extract_numpy=True)
8086 y_values = extract_array(y, extract_numpy=True)
8088 # If the column y in other DataFrame is not in first DataFrame,
8089 # just return y_values.
8090 if y.name not in self.columns:
8091 return y_values
8093 return expressions.where(mask, y_values, x_values)
8095 combined = self.combine(other, combiner, overwrite=False)
8097 dtypes = {
8098 col: find_common_type([self.dtypes[col], other.dtypes[col]])
8099 for col in self.columns.intersection(other.columns)
8100 if not is_dtype_equal(combined.dtypes[col], self.dtypes[col])
8101 }
8103 if dtypes:
8104 combined = combined.astype(dtypes)
8106 return combined
8108 def update(
8109 self,
8110 other,
8111 join: str = "left",
8112 overwrite: bool = True,
8113 filter_func=None,
8114 errors: str = "ignore",
8115 ) -> None:
8116 """
8117 Modify in place using non-NA values from another DataFrame.
8119 Aligns on indices. There is no return value.
8121 Parameters
8122 ----------
8123 other : DataFrame, or object coercible into a DataFrame
8124 Should have at least one matching index/column label
8125 with the original DataFrame. If a Series is passed,
8126 its name attribute must be set, and that will be
8127 used as the column name to align with the original DataFrame.
8128 join : {'left'}, default 'left'
8129 Only left join is implemented, keeping the index and columns of the
8130 original object.
8131 overwrite : bool, default True
8132 How to handle non-NA values for overlapping keys:
8134 * True: overwrite original DataFrame's values
8135 with values from `other`.
8136 * False: only update values that are NA in
8137 the original DataFrame.
8139 filter_func : callable(1d-array) -> bool 1d-array, optional
8140 Can choose to replace values other than NA. Return True for values
8141 that should be updated.
8142 errors : {'raise', 'ignore'}, default 'ignore'
8143 If 'raise', will raise a ValueError if the DataFrame and `other`
8144 both contain non-NA data in the same place.
8146 Returns
8147 -------
8148 None : method directly changes calling object
8150 Raises
8151 ------
8152 ValueError
8153 * When `errors='raise'` and there's overlapping non-NA data.
8154 * When `errors` is not either `'ignore'` or `'raise'`
8155 NotImplementedError
8156 * If `join != 'left'`
8158 See Also
8159 --------
8160 dict.update : Similar method for dictionaries.
8161 DataFrame.merge : For column(s)-on-column(s) operations.
8163 Examples
8164 --------
8165 >>> df = pd.DataFrame({'A': [1, 2, 3],
8166 ... 'B': [400, 500, 600]})
8167 >>> new_df = pd.DataFrame({'B': [4, 5, 6],
8168 ... 'C': [7, 8, 9]})
8169 >>> df.update(new_df)
8170 >>> df
8171 A B
8172 0 1 4
8173 1 2 5
8174 2 3 6
8176 The DataFrame's length does not increase as a result of the update,
8177 only values at matching index/column labels are updated.
8179 >>> df = pd.DataFrame({'A': ['a', 'b', 'c'],
8180 ... 'B': ['x', 'y', 'z']})
8181 >>> new_df = pd.DataFrame({'B': ['d', 'e', 'f', 'g', 'h', 'i']})
8182 >>> df.update(new_df)
8183 >>> df
8184 A B
8185 0 a d
8186 1 b e
8187 2 c f
8189 For Series, its name attribute must be set.
8191 >>> df = pd.DataFrame({'A': ['a', 'b', 'c'],
8192 ... 'B': ['x', 'y', 'z']})
8193 >>> new_column = pd.Series(['d', 'e'], name='B', index=[0, 2])
8194 >>> df.update(new_column)
8195 >>> df
8196 A B
8197 0 a d
8198 1 b y
8199 2 c e
8200 >>> df = pd.DataFrame({'A': ['a', 'b', 'c'],
8201 ... 'B': ['x', 'y', 'z']})
8202 >>> new_df = pd.DataFrame({'B': ['d', 'e']}, index=[1, 2])
8203 >>> df.update(new_df)
8204 >>> df
8205 A B
8206 0 a x
8207 1 b d
8208 2 c e
8210 If `other` contains NaNs the corresponding values are not updated
8211 in the original dataframe.
8213 >>> df = pd.DataFrame({'A': [1, 2, 3],
8214 ... 'B': [400, 500, 600]})
8215 >>> new_df = pd.DataFrame({'B': [4, np.nan, 6]})
8216 >>> df.update(new_df)
8217 >>> df
8218 A B
8219 0 1 4.0
8220 1 2 500.0
8221 2 3 6.0
8222 """
8223 import pandas.core.computation.expressions as expressions
8225 # TODO: Support other joins
8226 if join != "left": # pragma: no cover
8227 raise NotImplementedError("Only left join is supported")
8228 if errors not in ["ignore", "raise"]:
8229 raise ValueError("The parameter errors must be either 'ignore' or 'raise'")
8231 if not isinstance(other, DataFrame):
8232 other = DataFrame(other)
8234 other = other.reindex_like(self)
8236 for col in self.columns:
8237 this = self[col]._values
8238 that = other[col]._values
8239 if filter_func is not None:
8240 with np.errstate(all="ignore"):
8241 mask = ~filter_func(this) | isna(that)
8242 else:
8243 if errors == "raise":
8244 mask_this = notna(that)
8245 mask_that = notna(this)
8246 if any(mask_this & mask_that):
8247 raise ValueError("Data overlaps.")
8249 if overwrite:
8250 mask = isna(that)
8251 else:
8252 mask = notna(this)
8254 # don't overwrite columns unnecessarily
8255 if mask.all():
8256 continue
8258 with warnings.catch_warnings():
8259 warnings.filterwarnings("ignore", "In a future version, `df.iloc")
8260 self.loc[:, col] = expressions.where(mask, this, that)
8262 # ----------------------------------------------------------------------
8263 # Data reshaping
8264 @Appender(
8265 """
8266Examples
8267--------
8268>>> df = pd.DataFrame({'Animal': ['Falcon', 'Falcon',
8269... 'Parrot', 'Parrot'],
8270... 'Max Speed': [380., 370., 24., 26.]})
8271>>> df
8272 Animal Max Speed
82730 Falcon 380.0
82741 Falcon 370.0
82752 Parrot 24.0
82763 Parrot 26.0
8277>>> df.groupby(['Animal']).mean()
8278 Max Speed
8279Animal
8280Falcon 375.0
8281Parrot 25.0
8283**Hierarchical Indexes**
8285We can groupby different levels of a hierarchical index
8286using the `level` parameter:
8288>>> arrays = [['Falcon', 'Falcon', 'Parrot', 'Parrot'],
8289... ['Captive', 'Wild', 'Captive', 'Wild']]
8290>>> index = pd.MultiIndex.from_arrays(arrays, names=('Animal', 'Type'))
8291>>> df = pd.DataFrame({'Max Speed': [390., 350., 30., 20.]},
8292... index=index)
8293>>> df
8294 Max Speed
8295Animal Type
8296Falcon Captive 390.0
8297 Wild 350.0
8298Parrot Captive 30.0
8299 Wild 20.0
8300>>> df.groupby(level=0).mean()
8301 Max Speed
8302Animal
8303Falcon 370.0
8304Parrot 25.0
8305>>> df.groupby(level="Type").mean()
8306 Max Speed
8307Type
8308Captive 210.0
8309Wild 185.0
8311We can also choose to include NA in group keys or not by setting
8312`dropna` parameter, the default setting is `True`.
8314>>> l = [[1, 2, 3], [1, None, 4], [2, 1, 3], [1, 2, 2]]
8315>>> df = pd.DataFrame(l, columns=["a", "b", "c"])
8317>>> df.groupby(by=["b"]).sum()
8318 a c
8319b
83201.0 2 3
83212.0 2 5
8323>>> df.groupby(by=["b"], dropna=False).sum()
8324 a c
8325b
83261.0 2 3
83272.0 2 5
8328NaN 1 4
8330>>> l = [["a", 12, 12], [None, 12.3, 33.], ["b", 12.3, 123], ["a", 1, 1]]
8331>>> df = pd.DataFrame(l, columns=["a", "b", "c"])
8333>>> df.groupby(by="a").sum()
8334 b c
8335a
8336a 13.0 13.0
8337b 12.3 123.0
8339>>> df.groupby(by="a", dropna=False).sum()
8340 b c
8341a
8342a 13.0 13.0
8343b 12.3 123.0
8344NaN 12.3 33.0
8346When using ``.apply()``, use ``group_keys`` to include or exclude the group keys.
8347The ``group_keys`` argument defaults to ``True`` (include).
8349>>> df = pd.DataFrame({'Animal': ['Falcon', 'Falcon',
8350... 'Parrot', 'Parrot'],
8351... 'Max Speed': [380., 370., 24., 26.]})
8352>>> df.groupby("Animal", group_keys=True).apply(lambda x: x)
8353 Animal Max Speed
8354Animal
8355Falcon 0 Falcon 380.0
8356 1 Falcon 370.0
8357Parrot 2 Parrot 24.0
8358 3 Parrot 26.0
8360>>> df.groupby("Animal", group_keys=False).apply(lambda x: x)
8361 Animal Max Speed
83620 Falcon 380.0
83631 Falcon 370.0
83642 Parrot 24.0
83653 Parrot 26.0
8366"""
8367 )
8368 @Appender(_shared_docs["groupby"] % _shared_doc_kwargs)
8369 def groupby(
8370 self,
8371 by=None,
8372 axis: Axis = 0,
8373 level: IndexLabel | None = None,
8374 as_index: bool = True,
8375 sort: bool = True,
8376 group_keys: bool | lib.NoDefault = no_default,
8377 squeeze: bool | lib.NoDefault = no_default,
8378 observed: bool = False,
8379 dropna: bool = True,
8380 ) -> DataFrameGroupBy:
8381 from pandas.core.groupby.generic import DataFrameGroupBy
8383 if squeeze is not no_default:
8384 warnings.warn(
8385 (
8386 "The `squeeze` parameter is deprecated and "
8387 "will be removed in a future version."
8388 ),
8389 FutureWarning,
8390 stacklevel=find_stack_level(),
8391 )
8392 else:
8393 squeeze = False
8395 if level is None and by is None:
8396 raise TypeError("You have to supply one of 'by' and 'level'")
8397 axis = self._get_axis_number(axis)
8399 return DataFrameGroupBy(
8400 obj=self,
8401 keys=by,
8402 axis=axis,
8403 level=level,
8404 as_index=as_index,
8405 sort=sort,
8406 group_keys=group_keys,
8407 squeeze=squeeze,
8408 observed=observed,
8409 dropna=dropna,
8410 )
8412 _shared_docs[
8413 "pivot"
8414 ] = """
8415 Return reshaped DataFrame organized by given index / column values.
8417 Reshape data (produce a "pivot" table) based on column values. Uses
8418 unique values from specified `index` / `columns` to form axes of the
8419 resulting DataFrame. This function does not support data
8420 aggregation, multiple values will result in a MultiIndex in the
8421 columns. See the :ref:`User Guide <reshaping>` for more on reshaping.
8423 Parameters
8424 ----------%s
8425 index : str or object or a list of str, optional
8426 Column to use to make new frame's index. If None, uses
8427 existing index.
8429 .. versionchanged:: 1.1.0
8430 Also accept list of index names.
8432 columns : str or object or a list of str
8433 Column to use to make new frame's columns.
8435 .. versionchanged:: 1.1.0
8436 Also accept list of columns names.
8438 values : str, object or a list of the previous, optional
8439 Column(s) to use for populating new frame's values. If not
8440 specified, all remaining columns will be used and the result will
8441 have hierarchically indexed columns.
8443 Returns
8444 -------
8445 DataFrame
8446 Returns reshaped DataFrame.
8448 Raises
8449 ------
8450 ValueError:
8451 When there are any `index`, `columns` combinations with multiple
8452 values. `DataFrame.pivot_table` when you need to aggregate.
8454 See Also
8455 --------
8456 DataFrame.pivot_table : Generalization of pivot that can handle
8457 duplicate values for one index/column pair.
8458 DataFrame.unstack : Pivot based on the index values instead of a
8459 column.
8460 wide_to_long : Wide panel to long format. Less flexible but more
8461 user-friendly than melt.
8463 Notes
8464 -----
8465 For finer-tuned control, see hierarchical indexing documentation along
8466 with the related stack/unstack methods.
8468 Reference :ref:`the user guide <reshaping.pivot>` for more examples.
8470 Examples
8471 --------
8472 >>> df = pd.DataFrame({'foo': ['one', 'one', 'one', 'two', 'two',
8473 ... 'two'],
8474 ... 'bar': ['A', 'B', 'C', 'A', 'B', 'C'],
8475 ... 'baz': [1, 2, 3, 4, 5, 6],
8476 ... 'zoo': ['x', 'y', 'z', 'q', 'w', 't']})
8477 >>> df
8478 foo bar baz zoo
8479 0 one A 1 x
8480 1 one B 2 y
8481 2 one C 3 z
8482 3 two A 4 q
8483 4 two B 5 w
8484 5 two C 6 t
8486 >>> df.pivot(index='foo', columns='bar', values='baz')
8487 bar A B C
8488 foo
8489 one 1 2 3
8490 two 4 5 6
8492 >>> df.pivot(index='foo', columns='bar')['baz']
8493 bar A B C
8494 foo
8495 one 1 2 3
8496 two 4 5 6
8498 >>> df.pivot(index='foo', columns='bar', values=['baz', 'zoo'])
8499 baz zoo
8500 bar A B C A B C
8501 foo
8502 one 1 2 3 x y z
8503 two 4 5 6 q w t
8505 You could also assign a list of column names or a list of index names.
8507 >>> df = pd.DataFrame({
8508 ... "lev1": [1, 1, 1, 2, 2, 2],
8509 ... "lev2": [1, 1, 2, 1, 1, 2],
8510 ... "lev3": [1, 2, 1, 2, 1, 2],
8511 ... "lev4": [1, 2, 3, 4, 5, 6],
8512 ... "values": [0, 1, 2, 3, 4, 5]})
8513 >>> df
8514 lev1 lev2 lev3 lev4 values
8515 0 1 1 1 1 0
8516 1 1 1 2 2 1
8517 2 1 2 1 3 2
8518 3 2 1 2 4 3
8519 4 2 1 1 5 4
8520 5 2 2 2 6 5
8522 >>> df.pivot(index="lev1", columns=["lev2", "lev3"],values="values")
8523 lev2 1 2
8524 lev3 1 2 1 2
8525 lev1
8526 1 0.0 1.0 2.0 NaN
8527 2 4.0 3.0 NaN 5.0
8529 >>> df.pivot(index=["lev1", "lev2"], columns=["lev3"],values="values")
8530 lev3 1 2
8531 lev1 lev2
8532 1 1 0.0 1.0
8533 2 2.0 NaN
8534 2 1 4.0 3.0
8535 2 NaN 5.0
8537 A ValueError is raised if there are any duplicates.
8539 >>> df = pd.DataFrame({"foo": ['one', 'one', 'two', 'two'],
8540 ... "bar": ['A', 'A', 'B', 'C'],
8541 ... "baz": [1, 2, 3, 4]})
8542 >>> df
8543 foo bar baz
8544 0 one A 1
8545 1 one A 2
8546 2 two B 3
8547 3 two C 4
8549 Notice that the first two rows are the same for our `index`
8550 and `columns` arguments.
8552 >>> df.pivot(index='foo', columns='bar', values='baz')
8553 Traceback (most recent call last):
8554 ...
8555 ValueError: Index contains duplicate entries, cannot reshape
8556 """
8558 @Substitution("")
8559 @Appender(_shared_docs["pivot"])
8560 @deprecate_nonkeyword_arguments(version=None, allowed_args=["self"])
8561 def pivot(self, index=None, columns=None, values=None) -> DataFrame:
8562 from pandas.core.reshape.pivot import pivot
8564 return pivot(self, index=index, columns=columns, values=values)
8566 _shared_docs[
8567 "pivot_table"
8568 ] = """
8569 Create a spreadsheet-style pivot table as a DataFrame.
8571 The levels in the pivot table will be stored in MultiIndex objects
8572 (hierarchical indexes) on the index and columns of the result DataFrame.
8574 Parameters
8575 ----------%s
8576 values : column to aggregate, optional
8577 index : column, Grouper, array, or list of the previous
8578 If an array is passed, it must be the same length as the data. The
8579 list can contain any of the other types (except list).
8580 Keys to group by on the pivot table index. If an array is passed,
8581 it is being used as the same manner as column values.
8582 columns : column, Grouper, array, or list of the previous
8583 If an array is passed, it must be the same length as the data. The
8584 list can contain any of the other types (except list).
8585 Keys to group by on the pivot table column. If an array is passed,
8586 it is being used as the same manner as column values.
8587 aggfunc : function, list of functions, dict, default numpy.mean
8588 If list of functions passed, the resulting pivot table will have
8589 hierarchical columns whose top level are the function names
8590 (inferred from the function objects themselves)
8591 If dict is passed, the key is column to aggregate and value
8592 is function or list of functions.
8593 fill_value : scalar, default None
8594 Value to replace missing values with (in the resulting pivot table,
8595 after aggregation).
8596 margins : bool, default False
8597 Add all row / columns (e.g. for subtotal / grand totals).
8598 dropna : bool, default True
8599 Do not include columns whose entries are all NaN. If True,
8600 rows with a NaN value in any column will be omitted before
8601 computing margins.
8602 margins_name : str, default 'All'
8603 Name of the row / column that will contain the totals
8604 when margins is True.
8605 observed : bool, default False
8606 This only applies if any of the groupers are Categoricals.
8607 If True: only show observed values for categorical groupers.
8608 If False: show all values for categorical groupers.
8610 .. versionchanged:: 0.25.0
8612 sort : bool, default True
8613 Specifies if the result should be sorted.
8615 .. versionadded:: 1.3.0
8617 Returns
8618 -------
8619 DataFrame
8620 An Excel style pivot table.
8622 See Also
8623 --------
8624 DataFrame.pivot : Pivot without aggregation that can handle
8625 non-numeric data.
8626 DataFrame.melt: Unpivot a DataFrame from wide to long format,
8627 optionally leaving identifiers set.
8628 wide_to_long : Wide panel to long format. Less flexible but more
8629 user-friendly than melt.
8631 Notes
8632 -----
8633 Reference :ref:`the user guide <reshaping.pivot>` for more examples.
8635 Examples
8636 --------
8637 >>> df = pd.DataFrame({"A": ["foo", "foo", "foo", "foo", "foo",
8638 ... "bar", "bar", "bar", "bar"],
8639 ... "B": ["one", "one", "one", "two", "two",
8640 ... "one", "one", "two", "two"],
8641 ... "C": ["small", "large", "large", "small",
8642 ... "small", "large", "small", "small",
8643 ... "large"],
8644 ... "D": [1, 2, 2, 3, 3, 4, 5, 6, 7],
8645 ... "E": [2, 4, 5, 5, 6, 6, 8, 9, 9]})
8646 >>> df
8647 A B C D E
8648 0 foo one small 1 2
8649 1 foo one large 2 4
8650 2 foo one large 2 5
8651 3 foo two small 3 5
8652 4 foo two small 3 6
8653 5 bar one large 4 6
8654 6 bar one small 5 8
8655 7 bar two small 6 9
8656 8 bar two large 7 9
8658 This first example aggregates values by taking the sum.
8660 >>> table = pd.pivot_table(df, values='D', index=['A', 'B'],
8661 ... columns=['C'], aggfunc=np.sum)
8662 >>> table
8663 C large small
8664 A B
8665 bar one 4.0 5.0
8666 two 7.0 6.0
8667 foo one 4.0 1.0
8668 two NaN 6.0
8670 We can also fill missing values using the `fill_value` parameter.
8672 >>> table = pd.pivot_table(df, values='D', index=['A', 'B'],
8673 ... columns=['C'], aggfunc=np.sum, fill_value=0)
8674 >>> table
8675 C large small
8676 A B
8677 bar one 4 5
8678 two 7 6
8679 foo one 4 1
8680 two 0 6
8682 The next example aggregates by taking the mean across multiple columns.
8684 >>> table = pd.pivot_table(df, values=['D', 'E'], index=['A', 'C'],
8685 ... aggfunc={'D': np.mean,
8686 ... 'E': np.mean})
8687 >>> table
8688 D E
8689 A C
8690 bar large 5.500000 7.500000
8691 small 5.500000 8.500000
8692 foo large 2.000000 4.500000
8693 small 2.333333 4.333333
8695 We can also calculate multiple types of aggregations for any given
8696 value column.
8698 >>> table = pd.pivot_table(df, values=['D', 'E'], index=['A', 'C'],
8699 ... aggfunc={'D': np.mean,
8700 ... 'E': [min, max, np.mean]})
8701 >>> table
8702 D E
8703 mean max mean min
8704 A C
8705 bar large 5.500000 9 7.500000 6
8706 small 5.500000 9 8.500000 8
8707 foo large 2.000000 5 4.500000 4
8708 small 2.333333 6 4.333333 2
8709 """
8711 @Substitution("")
8712 @Appender(_shared_docs["pivot_table"])
8713 def pivot_table(
8714 self,
8715 values=None,
8716 index=None,
8717 columns=None,
8718 aggfunc="mean",
8719 fill_value=None,
8720 margins=False,
8721 dropna=True,
8722 margins_name="All",
8723 observed=False,
8724 sort=True,
8725 ) -> DataFrame:
8726 from pandas.core.reshape.pivot import pivot_table
8728 return pivot_table(
8729 self,
8730 values=values,
8731 index=index,
8732 columns=columns,
8733 aggfunc=aggfunc,
8734 fill_value=fill_value,
8735 margins=margins,
8736 dropna=dropna,
8737 margins_name=margins_name,
8738 observed=observed,
8739 sort=sort,
8740 )
8742 def stack(self, level: Level = -1, dropna: bool = True):
8743 """
8744 Stack the prescribed level(s) from columns to index.
8746 Return a reshaped DataFrame or Series having a multi-level
8747 index with one or more new inner-most levels compared to the current
8748 DataFrame. The new inner-most levels are created by pivoting the
8749 columns of the current dataframe:
8751 - if the columns have a single level, the output is a Series;
8752 - if the columns have multiple levels, the new index
8753 level(s) is (are) taken from the prescribed level(s) and
8754 the output is a DataFrame.
8756 Parameters
8757 ----------
8758 level : int, str, list, default -1
8759 Level(s) to stack from the column axis onto the index
8760 axis, defined as one index or label, or a list of indices
8761 or labels.
8762 dropna : bool, default True
8763 Whether to drop rows in the resulting Frame/Series with
8764 missing values. Stacking a column level onto the index
8765 axis can create combinations of index and column values
8766 that are missing from the original dataframe. See Examples
8767 section.
8769 Returns
8770 -------
8771 DataFrame or Series
8772 Stacked dataframe or series.
8774 See Also
8775 --------
8776 DataFrame.unstack : Unstack prescribed level(s) from index axis
8777 onto column axis.
8778 DataFrame.pivot : Reshape dataframe from long format to wide
8779 format.
8780 DataFrame.pivot_table : Create a spreadsheet-style pivot table
8781 as a DataFrame.
8783 Notes
8784 -----
8785 The function is named by analogy with a collection of books
8786 being reorganized from being side by side on a horizontal
8787 position (the columns of the dataframe) to being stacked
8788 vertically on top of each other (in the index of the
8789 dataframe).
8791 Reference :ref:`the user guide <reshaping.stacking>` for more examples.
8793 Examples
8794 --------
8795 **Single level columns**
8797 >>> df_single_level_cols = pd.DataFrame([[0, 1], [2, 3]],
8798 ... index=['cat', 'dog'],
8799 ... columns=['weight', 'height'])
8801 Stacking a dataframe with a single level column axis returns a Series:
8803 >>> df_single_level_cols
8804 weight height
8805 cat 0 1
8806 dog 2 3
8807 >>> df_single_level_cols.stack()
8808 cat weight 0
8809 height 1
8810 dog weight 2
8811 height 3
8812 dtype: int64
8814 **Multi level columns: simple case**
8816 >>> multicol1 = pd.MultiIndex.from_tuples([('weight', 'kg'),
8817 ... ('weight', 'pounds')])
8818 >>> df_multi_level_cols1 = pd.DataFrame([[1, 2], [2, 4]],
8819 ... index=['cat', 'dog'],
8820 ... columns=multicol1)
8822 Stacking a dataframe with a multi-level column axis:
8824 >>> df_multi_level_cols1
8825 weight
8826 kg pounds
8827 cat 1 2
8828 dog 2 4
8829 >>> df_multi_level_cols1.stack()
8830 weight
8831 cat kg 1
8832 pounds 2
8833 dog kg 2
8834 pounds 4
8836 **Missing values**
8838 >>> multicol2 = pd.MultiIndex.from_tuples([('weight', 'kg'),
8839 ... ('height', 'm')])
8840 >>> df_multi_level_cols2 = pd.DataFrame([[1.0, 2.0], [3.0, 4.0]],
8841 ... index=['cat', 'dog'],
8842 ... columns=multicol2)
8844 It is common to have missing values when stacking a dataframe
8845 with multi-level columns, as the stacked dataframe typically
8846 has more values than the original dataframe. Missing values
8847 are filled with NaNs:
8849 >>> df_multi_level_cols2
8850 weight height
8851 kg m
8852 cat 1.0 2.0
8853 dog 3.0 4.0
8854 >>> df_multi_level_cols2.stack()
8855 height weight
8856 cat kg NaN 1.0
8857 m 2.0 NaN
8858 dog kg NaN 3.0
8859 m 4.0 NaN
8861 **Prescribing the level(s) to be stacked**
8863 The first parameter controls which level or levels are stacked:
8865 >>> df_multi_level_cols2.stack(0)
8866 kg m
8867 cat height NaN 2.0
8868 weight 1.0 NaN
8869 dog height NaN 4.0
8870 weight 3.0 NaN
8871 >>> df_multi_level_cols2.stack([0, 1])
8872 cat height m 2.0
8873 weight kg 1.0
8874 dog height m 4.0
8875 weight kg 3.0
8876 dtype: float64
8878 **Dropping missing values**
8880 >>> df_multi_level_cols3 = pd.DataFrame([[None, 1.0], [2.0, 3.0]],
8881 ... index=['cat', 'dog'],
8882 ... columns=multicol2)
8884 Note that rows where all values are missing are dropped by
8885 default but this behaviour can be controlled via the dropna
8886 keyword parameter:
8888 >>> df_multi_level_cols3
8889 weight height
8890 kg m
8891 cat NaN 1.0
8892 dog 2.0 3.0
8893 >>> df_multi_level_cols3.stack(dropna=False)
8894 height weight
8895 cat kg NaN NaN
8896 m 1.0 NaN
8897 dog kg NaN 2.0
8898 m 3.0 NaN
8899 >>> df_multi_level_cols3.stack(dropna=True)
8900 height weight
8901 cat m 1.0 NaN
8902 dog kg NaN 2.0
8903 m 3.0 NaN
8904 """
8905 from pandas.core.reshape.reshape import (
8906 stack,
8907 stack_multiple,
8908 )
8910 if isinstance(level, (tuple, list)):
8911 result = stack_multiple(self, level, dropna=dropna)
8912 else:
8913 result = stack(self, level, dropna=dropna)
8915 return result.__finalize__(self, method="stack")
8917 def explode(
8918 self,
8919 column: IndexLabel,
8920 ignore_index: bool = False,
8921 ) -> DataFrame:
8922 """
8923 Transform each element of a list-like to a row, replicating index values.
8925 .. versionadded:: 0.25.0
8927 Parameters
8928 ----------
8929 column : IndexLabel
8930 Column(s) to explode.
8931 For multiple columns, specify a non-empty list with each element
8932 be str or tuple, and all specified columns their list-like data
8933 on same row of the frame must have matching length.
8935 .. versionadded:: 1.3.0
8936 Multi-column explode
8938 ignore_index : bool, default False
8939 If True, the resulting index will be labeled 0, 1, …, n - 1.
8941 .. versionadded:: 1.1.0
8943 Returns
8944 -------
8945 DataFrame
8946 Exploded lists to rows of the subset columns;
8947 index will be duplicated for these rows.
8949 Raises
8950 ------
8951 ValueError :
8952 * If columns of the frame are not unique.
8953 * If specified columns to explode is empty list.
8954 * If specified columns to explode have not matching count of
8955 elements rowwise in the frame.
8957 See Also
8958 --------
8959 DataFrame.unstack : Pivot a level of the (necessarily hierarchical)
8960 index labels.
8961 DataFrame.melt : Unpivot a DataFrame from wide format to long format.
8962 Series.explode : Explode a DataFrame from list-like columns to long format.
8964 Notes
8965 -----
8966 This routine will explode list-likes including lists, tuples, sets,
8967 Series, and np.ndarray. The result dtype of the subset rows will
8968 be object. Scalars will be returned unchanged, and empty list-likes will
8969 result in a np.nan for that row. In addition, the ordering of rows in the
8970 output will be non-deterministic when exploding sets.
8972 Reference :ref:`the user guide <reshaping.explode>` for more examples.
8974 Examples
8975 --------
8976 >>> df = pd.DataFrame({'A': [[0, 1, 2], 'foo', [], [3, 4]],
8977 ... 'B': 1,
8978 ... 'C': [['a', 'b', 'c'], np.nan, [], ['d', 'e']]})
8979 >>> df
8980 A B C
8981 0 [0, 1, 2] 1 [a, b, c]
8982 1 foo 1 NaN
8983 2 [] 1 []
8984 3 [3, 4] 1 [d, e]
8986 Single-column explode.
8988 >>> df.explode('A')
8989 A B C
8990 0 0 1 [a, b, c]
8991 0 1 1 [a, b, c]
8992 0 2 1 [a, b, c]
8993 1 foo 1 NaN
8994 2 NaN 1 []
8995 3 3 1 [d, e]
8996 3 4 1 [d, e]
8998 Multi-column explode.
9000 >>> df.explode(list('AC'))
9001 A B C
9002 0 0 1 a
9003 0 1 1 b
9004 0 2 1 c
9005 1 foo 1 NaN
9006 2 NaN 1 NaN
9007 3 3 1 d
9008 3 4 1 e
9009 """
9010 if not self.columns.is_unique:
9011 raise ValueError("columns must be unique")
9013 columns: list[Hashable]
9014 if is_scalar(column) or isinstance(column, tuple):
9015 columns = [column]
9016 elif isinstance(column, list) and all(
9017 is_scalar(c) or isinstance(c, tuple) for c in column
9018 ):
9019 if not column:
9020 raise ValueError("column must be nonempty")
9021 if len(column) > len(set(column)):
9022 raise ValueError("column must be unique")
9023 columns = column
9024 else:
9025 raise ValueError("column must be a scalar, tuple, or list thereof")
9027 df = self.reset_index(drop=True)
9028 if len(columns) == 1:
9029 result = df[columns[0]].explode()
9030 else:
9031 mylen = lambda x: len(x) if is_list_like(x) else -1
9032 counts0 = self[columns[0]].apply(mylen)
9033 for c in columns[1:]:
9034 if not all(counts0 == self[c].apply(mylen)):
9035 raise ValueError("columns must have matching element counts")
9036 result = DataFrame({c: df[c].explode() for c in columns})
9037 result = df.drop(columns, axis=1).join(result)
9038 if ignore_index:
9039 result.index = default_index(len(result))
9040 else:
9041 result.index = self.index.take(result.index)
9042 result = result.reindex(columns=self.columns, copy=False)
9044 return result.__finalize__(self, method="explode")
9046 def unstack(self, level: Level = -1, fill_value=None):
9047 """
9048 Pivot a level of the (necessarily hierarchical) index labels.
9050 Returns a DataFrame having a new level of column labels whose inner-most level
9051 consists of the pivoted index labels.
9053 If the index is not a MultiIndex, the output will be a Series
9054 (the analogue of stack when the columns are not a MultiIndex).
9056 Parameters
9057 ----------
9058 level : int, str, or list of these, default -1 (last level)
9059 Level(s) of index to unstack, can pass level name.
9060 fill_value : int, str or dict
9061 Replace NaN with this value if the unstack produces missing values.
9063 Returns
9064 -------
9065 Series or DataFrame
9067 See Also
9068 --------
9069 DataFrame.pivot : Pivot a table based on column values.
9070 DataFrame.stack : Pivot a level of the column labels (inverse operation
9071 from `unstack`).
9073 Notes
9074 -----
9075 Reference :ref:`the user guide <reshaping.stacking>` for more examples.
9077 Examples
9078 --------
9079 >>> index = pd.MultiIndex.from_tuples([('one', 'a'), ('one', 'b'),
9080 ... ('two', 'a'), ('two', 'b')])
9081 >>> s = pd.Series(np.arange(1.0, 5.0), index=index)
9082 >>> s
9083 one a 1.0
9084 b 2.0
9085 two a 3.0
9086 b 4.0
9087 dtype: float64
9089 >>> s.unstack(level=-1)
9090 a b
9091 one 1.0 2.0
9092 two 3.0 4.0
9094 >>> s.unstack(level=0)
9095 one two
9096 a 1.0 3.0
9097 b 2.0 4.0
9099 >>> df = s.unstack(level=0)
9100 >>> df.unstack()
9101 one a 1.0
9102 b 2.0
9103 two a 3.0
9104 b 4.0
9105 dtype: float64
9106 """
9107 from pandas.core.reshape.reshape import unstack
9109 result = unstack(self, level, fill_value)
9111 return result.__finalize__(self, method="unstack")
9113 @Appender(_shared_docs["melt"] % {"caller": "df.melt(", "other": "melt"})
9114 def melt(
9115 self,
9116 id_vars=None,
9117 value_vars=None,
9118 var_name=None,
9119 value_name="value",
9120 col_level: Level = None,
9121 ignore_index: bool = True,
9122 ) -> DataFrame:
9124 return melt(
9125 self,
9126 id_vars=id_vars,
9127 value_vars=value_vars,
9128 var_name=var_name,
9129 value_name=value_name,
9130 col_level=col_level,
9131 ignore_index=ignore_index,
9132 ).__finalize__(self, method="melt")
9134 # ----------------------------------------------------------------------
9135 # Time series-related
9137 @doc(
9138 Series.diff,
9139 klass="DataFrame",
9140 extra_params="axis : {0 or 'index', 1 or 'columns'}, default 0\n "
9141 "Take difference over rows (0) or columns (1).\n",
9142 other_klass="Series",
9143 examples=dedent(
9144 """
9145 Difference with previous row
9147 >>> df = pd.DataFrame({'a': [1, 2, 3, 4, 5, 6],
9148 ... 'b': [1, 1, 2, 3, 5, 8],
9149 ... 'c': [1, 4, 9, 16, 25, 36]})
9150 >>> df
9151 a b c
9152 0 1 1 1
9153 1 2 1 4
9154 2 3 2 9
9155 3 4 3 16
9156 4 5 5 25
9157 5 6 8 36
9159 >>> df.diff()
9160 a b c
9161 0 NaN NaN NaN
9162 1 1.0 0.0 3.0
9163 2 1.0 1.0 5.0
9164 3 1.0 1.0 7.0
9165 4 1.0 2.0 9.0
9166 5 1.0 3.0 11.0
9168 Difference with previous column
9170 >>> df.diff(axis=1)
9171 a b c
9172 0 NaN 0 0
9173 1 NaN -1 3
9174 2 NaN -1 7
9175 3 NaN -1 13
9176 4 NaN 0 20
9177 5 NaN 2 28
9179 Difference with 3rd previous row
9181 >>> df.diff(periods=3)
9182 a b c
9183 0 NaN NaN NaN
9184 1 NaN NaN NaN
9185 2 NaN NaN NaN
9186 3 3.0 2.0 15.0
9187 4 3.0 4.0 21.0
9188 5 3.0 6.0 27.0
9190 Difference with following row
9192 >>> df.diff(periods=-1)
9193 a b c
9194 0 -1.0 0.0 -3.0
9195 1 -1.0 -1.0 -5.0
9196 2 -1.0 -1.0 -7.0
9197 3 -1.0 -2.0 -9.0
9198 4 -1.0 -3.0 -11.0
9199 5 NaN NaN NaN
9201 Overflow in input dtype
9203 >>> df = pd.DataFrame({'a': [1, 0]}, dtype=np.uint8)
9204 >>> df.diff()
9205 a
9206 0 NaN
9207 1 255.0"""
9208 ),
9209 )
9210 def diff(self, periods: int = 1, axis: Axis = 0) -> DataFrame:
9211 if not lib.is_integer(periods):
9212 if not (
9213 is_float(periods)
9214 # error: "int" has no attribute "is_integer"
9215 and periods.is_integer() # type: ignore[attr-defined]
9216 ):
9217 raise ValueError("periods must be an integer")
9218 periods = int(periods)
9220 axis = self._get_axis_number(axis)
9221 if axis == 1 and periods != 0:
9222 return self - self.shift(periods, axis=axis)
9224 new_data = self._mgr.diff(n=periods, axis=axis)
9225 return self._constructor(new_data).__finalize__(self, "diff")
9227 # ----------------------------------------------------------------------
9228 # Function application
9230 def _gotitem(
9231 self,
9232 key: IndexLabel,
9233 ndim: int,
9234 subset: DataFrame | Series | None = None,
9235 ) -> DataFrame | Series:
9236 """
9237 Sub-classes to define. Return a sliced object.
9239 Parameters
9240 ----------
9241 key : string / list of selections
9242 ndim : {1, 2}
9243 requested ndim of result
9244 subset : object, default None
9245 subset to act on
9246 """
9247 if subset is None:
9248 subset = self
9249 elif subset.ndim == 1: # is Series
9250 return subset
9252 # TODO: _shallow_copy(subset)?
9253 return subset[key]
9255 _agg_summary_and_see_also_doc = dedent(
9256 """
9257 The aggregation operations are always performed over an axis, either the
9258 index (default) or the column axis. This behavior is different from
9259 `numpy` aggregation functions (`mean`, `median`, `prod`, `sum`, `std`,
9260 `var`), where the default is to compute the aggregation of the flattened
9261 array, e.g., ``numpy.mean(arr_2d)`` as opposed to
9262 ``numpy.mean(arr_2d, axis=0)``.
9264 `agg` is an alias for `aggregate`. Use the alias.
9266 See Also
9267 --------
9268 DataFrame.apply : Perform any type of operations.
9269 DataFrame.transform : Perform transformation type operations.
9270 core.groupby.GroupBy : Perform operations over groups.
9271 core.resample.Resampler : Perform operations over resampled bins.
9272 core.window.Rolling : Perform operations over rolling window.
9273 core.window.Expanding : Perform operations over expanding window.
9274 core.window.ExponentialMovingWindow : Perform operation over exponential weighted
9275 window.
9276 """
9277 )
9279 _agg_examples_doc = dedent(
9280 """
9281 Examples
9282 --------
9283 >>> df = pd.DataFrame([[1, 2, 3],
9284 ... [4, 5, 6],
9285 ... [7, 8, 9],
9286 ... [np.nan, np.nan, np.nan]],
9287 ... columns=['A', 'B', 'C'])
9289 Aggregate these functions over the rows.
9291 >>> df.agg(['sum', 'min'])
9292 A B C
9293 sum 12.0 15.0 18.0
9294 min 1.0 2.0 3.0
9296 Different aggregations per column.
9298 >>> df.agg({'A' : ['sum', 'min'], 'B' : ['min', 'max']})
9299 A B
9300 sum 12.0 NaN
9301 min 1.0 2.0
9302 max NaN 8.0
9304 Aggregate different functions over the columns and rename the index of the resulting
9305 DataFrame.
9307 >>> df.agg(x=('A', max), y=('B', 'min'), z=('C', np.mean))
9308 A B C
9309 x 7.0 NaN NaN
9310 y NaN 2.0 NaN
9311 z NaN NaN 6.0
9313 Aggregate over the columns.
9315 >>> df.agg("mean", axis="columns")
9316 0 2.0
9317 1 5.0
9318 2 8.0
9319 3 NaN
9320 dtype: float64
9321 """
9322 )
9324 @doc(
9325 _shared_docs["aggregate"],
9326 klass=_shared_doc_kwargs["klass"],
9327 axis=_shared_doc_kwargs["axis"],
9328 see_also=_agg_summary_and_see_also_doc,
9329 examples=_agg_examples_doc,
9330 )
9331 def aggregate(self, func=None, axis: Axis = 0, *args, **kwargs):
9332 from pandas.core.apply import frame_apply
9334 axis = self._get_axis_number(axis)
9336 relabeling, func, columns, order = reconstruct_func(func, **kwargs)
9338 op = frame_apply(self, func=func, axis=axis, args=args, kwargs=kwargs)
9339 result = op.agg()
9341 if relabeling:
9342 # This is to keep the order to columns occurrence unchanged, and also
9343 # keep the order of new columns occurrence unchanged
9345 # For the return values of reconstruct_func, if relabeling is
9346 # False, columns and order will be None.
9347 assert columns is not None
9348 assert order is not None
9350 result_in_dict = relabel_result(result, func, columns, order)
9351 result = DataFrame(result_in_dict, index=columns)
9353 return result
9355 agg = aggregate
9357 # error: Signature of "any" incompatible with supertype "NDFrame" [override]
9358 @overload # type: ignore[override]
9359 def any(
9360 self,
9361 *,
9362 axis: Axis = ...,
9363 bool_only: bool | None = ...,
9364 skipna: bool = ...,
9365 level: None = ...,
9366 **kwargs,
9367 ) -> Series:
9368 ...
9370 @overload
9371 def any(
9372 self,
9373 *,
9374 axis: Axis = ...,
9375 bool_only: bool | None = ...,
9376 skipna: bool = ...,
9377 level: Level,
9378 **kwargs,
9379 ) -> DataFrame | Series:
9380 ...
9382 @doc(NDFrame.any, **_shared_doc_kwargs)
9383 def any(
9384 self,
9385 axis: Axis = 0,
9386 bool_only: bool | None = None,
9387 skipna: bool = True,
9388 level: Level = None,
9389 **kwargs,
9390 ) -> DataFrame | Series:
9391 ...
9393 @doc(
9394 _shared_docs["transform"],
9395 klass=_shared_doc_kwargs["klass"],
9396 axis=_shared_doc_kwargs["axis"],
9397 )
9398 def transform(
9399 self, func: AggFuncType, axis: Axis = 0, *args, **kwargs
9400 ) -> DataFrame:
9401 from pandas.core.apply import frame_apply
9403 op = frame_apply(self, func=func, axis=axis, args=args, kwargs=kwargs)
9404 result = op.transform()
9405 assert isinstance(result, DataFrame)
9406 return result
9408 def apply(
9409 self,
9410 func: AggFuncType,
9411 axis: Axis = 0,
9412 raw: bool = False,
9413 result_type: Literal["expand", "reduce", "broadcast"] | None = None,
9414 args=(),
9415 **kwargs,
9416 ):
9417 """
9418 Apply a function along an axis of the DataFrame.
9420 Objects passed to the function are Series objects whose index is
9421 either the DataFrame's index (``axis=0``) or the DataFrame's columns
9422 (``axis=1``). By default (``result_type=None``), the final return type
9423 is inferred from the return type of the applied function. Otherwise,
9424 it depends on the `result_type` argument.
9426 Parameters
9427 ----------
9428 func : function
9429 Function to apply to each column or row.
9430 axis : {0 or 'index', 1 or 'columns'}, default 0
9431 Axis along which the function is applied:
9433 * 0 or 'index': apply function to each column.
9434 * 1 or 'columns': apply function to each row.
9436 raw : bool, default False
9437 Determines if row or column is passed as a Series or ndarray object:
9439 * ``False`` : passes each row or column as a Series to the
9440 function.
9441 * ``True`` : the passed function will receive ndarray objects
9442 instead.
9443 If you are just applying a NumPy reduction function this will
9444 achieve much better performance.
9446 result_type : {'expand', 'reduce', 'broadcast', None}, default None
9447 These only act when ``axis=1`` (columns):
9449 * 'expand' : list-like results will be turned into columns.
9450 * 'reduce' : returns a Series if possible rather than expanding
9451 list-like results. This is the opposite of 'expand'.
9452 * 'broadcast' : results will be broadcast to the original shape
9453 of the DataFrame, the original index and columns will be
9454 retained.
9456 The default behaviour (None) depends on the return value of the
9457 applied function: list-like results will be returned as a Series
9458 of those. However if the apply function returns a Series these
9459 are expanded to columns.
9460 args : tuple
9461 Positional arguments to pass to `func` in addition to the
9462 array/series.
9463 **kwargs
9464 Additional keyword arguments to pass as keywords arguments to
9465 `func`.
9467 Returns
9468 -------
9469 Series or DataFrame
9470 Result of applying ``func`` along the given axis of the
9471 DataFrame.
9473 See Also
9474 --------
9475 DataFrame.applymap: For elementwise operations.
9476 DataFrame.aggregate: Only perform aggregating type operations.
9477 DataFrame.transform: Only perform transforming type operations.
9479 Notes
9480 -----
9481 Functions that mutate the passed object can produce unexpected
9482 behavior or errors and are not supported. See :ref:`gotchas.udf-mutation`
9483 for more details.
9485 Examples
9486 --------
9487 >>> df = pd.DataFrame([[4, 9]] * 3, columns=['A', 'B'])
9488 >>> df
9489 A B
9490 0 4 9
9491 1 4 9
9492 2 4 9
9494 Using a numpy universal function (in this case the same as
9495 ``np.sqrt(df)``):
9497 >>> df.apply(np.sqrt)
9498 A B
9499 0 2.0 3.0
9500 1 2.0 3.0
9501 2 2.0 3.0
9503 Using a reducing function on either axis
9505 >>> df.apply(np.sum, axis=0)
9506 A 12
9507 B 27
9508 dtype: int64
9510 >>> df.apply(np.sum, axis=1)
9511 0 13
9512 1 13
9513 2 13
9514 dtype: int64
9516 Returning a list-like will result in a Series
9518 >>> df.apply(lambda x: [1, 2], axis=1)
9519 0 [1, 2]
9520 1 [1, 2]
9521 2 [1, 2]
9522 dtype: object
9524 Passing ``result_type='expand'`` will expand list-like results
9525 to columns of a Dataframe
9527 >>> df.apply(lambda x: [1, 2], axis=1, result_type='expand')
9528 0 1
9529 0 1 2
9530 1 1 2
9531 2 1 2
9533 Returning a Series inside the function is similar to passing
9534 ``result_type='expand'``. The resulting column names
9535 will be the Series index.
9537 >>> df.apply(lambda x: pd.Series([1, 2], index=['foo', 'bar']), axis=1)
9538 foo bar
9539 0 1 2
9540 1 1 2
9541 2 1 2
9543 Passing ``result_type='broadcast'`` will ensure the same shape
9544 result, whether list-like or scalar is returned by the function,
9545 and broadcast it along the axis. The resulting column names will
9546 be the originals.
9548 >>> df.apply(lambda x: [1, 2], axis=1, result_type='broadcast')
9549 A B
9550 0 1 2
9551 1 1 2
9552 2 1 2
9553 """
9554 from pandas.core.apply import frame_apply
9556 op = frame_apply(
9557 self,
9558 func=func,
9559 axis=axis,
9560 raw=raw,
9561 result_type=result_type,
9562 args=args,
9563 kwargs=kwargs,
9564 )
9565 return op.apply().__finalize__(self, method="apply")
9567 def applymap(
9568 self, func: PythonFuncType, na_action: str | None = None, **kwargs
9569 ) -> DataFrame:
9570 """
9571 Apply a function to a Dataframe elementwise.
9573 This method applies a function that accepts and returns a scalar
9574 to every element of a DataFrame.
9576 Parameters
9577 ----------
9578 func : callable
9579 Python function, returns a single value from a single value.
9580 na_action : {None, 'ignore'}, default None
9581 If ‘ignore’, propagate NaN values, without passing them to func.
9583 .. versionadded:: 1.2
9585 **kwargs
9586 Additional keyword arguments to pass as keywords arguments to
9587 `func`.
9589 .. versionadded:: 1.3.0
9591 Returns
9592 -------
9593 DataFrame
9594 Transformed DataFrame.
9596 See Also
9597 --------
9598 DataFrame.apply : Apply a function along input axis of DataFrame.
9600 Examples
9601 --------
9602 >>> df = pd.DataFrame([[1, 2.12], [3.356, 4.567]])
9603 >>> df
9604 0 1
9605 0 1.000 2.120
9606 1 3.356 4.567
9608 >>> df.applymap(lambda x: len(str(x)))
9609 0 1
9610 0 3 4
9611 1 5 5
9613 Like Series.map, NA values can be ignored:
9615 >>> df_copy = df.copy()
9616 >>> df_copy.iloc[0, 0] = pd.NA
9617 >>> df_copy.applymap(lambda x: len(str(x)), na_action='ignore')
9618 0 1
9619 0 NaN 4
9620 1 5.0 5
9622 Note that a vectorized version of `func` often exists, which will
9623 be much faster. You could square each number elementwise.
9625 >>> df.applymap(lambda x: x**2)
9626 0 1
9627 0 1.000000 4.494400
9628 1 11.262736 20.857489
9630 But it's better to avoid applymap in that case.
9632 >>> df ** 2
9633 0 1
9634 0 1.000000 4.494400
9635 1 11.262736 20.857489
9636 """
9637 if na_action not in {"ignore", None}:
9638 raise ValueError(
9639 f"na_action must be 'ignore' or None. Got {repr(na_action)}"
9640 )
9641 ignore_na = na_action == "ignore"
9642 func = functools.partial(func, **kwargs)
9644 # if we have a dtype == 'M8[ns]', provide boxed values
9645 def infer(x):
9646 if x.empty:
9647 return lib.map_infer(x, func, ignore_na=ignore_na)
9648 return lib.map_infer(x.astype(object)._values, func, ignore_na=ignore_na)
9650 return self.apply(infer).__finalize__(self, "applymap")
9652 # ----------------------------------------------------------------------
9653 # Merging / joining methods
9655 def append(
9656 self,
9657 other,
9658 ignore_index: bool = False,
9659 verify_integrity: bool = False,
9660 sort: bool = False,
9661 ) -> DataFrame:
9662 """
9663 Append rows of `other` to the end of caller, returning a new object.
9665 .. deprecated:: 1.4.0
9666 Use :func:`concat` instead. For further details see
9667 :ref:`whatsnew_140.deprecations.frame_series_append`
9669 Columns in `other` that are not in the caller are added as new columns.
9671 Parameters
9672 ----------
9673 other : DataFrame or Series/dict-like object, or list of these
9674 The data to append.
9675 ignore_index : bool, default False
9676 If True, the resulting axis will be labeled 0, 1, …, n - 1.
9677 verify_integrity : bool, default False
9678 If True, raise ValueError on creating index with duplicates.
9679 sort : bool, default False
9680 Sort columns if the columns of `self` and `other` are not aligned.
9682 .. versionchanged:: 1.0.0
9684 Changed to not sort by default.
9686 Returns
9687 -------
9688 DataFrame
9689 A new DataFrame consisting of the rows of caller and the rows of `other`.
9691 See Also
9692 --------
9693 concat : General function to concatenate DataFrame or Series objects.
9695 Notes
9696 -----
9697 If a list of dict/series is passed and the keys are all contained in
9698 the DataFrame's index, the order of the columns in the resulting
9699 DataFrame will be unchanged.
9701 Iteratively appending rows to a DataFrame can be more computationally
9702 intensive than a single concatenate. A better solution is to append
9703 those rows to a list and then concatenate the list with the original
9704 DataFrame all at once.
9706 Examples
9707 --------
9708 >>> df = pd.DataFrame([[1, 2], [3, 4]], columns=list('AB'), index=['x', 'y'])
9709 >>> df
9710 A B
9711 x 1 2
9712 y 3 4
9713 >>> df2 = pd.DataFrame([[5, 6], [7, 8]], columns=list('AB'), index=['x', 'y'])
9714 >>> df.append(df2)
9715 A B
9716 x 1 2
9717 y 3 4
9718 x 5 6
9719 y 7 8
9721 With `ignore_index` set to True:
9723 >>> df.append(df2, ignore_index=True)
9724 A B
9725 0 1 2
9726 1 3 4
9727 2 5 6
9728 3 7 8
9730 The following, while not recommended methods for generating DataFrames,
9731 show two ways to generate a DataFrame from multiple data sources.
9733 Less efficient:
9735 >>> df = pd.DataFrame(columns=['A'])
9736 >>> for i in range(5):
9737 ... df = df.append({'A': i}, ignore_index=True)
9738 >>> df
9739 A
9740 0 0
9741 1 1
9742 2 2
9743 3 3
9744 4 4
9746 More efficient:
9748 >>> pd.concat([pd.DataFrame([i], columns=['A']) for i in range(5)],
9749 ... ignore_index=True)
9750 A
9751 0 0
9752 1 1
9753 2 2
9754 3 3
9755 4 4
9756 """
9757 warnings.warn(
9758 "The frame.append method is deprecated "
9759 "and will be removed from pandas in a future version. "
9760 "Use pandas.concat instead.",
9761 FutureWarning,
9762 stacklevel=find_stack_level(),
9763 )
9765 return self._append(other, ignore_index, verify_integrity, sort)
9767 def _append(
9768 self,
9769 other,
9770 ignore_index: bool = False,
9771 verify_integrity: bool = False,
9772 sort: bool = False,
9773 ) -> DataFrame:
9774 if isinstance(other, (Series, dict)):
9775 if isinstance(other, dict):
9776 if not ignore_index:
9777 raise TypeError("Can only append a dict if ignore_index=True")
9778 other = Series(other)
9779 if other.name is None and not ignore_index:
9780 raise TypeError(
9781 "Can only append a Series if ignore_index=True "
9782 "or if the Series has a name"
9783 )
9785 index = Index([other.name], name=self.index.name)
9786 row_df = other.to_frame().T
9787 # infer_objects is needed for
9788 # test_append_empty_frame_to_series_with_dateutil_tz
9789 other = row_df.infer_objects().rename_axis(index.names, copy=False)
9790 elif isinstance(other, list):
9791 if not other:
9792 pass
9793 elif not isinstance(other[0], DataFrame):
9794 other = DataFrame(other)
9795 if self.index.name is not None and not ignore_index:
9796 other.index.name = self.index.name
9798 from pandas.core.reshape.concat import concat
9800 if isinstance(other, (list, tuple)):
9801 to_concat = [self, *other]
9802 else:
9803 to_concat = [self, other]
9805 result = concat(
9806 to_concat,
9807 ignore_index=ignore_index,
9808 verify_integrity=verify_integrity,
9809 sort=sort,
9810 )
9811 return result.__finalize__(self, method="append")
9813 def join(
9814 self,
9815 other: DataFrame | Series | list[DataFrame | Series],
9816 on: IndexLabel | None = None,
9817 how: str = "left",
9818 lsuffix: str = "",
9819 rsuffix: str = "",
9820 sort: bool = False,
9821 validate: str | None = None,
9822 ) -> DataFrame:
9823 """
9824 Join columns of another DataFrame.
9826 Join columns with `other` DataFrame either on index or on a key
9827 column. Efficiently join multiple DataFrame objects by index at once by
9828 passing a list.
9830 Parameters
9831 ----------
9832 other : DataFrame, Series, or a list containing any combination of them
9833 Index should be similar to one of the columns in this one. If a
9834 Series is passed, its name attribute must be set, and that will be
9835 used as the column name in the resulting joined DataFrame.
9836 on : str, list of str, or array-like, optional
9837 Column or index level name(s) in the caller to join on the index
9838 in `other`, otherwise joins index-on-index. If multiple
9839 values given, the `other` DataFrame must have a MultiIndex. Can
9840 pass an array as the join key if it is not already contained in
9841 the calling DataFrame. Like an Excel VLOOKUP operation.
9842 how : {'left', 'right', 'outer', 'inner'}, default 'left'
9843 How to handle the operation of the two objects.
9845 * left: use calling frame's index (or column if on is specified)
9846 * right: use `other`'s index.
9847 * outer: form union of calling frame's index (or column if on is
9848 specified) with `other`'s index, and sort it.
9849 lexicographically.
9850 * inner: form intersection of calling frame's index (or column if
9851 on is specified) with `other`'s index, preserving the order
9852 of the calling's one.
9853 * cross: creates the cartesian product from both frames, preserves the order
9854 of the left keys.
9856 .. versionadded:: 1.2.0
9858 lsuffix : str, default ''
9859 Suffix to use from left frame's overlapping columns.
9860 rsuffix : str, default ''
9861 Suffix to use from right frame's overlapping columns.
9862 sort : bool, default False
9863 Order result DataFrame lexicographically by the join key. If False,
9864 the order of the join key depends on the join type (how keyword).
9865 validate : str, optional
9866 If specified, checks if join is of specified type.
9867 * "one_to_one" or "1:1": check if join keys are unique in both left
9868 and right datasets.
9869 * "one_to_many" or "1:m": check if join keys are unique in left dataset.
9870 * "many_to_one" or "m:1": check if join keys are unique in right dataset.
9871 * "many_to_many" or "m:m": allowed, but does not result in checks.
9872 .. versionadded:: 1.5.0
9874 Returns
9875 -------
9876 DataFrame
9877 A dataframe containing columns from both the caller and `other`.
9879 See Also
9880 --------
9881 DataFrame.merge : For column(s)-on-column(s) operations.
9883 Notes
9884 -----
9885 Parameters `on`, `lsuffix`, and `rsuffix` are not supported when
9886 passing a list of `DataFrame` objects.
9888 Support for specifying index levels as the `on` parameter was added
9889 in version 0.23.0.
9891 Examples
9892 --------
9893 >>> df = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3', 'K4', 'K5'],
9894 ... 'A': ['A0', 'A1', 'A2', 'A3', 'A4', 'A5']})
9896 >>> df
9897 key A
9898 0 K0 A0
9899 1 K1 A1
9900 2 K2 A2
9901 3 K3 A3
9902 4 K4 A4
9903 5 K5 A5
9905 >>> other = pd.DataFrame({'key': ['K0', 'K1', 'K2'],
9906 ... 'B': ['B0', 'B1', 'B2']})
9908 >>> other
9909 key B
9910 0 K0 B0
9911 1 K1 B1
9912 2 K2 B2
9914 Join DataFrames using their indexes.
9916 >>> df.join(other, lsuffix='_caller', rsuffix='_other')
9917 key_caller A key_other B
9918 0 K0 A0 K0 B0
9919 1 K1 A1 K1 B1
9920 2 K2 A2 K2 B2
9921 3 K3 A3 NaN NaN
9922 4 K4 A4 NaN NaN
9923 5 K5 A5 NaN NaN
9925 If we want to join using the key columns, we need to set key to be
9926 the index in both `df` and `other`. The joined DataFrame will have
9927 key as its index.
9929 >>> df.set_index('key').join(other.set_index('key'))
9930 A B
9931 key
9932 K0 A0 B0
9933 K1 A1 B1
9934 K2 A2 B2
9935 K3 A3 NaN
9936 K4 A4 NaN
9937 K5 A5 NaN
9939 Another option to join using the key columns is to use the `on`
9940 parameter. DataFrame.join always uses `other`'s index but we can use
9941 any column in `df`. This method preserves the original DataFrame's
9942 index in the result.
9944 >>> df.join(other.set_index('key'), on='key')
9945 key A B
9946 0 K0 A0 B0
9947 1 K1 A1 B1
9948 2 K2 A2 B2
9949 3 K3 A3 NaN
9950 4 K4 A4 NaN
9951 5 K5 A5 NaN
9953 Using non-unique key values shows how they are matched.
9955 >>> df = pd.DataFrame({'key': ['K0', 'K1', 'K1', 'K3', 'K0', 'K1'],
9956 ... 'A': ['A0', 'A1', 'A2', 'A3', 'A4', 'A5']})
9958 >>> df
9959 key A
9960 0 K0 A0
9961 1 K1 A1
9962 2 K1 A2
9963 3 K3 A3
9964 4 K0 A4
9965 5 K1 A5
9967 >>> df.join(other.set_index('key'), on='key', validate='m:1')
9968 key A B
9969 0 K0 A0 B0
9970 1 K1 A1 B1
9971 2 K1 A2 B1
9972 3 K3 A3 NaN
9973 4 K0 A4 B0
9974 5 K1 A5 B1
9975 """
9976 return self._join_compat(
9977 other,
9978 on=on,
9979 how=how,
9980 lsuffix=lsuffix,
9981 rsuffix=rsuffix,
9982 sort=sort,
9983 validate=validate,
9984 )
9986 def _join_compat(
9987 self,
9988 other: DataFrame | Series | Iterable[DataFrame | Series],
9989 on: IndexLabel | None = None,
9990 how: str = "left",
9991 lsuffix: str = "",
9992 rsuffix: str = "",
9993 sort: bool = False,
9994 validate: str | None = None,
9995 ):
9996 from pandas.core.reshape.concat import concat
9997 from pandas.core.reshape.merge import merge
9999 if isinstance(other, Series):
10000 if other.name is None:
10001 raise ValueError("Other Series must have a name")
10002 other = DataFrame({other.name: other})
10004 if isinstance(other, DataFrame):
10005 if how == "cross":
10006 return merge(
10007 self,
10008 other,
10009 how=how,
10010 on=on,
10011 suffixes=(lsuffix, rsuffix),
10012 sort=sort,
10013 validate=validate,
10014 )
10015 return merge(
10016 self,
10017 other,
10018 left_on=on,
10019 how=how,
10020 left_index=on is None,
10021 right_index=True,
10022 suffixes=(lsuffix, rsuffix),
10023 sort=sort,
10024 validate=validate,
10025 )
10026 else:
10027 if on is not None:
10028 raise ValueError(
10029 "Joining multiple DataFrames only supported for joining on index"
10030 )
10032 if rsuffix or lsuffix:
10033 raise ValueError(
10034 "Suffixes not supported when joining multiple DataFrames"
10035 )
10037 # Mypy thinks the RHS is a
10038 # "Union[DataFrame, Series, Iterable[Union[DataFrame, Series]]]" whereas
10039 # the LHS is an "Iterable[DataFrame]", but in reality both types are
10040 # "Iterable[Union[DataFrame, Series]]" due to the if statements
10041 frames = [cast("DataFrame | Series", self)] + list(other)
10043 can_concat = all(df.index.is_unique for df in frames)
10045 # join indexes only using concat
10046 if can_concat:
10047 if how == "left":
10048 res = concat(
10049 frames, axis=1, join="outer", verify_integrity=True, sort=sort
10050 )
10051 return res.reindex(self.index, copy=False)
10052 else:
10053 return concat(
10054 frames, axis=1, join=how, verify_integrity=True, sort=sort
10055 )
10057 joined = frames[0]
10059 for frame in frames[1:]:
10060 joined = merge(
10061 joined,
10062 frame,
10063 how=how,
10064 left_index=True,
10065 right_index=True,
10066 validate=validate,
10067 )
10069 return joined
10071 @Substitution("")
10072 @Appender(_merge_doc, indents=2)
10073 def merge(
10074 self,
10075 right: DataFrame | Series,
10076 how: str = "inner",
10077 on: IndexLabel | None = None,
10078 left_on: IndexLabel | None = None,
10079 right_on: IndexLabel | None = None,
10080 left_index: bool = False,
10081 right_index: bool = False,
10082 sort: bool = False,
10083 suffixes: Suffixes = ("_x", "_y"),
10084 copy: bool = True,
10085 indicator: bool = False,
10086 validate: str | None = None,
10087 ) -> DataFrame:
10088 from pandas.core.reshape.merge import merge
10090 return merge(
10091 self,
10092 right,
10093 how=how,
10094 on=on,
10095 left_on=left_on,
10096 right_on=right_on,
10097 left_index=left_index,
10098 right_index=right_index,
10099 sort=sort,
10100 suffixes=suffixes,
10101 copy=copy,
10102 indicator=indicator,
10103 validate=validate,
10104 )
10106 def round(
10107 self, decimals: int | dict[IndexLabel, int] | Series = 0, *args, **kwargs
10108 ) -> DataFrame:
10109 """
10110 Round a DataFrame to a variable number of decimal places.
10112 Parameters
10113 ----------
10114 decimals : int, dict, Series
10115 Number of decimal places to round each column to. If an int is
10116 given, round each column to the same number of places.
10117 Otherwise dict and Series round to variable numbers of places.
10118 Column names should be in the keys if `decimals` is a
10119 dict-like, or in the index if `decimals` is a Series. Any
10120 columns not included in `decimals` will be left as is. Elements
10121 of `decimals` which are not columns of the input will be
10122 ignored.
10123 *args
10124 Additional keywords have no effect but might be accepted for
10125 compatibility with numpy.
10126 **kwargs
10127 Additional keywords have no effect but might be accepted for
10128 compatibility with numpy.
10130 Returns
10131 -------
10132 DataFrame
10133 A DataFrame with the affected columns rounded to the specified
10134 number of decimal places.
10136 See Also
10137 --------
10138 numpy.around : Round a numpy array to the given number of decimals.
10139 Series.round : Round a Series to the given number of decimals.
10141 Examples
10142 --------
10143 >>> df = pd.DataFrame([(.21, .32), (.01, .67), (.66, .03), (.21, .18)],
10144 ... columns=['dogs', 'cats'])
10145 >>> df
10146 dogs cats
10147 0 0.21 0.32
10148 1 0.01 0.67
10149 2 0.66 0.03
10150 3 0.21 0.18
10152 By providing an integer each column is rounded to the same number
10153 of decimal places
10155 >>> df.round(1)
10156 dogs cats
10157 0 0.2 0.3
10158 1 0.0 0.7
10159 2 0.7 0.0
10160 3 0.2 0.2
10162 With a dict, the number of places for specific columns can be
10163 specified with the column names as key and the number of decimal
10164 places as value
10166 >>> df.round({'dogs': 1, 'cats': 0})
10167 dogs cats
10168 0 0.2 0.0
10169 1 0.0 1.0
10170 2 0.7 0.0
10171 3 0.2 0.0
10173 Using a Series, the number of places for specific columns can be
10174 specified with the column names as index and the number of
10175 decimal places as value
10177 >>> decimals = pd.Series([0, 1], index=['cats', 'dogs'])
10178 >>> df.round(decimals)
10179 dogs cats
10180 0 0.2 0.0
10181 1 0.0 1.0
10182 2 0.7 0.0
10183 3 0.2 0.0
10184 """
10185 from pandas.core.reshape.concat import concat
10187 def _dict_round(df: DataFrame, decimals):
10188 for col, vals in df.items():
10189 try:
10190 yield _series_round(vals, decimals[col])
10191 except KeyError:
10192 yield vals
10194 def _series_round(ser: Series, decimals: int):
10195 if is_integer_dtype(ser.dtype) or is_float_dtype(ser.dtype):
10196 return ser.round(decimals)
10197 return ser
10199 nv.validate_round(args, kwargs)
10201 if isinstance(decimals, (dict, Series)):
10202 if isinstance(decimals, Series) and not decimals.index.is_unique:
10203 raise ValueError("Index of decimals must be unique")
10204 if is_dict_like(decimals) and not all(
10205 is_integer(value) for _, value in decimals.items()
10206 ):
10207 raise TypeError("Values in decimals must be integers")
10208 new_cols = list(_dict_round(self, decimals))
10209 elif is_integer(decimals):
10210 # Dispatch to Series.round
10211 new_cols = [_series_round(v, decimals) for _, v in self.items()]
10212 else:
10213 raise TypeError("decimals must be an integer, a dict-like or a Series")
10215 if len(new_cols) > 0:
10216 return self._constructor(
10217 concat(new_cols, axis=1), index=self.index, columns=self.columns
10218 ).__finalize__(self, method="round")
10219 else:
10220 return self
10222 # ----------------------------------------------------------------------
10223 # Statistical methods, etc.
10225 def corr(
10226 self,
10227 method: str | Callable[[np.ndarray, np.ndarray], float] = "pearson",
10228 min_periods: int = 1,
10229 numeric_only: bool | lib.NoDefault = lib.no_default,
10230 ) -> DataFrame:
10231 """
10232 Compute pairwise correlation of columns, excluding NA/null values.
10234 Parameters
10235 ----------
10236 method : {'pearson', 'kendall', 'spearman'} or callable
10237 Method of correlation:
10239 * pearson : standard correlation coefficient
10240 * kendall : Kendall Tau correlation coefficient
10241 * spearman : Spearman rank correlation
10242 * callable: callable with input two 1d ndarrays
10243 and returning a float. Note that the returned matrix from corr
10244 will have 1 along the diagonals and will be symmetric
10245 regardless of the callable's behavior.
10246 min_periods : int, optional
10247 Minimum number of observations required per pair of columns
10248 to have a valid result. Currently only available for Pearson
10249 and Spearman correlation.
10250 numeric_only : bool, default True
10251 Include only `float`, `int` or `boolean` data.
10253 .. versionadded:: 1.5.0
10255 .. deprecated:: 1.5.0
10256 The default value of ``numeric_only`` will be ``False`` in a future
10257 version of pandas.
10259 Returns
10260 -------
10261 DataFrame
10262 Correlation matrix.
10264 See Also
10265 --------
10266 DataFrame.corrwith : Compute pairwise correlation with another
10267 DataFrame or Series.
10268 Series.corr : Compute the correlation between two Series.
10270 Notes
10271 -----
10272 Pearson, Kendall and Spearman correlation are currently computed using pairwise complete observations.
10274 * `Pearson correlation coefficient <https://en.wikipedia.org/wiki/Pearson_correlation_coefficient>`_
10275 * `Kendall rank correlation coefficient <https://en.wikipedia.org/wiki/Kendall_rank_correlation_coefficient>`_
10276 * `Spearman's rank correlation coefficient <https://en.wikipedia.org/wiki/Spearman%27s_rank_correlation_coefficient>`_
10278 Examples
10279 --------
10280 >>> def histogram_intersection(a, b):
10281 ... v = np.minimum(a, b).sum().round(decimals=1)
10282 ... return v
10283 >>> df = pd.DataFrame([(.2, .3), (.0, .6), (.6, .0), (.2, .1)],
10284 ... columns=['dogs', 'cats'])
10285 >>> df.corr(method=histogram_intersection)
10286 dogs cats
10287 dogs 1.0 0.3
10288 cats 0.3 1.0
10290 >>> df = pd.DataFrame([(1, 1), (2, np.nan), (np.nan, 3), (4, 4)],
10291 ... columns=['dogs', 'cats'])
10292 >>> df.corr(min_periods=3)
10293 dogs cats
10294 dogs 1.0 NaN
10295 cats NaN 1.0
10296 """ # noqa:E501
10297 numeric_only_bool = com.resolve_numeric_only(numeric_only)
10298 data = self._get_numeric_data() if numeric_only_bool else self
10299 if numeric_only is lib.no_default and len(data.columns) < len(self.columns):
10300 com.deprecate_numeric_only_default(type(self), "corr")
10302 cols = data.columns
10303 idx = cols.copy()
10304 mat = data.to_numpy(dtype=float, na_value=np.nan, copy=False)
10306 if method == "pearson":
10307 correl = libalgos.nancorr(mat, minp=min_periods)
10308 elif method == "spearman":
10309 correl = libalgos.nancorr_spearman(mat, minp=min_periods)
10310 elif method == "kendall" or callable(method):
10311 if min_periods is None:
10312 min_periods = 1
10313 mat = mat.T
10314 corrf = nanops.get_corr_func(method)
10315 K = len(cols)
10316 correl = np.empty((K, K), dtype=float)
10317 mask = np.isfinite(mat)
10318 for i, ac in enumerate(mat):
10319 for j, bc in enumerate(mat):
10320 if i > j:
10321 continue
10323 valid = mask[i] & mask[j]
10324 if valid.sum() < min_periods:
10325 c = np.nan
10326 elif i == j:
10327 c = 1.0
10328 elif not valid.all():
10329 c = corrf(ac[valid], bc[valid])
10330 else:
10331 c = corrf(ac, bc)
10332 correl[i, j] = c
10333 correl[j, i] = c
10334 else:
10335 raise ValueError(
10336 "method must be either 'pearson', "
10337 "'spearman', 'kendall', or a callable, "
10338 f"'{method}' was supplied"
10339 )
10341 return self._constructor(correl, index=idx, columns=cols)
10343 def cov(
10344 self,
10345 min_periods: int | None = None,
10346 ddof: int | None = 1,
10347 numeric_only: bool | lib.NoDefault = lib.no_default,
10348 ) -> DataFrame:
10349 """
10350 Compute pairwise covariance of columns, excluding NA/null values.
10352 Compute the pairwise covariance among the series of a DataFrame.
10353 The returned data frame is the `covariance matrix
10354 <https://en.wikipedia.org/wiki/Covariance_matrix>`__ of the columns
10355 of the DataFrame.
10357 Both NA and null values are automatically excluded from the
10358 calculation. (See the note below about bias from missing values.)
10359 A threshold can be set for the minimum number of
10360 observations for each value created. Comparisons with observations
10361 below this threshold will be returned as ``NaN``.
10363 This method is generally used for the analysis of time series data to
10364 understand the relationship between different measures
10365 across time.
10367 Parameters
10368 ----------
10369 min_periods : int, optional
10370 Minimum number of observations required per pair of columns
10371 to have a valid result.
10373 ddof : int, default 1
10374 Delta degrees of freedom. The divisor used in calculations
10375 is ``N - ddof``, where ``N`` represents the number of elements.
10377 .. versionadded:: 1.1.0
10379 numeric_only : bool, default True
10380 Include only `float`, `int` or `boolean` data.
10382 .. versionadded:: 1.5.0
10384 .. deprecated:: 1.5.0
10385 The default value of ``numeric_only`` will be ``False`` in a future
10386 version of pandas.
10388 Returns
10389 -------
10390 DataFrame
10391 The covariance matrix of the series of the DataFrame.
10393 See Also
10394 --------
10395 Series.cov : Compute covariance with another Series.
10396 core.window.ewm.ExponentialMovingWindow.cov : Exponential weighted sample
10397 covariance.
10398 core.window.expanding.Expanding.cov : Expanding sample covariance.
10399 core.window.rolling.Rolling.cov : Rolling sample covariance.
10401 Notes
10402 -----
10403 Returns the covariance matrix of the DataFrame's time series.
10404 The covariance is normalized by N-ddof.
10406 For DataFrames that have Series that are missing data (assuming that
10407 data is `missing at random
10408 <https://en.wikipedia.org/wiki/Missing_data#Missing_at_random>`__)
10409 the returned covariance matrix will be an unbiased estimate
10410 of the variance and covariance between the member Series.
10412 However, for many applications this estimate may not be acceptable
10413 because the estimate covariance matrix is not guaranteed to be positive
10414 semi-definite. This could lead to estimate correlations having
10415 absolute values which are greater than one, and/or a non-invertible
10416 covariance matrix. See `Estimation of covariance matrices
10417 <https://en.wikipedia.org/w/index.php?title=Estimation_of_covariance_
10418 matrices>`__ for more details.
10420 Examples
10421 --------
10422 >>> df = pd.DataFrame([(1, 2), (0, 3), (2, 0), (1, 1)],
10423 ... columns=['dogs', 'cats'])
10424 >>> df.cov()
10425 dogs cats
10426 dogs 0.666667 -1.000000
10427 cats -1.000000 1.666667
10429 >>> np.random.seed(42)
10430 >>> df = pd.DataFrame(np.random.randn(1000, 5),
10431 ... columns=['a', 'b', 'c', 'd', 'e'])
10432 >>> df.cov()
10433 a b c d e
10434 a 0.998438 -0.020161 0.059277 -0.008943 0.014144
10435 b -0.020161 1.059352 -0.008543 -0.024738 0.009826
10436 c 0.059277 -0.008543 1.010670 -0.001486 -0.000271
10437 d -0.008943 -0.024738 -0.001486 0.921297 -0.013692
10438 e 0.014144 0.009826 -0.000271 -0.013692 0.977795
10440 **Minimum number of periods**
10442 This method also supports an optional ``min_periods`` keyword
10443 that specifies the required minimum number of non-NA observations for
10444 each column pair in order to have a valid result:
10446 >>> np.random.seed(42)
10447 >>> df = pd.DataFrame(np.random.randn(20, 3),
10448 ... columns=['a', 'b', 'c'])
10449 >>> df.loc[df.index[:5], 'a'] = np.nan
10450 >>> df.loc[df.index[5:10], 'b'] = np.nan
10451 >>> df.cov(min_periods=12)
10452 a b c
10453 a 0.316741 NaN -0.150812
10454 b NaN 1.248003 0.191417
10455 c -0.150812 0.191417 0.895202
10456 """
10457 numeric_only_bool = com.resolve_numeric_only(numeric_only)
10458 data = self._get_numeric_data() if numeric_only_bool else self
10459 if numeric_only is lib.no_default and len(data.columns) < len(self.columns):
10460 com.deprecate_numeric_only_default(type(self), "cov")
10462 cols = data.columns
10463 idx = cols.copy()
10464 mat = data.to_numpy(dtype=float, na_value=np.nan, copy=False)
10466 if notna(mat).all():
10467 if min_periods is not None and min_periods > len(mat):
10468 base_cov = np.empty((mat.shape[1], mat.shape[1]))
10469 base_cov.fill(np.nan)
10470 else:
10471 base_cov = np.cov(mat.T, ddof=ddof)
10472 base_cov = base_cov.reshape((len(cols), len(cols)))
10473 else:
10474 base_cov = libalgos.nancorr(mat, cov=True, minp=min_periods)
10476 return self._constructor(base_cov, index=idx, columns=cols)
10478 def corrwith(
10479 self,
10480 other: DataFrame | Series,
10481 axis: Axis = 0,
10482 drop: bool = False,
10483 method: Literal["pearson", "kendall", "spearman"]
10484 | Callable[[np.ndarray, np.ndarray], float] = "pearson",
10485 numeric_only: bool | lib.NoDefault = lib.no_default,
10486 ) -> Series:
10487 """
10488 Compute pairwise correlation.
10490 Pairwise correlation is computed between rows or columns of
10491 DataFrame with rows or columns of Series or DataFrame. DataFrames
10492 are first aligned along both axes before computing the
10493 correlations.
10495 Parameters
10496 ----------
10497 other : DataFrame, Series
10498 Object with which to compute correlations.
10499 axis : {0 or 'index', 1 or 'columns'}, default 0
10500 The axis to use. 0 or 'index' to compute row-wise, 1 or 'columns' for
10501 column-wise.
10502 drop : bool, default False
10503 Drop missing indices from result.
10504 method : {'pearson', 'kendall', 'spearman'} or callable
10505 Method of correlation:
10507 * pearson : standard correlation coefficient
10508 * kendall : Kendall Tau correlation coefficient
10509 * spearman : Spearman rank correlation
10510 * callable: callable with input two 1d ndarrays
10511 and returning a float.
10513 numeric_only : bool, default True
10514 Include only `float`, `int` or `boolean` data.
10516 .. versionadded:: 1.5.0
10518 .. deprecated:: 1.5.0
10519 The default value of ``numeric_only`` will be ``False`` in a future
10520 version of pandas.
10522 Returns
10523 -------
10524 Series
10525 Pairwise correlations.
10527 See Also
10528 --------
10529 DataFrame.corr : Compute pairwise correlation of columns.
10531 Examples
10532 --------
10533 >>> index = ["a", "b", "c", "d", "e"]
10534 >>> columns = ["one", "two", "three", "four"]
10535 >>> df1 = pd.DataFrame(np.arange(20).reshape(5, 4), index=index, columns=columns)
10536 >>> df2 = pd.DataFrame(np.arange(16).reshape(4, 4), index=index[:4], columns=columns)
10537 >>> df1.corrwith(df2)
10538 one 1.0
10539 two 1.0
10540 three 1.0
10541 four 1.0
10542 dtype: float64
10544 >>> df2.corrwith(df1, axis=1)
10545 a 1.0
10546 b 1.0
10547 c 1.0
10548 d 1.0
10549 e NaN
10550 dtype: float64
10551 """ # noqa:E501
10552 axis = self._get_axis_number(axis)
10553 numeric_only_bool = com.resolve_numeric_only(numeric_only)
10554 this = self._get_numeric_data() if numeric_only_bool else self
10555 if numeric_only is lib.no_default and len(this.columns) < len(self.columns):
10556 com.deprecate_numeric_only_default(type(self), "corrwith")
10558 if isinstance(other, Series):
10559 return this.apply(lambda x: other.corr(x, method=method), axis=axis)
10561 if numeric_only_bool:
10562 other = other._get_numeric_data()
10563 left, right = this.align(other, join="inner", copy=False)
10565 if axis == 1:
10566 left = left.T
10567 right = right.T
10569 if method == "pearson":
10570 # mask missing values
10571 left = left + right * 0
10572 right = right + left * 0
10574 # demeaned data
10575 ldem = left - left.mean(numeric_only=numeric_only_bool)
10576 rdem = right - right.mean(numeric_only=numeric_only_bool)
10578 num = (ldem * rdem).sum()
10579 dom = (
10580 (left.count() - 1)
10581 * left.std(numeric_only=numeric_only_bool)
10582 * right.std(numeric_only=numeric_only_bool)
10583 )
10585 correl = num / dom
10587 elif method in ["kendall", "spearman"] or callable(method):
10589 def c(x):
10590 return nanops.nancorr(x[0], x[1], method=method)
10592 correl = self._constructor_sliced(
10593 map(c, zip(left.values.T, right.values.T)), index=left.columns
10594 )
10596 else:
10597 raise ValueError(
10598 f"Invalid method {method} was passed, "
10599 "valid methods are: 'pearson', 'kendall', "
10600 "'spearman', or callable"
10601 )
10603 if not drop:
10604 # Find non-matching labels along the given axis
10605 # and append missing correlations (GH 22375)
10606 raxis = 1 if axis == 0 else 0
10607 result_index = this._get_axis(raxis).union(other._get_axis(raxis))
10608 idx_diff = result_index.difference(correl.index)
10610 if len(idx_diff) > 0:
10611 correl = correl._append(
10612 Series([np.nan] * len(idx_diff), index=idx_diff)
10613 )
10615 return correl
10617 # ----------------------------------------------------------------------
10618 # ndarray-like stats methods
10620 def count(self, axis: Axis = 0, level: Level = None, numeric_only: bool = False):
10621 """
10622 Count non-NA cells for each column or row.
10624 The values `None`, `NaN`, `NaT`, and optionally `numpy.inf` (depending
10625 on `pandas.options.mode.use_inf_as_na`) are considered NA.
10627 Parameters
10628 ----------
10629 axis : {0 or 'index', 1 or 'columns'}, default 0
10630 If 0 or 'index' counts are generated for each column.
10631 If 1 or 'columns' counts are generated for each row.
10632 level : int or str, optional
10633 If the axis is a `MultiIndex` (hierarchical), count along a
10634 particular `level`, collapsing into a `DataFrame`.
10635 A `str` specifies the level name.
10636 numeric_only : bool, default False
10637 Include only `float`, `int` or `boolean` data.
10639 Returns
10640 -------
10641 Series or DataFrame
10642 For each column/row the number of non-NA/null entries.
10643 If `level` is specified returns a `DataFrame`.
10645 See Also
10646 --------
10647 Series.count: Number of non-NA elements in a Series.
10648 DataFrame.value_counts: Count unique combinations of columns.
10649 DataFrame.shape: Number of DataFrame rows and columns (including NA
10650 elements).
10651 DataFrame.isna: Boolean same-sized DataFrame showing places of NA
10652 elements.
10654 Examples
10655 --------
10656 Constructing DataFrame from a dictionary:
10658 >>> df = pd.DataFrame({"Person":
10659 ... ["John", "Myla", "Lewis", "John", "Myla"],
10660 ... "Age": [24., np.nan, 21., 33, 26],
10661 ... "Single": [False, True, True, True, False]})
10662 >>> df
10663 Person Age Single
10664 0 John 24.0 False
10665 1 Myla NaN True
10666 2 Lewis 21.0 True
10667 3 John 33.0 True
10668 4 Myla 26.0 False
10670 Notice the uncounted NA values:
10672 >>> df.count()
10673 Person 5
10674 Age 4
10675 Single 5
10676 dtype: int64
10678 Counts for each **row**:
10680 >>> df.count(axis='columns')
10681 0 3
10682 1 2
10683 2 3
10684 3 3
10685 4 3
10686 dtype: int64
10687 """
10688 axis = self._get_axis_number(axis)
10689 if level is not None:
10690 warnings.warn(
10691 "Using the level keyword in DataFrame and Series aggregations is "
10692 "deprecated and will be removed in a future version. Use groupby "
10693 "instead. df.count(level=1) should use df.groupby(level=1).count().",
10694 FutureWarning,
10695 stacklevel=find_stack_level(),
10696 )
10697 res = self._count_level(level, axis=axis, numeric_only=numeric_only)
10698 return res.__finalize__(self, method="count")
10700 if numeric_only:
10701 frame = self._get_numeric_data()
10702 else:
10703 frame = self
10705 # GH #423
10706 if len(frame._get_axis(axis)) == 0:
10707 result = self._constructor_sliced(0, index=frame._get_agg_axis(axis))
10708 else:
10709 if frame._is_mixed_type or frame._mgr.any_extension_types:
10710 # the or any_extension_types is really only hit for single-
10711 # column frames with an extension array
10712 result = notna(frame).sum(axis=axis)
10713 else:
10714 # GH13407
10715 series_counts = notna(frame).sum(axis=axis)
10716 counts = series_counts.values
10717 result = self._constructor_sliced(
10718 counts, index=frame._get_agg_axis(axis)
10719 )
10721 return result.astype("int64").__finalize__(self, method="count")
10723 def _count_level(self, level: Level, axis: int = 0, numeric_only: bool = False):
10724 if numeric_only:
10725 frame = self._get_numeric_data()
10726 else:
10727 frame = self
10729 count_axis = frame._get_axis(axis)
10730 agg_axis = frame._get_agg_axis(axis)
10732 if not isinstance(count_axis, MultiIndex):
10733 raise TypeError(
10734 f"Can only count levels on hierarchical {self._get_axis_name(axis)}."
10735 )
10737 # Mask NaNs: Mask rows or columns where the index level is NaN, and all
10738 # values in the DataFrame that are NaN
10739 if frame._is_mixed_type:
10740 # Since we have mixed types, calling notna(frame.values) might
10741 # upcast everything to object
10742 values_mask = notna(frame).values
10743 else:
10744 # But use the speedup when we have homogeneous dtypes
10745 values_mask = notna(frame.values)
10747 index_mask = notna(count_axis.get_level_values(level=level))
10748 if axis == 1:
10749 mask = index_mask & values_mask
10750 else:
10751 mask = index_mask.reshape(-1, 1) & values_mask
10753 if isinstance(level, int):
10754 level_number = level
10755 else:
10756 level_number = count_axis._get_level_number(level)
10758 level_name = count_axis._names[level_number]
10759 level_index = count_axis.levels[level_number]._rename(name=level_name)
10760 level_codes = ensure_platform_int(count_axis.codes[level_number])
10761 counts = lib.count_level_2d(mask, level_codes, len(level_index), axis=axis)
10763 if axis == 1:
10764 result = self._constructor(counts, index=agg_axis, columns=level_index)
10765 else:
10766 result = self._constructor(counts, index=level_index, columns=agg_axis)
10768 return result
10770 def _reduce(
10771 self,
10772 op,
10773 name: str,
10774 *,
10775 axis: Axis = 0,
10776 skipna: bool = True,
10777 numeric_only: bool | None = None,
10778 filter_type=None,
10779 **kwds,
10780 ):
10781 assert filter_type is None or filter_type == "bool", filter_type
10782 out_dtype = "bool" if filter_type == "bool" else None
10784 if numeric_only is None and name in ["mean", "median"]:
10785 own_dtypes = [arr.dtype for arr in self._mgr.arrays]
10787 dtype_is_dt = np.array(
10788 [is_datetime64_any_dtype(dtype) for dtype in own_dtypes],
10789 dtype=bool,
10790 )
10791 if dtype_is_dt.any():
10792 warnings.warn(
10793 "DataFrame.mean and DataFrame.median with numeric_only=None "
10794 "will include datetime64 and datetime64tz columns in a "
10795 "future version.",
10796 FutureWarning,
10797 stacklevel=find_stack_level(),
10798 )
10799 # Non-copy equivalent to
10800 # dt64_cols = self.dtypes.apply(is_datetime64_any_dtype)
10801 # cols = self.columns[~dt64_cols]
10802 # self = self[cols]
10803 predicate = lambda x: not is_datetime64_any_dtype(x.dtype)
10804 mgr = self._mgr._get_data_subset(predicate)
10805 self = type(self)(mgr)
10807 # TODO: Make other agg func handle axis=None properly GH#21597
10808 axis = self._get_axis_number(axis)
10809 labels = self._get_agg_axis(axis)
10810 assert axis in [0, 1]
10812 def func(values: np.ndarray):
10813 # We only use this in the case that operates on self.values
10814 return op(values, axis=axis, skipna=skipna, **kwds)
10816 def blk_func(values, axis=1):
10817 if isinstance(values, ExtensionArray):
10818 if not is_1d_only_ea_dtype(values.dtype) and not isinstance(
10819 self._mgr, ArrayManager
10820 ):
10821 return values._reduce(name, axis=1, skipna=skipna, **kwds)
10822 return values._reduce(name, skipna=skipna, **kwds)
10823 else:
10824 return op(values, axis=axis, skipna=skipna, **kwds)
10826 def _get_data() -> DataFrame:
10827 if filter_type is None:
10828 data = self._get_numeric_data()
10829 else:
10830 # GH#25101, GH#24434
10831 assert filter_type == "bool"
10832 data = self._get_bool_data()
10833 return data
10835 numeric_only_bool = com.resolve_numeric_only(numeric_only)
10836 if numeric_only is not None or axis == 0:
10837 # For numeric_only non-None and axis non-None, we know
10838 # which blocks to use and no try/except is needed.
10839 # For numeric_only=None only the case with axis==0 and no object
10840 # dtypes are unambiguous can be handled with BlockManager.reduce
10841 # Case with EAs see GH#35881
10842 df = self
10843 if numeric_only_bool:
10844 df = _get_data()
10845 if axis == 1:
10846 df = df.T
10847 axis = 0
10849 ignore_failures = numeric_only is None
10851 # After possibly _get_data and transposing, we are now in the
10852 # simple case where we can use BlockManager.reduce
10853 res, _ = df._mgr.reduce(blk_func, ignore_failures=ignore_failures)
10854 out = df._constructor(res).iloc[0]
10855 if out_dtype is not None:
10856 out = out.astype(out_dtype)
10857 if axis == 0 and len(self) == 0 and name in ["sum", "prod"]:
10858 # Even if we are object dtype, follow numpy and return
10859 # float64, see test_apply_funcs_over_empty
10860 out = out.astype(np.float64)
10862 if numeric_only is None and out.shape[0] != df.shape[1]:
10863 # columns have been dropped GH#41480
10864 com.deprecate_numeric_only_default(
10865 type(self), name, deprecate_none=True
10866 )
10868 return out
10870 assert numeric_only is None
10872 data = self
10873 values = data.values
10875 try:
10876 result = func(values)
10878 except TypeError:
10879 # e.g. in nanops trying to convert strs to float
10881 data = _get_data()
10882 labels = data._get_agg_axis(axis)
10884 values = data.values
10885 with np.errstate(all="ignore"):
10886 result = func(values)
10888 # columns have been dropped GH#41480
10889 arg_name = "numeric_only"
10890 if name in ["all", "any"]:
10891 arg_name = "bool_only"
10892 warnings.warn(
10893 "Dropping of nuisance columns in DataFrame reductions "
10894 f"(with '{arg_name}=None') is deprecated; in a future "
10895 "version this will raise TypeError. Select only valid "
10896 "columns before calling the reduction.",
10897 FutureWarning,
10898 stacklevel=find_stack_level(),
10899 )
10901 if hasattr(result, "dtype"):
10902 if filter_type == "bool" and notna(result).all():
10903 result = result.astype(np.bool_)
10904 elif filter_type is None and is_object_dtype(result.dtype):
10905 try:
10906 result = result.astype(np.float64)
10907 except (ValueError, TypeError):
10908 # try to coerce to the original dtypes item by item if we can
10909 pass
10911 result = self._constructor_sliced(result, index=labels)
10912 return result
10914 def _reduce_axis1(self, name: str, func, skipna: bool) -> Series:
10915 """
10916 Special case for _reduce to try to avoid a potentially-expensive transpose.
10918 Apply the reduction block-wise along axis=1 and then reduce the resulting
10919 1D arrays.
10920 """
10921 if name == "all":
10922 result = np.ones(len(self), dtype=bool)
10923 ufunc = np.logical_and
10924 elif name == "any":
10925 result = np.zeros(len(self), dtype=bool)
10926 # error: Incompatible types in assignment
10927 # (expression has type "_UFunc_Nin2_Nout1[Literal['logical_or'],
10928 # Literal[20], Literal[False]]", variable has type
10929 # "_UFunc_Nin2_Nout1[Literal['logical_and'], Literal[20],
10930 # Literal[True]]")
10931 ufunc = np.logical_or # type: ignore[assignment]
10932 else:
10933 raise NotImplementedError(name)
10935 for arr in self._mgr.arrays:
10936 middle = func(arr, axis=0, skipna=skipna)
10937 result = ufunc(result, middle)
10939 res_ser = self._constructor_sliced(result, index=self.index)
10940 return res_ser
10942 def nunique(self, axis: Axis = 0, dropna: bool = True) -> Series:
10943 """
10944 Count number of distinct elements in specified axis.
10946 Return Series with number of distinct elements. Can ignore NaN
10947 values.
10949 Parameters
10950 ----------
10951 axis : {0 or 'index', 1 or 'columns'}, default 0
10952 The axis to use. 0 or 'index' for row-wise, 1 or 'columns' for
10953 column-wise.
10954 dropna : bool, default True
10955 Don't include NaN in the counts.
10957 Returns
10958 -------
10959 Series
10961 See Also
10962 --------
10963 Series.nunique: Method nunique for Series.
10964 DataFrame.count: Count non-NA cells for each column or row.
10966 Examples
10967 --------
10968 >>> df = pd.DataFrame({'A': [4, 5, 6], 'B': [4, 1, 1]})
10969 >>> df.nunique()
10970 A 3
10971 B 2
10972 dtype: int64
10974 >>> df.nunique(axis=1)
10975 0 1
10976 1 2
10977 2 2
10978 dtype: int64
10979 """
10980 return self.apply(Series.nunique, axis=axis, dropna=dropna)
10982 @doc(_shared_docs["idxmin"], numeric_only_default="False")
10983 def idxmin(
10984 self, axis: Axis = 0, skipna: bool = True, numeric_only: bool = False
10985 ) -> Series:
10986 axis = self._get_axis_number(axis)
10987 if numeric_only:
10988 data = self._get_numeric_data()
10989 else:
10990 data = self
10992 res = data._reduce(
10993 nanops.nanargmin, "argmin", axis=axis, skipna=skipna, numeric_only=False
10994 )
10995 indices = res._values
10997 # indices will always be np.ndarray since axis is not None and
10998 # values is a 2d array for DataFrame
10999 # error: Item "int" of "Union[int, Any]" has no attribute "__iter__"
11000 assert isinstance(indices, np.ndarray) # for mypy
11002 index = data._get_axis(axis)
11003 result = [index[i] if i >= 0 else np.nan for i in indices]
11004 final_result = data._constructor_sliced(result, index=data._get_agg_axis(axis))
11005 return final_result.__finalize__(self, method="idxmin")
11007 @doc(_shared_docs["idxmax"], numeric_only_default="False")
11008 def idxmax(
11009 self, axis: Axis = 0, skipna: bool = True, numeric_only: bool = False
11010 ) -> Series:
11012 axis = self._get_axis_number(axis)
11013 if numeric_only:
11014 data = self._get_numeric_data()
11015 else:
11016 data = self
11018 res = data._reduce(
11019 nanops.nanargmax, "argmax", axis=axis, skipna=skipna, numeric_only=False
11020 )
11021 indices = res._values
11023 # indices will always be np.ndarray since axis is not None and
11024 # values is a 2d array for DataFrame
11025 # error: Item "int" of "Union[int, Any]" has no attribute "__iter__"
11026 assert isinstance(indices, np.ndarray) # for mypy
11028 index = data._get_axis(axis)
11029 result = [index[i] if i >= 0 else np.nan for i in indices]
11030 final_result = data._constructor_sliced(result, index=data._get_agg_axis(axis))
11031 return final_result.__finalize__(self, method="idxmax")
11033 def _get_agg_axis(self, axis_num: int) -> Index:
11034 """
11035 Let's be explicit about this.
11036 """
11037 if axis_num == 0:
11038 return self.columns
11039 elif axis_num == 1:
11040 return self.index
11041 else:
11042 raise ValueError(f"Axis must be 0 or 1 (got {repr(axis_num)})")
11044 def mode(
11045 self, axis: Axis = 0, numeric_only: bool = False, dropna: bool = True
11046 ) -> DataFrame:
11047 """
11048 Get the mode(s) of each element along the selected axis.
11050 The mode of a set of values is the value that appears most often.
11051 It can be multiple values.
11053 Parameters
11054 ----------
11055 axis : {0 or 'index', 1 or 'columns'}, default 0
11056 The axis to iterate over while searching for the mode:
11058 * 0 or 'index' : get mode of each column
11059 * 1 or 'columns' : get mode of each row.
11061 numeric_only : bool, default False
11062 If True, only apply to numeric columns.
11063 dropna : bool, default True
11064 Don't consider counts of NaN/NaT.
11066 Returns
11067 -------
11068 DataFrame
11069 The modes of each column or row.
11071 See Also
11072 --------
11073 Series.mode : Return the highest frequency value in a Series.
11074 Series.value_counts : Return the counts of values in a Series.
11076 Examples
11077 --------
11078 >>> df = pd.DataFrame([('bird', 2, 2),
11079 ... ('mammal', 4, np.nan),
11080 ... ('arthropod', 8, 0),
11081 ... ('bird', 2, np.nan)],
11082 ... index=('falcon', 'horse', 'spider', 'ostrich'),
11083 ... columns=('species', 'legs', 'wings'))
11084 >>> df
11085 species legs wings
11086 falcon bird 2 2.0
11087 horse mammal 4 NaN
11088 spider arthropod 8 0.0
11089 ostrich bird 2 NaN
11091 By default, missing values are not considered, and the mode of wings
11092 are both 0 and 2. Because the resulting DataFrame has two rows,
11093 the second row of ``species`` and ``legs`` contains ``NaN``.
11095 >>> df.mode()
11096 species legs wings
11097 0 bird 2.0 0.0
11098 1 NaN NaN 2.0
11100 Setting ``dropna=False`` ``NaN`` values are considered and they can be
11101 the mode (like for wings).
11103 >>> df.mode(dropna=False)
11104 species legs wings
11105 0 bird 2 NaN
11107 Setting ``numeric_only=True``, only the mode of numeric columns is
11108 computed, and columns of other types are ignored.
11110 >>> df.mode(numeric_only=True)
11111 legs wings
11112 0 2.0 0.0
11113 1 NaN 2.0
11115 To compute the mode over columns and not rows, use the axis parameter:
11117 >>> df.mode(axis='columns', numeric_only=True)
11118 0 1
11119 falcon 2.0 NaN
11120 horse 4.0 NaN
11121 spider 0.0 8.0
11122 ostrich 2.0 NaN
11123 """
11124 data = self if not numeric_only else self._get_numeric_data()
11126 def f(s):
11127 return s.mode(dropna=dropna)
11129 data = data.apply(f, axis=axis)
11130 # Ensure index is type stable (should always use int index)
11131 if data.empty:
11132 data.index = default_index(0)
11134 return data
11136 @overload
11137 def quantile(
11138 self,
11139 q: float = ...,
11140 axis: Axis = ...,
11141 numeric_only: bool | lib.NoDefault = ...,
11142 interpolation: QuantileInterpolation = ...,
11143 ) -> Series:
11144 ...
11146 @overload
11147 def quantile(
11148 self,
11149 q: AnyArrayLike | Sequence[float],
11150 axis: Axis = ...,
11151 numeric_only: bool | lib.NoDefault = ...,
11152 interpolation: QuantileInterpolation = ...,
11153 ) -> Series | DataFrame:
11154 ...
11156 @overload
11157 def quantile(
11158 self,
11159 q: float | AnyArrayLike | Sequence[float] = ...,
11160 axis: Axis = ...,
11161 numeric_only: bool | lib.NoDefault = ...,
11162 interpolation: QuantileInterpolation = ...,
11163 ) -> Series | DataFrame:
11164 ...
11166 def quantile(
11167 self,
11168 q: float | AnyArrayLike | Sequence[float] = 0.5,
11169 axis: Axis = 0,
11170 numeric_only: bool | lib.NoDefault = no_default,
11171 interpolation: QuantileInterpolation = "linear",
11172 method: Literal["single", "table"] = "single",
11173 ) -> Series | DataFrame:
11174 """
11175 Return values at the given quantile over requested axis.
11177 Parameters
11178 ----------
11179 q : float or array-like, default 0.5 (50% quantile)
11180 Value between 0 <= q <= 1, the quantile(s) to compute.
11181 axis : {0 or 'index', 1 or 'columns'}, default 0
11182 Equals 0 or 'index' for row-wise, 1 or 'columns' for column-wise.
11183 numeric_only : bool, default True
11184 If False, the quantile of datetime and timedelta data will be
11185 computed as well.
11187 .. deprecated:: 1.5.0
11188 The default value of ``numeric_only`` will be ``False`` in a future
11189 version of pandas.
11191 interpolation : {'linear', 'lower', 'higher', 'midpoint', 'nearest'}
11192 This optional parameter specifies the interpolation method to use,
11193 when the desired quantile lies between two data points `i` and `j`:
11195 * linear: `i + (j - i) * fraction`, where `fraction` is the
11196 fractional part of the index surrounded by `i` and `j`.
11197 * lower: `i`.
11198 * higher: `j`.
11199 * nearest: `i` or `j` whichever is nearest.
11200 * midpoint: (`i` + `j`) / 2.
11201 method : {'single', 'table'}, default 'single'
11202 Whether to compute quantiles per-column ('single') or over all columns
11203 ('table'). When 'table', the only allowed interpolation methods are
11204 'nearest', 'lower', and 'higher'.
11206 Returns
11207 -------
11208 Series or DataFrame
11210 If ``q`` is an array, a DataFrame will be returned where the
11211 index is ``q``, the columns are the columns of self, and the
11212 values are the quantiles.
11213 If ``q`` is a float, a Series will be returned where the
11214 index is the columns of self and the values are the quantiles.
11216 See Also
11217 --------
11218 core.window.rolling.Rolling.quantile: Rolling quantile.
11219 numpy.percentile: Numpy function to compute the percentile.
11221 Examples
11222 --------
11223 >>> df = pd.DataFrame(np.array([[1, 1], [2, 10], [3, 100], [4, 100]]),
11224 ... columns=['a', 'b'])
11225 >>> df.quantile(.1)
11226 a 1.3
11227 b 3.7
11228 Name: 0.1, dtype: float64
11229 >>> df.quantile([.1, .5])
11230 a b
11231 0.1 1.3 3.7
11232 0.5 2.5 55.0
11234 Specifying `method='table'` will compute the quantile over all columns.
11236 >>> df.quantile(.1, method="table", interpolation="nearest")
11237 a 1
11238 b 1
11239 Name: 0.1, dtype: int64
11240 >>> df.quantile([.1, .5], method="table", interpolation="nearest")
11241 a b
11242 0.1 1 1
11243 0.5 3 100
11245 Specifying `numeric_only=False` will also compute the quantile of
11246 datetime and timedelta data.
11248 >>> df = pd.DataFrame({'A': [1, 2],
11249 ... 'B': [pd.Timestamp('2010'),
11250 ... pd.Timestamp('2011')],
11251 ... 'C': [pd.Timedelta('1 days'),
11252 ... pd.Timedelta('2 days')]})
11253 >>> df.quantile(0.5, numeric_only=False)
11254 A 1.5
11255 B 2010-07-02 12:00:00
11256 C 1 days 12:00:00
11257 Name: 0.5, dtype: object
11258 """
11259 validate_percentile(q)
11260 axis = self._get_axis_number(axis)
11261 any_not_numeric = any(not is_numeric_dtype(x) for x in self.dtypes)
11262 if numeric_only is no_default and any_not_numeric:
11263 com.deprecate_numeric_only_default(type(self), "quantile")
11264 numeric_only = com.resolve_numeric_only(numeric_only)
11266 if not is_list_like(q):
11267 # BlockManager.quantile expects listlike, so we wrap and unwrap here
11268 # error: List item 0 has incompatible type "Union[float, Union[Union[
11269 # ExtensionArray, ndarray[Any, Any]], Index, Series], Sequence[float]]";
11270 # expected "float"
11271 res_df = self.quantile( # type: ignore[call-overload]
11272 [q],
11273 axis=axis,
11274 numeric_only=numeric_only,
11275 interpolation=interpolation,
11276 method=method,
11277 )
11278 if method == "single":
11279 res = res_df.iloc[0]
11280 else:
11281 # cannot directly iloc over sparse arrays
11282 res = res_df.T.iloc[:, 0]
11283 if axis == 1 and len(self) == 0:
11284 # GH#41544 try to get an appropriate dtype
11285 dtype = find_common_type(list(self.dtypes))
11286 if needs_i8_conversion(dtype):
11287 return res.astype(dtype)
11288 return res
11290 q = Index(q, dtype=np.float64)
11291 data = self._get_numeric_data() if numeric_only else self
11293 if axis == 1:
11294 data = data.T
11296 if len(data.columns) == 0:
11297 # GH#23925 _get_numeric_data may have dropped all columns
11298 cols = Index([], name=self.columns.name)
11300 dtype = np.float64
11301 if axis == 1:
11302 # GH#41544 try to get an appropriate dtype
11303 cdtype = find_common_type(list(self.dtypes))
11304 if needs_i8_conversion(cdtype):
11305 dtype = cdtype
11307 res = self._constructor([], index=q, columns=cols, dtype=dtype)
11308 return res.__finalize__(self, method="quantile")
11310 valid_method = {"single", "table"}
11311 if method not in valid_method:
11312 raise ValueError(
11313 f"Invalid method: {method}. Method must be in {valid_method}."
11314 )
11315 if method == "single":
11316 # error: Argument "qs" to "quantile" of "BlockManager" has incompatible type
11317 # "Index"; expected "Float64Index"
11318 res = data._mgr.quantile(
11319 qs=q, axis=1, interpolation=interpolation # type: ignore[arg-type]
11320 )
11321 elif method == "table":
11322 valid_interpolation = {"nearest", "lower", "higher"}
11323 if interpolation not in valid_interpolation:
11324 raise ValueError(
11325 f"Invalid interpolation: {interpolation}. "
11326 f"Interpolation must be in {valid_interpolation}"
11327 )
11328 # handle degenerate case
11329 if len(data) == 0:
11330 if data.ndim == 2:
11331 dtype = find_common_type(list(self.dtypes))
11332 else:
11333 dtype = self.dtype
11334 return self._constructor([], index=q, columns=data.columns, dtype=dtype)
11336 q_idx = np.quantile( # type: ignore[call-overload]
11337 np.arange(len(data)), q, **{np_percentile_argname: interpolation}
11338 )
11340 by = data.columns
11341 if len(by) > 1:
11342 keys = [data._get_label_or_level_values(x) for x in by]
11343 indexer = lexsort_indexer(keys)
11344 else:
11345 by = by[0]
11346 k = data._get_label_or_level_values(by) # type: ignore[arg-type]
11347 indexer = nargsort(k)
11349 res = data._mgr.take(indexer[q_idx], verify=False)
11350 res.axes[1] = q
11352 result = self._constructor(res)
11353 return result.__finalize__(self, method="quantile")
11355 @doc(NDFrame.asfreq, **_shared_doc_kwargs)
11356 def asfreq(
11357 self,
11358 freq: Frequency,
11359 method: FillnaOptions | None = None,
11360 how: str | None = None,
11361 normalize: bool = False,
11362 fill_value: Hashable = None,
11363 ) -> DataFrame:
11364 return super().asfreq(
11365 freq=freq,
11366 method=method,
11367 how=how,
11368 normalize=normalize,
11369 fill_value=fill_value,
11370 )
11372 @doc(NDFrame.resample, **_shared_doc_kwargs)
11373 def resample(
11374 self,
11375 rule,
11376 axis: Axis = 0,
11377 closed: str | None = None,
11378 label: str | None = None,
11379 convention: str = "start",
11380 kind: str | None = None,
11381 loffset=None,
11382 base: int | None = None,
11383 on: Level = None,
11384 level: Level = None,
11385 origin: str | TimestampConvertibleTypes = "start_day",
11386 offset: TimedeltaConvertibleTypes | None = None,
11387 group_keys: bool | lib.NoDefault = no_default,
11388 ) -> Resampler:
11389 return super().resample(
11390 rule=rule,
11391 axis=axis,
11392 closed=closed,
11393 label=label,
11394 convention=convention,
11395 kind=kind,
11396 loffset=loffset,
11397 base=base,
11398 on=on,
11399 level=level,
11400 origin=origin,
11401 offset=offset,
11402 group_keys=group_keys,
11403 )
11405 def to_timestamp(
11406 self,
11407 freq: Frequency | None = None,
11408 how: str = "start",
11409 axis: Axis = 0,
11410 copy: bool = True,
11411 ) -> DataFrame:
11412 """
11413 Cast to DatetimeIndex of timestamps, at *beginning* of period.
11415 Parameters
11416 ----------
11417 freq : str, default frequency of PeriodIndex
11418 Desired frequency.
11419 how : {'s', 'e', 'start', 'end'}
11420 Convention for converting period to timestamp; start of period
11421 vs. end.
11422 axis : {0 or 'index', 1 or 'columns'}, default 0
11423 The axis to convert (the index by default).
11424 copy : bool, default True
11425 If False then underlying input data is not copied.
11427 Returns
11428 -------
11429 DataFrame with DatetimeIndex
11430 """
11431 new_obj = self.copy(deep=copy)
11433 axis_name = self._get_axis_name(axis)
11434 old_ax = getattr(self, axis_name)
11435 if not isinstance(old_ax, PeriodIndex):
11436 raise TypeError(f"unsupported Type {type(old_ax).__name__}")
11438 new_ax = old_ax.to_timestamp(freq=freq, how=how)
11440 setattr(new_obj, axis_name, new_ax)
11441 return new_obj
11443 def to_period(
11444 self, freq: Frequency | None = None, axis: Axis = 0, copy: bool = True
11445 ) -> DataFrame:
11446 """
11447 Convert DataFrame from DatetimeIndex to PeriodIndex.
11449 Convert DataFrame from DatetimeIndex to PeriodIndex with desired
11450 frequency (inferred from index if not passed).
11452 Parameters
11453 ----------
11454 freq : str, default
11455 Frequency of the PeriodIndex.
11456 axis : {0 or 'index', 1 or 'columns'}, default 0
11457 The axis to convert (the index by default).
11458 copy : bool, default True
11459 If False then underlying input data is not copied.
11461 Returns
11462 -------
11463 DataFrame with PeriodIndex
11465 Examples
11466 --------
11467 >>> idx = pd.to_datetime(
11468 ... [
11469 ... "2001-03-31 00:00:00",
11470 ... "2002-05-31 00:00:00",
11471 ... "2003-08-31 00:00:00",
11472 ... ]
11473 ... )
11475 >>> idx
11476 DatetimeIndex(['2001-03-31', '2002-05-31', '2003-08-31'],
11477 dtype='datetime64[ns]', freq=None)
11479 >>> idx.to_period("M")
11480 PeriodIndex(['2001-03', '2002-05', '2003-08'], dtype='period[M]')
11482 For the yearly frequency
11484 >>> idx.to_period("Y")
11485 PeriodIndex(['2001', '2002', '2003'], dtype='period[A-DEC]')
11486 """
11487 new_obj = self.copy(deep=copy)
11489 axis_name = self._get_axis_name(axis)
11490 old_ax = getattr(self, axis_name)
11491 if not isinstance(old_ax, DatetimeIndex):
11492 raise TypeError(f"unsupported Type {type(old_ax).__name__}")
11494 new_ax = old_ax.to_period(freq=freq)
11496 setattr(new_obj, axis_name, new_ax)
11497 return new_obj
11499 def isin(self, values: Series | DataFrame | Sequence | Mapping) -> DataFrame:
11500 """
11501 Whether each element in the DataFrame is contained in values.
11503 Parameters
11504 ----------
11505 values : iterable, Series, DataFrame or dict
11506 The result will only be true at a location if all the
11507 labels match. If `values` is a Series, that's the index. If
11508 `values` is a dict, the keys must be the column names,
11509 which must match. If `values` is a DataFrame,
11510 then both the index and column labels must match.
11512 Returns
11513 -------
11514 DataFrame
11515 DataFrame of booleans showing whether each element in the DataFrame
11516 is contained in values.
11518 See Also
11519 --------
11520 DataFrame.eq: Equality test for DataFrame.
11521 Series.isin: Equivalent method on Series.
11522 Series.str.contains: Test if pattern or regex is contained within a
11523 string of a Series or Index.
11525 Examples
11526 --------
11527 >>> df = pd.DataFrame({'num_legs': [2, 4], 'num_wings': [2, 0]},
11528 ... index=['falcon', 'dog'])
11529 >>> df
11530 num_legs num_wings
11531 falcon 2 2
11532 dog 4 0
11534 When ``values`` is a list check whether every value in the DataFrame
11535 is present in the list (which animals have 0 or 2 legs or wings)
11537 >>> df.isin([0, 2])
11538 num_legs num_wings
11539 falcon True True
11540 dog False True
11542 To check if ``values`` is *not* in the DataFrame, use the ``~`` operator:
11544 >>> ~df.isin([0, 2])
11545 num_legs num_wings
11546 falcon False False
11547 dog True False
11549 When ``values`` is a dict, we can pass values to check for each
11550 column separately:
11552 >>> df.isin({'num_wings': [0, 3]})
11553 num_legs num_wings
11554 falcon False False
11555 dog False True
11557 When ``values`` is a Series or DataFrame the index and column must
11558 match. Note that 'falcon' does not match based on the number of legs
11559 in other.
11561 >>> other = pd.DataFrame({'num_legs': [8, 3], 'num_wings': [0, 2]},
11562 ... index=['spider', 'falcon'])
11563 >>> df.isin(other)
11564 num_legs num_wings
11565 falcon False True
11566 dog False False
11567 """
11568 if isinstance(values, dict):
11569 from pandas.core.reshape.concat import concat
11571 values = collections.defaultdict(list, values)
11572 result = concat(
11573 (
11574 self.iloc[:, [i]].isin(values[col])
11575 for i, col in enumerate(self.columns)
11576 ),
11577 axis=1,
11578 )
11579 elif isinstance(values, Series):
11580 if not values.index.is_unique:
11581 raise ValueError("cannot compute isin with a duplicate axis.")
11582 result = self.eq(values.reindex_like(self), axis="index")
11583 elif isinstance(values, DataFrame):
11584 if not (values.columns.is_unique and values.index.is_unique):
11585 raise ValueError("cannot compute isin with a duplicate axis.")
11586 result = self.eq(values.reindex_like(self))
11587 else:
11588 if not is_list_like(values):
11589 raise TypeError(
11590 "only list-like or dict-like objects are allowed "
11591 "to be passed to DataFrame.isin(), "
11592 f"you passed a '{type(values).__name__}'"
11593 )
11594 # error: Argument 2 to "isin" has incompatible type "Union[Sequence[Any],
11595 # Mapping[Any, Any]]"; expected "Union[Union[ExtensionArray,
11596 # ndarray[Any, Any]], Index, Series]"
11597 result = self._constructor(
11598 algorithms.isin(
11599 self.values.ravel(), values # type: ignore[arg-type]
11600 ).reshape(self.shape),
11601 self.index,
11602 self.columns,
11603 )
11604 return result.__finalize__(self, method="isin")
11606 # ----------------------------------------------------------------------
11607 # Add index and columns
11608 _AXIS_ORDERS = ["index", "columns"]
11609 _AXIS_TO_AXIS_NUMBER: dict[Axis, int] = {
11610 **NDFrame._AXIS_TO_AXIS_NUMBER,
11611 1: 1,
11612 "columns": 1,
11613 }
11614 _AXIS_LEN = len(_AXIS_ORDERS)
11615 _info_axis_number = 1
11616 _info_axis_name = "columns"
11618 index = properties.AxisProperty(
11619 axis=1, doc="The index (row labels) of the DataFrame."
11620 )
11621 columns = properties.AxisProperty(axis=0, doc="The column labels of the DataFrame.")
11623 @property
11624 def _AXIS_NUMBERS(self) -> dict[str, int]:
11625 """.. deprecated:: 1.1.0"""
11626 super()._AXIS_NUMBERS
11627 return {"index": 0, "columns": 1}
11629 @property
11630 def _AXIS_NAMES(self) -> dict[int, str]:
11631 """.. deprecated:: 1.1.0"""
11632 super()._AXIS_NAMES
11633 return {0: "index", 1: "columns"}
11635 # ----------------------------------------------------------------------
11636 # Add plotting methods to DataFrame
11637 plot = CachedAccessor("plot", pandas.plotting.PlotAccessor)
11638 hist = pandas.plotting.hist_frame
11639 boxplot = pandas.plotting.boxplot_frame
11640 sparse = CachedAccessor("sparse", SparseFrameAccessor)
11642 # ----------------------------------------------------------------------
11643 # Internal Interface Methods
11645 def _to_dict_of_blocks(self, copy: bool = True):
11646 """
11647 Return a dict of dtype -> Constructor Types that
11648 each is a homogeneous dtype.
11650 Internal ONLY - only works for BlockManager
11651 """
11652 mgr = self._mgr
11653 # convert to BlockManager if needed -> this way support ArrayManager as well
11654 mgr = mgr_to_mgr(mgr, "block")
11655 mgr = cast(BlockManager, mgr)
11656 return {
11657 k: self._constructor(v).__finalize__(self)
11658 for k, v, in mgr.to_dict(copy=copy).items()
11659 }
11661 @property
11662 def values(self) -> np.ndarray:
11663 """
11664 Return a Numpy representation of the DataFrame.
11666 .. warning::
11668 We recommend using :meth:`DataFrame.to_numpy` instead.
11670 Only the values in the DataFrame will be returned, the axes labels
11671 will be removed.
11673 Returns
11674 -------
11675 numpy.ndarray
11676 The values of the DataFrame.
11678 See Also
11679 --------
11680 DataFrame.to_numpy : Recommended alternative to this method.
11681 DataFrame.index : Retrieve the index labels.
11682 DataFrame.columns : Retrieving the column names.
11684 Notes
11685 -----
11686 The dtype will be a lower-common-denominator dtype (implicit
11687 upcasting); that is to say if the dtypes (even of numeric types)
11688 are mixed, the one that accommodates all will be chosen. Use this
11689 with care if you are not dealing with the blocks.
11691 e.g. If the dtypes are float16 and float32, dtype will be upcast to
11692 float32. If dtypes are int32 and uint8, dtype will be upcast to
11693 int32. By :func:`numpy.find_common_type` convention, mixing int64
11694 and uint64 will result in a float64 dtype.
11696 Examples
11697 --------
11698 A DataFrame where all columns are the same type (e.g., int64) results
11699 in an array of the same type.
11701 >>> df = pd.DataFrame({'age': [ 3, 29],
11702 ... 'height': [94, 170],
11703 ... 'weight': [31, 115]})
11704 >>> df
11705 age height weight
11706 0 3 94 31
11707 1 29 170 115
11708 >>> df.dtypes
11709 age int64
11710 height int64
11711 weight int64
11712 dtype: object
11713 >>> df.values
11714 array([[ 3, 94, 31],
11715 [ 29, 170, 115]])
11717 A DataFrame with mixed type columns(e.g., str/object, int64, float32)
11718 results in an ndarray of the broadest type that accommodates these
11719 mixed types (e.g., object).
11721 >>> df2 = pd.DataFrame([('parrot', 24.0, 'second'),
11722 ... ('lion', 80.5, 1),
11723 ... ('monkey', np.nan, None)],
11724 ... columns=('name', 'max_speed', 'rank'))
11725 >>> df2.dtypes
11726 name object
11727 max_speed float64
11728 rank object
11729 dtype: object
11730 >>> df2.values
11731 array([['parrot', 24.0, 'second'],
11732 ['lion', 80.5, 1],
11733 ['monkey', nan, None]], dtype=object)
11734 """
11735 self._consolidate_inplace()
11736 return self._mgr.as_array()
11738 @overload
11739 def ffill(
11740 self,
11741 *,
11742 axis: None | Axis = ...,
11743 inplace: Literal[False] = ...,
11744 limit: None | int = ...,
11745 downcast: dict | None = ...,
11746 ) -> DataFrame:
11747 ...
11749 @overload
11750 def ffill(
11751 self,
11752 *,
11753 axis: None | Axis = ...,
11754 inplace: Literal[True],
11755 limit: None | int = ...,
11756 downcast: dict | None = ...,
11757 ) -> None:
11758 ...
11760 @overload
11761 def ffill(
11762 self,
11763 *,
11764 axis: None | Axis = ...,
11765 inplace: bool = ...,
11766 limit: None | int = ...,
11767 downcast: dict | None = ...,
11768 ) -> DataFrame | None:
11769 ...
11771 # error: Signature of "ffill" incompatible with supertype "NDFrame"
11772 @deprecate_nonkeyword_arguments(version=None, allowed_args=["self"])
11773 def ffill( # type: ignore[override]
11774 self,
11775 axis: None | Axis = None,
11776 inplace: bool = False,
11777 limit: None | int = None,
11778 downcast: dict | None = None,
11779 ) -> DataFrame | None:
11780 return super().ffill(axis=axis, inplace=inplace, limit=limit, downcast=downcast)
11782 @overload
11783 def bfill(
11784 self,
11785 *,
11786 axis: None | Axis = ...,
11787 inplace: Literal[False] = ...,
11788 limit: None | int = ...,
11789 downcast=...,
11790 ) -> DataFrame:
11791 ...
11793 @overload
11794 def bfill(
11795 self,
11796 *,
11797 axis: None | Axis = ...,
11798 inplace: Literal[True],
11799 limit: None | int = ...,
11800 downcast=...,
11801 ) -> None:
11802 ...
11804 @overload
11805 def bfill(
11806 self,
11807 *,
11808 axis: None | Axis = ...,
11809 inplace: bool = ...,
11810 limit: None | int = ...,
11811 downcast=...,
11812 ) -> DataFrame | None:
11813 ...
11815 # error: Signature of "bfill" incompatible with supertype "NDFrame"
11816 @deprecate_nonkeyword_arguments(version=None, allowed_args=["self"])
11817 def bfill( # type: ignore[override]
11818 self,
11819 axis: None | Axis = None,
11820 inplace: bool = False,
11821 limit: None | int = None,
11822 downcast=None,
11823 ) -> DataFrame | None:
11824 return super().bfill(axis=axis, inplace=inplace, limit=limit, downcast=downcast)
11826 @deprecate_nonkeyword_arguments(
11827 version=None, allowed_args=["self", "lower", "upper"]
11828 )
11829 def clip(
11830 self: DataFrame,
11831 lower: float | None = None,
11832 upper: float | None = None,
11833 axis: Axis | None = None,
11834 inplace: bool = False,
11835 *args,
11836 **kwargs,
11837 ) -> DataFrame | None:
11838 return super().clip(lower, upper, axis, inplace, *args, **kwargs)
11840 @deprecate_nonkeyword_arguments(version=None, allowed_args=["self", "method"])
11841 def interpolate(
11842 self: DataFrame,
11843 method: str = "linear",
11844 axis: Axis = 0,
11845 limit: int | None = None,
11846 inplace: bool = False,
11847 limit_direction: str | None = None,
11848 limit_area: str | None = None,
11849 downcast: str | None = None,
11850 **kwargs,
11851 ) -> DataFrame | None:
11852 return super().interpolate(
11853 method,
11854 axis,
11855 limit,
11856 inplace,
11857 limit_direction,
11858 limit_area,
11859 downcast,
11860 **kwargs,
11861 )
11863 @overload
11864 def where(
11865 self,
11866 cond,
11867 other=...,
11868 *,
11869 inplace: Literal[False] = ...,
11870 axis: Axis | None = ...,
11871 level: Level = ...,
11872 errors: IgnoreRaise | lib.NoDefault = ...,
11873 try_cast: bool | lib.NoDefault = ...,
11874 ) -> DataFrame:
11875 ...
11877 @overload
11878 def where(
11879 self,
11880 cond,
11881 other=...,
11882 *,
11883 inplace: Literal[True],
11884 axis: Axis | None = ...,
11885 level: Level = ...,
11886 errors: IgnoreRaise | lib.NoDefault = ...,
11887 try_cast: bool | lib.NoDefault = ...,
11888 ) -> None:
11889 ...
11891 @overload
11892 def where(
11893 self,
11894 cond,
11895 other=...,
11896 *,
11897 inplace: bool = ...,
11898 axis: Axis | None = ...,
11899 level: Level = ...,
11900 errors: IgnoreRaise | lib.NoDefault = ...,
11901 try_cast: bool | lib.NoDefault = ...,
11902 ) -> DataFrame | None:
11903 ...
11905 # error: Signature of "where" incompatible with supertype "NDFrame"
11906 @deprecate_kwarg(old_arg_name="errors", new_arg_name=None)
11907 @deprecate_nonkeyword_arguments(
11908 version=None, allowed_args=["self", "cond", "other"]
11909 )
11910 def where( # type: ignore[override]
11911 self,
11912 cond,
11913 other=lib.no_default,
11914 inplace: bool = False,
11915 axis: Axis | None = None,
11916 level: Level = None,
11917 errors: IgnoreRaise | lib.NoDefault = "raise",
11918 try_cast: bool | lib.NoDefault = lib.no_default,
11919 ) -> DataFrame | None:
11920 return super().where(
11921 cond,
11922 other,
11923 inplace=inplace,
11924 axis=axis,
11925 level=level,
11926 try_cast=try_cast,
11927 )
11929 @overload
11930 def mask(
11931 self,
11932 cond,
11933 other=...,
11934 *,
11935 inplace: Literal[False] = ...,
11936 axis: Axis | None = ...,
11937 level: Level = ...,
11938 errors: IgnoreRaise | lib.NoDefault = ...,
11939 try_cast: bool | lib.NoDefault = ...,
11940 ) -> DataFrame:
11941 ...
11943 @overload
11944 def mask(
11945 self,
11946 cond,
11947 other=...,
11948 *,
11949 inplace: Literal[True],
11950 axis: Axis | None = ...,
11951 level: Level = ...,
11952 errors: IgnoreRaise | lib.NoDefault = ...,
11953 try_cast: bool | lib.NoDefault = ...,
11954 ) -> None:
11955 ...
11957 @overload
11958 def mask(
11959 self,
11960 cond,
11961 other=...,
11962 *,
11963 inplace: bool = ...,
11964 axis: Axis | None = ...,
11965 level: Level = ...,
11966 errors: IgnoreRaise | lib.NoDefault = ...,
11967 try_cast: bool | lib.NoDefault = ...,
11968 ) -> DataFrame | None:
11969 ...
11971 # error: Signature of "mask" incompatible with supertype "NDFrame"
11972 @deprecate_kwarg(old_arg_name="errors", new_arg_name=None)
11973 @deprecate_nonkeyword_arguments(
11974 version=None, allowed_args=["self", "cond", "other"]
11975 )
11976 def mask( # type: ignore[override]
11977 self,
11978 cond,
11979 other=np.nan,
11980 inplace: bool = False,
11981 axis: Axis | None = None,
11982 level: Level = None,
11983 errors: IgnoreRaise | lib.NoDefault = "raise",
11984 try_cast: bool | lib.NoDefault = lib.no_default,
11985 ) -> DataFrame | None:
11986 return super().mask(
11987 cond,
11988 other,
11989 inplace=inplace,
11990 axis=axis,
11991 level=level,
11992 try_cast=try_cast,
11993 )
11996DataFrame._add_numeric_operations()
11998ops.add_flex_arithmetic_methods(DataFrame)
12001def _from_nested_dict(data) -> collections.defaultdict:
12002 new_data: collections.defaultdict = collections.defaultdict(dict)
12003 for index, s in data.items():
12004 for col, v in s.items():
12005 new_data[col][index] = v
12006 return new_data
12009def _reindex_for_setitem(value: DataFrame | Series, index: Index) -> ArrayLike:
12010 # reindex if necessary
12012 if value.index.equals(index) or not len(index):
12013 return value._values.copy()
12015 # GH#4107
12016 try:
12017 reindexed_value = value.reindex(index)._values
12018 except ValueError as err:
12019 # raised in MultiIndex.from_tuples, see test_insert_error_msmgs
12020 if not value.index.is_unique:
12021 # duplicate axis
12022 raise err
12024 raise TypeError(
12025 "incompatible index of inserted column with frame index"
12026 ) from err
12027 return reindexed_value