Coverage for /var/srv/projects/api.amasfac.comuna18.com/tmp/venv/lib/python3.9/site-packages/pandas/core/frame.py: 14%

1"""

2DataFrame

3---------

4An efficient 2D container for potentially mixed-type time series or other

5labeled data series.

7Similar to its R counterpart, data.frame, except providing automatic data

8alignment and a host of useful data manipulation methods having to do with the

9labeling information

10"""

11from __future__ import annotations

13import collections

14from collections import abc

15import datetime

16import functools

17from io import StringIO

18import itertools

19from textwrap import dedent

20from typing import (

21 TYPE_CHECKING,

22 Any,

23 Callable,

24 Hashable,

25 Iterable,

26 Iterator,

27 Literal,

28 Mapping,

29 Sequence,

30 cast,

31 overload,

32)

33import warnings

35import numpy as np

36import numpy.ma as ma

38from pandas._config import get_option

40from pandas._libs import (

41 algos as libalgos,

42 lib,

43 properties,

44)

45from pandas._libs.hashtable import duplicated

46from pandas._libs.lib import (

47 NoDefault,

48 no_default,

49)

50from pandas._typing import (

51 AggFuncType,

52 AnyArrayLike,

53 ArrayLike,

54 Axes,

55 Axis,

56 ColspaceArgType,

57 CompressionOptions,

58 Dtype,

59 DtypeObj,

60 FilePath,

61 FillnaOptions,

62 FloatFormatType,

63 FormattersType,

64 Frequency,

65 IgnoreRaise,

66 IndexKeyFunc,

67 IndexLabel,

68 Level,

69 NaPosition,

70 PythonFuncType,

71 QuantileInterpolation,

72 ReadBuffer,

73 Renamer,

74 Scalar,

75 SortKind,

76 StorageOptions,

77 Suffixes,

78 TimedeltaConvertibleTypes,

79 TimestampConvertibleTypes,

80 ValueKeyFunc,

81 WriteBuffer,

82 npt,

83)

84from pandas.compat._optional import import_optional_dependency

85from pandas.compat.numpy import (

86 function as nv,

87 np_percentile_argname,

88)

89from pandas.errors import InvalidIndexError

90from pandas.util._decorators import (

91 Appender,

92 Substitution,

93 deprecate_kwarg,

94 deprecate_nonkeyword_arguments,

95 doc,

96 rewrite_axis_style_signature,

97)

98from pandas.util._exceptions import find_stack_level

99from pandas.util._validators import (

100 validate_ascending,

101 validate_axis_style_args,

102 validate_bool_kwarg,

103 validate_percentile,

104)

105

106from pandas.core.dtypes.cast import (

107 can_hold_element,

108 construct_1d_arraylike_from_scalar,

109 construct_2d_arraylike_from_scalar,

110 find_common_type,

111 infer_dtype_from_scalar,

112 invalidate_string_dtypes,

113 maybe_box_native,

114 maybe_downcast_to_dtype,

115)

116from pandas.core.dtypes.common import (

117 ensure_platform_int,

118 infer_dtype_from_object,

119 is_1d_only_ea_dtype,

120 is_bool_dtype,

121 is_dataclass,

122 is_datetime64_any_dtype,

123 is_dict_like,

124 is_dtype_equal,

125 is_extension_array_dtype,

126 is_float,

127 is_float_dtype,

128 is_hashable,

129 is_integer,

130 is_integer_dtype,

131 is_iterator,

132 is_list_like,

133 is_numeric_dtype,

134 is_object_dtype,

135 is_scalar,

136 is_sequence,

137 needs_i8_conversion,

138 pandas_dtype,

139)

140from pandas.core.dtypes.dtypes import ExtensionDtype

141from pandas.core.dtypes.missing import (

142 isna,

143 notna,

144)

145

146from pandas.core import (

147 algorithms,

148 common as com,

149 nanops,

150 ops,

151)

152from pandas.core.accessor import CachedAccessor

153from pandas.core.apply import (

154 reconstruct_func,

155 relabel_result,

156)

157from pandas.core.array_algos.take import take_2d_multi

158from pandas.core.arraylike import OpsMixin

159from pandas.core.arrays import (

160 DatetimeArray,

161 ExtensionArray,

162 PeriodArray,

163 TimedeltaArray,

164)

165from pandas.core.arrays.sparse import SparseFrameAccessor

166from pandas.core.construction import (

167 extract_array,

168 sanitize_array,

169 sanitize_masked_array,

170)

171from pandas.core.generic import NDFrame

172from pandas.core.indexers import check_key_length

173from pandas.core.indexes.api import (

174 DatetimeIndex,

175 Index,

176 PeriodIndex,

177 default_index,

178 ensure_index,

179 ensure_index_from_sequences,

180)

181from pandas.core.indexes.multi import (

182 MultiIndex,

183 maybe_droplevels,

184)

185from pandas.core.indexing import (

186 check_bool_indexer,

187 check_deprecated_indexers,

188 convert_to_index_sliceable,

189)

190from pandas.core.internals import (

191 ArrayManager,

192 BlockManager,

193)

194from pandas.core.internals.construction import (

195 arrays_to_mgr,

196 dataclasses_to_dicts,

197 dict_to_mgr,

198 mgr_to_mgr,

199 ndarray_to_mgr,

200 nested_data_to_arrays,

201 rec_array_to_mgr,

202 reorder_arrays,

203 to_arrays,

204 treat_as_nested,

205)

206from pandas.core.reshape.melt import melt

207from pandas.core.series import Series

208from pandas.core.shared_docs import _shared_docs

209from pandas.core.sorting import (

210 get_group_index,

211 lexsort_indexer,

212 nargsort,

213)

214

215from pandas.io.common import get_handle

216from pandas.io.formats import (

217 console,

218 format as fmt,

219)

220from pandas.io.formats.info import (

221 INFO_DOCSTRING,

222 DataFrameInfo,

223 frame_sub_kwargs,

224)

225import pandas.plotting

226

227if TYPE_CHECKING: 227 ↛ 229line 227 didn't jump to line 229, because the condition on line 227 was never true

228

229 from pandas.core.groupby.generic import DataFrameGroupBy

230 from pandas.core.interchange.dataframe_protocol import DataFrame as DataFrameXchg

231 from pandas.core.internals import SingleDataManager

232 from pandas.core.resample import Resampler

233

234 from pandas.io.formats.style import Styler

235

236# ---------------------------------------------------------------------

237# Docstring templates

238

239_shared_doc_kwargs = {

240 "axes": "index, columns",

241 "klass": "DataFrame",

242 "axes_single_arg": "{0 or 'index', 1 or 'columns'}",

243 "axis": """axis : {0 or 'index', 1 or 'columns'}, default 0

244 If 0 or 'index': apply function to each column.

245 If 1 or 'columns': apply function to each row.""",

246 "inplace": """

247 inplace : bool, default False

248 Whether to modify the DataFrame rather than creating a new one.""",

249 "optional_by": """

250 by : str or list of str

251 Name or list of names to sort by.

252

253 - if `axis` is 0 or `'index'` then `by` may contain index

254 levels and/or column labels.

255 - if `axis` is 1 or `'columns'` then `by` may contain column

256 levels and/or index labels.""",

257 "optional_labels": """labels : array-like, optional

258 New labels / index to conform the axis specified by 'axis' to.""",

259 "optional_axis": """axis : int or str, optional

260 Axis to target. Can be either the axis name ('index', 'columns')

261 or number (0, 1).""",

262 "replace_iloc": """

263 This differs from updating with ``.loc`` or ``.iloc``, which require

264 you to specify a location to update with some value.""",

265}

266

267_numeric_only_doc = """numeric_only : bool or None, default None

268 Include only float, int, boolean data. If None, will attempt to use

269 everything, then use only numeric data

270"""

271

272_merge_doc = """

273Merge DataFrame or named Series objects with a database-style join.

274

275A named Series object is treated as a DataFrame with a single named column.

276

277The join is done on columns or indexes. If joining columns on

278columns, the DataFrame indexes *will be ignored*. Otherwise if joining indexes

279on indexes or indexes on a column or columns, the index will be passed on.

280When performing a cross merge, no column specifications to merge on are

281allowed.

282

283.. warning::

284

285 If both key columns contain rows where the key is a null value, those

286 rows will be matched against each other. This is different from usual SQL

287 join behaviour and can lead to unexpected results.

288

289Parameters

290----------%s

291right : DataFrame or named Series

292 Object to merge with.

293how : {'left', 'right', 'outer', 'inner', 'cross'}, default 'inner'

294 Type of merge to be performed.

295

296 * left: use only keys from left frame, similar to a SQL left outer join;

297 preserve key order.

298 * right: use only keys from right frame, similar to a SQL right outer join;

299 preserve key order.

300 * outer: use union of keys from both frames, similar to a SQL full outer

301 join; sort keys lexicographically.

302 * inner: use intersection of keys from both frames, similar to a SQL inner

303 join; preserve the order of the left keys.

304 * cross: creates the cartesian product from both frames, preserves the order

305 of the left keys.

306

307 .. versionadded:: 1.2.0

308

309on : label or list

310 Column or index level names to join on. These must be found in both

311 DataFrames. If `on` is None and not merging on indexes then this defaults

312 to the intersection of the columns in both DataFrames.

313left_on : label or list, or array-like

314 Column or index level names to join on in the left DataFrame. Can also

315 be an array or list of arrays of the length of the left DataFrame.

316 These arrays are treated as if they are columns.

317right_on : label or list, or array-like

318 Column or index level names to join on in the right DataFrame. Can also

319 be an array or list of arrays of the length of the right DataFrame.

320 These arrays are treated as if they are columns.

321left_index : bool, default False

322 Use the index from the left DataFrame as the join key(s). If it is a

323 MultiIndex, the number of keys in the other DataFrame (either the index

324 or a number of columns) must match the number of levels.

325right_index : bool, default False

326 Use the index from the right DataFrame as the join key. Same caveats as

327 left_index.

328sort : bool, default False

329 Sort the join keys lexicographically in the result DataFrame. If False,

330 the order of the join keys depends on the join type (how keyword).

331suffixes : list-like, default is ("_x", "_y")

332 A length-2 sequence where each element is optionally a string

333 indicating the suffix to add to overlapping column names in

334 `left` and `right` respectively. Pass a value of `None` instead

335 of a string to indicate that the column name from `left` or

336 `right` should be left as-is, with no suffix. At least one of the

337 values must not be None.

338copy : bool, default True

339 If False, avoid copy if possible.

340indicator : bool or str, default False

341 If True, adds a column to the output DataFrame called "_merge" with

342 information on the source of each row. The column can be given a different

343 name by providing a string argument. The column will have a Categorical

344 type with the value of "left_only" for observations whose merge key only

345 appears in the left DataFrame, "right_only" for observations

346 whose merge key only appears in the right DataFrame, and "both"

347 if the observation's merge key is found in both DataFrames.

348

349validate : str, optional

350 If specified, checks if merge is of specified type.

351

352 * "one_to_one" or "1:1": check if merge keys are unique in both

353 left and right datasets.

354 * "one_to_many" or "1:m": check if merge keys are unique in left

355 dataset.

356 * "many_to_one" or "m:1": check if merge keys are unique in right

357 dataset.

358 * "many_to_many" or "m:m": allowed, but does not result in checks.

359

360Returns

361-------

362DataFrame

363 A DataFrame of the two merged objects.

364

365See Also

366--------

367merge_ordered : Merge with optional filling/interpolation.

368merge_asof : Merge on nearest keys.

369DataFrame.join : Similar method using indices.

370

371Notes

372-----

373Support for specifying index levels as the `on`, `left_on`, and

374`right_on` parameters was added in version 0.23.0

375Support for merging named Series objects was added in version 0.24.0

376

377Examples

378--------

379>>> df1 = pd.DataFrame({'lkey': ['foo', 'bar', 'baz', 'foo'],

380... 'value': [1, 2, 3, 5]})

381>>> df2 = pd.DataFrame({'rkey': ['foo', 'bar', 'baz', 'foo'],

382... 'value': [5, 6, 7, 8]})

383>>> df1

384 lkey value

3850 foo 1

3861 bar 2

3872 baz 3

3883 foo 5

389>>> df2

390 rkey value

3910 foo 5

3921 bar 6

3932 baz 7

3943 foo 8

395

396Merge df1 and df2 on the lkey and rkey columns. The value columns have

397the default suffixes, _x and _y, appended.

398

399>>> df1.merge(df2, left_on='lkey', right_on='rkey')

400 lkey value_x rkey value_y

4010 foo 1 foo 5

4021 foo 1 foo 8

4032 foo 5 foo 5

4043 foo 5 foo 8

4054 bar 2 bar 6

4065 baz 3 baz 7

407

408Merge DataFrames df1 and df2 with specified left and right suffixes

409appended to any overlapping columns.

410

411>>> df1.merge(df2, left_on='lkey', right_on='rkey',

412... suffixes=('_left', '_right'))

413 lkey value_left rkey value_right

4140 foo 1 foo 5

4151 foo 1 foo 8

4162 foo 5 foo 5

4173 foo 5 foo 8

4184 bar 2 bar 6

4195 baz 3 baz 7

420

421Merge DataFrames df1 and df2, but raise an exception if the DataFrames have

422any overlapping columns.

423

424>>> df1.merge(df2, left_on='lkey', right_on='rkey', suffixes=(False, False))

425Traceback (most recent call last):

426...

427ValueError: columns overlap but no suffix specified:

428 Index(['value'], dtype='object')

429

430>>> df1 = pd.DataFrame({'a': ['foo', 'bar'], 'b': [1, 2]})

431>>> df2 = pd.DataFrame({'a': ['foo', 'baz'], 'c': [3, 4]})

432>>> df1

433 a b

4340 foo 1

4351 bar 2

436>>> df2

437 a c

4380 foo 3

4391 baz 4

440

441>>> df1.merge(df2, how='inner', on='a')

442 a b c

4430 foo 1 3

444

445>>> df1.merge(df2, how='left', on='a')

446 a b c

4470 foo 1 3.0

4481 bar 2 NaN

449

450>>> df1 = pd.DataFrame({'left': ['foo', 'bar']})

451>>> df2 = pd.DataFrame({'right': [7, 8]})

452>>> df1

453 left

4540 foo

4551 bar

456>>> df2

457 right

4580 7

4591 8

460

461>>> df1.merge(df2, how='cross')

462 left right

4630 foo 7

4641 foo 8

4652 bar 7

4663 bar 8

467"""

468

469

470# -----------------------------------------------------------------------

471# DataFrame class

472

473

474class DataFrame(NDFrame, OpsMixin):

475 """

476 Two-dimensional, size-mutable, potentially heterogeneous tabular data.

477

478 Data structure also contains labeled axes (rows and columns).

479 Arithmetic operations align on both row and column labels. Can be

480 thought of as a dict-like container for Series objects. The primary

481 pandas data structure.

482

483 Parameters

484 ----------

485 data : ndarray (structured or homogeneous), Iterable, dict, or DataFrame

486 Dict can contain Series, arrays, constants, dataclass or list-like objects. If

487 data is a dict, column order follows insertion-order. If a dict contains Series

488 which have an index defined, it is aligned by its index.

489

490 .. versionchanged:: 0.25.0

491 If data is a list of dicts, column order follows insertion-order.

492

493 index : Index or array-like

494 Index to use for resulting frame. Will default to RangeIndex if

495 no indexing information part of input data and no index provided.

496 columns : Index or array-like

497 Column labels to use for resulting frame when data does not have them,

498 defaulting to RangeIndex(0, 1, 2, ..., n). If data contains column labels,

499 will perform column selection instead.

500 dtype : dtype, default None

501 Data type to force. Only a single dtype is allowed. If None, infer.

502 copy : bool or None, default None

503 Copy data from inputs.

504 For dict data, the default of None behaves like ``copy=True``. For DataFrame

505 or 2d ndarray input, the default of None behaves like ``copy=False``.

506 If data is a dict containing one or more Series (possibly of different dtypes),

507 ``copy=False`` will ensure that these inputs are not copied.

508

509 .. versionchanged:: 1.3.0

510

511 See Also

512 --------

513 DataFrame.from_records : Constructor from tuples, also record arrays.

514 DataFrame.from_dict : From dicts of Series, arrays, or dicts.

515 read_csv : Read a comma-separated values (csv) file into DataFrame.

516 read_table : Read general delimited file into DataFrame.

517 read_clipboard : Read text from clipboard into DataFrame.

518

519 Notes

520 -----

521 Please reference the :ref:`User Guide <basics.dataframe>` for more information.

522

523 Examples

524 --------

525 Constructing DataFrame from a dictionary.

526

527 >>> d = {'col1': [1, 2], 'col2': [3, 4]}

528 >>> df = pd.DataFrame(data=d)

529 >>> df

530 col1 col2

531 0 1 3

532 1 2 4

533

534 Notice that the inferred dtype is int64.

535

536 >>> df.dtypes

537 col1 int64

538 col2 int64

539 dtype: object

540

541 To enforce a single dtype:

542

543 >>> df = pd.DataFrame(data=d, dtype=np.int8)

544 >>> df.dtypes

545 col1 int8

546 col2 int8

547 dtype: object

548

549 Constructing DataFrame from a dictionary including Series:

550

551 >>> d = {'col1': [0, 1, 2, 3], 'col2': pd.Series([2, 3], index=[2, 3])}

552 >>> pd.DataFrame(data=d, index=[0, 1, 2, 3])

553 col1 col2

554 0 0 NaN

555 1 1 NaN

556 2 2 2.0

557 3 3 3.0

558

559 Constructing DataFrame from numpy ndarray:

560

561 >>> df2 = pd.DataFrame(np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]),

562 ... columns=['a', 'b', 'c'])

563 >>> df2

564 a b c

565 0 1 2 3

566 1 4 5 6

567 2 7 8 9

568

569 Constructing DataFrame from a numpy ndarray that has labeled columns:

570

571 >>> data = np.array([(1, 2, 3), (4, 5, 6), (7, 8, 9)],

572 ... dtype=[("a", "i4"), ("b", "i4"), ("c", "i4")])

573 >>> df3 = pd.DataFrame(data, columns=['c', 'a'])

574 ...

575 >>> df3

576 c a

577 0 3 1

578 1 6 4

579 2 9 7

580

581 Constructing DataFrame from dataclass:

582

583 >>> from dataclasses import make_dataclass

584 >>> Point = make_dataclass("Point", [("x", int), ("y", int)])

585 >>> pd.DataFrame([Point(0, 0), Point(0, 3), Point(2, 3)])

586 x y

587 0 0 0

588 1 0 3

589 2 2 3

590 """

591

592 _internal_names_set = {"columns", "index"} | NDFrame._internal_names_set

593 _typ = "dataframe"

594 _HANDLED_TYPES = (Series, Index, ExtensionArray, np.ndarray)

595 _accessors: set[str] = {"sparse"}

596 _hidden_attrs: frozenset[str] = NDFrame._hidden_attrs | frozenset([])

597 _mgr: BlockManager | ArrayManager

598

599 @property

600 def _constructor(self) -> Callable[..., DataFrame]:

601 return DataFrame

602

603 _constructor_sliced: Callable[..., Series] = Series

604

605 # ----------------------------------------------------------------------

606 # Constructors

607

608 def __init__(

609 self,

610 data=None,

611 index: Axes | None = None,

612 columns: Axes | None = None,

613 dtype: Dtype | None = None,

614 copy: bool | None = None,

615 ) -> None:

616

617 if data is None:

618 data = {}

619 if dtype is not None:

620 dtype = self._validate_dtype(dtype)

621

622 if isinstance(data, DataFrame):

623 data = data._mgr

624

625 if isinstance(data, (BlockManager, ArrayManager)):

626 # first check if a Manager is passed without any other arguments

627 # -> use fastpath (without checking Manager type)

628 if index is None and columns is None and dtype is None and not copy:

629 # GH#33357 fastpath

630 NDFrame.__init__(self, data)

631 return

632

633 manager = get_option("mode.data_manager")

634

635 # GH47215

636 if index is not None and isinstance(index, set):

637 raise ValueError("index cannot be a set")

638 if columns is not None and isinstance(columns, set):

639 raise ValueError("columns cannot be a set")

640

641 if copy is None:

642 if isinstance(data, dict):

643 # retain pre-GH#38939 default behavior

644 copy = True

645 elif (

646 manager == "array"

647 and isinstance(data, (np.ndarray, ExtensionArray))

648 and data.ndim == 2

649 ):

650 # INFO(ArrayManager) by default copy the 2D input array to get

651 # contiguous 1D arrays

652 copy = True

653 else:

654 copy = False

655

656 if isinstance(data, (BlockManager, ArrayManager)):

657 mgr = self._init_mgr(

658 data, axes={"index": index, "columns": columns}, dtype=dtype, copy=copy

659 )

660

661 elif isinstance(data, dict):

662 # GH#38939 de facto copy defaults to False only in non-dict cases

663 mgr = dict_to_mgr(data, index, columns, dtype=dtype, copy=copy, typ=manager)

664 elif isinstance(data, ma.MaskedArray):

665 import numpy.ma.mrecords as mrecords

666

667 # masked recarray

668 if isinstance(data, mrecords.MaskedRecords):

669 mgr = rec_array_to_mgr(

670 data,

671 index,

672 columns,

673 dtype,

674 copy,

675 typ=manager,

676 )

677 warnings.warn(

678 "Support for MaskedRecords is deprecated and will be "

679 "removed in a future version. Pass "

680 "{name: data[name] for name in data.dtype.names} instead.",

681 FutureWarning,

682 stacklevel=find_stack_level(),

683 )

684

685 # a masked array

686 else:

687 data = sanitize_masked_array(data)

688 mgr = ndarray_to_mgr(

689 data,

690 index,

691 columns,

692 dtype=dtype,

693 copy=copy,

694 typ=manager,

695 )

696

697 elif isinstance(data, (np.ndarray, Series, Index, ExtensionArray)):

698 if data.dtype.names:

699 # i.e. numpy structured array

700 data = cast(np.ndarray, data)

701 mgr = rec_array_to_mgr(

702 data,

703 index,

704 columns,

705 dtype,

706 copy,

707 typ=manager,

708 )

709 elif getattr(data, "name", None) is not None:

710 # i.e. Series/Index with non-None name

711 mgr = dict_to_mgr(

712 # error: Item "ndarray" of "Union[ndarray, Series, Index]" has no

713 # attribute "name"

714 {data.name: data}, # type: ignore[union-attr]

715 index,

716 columns,

717 dtype=dtype,

718 typ=manager,

719 )

720 else:

721 mgr = ndarray_to_mgr(

722 data,

723 index,

724 columns,

725 dtype=dtype,

726 copy=copy,

727 typ=manager,

728 )

729

730 # For data is list-like, or Iterable (will consume into list)

731 elif is_list_like(data):

732 if not isinstance(data, (abc.Sequence, ExtensionArray)):

733 if hasattr(data, "__array__"):

734 # GH#44616 big perf improvement for e.g. pytorch tensor

735 data = np.asarray(data)

736 else:

737 data = list(data)

738 if len(data) > 0:

739 if is_dataclass(data[0]):

740 data = dataclasses_to_dicts(data)

741 if not isinstance(data, np.ndarray) and treat_as_nested(data):

742 # exclude ndarray as we may have cast it a few lines above

743 if columns is not None:

744 columns = ensure_index(columns)

745 arrays, columns, index = nested_data_to_arrays(

746 # error: Argument 3 to "nested_data_to_arrays" has incompatible

747 # type "Optional[Collection[Any]]"; expected "Optional[Index]"

748 data,

749 columns,

750 index, # type: ignore[arg-type]

751 dtype,

752 )

753 mgr = arrays_to_mgr(

754 arrays,

755 columns,

756 index,

757 dtype=dtype,

758 typ=manager,

759 )

760 else:

761 mgr = ndarray_to_mgr(

762 data,

763 index,

764 columns,

765 dtype=dtype,

766 copy=copy,

767 typ=manager,

768 )

769 else:

770 mgr = dict_to_mgr(

771 {},

772 index,

773 columns,

774 dtype=dtype,

775 typ=manager,

776 )

777 # For data is scalar

778 else:

779 if index is None or columns is None:

780 raise ValueError("DataFrame constructor not properly called!")

781

782 index = ensure_index(index)

783 columns = ensure_index(columns)

784

785 if not dtype:

786 dtype, _ = infer_dtype_from_scalar(data, pandas_dtype=True)

787

788 # For data is a scalar extension dtype

789 if isinstance(dtype, ExtensionDtype):

790 # TODO(EA2D): special case not needed with 2D EAs

791

792 values = [

793 construct_1d_arraylike_from_scalar(data, len(index), dtype)

794 for _ in range(len(columns))

795 ]

796 mgr = arrays_to_mgr(values, columns, index, dtype=None, typ=manager)

797 else:

798 arr2d = construct_2d_arraylike_from_scalar(

799 data,

800 len(index),

801 len(columns),

802 dtype,

803 copy,

804 )

805

806 mgr = ndarray_to_mgr(

807 arr2d,

808 index,

809 columns,

810 dtype=arr2d.dtype,

811 copy=False,

812 typ=manager,

813 )

814

815 # ensure correct Manager type according to settings

816 mgr = mgr_to_mgr(mgr, typ=manager)

817

818 NDFrame.__init__(self, mgr)

819

820 # ----------------------------------------------------------------------

821 def __dataframe__(

822 self, nan_as_null: bool = False, allow_copy: bool = True

823 ) -> DataFrameXchg:

824 """

825 Return the dataframe interchange object implementing the interchange protocol.

826

827 Parameters

828 ----------

829 nan_as_null : bool, default False

830 Whether to tell the DataFrame to overwrite null values in the data

831 with ``NaN`` (or ``NaT``).

832 allow_copy : bool, default True

833 Whether to allow memory copying when exporting. If set to False

834 it would cause non-zero-copy exports to fail.

835

836 Returns

837 -------

838 DataFrame interchange object

839 The object which consuming library can use to ingress the dataframe.

840

841 Notes

842 -----

843 Details on the interchange protocol:

844 https://data-apis.org/dataframe-protocol/latest/index.html

845

846 `nan_as_null` currently has no effect; once support for nullable extension

847 dtypes is added, this value should be propagated to columns.

848 """

849

850 from pandas.core.interchange.dataframe import PandasDataFrameXchg

851

852 return PandasDataFrameXchg(self, nan_as_null, allow_copy)

853

854 # ----------------------------------------------------------------------

855

856 @property

857 def axes(self) -> list[Index]:

858 """

859 Return a list representing the axes of the DataFrame.

860

861 It has the row axis labels and column axis labels as the only members.

862 They are returned in that order.

863

864 Examples

865 --------

866 >>> df = pd.DataFrame({'col1': [1, 2], 'col2': [3, 4]})

867 >>> df.axes

868 [RangeIndex(start=0, stop=2, step=1), Index(['col1', 'col2'],

869 dtype='object')]

870 """

871 return [self.index, self.columns]

872

873 @property

874 def shape(self) -> tuple[int, int]:

875 """

876 Return a tuple representing the dimensionality of the DataFrame.

877

878 See Also

879 --------

880 ndarray.shape : Tuple of array dimensions.

881

882 Examples

883 --------

884 >>> df = pd.DataFrame({'col1': [1, 2], 'col2': [3, 4]})

885 >>> df.shape

886 (2, 2)

887

888 >>> df = pd.DataFrame({'col1': [1, 2], 'col2': [3, 4],

889 ... 'col3': [5, 6]})

890 >>> df.shape

891 (2, 3)

892 """

893 return len(self.index), len(self.columns)

894

895 @property

896 def _is_homogeneous_type(self) -> bool:

897 """

898 Whether all the columns in a DataFrame have the same type.

899

900 Returns

901 -------

902 bool

903

904 See Also

905 --------

906 Index._is_homogeneous_type : Whether the object has a single

907 dtype.

908 MultiIndex._is_homogeneous_type : Whether all the levels of a

909 MultiIndex have the same dtype.

910

911 Examples

912 --------

913 >>> DataFrame({"A": [1, 2], "B": [3, 4]})._is_homogeneous_type

914 True

915 >>> DataFrame({"A": [1, 2], "B": [3.0, 4.0]})._is_homogeneous_type

916 False

917

918 Items with the same type but different sizes are considered

919 different types.

920

921 >>> DataFrame({

922 ... "A": np.array([1, 2], dtype=np.int32),

923 ... "B": np.array([1, 2], dtype=np.int64)})._is_homogeneous_type

924 False

925 """

926 if isinstance(self._mgr, ArrayManager):

927 return len({arr.dtype for arr in self._mgr.arrays}) == 1

928 if self._mgr.any_extension_types:

929 return len({block.dtype for block in self._mgr.blocks}) == 1

930 else:

931 return not self._is_mixed_type

932

933 @property

934 def _can_fast_transpose(self) -> bool:

935 """

936 Can we transpose this DataFrame without creating any new array objects.

937 """

938 if isinstance(self._mgr, ArrayManager):

939 return False

940 blocks = self._mgr.blocks

941 if len(blocks) != 1:

942 return False

943

944 dtype = blocks[0].dtype

945 # TODO(EA2D) special case would be unnecessary with 2D EAs

946 return not is_1d_only_ea_dtype(dtype)

947

948 # error: Return type "Union[ndarray, DatetimeArray, TimedeltaArray]" of

949 # "_values" incompatible with return type "ndarray" in supertype "NDFrame"

950 @property

951 def _values( # type: ignore[override]

952 self,

953 ) -> np.ndarray | DatetimeArray | TimedeltaArray | PeriodArray:

954 """

955 Analogue to ._values that may return a 2D ExtensionArray.

956 """

957 self._consolidate_inplace()

958

959 mgr = self._mgr

960

961 if isinstance(mgr, ArrayManager):

962 if len(mgr.arrays) == 1 and not is_1d_only_ea_dtype(mgr.arrays[0].dtype):

963 # error: Item "ExtensionArray" of "Union[ndarray, ExtensionArray]"

964 # has no attribute "reshape"

965 return mgr.arrays[0].reshape(-1, 1) # type: ignore[union-attr]

966 return self.values

967

968 blocks = mgr.blocks

969 if len(blocks) != 1:

970 return self.values

971

972 arr = blocks[0].values

973 if arr.ndim == 1:

974 # non-2D ExtensionArray

975 return self.values

976

977 # more generally, whatever we allow in NDArrayBackedExtensionBlock

978 arr = cast("np.ndarray | DatetimeArray | TimedeltaArray | PeriodArray", arr)

979 return arr.T

980

981 # ----------------------------------------------------------------------

982 # Rendering Methods

983

984 def _repr_fits_vertical_(self) -> bool:

985 """

986 Check length against max_rows.

987 """

988 max_rows = get_option("display.max_rows")

989 return len(self) <= max_rows

990

991 def _repr_fits_horizontal_(self, ignore_width: bool = False) -> bool:

992 """

993 Check if full repr fits in horizontal boundaries imposed by the display

994 options width and max_columns.

995

996 In case of non-interactive session, no boundaries apply.

997

998 `ignore_width` is here so ipynb+HTML output can behave the way

999 users expect. display.max_columns remains in effect.

1000 GH3541, GH3573

1001 """

1002 width, height = console.get_console_size()

1003 max_columns = get_option("display.max_columns")

1004 nb_columns = len(self.columns)

1005

1006 # exceed max columns

1007 if (max_columns and nb_columns > max_columns) or (

1008 (not ignore_width) and width and nb_columns > (width // 2)

1009 ):

1010 return False

1011

1012 # used by repr_html under IPython notebook or scripts ignore terminal

1013 # dims

1014 if ignore_width or width is None or not console.in_interactive_session():

1015 return True

1016

1017 if get_option("display.width") is not None or console.in_ipython_frontend():

1018 # check at least the column row for excessive width

1019 max_rows = 1

1020 else:

1021 max_rows = get_option("display.max_rows")

1022

1023 # when auto-detecting, so width=None and not in ipython front end

1024 # check whether repr fits horizontal by actually checking

1025 # the width of the rendered repr

1026 buf = StringIO()

1027

1028 # only care about the stuff we'll actually print out

1029 # and to_string on entire frame may be expensive

1030 d = self

1031

1032 if max_rows is not None: # unlimited rows

1033 # min of two, where one may be None

1034 d = d.iloc[: min(max_rows, len(d))]

1035 else:

1036 return True

1037

1038 d.to_string(buf=buf)

1039 value = buf.getvalue()

1040 repr_width = max(len(line) for line in value.split("\n"))

1041

1042 return repr_width < width

1043

1044 def _info_repr(self) -> bool:

1045 """

1046 True if the repr should show the info view.

1047 """

1048 info_repr_option = get_option("display.large_repr") == "info"

1049 return info_repr_option and not (

1050 self._repr_fits_horizontal_() and self._repr_fits_vertical_()

1051 )

1052

1053 def __repr__(self) -> str:

1054 """

1055 Return a string representation for a particular DataFrame.

1056 """

1057 if self._info_repr():

1058 buf = StringIO()

1059 self.info(buf=buf)

1060 return buf.getvalue()

1061

1062 repr_params = fmt.get_dataframe_repr_params()

1063 return self.to_string(**repr_params)

1064

1065 def _repr_html_(self) -> str | None:

1066 """

1067 Return a html representation for a particular DataFrame.

1068

1069 Mainly for IPython notebook.

1070 """

1071 if self._info_repr():

1072 buf = StringIO()

1073 self.info(buf=buf)

1074 # need to escape the <class>, should be the first line.

1075 val = buf.getvalue().replace("<", r"<", 1)

1076 val = val.replace(">", r">", 1)

1077 return "<pre>" + val + "</pre>"

1078

1079 if get_option("display.notebook_repr_html"):

1080 max_rows = get_option("display.max_rows")

1081 min_rows = get_option("display.min_rows")

1082 max_cols = get_option("display.max_columns")

1083 show_dimensions = get_option("display.show_dimensions")

1084

1085 formatter = fmt.DataFrameFormatter(

1086 self,

1087 columns=None,

1088 col_space=None,

1089 na_rep="NaN",

1090 formatters=None,

1091 float_format=None,

1092 sparsify=None,

1093 justify=None,

1094 index_names=True,

1095 header=True,

1096 index=True,

1097 bold_rows=True,

1098 escape=True,

1099 max_rows=max_rows,

1100 min_rows=min_rows,

1101 max_cols=max_cols,

1102 show_dimensions=show_dimensions,

1103 decimal=".",

1104 )

1105 return fmt.DataFrameRenderer(formatter).to_html(notebook=True)

1106 else:

1107 return None

1108

1109 @overload

1110 def to_string(

1111 self,

1112 buf: None = ...,

1113 columns: Sequence[str] | None = ...,

1114 col_space: int | list[int] | dict[Hashable, int] | None = ...,

1115 header: bool | Sequence[str] = ...,

1116 index: bool = ...,

1117 na_rep: str = ...,

1118 formatters: fmt.FormattersType | None = ...,

1119 float_format: fmt.FloatFormatType | None = ...,

1120 sparsify: bool | None = ...,

1121 index_names: bool = ...,

1122 justify: str | None = ...,

1123 max_rows: int | None = ...,

1124 max_cols: int | None = ...,

1125 show_dimensions: bool = ...,

1126 decimal: str = ...,

1127 line_width: int | None = ...,

1128 min_rows: int | None = ...,

1129 max_colwidth: int | None = ...,

1130 encoding: str | None = ...,

1131 ) -> str:

1132 ...

1133

1134 @overload

1135 def to_string(

1136 self,

1137 buf: FilePath | WriteBuffer[str],

1138 columns: Sequence[str] | None = ...,

1139 col_space: int | list[int] | dict[Hashable, int] | None = ...,

1140 header: bool | Sequence[str] = ...,

1141 index: bool = ...,

1142 na_rep: str = ...,

1143 formatters: fmt.FormattersType | None = ...,

1144 float_format: fmt.FloatFormatType | None = ...,

1145 sparsify: bool | None = ...,

1146 index_names: bool = ...,

1147 justify: str | None = ...,

1148 max_rows: int | None = ...,

1149 max_cols: int | None = ...,

1150 show_dimensions: bool = ...,

1151 decimal: str = ...,

1152 line_width: int | None = ...,

1153 min_rows: int | None = ...,

1154 max_colwidth: int | None = ...,

1155 encoding: str | None = ...,

1156 ) -> None:

1157 ...

1158

1159 @Substitution(

1160 header_type="bool or sequence of str",

1161 header="Write out the column names. If a list of strings "

1162 "is given, it is assumed to be aliases for the "

1163 "column names",

1164 col_space_type="int, list or dict of int",

1165 col_space="The minimum width of each column. If a list of ints is given "

1166 "every integers corresponds with one column. If a dict is given, the key "

1167 "references the column, while the value defines the space to use.",

1168 )

1169 @Substitution(shared_params=fmt.common_docstring, returns=fmt.return_docstring)

1170 def to_string(

1171 self,

1172 buf: FilePath | WriteBuffer[str] | None = None,

1173 columns: Sequence[str] | None = None,

1174 col_space: int | list[int] | dict[Hashable, int] | None = None,

1175 header: bool | Sequence[str] = True,

1176 index: bool = True,

1177 na_rep: str = "NaN",

1178 formatters: fmt.FormattersType | None = None,

1179 float_format: fmt.FloatFormatType | None = None,

1180 sparsify: bool | None = None,

1181 index_names: bool = True,

1182 justify: str | None = None,

1183 max_rows: int | None = None,

1184 max_cols: int | None = None,

1185 show_dimensions: bool = False,

1186 decimal: str = ".",

1187 line_width: int | None = None,

1188 min_rows: int | None = None,

1189 max_colwidth: int | None = None,

1190 encoding: str | None = None,

1191 ) -> str | None:

1192 """

1193 Render a DataFrame to a console-friendly tabular output.

1194 %(shared_params)s

1195 line_width : int, optional

1196 Width to wrap a line in characters.

1197 min_rows : int, optional

1198 The number of rows to display in the console in a truncated repr

1199 (when number of rows is above `max_rows`).

1200 max_colwidth : int, optional

1201 Max width to truncate each column in characters. By default, no limit.

1202

1203 .. versionadded:: 1.0.0

1204 encoding : str, default "utf-8"

1205 Set character encoding.

1206

1207 .. versionadded:: 1.0

1208 %(returns)s

1209 See Also

1210 --------

1211 to_html : Convert DataFrame to HTML.

1212

1213 Examples

1214 --------

1215 >>> d = {'col1': [1, 2, 3], 'col2': [4, 5, 6]}

1216 >>> df = pd.DataFrame(d)

1217 >>> print(df.to_string())

1218 col1 col2

1219 0 1 4

1220 1 2 5

1221 2 3 6

1222 """

1223 from pandas import option_context

1224

1225 with option_context("display.max_colwidth", max_colwidth):

1226 formatter = fmt.DataFrameFormatter(

1227 self,

1228 columns=columns,

1229 col_space=col_space,

1230 na_rep=na_rep,

1231 formatters=formatters,

1232 float_format=float_format,

1233 sparsify=sparsify,

1234 justify=justify,

1235 index_names=index_names,

1236 header=header,

1237 index=index,

1238 min_rows=min_rows,

1239 max_rows=max_rows,

1240 max_cols=max_cols,

1241 show_dimensions=show_dimensions,

1242 decimal=decimal,

1243 )

1244 return fmt.DataFrameRenderer(formatter).to_string(

1245 buf=buf,

1246 encoding=encoding,

1247 line_width=line_width,

1248 )

1249

1250 # ----------------------------------------------------------------------

1251

1252 @property

1253 def style(self) -> Styler:

1254 """

1255 Returns a Styler object.

1256

1257 Contains methods for building a styled HTML representation of the DataFrame.

1258

1259 See Also

1260 --------

1261 io.formats.style.Styler : Helps style a DataFrame or Series according to the

1262 data with HTML and CSS.

1263 """

1264 from pandas.io.formats.style import Styler

1265

1266 return Styler(self)

1267

1268 _shared_docs[

1269 "items"

1270 ] = r"""

1271 Iterate over (column name, Series) pairs.

1272

1273 Iterates over the DataFrame columns, returning a tuple with

1274 the column name and the content as a Series.

1275

1276 Yields

1277 ------

1278 label : object

1279 The column names for the DataFrame being iterated over.

1280 content : Series

1281 The column entries belonging to each label, as a Series.

1282

1283 See Also

1284 --------

1285 DataFrame.iterrows : Iterate over DataFrame rows as

1286 (index, Series) pairs.

1287 DataFrame.itertuples : Iterate over DataFrame rows as namedtuples

1288 of the values.

1289

1290 Examples

1291 --------

1292 >>> df = pd.DataFrame({'species': ['bear', 'bear', 'marsupial'],

1293 ... 'population': [1864, 22000, 80000]},

1294 ... index=['panda', 'polar', 'koala'])

1295 >>> df

1296 species population

1297 panda bear 1864

1298 polar bear 22000

1299 koala marsupial 80000

1300 >>> for label, content in df.items():

1301 ... print(f'label: {label}')

1302 ... print(f'content: {content}', sep='\n')

1303 ...

1304 label: species

1305 content:

1306 panda bear

1307 polar bear

1308 koala marsupial

1309 Name: species, dtype: object

1310 label: population

1311 content:

1312 panda 1864

1313 polar 22000

1314 koala 80000

1315 Name: population, dtype: int64

1316 """

1317

1318 @Appender(_shared_docs["items"])

1319 def items(self) -> Iterable[tuple[Hashable, Series]]:

1320 if self.columns.is_unique and hasattr(self, "_item_cache"):

1321 for k in self.columns:

1322 yield k, self._get_item_cache(k)

1323 else:

1324 for i, k in enumerate(self.columns):

1325 yield k, self._ixs(i, axis=1)

1326

1327 _shared_docs[

1328 "iteritems"

1329 ] = r"""

1330 Iterate over (column name, Series) pairs.

1331

1332 .. deprecated:: 1.5.0

1333 iteritems is deprecated and will be removed in a future version.

1334 Use .items instead.

1335

1336 Iterates over the DataFrame columns, returning a tuple with

1337 the column name and the content as a Series.

1338

1339 Yields

1340 ------

1341 label : object

1342 The column names for the DataFrame being iterated over.

1343 content : Series

1344 The column entries belonging to each label, as a Series.

1345

1346 See Also

1347 --------

1348 DataFrame.iter : Recommended alternative.

1349 DataFrame.iterrows : Iterate over DataFrame rows as

1350 (index, Series) pairs.

1351 DataFrame.itertuples : Iterate over DataFrame rows as namedtuples

1352 of the values.

1353 """

1354

1355 @Appender(_shared_docs["iteritems"])

1356 def iteritems(self) -> Iterable[tuple[Hashable, Series]]:

1357 warnings.warn(

1358 "iteritems is deprecated and will be removed in a future version. "

1359 "Use .items instead.",

1360 FutureWarning,

1361 stacklevel=find_stack_level(),

1362 )

1363 yield from self.items()

1364

1365 def iterrows(self) -> Iterable[tuple[Hashable, Series]]:

1366 """

1367 Iterate over DataFrame rows as (index, Series) pairs.

1368

1369 Yields

1370 ------

1371 index : label or tuple of label

1372 The index of the row. A tuple for a `MultiIndex`.

1373 data : Series

1374 The data of the row as a Series.

1375

1376 See Also

1377 --------

1378 DataFrame.itertuples : Iterate over DataFrame rows as namedtuples of the values.

1379 DataFrame.items : Iterate over (column name, Series) pairs.

1380

1381 Notes

1382 -----

1383 1. Because ``iterrows`` returns a Series for each row,

1384 it does **not** preserve dtypes across the rows (dtypes are

1385 preserved across columns for DataFrames). For example,

1386

1387 >>> df = pd.DataFrame([[1, 1.5]], columns=['int', 'float'])

1388 >>> row = next(df.iterrows())[1]

1389 >>> row

1390 int 1.0

1391 float 1.5

1392 Name: 0, dtype: float64

1393 >>> print(row['int'].dtype)

1394 float64

1395 >>> print(df['int'].dtype)

1396 int64

1397

1398 To preserve dtypes while iterating over the rows, it is better

1399 to use :meth:`itertuples` which returns namedtuples of the values

1400 and which is generally faster than ``iterrows``.

1401

1402 2. You should **never modify** something you are iterating over.

1403 This is not guaranteed to work in all cases. Depending on the

1404 data types, the iterator returns a copy and not a view, and writing

1405 to it will have no effect.

1406 """

1407 columns = self.columns

1408 klass = self._constructor_sliced

1409 for k, v in zip(self.index, self.values):

1410 s = klass(v, index=columns, name=k).__finalize__(self)

1411 yield k, s

1412

1413 def itertuples(

1414 self, index: bool = True, name: str | None = "Pandas"

1415 ) -> Iterable[tuple[Any, ...]]:

1416 """

1417 Iterate over DataFrame rows as namedtuples.

1418

1419 Parameters

1420 ----------

1421 index : bool, default True

1422 If True, return the index as the first element of the tuple.

1423 name : str or None, default "Pandas"

1424 The name of the returned namedtuples or None to return regular

1425 tuples.

1426

1427 Returns

1428 -------

1429 iterator

1430 An object to iterate over namedtuples for each row in the

1431 DataFrame with the first field possibly being the index and

1432 following fields being the column values.

1433

1434 See Also

1435 --------

1436 DataFrame.iterrows : Iterate over DataFrame rows as (index, Series)

1437 pairs.

1438 DataFrame.items : Iterate over (column name, Series) pairs.

1439

1440 Notes

1441 -----

1442 The column names will be renamed to positional names if they are

1443 invalid Python identifiers, repeated, or start with an underscore.

1444

1445 Examples

1446 --------

1447 >>> df = pd.DataFrame({'num_legs': [4, 2], 'num_wings': [0, 2]},

1448 ... index=['dog', 'hawk'])

1449 >>> df

1450 num_legs num_wings

1451 dog 4 0

1452 hawk 2 2

1453 >>> for row in df.itertuples():

1454 ... print(row)

1455 ...

1456 Pandas(Index='dog', num_legs=4, num_wings=0)

1457 Pandas(Index='hawk', num_legs=2, num_wings=2)

1458

1459 By setting the `index` parameter to False we can remove the index

1460 as the first element of the tuple:

1461

1462 >>> for row in df.itertuples(index=False):

1463 ... print(row)

1464 ...

1465 Pandas(num_legs=4, num_wings=0)

1466 Pandas(num_legs=2, num_wings=2)

1467

1468 With the `name` parameter set we set a custom name for the yielded

1469 namedtuples:

1470

1471 >>> for row in df.itertuples(name='Animal'):

1472 ... print(row)

1473 ...

1474 Animal(Index='dog', num_legs=4, num_wings=0)

1475 Animal(Index='hawk', num_legs=2, num_wings=2)

1476 """

1477 arrays = []

1478 fields = list(self.columns)

1479 if index:

1480 arrays.append(self.index)

1481 fields.insert(0, "Index")

1482

1483 # use integer indexing because of possible duplicate column names

1484 arrays.extend(self.iloc[:, k] for k in range(len(self.columns)))

1485

1486 if name is not None:

1487 # https://github.com/python/mypy/issues/9046

1488 # error: namedtuple() expects a string literal as the first argument

1489 itertuple = collections.namedtuple( # type: ignore[misc]

1490 name, fields, rename=True

1491 )

1492 return map(itertuple._make, zip(*arrays))

1493

1494 # fallback to regular tuples

1495 return zip(*arrays)

1496

1497 def __len__(self) -> int:

1498 """

1499 Returns length of info axis, but here we use the index.

1500 """

1501 return len(self.index)

1502

1503 @overload

1504 def dot(self, other: Series) -> Series:

1505 ...

1506

1507 @overload

1508 def dot(self, other: DataFrame | Index | ArrayLike) -> DataFrame:

1509 ...

1510

1511 def dot(self, other: AnyArrayLike | DataFrame) -> DataFrame | Series:

1512 """

1513 Compute the matrix multiplication between the DataFrame and other.

1514

1515 This method computes the matrix product between the DataFrame and the

1516 values of an other Series, DataFrame or a numpy array.

1517

1518 It can also be called using ``self @ other`` in Python >= 3.5.

1519

1520 Parameters

1521 ----------

1522 other : Series, DataFrame or array-like

1523 The other object to compute the matrix product with.

1524

1525 Returns

1526 -------

1527 Series or DataFrame

1528 If other is a Series, return the matrix product between self and

1529 other as a Series. If other is a DataFrame or a numpy.array, return

1530 the matrix product of self and other in a DataFrame of a np.array.

1531

1532 See Also

1533 --------

1534 Series.dot: Similar method for Series.

1535

1536 Notes

1537 -----

1538 The dimensions of DataFrame and other must be compatible in order to

1539 compute the matrix multiplication. In addition, the column names of

1540 DataFrame and the index of other must contain the same values, as they

1541 will be aligned prior to the multiplication.

1542

1543 The dot method for Series computes the inner product, instead of the

1544 matrix product here.

1545

1546 Examples

1547 --------

1548 Here we multiply a DataFrame with a Series.

1549

1550 >>> df = pd.DataFrame([[0, 1, -2, -1], [1, 1, 1, 1]])

1551 >>> s = pd.Series([1, 1, 2, 1])

1552 >>> df.dot(s)

1553 0 -4

1554 1 5

1555 dtype: int64

1556

1557 Here we multiply a DataFrame with another DataFrame.

1558

1559 >>> other = pd.DataFrame([[0, 1], [1, 2], [-1, -1], [2, 0]])

1560 >>> df.dot(other)

1561 0 1

1562 0 1 4

1563 1 2 2

1564

1565 Note that the dot method give the same result as @

1566

1567 >>> df @ other

1568 0 1

1569 0 1 4

1570 1 2 2

1571

1572 The dot method works also if other is an np.array.

1573

1574 >>> arr = np.array([[0, 1], [1, 2], [-1, -1], [2, 0]])

1575 >>> df.dot(arr)

1576 0 1

1577 0 1 4

1578 1 2 2

1579

1580 Note how shuffling of the objects does not change the result.

1581

1582 >>> s2 = s.reindex([1, 0, 2, 3])

1583 >>> df.dot(s2)

1584 0 -4

1585 1 5

1586 dtype: int64

1587 """

1588 if isinstance(other, (Series, DataFrame)):

1589 common = self.columns.union(other.index)

1590 if len(common) > len(self.columns) or len(common) > len(other.index):

1591 raise ValueError("matrices are not aligned")

1592

1593 left = self.reindex(columns=common, copy=False)

1594 right = other.reindex(index=common, copy=False)

1595 lvals = left.values

1596 rvals = right._values

1597 else:

1598 left = self

1599 lvals = self.values

1600 rvals = np.asarray(other)

1601 if lvals.shape[1] != rvals.shape[0]:

1602 raise ValueError(

1603 f"Dot product shape mismatch, {lvals.shape} vs {rvals.shape}"

1604 )

1605

1606 if isinstance(other, DataFrame):

1607 return self._constructor(

1608 np.dot(lvals, rvals), index=left.index, columns=other.columns

1609 )

1610 elif isinstance(other, Series):

1611 return self._constructor_sliced(np.dot(lvals, rvals), index=left.index)

1612 elif isinstance(rvals, (np.ndarray, Index)):

1613 result = np.dot(lvals, rvals)

1614 if result.ndim == 2:

1615 return self._constructor(result, index=left.index)

1616 else:

1617 return self._constructor_sliced(result, index=left.index)

1618 else: # pragma: no cover

1619 raise TypeError(f"unsupported type: {type(other)}")

1620

1621 @overload

1622 def __matmul__(self, other: Series) -> Series:

1623 ...

1624

1625 @overload

1626 def __matmul__(self, other: AnyArrayLike | DataFrame) -> DataFrame | Series:

1627 ...

1628

1629 def __matmul__(self, other: AnyArrayLike | DataFrame) -> DataFrame | Series:

1630 """

1631 Matrix multiplication using binary `@` operator in Python>=3.5.

1632 """

1633 return self.dot(other)

1634

1635 def __rmatmul__(self, other) -> DataFrame:

1636 """

1637 Matrix multiplication using binary `@` operator in Python>=3.5.

1638 """

1639 try:

1640 return self.T.dot(np.transpose(other)).T

1641 except ValueError as err:

1642 if "shape mismatch" not in str(err):

1643 raise

1644 # GH#21581 give exception message for original shapes

1645 msg = f"shapes {np.shape(other)} and {self.shape} not aligned"

1646 raise ValueError(msg) from err

1647

1648 # ----------------------------------------------------------------------

1649 # IO methods (to / from other formats)

1650

1651 @classmethod

1652 def from_dict(

1653 cls,

1654 data: dict,

1655 orient: str = "columns",

1656 dtype: Dtype | None = None,

1657 columns: Axes | None = None,

1658 ) -> DataFrame:

1659 """

1660 Construct DataFrame from dict of array-like or dicts.

1661

1662 Creates DataFrame object from dictionary by columns or by index

1663 allowing dtype specification.

1664

1665 Parameters

1666 ----------

1667 data : dict

1668 Of the form {field : array-like} or {field : dict}.

1669 orient : {'columns', 'index', 'tight'}, default 'columns'

1670 The "orientation" of the data. If the keys of the passed dict

1671 should be the columns of the resulting DataFrame, pass 'columns'

1672 (default). Otherwise if the keys should be rows, pass 'index'.

1673 If 'tight', assume a dict with keys ['index', 'columns', 'data',

1674 'index_names', 'column_names'].

1675

1676 .. versionadded:: 1.4.0

1677 'tight' as an allowed value for the ``orient`` argument

1678

1679 dtype : dtype, default None

1680 Data type to force, otherwise infer.

1681 columns : list, default None

1682 Column labels to use when ``orient='index'``. Raises a ValueError

1683 if used with ``orient='columns'`` or ``orient='tight'``.

1684

1685 Returns

1686 -------

1687 DataFrame

1688

1689 See Also

1690 --------

1691 DataFrame.from_records : DataFrame from structured ndarray, sequence

1692 of tuples or dicts, or DataFrame.

1693 DataFrame : DataFrame object creation using constructor.

1694 DataFrame.to_dict : Convert the DataFrame to a dictionary.

1695

1696 Examples

1697 --------

1698 By default the keys of the dict become the DataFrame columns:

1699

1700 >>> data = {'col_1': [3, 2, 1, 0], 'col_2': ['a', 'b', 'c', 'd']}

1701 >>> pd.DataFrame.from_dict(data)

1702 col_1 col_2

1703 0 3 a

1704 1 2 b

1705 2 1 c

1706 3 0 d

1707

1708 Specify ``orient='index'`` to create the DataFrame using dictionary

1709 keys as rows:

1710

1711 >>> data = {'row_1': [3, 2, 1, 0], 'row_2': ['a', 'b', 'c', 'd']}

1712 >>> pd.DataFrame.from_dict(data, orient='index')

1713 0 1 2 3

1714 row_1 3 2 1 0

1715 row_2 a b c d

1716

1717 When using the 'index' orientation, the column names can be

1718 specified manually:

1719

1720 >>> pd.DataFrame.from_dict(data, orient='index',

1721 ... columns=['A', 'B', 'C', 'D'])

1722 A B C D

1723 row_1 3 2 1 0

1724 row_2 a b c d

1725

1726 Specify ``orient='tight'`` to create the DataFrame using a 'tight'

1727 format:

1728

1729 >>> data = {'index': [('a', 'b'), ('a', 'c')],

1730 ... 'columns': [('x', 1), ('y', 2)],

1731 ... 'data': [[1, 3], [2, 4]],

1732 ... 'index_names': ['n1', 'n2'],

1733 ... 'column_names': ['z1', 'z2']}

1734 >>> pd.DataFrame.from_dict(data, orient='tight')

1735 z1 x y

1736 z2 1 2

1737 n1 n2

1738 a b 1 3

1739 c 2 4

1740 """

1741 index = None

1742 orient = orient.lower()

1743 if orient == "index":

1744 if len(data) > 0:

1745 # TODO speed up Series case

1746 if isinstance(list(data.values())[0], (Series, dict)):

1747 data = _from_nested_dict(data)

1748 else:

1749 index = list(data.keys())

1750 # error: Incompatible types in assignment (expression has type

1751 # "List[Any]", variable has type "Dict[Any, Any]")

1752 data = list(data.values()) # type: ignore[assignment]

1753 elif orient == "columns" or orient == "tight":

1754 if columns is not None:

1755 raise ValueError(f"cannot use columns parameter with orient='{orient}'")

1756 else: # pragma: no cover

1757 raise ValueError(

1758 f"Expected 'index', 'columns' or 'tight' for orient parameter. "

1759 f"Got '{orient}' instead"

1760 )

1761

1762 if orient != "tight":

1763 return cls(data, index=index, columns=columns, dtype=dtype)

1764 else:

1765 realdata = data["data"]

1766

1767 def create_index(indexlist, namelist):

1768 index: Index

1769 if len(namelist) > 1:

1770 index = MultiIndex.from_tuples(indexlist, names=namelist)

1771 else:

1772 index = Index(indexlist, name=namelist[0])

1773 return index

1774

1775 index = create_index(data["index"], data["index_names"])

1776 columns = create_index(data["columns"], data["column_names"])

1777 return cls(realdata, index=index, columns=columns, dtype=dtype)

1778

1779 def to_numpy(

1780 self,

1781 dtype: npt.DTypeLike | None = None,

1782 copy: bool = False,

1783 na_value: object = lib.no_default,

1784 ) -> np.ndarray:

1785 """

1786 Convert the DataFrame to a NumPy array.

1787

1788 By default, the dtype of the returned array will be the common NumPy

1789 dtype of all types in the DataFrame. For example, if the dtypes are

1790 ``float16`` and ``float32``, the results dtype will be ``float32``.

1791 This may require copying data and coercing values, which may be

1792 expensive.

1793

1794 Parameters

1795 ----------

1796 dtype : str or numpy.dtype, optional

1797 The dtype to pass to :meth:`numpy.asarray`.

1798 copy : bool, default False

1799 Whether to ensure that the returned value is not a view on

1800 another array. Note that ``copy=False`` does not *ensure* that

1801 ``to_numpy()`` is no-copy. Rather, ``copy=True`` ensure that

1802 a copy is made, even if not strictly necessary.

1803 na_value : Any, optional

1804 The value to use for missing values. The default value depends

1805 on `dtype` and the dtypes of the DataFrame columns.

1806

1807 .. versionadded:: 1.1.0

1808

1809 Returns

1810 -------

1811 numpy.ndarray

1812

1813 See Also

1814 --------

1815 Series.to_numpy : Similar method for Series.

1816

1817 Examples

1818 --------

1819 >>> pd.DataFrame({"A": [1, 2], "B": [3, 4]}).to_numpy()

1820 array([[1, 3],

1821 [2, 4]])

1822

1823 With heterogeneous data, the lowest common type will have to

1824 be used.

1825

1826 >>> df = pd.DataFrame({"A": [1, 2], "B": [3.0, 4.5]})

1827 >>> df.to_numpy()

1828 array([[1. , 3. ],

1829 [2. , 4.5]])

1830

1831 For a mix of numeric and non-numeric types, the output array will

1832 have object dtype.

1833

1834 >>> df['C'] = pd.date_range('2000', periods=2)

1835 >>> df.to_numpy()

1836 array([[1, 3.0, Timestamp('2000-01-01 00:00:00')],

1837 [2, 4.5, Timestamp('2000-01-02 00:00:00')]], dtype=object)

1838 """

1839 self._consolidate_inplace()

1840 if dtype is not None:

1841 dtype = np.dtype(dtype)

1842 result = self._mgr.as_array(dtype=dtype, copy=copy, na_value=na_value)

1843 if result.dtype is not dtype:

1844 result = np.array(result, dtype=dtype, copy=False)

1845

1846 return result

1847

1848 @overload

1849 def to_dict(

1850 self,

1851 orient: Literal["dict", "list", "series", "split", "tight", "index"] = ...,

1852 into: type[dict] = ...,

1853 ) -> dict:

1854 ...

1855

1856 @overload

1857 def to_dict(self, orient: Literal["records"], into: type[dict] = ...) -> list[dict]:

1858 ...

1859

1860 def to_dict(

1861 self,

1862 orient: Literal[

1863 "dict", "list", "series", "split", "tight", "records", "index"

1864 ] = "dict",

1865 into: type[dict] = dict,

1866 ) -> dict | list[dict]:

1867 """

1868 Convert the DataFrame to a dictionary.

1869

1870 The type of the key-value pairs can be customized with the parameters

1871 (see below).

1872

1873 Parameters

1874 ----------

1875 orient : str {'dict', 'list', 'series', 'split', 'tight', 'records', 'index'}

1876 Determines the type of the values of the dictionary.

1877

1878 - 'dict' (default) : dict like {column -> {index -> value}}

1879 - 'list' : dict like {column -> [values]}

1880 - 'series' : dict like {column -> Series(values)}

1881 - 'split' : dict like

1882 {'index' -> [index], 'columns' -> [columns], 'data' -> [values]}

1883 - 'tight' : dict like

1884 {'index' -> [index], 'columns' -> [columns], 'data' -> [values],

1885 'index_names' -> [index.names], 'column_names' -> [column.names]}

1886 - 'records' : list like

1887 [{column -> value}, ... , {column -> value}]

1888 - 'index' : dict like {index -> {column -> value}}

1889

1890 Abbreviations are allowed. `s` indicates `series` and `sp`

1891 indicates `split`.

1892

1893 .. versionadded:: 1.4.0

1894 'tight' as an allowed value for the ``orient`` argument

1895

1896 into : class, default dict

1897 The collections.abc.Mapping subclass used for all Mappings

1898 in the return value. Can be the actual class or an empty

1899 instance of the mapping type you want. If you want a

1900 collections.defaultdict, you must pass it initialized.

1901

1902 Returns

1903 -------

1904 dict, list or collections.abc.Mapping

1905 Return a collections.abc.Mapping object representing the DataFrame.

1906 The resulting transformation depends on the `orient` parameter.

1907

1908 See Also

1909 --------

1910 DataFrame.from_dict: Create a DataFrame from a dictionary.

1911 DataFrame.to_json: Convert a DataFrame to JSON format.

1912

1913 Examples

1914 --------

1915 >>> df = pd.DataFrame({'col1': [1, 2],

1916 ... 'col2': [0.5, 0.75]},

1917 ... index=['row1', 'row2'])

1918 >>> df

1919 col1 col2

1920 row1 1 0.50

1921 row2 2 0.75

1922 >>> df.to_dict()

1923 {'col1': {'row1': 1, 'row2': 2}, 'col2': {'row1': 0.5, 'row2': 0.75}}

1924

1925 You can specify the return orientation.

1926

1927 >>> df.to_dict('series')

1928 {'col1': row1 1

1929 row2 2

1930 Name: col1, dtype: int64,

1931 'col2': row1 0.50

1932 row2 0.75

1933 Name: col2, dtype: float64}

1934

1935 >>> df.to_dict('split')

1936 {'index': ['row1', 'row2'], 'columns': ['col1', 'col2'],

1937 'data': [[1, 0.5], [2, 0.75]]}

1938

1939 >>> df.to_dict('records')

1940 [{'col1': 1, 'col2': 0.5}, {'col1': 2, 'col2': 0.75}]

1941

1942 >>> df.to_dict('index')

1943 {'row1': {'col1': 1, 'col2': 0.5}, 'row2': {'col1': 2, 'col2': 0.75}}

1944

1945 >>> df.to_dict('tight')

1946 {'index': ['row1', 'row2'], 'columns': ['col1', 'col2'],

1947 'data': [[1, 0.5], [2, 0.75]], 'index_names': [None], 'column_names': [None]}

1948

1949 You can also specify the mapping type.

1950

1951 >>> from collections import OrderedDict, defaultdict

1952 >>> df.to_dict(into=OrderedDict)

1953 OrderedDict([('col1', OrderedDict([('row1', 1), ('row2', 2)])),

1954 ('col2', OrderedDict([('row1', 0.5), ('row2', 0.75)]))])

1955

1956 If you want a `defaultdict`, you need to initialize it:

1957

1958 >>> dd = defaultdict(list)

1959 >>> df.to_dict('records', into=dd)

1960 [defaultdict(<class 'list'>, {'col1': 1, 'col2': 0.5}),

1961 defaultdict(<class 'list'>, {'col1': 2, 'col2': 0.75})]

1962 """

1963 if not self.columns.is_unique:

1964 warnings.warn(

1965 "DataFrame columns are not unique, some columns will be omitted.",

1966 UserWarning,

1967 stacklevel=find_stack_level(),

1968 )

1969 # GH16122

1970 into_c = com.standardize_mapping(into)

1971

1972 # error: Incompatible types in assignment (expression has type "str",

1973 # variable has type "Literal['dict', 'list', 'series', 'split', 'tight',

1974 # 'records', 'index']")

1975 orient = orient.lower() # type: ignore[assignment]

1976 # GH32515

1977 if orient.startswith(("d", "l", "s", "r", "i")) and orient not in {

1978 "dict",

1979 "list",

1980 "series",

1981 "split",

1982 "records",

1983 "index",

1984 }:

1985 warnings.warn(

1986 "Using short name for 'orient' is deprecated. Only the "

1987 "options: ('dict', list, 'series', 'split', 'records', 'index') "

1988 "will be used in a future version. Use one of the above "

1989 "to silence this warning.",

1990 FutureWarning,

1991 stacklevel=find_stack_level(),

1992 )

1993

1994 if orient.startswith("d"):

1995 orient = "dict"

1996 elif orient.startswith("l"):

1997 orient = "list"

1998 elif orient.startswith("sp"):

1999 orient = "split"

2000 elif orient.startswith("s"):

2001 orient = "series"

2002 elif orient.startswith("r"):

2003 orient = "records"

2004 elif orient.startswith("i"):

2005 orient = "index"

2006

2007 if orient == "dict":

2008 return into_c((k, v.to_dict(into)) for k, v in self.items())

2009

2010 elif orient == "list":

2011 return into_c(

2012 (k, list(map(maybe_box_native, v.tolist()))) for k, v in self.items()

2013 )

2014

2015 elif orient == "split":

2016 return into_c(

2017 (

2018 ("index", self.index.tolist()),

2019 ("columns", self.columns.tolist()),

2020 (

2021 "data",

2022 [

2023 list(map(maybe_box_native, t))

2024 for t in self.itertuples(index=False, name=None)

2025 ],

2026 ),

2027 )

2028 )

2029

2030 elif orient == "tight":

2031 return into_c(

2032 (

2033 ("index", self.index.tolist()),

2034 ("columns", self.columns.tolist()),

2035 (

2036 "data",

2037 [

2038 list(map(maybe_box_native, t))

2039 for t in self.itertuples(index=False, name=None)

2040 ],

2041 ),

2042 ("index_names", list(self.index.names)),

2043 ("column_names", list(self.columns.names)),

2044 )

2045 )

2046

2047 elif orient == "series":

2048 return into_c((k, v) for k, v in self.items())

2049

2050 elif orient == "records":

2051 columns = self.columns.tolist()

2052 rows = (

2053 dict(zip(columns, row))

2054 for row in self.itertuples(index=False, name=None)

2055 )

2056 return [

2057 into_c((k, maybe_box_native(v)) for k, v in row.items()) for row in rows

2058 ]

2059

2060 elif orient == "index":

2061 if not self.index.is_unique:

2062 raise ValueError("DataFrame index must be unique for orient='index'.")

2063 return into_c(

2064 (t[0], dict(zip(self.columns, map(maybe_box_native, t[1:]))))

2065 for t in self.itertuples(name=None)

2066 )

2067

2068 else:

2069 raise ValueError(f"orient '{orient}' not understood")

2070

2071 def to_gbq(

2072 self,

2073 destination_table: str,

2074 project_id: str | None = None,

2075 chunksize: int | None = None,

2076 reauth: bool = False,

2077 if_exists: str = "fail",

2078 auth_local_webserver: bool = True,

2079 table_schema: list[dict[str, str]] | None = None,

2080 location: str | None = None,

2081 progress_bar: bool = True,

2082 credentials=None,

2083 ) -> None:

2084 """

2085 Write a DataFrame to a Google BigQuery table.

2086

2087 This function requires the `pandas-gbq package

2088 <https://pandas-gbq.readthedocs.io>`__.

2089

2090 See the `How to authenticate with Google BigQuery

2091 <https://pandas-gbq.readthedocs.io/en/latest/howto/authentication.html>`__

2092 guide for authentication instructions.

2093

2094 Parameters

2095 ----------

2096 destination_table : str

2097 Name of table to be written, in the form ``dataset.tablename``.

2098 project_id : str, optional

2099 Google BigQuery Account project ID. Optional when available from

2100 the environment.

2101 chunksize : int, optional

2102 Number of rows to be inserted in each chunk from the dataframe.

2103 Set to ``None`` to load the whole dataframe at once.

2104 reauth : bool, default False

2105 Force Google BigQuery to re-authenticate the user. This is useful

2106 if multiple accounts are used.

2107 if_exists : str, default 'fail'

2108 Behavior when the destination table exists. Value can be one of:

2109

2110 ``'fail'``

2111 If table exists raise pandas_gbq.gbq.TableCreationError.

2112 ``'replace'``

2113 If table exists, drop it, recreate it, and insert data.

2114 ``'append'``

2115 If table exists, insert data. Create if does not exist.

2116 auth_local_webserver : bool, default True

2117 Use the `local webserver flow`_ instead of the `console flow`_

2118 when getting user credentials.

2119

2120 .. _local webserver flow:

2121 https://google-auth-oauthlib.readthedocs.io/en/latest/reference/google_auth_oauthlib.flow.html#google_auth_oauthlib.flow.InstalledAppFlow.run_local_server

2122 .. _console flow:

2123 https://google-auth-oauthlib.readthedocs.io/en/latest/reference/google_auth_oauthlib.flow.html#google_auth_oauthlib.flow.InstalledAppFlow.run_console

2124

2125 *New in version 0.2.0 of pandas-gbq*.

2126

2127 .. versionchanged:: 1.5.0

2128 Default value is changed to ``True``. Google has deprecated the

2129 ``auth_local_webserver = False`` `"out of band" (copy-paste)

2130 flow

2131 <https://developers.googleblog.com/2022/02/making-oauth-flows-safer.html?m=1#disallowed-oob>`_.

2132 table_schema : list of dicts, optional

2133 List of BigQuery table fields to which according DataFrame

2134 columns conform to, e.g. ``[{'name': 'col1', 'type':

2135 'STRING'},...]``. If schema is not provided, it will be

2136 generated according to dtypes of DataFrame columns. See

2137 BigQuery API documentation on available names of a field.

2138

2139 *New in version 0.3.1 of pandas-gbq*.

2140 location : str, optional

2141 Location where the load job should run. See the `BigQuery locations

2142 documentation

2143 <https://cloud.google.com/bigquery/docs/dataset-locations>`__ for a

2144 list of available locations. The location must match that of the

2145 target dataset.

2146

2147 *New in version 0.5.0 of pandas-gbq*.

2148 progress_bar : bool, default True

2149 Use the library `tqdm` to show the progress bar for the upload,

2150 chunk by chunk.

2151

2152 *New in version 0.5.0 of pandas-gbq*.

2153 credentials : google.auth.credentials.Credentials, optional

2154 Credentials for accessing Google APIs. Use this parameter to

2155 override default credentials, such as to use Compute Engine

2156 :class:`google.auth.compute_engine.Credentials` or Service

2157 Account :class:`google.oauth2.service_account.Credentials`

2158 directly.

2159

2160 *New in version 0.8.0 of pandas-gbq*.

2161

2162 See Also

2163 --------

2164 pandas_gbq.to_gbq : This function in the pandas-gbq library.

2165 read_gbq : Read a DataFrame from Google BigQuery.

2166 """

2167 from pandas.io import gbq

2168

2169 gbq.to_gbq(

2170 self,

2171 destination_table,

2172 project_id=project_id,

2173 chunksize=chunksize,

2174 reauth=reauth,

2175 if_exists=if_exists,

2176 auth_local_webserver=auth_local_webserver,

2177 table_schema=table_schema,

2178 location=location,

2179 progress_bar=progress_bar,

2180 credentials=credentials,

2181 )

2182

2183 @classmethod

2184 def from_records(

2185 cls,

2186 data,

2187 index=None,

2188 exclude=None,

2189 columns=None,

2190 coerce_float: bool = False,

2191 nrows: int | None = None,

2192 ) -> DataFrame:

2193 """

2194 Convert structured or record ndarray to DataFrame.

2195

2196 Creates a DataFrame object from a structured ndarray, sequence of

2197 tuples or dicts, or DataFrame.

2198

2199 Parameters

2200 ----------

2201 data : structured ndarray, sequence of tuples or dicts, or DataFrame

2202 Structured input data.

2203 index : str, list of fields, array-like

2204 Field of array to use as the index, alternately a specific set of

2205 input labels to use.

2206 exclude : sequence, default None

2207 Columns or fields to exclude.

2208 columns : sequence, default None

2209 Column names to use. If the passed data do not have names

2210 associated with them, this argument provides names for the

2211 columns. Otherwise this argument indicates the order of the columns

2212 in the result (any names not found in the data will become all-NA

2213 columns).

2214 coerce_float : bool, default False

2215 Attempt to convert values of non-string, non-numeric objects (like

2216 decimal.Decimal) to floating point, useful for SQL result sets.

2217 nrows : int, default None

2218 Number of rows to read if data is an iterator.

2219

2220 Returns

2221 -------

2222 DataFrame

2223

2224 See Also

2225 --------

2226 DataFrame.from_dict : DataFrame from dict of array-like or dicts.

2227 DataFrame : DataFrame object creation using constructor.

2228

2229 Examples

2230 --------

2231 Data can be provided as a structured ndarray:

2232

2233 >>> data = np.array([(3, 'a'), (2, 'b'), (1, 'c'), (0, 'd')],

2234 ... dtype=[('col_1', 'i4'), ('col_2', 'U1')])

2235 >>> pd.DataFrame.from_records(data)

2236 col_1 col_2

2237 0 3 a

2238 1 2 b

2239 2 1 c

2240 3 0 d

2241

2242 Data can be provided as a list of dicts:

2243

2244 >>> data = [{'col_1': 3, 'col_2': 'a'},

2245 ... {'col_1': 2, 'col_2': 'b'},

2246 ... {'col_1': 1, 'col_2': 'c'},

2247 ... {'col_1': 0, 'col_2': 'd'}]

2248 >>> pd.DataFrame.from_records(data)

2249 col_1 col_2

2250 0 3 a

2251 1 2 b

2252 2 1 c

2253 3 0 d

2254

2255 Data can be provided as a list of tuples with corresponding columns:

2256

2257 >>> data = [(3, 'a'), (2, 'b'), (1, 'c'), (0, 'd')]

2258 >>> pd.DataFrame.from_records(data, columns=['col_1', 'col_2'])

2259 col_1 col_2

2260 0 3 a

2261 1 2 b

2262 2 1 c

2263 3 0 d

2264 """

2265 result_index = None

2266

2267 # Make a copy of the input columns so we can modify it

2268 if columns is not None:

2269 columns = ensure_index(columns)

2270

2271 def maybe_reorder(

2272 arrays: list[ArrayLike], arr_columns: Index, columns: Index, index

2273 ) -> tuple[list[ArrayLike], Index, Index | None]:

2274 """

2275 If our desired 'columns' do not match the data's pre-existing 'arr_columns',

2276 we re-order our arrays. This is like a pre-emptive (cheap) reindex.

2277 """

2278 if len(arrays):

2279 length = len(arrays[0])

2280 else:

2281 length = 0

2282

2283 result_index = None

2284 if len(arrays) == 0 and index is None and length == 0:

2285 # for backward compat use an object Index instead of RangeIndex

2286 result_index = Index([])

2287

2288 arrays, arr_columns = reorder_arrays(arrays, arr_columns, columns, length)

2289 return arrays, arr_columns, result_index

2290

2291 if is_iterator(data):

2292 if nrows == 0:

2293 return cls()

2294

2295 try:

2296 first_row = next(data)

2297 except StopIteration:

2298 return cls(index=index, columns=columns)

2299

2300 dtype = None

2301 if hasattr(first_row, "dtype") and first_row.dtype.names:

2302 dtype = first_row.dtype

2303

2304 values = [first_row]

2305

2306 if nrows is None:

2307 values += data

2308 else:

2309 values.extend(itertools.islice(data, nrows - 1))

2310

2311 if dtype is not None:

2312 data = np.array(values, dtype=dtype)

2313 else:

2314 data = values

2315

2316 if isinstance(data, dict):

2317 if columns is None:

2318 columns = arr_columns = ensure_index(sorted(data))

2319 arrays = [data[k] for k in columns]

2320 else:

2321 arrays = []

2322 arr_columns_list = []

2323 for k, v in data.items():

2324 if k in columns:

2325 arr_columns_list.append(k)

2326 arrays.append(v)

2327

2328 arr_columns = Index(arr_columns_list)

2329 arrays, arr_columns, result_index = maybe_reorder(

2330 arrays, arr_columns, columns, index

2331 )

2332

2333 elif isinstance(data, (np.ndarray, DataFrame)):

2334 arrays, columns = to_arrays(data, columns)

2335 arr_columns = columns

2336 else:

2337 arrays, arr_columns = to_arrays(data, columns)

2338 if coerce_float:

2339 for i, arr in enumerate(arrays):

2340 if arr.dtype == object:

2341 # error: Argument 1 to "maybe_convert_objects" has

2342 # incompatible type "Union[ExtensionArray, ndarray]";

2343 # expected "ndarray"

2344 arrays[i] = lib.maybe_convert_objects(

2345 arr, # type: ignore[arg-type]

2346 try_float=True,

2347 )

2348

2349 arr_columns = ensure_index(arr_columns)

2350 if columns is None:

2351 columns = arr_columns

2352 else:

2353 arrays, arr_columns, result_index = maybe_reorder(

2354 arrays, arr_columns, columns, index

2355 )

2356

2357 if exclude is None:

2358 exclude = set()

2359 else:

2360 exclude = set(exclude)

2361

2362 if index is not None:

2363 if isinstance(index, str) or not hasattr(index, "__iter__"):

2364 i = columns.get_loc(index)

2365 exclude.add(index)

2366 if len(arrays) > 0:

2367 result_index = Index(arrays[i], name=index)

2368 else:

2369 result_index = Index([], name=index)

2370 else:

2371 try:

2372 index_data = [arrays[arr_columns.get_loc(field)] for field in index]

2373 except (KeyError, TypeError):

2374 # raised by get_loc, see GH#29258

2375 result_index = index

2376 else:

2377 result_index = ensure_index_from_sequences(index_data, names=index)

2378 exclude.update(index)

2379

2380 if any(exclude):

2381 arr_exclude = [x for x in exclude if x in arr_columns]

2382 to_remove = [arr_columns.get_loc(col) for col in arr_exclude]

2383 arrays = [v for i, v in enumerate(arrays) if i not in to_remove]

2384

2385 columns = columns.drop(exclude)

2386

2387 manager = get_option("mode.data_manager")

2388 mgr = arrays_to_mgr(arrays, columns, result_index, typ=manager)

2389

2390 return cls(mgr)

2391

2392 def to_records(

2393 self, index: bool = True, column_dtypes=None, index_dtypes=None

2394 ) -> np.recarray:

2395 """

2396 Convert DataFrame to a NumPy record array.

2397

2398 Index will be included as the first field of the record array if

2399 requested.

2400

2401 Parameters

2402 ----------

2403 index : bool, default True

2404 Include index in resulting record array, stored in 'index'

2405 field or using the index label, if set.

2406 column_dtypes : str, type, dict, default None

2407 If a string or type, the data type to store all columns. If

2408 a dictionary, a mapping of column names and indices (zero-indexed)

2409 to specific data types.

2410 index_dtypes : str, type, dict, default None

2411 If a string or type, the data type to store all index levels. If

2412 a dictionary, a mapping of index level names and indices

2413 (zero-indexed) to specific data types.

2414

2415 This mapping is applied only if `index=True`.

2416

2417 Returns

2418 -------

2419 numpy.recarray

2420 NumPy ndarray with the DataFrame labels as fields and each row

2421 of the DataFrame as entries.

2422

2423 See Also

2424 --------

2425 DataFrame.from_records: Convert structured or record ndarray

2426 to DataFrame.

2427 numpy.recarray: An ndarray that allows field access using

2428 attributes, analogous to typed columns in a

2429 spreadsheet.

2430

2431 Examples

2432 --------

2433 >>> df = pd.DataFrame({'A': [1, 2], 'B': [0.5, 0.75]},

2434 ... index=['a', 'b'])

2435 >>> df

2436 A B

2437 a 1 0.50

2438 b 2 0.75

2439 >>> df.to_records()

2440 rec.array([('a', 1, 0.5 ), ('b', 2, 0.75)],

2441 dtype=[('index', 'O'), ('A', '<i8'), ('B', '<f8')])

2442

2443 If the DataFrame index has no label then the recarray field name

2444 is set to 'index'. If the index has a label then this is used as the

2445 field name:

2446

2447 >>> df.index = df.index.rename("I")

2448 >>> df.to_records()

2449 rec.array([('a', 1, 0.5 ), ('b', 2, 0.75)],

2450 dtype=[('I', 'O'), ('A', '<i8'), ('B', '<f8')])

2451

2452 The index can be excluded from the record array:

2453

2454 >>> df.to_records(index=False)

2455 rec.array([(1, 0.5 ), (2, 0.75)],

2456 dtype=[('A', '<i8'), ('B', '<f8')])

2457

2458 Data types can be specified for the columns:

2459

2460 >>> df.to_records(column_dtypes={"A": "int32"})

2461 rec.array([('a', 1, 0.5 ), ('b', 2, 0.75)],

2462 dtype=[('I', 'O'), ('A', '<i4'), ('B', '<f8')])

2463

2464 As well as for the index:

2465

2466 >>> df.to_records(index_dtypes="<S2")

2467 rec.array([(b'a', 1, 0.5 ), (b'b', 2, 0.75)],

2468 dtype=[('I', 'S2'), ('A', '<i8'), ('B', '<f8')])

2469

2470 >>> index_dtypes = f"<S{df.index.str.len().max()}"

2471 >>> df.to_records(index_dtypes=index_dtypes)

2472 rec.array([(b'a', 1, 0.5 ), (b'b', 2, 0.75)],

2473 dtype=[('I', 'S1'), ('A', '<i8'), ('B', '<f8')])

2474 """

2475 if index:

2476 ix_vals = [

2477 np.asarray(self.index.get_level_values(i))

2478 for i in range(self.index.nlevels)

2479 ]

2480

2481 arrays = ix_vals + [

2482 np.asarray(self.iloc[:, i]) for i in range(len(self.columns))

2483 ]

2484

2485 index_names = list(self.index.names)

2486

2487 if isinstance(self.index, MultiIndex):

2488 index_names = com.fill_missing_names(index_names)

2489 elif index_names[0] is None:

2490 index_names = ["index"]

2491

2492 names = [str(name) for name in itertools.chain(index_names, self.columns)]

2493 else:

2494 arrays = [np.asarray(self.iloc[:, i]) for i in range(len(self.columns))]

2495 names = [str(c) for c in self.columns]

2496 index_names = []

2497

2498 index_len = len(index_names)

2499 formats = []

2500

2501 for i, v in enumerate(arrays):

2502 index_int = i

2503

2504 # When the names and arrays are collected, we

2505 # first collect those in the DataFrame's index,

2506 # followed by those in its columns.

2507 #

2508 # Thus, the total length of the array is:

2509 # len(index_names) + len(DataFrame.columns).

2510 #

2511 # This check allows us to see whether we are

2512 # handling a name / array in the index or column.

2513 if index_int < index_len:

2514 dtype_mapping = index_dtypes

2515 name = index_names[index_int]

2516 else:

2517 index_int -= index_len

2518 dtype_mapping = column_dtypes

2519 name = self.columns[index_int]

2520

2521 # We have a dictionary, so we get the data type

2522 # associated with the index or column (which can

2523 # be denoted by its name in the DataFrame or its

2524 # position in DataFrame's array of indices or

2525 # columns, whichever is applicable.

2526 if is_dict_like(dtype_mapping):

2527 if name in dtype_mapping:

2528 dtype_mapping = dtype_mapping[name]

2529 elif index_int in dtype_mapping:

2530 dtype_mapping = dtype_mapping[index_int]

2531 else:

2532 dtype_mapping = None

2533

2534 # If no mapping can be found, use the array's

2535 # dtype attribute for formatting.

2536 #

2537 # A valid dtype must either be a type or

2538 # string naming a type.

2539 if dtype_mapping is None:

2540 formats.append(v.dtype)

2541 elif isinstance(dtype_mapping, (type, np.dtype, str)):

2542 # error: Argument 1 to "append" of "list" has incompatible

2543 # type "Union[type, dtype[Any], str]"; expected "dtype[Any]"

2544 formats.append(dtype_mapping) # type: ignore[arg-type]

2545 else:

2546 element = "row" if i < index_len else "column"

2547 msg = f"Invalid dtype {dtype_mapping} specified for {element} {name}"

2548 raise ValueError(msg)

2549

2550 return np.rec.fromarrays(arrays, dtype={"names": names, "formats": formats})

2551

2552 @classmethod

2553 def _from_arrays(

2554 cls,

2555 arrays,

2556 columns,

2557 index,

2558 dtype: Dtype | None = None,

2559 verify_integrity: bool = True,

2560 ) -> DataFrame:

2561 """

2562 Create DataFrame from a list of arrays corresponding to the columns.

2563

2564 Parameters

2565 ----------

2566 arrays : list-like of arrays

2567 Each array in the list corresponds to one column, in order.

2568 columns : list-like, Index

2569 The column names for the resulting DataFrame.

2570 index : list-like, Index

2571 The rows labels for the resulting DataFrame.

2572 dtype : dtype, optional

2573 Optional dtype to enforce for all arrays.

2574 verify_integrity : bool, default True

2575 Validate and homogenize all input. If set to False, it is assumed

2576 that all elements of `arrays` are actual arrays how they will be

2577 stored in a block (numpy ndarray or ExtensionArray), have the same

2578 length as and are aligned with the index, and that `columns` and

2579 `index` are ensured to be an Index object.

2580

2581 Returns

2582 -------

2583 DataFrame

2584 """

2585 if dtype is not None:

2586 dtype = pandas_dtype(dtype)

2587

2588 manager = get_option("mode.data_manager")

2589 columns = ensure_index(columns)

2590 if len(columns) != len(arrays):

2591 raise ValueError("len(columns) must match len(arrays)")

2592 mgr = arrays_to_mgr(

2593 arrays,

2594 columns,

2595 index,

2596 dtype=dtype,

2597 verify_integrity=verify_integrity,

2598 typ=manager,

2599 )

2600 return cls(mgr)

2601

2602 @doc(

2603 storage_options=_shared_docs["storage_options"],

2604 compression_options=_shared_docs["compression_options"] % "path",

2605 )

2606 @deprecate_kwarg(old_arg_name="fname", new_arg_name="path")

2607 @deprecate_nonkeyword_arguments(version=None, allowed_args=["self", "path"])

2608 def to_stata(

2609 self,

2610 path: FilePath | WriteBuffer[bytes],

2611 convert_dates: dict[Hashable, str] | None = None,

2612 write_index: bool = True,

2613 byteorder: str | None = None,

2614 time_stamp: datetime.datetime | None = None,

2615 data_label: str | None = None,

2616 variable_labels: dict[Hashable, str] | None = None,

2617 version: int | None = 114,

2618 convert_strl: Sequence[Hashable] | None = None,

2619 compression: CompressionOptions = "infer",

2620 storage_options: StorageOptions = None,

2621 *,

2622 value_labels: dict[Hashable, dict[float, str]] | None = None,

2623 ) -> None:

2624 """

2625 Export DataFrame object to Stata dta format.

2626

2627 Writes the DataFrame to a Stata dataset file.

2628 "dta" files contain a Stata dataset.

2629

2630 Parameters

2631 ----------

2632 path : str, path object, or buffer

2633 String, path object (implementing ``os.PathLike[str]``), or file-like

2634 object implementing a binary ``write()`` function.

2635

2636 .. versionchanged:: 1.0.0

2637

2638 Previously this was "fname"

2639

2640 convert_dates : dict

2641 Dictionary mapping columns containing datetime types to stata

2642 internal format to use when writing the dates. Options are 'tc',

2643 'td', 'tm', 'tw', 'th', 'tq', 'ty'. Column can be either an integer

2644 or a name. Datetime columns that do not have a conversion type

2645 specified will be converted to 'tc'. Raises NotImplementedError if

2646 a datetime column has timezone information.

2647 write_index : bool

2648 Write the index to Stata dataset.

2649 byteorder : str

2650 Can be ">", "<", "little", or "big". default is `sys.byteorder`.

2651 time_stamp : datetime

2652 A datetime to use as file creation date. Default is the current

2653 time.

2654 data_label : str, optional

2655 A label for the data set. Must be 80 characters or smaller.

2656 variable_labels : dict

2657 Dictionary containing columns as keys and variable labels as

2658 values. Each label must be 80 characters or smaller.

2659 version : {{114, 117, 118, 119, None}}, default 114

2660 Version to use in the output dta file. Set to None to let pandas

2661 decide between 118 or 119 formats depending on the number of

2662 columns in the frame. Version 114 can be read by Stata 10 and

2663 later. Version 117 can be read by Stata 13 or later. Version 118

2664 is supported in Stata 14 and later. Version 119 is supported in

2665 Stata 15 and later. Version 114 limits string variables to 244

2666 characters or fewer while versions 117 and later allow strings

2667 with lengths up to 2,000,000 characters. Versions 118 and 119

2668 support Unicode characters, and version 119 supports more than

2669 32,767 variables.

2670

2671 Version 119 should usually only be used when the number of

2672 variables exceeds the capacity of dta format 118. Exporting

2673 smaller datasets in format 119 may have unintended consequences,

2674 and, as of November 2020, Stata SE cannot read version 119 files.

2675

2676 .. versionchanged:: 1.0.0

2677

2678 Added support for formats 118 and 119.

2679

2680 convert_strl : list, optional

2681 List of column names to convert to string columns to Stata StrL

2682 format. Only available if version is 117. Storing strings in the

2683 StrL format can produce smaller dta files if strings have more than

2684 8 characters and values are repeated.

2685 {compression_options}

2686

2687 .. versionadded:: 1.1.0

2688

2689 .. versionchanged:: 1.4.0 Zstandard support.

2690

2691 {storage_options}

2692

2693 .. versionadded:: 1.2.0

2694

2695 value_labels : dict of dicts

2696 Dictionary containing columns as keys and dictionaries of column value

2697 to labels as values. Labels for a single variable must be 32,000

2698 characters or smaller.

2699

2700 .. versionadded:: 1.4.0

2701

2702 Raises

2703 ------

2704 NotImplementedError

2705 * If datetimes contain timezone information

2706 * Column dtype is not representable in Stata

2707 ValueError

2708 * Columns listed in convert_dates are neither datetime64[ns]

2709 or datetime.datetime

2710 * Column listed in convert_dates is not in DataFrame

2711 * Categorical label contains more than 32,000 characters

2712

2713 See Also

2714 --------

2715 read_stata : Import Stata data files.

2716 io.stata.StataWriter : Low-level writer for Stata data files.

2717 io.stata.StataWriter117 : Low-level writer for version 117 files.

2718

2719 Examples

2720 --------

2721 >>> df = pd.DataFrame({{'animal': ['falcon', 'parrot', 'falcon',

2722 ... 'parrot'],

2723 ... 'speed': [350, 18, 361, 15]}})

2724 >>> df.to_stata('animals.dta') # doctest: +SKIP

2725 """

2726 if version not in (114, 117, 118, 119, None):

2727 raise ValueError("Only formats 114, 117, 118 and 119 are supported.")

2728 if version == 114:

2729 if convert_strl is not None:

2730 raise ValueError("strl is not supported in format 114")

2731 from pandas.io.stata import StataWriter as statawriter

2732 elif version == 117:

2733 # mypy: Name 'statawriter' already defined (possibly by an import)

2734 from pandas.io.stata import ( # type: ignore[no-redef]

2735 StataWriter117 as statawriter,

2736 )

2737 else: # versions 118 and 119

2738 # mypy: Name 'statawriter' already defined (possibly by an import)

2739 from pandas.io.stata import ( # type: ignore[no-redef]

2740 StataWriterUTF8 as statawriter,

2741 )

2742

2743 kwargs: dict[str, Any] = {}

2744 if version is None or version >= 117:

2745 # strl conversion is only supported >= 117

2746 kwargs["convert_strl"] = convert_strl

2747 if version is None or version >= 118:

2748 # Specifying the version is only supported for UTF8 (118 or 119)

2749 kwargs["version"] = version

2750

2751 writer = statawriter(

2752 path,

2753 self,

2754 convert_dates=convert_dates,

2755 byteorder=byteorder,

2756 time_stamp=time_stamp,

2757 data_label=data_label,

2758 write_index=write_index,

2759 variable_labels=variable_labels,

2760 compression=compression,

2761 storage_options=storage_options,

2762 value_labels=value_labels,

2763 **kwargs,

2764 )

2765 writer.write_file()

2766

2767 @deprecate_kwarg(old_arg_name="fname", new_arg_name="path")

2768 def to_feather(self, path: FilePath | WriteBuffer[bytes], **kwargs) -> None:

2769 """

2770 Write a DataFrame to the binary Feather format.

2771

2772 Parameters

2773 ----------

2774 path : str, path object, file-like object

2775 String, path object (implementing ``os.PathLike[str]``), or file-like

2776 object implementing a binary ``write()`` function. If a string or a path,

2777 it will be used as Root Directory path when writing a partitioned dataset.

2778 **kwargs :

2779 Additional keywords passed to :func:`pyarrow.feather.write_feather`.

2780 Starting with pyarrow 0.17, this includes the `compression`,

2781 `compression_level`, `chunksize` and `version` keywords.

2782

2783 .. versionadded:: 1.1.0

2784

2785 Notes

2786 -----

2787 This function writes the dataframe as a `feather file

2788 <https://arrow.apache.org/docs/python/feather.html>`_. Requires a default

2789 index. For saving the DataFrame with your custom index use a method that

2790 supports custom indices e.g. `to_parquet`.

2791 """

2792 from pandas.io.feather_format import to_feather

2793

2794 to_feather(self, path, **kwargs)

2795

2796 @doc(

2797 Series.to_markdown,

2798 klass=_shared_doc_kwargs["klass"],

2799 storage_options=_shared_docs["storage_options"],

2800 examples="""Examples

2801 --------

2802 >>> df = pd.DataFrame(

2803 ... data={"animal_1": ["elk", "pig"], "animal_2": ["dog", "quetzal"]}

2804 ... )

2805 >>> print(df.to_markdown())

2806 | | animal_1 | animal_2 |

2807 |---:|:-----------|:-----------|

2808 | 0 | elk | dog |

2809 | 1 | pig | quetzal |

2810

2811 Output markdown with a tabulate option.

2812

2813 >>> print(df.to_markdown(tablefmt="grid"))

2814 +----+------------+------------+

2815 | | animal_1 | animal_2 |

2816 +====+============+============+

2817 | 0 | elk | dog |

2818 +----+------------+------------+

2819 | 1 | pig | quetzal |

2820 +----+------------+------------+""",

2821 )

2822 def to_markdown(

2823 self,

2824 buf: FilePath | WriteBuffer[str] | None = None,

2825 mode: str = "wt",

2826 index: bool = True,

2827 storage_options: StorageOptions = None,

2828 **kwargs,

2829 ) -> str | None:

2830 if "showindex" in kwargs:

2831 warnings.warn(

2832 "'showindex' is deprecated. Only 'index' will be used "

2833 "in a future version. Use 'index' to silence this warning.",

2834 FutureWarning,

2835 stacklevel=find_stack_level(),

2836 )

2837

2838 kwargs.setdefault("headers", "keys")

2839 kwargs.setdefault("tablefmt", "pipe")

2840 kwargs.setdefault("showindex", index)

2841 tabulate = import_optional_dependency("tabulate")

2842 result = tabulate.tabulate(self, **kwargs)

2843 if buf is None:

2844 return result

2845

2846 with get_handle(buf, mode, storage_options=storage_options) as handles:

2847 handles.handle.write(result)

2848 return None

2849

2850 @overload

2851 def to_parquet(

2852 self,

2853 path: None = ...,

2854 engine: str = ...,

2855 compression: str | None = ...,

2856 index: bool | None = ...,

2857 partition_cols: list[str] | None = ...,

2858 storage_options: StorageOptions = ...,

2859 **kwargs,

2860 ) -> bytes:

2861 ...

2862

2863 @overload

2864 def to_parquet(

2865 self,

2866 path: FilePath | WriteBuffer[bytes],

2867 engine: str = ...,

2868 compression: str | None = ...,

2869 index: bool | None = ...,

2870 partition_cols: list[str] | None = ...,

2871 storage_options: StorageOptions = ...,

2872 **kwargs,

2873 ) -> None:

2874 ...

2875

2876 @doc(storage_options=_shared_docs["storage_options"])

2877 @deprecate_kwarg(old_arg_name="fname", new_arg_name="path")

2878 def to_parquet(

2879 self,

2880 path: FilePath | WriteBuffer[bytes] | None = None,

2881 engine: str = "auto",

2882 compression: str | None = "snappy",

2883 index: bool | None = None,

2884 partition_cols: list[str] | None = None,

2885 storage_options: StorageOptions = None,

2886 **kwargs,

2887 ) -> bytes | None:

2888 """

2889 Write a DataFrame to the binary parquet format.

2890

2891 This function writes the dataframe as a `parquet file

2892 <https://parquet.apache.org/>`_. You can choose different parquet

2893 backends, and have the option of compression. See

2894 :ref:`the user guide <io.parquet>` for more details.

2895

2896 Parameters

2897 ----------

2898 path : str, path object, file-like object, or None, default None

2899 String, path object (implementing ``os.PathLike[str]``), or file-like

2900 object implementing a binary ``write()`` function. If None, the result is

2901 returned as bytes. If a string or path, it will be used as Root Directory

2902 path when writing a partitioned dataset.

2903

2904 .. versionchanged:: 1.2.0

2905

2906 Previously this was "fname"

2907

2908 engine : {{'auto', 'pyarrow', 'fastparquet'}}, default 'auto'

2909 Parquet library to use. If 'auto', then the option

2910 ``io.parquet.engine`` is used. The default ``io.parquet.engine``

2911 behavior is to try 'pyarrow', falling back to 'fastparquet' if

2912 'pyarrow' is unavailable.

2913 compression : {{'snappy', 'gzip', 'brotli', None}}, default 'snappy'

2914 Name of the compression to use. Use ``None`` for no compression.

2915 index : bool, default None

2916 If ``True``, include the dataframe's index(es) in the file output.

2917 If ``False``, they will not be written to the file.

2918 If ``None``, similar to ``True`` the dataframe's index(es)

2919 will be saved. However, instead of being saved as values,

2920 the RangeIndex will be stored as a range in the metadata so it

2921 doesn't require much space and is faster. Other indexes will

2922 be included as columns in the file output.

2923 partition_cols : list, optional, default None

2924 Column names by which to partition the dataset.

2925 Columns are partitioned in the order they are given.

2926 Must be None if path is not a string.

2927 {storage_options}

2928

2929 .. versionadded:: 1.2.0

2930

2931 **kwargs

2932 Additional arguments passed to the parquet library. See

2933 :ref:`pandas io <io.parquet>` for more details.

2934

2935 Returns

2936 -------

2937 bytes if no path argument is provided else None

2938

2939 See Also

2940 --------

2941 read_parquet : Read a parquet file.

2942 DataFrame.to_orc : Write an orc file.

2943 DataFrame.to_csv : Write a csv file.

2944 DataFrame.to_sql : Write to a sql table.

2945 DataFrame.to_hdf : Write to hdf.

2946

2947 Notes

2948 -----

2949 This function requires either the `fastparquet

2950 <https://pypi.org/project/fastparquet>`_ or `pyarrow

2951 <https://arrow.apache.org/docs/python/>`_ library.

2952

2953 Examples

2954 --------

2955 >>> df = pd.DataFrame(data={{'col1': [1, 2], 'col2': [3, 4]}})

2956 >>> df.to_parquet('df.parquet.gzip',

2957 ... compression='gzip') # doctest: +SKIP

2958 >>> pd.read_parquet('df.parquet.gzip') # doctest: +SKIP

2959 col1 col2

2960 0 1 3

2961 1 2 4

2962

2963 If you want to get a buffer to the parquet content you can use a io.BytesIO

2964 object, as long as you don't use partition_cols, which creates multiple files.

2965

2966 >>> import io

2967 >>> f = io.BytesIO()

2968 >>> df.to_parquet(f)

2969 >>> f.seek(0)

2970 0

2971 >>> content = f.read()

2972 """

2973 from pandas.io.parquet import to_parquet

2974

2975 return to_parquet(

2976 self,

2977 path,

2978 engine,

2979 compression=compression,

2980 index=index,

2981 partition_cols=partition_cols,

2982 storage_options=storage_options,

2983 **kwargs,

2984 )

2985

2986 def to_orc(

2987 self,

2988 path: FilePath | WriteBuffer[bytes] | None = None,

2989 *,

2990 engine: Literal["pyarrow"] = "pyarrow",

2991 index: bool | None = None,

2992 engine_kwargs: dict[str, Any] | None = None,

2993 ) -> bytes | None:

2994 """

2995 Write a DataFrame to the ORC format.

2996

2997 .. versionadded:: 1.5.0

2998

2999 Parameters

3000 ----------

3001 path : str, file-like object or None, default None

3002 If a string, it will be used as Root Directory path

3003 when writing a partitioned dataset. By file-like object,

3004 we refer to objects with a write() method, such as a file handle

3005 (e.g. via builtin open function). If path is None,

3006 a bytes object is returned.

3007 engine : str, default 'pyarrow'

3008 ORC library to use. Pyarrow must be >= 7.0.0.

3009 index : bool, optional

3010 If ``True``, include the dataframe's index(es) in the file output.

3011 If ``False``, they will not be written to the file.

3012 If ``None``, similar to ``infer`` the dataframe's index(es)

3013 will be saved. However, instead of being saved as values,

3014 the RangeIndex will be stored as a range in the metadata so it

3015 doesn't require much space and is faster. Other indexes will

3016 be included as columns in the file output.

3017 engine_kwargs : dict[str, Any] or None, default None

3018 Additional keyword arguments passed to :func:`pyarrow.orc.write_table`.

3019

3020 Returns

3021 -------

3022 bytes if no path argument is provided else None

3023

3024 Raises

3025 ------

3026 NotImplementedError

3027 Dtype of one or more columns is category, unsigned integers, interval,

3028 period or sparse.

3029 ValueError

3030 engine is not pyarrow.

3031

3032 See Also

3033 --------

3034 read_orc : Read a ORC file.

3035 DataFrame.to_parquet : Write a parquet file.

3036 DataFrame.to_csv : Write a csv file.

3037 DataFrame.to_sql : Write to a sql table.

3038 DataFrame.to_hdf : Write to hdf.

3039

3040 Notes

3041 -----

3042 * Before using this function you should read the :ref:`user guide about

3043 ORC <io.orc>` and :ref:`install optional dependencies <install.warn_orc>`.

3044 * This function requires `pyarrow <https://arrow.apache.org/docs/python/>`_

3045 library.

3046 * For supported dtypes please refer to `supported ORC features in Arrow

3047 <https://arrow.apache.org/docs/cpp/orc.html#data-types>`__.

3048 * Currently timezones in datetime columns are not preserved when a

3049 dataframe is converted into ORC files.

3050

3051 Examples

3052 --------

3053 >>> df = pd.DataFrame(data={'col1': [1, 2], 'col2': [4, 3]})

3054 >>> df.to_orc('df.orc') # doctest: +SKIP

3055 >>> pd.read_orc('df.orc') # doctest: +SKIP

3056 col1 col2

3057 0 1 4

3058 1 2 3

3059

3060 If you want to get a buffer to the orc content you can write it to io.BytesIO

3061 >>> import io

3062 >>> b = io.BytesIO(df.to_orc()) # doctest: +SKIP

3063 >>> b.seek(0) # doctest: +SKIP

3064 0

3065 >>> content = b.read() # doctest: +SKIP

3066 """

3067 from pandas.io.orc import to_orc

3068

3069 return to_orc(

3070 self, path, engine=engine, index=index, engine_kwargs=engine_kwargs

3071 )

3072

3073 @overload

3074 def to_html(

3075 self,

3076 buf: FilePath | WriteBuffer[str],

3077 columns: Sequence[Level] | None = ...,

3078 col_space: ColspaceArgType | None = ...,

3079 header: bool | Sequence[str] = ...,

3080 index: bool = ...,

3081 na_rep: str = ...,

3082 formatters: FormattersType | None = ...,

3083 float_format: FloatFormatType | None = ...,

3084 sparsify: bool | None = ...,

3085 index_names: bool = ...,

3086 justify: str | None = ...,

3087 max_rows: int | None = ...,

3088 max_cols: int | None = ...,

3089 show_dimensions: bool | str = ...,

3090 decimal: str = ...,

3091 bold_rows: bool = ...,

3092 classes: str | list | tuple | None = ...,

3093 escape: bool = ...,

3094 notebook: bool = ...,

3095 border: int | bool | None = ...,

3096 table_id: str | None = ...,

3097 render_links: bool = ...,

3098 encoding: str | None = ...,

3099 ) -> None:

3100 ...

3101

3102 @overload

3103 def to_html(

3104 self,

3105 buf: None = ...,

3106 columns: Sequence[Level] | None = ...,

3107 col_space: ColspaceArgType | None = ...,

3108 header: bool | Sequence[str] = ...,

3109 index: bool = ...,

3110 na_rep: str = ...,

3111 formatters: FormattersType | None = ...,

3112 float_format: FloatFormatType | None = ...,

3113 sparsify: bool | None = ...,

3114 index_names: bool = ...,

3115 justify: str | None = ...,

3116 max_rows: int | None = ...,

3117 max_cols: int | None = ...,

3118 show_dimensions: bool | str = ...,

3119 decimal: str = ...,

3120 bold_rows: bool = ...,

3121 classes: str | list | tuple | None = ...,

3122 escape: bool = ...,

3123 notebook: bool = ...,

3124 border: int | bool | None = ...,

3125 table_id: str | None = ...,

3126 render_links: bool = ...,

3127 encoding: str | None = ...,

3128 ) -> str:

3129 ...

3130

3131 @Substitution(

3132 header_type="bool",

3133 header="Whether to print column labels, default True",

3134 col_space_type="str or int, list or dict of int or str",

3135 col_space="The minimum width of each column in CSS length "

3136 "units. An int is assumed to be px units.\n\n"

3137 " .. versionadded:: 0.25.0\n"

3138 " Ability to use str",

3139 )

3140 @Substitution(shared_params=fmt.common_docstring, returns=fmt.return_docstring)

3141 def to_html(

3142 self,

3143 buf: FilePath | WriteBuffer[str] | None = None,

3144 columns: Sequence[Level] | None = None,

3145 col_space: ColspaceArgType | None = None,

3146 header: bool | Sequence[str] = True,

3147 index: bool = True,

3148 na_rep: str = "NaN",

3149 formatters: FormattersType | None = None,

3150 float_format: FloatFormatType | None = None,

3151 sparsify: bool | None = None,

3152 index_names: bool = True,

3153 justify: str | None = None,

3154 max_rows: int | None = None,

3155 max_cols: int | None = None,

3156 show_dimensions: bool | str = False,

3157 decimal: str = ".",

3158 bold_rows: bool = True,

3159 classes: str | list | tuple | None = None,

3160 escape: bool = True,

3161 notebook: bool = False,

3162 border: int | bool | None = None,

3163 table_id: str | None = None,

3164 render_links: bool = False,

3165 encoding: str | None = None,

3166 ) -> str | None:

3167 """

3168 Render a DataFrame as an HTML table.

3169 %(shared_params)s

3170 bold_rows : bool, default True

3171 Make the row labels bold in the output.

3172 classes : str or list or tuple, default None

3173 CSS class(es) to apply to the resulting html table.

3174 escape : bool, default True

3175 Convert the characters <, >, and & to HTML-safe sequences.

3176 notebook : {True, False}, default False

3177 Whether the generated HTML is for IPython Notebook.

3178 border : int

3179 A ``border=border`` attribute is included in the opening

3180 `<table>` tag. Default ``pd.options.display.html.border``.

3181 table_id : str, optional

3182 A css id is included in the opening `<table>` tag if specified.

3183 render_links : bool, default False

3184 Convert URLs to HTML links.

3185 encoding : str, default "utf-8"

3186 Set character encoding.

3187

3188 .. versionadded:: 1.0

3189 %(returns)s

3190 See Also

3191 --------

3192 to_string : Convert DataFrame to a string.

3193 """

3194 if justify is not None and justify not in fmt._VALID_JUSTIFY_PARAMETERS:

3195 raise ValueError("Invalid value for justify parameter")

3196

3197 formatter = fmt.DataFrameFormatter(

3198 self,

3199 columns=columns,

3200 col_space=col_space,

3201 na_rep=na_rep,

3202 header=header,

3203 index=index,

3204 formatters=formatters,

3205 float_format=float_format,

3206 bold_rows=bold_rows,

3207 sparsify=sparsify,

3208 justify=justify,

3209 index_names=index_names,

3210 escape=escape,

3211 decimal=decimal,

3212 max_rows=max_rows,

3213 max_cols=max_cols,

3214 show_dimensions=show_dimensions,

3215 )

3216 # TODO: a generic formatter wld b in DataFrameFormatter

3217 return fmt.DataFrameRenderer(formatter).to_html(

3218 buf=buf,

3219 classes=classes,

3220 notebook=notebook,

3221 border=border,

3222 encoding=encoding,

3223 table_id=table_id,

3224 render_links=render_links,

3225 )

3226

3227 @doc(

3228 storage_options=_shared_docs["storage_options"],

3229 compression_options=_shared_docs["compression_options"] % "path_or_buffer",

3230 )

3231 def to_xml(

3232 self,

3233 path_or_buffer: FilePath | WriteBuffer[bytes] | WriteBuffer[str] | None = None,

3234 index: bool = True,

3235 root_name: str | None = "data",

3236 row_name: str | None = "row",

3237 na_rep: str | None = None,

3238 attr_cols: list[str] | None = None,

3239 elem_cols: list[str] | None = None,

3240 namespaces: dict[str | None, str] | None = None,

3241 prefix: str | None = None,

3242 encoding: str = "utf-8",

3243 xml_declaration: bool | None = True,

3244 pretty_print: bool | None = True,

3245 parser: str | None = "lxml",

3246 stylesheet: FilePath | ReadBuffer[str] | ReadBuffer[bytes] | None = None,

3247 compression: CompressionOptions = "infer",

3248 storage_options: StorageOptions = None,

3249 ) -> str | None:

3250 """

3251 Render a DataFrame to an XML document.

3252

3253 .. versionadded:: 1.3.0

3254

3255 Parameters

3256 ----------

3257 path_or_buffer : str, path object, file-like object, or None, default None

3258 String, path object (implementing ``os.PathLike[str]``), or file-like

3259 object implementing a ``write()`` function. If None, the result is returned

3260 as a string.

3261 index : bool, default True

3262 Whether to include index in XML document.

3263 root_name : str, default 'data'

3264 The name of root element in XML document.

3265 row_name : str, default 'row'

3266 The name of row element in XML document.

3267 na_rep : str, optional

3268 Missing data representation.

3269 attr_cols : list-like, optional

3270 List of columns to write as attributes in row element.

3271 Hierarchical columns will be flattened with underscore

3272 delimiting the different levels.

3273 elem_cols : list-like, optional

3274 List of columns to write as children in row element. By default,

3275 all columns output as children of row element. Hierarchical

3276 columns will be flattened with underscore delimiting the

3277 different levels.

3278 namespaces : dict, optional

3279 All namespaces to be defined in root element. Keys of dict

3280 should be prefix names and values of dict corresponding URIs.

3281 Default namespaces should be given empty string key. For

3282 example, ::

3283

3284 namespaces = {{"": "https://example.com"}}

3285

3286 prefix : str, optional

3287 Namespace prefix to be used for every element and/or attribute

3288 in document. This should be one of the keys in ``namespaces``

3289 dict.

3290 encoding : str, default 'utf-8'

3291 Encoding of the resulting document.

3292 xml_declaration : bool, default True

3293 Whether to include the XML declaration at start of document.

3294 pretty_print : bool, default True

3295 Whether output should be pretty printed with indentation and

3296 line breaks.

3297 parser : {{'lxml','etree'}}, default 'lxml'

3298 Parser module to use for building of tree. Only 'lxml' and

3299 'etree' are supported. With 'lxml', the ability to use XSLT

3300 stylesheet is supported.

3301 stylesheet : str, path object or file-like object, optional

3302 A URL, file-like object, or a raw string containing an XSLT

3303 script used to transform the raw XML output. Script should use

3304 layout of elements and attributes from original output. This

3305 argument requires ``lxml`` to be installed. Only XSLT 1.0

3306 scripts and not later versions is currently supported.

3307 {compression_options}

3308

3309 .. versionchanged:: 1.4.0 Zstandard support.

3310

3311 {storage_options}

3312

3313 Returns

3314 -------

3315 None or str

3316 If ``io`` is None, returns the resulting XML format as a

3317 string. Otherwise returns None.

3318

3319 See Also

3320 --------

3321 to_json : Convert the pandas object to a JSON string.

3322 to_html : Convert DataFrame to a html.

3323

3324 Examples

3325 --------

3326 >>> df = pd.DataFrame({{'shape': ['square', 'circle', 'triangle'],

3327 ... 'degrees': [360, 360, 180],

3328 ... 'sides': [4, np.nan, 3]}})

3329

3330 >>> df.to_xml() # doctest: +SKIP

3331 <?xml version='1.0' encoding='utf-8'?>

3332 <data>

3333 <row>

3334 <index>0</index>

3335 <shape>square</shape>

3336 <degrees>360</degrees>

3337 <sides>4.0</sides>

3338 </row>

3339 <row>

3340 <index>1</index>

3341 <shape>circle</shape>

3342 <degrees>360</degrees>

3343 <sides/>

3344 </row>

3345 <row>

3346 <index>2</index>

3347 <shape>triangle</shape>

3348 <degrees>180</degrees>

3349 <sides>3.0</sides>

3350 </row>

3351 </data>

3352

3353 >>> df.to_xml(attr_cols=[

3354 ... 'index', 'shape', 'degrees', 'sides'

3355 ... ]) # doctest: +SKIP

3356 <?xml version='1.0' encoding='utf-8'?>

3357 <data>

3358 <row index="0" shape="square" degrees="360" sides="4.0"/>

3359 <row index="1" shape="circle" degrees="360"/>

3360 <row index="2" shape="triangle" degrees="180" sides="3.0"/>

3361 </data>

3362

3363 >>> df.to_xml(namespaces={{"doc": "https://example.com"}},

3364 ... prefix="doc") # doctest: +SKIP

3365 <?xml version='1.0' encoding='utf-8'?>

3366 <doc:data xmlns:doc="https://example.com">

3367 <doc:row>

3368 <doc:index>0</doc:index>

3369 <doc:shape>square</doc:shape>

3370 <doc:degrees>360</doc:degrees>

3371 <doc:sides>4.0</doc:sides>

3372 </doc:row>

3373 <doc:row>

3374 <doc:index>1</doc:index>

3375 <doc:shape>circle</doc:shape>

3376 <doc:degrees>360</doc:degrees>

3377 <doc:sides/>

3378 </doc:row>

3379 <doc:row>

3380 <doc:index>2</doc:index>

3381 <doc:shape>triangle</doc:shape>

3382 <doc:degrees>180</doc:degrees>

3383 <doc:sides>3.0</doc:sides>

3384 </doc:row>

3385 </doc:data>

3386 """

3387

3388 from pandas.io.formats.xml import (

3389 EtreeXMLFormatter,

3390 LxmlXMLFormatter,

3391 )

3392

3393 lxml = import_optional_dependency("lxml.etree", errors="ignore")

3394

3395 TreeBuilder: type[EtreeXMLFormatter] | type[LxmlXMLFormatter]

3396

3397 if parser == "lxml":

3398 if lxml is not None:

3399 TreeBuilder = LxmlXMLFormatter

3400 else:

3401 raise ImportError(

3402 "lxml not found, please install or use the etree parser."

3403 )

3404

3405 elif parser == "etree":

3406 TreeBuilder = EtreeXMLFormatter

3407

3408 else:

3409 raise ValueError("Values for parser can only be lxml or etree.")

3410

3411 xml_formatter = TreeBuilder(

3412 self,

3413 path_or_buffer=path_or_buffer,

3414 index=index,

3415 root_name=root_name,

3416 row_name=row_name,

3417 na_rep=na_rep,

3418 attr_cols=attr_cols,

3419 elem_cols=elem_cols,

3420 namespaces=namespaces,

3421 prefix=prefix,

3422 encoding=encoding,

3423 xml_declaration=xml_declaration,

3424 pretty_print=pretty_print,

3425 stylesheet=stylesheet,

3426 compression=compression,

3427 storage_options=storage_options,

3428 )

3429

3430 return xml_formatter.write_output()

3431

3432 # ----------------------------------------------------------------------

3433 @doc(INFO_DOCSTRING, **frame_sub_kwargs)

3434 def info(

3435 self,

3436 verbose: bool | None = None,

3437 buf: WriteBuffer[str] | None = None,

3438 max_cols: int | None = None,

3439 memory_usage: bool | str | None = None,

3440 show_counts: bool | None = None,

3441 null_counts: bool | None = None,

3442 ) -> None:

3443 if null_counts is not None:

3444 if show_counts is not None:

3445 raise ValueError("null_counts used with show_counts. Use show_counts.")

3446 warnings.warn(

3447 "null_counts is deprecated. Use show_counts instead",

3448 FutureWarning,

3449 stacklevel=find_stack_level(),

3450 )

3451 show_counts = null_counts

3452 info = DataFrameInfo(

3453 data=self,

3454 memory_usage=memory_usage,

3455 )

3456 info.render(

3457 buf=buf,

3458 max_cols=max_cols,

3459 verbose=verbose,

3460 show_counts=show_counts,

3461 )

3462

3463 def memory_usage(self, index: bool = True, deep: bool = False) -> Series:

3464 """

3465 Return the memory usage of each column in bytes.

3466

3467 The memory usage can optionally include the contribution of

3468 the index and elements of `object` dtype.

3469

3470 This value is displayed in `DataFrame.info` by default. This can be

3471 suppressed by setting ``pandas.options.display.memory_usage`` to False.

3472

3473 Parameters

3474 ----------

3475 index : bool, default True

3476 Specifies whether to include the memory usage of the DataFrame's

3477 index in returned Series. If ``index=True``, the memory usage of

3478 the index is the first item in the output.

3479 deep : bool, default False

3480 If True, introspect the data deeply by interrogating

3481 `object` dtypes for system-level memory consumption, and include

3482 it in the returned values.

3483

3484 Returns

3485 -------

3486 Series

3487 A Series whose index is the original column names and whose values

3488 is the memory usage of each column in bytes.

3489

3490 See Also

3491 --------

3492 numpy.ndarray.nbytes : Total bytes consumed by the elements of an

3493 ndarray.

3494 Series.memory_usage : Bytes consumed by a Series.

3495 Categorical : Memory-efficient array for string values with

3496 many repeated values.

3497 DataFrame.info : Concise summary of a DataFrame.

3498

3499 Notes

3500 -----

3501 See the :ref:`Frequently Asked Questions <df-memory-usage>` for more

3502 details.

3503

3504 Examples

3505 --------

3506 >>> dtypes = ['int64', 'float64', 'complex128', 'object', 'bool']

3507 >>> data = dict([(t, np.ones(shape=5000, dtype=int).astype(t))

3508 ... for t in dtypes])

3509 >>> df = pd.DataFrame(data)

3510 >>> df.head()

3511 int64 float64 complex128 object bool

3512 0 1 1.0 1.0+0.0j 1 True

3513 1 1 1.0 1.0+0.0j 1 True

3514 2 1 1.0 1.0+0.0j 1 True

3515 3 1 1.0 1.0+0.0j 1 True

3516 4 1 1.0 1.0+0.0j 1 True

3517

3518 >>> df.memory_usage()

3519 Index 128

3520 int64 40000

3521 float64 40000

3522 complex128 80000

3523 object 40000

3524 bool 5000

3525 dtype: int64

3526

3527 >>> df.memory_usage(index=False)

3528 int64 40000

3529 float64 40000

3530 complex128 80000

3531 object 40000

3532 bool 5000

3533 dtype: int64

3534

3535 The memory footprint of `object` dtype columns is ignored by default:

3536

3537 >>> df.memory_usage(deep=True)

3538 Index 128

3539 int64 40000

3540 float64 40000

3541 complex128 80000

3542 object 180000

3543 bool 5000

3544 dtype: int64

3545

3546 Use a Categorical for efficient storage of an object-dtype column with

3547 many repeated values.

3548

3549 >>> df['object'].astype('category').memory_usage(deep=True)

3550 5244

3551 """

3552 result = self._constructor_sliced(

3553 [c.memory_usage(index=False, deep=deep) for col, c in self.items()],

3554 index=self.columns,

3555 )

3556 if index:

3557 index_memory_usage = self._constructor_sliced(

3558 self.index.memory_usage(deep=deep), index=["Index"]

3559 )

3560 result = index_memory_usage._append(result)

3561 return result

3562

3563 def transpose(self, *args, copy: bool = False) -> DataFrame:

3564 """

3565 Transpose index and columns.

3566

3567 Reflect the DataFrame over its main diagonal by writing rows as columns

3568 and vice-versa. The property :attr:`.T` is an accessor to the method

3569 :meth:`transpose`.

3570

3571 Parameters

3572 ----------

3573 *args : tuple, optional

3574 Accepted for compatibility with NumPy.

3575 copy : bool, default False

3576 Whether to copy the data after transposing, even for DataFrames

3577 with a single dtype.

3578

3579 Note that a copy is always required for mixed dtype DataFrames,

3580 or for DataFrames with any extension types.

3581

3582 Returns

3583 -------

3584 DataFrame

3585 The transposed DataFrame.

3586

3587 See Also

3588 --------

3589 numpy.transpose : Permute the dimensions of a given array.

3590

3591 Notes

3592 -----

3593 Transposing a DataFrame with mixed dtypes will result in a homogeneous

3594 DataFrame with the `object` dtype. In such a case, a copy of the data

3595 is always made.

3596

3597 Examples

3598 --------

3599 **Square DataFrame with homogeneous dtype**

3600

3601 >>> d1 = {'col1': [1, 2], 'col2': [3, 4]}

3602 >>> df1 = pd.DataFrame(data=d1)

3603 >>> df1

3604 col1 col2

3605 0 1 3

3606 1 2 4

3607

3608 >>> df1_transposed = df1.T # or df1.transpose()

3609 >>> df1_transposed

3610 0 1

3611 col1 1 2

3612 col2 3 4

3613

3614 When the dtype is homogeneous in the original DataFrame, we get a

3615 transposed DataFrame with the same dtype:

3616

3617 >>> df1.dtypes

3618 col1 int64

3619 col2 int64

3620 dtype: object

3621 >>> df1_transposed.dtypes

3622 0 int64

3623 1 int64

3624 dtype: object

3625

3626 **Non-square DataFrame with mixed dtypes**

3627

3628 >>> d2 = {'name': ['Alice', 'Bob'],

3629 ... 'score': [9.5, 8],

3630 ... 'employed': [False, True],

3631 ... 'kids': [0, 0]}

3632 >>> df2 = pd.DataFrame(data=d2)

3633 >>> df2

3634 name score employed kids

3635 0 Alice 9.5 False 0

3636 1 Bob 8.0 True 0

3637

3638 >>> df2_transposed = df2.T # or df2.transpose()

3639 >>> df2_transposed

3640 0 1

3641 name Alice Bob

3642 score 9.5 8.0

3643 employed False True

3644 kids 0 0

3645

3646 When the DataFrame has mixed dtypes, we get a transposed DataFrame with

3647 the `object` dtype:

3648

3649 >>> df2.dtypes

3650 name object

3651 score float64

3652 employed bool

3653 kids int64

3654 dtype: object

3655 >>> df2_transposed.dtypes

3656 0 object

3657 1 object

3658 dtype: object

3659 """

3660 nv.validate_transpose(args, {})

3661 # construct the args

3662

3663 dtypes = list(self.dtypes)

3664

3665 if self._can_fast_transpose:

3666 # Note: tests pass without this, but this improves perf quite a bit.

3667 new_vals = self._values.T

3668 if copy:

3669 new_vals = new_vals.copy()

3670

3671 result = self._constructor(new_vals, index=self.columns, columns=self.index)

3672

3673 elif (

3674 self._is_homogeneous_type and dtypes and is_extension_array_dtype(dtypes[0])

3675 ):

3676 # We have EAs with the same dtype. We can preserve that dtype in transpose.

3677 dtype = dtypes[0]

3678 arr_type = dtype.construct_array_type()

3679 values = self.values

3680

3681 new_values = [arr_type._from_sequence(row, dtype=dtype) for row in values]

3682 result = type(self)._from_arrays(

3683 new_values, index=self.columns, columns=self.index

3684 )

3685

3686 else:

3687 new_arr = self.values.T

3688 if copy:

3689 new_arr = new_arr.copy()

3690 result = self._constructor(new_arr, index=self.columns, columns=self.index)

3691

3692 return result.__finalize__(self, method="transpose")

3693

3694 @property

3695 def T(self) -> DataFrame:

3696 return self.transpose()

3697

3698 # ----------------------------------------------------------------------

3699 # Indexing Methods

3700

3701 def _ixs(self, i: int, axis: int = 0) -> Series:

3702 """

3703 Parameters

3704 ----------

3705 i : int

3706 axis : int

3707

3708 Returns

3709 -------

3710 Series

3711 """

3712 # irow

3713 if axis == 0:

3714 new_mgr = self._mgr.fast_xs(i)

3715

3716 # if we are a copy, mark as such

3717 copy = isinstance(new_mgr.array, np.ndarray) and new_mgr.array.base is None

3718 result = self._constructor_sliced(new_mgr, name=self.index[i]).__finalize__(

3719 self

3720 )

3721 result._set_is_copy(self, copy=copy)

3722 return result

3723

3724 # icol

3725 else:

3726 label = self.columns[i]

3727

3728 col_mgr = self._mgr.iget(i)

3729 result = self._box_col_values(col_mgr, i)

3730

3731 # this is a cached value, mark it so

3732 result._set_as_cached(label, self)

3733 return result

3734

3735 def _get_column_array(self, i: int) -> ArrayLike:

3736 """

3737 Get the values of the i'th column (ndarray or ExtensionArray, as stored

3738 in the Block)

3739

3740 Warning! The returned array is a view but doesn't handle Copy-on-Write,

3741 so this should be used with caution (for read-only purposes).

3742 """

3743 return self._mgr.iget_values(i)

3744

3745 def _iter_column_arrays(self) -> Iterator[ArrayLike]:

3746 """

3747 Iterate over the arrays of all columns in order.

3748 This returns the values as stored in the Block (ndarray or ExtensionArray).

3749

3750 Warning! The returned array is a view but doesn't handle Copy-on-Write,

3751 so this should be used with caution (for read-only purposes).

3752 """

3753 for i in range(len(self.columns)):

3754 yield self._get_column_array(i)

3755

3756 def __getitem__(self, key):

3757 check_deprecated_indexers(key)

3758 key = lib.item_from_zerodim(key)

3759 key = com.apply_if_callable(key, self)

3760

3761 if is_hashable(key) and not is_iterator(key):

3762 # is_iterator to exclude generator e.g. test_getitem_listlike

3763 # shortcut if the key is in columns

3764 is_mi = isinstance(self.columns, MultiIndex)

3765 # GH#45316 Return view if key is not duplicated

3766 # Only use drop_duplicates with duplicates for performance

3767 if not is_mi and (

3768 self.columns.is_unique

3769 and key in self.columns

3770 or key in self.columns.drop_duplicates(keep=False)

3771 ):

3772 return self._get_item_cache(key)

3773

3774 elif is_mi and self.columns.is_unique and key in self.columns:

3775 return self._getitem_multilevel(key)

3776 # Do we have a slicer (on rows)?

3777 indexer = convert_to_index_sliceable(self, key)

3778 if indexer is not None:

3779 if isinstance(indexer, np.ndarray):

3780 indexer = lib.maybe_indices_to_slice(

3781 indexer.astype(np.intp, copy=False), len(self)

3782 )

3783 if isinstance(indexer, np.ndarray):

3784 # GH#43223 If we can not convert, use take

3785 return self.take(indexer, axis=0)

3786 # either we have a slice or we have a string that can be converted

3787 # to a slice for partial-string date indexing

3788 return self._slice(indexer, axis=0)

3789

3790 # Do we have a (boolean) DataFrame?

3791 if isinstance(key, DataFrame):

3792 return self.where(key)

3793

3794 # Do we have a (boolean) 1d indexer?

3795 if com.is_bool_indexer(key):

3796 return self._getitem_bool_array(key)

3797

3798 # We are left with two options: a single key, and a collection of keys,

3799 # We interpret tuples as collections only for non-MultiIndex

3800 is_single_key = isinstance(key, tuple) or not is_list_like(key)

3801

3802 if is_single_key:

3803 if self.columns.nlevels > 1:

3804 return self._getitem_multilevel(key)

3805 indexer = self.columns.get_loc(key)

3806 if is_integer(indexer):

3807 indexer = [indexer]

3808 else:

3809 if is_iterator(key):

3810 key = list(key)

3811 indexer = self.columns._get_indexer_strict(key, "columns")[1]

3812

3813 # take() does not accept boolean indexers

3814 if getattr(indexer, "dtype", None) == bool:

3815 indexer = np.where(indexer)[0]

3816

3817 data = self._take_with_is_copy(indexer, axis=1)

3818

3819 if is_single_key:

3820 # What does looking for a single key in a non-unique index return?

3821 # The behavior is inconsistent. It returns a Series, except when

3822 # - the key itself is repeated (test on data.shape, #9519), or

3823 # - we have a MultiIndex on columns (test on self.columns, #21309)

3824 if data.shape[1] == 1 and not isinstance(self.columns, MultiIndex):

3825 # GH#26490 using data[key] can cause RecursionError

3826 return data._get_item_cache(key)

3827

3828 return data

3829

3830 def _getitem_bool_array(self, key):

3831 # also raises Exception if object array with NA values

3832 # warning here just in case -- previously __setitem__ was

3833 # reindexing but __getitem__ was not; it seems more reasonable to

3834 # go with the __setitem__ behavior since that is more consistent

3835 # with all other indexing behavior

3836 if isinstance(key, Series) and not key.index.equals(self.index):

3837 warnings.warn(

3838 "Boolean Series key will be reindexed to match DataFrame index.",

3839 UserWarning,

3840 stacklevel=find_stack_level(),

3841 )

3842 elif len(key) != len(self.index):

3843 raise ValueError(

3844 f"Item wrong length {len(key)} instead of {len(self.index)}."

3845 )

3846

3847 # check_bool_indexer will throw exception if Series key cannot

3848 # be reindexed to match DataFrame rows

3849 key = check_bool_indexer(self.index, key)

3850 indexer = key.nonzero()[0]

3851 return self._take_with_is_copy(indexer, axis=0)

3852

3853 def _getitem_multilevel(self, key):

3854 # self.columns is a MultiIndex

3855 loc = self.columns.get_loc(key)

3856 if isinstance(loc, (slice, np.ndarray)):

3857 new_columns = self.columns[loc]

3858 result_columns = maybe_droplevels(new_columns, key)

3859 if self._is_mixed_type:

3860 result = self.reindex(columns=new_columns)

3861 result.columns = result_columns

3862 else:

3863 new_values = self.values[:, loc]

3864 result = self._constructor(

3865 new_values, index=self.index, columns=result_columns

3866 )

3867 result = result.__finalize__(self)

3868

3869 # If there is only one column being returned, and its name is

3870 # either an empty string, or a tuple with an empty string as its

3871 # first element, then treat the empty string as a placeholder

3872 # and return the column as if the user had provided that empty

3873 # string in the key. If the result is a Series, exclude the

3874 # implied empty string from its name.

3875 if len(result.columns) == 1:

3876 top = result.columns[0]

3877 if isinstance(top, tuple):

3878 top = top[0]

3879 if top == "":

3880 result = result[""]

3881 if isinstance(result, Series):

3882 result = self._constructor_sliced(

3883 result, index=self.index, name=key

3884 )

3885

3886 result._set_is_copy(self)

3887 return result

3888 else:

3889 # loc is neither a slice nor ndarray, so must be an int

3890 return self._ixs(loc, axis=1)

3891

3892 def _get_value(self, index, col, takeable: bool = False) -> Scalar:

3893 """

3894 Quickly retrieve single value at passed column and index.

3895

3896 Parameters

3897 ----------

3898 index : row label

3899 col : column label

3900 takeable : interpret the index/col as indexers, default False

3901

3902 Returns

3903 -------

3904 scalar

3905

3906 Notes

3907 -----

3908 Assumes that both `self.index._index_as_unique` and

3909 `self.columns._index_as_unique`; Caller is responsible for checking.

3910 """

3911 if takeable:

3912 series = self._ixs(col, axis=1)

3913 return series._values[index]

3914

3915 series = self._get_item_cache(col)

3916 engine = self.index._engine

3917

3918 if not isinstance(self.index, MultiIndex):

3919 # CategoricalIndex: Trying to use the engine fastpath may give incorrect

3920 # results if our categories are integers that dont match our codes

3921 # IntervalIndex: IntervalTree has no get_loc

3922 row = self.index.get_loc(index)

3923 return series._values[row]

3924

3925 # For MultiIndex going through engine effectively restricts us to

3926 # same-length tuples; see test_get_set_value_no_partial_indexing

3927 loc = engine.get_loc(index)

3928 return series._values[loc]

3929

3930 def isetitem(self, loc, value) -> None:

3931 """

3932 Set the given value in the column with position 'loc'.

3933

3934 This is a positional analogue to __setitem__.

3935

3936 Parameters

3937 ----------

3938 loc : int or sequence of ints

3939 value : scalar or arraylike

3940

3941 Notes

3942 -----

3943 Unlike `frame.iloc[:, i] = value`, `frame.isetitem(loc, value)` will

3944 _never_ try to set the values in place, but will always insert a new

3945 array.

3946

3947 In cases where `frame.columns` is unique, this is equivalent to

3948 `frame[frame.columns[i]] = value`.

3949 """

3950 arraylike = self._sanitize_column(value)

3951 self._iset_item_mgr(loc, arraylike, inplace=False)

3952

3953 def __setitem__(self, key, value):

3954 key = com.apply_if_callable(key, self)

3955

3956 # see if we can slice the rows

3957 indexer = convert_to_index_sliceable(self, key)

3958 if indexer is not None:

3959 # either we have a slice or we have a string that can be converted

3960 # to a slice for partial-string date indexing

3961 return self._setitem_slice(indexer, value)

3962

3963 if isinstance(key, DataFrame) or getattr(key, "ndim", None) == 2:

3964 self._setitem_frame(key, value)

3965 elif isinstance(key, (Series, np.ndarray, list, Index)):

3966 self._setitem_array(key, value)

3967 elif isinstance(value, DataFrame):

3968 self._set_item_frame_value(key, value)

3969 elif (

3970 is_list_like(value)

3971 and not self.columns.is_unique

3972 and 1 < len(self.columns.get_indexer_for([key])) == len(value)

3973 ):

3974 # Column to set is duplicated

3975 self._setitem_array([key], value)

3976 else:

3977 # set column

3978 self._set_item(key, value)

3979

3980 def _setitem_slice(self, key: slice, value):

3981 # NB: we can't just use self.loc[key] = value because that

3982 # operates on labels and we need to operate positional for

3983 # backwards-compat, xref GH#31469

3984 self._check_setitem_copy()

3985 self.iloc[key] = value

3986

3987 def _setitem_array(self, key, value):

3988 # also raises Exception if object array with NA values

3989 if com.is_bool_indexer(key):

3990 # bool indexer is indexing along rows

3991 if len(key) != len(self.index):

3992 raise ValueError(

3993 f"Item wrong length {len(key)} instead of {len(self.index)}!"

3994 )

3995 key = check_bool_indexer(self.index, key)

3996 indexer = key.nonzero()[0]

3997 self._check_setitem_copy()

3998 if isinstance(value, DataFrame):

3999 # GH#39931 reindex since iloc does not align

4000 value = value.reindex(self.index.take(indexer))

4001 self.iloc[indexer] = value

4002

4003 else:

4004 # Note: unlike self.iloc[:, indexer] = value, this will

4005 # never try to overwrite values inplace

4006

4007 if isinstance(value, DataFrame):

4008 check_key_length(self.columns, key, value)

4009 for k1, k2 in zip(key, value.columns):

4010 self[k1] = value[k2]

4011

4012 elif not is_list_like(value):

4013 for col in key:

4014 self[col] = value

4015

4016 elif isinstance(value, np.ndarray) and value.ndim == 2:

4017 self._iset_not_inplace(key, value)

4018

4019 elif np.ndim(value) > 1:

4020 # list of lists

4021 value = DataFrame(value).values

4022 return self._setitem_array(key, value)

4023

4024 else:

4025 self._iset_not_inplace(key, value)

4026

4027 def _iset_not_inplace(self, key, value):

4028 # GH#39510 when setting with df[key] = obj with a list-like key and

4029 # list-like value, we iterate over those listlikes and set columns

4030 # one at a time. This is different from dispatching to

4031 # `self.loc[:, key]= value` because loc.__setitem__ may overwrite

4032 # data inplace, whereas this will insert new arrays.

4033

4034 def igetitem(obj, i: int):

4035 # Note: we catch DataFrame obj before getting here, but

4036 # hypothetically would return obj.iloc[:, i]

4037 if isinstance(obj, np.ndarray):

4038 return obj[..., i]

4039 else:

4040 return obj[i]

4041

4042 if self.columns.is_unique:

4043 if np.shape(value)[-1] != len(key):

4044 raise ValueError("Columns must be same length as key")

4045

4046 for i, col in enumerate(key):

4047 self[col] = igetitem(value, i)

4048

4049 else:

4050

4051 ilocs = self.columns.get_indexer_non_unique(key)[0]

4052 if (ilocs < 0).any():

4053 # key entries not in self.columns

4054 raise NotImplementedError

4055

4056 if np.shape(value)[-1] != len(ilocs):

4057 raise ValueError("Columns must be same length as key")

4058

4059 assert np.ndim(value) <= 2

4060

4061 orig_columns = self.columns

4062

4063 # Using self.iloc[:, i] = ... may set values inplace, which

4064 # by convention we do not do in __setitem__

4065 try:

4066 self.columns = Index(range(len(self.columns)))

4067 for i, iloc in enumerate(ilocs):

4068 self[iloc] = igetitem(value, i)

4069 finally:

4070 self.columns = orig_columns

4071

4072 def _setitem_frame(self, key, value):

4073 # support boolean setting with DataFrame input, e.g.

4074 # df[df > df2] = 0

4075 if isinstance(key, np.ndarray):

4076 if key.shape != self.shape:

4077 raise ValueError("Array conditional must be same shape as self")

4078 key = self._constructor(key, **self._construct_axes_dict())

4079

4080 if key.size and not is_bool_dtype(key.values):

4081 raise TypeError(

4082 "Must pass DataFrame or 2-d ndarray with boolean values only"

4083 )

4084

4085 self._check_inplace_setting(value)

4086 self._check_setitem_copy()

4087 self._where(-key, value, inplace=True)

4088

4089 def _set_item_frame_value(self, key, value: DataFrame) -> None:

4090 self._ensure_valid_index(value)

4091

4092 # align columns

4093 if key in self.columns:

4094 loc = self.columns.get_loc(key)

4095 cols = self.columns[loc]

4096 len_cols = 1 if is_scalar(cols) else len(cols)

4097 if len_cols != len(value.columns):

4098 raise ValueError("Columns must be same length as key")

4099

4100 # align right-hand-side columns if self.columns

4101 # is multi-index and self[key] is a sub-frame

4102 if isinstance(self.columns, MultiIndex) and isinstance(

4103 loc, (slice, Series, np.ndarray, Index)

4104 ):

4105 cols_droplevel = maybe_droplevels(cols, key)

4106 if len(cols_droplevel) and not cols_droplevel.equals(value.columns):

4107 value = value.reindex(cols_droplevel, axis=1)

4108

4109 for col, col_droplevel in zip(cols, cols_droplevel):

4110 self[col] = value[col_droplevel]

4111 return

4112

4113 if is_scalar(cols):

4114 self[cols] = value[value.columns[0]]

4115 return

4116

4117 # now align rows

4118 arraylike = _reindex_for_setitem(value, self.index)

4119 self._set_item_mgr(key, arraylike)

4120 return

4121

4122 if len(value.columns) != 1:

4123 raise ValueError(

4124 "Cannot set a DataFrame with multiple columns to the single "

4125 f"column {key}"

4126 )

4127

4128 self[key] = value[value.columns[0]]

4129

4130 def _iset_item_mgr(

4131 self, loc: int | slice | np.ndarray, value, inplace: bool = False

4132 ) -> None:

4133 # when called from _set_item_mgr loc can be anything returned from get_loc

4134 self._mgr.iset(loc, value, inplace=inplace)

4135 self._clear_item_cache()

4136

4137 def _set_item_mgr(self, key, value: ArrayLike) -> None:

4138 try:

4139 loc = self._info_axis.get_loc(key)

4140 except KeyError:

4141 # This item wasn't present, just insert at end

4142 self._mgr.insert(len(self._info_axis), key, value)

4143 else:

4144 self._iset_item_mgr(loc, value)

4145

4146 # check if we are modifying a copy

4147 # try to set first as we want an invalid

4148 # value exception to occur first

4149 if len(self):

4150 self._check_setitem_copy()

4151

4152 def _iset_item(self, loc: int, value) -> None:

4153 arraylike = self._sanitize_column(value)

4154 self._iset_item_mgr(loc, arraylike, inplace=True)

4155

4156 # check if we are modifying a copy

4157 # try to set first as we want an invalid

4158 # value exception to occur first

4159 if len(self):

4160 self._check_setitem_copy()

4161

4162 def _set_item(self, key, value) -> None:

4163 """

4164 Add series to DataFrame in specified column.

4165

4166 If series is a numpy-array (not a Series/TimeSeries), it must be the

4167 same length as the DataFrames index or an error will be thrown.

4168

4169 Series/TimeSeries will be conformed to the DataFrames index to

4170 ensure homogeneity.

4171 """

4172 value = self._sanitize_column(value)

4173

4174 if (

4175 key in self.columns

4176 and value.ndim == 1

4177 and not is_extension_array_dtype(value)

4178 ):

4179 # broadcast across multiple columns if necessary

4180 if not self.columns.is_unique or isinstance(self.columns, MultiIndex):

4181 existing_piece = self[key]

4182 if isinstance(existing_piece, DataFrame):

4183 value = np.tile(value, (len(existing_piece.columns), 1)).T

4184

4185 self._set_item_mgr(key, value)

4186

4187 def _set_value(

4188 self, index: IndexLabel, col, value: Scalar, takeable: bool = False

4189 ) -> None:

4190 """

4191 Put single value at passed column and index.

4192

4193 Parameters

4194 ----------

4195 index : Label

4196 row label

4197 col : Label

4198 column label

4199 value : scalar

4200 takeable : bool, default False

4201 Sets whether or not index/col interpreted as indexers

4202 """

4203 try:

4204 if takeable:

4205 icol = col

4206 iindex = cast(int, index)

4207 else:

4208 icol = self.columns.get_loc(col)

4209 iindex = self.index.get_loc(index)

4210 self._mgr.column_setitem(icol, iindex, value)

4211 self._clear_item_cache()

4212

4213 except (KeyError, TypeError, ValueError):

4214 # get_loc might raise a KeyError for missing labels (falling back

4215 # to (i)loc will do expansion of the index)

4216 # column_setitem will do validation that may raise TypeError or ValueError

4217 # set using a non-recursive method & reset the cache

4218 if takeable:

4219 self.iloc[index, col] = value

4220 else:

4221 self.loc[index, col] = value

4222 self._item_cache.pop(col, None)

4223

4224 except InvalidIndexError as ii_err:

4225 # GH48729: Seems like you are trying to assign a value to a

4226 # row when only scalar options are permitted

4227 raise InvalidIndexError(

4228 f"You can only assign a scalar value not a {type(value)}"

4229 ) from ii_err

4230

4231 def _ensure_valid_index(self, value) -> None:

4232 """

4233 Ensure that if we don't have an index, that we can create one from the

4234 passed value.

4235 """

4236 # GH5632, make sure that we are a Series convertible

4237 if not len(self.index) and is_list_like(value) and len(value):

4238 if not isinstance(value, DataFrame):

4239 try:

4240 value = Series(value)

4241 except (ValueError, NotImplementedError, TypeError) as err:

4242 raise ValueError(

4243 "Cannot set a frame with no defined index "

4244 "and a value that cannot be converted to a Series"

4245 ) from err

4246

4247 # GH31368 preserve name of index

4248 index_copy = value.index.copy()

4249 if self.index.name is not None:

4250 index_copy.name = self.index.name

4251

4252 self._mgr = self._mgr.reindex_axis(index_copy, axis=1, fill_value=np.nan)

4253

4254 def _box_col_values(self, values: SingleDataManager, loc: int) -> Series:

4255 """

4256 Provide boxed values for a column.

4257 """

4258 # Lookup in columns so that if e.g. a str datetime was passed

4259 # we attach the Timestamp object as the name.

4260 name = self.columns[loc]

4261 klass = self._constructor_sliced

4262 # We get index=self.index bc values is a SingleDataManager

4263 return klass(values, name=name, fastpath=True).__finalize__(self)

4264

4265 # ----------------------------------------------------------------------

4266 # Lookup Caching

4267

4268 def _clear_item_cache(self) -> None:

4269 self._item_cache.clear()

4270

4271 def _get_item_cache(self, item: Hashable) -> Series:

4272 """Return the cached item, item represents a label indexer."""

4273 cache = self._item_cache

4274 res = cache.get(item)

4275 if res is None:

4276 # All places that call _get_item_cache have unique columns,

4277 # pending resolution of GH#33047

4278

4279 loc = self.columns.get_loc(item)

4280 res = self._ixs(loc, axis=1)

4281

4282 cache[item] = res

4283

4284 # for a chain

4285 res._is_copy = self._is_copy

4286 return res

4287

4288 def _reset_cacher(self) -> None:

4289 # no-op for DataFrame

4290 pass

4291

4292 def _maybe_cache_changed(self, item, value: Series, inplace: bool) -> None:

4293 """

4294 The object has called back to us saying maybe it has changed.

4295 """

4296 loc = self._info_axis.get_loc(item)

4297 arraylike = value._values

4298

4299 old = self._ixs(loc, axis=1)

4300 if old._values is value._values and inplace:

4301 # GH#46149 avoid making unnecessary copies/block-splitting

4302 return

4303

4304 self._mgr.iset(loc, arraylike, inplace=inplace)

4305

4306 # ----------------------------------------------------------------------

4307 # Unsorted

4308

4309 @overload

4310 def query(self, expr: str, *, inplace: Literal[False] = ..., **kwargs) -> DataFrame:

4311 ...

4312

4313 @overload

4314 def query(self, expr: str, *, inplace: Literal[True], **kwargs) -> None:

4315 ...

4316

4317 @overload

4318 def query(self, expr: str, *, inplace: bool = ..., **kwargs) -> DataFrame | None:

4319 ...

4320

4321 @deprecate_nonkeyword_arguments(version=None, allowed_args=["self", "expr"])

4322 def query(self, expr: str, inplace: bool = False, **kwargs) -> DataFrame | None:

4323 """

4324 Query the columns of a DataFrame with a boolean expression.

4325

4326 Parameters

4327 ----------

4328 expr : str

4329 The query string to evaluate.

4330

4331 You can refer to variables

4332 in the environment by prefixing them with an '@' character like

4333 ``@a + b``.

4334

4335 You can refer to column names that are not valid Python variable names

4336 by surrounding them in backticks. Thus, column names containing spaces

4337 or punctuations (besides underscores) or starting with digits must be

4338 surrounded by backticks. (For example, a column named "Area (cm^2)" would

4339 be referenced as ```Area (cm^2)```). Column names which are Python keywords

4340 (like "list", "for", "import", etc) cannot be used.

4341

4342 For example, if one of your columns is called ``a a`` and you want

4343 to sum it with ``b``, your query should be ```a a` + b``.

4344

4345 .. versionadded:: 0.25.0

4346 Backtick quoting introduced.

4347

4348 .. versionadded:: 1.0.0

4349 Expanding functionality of backtick quoting for more than only spaces.

4350

4351 inplace : bool

4352 Whether to modify the DataFrame rather than creating a new one.

4353 **kwargs

4354 See the documentation for :func:`eval` for complete details

4355 on the keyword arguments accepted by :meth:`DataFrame.query`.

4356

4357 Returns

4358 -------

4359 DataFrame or None

4360 DataFrame resulting from the provided query expression or

4361 None if ``inplace=True``.

4362

4363 See Also

4364 --------

4365 eval : Evaluate a string describing operations on

4366 DataFrame columns.

4367 DataFrame.eval : Evaluate a string describing operations on

4368 DataFrame columns.

4369

4370 Notes

4371 -----

4372 The result of the evaluation of this expression is first passed to

4373 :attr:`DataFrame.loc` and if that fails because of a

4374 multidimensional key (e.g., a DataFrame) then the result will be passed

4375 to :meth:`DataFrame.__getitem__`.

4376

4377 This method uses the top-level :func:`eval` function to

4378 evaluate the passed query.

4379

4380 The :meth:`~pandas.DataFrame.query` method uses a slightly

4381 modified Python syntax by default. For example, the ``&`` and ``|``

4382 (bitwise) operators have the precedence of their boolean cousins,

4383 :keyword:`and` and :keyword:`or`. This *is* syntactically valid Python,

4384 however the semantics are different.

4385

4386 You can change the semantics of the expression by passing the keyword

4387 argument ``parser='python'``. This enforces the same semantics as

4388 evaluation in Python space. Likewise, you can pass ``engine='python'``

4389 to evaluate an expression using Python itself as a backend. This is not

4390 recommended as it is inefficient compared to using ``numexpr`` as the

4391 engine.

4392

4393 The :attr:`DataFrame.index` and

4394 :attr:`DataFrame.columns` attributes of the

4395 :class:`~pandas.DataFrame` instance are placed in the query namespace

4396 by default, which allows you to treat both the index and columns of the

4397 frame as a column in the frame.

4398 The identifier ``index`` is used for the frame index; you can also

4399 use the name of the index to identify it in a query. Please note that

4400 Python keywords may not be used as identifiers.

4401

4402 For further details and examples see the ``query`` documentation in

4403 :ref:`indexing <indexing.query>`.

4404

4405 *Backtick quoted variables*

4406

4407 Backtick quoted variables are parsed as literal Python code and

4408 are converted internally to a Python valid identifier.

4409 This can lead to the following problems.

4410

4411 During parsing a number of disallowed characters inside the backtick

4412 quoted string are replaced by strings that are allowed as a Python identifier.

4413 These characters include all operators in Python, the space character, the

4414 question mark, the exclamation mark, the dollar sign, and the euro sign.

4415 For other characters that fall outside the ASCII range (U+0001..U+007F)

4416 and those that are not further specified in PEP 3131,

4417 the query parser will raise an error.

4418 This excludes whitespace different than the space character,

4419 but also the hashtag (as it is used for comments) and the backtick

4420 itself (backtick can also not be escaped).

4421

4422 In a special case, quotes that make a pair around a backtick can

4423 confuse the parser.

4424 For example, ```it's` > `that's``` will raise an error,

4425 as it forms a quoted string (``'s > `that'``) with a backtick inside.

4426

4427 See also the Python documentation about lexical analysis

4428 (https://docs.python.org/3/reference/lexical_analysis.html)

4429 in combination with the source code in :mod:`pandas.core.computation.parsing`.

4430

4431 Examples

4432 --------

4433 >>> df = pd.DataFrame({'A': range(1, 6),

4434 ... 'B': range(10, 0, -2),

4435 ... 'C C': range(10, 5, -1)})

4436 >>> df

4437 A B C C

4438 0 1 10 10

4439 1 2 8 9

4440 2 3 6 8

4441 3 4 4 7

4442 4 5 2 6

4443 >>> df.query('A > B')

4444 A B C C

4445 4 5 2 6

4446

4447 The previous expression is equivalent to

4448

4449 >>> df[df.A > df.B]

4450 A B C C

4451 4 5 2 6

4452

4453 For columns with spaces in their name, you can use backtick quoting.

4454

4455 >>> df.query('B == `C C`')

4456 A B C C

4457 0 1 10 10

4458

4459 The previous expression is equivalent to

4460

4461 >>> df[df.B == df['C C']]

4462 A B C C

4463 0 1 10 10

4464 """

4465 inplace = validate_bool_kwarg(inplace, "inplace")

4466 if not isinstance(expr, str):

4467 msg = f"expr must be a string to be evaluated, {type(expr)} given"

4468 raise ValueError(msg)

4469 kwargs["level"] = kwargs.pop("level", 0) + 2

4470 kwargs["target"] = None

4471 res = self.eval(expr, **kwargs)

4472

4473 try:

4474 result = self.loc[res]

4475 except ValueError:

4476 # when res is multi-dimensional loc raises, but this is sometimes a

4477 # valid query

4478 result = self[res]

4479

4480 if inplace:

4481 self._update_inplace(result)

4482 return None

4483 else:

4484 return result

4485

4486 @overload

4487 def eval(self, expr: str, *, inplace: Literal[False] = ..., **kwargs) -> Any:

4488 ...

4489

4490 @overload

4491 def eval(self, expr: str, *, inplace: Literal[True], **kwargs) -> None:

4492 ...

4493

4494 @deprecate_nonkeyword_arguments(version=None, allowed_args=["self", "expr"])

4495 def eval(self, expr: str, inplace: bool = False, **kwargs) -> Any | None:

4496 """

4497 Evaluate a string describing operations on DataFrame columns.

4498

4499 Operates on columns only, not specific rows or elements. This allows

4500 `eval` to run arbitrary code, which can make you vulnerable to code

4501 injection if you pass user input to this function.

4502

4503 Parameters

4504 ----------

4505 expr : str

4506 The expression string to evaluate.

4507 inplace : bool, default False

4508 If the expression contains an assignment, whether to perform the

4509 operation inplace and mutate the existing DataFrame. Otherwise,

4510 a new DataFrame is returned.

4511 **kwargs

4512 See the documentation for :func:`eval` for complete details

4513 on the keyword arguments accepted by

4514 :meth:`~pandas.DataFrame.query`.

4515

4516 Returns

4517 -------

4518 ndarray, scalar, pandas object, or None

4519 The result of the evaluation or None if ``inplace=True``.

4520

4521 See Also

4522 --------

4523 DataFrame.query : Evaluates a boolean expression to query the columns

4524 of a frame.

4525 DataFrame.assign : Can evaluate an expression or function to create new

4526 values for a column.

4527 eval : Evaluate a Python expression as a string using various

4528 backends.

4529

4530 Notes

4531 -----

4532 For more details see the API documentation for :func:`~eval`.

4533 For detailed examples see :ref:`enhancing performance with eval

4534 <enhancingperf.eval>`.

4535

4536 Examples

4537 --------

4538 >>> df = pd.DataFrame({'A': range(1, 6), 'B': range(10, 0, -2)})

4539 >>> df

4540 A B

4541 0 1 10

4542 1 2 8

4543 2 3 6

4544 3 4 4

4545 4 5 2

4546 >>> df.eval('A + B')

4547 0 11

4548 1 10

4549 2 9

4550 3 8

4551 4 7

4552 dtype: int64

4553

4554 Assignment is allowed though by default the original DataFrame is not

4555 modified.

4556

4557 >>> df.eval('C = A + B')

4558 A B C

4559 0 1 10 11

4560 1 2 8 10

4561 2 3 6 9

4562 3 4 4 8

4563 4 5 2 7

4564 >>> df

4565 A B

4566 0 1 10

4567 1 2 8

4568 2 3 6

4569 3 4 4

4570 4 5 2

4571

4572 Use ``inplace=True`` to modify the original DataFrame.

4573

4574 >>> df.eval('C = A + B', inplace=True)

4575 >>> df

4576 A B C

4577 0 1 10 11

4578 1 2 8 10

4579 2 3 6 9

4580 3 4 4 8

4581 4 5 2 7

4582

4583 Multiple columns can be assigned to using multi-line expressions:

4584

4585 >>> df.eval(

4586 ... '''

4587 ... C = A + B

4588 ... D = A - B

4589 ... '''

4590 ... )

4591 A B C D

4592 0 1 10 11 -9

4593 1 2 8 10 -6

4594 2 3 6 9 -3

4595 3 4 4 8 0

4596 4 5 2 7 3

4597 """

4598 from pandas.core.computation.eval import eval as _eval

4599

4600 inplace = validate_bool_kwarg(inplace, "inplace")

4601 kwargs["level"] = kwargs.pop("level", 0) + 2

4602 index_resolvers = self._get_index_resolvers()

4603 column_resolvers = self._get_cleaned_column_resolvers()

4604 resolvers = column_resolvers, index_resolvers

4605 if "target" not in kwargs:

4606 kwargs["target"] = self

4607 kwargs["resolvers"] = tuple(kwargs.get("resolvers", ())) + resolvers

4608

4609 return _eval(expr, inplace=inplace, **kwargs)

4610

4611 def select_dtypes(self, include=None, exclude=None) -> DataFrame:

4612 """

4613 Return a subset of the DataFrame's columns based on the column dtypes.

4614

4615 Parameters

4616 ----------

4617 include, exclude : scalar or list-like

4618 A selection of dtypes or strings to be included/excluded. At least

4619 one of these parameters must be supplied.

4620

4621 Returns

4622 -------

4623 DataFrame

4624 The subset of the frame including the dtypes in ``include`` and

4625 excluding the dtypes in ``exclude``.

4626

4627 Raises

4628 ------

4629 ValueError

4630 * If both of ``include`` and ``exclude`` are empty

4631 * If ``include`` and ``exclude`` have overlapping elements

4632 * If any kind of string dtype is passed in.

4633

4634 See Also

4635 --------

4636 DataFrame.dtypes: Return Series with the data type of each column.

4637

4638 Notes

4639 -----

4640 * To select all *numeric* types, use ``np.number`` or ``'number'``

4641 * To select strings you must use the ``object`` dtype, but note that

4642 this will return *all* object dtype columns

4643 * See the `numpy dtype hierarchy

4644 <https://numpy.org/doc/stable/reference/arrays.scalars.html>`__

4645 * To select datetimes, use ``np.datetime64``, ``'datetime'`` or

4646 ``'datetime64'``

4647 * To select timedeltas, use ``np.timedelta64``, ``'timedelta'`` or

4648 ``'timedelta64'``

4649 * To select Pandas categorical dtypes, use ``'category'``

4650 * To select Pandas datetimetz dtypes, use ``'datetimetz'`` (new in

4651 0.20.0) or ``'datetime64[ns, tz]'``

4652

4653 Examples

4654 --------

4655 >>> df = pd.DataFrame({'a': [1, 2] * 3,

4656 ... 'b': [True, False] * 3,

4657 ... 'c': [1.0, 2.0] * 3})

4658 >>> df

4659 a b c

4660 0 1 True 1.0

4661 1 2 False 2.0

4662 2 1 True 1.0

4663 3 2 False 2.0

4664 4 1 True 1.0

4665 5 2 False 2.0

4666

4667 >>> df.select_dtypes(include='bool')

4668 b

4669 0 True

4670 1 False

4671 2 True

4672 3 False

4673 4 True

4674 5 False

4675

4676 >>> df.select_dtypes(include=['float64'])

4677 c

4678 0 1.0

4679 1 2.0

4680 2 1.0

4681 3 2.0

4682 4 1.0

4683 5 2.0

4684

4685 >>> df.select_dtypes(exclude=['int64'])

4686 b c

4687 0 True 1.0

4688 1 False 2.0

4689 2 True 1.0

4690 3 False 2.0

4691 4 True 1.0

4692 5 False 2.0

4693 """

4694 if not is_list_like(include):

4695 include = (include,) if include is not None else ()

4696 if not is_list_like(exclude):

4697 exclude = (exclude,) if exclude is not None else ()

4698

4699 selection = (frozenset(include), frozenset(exclude))

4700

4701 if not any(selection):

4702 raise ValueError("at least one of include or exclude must be nonempty")

4703

4704 # convert the myriad valid dtypes object to a single representation

4705 def check_int_infer_dtype(dtypes):

4706 converted_dtypes: list[type] = []

4707 for dtype in dtypes:

4708 # Numpy maps int to different types (int32, in64) on Windows and Linux

4709 # see https://github.com/numpy/numpy/issues/9464

4710 if (isinstance(dtype, str) and dtype == "int") or (dtype is int):

4711 converted_dtypes.append(np.int32)

4712 converted_dtypes.append(np.int64)

4713 elif dtype == "float" or dtype is float:

4714 # GH#42452 : np.dtype("float") coerces to np.float64 from Numpy 1.20

4715 converted_dtypes.extend([np.float64, np.float32])

4716 else:

4717 converted_dtypes.append(infer_dtype_from_object(dtype))

4718 return frozenset(converted_dtypes)

4719

4720 include = check_int_infer_dtype(include)

4721 exclude = check_int_infer_dtype(exclude)

4722

4723 for dtypes in (include, exclude):

4724 invalidate_string_dtypes(dtypes)

4725

4726 # can't both include AND exclude!

4727 if not include.isdisjoint(exclude):

4728 raise ValueError(f"include and exclude overlap on {(include & exclude)}")

4729

4730 def dtype_predicate(dtype: DtypeObj, dtypes_set) -> bool:

4731 # GH 46870: BooleanDtype._is_numeric == True but should be excluded

4732 return issubclass(dtype.type, tuple(dtypes_set)) or (

4733 np.number in dtypes_set

4734 and getattr(dtype, "_is_numeric", False)

4735 and not is_bool_dtype(dtype)

4736 )

4737

4738 def predicate(arr: ArrayLike) -> bool:

4739 dtype = arr.dtype

4740 if include:

4741 if not dtype_predicate(dtype, include):

4742 return False

4743

4744 if exclude:

4745 if dtype_predicate(dtype, exclude):

4746 return False

4747

4748 return True

4749

4750 mgr = self._mgr._get_data_subset(predicate).copy(deep=None)

4751 return type(self)(mgr).__finalize__(self)

4752

4753 def insert(

4754 self,

4755 loc: int,

4756 column: Hashable,

4757 value: Scalar | AnyArrayLike,

4758 allow_duplicates: bool | lib.NoDefault = lib.no_default,

4759 ) -> None:

4760 """

4761 Insert column into DataFrame at specified location.

4762

4763 Raises a ValueError if `column` is already contained in the DataFrame,

4764 unless `allow_duplicates` is set to True.

4765

4766 Parameters

4767 ----------

4768 loc : int

4769 Insertion index. Must verify 0 <= loc <= len(columns).

4770 column : str, number, or hashable object

4771 Label of the inserted column.

4772 value : Scalar, Series, or array-like

4773 allow_duplicates : bool, optional, default lib.no_default

4774

4775 See Also

4776 --------

4777 Index.insert : Insert new item by index.

4778

4779 Examples

4780 --------

4781 >>> df = pd.DataFrame({'col1': [1, 2], 'col2': [3, 4]})

4782 >>> df

4783 col1 col2

4784 0 1 3

4785 1 2 4

4786 >>> df.insert(1, "newcol", [99, 99])

4787 >>> df

4788 col1 newcol col2

4789 0 1 99 3

4790 1 2 99 4

4791 >>> df.insert(0, "col1", [100, 100], allow_duplicates=True)

4792 >>> df

4793 col1 col1 newcol col2

4794 0 100 1 99 3

4795 1 100 2 99 4

4796

4797 Notice that pandas uses index alignment in case of `value` from type `Series`:

4798

4799 >>> df.insert(0, "col0", pd.Series([5, 6], index=[1, 2]))

4800 >>> df

4801 col0 col1 col1 newcol col2

4802 0 NaN 100 1 99 3

4803 1 5.0 100 2 99 4

4804 """

4805 if allow_duplicates is lib.no_default:

4806 allow_duplicates = False

4807 if allow_duplicates and not self.flags.allows_duplicate_labels:

4808 raise ValueError(

4809 "Cannot specify 'allow_duplicates=True' when "

4810 "'self.flags.allows_duplicate_labels' is False."

4811 )

4812 if not allow_duplicates and column in self.columns:

4813 # Should this be a different kind of error??

4814 raise ValueError(f"cannot insert {column}, already exists")

4815 if not isinstance(loc, int):

4816 raise TypeError("loc must be int")

4817

4818 value = self._sanitize_column(value)

4819 self._mgr.insert(loc, column, value)

4820

4821 def assign(self, **kwargs) -> DataFrame:

4822 r"""

4823 Assign new columns to a DataFrame.

4824

4825 Returns a new object with all original columns in addition to new ones.

4826 Existing columns that are re-assigned will be overwritten.

4827

4828 Parameters

4829 ----------

4830 **kwargs : dict of {str: callable or Series}

4831 The column names are keywords. If the values are

4832 callable, they are computed on the DataFrame and

4833 assigned to the new columns. The callable must not

4834 change input DataFrame (though pandas doesn't check it).

4835 If the values are not callable, (e.g. a Series, scalar, or array),

4836 they are simply assigned.

4837

4838 Returns

4839 -------

4840 DataFrame

4841 A new DataFrame with the new columns in addition to

4842 all the existing columns.

4843

4844 Notes

4845 -----

4846 Assigning multiple columns within the same ``assign`` is possible.

4847 Later items in '\*\*kwargs' may refer to newly created or modified

4848 columns in 'df'; items are computed and assigned into 'df' in order.

4849

4850 Examples

4851 --------

4852 >>> df = pd.DataFrame({'temp_c': [17.0, 25.0]},

4853 ... index=['Portland', 'Berkeley'])

4854 >>> df

4855 temp_c

4856 Portland 17.0

4857 Berkeley 25.0

4858

4859 Where the value is a callable, evaluated on `df`:

4860

4861 >>> df.assign(temp_f=lambda x: x.temp_c * 9 / 5 + 32)

4862 temp_c temp_f

4863 Portland 17.0 62.6

4864 Berkeley 25.0 77.0

4865

4866 Alternatively, the same behavior can be achieved by directly

4867 referencing an existing Series or sequence:

4868

4869 >>> df.assign(temp_f=df['temp_c'] * 9 / 5 + 32)

4870 temp_c temp_f

4871 Portland 17.0 62.6

4872 Berkeley 25.0 77.0

4873

4874 You can create multiple columns within the same assign where one

4875 of the columns depends on another one defined within the same assign:

4876

4877 >>> df.assign(temp_f=lambda x: x['temp_c'] * 9 / 5 + 32,

4878 ... temp_k=lambda x: (x['temp_f'] + 459.67) * 5 / 9)

4879 temp_c temp_f temp_k

4880 Portland 17.0 62.6 290.15

4881 Berkeley 25.0 77.0 298.15

4882 """

4883 data = self.copy()

4884

4885 for k, v in kwargs.items():

4886 data[k] = com.apply_if_callable(v, data)

4887 return data

4888

4889 def _sanitize_column(self, value) -> ArrayLike:

4890 """

4891 Ensures new columns (which go into the BlockManager as new blocks) are

4892 always copied and converted into an array.

4893

4894 Parameters

4895 ----------

4896 value : scalar, Series, or array-like

4897

4898 Returns

4899 -------

4900 numpy.ndarray or ExtensionArray

4901 """

4902 self._ensure_valid_index(value)

4903

4904 # We can get there through isetitem with a DataFrame

4905 # or through loc single_block_path

4906 if isinstance(value, DataFrame):

4907 return _reindex_for_setitem(value, self.index)

4908 elif is_dict_like(value):

4909 return _reindex_for_setitem(Series(value), self.index)

4910

4911 if is_list_like(value):

4912 com.require_length_match(value, self.index)

4913 return sanitize_array(value, self.index, copy=True, allow_2d=True)

4914

4915 @property

4916 def _series(self):

4917 return {

4918 item: Series(

4919 self._mgr.iget(idx), index=self.index, name=item, fastpath=True

4920 )

4921 for idx, item in enumerate(self.columns)

4922 }

4923

4924 def lookup(

4925 self, row_labels: Sequence[IndexLabel], col_labels: Sequence[IndexLabel]

4926 ) -> np.ndarray:

4927 """

4928 Label-based "fancy indexing" function for DataFrame.

4929

4930 .. deprecated:: 1.2.0

4931 DataFrame.lookup is deprecated,

4932 use pandas.factorize and NumPy indexing instead.

4933 For further details see

4934 :ref:`Looking up values by index/column labels <indexing.lookup>`.

4935

4936 Given equal-length arrays of row and column labels, return an

4937 array of the values corresponding to each (row, col) pair.

4938

4939 Parameters

4940 ----------

4941 row_labels : sequence

4942 The row labels to use for lookup.

4943 col_labels : sequence

4944 The column labels to use for lookup.

4945

4946 Returns

4947 -------

4948 numpy.ndarray

4949 The found values.

4950 """

4951 msg = (

4952 "The 'lookup' method is deprecated and will be "

4953 "removed in a future version. "

4954 "You can use DataFrame.melt and DataFrame.loc "

4955 "as a substitute."

4956 )

4957 warnings.warn(msg, FutureWarning, stacklevel=find_stack_level())

4958

4959 n = len(row_labels)

4960 if n != len(col_labels):

4961 raise ValueError("Row labels must have same size as column labels")

4962 if not (self.index.is_unique and self.columns.is_unique):

4963 # GH#33041

4964 raise ValueError("DataFrame.lookup requires unique index and columns")

4965

4966 thresh = 1000

4967 if not self._is_mixed_type or n > thresh:

4968 values = self.values

4969 ridx = self.index.get_indexer(row_labels)

4970 cidx = self.columns.get_indexer(col_labels)

4971 if (ridx == -1).any():

4972 raise KeyError("One or more row labels was not found")

4973 if (cidx == -1).any():

4974 raise KeyError("One or more column labels was not found")

4975 flat_index = ridx * len(self.columns) + cidx

4976 result = values.flat[flat_index]

4977 else:

4978 result = np.empty(n, dtype="O")

4979 for i, (r, c) in enumerate(zip(row_labels, col_labels)):

4980 result[i] = self._get_value(r, c)

4981

4982 if is_object_dtype(result):

4983 result = lib.maybe_convert_objects(result)

4984

4985 return result

4986

4987 # ----------------------------------------------------------------------

4988 # Reindexing and alignment

4989

4990 def _reindex_axes(self, axes, level, limit, tolerance, method, fill_value, copy):

4991 frame = self

4992

4993 columns = axes["columns"]

4994 if columns is not None:

4995 frame = frame._reindex_columns(

4996 columns, method, copy, level, fill_value, limit, tolerance

4997 )

4998

4999 index = axes["index"]

5000 if index is not None:

5001 frame = frame._reindex_index(

5002 index, method, copy, level, fill_value, limit, tolerance

5003 )

5004

5005 return frame

5006

5007 def _reindex_index(

5008 self,

5009 new_index,

5010 method,

5011 copy: bool,

5012 level: Level,

5013 fill_value=np.nan,

5014 limit=None,

5015 tolerance=None,

5016 ):

5017 new_index, indexer = self.index.reindex(

5018 new_index, method=method, level=level, limit=limit, tolerance=tolerance

5019 )

5020 return self._reindex_with_indexers(

5021 {0: [new_index, indexer]},

5022 copy=copy,

5023 fill_value=fill_value,

5024 allow_dups=False,

5025 )

5026

5027 def _reindex_columns(

5028 self,

5029 new_columns,

5030 method,

5031 copy: bool,

5032 level: Level,

5033 fill_value=None,

5034 limit=None,

5035 tolerance=None,

5036 ):

5037 new_columns, indexer = self.columns.reindex(

5038 new_columns, method=method, level=level, limit=limit, tolerance=tolerance

5039 )

5040 return self._reindex_with_indexers(

5041 {1: [new_columns, indexer]},

5042 copy=copy,

5043 fill_value=fill_value,

5044 allow_dups=False,

5045 )

5046

5047 def _reindex_multi(

5048 self, axes: dict[str, Index], copy: bool, fill_value

5049 ) -> DataFrame:

5050 """

5051 We are guaranteed non-Nones in the axes.

5052 """

5053

5054 new_index, row_indexer = self.index.reindex(axes["index"])

5055 new_columns, col_indexer = self.columns.reindex(axes["columns"])

5056

5057 if row_indexer is not None and col_indexer is not None:

5058 # Fastpath. By doing two 'take's at once we avoid making an

5059 # unnecessary copy.

5060 # We only get here with `not self._is_mixed_type`, which (almost)

5061 # ensures that self.values is cheap. It may be worth making this

5062 # condition more specific.

5063 indexer = row_indexer, col_indexer

5064 new_values = take_2d_multi(self.values, indexer, fill_value=fill_value)

5065 return self._constructor(new_values, index=new_index, columns=new_columns)

5066 else:

5067 return self._reindex_with_indexers(

5068 {0: [new_index, row_indexer], 1: [new_columns, col_indexer]},

5069 copy=copy,

5070 fill_value=fill_value,

5071 )

5072

5073 @doc(NDFrame.align, **_shared_doc_kwargs)

5074 def align(

5075 self,

5076 other: DataFrame,

5077 join: Literal["outer", "inner", "left", "right"] = "outer",

5078 axis: Axis | None = None,

5079 level: Level = None,

5080 copy: bool = True,

5081 fill_value=None,

5082 method: FillnaOptions | None = None,

5083 limit: int | None = None,

5084 fill_axis: Axis = 0,

5085 broadcast_axis: Axis | None = None,

5086 ) -> DataFrame:

5087 return super().align(

5088 other,

5089 join=join,

5090 axis=axis,

5091 level=level,

5092 copy=copy,

5093 fill_value=fill_value,

5094 method=method,

5095 limit=limit,

5096 fill_axis=fill_axis,

5097 broadcast_axis=broadcast_axis,

5098 )

5099

5100 @overload

5101 def set_axis(

5102 self,

5103 labels,

5104 *,

5105 axis: Axis = ...,

5106 inplace: Literal[False] | lib.NoDefault = ...,

5107 copy: bool | lib.NoDefault = ...,

5108 ) -> DataFrame:

5109 ...

5110

5111 @overload

5112 def set_axis(

5113 self,

5114 labels,

5115 *,

5116 axis: Axis = ...,

5117 inplace: Literal[True],

5118 copy: bool | lib.NoDefault = ...,

5119 ) -> None:

5120 ...

5121

5122 @overload

5123 def set_axis(

5124 self,

5125 labels,

5126 *,

5127 axis: Axis = ...,

5128 inplace: bool | lib.NoDefault = ...,

5129 copy: bool | lib.NoDefault = ...,

5130 ) -> DataFrame | None:

5131 ...

5132

5133 # error: Signature of "set_axis" incompatible with supertype "NDFrame"

5134 @deprecate_nonkeyword_arguments(version=None, allowed_args=["self", "labels"])

5135 @Appender(

5136 """

5137 Examples

5138 --------

5139 >>> df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})

5140

5141 Change the row labels.

5142

5143 >>> df.set_axis(['a', 'b', 'c'], axis='index')

5144 A B

5145 a 1 4

5146 b 2 5

5147 c 3 6

5148

5149 Change the column labels.

5150

5151 >>> df.set_axis(['I', 'II'], axis='columns')

5152 I II

5153 0 1 4

5154 1 2 5

5155 2 3 6

5156

5157 Now, update the labels without copying the underlying data.

5158

5159 >>> df.set_axis(['i', 'ii'], axis='columns', copy=False)

5160 i ii

5161 0 1 4

5162 1 2 5

5163 2 3 6

5164 """

5165 )

5166 @Substitution(

5167 **_shared_doc_kwargs,

5168 extended_summary_sub=" column or",

5169 axis_description_sub=", and 1 identifies the columns",

5170 see_also_sub=" or columns",

5171 )

5172 @Appender(NDFrame.set_axis.__doc__)

5173 def set_axis(

5174 self,

5175 labels,

5176 axis: Axis = 0,

5177 inplace: bool | lib.NoDefault = lib.no_default,

5178 *,

5179 copy: bool | lib.NoDefault = lib.no_default,

5180 ):

5181 return super().set_axis(labels, axis=axis, inplace=inplace, copy=copy)

5182

5183 @Substitution(**_shared_doc_kwargs)

5184 @Appender(NDFrame.reindex.__doc__)

5185 @rewrite_axis_style_signature(

5186 "labels",

5187 [

5188 ("method", None),

5189 ("copy", None),

5190 ("level", None),

5191 ("fill_value", np.nan),

5192 ("limit", None),

5193 ("tolerance", None),

5194 ],

5195 )

5196 def reindex(self, *args, **kwargs) -> DataFrame:

5197 axes = validate_axis_style_args(self, args, kwargs, "labels", "reindex")

5198 kwargs.update(axes)

5199 # Pop these, since the values are in `kwargs` under different names

5200 kwargs.pop("axis", None)

5201 kwargs.pop("labels", None)

5202 return super().reindex(**kwargs)

5203

5204 @overload

5205 def drop(

5206 self,

5207 labels: IndexLabel = ...,

5208 *,

5209 axis: Axis = ...,

5210 index: IndexLabel = ...,

5211 columns: IndexLabel = ...,

5212 level: Level = ...,

5213 inplace: Literal[True],

5214 errors: IgnoreRaise = ...,

5215 ) -> None:

5216 ...

5217

5218 @overload

5219 def drop(

5220 self,

5221 labels: IndexLabel = ...,

5222 *,

5223 axis: Axis = ...,

5224 index: IndexLabel = ...,

5225 columns: IndexLabel = ...,

5226 level: Level = ...,

5227 inplace: Literal[False] = ...,

5228 errors: IgnoreRaise = ...,

5229 ) -> DataFrame:

5230 ...

5231

5232 @overload

5233 def drop(

5234 self,

5235 labels: IndexLabel = ...,

5236 *,

5237 axis: Axis = ...,

5238 index: IndexLabel = ...,

5239 columns: IndexLabel = ...,

5240 level: Level = ...,

5241 inplace: bool = ...,

5242 errors: IgnoreRaise = ...,

5243 ) -> DataFrame | None:

5244 ...

5245

5246 # error: Signature of "drop" incompatible with supertype "NDFrame"

5247 # github.com/python/mypy/issues/12387

5248 @deprecate_nonkeyword_arguments(version=None, allowed_args=["self", "labels"])

5249 def drop( # type: ignore[override]

5250 self,

5251 labels: IndexLabel = None,

5252 axis: Axis = 0,

5253 index: IndexLabel = None,

5254 columns: IndexLabel = None,

5255 level: Level = None,

5256 inplace: bool = False,

5257 errors: IgnoreRaise = "raise",

5258 ) -> DataFrame | None:

5259 """

5260 Drop specified labels from rows or columns.

5261

5262 Remove rows or columns by specifying label names and corresponding

5263 axis, or by specifying directly index or column names. When using a

5264 multi-index, labels on different levels can be removed by specifying

5265 the level. See the `user guide <advanced.shown_levels>`

5266 for more information about the now unused levels.

5267

5268 Parameters

5269 ----------

5270 labels : single label or list-like

5271 Index or column labels to drop. A tuple will be used as a single

5272 label and not treated as a list-like.

5273 axis : {0 or 'index', 1 or 'columns'}, default 0

5274 Whether to drop labels from the index (0 or 'index') or

5275 columns (1 or 'columns').

5276 index : single label or list-like

5277 Alternative to specifying axis (``labels, axis=0``

5278 is equivalent to ``index=labels``).

5279 columns : single label or list-like

5280 Alternative to specifying axis (``labels, axis=1``

5281 is equivalent to ``columns=labels``).

5282 level : int or level name, optional

5283 For MultiIndex, level from which the labels will be removed.

5284 inplace : bool, default False

5285 If False, return a copy. Otherwise, do operation

5286 inplace and return None.

5287 errors : {'ignore', 'raise'}, default 'raise'

5288 If 'ignore', suppress error and only existing labels are

5289 dropped.

5290

5291 Returns

5292 -------

5293 DataFrame or None

5294 DataFrame without the removed index or column labels or

5295 None if ``inplace=True``.

5296

5297 Raises

5298 ------

5299 KeyError

5300 If any of the labels is not found in the selected axis.

5301

5302 See Also

5303 --------

5304 DataFrame.loc : Label-location based indexer for selection by label.

5305 DataFrame.dropna : Return DataFrame with labels on given axis omitted

5306 where (all or any) data are missing.

5307 DataFrame.drop_duplicates : Return DataFrame with duplicate rows

5308 removed, optionally only considering certain columns.

5309 Series.drop : Return Series with specified index labels removed.

5310

5311 Examples

5312 --------

5313 >>> df = pd.DataFrame(np.arange(12).reshape(3, 4),

5314 ... columns=['A', 'B', 'C', 'D'])

5315 >>> df

5316 A B C D

5317 0 0 1 2 3

5318 1 4 5 6 7

5319 2 8 9 10 11

5320

5321 Drop columns

5322

5323 >>> df.drop(['B', 'C'], axis=1)

5324 A D

5325 0 0 3

5326 1 4 7

5327 2 8 11

5328

5329 >>> df.drop(columns=['B', 'C'])

5330 A D

5331 0 0 3

5332 1 4 7

5333 2 8 11

5334

5335 Drop a row by index

5336

5337 >>> df.drop([0, 1])

5338 A B C D

5339 2 8 9 10 11

5340

5341 Drop columns and/or rows of MultiIndex DataFrame

5342

5343 >>> midx = pd.MultiIndex(levels=[['lama', 'cow', 'falcon'],

5344 ... ['speed', 'weight', 'length']],

5345 ... codes=[[0, 0, 0, 1, 1, 1, 2, 2, 2],

5346 ... [0, 1, 2, 0, 1, 2, 0, 1, 2]])

5347 >>> df = pd.DataFrame(index=midx, columns=['big', 'small'],

5348 ... data=[[45, 30], [200, 100], [1.5, 1], [30, 20],

5349 ... [250, 150], [1.5, 0.8], [320, 250],

5350 ... [1, 0.8], [0.3, 0.2]])

5351 >>> df

5352 big small

5353 lama speed 45.0 30.0

5354 weight 200.0 100.0

5355 length 1.5 1.0

5356 cow speed 30.0 20.0

5357 weight 250.0 150.0

5358 length 1.5 0.8

5359 falcon speed 320.0 250.0

5360 weight 1.0 0.8

5361 length 0.3 0.2

5362

5363 Drop a specific index combination from the MultiIndex

5364 DataFrame, i.e., drop the combination ``'falcon'`` and

5365 ``'weight'``, which deletes only the corresponding row

5366

5367 >>> df.drop(index=('falcon', 'weight'))

5368 big small

5369 lama speed 45.0 30.0

5370 weight 200.0 100.0

5371 length 1.5 1.0

5372 cow speed 30.0 20.0

5373 weight 250.0 150.0

5374 length 1.5 0.8

5375 falcon speed 320.0 250.0

5376 length 0.3 0.2

5377

5378 >>> df.drop(index='cow', columns='small')

5379 big

5380 lama speed 45.0

5381 weight 200.0

5382 length 1.5

5383 falcon speed 320.0

5384 weight 1.0

5385 length 0.3

5386

5387 >>> df.drop(index='length', level=1)

5388 big small

5389 lama speed 45.0 30.0

5390 weight 200.0 100.0

5391 cow speed 30.0 20.0

5392 weight 250.0 150.0

5393 falcon speed 320.0 250.0

5394 weight 1.0 0.8

5395 """

5396 return super().drop(

5397 labels=labels,

5398 axis=axis,

5399 index=index,

5400 columns=columns,

5401 level=level,

5402 inplace=inplace,

5403 errors=errors,

5404 )

5405

5406 @overload

5407 def rename(

5408 self,

5409 mapper: Renamer | None = ...,

5410 *,

5411 index: Renamer | None = ...,

5412 columns: Renamer | None = ...,

5413 axis: Axis | None = ...,

5414 copy: bool | None = ...,

5415 inplace: Literal[True],

5416 level: Level = ...,

5417 errors: IgnoreRaise = ...,

5418 ) -> None:

5419 ...

5420

5421 @overload

5422 def rename(

5423 self,

5424 mapper: Renamer | None = ...,

5425 *,

5426 index: Renamer | None = ...,

5427 columns: Renamer | None = ...,

5428 axis: Axis | None = ...,

5429 copy: bool | None = ...,

5430 inplace: Literal[False] = ...,

5431 level: Level = ...,

5432 errors: IgnoreRaise = ...,

5433 ) -> DataFrame:

5434 ...

5435

5436 @overload

5437 def rename(

5438 self,

5439 mapper: Renamer | None = ...,

5440 *,

5441 index: Renamer | None = ...,

5442 columns: Renamer | None = ...,

5443 axis: Axis | None = ...,

5444 copy: bool | None = ...,

5445 inplace: bool = ...,

5446 level: Level = ...,

5447 errors: IgnoreRaise = ...,

5448 ) -> DataFrame | None:

5449 ...

5450

5451 def rename(

5452 self,

5453 mapper: Renamer | None = None,

5454 *,

5455 index: Renamer | None = None,

5456 columns: Renamer | None = None,

5457 axis: Axis | None = None,

5458 copy: bool | None = None,

5459 inplace: bool = False,

5460 level: Level = None,

5461 errors: IgnoreRaise = "ignore",

5462 ) -> DataFrame | None:

5463 """

5464 Alter axes labels.

5465

5466 Function / dict values must be unique (1-to-1). Labels not contained in

5467 a dict / Series will be left as-is. Extra labels listed don't throw an

5468 error.

5469

5470 See the :ref:`user guide <basics.rename>` for more.

5471

5472 Parameters

5473 ----------

5474 mapper : dict-like or function

5475 Dict-like or function transformations to apply to

5476 that axis' values. Use either ``mapper`` and ``axis`` to

5477 specify the axis to target with ``mapper``, or ``index`` and

5478 ``columns``.

5479 index : dict-like or function

5480 Alternative to specifying axis (``mapper, axis=0``

5481 is equivalent to ``index=mapper``).

5482 columns : dict-like or function

5483 Alternative to specifying axis (``mapper, axis=1``

5484 is equivalent to ``columns=mapper``).

5485 axis : {0 or 'index', 1 or 'columns'}, default 0

5486 Axis to target with ``mapper``. Can be either the axis name

5487 ('index', 'columns') or number (0, 1). The default is 'index'.

5488 copy : bool, default True

5489 Also copy underlying data.

5490 inplace : bool, default False

5491 Whether to modify the DataFrame rather than creating a new one.

5492 If True then value of copy is ignored.

5493 level : int or level name, default None

5494 In case of a MultiIndex, only rename labels in the specified

5495 level.

5496 errors : {'ignore', 'raise'}, default 'ignore'

5497 If 'raise', raise a `KeyError` when a dict-like `mapper`, `index`,

5498 or `columns` contains labels that are not present in the Index

5499 being transformed.

5500 If 'ignore', existing keys will be renamed and extra keys will be

5501 ignored.

5502

5503 Returns

5504 -------

5505 DataFrame or None

5506 DataFrame with the renamed axis labels or None if ``inplace=True``.

5507

5508 Raises

5509 ------

5510 KeyError

5511 If any of the labels is not found in the selected axis and

5512 "errors='raise'".

5513

5514 See Also

5515 --------

5516 DataFrame.rename_axis : Set the name of the axis.

5517

5518 Examples

5519 --------

5520 ``DataFrame.rename`` supports two calling conventions

5521

5522 * ``(index=index_mapper, columns=columns_mapper, ...)``

5523 * ``(mapper, axis={'index', 'columns'}, ...)``

5524

5525 We *highly* recommend using keyword arguments to clarify your

5526 intent.

5527

5528 Rename columns using a mapping:

5529

5530 >>> df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})

5531 >>> df.rename(columns={"A": "a", "B": "c"})

5532 a c

5533 0 1 4

5534 1 2 5

5535 2 3 6

5536

5537 Rename index using a mapping:

5538

5539 >>> df.rename(index={0: "x", 1: "y", 2: "z"})

5540 A B

5541 x 1 4

5542 y 2 5

5543 z 3 6

5544

5545 Cast index labels to a different type:

5546

5547 >>> df.index

5548 RangeIndex(start=0, stop=3, step=1)

5549 >>> df.rename(index=str).index

5550 Index(['0', '1', '2'], dtype='object')

5551

5552 >>> df.rename(columns={"A": "a", "B": "b", "C": "c"}, errors="raise")

5553 Traceback (most recent call last):

5554 KeyError: ['C'] not found in axis

5555

5556 Using axis-style parameters:

5557

5558 >>> df.rename(str.lower, axis='columns')

5559 a b

5560 0 1 4

5561 1 2 5

5562 2 3 6

5563

5564 >>> df.rename({1: 2, 2: 4}, axis='index')

5565 A B

5566 0 1 4

5567 2 2 5

5568 4 3 6

5569 """

5570 return super()._rename(

5571 mapper=mapper,

5572 index=index,

5573 columns=columns,

5574 axis=axis,

5575 copy=copy,

5576 inplace=inplace,

5577 level=level,

5578 errors=errors,

5579 )

5580

5581 @overload

5582 def fillna(

5583 self,

5584 value: Hashable | Mapping | Series | DataFrame = ...,

5585 *,

5586 method: FillnaOptions | None = ...,

5587 axis: Axis | None = ...,

5588 inplace: Literal[False] = ...,

5589 limit: int | None = ...,

5590 downcast: dict | None = ...,

5591 ) -> DataFrame:

5592 ...

5593

5594 @overload

5595 def fillna(

5596 self,

5597 value: Hashable | Mapping | Series | DataFrame = ...,

5598 *,

5599 method: FillnaOptions | None = ...,

5600 axis: Axis | None = ...,

5601 inplace: Literal[True],

5602 limit: int | None = ...,

5603 downcast: dict | None = ...,

5604 ) -> None:

5605 ...

5606

5607 @overload

5608 def fillna(

5609 self,

5610 value: Hashable | Mapping | Series | DataFrame = ...,

5611 *,

5612 method: FillnaOptions | None = ...,

5613 axis: Axis | None = ...,

5614 inplace: bool = ...,

5615 limit: int | None = ...,

5616 downcast: dict | None = ...,

5617 ) -> DataFrame | None:

5618 ...

5619

5620 # error: Signature of "fillna" incompatible with supertype "NDFrame"

5621 @deprecate_nonkeyword_arguments(version=None, allowed_args=["self", "value"])

5622 @doc(NDFrame.fillna, **_shared_doc_kwargs)

5623 def fillna( # type: ignore[override]

5624 self,

5625 value: Hashable | Mapping | Series | DataFrame = None,

5626 method: FillnaOptions | None = None,

5627 axis: Axis | None = None,

5628 inplace: bool = False,

5629 limit: int | None = None,

5630 downcast: dict | None = None,

5631 ) -> DataFrame | None:

5632 return super().fillna(

5633 value=value,

5634 method=method,

5635 axis=axis,

5636 inplace=inplace,

5637 limit=limit,

5638 downcast=downcast,

5639 )

5640

5641 def pop(self, item: Hashable) -> Series:

5642 """

5643 Return item and drop from frame. Raise KeyError if not found.

5644

5645 Parameters

5646 ----------

5647 item : label

5648 Label of column to be popped.

5649

5650 Returns

5651 -------

5652 Series

5653

5654 Examples

5655 --------

5656 >>> df = pd.DataFrame([('falcon', 'bird', 389.0),

5657 ... ('parrot', 'bird', 24.0),

5658 ... ('lion', 'mammal', 80.5),

5659 ... ('monkey', 'mammal', np.nan)],

5660 ... columns=('name', 'class', 'max_speed'))

5661 >>> df

5662 name class max_speed

5663 0 falcon bird 389.0

5664 1 parrot bird 24.0

5665 2 lion mammal 80.5

5666 3 monkey mammal NaN

5667

5668 >>> df.pop('class')

5669 0 bird

5670 1 bird

5671 2 mammal

5672 3 mammal

5673 Name: class, dtype: object

5674

5675 >>> df

5676 name max_speed

5677 0 falcon 389.0

5678 1 parrot 24.0

5679 2 lion 80.5

5680 3 monkey NaN

5681 """

5682 return super().pop(item=item)

5683

5684 # error: Signature of "replace" incompatible with supertype "NDFrame"

5685 @overload # type: ignore[override]

5686 def replace(

5687 self,

5688 to_replace=...,

5689 value=...,

5690 *,

5691 inplace: Literal[False] = ...,

5692 limit: int | None = ...,

5693 regex: bool = ...,

5694 method: Literal["pad", "ffill", "bfill"] | lib.NoDefault = ...,

5695 ) -> DataFrame:

5696 ...

5697

5698 @overload

5699 def replace(

5700 self,

5701 to_replace=...,

5702 value=...,

5703 *,

5704 inplace: Literal[True],

5705 limit: int | None = ...,

5706 regex: bool = ...,

5707 method: Literal["pad", "ffill", "bfill"] | lib.NoDefault = ...,

5708 ) -> None:

5709 ...

5710

5711 # error: Signature of "replace" incompatible with supertype "NDFrame"

5712 @deprecate_nonkeyword_arguments(

5713 version=None, allowed_args=["self", "to_replace", "value"]

5714 )

5715 @doc(NDFrame.replace, **_shared_doc_kwargs)

5716 def replace( # type: ignore[override]

5717 self,

5718 to_replace=None,

5719 value=lib.no_default,

5720 inplace: bool = False,

5721 limit: int | None = None,

5722 regex: bool = False,

5723 method: Literal["pad", "ffill", "bfill"] | lib.NoDefault = lib.no_default,

5724 ) -> DataFrame | None:

5725 return super().replace(

5726 to_replace=to_replace,

5727 value=value,

5728 inplace=inplace,

5729 limit=limit,

5730 regex=regex,

5731 method=method,

5732 )

5733

5734 def _replace_columnwise(

5735 self, mapping: dict[Hashable, tuple[Any, Any]], inplace: bool, regex

5736 ):

5737 """

5738 Dispatch to Series.replace column-wise.

5739

5740 Parameters

5741 ----------

5742 mapping : dict

5743 of the form {col: (target, value)}

5744 inplace : bool

5745 regex : bool or same types as `to_replace` in DataFrame.replace

5746

5747 Returns

5748 -------

5749 DataFrame or None

5750 """

5751 # Operate column-wise

5752 res = self if inplace else self.copy()

5753 ax = self.columns

5754

5755 for i in range(len(ax)):

5756 if ax[i] in mapping:

5757 ser = self.iloc[:, i]

5758

5759 target, value = mapping[ax[i]]

5760 newobj = ser.replace(target, value, regex=regex)

5761

5762 res._iset_item(i, newobj)

5763

5764 if inplace:

5765 return

5766 return res.__finalize__(self)

5767

5768 @doc(NDFrame.shift, klass=_shared_doc_kwargs["klass"])

5769 def shift(

5770 self,

5771 periods: int = 1,

5772 freq: Frequency | None = None,

5773 axis: Axis = 0,

5774 fill_value: Hashable = lib.no_default,

5775 ) -> DataFrame:

5776 axis = self._get_axis_number(axis)

5777

5778 ncols = len(self.columns)

5779 if (

5780 axis == 1

5781 and periods != 0

5782 and freq is None

5783 and fill_value is lib.no_default

5784 and ncols > 0

5785 ):

5786 # We will infer fill_value to match the closest column

5787

5788 # Use a column that we know is valid for our column's dtype GH#38434

5789 label = self.columns[0]

5790

5791 if periods > 0:

5792 result = self.iloc[:, :-periods]

5793 for col in range(min(ncols, abs(periods))):

5794 # TODO(EA2D): doing this in a loop unnecessary with 2D EAs

5795 # Define filler inside loop so we get a copy

5796 filler = self.iloc[:, 0].shift(len(self))

5797 result.insert(0, label, filler, allow_duplicates=True)

5798 else:

5799 result = self.iloc[:, -periods:]

5800 for col in range(min(ncols, abs(periods))):

5801 # Define filler inside loop so we get a copy

5802 filler = self.iloc[:, -1].shift(len(self))

5803 result.insert(

5804 len(result.columns), label, filler, allow_duplicates=True

5805 )

5806

5807 result.columns = self.columns.copy()

5808 return result

5809 elif (

5810 axis == 1

5811 and periods != 0

5812 and fill_value is not lib.no_default

5813 and ncols > 0

5814 ):

5815 arrays = self._mgr.arrays

5816 if len(arrays) > 1 or (

5817 # If we only have one block and we know that we can't

5818 # keep the same dtype (i.e. the _can_hold_element check)

5819 # then we can go through the reindex_indexer path

5820 # (and avoid casting logic in the Block method).

5821 # The exception to this (until 2.0) is datetimelike

5822 # dtypes with integers, which cast.

5823 not can_hold_element(arrays[0], fill_value)

5824 # TODO(2.0): remove special case for integer-with-datetimelike

5825 # once deprecation is enforced

5826 and not (

5827 lib.is_integer(fill_value) and needs_i8_conversion(arrays[0].dtype)

5828 )

5829 ):

5830 # GH#35488 we need to watch out for multi-block cases

5831 # We only get here with fill_value not-lib.no_default

5832 nper = abs(periods)

5833 nper = min(nper, ncols)

5834 if periods > 0:

5835 indexer = np.array(

5836 [-1] * nper + list(range(ncols - periods)), dtype=np.intp

5837 )

5838 else:

5839 indexer = np.array(

5840 list(range(nper, ncols)) + [-1] * nper, dtype=np.intp

5841 )

5842 mgr = self._mgr.reindex_indexer(

5843 self.columns,

5844 indexer,

5845 axis=0,

5846 fill_value=fill_value,

5847 allow_dups=True,

5848 )

5849 res_df = self._constructor(mgr)

5850 return res_df.__finalize__(self, method="shift")

5851

5852 return super().shift(

5853 periods=periods, freq=freq, axis=axis, fill_value=fill_value

5854 )

5855

5856 @overload

5857 def set_index(

5858 self,

5859 keys,

5860 *,

5861 drop: bool = ...,

5862 append: bool = ...,

5863 inplace: Literal[False] = ...,

5864 verify_integrity: bool = ...,

5865 ) -> DataFrame:

5866 ...

5867

5868 @overload

5869 def set_index(

5870 self,

5871 keys,

5872 *,

5873 drop: bool = ...,

5874 append: bool = ...,

5875 inplace: Literal[True],

5876 verify_integrity: bool = ...,

5877 ) -> None:

5878 ...

5879

5880 @deprecate_nonkeyword_arguments(version=None, allowed_args=["self", "keys"])

5881 def set_index(

5882 self,

5883 keys,

5884 drop: bool = True,

5885 append: bool = False,

5886 inplace: bool = False,

5887 verify_integrity: bool = False,

5888 ) -> DataFrame | None:

5889 """

5890 Set the DataFrame index using existing columns.

5891

5892 Set the DataFrame index (row labels) using one or more existing

5893 columns or arrays (of the correct length). The index can replace the

5894 existing index or expand on it.

5895

5896 Parameters

5897 ----------

5898 keys : label or array-like or list of labels/arrays

5899 This parameter can be either a single column key, a single array of

5900 the same length as the calling DataFrame, or a list containing an

5901 arbitrary combination of column keys and arrays. Here, "array"

5902 encompasses :class:`Series`, :class:`Index`, ``np.ndarray``, and

5903 instances of :class:`~collections.abc.Iterator`.

5904 drop : bool, default True

5905 Delete columns to be used as the new index.

5906 append : bool, default False

5907 Whether to append columns to existing index.

5908 inplace : bool, default False

5909 Whether to modify the DataFrame rather than creating a new one.

5910 verify_integrity : bool, default False

5911 Check the new index for duplicates. Otherwise defer the check until

5912 necessary. Setting to False will improve the performance of this

5913 method.

5914

5915 Returns

5916 -------

5917 DataFrame or None

5918 Changed row labels or None if ``inplace=True``.

5919

5920 See Also

5921 --------

5922 DataFrame.reset_index : Opposite of set_index.

5923 DataFrame.reindex : Change to new indices or expand indices.

5924 DataFrame.reindex_like : Change to same indices as other DataFrame.

5925

5926 Examples

5927 --------

5928 >>> df = pd.DataFrame({'month': [1, 4, 7, 10],

5929 ... 'year': [2012, 2014, 2013, 2014],

5930 ... 'sale': [55, 40, 84, 31]})

5931 >>> df

5932 month year sale

5933 0 1 2012 55

5934 1 4 2014 40

5935 2 7 2013 84

5936 3 10 2014 31

5937

5938 Set the index to become the 'month' column:

5939

5940 >>> df.set_index('month')

5941 year sale

5942 month

5943 1 2012 55

5944 4 2014 40

5945 7 2013 84

5946 10 2014 31

5947

5948 Create a MultiIndex using columns 'year' and 'month':

5949

5950 >>> df.set_index(['year', 'month'])

5951 sale

5952 year month

5953 2012 1 55

5954 2014 4 40

5955 2013 7 84

5956 2014 10 31

5957

5958 Create a MultiIndex using an Index and a column:

5959

5960 >>> df.set_index([pd.Index([1, 2, 3, 4]), 'year'])

5961 month sale

5962 year

5963 1 2012 1 55

5964 2 2014 4 40

5965 3 2013 7 84

5966 4 2014 10 31

5967

5968 Create a MultiIndex using two Series:

5969

5970 >>> s = pd.Series([1, 2, 3, 4])

5971 >>> df.set_index([s, s**2])

5972 month year sale

5973 1 1 1 2012 55

5974 2 4 4 2014 40

5975 3 9 7 2013 84

5976 4 16 10 2014 31

5977 """

5978 inplace = validate_bool_kwarg(inplace, "inplace")

5979 self._check_inplace_and_allows_duplicate_labels(inplace)

5980 if not isinstance(keys, list):

5981 keys = [keys]

5982

5983 err_msg = (

5984 'The parameter "keys" may be a column key, one-dimensional '

5985 "array, or a list containing only valid column keys and "

5986 "one-dimensional arrays."

5987 )

5988

5989 missing: list[Hashable] = []

5990 for col in keys:

5991 if isinstance(col, (Index, Series, np.ndarray, list, abc.Iterator)):

5992 # arrays are fine as long as they are one-dimensional

5993 # iterators get converted to list below

5994 if getattr(col, "ndim", 1) != 1:

5995 raise ValueError(err_msg)

5996 else:

5997 # everything else gets tried as a key; see GH 24969

5998 try:

5999 found = col in self.columns

6000 except TypeError as err:

6001 raise TypeError(

6002 f"{err_msg}. Received column of type {type(col)}"

6003 ) from err

6004 else:

6005 if not found:

6006 missing.append(col)

6007

6008 if missing:

6009 raise KeyError(f"None of {missing} are in the columns")

6010

6011 if inplace:

6012 frame = self

6013 else:

6014 frame = self.copy()

6015

6016 arrays = []

6017 names: list[Hashable] = []

6018 if append:

6019 names = list(self.index.names)

6020 if isinstance(self.index, MultiIndex):

6021 for i in range(self.index.nlevels):

6022 arrays.append(self.index._get_level_values(i))

6023 else:

6024 arrays.append(self.index)

6025

6026 to_remove: list[Hashable] = []

6027 for col in keys:

6028 if isinstance(col, MultiIndex):

6029 for n in range(col.nlevels):

6030 arrays.append(col._get_level_values(n))

6031 names.extend(col.names)

6032 elif isinstance(col, (Index, Series)):

6033 # if Index then not MultiIndex (treated above)

6034

6035 # error: Argument 1 to "append" of "list" has incompatible type

6036 # "Union[Index, Series]"; expected "Index"

6037 arrays.append(col) # type:ignore[arg-type]

6038 names.append(col.name)

6039 elif isinstance(col, (list, np.ndarray)):

6040 # error: Argument 1 to "append" of "list" has incompatible type

6041 # "Union[List[Any], ndarray]"; expected "Index"

6042 arrays.append(col) # type: ignore[arg-type]

6043 names.append(None)

6044 elif isinstance(col, abc.Iterator):

6045 # error: Argument 1 to "append" of "list" has incompatible type

6046 # "List[Any]"; expected "Index"

6047 arrays.append(list(col)) # type: ignore[arg-type]

6048 names.append(None)

6049 # from here, col can only be a column label

6050 else:

6051 arrays.append(frame[col]._values)

6052 names.append(col)

6053 if drop:

6054 to_remove.append(col)

6055

6056 if len(arrays[-1]) != len(self):

6057 # check newest element against length of calling frame, since

6058 # ensure_index_from_sequences would not raise for append=False.

6059 raise ValueError(

6060 f"Length mismatch: Expected {len(self)} rows, "

6061 f"received array of length {len(arrays[-1])}"

6062 )

6063

6064 index = ensure_index_from_sequences(arrays, names)

6065

6066 if verify_integrity and not index.is_unique:

6067 duplicates = index[index.duplicated()].unique()

6068 raise ValueError(f"Index has duplicate keys: {duplicates}")

6069

6070 # use set to handle duplicate column names gracefully in case of drop

6071 for c in set(to_remove):

6072 del frame[c]

6073

6074 # clear up memory usage

6075 index._cleanup()

6076

6077 frame.index = index

6078

6079 if not inplace:

6080 return frame

6081 return None

6082

6083 @overload

6084 def reset_index(

6085 self,

6086 level: IndexLabel = ...,

6087 *,

6088 drop: bool = ...,

6089 inplace: Literal[False] = ...,

6090 col_level: Hashable = ...,

6091 col_fill: Hashable = ...,

6092 allow_duplicates: bool | lib.NoDefault = ...,

6093 names: Hashable | Sequence[Hashable] = None,

6094 ) -> DataFrame:

6095 ...

6096

6097 @overload

6098 def reset_index(

6099 self,

6100 level: IndexLabel = ...,

6101 *,

6102 drop: bool = ...,

6103 inplace: Literal[True],

6104 col_level: Hashable = ...,

6105 col_fill: Hashable = ...,

6106 allow_duplicates: bool | lib.NoDefault = ...,

6107 names: Hashable | Sequence[Hashable] = None,

6108 ) -> None:

6109 ...

6110

6111 @overload

6112 def reset_index(

6113 self,

6114 level: IndexLabel = ...,

6115 *,

6116 drop: bool = ...,

6117 inplace: bool = ...,

6118 col_level: Hashable = ...,

6119 col_fill: Hashable = ...,

6120 allow_duplicates: bool | lib.NoDefault = ...,

6121 names: Hashable | Sequence[Hashable] = None,

6122 ) -> DataFrame | None:

6123 ...

6124

6125 @deprecate_nonkeyword_arguments(version=None, allowed_args=["self", "level"])

6126 def reset_index(

6127 self,

6128 level: IndexLabel = None,

6129 drop: bool = False,

6130 inplace: bool = False,

6131 col_level: Hashable = 0,

6132 col_fill: Hashable = "",

6133 allow_duplicates: bool | lib.NoDefault = lib.no_default,

6134 names: Hashable | Sequence[Hashable] = None,

6135 ) -> DataFrame | None:

6136 """

6137 Reset the index, or a level of it.

6138

6139 Reset the index of the DataFrame, and use the default one instead.

6140 If the DataFrame has a MultiIndex, this method can remove one or more

6141 levels.

6142

6143 Parameters

6144 ----------

6145 level : int, str, tuple, or list, default None

6146 Only remove the given levels from the index. Removes all levels by

6147 default.

6148 drop : bool, default False

6149 Do not try to insert index into dataframe columns. This resets

6150 the index to the default integer index.

6151 inplace : bool, default False

6152 Whether to modify the DataFrame rather than creating a new one.

6153 col_level : int or str, default 0

6154 If the columns have multiple levels, determines which level the

6155 labels are inserted into. By default it is inserted into the first

6156 level.

6157 col_fill : object, default ''

6158 If the columns have multiple levels, determines how the other

6159 levels are named. If None then the index name is repeated.

6160 allow_duplicates : bool, optional, default lib.no_default

6161 Allow duplicate column labels to be created.

6162

6163 .. versionadded:: 1.5.0

6164

6165 names : int, str or 1-dimensional list, default None

6166 Using the given string, rename the DataFrame column which contains the

6167 index data. If the DataFrame has a MultiIndex, this has to be a list or

6168 tuple with length equal to the number of levels.

6169

6170 .. versionadded:: 1.5.0

6171

6172 Returns

6173 -------

6174 DataFrame or None

6175 DataFrame with the new index or None if ``inplace=True``.

6176

6177 See Also

6178 --------

6179 DataFrame.set_index : Opposite of reset_index.

6180 DataFrame.reindex : Change to new indices or expand indices.

6181 DataFrame.reindex_like : Change to same indices as other DataFrame.

6182

6183 Examples

6184 --------

6185 >>> df = pd.DataFrame([('bird', 389.0),

6186 ... ('bird', 24.0),

6187 ... ('mammal', 80.5),

6188 ... ('mammal', np.nan)],

6189 ... index=['falcon', 'parrot', 'lion', 'monkey'],

6190 ... columns=('class', 'max_speed'))

6191 >>> df

6192 class max_speed

6193 falcon bird 389.0

6194 parrot bird 24.0

6195 lion mammal 80.5

6196 monkey mammal NaN

6197

6198 When we reset the index, the old index is added as a column, and a

6199 new sequential index is used:

6200

6201 >>> df.reset_index()

6202 index class max_speed

6203 0 falcon bird 389.0

6204 1 parrot bird 24.0

6205 2 lion mammal 80.5

6206 3 monkey mammal NaN

6207

6208 We can use the `drop` parameter to avoid the old index being added as

6209 a column:

6210

6211 >>> df.reset_index(drop=True)

6212 class max_speed

6213 0 bird 389.0

6214 1 bird 24.0

6215 2 mammal 80.5

6216 3 mammal NaN

6217

6218 You can also use `reset_index` with `MultiIndex`.

6219

6220 >>> index = pd.MultiIndex.from_tuples([('bird', 'falcon'),

6221 ... ('bird', 'parrot'),

6222 ... ('mammal', 'lion'),

6223 ... ('mammal', 'monkey')],

6224 ... names=['class', 'name'])

6225 >>> columns = pd.MultiIndex.from_tuples([('speed', 'max'),

6226 ... ('species', 'type')])

6227 >>> df = pd.DataFrame([(389.0, 'fly'),

6228 ... ( 24.0, 'fly'),

6229 ... ( 80.5, 'run'),

6230 ... (np.nan, 'jump')],

6231 ... index=index,

6232 ... columns=columns)

6233 >>> df

6234 speed species

6235 max type

6236 class name

6237 bird falcon 389.0 fly

6238 parrot 24.0 fly

6239 mammal lion 80.5 run

6240 monkey NaN jump

6241

6242 Using the `names` parameter, choose a name for the index column:

6243

6244 >>> df.reset_index(names=['classes', 'names'])

6245 classes names speed species

6246 max type

6247 0 bird falcon 389.0 fly

6248 1 bird parrot 24.0 fly

6249 2 mammal lion 80.5 run

6250 3 mammal monkey NaN jump

6251

6252 If the index has multiple levels, we can reset a subset of them:

6253

6254 >>> df.reset_index(level='class')

6255 class speed species

6256 max type

6257 name

6258 falcon bird 389.0 fly

6259 parrot bird 24.0 fly

6260 lion mammal 80.5 run

6261 monkey mammal NaN jump

6262

6263 If we are not dropping the index, by default, it is placed in the top

6264 level. We can place it in another level:

6265

6266 >>> df.reset_index(level='class', col_level=1)

6267 speed species

6268 class max type

6269 name

6270 falcon bird 389.0 fly

6271 parrot bird 24.0 fly

6272 lion mammal 80.5 run

6273 monkey mammal NaN jump

6274

6275 When the index is inserted under another level, we can specify under

6276 which one with the parameter `col_fill`:

6277

6278 >>> df.reset_index(level='class', col_level=1, col_fill='species')

6279 species speed species

6280 class max type

6281 name

6282 falcon bird 389.0 fly

6283 parrot bird 24.0 fly

6284 lion mammal 80.5 run

6285 monkey mammal NaN jump

6286

6287 If we specify a nonexistent level for `col_fill`, it is created:

6288

6289 >>> df.reset_index(level='class', col_level=1, col_fill='genus')

6290 genus speed species

6291 class max type

6292 name

6293 falcon bird 389.0 fly

6294 parrot bird 24.0 fly

6295 lion mammal 80.5 run

6296 monkey mammal NaN jump

6297 """

6298 inplace = validate_bool_kwarg(inplace, "inplace")

6299 self._check_inplace_and_allows_duplicate_labels(inplace)

6300 if inplace:

6301 new_obj = self

6302 else:

6303 new_obj = self.copy(deep=None)

6304 if allow_duplicates is not lib.no_default:

6305 allow_duplicates = validate_bool_kwarg(allow_duplicates, "allow_duplicates")

6306

6307 new_index = default_index(len(new_obj))

6308 if level is not None:

6309 if not isinstance(level, (tuple, list)):

6310 level = [level]

6311 level = [self.index._get_level_number(lev) for lev in level]

6312 if len(level) < self.index.nlevels:

6313 new_index = self.index.droplevel(level)

6314

6315 if not drop:

6316 to_insert: Iterable[tuple[Any, Any | None]]

6317

6318 default = "index" if "index" not in self else "level_0"

6319 names = self.index._get_default_index_names(names, default)

6320

6321 if isinstance(self.index, MultiIndex):

6322 to_insert = zip(self.index.levels, self.index.codes)

6323 else:

6324 to_insert = ((self.index, None),)

6325

6326 multi_col = isinstance(self.columns, MultiIndex)

6327 for i, (lev, lab) in reversed(list(enumerate(to_insert))):

6328 if level is not None and i not in level:

6329 continue

6330 name = names[i]

6331 if multi_col:

6332 col_name = list(name) if isinstance(name, tuple) else [name]

6333 if col_fill is None:

6334 if len(col_name) not in (1, self.columns.nlevels):

6335 raise ValueError(

6336 "col_fill=None is incompatible "

6337 f"with incomplete column name {name}"

6338 )

6339 col_fill = col_name[0]

6340

6341 lev_num = self.columns._get_level_number(col_level)

6342 name_lst = [col_fill] * lev_num + col_name

6343 missing = self.columns.nlevels - len(name_lst)

6344 name_lst += [col_fill] * missing

6345 name = tuple(name_lst)

6346

6347 # to ndarray and maybe infer different dtype

6348 level_values = lev._values

6349 if level_values.dtype == np.object_:

6350 level_values = lib.maybe_convert_objects(level_values)

6351

6352 if lab is not None:

6353 # if we have the codes, extract the values with a mask

6354 level_values = algorithms.take(

6355 level_values, lab, allow_fill=True, fill_value=lev._na_value

6356 )

6357

6358 new_obj.insert(

6359 0,

6360 name,

6361 level_values,

6362 allow_duplicates=allow_duplicates,

6363 )

6364

6365 new_obj.index = new_index

6366 if not inplace:

6367 return new_obj

6368

6369 return None

6370

6371 # ----------------------------------------------------------------------

6372 # Reindex-based selection methods

6373

6374 @doc(NDFrame.isna, klass=_shared_doc_kwargs["klass"])

6375 def isna(self) -> DataFrame:

6376 result = self._constructor(self._mgr.isna(func=isna))

6377 return result.__finalize__(self, method="isna")

6378

6379 @doc(NDFrame.isna, klass=_shared_doc_kwargs["klass"])

6380 def isnull(self) -> DataFrame:

6381 """

6382 DataFrame.isnull is an alias for DataFrame.isna.

6383 """

6384 return self.isna()

6385

6386 @doc(NDFrame.notna, klass=_shared_doc_kwargs["klass"])

6387 def notna(self) -> DataFrame:

6388 return ~self.isna()

6389

6390 @doc(NDFrame.notna, klass=_shared_doc_kwargs["klass"])

6391 def notnull(self) -> DataFrame:

6392 """

6393 DataFrame.notnull is an alias for DataFrame.notna.

6394 """

6395 return ~self.isna()

6396

6397 @overload

6398 def dropna(

6399 self,

6400 *,

6401 axis: Axis = ...,

6402 how: str | NoDefault = ...,

6403 thresh: int | NoDefault = ...,

6404 subset: IndexLabel = ...,

6405 inplace: Literal[False] = ...,

6406 ) -> DataFrame:

6407 ...

6408

6409 @overload

6410 def dropna(

6411 self,

6412 *,

6413 axis: Axis = ...,

6414 how: str | NoDefault = ...,

6415 thresh: int | NoDefault = ...,

6416 subset: IndexLabel = ...,

6417 inplace: Literal[True],

6418 ) -> None:

6419 ...

6420

6421 @deprecate_nonkeyword_arguments(version=None, allowed_args=["self"])

6422 def dropna(

6423 self,

6424 axis: Axis = 0,

6425 how: str | NoDefault = no_default,

6426 thresh: int | NoDefault = no_default,

6427 subset: IndexLabel = None,

6428 inplace: bool = False,

6429 ) -> DataFrame | None:

6430 """

6431 Remove missing values.

6432

6433 See the :ref:`User Guide <missing_data>` for more on which values are

6434 considered missing, and how to work with missing data.

6435

6436 Parameters

6437 ----------

6438 axis : {0 or 'index', 1 or 'columns'}, default 0

6439 Determine if rows or columns which contain missing values are

6440 removed.

6441

6442 * 0, or 'index' : Drop rows which contain missing values.

6443 * 1, or 'columns' : Drop columns which contain missing value.

6444

6445 .. versionchanged:: 1.0.0

6446

6447 Pass tuple or list to drop on multiple axes.

6448 Only a single axis is allowed.

6449

6450 how : {'any', 'all'}, default 'any'

6451 Determine if row or column is removed from DataFrame, when we have

6452 at least one NA or all NA.

6453

6454 * 'any' : If any NA values are present, drop that row or column.

6455 * 'all' : If all values are NA, drop that row or column.

6456

6457 thresh : int, optional

6458 Require that many non-NA values. Cannot be combined with how.

6459 subset : column label or sequence of labels, optional

6460 Labels along other axis to consider, e.g. if you are dropping rows

6461 these would be a list of columns to include.

6462 inplace : bool, default False

6463 Whether to modify the DataFrame rather than creating a new one.

6464

6465 Returns

6466 -------

6467 DataFrame or None

6468 DataFrame with NA entries dropped from it or None if ``inplace=True``.

6469

6470 See Also

6471 --------

6472 DataFrame.isna: Indicate missing values.

6473 DataFrame.notna : Indicate existing (non-missing) values.

6474 DataFrame.fillna : Replace missing values.

6475 Series.dropna : Drop missing values.

6476 Index.dropna : Drop missing indices.

6477

6478 Examples

6479 --------

6480 >>> df = pd.DataFrame({"name": ['Alfred', 'Batman', 'Catwoman'],

6481 ... "toy": [np.nan, 'Batmobile', 'Bullwhip'],

6482 ... "born": [pd.NaT, pd.Timestamp("1940-04-25"),

6483 ... pd.NaT]})

6484 >>> df

6485 name toy born

6486 0 Alfred NaN NaT

6487 1 Batman Batmobile 1940-04-25

6488 2 Catwoman Bullwhip NaT

6489

6490 Drop the rows where at least one element is missing.

6491

6492 >>> df.dropna()

6493 name toy born

6494 1 Batman Batmobile 1940-04-25

6495

6496 Drop the columns where at least one element is missing.

6497

6498 >>> df.dropna(axis='columns')

6499 name

6500 0 Alfred

6501 1 Batman

6502 2 Catwoman

6503

6504 Drop the rows where all elements are missing.

6505

6506 >>> df.dropna(how='all')

6507 name toy born

6508 0 Alfred NaN NaT

6509 1 Batman Batmobile 1940-04-25

6510 2 Catwoman Bullwhip NaT

6511

6512 Keep only the rows with at least 2 non-NA values.

6513

6514 >>> df.dropna(thresh=2)

6515 name toy born

6516 1 Batman Batmobile 1940-04-25

6517 2 Catwoman Bullwhip NaT

6518

6519 Define in which columns to look for missing values.

6520

6521 >>> df.dropna(subset=['name', 'toy'])

6522 name toy born

6523 1 Batman Batmobile 1940-04-25

6524 2 Catwoman Bullwhip NaT

6525

6526 Keep the DataFrame with valid entries in the same variable.

6527

6528 >>> df.dropna(inplace=True)

6529 >>> df

6530 name toy born

6531 1 Batman Batmobile 1940-04-25

6532 """

6533 if (how is not no_default) and (thresh is not no_default):

6534 raise TypeError(

6535 "You cannot set both the how and thresh arguments at the same time."

6536 )

6537

6538 if how is no_default:

6539 how = "any"

6540

6541 inplace = validate_bool_kwarg(inplace, "inplace")

6542 if isinstance(axis, (tuple, list)):

6543 # GH20987

6544 raise TypeError("supplying multiple axes to axis is no longer supported.")

6545

6546 axis = self._get_axis_number(axis)

6547 agg_axis = 1 - axis

6548

6549 agg_obj = self

6550 if subset is not None:

6551 # subset needs to be list

6552 if not is_list_like(subset):

6553 subset = [subset]

6554 ax = self._get_axis(agg_axis)

6555 indices = ax.get_indexer_for(subset)

6556 check = indices == -1

6557 if check.any():

6558 raise KeyError(np.array(subset)[check].tolist())

6559 agg_obj = self.take(indices, axis=agg_axis)

6560

6561 if thresh is not no_default:

6562 count = agg_obj.count(axis=agg_axis)

6563 mask = count >= thresh

6564 elif how == "any":

6565 # faster equivalent to 'agg_obj.count(agg_axis) == self.shape[agg_axis]'

6566 mask = notna(agg_obj).all(axis=agg_axis, bool_only=False)

6567 elif how == "all":

6568 # faster equivalent to 'agg_obj.count(agg_axis) > 0'

6569 mask = notna(agg_obj).any(axis=agg_axis, bool_only=False)

6570 else:

6571 raise ValueError(f"invalid how option: {how}")

6572

6573 if np.all(mask):

6574 result = self.copy()

6575 else:

6576 result = self.loc(axis=axis)[mask]

6577

6578 if not inplace:

6579 return result

6580 self._update_inplace(result)

6581 return None

6582

6583 @deprecate_nonkeyword_arguments(version=None, allowed_args=["self", "subset"])

6584 def drop_duplicates(

6585 self,

6586 subset: Hashable | Sequence[Hashable] | None = None,

6587 keep: Literal["first", "last", False] = "first",

6588 inplace: bool = False,

6589 ignore_index: bool = False,

6590 ) -> DataFrame | None:

6591 """

6592 Return DataFrame with duplicate rows removed.

6593

6594 Considering certain columns is optional. Indexes, including time indexes

6595 are ignored.

6596

6597 Parameters

6598 ----------

6599 subset : column label or sequence of labels, optional

6600 Only consider certain columns for identifying duplicates, by

6601 default use all of the columns.

6602 keep : {'first', 'last', False}, default 'first'

6603 Determines which duplicates (if any) to keep.

6604 - ``first`` : Drop duplicates except for the first occurrence.

6605 - ``last`` : Drop duplicates except for the last occurrence.

6606 - False : Drop all duplicates.

6607 inplace : bool, default False

6608 Whether to modify the DataFrame rather than creating a new one.

6609 ignore_index : bool, default False

6610 If True, the resulting axis will be labeled 0, 1, …, n - 1.

6611

6612 .. versionadded:: 1.0.0

6613

6614 Returns

6615 -------

6616 DataFrame or None

6617 DataFrame with duplicates removed or None if ``inplace=True``.

6618

6619 See Also

6620 --------

6621 DataFrame.value_counts: Count unique combinations of columns.

6622

6623 Examples

6624 --------

6625 Consider dataset containing ramen rating.

6626

6627 >>> df = pd.DataFrame({

6628 ... 'brand': ['Yum Yum', 'Yum Yum', 'Indomie', 'Indomie', 'Indomie'],

6629 ... 'style': ['cup', 'cup', 'cup', 'pack', 'pack'],

6630 ... 'rating': [4, 4, 3.5, 15, 5]

6631 ... })

6632 >>> df

6633 brand style rating

6634 0 Yum Yum cup 4.0

6635 1 Yum Yum cup 4.0

6636 2 Indomie cup 3.5

6637 3 Indomie pack 15.0

6638 4 Indomie pack 5.0

6639

6640 By default, it removes duplicate rows based on all columns.

6641

6642 >>> df.drop_duplicates()

6643 brand style rating

6644 0 Yum Yum cup 4.0

6645 2 Indomie cup 3.5

6646 3 Indomie pack 15.0

6647 4 Indomie pack 5.0

6648

6649 To remove duplicates on specific column(s), use ``subset``.

6650

6651 >>> df.drop_duplicates(subset=['brand'])

6652 brand style rating

6653 0 Yum Yum cup 4.0

6654 2 Indomie cup 3.5

6655

6656 To remove duplicates and keep last occurrences, use ``keep``.

6657

6658 >>> df.drop_duplicates(subset=['brand', 'style'], keep='last')

6659 brand style rating

6660 1 Yum Yum cup 4.0

6661 2 Indomie cup 3.5

6662 4 Indomie pack 5.0

6663 """

6664 if self.empty:

6665 return self.copy()

6666

6667 inplace = validate_bool_kwarg(inplace, "inplace")

6668 ignore_index = validate_bool_kwarg(ignore_index, "ignore_index")

6669 duplicated = self.duplicated(subset, keep=keep)

6670

6671 result = self[-duplicated]

6672 if ignore_index:

6673 result.index = default_index(len(result))

6674

6675 if inplace:

6676 self._update_inplace(result)

6677 return None

6678 else:

6679 return result

6680

6681 def duplicated(

6682 self,

6683 subset: Hashable | Sequence[Hashable] | None = None,

6684 keep: Literal["first", "last", False] = "first",

6685 ) -> Series:

6686 """

6687 Return boolean Series denoting duplicate rows.

6688

6689 Considering certain columns is optional.

6690

6691 Parameters

6692 ----------

6693 subset : column label or sequence of labels, optional

6694 Only consider certain columns for identifying duplicates, by

6695 default use all of the columns.

6696 keep : {'first', 'last', False}, default 'first'

6697 Determines which duplicates (if any) to mark.

6698

6699 - ``first`` : Mark duplicates as ``True`` except for the first occurrence.

6700 - ``last`` : Mark duplicates as ``True`` except for the last occurrence.

6701 - False : Mark all duplicates as ``True``.

6702

6703 Returns

6704 -------

6705 Series

6706 Boolean series for each duplicated rows.

6707

6708 See Also

6709 --------

6710 Index.duplicated : Equivalent method on index.

6711 Series.duplicated : Equivalent method on Series.

6712 Series.drop_duplicates : Remove duplicate values from Series.

6713 DataFrame.drop_duplicates : Remove duplicate values from DataFrame.

6714

6715 Examples

6716 --------

6717 Consider dataset containing ramen rating.

6718

6719 >>> df = pd.DataFrame({

6720 ... 'brand': ['Yum Yum', 'Yum Yum', 'Indomie', 'Indomie', 'Indomie'],

6721 ... 'style': ['cup', 'cup', 'cup', 'pack', 'pack'],

6722 ... 'rating': [4, 4, 3.5, 15, 5]

6723 ... })

6724 >>> df

6725 brand style rating

6726 0 Yum Yum cup 4.0

6727 1 Yum Yum cup 4.0

6728 2 Indomie cup 3.5

6729 3 Indomie pack 15.0

6730 4 Indomie pack 5.0

6731

6732 By default, for each set of duplicated values, the first occurrence

6733 is set on False and all others on True.

6734

6735 >>> df.duplicated()

6736 0 False

6737 1 True

6738 2 False

6739 3 False

6740 4 False

6741 dtype: bool

6742

6743 By using 'last', the last occurrence of each set of duplicated values

6744 is set on False and all others on True.

6745

6746 >>> df.duplicated(keep='last')

6747 0 True

6748 1 False

6749 2 False

6750 3 False

6751 4 False

6752 dtype: bool

6753

6754 By setting ``keep`` on False, all duplicates are True.

6755

6756 >>> df.duplicated(keep=False)

6757 0 True

6758 1 True

6759 2 False

6760 3 False

6761 4 False

6762 dtype: bool

6763

6764 To find duplicates on specific column(s), use ``subset``.

6765

6766 >>> df.duplicated(subset=['brand'])

6767 0 False

6768 1 True

6769 2 False

6770 3 True

6771 4 True

6772 dtype: bool

6773 """

6774

6775 if self.empty:

6776 return self._constructor_sliced(dtype=bool)

6777

6778 def f(vals) -> tuple[np.ndarray, int]:

6779 labels, shape = algorithms.factorize(vals, size_hint=len(self))

6780 return labels.astype("i8", copy=False), len(shape)

6781

6782 if subset is None:

6783 # https://github.com/pandas-dev/pandas/issues/28770

6784 # Incompatible types in assignment (expression has type "Index", variable

6785 # has type "Sequence[Any]")

6786 subset = self.columns # type: ignore[assignment]

6787 elif (

6788 not np.iterable(subset)

6789 or isinstance(subset, str)

6790 or isinstance(subset, tuple)

6791 and subset in self.columns

6792 ):

6793 subset = (subset,)

6794

6795 # needed for mypy since can't narrow types using np.iterable

6796 subset = cast(Sequence, subset)

6797

6798 # Verify all columns in subset exist in the queried dataframe

6799 # Otherwise, raise a KeyError, same as if you try to __getitem__ with a

6800 # key that doesn't exist.

6801 diff = set(subset) - set(self.columns)

6802 if diff:

6803 raise KeyError(Index(diff))

6804

6805 if len(subset) == 1 and self.columns.is_unique:

6806 # GH#45236 This is faster than get_group_index below

6807 result = self[subset[0]].duplicated(keep)

6808 result.name = None

6809 else:

6810 vals = (col.values for name, col in self.items() if name in subset)

6811 labels, shape = map(list, zip(*map(f, vals)))

6812

6813 ids = get_group_index(

6814 labels,

6815 # error: Argument 1 to "tuple" has incompatible type "List[_T]";

6816 # expected "Iterable[int]"

6817 tuple(shape), # type: ignore[arg-type]

6818 sort=False,

6819 xnull=False,

6820 )

6821 result = self._constructor_sliced(duplicated(ids, keep), index=self.index)

6822 return result.__finalize__(self, method="duplicated")

6823

6824 # ----------------------------------------------------------------------

6825 # Sorting

6826 # error: Signature of "sort_values" incompatible with supertype "NDFrame"

6827 @overload # type: ignore[override]

6828 def sort_values(

6829 self,

6830 by: IndexLabel,

6831 *,

6832 axis: Axis = ...,

6833 ascending=...,

6834 inplace: Literal[False] = ...,

6835 kind: str = ...,

6836 na_position: str = ...,

6837 ignore_index: bool = ...,

6838 key: ValueKeyFunc = ...,

6839 ) -> DataFrame:

6840 ...

6841

6842 @overload

6843 def sort_values(

6844 self,

6845 by: IndexLabel,

6846 *,

6847 axis: Axis = ...,

6848 ascending=...,

6849 inplace: Literal[True],

6850 kind: str = ...,

6851 na_position: str = ...,

6852 ignore_index: bool = ...,

6853 key: ValueKeyFunc = ...,

6854 ) -> None:

6855 ...

6856

6857 # TODO: Just move the sort_values doc here.

6858 # error: Signature of "sort_values" incompatible with supertype "NDFrame"

6859 @deprecate_nonkeyword_arguments(version=None, allowed_args=["self", "by"])

6860 @Substitution(**_shared_doc_kwargs)

6861 @Appender(NDFrame.sort_values.__doc__)

6862 def sort_values( # type: ignore[override]

6863 self,

6864 by: IndexLabel,

6865 axis: Axis = 0,

6866 ascending: bool | list[bool] | tuple[bool, ...] = True,

6867 inplace: bool = False,

6868 kind: str = "quicksort",

6869 na_position: str = "last",

6870 ignore_index: bool = False,

6871 key: ValueKeyFunc = None,

6872 ) -> DataFrame | None:

6873 inplace = validate_bool_kwarg(inplace, "inplace")

6874 axis = self._get_axis_number(axis)

6875 ascending = validate_ascending(ascending)

6876 if not isinstance(by, list):

6877 by = [by]

6878 # error: Argument 1 to "len" has incompatible type "Union[bool, List[bool]]";

6879 # expected "Sized"

6880 if is_sequence(ascending) and (

6881 len(by) != len(ascending) # type: ignore[arg-type]

6882 ):

6883 # error: Argument 1 to "len" has incompatible type "Union[bool,

6884 # List[bool]]"; expected "Sized"

6885 raise ValueError(

6886 f"Length of ascending ({len(ascending)})" # type: ignore[arg-type]

6887 f" != length of by ({len(by)})"

6888 )

6889 if len(by) > 1:

6890

6891 keys = [self._get_label_or_level_values(x, axis=axis) for x in by]

6892

6893 # need to rewrap columns in Series to apply key function

6894 if key is not None:

6895 # error: List comprehension has incompatible type List[Series];

6896 # expected List[ndarray]

6897 keys = [

6898 Series(k, name=name) # type: ignore[misc]

6899 for (k, name) in zip(keys, by)

6900 ]

6901

6902 indexer = lexsort_indexer(

6903 keys, orders=ascending, na_position=na_position, key=key

6904 )

6905 elif len(by):

6906 # len(by) == 1

6907

6908 by = by[0]

6909 k = self._get_label_or_level_values(by, axis=axis)

6910

6911 # need to rewrap column in Series to apply key function

6912 if key is not None:

6913 # error: Incompatible types in assignment (expression has type

6914 # "Series", variable has type "ndarray")

6915 k = Series(k, name=by) # type: ignore[assignment]

6916

6917 if isinstance(ascending, (tuple, list)):

6918 ascending = ascending[0]

6919

6920 indexer = nargsort(

6921 k, kind=kind, ascending=ascending, na_position=na_position, key=key

6922 )

6923 else:

6924 return self.copy()

6925

6926 new_data = self._mgr.take(

6927 indexer, axis=self._get_block_manager_axis(axis), verify=False

6928 )

6929

6930 if ignore_index:

6931 new_data.set_axis(

6932 self._get_block_manager_axis(axis), default_index(len(indexer))

6933 )

6934

6935 result = self._constructor(new_data)

6936 if inplace:

6937 return self._update_inplace(result)

6938 else:

6939 return result.__finalize__(self, method="sort_values")

6940

6941 @overload

6942 def sort_index(

6943 self,

6944 *,

6945 axis: Axis = ...,

6946 level: IndexLabel = ...,

6947 ascending: bool | Sequence[bool] = ...,

6948 inplace: Literal[True],

6949 kind: SortKind = ...,

6950 na_position: NaPosition = ...,

6951 sort_remaining: bool = ...,

6952 ignore_index: bool = ...,

6953 key: IndexKeyFunc = ...,

6954 ) -> None:

6955 ...

6956

6957 @overload

6958 def sort_index(

6959 self,

6960 *,

6961 axis: Axis = ...,

6962 level: IndexLabel = ...,

6963 ascending: bool | Sequence[bool] = ...,

6964 inplace: Literal[False] = ...,

6965 kind: SortKind = ...,

6966 na_position: NaPosition = ...,

6967 sort_remaining: bool = ...,

6968 ignore_index: bool = ...,

6969 key: IndexKeyFunc = ...,

6970 ) -> DataFrame:

6971 ...

6972

6973 @overload

6974 def sort_index(

6975 self,

6976 *,

6977 axis: Axis = ...,

6978 level: IndexLabel = ...,

6979 ascending: bool | Sequence[bool] = ...,

6980 inplace: bool = ...,

6981 kind: SortKind = ...,

6982 na_position: NaPosition = ...,

6983 sort_remaining: bool = ...,

6984 ignore_index: bool = ...,

6985 key: IndexKeyFunc = ...,

6986 ) -> DataFrame | None:

6987 ...

6988

6989 # error: Signature of "sort_index" incompatible with supertype "NDFrame"

6990 @deprecate_nonkeyword_arguments(version=None, allowed_args=["self"])

6991 def sort_index( # type: ignore[override]

6992 self,

6993 axis: Axis = 0,

6994 level: IndexLabel = None,

6995 ascending: bool | Sequence[bool] = True,

6996 inplace: bool = False,

6997 kind: SortKind = "quicksort",

6998 na_position: NaPosition = "last",

6999 sort_remaining: bool = True,

7000 ignore_index: bool = False,

7001 key: IndexKeyFunc = None,

7002 ) -> DataFrame | None:

7003 """

7004 Sort object by labels (along an axis).

7005

7006 Returns a new DataFrame sorted by label if `inplace` argument is

7007 ``False``, otherwise updates the original DataFrame and returns None.

7008

7009 Parameters

7010 ----------

7011 axis : {0 or 'index', 1 or 'columns'}, default 0

7012 The axis along which to sort. The value 0 identifies the rows,

7013 and 1 identifies the columns.

7014 level : int or level name or list of ints or list of level names

7015 If not None, sort on values in specified index level(s).

7016 ascending : bool or list-like of bools, default True

7017 Sort ascending vs. descending. When the index is a MultiIndex the

7018 sort direction can be controlled for each level individually.

7019 inplace : bool, default False

7020 Whether to modify the DataFrame rather than creating a new one.

7021 kind : {'quicksort', 'mergesort', 'heapsort', 'stable'}, default 'quicksort'

7022 Choice of sorting algorithm. See also :func:`numpy.sort` for more

7023 information. `mergesort` and `stable` are the only stable algorithms. For

7024 DataFrames, this option is only applied when sorting on a single

7025 column or label.

7026 na_position : {'first', 'last'}, default 'last'

7027 Puts NaNs at the beginning if `first`; `last` puts NaNs at the end.

7028 Not implemented for MultiIndex.

7029 sort_remaining : bool, default True

7030 If True and sorting by level and index is multilevel, sort by other

7031 levels too (in order) after sorting by specified level.

7032 ignore_index : bool, default False

7033 If True, the resulting axis will be labeled 0, 1, …, n - 1.

7034

7035 .. versionadded:: 1.0.0

7036

7037 key : callable, optional

7038 If not None, apply the key function to the index values

7039 before sorting. This is similar to the `key` argument in the

7040 builtin :meth:`sorted` function, with the notable difference that

7041 this `key` function should be *vectorized*. It should expect an

7042 ``Index`` and return an ``Index`` of the same shape. For MultiIndex

7043 inputs, the key is applied *per level*.

7044

7045 .. versionadded:: 1.1.0

7046

7047 Returns

7048 -------

7049 DataFrame or None

7050 The original DataFrame sorted by the labels or None if ``inplace=True``.

7051

7052 See Also

7053 --------

7054 Series.sort_index : Sort Series by the index.

7055 DataFrame.sort_values : Sort DataFrame by the value.

7056 Series.sort_values : Sort Series by the value.

7057

7058 Examples

7059 --------

7060 >>> df = pd.DataFrame([1, 2, 3, 4, 5], index=[100, 29, 234, 1, 150],

7061 ... columns=['A'])

7062 >>> df.sort_index()

7063 A

7064 1 4

7065 29 2

7066 100 1

7067 150 5

7068 234 3

7069

7070 By default, it sorts in ascending order, to sort in descending order,

7071 use ``ascending=False``

7072

7073 >>> df.sort_index(ascending=False)

7074 A

7075 234 3

7076 150 5

7077 100 1

7078 29 2

7079 1 4

7080

7081 A key function can be specified which is applied to the index before

7082 sorting. For a ``MultiIndex`` this is applied to each level separately.

7083

7084 >>> df = pd.DataFrame({"a": [1, 2, 3, 4]}, index=['A', 'b', 'C', 'd'])

7085 >>> df.sort_index(key=lambda x: x.str.lower())

7086 a

7087 A 1

7088 b 2

7089 C 3

7090 d 4

7091 """

7092 return super().sort_index(

7093 axis=axis,

7094 level=level,

7095 ascending=ascending,

7096 inplace=inplace,

7097 kind=kind,

7098 na_position=na_position,

7099 sort_remaining=sort_remaining,

7100 ignore_index=ignore_index,

7101 key=key,

7102 )

7103

7104 def value_counts(

7105 self,

7106 subset: Sequence[Hashable] | None = None,

7107 normalize: bool = False,

7108 sort: bool = True,

7109 ascending: bool = False,

7110 dropna: bool = True,

7111 ) -> Series:

7112 """

7113 Return a Series containing counts of unique rows in the DataFrame.

7114

7115 .. versionadded:: 1.1.0

7116

7117 Parameters

7118 ----------

7119 subset : list-like, optional

7120 Columns to use when counting unique combinations.

7121 normalize : bool, default False

7122 Return proportions rather than frequencies.

7123 sort : bool, default True

7124 Sort by frequencies.

7125 ascending : bool, default False

7126 Sort in ascending order.

7127 dropna : bool, default True

7128 Don’t include counts of rows that contain NA values.

7129

7130 .. versionadded:: 1.3.0

7131

7132 Returns

7133 -------

7134 Series

7135

7136 See Also

7137 --------

7138 Series.value_counts: Equivalent method on Series.

7139

7140 Notes

7141 -----

7142 The returned Series will have a MultiIndex with one level per input

7143 column. By default, rows that contain any NA values are omitted from

7144 the result. By default, the resulting Series will be in descending

7145 order so that the first element is the most frequently-occurring row.

7146

7147 Examples

7148 --------

7149 >>> df = pd.DataFrame({'num_legs': [2, 4, 4, 6],

7150 ... 'num_wings': [2, 0, 0, 0]},

7151 ... index=['falcon', 'dog', 'cat', 'ant'])

7152 >>> df

7153 num_legs num_wings

7154 falcon 2 2

7155 dog 4 0

7156 cat 4 0

7157 ant 6 0

7158

7159 >>> df.value_counts()

7160 num_legs num_wings

7161 4 0 2

7162 2 2 1

7163 6 0 1

7164 dtype: int64

7165

7166 >>> df.value_counts(sort=False)

7167 num_legs num_wings

7168 2 2 1

7169 4 0 2

7170 6 0 1

7171 dtype: int64

7172

7173 >>> df.value_counts(ascending=True)

7174 num_legs num_wings

7175 2 2 1

7176 6 0 1

7177 4 0 2

7178 dtype: int64

7179

7180 >>> df.value_counts(normalize=True)

7181 num_legs num_wings

7182 4 0 0.50

7183 2 2 0.25

7184 6 0 0.25

7185 dtype: float64

7186

7187 With `dropna` set to `False` we can also count rows with NA values.

7188

7189 >>> df = pd.DataFrame({'first_name': ['John', 'Anne', 'John', 'Beth'],

7190 ... 'middle_name': ['Smith', pd.NA, pd.NA, 'Louise']})

7191 >>> df

7192 first_name middle_name

7193 0 John Smith

7194 1 Anne <NA>

7195 2 John <NA>

7196 3 Beth Louise

7197

7198 >>> df.value_counts()

7199 first_name middle_name

7200 Beth Louise 1

7201 John Smith 1

7202 dtype: int64

7203

7204 >>> df.value_counts(dropna=False)

7205 first_name middle_name

7206 Anne NaN 1

7207 Beth Louise 1

7208 John Smith 1

7209 NaN 1

7210 dtype: int64

7211 """

7212 if subset is None:

7213 subset = self.columns.tolist()

7214

7215 counts = self.groupby(subset, dropna=dropna).grouper.size()

7216

7217 if sort:

7218 counts = counts.sort_values(ascending=ascending)

7219 if normalize:

7220 counts /= counts.sum()

7221

7222 # Force MultiIndex for single column

7223 if len(subset) == 1:

7224 counts.index = MultiIndex.from_arrays(

7225 [counts.index], names=[counts.index.name]

7226 )

7227

7228 return counts

7229

7230 def nlargest(self, n: int, columns: IndexLabel, keep: str = "first") -> DataFrame:

7231 """

7232 Return the first `n` rows ordered by `columns` in descending order.

7233

7234 Return the first `n` rows with the largest values in `columns`, in

7235 descending order. The columns that are not specified are returned as

7236 well, but not used for ordering.

7237

7238 This method is equivalent to

7239 ``df.sort_values(columns, ascending=False).head(n)``, but more

7240 performant.

7241

7242 Parameters

7243 ----------

7244 n : int

7245 Number of rows to return.

7246 columns : label or list of labels

7247 Column label(s) to order by.

7248 keep : {'first', 'last', 'all'}, default 'first'

7249 Where there are duplicate values:

7250

7251 - ``first`` : prioritize the first occurrence(s)

7252 - ``last`` : prioritize the last occurrence(s)

7253 - ``all`` : do not drop any duplicates, even it means

7254 selecting more than `n` items.

7255

7256 Returns

7257 -------

7258 DataFrame

7259 The first `n` rows ordered by the given columns in descending

7260 order.

7261

7262 See Also

7263 --------

7264 DataFrame.nsmallest : Return the first `n` rows ordered by `columns` in

7265 ascending order.

7266 DataFrame.sort_values : Sort DataFrame by the values.

7267 DataFrame.head : Return the first `n` rows without re-ordering.

7268

7269 Notes

7270 -----

7271 This function cannot be used with all column types. For example, when

7272 specifying columns with `object` or `category` dtypes, ``TypeError`` is

7273 raised.

7274

7275 Examples

7276 --------

7277 >>> df = pd.DataFrame({'population': [59000000, 65000000, 434000,

7278 ... 434000, 434000, 337000, 11300,

7279 ... 11300, 11300],

7280 ... 'GDP': [1937894, 2583560 , 12011, 4520, 12128,

7281 ... 17036, 182, 38, 311],

7282 ... 'alpha-2': ["IT", "FR", "MT", "MV", "BN",

7283 ... "IS", "NR", "TV", "AI"]},

7284 ... index=["Italy", "France", "Malta",

7285 ... "Maldives", "Brunei", "Iceland",

7286 ... "Nauru", "Tuvalu", "Anguilla"])

7287 >>> df

7288 population GDP alpha-2

7289 Italy 59000000 1937894 IT

7290 France 65000000 2583560 FR

7291 Malta 434000 12011 MT

7292 Maldives 434000 4520 MV

7293 Brunei 434000 12128 BN

7294 Iceland 337000 17036 IS

7295 Nauru 11300 182 NR

7296 Tuvalu 11300 38 TV

7297 Anguilla 11300 311 AI

7298

7299 In the following example, we will use ``nlargest`` to select the three

7300 rows having the largest values in column "population".

7301

7302 >>> df.nlargest(3, 'population')

7303 population GDP alpha-2

7304 France 65000000 2583560 FR

7305 Italy 59000000 1937894 IT

7306 Malta 434000 12011 MT

7307

7308 When using ``keep='last'``, ties are resolved in reverse order:

7309

7310 >>> df.nlargest(3, 'population', keep='last')

7311 population GDP alpha-2

7312 France 65000000 2583560 FR

7313 Italy 59000000 1937894 IT

7314 Brunei 434000 12128 BN

7315

7316 When using ``keep='all'``, all duplicate items are maintained:

7317

7318 >>> df.nlargest(3, 'population', keep='all')

7319 population GDP alpha-2

7320 France 65000000 2583560 FR

7321 Italy 59000000 1937894 IT

7322 Malta 434000 12011 MT

7323 Maldives 434000 4520 MV

7324 Brunei 434000 12128 BN

7325

7326 To order by the largest values in column "population" and then "GDP",

7327 we can specify multiple columns like in the next example.

7328

7329 >>> df.nlargest(3, ['population', 'GDP'])

7330 population GDP alpha-2

7331 France 65000000 2583560 FR

7332 Italy 59000000 1937894 IT

7333 Brunei 434000 12128 BN

7334 """

7335 return algorithms.SelectNFrame(self, n=n, keep=keep, columns=columns).nlargest()

7336

7337 def nsmallest(self, n: int, columns: IndexLabel, keep: str = "first") -> DataFrame:

7338 """

7339 Return the first `n` rows ordered by `columns` in ascending order.

7340

7341 Return the first `n` rows with the smallest values in `columns`, in

7342 ascending order. The columns that are not specified are returned as

7343 well, but not used for ordering.

7344

7345 This method is equivalent to

7346 ``df.sort_values(columns, ascending=True).head(n)``, but more

7347 performant.

7348

7349 Parameters

7350 ----------

7351 n : int

7352 Number of items to retrieve.

7353 columns : list or str

7354 Column name or names to order by.

7355 keep : {'first', 'last', 'all'}, default 'first'

7356 Where there are duplicate values:

7357

7358 - ``first`` : take the first occurrence.

7359 - ``last`` : take the last occurrence.

7360 - ``all`` : do not drop any duplicates, even it means

7361 selecting more than `n` items.

7362

7363 Returns

7364 -------

7365 DataFrame

7366

7367 See Also

7368 --------

7369 DataFrame.nlargest : Return the first `n` rows ordered by `columns` in

7370 descending order.

7371 DataFrame.sort_values : Sort DataFrame by the values.

7372 DataFrame.head : Return the first `n` rows without re-ordering.

7373

7374 Examples

7375 --------

7376 >>> df = pd.DataFrame({'population': [59000000, 65000000, 434000,

7377 ... 434000, 434000, 337000, 337000,

7378 ... 11300, 11300],

7379 ... 'GDP': [1937894, 2583560 , 12011, 4520, 12128,

7380 ... 17036, 182, 38, 311],

7381 ... 'alpha-2': ["IT", "FR", "MT", "MV", "BN",

7382 ... "IS", "NR", "TV", "AI"]},

7383 ... index=["Italy", "France", "Malta",

7384 ... "Maldives", "Brunei", "Iceland",

7385 ... "Nauru", "Tuvalu", "Anguilla"])

7386 >>> df

7387 population GDP alpha-2

7388 Italy 59000000 1937894 IT

7389 France 65000000 2583560 FR

7390 Malta 434000 12011 MT

7391 Maldives 434000 4520 MV

7392 Brunei 434000 12128 BN

7393 Iceland 337000 17036 IS

7394 Nauru 337000 182 NR

7395 Tuvalu 11300 38 TV

7396 Anguilla 11300 311 AI

7397

7398 In the following example, we will use ``nsmallest`` to select the

7399 three rows having the smallest values in column "population".

7400

7401 >>> df.nsmallest(3, 'population')

7402 population GDP alpha-2

7403 Tuvalu 11300 38 TV

7404 Anguilla 11300 311 AI

7405 Iceland 337000 17036 IS

7406

7407 When using ``keep='last'``, ties are resolved in reverse order:

7408

7409 >>> df.nsmallest(3, 'population', keep='last')

7410 population GDP alpha-2

7411 Anguilla 11300 311 AI

7412 Tuvalu 11300 38 TV

7413 Nauru 337000 182 NR

7414

7415 When using ``keep='all'``, all duplicate items are maintained:

7416

7417 >>> df.nsmallest(3, 'population', keep='all')

7418 population GDP alpha-2

7419 Tuvalu 11300 38 TV

7420 Anguilla 11300 311 AI

7421 Iceland 337000 17036 IS

7422 Nauru 337000 182 NR

7423

7424 To order by the smallest values in column "population" and then "GDP", we can

7425 specify multiple columns like in the next example.

7426

7427 >>> df.nsmallest(3, ['population', 'GDP'])

7428 population GDP alpha-2

7429 Tuvalu 11300 38 TV

7430 Anguilla 11300 311 AI

7431 Nauru 337000 182 NR

7432 """

7433 return algorithms.SelectNFrame(

7434 self, n=n, keep=keep, columns=columns

7435 ).nsmallest()

7436

7437 @doc(

7438 Series.swaplevel,

7439 klass=_shared_doc_kwargs["klass"],

7440 extra_params=dedent(

7441 """axis : {0 or 'index', 1 or 'columns'}, default 0

7442 The axis to swap levels on. 0 or 'index' for row-wise, 1 or

7443 'columns' for column-wise."""

7444 ),

7445 examples=dedent(

7446 """\

7447 Examples

7448 --------

7449 >>> df = pd.DataFrame(

7450 ... {"Grade": ["A", "B", "A", "C"]},

7451 ... index=[

7452 ... ["Final exam", "Final exam", "Coursework", "Coursework"],

7453 ... ["History", "Geography", "History", "Geography"],

7454 ... ["January", "February", "March", "April"],

7455 ... ],

7456 ... )

7457 >>> df

7458 Grade

7459 Final exam History January A

7460 Geography February B

7461 Coursework History March A

7462 Geography April C

7463

7464 In the following example, we will swap the levels of the indices.

7465 Here, we will swap the levels column-wise, but levels can be swapped row-wise

7466 in a similar manner. Note that column-wise is the default behaviour.

7467 By not supplying any arguments for i and j, we swap the last and second to

7468 last indices.

7469

7470 >>> df.swaplevel()

7471 Grade

7472 Final exam January History A

7473 February Geography B

7474 Coursework March History A

7475 April Geography C

7476

7477 By supplying one argument, we can choose which index to swap the last

7478 index with. We can for example swap the first index with the last one as

7479 follows.

7480

7481 >>> df.swaplevel(0)

7482 Grade

7483 January History Final exam A

7484 February Geography Final exam B

7485 March History Coursework A

7486 April Geography Coursework C

7487

7488 We can also define explicitly which indices we want to swap by supplying values

7489 for both i and j. Here, we for example swap the first and second indices.

7490

7491 >>> df.swaplevel(0, 1)

7492 Grade

7493 History Final exam January A

7494 Geography Final exam February B

7495 History Coursework March A

7496 Geography Coursework April C"""

7497 ),

7498 )

7499 def swaplevel(self, i: Axis = -2, j: Axis = -1, axis: Axis = 0) -> DataFrame:

7500 result = self.copy()

7501

7502 axis = self._get_axis_number(axis)

7503

7504 if not isinstance(result._get_axis(axis), MultiIndex): # pragma: no cover

7505 raise TypeError("Can only swap levels on a hierarchical axis.")

7506

7507 if axis == 0:

7508 assert isinstance(result.index, MultiIndex)

7509 result.index = result.index.swaplevel(i, j)

7510 else:

7511 assert isinstance(result.columns, MultiIndex)

7512 result.columns = result.columns.swaplevel(i, j)

7513 return result

7514

7515 def reorder_levels(self, order: Sequence[Axis], axis: Axis = 0) -> DataFrame:

7516 """

7517 Rearrange index levels using input order. May not drop or duplicate levels.

7518

7519 Parameters

7520 ----------

7521 order : list of int or list of str

7522 List representing new level order. Reference level by number

7523 (position) or by key (label).

7524 axis : {0 or 'index', 1 or 'columns'}, default 0

7525 Where to reorder levels.

7526

7527 Returns

7528 -------

7529 DataFrame

7530

7531 Examples

7532 --------

7533 >>> data = {

7534 ... "class": ["Mammals", "Mammals", "Reptiles"],

7535 ... "diet": ["Omnivore", "Carnivore", "Carnivore"],

7536 ... "species": ["Humans", "Dogs", "Snakes"],

7537 ... }

7538 >>> df = pd.DataFrame(data, columns=["class", "diet", "species"])

7539 >>> df = df.set_index(["class", "diet"])

7540 >>> df

7541 species

7542 class diet

7543 Mammals Omnivore Humans

7544 Carnivore Dogs

7545 Reptiles Carnivore Snakes

7546

7547 Let's reorder the levels of the index:

7548

7549 >>> df.reorder_levels(["diet", "class"])

7550 species

7551 diet class

7552 Omnivore Mammals Humans

7553 Carnivore Mammals Dogs

7554 Reptiles Snakes

7555 """

7556 axis = self._get_axis_number(axis)

7557 if not isinstance(self._get_axis(axis), MultiIndex): # pragma: no cover

7558 raise TypeError("Can only reorder levels on a hierarchical axis.")

7559

7560 result = self.copy()

7561

7562 if axis == 0:

7563 assert isinstance(result.index, MultiIndex)

7564 result.index = result.index.reorder_levels(order)

7565 else:

7566 assert isinstance(result.columns, MultiIndex)

7567 result.columns = result.columns.reorder_levels(order)

7568 return result

7569

7570 # ----------------------------------------------------------------------

7571 # Arithmetic Methods

7572

7573 def _cmp_method(self, other, op):

7574 axis = 1 # only relevant for Series other case

7575

7576 self, other = ops.align_method_FRAME(self, other, axis, flex=False, level=None)

7577

7578 # See GH#4537 for discussion of scalar op behavior

7579 new_data = self._dispatch_frame_op(other, op, axis=axis)

7580 return self._construct_result(new_data)

7581

7582 def _arith_method(self, other, op):

7583 if ops.should_reindex_frame_op(self, other, op, 1, 1, None, None):

7584 return ops.frame_arith_method_with_reindex(self, other, op)

7585

7586 axis = 1 # only relevant for Series other case

7587 other = ops.maybe_prepare_scalar_for_op(other, (self.shape[axis],))

7588

7589 self, other = ops.align_method_FRAME(self, other, axis, flex=True, level=None)

7590

7591 new_data = self._dispatch_frame_op(other, op, axis=axis)

7592 return self._construct_result(new_data)

7593

7594 _logical_method = _arith_method

7595

7596 def _dispatch_frame_op(self, right, func: Callable, axis: int | None = None):

7597 """

7598 Evaluate the frame operation func(left, right) by evaluating

7599 column-by-column, dispatching to the Series implementation.

7600

7601 Parameters

7602 ----------

7603 right : scalar, Series, or DataFrame

7604 func : arithmetic or comparison operator

7605 axis : {None, 0, 1}

7606

7607 Returns

7608 -------

7609 DataFrame

7610 """

7611 # Get the appropriate array-op to apply to each column/block's values.

7612 array_op = ops.get_array_op(func)

7613

7614 right = lib.item_from_zerodim(right)

7615 if not is_list_like(right):

7616 # i.e. scalar, faster than checking np.ndim(right) == 0

7617 with np.errstate(all="ignore"):

7618 bm = self._mgr.apply(array_op, right=right)

7619 return self._constructor(bm)

7620

7621 elif isinstance(right, DataFrame):

7622 assert self.index.equals(right.index)

7623 assert self.columns.equals(right.columns)

7624 # TODO: The previous assertion `assert right._indexed_same(self)`

7625 # fails in cases with empty columns reached via

7626 # _frame_arith_method_with_reindex

7627

7628 # TODO operate_blockwise expects a manager of the same type

7629 with np.errstate(all="ignore"):

7630 bm = self._mgr.operate_blockwise(

7631 # error: Argument 1 to "operate_blockwise" of "ArrayManager" has

7632 # incompatible type "Union[ArrayManager, BlockManager]"; expected

7633 # "ArrayManager"

7634 # error: Argument 1 to "operate_blockwise" of "BlockManager" has

7635 # incompatible type "Union[ArrayManager, BlockManager]"; expected

7636 # "BlockManager"

7637 right._mgr, # type: ignore[arg-type]

7638 array_op,

7639 )

7640 return self._constructor(bm)

7641

7642 elif isinstance(right, Series) and axis == 1:

7643 # axis=1 means we want to operate row-by-row

7644 assert right.index.equals(self.columns)

7645

7646 right = right._values

7647 # maybe_align_as_frame ensures we do not have an ndarray here

7648 assert not isinstance(right, np.ndarray)

7649

7650 with np.errstate(all="ignore"):

7651 arrays = [

7652 array_op(_left, _right)

7653 for _left, _right in zip(self._iter_column_arrays(), right)

7654 ]

7655

7656 elif isinstance(right, Series):

7657 assert right.index.equals(self.index) # Handle other cases later

7658 right = right._values

7659

7660 with np.errstate(all="ignore"):

7661 arrays = [array_op(left, right) for left in self._iter_column_arrays()]

7662

7663 else:

7664 # Remaining cases have less-obvious dispatch rules

7665 raise NotImplementedError(right)

7666

7667 return type(self)._from_arrays(

7668 arrays, self.columns, self.index, verify_integrity=False

7669 )

7670

7671 def _combine_frame(self, other: DataFrame, func, fill_value=None):

7672 # at this point we have `self._indexed_same(other)`

7673

7674 if fill_value is None:

7675 # since _arith_op may be called in a loop, avoid function call

7676 # overhead if possible by doing this check once

7677 _arith_op = func

7678

7679 else:

7680

7681 def _arith_op(left, right):

7682 # for the mixed_type case where we iterate over columns,

7683 # _arith_op(left, right) is equivalent to

7684 # left._binop(right, func, fill_value=fill_value)

7685 left, right = ops.fill_binop(left, right, fill_value)

7686 return func(left, right)

7687

7688 new_data = self._dispatch_frame_op(other, _arith_op)

7689 return new_data

7690

7691 def _construct_result(self, result) -> DataFrame:

7692 """

7693 Wrap the result of an arithmetic, comparison, or logical operation.

7694

7695 Parameters

7696 ----------

7697 result : DataFrame

7698

7699 Returns

7700 -------

7701 DataFrame

7702 """

7703 out = self._constructor(result, copy=False)

7704 # Pin columns instead of passing to constructor for compat with

7705 # non-unique columns case

7706 out.columns = self.columns

7707 out.index = self.index

7708 return out

7709

7710 def __divmod__(self, other) -> tuple[DataFrame, DataFrame]:

7711 # Naive implementation, room for optimization

7712 div = self // other

7713 mod = self - div * other

7714 return div, mod

7715

7716 def __rdivmod__(self, other) -> tuple[DataFrame, DataFrame]:

7717 # Naive implementation, room for optimization

7718 div = other // self

7719 mod = other - div * self

7720 return div, mod

7721

7722 # ----------------------------------------------------------------------

7723 # Combination-Related

7724

7725 @doc(

7726 _shared_docs["compare"],

7727 """

7728Returns

7729-------

7730DataFrame

7731 DataFrame that shows the differences stacked side by side.

7732

7733 The resulting index will be a MultiIndex with 'self' and 'other'

7734 stacked alternately at the inner level.

7735

7736Raises

7737------

7738ValueError

7739 When the two DataFrames don't have identical labels or shape.

7740

7741See Also

7742--------

7743Series.compare : Compare with another Series and show differences.

7744DataFrame.equals : Test whether two objects contain the same elements.

7745

7746Notes

7747-----

7748Matching NaNs will not appear as a difference.

7749

7750Can only compare identically-labeled

7751(i.e. same shape, identical row and column labels) DataFrames

7752

7753Examples

7754--------

7755>>> df = pd.DataFrame(

7756... {{

7757... "col1": ["a", "a", "b", "b", "a"],

7758... "col2": [1.0, 2.0, 3.0, np.nan, 5.0],

7759... "col3": [1.0, 2.0, 3.0, 4.0, 5.0]

7760... }},

7761... columns=["col1", "col2", "col3"],

7762... )

7763>>> df

7764 col1 col2 col3

77650 a 1.0 1.0

77661 a 2.0 2.0

77672 b 3.0 3.0

77683 b NaN 4.0

77694 a 5.0 5.0

7770

7771>>> df2 = df.copy()

7772>>> df2.loc[0, 'col1'] = 'c'

7773>>> df2.loc[2, 'col3'] = 4.0

7774>>> df2

7775 col1 col2 col3

77760 c 1.0 1.0

77771 a 2.0 2.0

77782 b 3.0 4.0

77793 b NaN 4.0

77804 a 5.0 5.0

7781

7782Align the differences on columns

7783

7784>>> df.compare(df2)

7785 col1 col3

7786 self other self other

77870 a c NaN NaN

77882 NaN NaN 3.0 4.0

7789

7790Assign result_names

7791

7792>>> df.compare(df2, result_names=("left", "right"))

7793 col1 col3

7794 left right left right

77950 a c NaN NaN

77962 NaN NaN 3.0 4.0

7797

7798Stack the differences on rows

7799

7800>>> df.compare(df2, align_axis=0)

7801 col1 col3

78020 self a NaN

7803 other c NaN

78042 self NaN 3.0

7805 other NaN 4.0

7806

7807Keep the equal values

7808

7809>>> df.compare(df2, keep_equal=True)

7810 col1 col3

7811 self other self other

78120 a c 1.0 1.0

78132 b b 3.0 4.0

7814

7815Keep all original rows and columns

7816

7817>>> df.compare(df2, keep_shape=True)

7818 col1 col2 col3

7819 self other self other self other

78200 a c NaN NaN NaN NaN

78211 NaN NaN NaN NaN NaN NaN

78222 NaN NaN NaN NaN 3.0 4.0

78233 NaN NaN NaN NaN NaN NaN

78244 NaN NaN NaN NaN NaN NaN

7825

7826Keep all original rows and columns and also all original values

7827

7828>>> df.compare(df2, keep_shape=True, keep_equal=True)

7829 col1 col2 col3

7830 self other self other self other

78310 a c 1.0 1.0 1.0 1.0

78321 a a 2.0 2.0 2.0 2.0

78332 b b 3.0 3.0 3.0 4.0

78343 b b NaN NaN 4.0 4.0

78354 a a 5.0 5.0 5.0 5.0

7836""",

7837 klass=_shared_doc_kwargs["klass"],

7838 )

7839 def compare(

7840 self,

7841 other: DataFrame,

7842 align_axis: Axis = 1,

7843 keep_shape: bool = False,

7844 keep_equal: bool = False,

7845 result_names: Suffixes = ("self", "other"),

7846 ) -> DataFrame:

7847 return super().compare(

7848 other=other,

7849 align_axis=align_axis,

7850 keep_shape=keep_shape,

7851 keep_equal=keep_equal,

7852 result_names=result_names,

7853 )

7854

7855 def combine(

7856 self,

7857 other: DataFrame,

7858 func: Callable[[Series, Series], Series | Hashable],

7859 fill_value=None,

7860 overwrite: bool = True,

7861 ) -> DataFrame:

7862 """

7863 Perform column-wise combine with another DataFrame.

7864

7865 Combines a DataFrame with `other` DataFrame using `func`

7866 to element-wise combine columns. The row and column indexes of the

7867 resulting DataFrame will be the union of the two.

7868

7869 Parameters

7870 ----------

7871 other : DataFrame

7872 The DataFrame to merge column-wise.

7873 func : function

7874 Function that takes two series as inputs and return a Series or a

7875 scalar. Used to merge the two dataframes column by columns.

7876 fill_value : scalar value, default None

7877 The value to fill NaNs with prior to passing any column to the

7878 merge func.

7879 overwrite : bool, default True

7880 If True, columns in `self` that do not exist in `other` will be

7881 overwritten with NaNs.

7882

7883 Returns

7884 -------

7885 DataFrame

7886 Combination of the provided DataFrames.

7887

7888 See Also

7889 --------

7890 DataFrame.combine_first : Combine two DataFrame objects and default to

7891 non-null values in frame calling the method.

7892

7893 Examples

7894 --------

7895 Combine using a simple function that chooses the smaller column.

7896

7897 >>> df1 = pd.DataFrame({'A': [0, 0], 'B': [4, 4]})

7898 >>> df2 = pd.DataFrame({'A': [1, 1], 'B': [3, 3]})

7899 >>> take_smaller = lambda s1, s2: s1 if s1.sum() < s2.sum() else s2

7900 >>> df1.combine(df2, take_smaller)

7901 A B

7902 0 0 3

7903 1 0 3

7904

7905 Example using a true element-wise combine function.

7906

7907 >>> df1 = pd.DataFrame({'A': [5, 0], 'B': [2, 4]})

7908 >>> df2 = pd.DataFrame({'A': [1, 1], 'B': [3, 3]})

7909 >>> df1.combine(df2, np.minimum)

7910 A B

7911 0 1 2

7912 1 0 3

7913

7914 Using `fill_value` fills Nones prior to passing the column to the

7915 merge function.

7916

7917 >>> df1 = pd.DataFrame({'A': [0, 0], 'B': [None, 4]})

7918 >>> df2 = pd.DataFrame({'A': [1, 1], 'B': [3, 3]})

7919 >>> df1.combine(df2, take_smaller, fill_value=-5)

7920 A B

7921 0 0 -5.0

7922 1 0 4.0

7923

7924 However, if the same element in both dataframes is None, that None

7925 is preserved

7926

7927 >>> df1 = pd.DataFrame({'A': [0, 0], 'B': [None, 4]})

7928 >>> df2 = pd.DataFrame({'A': [1, 1], 'B': [None, 3]})

7929 >>> df1.combine(df2, take_smaller, fill_value=-5)

7930 A B

7931 0 0 -5.0

7932 1 0 3.0

7933

7934 Example that demonstrates the use of `overwrite` and behavior when

7935 the axis differ between the dataframes.

7936

7937 >>> df1 = pd.DataFrame({'A': [0, 0], 'B': [4, 4]})

7938 >>> df2 = pd.DataFrame({'B': [3, 3], 'C': [-10, 1], }, index=[1, 2])

7939 >>> df1.combine(df2, take_smaller)

7940 A B C

7941 0 NaN NaN NaN

7942 1 NaN 3.0 -10.0

7943 2 NaN 3.0 1.0

7944

7945 >>> df1.combine(df2, take_smaller, overwrite=False)

7946 A B C

7947 0 0.0 NaN NaN

7948 1 0.0 3.0 -10.0

7949 2 NaN 3.0 1.0

7950

7951 Demonstrating the preference of the passed in dataframe.

7952

7953 >>> df2 = pd.DataFrame({'B': [3, 3], 'C': [1, 1], }, index=[1, 2])

7954 >>> df2.combine(df1, take_smaller)

7955 A B C

7956 0 0.0 NaN NaN

7957 1 0.0 3.0 NaN

7958 2 NaN 3.0 NaN

7959

7960 >>> df2.combine(df1, take_smaller, overwrite=False)

7961 A B C

7962 0 0.0 NaN NaN

7963 1 0.0 3.0 1.0

7964 2 NaN 3.0 1.0

7965 """

7966 other_idxlen = len(other.index) # save for compare

7967

7968 this, other = self.align(other, copy=False)

7969 new_index = this.index

7970

7971 if other.empty and len(new_index) == len(self.index):

7972 return self.copy()

7973

7974 if self.empty and len(other) == other_idxlen:

7975 return other.copy()

7976

7977 # sorts if possible

7978 new_columns = this.columns.union(other.columns)

7979 do_fill = fill_value is not None

7980 result = {}

7981 for col in new_columns:

7982 series = this[col]

7983 otherSeries = other[col]

7984

7985 this_dtype = series.dtype

7986 other_dtype = otherSeries.dtype

7987

7988 this_mask = isna(series)

7989 other_mask = isna(otherSeries)

7990

7991 # don't overwrite columns unnecessarily

7992 # DO propagate if this column is not in the intersection

7993 if not overwrite and other_mask.all():

7994 result[col] = this[col].copy()

7995 continue

7996

7997 if do_fill:

7998 series = series.copy()

7999 otherSeries = otherSeries.copy()

8000 series[this_mask] = fill_value

8001 otherSeries[other_mask] = fill_value

8002

8003 if col not in self.columns:

8004 # If self DataFrame does not have col in other DataFrame,

8005 # try to promote series, which is all NaN, as other_dtype.

8006 new_dtype = other_dtype

8007 try:

8008 series = series.astype(new_dtype, copy=False)

8009 except ValueError:

8010 # e.g. new_dtype is integer types

8011 pass

8012 else:

8013 # if we have different dtypes, possibly promote

8014 new_dtype = find_common_type([this_dtype, other_dtype])

8015 series = series.astype(new_dtype, copy=False)

8016 otherSeries = otherSeries.astype(new_dtype, copy=False)

8017

8018 arr = func(series, otherSeries)

8019 if isinstance(new_dtype, np.dtype):

8020 # if new_dtype is an EA Dtype, then `func` is expected to return

8021 # the correct dtype without any additional casting

8022 # error: No overload variant of "maybe_downcast_to_dtype" matches

8023 # argument types "Union[Series, Hashable]", "dtype[Any]"

8024 arr = maybe_downcast_to_dtype( # type: ignore[call-overload]

8025 arr, new_dtype

8026 )

8027

8028 result[col] = arr

8029

8030 # convert_objects just in case

8031 return self._constructor(result, index=new_index, columns=new_columns)

8032

8033 def combine_first(self, other: DataFrame) -> DataFrame:

8034 """

8035 Update null elements with value in the same location in `other`.

8036

8037 Combine two DataFrame objects by filling null values in one DataFrame

8038 with non-null values from other DataFrame. The row and column indexes

8039 of the resulting DataFrame will be the union of the two. The resulting

8040 dataframe contains the 'first' dataframe values and overrides the

8041 second one values where both first.loc[index, col] and

8042 second.loc[index, col] are not missing values, upon calling

8043 first.combine_first(second).

8044

8045 Parameters

8046 ----------

8047 other : DataFrame

8048 Provided DataFrame to use to fill null values.

8049

8050 Returns

8051 -------

8052 DataFrame

8053 The result of combining the provided DataFrame with the other object.

8054

8055 See Also

8056 --------

8057 DataFrame.combine : Perform series-wise operation on two DataFrames

8058 using a given function.

8059

8060 Examples

8061 --------

8062 >>> df1 = pd.DataFrame({'A': [None, 0], 'B': [None, 4]})

8063 >>> df2 = pd.DataFrame({'A': [1, 1], 'B': [3, 3]})

8064 >>> df1.combine_first(df2)

8065 A B

8066 0 1.0 3.0

8067 1 0.0 4.0

8068

8069 Null values still persist if the location of that null value

8070 does not exist in `other`

8071

8072 >>> df1 = pd.DataFrame({'A': [None, 0], 'B': [4, None]})

8073 >>> df2 = pd.DataFrame({'B': [3, 3], 'C': [1, 1]}, index=[1, 2])

8074 >>> df1.combine_first(df2)

8075 A B C

8076 0 NaN 4.0 NaN

8077 1 0.0 3.0 1.0

8078 2 NaN 3.0 1.0

8079 """

8080 import pandas.core.computation.expressions as expressions

8081

8082 def combiner(x, y):

8083 mask = extract_array(isna(x))

8084

8085 x_values = extract_array(x, extract_numpy=True)

8086 y_values = extract_array(y, extract_numpy=True)

8087

8088 # If the column y in other DataFrame is not in first DataFrame,

8089 # just return y_values.

8090 if y.name not in self.columns:

8091 return y_values

8092

8093 return expressions.where(mask, y_values, x_values)

8094

8095 combined = self.combine(other, combiner, overwrite=False)

8096

8097 dtypes = {

8098 col: find_common_type([self.dtypes[col], other.dtypes[col]])

8099 for col in self.columns.intersection(other.columns)

8100 if not is_dtype_equal(combined.dtypes[col], self.dtypes[col])

8101 }

8102

8103 if dtypes:

8104 combined = combined.astype(dtypes)

8105

8106 return combined

8107

8108 def update(

8109 self,

8110 other,

8111 join: str = "left",

8112 overwrite: bool = True,

8113 filter_func=None,

8114 errors: str = "ignore",

8115 ) -> None:

8116 """

8117 Modify in place using non-NA values from another DataFrame.

8118

8119 Aligns on indices. There is no return value.

8120

8121 Parameters

8122 ----------

8123 other : DataFrame, or object coercible into a DataFrame

8124 Should have at least one matching index/column label

8125 with the original DataFrame. If a Series is passed,

8126 its name attribute must be set, and that will be

8127 used as the column name to align with the original DataFrame.

8128 join : {'left'}, default 'left'

8129 Only left join is implemented, keeping the index and columns of the

8130 original object.

8131 overwrite : bool, default True

8132 How to handle non-NA values for overlapping keys:

8133

8134 * True: overwrite original DataFrame's values

8135 with values from `other`.

8136 * False: only update values that are NA in

8137 the original DataFrame.

8138

8139 filter_func : callable(1d-array) -> bool 1d-array, optional

8140 Can choose to replace values other than NA. Return True for values

8141 that should be updated.

8142 errors : {'raise', 'ignore'}, default 'ignore'

8143 If 'raise', will raise a ValueError if the DataFrame and `other`

8144 both contain non-NA data in the same place.

8145

8146 Returns

8147 -------

8148 None : method directly changes calling object

8149

8150 Raises

8151 ------

8152 ValueError

8153 * When `errors='raise'` and there's overlapping non-NA data.

8154 * When `errors` is not either `'ignore'` or `'raise'`

8155 NotImplementedError

8156 * If `join != 'left'`

8157

8158 See Also

8159 --------

8160 dict.update : Similar method for dictionaries.

8161 DataFrame.merge : For column(s)-on-column(s) operations.

8162

8163 Examples

8164 --------

8165 >>> df = pd.DataFrame({'A': [1, 2, 3],

8166 ... 'B': [400, 500, 600]})

8167 >>> new_df = pd.DataFrame({'B': [4, 5, 6],

8168 ... 'C': [7, 8, 9]})

8169 >>> df.update(new_df)

8170 >>> df

8171 A B

8172 0 1 4

8173 1 2 5

8174 2 3 6

8175

8176 The DataFrame's length does not increase as a result of the update,

8177 only values at matching index/column labels are updated.

8178

8179 >>> df = pd.DataFrame({'A': ['a', 'b', 'c'],

8180 ... 'B': ['x', 'y', 'z']})

8181 >>> new_df = pd.DataFrame({'B': ['d', 'e', 'f', 'g', 'h', 'i']})

8182 >>> df.update(new_df)

8183 >>> df

8184 A B

8185 0 a d

8186 1 b e

8187 2 c f

8188

8189 For Series, its name attribute must be set.

8190

8191 >>> df = pd.DataFrame({'A': ['a', 'b', 'c'],

8192 ... 'B': ['x', 'y', 'z']})

8193 >>> new_column = pd.Series(['d', 'e'], name='B', index=[0, 2])

8194 >>> df.update(new_column)

8195 >>> df

8196 A B

8197 0 a d

8198 1 b y

8199 2 c e

8200 >>> df = pd.DataFrame({'A': ['a', 'b', 'c'],

8201 ... 'B': ['x', 'y', 'z']})

8202 >>> new_df = pd.DataFrame({'B': ['d', 'e']}, index=[1, 2])

8203 >>> df.update(new_df)

8204 >>> df

8205 A B

8206 0 a x

8207 1 b d

8208 2 c e

8209

8210 If `other` contains NaNs the corresponding values are not updated

8211 in the original dataframe.

8212

8213 >>> df = pd.DataFrame({'A': [1, 2, 3],

8214 ... 'B': [400, 500, 600]})

8215 >>> new_df = pd.DataFrame({'B': [4, np.nan, 6]})

8216 >>> df.update(new_df)

8217 >>> df

8218 A B

8219 0 1 4.0

8220 1 2 500.0

8221 2 3 6.0

8222 """

8223 import pandas.core.computation.expressions as expressions

8224

8225 # TODO: Support other joins

8226 if join != "left": # pragma: no cover

8227 raise NotImplementedError("Only left join is supported")

8228 if errors not in ["ignore", "raise"]:

8229 raise ValueError("The parameter errors must be either 'ignore' or 'raise'")

8230

8231 if not isinstance(other, DataFrame):

8232 other = DataFrame(other)

8233

8234 other = other.reindex_like(self)

8235

8236 for col in self.columns:

8237 this = self[col]._values

8238 that = other[col]._values

8239 if filter_func is not None:

8240 with np.errstate(all="ignore"):

8241 mask = ~filter_func(this) | isna(that)

8242 else:

8243 if errors == "raise":

8244 mask_this = notna(that)

8245 mask_that = notna(this)

8246 if any(mask_this & mask_that):

8247 raise ValueError("Data overlaps.")

8248

8249 if overwrite:

8250 mask = isna(that)

8251 else:

8252 mask = notna(this)

8253

8254 # don't overwrite columns unnecessarily

8255 if mask.all():

8256 continue

8257

8258 with warnings.catch_warnings():

8259 warnings.filterwarnings("ignore", "In a future version, `df.iloc")

8260 self.loc[:, col] = expressions.where(mask, this, that)

8261

8262 # ----------------------------------------------------------------------

8263 # Data reshaping

8264 @Appender(

8265 """

8266Examples

8267--------

8268>>> df = pd.DataFrame({'Animal': ['Falcon', 'Falcon',

8269... 'Parrot', 'Parrot'],

8270... 'Max Speed': [380., 370., 24., 26.]})

8271>>> df

8272 Animal Max Speed

82730 Falcon 380.0

82741 Falcon 370.0

82752 Parrot 24.0

82763 Parrot 26.0

8277>>> df.groupby(['Animal']).mean()

8278 Max Speed

8279Animal

8280Falcon 375.0

8281Parrot 25.0

8282

8283**Hierarchical Indexes**

8284

8285We can groupby different levels of a hierarchical index

8286using the `level` parameter:

8287

8288>>> arrays = [['Falcon', 'Falcon', 'Parrot', 'Parrot'],

8289... ['Captive', 'Wild', 'Captive', 'Wild']]

8290>>> index = pd.MultiIndex.from_arrays(arrays, names=('Animal', 'Type'))

8291>>> df = pd.DataFrame({'Max Speed': [390., 350., 30., 20.]},

8292... index=index)

8293>>> df

8294 Max Speed

8295Animal Type

8296Falcon Captive 390.0

8297 Wild 350.0

8298Parrot Captive 30.0

8299 Wild 20.0

8300>>> df.groupby(level=0).mean()

8301 Max Speed

8302Animal

8303Falcon 370.0

8304Parrot 25.0

8305>>> df.groupby(level="Type").mean()

8306 Max Speed

8307Type

8308Captive 210.0

8309Wild 185.0

8310

8311We can also choose to include NA in group keys or not by setting

8312`dropna` parameter, the default setting is `True`.

8313

8314>>> l = [[1, 2, 3], [1, None, 4], [2, 1, 3], [1, 2, 2]]

8315>>> df = pd.DataFrame(l, columns=["a", "b", "c"])

8316

8317>>> df.groupby(by=["b"]).sum()

8318 a c

8319b

83201.0 2 3

83212.0 2 5

8322

8323>>> df.groupby(by=["b"], dropna=False).sum()

8324 a c

8325b

83261.0 2 3

83272.0 2 5

8328NaN 1 4

8329

8330>>> l = [["a", 12, 12], [None, 12.3, 33.], ["b", 12.3, 123], ["a", 1, 1]]

8331>>> df = pd.DataFrame(l, columns=["a", "b", "c"])

8332

8333>>> df.groupby(by="a").sum()

8334 b c

8335a

8336a 13.0 13.0

8337b 12.3 123.0

8338

8339>>> df.groupby(by="a", dropna=False).sum()

8340 b c

8341a

8342a 13.0 13.0

8343b 12.3 123.0

8344NaN 12.3 33.0

8345

8346When using ``.apply()``, use ``group_keys`` to include or exclude the group keys.

8347The ``group_keys`` argument defaults to ``True`` (include).

8348

8349>>> df = pd.DataFrame({'Animal': ['Falcon', 'Falcon',

8350... 'Parrot', 'Parrot'],

8351... 'Max Speed': [380., 370., 24., 26.]})

8352>>> df.groupby("Animal", group_keys=True).apply(lambda x: x)

8353 Animal Max Speed

8354Animal

8355Falcon 0 Falcon 380.0

8356 1 Falcon 370.0

8357Parrot 2 Parrot 24.0

8358 3 Parrot 26.0

8359

8360>>> df.groupby("Animal", group_keys=False).apply(lambda x: x)

8361 Animal Max Speed

83620 Falcon 380.0

83631 Falcon 370.0

83642 Parrot 24.0

83653 Parrot 26.0

8366"""

8367 )

8368 @Appender(_shared_docs["groupby"] % _shared_doc_kwargs)

8369 def groupby(

8370 self,

8371 by=None,

8372 axis: Axis = 0,

8373 level: IndexLabel | None = None,

8374 as_index: bool = True,

8375 sort: bool = True,

8376 group_keys: bool | lib.NoDefault = no_default,

8377 squeeze: bool | lib.NoDefault = no_default,

8378 observed: bool = False,

8379 dropna: bool = True,

8380 ) -> DataFrameGroupBy:

8381 from pandas.core.groupby.generic import DataFrameGroupBy

8382

8383 if squeeze is not no_default:

8384 warnings.warn(

8385 (

8386 "The `squeeze` parameter is deprecated and "

8387 "will be removed in a future version."

8388 ),

8389 FutureWarning,

8390 stacklevel=find_stack_level(),

8391 )

8392 else:

8393 squeeze = False

8394

8395 if level is None and by is None:

8396 raise TypeError("You have to supply one of 'by' and 'level'")

8397 axis = self._get_axis_number(axis)

8398

8399 return DataFrameGroupBy(

8400 obj=self,

8401 keys=by,

8402 axis=axis,

8403 level=level,

8404 as_index=as_index,

8405 sort=sort,

8406 group_keys=group_keys,

8407 squeeze=squeeze,

8408 observed=observed,

8409 dropna=dropna,

8410 )

8411

8412 _shared_docs[

8413 "pivot"

8414 ] = """

8415 Return reshaped DataFrame organized by given index / column values.

8416

8417 Reshape data (produce a "pivot" table) based on column values. Uses

8418 unique values from specified `index` / `columns` to form axes of the

8419 resulting DataFrame. This function does not support data

8420 aggregation, multiple values will result in a MultiIndex in the

8421 columns. See the :ref:`User Guide <reshaping>` for more on reshaping.

8422

8423 Parameters

8424 ----------%s

8425 index : str or object or a list of str, optional

8426 Column to use to make new frame's index. If None, uses

8427 existing index.

8428

8429 .. versionchanged:: 1.1.0

8430 Also accept list of index names.

8431

8432 columns : str or object or a list of str

8433 Column to use to make new frame's columns.

8434

8435 .. versionchanged:: 1.1.0

8436 Also accept list of columns names.

8437

8438 values : str, object or a list of the previous, optional

8439 Column(s) to use for populating new frame's values. If not

8440 specified, all remaining columns will be used and the result will

8441 have hierarchically indexed columns.

8442

8443 Returns

8444 -------

8445 DataFrame

8446 Returns reshaped DataFrame.

8447

8448 Raises

8449 ------

8450 ValueError:

8451 When there are any `index`, `columns` combinations with multiple

8452 values. `DataFrame.pivot_table` when you need to aggregate.

8453

8454 See Also

8455 --------

8456 DataFrame.pivot_table : Generalization of pivot that can handle

8457 duplicate values for one index/column pair.

8458 DataFrame.unstack : Pivot based on the index values instead of a

8459 column.

8460 wide_to_long : Wide panel to long format. Less flexible but more

8461 user-friendly than melt.

8462

8463 Notes

8464 -----

8465 For finer-tuned control, see hierarchical indexing documentation along

8466 with the related stack/unstack methods.

8467

8468 Reference :ref:`the user guide <reshaping.pivot>` for more examples.

8469

8470 Examples

8471 --------

8472 >>> df = pd.DataFrame({'foo': ['one', 'one', 'one', 'two', 'two',

8473 ... 'two'],

8474 ... 'bar': ['A', 'B', 'C', 'A', 'B', 'C'],

8475 ... 'baz': [1, 2, 3, 4, 5, 6],

8476 ... 'zoo': ['x', 'y', 'z', 'q', 'w', 't']})

8477 >>> df

8478 foo bar baz zoo

8479 0 one A 1 x

8480 1 one B 2 y

8481 2 one C 3 z

8482 3 two A 4 q

8483 4 two B 5 w

8484 5 two C 6 t

8485

8486 >>> df.pivot(index='foo', columns='bar', values='baz')

8487 bar A B C

8488 foo

8489 one 1 2 3

8490 two 4 5 6

8491

8492 >>> df.pivot(index='foo', columns='bar')['baz']

8493 bar A B C

8494 foo

8495 one 1 2 3

8496 two 4 5 6

8497

8498 >>> df.pivot(index='foo', columns='bar', values=['baz', 'zoo'])

8499 baz zoo

8500 bar A B C A B C

8501 foo

8502 one 1 2 3 x y z

8503 two 4 5 6 q w t

8504

8505 You could also assign a list of column names or a list of index names.

8506

8507 >>> df = pd.DataFrame({

8508 ... "lev1": [1, 1, 1, 2, 2, 2],

8509 ... "lev2": [1, 1, 2, 1, 1, 2],

8510 ... "lev3": [1, 2, 1, 2, 1, 2],

8511 ... "lev4": [1, 2, 3, 4, 5, 6],

8512 ... "values": [0, 1, 2, 3, 4, 5]})

8513 >>> df

8514 lev1 lev2 lev3 lev4 values

8515 0 1 1 1 1 0

8516 1 1 1 2 2 1

8517 2 1 2 1 3 2

8518 3 2 1 2 4 3

8519 4 2 1 1 5 4

8520 5 2 2 2 6 5

8521

8522 >>> df.pivot(index="lev1", columns=["lev2", "lev3"],values="values")

8523 lev2 1 2

8524 lev3 1 2 1 2

8525 lev1

8526 1 0.0 1.0 2.0 NaN

8527 2 4.0 3.0 NaN 5.0

8528

8529 >>> df.pivot(index=["lev1", "lev2"], columns=["lev3"],values="values")

8530 lev3 1 2

8531 lev1 lev2

8532 1 1 0.0 1.0

8533 2 2.0 NaN

8534 2 1 4.0 3.0

8535 2 NaN 5.0

8536

8537 A ValueError is raised if there are any duplicates.

8538

8539 >>> df = pd.DataFrame({"foo": ['one', 'one', 'two', 'two'],

8540 ... "bar": ['A', 'A', 'B', 'C'],

8541 ... "baz": [1, 2, 3, 4]})

8542 >>> df

8543 foo bar baz

8544 0 one A 1

8545 1 one A 2

8546 2 two B 3

8547 3 two C 4

8548

8549 Notice that the first two rows are the same for our `index`

8550 and `columns` arguments.

8551

8552 >>> df.pivot(index='foo', columns='bar', values='baz')

8553 Traceback (most recent call last):

8554 ...

8555 ValueError: Index contains duplicate entries, cannot reshape

8556 """

8557

8558 @Substitution("")

8559 @Appender(_shared_docs["pivot"])

8560 @deprecate_nonkeyword_arguments(version=None, allowed_args=["self"])

8561 def pivot(self, index=None, columns=None, values=None) -> DataFrame:

8562 from pandas.core.reshape.pivot import pivot

8563

8564 return pivot(self, index=index, columns=columns, values=values)

8565

8566 _shared_docs[

8567 "pivot_table"

8568 ] = """

8569 Create a spreadsheet-style pivot table as a DataFrame.

8570

8571 The levels in the pivot table will be stored in MultiIndex objects

8572 (hierarchical indexes) on the index and columns of the result DataFrame.

8573

8574 Parameters

8575 ----------%s

8576 values : column to aggregate, optional

8577 index : column, Grouper, array, or list of the previous

8578 If an array is passed, it must be the same length as the data. The

8579 list can contain any of the other types (except list).

8580 Keys to group by on the pivot table index. If an array is passed,

8581 it is being used as the same manner as column values.

8582 columns : column, Grouper, array, or list of the previous

8583 If an array is passed, it must be the same length as the data. The

8584 list can contain any of the other types (except list).

8585 Keys to group by on the pivot table column. If an array is passed,

8586 it is being used as the same manner as column values.

8587 aggfunc : function, list of functions, dict, default numpy.mean

8588 If list of functions passed, the resulting pivot table will have

8589 hierarchical columns whose top level are the function names

8590 (inferred from the function objects themselves)

8591 If dict is passed, the key is column to aggregate and value

8592 is function or list of functions.

8593 fill_value : scalar, default None

8594 Value to replace missing values with (in the resulting pivot table,

8595 after aggregation).

8596 margins : bool, default False

8597 Add all row / columns (e.g. for subtotal / grand totals).

8598 dropna : bool, default True

8599 Do not include columns whose entries are all NaN. If True,

8600 rows with a NaN value in any column will be omitted before

8601 computing margins.

8602 margins_name : str, default 'All'

8603 Name of the row / column that will contain the totals

8604 when margins is True.

8605 observed : bool, default False

8606 This only applies if any of the groupers are Categoricals.

8607 If True: only show observed values for categorical groupers.

8608 If False: show all values for categorical groupers.

8609

8610 .. versionchanged:: 0.25.0

8611

8612 sort : bool, default True

8613 Specifies if the result should be sorted.

8614

8615 .. versionadded:: 1.3.0

8616

8617 Returns

8618 -------

8619 DataFrame

8620 An Excel style pivot table.

8621

8622 See Also

8623 --------

8624 DataFrame.pivot : Pivot without aggregation that can handle

8625 non-numeric data.

8626 DataFrame.melt: Unpivot a DataFrame from wide to long format,

8627 optionally leaving identifiers set.

8628 wide_to_long : Wide panel to long format. Less flexible but more

8629 user-friendly than melt.

8630

8631 Notes

8632 -----

8633 Reference :ref:`the user guide <reshaping.pivot>` for more examples.

8634

8635 Examples

8636 --------

8637 >>> df = pd.DataFrame({"A": ["foo", "foo", "foo", "foo", "foo",

8638 ... "bar", "bar", "bar", "bar"],

8639 ... "B": ["one", "one", "one", "two", "two",

8640 ... "one", "one", "two", "two"],

8641 ... "C": ["small", "large", "large", "small",

8642 ... "small", "large", "small", "small",

8643 ... "large"],

8644 ... "D": [1, 2, 2, 3, 3, 4, 5, 6, 7],

8645 ... "E": [2, 4, 5, 5, 6, 6, 8, 9, 9]})

8646 >>> df

8647 A B C D E

8648 0 foo one small 1 2

8649 1 foo one large 2 4

8650 2 foo one large 2 5

8651 3 foo two small 3 5

8652 4 foo two small 3 6

8653 5 bar one large 4 6

8654 6 bar one small 5 8

8655 7 bar two small 6 9

8656 8 bar two large 7 9

8657

8658 This first example aggregates values by taking the sum.

8659

8660 >>> table = pd.pivot_table(df, values='D', index=['A', 'B'],

8661 ... columns=['C'], aggfunc=np.sum)

8662 >>> table

8663 C large small

8664 A B

8665 bar one 4.0 5.0

8666 two 7.0 6.0

8667 foo one 4.0 1.0

8668 two NaN 6.0

8669

8670 We can also fill missing values using the `fill_value` parameter.

8671

8672 >>> table = pd.pivot_table(df, values='D', index=['A', 'B'],

8673 ... columns=['C'], aggfunc=np.sum, fill_value=0)

8674 >>> table

8675 C large small

8676 A B

8677 bar one 4 5

8678 two 7 6

8679 foo one 4 1

8680 two 0 6

8681

8682 The next example aggregates by taking the mean across multiple columns.

8683

8684 >>> table = pd.pivot_table(df, values=['D', 'E'], index=['A', 'C'],

8685 ... aggfunc={'D': np.mean,

8686 ... 'E': np.mean})

8687 >>> table

8688 D E

8689 A C

8690 bar large 5.500000 7.500000

8691 small 5.500000 8.500000

8692 foo large 2.000000 4.500000

8693 small 2.333333 4.333333

8694

8695 We can also calculate multiple types of aggregations for any given

8696 value column.

8697

8698 >>> table = pd.pivot_table(df, values=['D', 'E'], index=['A', 'C'],

8699 ... aggfunc={'D': np.mean,

8700 ... 'E': [min, max, np.mean]})

8701 >>> table

8702 D E

8703 mean max mean min

8704 A C

8705 bar large 5.500000 9 7.500000 6

8706 small 5.500000 9 8.500000 8

8707 foo large 2.000000 5 4.500000 4

8708 small 2.333333 6 4.333333 2

8709 """

8710

8711 @Substitution("")

8712 @Appender(_shared_docs["pivot_table"])

8713 def pivot_table(

8714 self,

8715 values=None,

8716 index=None,

8717 columns=None,

8718 aggfunc="mean",

8719 fill_value=None,

8720 margins=False,

8721 dropna=True,

8722 margins_name="All",

8723 observed=False,

8724 sort=True,

8725 ) -> DataFrame:

8726 from pandas.core.reshape.pivot import pivot_table

8727

8728 return pivot_table(

8729 self,

8730 values=values,

8731 index=index,

8732 columns=columns,

8733 aggfunc=aggfunc,

8734 fill_value=fill_value,

8735 margins=margins,

8736 dropna=dropna,

8737 margins_name=margins_name,

8738 observed=observed,

8739 sort=sort,

8740 )

8741

8742 def stack(self, level: Level = -1, dropna: bool = True):

8743 """

8744 Stack the prescribed level(s) from columns to index.

8745

8746 Return a reshaped DataFrame or Series having a multi-level

8747 index with one or more new inner-most levels compared to the current

8748 DataFrame. The new inner-most levels are created by pivoting the

8749 columns of the current dataframe:

8750

8751 - if the columns have a single level, the output is a Series;

8752 - if the columns have multiple levels, the new index

8753 level(s) is (are) taken from the prescribed level(s) and

8754 the output is a DataFrame.

8755

8756 Parameters

8757 ----------

8758 level : int, str, list, default -1

8759 Level(s) to stack from the column axis onto the index

8760 axis, defined as one index or label, or a list of indices

8761 or labels.

8762 dropna : bool, default True

8763 Whether to drop rows in the resulting Frame/Series with

8764 missing values. Stacking a column level onto the index

8765 axis can create combinations of index and column values

8766 that are missing from the original dataframe. See Examples

8767 section.

8768

8769 Returns

8770 -------

8771 DataFrame or Series

8772 Stacked dataframe or series.

8773

8774 See Also

8775 --------

8776 DataFrame.unstack : Unstack prescribed level(s) from index axis

8777 onto column axis.

8778 DataFrame.pivot : Reshape dataframe from long format to wide

8779 format.

8780 DataFrame.pivot_table : Create a spreadsheet-style pivot table

8781 as a DataFrame.

8782

8783 Notes

8784 -----

8785 The function is named by analogy with a collection of books

8786 being reorganized from being side by side on a horizontal

8787 position (the columns of the dataframe) to being stacked

8788 vertically on top of each other (in the index of the

8789 dataframe).

8790

8791 Reference :ref:`the user guide <reshaping.stacking>` for more examples.

8792

8793 Examples

8794 --------

8795 **Single level columns**

8796

8797 >>> df_single_level_cols = pd.DataFrame([[0, 1], [2, 3]],

8798 ... index=['cat', 'dog'],

8799 ... columns=['weight', 'height'])

8800

8801 Stacking a dataframe with a single level column axis returns a Series:

8802

8803 >>> df_single_level_cols

8804 weight height

8805 cat 0 1

8806 dog 2 3

8807 >>> df_single_level_cols.stack()

8808 cat weight 0

8809 height 1

8810 dog weight 2

8811 height 3

8812 dtype: int64

8813

8814 **Multi level columns: simple case**

8815

8816 >>> multicol1 = pd.MultiIndex.from_tuples([('weight', 'kg'),

8817 ... ('weight', 'pounds')])

8818 >>> df_multi_level_cols1 = pd.DataFrame([[1, 2], [2, 4]],

8819 ... index=['cat', 'dog'],

8820 ... columns=multicol1)

8821

8822 Stacking a dataframe with a multi-level column axis:

8823

8824 >>> df_multi_level_cols1

8825 weight

8826 kg pounds

8827 cat 1 2

8828 dog 2 4

8829 >>> df_multi_level_cols1.stack()

8830 weight

8831 cat kg 1

8832 pounds 2

8833 dog kg 2

8834 pounds 4

8835

8836 **Missing values**

8837

8838 >>> multicol2 = pd.MultiIndex.from_tuples([('weight', 'kg'),

8839 ... ('height', 'm')])

8840 >>> df_multi_level_cols2 = pd.DataFrame([[1.0, 2.0], [3.0, 4.0]],

8841 ... index=['cat', 'dog'],

8842 ... columns=multicol2)

8843

8844 It is common to have missing values when stacking a dataframe

8845 with multi-level columns, as the stacked dataframe typically

8846 has more values than the original dataframe. Missing values

8847 are filled with NaNs:

8848

8849 >>> df_multi_level_cols2

8850 weight height

8851 kg m

8852 cat 1.0 2.0

8853 dog 3.0 4.0

8854 >>> df_multi_level_cols2.stack()

8855 height weight

8856 cat kg NaN 1.0

8857 m 2.0 NaN

8858 dog kg NaN 3.0

8859 m 4.0 NaN

8860

8861 **Prescribing the level(s) to be stacked**

8862

8863 The first parameter controls which level or levels are stacked:

8864

8865 >>> df_multi_level_cols2.stack(0)

8866 kg m

8867 cat height NaN 2.0

8868 weight 1.0 NaN

8869 dog height NaN 4.0

8870 weight 3.0 NaN

8871 >>> df_multi_level_cols2.stack([0, 1])

8872 cat height m 2.0

8873 weight kg 1.0

8874 dog height m 4.0

8875 weight kg 3.0

8876 dtype: float64

8877

8878 **Dropping missing values**

8879

8880 >>> df_multi_level_cols3 = pd.DataFrame([[None, 1.0], [2.0, 3.0]],

8881 ... index=['cat', 'dog'],

8882 ... columns=multicol2)

8883

8884 Note that rows where all values are missing are dropped by

8885 default but this behaviour can be controlled via the dropna

8886 keyword parameter:

8887

8888 >>> df_multi_level_cols3

8889 weight height

8890 kg m

8891 cat NaN 1.0

8892 dog 2.0 3.0

8893 >>> df_multi_level_cols3.stack(dropna=False)

8894 height weight

8895 cat kg NaN NaN

8896 m 1.0 NaN

8897 dog kg NaN 2.0

8898 m 3.0 NaN

8899 >>> df_multi_level_cols3.stack(dropna=True)

8900 height weight

8901 cat m 1.0 NaN

8902 dog kg NaN 2.0

8903 m 3.0 NaN

8904 """

8905 from pandas.core.reshape.reshape import (

8906 stack,

8907 stack_multiple,

8908 )

8909

8910 if isinstance(level, (tuple, list)):

8911 result = stack_multiple(self, level, dropna=dropna)

8912 else:

8913 result = stack(self, level, dropna=dropna)

8914

8915 return result.__finalize__(self, method="stack")

8916

8917 def explode(

8918 self,

8919 column: IndexLabel,

8920 ignore_index: bool = False,

8921 ) -> DataFrame:

8922 """

8923 Transform each element of a list-like to a row, replicating index values.

8924

8925 .. versionadded:: 0.25.0

8926

8927 Parameters

8928 ----------

8929 column : IndexLabel

8930 Column(s) to explode.

8931 For multiple columns, specify a non-empty list with each element

8932 be str or tuple, and all specified columns their list-like data

8933 on same row of the frame must have matching length.

8934

8935 .. versionadded:: 1.3.0

8936 Multi-column explode

8937

8938 ignore_index : bool, default False

8939 If True, the resulting index will be labeled 0, 1, …, n - 1.

8940

8941 .. versionadded:: 1.1.0

8942

8943 Returns

8944 -------

8945 DataFrame

8946 Exploded lists to rows of the subset columns;

8947 index will be duplicated for these rows.

8948

8949 Raises

8950 ------

8951 ValueError :

8952 * If columns of the frame are not unique.

8953 * If specified columns to explode is empty list.

8954 * If specified columns to explode have not matching count of

8955 elements rowwise in the frame.

8956

8957 See Also

8958 --------

8959 DataFrame.unstack : Pivot a level of the (necessarily hierarchical)

8960 index labels.

8961 DataFrame.melt : Unpivot a DataFrame from wide format to long format.

8962 Series.explode : Explode a DataFrame from list-like columns to long format.

8963

8964 Notes

8965 -----

8966 This routine will explode list-likes including lists, tuples, sets,

8967 Series, and np.ndarray. The result dtype of the subset rows will

8968 be object. Scalars will be returned unchanged, and empty list-likes will

8969 result in a np.nan for that row. In addition, the ordering of rows in the

8970 output will be non-deterministic when exploding sets.

8971

8972 Reference :ref:`the user guide <reshaping.explode>` for more examples.

8973

8974 Examples

8975 --------

8976 >>> df = pd.DataFrame({'A': [[0, 1, 2], 'foo', [], [3, 4]],

8977 ... 'B': 1,

8978 ... 'C': [['a', 'b', 'c'], np.nan, [], ['d', 'e']]})

8979 >>> df

8980 A B C

8981 0 [0, 1, 2] 1 [a, b, c]

8982 1 foo 1 NaN

8983 2 [] 1 []

8984 3 [3, 4] 1 [d, e]

8985

8986 Single-column explode.

8987

8988 >>> df.explode('A')

8989 A B C

8990 0 0 1 [a, b, c]

8991 0 1 1 [a, b, c]

8992 0 2 1 [a, b, c]

8993 1 foo 1 NaN

8994 2 NaN 1 []

8995 3 3 1 [d, e]

8996 3 4 1 [d, e]

8997

8998 Multi-column explode.

8999

9000 >>> df.explode(list('AC'))

9001 A B C

9002 0 0 1 a

9003 0 1 1 b

9004 0 2 1 c

9005 1 foo 1 NaN

9006 2 NaN 1 NaN

9007 3 3 1 d

9008 3 4 1 e

9009 """

9010 if not self.columns.is_unique:

9011 raise ValueError("columns must be unique")

9012

9013 columns: list[Hashable]

9014 if is_scalar(column) or isinstance(column, tuple):

9015 columns = [column]

9016 elif isinstance(column, list) and all(

9017 is_scalar(c) or isinstance(c, tuple) for c in column

9018 ):

9019 if not column:

9020 raise ValueError("column must be nonempty")

9021 if len(column) > len(set(column)):

9022 raise ValueError("column must be unique")

9023 columns = column

9024 else:

9025 raise ValueError("column must be a scalar, tuple, or list thereof")

9026

9027 df = self.reset_index(drop=True)

9028 if len(columns) == 1:

9029 result = df[columns[0]].explode()

9030 else:

9031 mylen = lambda x: len(x) if is_list_like(x) else -1

9032 counts0 = self[columns[0]].apply(mylen)

9033 for c in columns[1:]:

9034 if not all(counts0 == self[c].apply(mylen)):

9035 raise ValueError("columns must have matching element counts")

9036 result = DataFrame({c: df[c].explode() for c in columns})

9037 result = df.drop(columns, axis=1).join(result)

9038 if ignore_index:

9039 result.index = default_index(len(result))

9040 else:

9041 result.index = self.index.take(result.index)

9042 result = result.reindex(columns=self.columns, copy=False)

9043

9044 return result.__finalize__(self, method="explode")

9045

9046 def unstack(self, level: Level = -1, fill_value=None):

9047 """

9048 Pivot a level of the (necessarily hierarchical) index labels.

9049

9050 Returns a DataFrame having a new level of column labels whose inner-most level

9051 consists of the pivoted index labels.

9052

9053 If the index is not a MultiIndex, the output will be a Series

9054 (the analogue of stack when the columns are not a MultiIndex).

9055

9056 Parameters

9057 ----------

9058 level : int, str, or list of these, default -1 (last level)

9059 Level(s) of index to unstack, can pass level name.

9060 fill_value : int, str or dict

9061 Replace NaN with this value if the unstack produces missing values.

9062

9063 Returns

9064 -------

9065 Series or DataFrame

9066

9067 See Also

9068 --------

9069 DataFrame.pivot : Pivot a table based on column values.

9070 DataFrame.stack : Pivot a level of the column labels (inverse operation

9071 from `unstack`).

9072

9073 Notes

9074 -----

9075 Reference :ref:`the user guide <reshaping.stacking>` for more examples.

9076

9077 Examples

9078 --------

9079 >>> index = pd.MultiIndex.from_tuples([('one', 'a'), ('one', 'b'),

9080 ... ('two', 'a'), ('two', 'b')])

9081 >>> s = pd.Series(np.arange(1.0, 5.0), index=index)

9082 >>> s

9083 one a 1.0

9084 b 2.0

9085 two a 3.0

9086 b 4.0

9087 dtype: float64

9088

9089 >>> s.unstack(level=-1)

9090 a b

9091 one 1.0 2.0

9092 two 3.0 4.0

9093

9094 >>> s.unstack(level=0)

9095 one two

9096 a 1.0 3.0

9097 b 2.0 4.0

9098

9099 >>> df = s.unstack(level=0)

9100 >>> df.unstack()

9101 one a 1.0

9102 b 2.0

9103 two a 3.0

9104 b 4.0

9105 dtype: float64

9106 """

9107 from pandas.core.reshape.reshape import unstack

9108

9109 result = unstack(self, level, fill_value)

9110

9111 return result.__finalize__(self, method="unstack")

9112

9113 @Appender(_shared_docs["melt"] % {"caller": "df.melt(", "other": "melt"})

9114 def melt(

9115 self,

9116 id_vars=None,

9117 value_vars=None,

9118 var_name=None,

9119 value_name="value",

9120 col_level: Level = None,

9121 ignore_index: bool = True,

9122 ) -> DataFrame:

9123

9124 return melt(

9125 self,

9126 id_vars=id_vars,

9127 value_vars=value_vars,

9128 var_name=var_name,

9129 value_name=value_name,

9130 col_level=col_level,

9131 ignore_index=ignore_index,

9132 ).__finalize__(self, method="melt")

9133

9134 # ----------------------------------------------------------------------

9135 # Time series-related

9136

9137 @doc(

9138 Series.diff,

9139 klass="DataFrame",

9140 extra_params="axis : {0 or 'index', 1 or 'columns'}, default 0\n "

9141 "Take difference over rows (0) or columns (1).\n",

9142 other_klass="Series",

9143 examples=dedent(

9144 """

9145 Difference with previous row

9146

9147 >>> df = pd.DataFrame({'a': [1, 2, 3, 4, 5, 6],

9148 ... 'b': [1, 1, 2, 3, 5, 8],

9149 ... 'c': [1, 4, 9, 16, 25, 36]})

9150 >>> df

9151 a b c

9152 0 1 1 1

9153 1 2 1 4

9154 2 3 2 9

9155 3 4 3 16

9156 4 5 5 25

9157 5 6 8 36

9158

9159 >>> df.diff()

9160 a b c

9161 0 NaN NaN NaN

9162 1 1.0 0.0 3.0

9163 2 1.0 1.0 5.0

9164 3 1.0 1.0 7.0

9165 4 1.0 2.0 9.0

9166 5 1.0 3.0 11.0

9167

9168 Difference with previous column

9169

9170 >>> df.diff(axis=1)

9171 a b c

9172 0 NaN 0 0

9173 1 NaN -1 3

9174 2 NaN -1 7

9175 3 NaN -1 13

9176 4 NaN 0 20

9177 5 NaN 2 28

9178

9179 Difference with 3rd previous row

9180

9181 >>> df.diff(periods=3)

9182 a b c

9183 0 NaN NaN NaN

9184 1 NaN NaN NaN

9185 2 NaN NaN NaN

9186 3 3.0 2.0 15.0

9187 4 3.0 4.0 21.0

9188 5 3.0 6.0 27.0

9189

9190 Difference with following row

9191

9192 >>> df.diff(periods=-1)

9193 a b c

9194 0 -1.0 0.0 -3.0

9195 1 -1.0 -1.0 -5.0

9196 2 -1.0 -1.0 -7.0

9197 3 -1.0 -2.0 -9.0

9198 4 -1.0 -3.0 -11.0

9199 5 NaN NaN NaN

9200

9201 Overflow in input dtype

9202

9203 >>> df = pd.DataFrame({'a': [1, 0]}, dtype=np.uint8)

9204 >>> df.diff()

9205 a

9206 0 NaN

9207 1 255.0"""

9208 ),

9209 )

9210 def diff(self, periods: int = 1, axis: Axis = 0) -> DataFrame:

9211 if not lib.is_integer(periods):

9212 if not (

9213 is_float(periods)

9214 # error: "int" has no attribute "is_integer"

9215 and periods.is_integer() # type: ignore[attr-defined]

9216 ):

9217 raise ValueError("periods must be an integer")

9218 periods = int(periods)

9219

9220 axis = self._get_axis_number(axis)

9221 if axis == 1 and periods != 0:

9222 return self - self.shift(periods, axis=axis)

9223

9224 new_data = self._mgr.diff(n=periods, axis=axis)

9225 return self._constructor(new_data).__finalize__(self, "diff")

9226

9227 # ----------------------------------------------------------------------

9228 # Function application

9229

9230 def _gotitem(

9231 self,

9232 key: IndexLabel,

9233 ndim: int,

9234 subset: DataFrame | Series | None = None,

9235 ) -> DataFrame | Series:

9236 """

9237 Sub-classes to define. Return a sliced object.

9238

9239 Parameters

9240 ----------

9241 key : string / list of selections

9242 ndim : {1, 2}

9243 requested ndim of result

9244 subset : object, default None

9245 subset to act on

9246 """

9247 if subset is None:

9248 subset = self

9249 elif subset.ndim == 1: # is Series

9250 return subset

9251

9252 # TODO: _shallow_copy(subset)?

9253 return subset[key]

9254

9255 _agg_summary_and_see_also_doc = dedent(

9256 """

9257 The aggregation operations are always performed over an axis, either the

9258 index (default) or the column axis. This behavior is different from

9259 `numpy` aggregation functions (`mean`, `median`, `prod`, `sum`, `std`,

9260 `var`), where the default is to compute the aggregation of the flattened

9261 array, e.g., ``numpy.mean(arr_2d)`` as opposed to

9262 ``numpy.mean(arr_2d, axis=0)``.

9263

9264 `agg` is an alias for `aggregate`. Use the alias.

9265

9266 See Also

9267 --------

9268 DataFrame.apply : Perform any type of operations.

9269 DataFrame.transform : Perform transformation type operations.

9270 core.groupby.GroupBy : Perform operations over groups.

9271 core.resample.Resampler : Perform operations over resampled bins.

9272 core.window.Rolling : Perform operations over rolling window.

9273 core.window.Expanding : Perform operations over expanding window.

9274 core.window.ExponentialMovingWindow : Perform operation over exponential weighted

9275 window.

9276 """

9277 )

9278

9279 _agg_examples_doc = dedent(

9280 """

9281 Examples

9282 --------

9283 >>> df = pd.DataFrame([[1, 2, 3],

9284 ... [4, 5, 6],

9285 ... [7, 8, 9],

9286 ... [np.nan, np.nan, np.nan]],

9287 ... columns=['A', 'B', 'C'])

9288

9289 Aggregate these functions over the rows.

9290

9291 >>> df.agg(['sum', 'min'])

9292 A B C

9293 sum 12.0 15.0 18.0

9294 min 1.0 2.0 3.0

9295

9296 Different aggregations per column.

9297

9298 >>> df.agg({'A' : ['sum', 'min'], 'B' : ['min', 'max']})

9299 A B

9300 sum 12.0 NaN

9301 min 1.0 2.0

9302 max NaN 8.0

9303

9304 Aggregate different functions over the columns and rename the index of the resulting

9305 DataFrame.

9306

9307 >>> df.agg(x=('A', max), y=('B', 'min'), z=('C', np.mean))

9308 A B C

9309 x 7.0 NaN NaN

9310 y NaN 2.0 NaN

9311 z NaN NaN 6.0

9312

9313 Aggregate over the columns.

9314

9315 >>> df.agg("mean", axis="columns")

9316 0 2.0

9317 1 5.0

9318 2 8.0

9319 3 NaN

9320 dtype: float64

9321 """

9322 )

9323

9324 @doc(

9325 _shared_docs["aggregate"],

9326 klass=_shared_doc_kwargs["klass"],

9327 axis=_shared_doc_kwargs["axis"],

9328 see_also=_agg_summary_and_see_also_doc,

9329 examples=_agg_examples_doc,

9330 )

9331 def aggregate(self, func=None, axis: Axis = 0, *args, **kwargs):

9332 from pandas.core.apply import frame_apply

9333

9334 axis = self._get_axis_number(axis)

9335

9336 relabeling, func, columns, order = reconstruct_func(func, **kwargs)

9337

9338 op = frame_apply(self, func=func, axis=axis, args=args, kwargs=kwargs)

9339 result = op.agg()

9340

9341 if relabeling:

9342 # This is to keep the order to columns occurrence unchanged, and also

9343 # keep the order of new columns occurrence unchanged

9344

9345 # For the return values of reconstruct_func, if relabeling is

9346 # False, columns and order will be None.

9347 assert columns is not None

9348 assert order is not None

9349

9350 result_in_dict = relabel_result(result, func, columns, order)

9351 result = DataFrame(result_in_dict, index=columns)

9352

9353 return result

9354

9355 agg = aggregate

9356

9357 # error: Signature of "any" incompatible with supertype "NDFrame" [override]

9358 @overload # type: ignore[override]

9359 def any(

9360 self,

9361 *,

9362 axis: Axis = ...,

9363 bool_only: bool | None = ...,

9364 skipna: bool = ...,

9365 level: None = ...,

9366 **kwargs,

9367 ) -> Series:

9368 ...

9369

9370 @overload

9371 def any(

9372 self,

9373 *,

9374 axis: Axis = ...,

9375 bool_only: bool | None = ...,

9376 skipna: bool = ...,

9377 level: Level,

9378 **kwargs,

9379 ) -> DataFrame | Series:

9380 ...

9381

9382 @doc(NDFrame.any, **_shared_doc_kwargs)

9383 def any(

9384 self,

9385 axis: Axis = 0,

9386 bool_only: bool | None = None,

9387 skipna: bool = True,

9388 level: Level = None,

9389 **kwargs,

9390 ) -> DataFrame | Series:

9391 ...

9392

9393 @doc(

9394 _shared_docs["transform"],

9395 klass=_shared_doc_kwargs["klass"],

9396 axis=_shared_doc_kwargs["axis"],

9397 )

9398 def transform(

9399 self, func: AggFuncType, axis: Axis = 0, *args, **kwargs

9400 ) -> DataFrame:

9401 from pandas.core.apply import frame_apply

9402

9403 op = frame_apply(self, func=func, axis=axis, args=args, kwargs=kwargs)

9404 result = op.transform()

9405 assert isinstance(result, DataFrame)

9406 return result

9407

9408 def apply(

9409 self,

9410 func: AggFuncType,

9411 axis: Axis = 0,

9412 raw: bool = False,

9413 result_type: Literal["expand", "reduce", "broadcast"] | None = None,

9414 args=(),

9415 **kwargs,

9416 ):

9417 """

9418 Apply a function along an axis of the DataFrame.

9419

9420 Objects passed to the function are Series objects whose index is

9421 either the DataFrame's index (``axis=0``) or the DataFrame's columns

9422 (``axis=1``). By default (``result_type=None``), the final return type

9423 is inferred from the return type of the applied function. Otherwise,

9424 it depends on the `result_type` argument.

9425

9426 Parameters

9427 ----------

9428 func : function

9429 Function to apply to each column or row.

9430 axis : {0 or 'index', 1 or 'columns'}, default 0

9431 Axis along which the function is applied:

9432

9433 * 0 or 'index': apply function to each column.

9434 * 1 or 'columns': apply function to each row.

9435

9436 raw : bool, default False

9437 Determines if row or column is passed as a Series or ndarray object:

9438

9439 * ``False`` : passes each row or column as a Series to the

9440 function.

9441 * ``True`` : the passed function will receive ndarray objects

9442 instead.

9443 If you are just applying a NumPy reduction function this will

9444 achieve much better performance.

9445

9446 result_type : {'expand', 'reduce', 'broadcast', None}, default None

9447 These only act when ``axis=1`` (columns):

9448

9449 * 'expand' : list-like results will be turned into columns.

9450 * 'reduce' : returns a Series if possible rather than expanding

9451 list-like results. This is the opposite of 'expand'.

9452 * 'broadcast' : results will be broadcast to the original shape

9453 of the DataFrame, the original index and columns will be

9454 retained.

9455

9456 The default behaviour (None) depends on the return value of the

9457 applied function: list-like results will be returned as a Series

9458 of those. However if the apply function returns a Series these

9459 are expanded to columns.

9460 args : tuple

9461 Positional arguments to pass to `func` in addition to the

9462 array/series.

9463 **kwargs

9464 Additional keyword arguments to pass as keywords arguments to

9465 `func`.

9466

9467 Returns

9468 -------

9469 Series or DataFrame

9470 Result of applying ``func`` along the given axis of the

9471 DataFrame.

9472

9473 See Also

9474 --------

9475 DataFrame.applymap: For elementwise operations.

9476 DataFrame.aggregate: Only perform aggregating type operations.

9477 DataFrame.transform: Only perform transforming type operations.

9478

9479 Notes

9480 -----

9481 Functions that mutate the passed object can produce unexpected

9482 behavior or errors and are not supported. See :ref:`gotchas.udf-mutation`

9483 for more details.

9484

9485 Examples

9486 --------

9487 >>> df = pd.DataFrame([[4, 9]] * 3, columns=['A', 'B'])

9488 >>> df

9489 A B

9490 0 4 9

9491 1 4 9

9492 2 4 9

9493

9494 Using a numpy universal function (in this case the same as

9495 ``np.sqrt(df)``):

9496

9497 >>> df.apply(np.sqrt)

9498 A B

9499 0 2.0 3.0

9500 1 2.0 3.0

9501 2 2.0 3.0

9502

9503 Using a reducing function on either axis

9504

9505 >>> df.apply(np.sum, axis=0)

9506 A 12

9507 B 27

9508 dtype: int64

9509

9510 >>> df.apply(np.sum, axis=1)

9511 0 13

9512 1 13

9513 2 13

9514 dtype: int64

9515

9516 Returning a list-like will result in a Series

9517

9518 >>> df.apply(lambda x: [1, 2], axis=1)

9519 0 [1, 2]

9520 1 [1, 2]

9521 2 [1, 2]

9522 dtype: object

9523

9524 Passing ``result_type='expand'`` will expand list-like results

9525 to columns of a Dataframe

9526

9527 >>> df.apply(lambda x: [1, 2], axis=1, result_type='expand')

9528 0 1

9529 0 1 2

9530 1 1 2

9531 2 1 2

9532

9533 Returning a Series inside the function is similar to passing

9534 ``result_type='expand'``. The resulting column names

9535 will be the Series index.

9536

9537 >>> df.apply(lambda x: pd.Series([1, 2], index=['foo', 'bar']), axis=1)

9538 foo bar

9539 0 1 2

9540 1 1 2

9541 2 1 2

9542

9543 Passing ``result_type='broadcast'`` will ensure the same shape

9544 result, whether list-like or scalar is returned by the function,

9545 and broadcast it along the axis. The resulting column names will

9546 be the originals.

9547

9548 >>> df.apply(lambda x: [1, 2], axis=1, result_type='broadcast')

9549 A B

9550 0 1 2

9551 1 1 2

9552 2 1 2

9553 """

9554 from pandas.core.apply import frame_apply

9555

9556 op = frame_apply(

9557 self,

9558 func=func,

9559 axis=axis,

9560 raw=raw,

9561 result_type=result_type,

9562 args=args,

9563 kwargs=kwargs,

9564 )

9565 return op.apply().__finalize__(self, method="apply")

9566

9567 def applymap(

9568 self, func: PythonFuncType, na_action: str | None = None, **kwargs

9569 ) -> DataFrame:

9570 """

9571 Apply a function to a Dataframe elementwise.

9572

9573 This method applies a function that accepts and returns a scalar

9574 to every element of a DataFrame.

9575

9576 Parameters

9577 ----------

9578 func : callable

9579 Python function, returns a single value from a single value.

9580 na_action : {None, 'ignore'}, default None

9581 If ‘ignore’, propagate NaN values, without passing them to func.

9582

9583 .. versionadded:: 1.2

9584

9585 **kwargs

9586 Additional keyword arguments to pass as keywords arguments to

9587 `func`.

9588

9589 .. versionadded:: 1.3.0

9590

9591 Returns

9592 -------

9593 DataFrame

9594 Transformed DataFrame.

9595

9596 See Also

9597 --------

9598 DataFrame.apply : Apply a function along input axis of DataFrame.

9599

9600 Examples

9601 --------

9602 >>> df = pd.DataFrame([[1, 2.12], [3.356, 4.567]])

9603 >>> df

9604 0 1

9605 0 1.000 2.120

9606 1 3.356 4.567

9607

9608 >>> df.applymap(lambda x: len(str(x)))

9609 0 1

9610 0 3 4

9611 1 5 5

9612

9613 Like Series.map, NA values can be ignored:

9614

9615 >>> df_copy = df.copy()

9616 >>> df_copy.iloc[0, 0] = pd.NA

9617 >>> df_copy.applymap(lambda x: len(str(x)), na_action='ignore')

9618 0 1

9619 0 NaN 4

9620 1 5.0 5

9621

9622 Note that a vectorized version of `func` often exists, which will

9623 be much faster. You could square each number elementwise.

9624

9625 >>> df.applymap(lambda x: x**2)

9626 0 1

9627 0 1.000000 4.494400

9628 1 11.262736 20.857489

9629

9630 But it's better to avoid applymap in that case.

9631

9632 >>> df ** 2

9633 0 1

9634 0 1.000000 4.494400

9635 1 11.262736 20.857489

9636 """

9637 if na_action not in {"ignore", None}:

9638 raise ValueError(

9639 f"na_action must be 'ignore' or None. Got {repr(na_action)}"

9640 )

9641 ignore_na = na_action == "ignore"

9642 func = functools.partial(func, **kwargs)

9643

9644 # if we have a dtype == 'M8[ns]', provide boxed values

9645 def infer(x):

9646 if x.empty:

9647 return lib.map_infer(x, func, ignore_na=ignore_na)

9648 return lib.map_infer(x.astype(object)._values, func, ignore_na=ignore_na)

9649

9650 return self.apply(infer).__finalize__(self, "applymap")

9651

9652 # ----------------------------------------------------------------------

9653 # Merging / joining methods

9654

9655 def append(

9656 self,

9657 other,

9658 ignore_index: bool = False,

9659 verify_integrity: bool = False,

9660 sort: bool = False,

9661 ) -> DataFrame:

9662 """

9663 Append rows of `other` to the end of caller, returning a new object.

9664

9665 .. deprecated:: 1.4.0

9666 Use :func:`concat` instead. For further details see

9667 :ref:`whatsnew_140.deprecations.frame_series_append`

9668

9669 Columns in `other` that are not in the caller are added as new columns.

9670

9671 Parameters

9672 ----------

9673 other : DataFrame or Series/dict-like object, or list of these

9674 The data to append.

9675 ignore_index : bool, default False

9676 If True, the resulting axis will be labeled 0, 1, …, n - 1.

9677 verify_integrity : bool, default False

9678 If True, raise ValueError on creating index with duplicates.

9679 sort : bool, default False

9680 Sort columns if the columns of `self` and `other` are not aligned.

9681

9682 .. versionchanged:: 1.0.0

9683

9684 Changed to not sort by default.

9685

9686 Returns

9687 -------

9688 DataFrame

9689 A new DataFrame consisting of the rows of caller and the rows of `other`.

9690

9691 See Also

9692 --------

9693 concat : General function to concatenate DataFrame or Series objects.

9694

9695 Notes

9696 -----

9697 If a list of dict/series is passed and the keys are all contained in

9698 the DataFrame's index, the order of the columns in the resulting

9699 DataFrame will be unchanged.

9700

9701 Iteratively appending rows to a DataFrame can be more computationally

9702 intensive than a single concatenate. A better solution is to append

9703 those rows to a list and then concatenate the list with the original

9704 DataFrame all at once.

9705

9706 Examples

9707 --------

9708 >>> df = pd.DataFrame([[1, 2], [3, 4]], columns=list('AB'), index=['x', 'y'])

9709 >>> df

9710 A B

9711 x 1 2

9712 y 3 4

9713 >>> df2 = pd.DataFrame([[5, 6], [7, 8]], columns=list('AB'), index=['x', 'y'])

9714 >>> df.append(df2)

9715 A B

9716 x 1 2

9717 y 3 4

9718 x 5 6

9719 y 7 8

9720

9721 With `ignore_index` set to True:

9722

9723 >>> df.append(df2, ignore_index=True)

9724 A B

9725 0 1 2

9726 1 3 4

9727 2 5 6

9728 3 7 8

9729

9730 The following, while not recommended methods for generating DataFrames,

9731 show two ways to generate a DataFrame from multiple data sources.

9732

9733 Less efficient:

9734

9735 >>> df = pd.DataFrame(columns=['A'])

9736 >>> for i in range(5):

9737 ... df = df.append({'A': i}, ignore_index=True)

9738 >>> df

9739 A

9740 0 0

9741 1 1

9742 2 2

9743 3 3

9744 4 4

9745

9746 More efficient:

9747

9748 >>> pd.concat([pd.DataFrame([i], columns=['A']) for i in range(5)],

9749 ... ignore_index=True)

9750 A

9751 0 0

9752 1 1

9753 2 2

9754 3 3

9755 4 4

9756 """

9757 warnings.warn(

9758 "The frame.append method is deprecated "

9759 "and will be removed from pandas in a future version. "

9760 "Use pandas.concat instead.",

9761 FutureWarning,

9762 stacklevel=find_stack_level(),

9763 )

9764

9765 return self._append(other, ignore_index, verify_integrity, sort)

9766

9767 def _append(

9768 self,

9769 other,

9770 ignore_index: bool = False,

9771 verify_integrity: bool = False,

9772 sort: bool = False,

9773 ) -> DataFrame:

9774 if isinstance(other, (Series, dict)):

9775 if isinstance(other, dict):

9776 if not ignore_index:

9777 raise TypeError("Can only append a dict if ignore_index=True")

9778 other = Series(other)

9779 if other.name is None and not ignore_index:

9780 raise TypeError(

9781 "Can only append a Series if ignore_index=True "

9782 "or if the Series has a name"

9783 )

9784

9785 index = Index([other.name], name=self.index.name)

9786 row_df = other.to_frame().T

9787 # infer_objects is needed for

9788 # test_append_empty_frame_to_series_with_dateutil_tz

9789 other = row_df.infer_objects().rename_axis(index.names, copy=False)

9790 elif isinstance(other, list):

9791 if not other:

9792 pass

9793 elif not isinstance(other[0], DataFrame):

9794 other = DataFrame(other)

9795 if self.index.name is not None and not ignore_index:

9796 other.index.name = self.index.name

9797

9798 from pandas.core.reshape.concat import concat

9799

9800 if isinstance(other, (list, tuple)):

9801 to_concat = [self, *other]

9802 else:

9803 to_concat = [self, other]

9804

9805 result = concat(

9806 to_concat,

9807 ignore_index=ignore_index,

9808 verify_integrity=verify_integrity,

9809 sort=sort,

9810 )

9811 return result.__finalize__(self, method="append")

9812

9813 def join(

9814 self,

9815 other: DataFrame | Series | list[DataFrame | Series],

9816 on: IndexLabel | None = None,

9817 how: str = "left",

9818 lsuffix: str = "",

9819 rsuffix: str = "",

9820 sort: bool = False,

9821 validate: str | None = None,

9822 ) -> DataFrame:

9823 """

9824 Join columns of another DataFrame.

9825

9826 Join columns with `other` DataFrame either on index or on a key

9827 column. Efficiently join multiple DataFrame objects by index at once by

9828 passing a list.

9829

9830 Parameters

9831 ----------

9832 other : DataFrame, Series, or a list containing any combination of them

9833 Index should be similar to one of the columns in this one. If a

9834 Series is passed, its name attribute must be set, and that will be

9835 used as the column name in the resulting joined DataFrame.

9836 on : str, list of str, or array-like, optional

9837 Column or index level name(s) in the caller to join on the index

9838 in `other`, otherwise joins index-on-index. If multiple

9839 values given, the `other` DataFrame must have a MultiIndex. Can

9840 pass an array as the join key if it is not already contained in

9841 the calling DataFrame. Like an Excel VLOOKUP operation.

9842 how : {'left', 'right', 'outer', 'inner'}, default 'left'

9843 How to handle the operation of the two objects.

9844

9845 * left: use calling frame's index (or column if on is specified)

9846 * right: use `other`'s index.

9847 * outer: form union of calling frame's index (or column if on is

9848 specified) with `other`'s index, and sort it.

9849 lexicographically.

9850 * inner: form intersection of calling frame's index (or column if

9851 on is specified) with `other`'s index, preserving the order

9852 of the calling's one.

9853 * cross: creates the cartesian product from both frames, preserves the order

9854 of the left keys.

9855

9856 .. versionadded:: 1.2.0

9857

9858 lsuffix : str, default ''

9859 Suffix to use from left frame's overlapping columns.

9860 rsuffix : str, default ''

9861 Suffix to use from right frame's overlapping columns.

9862 sort : bool, default False

9863 Order result DataFrame lexicographically by the join key. If False,

9864 the order of the join key depends on the join type (how keyword).

9865 validate : str, optional

9866 If specified, checks if join is of specified type.

9867 * "one_to_one" or "1:1": check if join keys are unique in both left

9868 and right datasets.

9869 * "one_to_many" or "1:m": check if join keys are unique in left dataset.

9870 * "many_to_one" or "m:1": check if join keys are unique in right dataset.

9871 * "many_to_many" or "m:m": allowed, but does not result in checks.

9872 .. versionadded:: 1.5.0

9873

9874 Returns

9875 -------

9876 DataFrame

9877 A dataframe containing columns from both the caller and `other`.

9878

9879 See Also

9880 --------

9881 DataFrame.merge : For column(s)-on-column(s) operations.

9882

9883 Notes

9884 -----

9885 Parameters `on`, `lsuffix`, and `rsuffix` are not supported when

9886 passing a list of `DataFrame` objects.

9887

9888 Support for specifying index levels as the `on` parameter was added

9889 in version 0.23.0.

9890

9891 Examples

9892 --------

9893 >>> df = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3', 'K4', 'K5'],

9894 ... 'A': ['A0', 'A1', 'A2', 'A3', 'A4', 'A5']})

9895

9896 >>> df

9897 key A

9898 0 K0 A0

9899 1 K1 A1

9900 2 K2 A2

9901 3 K3 A3

9902 4 K4 A4

9903 5 K5 A5

9904

9905 >>> other = pd.DataFrame({'key': ['K0', 'K1', 'K2'],

9906 ... 'B': ['B0', 'B1', 'B2']})

9907

9908 >>> other

9909 key B

9910 0 K0 B0

9911 1 K1 B1

9912 2 K2 B2

9913

9914 Join DataFrames using their indexes.

9915

9916 >>> df.join(other, lsuffix='_caller', rsuffix='_other')

9917 key_caller A key_other B

9918 0 K0 A0 K0 B0

9919 1 K1 A1 K1 B1

9920 2 K2 A2 K2 B2

9921 3 K3 A3 NaN NaN

9922 4 K4 A4 NaN NaN

9923 5 K5 A5 NaN NaN

9924

9925 If we want to join using the key columns, we need to set key to be

9926 the index in both `df` and `other`. The joined DataFrame will have

9927 key as its index.

9928

9929 >>> df.set_index('key').join(other.set_index('key'))

9930 A B

9931 key

9932 K0 A0 B0

9933 K1 A1 B1

9934 K2 A2 B2

9935 K3 A3 NaN

9936 K4 A4 NaN

9937 K5 A5 NaN

9938

9939 Another option to join using the key columns is to use the `on`

9940 parameter. DataFrame.join always uses `other`'s index but we can use

9941 any column in `df`. This method preserves the original DataFrame's

9942 index in the result.

9943

9944 >>> df.join(other.set_index('key'), on='key')

9945 key A B

9946 0 K0 A0 B0

9947 1 K1 A1 B1

9948 2 K2 A2 B2

9949 3 K3 A3 NaN

9950 4 K4 A4 NaN

9951 5 K5 A5 NaN

9952

9953 Using non-unique key values shows how they are matched.

9954

9955 >>> df = pd.DataFrame({'key': ['K0', 'K1', 'K1', 'K3', 'K0', 'K1'],

9956 ... 'A': ['A0', 'A1', 'A2', 'A3', 'A4', 'A5']})

9957

9958 >>> df

9959 key A

9960 0 K0 A0

9961 1 K1 A1

9962 2 K1 A2

9963 3 K3 A3

9964 4 K0 A4

9965 5 K1 A5

9966

9967 >>> df.join(other.set_index('key'), on='key', validate='m:1')

9968 key A B

9969 0 K0 A0 B0

9970 1 K1 A1 B1

9971 2 K1 A2 B1

9972 3 K3 A3 NaN

9973 4 K0 A4 B0

9974 5 K1 A5 B1

9975 """

9976 return self._join_compat(

9977 other,

9978 on=on,

9979 how=how,

9980 lsuffix=lsuffix,

9981 rsuffix=rsuffix,

9982 sort=sort,

9983 validate=validate,

9984 )

9985

9986 def _join_compat(

9987 self,

9988 other: DataFrame | Series | Iterable[DataFrame | Series],

9989 on: IndexLabel | None = None,

9990 how: str = "left",

9991 lsuffix: str = "",

9992 rsuffix: str = "",

9993 sort: bool = False,

9994 validate: str | None = None,

9995 ):

9996 from pandas.core.reshape.concat import concat

9997 from pandas.core.reshape.merge import merge

9998

9999 if isinstance(other, Series):

10000 if other.name is None:

10001 raise ValueError("Other Series must have a name")

10002 other = DataFrame({other.name: other})

10003

10004 if isinstance(other, DataFrame):

10005 if how == "cross":

10006 return merge(

10007 self,

10008 other,

10009 how=how,

10010 on=on,

10011 suffixes=(lsuffix, rsuffix),

10012 sort=sort,

10013 validate=validate,

10014 )

10015 return merge(

10016 self,

10017 other,

10018 left_on=on,

10019 how=how,

10020 left_index=on is None,

10021 right_index=True,

10022 suffixes=(lsuffix, rsuffix),

10023 sort=sort,

10024 validate=validate,

10025 )

10026 else:

10027 if on is not None:

10028 raise ValueError(

10029 "Joining multiple DataFrames only supported for joining on index"

10030 )

10031

10032 if rsuffix or lsuffix:

10033 raise ValueError(

10034 "Suffixes not supported when joining multiple DataFrames"

10035 )

10036

10037 # Mypy thinks the RHS is a

10038 # "Union[DataFrame, Series, Iterable[Union[DataFrame, Series]]]" whereas

10039 # the LHS is an "Iterable[DataFrame]", but in reality both types are

10040 # "Iterable[Union[DataFrame, Series]]" due to the if statements

10041 frames = [cast("DataFrame | Series", self)] + list(other)

10042

10043 can_concat = all(df.index.is_unique for df in frames)

10044

10045 # join indexes only using concat

10046 if can_concat:

10047 if how == "left":

10048 res = concat(

10049 frames, axis=1, join="outer", verify_integrity=True, sort=sort

10050 )

10051 return res.reindex(self.index, copy=False)

10052 else:

10053 return concat(

10054 frames, axis=1, join=how, verify_integrity=True, sort=sort

10055 )

10056

10057 joined = frames[0]

10058

10059 for frame in frames[1:]:

10060 joined = merge(

10061 joined,

10062 frame,

10063 how=how,

10064 left_index=True,

10065 right_index=True,

10066 validate=validate,

10067 )

10068

10069 return joined

10070

10071 @Substitution("")

10072 @Appender(_merge_doc, indents=2)

10073 def merge(

10074 self,

10075 right: DataFrame | Series,

10076 how: str = "inner",

10077 on: IndexLabel | None = None,

10078 left_on: IndexLabel | None = None,

10079 right_on: IndexLabel | None = None,

10080 left_index: bool = False,

10081 right_index: bool = False,

10082 sort: bool = False,

10083 suffixes: Suffixes = ("_x", "_y"),

10084 copy: bool = True,

10085 indicator: bool = False,

10086 validate: str | None = None,

10087 ) -> DataFrame:

10088 from pandas.core.reshape.merge import merge

10089

10090 return merge(

10091 self,

10092 right,

10093 how=how,

10094 on=on,

10095 left_on=left_on,

10096 right_on=right_on,

10097 left_index=left_index,

10098 right_index=right_index,

10099 sort=sort,

10100 suffixes=suffixes,

10101 copy=copy,

10102 indicator=indicator,

10103 validate=validate,

10104 )

10105

10106 def round(

10107 self, decimals: int | dict[IndexLabel, int] | Series = 0, *args, **kwargs

10108 ) -> DataFrame:

10109 """

10110 Round a DataFrame to a variable number of decimal places.

10111

10112 Parameters

10113 ----------

10114 decimals : int, dict, Series

10115 Number of decimal places to round each column to. If an int is

10116 given, round each column to the same number of places.

10117 Otherwise dict and Series round to variable numbers of places.

10118 Column names should be in the keys if `decimals` is a

10119 dict-like, or in the index if `decimals` is a Series. Any

10120 columns not included in `decimals` will be left as is. Elements

10121 of `decimals` which are not columns of the input will be

10122 ignored.

10123 *args

10124 Additional keywords have no effect but might be accepted for

10125 compatibility with numpy.

10126 **kwargs

10127 Additional keywords have no effect but might be accepted for

10128 compatibility with numpy.

10129

10130 Returns

10131 -------

10132 DataFrame

10133 A DataFrame with the affected columns rounded to the specified

10134 number of decimal places.

10135

10136 See Also

10137 --------

10138 numpy.around : Round a numpy array to the given number of decimals.

10139 Series.round : Round a Series to the given number of decimals.

10140

10141 Examples

10142 --------

10143 >>> df = pd.DataFrame([(.21, .32), (.01, .67), (.66, .03), (.21, .18)],

10144 ... columns=['dogs', 'cats'])

10145 >>> df

10146 dogs cats

10147 0 0.21 0.32

10148 1 0.01 0.67

10149 2 0.66 0.03

10150 3 0.21 0.18

10151

10152 By providing an integer each column is rounded to the same number

10153 of decimal places

10154

10155 >>> df.round(1)

10156 dogs cats

10157 0 0.2 0.3

10158 1 0.0 0.7

10159 2 0.7 0.0

10160 3 0.2 0.2

10161

10162 With a dict, the number of places for specific columns can be

10163 specified with the column names as key and the number of decimal

10164 places as value

10165

10166 >>> df.round({'dogs': 1, 'cats': 0})

10167 dogs cats

10168 0 0.2 0.0

10169 1 0.0 1.0

10170 2 0.7 0.0

10171 3 0.2 0.0

10172

10173 Using a Series, the number of places for specific columns can be

10174 specified with the column names as index and the number of

10175 decimal places as value

10176

10177 >>> decimals = pd.Series([0, 1], index=['cats', 'dogs'])

10178 >>> df.round(decimals)

10179 dogs cats

10180 0 0.2 0.0

10181 1 0.0 1.0

10182 2 0.7 0.0

10183 3 0.2 0.0

10184 """

10185 from pandas.core.reshape.concat import concat

10186

10187 def _dict_round(df: DataFrame, decimals):

10188 for col, vals in df.items():

10189 try:

10190 yield _series_round(vals, decimals[col])

10191 except KeyError:

10192 yield vals

10193

10194 def _series_round(ser: Series, decimals: int):

10195 if is_integer_dtype(ser.dtype) or is_float_dtype(ser.dtype):

10196 return ser.round(decimals)

10197 return ser

10198

10199 nv.validate_round(args, kwargs)

10200

10201 if isinstance(decimals, (dict, Series)):

10202 if isinstance(decimals, Series) and not decimals.index.is_unique:

10203 raise ValueError("Index of decimals must be unique")

10204 if is_dict_like(decimals) and not all(

10205 is_integer(value) for _, value in decimals.items()

10206 ):

10207 raise TypeError("Values in decimals must be integers")

10208 new_cols = list(_dict_round(self, decimals))

10209 elif is_integer(decimals):

10210 # Dispatch to Series.round

10211 new_cols = [_series_round(v, decimals) for _, v in self.items()]

10212 else:

10213 raise TypeError("decimals must be an integer, a dict-like or a Series")

10214

10215 if len(new_cols) > 0:

10216 return self._constructor(

10217 concat(new_cols, axis=1), index=self.index, columns=self.columns

10218 ).__finalize__(self, method="round")

10219 else:

10220 return self

10221

10222 # ----------------------------------------------------------------------

10223 # Statistical methods, etc.

10224

10225 def corr(

10226 self,

10227 method: str | Callable[[np.ndarray, np.ndarray], float] = "pearson",

10228 min_periods: int = 1,

10229 numeric_only: bool | lib.NoDefault = lib.no_default,

10230 ) -> DataFrame:

10231 """

10232 Compute pairwise correlation of columns, excluding NA/null values.

10233

10234 Parameters

10235 ----------

10236 method : {'pearson', 'kendall', 'spearman'} or callable

10237 Method of correlation:

10238

10239 * pearson : standard correlation coefficient

10240 * kendall : Kendall Tau correlation coefficient

10241 * spearman : Spearman rank correlation

10242 * callable: callable with input two 1d ndarrays

10243 and returning a float. Note that the returned matrix from corr

10244 will have 1 along the diagonals and will be symmetric

10245 regardless of the callable's behavior.

10246 min_periods : int, optional

10247 Minimum number of observations required per pair of columns

10248 to have a valid result. Currently only available for Pearson

10249 and Spearman correlation.

10250 numeric_only : bool, default True

10251 Include only `float`, `int` or `boolean` data.

10252

10253 .. versionadded:: 1.5.0

10254

10255 .. deprecated:: 1.5.0

10256 The default value of ``numeric_only`` will be ``False`` in a future

10257 version of pandas.

10258

10259 Returns

10260 -------

10261 DataFrame

10262 Correlation matrix.

10263

10264 See Also

10265 --------

10266 DataFrame.corrwith : Compute pairwise correlation with another

10267 DataFrame or Series.

10268 Series.corr : Compute the correlation between two Series.

10269

10270 Notes

10271 -----

10272 Pearson, Kendall and Spearman correlation are currently computed using pairwise complete observations.

10273

10274 * `Pearson correlation coefficient <https://en.wikipedia.org/wiki/Pearson_correlation_coefficient>`_

10275 * `Kendall rank correlation coefficient <https://en.wikipedia.org/wiki/Kendall_rank_correlation_coefficient>`_

10276 * `Spearman's rank correlation coefficient <https://en.wikipedia.org/wiki/Spearman%27s_rank_correlation_coefficient>`_

10277

10278 Examples

10279 --------

10280 >>> def histogram_intersection(a, b):

10281 ... v = np.minimum(a, b).sum().round(decimals=1)

10282 ... return v

10283 >>> df = pd.DataFrame([(.2, .3), (.0, .6), (.6, .0), (.2, .1)],

10284 ... columns=['dogs', 'cats'])

10285 >>> df.corr(method=histogram_intersection)

10286 dogs cats

10287 dogs 1.0 0.3

10288 cats 0.3 1.0

10289

10290 >>> df = pd.DataFrame([(1, 1), (2, np.nan), (np.nan, 3), (4, 4)],

10291 ... columns=['dogs', 'cats'])

10292 >>> df.corr(min_periods=3)

10293 dogs cats

10294 dogs 1.0 NaN

10295 cats NaN 1.0

10296 """ # noqa:E501

10297 numeric_only_bool = com.resolve_numeric_only(numeric_only)

10298 data = self._get_numeric_data() if numeric_only_bool else self

10299 if numeric_only is lib.no_default and len(data.columns) < len(self.columns):

10300 com.deprecate_numeric_only_default(type(self), "corr")

10301

10302 cols = data.columns

10303 idx = cols.copy()

10304 mat = data.to_numpy(dtype=float, na_value=np.nan, copy=False)

10305

10306 if method == "pearson":

10307 correl = libalgos.nancorr(mat, minp=min_periods)

10308 elif method == "spearman":

10309 correl = libalgos.nancorr_spearman(mat, minp=min_periods)

10310 elif method == "kendall" or callable(method):

10311 if min_periods is None:

10312 min_periods = 1

10313 mat = mat.T

10314 corrf = nanops.get_corr_func(method)

10315 K = len(cols)

10316 correl = np.empty((K, K), dtype=float)

10317 mask = np.isfinite(mat)

10318 for i, ac in enumerate(mat):

10319 for j, bc in enumerate(mat):

10320 if i > j:

10321 continue

10322

10323 valid = mask[i] & mask[j]

10324 if valid.sum() < min_periods:

10325 c = np.nan

10326 elif i == j:

10327 c = 1.0

10328 elif not valid.all():

10329 c = corrf(ac[valid], bc[valid])

10330 else:

10331 c = corrf(ac, bc)

10332 correl[i, j] = c

10333 correl[j, i] = c

10334 else:

10335 raise ValueError(

10336 "method must be either 'pearson', "

10337 "'spearman', 'kendall', or a callable, "

10338 f"'{method}' was supplied"

10339 )

10340

10341 return self._constructor(correl, index=idx, columns=cols)

10342

10343 def cov(

10344 self,

10345 min_periods: int | None = None,

10346 ddof: int | None = 1,

10347 numeric_only: bool | lib.NoDefault = lib.no_default,

10348 ) -> DataFrame:

10349 """

10350 Compute pairwise covariance of columns, excluding NA/null values.

10351

10352 Compute the pairwise covariance among the series of a DataFrame.

10353 The returned data frame is the `covariance matrix

10354 <https://en.wikipedia.org/wiki/Covariance_matrix>`__ of the columns

10355 of the DataFrame.

10356

10357 Both NA and null values are automatically excluded from the

10358 calculation. (See the note below about bias from missing values.)

10359 A threshold can be set for the minimum number of

10360 observations for each value created. Comparisons with observations

10361 below this threshold will be returned as ``NaN``.

10362

10363 This method is generally used for the analysis of time series data to

10364 understand the relationship between different measures

10365 across time.

10366

10367 Parameters

10368 ----------

10369 min_periods : int, optional

10370 Minimum number of observations required per pair of columns

10371 to have a valid result.

10372

10373 ddof : int, default 1

10374 Delta degrees of freedom. The divisor used in calculations

10375 is ``N - ddof``, where ``N`` represents the number of elements.

10376

10377 .. versionadded:: 1.1.0

10378

10379 numeric_only : bool, default True

10380 Include only `float`, `int` or `boolean` data.

10381

10382 .. versionadded:: 1.5.0

10383

10384 .. deprecated:: 1.5.0

10385 The default value of ``numeric_only`` will be ``False`` in a future

10386 version of pandas.

10387

10388 Returns

10389 -------

10390 DataFrame

10391 The covariance matrix of the series of the DataFrame.

10392

10393 See Also

10394 --------

10395 Series.cov : Compute covariance with another Series.

10396 core.window.ewm.ExponentialMovingWindow.cov : Exponential weighted sample

10397 covariance.

10398 core.window.expanding.Expanding.cov : Expanding sample covariance.

10399 core.window.rolling.Rolling.cov : Rolling sample covariance.

10400

10401 Notes

10402 -----

10403 Returns the covariance matrix of the DataFrame's time series.

10404 The covariance is normalized by N-ddof.

10405

10406 For DataFrames that have Series that are missing data (assuming that

10407 data is `missing at random

10408 <https://en.wikipedia.org/wiki/Missing_data#Missing_at_random>`__)

10409 the returned covariance matrix will be an unbiased estimate

10410 of the variance and covariance between the member Series.

10411

10412 However, for many applications this estimate may not be acceptable

10413 because the estimate covariance matrix is not guaranteed to be positive

10414 semi-definite. This could lead to estimate correlations having

10415 absolute values which are greater than one, and/or a non-invertible

10416 covariance matrix. See `Estimation of covariance matrices

10417 <https://en.wikipedia.org/w/index.php?title=Estimation_of_covariance_

10418 matrices>`__ for more details.

10419

10420 Examples

10421 --------

10422 >>> df = pd.DataFrame([(1, 2), (0, 3), (2, 0), (1, 1)],

10423 ... columns=['dogs', 'cats'])

10424 >>> df.cov()

10425 dogs cats

10426 dogs 0.666667 -1.000000

10427 cats -1.000000 1.666667

10428

10429 >>> np.random.seed(42)

10430 >>> df = pd.DataFrame(np.random.randn(1000, 5),

10431 ... columns=['a', 'b', 'c', 'd', 'e'])

10432 >>> df.cov()

10433 a b c d e

10434 a 0.998438 -0.020161 0.059277 -0.008943 0.014144

10435 b -0.020161 1.059352 -0.008543 -0.024738 0.009826

10436 c 0.059277 -0.008543 1.010670 -0.001486 -0.000271

10437 d -0.008943 -0.024738 -0.001486 0.921297 -0.013692

10438 e 0.014144 0.009826 -0.000271 -0.013692 0.977795

10439

10440 **Minimum number of periods**

10441

10442 This method also supports an optional ``min_periods`` keyword

10443 that specifies the required minimum number of non-NA observations for

10444 each column pair in order to have a valid result:

10445

10446 >>> np.random.seed(42)

10447 >>> df = pd.DataFrame(np.random.randn(20, 3),

10448 ... columns=['a', 'b', 'c'])

10449 >>> df.loc[df.index[:5], 'a'] = np.nan

10450 >>> df.loc[df.index[5:10], 'b'] = np.nan

10451 >>> df.cov(min_periods=12)

10452 a b c

10453 a 0.316741 NaN -0.150812

10454 b NaN 1.248003 0.191417

10455 c -0.150812 0.191417 0.895202

10456 """

10457 numeric_only_bool = com.resolve_numeric_only(numeric_only)

10458 data = self._get_numeric_data() if numeric_only_bool else self

10459 if numeric_only is lib.no_default and len(data.columns) < len(self.columns):

10460 com.deprecate_numeric_only_default(type(self), "cov")

10461

10462 cols = data.columns

10463 idx = cols.copy()

10464 mat = data.to_numpy(dtype=float, na_value=np.nan, copy=False)

10465

10466 if notna(mat).all():

10467 if min_periods is not None and min_periods > len(mat):

10468 base_cov = np.empty((mat.shape[1], mat.shape[1]))

10469 base_cov.fill(np.nan)

10470 else:

10471 base_cov = np.cov(mat.T, ddof=ddof)

10472 base_cov = base_cov.reshape((len(cols), len(cols)))

10473 else:

10474 base_cov = libalgos.nancorr(mat, cov=True, minp=min_periods)

10475

10476 return self._constructor(base_cov, index=idx, columns=cols)

10477

10478 def corrwith(

10479 self,

10480 other: DataFrame | Series,

10481 axis: Axis = 0,

10482 drop: bool = False,

10483 method: Literal["pearson", "kendall", "spearman"]

10484 | Callable[[np.ndarray, np.ndarray], float] = "pearson",

10485 numeric_only: bool | lib.NoDefault = lib.no_default,

10486 ) -> Series:

10487 """

10488 Compute pairwise correlation.

10489

10490 Pairwise correlation is computed between rows or columns of

10491 DataFrame with rows or columns of Series or DataFrame. DataFrames

10492 are first aligned along both axes before computing the

10493 correlations.

10494

10495 Parameters

10496 ----------

10497 other : DataFrame, Series

10498 Object with which to compute correlations.

10499 axis : {0 or 'index', 1 or 'columns'}, default 0

10500 The axis to use. 0 or 'index' to compute row-wise, 1 or 'columns' for

10501 column-wise.

10502 drop : bool, default False

10503 Drop missing indices from result.

10504 method : {'pearson', 'kendall', 'spearman'} or callable

10505 Method of correlation:

10506

10507 * pearson : standard correlation coefficient

10508 * kendall : Kendall Tau correlation coefficient

10509 * spearman : Spearman rank correlation

10510 * callable: callable with input two 1d ndarrays

10511 and returning a float.

10512

10513 numeric_only : bool, default True

10514 Include only `float`, `int` or `boolean` data.

10515

10516 .. versionadded:: 1.5.0

10517

10518 .. deprecated:: 1.5.0

10519 The default value of ``numeric_only`` will be ``False`` in a future

10520 version of pandas.

10521

10522 Returns

10523 -------

10524 Series

10525 Pairwise correlations.

10526

10527 See Also

10528 --------

10529 DataFrame.corr : Compute pairwise correlation of columns.

10530

10531 Examples

10532 --------

10533 >>> index = ["a", "b", "c", "d", "e"]

10534 >>> columns = ["one", "two", "three", "four"]

10535 >>> df1 = pd.DataFrame(np.arange(20).reshape(5, 4), index=index, columns=columns)

10536 >>> df2 = pd.DataFrame(np.arange(16).reshape(4, 4), index=index[:4], columns=columns)

10537 >>> df1.corrwith(df2)

10538 one 1.0

10539 two 1.0

10540 three 1.0

10541 four 1.0

10542 dtype: float64

10543

10544 >>> df2.corrwith(df1, axis=1)

10545 a 1.0

10546 b 1.0

10547 c 1.0

10548 d 1.0

10549 e NaN

10550 dtype: float64

10551 """ # noqa:E501

10552 axis = self._get_axis_number(axis)

10553 numeric_only_bool = com.resolve_numeric_only(numeric_only)

10554 this = self._get_numeric_data() if numeric_only_bool else self

10555 if numeric_only is lib.no_default and len(this.columns) < len(self.columns):

10556 com.deprecate_numeric_only_default(type(self), "corrwith")

10557

10558 if isinstance(other, Series):

10559 return this.apply(lambda x: other.corr(x, method=method), axis=axis)

10560

10561 if numeric_only_bool:

10562 other = other._get_numeric_data()

10563 left, right = this.align(other, join="inner", copy=False)

10564

10565 if axis == 1:

10566 left = left.T

10567 right = right.T

10568

10569 if method == "pearson":

10570 # mask missing values

10571 left = left + right * 0

10572 right = right + left * 0

10573

10574 # demeaned data

10575 ldem = left - left.mean(numeric_only=numeric_only_bool)

10576 rdem = right - right.mean(numeric_only=numeric_only_bool)

10577

10578 num = (ldem * rdem).sum()

10579 dom = (

10580 (left.count() - 1)

10581 * left.std(numeric_only=numeric_only_bool)

10582 * right.std(numeric_only=numeric_only_bool)

10583 )

10584

10585 correl = num / dom

10586

10587 elif method in ["kendall", "spearman"] or callable(method):

10588

10589 def c(x):

10590 return nanops.nancorr(x[0], x[1], method=method)

10591

10592 correl = self._constructor_sliced(

10593 map(c, zip(left.values.T, right.values.T)), index=left.columns

10594 )

10595

10596 else:

10597 raise ValueError(

10598 f"Invalid method {method} was passed, "

10599 "valid methods are: 'pearson', 'kendall', "

10600 "'spearman', or callable"

10601 )

10602

10603 if not drop:

10604 # Find non-matching labels along the given axis

10605 # and append missing correlations (GH 22375)

10606 raxis = 1 if axis == 0 else 0

10607 result_index = this._get_axis(raxis).union(other._get_axis(raxis))

10608 idx_diff = result_index.difference(correl.index)

10609

10610 if len(idx_diff) > 0:

10611 correl = correl._append(

10612 Series([np.nan] * len(idx_diff), index=idx_diff)

10613 )

10614

10615 return correl

10616

10617 # ----------------------------------------------------------------------

10618 # ndarray-like stats methods

10619

10620 def count(self, axis: Axis = 0, level: Level = None, numeric_only: bool = False):

10621 """

10622 Count non-NA cells for each column or row.

10623

10624 The values `None`, `NaN`, `NaT`, and optionally `numpy.inf` (depending

10625 on `pandas.options.mode.use_inf_as_na`) are considered NA.

10626

10627 Parameters

10628 ----------

10629 axis : {0 or 'index', 1 or 'columns'}, default 0

10630 If 0 or 'index' counts are generated for each column.

10631 If 1 or 'columns' counts are generated for each row.

10632 level : int or str, optional

10633 If the axis is a `MultiIndex` (hierarchical), count along a

10634 particular `level`, collapsing into a `DataFrame`.

10635 A `str` specifies the level name.

10636 numeric_only : bool, default False

10637 Include only `float`, `int` or `boolean` data.

10638

10639 Returns

10640 -------

10641 Series or DataFrame

10642 For each column/row the number of non-NA/null entries.

10643 If `level` is specified returns a `DataFrame`.

10644

10645 See Also

10646 --------

10647 Series.count: Number of non-NA elements in a Series.

10648 DataFrame.value_counts: Count unique combinations of columns.

10649 DataFrame.shape: Number of DataFrame rows and columns (including NA

10650 elements).

10651 DataFrame.isna: Boolean same-sized DataFrame showing places of NA

10652 elements.

10653

10654 Examples

10655 --------

10656 Constructing DataFrame from a dictionary:

10657

10658 >>> df = pd.DataFrame({"Person":

10659 ... ["John", "Myla", "Lewis", "John", "Myla"],

10660 ... "Age": [24., np.nan, 21., 33, 26],

10661 ... "Single": [False, True, True, True, False]})

10662 >>> df

10663 Person Age Single

10664 0 John 24.0 False

10665 1 Myla NaN True

10666 2 Lewis 21.0 True

10667 3 John 33.0 True

10668 4 Myla 26.0 False

10669

10670 Notice the uncounted NA values:

10671

10672 >>> df.count()

10673 Person 5

10674 Age 4

10675 Single 5

10676 dtype: int64

10677

10678 Counts for each **row**:

10679

10680 >>> df.count(axis='columns')

10681 0 3

10682 1 2

10683 2 3

10684 3 3

10685 4 3

10686 dtype: int64

10687 """

10688 axis = self._get_axis_number(axis)

10689 if level is not None:

10690 warnings.warn(

10691 "Using the level keyword in DataFrame and Series aggregations is "

10692 "deprecated and will be removed in a future version. Use groupby "

10693 "instead. df.count(level=1) should use df.groupby(level=1).count().",

10694 FutureWarning,

10695 stacklevel=find_stack_level(),

10696 )

10697 res = self._count_level(level, axis=axis, numeric_only=numeric_only)

10698 return res.__finalize__(self, method="count")

10699

10700 if numeric_only:

10701 frame = self._get_numeric_data()

10702 else:

10703 frame = self

10704

10705 # GH #423

10706 if len(frame._get_axis(axis)) == 0:

10707 result = self._constructor_sliced(0, index=frame._get_agg_axis(axis))

10708 else:

10709 if frame._is_mixed_type or frame._mgr.any_extension_types:

10710 # the or any_extension_types is really only hit for single-

10711 # column frames with an extension array

10712 result = notna(frame).sum(axis=axis)

10713 else:

10714 # GH13407

10715 series_counts = notna(frame).sum(axis=axis)

10716 counts = series_counts.values

10717 result = self._constructor_sliced(

10718 counts, index=frame._get_agg_axis(axis)

10719 )

10720

10721 return result.astype("int64").__finalize__(self, method="count")

10722

10723 def _count_level(self, level: Level, axis: int = 0, numeric_only: bool = False):

10724 if numeric_only:

10725 frame = self._get_numeric_data()

10726 else:

10727 frame = self

10728

10729 count_axis = frame._get_axis(axis)

10730 agg_axis = frame._get_agg_axis(axis)

10731

10732 if not isinstance(count_axis, MultiIndex):

10733 raise TypeError(

10734 f"Can only count levels on hierarchical {self._get_axis_name(axis)}."

10735 )

10736

10737 # Mask NaNs: Mask rows or columns where the index level is NaN, and all

10738 # values in the DataFrame that are NaN

10739 if frame._is_mixed_type:

10740 # Since we have mixed types, calling notna(frame.values) might

10741 # upcast everything to object

10742 values_mask = notna(frame).values

10743 else:

10744 # But use the speedup when we have homogeneous dtypes

10745 values_mask = notna(frame.values)

10746

10747 index_mask = notna(count_axis.get_level_values(level=level))

10748 if axis == 1:

10749 mask = index_mask & values_mask

10750 else:

10751 mask = index_mask.reshape(-1, 1) & values_mask

10752

10753 if isinstance(level, int):

10754 level_number = level

10755 else:

10756 level_number = count_axis._get_level_number(level)

10757

10758 level_name = count_axis._names[level_number]

10759 level_index = count_axis.levels[level_number]._rename(name=level_name)

10760 level_codes = ensure_platform_int(count_axis.codes[level_number])

10761 counts = lib.count_level_2d(mask, level_codes, len(level_index), axis=axis)

10762

10763 if axis == 1:

10764 result = self._constructor(counts, index=agg_axis, columns=level_index)

10765 else:

10766 result = self._constructor(counts, index=level_index, columns=agg_axis)

10767

10768 return result

10769

10770 def _reduce(

10771 self,

10772 op,

10773 name: str,

10774 *,

10775 axis: Axis = 0,

10776 skipna: bool = True,

10777 numeric_only: bool | None = None,

10778 filter_type=None,

10779 **kwds,

10780 ):

10781 assert filter_type is None or filter_type == "bool", filter_type

10782 out_dtype = "bool" if filter_type == "bool" else None

10783

10784 if numeric_only is None and name in ["mean", "median"]:

10785 own_dtypes = [arr.dtype for arr in self._mgr.arrays]

10786

10787 dtype_is_dt = np.array(

10788 [is_datetime64_any_dtype(dtype) for dtype in own_dtypes],

10789 dtype=bool,

10790 )

10791 if dtype_is_dt.any():

10792 warnings.warn(

10793 "DataFrame.mean and DataFrame.median with numeric_only=None "

10794 "will include datetime64 and datetime64tz columns in a "

10795 "future version.",

10796 FutureWarning,

10797 stacklevel=find_stack_level(),

10798 )

10799 # Non-copy equivalent to

10800 # dt64_cols = self.dtypes.apply(is_datetime64_any_dtype)

10801 # cols = self.columns[~dt64_cols]

10802 # self = self[cols]

10803 predicate = lambda x: not is_datetime64_any_dtype(x.dtype)

10804 mgr = self._mgr._get_data_subset(predicate)

10805 self = type(self)(mgr)

10806

10807 # TODO: Make other agg func handle axis=None properly GH#21597

10808 axis = self._get_axis_number(axis)

10809 labels = self._get_agg_axis(axis)

10810 assert axis in [0, 1]

10811

10812 def func(values: np.ndarray):

10813 # We only use this in the case that operates on self.values

10814 return op(values, axis=axis, skipna=skipna, **kwds)

10815

10816 def blk_func(values, axis=1):

10817 if isinstance(values, ExtensionArray):

10818 if not is_1d_only_ea_dtype(values.dtype) and not isinstance(

10819 self._mgr, ArrayManager

10820 ):

10821 return values._reduce(name, axis=1, skipna=skipna, **kwds)

10822 return values._reduce(name, skipna=skipna, **kwds)

10823 else:

10824 return op(values, axis=axis, skipna=skipna, **kwds)

10825

10826 def _get_data() -> DataFrame:

10827 if filter_type is None:

10828 data = self._get_numeric_data()

10829 else:

10830 # GH#25101, GH#24434

10831 assert filter_type == "bool"

10832 data = self._get_bool_data()

10833 return data

10834

10835 numeric_only_bool = com.resolve_numeric_only(numeric_only)

10836 if numeric_only is not None or axis == 0:

10837 # For numeric_only non-None and axis non-None, we know

10838 # which blocks to use and no try/except is needed.

10839 # For numeric_only=None only the case with axis==0 and no object

10840 # dtypes are unambiguous can be handled with BlockManager.reduce

10841 # Case with EAs see GH#35881

10842 df = self

10843 if numeric_only_bool:

10844 df = _get_data()

10845 if axis == 1:

10846 df = df.T

10847 axis = 0

10848

10849 ignore_failures = numeric_only is None

10850

10851 # After possibly _get_data and transposing, we are now in the

10852 # simple case where we can use BlockManager.reduce

10853 res, _ = df._mgr.reduce(blk_func, ignore_failures=ignore_failures)

10854 out = df._constructor(res).iloc[0]

10855 if out_dtype is not None:

10856 out = out.astype(out_dtype)

10857 if axis == 0 and len(self) == 0 and name in ["sum", "prod"]:

10858 # Even if we are object dtype, follow numpy and return

10859 # float64, see test_apply_funcs_over_empty

10860 out = out.astype(np.float64)

10861

10862 if numeric_only is None and out.shape[0] != df.shape[1]:

10863 # columns have been dropped GH#41480

10864 com.deprecate_numeric_only_default(

10865 type(self), name, deprecate_none=True

10866 )

10867

10868 return out

10869

10870 assert numeric_only is None

10871

10872 data = self

10873 values = data.values

10874

10875 try:

10876 result = func(values)

10877

10878 except TypeError:

10879 # e.g. in nanops trying to convert strs to float

10880

10881 data = _get_data()

10882 labels = data._get_agg_axis(axis)

10883

10884 values = data.values

10885 with np.errstate(all="ignore"):

10886 result = func(values)

10887

10888 # columns have been dropped GH#41480

10889 arg_name = "numeric_only"

10890 if name in ["all", "any"]:

10891 arg_name = "bool_only"

10892 warnings.warn(

10893 "Dropping of nuisance columns in DataFrame reductions "

10894 f"(with '{arg_name}=None') is deprecated; in a future "

10895 "version this will raise TypeError. Select only valid "

10896 "columns before calling the reduction.",

10897 FutureWarning,

10898 stacklevel=find_stack_level(),

10899 )

10900

10901 if hasattr(result, "dtype"):

10902 if filter_type == "bool" and notna(result).all():

10903 result = result.astype(np.bool_)

10904 elif filter_type is None and is_object_dtype(result.dtype):

10905 try:

10906 result = result.astype(np.float64)

10907 except (ValueError, TypeError):

10908 # try to coerce to the original dtypes item by item if we can

10909 pass

10910

10911 result = self._constructor_sliced(result, index=labels)

10912 return result

10913

10914 def _reduce_axis1(self, name: str, func, skipna: bool) -> Series:

10915 """

10916 Special case for _reduce to try to avoid a potentially-expensive transpose.

10917

10918 Apply the reduction block-wise along axis=1 and then reduce the resulting

10919 1D arrays.

10920 """

10921 if name == "all":

10922 result = np.ones(len(self), dtype=bool)

10923 ufunc = np.logical_and

10924 elif name == "any":

10925 result = np.zeros(len(self), dtype=bool)

10926 # error: Incompatible types in assignment

10927 # (expression has type "_UFunc_Nin2_Nout1[Literal['logical_or'],

10928 # Literal[20], Literal[False]]", variable has type

10929 # "_UFunc_Nin2_Nout1[Literal['logical_and'], Literal[20],

10930 # Literal[True]]")

10931 ufunc = np.logical_or # type: ignore[assignment]

10932 else:

10933 raise NotImplementedError(name)

10934

10935 for arr in self._mgr.arrays:

10936 middle = func(arr, axis=0, skipna=skipna)

10937 result = ufunc(result, middle)

10938

10939 res_ser = self._constructor_sliced(result, index=self.index)

10940 return res_ser

10941

10942 def nunique(self, axis: Axis = 0, dropna: bool = True) -> Series:

10943 """

10944 Count number of distinct elements in specified axis.

10945

10946 Return Series with number of distinct elements. Can ignore NaN

10947 values.

10948

10949 Parameters

10950 ----------

10951 axis : {0 or 'index', 1 or 'columns'}, default 0

10952 The axis to use. 0 or 'index' for row-wise, 1 or 'columns' for

10953 column-wise.

10954 dropna : bool, default True

10955 Don't include NaN in the counts.

10956

10957 Returns

10958 -------

10959 Series

10960

10961 See Also

10962 --------

10963 Series.nunique: Method nunique for Series.

10964 DataFrame.count: Count non-NA cells for each column or row.

10965

10966 Examples

10967 --------

10968 >>> df = pd.DataFrame({'A': [4, 5, 6], 'B': [4, 1, 1]})

10969 >>> df.nunique()

10970 A 3

10971 B 2

10972 dtype: int64

10973

10974 >>> df.nunique(axis=1)

10975 0 1

10976 1 2

10977 2 2

10978 dtype: int64

10979 """

10980 return self.apply(Series.nunique, axis=axis, dropna=dropna)

10981

10982 @doc(_shared_docs["idxmin"], numeric_only_default="False")

10983 def idxmin(

10984 self, axis: Axis = 0, skipna: bool = True, numeric_only: bool = False

10985 ) -> Series:

10986 axis = self._get_axis_number(axis)

10987 if numeric_only:

10988 data = self._get_numeric_data()

10989 else:

10990 data = self

10991

10992 res = data._reduce(

10993 nanops.nanargmin, "argmin", axis=axis, skipna=skipna, numeric_only=False

10994 )

10995 indices = res._values

10996

10997 # indices will always be np.ndarray since axis is not None and

10998 # values is a 2d array for DataFrame

10999 # error: Item "int" of "Union[int, Any]" has no attribute "__iter__"

11000 assert isinstance(indices, np.ndarray) # for mypy

11001

11002 index = data._get_axis(axis)

11003 result = [index[i] if i >= 0 else np.nan for i in indices]

11004 final_result = data._constructor_sliced(result, index=data._get_agg_axis(axis))

11005 return final_result.__finalize__(self, method="idxmin")

11006

11007 @doc(_shared_docs["idxmax"], numeric_only_default="False")

11008 def idxmax(

11009 self, axis: Axis = 0, skipna: bool = True, numeric_only: bool = False

11010 ) -> Series:

11011

11012 axis = self._get_axis_number(axis)

11013 if numeric_only:

11014 data = self._get_numeric_data()

11015 else:

11016 data = self

11017

11018 res = data._reduce(

11019 nanops.nanargmax, "argmax", axis=axis, skipna=skipna, numeric_only=False

11020 )

11021 indices = res._values

11022

11023 # indices will always be np.ndarray since axis is not None and

11024 # values is a 2d array for DataFrame

11025 # error: Item "int" of "Union[int, Any]" has no attribute "__iter__"

11026 assert isinstance(indices, np.ndarray) # for mypy

11027

11028 index = data._get_axis(axis)

11029 result = [index[i] if i >= 0 else np.nan for i in indices]

11030 final_result = data._constructor_sliced(result, index=data._get_agg_axis(axis))

11031 return final_result.__finalize__(self, method="idxmax")

11032

11033 def _get_agg_axis(self, axis_num: int) -> Index:

11034 """

11035 Let's be explicit about this.

11036 """

11037 if axis_num == 0:

11038 return self.columns

11039 elif axis_num == 1:

11040 return self.index

11041 else:

11042 raise ValueError(f"Axis must be 0 or 1 (got {repr(axis_num)})")

11043

11044 def mode(

11045 self, axis: Axis = 0, numeric_only: bool = False, dropna: bool = True

11046 ) -> DataFrame:

11047 """

11048 Get the mode(s) of each element along the selected axis.

11049

11050 The mode of a set of values is the value that appears most often.

11051 It can be multiple values.

11052

11053 Parameters

11054 ----------

11055 axis : {0 or 'index', 1 or 'columns'}, default 0

11056 The axis to iterate over while searching for the mode:

11057

11058 * 0 or 'index' : get mode of each column

11059 * 1 or 'columns' : get mode of each row.

11060

11061 numeric_only : bool, default False

11062 If True, only apply to numeric columns.

11063 dropna : bool, default True

11064 Don't consider counts of NaN/NaT.

11065

11066 Returns

11067 -------

11068 DataFrame

11069 The modes of each column or row.

11070

11071 See Also

11072 --------

11073 Series.mode : Return the highest frequency value in a Series.

11074 Series.value_counts : Return the counts of values in a Series.

11075

11076 Examples

11077 --------

11078 >>> df = pd.DataFrame([('bird', 2, 2),

11079 ... ('mammal', 4, np.nan),

11080 ... ('arthropod', 8, 0),

11081 ... ('bird', 2, np.nan)],

11082 ... index=('falcon', 'horse', 'spider', 'ostrich'),

11083 ... columns=('species', 'legs', 'wings'))

11084 >>> df

11085 species legs wings

11086 falcon bird 2 2.0

11087 horse mammal 4 NaN

11088 spider arthropod 8 0.0

11089 ostrich bird 2 NaN

11090

11091 By default, missing values are not considered, and the mode of wings

11092 are both 0 and 2. Because the resulting DataFrame has two rows,

11093 the second row of ``species`` and ``legs`` contains ``NaN``.

11094

11095 >>> df.mode()

11096 species legs wings

11097 0 bird 2.0 0.0

11098 1 NaN NaN 2.0

11099

11100 Setting ``dropna=False`` ``NaN`` values are considered and they can be

11101 the mode (like for wings).

11102

11103 >>> df.mode(dropna=False)

11104 species legs wings

11105 0 bird 2 NaN

11106

11107 Setting ``numeric_only=True``, only the mode of numeric columns is

11108 computed, and columns of other types are ignored.

11109

11110 >>> df.mode(numeric_only=True)

11111 legs wings

11112 0 2.0 0.0

11113 1 NaN 2.0

11114

11115 To compute the mode over columns and not rows, use the axis parameter:

11116

11117 >>> df.mode(axis='columns', numeric_only=True)

11118 0 1

11119 falcon 2.0 NaN

11120 horse 4.0 NaN

11121 spider 0.0 8.0

11122 ostrich 2.0 NaN

11123 """

11124 data = self if not numeric_only else self._get_numeric_data()

11125

11126 def f(s):

11127 return s.mode(dropna=dropna)

11128

11129 data = data.apply(f, axis=axis)

11130 # Ensure index is type stable (should always use int index)

11131 if data.empty:

11132 data.index = default_index(0)

11133

11134 return data

11135

11136 @overload

11137 def quantile(

11138 self,

11139 q: float = ...,

11140 axis: Axis = ...,

11141 numeric_only: bool | lib.NoDefault = ...,

11142 interpolation: QuantileInterpolation = ...,

11143 ) -> Series:

11144 ...

11145

11146 @overload

11147 def quantile(

11148 self,

11149 q: AnyArrayLike | Sequence[float],

11150 axis: Axis = ...,

11151 numeric_only: bool | lib.NoDefault = ...,

11152 interpolation: QuantileInterpolation = ...,

11153 ) -> Series | DataFrame:

11154 ...

11155

11156 @overload

11157 def quantile(

11158 self,

11159 q: float | AnyArrayLike | Sequence[float] = ...,

11160 axis: Axis = ...,

11161 numeric_only: bool | lib.NoDefault = ...,

11162 interpolation: QuantileInterpolation = ...,

11163 ) -> Series | DataFrame:

11164 ...

11165

11166 def quantile(

11167 self,

11168 q: float | AnyArrayLike | Sequence[float] = 0.5,

11169 axis: Axis = 0,

11170 numeric_only: bool | lib.NoDefault = no_default,

11171 interpolation: QuantileInterpolation = "linear",

11172 method: Literal["single", "table"] = "single",

11173 ) -> Series | DataFrame:

11174 """

11175 Return values at the given quantile over requested axis.

11176

11177 Parameters

11178 ----------

11179 q : float or array-like, default 0.5 (50% quantile)

11180 Value between 0 <= q <= 1, the quantile(s) to compute.

11181 axis : {0 or 'index', 1 or 'columns'}, default 0

11182 Equals 0 or 'index' for row-wise, 1 or 'columns' for column-wise.

11183 numeric_only : bool, default True

11184 If False, the quantile of datetime and timedelta data will be

11185 computed as well.

11186

11187 .. deprecated:: 1.5.0

11188 The default value of ``numeric_only`` will be ``False`` in a future

11189 version of pandas.

11190

11191 interpolation : {'linear', 'lower', 'higher', 'midpoint', 'nearest'}

11192 This optional parameter specifies the interpolation method to use,

11193 when the desired quantile lies between two data points `i` and `j`:

11194

11195 * linear: `i + (j - i) * fraction`, where `fraction` is the

11196 fractional part of the index surrounded by `i` and `j`.

11197 * lower: `i`.

11198 * higher: `j`.

11199 * nearest: `i` or `j` whichever is nearest.

11200 * midpoint: (`i` + `j`) / 2.

11201 method : {'single', 'table'}, default 'single'

11202 Whether to compute quantiles per-column ('single') or over all columns

11203 ('table'). When 'table', the only allowed interpolation methods are

11204 'nearest', 'lower', and 'higher'.

11205

11206 Returns

11207 -------

11208 Series or DataFrame

11209

11210 If ``q`` is an array, a DataFrame will be returned where the

11211 index is ``q``, the columns are the columns of self, and the

11212 values are the quantiles.

11213 If ``q`` is a float, a Series will be returned where the

11214 index is the columns of self and the values are the quantiles.

11215

11216 See Also

11217 --------

11218 core.window.rolling.Rolling.quantile: Rolling quantile.

11219 numpy.percentile: Numpy function to compute the percentile.

11220

11221 Examples

11222 --------

11223 >>> df = pd.DataFrame(np.array([[1, 1], [2, 10], [3, 100], [4, 100]]),

11224 ... columns=['a', 'b'])

11225 >>> df.quantile(.1)

11226 a 1.3

11227 b 3.7

11228 Name: 0.1, dtype: float64

11229 >>> df.quantile([.1, .5])

11230 a b

11231 0.1 1.3 3.7

11232 0.5 2.5 55.0

11233

11234 Specifying `method='table'` will compute the quantile over all columns.

11235

11236 >>> df.quantile(.1, method="table", interpolation="nearest")

11237 a 1

11238 b 1

11239 Name: 0.1, dtype: int64

11240 >>> df.quantile([.1, .5], method="table", interpolation="nearest")

11241 a b

11242 0.1 1 1

11243 0.5 3 100

11244

11245 Specifying `numeric_only=False` will also compute the quantile of

11246 datetime and timedelta data.

11247

11248 >>> df = pd.DataFrame({'A': [1, 2],

11249 ... 'B': [pd.Timestamp('2010'),

11250 ... pd.Timestamp('2011')],

11251 ... 'C': [pd.Timedelta('1 days'),

11252 ... pd.Timedelta('2 days')]})

11253 >>> df.quantile(0.5, numeric_only=False)

11254 A 1.5

11255 B 2010-07-02 12:00:00

11256 C 1 days 12:00:00

11257 Name: 0.5, dtype: object

11258 """

11259 validate_percentile(q)

11260 axis = self._get_axis_number(axis)

11261 any_not_numeric = any(not is_numeric_dtype(x) for x in self.dtypes)

11262 if numeric_only is no_default and any_not_numeric:

11263 com.deprecate_numeric_only_default(type(self), "quantile")

11264 numeric_only = com.resolve_numeric_only(numeric_only)

11265

11266 if not is_list_like(q):

11267 # BlockManager.quantile expects listlike, so we wrap and unwrap here

11268 # error: List item 0 has incompatible type "Union[float, Union[Union[

11269 # ExtensionArray, ndarray[Any, Any]], Index, Series], Sequence[float]]";

11270 # expected "float"

11271 res_df = self.quantile( # type: ignore[call-overload]

11272 [q],

11273 axis=axis,

11274 numeric_only=numeric_only,

11275 interpolation=interpolation,

11276 method=method,

11277 )

11278 if method == "single":

11279 res = res_df.iloc[0]

11280 else:

11281 # cannot directly iloc over sparse arrays

11282 res = res_df.T.iloc[:, 0]

11283 if axis == 1 and len(self) == 0:

11284 # GH#41544 try to get an appropriate dtype

11285 dtype = find_common_type(list(self.dtypes))

11286 if needs_i8_conversion(dtype):

11287 return res.astype(dtype)

11288 return res

11289

11290 q = Index(q, dtype=np.float64)

11291 data = self._get_numeric_data() if numeric_only else self

11292

11293 if axis == 1:

11294 data = data.T

11295

11296 if len(data.columns) == 0:

11297 # GH#23925 _get_numeric_data may have dropped all columns

11298 cols = Index([], name=self.columns.name)

11299

11300 dtype = np.float64

11301 if axis == 1:

11302 # GH#41544 try to get an appropriate dtype

11303 cdtype = find_common_type(list(self.dtypes))

11304 if needs_i8_conversion(cdtype):

11305 dtype = cdtype

11306

11307 res = self._constructor([], index=q, columns=cols, dtype=dtype)

11308 return res.__finalize__(self, method="quantile")

11309

11310 valid_method = {"single", "table"}

11311 if method not in valid_method:

11312 raise ValueError(

11313 f"Invalid method: {method}. Method must be in {valid_method}."

11314 )

11315 if method == "single":

11316 # error: Argument "qs" to "quantile" of "BlockManager" has incompatible type

11317 # "Index"; expected "Float64Index"

11318 res = data._mgr.quantile(

11319 qs=q, axis=1, interpolation=interpolation # type: ignore[arg-type]

11320 )

11321 elif method == "table":

11322 valid_interpolation = {"nearest", "lower", "higher"}

11323 if interpolation not in valid_interpolation:

11324 raise ValueError(

11325 f"Invalid interpolation: {interpolation}. "

11326 f"Interpolation must be in {valid_interpolation}"

11327 )

11328 # handle degenerate case

11329 if len(data) == 0:

11330 if data.ndim == 2:

11331 dtype = find_common_type(list(self.dtypes))

11332 else:

11333 dtype = self.dtype

11334 return self._constructor([], index=q, columns=data.columns, dtype=dtype)

11335

11336 q_idx = np.quantile( # type: ignore[call-overload]

11337 np.arange(len(data)), q, **{np_percentile_argname: interpolation}

11338 )

11339

11340 by = data.columns

11341 if len(by) > 1:

11342 keys = [data._get_label_or_level_values(x) for x in by]

11343 indexer = lexsort_indexer(keys)

11344 else:

11345 by = by[0]

11346 k = data._get_label_or_level_values(by) # type: ignore[arg-type]

11347 indexer = nargsort(k)

11348

11349 res = data._mgr.take(indexer[q_idx], verify=False)

11350 res.axes[1] = q

11351

11352 result = self._constructor(res)

11353 return result.__finalize__(self, method="quantile")

11354

11355 @doc(NDFrame.asfreq, **_shared_doc_kwargs)

11356 def asfreq(

11357 self,

11358 freq: Frequency,

11359 method: FillnaOptions | None = None,

11360 how: str | None = None,

11361 normalize: bool = False,

11362 fill_value: Hashable = None,

11363 ) -> DataFrame:

11364 return super().asfreq(

11365 freq=freq,

11366 method=method,

11367 how=how,

11368 normalize=normalize,

11369 fill_value=fill_value,

11370 )

11371

11372 @doc(NDFrame.resample, **_shared_doc_kwargs)

11373 def resample(

11374 self,

11375 rule,

11376 axis: Axis = 0,

11377 closed: str | None = None,

11378 label: str | None = None,

11379 convention: str = "start",

11380 kind: str | None = None,

11381 loffset=None,

11382 base: int | None = None,

11383 on: Level = None,

11384 level: Level = None,

11385 origin: str | TimestampConvertibleTypes = "start_day",

11386 offset: TimedeltaConvertibleTypes | None = None,

11387 group_keys: bool | lib.NoDefault = no_default,

11388 ) -> Resampler:

11389 return super().resample(

11390 rule=rule,

11391 axis=axis,

11392 closed=closed,

11393 label=label,

11394 convention=convention,

11395 kind=kind,

11396 loffset=loffset,

11397 base=base,

11398 on=on,

11399 level=level,

11400 origin=origin,

11401 offset=offset,

11402 group_keys=group_keys,

11403 )

11404

11405 def to_timestamp(

11406 self,

11407 freq: Frequency | None = None,

11408 how: str = "start",

11409 axis: Axis = 0,

11410 copy: bool = True,

11411 ) -> DataFrame:

11412 """

11413 Cast to DatetimeIndex of timestamps, at *beginning* of period.

11414

11415 Parameters

11416 ----------

11417 freq : str, default frequency of PeriodIndex

11418 Desired frequency.

11419 how : {'s', 'e', 'start', 'end'}

11420 Convention for converting period to timestamp; start of period

11421 vs. end.

11422 axis : {0 or 'index', 1 or 'columns'}, default 0

11423 The axis to convert (the index by default).

11424 copy : bool, default True

11425 If False then underlying input data is not copied.

11426

11427 Returns

11428 -------

11429 DataFrame with DatetimeIndex

11430 """

11431 new_obj = self.copy(deep=copy)

11432

11433 axis_name = self._get_axis_name(axis)

11434 old_ax = getattr(self, axis_name)

11435 if not isinstance(old_ax, PeriodIndex):

11436 raise TypeError(f"unsupported Type {type(old_ax).__name__}")

11437

11438 new_ax = old_ax.to_timestamp(freq=freq, how=how)

11439

11440 setattr(new_obj, axis_name, new_ax)

11441 return new_obj

11442

11443 def to_period(

11444 self, freq: Frequency | None = None, axis: Axis = 0, copy: bool = True

11445 ) -> DataFrame:

11446 """

11447 Convert DataFrame from DatetimeIndex to PeriodIndex.

11448

11449 Convert DataFrame from DatetimeIndex to PeriodIndex with desired

11450 frequency (inferred from index if not passed).

11451

11452 Parameters

11453 ----------

11454 freq : str, default

11455 Frequency of the PeriodIndex.

11456 axis : {0 or 'index', 1 or 'columns'}, default 0

11457 The axis to convert (the index by default).

11458 copy : bool, default True

11459 If False then underlying input data is not copied.

11460

11461 Returns

11462 -------

11463 DataFrame with PeriodIndex

11464

11465 Examples

11466 --------

11467 >>> idx = pd.to_datetime(

11468 ... [

11469 ... "2001-03-31 00:00:00",

11470 ... "2002-05-31 00:00:00",

11471 ... "2003-08-31 00:00:00",

11472 ... ]

11473 ... )

11474

11475 >>> idx

11476 DatetimeIndex(['2001-03-31', '2002-05-31', '2003-08-31'],

11477 dtype='datetime64[ns]', freq=None)

11478

11479 >>> idx.to_period("M")

11480 PeriodIndex(['2001-03', '2002-05', '2003-08'], dtype='period[M]')

11481

11482 For the yearly frequency

11483

11484 >>> idx.to_period("Y")

11485 PeriodIndex(['2001', '2002', '2003'], dtype='period[A-DEC]')

11486 """

11487 new_obj = self.copy(deep=copy)

11488

11489 axis_name = self._get_axis_name(axis)

11490 old_ax = getattr(self, axis_name)

11491 if not isinstance(old_ax, DatetimeIndex):

11492 raise TypeError(f"unsupported Type {type(old_ax).__name__}")

11493

11494 new_ax = old_ax.to_period(freq=freq)

11495

11496 setattr(new_obj, axis_name, new_ax)

11497 return new_obj

11498

11499 def isin(self, values: Series | DataFrame | Sequence | Mapping) -> DataFrame:

11500 """

11501 Whether each element in the DataFrame is contained in values.

11502

11503 Parameters

11504 ----------

11505 values : iterable, Series, DataFrame or dict

11506 The result will only be true at a location if all the

11507 labels match. If `values` is a Series, that's the index. If

11508 `values` is a dict, the keys must be the column names,

11509 which must match. If `values` is a DataFrame,

11510 then both the index and column labels must match.

11511

11512 Returns

11513 -------

11514 DataFrame

11515 DataFrame of booleans showing whether each element in the DataFrame

11516 is contained in values.

11517

11518 See Also

11519 --------

11520 DataFrame.eq: Equality test for DataFrame.

11521 Series.isin: Equivalent method on Series.

11522 Series.str.contains: Test if pattern or regex is contained within a

11523 string of a Series or Index.

11524

11525 Examples

11526 --------

11527 >>> df = pd.DataFrame({'num_legs': [2, 4], 'num_wings': [2, 0]},

11528 ... index=['falcon', 'dog'])

11529 >>> df

11530 num_legs num_wings

11531 falcon 2 2

11532 dog 4 0

11533

11534 When ``values`` is a list check whether every value in the DataFrame

11535 is present in the list (which animals have 0 or 2 legs or wings)

11536

11537 >>> df.isin([0, 2])

11538 num_legs num_wings

11539 falcon True True

11540 dog False True

11541

11542 To check if ``values`` is *not* in the DataFrame, use the ``~`` operator:

11543

11544 >>> ~df.isin([0, 2])

11545 num_legs num_wings

11546 falcon False False

11547 dog True False

11548

11549 When ``values`` is a dict, we can pass values to check for each

11550 column separately:

11551

11552 >>> df.isin({'num_wings': [0, 3]})

11553 num_legs num_wings

11554 falcon False False

11555 dog False True

11556

11557 When ``values`` is a Series or DataFrame the index and column must

11558 match. Note that 'falcon' does not match based on the number of legs

11559 in other.

11560

11561 >>> other = pd.DataFrame({'num_legs': [8, 3], 'num_wings': [0, 2]},

11562 ... index=['spider', 'falcon'])

11563 >>> df.isin(other)

11564 num_legs num_wings

11565 falcon False True

11566 dog False False

11567 """

11568 if isinstance(values, dict):

11569 from pandas.core.reshape.concat import concat

11570

11571 values = collections.defaultdict(list, values)

11572 result = concat(

11573 (

11574 self.iloc[:, [i]].isin(values[col])

11575 for i, col in enumerate(self.columns)

11576 ),

11577 axis=1,

11578 )

11579 elif isinstance(values, Series):

11580 if not values.index.is_unique:

11581 raise ValueError("cannot compute isin with a duplicate axis.")

11582 result = self.eq(values.reindex_like(self), axis="index")

11583 elif isinstance(values, DataFrame):

11584 if not (values.columns.is_unique and values.index.is_unique):

11585 raise ValueError("cannot compute isin with a duplicate axis.")

11586 result = self.eq(values.reindex_like(self))

11587 else:

11588 if not is_list_like(values):

11589 raise TypeError(

11590 "only list-like or dict-like objects are allowed "

11591 "to be passed to DataFrame.isin(), "

11592 f"you passed a '{type(values).__name__}'"

11593 )

11594 # error: Argument 2 to "isin" has incompatible type "Union[Sequence[Any],

11595 # Mapping[Any, Any]]"; expected "Union[Union[ExtensionArray,

11596 # ndarray[Any, Any]], Index, Series]"

11597 result = self._constructor(

11598 algorithms.isin(

11599 self.values.ravel(), values # type: ignore[arg-type]

11600 ).reshape(self.shape),

11601 self.index,

11602 self.columns,

11603 )

11604 return result.__finalize__(self, method="isin")

11605

11606 # ----------------------------------------------------------------------

11607 # Add index and columns

11608 _AXIS_ORDERS = ["index", "columns"]

11609 _AXIS_TO_AXIS_NUMBER: dict[Axis, int] = {

11610 **NDFrame._AXIS_TO_AXIS_NUMBER,

11611 1: 1,

11612 "columns": 1,

11613 }

11614 _AXIS_LEN = len(_AXIS_ORDERS)

11615 _info_axis_number = 1

11616 _info_axis_name = "columns"

11617

11618 index = properties.AxisProperty(

11619 axis=1, doc="The index (row labels) of the DataFrame."

11620 )

11621 columns = properties.AxisProperty(axis=0, doc="The column labels of the DataFrame.")

11622

11623 @property

11624 def _AXIS_NUMBERS(self) -> dict[str, int]:

11625 """.. deprecated:: 1.1.0"""

11626 super()._AXIS_NUMBERS

11627 return {"index": 0, "columns": 1}

11628

11629 @property

11630 def _AXIS_NAMES(self) -> dict[int, str]:

11631 """.. deprecated:: 1.1.0"""

11632 super()._AXIS_NAMES

11633 return {0: "index", 1: "columns"}

11634

11635 # ----------------------------------------------------------------------

11636 # Add plotting methods to DataFrame

11637 plot = CachedAccessor("plot", pandas.plotting.PlotAccessor)

11638 hist = pandas.plotting.hist_frame

11639 boxplot = pandas.plotting.boxplot_frame

11640 sparse = CachedAccessor("sparse", SparseFrameAccessor)

11641

11642 # ----------------------------------------------------------------------

11643 # Internal Interface Methods

11644

11645 def _to_dict_of_blocks(self, copy: bool = True):

11646 """

11647 Return a dict of dtype -> Constructor Types that

11648 each is a homogeneous dtype.

11649

11650 Internal ONLY - only works for BlockManager

11651 """

11652 mgr = self._mgr

11653 # convert to BlockManager if needed -> this way support ArrayManager as well

11654 mgr = mgr_to_mgr(mgr, "block")

11655 mgr = cast(BlockManager, mgr)

11656 return {

11657 k: self._constructor(v).__finalize__(self)

11658 for k, v, in mgr.to_dict(copy=copy).items()

11659 }

11660

11661 @property

11662 def values(self) -> np.ndarray:

11663 """

11664 Return a Numpy representation of the DataFrame.

11665

11666 .. warning::

11667

11668 We recommend using :meth:`DataFrame.to_numpy` instead.

11669

11670 Only the values in the DataFrame will be returned, the axes labels

11671 will be removed.

11672

11673 Returns

11674 -------

11675 numpy.ndarray

11676 The values of the DataFrame.

11677

11678 See Also

11679 --------

11680 DataFrame.to_numpy : Recommended alternative to this method.

11681 DataFrame.index : Retrieve the index labels.

11682 DataFrame.columns : Retrieving the column names.

11683

11684 Notes

11685 -----

11686 The dtype will be a lower-common-denominator dtype (implicit

11687 upcasting); that is to say if the dtypes (even of numeric types)

11688 are mixed, the one that accommodates all will be chosen. Use this

11689 with care if you are not dealing with the blocks.

11690

11691 e.g. If the dtypes are float16 and float32, dtype will be upcast to

11692 float32. If dtypes are int32 and uint8, dtype will be upcast to

11693 int32. By :func:`numpy.find_common_type` convention, mixing int64

11694 and uint64 will result in a float64 dtype.

11695

11696 Examples

11697 --------

11698 A DataFrame where all columns are the same type (e.g., int64) results

11699 in an array of the same type.

11700

11701 >>> df = pd.DataFrame({'age': [ 3, 29],

11702 ... 'height': [94, 170],

11703 ... 'weight': [31, 115]})

11704 >>> df

11705 age height weight

11706 0 3 94 31

11707 1 29 170 115

11708 >>> df.dtypes

11709 age int64

11710 height int64

11711 weight int64

11712 dtype: object

11713 >>> df.values

11714 array([[ 3, 94, 31],

11715 [ 29, 170, 115]])

11716

11717 A DataFrame with mixed type columns(e.g., str/object, int64, float32)

11718 results in an ndarray of the broadest type that accommodates these

11719 mixed types (e.g., object).

11720

11721 >>> df2 = pd.DataFrame([('parrot', 24.0, 'second'),

11722 ... ('lion', 80.5, 1),

11723 ... ('monkey', np.nan, None)],

11724 ... columns=('name', 'max_speed', 'rank'))

11725 >>> df2.dtypes

11726 name object

11727 max_speed float64

11728 rank object

11729 dtype: object

11730 >>> df2.values

11731 array([['parrot', 24.0, 'second'],

11732 ['lion', 80.5, 1],

11733 ['monkey', nan, None]], dtype=object)

11734 """

11735 self._consolidate_inplace()

11736 return self._mgr.as_array()

11737

11738 @overload

11739 def ffill(

11740 self,

11741 *,

11742 axis: None | Axis = ...,

11743 inplace: Literal[False] = ...,

11744 limit: None | int = ...,

11745 downcast: dict | None = ...,

11746 ) -> DataFrame:

11747 ...

11748

11749 @overload

11750 def ffill(

11751 self,

11752 *,

11753 axis: None | Axis = ...,

11754 inplace: Literal[True],

11755 limit: None | int = ...,

11756 downcast: dict | None = ...,

11757 ) -> None:

11758 ...

11759

11760 @overload

11761 def ffill(

11762 self,

11763 *,

11764 axis: None | Axis = ...,

11765 inplace: bool = ...,

11766 limit: None | int = ...,

11767 downcast: dict | None = ...,

11768 ) -> DataFrame | None:

11769 ...

11770

11771 # error: Signature of "ffill" incompatible with supertype "NDFrame"

11772 @deprecate_nonkeyword_arguments(version=None, allowed_args=["self"])

11773 def ffill( # type: ignore[override]

11774 self,

11775 axis: None | Axis = None,

11776 inplace: bool = False,

11777 limit: None | int = None,

11778 downcast: dict | None = None,

11779 ) -> DataFrame | None:

11780 return super().ffill(axis=axis, inplace=inplace, limit=limit, downcast=downcast)

11781

11782 @overload

11783 def bfill(

11784 self,

11785 *,

11786 axis: None | Axis = ...,

11787 inplace: Literal[False] = ...,

11788 limit: None | int = ...,

11789 downcast=...,

11790 ) -> DataFrame:

11791 ...

11792

11793 @overload

11794 def bfill(

11795 self,

11796 *,

11797 axis: None | Axis = ...,

11798 inplace: Literal[True],

11799 limit: None | int = ...,

11800 downcast=...,

11801 ) -> None:

11802 ...

11803

11804 @overload

11805 def bfill(

11806 self,

11807 *,

11808 axis: None | Axis = ...,

11809 inplace: bool = ...,

11810 limit: None | int = ...,

11811 downcast=...,

11812 ) -> DataFrame | None:

11813 ...

11814

11815 # error: Signature of "bfill" incompatible with supertype "NDFrame"

11816 @deprecate_nonkeyword_arguments(version=None, allowed_args=["self"])

11817 def bfill( # type: ignore[override]

11818 self,

11819 axis: None | Axis = None,

11820 inplace: bool = False,

11821 limit: None | int = None,

11822 downcast=None,

11823 ) -> DataFrame | None:

11824 return super().bfill(axis=axis, inplace=inplace, limit=limit, downcast=downcast)

11825

11826 @deprecate_nonkeyword_arguments(

11827 version=None, allowed_args=["self", "lower", "upper"]

11828 )

11829 def clip(

11830 self: DataFrame,

11831 lower: float | None = None,

11832 upper: float | None = None,

11833 axis: Axis | None = None,

11834 inplace: bool = False,

11835 *args,

11836 **kwargs,

11837 ) -> DataFrame | None:

11838 return super().clip(lower, upper, axis, inplace, *args, **kwargs)

11839

11840 @deprecate_nonkeyword_arguments(version=None, allowed_args=["self", "method"])

11841 def interpolate(

11842 self: DataFrame,

11843 method: str = "linear",

11844 axis: Axis = 0,

11845 limit: int | None = None,

11846 inplace: bool = False,

11847 limit_direction: str | None = None,

11848 limit_area: str | None = None,

11849 downcast: str | None = None,

11850 **kwargs,

11851 ) -> DataFrame | None:

11852 return super().interpolate(

11853 method,

11854 axis,

11855 limit,

11856 inplace,

11857 limit_direction,

11858 limit_area,

11859 downcast,

11860 **kwargs,

11861 )

11862

11863 @overload

11864 def where(

11865 self,

11866 cond,

11867 other=...,

11868 *,

11869 inplace: Literal[False] = ...,

11870 axis: Axis | None = ...,

11871 level: Level = ...,

11872 errors: IgnoreRaise | lib.NoDefault = ...,

11873 try_cast: bool | lib.NoDefault = ...,

11874 ) -> DataFrame:

11875 ...

11876

11877 @overload

11878 def where(

11879 self,

11880 cond,

11881 other=...,

11882 *,

11883 inplace: Literal[True],

11884 axis: Axis | None = ...,

11885 level: Level = ...,

11886 errors: IgnoreRaise | lib.NoDefault = ...,

11887 try_cast: bool | lib.NoDefault = ...,

11888 ) -> None:

11889 ...

11890

11891 @overload

11892 def where(

11893 self,

11894 cond,

11895 other=...,

11896 *,

11897 inplace: bool = ...,

11898 axis: Axis | None = ...,

11899 level: Level = ...,

11900 errors: IgnoreRaise | lib.NoDefault = ...,

11901 try_cast: bool | lib.NoDefault = ...,

11902 ) -> DataFrame | None:

11903 ...

11904

11905 # error: Signature of "where" incompatible with supertype "NDFrame"

11906 @deprecate_kwarg(old_arg_name="errors", new_arg_name=None)

11907 @deprecate_nonkeyword_arguments(

11908 version=None, allowed_args=["self", "cond", "other"]

11909 )

11910 def where( # type: ignore[override]

11911 self,

11912 cond,

11913 other=lib.no_default,

11914 inplace: bool = False,

11915 axis: Axis | None = None,

11916 level: Level = None,

11917 errors: IgnoreRaise | lib.NoDefault = "raise",

11918 try_cast: bool | lib.NoDefault = lib.no_default,

11919 ) -> DataFrame | None:

11920 return super().where(

11921 cond,

11922 other,

11923 inplace=inplace,

11924 axis=axis,

11925 level=level,

11926 try_cast=try_cast,

11927 )

11928

11929 @overload

11930 def mask(

11931 self,

11932 cond,

11933 other=...,

11934 *,

11935 inplace: Literal[False] = ...,

11936 axis: Axis | None = ...,

11937 level: Level = ...,

11938 errors: IgnoreRaise | lib.NoDefault = ...,

11939 try_cast: bool | lib.NoDefault = ...,

11940 ) -> DataFrame:

11941 ...

11942

11943 @overload

11944 def mask(

11945 self,

11946 cond,

11947 other=...,

11948 *,

11949 inplace: Literal[True],

11950 axis: Axis | None = ...,

11951 level: Level = ...,

11952 errors: IgnoreRaise | lib.NoDefault = ...,

11953 try_cast: bool | lib.NoDefault = ...,

11954 ) -> None:

11955 ...

11956

11957 @overload

11958 def mask(

11959 self,

11960 cond,

11961 other=...,

11962 *,

11963 inplace: bool = ...,

11964 axis: Axis | None = ...,

11965 level: Level = ...,

11966 errors: IgnoreRaise | lib.NoDefault = ...,

11967 try_cast: bool | lib.NoDefault = ...,

11968 ) -> DataFrame | None:

11969 ...

11970

11971 # error: Signature of "mask" incompatible with supertype "NDFrame"

11972 @deprecate_kwarg(old_arg_name="errors", new_arg_name=None)

11973 @deprecate_nonkeyword_arguments(

11974 version=None, allowed_args=["self", "cond", "other"]

11975 )

11976 def mask( # type: ignore[override]

11977 self,

11978 cond,

11979 other=np.nan,

11980 inplace: bool = False,

11981 axis: Axis | None = None,

11982 level: Level = None,

11983 errors: IgnoreRaise | lib.NoDefault = "raise",

11984 try_cast: bool | lib.NoDefault = lib.no_default,

11985 ) -> DataFrame | None:

11986 return super().mask(

11987 cond,

11988 other,

11989 inplace=inplace,

11990 axis=axis,

11991 level=level,

11992 try_cast=try_cast,

11993 )

11994

11995

11996DataFrame._add_numeric_operations()

11997

11998ops.add_flex_arithmetic_methods(DataFrame)

11999

12000

12001def _from_nested_dict(data) -> collections.defaultdict:

12002 new_data: collections.defaultdict = collections.defaultdict(dict)

12003 for index, s in data.items():

12004 for col, v in s.items():

12005 new_data[col][index] = v

12006 return new_data

12007

12008

12009def _reindex_for_setitem(value: DataFrame | Series, index: Index) -> ArrayLike:

12010 # reindex if necessary

12011

12012 if value.index.equals(index) or not len(index):

12013 return value._values.copy()

12014

12015 # GH#4107

12016 try:

12017 reindexed_value = value.reindex(index)._values

12018 except ValueError as err:

12019 # raised in MultiIndex.from_tuples, see test_insert_error_msmgs

12020 if not value.index.is_unique:

12021 # duplicate axis

12022 raise err

12023

12024 raise TypeError(

12025 "incompatible index of inserted column with frame index"

12026 ) from err

12027 return reindexed_value