Coverage for /var/srv/projects/api.amasfac.comuna18.com/tmp/venv/lib/python3.9/site-packages/pandas/core/frame.py: 14%

2312 statements  

« prev     ^ index     » next       coverage.py v6.4.4, created at 2023-07-17 14:22 -0600

1""" 

2DataFrame 

3--------- 

4An efficient 2D container for potentially mixed-type time series or other 

5labeled data series. 

6 

7Similar to its R counterpart, data.frame, except providing automatic data 

8alignment and a host of useful data manipulation methods having to do with the 

9labeling information 

10""" 

11from __future__ import annotations 

12 

13import collections 

14from collections import abc 

15import datetime 

16import functools 

17from io import StringIO 

18import itertools 

19from textwrap import dedent 

20from typing import ( 

21 TYPE_CHECKING, 

22 Any, 

23 Callable, 

24 Hashable, 

25 Iterable, 

26 Iterator, 

27 Literal, 

28 Mapping, 

29 Sequence, 

30 cast, 

31 overload, 

32) 

33import warnings 

34 

35import numpy as np 

36import numpy.ma as ma 

37 

38from pandas._config import get_option 

39 

40from pandas._libs import ( 

41 algos as libalgos, 

42 lib, 

43 properties, 

44) 

45from pandas._libs.hashtable import duplicated 

46from pandas._libs.lib import ( 

47 NoDefault, 

48 no_default, 

49) 

50from pandas._typing import ( 

51 AggFuncType, 

52 AnyArrayLike, 

53 ArrayLike, 

54 Axes, 

55 Axis, 

56 ColspaceArgType, 

57 CompressionOptions, 

58 Dtype, 

59 DtypeObj, 

60 FilePath, 

61 FillnaOptions, 

62 FloatFormatType, 

63 FormattersType, 

64 Frequency, 

65 IgnoreRaise, 

66 IndexKeyFunc, 

67 IndexLabel, 

68 Level, 

69 NaPosition, 

70 PythonFuncType, 

71 QuantileInterpolation, 

72 ReadBuffer, 

73 Renamer, 

74 Scalar, 

75 SortKind, 

76 StorageOptions, 

77 Suffixes, 

78 TimedeltaConvertibleTypes, 

79 TimestampConvertibleTypes, 

80 ValueKeyFunc, 

81 WriteBuffer, 

82 npt, 

83) 

84from pandas.compat._optional import import_optional_dependency 

85from pandas.compat.numpy import ( 

86 function as nv, 

87 np_percentile_argname, 

88) 

89from pandas.errors import InvalidIndexError 

90from pandas.util._decorators import ( 

91 Appender, 

92 Substitution, 

93 deprecate_kwarg, 

94 deprecate_nonkeyword_arguments, 

95 doc, 

96 rewrite_axis_style_signature, 

97) 

98from pandas.util._exceptions import find_stack_level 

99from pandas.util._validators import ( 

100 validate_ascending, 

101 validate_axis_style_args, 

102 validate_bool_kwarg, 

103 validate_percentile, 

104) 

105 

106from pandas.core.dtypes.cast import ( 

107 can_hold_element, 

108 construct_1d_arraylike_from_scalar, 

109 construct_2d_arraylike_from_scalar, 

110 find_common_type, 

111 infer_dtype_from_scalar, 

112 invalidate_string_dtypes, 

113 maybe_box_native, 

114 maybe_downcast_to_dtype, 

115) 

116from pandas.core.dtypes.common import ( 

117 ensure_platform_int, 

118 infer_dtype_from_object, 

119 is_1d_only_ea_dtype, 

120 is_bool_dtype, 

121 is_dataclass, 

122 is_datetime64_any_dtype, 

123 is_dict_like, 

124 is_dtype_equal, 

125 is_extension_array_dtype, 

126 is_float, 

127 is_float_dtype, 

128 is_hashable, 

129 is_integer, 

130 is_integer_dtype, 

131 is_iterator, 

132 is_list_like, 

133 is_numeric_dtype, 

134 is_object_dtype, 

135 is_scalar, 

136 is_sequence, 

137 needs_i8_conversion, 

138 pandas_dtype, 

139) 

140from pandas.core.dtypes.dtypes import ExtensionDtype 

141from pandas.core.dtypes.missing import ( 

142 isna, 

143 notna, 

144) 

145 

146from pandas.core import ( 

147 algorithms, 

148 common as com, 

149 nanops, 

150 ops, 

151) 

152from pandas.core.accessor import CachedAccessor 

153from pandas.core.apply import ( 

154 reconstruct_func, 

155 relabel_result, 

156) 

157from pandas.core.array_algos.take import take_2d_multi 

158from pandas.core.arraylike import OpsMixin 

159from pandas.core.arrays import ( 

160 DatetimeArray, 

161 ExtensionArray, 

162 PeriodArray, 

163 TimedeltaArray, 

164) 

165from pandas.core.arrays.sparse import SparseFrameAccessor 

166from pandas.core.construction import ( 

167 extract_array, 

168 sanitize_array, 

169 sanitize_masked_array, 

170) 

171from pandas.core.generic import NDFrame 

172from pandas.core.indexers import check_key_length 

173from pandas.core.indexes.api import ( 

174 DatetimeIndex, 

175 Index, 

176 PeriodIndex, 

177 default_index, 

178 ensure_index, 

179 ensure_index_from_sequences, 

180) 

181from pandas.core.indexes.multi import ( 

182 MultiIndex, 

183 maybe_droplevels, 

184) 

185from pandas.core.indexing import ( 

186 check_bool_indexer, 

187 check_deprecated_indexers, 

188 convert_to_index_sliceable, 

189) 

190from pandas.core.internals import ( 

191 ArrayManager, 

192 BlockManager, 

193) 

194from pandas.core.internals.construction import ( 

195 arrays_to_mgr, 

196 dataclasses_to_dicts, 

197 dict_to_mgr, 

198 mgr_to_mgr, 

199 ndarray_to_mgr, 

200 nested_data_to_arrays, 

201 rec_array_to_mgr, 

202 reorder_arrays, 

203 to_arrays, 

204 treat_as_nested, 

205) 

206from pandas.core.reshape.melt import melt 

207from pandas.core.series import Series 

208from pandas.core.shared_docs import _shared_docs 

209from pandas.core.sorting import ( 

210 get_group_index, 

211 lexsort_indexer, 

212 nargsort, 

213) 

214 

215from pandas.io.common import get_handle 

216from pandas.io.formats import ( 

217 console, 

218 format as fmt, 

219) 

220from pandas.io.formats.info import ( 

221 INFO_DOCSTRING, 

222 DataFrameInfo, 

223 frame_sub_kwargs, 

224) 

225import pandas.plotting 

226 

227if TYPE_CHECKING: 227 ↛ 229line 227 didn't jump to line 229, because the condition on line 227 was never true

228 

229 from pandas.core.groupby.generic import DataFrameGroupBy 

230 from pandas.core.interchange.dataframe_protocol import DataFrame as DataFrameXchg 

231 from pandas.core.internals import SingleDataManager 

232 from pandas.core.resample import Resampler 

233 

234 from pandas.io.formats.style import Styler 

235 

236# --------------------------------------------------------------------- 

237# Docstring templates 

238 

239_shared_doc_kwargs = { 

240 "axes": "index, columns", 

241 "klass": "DataFrame", 

242 "axes_single_arg": "{0 or 'index', 1 or 'columns'}", 

243 "axis": """axis : {0 or 'index', 1 or 'columns'}, default 0 

244 If 0 or 'index': apply function to each column. 

245 If 1 or 'columns': apply function to each row.""", 

246 "inplace": """ 

247 inplace : bool, default False 

248 Whether to modify the DataFrame rather than creating a new one.""", 

249 "optional_by": """ 

250 by : str or list of str 

251 Name or list of names to sort by. 

252 

253 - if `axis` is 0 or `'index'` then `by` may contain index 

254 levels and/or column labels. 

255 - if `axis` is 1 or `'columns'` then `by` may contain column 

256 levels and/or index labels.""", 

257 "optional_labels": """labels : array-like, optional 

258 New labels / index to conform the axis specified by 'axis' to.""", 

259 "optional_axis": """axis : int or str, optional 

260 Axis to target. Can be either the axis name ('index', 'columns') 

261 or number (0, 1).""", 

262 "replace_iloc": """ 

263 This differs from updating with ``.loc`` or ``.iloc``, which require 

264 you to specify a location to update with some value.""", 

265} 

266 

267_numeric_only_doc = """numeric_only : bool or None, default None 

268 Include only float, int, boolean data. If None, will attempt to use 

269 everything, then use only numeric data 

270""" 

271 

272_merge_doc = """ 

273Merge DataFrame or named Series objects with a database-style join. 

274 

275A named Series object is treated as a DataFrame with a single named column. 

276 

277The join is done on columns or indexes. If joining columns on 

278columns, the DataFrame indexes *will be ignored*. Otherwise if joining indexes 

279on indexes or indexes on a column or columns, the index will be passed on. 

280When performing a cross merge, no column specifications to merge on are 

281allowed. 

282 

283.. warning:: 

284 

285 If both key columns contain rows where the key is a null value, those 

286 rows will be matched against each other. This is different from usual SQL 

287 join behaviour and can lead to unexpected results. 

288 

289Parameters 

290----------%s 

291right : DataFrame or named Series 

292 Object to merge with. 

293how : {'left', 'right', 'outer', 'inner', 'cross'}, default 'inner' 

294 Type of merge to be performed. 

295 

296 * left: use only keys from left frame, similar to a SQL left outer join; 

297 preserve key order. 

298 * right: use only keys from right frame, similar to a SQL right outer join; 

299 preserve key order. 

300 * outer: use union of keys from both frames, similar to a SQL full outer 

301 join; sort keys lexicographically. 

302 * inner: use intersection of keys from both frames, similar to a SQL inner 

303 join; preserve the order of the left keys. 

304 * cross: creates the cartesian product from both frames, preserves the order 

305 of the left keys. 

306 

307 .. versionadded:: 1.2.0 

308 

309on : label or list 

310 Column or index level names to join on. These must be found in both 

311 DataFrames. If `on` is None and not merging on indexes then this defaults 

312 to the intersection of the columns in both DataFrames. 

313left_on : label or list, or array-like 

314 Column or index level names to join on in the left DataFrame. Can also 

315 be an array or list of arrays of the length of the left DataFrame. 

316 These arrays are treated as if they are columns. 

317right_on : label or list, or array-like 

318 Column or index level names to join on in the right DataFrame. Can also 

319 be an array or list of arrays of the length of the right DataFrame. 

320 These arrays are treated as if they are columns. 

321left_index : bool, default False 

322 Use the index from the left DataFrame as the join key(s). If it is a 

323 MultiIndex, the number of keys in the other DataFrame (either the index 

324 or a number of columns) must match the number of levels. 

325right_index : bool, default False 

326 Use the index from the right DataFrame as the join key. Same caveats as 

327 left_index. 

328sort : bool, default False 

329 Sort the join keys lexicographically in the result DataFrame. If False, 

330 the order of the join keys depends on the join type (how keyword). 

331suffixes : list-like, default is ("_x", "_y") 

332 A length-2 sequence where each element is optionally a string 

333 indicating the suffix to add to overlapping column names in 

334 `left` and `right` respectively. Pass a value of `None` instead 

335 of a string to indicate that the column name from `left` or 

336 `right` should be left as-is, with no suffix. At least one of the 

337 values must not be None. 

338copy : bool, default True 

339 If False, avoid copy if possible. 

340indicator : bool or str, default False 

341 If True, adds a column to the output DataFrame called "_merge" with 

342 information on the source of each row. The column can be given a different 

343 name by providing a string argument. The column will have a Categorical 

344 type with the value of "left_only" for observations whose merge key only 

345 appears in the left DataFrame, "right_only" for observations 

346 whose merge key only appears in the right DataFrame, and "both" 

347 if the observation's merge key is found in both DataFrames. 

348 

349validate : str, optional 

350 If specified, checks if merge is of specified type. 

351 

352 * "one_to_one" or "1:1": check if merge keys are unique in both 

353 left and right datasets. 

354 * "one_to_many" or "1:m": check if merge keys are unique in left 

355 dataset. 

356 * "many_to_one" or "m:1": check if merge keys are unique in right 

357 dataset. 

358 * "many_to_many" or "m:m": allowed, but does not result in checks. 

359 

360Returns 

361------- 

362DataFrame 

363 A DataFrame of the two merged objects. 

364 

365See Also 

366-------- 

367merge_ordered : Merge with optional filling/interpolation. 

368merge_asof : Merge on nearest keys. 

369DataFrame.join : Similar method using indices. 

370 

371Notes 

372----- 

373Support for specifying index levels as the `on`, `left_on`, and 

374`right_on` parameters was added in version 0.23.0 

375Support for merging named Series objects was added in version 0.24.0 

376 

377Examples 

378-------- 

379>>> df1 = pd.DataFrame({'lkey': ['foo', 'bar', 'baz', 'foo'], 

380... 'value': [1, 2, 3, 5]}) 

381>>> df2 = pd.DataFrame({'rkey': ['foo', 'bar', 'baz', 'foo'], 

382... 'value': [5, 6, 7, 8]}) 

383>>> df1 

384 lkey value 

3850 foo 1 

3861 bar 2 

3872 baz 3 

3883 foo 5 

389>>> df2 

390 rkey value 

3910 foo 5 

3921 bar 6 

3932 baz 7 

3943 foo 8 

395 

396Merge df1 and df2 on the lkey and rkey columns. The value columns have 

397the default suffixes, _x and _y, appended. 

398 

399>>> df1.merge(df2, left_on='lkey', right_on='rkey') 

400 lkey value_x rkey value_y 

4010 foo 1 foo 5 

4021 foo 1 foo 8 

4032 foo 5 foo 5 

4043 foo 5 foo 8 

4054 bar 2 bar 6 

4065 baz 3 baz 7 

407 

408Merge DataFrames df1 and df2 with specified left and right suffixes 

409appended to any overlapping columns. 

410 

411>>> df1.merge(df2, left_on='lkey', right_on='rkey', 

412... suffixes=('_left', '_right')) 

413 lkey value_left rkey value_right 

4140 foo 1 foo 5 

4151 foo 1 foo 8 

4162 foo 5 foo 5 

4173 foo 5 foo 8 

4184 bar 2 bar 6 

4195 baz 3 baz 7 

420 

421Merge DataFrames df1 and df2, but raise an exception if the DataFrames have 

422any overlapping columns. 

423 

424>>> df1.merge(df2, left_on='lkey', right_on='rkey', suffixes=(False, False)) 

425Traceback (most recent call last): 

426... 

427ValueError: columns overlap but no suffix specified: 

428 Index(['value'], dtype='object') 

429 

430>>> df1 = pd.DataFrame({'a': ['foo', 'bar'], 'b': [1, 2]}) 

431>>> df2 = pd.DataFrame({'a': ['foo', 'baz'], 'c': [3, 4]}) 

432>>> df1 

433 a b 

4340 foo 1 

4351 bar 2 

436>>> df2 

437 a c 

4380 foo 3 

4391 baz 4 

440 

441>>> df1.merge(df2, how='inner', on='a') 

442 a b c 

4430 foo 1 3 

444 

445>>> df1.merge(df2, how='left', on='a') 

446 a b c 

4470 foo 1 3.0 

4481 bar 2 NaN 

449 

450>>> df1 = pd.DataFrame({'left': ['foo', 'bar']}) 

451>>> df2 = pd.DataFrame({'right': [7, 8]}) 

452>>> df1 

453 left 

4540 foo 

4551 bar 

456>>> df2 

457 right 

4580 7 

4591 8 

460 

461>>> df1.merge(df2, how='cross') 

462 left right 

4630 foo 7 

4641 foo 8 

4652 bar 7 

4663 bar 8 

467""" 

468 

469 

470# ----------------------------------------------------------------------- 

471# DataFrame class 

472 

473 

474class DataFrame(NDFrame, OpsMixin): 

475 """ 

476 Two-dimensional, size-mutable, potentially heterogeneous tabular data. 

477 

478 Data structure also contains labeled axes (rows and columns). 

479 Arithmetic operations align on both row and column labels. Can be 

480 thought of as a dict-like container for Series objects. The primary 

481 pandas data structure. 

482 

483 Parameters 

484 ---------- 

485 data : ndarray (structured or homogeneous), Iterable, dict, or DataFrame 

486 Dict can contain Series, arrays, constants, dataclass or list-like objects. If 

487 data is a dict, column order follows insertion-order. If a dict contains Series 

488 which have an index defined, it is aligned by its index. 

489 

490 .. versionchanged:: 0.25.0 

491 If data is a list of dicts, column order follows insertion-order. 

492 

493 index : Index or array-like 

494 Index to use for resulting frame. Will default to RangeIndex if 

495 no indexing information part of input data and no index provided. 

496 columns : Index or array-like 

497 Column labels to use for resulting frame when data does not have them, 

498 defaulting to RangeIndex(0, 1, 2, ..., n). If data contains column labels, 

499 will perform column selection instead. 

500 dtype : dtype, default None 

501 Data type to force. Only a single dtype is allowed. If None, infer. 

502 copy : bool or None, default None 

503 Copy data from inputs. 

504 For dict data, the default of None behaves like ``copy=True``. For DataFrame 

505 or 2d ndarray input, the default of None behaves like ``copy=False``. 

506 If data is a dict containing one or more Series (possibly of different dtypes), 

507 ``copy=False`` will ensure that these inputs are not copied. 

508 

509 .. versionchanged:: 1.3.0 

510 

511 See Also 

512 -------- 

513 DataFrame.from_records : Constructor from tuples, also record arrays. 

514 DataFrame.from_dict : From dicts of Series, arrays, or dicts. 

515 read_csv : Read a comma-separated values (csv) file into DataFrame. 

516 read_table : Read general delimited file into DataFrame. 

517 read_clipboard : Read text from clipboard into DataFrame. 

518 

519 Notes 

520 ----- 

521 Please reference the :ref:`User Guide <basics.dataframe>` for more information. 

522 

523 Examples 

524 -------- 

525 Constructing DataFrame from a dictionary. 

526 

527 >>> d = {'col1': [1, 2], 'col2': [3, 4]} 

528 >>> df = pd.DataFrame(data=d) 

529 >>> df 

530 col1 col2 

531 0 1 3 

532 1 2 4 

533 

534 Notice that the inferred dtype is int64. 

535 

536 >>> df.dtypes 

537 col1 int64 

538 col2 int64 

539 dtype: object 

540 

541 To enforce a single dtype: 

542 

543 >>> df = pd.DataFrame(data=d, dtype=np.int8) 

544 >>> df.dtypes 

545 col1 int8 

546 col2 int8 

547 dtype: object 

548 

549 Constructing DataFrame from a dictionary including Series: 

550 

551 >>> d = {'col1': [0, 1, 2, 3], 'col2': pd.Series([2, 3], index=[2, 3])} 

552 >>> pd.DataFrame(data=d, index=[0, 1, 2, 3]) 

553 col1 col2 

554 0 0 NaN 

555 1 1 NaN 

556 2 2 2.0 

557 3 3 3.0 

558 

559 Constructing DataFrame from numpy ndarray: 

560 

561 >>> df2 = pd.DataFrame(np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]), 

562 ... columns=['a', 'b', 'c']) 

563 >>> df2 

564 a b c 

565 0 1 2 3 

566 1 4 5 6 

567 2 7 8 9 

568 

569 Constructing DataFrame from a numpy ndarray that has labeled columns: 

570 

571 >>> data = np.array([(1, 2, 3), (4, 5, 6), (7, 8, 9)], 

572 ... dtype=[("a", "i4"), ("b", "i4"), ("c", "i4")]) 

573 >>> df3 = pd.DataFrame(data, columns=['c', 'a']) 

574 ... 

575 >>> df3 

576 c a 

577 0 3 1 

578 1 6 4 

579 2 9 7 

580 

581 Constructing DataFrame from dataclass: 

582 

583 >>> from dataclasses import make_dataclass 

584 >>> Point = make_dataclass("Point", [("x", int), ("y", int)]) 

585 >>> pd.DataFrame([Point(0, 0), Point(0, 3), Point(2, 3)]) 

586 x y 

587 0 0 0 

588 1 0 3 

589 2 2 3 

590 """ 

591 

592 _internal_names_set = {"columns", "index"} | NDFrame._internal_names_set 

593 _typ = "dataframe" 

594 _HANDLED_TYPES = (Series, Index, ExtensionArray, np.ndarray) 

595 _accessors: set[str] = {"sparse"} 

596 _hidden_attrs: frozenset[str] = NDFrame._hidden_attrs | frozenset([]) 

597 _mgr: BlockManager | ArrayManager 

598 

599 @property 

600 def _constructor(self) -> Callable[..., DataFrame]: 

601 return DataFrame 

602 

603 _constructor_sliced: Callable[..., Series] = Series 

604 

605 # ---------------------------------------------------------------------- 

606 # Constructors 

607 

608 def __init__( 

609 self, 

610 data=None, 

611 index: Axes | None = None, 

612 columns: Axes | None = None, 

613 dtype: Dtype | None = None, 

614 copy: bool | None = None, 

615 ) -> None: 

616 

617 if data is None: 

618 data = {} 

619 if dtype is not None: 

620 dtype = self._validate_dtype(dtype) 

621 

622 if isinstance(data, DataFrame): 

623 data = data._mgr 

624 

625 if isinstance(data, (BlockManager, ArrayManager)): 

626 # first check if a Manager is passed without any other arguments 

627 # -> use fastpath (without checking Manager type) 

628 if index is None and columns is None and dtype is None and not copy: 

629 # GH#33357 fastpath 

630 NDFrame.__init__(self, data) 

631 return 

632 

633 manager = get_option("mode.data_manager") 

634 

635 # GH47215 

636 if index is not None and isinstance(index, set): 

637 raise ValueError("index cannot be a set") 

638 if columns is not None and isinstance(columns, set): 

639 raise ValueError("columns cannot be a set") 

640 

641 if copy is None: 

642 if isinstance(data, dict): 

643 # retain pre-GH#38939 default behavior 

644 copy = True 

645 elif ( 

646 manager == "array" 

647 and isinstance(data, (np.ndarray, ExtensionArray)) 

648 and data.ndim == 2 

649 ): 

650 # INFO(ArrayManager) by default copy the 2D input array to get 

651 # contiguous 1D arrays 

652 copy = True 

653 else: 

654 copy = False 

655 

656 if isinstance(data, (BlockManager, ArrayManager)): 

657 mgr = self._init_mgr( 

658 data, axes={"index": index, "columns": columns}, dtype=dtype, copy=copy 

659 ) 

660 

661 elif isinstance(data, dict): 

662 # GH#38939 de facto copy defaults to False only in non-dict cases 

663 mgr = dict_to_mgr(data, index, columns, dtype=dtype, copy=copy, typ=manager) 

664 elif isinstance(data, ma.MaskedArray): 

665 import numpy.ma.mrecords as mrecords 

666 

667 # masked recarray 

668 if isinstance(data, mrecords.MaskedRecords): 

669 mgr = rec_array_to_mgr( 

670 data, 

671 index, 

672 columns, 

673 dtype, 

674 copy, 

675 typ=manager, 

676 ) 

677 warnings.warn( 

678 "Support for MaskedRecords is deprecated and will be " 

679 "removed in a future version. Pass " 

680 "{name: data[name] for name in data.dtype.names} instead.", 

681 FutureWarning, 

682 stacklevel=find_stack_level(), 

683 ) 

684 

685 # a masked array 

686 else: 

687 data = sanitize_masked_array(data) 

688 mgr = ndarray_to_mgr( 

689 data, 

690 index, 

691 columns, 

692 dtype=dtype, 

693 copy=copy, 

694 typ=manager, 

695 ) 

696 

697 elif isinstance(data, (np.ndarray, Series, Index, ExtensionArray)): 

698 if data.dtype.names: 

699 # i.e. numpy structured array 

700 data = cast(np.ndarray, data) 

701 mgr = rec_array_to_mgr( 

702 data, 

703 index, 

704 columns, 

705 dtype, 

706 copy, 

707 typ=manager, 

708 ) 

709 elif getattr(data, "name", None) is not None: 

710 # i.e. Series/Index with non-None name 

711 mgr = dict_to_mgr( 

712 # error: Item "ndarray" of "Union[ndarray, Series, Index]" has no 

713 # attribute "name" 

714 {data.name: data}, # type: ignore[union-attr] 

715 index, 

716 columns, 

717 dtype=dtype, 

718 typ=manager, 

719 ) 

720 else: 

721 mgr = ndarray_to_mgr( 

722 data, 

723 index, 

724 columns, 

725 dtype=dtype, 

726 copy=copy, 

727 typ=manager, 

728 ) 

729 

730 # For data is list-like, or Iterable (will consume into list) 

731 elif is_list_like(data): 

732 if not isinstance(data, (abc.Sequence, ExtensionArray)): 

733 if hasattr(data, "__array__"): 

734 # GH#44616 big perf improvement for e.g. pytorch tensor 

735 data = np.asarray(data) 

736 else: 

737 data = list(data) 

738 if len(data) > 0: 

739 if is_dataclass(data[0]): 

740 data = dataclasses_to_dicts(data) 

741 if not isinstance(data, np.ndarray) and treat_as_nested(data): 

742 # exclude ndarray as we may have cast it a few lines above 

743 if columns is not None: 

744 columns = ensure_index(columns) 

745 arrays, columns, index = nested_data_to_arrays( 

746 # error: Argument 3 to "nested_data_to_arrays" has incompatible 

747 # type "Optional[Collection[Any]]"; expected "Optional[Index]" 

748 data, 

749 columns, 

750 index, # type: ignore[arg-type] 

751 dtype, 

752 ) 

753 mgr = arrays_to_mgr( 

754 arrays, 

755 columns, 

756 index, 

757 dtype=dtype, 

758 typ=manager, 

759 ) 

760 else: 

761 mgr = ndarray_to_mgr( 

762 data, 

763 index, 

764 columns, 

765 dtype=dtype, 

766 copy=copy, 

767 typ=manager, 

768 ) 

769 else: 

770 mgr = dict_to_mgr( 

771 {}, 

772 index, 

773 columns, 

774 dtype=dtype, 

775 typ=manager, 

776 ) 

777 # For data is scalar 

778 else: 

779 if index is None or columns is None: 

780 raise ValueError("DataFrame constructor not properly called!") 

781 

782 index = ensure_index(index) 

783 columns = ensure_index(columns) 

784 

785 if not dtype: 

786 dtype, _ = infer_dtype_from_scalar(data, pandas_dtype=True) 

787 

788 # For data is a scalar extension dtype 

789 if isinstance(dtype, ExtensionDtype): 

790 # TODO(EA2D): special case not needed with 2D EAs 

791 

792 values = [ 

793 construct_1d_arraylike_from_scalar(data, len(index), dtype) 

794 for _ in range(len(columns)) 

795 ] 

796 mgr = arrays_to_mgr(values, columns, index, dtype=None, typ=manager) 

797 else: 

798 arr2d = construct_2d_arraylike_from_scalar( 

799 data, 

800 len(index), 

801 len(columns), 

802 dtype, 

803 copy, 

804 ) 

805 

806 mgr = ndarray_to_mgr( 

807 arr2d, 

808 index, 

809 columns, 

810 dtype=arr2d.dtype, 

811 copy=False, 

812 typ=manager, 

813 ) 

814 

815 # ensure correct Manager type according to settings 

816 mgr = mgr_to_mgr(mgr, typ=manager) 

817 

818 NDFrame.__init__(self, mgr) 

819 

820 # ---------------------------------------------------------------------- 

821 def __dataframe__( 

822 self, nan_as_null: bool = False, allow_copy: bool = True 

823 ) -> DataFrameXchg: 

824 """ 

825 Return the dataframe interchange object implementing the interchange protocol. 

826 

827 Parameters 

828 ---------- 

829 nan_as_null : bool, default False 

830 Whether to tell the DataFrame to overwrite null values in the data 

831 with ``NaN`` (or ``NaT``). 

832 allow_copy : bool, default True 

833 Whether to allow memory copying when exporting. If set to False 

834 it would cause non-zero-copy exports to fail. 

835 

836 Returns 

837 ------- 

838 DataFrame interchange object 

839 The object which consuming library can use to ingress the dataframe. 

840 

841 Notes 

842 ----- 

843 Details on the interchange protocol: 

844 https://data-apis.org/dataframe-protocol/latest/index.html 

845 

846 `nan_as_null` currently has no effect; once support for nullable extension 

847 dtypes is added, this value should be propagated to columns. 

848 """ 

849 

850 from pandas.core.interchange.dataframe import PandasDataFrameXchg 

851 

852 return PandasDataFrameXchg(self, nan_as_null, allow_copy) 

853 

854 # ---------------------------------------------------------------------- 

855 

856 @property 

857 def axes(self) -> list[Index]: 

858 """ 

859 Return a list representing the axes of the DataFrame. 

860 

861 It has the row axis labels and column axis labels as the only members. 

862 They are returned in that order. 

863 

864 Examples 

865 -------- 

866 >>> df = pd.DataFrame({'col1': [1, 2], 'col2': [3, 4]}) 

867 >>> df.axes 

868 [RangeIndex(start=0, stop=2, step=1), Index(['col1', 'col2'], 

869 dtype='object')] 

870 """ 

871 return [self.index, self.columns] 

872 

873 @property 

874 def shape(self) -> tuple[int, int]: 

875 """ 

876 Return a tuple representing the dimensionality of the DataFrame. 

877 

878 See Also 

879 -------- 

880 ndarray.shape : Tuple of array dimensions. 

881 

882 Examples 

883 -------- 

884 >>> df = pd.DataFrame({'col1': [1, 2], 'col2': [3, 4]}) 

885 >>> df.shape 

886 (2, 2) 

887 

888 >>> df = pd.DataFrame({'col1': [1, 2], 'col2': [3, 4], 

889 ... 'col3': [5, 6]}) 

890 >>> df.shape 

891 (2, 3) 

892 """ 

893 return len(self.index), len(self.columns) 

894 

895 @property 

896 def _is_homogeneous_type(self) -> bool: 

897 """ 

898 Whether all the columns in a DataFrame have the same type. 

899 

900 Returns 

901 ------- 

902 bool 

903 

904 See Also 

905 -------- 

906 Index._is_homogeneous_type : Whether the object has a single 

907 dtype. 

908 MultiIndex._is_homogeneous_type : Whether all the levels of a 

909 MultiIndex have the same dtype. 

910 

911 Examples 

912 -------- 

913 >>> DataFrame({"A": [1, 2], "B": [3, 4]})._is_homogeneous_type 

914 True 

915 >>> DataFrame({"A": [1, 2], "B": [3.0, 4.0]})._is_homogeneous_type 

916 False 

917 

918 Items with the same type but different sizes are considered 

919 different types. 

920 

921 >>> DataFrame({ 

922 ... "A": np.array([1, 2], dtype=np.int32), 

923 ... "B": np.array([1, 2], dtype=np.int64)})._is_homogeneous_type 

924 False 

925 """ 

926 if isinstance(self._mgr, ArrayManager): 

927 return len({arr.dtype for arr in self._mgr.arrays}) == 1 

928 if self._mgr.any_extension_types: 

929 return len({block.dtype for block in self._mgr.blocks}) == 1 

930 else: 

931 return not self._is_mixed_type 

932 

933 @property 

934 def _can_fast_transpose(self) -> bool: 

935 """ 

936 Can we transpose this DataFrame without creating any new array objects. 

937 """ 

938 if isinstance(self._mgr, ArrayManager): 

939 return False 

940 blocks = self._mgr.blocks 

941 if len(blocks) != 1: 

942 return False 

943 

944 dtype = blocks[0].dtype 

945 # TODO(EA2D) special case would be unnecessary with 2D EAs 

946 return not is_1d_only_ea_dtype(dtype) 

947 

948 # error: Return type "Union[ndarray, DatetimeArray, TimedeltaArray]" of 

949 # "_values" incompatible with return type "ndarray" in supertype "NDFrame" 

950 @property 

951 def _values( # type: ignore[override] 

952 self, 

953 ) -> np.ndarray | DatetimeArray | TimedeltaArray | PeriodArray: 

954 """ 

955 Analogue to ._values that may return a 2D ExtensionArray. 

956 """ 

957 self._consolidate_inplace() 

958 

959 mgr = self._mgr 

960 

961 if isinstance(mgr, ArrayManager): 

962 if len(mgr.arrays) == 1 and not is_1d_only_ea_dtype(mgr.arrays[0].dtype): 

963 # error: Item "ExtensionArray" of "Union[ndarray, ExtensionArray]" 

964 # has no attribute "reshape" 

965 return mgr.arrays[0].reshape(-1, 1) # type: ignore[union-attr] 

966 return self.values 

967 

968 blocks = mgr.blocks 

969 if len(blocks) != 1: 

970 return self.values 

971 

972 arr = blocks[0].values 

973 if arr.ndim == 1: 

974 # non-2D ExtensionArray 

975 return self.values 

976 

977 # more generally, whatever we allow in NDArrayBackedExtensionBlock 

978 arr = cast("np.ndarray | DatetimeArray | TimedeltaArray | PeriodArray", arr) 

979 return arr.T 

980 

981 # ---------------------------------------------------------------------- 

982 # Rendering Methods 

983 

984 def _repr_fits_vertical_(self) -> bool: 

985 """ 

986 Check length against max_rows. 

987 """ 

988 max_rows = get_option("display.max_rows") 

989 return len(self) <= max_rows 

990 

991 def _repr_fits_horizontal_(self, ignore_width: bool = False) -> bool: 

992 """ 

993 Check if full repr fits in horizontal boundaries imposed by the display 

994 options width and max_columns. 

995 

996 In case of non-interactive session, no boundaries apply. 

997 

998 `ignore_width` is here so ipynb+HTML output can behave the way 

999 users expect. display.max_columns remains in effect. 

1000 GH3541, GH3573 

1001 """ 

1002 width, height = console.get_console_size() 

1003 max_columns = get_option("display.max_columns") 

1004 nb_columns = len(self.columns) 

1005 

1006 # exceed max columns 

1007 if (max_columns and nb_columns > max_columns) or ( 

1008 (not ignore_width) and width and nb_columns > (width // 2) 

1009 ): 

1010 return False 

1011 

1012 # used by repr_html under IPython notebook or scripts ignore terminal 

1013 # dims 

1014 if ignore_width or width is None or not console.in_interactive_session(): 

1015 return True 

1016 

1017 if get_option("display.width") is not None or console.in_ipython_frontend(): 

1018 # check at least the column row for excessive width 

1019 max_rows = 1 

1020 else: 

1021 max_rows = get_option("display.max_rows") 

1022 

1023 # when auto-detecting, so width=None and not in ipython front end 

1024 # check whether repr fits horizontal by actually checking 

1025 # the width of the rendered repr 

1026 buf = StringIO() 

1027 

1028 # only care about the stuff we'll actually print out 

1029 # and to_string on entire frame may be expensive 

1030 d = self 

1031 

1032 if max_rows is not None: # unlimited rows 

1033 # min of two, where one may be None 

1034 d = d.iloc[: min(max_rows, len(d))] 

1035 else: 

1036 return True 

1037 

1038 d.to_string(buf=buf) 

1039 value = buf.getvalue() 

1040 repr_width = max(len(line) for line in value.split("\n")) 

1041 

1042 return repr_width < width 

1043 

1044 def _info_repr(self) -> bool: 

1045 """ 

1046 True if the repr should show the info view. 

1047 """ 

1048 info_repr_option = get_option("display.large_repr") == "info" 

1049 return info_repr_option and not ( 

1050 self._repr_fits_horizontal_() and self._repr_fits_vertical_() 

1051 ) 

1052 

1053 def __repr__(self) -> str: 

1054 """ 

1055 Return a string representation for a particular DataFrame. 

1056 """ 

1057 if self._info_repr(): 

1058 buf = StringIO() 

1059 self.info(buf=buf) 

1060 return buf.getvalue() 

1061 

1062 repr_params = fmt.get_dataframe_repr_params() 

1063 return self.to_string(**repr_params) 

1064 

1065 def _repr_html_(self) -> str | None: 

1066 """ 

1067 Return a html representation for a particular DataFrame. 

1068 

1069 Mainly for IPython notebook. 

1070 """ 

1071 if self._info_repr(): 

1072 buf = StringIO() 

1073 self.info(buf=buf) 

1074 # need to escape the <class>, should be the first line. 

1075 val = buf.getvalue().replace("<", r"&lt;", 1) 

1076 val = val.replace(">", r"&gt;", 1) 

1077 return "<pre>" + val + "</pre>" 

1078 

1079 if get_option("display.notebook_repr_html"): 

1080 max_rows = get_option("display.max_rows") 

1081 min_rows = get_option("display.min_rows") 

1082 max_cols = get_option("display.max_columns") 

1083 show_dimensions = get_option("display.show_dimensions") 

1084 

1085 formatter = fmt.DataFrameFormatter( 

1086 self, 

1087 columns=None, 

1088 col_space=None, 

1089 na_rep="NaN", 

1090 formatters=None, 

1091 float_format=None, 

1092 sparsify=None, 

1093 justify=None, 

1094 index_names=True, 

1095 header=True, 

1096 index=True, 

1097 bold_rows=True, 

1098 escape=True, 

1099 max_rows=max_rows, 

1100 min_rows=min_rows, 

1101 max_cols=max_cols, 

1102 show_dimensions=show_dimensions, 

1103 decimal=".", 

1104 ) 

1105 return fmt.DataFrameRenderer(formatter).to_html(notebook=True) 

1106 else: 

1107 return None 

1108 

1109 @overload 

1110 def to_string( 

1111 self, 

1112 buf: None = ..., 

1113 columns: Sequence[str] | None = ..., 

1114 col_space: int | list[int] | dict[Hashable, int] | None = ..., 

1115 header: bool | Sequence[str] = ..., 

1116 index: bool = ..., 

1117 na_rep: str = ..., 

1118 formatters: fmt.FormattersType | None = ..., 

1119 float_format: fmt.FloatFormatType | None = ..., 

1120 sparsify: bool | None = ..., 

1121 index_names: bool = ..., 

1122 justify: str | None = ..., 

1123 max_rows: int | None = ..., 

1124 max_cols: int | None = ..., 

1125 show_dimensions: bool = ..., 

1126 decimal: str = ..., 

1127 line_width: int | None = ..., 

1128 min_rows: int | None = ..., 

1129 max_colwidth: int | None = ..., 

1130 encoding: str | None = ..., 

1131 ) -> str: 

1132 ... 

1133 

1134 @overload 

1135 def to_string( 

1136 self, 

1137 buf: FilePath | WriteBuffer[str], 

1138 columns: Sequence[str] | None = ..., 

1139 col_space: int | list[int] | dict[Hashable, int] | None = ..., 

1140 header: bool | Sequence[str] = ..., 

1141 index: bool = ..., 

1142 na_rep: str = ..., 

1143 formatters: fmt.FormattersType | None = ..., 

1144 float_format: fmt.FloatFormatType | None = ..., 

1145 sparsify: bool | None = ..., 

1146 index_names: bool = ..., 

1147 justify: str | None = ..., 

1148 max_rows: int | None = ..., 

1149 max_cols: int | None = ..., 

1150 show_dimensions: bool = ..., 

1151 decimal: str = ..., 

1152 line_width: int | None = ..., 

1153 min_rows: int | None = ..., 

1154 max_colwidth: int | None = ..., 

1155 encoding: str | None = ..., 

1156 ) -> None: 

1157 ... 

1158 

1159 @Substitution( 

1160 header_type="bool or sequence of str", 

1161 header="Write out the column names. If a list of strings " 

1162 "is given, it is assumed to be aliases for the " 

1163 "column names", 

1164 col_space_type="int, list or dict of int", 

1165 col_space="The minimum width of each column. If a list of ints is given " 

1166 "every integers corresponds with one column. If a dict is given, the key " 

1167 "references the column, while the value defines the space to use.", 

1168 ) 

1169 @Substitution(shared_params=fmt.common_docstring, returns=fmt.return_docstring) 

1170 def to_string( 

1171 self, 

1172 buf: FilePath | WriteBuffer[str] | None = None, 

1173 columns: Sequence[str] | None = None, 

1174 col_space: int | list[int] | dict[Hashable, int] | None = None, 

1175 header: bool | Sequence[str] = True, 

1176 index: bool = True, 

1177 na_rep: str = "NaN", 

1178 formatters: fmt.FormattersType | None = None, 

1179 float_format: fmt.FloatFormatType | None = None, 

1180 sparsify: bool | None = None, 

1181 index_names: bool = True, 

1182 justify: str | None = None, 

1183 max_rows: int | None = None, 

1184 max_cols: int | None = None, 

1185 show_dimensions: bool = False, 

1186 decimal: str = ".", 

1187 line_width: int | None = None, 

1188 min_rows: int | None = None, 

1189 max_colwidth: int | None = None, 

1190 encoding: str | None = None, 

1191 ) -> str | None: 

1192 """ 

1193 Render a DataFrame to a console-friendly tabular output. 

1194 %(shared_params)s 

1195 line_width : int, optional 

1196 Width to wrap a line in characters. 

1197 min_rows : int, optional 

1198 The number of rows to display in the console in a truncated repr 

1199 (when number of rows is above `max_rows`). 

1200 max_colwidth : int, optional 

1201 Max width to truncate each column in characters. By default, no limit. 

1202 

1203 .. versionadded:: 1.0.0 

1204 encoding : str, default "utf-8" 

1205 Set character encoding. 

1206 

1207 .. versionadded:: 1.0 

1208 %(returns)s 

1209 See Also 

1210 -------- 

1211 to_html : Convert DataFrame to HTML. 

1212 

1213 Examples 

1214 -------- 

1215 >>> d = {'col1': [1, 2, 3], 'col2': [4, 5, 6]} 

1216 >>> df = pd.DataFrame(d) 

1217 >>> print(df.to_string()) 

1218 col1 col2 

1219 0 1 4 

1220 1 2 5 

1221 2 3 6 

1222 """ 

1223 from pandas import option_context 

1224 

1225 with option_context("display.max_colwidth", max_colwidth): 

1226 formatter = fmt.DataFrameFormatter( 

1227 self, 

1228 columns=columns, 

1229 col_space=col_space, 

1230 na_rep=na_rep, 

1231 formatters=formatters, 

1232 float_format=float_format, 

1233 sparsify=sparsify, 

1234 justify=justify, 

1235 index_names=index_names, 

1236 header=header, 

1237 index=index, 

1238 min_rows=min_rows, 

1239 max_rows=max_rows, 

1240 max_cols=max_cols, 

1241 show_dimensions=show_dimensions, 

1242 decimal=decimal, 

1243 ) 

1244 return fmt.DataFrameRenderer(formatter).to_string( 

1245 buf=buf, 

1246 encoding=encoding, 

1247 line_width=line_width, 

1248 ) 

1249 

1250 # ---------------------------------------------------------------------- 

1251 

1252 @property 

1253 def style(self) -> Styler: 

1254 """ 

1255 Returns a Styler object. 

1256 

1257 Contains methods for building a styled HTML representation of the DataFrame. 

1258 

1259 See Also 

1260 -------- 

1261 io.formats.style.Styler : Helps style a DataFrame or Series according to the 

1262 data with HTML and CSS. 

1263 """ 

1264 from pandas.io.formats.style import Styler 

1265 

1266 return Styler(self) 

1267 

1268 _shared_docs[ 

1269 "items" 

1270 ] = r""" 

1271 Iterate over (column name, Series) pairs. 

1272 

1273 Iterates over the DataFrame columns, returning a tuple with 

1274 the column name and the content as a Series. 

1275 

1276 Yields 

1277 ------ 

1278 label : object 

1279 The column names for the DataFrame being iterated over. 

1280 content : Series 

1281 The column entries belonging to each label, as a Series. 

1282 

1283 See Also 

1284 -------- 

1285 DataFrame.iterrows : Iterate over DataFrame rows as 

1286 (index, Series) pairs. 

1287 DataFrame.itertuples : Iterate over DataFrame rows as namedtuples 

1288 of the values. 

1289 

1290 Examples 

1291 -------- 

1292 >>> df = pd.DataFrame({'species': ['bear', 'bear', 'marsupial'], 

1293 ... 'population': [1864, 22000, 80000]}, 

1294 ... index=['panda', 'polar', 'koala']) 

1295 >>> df 

1296 species population 

1297 panda bear 1864 

1298 polar bear 22000 

1299 koala marsupial 80000 

1300 >>> for label, content in df.items(): 

1301 ... print(f'label: {label}') 

1302 ... print(f'content: {content}', sep='\n') 

1303 ... 

1304 label: species 

1305 content: 

1306 panda bear 

1307 polar bear 

1308 koala marsupial 

1309 Name: species, dtype: object 

1310 label: population 

1311 content: 

1312 panda 1864 

1313 polar 22000 

1314 koala 80000 

1315 Name: population, dtype: int64 

1316 """ 

1317 

1318 @Appender(_shared_docs["items"]) 

1319 def items(self) -> Iterable[tuple[Hashable, Series]]: 

1320 if self.columns.is_unique and hasattr(self, "_item_cache"): 

1321 for k in self.columns: 

1322 yield k, self._get_item_cache(k) 

1323 else: 

1324 for i, k in enumerate(self.columns): 

1325 yield k, self._ixs(i, axis=1) 

1326 

1327 _shared_docs[ 

1328 "iteritems" 

1329 ] = r""" 

1330 Iterate over (column name, Series) pairs. 

1331 

1332 .. deprecated:: 1.5.0 

1333 iteritems is deprecated and will be removed in a future version. 

1334 Use .items instead. 

1335 

1336 Iterates over the DataFrame columns, returning a tuple with 

1337 the column name and the content as a Series. 

1338 

1339 Yields 

1340 ------ 

1341 label : object 

1342 The column names for the DataFrame being iterated over. 

1343 content : Series 

1344 The column entries belonging to each label, as a Series. 

1345 

1346 See Also 

1347 -------- 

1348 DataFrame.iter : Recommended alternative. 

1349 DataFrame.iterrows : Iterate over DataFrame rows as 

1350 (index, Series) pairs. 

1351 DataFrame.itertuples : Iterate over DataFrame rows as namedtuples 

1352 of the values. 

1353 """ 

1354 

1355 @Appender(_shared_docs["iteritems"]) 

1356 def iteritems(self) -> Iterable[tuple[Hashable, Series]]: 

1357 warnings.warn( 

1358 "iteritems is deprecated and will be removed in a future version. " 

1359 "Use .items instead.", 

1360 FutureWarning, 

1361 stacklevel=find_stack_level(), 

1362 ) 

1363 yield from self.items() 

1364 

1365 def iterrows(self) -> Iterable[tuple[Hashable, Series]]: 

1366 """ 

1367 Iterate over DataFrame rows as (index, Series) pairs. 

1368 

1369 Yields 

1370 ------ 

1371 index : label or tuple of label 

1372 The index of the row. A tuple for a `MultiIndex`. 

1373 data : Series 

1374 The data of the row as a Series. 

1375 

1376 See Also 

1377 -------- 

1378 DataFrame.itertuples : Iterate over DataFrame rows as namedtuples of the values. 

1379 DataFrame.items : Iterate over (column name, Series) pairs. 

1380 

1381 Notes 

1382 ----- 

1383 1. Because ``iterrows`` returns a Series for each row, 

1384 it does **not** preserve dtypes across the rows (dtypes are 

1385 preserved across columns for DataFrames). For example, 

1386 

1387 >>> df = pd.DataFrame([[1, 1.5]], columns=['int', 'float']) 

1388 >>> row = next(df.iterrows())[1] 

1389 >>> row 

1390 int 1.0 

1391 float 1.5 

1392 Name: 0, dtype: float64 

1393 >>> print(row['int'].dtype) 

1394 float64 

1395 >>> print(df['int'].dtype) 

1396 int64 

1397 

1398 To preserve dtypes while iterating over the rows, it is better 

1399 to use :meth:`itertuples` which returns namedtuples of the values 

1400 and which is generally faster than ``iterrows``. 

1401 

1402 2. You should **never modify** something you are iterating over. 

1403 This is not guaranteed to work in all cases. Depending on the 

1404 data types, the iterator returns a copy and not a view, and writing 

1405 to it will have no effect. 

1406 """ 

1407 columns = self.columns 

1408 klass = self._constructor_sliced 

1409 for k, v in zip(self.index, self.values): 

1410 s = klass(v, index=columns, name=k).__finalize__(self) 

1411 yield k, s 

1412 

1413 def itertuples( 

1414 self, index: bool = True, name: str | None = "Pandas" 

1415 ) -> Iterable[tuple[Any, ...]]: 

1416 """ 

1417 Iterate over DataFrame rows as namedtuples. 

1418 

1419 Parameters 

1420 ---------- 

1421 index : bool, default True 

1422 If True, return the index as the first element of the tuple. 

1423 name : str or None, default "Pandas" 

1424 The name of the returned namedtuples or None to return regular 

1425 tuples. 

1426 

1427 Returns 

1428 ------- 

1429 iterator 

1430 An object to iterate over namedtuples for each row in the 

1431 DataFrame with the first field possibly being the index and 

1432 following fields being the column values. 

1433 

1434 See Also 

1435 -------- 

1436 DataFrame.iterrows : Iterate over DataFrame rows as (index, Series) 

1437 pairs. 

1438 DataFrame.items : Iterate over (column name, Series) pairs. 

1439 

1440 Notes 

1441 ----- 

1442 The column names will be renamed to positional names if they are 

1443 invalid Python identifiers, repeated, or start with an underscore. 

1444 

1445 Examples 

1446 -------- 

1447 >>> df = pd.DataFrame({'num_legs': [4, 2], 'num_wings': [0, 2]}, 

1448 ... index=['dog', 'hawk']) 

1449 >>> df 

1450 num_legs num_wings 

1451 dog 4 0 

1452 hawk 2 2 

1453 >>> for row in df.itertuples(): 

1454 ... print(row) 

1455 ... 

1456 Pandas(Index='dog', num_legs=4, num_wings=0) 

1457 Pandas(Index='hawk', num_legs=2, num_wings=2) 

1458 

1459 By setting the `index` parameter to False we can remove the index 

1460 as the first element of the tuple: 

1461 

1462 >>> for row in df.itertuples(index=False): 

1463 ... print(row) 

1464 ... 

1465 Pandas(num_legs=4, num_wings=0) 

1466 Pandas(num_legs=2, num_wings=2) 

1467 

1468 With the `name` parameter set we set a custom name for the yielded 

1469 namedtuples: 

1470 

1471 >>> for row in df.itertuples(name='Animal'): 

1472 ... print(row) 

1473 ... 

1474 Animal(Index='dog', num_legs=4, num_wings=0) 

1475 Animal(Index='hawk', num_legs=2, num_wings=2) 

1476 """ 

1477 arrays = [] 

1478 fields = list(self.columns) 

1479 if index: 

1480 arrays.append(self.index) 

1481 fields.insert(0, "Index") 

1482 

1483 # use integer indexing because of possible duplicate column names 

1484 arrays.extend(self.iloc[:, k] for k in range(len(self.columns))) 

1485 

1486 if name is not None: 

1487 # https://github.com/python/mypy/issues/9046 

1488 # error: namedtuple() expects a string literal as the first argument 

1489 itertuple = collections.namedtuple( # type: ignore[misc] 

1490 name, fields, rename=True 

1491 ) 

1492 return map(itertuple._make, zip(*arrays)) 

1493 

1494 # fallback to regular tuples 

1495 return zip(*arrays) 

1496 

1497 def __len__(self) -> int: 

1498 """ 

1499 Returns length of info axis, but here we use the index. 

1500 """ 

1501 return len(self.index) 

1502 

1503 @overload 

1504 def dot(self, other: Series) -> Series: 

1505 ... 

1506 

1507 @overload 

1508 def dot(self, other: DataFrame | Index | ArrayLike) -> DataFrame: 

1509 ... 

1510 

1511 def dot(self, other: AnyArrayLike | DataFrame) -> DataFrame | Series: 

1512 """ 

1513 Compute the matrix multiplication between the DataFrame and other. 

1514 

1515 This method computes the matrix product between the DataFrame and the 

1516 values of an other Series, DataFrame or a numpy array. 

1517 

1518 It can also be called using ``self @ other`` in Python >= 3.5. 

1519 

1520 Parameters 

1521 ---------- 

1522 other : Series, DataFrame or array-like 

1523 The other object to compute the matrix product with. 

1524 

1525 Returns 

1526 ------- 

1527 Series or DataFrame 

1528 If other is a Series, return the matrix product between self and 

1529 other as a Series. If other is a DataFrame or a numpy.array, return 

1530 the matrix product of self and other in a DataFrame of a np.array. 

1531 

1532 See Also 

1533 -------- 

1534 Series.dot: Similar method for Series. 

1535 

1536 Notes 

1537 ----- 

1538 The dimensions of DataFrame and other must be compatible in order to 

1539 compute the matrix multiplication. In addition, the column names of 

1540 DataFrame and the index of other must contain the same values, as they 

1541 will be aligned prior to the multiplication. 

1542 

1543 The dot method for Series computes the inner product, instead of the 

1544 matrix product here. 

1545 

1546 Examples 

1547 -------- 

1548 Here we multiply a DataFrame with a Series. 

1549 

1550 >>> df = pd.DataFrame([[0, 1, -2, -1], [1, 1, 1, 1]]) 

1551 >>> s = pd.Series([1, 1, 2, 1]) 

1552 >>> df.dot(s) 

1553 0 -4 

1554 1 5 

1555 dtype: int64 

1556 

1557 Here we multiply a DataFrame with another DataFrame. 

1558 

1559 >>> other = pd.DataFrame([[0, 1], [1, 2], [-1, -1], [2, 0]]) 

1560 >>> df.dot(other) 

1561 0 1 

1562 0 1 4 

1563 1 2 2 

1564 

1565 Note that the dot method give the same result as @ 

1566 

1567 >>> df @ other 

1568 0 1 

1569 0 1 4 

1570 1 2 2 

1571 

1572 The dot method works also if other is an np.array. 

1573 

1574 >>> arr = np.array([[0, 1], [1, 2], [-1, -1], [2, 0]]) 

1575 >>> df.dot(arr) 

1576 0 1 

1577 0 1 4 

1578 1 2 2 

1579 

1580 Note how shuffling of the objects does not change the result. 

1581 

1582 >>> s2 = s.reindex([1, 0, 2, 3]) 

1583 >>> df.dot(s2) 

1584 0 -4 

1585 1 5 

1586 dtype: int64 

1587 """ 

1588 if isinstance(other, (Series, DataFrame)): 

1589 common = self.columns.union(other.index) 

1590 if len(common) > len(self.columns) or len(common) > len(other.index): 

1591 raise ValueError("matrices are not aligned") 

1592 

1593 left = self.reindex(columns=common, copy=False) 

1594 right = other.reindex(index=common, copy=False) 

1595 lvals = left.values 

1596 rvals = right._values 

1597 else: 

1598 left = self 

1599 lvals = self.values 

1600 rvals = np.asarray(other) 

1601 if lvals.shape[1] != rvals.shape[0]: 

1602 raise ValueError( 

1603 f"Dot product shape mismatch, {lvals.shape} vs {rvals.shape}" 

1604 ) 

1605 

1606 if isinstance(other, DataFrame): 

1607 return self._constructor( 

1608 np.dot(lvals, rvals), index=left.index, columns=other.columns 

1609 ) 

1610 elif isinstance(other, Series): 

1611 return self._constructor_sliced(np.dot(lvals, rvals), index=left.index) 

1612 elif isinstance(rvals, (np.ndarray, Index)): 

1613 result = np.dot(lvals, rvals) 

1614 if result.ndim == 2: 

1615 return self._constructor(result, index=left.index) 

1616 else: 

1617 return self._constructor_sliced(result, index=left.index) 

1618 else: # pragma: no cover 

1619 raise TypeError(f"unsupported type: {type(other)}") 

1620 

1621 @overload 

1622 def __matmul__(self, other: Series) -> Series: 

1623 ... 

1624 

1625 @overload 

1626 def __matmul__(self, other: AnyArrayLike | DataFrame) -> DataFrame | Series: 

1627 ... 

1628 

1629 def __matmul__(self, other: AnyArrayLike | DataFrame) -> DataFrame | Series: 

1630 """ 

1631 Matrix multiplication using binary `@` operator in Python>=3.5. 

1632 """ 

1633 return self.dot(other) 

1634 

1635 def __rmatmul__(self, other) -> DataFrame: 

1636 """ 

1637 Matrix multiplication using binary `@` operator in Python>=3.5. 

1638 """ 

1639 try: 

1640 return self.T.dot(np.transpose(other)).T 

1641 except ValueError as err: 

1642 if "shape mismatch" not in str(err): 

1643 raise 

1644 # GH#21581 give exception message for original shapes 

1645 msg = f"shapes {np.shape(other)} and {self.shape} not aligned" 

1646 raise ValueError(msg) from err 

1647 

1648 # ---------------------------------------------------------------------- 

1649 # IO methods (to / from other formats) 

1650 

1651 @classmethod 

1652 def from_dict( 

1653 cls, 

1654 data: dict, 

1655 orient: str = "columns", 

1656 dtype: Dtype | None = None, 

1657 columns: Axes | None = None, 

1658 ) -> DataFrame: 

1659 """ 

1660 Construct DataFrame from dict of array-like or dicts. 

1661 

1662 Creates DataFrame object from dictionary by columns or by index 

1663 allowing dtype specification. 

1664 

1665 Parameters 

1666 ---------- 

1667 data : dict 

1668 Of the form {field : array-like} or {field : dict}. 

1669 orient : {'columns', 'index', 'tight'}, default 'columns' 

1670 The "orientation" of the data. If the keys of the passed dict 

1671 should be the columns of the resulting DataFrame, pass 'columns' 

1672 (default). Otherwise if the keys should be rows, pass 'index'. 

1673 If 'tight', assume a dict with keys ['index', 'columns', 'data', 

1674 'index_names', 'column_names']. 

1675 

1676 .. versionadded:: 1.4.0 

1677 'tight' as an allowed value for the ``orient`` argument 

1678 

1679 dtype : dtype, default None 

1680 Data type to force, otherwise infer. 

1681 columns : list, default None 

1682 Column labels to use when ``orient='index'``. Raises a ValueError 

1683 if used with ``orient='columns'`` or ``orient='tight'``. 

1684 

1685 Returns 

1686 ------- 

1687 DataFrame 

1688 

1689 See Also 

1690 -------- 

1691 DataFrame.from_records : DataFrame from structured ndarray, sequence 

1692 of tuples or dicts, or DataFrame. 

1693 DataFrame : DataFrame object creation using constructor. 

1694 DataFrame.to_dict : Convert the DataFrame to a dictionary. 

1695 

1696 Examples 

1697 -------- 

1698 By default the keys of the dict become the DataFrame columns: 

1699 

1700 >>> data = {'col_1': [3, 2, 1, 0], 'col_2': ['a', 'b', 'c', 'd']} 

1701 >>> pd.DataFrame.from_dict(data) 

1702 col_1 col_2 

1703 0 3 a 

1704 1 2 b 

1705 2 1 c 

1706 3 0 d 

1707 

1708 Specify ``orient='index'`` to create the DataFrame using dictionary 

1709 keys as rows: 

1710 

1711 >>> data = {'row_1': [3, 2, 1, 0], 'row_2': ['a', 'b', 'c', 'd']} 

1712 >>> pd.DataFrame.from_dict(data, orient='index') 

1713 0 1 2 3 

1714 row_1 3 2 1 0 

1715 row_2 a b c d 

1716 

1717 When using the 'index' orientation, the column names can be 

1718 specified manually: 

1719 

1720 >>> pd.DataFrame.from_dict(data, orient='index', 

1721 ... columns=['A', 'B', 'C', 'D']) 

1722 A B C D 

1723 row_1 3 2 1 0 

1724 row_2 a b c d 

1725 

1726 Specify ``orient='tight'`` to create the DataFrame using a 'tight' 

1727 format: 

1728 

1729 >>> data = {'index': [('a', 'b'), ('a', 'c')], 

1730 ... 'columns': [('x', 1), ('y', 2)], 

1731 ... 'data': [[1, 3], [2, 4]], 

1732 ... 'index_names': ['n1', 'n2'], 

1733 ... 'column_names': ['z1', 'z2']} 

1734 >>> pd.DataFrame.from_dict(data, orient='tight') 

1735 z1 x y 

1736 z2 1 2 

1737 n1 n2 

1738 a b 1 3 

1739 c 2 4 

1740 """ 

1741 index = None 

1742 orient = orient.lower() 

1743 if orient == "index": 

1744 if len(data) > 0: 

1745 # TODO speed up Series case 

1746 if isinstance(list(data.values())[0], (Series, dict)): 

1747 data = _from_nested_dict(data) 

1748 else: 

1749 index = list(data.keys()) 

1750 # error: Incompatible types in assignment (expression has type 

1751 # "List[Any]", variable has type "Dict[Any, Any]") 

1752 data = list(data.values()) # type: ignore[assignment] 

1753 elif orient == "columns" or orient == "tight": 

1754 if columns is not None: 

1755 raise ValueError(f"cannot use columns parameter with orient='{orient}'") 

1756 else: # pragma: no cover 

1757 raise ValueError( 

1758 f"Expected 'index', 'columns' or 'tight' for orient parameter. " 

1759 f"Got '{orient}' instead" 

1760 ) 

1761 

1762 if orient != "tight": 

1763 return cls(data, index=index, columns=columns, dtype=dtype) 

1764 else: 

1765 realdata = data["data"] 

1766 

1767 def create_index(indexlist, namelist): 

1768 index: Index 

1769 if len(namelist) > 1: 

1770 index = MultiIndex.from_tuples(indexlist, names=namelist) 

1771 else: 

1772 index = Index(indexlist, name=namelist[0]) 

1773 return index 

1774 

1775 index = create_index(data["index"], data["index_names"]) 

1776 columns = create_index(data["columns"], data["column_names"]) 

1777 return cls(realdata, index=index, columns=columns, dtype=dtype) 

1778 

1779 def to_numpy( 

1780 self, 

1781 dtype: npt.DTypeLike | None = None, 

1782 copy: bool = False, 

1783 na_value: object = lib.no_default, 

1784 ) -> np.ndarray: 

1785 """ 

1786 Convert the DataFrame to a NumPy array. 

1787 

1788 By default, the dtype of the returned array will be the common NumPy 

1789 dtype of all types in the DataFrame. For example, if the dtypes are 

1790 ``float16`` and ``float32``, the results dtype will be ``float32``. 

1791 This may require copying data and coercing values, which may be 

1792 expensive. 

1793 

1794 Parameters 

1795 ---------- 

1796 dtype : str or numpy.dtype, optional 

1797 The dtype to pass to :meth:`numpy.asarray`. 

1798 copy : bool, default False 

1799 Whether to ensure that the returned value is not a view on 

1800 another array. Note that ``copy=False`` does not *ensure* that 

1801 ``to_numpy()`` is no-copy. Rather, ``copy=True`` ensure that 

1802 a copy is made, even if not strictly necessary. 

1803 na_value : Any, optional 

1804 The value to use for missing values. The default value depends 

1805 on `dtype` and the dtypes of the DataFrame columns. 

1806 

1807 .. versionadded:: 1.1.0 

1808 

1809 Returns 

1810 ------- 

1811 numpy.ndarray 

1812 

1813 See Also 

1814 -------- 

1815 Series.to_numpy : Similar method for Series. 

1816 

1817 Examples 

1818 -------- 

1819 >>> pd.DataFrame({"A": [1, 2], "B": [3, 4]}).to_numpy() 

1820 array([[1, 3], 

1821 [2, 4]]) 

1822 

1823 With heterogeneous data, the lowest common type will have to 

1824 be used. 

1825 

1826 >>> df = pd.DataFrame({"A": [1, 2], "B": [3.0, 4.5]}) 

1827 >>> df.to_numpy() 

1828 array([[1. , 3. ], 

1829 [2. , 4.5]]) 

1830 

1831 For a mix of numeric and non-numeric types, the output array will 

1832 have object dtype. 

1833 

1834 >>> df['C'] = pd.date_range('2000', periods=2) 

1835 >>> df.to_numpy() 

1836 array([[1, 3.0, Timestamp('2000-01-01 00:00:00')], 

1837 [2, 4.5, Timestamp('2000-01-02 00:00:00')]], dtype=object) 

1838 """ 

1839 self._consolidate_inplace() 

1840 if dtype is not None: 

1841 dtype = np.dtype(dtype) 

1842 result = self._mgr.as_array(dtype=dtype, copy=copy, na_value=na_value) 

1843 if result.dtype is not dtype: 

1844 result = np.array(result, dtype=dtype, copy=False) 

1845 

1846 return result 

1847 

1848 @overload 

1849 def to_dict( 

1850 self, 

1851 orient: Literal["dict", "list", "series", "split", "tight", "index"] = ..., 

1852 into: type[dict] = ..., 

1853 ) -> dict: 

1854 ... 

1855 

1856 @overload 

1857 def to_dict(self, orient: Literal["records"], into: type[dict] = ...) -> list[dict]: 

1858 ... 

1859 

1860 def to_dict( 

1861 self, 

1862 orient: Literal[ 

1863 "dict", "list", "series", "split", "tight", "records", "index" 

1864 ] = "dict", 

1865 into: type[dict] = dict, 

1866 ) -> dict | list[dict]: 

1867 """ 

1868 Convert the DataFrame to a dictionary. 

1869 

1870 The type of the key-value pairs can be customized with the parameters 

1871 (see below). 

1872 

1873 Parameters 

1874 ---------- 

1875 orient : str {'dict', 'list', 'series', 'split', 'tight', 'records', 'index'} 

1876 Determines the type of the values of the dictionary. 

1877 

1878 - 'dict' (default) : dict like {column -> {index -> value}} 

1879 - 'list' : dict like {column -> [values]} 

1880 - 'series' : dict like {column -> Series(values)} 

1881 - 'split' : dict like 

1882 {'index' -> [index], 'columns' -> [columns], 'data' -> [values]} 

1883 - 'tight' : dict like 

1884 {'index' -> [index], 'columns' -> [columns], 'data' -> [values], 

1885 'index_names' -> [index.names], 'column_names' -> [column.names]} 

1886 - 'records' : list like 

1887 [{column -> value}, ... , {column -> value}] 

1888 - 'index' : dict like {index -> {column -> value}} 

1889 

1890 Abbreviations are allowed. `s` indicates `series` and `sp` 

1891 indicates `split`. 

1892 

1893 .. versionadded:: 1.4.0 

1894 'tight' as an allowed value for the ``orient`` argument 

1895 

1896 into : class, default dict 

1897 The collections.abc.Mapping subclass used for all Mappings 

1898 in the return value. Can be the actual class or an empty 

1899 instance of the mapping type you want. If you want a 

1900 collections.defaultdict, you must pass it initialized. 

1901 

1902 Returns 

1903 ------- 

1904 dict, list or collections.abc.Mapping 

1905 Return a collections.abc.Mapping object representing the DataFrame. 

1906 The resulting transformation depends on the `orient` parameter. 

1907 

1908 See Also 

1909 -------- 

1910 DataFrame.from_dict: Create a DataFrame from a dictionary. 

1911 DataFrame.to_json: Convert a DataFrame to JSON format. 

1912 

1913 Examples 

1914 -------- 

1915 >>> df = pd.DataFrame({'col1': [1, 2], 

1916 ... 'col2': [0.5, 0.75]}, 

1917 ... index=['row1', 'row2']) 

1918 >>> df 

1919 col1 col2 

1920 row1 1 0.50 

1921 row2 2 0.75 

1922 >>> df.to_dict() 

1923 {'col1': {'row1': 1, 'row2': 2}, 'col2': {'row1': 0.5, 'row2': 0.75}} 

1924 

1925 You can specify the return orientation. 

1926 

1927 >>> df.to_dict('series') 

1928 {'col1': row1 1 

1929 row2 2 

1930 Name: col1, dtype: int64, 

1931 'col2': row1 0.50 

1932 row2 0.75 

1933 Name: col2, dtype: float64} 

1934 

1935 >>> df.to_dict('split') 

1936 {'index': ['row1', 'row2'], 'columns': ['col1', 'col2'], 

1937 'data': [[1, 0.5], [2, 0.75]]} 

1938 

1939 >>> df.to_dict('records') 

1940 [{'col1': 1, 'col2': 0.5}, {'col1': 2, 'col2': 0.75}] 

1941 

1942 >>> df.to_dict('index') 

1943 {'row1': {'col1': 1, 'col2': 0.5}, 'row2': {'col1': 2, 'col2': 0.75}} 

1944 

1945 >>> df.to_dict('tight') 

1946 {'index': ['row1', 'row2'], 'columns': ['col1', 'col2'], 

1947 'data': [[1, 0.5], [2, 0.75]], 'index_names': [None], 'column_names': [None]} 

1948 

1949 You can also specify the mapping type. 

1950 

1951 >>> from collections import OrderedDict, defaultdict 

1952 >>> df.to_dict(into=OrderedDict) 

1953 OrderedDict([('col1', OrderedDict([('row1', 1), ('row2', 2)])), 

1954 ('col2', OrderedDict([('row1', 0.5), ('row2', 0.75)]))]) 

1955 

1956 If you want a `defaultdict`, you need to initialize it: 

1957 

1958 >>> dd = defaultdict(list) 

1959 >>> df.to_dict('records', into=dd) 

1960 [defaultdict(<class 'list'>, {'col1': 1, 'col2': 0.5}), 

1961 defaultdict(<class 'list'>, {'col1': 2, 'col2': 0.75})] 

1962 """ 

1963 if not self.columns.is_unique: 

1964 warnings.warn( 

1965 "DataFrame columns are not unique, some columns will be omitted.", 

1966 UserWarning, 

1967 stacklevel=find_stack_level(), 

1968 ) 

1969 # GH16122 

1970 into_c = com.standardize_mapping(into) 

1971 

1972 # error: Incompatible types in assignment (expression has type "str", 

1973 # variable has type "Literal['dict', 'list', 'series', 'split', 'tight', 

1974 # 'records', 'index']") 

1975 orient = orient.lower() # type: ignore[assignment] 

1976 # GH32515 

1977 if orient.startswith(("d", "l", "s", "r", "i")) and orient not in { 

1978 "dict", 

1979 "list", 

1980 "series", 

1981 "split", 

1982 "records", 

1983 "index", 

1984 }: 

1985 warnings.warn( 

1986 "Using short name for 'orient' is deprecated. Only the " 

1987 "options: ('dict', list, 'series', 'split', 'records', 'index') " 

1988 "will be used in a future version. Use one of the above " 

1989 "to silence this warning.", 

1990 FutureWarning, 

1991 stacklevel=find_stack_level(), 

1992 ) 

1993 

1994 if orient.startswith("d"): 

1995 orient = "dict" 

1996 elif orient.startswith("l"): 

1997 orient = "list" 

1998 elif orient.startswith("sp"): 

1999 orient = "split" 

2000 elif orient.startswith("s"): 

2001 orient = "series" 

2002 elif orient.startswith("r"): 

2003 orient = "records" 

2004 elif orient.startswith("i"): 

2005 orient = "index" 

2006 

2007 if orient == "dict": 

2008 return into_c((k, v.to_dict(into)) for k, v in self.items()) 

2009 

2010 elif orient == "list": 

2011 return into_c( 

2012 (k, list(map(maybe_box_native, v.tolist()))) for k, v in self.items() 

2013 ) 

2014 

2015 elif orient == "split": 

2016 return into_c( 

2017 ( 

2018 ("index", self.index.tolist()), 

2019 ("columns", self.columns.tolist()), 

2020 ( 

2021 "data", 

2022 [ 

2023 list(map(maybe_box_native, t)) 

2024 for t in self.itertuples(index=False, name=None) 

2025 ], 

2026 ), 

2027 ) 

2028 ) 

2029 

2030 elif orient == "tight": 

2031 return into_c( 

2032 ( 

2033 ("index", self.index.tolist()), 

2034 ("columns", self.columns.tolist()), 

2035 ( 

2036 "data", 

2037 [ 

2038 list(map(maybe_box_native, t)) 

2039 for t in self.itertuples(index=False, name=None) 

2040 ], 

2041 ), 

2042 ("index_names", list(self.index.names)), 

2043 ("column_names", list(self.columns.names)), 

2044 ) 

2045 ) 

2046 

2047 elif orient == "series": 

2048 return into_c((k, v) for k, v in self.items()) 

2049 

2050 elif orient == "records": 

2051 columns = self.columns.tolist() 

2052 rows = ( 

2053 dict(zip(columns, row)) 

2054 for row in self.itertuples(index=False, name=None) 

2055 ) 

2056 return [ 

2057 into_c((k, maybe_box_native(v)) for k, v in row.items()) for row in rows 

2058 ] 

2059 

2060 elif orient == "index": 

2061 if not self.index.is_unique: 

2062 raise ValueError("DataFrame index must be unique for orient='index'.") 

2063 return into_c( 

2064 (t[0], dict(zip(self.columns, map(maybe_box_native, t[1:])))) 

2065 for t in self.itertuples(name=None) 

2066 ) 

2067 

2068 else: 

2069 raise ValueError(f"orient '{orient}' not understood") 

2070 

2071 def to_gbq( 

2072 self, 

2073 destination_table: str, 

2074 project_id: str | None = None, 

2075 chunksize: int | None = None, 

2076 reauth: bool = False, 

2077 if_exists: str = "fail", 

2078 auth_local_webserver: bool = True, 

2079 table_schema: list[dict[str, str]] | None = None, 

2080 location: str | None = None, 

2081 progress_bar: bool = True, 

2082 credentials=None, 

2083 ) -> None: 

2084 """ 

2085 Write a DataFrame to a Google BigQuery table. 

2086 

2087 This function requires the `pandas-gbq package 

2088 <https://pandas-gbq.readthedocs.io>`__. 

2089 

2090 See the `How to authenticate with Google BigQuery 

2091 <https://pandas-gbq.readthedocs.io/en/latest/howto/authentication.html>`__ 

2092 guide for authentication instructions. 

2093 

2094 Parameters 

2095 ---------- 

2096 destination_table : str 

2097 Name of table to be written, in the form ``dataset.tablename``. 

2098 project_id : str, optional 

2099 Google BigQuery Account project ID. Optional when available from 

2100 the environment. 

2101 chunksize : int, optional 

2102 Number of rows to be inserted in each chunk from the dataframe. 

2103 Set to ``None`` to load the whole dataframe at once. 

2104 reauth : bool, default False 

2105 Force Google BigQuery to re-authenticate the user. This is useful 

2106 if multiple accounts are used. 

2107 if_exists : str, default 'fail' 

2108 Behavior when the destination table exists. Value can be one of: 

2109 

2110 ``'fail'`` 

2111 If table exists raise pandas_gbq.gbq.TableCreationError. 

2112 ``'replace'`` 

2113 If table exists, drop it, recreate it, and insert data. 

2114 ``'append'`` 

2115 If table exists, insert data. Create if does not exist. 

2116 auth_local_webserver : bool, default True 

2117 Use the `local webserver flow`_ instead of the `console flow`_ 

2118 when getting user credentials. 

2119 

2120 .. _local webserver flow: 

2121 https://google-auth-oauthlib.readthedocs.io/en/latest/reference/google_auth_oauthlib.flow.html#google_auth_oauthlib.flow.InstalledAppFlow.run_local_server 

2122 .. _console flow: 

2123 https://google-auth-oauthlib.readthedocs.io/en/latest/reference/google_auth_oauthlib.flow.html#google_auth_oauthlib.flow.InstalledAppFlow.run_console 

2124 

2125 *New in version 0.2.0 of pandas-gbq*. 

2126 

2127 .. versionchanged:: 1.5.0 

2128 Default value is changed to ``True``. Google has deprecated the 

2129 ``auth_local_webserver = False`` `"out of band" (copy-paste) 

2130 flow 

2131 <https://developers.googleblog.com/2022/02/making-oauth-flows-safer.html?m=1#disallowed-oob>`_. 

2132 table_schema : list of dicts, optional 

2133 List of BigQuery table fields to which according DataFrame 

2134 columns conform to, e.g. ``[{'name': 'col1', 'type': 

2135 'STRING'},...]``. If schema is not provided, it will be 

2136 generated according to dtypes of DataFrame columns. See 

2137 BigQuery API documentation on available names of a field. 

2138 

2139 *New in version 0.3.1 of pandas-gbq*. 

2140 location : str, optional 

2141 Location where the load job should run. See the `BigQuery locations 

2142 documentation 

2143 <https://cloud.google.com/bigquery/docs/dataset-locations>`__ for a 

2144 list of available locations. The location must match that of the 

2145 target dataset. 

2146 

2147 *New in version 0.5.0 of pandas-gbq*. 

2148 progress_bar : bool, default True 

2149 Use the library `tqdm` to show the progress bar for the upload, 

2150 chunk by chunk. 

2151 

2152 *New in version 0.5.0 of pandas-gbq*. 

2153 credentials : google.auth.credentials.Credentials, optional 

2154 Credentials for accessing Google APIs. Use this parameter to 

2155 override default credentials, such as to use Compute Engine 

2156 :class:`google.auth.compute_engine.Credentials` or Service 

2157 Account :class:`google.oauth2.service_account.Credentials` 

2158 directly. 

2159 

2160 *New in version 0.8.0 of pandas-gbq*. 

2161 

2162 See Also 

2163 -------- 

2164 pandas_gbq.to_gbq : This function in the pandas-gbq library. 

2165 read_gbq : Read a DataFrame from Google BigQuery. 

2166 """ 

2167 from pandas.io import gbq 

2168 

2169 gbq.to_gbq( 

2170 self, 

2171 destination_table, 

2172 project_id=project_id, 

2173 chunksize=chunksize, 

2174 reauth=reauth, 

2175 if_exists=if_exists, 

2176 auth_local_webserver=auth_local_webserver, 

2177 table_schema=table_schema, 

2178 location=location, 

2179 progress_bar=progress_bar, 

2180 credentials=credentials, 

2181 ) 

2182 

2183 @classmethod 

2184 def from_records( 

2185 cls, 

2186 data, 

2187 index=None, 

2188 exclude=None, 

2189 columns=None, 

2190 coerce_float: bool = False, 

2191 nrows: int | None = None, 

2192 ) -> DataFrame: 

2193 """ 

2194 Convert structured or record ndarray to DataFrame. 

2195 

2196 Creates a DataFrame object from a structured ndarray, sequence of 

2197 tuples or dicts, or DataFrame. 

2198 

2199 Parameters 

2200 ---------- 

2201 data : structured ndarray, sequence of tuples or dicts, or DataFrame 

2202 Structured input data. 

2203 index : str, list of fields, array-like 

2204 Field of array to use as the index, alternately a specific set of 

2205 input labels to use. 

2206 exclude : sequence, default None 

2207 Columns or fields to exclude. 

2208 columns : sequence, default None 

2209 Column names to use. If the passed data do not have names 

2210 associated with them, this argument provides names for the 

2211 columns. Otherwise this argument indicates the order of the columns 

2212 in the result (any names not found in the data will become all-NA 

2213 columns). 

2214 coerce_float : bool, default False 

2215 Attempt to convert values of non-string, non-numeric objects (like 

2216 decimal.Decimal) to floating point, useful for SQL result sets. 

2217 nrows : int, default None 

2218 Number of rows to read if data is an iterator. 

2219 

2220 Returns 

2221 ------- 

2222 DataFrame 

2223 

2224 See Also 

2225 -------- 

2226 DataFrame.from_dict : DataFrame from dict of array-like or dicts. 

2227 DataFrame : DataFrame object creation using constructor. 

2228 

2229 Examples 

2230 -------- 

2231 Data can be provided as a structured ndarray: 

2232 

2233 >>> data = np.array([(3, 'a'), (2, 'b'), (1, 'c'), (0, 'd')], 

2234 ... dtype=[('col_1', 'i4'), ('col_2', 'U1')]) 

2235 >>> pd.DataFrame.from_records(data) 

2236 col_1 col_2 

2237 0 3 a 

2238 1 2 b 

2239 2 1 c 

2240 3 0 d 

2241 

2242 Data can be provided as a list of dicts: 

2243 

2244 >>> data = [{'col_1': 3, 'col_2': 'a'}, 

2245 ... {'col_1': 2, 'col_2': 'b'}, 

2246 ... {'col_1': 1, 'col_2': 'c'}, 

2247 ... {'col_1': 0, 'col_2': 'd'}] 

2248 >>> pd.DataFrame.from_records(data) 

2249 col_1 col_2 

2250 0 3 a 

2251 1 2 b 

2252 2 1 c 

2253 3 0 d 

2254 

2255 Data can be provided as a list of tuples with corresponding columns: 

2256 

2257 >>> data = [(3, 'a'), (2, 'b'), (1, 'c'), (0, 'd')] 

2258 >>> pd.DataFrame.from_records(data, columns=['col_1', 'col_2']) 

2259 col_1 col_2 

2260 0 3 a 

2261 1 2 b 

2262 2 1 c 

2263 3 0 d 

2264 """ 

2265 result_index = None 

2266 

2267 # Make a copy of the input columns so we can modify it 

2268 if columns is not None: 

2269 columns = ensure_index(columns) 

2270 

2271 def maybe_reorder( 

2272 arrays: list[ArrayLike], arr_columns: Index, columns: Index, index 

2273 ) -> tuple[list[ArrayLike], Index, Index | None]: 

2274 """ 

2275 If our desired 'columns' do not match the data's pre-existing 'arr_columns', 

2276 we re-order our arrays. This is like a pre-emptive (cheap) reindex. 

2277 """ 

2278 if len(arrays): 

2279 length = len(arrays[0]) 

2280 else: 

2281 length = 0 

2282 

2283 result_index = None 

2284 if len(arrays) == 0 and index is None and length == 0: 

2285 # for backward compat use an object Index instead of RangeIndex 

2286 result_index = Index([]) 

2287 

2288 arrays, arr_columns = reorder_arrays(arrays, arr_columns, columns, length) 

2289 return arrays, arr_columns, result_index 

2290 

2291 if is_iterator(data): 

2292 if nrows == 0: 

2293 return cls() 

2294 

2295 try: 

2296 first_row = next(data) 

2297 except StopIteration: 

2298 return cls(index=index, columns=columns) 

2299 

2300 dtype = None 

2301 if hasattr(first_row, "dtype") and first_row.dtype.names: 

2302 dtype = first_row.dtype 

2303 

2304 values = [first_row] 

2305 

2306 if nrows is None: 

2307 values += data 

2308 else: 

2309 values.extend(itertools.islice(data, nrows - 1)) 

2310 

2311 if dtype is not None: 

2312 data = np.array(values, dtype=dtype) 

2313 else: 

2314 data = values 

2315 

2316 if isinstance(data, dict): 

2317 if columns is None: 

2318 columns = arr_columns = ensure_index(sorted(data)) 

2319 arrays = [data[k] for k in columns] 

2320 else: 

2321 arrays = [] 

2322 arr_columns_list = [] 

2323 for k, v in data.items(): 

2324 if k in columns: 

2325 arr_columns_list.append(k) 

2326 arrays.append(v) 

2327 

2328 arr_columns = Index(arr_columns_list) 

2329 arrays, arr_columns, result_index = maybe_reorder( 

2330 arrays, arr_columns, columns, index 

2331 ) 

2332 

2333 elif isinstance(data, (np.ndarray, DataFrame)): 

2334 arrays, columns = to_arrays(data, columns) 

2335 arr_columns = columns 

2336 else: 

2337 arrays, arr_columns = to_arrays(data, columns) 

2338 if coerce_float: 

2339 for i, arr in enumerate(arrays): 

2340 if arr.dtype == object: 

2341 # error: Argument 1 to "maybe_convert_objects" has 

2342 # incompatible type "Union[ExtensionArray, ndarray]"; 

2343 # expected "ndarray" 

2344 arrays[i] = lib.maybe_convert_objects( 

2345 arr, # type: ignore[arg-type] 

2346 try_float=True, 

2347 ) 

2348 

2349 arr_columns = ensure_index(arr_columns) 

2350 if columns is None: 

2351 columns = arr_columns 

2352 else: 

2353 arrays, arr_columns, result_index = maybe_reorder( 

2354 arrays, arr_columns, columns, index 

2355 ) 

2356 

2357 if exclude is None: 

2358 exclude = set() 

2359 else: 

2360 exclude = set(exclude) 

2361 

2362 if index is not None: 

2363 if isinstance(index, str) or not hasattr(index, "__iter__"): 

2364 i = columns.get_loc(index) 

2365 exclude.add(index) 

2366 if len(arrays) > 0: 

2367 result_index = Index(arrays[i], name=index) 

2368 else: 

2369 result_index = Index([], name=index) 

2370 else: 

2371 try: 

2372 index_data = [arrays[arr_columns.get_loc(field)] for field in index] 

2373 except (KeyError, TypeError): 

2374 # raised by get_loc, see GH#29258 

2375 result_index = index 

2376 else: 

2377 result_index = ensure_index_from_sequences(index_data, names=index) 

2378 exclude.update(index) 

2379 

2380 if any(exclude): 

2381 arr_exclude = [x for x in exclude if x in arr_columns] 

2382 to_remove = [arr_columns.get_loc(col) for col in arr_exclude] 

2383 arrays = [v for i, v in enumerate(arrays) if i not in to_remove] 

2384 

2385 columns = columns.drop(exclude) 

2386 

2387 manager = get_option("mode.data_manager") 

2388 mgr = arrays_to_mgr(arrays, columns, result_index, typ=manager) 

2389 

2390 return cls(mgr) 

2391 

2392 def to_records( 

2393 self, index: bool = True, column_dtypes=None, index_dtypes=None 

2394 ) -> np.recarray: 

2395 """ 

2396 Convert DataFrame to a NumPy record array. 

2397 

2398 Index will be included as the first field of the record array if 

2399 requested. 

2400 

2401 Parameters 

2402 ---------- 

2403 index : bool, default True 

2404 Include index in resulting record array, stored in 'index' 

2405 field or using the index label, if set. 

2406 column_dtypes : str, type, dict, default None 

2407 If a string or type, the data type to store all columns. If 

2408 a dictionary, a mapping of column names and indices (zero-indexed) 

2409 to specific data types. 

2410 index_dtypes : str, type, dict, default None 

2411 If a string or type, the data type to store all index levels. If 

2412 a dictionary, a mapping of index level names and indices 

2413 (zero-indexed) to specific data types. 

2414 

2415 This mapping is applied only if `index=True`. 

2416 

2417 Returns 

2418 ------- 

2419 numpy.recarray 

2420 NumPy ndarray with the DataFrame labels as fields and each row 

2421 of the DataFrame as entries. 

2422 

2423 See Also 

2424 -------- 

2425 DataFrame.from_records: Convert structured or record ndarray 

2426 to DataFrame. 

2427 numpy.recarray: An ndarray that allows field access using 

2428 attributes, analogous to typed columns in a 

2429 spreadsheet. 

2430 

2431 Examples 

2432 -------- 

2433 >>> df = pd.DataFrame({'A': [1, 2], 'B': [0.5, 0.75]}, 

2434 ... index=['a', 'b']) 

2435 >>> df 

2436 A B 

2437 a 1 0.50 

2438 b 2 0.75 

2439 >>> df.to_records() 

2440 rec.array([('a', 1, 0.5 ), ('b', 2, 0.75)], 

2441 dtype=[('index', 'O'), ('A', '<i8'), ('B', '<f8')]) 

2442 

2443 If the DataFrame index has no label then the recarray field name 

2444 is set to 'index'. If the index has a label then this is used as the 

2445 field name: 

2446 

2447 >>> df.index = df.index.rename("I") 

2448 >>> df.to_records() 

2449 rec.array([('a', 1, 0.5 ), ('b', 2, 0.75)], 

2450 dtype=[('I', 'O'), ('A', '<i8'), ('B', '<f8')]) 

2451 

2452 The index can be excluded from the record array: 

2453 

2454 >>> df.to_records(index=False) 

2455 rec.array([(1, 0.5 ), (2, 0.75)], 

2456 dtype=[('A', '<i8'), ('B', '<f8')]) 

2457 

2458 Data types can be specified for the columns: 

2459 

2460 >>> df.to_records(column_dtypes={"A": "int32"}) 

2461 rec.array([('a', 1, 0.5 ), ('b', 2, 0.75)], 

2462 dtype=[('I', 'O'), ('A', '<i4'), ('B', '<f8')]) 

2463 

2464 As well as for the index: 

2465 

2466 >>> df.to_records(index_dtypes="<S2") 

2467 rec.array([(b'a', 1, 0.5 ), (b'b', 2, 0.75)], 

2468 dtype=[('I', 'S2'), ('A', '<i8'), ('B', '<f8')]) 

2469 

2470 >>> index_dtypes = f"<S{df.index.str.len().max()}" 

2471 >>> df.to_records(index_dtypes=index_dtypes) 

2472 rec.array([(b'a', 1, 0.5 ), (b'b', 2, 0.75)], 

2473 dtype=[('I', 'S1'), ('A', '<i8'), ('B', '<f8')]) 

2474 """ 

2475 if index: 

2476 ix_vals = [ 

2477 np.asarray(self.index.get_level_values(i)) 

2478 for i in range(self.index.nlevels) 

2479 ] 

2480 

2481 arrays = ix_vals + [ 

2482 np.asarray(self.iloc[:, i]) for i in range(len(self.columns)) 

2483 ] 

2484 

2485 index_names = list(self.index.names) 

2486 

2487 if isinstance(self.index, MultiIndex): 

2488 index_names = com.fill_missing_names(index_names) 

2489 elif index_names[0] is None: 

2490 index_names = ["index"] 

2491 

2492 names = [str(name) for name in itertools.chain(index_names, self.columns)] 

2493 else: 

2494 arrays = [np.asarray(self.iloc[:, i]) for i in range(len(self.columns))] 

2495 names = [str(c) for c in self.columns] 

2496 index_names = [] 

2497 

2498 index_len = len(index_names) 

2499 formats = [] 

2500 

2501 for i, v in enumerate(arrays): 

2502 index_int = i 

2503 

2504 # When the names and arrays are collected, we 

2505 # first collect those in the DataFrame's index, 

2506 # followed by those in its columns. 

2507 # 

2508 # Thus, the total length of the array is: 

2509 # len(index_names) + len(DataFrame.columns). 

2510 # 

2511 # This check allows us to see whether we are 

2512 # handling a name / array in the index or column. 

2513 if index_int < index_len: 

2514 dtype_mapping = index_dtypes 

2515 name = index_names[index_int] 

2516 else: 

2517 index_int -= index_len 

2518 dtype_mapping = column_dtypes 

2519 name = self.columns[index_int] 

2520 

2521 # We have a dictionary, so we get the data type 

2522 # associated with the index or column (which can 

2523 # be denoted by its name in the DataFrame or its 

2524 # position in DataFrame's array of indices or 

2525 # columns, whichever is applicable. 

2526 if is_dict_like(dtype_mapping): 

2527 if name in dtype_mapping: 

2528 dtype_mapping = dtype_mapping[name] 

2529 elif index_int in dtype_mapping: 

2530 dtype_mapping = dtype_mapping[index_int] 

2531 else: 

2532 dtype_mapping = None 

2533 

2534 # If no mapping can be found, use the array's 

2535 # dtype attribute for formatting. 

2536 # 

2537 # A valid dtype must either be a type or 

2538 # string naming a type. 

2539 if dtype_mapping is None: 

2540 formats.append(v.dtype) 

2541 elif isinstance(dtype_mapping, (type, np.dtype, str)): 

2542 # error: Argument 1 to "append" of "list" has incompatible 

2543 # type "Union[type, dtype[Any], str]"; expected "dtype[Any]" 

2544 formats.append(dtype_mapping) # type: ignore[arg-type] 

2545 else: 

2546 element = "row" if i < index_len else "column" 

2547 msg = f"Invalid dtype {dtype_mapping} specified for {element} {name}" 

2548 raise ValueError(msg) 

2549 

2550 return np.rec.fromarrays(arrays, dtype={"names": names, "formats": formats}) 

2551 

2552 @classmethod 

2553 def _from_arrays( 

2554 cls, 

2555 arrays, 

2556 columns, 

2557 index, 

2558 dtype: Dtype | None = None, 

2559 verify_integrity: bool = True, 

2560 ) -> DataFrame: 

2561 """ 

2562 Create DataFrame from a list of arrays corresponding to the columns. 

2563 

2564 Parameters 

2565 ---------- 

2566 arrays : list-like of arrays 

2567 Each array in the list corresponds to one column, in order. 

2568 columns : list-like, Index 

2569 The column names for the resulting DataFrame. 

2570 index : list-like, Index 

2571 The rows labels for the resulting DataFrame. 

2572 dtype : dtype, optional 

2573 Optional dtype to enforce for all arrays. 

2574 verify_integrity : bool, default True 

2575 Validate and homogenize all input. If set to False, it is assumed 

2576 that all elements of `arrays` are actual arrays how they will be 

2577 stored in a block (numpy ndarray or ExtensionArray), have the same 

2578 length as and are aligned with the index, and that `columns` and 

2579 `index` are ensured to be an Index object. 

2580 

2581 Returns 

2582 ------- 

2583 DataFrame 

2584 """ 

2585 if dtype is not None: 

2586 dtype = pandas_dtype(dtype) 

2587 

2588 manager = get_option("mode.data_manager") 

2589 columns = ensure_index(columns) 

2590 if len(columns) != len(arrays): 

2591 raise ValueError("len(columns) must match len(arrays)") 

2592 mgr = arrays_to_mgr( 

2593 arrays, 

2594 columns, 

2595 index, 

2596 dtype=dtype, 

2597 verify_integrity=verify_integrity, 

2598 typ=manager, 

2599 ) 

2600 return cls(mgr) 

2601 

2602 @doc( 

2603 storage_options=_shared_docs["storage_options"], 

2604 compression_options=_shared_docs["compression_options"] % "path", 

2605 ) 

2606 @deprecate_kwarg(old_arg_name="fname", new_arg_name="path") 

2607 @deprecate_nonkeyword_arguments(version=None, allowed_args=["self", "path"]) 

2608 def to_stata( 

2609 self, 

2610 path: FilePath | WriteBuffer[bytes], 

2611 convert_dates: dict[Hashable, str] | None = None, 

2612 write_index: bool = True, 

2613 byteorder: str | None = None, 

2614 time_stamp: datetime.datetime | None = None, 

2615 data_label: str | None = None, 

2616 variable_labels: dict[Hashable, str] | None = None, 

2617 version: int | None = 114, 

2618 convert_strl: Sequence[Hashable] | None = None, 

2619 compression: CompressionOptions = "infer", 

2620 storage_options: StorageOptions = None, 

2621 *, 

2622 value_labels: dict[Hashable, dict[float, str]] | None = None, 

2623 ) -> None: 

2624 """ 

2625 Export DataFrame object to Stata dta format. 

2626 

2627 Writes the DataFrame to a Stata dataset file. 

2628 "dta" files contain a Stata dataset. 

2629 

2630 Parameters 

2631 ---------- 

2632 path : str, path object, or buffer 

2633 String, path object (implementing ``os.PathLike[str]``), or file-like 

2634 object implementing a binary ``write()`` function. 

2635 

2636 .. versionchanged:: 1.0.0 

2637 

2638 Previously this was "fname" 

2639 

2640 convert_dates : dict 

2641 Dictionary mapping columns containing datetime types to stata 

2642 internal format to use when writing the dates. Options are 'tc', 

2643 'td', 'tm', 'tw', 'th', 'tq', 'ty'. Column can be either an integer 

2644 or a name. Datetime columns that do not have a conversion type 

2645 specified will be converted to 'tc'. Raises NotImplementedError if 

2646 a datetime column has timezone information. 

2647 write_index : bool 

2648 Write the index to Stata dataset. 

2649 byteorder : str 

2650 Can be ">", "<", "little", or "big". default is `sys.byteorder`. 

2651 time_stamp : datetime 

2652 A datetime to use as file creation date. Default is the current 

2653 time. 

2654 data_label : str, optional 

2655 A label for the data set. Must be 80 characters or smaller. 

2656 variable_labels : dict 

2657 Dictionary containing columns as keys and variable labels as 

2658 values. Each label must be 80 characters or smaller. 

2659 version : {{114, 117, 118, 119, None}}, default 114 

2660 Version to use in the output dta file. Set to None to let pandas 

2661 decide between 118 or 119 formats depending on the number of 

2662 columns in the frame. Version 114 can be read by Stata 10 and 

2663 later. Version 117 can be read by Stata 13 or later. Version 118 

2664 is supported in Stata 14 and later. Version 119 is supported in 

2665 Stata 15 and later. Version 114 limits string variables to 244 

2666 characters or fewer while versions 117 and later allow strings 

2667 with lengths up to 2,000,000 characters. Versions 118 and 119 

2668 support Unicode characters, and version 119 supports more than 

2669 32,767 variables. 

2670 

2671 Version 119 should usually only be used when the number of 

2672 variables exceeds the capacity of dta format 118. Exporting 

2673 smaller datasets in format 119 may have unintended consequences, 

2674 and, as of November 2020, Stata SE cannot read version 119 files. 

2675 

2676 .. versionchanged:: 1.0.0 

2677 

2678 Added support for formats 118 and 119. 

2679 

2680 convert_strl : list, optional 

2681 List of column names to convert to string columns to Stata StrL 

2682 format. Only available if version is 117. Storing strings in the 

2683 StrL format can produce smaller dta files if strings have more than 

2684 8 characters and values are repeated. 

2685 {compression_options} 

2686 

2687 .. versionadded:: 1.1.0 

2688 

2689 .. versionchanged:: 1.4.0 Zstandard support. 

2690 

2691 {storage_options} 

2692 

2693 .. versionadded:: 1.2.0 

2694 

2695 value_labels : dict of dicts 

2696 Dictionary containing columns as keys and dictionaries of column value 

2697 to labels as values. Labels for a single variable must be 32,000 

2698 characters or smaller. 

2699 

2700 .. versionadded:: 1.4.0 

2701 

2702 Raises 

2703 ------ 

2704 NotImplementedError 

2705 * If datetimes contain timezone information 

2706 * Column dtype is not representable in Stata 

2707 ValueError 

2708 * Columns listed in convert_dates are neither datetime64[ns] 

2709 or datetime.datetime 

2710 * Column listed in convert_dates is not in DataFrame 

2711 * Categorical label contains more than 32,000 characters 

2712 

2713 See Also 

2714 -------- 

2715 read_stata : Import Stata data files. 

2716 io.stata.StataWriter : Low-level writer for Stata data files. 

2717 io.stata.StataWriter117 : Low-level writer for version 117 files. 

2718 

2719 Examples 

2720 -------- 

2721 >>> df = pd.DataFrame({{'animal': ['falcon', 'parrot', 'falcon', 

2722 ... 'parrot'], 

2723 ... 'speed': [350, 18, 361, 15]}}) 

2724 >>> df.to_stata('animals.dta') # doctest: +SKIP 

2725 """ 

2726 if version not in (114, 117, 118, 119, None): 

2727 raise ValueError("Only formats 114, 117, 118 and 119 are supported.") 

2728 if version == 114: 

2729 if convert_strl is not None: 

2730 raise ValueError("strl is not supported in format 114") 

2731 from pandas.io.stata import StataWriter as statawriter 

2732 elif version == 117: 

2733 # mypy: Name 'statawriter' already defined (possibly by an import) 

2734 from pandas.io.stata import ( # type: ignore[no-redef] 

2735 StataWriter117 as statawriter, 

2736 ) 

2737 else: # versions 118 and 119 

2738 # mypy: Name 'statawriter' already defined (possibly by an import) 

2739 from pandas.io.stata import ( # type: ignore[no-redef] 

2740 StataWriterUTF8 as statawriter, 

2741 ) 

2742 

2743 kwargs: dict[str, Any] = {} 

2744 if version is None or version >= 117: 

2745 # strl conversion is only supported >= 117 

2746 kwargs["convert_strl"] = convert_strl 

2747 if version is None or version >= 118: 

2748 # Specifying the version is only supported for UTF8 (118 or 119) 

2749 kwargs["version"] = version 

2750 

2751 writer = statawriter( 

2752 path, 

2753 self, 

2754 convert_dates=convert_dates, 

2755 byteorder=byteorder, 

2756 time_stamp=time_stamp, 

2757 data_label=data_label, 

2758 write_index=write_index, 

2759 variable_labels=variable_labels, 

2760 compression=compression, 

2761 storage_options=storage_options, 

2762 value_labels=value_labels, 

2763 **kwargs, 

2764 ) 

2765 writer.write_file() 

2766 

2767 @deprecate_kwarg(old_arg_name="fname", new_arg_name="path") 

2768 def to_feather(self, path: FilePath | WriteBuffer[bytes], **kwargs) -> None: 

2769 """ 

2770 Write a DataFrame to the binary Feather format. 

2771 

2772 Parameters 

2773 ---------- 

2774 path : str, path object, file-like object 

2775 String, path object (implementing ``os.PathLike[str]``), or file-like 

2776 object implementing a binary ``write()`` function. If a string or a path, 

2777 it will be used as Root Directory path when writing a partitioned dataset. 

2778 **kwargs : 

2779 Additional keywords passed to :func:`pyarrow.feather.write_feather`. 

2780 Starting with pyarrow 0.17, this includes the `compression`, 

2781 `compression_level`, `chunksize` and `version` keywords. 

2782 

2783 .. versionadded:: 1.1.0 

2784 

2785 Notes 

2786 ----- 

2787 This function writes the dataframe as a `feather file 

2788 <https://arrow.apache.org/docs/python/feather.html>`_. Requires a default 

2789 index. For saving the DataFrame with your custom index use a method that 

2790 supports custom indices e.g. `to_parquet`. 

2791 """ 

2792 from pandas.io.feather_format import to_feather 

2793 

2794 to_feather(self, path, **kwargs) 

2795 

2796 @doc( 

2797 Series.to_markdown, 

2798 klass=_shared_doc_kwargs["klass"], 

2799 storage_options=_shared_docs["storage_options"], 

2800 examples="""Examples 

2801 -------- 

2802 >>> df = pd.DataFrame( 

2803 ... data={"animal_1": ["elk", "pig"], "animal_2": ["dog", "quetzal"]} 

2804 ... ) 

2805 >>> print(df.to_markdown()) 

2806 | | animal_1 | animal_2 | 

2807 |---:|:-----------|:-----------| 

2808 | 0 | elk | dog | 

2809 | 1 | pig | quetzal | 

2810 

2811 Output markdown with a tabulate option. 

2812 

2813 >>> print(df.to_markdown(tablefmt="grid")) 

2814 +----+------------+------------+ 

2815 | | animal_1 | animal_2 | 

2816 +====+============+============+ 

2817 | 0 | elk | dog | 

2818 +----+------------+------------+ 

2819 | 1 | pig | quetzal | 

2820 +----+------------+------------+""", 

2821 ) 

2822 def to_markdown( 

2823 self, 

2824 buf: FilePath | WriteBuffer[str] | None = None, 

2825 mode: str = "wt", 

2826 index: bool = True, 

2827 storage_options: StorageOptions = None, 

2828 **kwargs, 

2829 ) -> str | None: 

2830 if "showindex" in kwargs: 

2831 warnings.warn( 

2832 "'showindex' is deprecated. Only 'index' will be used " 

2833 "in a future version. Use 'index' to silence this warning.", 

2834 FutureWarning, 

2835 stacklevel=find_stack_level(), 

2836 ) 

2837 

2838 kwargs.setdefault("headers", "keys") 

2839 kwargs.setdefault("tablefmt", "pipe") 

2840 kwargs.setdefault("showindex", index) 

2841 tabulate = import_optional_dependency("tabulate") 

2842 result = tabulate.tabulate(self, **kwargs) 

2843 if buf is None: 

2844 return result 

2845 

2846 with get_handle(buf, mode, storage_options=storage_options) as handles: 

2847 handles.handle.write(result) 

2848 return None 

2849 

2850 @overload 

2851 def to_parquet( 

2852 self, 

2853 path: None = ..., 

2854 engine: str = ..., 

2855 compression: str | None = ..., 

2856 index: bool | None = ..., 

2857 partition_cols: list[str] | None = ..., 

2858 storage_options: StorageOptions = ..., 

2859 **kwargs, 

2860 ) -> bytes: 

2861 ... 

2862 

2863 @overload 

2864 def to_parquet( 

2865 self, 

2866 path: FilePath | WriteBuffer[bytes], 

2867 engine: str = ..., 

2868 compression: str | None = ..., 

2869 index: bool | None = ..., 

2870 partition_cols: list[str] | None = ..., 

2871 storage_options: StorageOptions = ..., 

2872 **kwargs, 

2873 ) -> None: 

2874 ... 

2875 

2876 @doc(storage_options=_shared_docs["storage_options"]) 

2877 @deprecate_kwarg(old_arg_name="fname", new_arg_name="path") 

2878 def to_parquet( 

2879 self, 

2880 path: FilePath | WriteBuffer[bytes] | None = None, 

2881 engine: str = "auto", 

2882 compression: str | None = "snappy", 

2883 index: bool | None = None, 

2884 partition_cols: list[str] | None = None, 

2885 storage_options: StorageOptions = None, 

2886 **kwargs, 

2887 ) -> bytes | None: 

2888 """ 

2889 Write a DataFrame to the binary parquet format. 

2890 

2891 This function writes the dataframe as a `parquet file 

2892 <https://parquet.apache.org/>`_. You can choose different parquet 

2893 backends, and have the option of compression. See 

2894 :ref:`the user guide <io.parquet>` for more details. 

2895 

2896 Parameters 

2897 ---------- 

2898 path : str, path object, file-like object, or None, default None 

2899 String, path object (implementing ``os.PathLike[str]``), or file-like 

2900 object implementing a binary ``write()`` function. If None, the result is 

2901 returned as bytes. If a string or path, it will be used as Root Directory 

2902 path when writing a partitioned dataset. 

2903 

2904 .. versionchanged:: 1.2.0 

2905 

2906 Previously this was "fname" 

2907 

2908 engine : {{'auto', 'pyarrow', 'fastparquet'}}, default 'auto' 

2909 Parquet library to use. If 'auto', then the option 

2910 ``io.parquet.engine`` is used. The default ``io.parquet.engine`` 

2911 behavior is to try 'pyarrow', falling back to 'fastparquet' if 

2912 'pyarrow' is unavailable. 

2913 compression : {{'snappy', 'gzip', 'brotli', None}}, default 'snappy' 

2914 Name of the compression to use. Use ``None`` for no compression. 

2915 index : bool, default None 

2916 If ``True``, include the dataframe's index(es) in the file output. 

2917 If ``False``, they will not be written to the file. 

2918 If ``None``, similar to ``True`` the dataframe's index(es) 

2919 will be saved. However, instead of being saved as values, 

2920 the RangeIndex will be stored as a range in the metadata so it 

2921 doesn't require much space and is faster. Other indexes will 

2922 be included as columns in the file output. 

2923 partition_cols : list, optional, default None 

2924 Column names by which to partition the dataset. 

2925 Columns are partitioned in the order they are given. 

2926 Must be None if path is not a string. 

2927 {storage_options} 

2928 

2929 .. versionadded:: 1.2.0 

2930 

2931 **kwargs 

2932 Additional arguments passed to the parquet library. See 

2933 :ref:`pandas io <io.parquet>` for more details. 

2934 

2935 Returns 

2936 ------- 

2937 bytes if no path argument is provided else None 

2938 

2939 See Also 

2940 -------- 

2941 read_parquet : Read a parquet file. 

2942 DataFrame.to_orc : Write an orc file. 

2943 DataFrame.to_csv : Write a csv file. 

2944 DataFrame.to_sql : Write to a sql table. 

2945 DataFrame.to_hdf : Write to hdf. 

2946 

2947 Notes 

2948 ----- 

2949 This function requires either the `fastparquet 

2950 <https://pypi.org/project/fastparquet>`_ or `pyarrow 

2951 <https://arrow.apache.org/docs/python/>`_ library. 

2952 

2953 Examples 

2954 -------- 

2955 >>> df = pd.DataFrame(data={{'col1': [1, 2], 'col2': [3, 4]}}) 

2956 >>> df.to_parquet('df.parquet.gzip', 

2957 ... compression='gzip') # doctest: +SKIP 

2958 >>> pd.read_parquet('df.parquet.gzip') # doctest: +SKIP 

2959 col1 col2 

2960 0 1 3 

2961 1 2 4 

2962 

2963 If you want to get a buffer to the parquet content you can use a io.BytesIO 

2964 object, as long as you don't use partition_cols, which creates multiple files. 

2965 

2966 >>> import io 

2967 >>> f = io.BytesIO() 

2968 >>> df.to_parquet(f) 

2969 >>> f.seek(0) 

2970 0 

2971 >>> content = f.read() 

2972 """ 

2973 from pandas.io.parquet import to_parquet 

2974 

2975 return to_parquet( 

2976 self, 

2977 path, 

2978 engine, 

2979 compression=compression, 

2980 index=index, 

2981 partition_cols=partition_cols, 

2982 storage_options=storage_options, 

2983 **kwargs, 

2984 ) 

2985 

2986 def to_orc( 

2987 self, 

2988 path: FilePath | WriteBuffer[bytes] | None = None, 

2989 *, 

2990 engine: Literal["pyarrow"] = "pyarrow", 

2991 index: bool | None = None, 

2992 engine_kwargs: dict[str, Any] | None = None, 

2993 ) -> bytes | None: 

2994 """ 

2995 Write a DataFrame to the ORC format. 

2996 

2997 .. versionadded:: 1.5.0 

2998 

2999 Parameters 

3000 ---------- 

3001 path : str, file-like object or None, default None 

3002 If a string, it will be used as Root Directory path 

3003 when writing a partitioned dataset. By file-like object, 

3004 we refer to objects with a write() method, such as a file handle 

3005 (e.g. via builtin open function). If path is None, 

3006 a bytes object is returned. 

3007 engine : str, default 'pyarrow' 

3008 ORC library to use. Pyarrow must be >= 7.0.0. 

3009 index : bool, optional 

3010 If ``True``, include the dataframe's index(es) in the file output. 

3011 If ``False``, they will not be written to the file. 

3012 If ``None``, similar to ``infer`` the dataframe's index(es) 

3013 will be saved. However, instead of being saved as values, 

3014 the RangeIndex will be stored as a range in the metadata so it 

3015 doesn't require much space and is faster. Other indexes will 

3016 be included as columns in the file output. 

3017 engine_kwargs : dict[str, Any] or None, default None 

3018 Additional keyword arguments passed to :func:`pyarrow.orc.write_table`. 

3019 

3020 Returns 

3021 ------- 

3022 bytes if no path argument is provided else None 

3023 

3024 Raises 

3025 ------ 

3026 NotImplementedError 

3027 Dtype of one or more columns is category, unsigned integers, interval, 

3028 period or sparse. 

3029 ValueError 

3030 engine is not pyarrow. 

3031 

3032 See Also 

3033 -------- 

3034 read_orc : Read a ORC file. 

3035 DataFrame.to_parquet : Write a parquet file. 

3036 DataFrame.to_csv : Write a csv file. 

3037 DataFrame.to_sql : Write to a sql table. 

3038 DataFrame.to_hdf : Write to hdf. 

3039 

3040 Notes 

3041 ----- 

3042 * Before using this function you should read the :ref:`user guide about 

3043 ORC <io.orc>` and :ref:`install optional dependencies <install.warn_orc>`. 

3044 * This function requires `pyarrow <https://arrow.apache.org/docs/python/>`_ 

3045 library. 

3046 * For supported dtypes please refer to `supported ORC features in Arrow 

3047 <https://arrow.apache.org/docs/cpp/orc.html#data-types>`__. 

3048 * Currently timezones in datetime columns are not preserved when a 

3049 dataframe is converted into ORC files. 

3050 

3051 Examples 

3052 -------- 

3053 >>> df = pd.DataFrame(data={'col1': [1, 2], 'col2': [4, 3]}) 

3054 >>> df.to_orc('df.orc') # doctest: +SKIP 

3055 >>> pd.read_orc('df.orc') # doctest: +SKIP 

3056 col1 col2 

3057 0 1 4 

3058 1 2 3 

3059 

3060 If you want to get a buffer to the orc content you can write it to io.BytesIO 

3061 >>> import io 

3062 >>> b = io.BytesIO(df.to_orc()) # doctest: +SKIP 

3063 >>> b.seek(0) # doctest: +SKIP 

3064 0 

3065 >>> content = b.read() # doctest: +SKIP 

3066 """ 

3067 from pandas.io.orc import to_orc 

3068 

3069 return to_orc( 

3070 self, path, engine=engine, index=index, engine_kwargs=engine_kwargs 

3071 ) 

3072 

3073 @overload 

3074 def to_html( 

3075 self, 

3076 buf: FilePath | WriteBuffer[str], 

3077 columns: Sequence[Level] | None = ..., 

3078 col_space: ColspaceArgType | None = ..., 

3079 header: bool | Sequence[str] = ..., 

3080 index: bool = ..., 

3081 na_rep: str = ..., 

3082 formatters: FormattersType | None = ..., 

3083 float_format: FloatFormatType | None = ..., 

3084 sparsify: bool | None = ..., 

3085 index_names: bool = ..., 

3086 justify: str | None = ..., 

3087 max_rows: int | None = ..., 

3088 max_cols: int | None = ..., 

3089 show_dimensions: bool | str = ..., 

3090 decimal: str = ..., 

3091 bold_rows: bool = ..., 

3092 classes: str | list | tuple | None = ..., 

3093 escape: bool = ..., 

3094 notebook: bool = ..., 

3095 border: int | bool | None = ..., 

3096 table_id: str | None = ..., 

3097 render_links: bool = ..., 

3098 encoding: str | None = ..., 

3099 ) -> None: 

3100 ... 

3101 

3102 @overload 

3103 def to_html( 

3104 self, 

3105 buf: None = ..., 

3106 columns: Sequence[Level] | None = ..., 

3107 col_space: ColspaceArgType | None = ..., 

3108 header: bool | Sequence[str] = ..., 

3109 index: bool = ..., 

3110 na_rep: str = ..., 

3111 formatters: FormattersType | None = ..., 

3112 float_format: FloatFormatType | None = ..., 

3113 sparsify: bool | None = ..., 

3114 index_names: bool = ..., 

3115 justify: str | None = ..., 

3116 max_rows: int | None = ..., 

3117 max_cols: int | None = ..., 

3118 show_dimensions: bool | str = ..., 

3119 decimal: str = ..., 

3120 bold_rows: bool = ..., 

3121 classes: str | list | tuple | None = ..., 

3122 escape: bool = ..., 

3123 notebook: bool = ..., 

3124 border: int | bool | None = ..., 

3125 table_id: str | None = ..., 

3126 render_links: bool = ..., 

3127 encoding: str | None = ..., 

3128 ) -> str: 

3129 ... 

3130 

3131 @Substitution( 

3132 header_type="bool", 

3133 header="Whether to print column labels, default True", 

3134 col_space_type="str or int, list or dict of int or str", 

3135 col_space="The minimum width of each column in CSS length " 

3136 "units. An int is assumed to be px units.\n\n" 

3137 " .. versionadded:: 0.25.0\n" 

3138 " Ability to use str", 

3139 ) 

3140 @Substitution(shared_params=fmt.common_docstring, returns=fmt.return_docstring) 

3141 def to_html( 

3142 self, 

3143 buf: FilePath | WriteBuffer[str] | None = None, 

3144 columns: Sequence[Level] | None = None, 

3145 col_space: ColspaceArgType | None = None, 

3146 header: bool | Sequence[str] = True, 

3147 index: bool = True, 

3148 na_rep: str = "NaN", 

3149 formatters: FormattersType | None = None, 

3150 float_format: FloatFormatType | None = None, 

3151 sparsify: bool | None = None, 

3152 index_names: bool = True, 

3153 justify: str | None = None, 

3154 max_rows: int | None = None, 

3155 max_cols: int | None = None, 

3156 show_dimensions: bool | str = False, 

3157 decimal: str = ".", 

3158 bold_rows: bool = True, 

3159 classes: str | list | tuple | None = None, 

3160 escape: bool = True, 

3161 notebook: bool = False, 

3162 border: int | bool | None = None, 

3163 table_id: str | None = None, 

3164 render_links: bool = False, 

3165 encoding: str | None = None, 

3166 ) -> str | None: 

3167 """ 

3168 Render a DataFrame as an HTML table. 

3169 %(shared_params)s 

3170 bold_rows : bool, default True 

3171 Make the row labels bold in the output. 

3172 classes : str or list or tuple, default None 

3173 CSS class(es) to apply to the resulting html table. 

3174 escape : bool, default True 

3175 Convert the characters <, >, and & to HTML-safe sequences. 

3176 notebook : {True, False}, default False 

3177 Whether the generated HTML is for IPython Notebook. 

3178 border : int 

3179 A ``border=border`` attribute is included in the opening 

3180 `<table>` tag. Default ``pd.options.display.html.border``. 

3181 table_id : str, optional 

3182 A css id is included in the opening `<table>` tag if specified. 

3183 render_links : bool, default False 

3184 Convert URLs to HTML links. 

3185 encoding : str, default "utf-8" 

3186 Set character encoding. 

3187 

3188 .. versionadded:: 1.0 

3189 %(returns)s 

3190 See Also 

3191 -------- 

3192 to_string : Convert DataFrame to a string. 

3193 """ 

3194 if justify is not None and justify not in fmt._VALID_JUSTIFY_PARAMETERS: 

3195 raise ValueError("Invalid value for justify parameter") 

3196 

3197 formatter = fmt.DataFrameFormatter( 

3198 self, 

3199 columns=columns, 

3200 col_space=col_space, 

3201 na_rep=na_rep, 

3202 header=header, 

3203 index=index, 

3204 formatters=formatters, 

3205 float_format=float_format, 

3206 bold_rows=bold_rows, 

3207 sparsify=sparsify, 

3208 justify=justify, 

3209 index_names=index_names, 

3210 escape=escape, 

3211 decimal=decimal, 

3212 max_rows=max_rows, 

3213 max_cols=max_cols, 

3214 show_dimensions=show_dimensions, 

3215 ) 

3216 # TODO: a generic formatter wld b in DataFrameFormatter 

3217 return fmt.DataFrameRenderer(formatter).to_html( 

3218 buf=buf, 

3219 classes=classes, 

3220 notebook=notebook, 

3221 border=border, 

3222 encoding=encoding, 

3223 table_id=table_id, 

3224 render_links=render_links, 

3225 ) 

3226 

3227 @doc( 

3228 storage_options=_shared_docs["storage_options"], 

3229 compression_options=_shared_docs["compression_options"] % "path_or_buffer", 

3230 ) 

3231 def to_xml( 

3232 self, 

3233 path_or_buffer: FilePath | WriteBuffer[bytes] | WriteBuffer[str] | None = None, 

3234 index: bool = True, 

3235 root_name: str | None = "data", 

3236 row_name: str | None = "row", 

3237 na_rep: str | None = None, 

3238 attr_cols: list[str] | None = None, 

3239 elem_cols: list[str] | None = None, 

3240 namespaces: dict[str | None, str] | None = None, 

3241 prefix: str | None = None, 

3242 encoding: str = "utf-8", 

3243 xml_declaration: bool | None = True, 

3244 pretty_print: bool | None = True, 

3245 parser: str | None = "lxml", 

3246 stylesheet: FilePath | ReadBuffer[str] | ReadBuffer[bytes] | None = None, 

3247 compression: CompressionOptions = "infer", 

3248 storage_options: StorageOptions = None, 

3249 ) -> str | None: 

3250 """ 

3251 Render a DataFrame to an XML document. 

3252 

3253 .. versionadded:: 1.3.0 

3254 

3255 Parameters 

3256 ---------- 

3257 path_or_buffer : str, path object, file-like object, or None, default None 

3258 String, path object (implementing ``os.PathLike[str]``), or file-like 

3259 object implementing a ``write()`` function. If None, the result is returned 

3260 as a string. 

3261 index : bool, default True 

3262 Whether to include index in XML document. 

3263 root_name : str, default 'data' 

3264 The name of root element in XML document. 

3265 row_name : str, default 'row' 

3266 The name of row element in XML document. 

3267 na_rep : str, optional 

3268 Missing data representation. 

3269 attr_cols : list-like, optional 

3270 List of columns to write as attributes in row element. 

3271 Hierarchical columns will be flattened with underscore 

3272 delimiting the different levels. 

3273 elem_cols : list-like, optional 

3274 List of columns to write as children in row element. By default, 

3275 all columns output as children of row element. Hierarchical 

3276 columns will be flattened with underscore delimiting the 

3277 different levels. 

3278 namespaces : dict, optional 

3279 All namespaces to be defined in root element. Keys of dict 

3280 should be prefix names and values of dict corresponding URIs. 

3281 Default namespaces should be given empty string key. For 

3282 example, :: 

3283 

3284 namespaces = {{"": "https://example.com"}} 

3285 

3286 prefix : str, optional 

3287 Namespace prefix to be used for every element and/or attribute 

3288 in document. This should be one of the keys in ``namespaces`` 

3289 dict. 

3290 encoding : str, default 'utf-8' 

3291 Encoding of the resulting document. 

3292 xml_declaration : bool, default True 

3293 Whether to include the XML declaration at start of document. 

3294 pretty_print : bool, default True 

3295 Whether output should be pretty printed with indentation and 

3296 line breaks. 

3297 parser : {{'lxml','etree'}}, default 'lxml' 

3298 Parser module to use for building of tree. Only 'lxml' and 

3299 'etree' are supported. With 'lxml', the ability to use XSLT 

3300 stylesheet is supported. 

3301 stylesheet : str, path object or file-like object, optional 

3302 A URL, file-like object, or a raw string containing an XSLT 

3303 script used to transform the raw XML output. Script should use 

3304 layout of elements and attributes from original output. This 

3305 argument requires ``lxml`` to be installed. Only XSLT 1.0 

3306 scripts and not later versions is currently supported. 

3307 {compression_options} 

3308 

3309 .. versionchanged:: 1.4.0 Zstandard support. 

3310 

3311 {storage_options} 

3312 

3313 Returns 

3314 ------- 

3315 None or str 

3316 If ``io`` is None, returns the resulting XML format as a 

3317 string. Otherwise returns None. 

3318 

3319 See Also 

3320 -------- 

3321 to_json : Convert the pandas object to a JSON string. 

3322 to_html : Convert DataFrame to a html. 

3323 

3324 Examples 

3325 -------- 

3326 >>> df = pd.DataFrame({{'shape': ['square', 'circle', 'triangle'], 

3327 ... 'degrees': [360, 360, 180], 

3328 ... 'sides': [4, np.nan, 3]}}) 

3329 

3330 >>> df.to_xml() # doctest: +SKIP 

3331 <?xml version='1.0' encoding='utf-8'?> 

3332 <data> 

3333 <row> 

3334 <index>0</index> 

3335 <shape>square</shape> 

3336 <degrees>360</degrees> 

3337 <sides>4.0</sides> 

3338 </row> 

3339 <row> 

3340 <index>1</index> 

3341 <shape>circle</shape> 

3342 <degrees>360</degrees> 

3343 <sides/> 

3344 </row> 

3345 <row> 

3346 <index>2</index> 

3347 <shape>triangle</shape> 

3348 <degrees>180</degrees> 

3349 <sides>3.0</sides> 

3350 </row> 

3351 </data> 

3352 

3353 >>> df.to_xml(attr_cols=[ 

3354 ... 'index', 'shape', 'degrees', 'sides' 

3355 ... ]) # doctest: +SKIP 

3356 <?xml version='1.0' encoding='utf-8'?> 

3357 <data> 

3358 <row index="0" shape="square" degrees="360" sides="4.0"/> 

3359 <row index="1" shape="circle" degrees="360"/> 

3360 <row index="2" shape="triangle" degrees="180" sides="3.0"/> 

3361 </data> 

3362 

3363 >>> df.to_xml(namespaces={{"doc": "https://example.com"}}, 

3364 ... prefix="doc") # doctest: +SKIP 

3365 <?xml version='1.0' encoding='utf-8'?> 

3366 <doc:data xmlns:doc="https://example.com"> 

3367 <doc:row> 

3368 <doc:index>0</doc:index> 

3369 <doc:shape>square</doc:shape> 

3370 <doc:degrees>360</doc:degrees> 

3371 <doc:sides>4.0</doc:sides> 

3372 </doc:row> 

3373 <doc:row> 

3374 <doc:index>1</doc:index> 

3375 <doc:shape>circle</doc:shape> 

3376 <doc:degrees>360</doc:degrees> 

3377 <doc:sides/> 

3378 </doc:row> 

3379 <doc:row> 

3380 <doc:index>2</doc:index> 

3381 <doc:shape>triangle</doc:shape> 

3382 <doc:degrees>180</doc:degrees> 

3383 <doc:sides>3.0</doc:sides> 

3384 </doc:row> 

3385 </doc:data> 

3386 """ 

3387 

3388 from pandas.io.formats.xml import ( 

3389 EtreeXMLFormatter, 

3390 LxmlXMLFormatter, 

3391 ) 

3392 

3393 lxml = import_optional_dependency("lxml.etree", errors="ignore") 

3394 

3395 TreeBuilder: type[EtreeXMLFormatter] | type[LxmlXMLFormatter] 

3396 

3397 if parser == "lxml": 

3398 if lxml is not None: 

3399 TreeBuilder = LxmlXMLFormatter 

3400 else: 

3401 raise ImportError( 

3402 "lxml not found, please install or use the etree parser." 

3403 ) 

3404 

3405 elif parser == "etree": 

3406 TreeBuilder = EtreeXMLFormatter 

3407 

3408 else: 

3409 raise ValueError("Values for parser can only be lxml or etree.") 

3410 

3411 xml_formatter = TreeBuilder( 

3412 self, 

3413 path_or_buffer=path_or_buffer, 

3414 index=index, 

3415 root_name=root_name, 

3416 row_name=row_name, 

3417 na_rep=na_rep, 

3418 attr_cols=attr_cols, 

3419 elem_cols=elem_cols, 

3420 namespaces=namespaces, 

3421 prefix=prefix, 

3422 encoding=encoding, 

3423 xml_declaration=xml_declaration, 

3424 pretty_print=pretty_print, 

3425 stylesheet=stylesheet, 

3426 compression=compression, 

3427 storage_options=storage_options, 

3428 ) 

3429 

3430 return xml_formatter.write_output() 

3431 

3432 # ---------------------------------------------------------------------- 

3433 @doc(INFO_DOCSTRING, **frame_sub_kwargs) 

3434 def info( 

3435 self, 

3436 verbose: bool | None = None, 

3437 buf: WriteBuffer[str] | None = None, 

3438 max_cols: int | None = None, 

3439 memory_usage: bool | str | None = None, 

3440 show_counts: bool | None = None, 

3441 null_counts: bool | None = None, 

3442 ) -> None: 

3443 if null_counts is not None: 

3444 if show_counts is not None: 

3445 raise ValueError("null_counts used with show_counts. Use show_counts.") 

3446 warnings.warn( 

3447 "null_counts is deprecated. Use show_counts instead", 

3448 FutureWarning, 

3449 stacklevel=find_stack_level(), 

3450 ) 

3451 show_counts = null_counts 

3452 info = DataFrameInfo( 

3453 data=self, 

3454 memory_usage=memory_usage, 

3455 ) 

3456 info.render( 

3457 buf=buf, 

3458 max_cols=max_cols, 

3459 verbose=verbose, 

3460 show_counts=show_counts, 

3461 ) 

3462 

3463 def memory_usage(self, index: bool = True, deep: bool = False) -> Series: 

3464 """ 

3465 Return the memory usage of each column in bytes. 

3466 

3467 The memory usage can optionally include the contribution of 

3468 the index and elements of `object` dtype. 

3469 

3470 This value is displayed in `DataFrame.info` by default. This can be 

3471 suppressed by setting ``pandas.options.display.memory_usage`` to False. 

3472 

3473 Parameters 

3474 ---------- 

3475 index : bool, default True 

3476 Specifies whether to include the memory usage of the DataFrame's 

3477 index in returned Series. If ``index=True``, the memory usage of 

3478 the index is the first item in the output. 

3479 deep : bool, default False 

3480 If True, introspect the data deeply by interrogating 

3481 `object` dtypes for system-level memory consumption, and include 

3482 it in the returned values. 

3483 

3484 Returns 

3485 ------- 

3486 Series 

3487 A Series whose index is the original column names and whose values 

3488 is the memory usage of each column in bytes. 

3489 

3490 See Also 

3491 -------- 

3492 numpy.ndarray.nbytes : Total bytes consumed by the elements of an 

3493 ndarray. 

3494 Series.memory_usage : Bytes consumed by a Series. 

3495 Categorical : Memory-efficient array for string values with 

3496 many repeated values. 

3497 DataFrame.info : Concise summary of a DataFrame. 

3498 

3499 Notes 

3500 ----- 

3501 See the :ref:`Frequently Asked Questions <df-memory-usage>` for more 

3502 details. 

3503 

3504 Examples 

3505 -------- 

3506 >>> dtypes = ['int64', 'float64', 'complex128', 'object', 'bool'] 

3507 >>> data = dict([(t, np.ones(shape=5000, dtype=int).astype(t)) 

3508 ... for t in dtypes]) 

3509 >>> df = pd.DataFrame(data) 

3510 >>> df.head() 

3511 int64 float64 complex128 object bool 

3512 0 1 1.0 1.0+0.0j 1 True 

3513 1 1 1.0 1.0+0.0j 1 True 

3514 2 1 1.0 1.0+0.0j 1 True 

3515 3 1 1.0 1.0+0.0j 1 True 

3516 4 1 1.0 1.0+0.0j 1 True 

3517 

3518 >>> df.memory_usage() 

3519 Index 128 

3520 int64 40000 

3521 float64 40000 

3522 complex128 80000 

3523 object 40000 

3524 bool 5000 

3525 dtype: int64 

3526 

3527 >>> df.memory_usage(index=False) 

3528 int64 40000 

3529 float64 40000 

3530 complex128 80000 

3531 object 40000 

3532 bool 5000 

3533 dtype: int64 

3534 

3535 The memory footprint of `object` dtype columns is ignored by default: 

3536 

3537 >>> df.memory_usage(deep=True) 

3538 Index 128 

3539 int64 40000 

3540 float64 40000 

3541 complex128 80000 

3542 object 180000 

3543 bool 5000 

3544 dtype: int64 

3545 

3546 Use a Categorical for efficient storage of an object-dtype column with 

3547 many repeated values. 

3548 

3549 >>> df['object'].astype('category').memory_usage(deep=True) 

3550 5244 

3551 """ 

3552 result = self._constructor_sliced( 

3553 [c.memory_usage(index=False, deep=deep) for col, c in self.items()], 

3554 index=self.columns, 

3555 ) 

3556 if index: 

3557 index_memory_usage = self._constructor_sliced( 

3558 self.index.memory_usage(deep=deep), index=["Index"] 

3559 ) 

3560 result = index_memory_usage._append(result) 

3561 return result 

3562 

3563 def transpose(self, *args, copy: bool = False) -> DataFrame: 

3564 """ 

3565 Transpose index and columns. 

3566 

3567 Reflect the DataFrame over its main diagonal by writing rows as columns 

3568 and vice-versa. The property :attr:`.T` is an accessor to the method 

3569 :meth:`transpose`. 

3570 

3571 Parameters 

3572 ---------- 

3573 *args : tuple, optional 

3574 Accepted for compatibility with NumPy. 

3575 copy : bool, default False 

3576 Whether to copy the data after transposing, even for DataFrames 

3577 with a single dtype. 

3578 

3579 Note that a copy is always required for mixed dtype DataFrames, 

3580 or for DataFrames with any extension types. 

3581 

3582 Returns 

3583 ------- 

3584 DataFrame 

3585 The transposed DataFrame. 

3586 

3587 See Also 

3588 -------- 

3589 numpy.transpose : Permute the dimensions of a given array. 

3590 

3591 Notes 

3592 ----- 

3593 Transposing a DataFrame with mixed dtypes will result in a homogeneous 

3594 DataFrame with the `object` dtype. In such a case, a copy of the data 

3595 is always made. 

3596 

3597 Examples 

3598 -------- 

3599 **Square DataFrame with homogeneous dtype** 

3600 

3601 >>> d1 = {'col1': [1, 2], 'col2': [3, 4]} 

3602 >>> df1 = pd.DataFrame(data=d1) 

3603 >>> df1 

3604 col1 col2 

3605 0 1 3 

3606 1 2 4 

3607 

3608 >>> df1_transposed = df1.T # or df1.transpose() 

3609 >>> df1_transposed 

3610 0 1 

3611 col1 1 2 

3612 col2 3 4 

3613 

3614 When the dtype is homogeneous in the original DataFrame, we get a 

3615 transposed DataFrame with the same dtype: 

3616 

3617 >>> df1.dtypes 

3618 col1 int64 

3619 col2 int64 

3620 dtype: object 

3621 >>> df1_transposed.dtypes 

3622 0 int64 

3623 1 int64 

3624 dtype: object 

3625 

3626 **Non-square DataFrame with mixed dtypes** 

3627 

3628 >>> d2 = {'name': ['Alice', 'Bob'], 

3629 ... 'score': [9.5, 8], 

3630 ... 'employed': [False, True], 

3631 ... 'kids': [0, 0]} 

3632 >>> df2 = pd.DataFrame(data=d2) 

3633 >>> df2 

3634 name score employed kids 

3635 0 Alice 9.5 False 0 

3636 1 Bob 8.0 True 0 

3637 

3638 >>> df2_transposed = df2.T # or df2.transpose() 

3639 >>> df2_transposed 

3640 0 1 

3641 name Alice Bob 

3642 score 9.5 8.0 

3643 employed False True 

3644 kids 0 0 

3645 

3646 When the DataFrame has mixed dtypes, we get a transposed DataFrame with 

3647 the `object` dtype: 

3648 

3649 >>> df2.dtypes 

3650 name object 

3651 score float64 

3652 employed bool 

3653 kids int64 

3654 dtype: object 

3655 >>> df2_transposed.dtypes 

3656 0 object 

3657 1 object 

3658 dtype: object 

3659 """ 

3660 nv.validate_transpose(args, {}) 

3661 # construct the args 

3662 

3663 dtypes = list(self.dtypes) 

3664 

3665 if self._can_fast_transpose: 

3666 # Note: tests pass without this, but this improves perf quite a bit. 

3667 new_vals = self._values.T 

3668 if copy: 

3669 new_vals = new_vals.copy() 

3670 

3671 result = self._constructor(new_vals, index=self.columns, columns=self.index) 

3672 

3673 elif ( 

3674 self._is_homogeneous_type and dtypes and is_extension_array_dtype(dtypes[0]) 

3675 ): 

3676 # We have EAs with the same dtype. We can preserve that dtype in transpose. 

3677 dtype = dtypes[0] 

3678 arr_type = dtype.construct_array_type() 

3679 values = self.values 

3680 

3681 new_values = [arr_type._from_sequence(row, dtype=dtype) for row in values] 

3682 result = type(self)._from_arrays( 

3683 new_values, index=self.columns, columns=self.index 

3684 ) 

3685 

3686 else: 

3687 new_arr = self.values.T 

3688 if copy: 

3689 new_arr = new_arr.copy() 

3690 result = self._constructor(new_arr, index=self.columns, columns=self.index) 

3691 

3692 return result.__finalize__(self, method="transpose") 

3693 

3694 @property 

3695 def T(self) -> DataFrame: 

3696 return self.transpose() 

3697 

3698 # ---------------------------------------------------------------------- 

3699 # Indexing Methods 

3700 

3701 def _ixs(self, i: int, axis: int = 0) -> Series: 

3702 """ 

3703 Parameters 

3704 ---------- 

3705 i : int 

3706 axis : int 

3707 

3708 Returns 

3709 ------- 

3710 Series 

3711 """ 

3712 # irow 

3713 if axis == 0: 

3714 new_mgr = self._mgr.fast_xs(i) 

3715 

3716 # if we are a copy, mark as such 

3717 copy = isinstance(new_mgr.array, np.ndarray) and new_mgr.array.base is None 

3718 result = self._constructor_sliced(new_mgr, name=self.index[i]).__finalize__( 

3719 self 

3720 ) 

3721 result._set_is_copy(self, copy=copy) 

3722 return result 

3723 

3724 # icol 

3725 else: 

3726 label = self.columns[i] 

3727 

3728 col_mgr = self._mgr.iget(i) 

3729 result = self._box_col_values(col_mgr, i) 

3730 

3731 # this is a cached value, mark it so 

3732 result._set_as_cached(label, self) 

3733 return result 

3734 

3735 def _get_column_array(self, i: int) -> ArrayLike: 

3736 """ 

3737 Get the values of the i'th column (ndarray or ExtensionArray, as stored 

3738 in the Block) 

3739 

3740 Warning! The returned array is a view but doesn't handle Copy-on-Write, 

3741 so this should be used with caution (for read-only purposes). 

3742 """ 

3743 return self._mgr.iget_values(i) 

3744 

3745 def _iter_column_arrays(self) -> Iterator[ArrayLike]: 

3746 """ 

3747 Iterate over the arrays of all columns in order. 

3748 This returns the values as stored in the Block (ndarray or ExtensionArray). 

3749 

3750 Warning! The returned array is a view but doesn't handle Copy-on-Write, 

3751 so this should be used with caution (for read-only purposes). 

3752 """ 

3753 for i in range(len(self.columns)): 

3754 yield self._get_column_array(i) 

3755 

3756 def __getitem__(self, key): 

3757 check_deprecated_indexers(key) 

3758 key = lib.item_from_zerodim(key) 

3759 key = com.apply_if_callable(key, self) 

3760 

3761 if is_hashable(key) and not is_iterator(key): 

3762 # is_iterator to exclude generator e.g. test_getitem_listlike 

3763 # shortcut if the key is in columns 

3764 is_mi = isinstance(self.columns, MultiIndex) 

3765 # GH#45316 Return view if key is not duplicated 

3766 # Only use drop_duplicates with duplicates for performance 

3767 if not is_mi and ( 

3768 self.columns.is_unique 

3769 and key in self.columns 

3770 or key in self.columns.drop_duplicates(keep=False) 

3771 ): 

3772 return self._get_item_cache(key) 

3773 

3774 elif is_mi and self.columns.is_unique and key in self.columns: 

3775 return self._getitem_multilevel(key) 

3776 # Do we have a slicer (on rows)? 

3777 indexer = convert_to_index_sliceable(self, key) 

3778 if indexer is not None: 

3779 if isinstance(indexer, np.ndarray): 

3780 indexer = lib.maybe_indices_to_slice( 

3781 indexer.astype(np.intp, copy=False), len(self) 

3782 ) 

3783 if isinstance(indexer, np.ndarray): 

3784 # GH#43223 If we can not convert, use take 

3785 return self.take(indexer, axis=0) 

3786 # either we have a slice or we have a string that can be converted 

3787 # to a slice for partial-string date indexing 

3788 return self._slice(indexer, axis=0) 

3789 

3790 # Do we have a (boolean) DataFrame? 

3791 if isinstance(key, DataFrame): 

3792 return self.where(key) 

3793 

3794 # Do we have a (boolean) 1d indexer? 

3795 if com.is_bool_indexer(key): 

3796 return self._getitem_bool_array(key) 

3797 

3798 # We are left with two options: a single key, and a collection of keys, 

3799 # We interpret tuples as collections only for non-MultiIndex 

3800 is_single_key = isinstance(key, tuple) or not is_list_like(key) 

3801 

3802 if is_single_key: 

3803 if self.columns.nlevels > 1: 

3804 return self._getitem_multilevel(key) 

3805 indexer = self.columns.get_loc(key) 

3806 if is_integer(indexer): 

3807 indexer = [indexer] 

3808 else: 

3809 if is_iterator(key): 

3810 key = list(key) 

3811 indexer = self.columns._get_indexer_strict(key, "columns")[1] 

3812 

3813 # take() does not accept boolean indexers 

3814 if getattr(indexer, "dtype", None) == bool: 

3815 indexer = np.where(indexer)[0] 

3816 

3817 data = self._take_with_is_copy(indexer, axis=1) 

3818 

3819 if is_single_key: 

3820 # What does looking for a single key in a non-unique index return? 

3821 # The behavior is inconsistent. It returns a Series, except when 

3822 # - the key itself is repeated (test on data.shape, #9519), or 

3823 # - we have a MultiIndex on columns (test on self.columns, #21309) 

3824 if data.shape[1] == 1 and not isinstance(self.columns, MultiIndex): 

3825 # GH#26490 using data[key] can cause RecursionError 

3826 return data._get_item_cache(key) 

3827 

3828 return data 

3829 

3830 def _getitem_bool_array(self, key): 

3831 # also raises Exception if object array with NA values 

3832 # warning here just in case -- previously __setitem__ was 

3833 # reindexing but __getitem__ was not; it seems more reasonable to 

3834 # go with the __setitem__ behavior since that is more consistent 

3835 # with all other indexing behavior 

3836 if isinstance(key, Series) and not key.index.equals(self.index): 

3837 warnings.warn( 

3838 "Boolean Series key will be reindexed to match DataFrame index.", 

3839 UserWarning, 

3840 stacklevel=find_stack_level(), 

3841 ) 

3842 elif len(key) != len(self.index): 

3843 raise ValueError( 

3844 f"Item wrong length {len(key)} instead of {len(self.index)}." 

3845 ) 

3846 

3847 # check_bool_indexer will throw exception if Series key cannot 

3848 # be reindexed to match DataFrame rows 

3849 key = check_bool_indexer(self.index, key) 

3850 indexer = key.nonzero()[0] 

3851 return self._take_with_is_copy(indexer, axis=0) 

3852 

3853 def _getitem_multilevel(self, key): 

3854 # self.columns is a MultiIndex 

3855 loc = self.columns.get_loc(key) 

3856 if isinstance(loc, (slice, np.ndarray)): 

3857 new_columns = self.columns[loc] 

3858 result_columns = maybe_droplevels(new_columns, key) 

3859 if self._is_mixed_type: 

3860 result = self.reindex(columns=new_columns) 

3861 result.columns = result_columns 

3862 else: 

3863 new_values = self.values[:, loc] 

3864 result = self._constructor( 

3865 new_values, index=self.index, columns=result_columns 

3866 ) 

3867 result = result.__finalize__(self) 

3868 

3869 # If there is only one column being returned, and its name is 

3870 # either an empty string, or a tuple with an empty string as its 

3871 # first element, then treat the empty string as a placeholder 

3872 # and return the column as if the user had provided that empty 

3873 # string in the key. If the result is a Series, exclude the 

3874 # implied empty string from its name. 

3875 if len(result.columns) == 1: 

3876 top = result.columns[0] 

3877 if isinstance(top, tuple): 

3878 top = top[0] 

3879 if top == "": 

3880 result = result[""] 

3881 if isinstance(result, Series): 

3882 result = self._constructor_sliced( 

3883 result, index=self.index, name=key 

3884 ) 

3885 

3886 result._set_is_copy(self) 

3887 return result 

3888 else: 

3889 # loc is neither a slice nor ndarray, so must be an int 

3890 return self._ixs(loc, axis=1) 

3891 

3892 def _get_value(self, index, col, takeable: bool = False) -> Scalar: 

3893 """ 

3894 Quickly retrieve single value at passed column and index. 

3895 

3896 Parameters 

3897 ---------- 

3898 index : row label 

3899 col : column label 

3900 takeable : interpret the index/col as indexers, default False 

3901 

3902 Returns 

3903 ------- 

3904 scalar 

3905 

3906 Notes 

3907 ----- 

3908 Assumes that both `self.index._index_as_unique` and 

3909 `self.columns._index_as_unique`; Caller is responsible for checking. 

3910 """ 

3911 if takeable: 

3912 series = self._ixs(col, axis=1) 

3913 return series._values[index] 

3914 

3915 series = self._get_item_cache(col) 

3916 engine = self.index._engine 

3917 

3918 if not isinstance(self.index, MultiIndex): 

3919 # CategoricalIndex: Trying to use the engine fastpath may give incorrect 

3920 # results if our categories are integers that dont match our codes 

3921 # IntervalIndex: IntervalTree has no get_loc 

3922 row = self.index.get_loc(index) 

3923 return series._values[row] 

3924 

3925 # For MultiIndex going through engine effectively restricts us to 

3926 # same-length tuples; see test_get_set_value_no_partial_indexing 

3927 loc = engine.get_loc(index) 

3928 return series._values[loc] 

3929 

3930 def isetitem(self, loc, value) -> None: 

3931 """ 

3932 Set the given value in the column with position 'loc'. 

3933 

3934 This is a positional analogue to __setitem__. 

3935 

3936 Parameters 

3937 ---------- 

3938 loc : int or sequence of ints 

3939 value : scalar or arraylike 

3940 

3941 Notes 

3942 ----- 

3943 Unlike `frame.iloc[:, i] = value`, `frame.isetitem(loc, value)` will 

3944 _never_ try to set the values in place, but will always insert a new 

3945 array. 

3946 

3947 In cases where `frame.columns` is unique, this is equivalent to 

3948 `frame[frame.columns[i]] = value`. 

3949 """ 

3950 arraylike = self._sanitize_column(value) 

3951 self._iset_item_mgr(loc, arraylike, inplace=False) 

3952 

3953 def __setitem__(self, key, value): 

3954 key = com.apply_if_callable(key, self) 

3955 

3956 # see if we can slice the rows 

3957 indexer = convert_to_index_sliceable(self, key) 

3958 if indexer is not None: 

3959 # either we have a slice or we have a string that can be converted 

3960 # to a slice for partial-string date indexing 

3961 return self._setitem_slice(indexer, value) 

3962 

3963 if isinstance(key, DataFrame) or getattr(key, "ndim", None) == 2: 

3964 self._setitem_frame(key, value) 

3965 elif isinstance(key, (Series, np.ndarray, list, Index)): 

3966 self._setitem_array(key, value) 

3967 elif isinstance(value, DataFrame): 

3968 self._set_item_frame_value(key, value) 

3969 elif ( 

3970 is_list_like(value) 

3971 and not self.columns.is_unique 

3972 and 1 < len(self.columns.get_indexer_for([key])) == len(value) 

3973 ): 

3974 # Column to set is duplicated 

3975 self._setitem_array([key], value) 

3976 else: 

3977 # set column 

3978 self._set_item(key, value) 

3979 

3980 def _setitem_slice(self, key: slice, value): 

3981 # NB: we can't just use self.loc[key] = value because that 

3982 # operates on labels and we need to operate positional for 

3983 # backwards-compat, xref GH#31469 

3984 self._check_setitem_copy() 

3985 self.iloc[key] = value 

3986 

3987 def _setitem_array(self, key, value): 

3988 # also raises Exception if object array with NA values 

3989 if com.is_bool_indexer(key): 

3990 # bool indexer is indexing along rows 

3991 if len(key) != len(self.index): 

3992 raise ValueError( 

3993 f"Item wrong length {len(key)} instead of {len(self.index)}!" 

3994 ) 

3995 key = check_bool_indexer(self.index, key) 

3996 indexer = key.nonzero()[0] 

3997 self._check_setitem_copy() 

3998 if isinstance(value, DataFrame): 

3999 # GH#39931 reindex since iloc does not align 

4000 value = value.reindex(self.index.take(indexer)) 

4001 self.iloc[indexer] = value 

4002 

4003 else: 

4004 # Note: unlike self.iloc[:, indexer] = value, this will 

4005 # never try to overwrite values inplace 

4006 

4007 if isinstance(value, DataFrame): 

4008 check_key_length(self.columns, key, value) 

4009 for k1, k2 in zip(key, value.columns): 

4010 self[k1] = value[k2] 

4011 

4012 elif not is_list_like(value): 

4013 for col in key: 

4014 self[col] = value 

4015 

4016 elif isinstance(value, np.ndarray) and value.ndim == 2: 

4017 self._iset_not_inplace(key, value) 

4018 

4019 elif np.ndim(value) > 1: 

4020 # list of lists 

4021 value = DataFrame(value).values 

4022 return self._setitem_array(key, value) 

4023 

4024 else: 

4025 self._iset_not_inplace(key, value) 

4026 

4027 def _iset_not_inplace(self, key, value): 

4028 # GH#39510 when setting with df[key] = obj with a list-like key and 

4029 # list-like value, we iterate over those listlikes and set columns 

4030 # one at a time. This is different from dispatching to 

4031 # `self.loc[:, key]= value` because loc.__setitem__ may overwrite 

4032 # data inplace, whereas this will insert new arrays. 

4033 

4034 def igetitem(obj, i: int): 

4035 # Note: we catch DataFrame obj before getting here, but 

4036 # hypothetically would return obj.iloc[:, i] 

4037 if isinstance(obj, np.ndarray): 

4038 return obj[..., i] 

4039 else: 

4040 return obj[i] 

4041 

4042 if self.columns.is_unique: 

4043 if np.shape(value)[-1] != len(key): 

4044 raise ValueError("Columns must be same length as key") 

4045 

4046 for i, col in enumerate(key): 

4047 self[col] = igetitem(value, i) 

4048 

4049 else: 

4050 

4051 ilocs = self.columns.get_indexer_non_unique(key)[0] 

4052 if (ilocs < 0).any(): 

4053 # key entries not in self.columns 

4054 raise NotImplementedError 

4055 

4056 if np.shape(value)[-1] != len(ilocs): 

4057 raise ValueError("Columns must be same length as key") 

4058 

4059 assert np.ndim(value) <= 2 

4060 

4061 orig_columns = self.columns 

4062 

4063 # Using self.iloc[:, i] = ... may set values inplace, which 

4064 # by convention we do not do in __setitem__ 

4065 try: 

4066 self.columns = Index(range(len(self.columns))) 

4067 for i, iloc in enumerate(ilocs): 

4068 self[iloc] = igetitem(value, i) 

4069 finally: 

4070 self.columns = orig_columns 

4071 

4072 def _setitem_frame(self, key, value): 

4073 # support boolean setting with DataFrame input, e.g. 

4074 # df[df > df2] = 0 

4075 if isinstance(key, np.ndarray): 

4076 if key.shape != self.shape: 

4077 raise ValueError("Array conditional must be same shape as self") 

4078 key = self._constructor(key, **self._construct_axes_dict()) 

4079 

4080 if key.size and not is_bool_dtype(key.values): 

4081 raise TypeError( 

4082 "Must pass DataFrame or 2-d ndarray with boolean values only" 

4083 ) 

4084 

4085 self._check_inplace_setting(value) 

4086 self._check_setitem_copy() 

4087 self._where(-key, value, inplace=True) 

4088 

4089 def _set_item_frame_value(self, key, value: DataFrame) -> None: 

4090 self._ensure_valid_index(value) 

4091 

4092 # align columns 

4093 if key in self.columns: 

4094 loc = self.columns.get_loc(key) 

4095 cols = self.columns[loc] 

4096 len_cols = 1 if is_scalar(cols) else len(cols) 

4097 if len_cols != len(value.columns): 

4098 raise ValueError("Columns must be same length as key") 

4099 

4100 # align right-hand-side columns if self.columns 

4101 # is multi-index and self[key] is a sub-frame 

4102 if isinstance(self.columns, MultiIndex) and isinstance( 

4103 loc, (slice, Series, np.ndarray, Index) 

4104 ): 

4105 cols_droplevel = maybe_droplevels(cols, key) 

4106 if len(cols_droplevel) and not cols_droplevel.equals(value.columns): 

4107 value = value.reindex(cols_droplevel, axis=1) 

4108 

4109 for col, col_droplevel in zip(cols, cols_droplevel): 

4110 self[col] = value[col_droplevel] 

4111 return 

4112 

4113 if is_scalar(cols): 

4114 self[cols] = value[value.columns[0]] 

4115 return 

4116 

4117 # now align rows 

4118 arraylike = _reindex_for_setitem(value, self.index) 

4119 self._set_item_mgr(key, arraylike) 

4120 return 

4121 

4122 if len(value.columns) != 1: 

4123 raise ValueError( 

4124 "Cannot set a DataFrame with multiple columns to the single " 

4125 f"column {key}" 

4126 ) 

4127 

4128 self[key] = value[value.columns[0]] 

4129 

4130 def _iset_item_mgr( 

4131 self, loc: int | slice | np.ndarray, value, inplace: bool = False 

4132 ) -> None: 

4133 # when called from _set_item_mgr loc can be anything returned from get_loc 

4134 self._mgr.iset(loc, value, inplace=inplace) 

4135 self._clear_item_cache() 

4136 

4137 def _set_item_mgr(self, key, value: ArrayLike) -> None: 

4138 try: 

4139 loc = self._info_axis.get_loc(key) 

4140 except KeyError: 

4141 # This item wasn't present, just insert at end 

4142 self._mgr.insert(len(self._info_axis), key, value) 

4143 else: 

4144 self._iset_item_mgr(loc, value) 

4145 

4146 # check if we are modifying a copy 

4147 # try to set first as we want an invalid 

4148 # value exception to occur first 

4149 if len(self): 

4150 self._check_setitem_copy() 

4151 

4152 def _iset_item(self, loc: int, value) -> None: 

4153 arraylike = self._sanitize_column(value) 

4154 self._iset_item_mgr(loc, arraylike, inplace=True) 

4155 

4156 # check if we are modifying a copy 

4157 # try to set first as we want an invalid 

4158 # value exception to occur first 

4159 if len(self): 

4160 self._check_setitem_copy() 

4161 

4162 def _set_item(self, key, value) -> None: 

4163 """ 

4164 Add series to DataFrame in specified column. 

4165 

4166 If series is a numpy-array (not a Series/TimeSeries), it must be the 

4167 same length as the DataFrames index or an error will be thrown. 

4168 

4169 Series/TimeSeries will be conformed to the DataFrames index to 

4170 ensure homogeneity. 

4171 """ 

4172 value = self._sanitize_column(value) 

4173 

4174 if ( 

4175 key in self.columns 

4176 and value.ndim == 1 

4177 and not is_extension_array_dtype(value) 

4178 ): 

4179 # broadcast across multiple columns if necessary 

4180 if not self.columns.is_unique or isinstance(self.columns, MultiIndex): 

4181 existing_piece = self[key] 

4182 if isinstance(existing_piece, DataFrame): 

4183 value = np.tile(value, (len(existing_piece.columns), 1)).T 

4184 

4185 self._set_item_mgr(key, value) 

4186 

4187 def _set_value( 

4188 self, index: IndexLabel, col, value: Scalar, takeable: bool = False 

4189 ) -> None: 

4190 """ 

4191 Put single value at passed column and index. 

4192 

4193 Parameters 

4194 ---------- 

4195 index : Label 

4196 row label 

4197 col : Label 

4198 column label 

4199 value : scalar 

4200 takeable : bool, default False 

4201 Sets whether or not index/col interpreted as indexers 

4202 """ 

4203 try: 

4204 if takeable: 

4205 icol = col 

4206 iindex = cast(int, index) 

4207 else: 

4208 icol = self.columns.get_loc(col) 

4209 iindex = self.index.get_loc(index) 

4210 self._mgr.column_setitem(icol, iindex, value) 

4211 self._clear_item_cache() 

4212 

4213 except (KeyError, TypeError, ValueError): 

4214 # get_loc might raise a KeyError for missing labels (falling back 

4215 # to (i)loc will do expansion of the index) 

4216 # column_setitem will do validation that may raise TypeError or ValueError 

4217 # set using a non-recursive method & reset the cache 

4218 if takeable: 

4219 self.iloc[index, col] = value 

4220 else: 

4221 self.loc[index, col] = value 

4222 self._item_cache.pop(col, None) 

4223 

4224 except InvalidIndexError as ii_err: 

4225 # GH48729: Seems like you are trying to assign a value to a 

4226 # row when only scalar options are permitted 

4227 raise InvalidIndexError( 

4228 f"You can only assign a scalar value not a {type(value)}" 

4229 ) from ii_err 

4230 

4231 def _ensure_valid_index(self, value) -> None: 

4232 """ 

4233 Ensure that if we don't have an index, that we can create one from the 

4234 passed value. 

4235 """ 

4236 # GH5632, make sure that we are a Series convertible 

4237 if not len(self.index) and is_list_like(value) and len(value): 

4238 if not isinstance(value, DataFrame): 

4239 try: 

4240 value = Series(value) 

4241 except (ValueError, NotImplementedError, TypeError) as err: 

4242 raise ValueError( 

4243 "Cannot set a frame with no defined index " 

4244 "and a value that cannot be converted to a Series" 

4245 ) from err 

4246 

4247 # GH31368 preserve name of index 

4248 index_copy = value.index.copy() 

4249 if self.index.name is not None: 

4250 index_copy.name = self.index.name 

4251 

4252 self._mgr = self._mgr.reindex_axis(index_copy, axis=1, fill_value=np.nan) 

4253 

4254 def _box_col_values(self, values: SingleDataManager, loc: int) -> Series: 

4255 """ 

4256 Provide boxed values for a column. 

4257 """ 

4258 # Lookup in columns so that if e.g. a str datetime was passed 

4259 # we attach the Timestamp object as the name. 

4260 name = self.columns[loc] 

4261 klass = self._constructor_sliced 

4262 # We get index=self.index bc values is a SingleDataManager 

4263 return klass(values, name=name, fastpath=True).__finalize__(self) 

4264 

4265 # ---------------------------------------------------------------------- 

4266 # Lookup Caching 

4267 

4268 def _clear_item_cache(self) -> None: 

4269 self._item_cache.clear() 

4270 

4271 def _get_item_cache(self, item: Hashable) -> Series: 

4272 """Return the cached item, item represents a label indexer.""" 

4273 cache = self._item_cache 

4274 res = cache.get(item) 

4275 if res is None: 

4276 # All places that call _get_item_cache have unique columns, 

4277 # pending resolution of GH#33047 

4278 

4279 loc = self.columns.get_loc(item) 

4280 res = self._ixs(loc, axis=1) 

4281 

4282 cache[item] = res 

4283 

4284 # for a chain 

4285 res._is_copy = self._is_copy 

4286 return res 

4287 

4288 def _reset_cacher(self) -> None: 

4289 # no-op for DataFrame 

4290 pass 

4291 

4292 def _maybe_cache_changed(self, item, value: Series, inplace: bool) -> None: 

4293 """ 

4294 The object has called back to us saying maybe it has changed. 

4295 """ 

4296 loc = self._info_axis.get_loc(item) 

4297 arraylike = value._values 

4298 

4299 old = self._ixs(loc, axis=1) 

4300 if old._values is value._values and inplace: 

4301 # GH#46149 avoid making unnecessary copies/block-splitting 

4302 return 

4303 

4304 self._mgr.iset(loc, arraylike, inplace=inplace) 

4305 

4306 # ---------------------------------------------------------------------- 

4307 # Unsorted 

4308 

4309 @overload 

4310 def query(self, expr: str, *, inplace: Literal[False] = ..., **kwargs) -> DataFrame: 

4311 ... 

4312 

4313 @overload 

4314 def query(self, expr: str, *, inplace: Literal[True], **kwargs) -> None: 

4315 ... 

4316 

4317 @overload 

4318 def query(self, expr: str, *, inplace: bool = ..., **kwargs) -> DataFrame | None: 

4319 ... 

4320 

4321 @deprecate_nonkeyword_arguments(version=None, allowed_args=["self", "expr"]) 

4322 def query(self, expr: str, inplace: bool = False, **kwargs) -> DataFrame | None: 

4323 """ 

4324 Query the columns of a DataFrame with a boolean expression. 

4325 

4326 Parameters 

4327 ---------- 

4328 expr : str 

4329 The query string to evaluate. 

4330 

4331 You can refer to variables 

4332 in the environment by prefixing them with an '@' character like 

4333 ``@a + b``. 

4334 

4335 You can refer to column names that are not valid Python variable names 

4336 by surrounding them in backticks. Thus, column names containing spaces 

4337 or punctuations (besides underscores) or starting with digits must be 

4338 surrounded by backticks. (For example, a column named "Area (cm^2)" would 

4339 be referenced as ```Area (cm^2)```). Column names which are Python keywords 

4340 (like "list", "for", "import", etc) cannot be used. 

4341 

4342 For example, if one of your columns is called ``a a`` and you want 

4343 to sum it with ``b``, your query should be ```a a` + b``. 

4344 

4345 .. versionadded:: 0.25.0 

4346 Backtick quoting introduced. 

4347 

4348 .. versionadded:: 1.0.0 

4349 Expanding functionality of backtick quoting for more than only spaces. 

4350 

4351 inplace : bool 

4352 Whether to modify the DataFrame rather than creating a new one. 

4353 **kwargs 

4354 See the documentation for :func:`eval` for complete details 

4355 on the keyword arguments accepted by :meth:`DataFrame.query`. 

4356 

4357 Returns 

4358 ------- 

4359 DataFrame or None 

4360 DataFrame resulting from the provided query expression or 

4361 None if ``inplace=True``. 

4362 

4363 See Also 

4364 -------- 

4365 eval : Evaluate a string describing operations on 

4366 DataFrame columns. 

4367 DataFrame.eval : Evaluate a string describing operations on 

4368 DataFrame columns. 

4369 

4370 Notes 

4371 ----- 

4372 The result of the evaluation of this expression is first passed to 

4373 :attr:`DataFrame.loc` and if that fails because of a 

4374 multidimensional key (e.g., a DataFrame) then the result will be passed 

4375 to :meth:`DataFrame.__getitem__`. 

4376 

4377 This method uses the top-level :func:`eval` function to 

4378 evaluate the passed query. 

4379 

4380 The :meth:`~pandas.DataFrame.query` method uses a slightly 

4381 modified Python syntax by default. For example, the ``&`` and ``|`` 

4382 (bitwise) operators have the precedence of their boolean cousins, 

4383 :keyword:`and` and :keyword:`or`. This *is* syntactically valid Python, 

4384 however the semantics are different. 

4385 

4386 You can change the semantics of the expression by passing the keyword 

4387 argument ``parser='python'``. This enforces the same semantics as 

4388 evaluation in Python space. Likewise, you can pass ``engine='python'`` 

4389 to evaluate an expression using Python itself as a backend. This is not 

4390 recommended as it is inefficient compared to using ``numexpr`` as the 

4391 engine. 

4392 

4393 The :attr:`DataFrame.index` and 

4394 :attr:`DataFrame.columns` attributes of the 

4395 :class:`~pandas.DataFrame` instance are placed in the query namespace 

4396 by default, which allows you to treat both the index and columns of the 

4397 frame as a column in the frame. 

4398 The identifier ``index`` is used for the frame index; you can also 

4399 use the name of the index to identify it in a query. Please note that 

4400 Python keywords may not be used as identifiers. 

4401 

4402 For further details and examples see the ``query`` documentation in 

4403 :ref:`indexing <indexing.query>`. 

4404 

4405 *Backtick quoted variables* 

4406 

4407 Backtick quoted variables are parsed as literal Python code and 

4408 are converted internally to a Python valid identifier. 

4409 This can lead to the following problems. 

4410 

4411 During parsing a number of disallowed characters inside the backtick 

4412 quoted string are replaced by strings that are allowed as a Python identifier. 

4413 These characters include all operators in Python, the space character, the 

4414 question mark, the exclamation mark, the dollar sign, and the euro sign. 

4415 For other characters that fall outside the ASCII range (U+0001..U+007F) 

4416 and those that are not further specified in PEP 3131, 

4417 the query parser will raise an error. 

4418 This excludes whitespace different than the space character, 

4419 but also the hashtag (as it is used for comments) and the backtick 

4420 itself (backtick can also not be escaped). 

4421 

4422 In a special case, quotes that make a pair around a backtick can 

4423 confuse the parser. 

4424 For example, ```it's` > `that's``` will raise an error, 

4425 as it forms a quoted string (``'s > `that'``) with a backtick inside. 

4426 

4427 See also the Python documentation about lexical analysis 

4428 (https://docs.python.org/3/reference/lexical_analysis.html) 

4429 in combination with the source code in :mod:`pandas.core.computation.parsing`. 

4430 

4431 Examples 

4432 -------- 

4433 >>> df = pd.DataFrame({'A': range(1, 6), 

4434 ... 'B': range(10, 0, -2), 

4435 ... 'C C': range(10, 5, -1)}) 

4436 >>> df 

4437 A B C C 

4438 0 1 10 10 

4439 1 2 8 9 

4440 2 3 6 8 

4441 3 4 4 7 

4442 4 5 2 6 

4443 >>> df.query('A > B') 

4444 A B C C 

4445 4 5 2 6 

4446 

4447 The previous expression is equivalent to 

4448 

4449 >>> df[df.A > df.B] 

4450 A B C C 

4451 4 5 2 6 

4452 

4453 For columns with spaces in their name, you can use backtick quoting. 

4454 

4455 >>> df.query('B == `C C`') 

4456 A B C C 

4457 0 1 10 10 

4458 

4459 The previous expression is equivalent to 

4460 

4461 >>> df[df.B == df['C C']] 

4462 A B C C 

4463 0 1 10 10 

4464 """ 

4465 inplace = validate_bool_kwarg(inplace, "inplace") 

4466 if not isinstance(expr, str): 

4467 msg = f"expr must be a string to be evaluated, {type(expr)} given" 

4468 raise ValueError(msg) 

4469 kwargs["level"] = kwargs.pop("level", 0) + 2 

4470 kwargs["target"] = None 

4471 res = self.eval(expr, **kwargs) 

4472 

4473 try: 

4474 result = self.loc[res] 

4475 except ValueError: 

4476 # when res is multi-dimensional loc raises, but this is sometimes a 

4477 # valid query 

4478 result = self[res] 

4479 

4480 if inplace: 

4481 self._update_inplace(result) 

4482 return None 

4483 else: 

4484 return result 

4485 

4486 @overload 

4487 def eval(self, expr: str, *, inplace: Literal[False] = ..., **kwargs) -> Any: 

4488 ... 

4489 

4490 @overload 

4491 def eval(self, expr: str, *, inplace: Literal[True], **kwargs) -> None: 

4492 ... 

4493 

4494 @deprecate_nonkeyword_arguments(version=None, allowed_args=["self", "expr"]) 

4495 def eval(self, expr: str, inplace: bool = False, **kwargs) -> Any | None: 

4496 """ 

4497 Evaluate a string describing operations on DataFrame columns. 

4498 

4499 Operates on columns only, not specific rows or elements. This allows 

4500 `eval` to run arbitrary code, which can make you vulnerable to code 

4501 injection if you pass user input to this function. 

4502 

4503 Parameters 

4504 ---------- 

4505 expr : str 

4506 The expression string to evaluate. 

4507 inplace : bool, default False 

4508 If the expression contains an assignment, whether to perform the 

4509 operation inplace and mutate the existing DataFrame. Otherwise, 

4510 a new DataFrame is returned. 

4511 **kwargs 

4512 See the documentation for :func:`eval` for complete details 

4513 on the keyword arguments accepted by 

4514 :meth:`~pandas.DataFrame.query`. 

4515 

4516 Returns 

4517 ------- 

4518 ndarray, scalar, pandas object, or None 

4519 The result of the evaluation or None if ``inplace=True``. 

4520 

4521 See Also 

4522 -------- 

4523 DataFrame.query : Evaluates a boolean expression to query the columns 

4524 of a frame. 

4525 DataFrame.assign : Can evaluate an expression or function to create new 

4526 values for a column. 

4527 eval : Evaluate a Python expression as a string using various 

4528 backends. 

4529 

4530 Notes 

4531 ----- 

4532 For more details see the API documentation for :func:`~eval`. 

4533 For detailed examples see :ref:`enhancing performance with eval 

4534 <enhancingperf.eval>`. 

4535 

4536 Examples 

4537 -------- 

4538 >>> df = pd.DataFrame({'A': range(1, 6), 'B': range(10, 0, -2)}) 

4539 >>> df 

4540 A B 

4541 0 1 10 

4542 1 2 8 

4543 2 3 6 

4544 3 4 4 

4545 4 5 2 

4546 >>> df.eval('A + B') 

4547 0 11 

4548 1 10 

4549 2 9 

4550 3 8 

4551 4 7 

4552 dtype: int64 

4553 

4554 Assignment is allowed though by default the original DataFrame is not 

4555 modified. 

4556 

4557 >>> df.eval('C = A + B') 

4558 A B C 

4559 0 1 10 11 

4560 1 2 8 10 

4561 2 3 6 9 

4562 3 4 4 8 

4563 4 5 2 7 

4564 >>> df 

4565 A B 

4566 0 1 10 

4567 1 2 8 

4568 2 3 6 

4569 3 4 4 

4570 4 5 2 

4571 

4572 Use ``inplace=True`` to modify the original DataFrame. 

4573 

4574 >>> df.eval('C = A + B', inplace=True) 

4575 >>> df 

4576 A B C 

4577 0 1 10 11 

4578 1 2 8 10 

4579 2 3 6 9 

4580 3 4 4 8 

4581 4 5 2 7 

4582 

4583 Multiple columns can be assigned to using multi-line expressions: 

4584 

4585 >>> df.eval( 

4586 ... ''' 

4587 ... C = A + B 

4588 ... D = A - B 

4589 ... ''' 

4590 ... ) 

4591 A B C D 

4592 0 1 10 11 -9 

4593 1 2 8 10 -6 

4594 2 3 6 9 -3 

4595 3 4 4 8 0 

4596 4 5 2 7 3 

4597 """ 

4598 from pandas.core.computation.eval import eval as _eval 

4599 

4600 inplace = validate_bool_kwarg(inplace, "inplace") 

4601 kwargs["level"] = kwargs.pop("level", 0) + 2 

4602 index_resolvers = self._get_index_resolvers() 

4603 column_resolvers = self._get_cleaned_column_resolvers() 

4604 resolvers = column_resolvers, index_resolvers 

4605 if "target" not in kwargs: 

4606 kwargs["target"] = self 

4607 kwargs["resolvers"] = tuple(kwargs.get("resolvers", ())) + resolvers 

4608 

4609 return _eval(expr, inplace=inplace, **kwargs) 

4610 

4611 def select_dtypes(self, include=None, exclude=None) -> DataFrame: 

4612 """ 

4613 Return a subset of the DataFrame's columns based on the column dtypes. 

4614 

4615 Parameters 

4616 ---------- 

4617 include, exclude : scalar or list-like 

4618 A selection of dtypes or strings to be included/excluded. At least 

4619 one of these parameters must be supplied. 

4620 

4621 Returns 

4622 ------- 

4623 DataFrame 

4624 The subset of the frame including the dtypes in ``include`` and 

4625 excluding the dtypes in ``exclude``. 

4626 

4627 Raises 

4628 ------ 

4629 ValueError 

4630 * If both of ``include`` and ``exclude`` are empty 

4631 * If ``include`` and ``exclude`` have overlapping elements 

4632 * If any kind of string dtype is passed in. 

4633 

4634 See Also 

4635 -------- 

4636 DataFrame.dtypes: Return Series with the data type of each column. 

4637 

4638 Notes 

4639 ----- 

4640 * To select all *numeric* types, use ``np.number`` or ``'number'`` 

4641 * To select strings you must use the ``object`` dtype, but note that 

4642 this will return *all* object dtype columns 

4643 * See the `numpy dtype hierarchy 

4644 <https://numpy.org/doc/stable/reference/arrays.scalars.html>`__ 

4645 * To select datetimes, use ``np.datetime64``, ``'datetime'`` or 

4646 ``'datetime64'`` 

4647 * To select timedeltas, use ``np.timedelta64``, ``'timedelta'`` or 

4648 ``'timedelta64'`` 

4649 * To select Pandas categorical dtypes, use ``'category'`` 

4650 * To select Pandas datetimetz dtypes, use ``'datetimetz'`` (new in 

4651 0.20.0) or ``'datetime64[ns, tz]'`` 

4652 

4653 Examples 

4654 -------- 

4655 >>> df = pd.DataFrame({'a': [1, 2] * 3, 

4656 ... 'b': [True, False] * 3, 

4657 ... 'c': [1.0, 2.0] * 3}) 

4658 >>> df 

4659 a b c 

4660 0 1 True 1.0 

4661 1 2 False 2.0 

4662 2 1 True 1.0 

4663 3 2 False 2.0 

4664 4 1 True 1.0 

4665 5 2 False 2.0 

4666 

4667 >>> df.select_dtypes(include='bool') 

4668 b 

4669 0 True 

4670 1 False 

4671 2 True 

4672 3 False 

4673 4 True 

4674 5 False 

4675 

4676 >>> df.select_dtypes(include=['float64']) 

4677 c 

4678 0 1.0 

4679 1 2.0 

4680 2 1.0 

4681 3 2.0 

4682 4 1.0 

4683 5 2.0 

4684 

4685 >>> df.select_dtypes(exclude=['int64']) 

4686 b c 

4687 0 True 1.0 

4688 1 False 2.0 

4689 2 True 1.0 

4690 3 False 2.0 

4691 4 True 1.0 

4692 5 False 2.0 

4693 """ 

4694 if not is_list_like(include): 

4695 include = (include,) if include is not None else () 

4696 if not is_list_like(exclude): 

4697 exclude = (exclude,) if exclude is not None else () 

4698 

4699 selection = (frozenset(include), frozenset(exclude)) 

4700 

4701 if not any(selection): 

4702 raise ValueError("at least one of include or exclude must be nonempty") 

4703 

4704 # convert the myriad valid dtypes object to a single representation 

4705 def check_int_infer_dtype(dtypes): 

4706 converted_dtypes: list[type] = [] 

4707 for dtype in dtypes: 

4708 # Numpy maps int to different types (int32, in64) on Windows and Linux 

4709 # see https://github.com/numpy/numpy/issues/9464 

4710 if (isinstance(dtype, str) and dtype == "int") or (dtype is int): 

4711 converted_dtypes.append(np.int32) 

4712 converted_dtypes.append(np.int64) 

4713 elif dtype == "float" or dtype is float: 

4714 # GH#42452 : np.dtype("float") coerces to np.float64 from Numpy 1.20 

4715 converted_dtypes.extend([np.float64, np.float32]) 

4716 else: 

4717 converted_dtypes.append(infer_dtype_from_object(dtype)) 

4718 return frozenset(converted_dtypes) 

4719 

4720 include = check_int_infer_dtype(include) 

4721 exclude = check_int_infer_dtype(exclude) 

4722 

4723 for dtypes in (include, exclude): 

4724 invalidate_string_dtypes(dtypes) 

4725 

4726 # can't both include AND exclude! 

4727 if not include.isdisjoint(exclude): 

4728 raise ValueError(f"include and exclude overlap on {(include & exclude)}") 

4729 

4730 def dtype_predicate(dtype: DtypeObj, dtypes_set) -> bool: 

4731 # GH 46870: BooleanDtype._is_numeric == True but should be excluded 

4732 return issubclass(dtype.type, tuple(dtypes_set)) or ( 

4733 np.number in dtypes_set 

4734 and getattr(dtype, "_is_numeric", False) 

4735 and not is_bool_dtype(dtype) 

4736 ) 

4737 

4738 def predicate(arr: ArrayLike) -> bool: 

4739 dtype = arr.dtype 

4740 if include: 

4741 if not dtype_predicate(dtype, include): 

4742 return False 

4743 

4744 if exclude: 

4745 if dtype_predicate(dtype, exclude): 

4746 return False 

4747 

4748 return True 

4749 

4750 mgr = self._mgr._get_data_subset(predicate).copy(deep=None) 

4751 return type(self)(mgr).__finalize__(self) 

4752 

4753 def insert( 

4754 self, 

4755 loc: int, 

4756 column: Hashable, 

4757 value: Scalar | AnyArrayLike, 

4758 allow_duplicates: bool | lib.NoDefault = lib.no_default, 

4759 ) -> None: 

4760 """ 

4761 Insert column into DataFrame at specified location. 

4762 

4763 Raises a ValueError if `column` is already contained in the DataFrame, 

4764 unless `allow_duplicates` is set to True. 

4765 

4766 Parameters 

4767 ---------- 

4768 loc : int 

4769 Insertion index. Must verify 0 <= loc <= len(columns). 

4770 column : str, number, or hashable object 

4771 Label of the inserted column. 

4772 value : Scalar, Series, or array-like 

4773 allow_duplicates : bool, optional, default lib.no_default 

4774 

4775 See Also 

4776 -------- 

4777 Index.insert : Insert new item by index. 

4778 

4779 Examples 

4780 -------- 

4781 >>> df = pd.DataFrame({'col1': [1, 2], 'col2': [3, 4]}) 

4782 >>> df 

4783 col1 col2 

4784 0 1 3 

4785 1 2 4 

4786 >>> df.insert(1, "newcol", [99, 99]) 

4787 >>> df 

4788 col1 newcol col2 

4789 0 1 99 3 

4790 1 2 99 4 

4791 >>> df.insert(0, "col1", [100, 100], allow_duplicates=True) 

4792 >>> df 

4793 col1 col1 newcol col2 

4794 0 100 1 99 3 

4795 1 100 2 99 4 

4796 

4797 Notice that pandas uses index alignment in case of `value` from type `Series`: 

4798 

4799 >>> df.insert(0, "col0", pd.Series([5, 6], index=[1, 2])) 

4800 >>> df 

4801 col0 col1 col1 newcol col2 

4802 0 NaN 100 1 99 3 

4803 1 5.0 100 2 99 4 

4804 """ 

4805 if allow_duplicates is lib.no_default: 

4806 allow_duplicates = False 

4807 if allow_duplicates and not self.flags.allows_duplicate_labels: 

4808 raise ValueError( 

4809 "Cannot specify 'allow_duplicates=True' when " 

4810 "'self.flags.allows_duplicate_labels' is False." 

4811 ) 

4812 if not allow_duplicates and column in self.columns: 

4813 # Should this be a different kind of error?? 

4814 raise ValueError(f"cannot insert {column}, already exists") 

4815 if not isinstance(loc, int): 

4816 raise TypeError("loc must be int") 

4817 

4818 value = self._sanitize_column(value) 

4819 self._mgr.insert(loc, column, value) 

4820 

4821 def assign(self, **kwargs) -> DataFrame: 

4822 r""" 

4823 Assign new columns to a DataFrame. 

4824 

4825 Returns a new object with all original columns in addition to new ones. 

4826 Existing columns that are re-assigned will be overwritten. 

4827 

4828 Parameters 

4829 ---------- 

4830 **kwargs : dict of {str: callable or Series} 

4831 The column names are keywords. If the values are 

4832 callable, they are computed on the DataFrame and 

4833 assigned to the new columns. The callable must not 

4834 change input DataFrame (though pandas doesn't check it). 

4835 If the values are not callable, (e.g. a Series, scalar, or array), 

4836 they are simply assigned. 

4837 

4838 Returns 

4839 ------- 

4840 DataFrame 

4841 A new DataFrame with the new columns in addition to 

4842 all the existing columns. 

4843 

4844 Notes 

4845 ----- 

4846 Assigning multiple columns within the same ``assign`` is possible. 

4847 Later items in '\*\*kwargs' may refer to newly created or modified 

4848 columns in 'df'; items are computed and assigned into 'df' in order. 

4849 

4850 Examples 

4851 -------- 

4852 >>> df = pd.DataFrame({'temp_c': [17.0, 25.0]}, 

4853 ... index=['Portland', 'Berkeley']) 

4854 >>> df 

4855 temp_c 

4856 Portland 17.0 

4857 Berkeley 25.0 

4858 

4859 Where the value is a callable, evaluated on `df`: 

4860 

4861 >>> df.assign(temp_f=lambda x: x.temp_c * 9 / 5 + 32) 

4862 temp_c temp_f 

4863 Portland 17.0 62.6 

4864 Berkeley 25.0 77.0 

4865 

4866 Alternatively, the same behavior can be achieved by directly 

4867 referencing an existing Series or sequence: 

4868 

4869 >>> df.assign(temp_f=df['temp_c'] * 9 / 5 + 32) 

4870 temp_c temp_f 

4871 Portland 17.0 62.6 

4872 Berkeley 25.0 77.0 

4873 

4874 You can create multiple columns within the same assign where one 

4875 of the columns depends on another one defined within the same assign: 

4876 

4877 >>> df.assign(temp_f=lambda x: x['temp_c'] * 9 / 5 + 32, 

4878 ... temp_k=lambda x: (x['temp_f'] + 459.67) * 5 / 9) 

4879 temp_c temp_f temp_k 

4880 Portland 17.0 62.6 290.15 

4881 Berkeley 25.0 77.0 298.15 

4882 """ 

4883 data = self.copy() 

4884 

4885 for k, v in kwargs.items(): 

4886 data[k] = com.apply_if_callable(v, data) 

4887 return data 

4888 

4889 def _sanitize_column(self, value) -> ArrayLike: 

4890 """ 

4891 Ensures new columns (which go into the BlockManager as new blocks) are 

4892 always copied and converted into an array. 

4893 

4894 Parameters 

4895 ---------- 

4896 value : scalar, Series, or array-like 

4897 

4898 Returns 

4899 ------- 

4900 numpy.ndarray or ExtensionArray 

4901 """ 

4902 self._ensure_valid_index(value) 

4903 

4904 # We can get there through isetitem with a DataFrame 

4905 # or through loc single_block_path 

4906 if isinstance(value, DataFrame): 

4907 return _reindex_for_setitem(value, self.index) 

4908 elif is_dict_like(value): 

4909 return _reindex_for_setitem(Series(value), self.index) 

4910 

4911 if is_list_like(value): 

4912 com.require_length_match(value, self.index) 

4913 return sanitize_array(value, self.index, copy=True, allow_2d=True) 

4914 

4915 @property 

4916 def _series(self): 

4917 return { 

4918 item: Series( 

4919 self._mgr.iget(idx), index=self.index, name=item, fastpath=True 

4920 ) 

4921 for idx, item in enumerate(self.columns) 

4922 } 

4923 

4924 def lookup( 

4925 self, row_labels: Sequence[IndexLabel], col_labels: Sequence[IndexLabel] 

4926 ) -> np.ndarray: 

4927 """ 

4928 Label-based "fancy indexing" function for DataFrame. 

4929 

4930 .. deprecated:: 1.2.0 

4931 DataFrame.lookup is deprecated, 

4932 use pandas.factorize and NumPy indexing instead. 

4933 For further details see 

4934 :ref:`Looking up values by index/column labels <indexing.lookup>`. 

4935 

4936 Given equal-length arrays of row and column labels, return an 

4937 array of the values corresponding to each (row, col) pair. 

4938 

4939 Parameters 

4940 ---------- 

4941 row_labels : sequence 

4942 The row labels to use for lookup. 

4943 col_labels : sequence 

4944 The column labels to use for lookup. 

4945 

4946 Returns 

4947 ------- 

4948 numpy.ndarray 

4949 The found values. 

4950 """ 

4951 msg = ( 

4952 "The 'lookup' method is deprecated and will be " 

4953 "removed in a future version. " 

4954 "You can use DataFrame.melt and DataFrame.loc " 

4955 "as a substitute." 

4956 ) 

4957 warnings.warn(msg, FutureWarning, stacklevel=find_stack_level()) 

4958 

4959 n = len(row_labels) 

4960 if n != len(col_labels): 

4961 raise ValueError("Row labels must have same size as column labels") 

4962 if not (self.index.is_unique and self.columns.is_unique): 

4963 # GH#33041 

4964 raise ValueError("DataFrame.lookup requires unique index and columns") 

4965 

4966 thresh = 1000 

4967 if not self._is_mixed_type or n > thresh: 

4968 values = self.values 

4969 ridx = self.index.get_indexer(row_labels) 

4970 cidx = self.columns.get_indexer(col_labels) 

4971 if (ridx == -1).any(): 

4972 raise KeyError("One or more row labels was not found") 

4973 if (cidx == -1).any(): 

4974 raise KeyError("One or more column labels was not found") 

4975 flat_index = ridx * len(self.columns) + cidx 

4976 result = values.flat[flat_index] 

4977 else: 

4978 result = np.empty(n, dtype="O") 

4979 for i, (r, c) in enumerate(zip(row_labels, col_labels)): 

4980 result[i] = self._get_value(r, c) 

4981 

4982 if is_object_dtype(result): 

4983 result = lib.maybe_convert_objects(result) 

4984 

4985 return result 

4986 

4987 # ---------------------------------------------------------------------- 

4988 # Reindexing and alignment 

4989 

4990 def _reindex_axes(self, axes, level, limit, tolerance, method, fill_value, copy): 

4991 frame = self 

4992 

4993 columns = axes["columns"] 

4994 if columns is not None: 

4995 frame = frame._reindex_columns( 

4996 columns, method, copy, level, fill_value, limit, tolerance 

4997 ) 

4998 

4999 index = axes["index"] 

5000 if index is not None: 

5001 frame = frame._reindex_index( 

5002 index, method, copy, level, fill_value, limit, tolerance 

5003 ) 

5004 

5005 return frame 

5006 

5007 def _reindex_index( 

5008 self, 

5009 new_index, 

5010 method, 

5011 copy: bool, 

5012 level: Level, 

5013 fill_value=np.nan, 

5014 limit=None, 

5015 tolerance=None, 

5016 ): 

5017 new_index, indexer = self.index.reindex( 

5018 new_index, method=method, level=level, limit=limit, tolerance=tolerance 

5019 ) 

5020 return self._reindex_with_indexers( 

5021 {0: [new_index, indexer]}, 

5022 copy=copy, 

5023 fill_value=fill_value, 

5024 allow_dups=False, 

5025 ) 

5026 

5027 def _reindex_columns( 

5028 self, 

5029 new_columns, 

5030 method, 

5031 copy: bool, 

5032 level: Level, 

5033 fill_value=None, 

5034 limit=None, 

5035 tolerance=None, 

5036 ): 

5037 new_columns, indexer = self.columns.reindex( 

5038 new_columns, method=method, level=level, limit=limit, tolerance=tolerance 

5039 ) 

5040 return self._reindex_with_indexers( 

5041 {1: [new_columns, indexer]}, 

5042 copy=copy, 

5043 fill_value=fill_value, 

5044 allow_dups=False, 

5045 ) 

5046 

5047 def _reindex_multi( 

5048 self, axes: dict[str, Index], copy: bool, fill_value 

5049 ) -> DataFrame: 

5050 """ 

5051 We are guaranteed non-Nones in the axes. 

5052 """ 

5053 

5054 new_index, row_indexer = self.index.reindex(axes["index"]) 

5055 new_columns, col_indexer = self.columns.reindex(axes["columns"]) 

5056 

5057 if row_indexer is not None and col_indexer is not None: 

5058 # Fastpath. By doing two 'take's at once we avoid making an 

5059 # unnecessary copy. 

5060 # We only get here with `not self._is_mixed_type`, which (almost) 

5061 # ensures that self.values is cheap. It may be worth making this 

5062 # condition more specific. 

5063 indexer = row_indexer, col_indexer 

5064 new_values = take_2d_multi(self.values, indexer, fill_value=fill_value) 

5065 return self._constructor(new_values, index=new_index, columns=new_columns) 

5066 else: 

5067 return self._reindex_with_indexers( 

5068 {0: [new_index, row_indexer], 1: [new_columns, col_indexer]}, 

5069 copy=copy, 

5070 fill_value=fill_value, 

5071 ) 

5072 

5073 @doc(NDFrame.align, **_shared_doc_kwargs) 

5074 def align( 

5075 self, 

5076 other: DataFrame, 

5077 join: Literal["outer", "inner", "left", "right"] = "outer", 

5078 axis: Axis | None = None, 

5079 level: Level = None, 

5080 copy: bool = True, 

5081 fill_value=None, 

5082 method: FillnaOptions | None = None, 

5083 limit: int | None = None, 

5084 fill_axis: Axis = 0, 

5085 broadcast_axis: Axis | None = None, 

5086 ) -> DataFrame: 

5087 return super().align( 

5088 other, 

5089 join=join, 

5090 axis=axis, 

5091 level=level, 

5092 copy=copy, 

5093 fill_value=fill_value, 

5094 method=method, 

5095 limit=limit, 

5096 fill_axis=fill_axis, 

5097 broadcast_axis=broadcast_axis, 

5098 ) 

5099 

5100 @overload 

5101 def set_axis( 

5102 self, 

5103 labels, 

5104 *, 

5105 axis: Axis = ..., 

5106 inplace: Literal[False] | lib.NoDefault = ..., 

5107 copy: bool | lib.NoDefault = ..., 

5108 ) -> DataFrame: 

5109 ... 

5110 

5111 @overload 

5112 def set_axis( 

5113 self, 

5114 labels, 

5115 *, 

5116 axis: Axis = ..., 

5117 inplace: Literal[True], 

5118 copy: bool | lib.NoDefault = ..., 

5119 ) -> None: 

5120 ... 

5121 

5122 @overload 

5123 def set_axis( 

5124 self, 

5125 labels, 

5126 *, 

5127 axis: Axis = ..., 

5128 inplace: bool | lib.NoDefault = ..., 

5129 copy: bool | lib.NoDefault = ..., 

5130 ) -> DataFrame | None: 

5131 ... 

5132 

5133 # error: Signature of "set_axis" incompatible with supertype "NDFrame" 

5134 @deprecate_nonkeyword_arguments(version=None, allowed_args=["self", "labels"]) 

5135 @Appender( 

5136 """ 

5137 Examples 

5138 -------- 

5139 >>> df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}) 

5140 

5141 Change the row labels. 

5142 

5143 >>> df.set_axis(['a', 'b', 'c'], axis='index') 

5144 A B 

5145 a 1 4 

5146 b 2 5 

5147 c 3 6 

5148 

5149 Change the column labels. 

5150 

5151 >>> df.set_axis(['I', 'II'], axis='columns') 

5152 I II 

5153 0 1 4 

5154 1 2 5 

5155 2 3 6 

5156 

5157 Now, update the labels without copying the underlying data. 

5158 

5159 >>> df.set_axis(['i', 'ii'], axis='columns', copy=False) 

5160 i ii 

5161 0 1 4 

5162 1 2 5 

5163 2 3 6 

5164 """ 

5165 ) 

5166 @Substitution( 

5167 **_shared_doc_kwargs, 

5168 extended_summary_sub=" column or", 

5169 axis_description_sub=", and 1 identifies the columns", 

5170 see_also_sub=" or columns", 

5171 ) 

5172 @Appender(NDFrame.set_axis.__doc__) 

5173 def set_axis( 

5174 self, 

5175 labels, 

5176 axis: Axis = 0, 

5177 inplace: bool | lib.NoDefault = lib.no_default, 

5178 *, 

5179 copy: bool | lib.NoDefault = lib.no_default, 

5180 ): 

5181 return super().set_axis(labels, axis=axis, inplace=inplace, copy=copy) 

5182 

5183 @Substitution(**_shared_doc_kwargs) 

5184 @Appender(NDFrame.reindex.__doc__) 

5185 @rewrite_axis_style_signature( 

5186 "labels", 

5187 [ 

5188 ("method", None), 

5189 ("copy", None), 

5190 ("level", None), 

5191 ("fill_value", np.nan), 

5192 ("limit", None), 

5193 ("tolerance", None), 

5194 ], 

5195 ) 

5196 def reindex(self, *args, **kwargs) -> DataFrame: 

5197 axes = validate_axis_style_args(self, args, kwargs, "labels", "reindex") 

5198 kwargs.update(axes) 

5199 # Pop these, since the values are in `kwargs` under different names 

5200 kwargs.pop("axis", None) 

5201 kwargs.pop("labels", None) 

5202 return super().reindex(**kwargs) 

5203 

5204 @overload 

5205 def drop( 

5206 self, 

5207 labels: IndexLabel = ..., 

5208 *, 

5209 axis: Axis = ..., 

5210 index: IndexLabel = ..., 

5211 columns: IndexLabel = ..., 

5212 level: Level = ..., 

5213 inplace: Literal[True], 

5214 errors: IgnoreRaise = ..., 

5215 ) -> None: 

5216 ... 

5217 

5218 @overload 

5219 def drop( 

5220 self, 

5221 labels: IndexLabel = ..., 

5222 *, 

5223 axis: Axis = ..., 

5224 index: IndexLabel = ..., 

5225 columns: IndexLabel = ..., 

5226 level: Level = ..., 

5227 inplace: Literal[False] = ..., 

5228 errors: IgnoreRaise = ..., 

5229 ) -> DataFrame: 

5230 ... 

5231 

5232 @overload 

5233 def drop( 

5234 self, 

5235 labels: IndexLabel = ..., 

5236 *, 

5237 axis: Axis = ..., 

5238 index: IndexLabel = ..., 

5239 columns: IndexLabel = ..., 

5240 level: Level = ..., 

5241 inplace: bool = ..., 

5242 errors: IgnoreRaise = ..., 

5243 ) -> DataFrame | None: 

5244 ... 

5245 

5246 # error: Signature of "drop" incompatible with supertype "NDFrame" 

5247 # github.com/python/mypy/issues/12387 

5248 @deprecate_nonkeyword_arguments(version=None, allowed_args=["self", "labels"]) 

5249 def drop( # type: ignore[override] 

5250 self, 

5251 labels: IndexLabel = None, 

5252 axis: Axis = 0, 

5253 index: IndexLabel = None, 

5254 columns: IndexLabel = None, 

5255 level: Level = None, 

5256 inplace: bool = False, 

5257 errors: IgnoreRaise = "raise", 

5258 ) -> DataFrame | None: 

5259 """ 

5260 Drop specified labels from rows or columns. 

5261 

5262 Remove rows or columns by specifying label names and corresponding 

5263 axis, or by specifying directly index or column names. When using a 

5264 multi-index, labels on different levels can be removed by specifying 

5265 the level. See the `user guide <advanced.shown_levels>` 

5266 for more information about the now unused levels. 

5267 

5268 Parameters 

5269 ---------- 

5270 labels : single label or list-like 

5271 Index or column labels to drop. A tuple will be used as a single 

5272 label and not treated as a list-like. 

5273 axis : {0 or 'index', 1 or 'columns'}, default 0 

5274 Whether to drop labels from the index (0 or 'index') or 

5275 columns (1 or 'columns'). 

5276 index : single label or list-like 

5277 Alternative to specifying axis (``labels, axis=0`` 

5278 is equivalent to ``index=labels``). 

5279 columns : single label or list-like 

5280 Alternative to specifying axis (``labels, axis=1`` 

5281 is equivalent to ``columns=labels``). 

5282 level : int or level name, optional 

5283 For MultiIndex, level from which the labels will be removed. 

5284 inplace : bool, default False 

5285 If False, return a copy. Otherwise, do operation 

5286 inplace and return None. 

5287 errors : {'ignore', 'raise'}, default 'raise' 

5288 If 'ignore', suppress error and only existing labels are 

5289 dropped. 

5290 

5291 Returns 

5292 ------- 

5293 DataFrame or None 

5294 DataFrame without the removed index or column labels or 

5295 None if ``inplace=True``. 

5296 

5297 Raises 

5298 ------ 

5299 KeyError 

5300 If any of the labels is not found in the selected axis. 

5301 

5302 See Also 

5303 -------- 

5304 DataFrame.loc : Label-location based indexer for selection by label. 

5305 DataFrame.dropna : Return DataFrame with labels on given axis omitted 

5306 where (all or any) data are missing. 

5307 DataFrame.drop_duplicates : Return DataFrame with duplicate rows 

5308 removed, optionally only considering certain columns. 

5309 Series.drop : Return Series with specified index labels removed. 

5310 

5311 Examples 

5312 -------- 

5313 >>> df = pd.DataFrame(np.arange(12).reshape(3, 4), 

5314 ... columns=['A', 'B', 'C', 'D']) 

5315 >>> df 

5316 A B C D 

5317 0 0 1 2 3 

5318 1 4 5 6 7 

5319 2 8 9 10 11 

5320 

5321 Drop columns 

5322 

5323 >>> df.drop(['B', 'C'], axis=1) 

5324 A D 

5325 0 0 3 

5326 1 4 7 

5327 2 8 11 

5328 

5329 >>> df.drop(columns=['B', 'C']) 

5330 A D 

5331 0 0 3 

5332 1 4 7 

5333 2 8 11 

5334 

5335 Drop a row by index 

5336 

5337 >>> df.drop([0, 1]) 

5338 A B C D 

5339 2 8 9 10 11 

5340 

5341 Drop columns and/or rows of MultiIndex DataFrame 

5342 

5343 >>> midx = pd.MultiIndex(levels=[['lama', 'cow', 'falcon'], 

5344 ... ['speed', 'weight', 'length']], 

5345 ... codes=[[0, 0, 0, 1, 1, 1, 2, 2, 2], 

5346 ... [0, 1, 2, 0, 1, 2, 0, 1, 2]]) 

5347 >>> df = pd.DataFrame(index=midx, columns=['big', 'small'], 

5348 ... data=[[45, 30], [200, 100], [1.5, 1], [30, 20], 

5349 ... [250, 150], [1.5, 0.8], [320, 250], 

5350 ... [1, 0.8], [0.3, 0.2]]) 

5351 >>> df 

5352 big small 

5353 lama speed 45.0 30.0 

5354 weight 200.0 100.0 

5355 length 1.5 1.0 

5356 cow speed 30.0 20.0 

5357 weight 250.0 150.0 

5358 length 1.5 0.8 

5359 falcon speed 320.0 250.0 

5360 weight 1.0 0.8 

5361 length 0.3 0.2 

5362 

5363 Drop a specific index combination from the MultiIndex 

5364 DataFrame, i.e., drop the combination ``'falcon'`` and 

5365 ``'weight'``, which deletes only the corresponding row 

5366 

5367 >>> df.drop(index=('falcon', 'weight')) 

5368 big small 

5369 lama speed 45.0 30.0 

5370 weight 200.0 100.0 

5371 length 1.5 1.0 

5372 cow speed 30.0 20.0 

5373 weight 250.0 150.0 

5374 length 1.5 0.8 

5375 falcon speed 320.0 250.0 

5376 length 0.3 0.2 

5377 

5378 >>> df.drop(index='cow', columns='small') 

5379 big 

5380 lama speed 45.0 

5381 weight 200.0 

5382 length 1.5 

5383 falcon speed 320.0 

5384 weight 1.0 

5385 length 0.3 

5386 

5387 >>> df.drop(index='length', level=1) 

5388 big small 

5389 lama speed 45.0 30.0 

5390 weight 200.0 100.0 

5391 cow speed 30.0 20.0 

5392 weight 250.0 150.0 

5393 falcon speed 320.0 250.0 

5394 weight 1.0 0.8 

5395 """ 

5396 return super().drop( 

5397 labels=labels, 

5398 axis=axis, 

5399 index=index, 

5400 columns=columns, 

5401 level=level, 

5402 inplace=inplace, 

5403 errors=errors, 

5404 ) 

5405 

5406 @overload 

5407 def rename( 

5408 self, 

5409 mapper: Renamer | None = ..., 

5410 *, 

5411 index: Renamer | None = ..., 

5412 columns: Renamer | None = ..., 

5413 axis: Axis | None = ..., 

5414 copy: bool | None = ..., 

5415 inplace: Literal[True], 

5416 level: Level = ..., 

5417 errors: IgnoreRaise = ..., 

5418 ) -> None: 

5419 ... 

5420 

5421 @overload 

5422 def rename( 

5423 self, 

5424 mapper: Renamer | None = ..., 

5425 *, 

5426 index: Renamer | None = ..., 

5427 columns: Renamer | None = ..., 

5428 axis: Axis | None = ..., 

5429 copy: bool | None = ..., 

5430 inplace: Literal[False] = ..., 

5431 level: Level = ..., 

5432 errors: IgnoreRaise = ..., 

5433 ) -> DataFrame: 

5434 ... 

5435 

5436 @overload 

5437 def rename( 

5438 self, 

5439 mapper: Renamer | None = ..., 

5440 *, 

5441 index: Renamer | None = ..., 

5442 columns: Renamer | None = ..., 

5443 axis: Axis | None = ..., 

5444 copy: bool | None = ..., 

5445 inplace: bool = ..., 

5446 level: Level = ..., 

5447 errors: IgnoreRaise = ..., 

5448 ) -> DataFrame | None: 

5449 ... 

5450 

5451 def rename( 

5452 self, 

5453 mapper: Renamer | None = None, 

5454 *, 

5455 index: Renamer | None = None, 

5456 columns: Renamer | None = None, 

5457 axis: Axis | None = None, 

5458 copy: bool | None = None, 

5459 inplace: bool = False, 

5460 level: Level = None, 

5461 errors: IgnoreRaise = "ignore", 

5462 ) -> DataFrame | None: 

5463 """ 

5464 Alter axes labels. 

5465 

5466 Function / dict values must be unique (1-to-1). Labels not contained in 

5467 a dict / Series will be left as-is. Extra labels listed don't throw an 

5468 error. 

5469 

5470 See the :ref:`user guide <basics.rename>` for more. 

5471 

5472 Parameters 

5473 ---------- 

5474 mapper : dict-like or function 

5475 Dict-like or function transformations to apply to 

5476 that axis' values. Use either ``mapper`` and ``axis`` to 

5477 specify the axis to target with ``mapper``, or ``index`` and 

5478 ``columns``. 

5479 index : dict-like or function 

5480 Alternative to specifying axis (``mapper, axis=0`` 

5481 is equivalent to ``index=mapper``). 

5482 columns : dict-like or function 

5483 Alternative to specifying axis (``mapper, axis=1`` 

5484 is equivalent to ``columns=mapper``). 

5485 axis : {0 or 'index', 1 or 'columns'}, default 0 

5486 Axis to target with ``mapper``. Can be either the axis name 

5487 ('index', 'columns') or number (0, 1). The default is 'index'. 

5488 copy : bool, default True 

5489 Also copy underlying data. 

5490 inplace : bool, default False 

5491 Whether to modify the DataFrame rather than creating a new one. 

5492 If True then value of copy is ignored. 

5493 level : int or level name, default None 

5494 In case of a MultiIndex, only rename labels in the specified 

5495 level. 

5496 errors : {'ignore', 'raise'}, default 'ignore' 

5497 If 'raise', raise a `KeyError` when a dict-like `mapper`, `index`, 

5498 or `columns` contains labels that are not present in the Index 

5499 being transformed. 

5500 If 'ignore', existing keys will be renamed and extra keys will be 

5501 ignored. 

5502 

5503 Returns 

5504 ------- 

5505 DataFrame or None 

5506 DataFrame with the renamed axis labels or None if ``inplace=True``. 

5507 

5508 Raises 

5509 ------ 

5510 KeyError 

5511 If any of the labels is not found in the selected axis and 

5512 "errors='raise'". 

5513 

5514 See Also 

5515 -------- 

5516 DataFrame.rename_axis : Set the name of the axis. 

5517 

5518 Examples 

5519 -------- 

5520 ``DataFrame.rename`` supports two calling conventions 

5521 

5522 * ``(index=index_mapper, columns=columns_mapper, ...)`` 

5523 * ``(mapper, axis={'index', 'columns'}, ...)`` 

5524 

5525 We *highly* recommend using keyword arguments to clarify your 

5526 intent. 

5527 

5528 Rename columns using a mapping: 

5529 

5530 >>> df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}) 

5531 >>> df.rename(columns={"A": "a", "B": "c"}) 

5532 a c 

5533 0 1 4 

5534 1 2 5 

5535 2 3 6 

5536 

5537 Rename index using a mapping: 

5538 

5539 >>> df.rename(index={0: "x", 1: "y", 2: "z"}) 

5540 A B 

5541 x 1 4 

5542 y 2 5 

5543 z 3 6 

5544 

5545 Cast index labels to a different type: 

5546 

5547 >>> df.index 

5548 RangeIndex(start=0, stop=3, step=1) 

5549 >>> df.rename(index=str).index 

5550 Index(['0', '1', '2'], dtype='object') 

5551 

5552 >>> df.rename(columns={"A": "a", "B": "b", "C": "c"}, errors="raise") 

5553 Traceback (most recent call last): 

5554 KeyError: ['C'] not found in axis 

5555 

5556 Using axis-style parameters: 

5557 

5558 >>> df.rename(str.lower, axis='columns') 

5559 a b 

5560 0 1 4 

5561 1 2 5 

5562 2 3 6 

5563 

5564 >>> df.rename({1: 2, 2: 4}, axis='index') 

5565 A B 

5566 0 1 4 

5567 2 2 5 

5568 4 3 6 

5569 """ 

5570 return super()._rename( 

5571 mapper=mapper, 

5572 index=index, 

5573 columns=columns, 

5574 axis=axis, 

5575 copy=copy, 

5576 inplace=inplace, 

5577 level=level, 

5578 errors=errors, 

5579 ) 

5580 

5581 @overload 

5582 def fillna( 

5583 self, 

5584 value: Hashable | Mapping | Series | DataFrame = ..., 

5585 *, 

5586 method: FillnaOptions | None = ..., 

5587 axis: Axis | None = ..., 

5588 inplace: Literal[False] = ..., 

5589 limit: int | None = ..., 

5590 downcast: dict | None = ..., 

5591 ) -> DataFrame: 

5592 ... 

5593 

5594 @overload 

5595 def fillna( 

5596 self, 

5597 value: Hashable | Mapping | Series | DataFrame = ..., 

5598 *, 

5599 method: FillnaOptions | None = ..., 

5600 axis: Axis | None = ..., 

5601 inplace: Literal[True], 

5602 limit: int | None = ..., 

5603 downcast: dict | None = ..., 

5604 ) -> None: 

5605 ... 

5606 

5607 @overload 

5608 def fillna( 

5609 self, 

5610 value: Hashable | Mapping | Series | DataFrame = ..., 

5611 *, 

5612 method: FillnaOptions | None = ..., 

5613 axis: Axis | None = ..., 

5614 inplace: bool = ..., 

5615 limit: int | None = ..., 

5616 downcast: dict | None = ..., 

5617 ) -> DataFrame | None: 

5618 ... 

5619 

5620 # error: Signature of "fillna" incompatible with supertype "NDFrame" 

5621 @deprecate_nonkeyword_arguments(version=None, allowed_args=["self", "value"]) 

5622 @doc(NDFrame.fillna, **_shared_doc_kwargs) 

5623 def fillna( # type: ignore[override] 

5624 self, 

5625 value: Hashable | Mapping | Series | DataFrame = None, 

5626 method: FillnaOptions | None = None, 

5627 axis: Axis | None = None, 

5628 inplace: bool = False, 

5629 limit: int | None = None, 

5630 downcast: dict | None = None, 

5631 ) -> DataFrame | None: 

5632 return super().fillna( 

5633 value=value, 

5634 method=method, 

5635 axis=axis, 

5636 inplace=inplace, 

5637 limit=limit, 

5638 downcast=downcast, 

5639 ) 

5640 

5641 def pop(self, item: Hashable) -> Series: 

5642 """ 

5643 Return item and drop from frame. Raise KeyError if not found. 

5644 

5645 Parameters 

5646 ---------- 

5647 item : label 

5648 Label of column to be popped. 

5649 

5650 Returns 

5651 ------- 

5652 Series 

5653 

5654 Examples 

5655 -------- 

5656 >>> df = pd.DataFrame([('falcon', 'bird', 389.0), 

5657 ... ('parrot', 'bird', 24.0), 

5658 ... ('lion', 'mammal', 80.5), 

5659 ... ('monkey', 'mammal', np.nan)], 

5660 ... columns=('name', 'class', 'max_speed')) 

5661 >>> df 

5662 name class max_speed 

5663 0 falcon bird 389.0 

5664 1 parrot bird 24.0 

5665 2 lion mammal 80.5 

5666 3 monkey mammal NaN 

5667 

5668 >>> df.pop('class') 

5669 0 bird 

5670 1 bird 

5671 2 mammal 

5672 3 mammal 

5673 Name: class, dtype: object 

5674 

5675 >>> df 

5676 name max_speed 

5677 0 falcon 389.0 

5678 1 parrot 24.0 

5679 2 lion 80.5 

5680 3 monkey NaN 

5681 """ 

5682 return super().pop(item=item) 

5683 

5684 # error: Signature of "replace" incompatible with supertype "NDFrame" 

5685 @overload # type: ignore[override] 

5686 def replace( 

5687 self, 

5688 to_replace=..., 

5689 value=..., 

5690 *, 

5691 inplace: Literal[False] = ..., 

5692 limit: int | None = ..., 

5693 regex: bool = ..., 

5694 method: Literal["pad", "ffill", "bfill"] | lib.NoDefault = ..., 

5695 ) -> DataFrame: 

5696 ... 

5697 

5698 @overload 

5699 def replace( 

5700 self, 

5701 to_replace=..., 

5702 value=..., 

5703 *, 

5704 inplace: Literal[True], 

5705 limit: int | None = ..., 

5706 regex: bool = ..., 

5707 method: Literal["pad", "ffill", "bfill"] | lib.NoDefault = ..., 

5708 ) -> None: 

5709 ... 

5710 

5711 # error: Signature of "replace" incompatible with supertype "NDFrame" 

5712 @deprecate_nonkeyword_arguments( 

5713 version=None, allowed_args=["self", "to_replace", "value"] 

5714 ) 

5715 @doc(NDFrame.replace, **_shared_doc_kwargs) 

5716 def replace( # type: ignore[override] 

5717 self, 

5718 to_replace=None, 

5719 value=lib.no_default, 

5720 inplace: bool = False, 

5721 limit: int | None = None, 

5722 regex: bool = False, 

5723 method: Literal["pad", "ffill", "bfill"] | lib.NoDefault = lib.no_default, 

5724 ) -> DataFrame | None: 

5725 return super().replace( 

5726 to_replace=to_replace, 

5727 value=value, 

5728 inplace=inplace, 

5729 limit=limit, 

5730 regex=regex, 

5731 method=method, 

5732 ) 

5733 

5734 def _replace_columnwise( 

5735 self, mapping: dict[Hashable, tuple[Any, Any]], inplace: bool, regex 

5736 ): 

5737 """ 

5738 Dispatch to Series.replace column-wise. 

5739 

5740 Parameters 

5741 ---------- 

5742 mapping : dict 

5743 of the form {col: (target, value)} 

5744 inplace : bool 

5745 regex : bool or same types as `to_replace` in DataFrame.replace 

5746 

5747 Returns 

5748 ------- 

5749 DataFrame or None 

5750 """ 

5751 # Operate column-wise 

5752 res = self if inplace else self.copy() 

5753 ax = self.columns 

5754 

5755 for i in range(len(ax)): 

5756 if ax[i] in mapping: 

5757 ser = self.iloc[:, i] 

5758 

5759 target, value = mapping[ax[i]] 

5760 newobj = ser.replace(target, value, regex=regex) 

5761 

5762 res._iset_item(i, newobj) 

5763 

5764 if inplace: 

5765 return 

5766 return res.__finalize__(self) 

5767 

5768 @doc(NDFrame.shift, klass=_shared_doc_kwargs["klass"]) 

5769 def shift( 

5770 self, 

5771 periods: int = 1, 

5772 freq: Frequency | None = None, 

5773 axis: Axis = 0, 

5774 fill_value: Hashable = lib.no_default, 

5775 ) -> DataFrame: 

5776 axis = self._get_axis_number(axis) 

5777 

5778 ncols = len(self.columns) 

5779 if ( 

5780 axis == 1 

5781 and periods != 0 

5782 and freq is None 

5783 and fill_value is lib.no_default 

5784 and ncols > 0 

5785 ): 

5786 # We will infer fill_value to match the closest column 

5787 

5788 # Use a column that we know is valid for our column's dtype GH#38434 

5789 label = self.columns[0] 

5790 

5791 if periods > 0: 

5792 result = self.iloc[:, :-periods] 

5793 for col in range(min(ncols, abs(periods))): 

5794 # TODO(EA2D): doing this in a loop unnecessary with 2D EAs 

5795 # Define filler inside loop so we get a copy 

5796 filler = self.iloc[:, 0].shift(len(self)) 

5797 result.insert(0, label, filler, allow_duplicates=True) 

5798 else: 

5799 result = self.iloc[:, -periods:] 

5800 for col in range(min(ncols, abs(periods))): 

5801 # Define filler inside loop so we get a copy 

5802 filler = self.iloc[:, -1].shift(len(self)) 

5803 result.insert( 

5804 len(result.columns), label, filler, allow_duplicates=True 

5805 ) 

5806 

5807 result.columns = self.columns.copy() 

5808 return result 

5809 elif ( 

5810 axis == 1 

5811 and periods != 0 

5812 and fill_value is not lib.no_default 

5813 and ncols > 0 

5814 ): 

5815 arrays = self._mgr.arrays 

5816 if len(arrays) > 1 or ( 

5817 # If we only have one block and we know that we can't 

5818 # keep the same dtype (i.e. the _can_hold_element check) 

5819 # then we can go through the reindex_indexer path 

5820 # (and avoid casting logic in the Block method). 

5821 # The exception to this (until 2.0) is datetimelike 

5822 # dtypes with integers, which cast. 

5823 not can_hold_element(arrays[0], fill_value) 

5824 # TODO(2.0): remove special case for integer-with-datetimelike 

5825 # once deprecation is enforced 

5826 and not ( 

5827 lib.is_integer(fill_value) and needs_i8_conversion(arrays[0].dtype) 

5828 ) 

5829 ): 

5830 # GH#35488 we need to watch out for multi-block cases 

5831 # We only get here with fill_value not-lib.no_default 

5832 nper = abs(periods) 

5833 nper = min(nper, ncols) 

5834 if periods > 0: 

5835 indexer = np.array( 

5836 [-1] * nper + list(range(ncols - periods)), dtype=np.intp 

5837 ) 

5838 else: 

5839 indexer = np.array( 

5840 list(range(nper, ncols)) + [-1] * nper, dtype=np.intp 

5841 ) 

5842 mgr = self._mgr.reindex_indexer( 

5843 self.columns, 

5844 indexer, 

5845 axis=0, 

5846 fill_value=fill_value, 

5847 allow_dups=True, 

5848 ) 

5849 res_df = self._constructor(mgr) 

5850 return res_df.__finalize__(self, method="shift") 

5851 

5852 return super().shift( 

5853 periods=periods, freq=freq, axis=axis, fill_value=fill_value 

5854 ) 

5855 

5856 @overload 

5857 def set_index( 

5858 self, 

5859 keys, 

5860 *, 

5861 drop: bool = ..., 

5862 append: bool = ..., 

5863 inplace: Literal[False] = ..., 

5864 verify_integrity: bool = ..., 

5865 ) -> DataFrame: 

5866 ... 

5867 

5868 @overload 

5869 def set_index( 

5870 self, 

5871 keys, 

5872 *, 

5873 drop: bool = ..., 

5874 append: bool = ..., 

5875 inplace: Literal[True], 

5876 verify_integrity: bool = ..., 

5877 ) -> None: 

5878 ... 

5879 

5880 @deprecate_nonkeyword_arguments(version=None, allowed_args=["self", "keys"]) 

5881 def set_index( 

5882 self, 

5883 keys, 

5884 drop: bool = True, 

5885 append: bool = False, 

5886 inplace: bool = False, 

5887 verify_integrity: bool = False, 

5888 ) -> DataFrame | None: 

5889 """ 

5890 Set the DataFrame index using existing columns. 

5891 

5892 Set the DataFrame index (row labels) using one or more existing 

5893 columns or arrays (of the correct length). The index can replace the 

5894 existing index or expand on it. 

5895 

5896 Parameters 

5897 ---------- 

5898 keys : label or array-like or list of labels/arrays 

5899 This parameter can be either a single column key, a single array of 

5900 the same length as the calling DataFrame, or a list containing an 

5901 arbitrary combination of column keys and arrays. Here, "array" 

5902 encompasses :class:`Series`, :class:`Index`, ``np.ndarray``, and 

5903 instances of :class:`~collections.abc.Iterator`. 

5904 drop : bool, default True 

5905 Delete columns to be used as the new index. 

5906 append : bool, default False 

5907 Whether to append columns to existing index. 

5908 inplace : bool, default False 

5909 Whether to modify the DataFrame rather than creating a new one. 

5910 verify_integrity : bool, default False 

5911 Check the new index for duplicates. Otherwise defer the check until 

5912 necessary. Setting to False will improve the performance of this 

5913 method. 

5914 

5915 Returns 

5916 ------- 

5917 DataFrame or None 

5918 Changed row labels or None if ``inplace=True``. 

5919 

5920 See Also 

5921 -------- 

5922 DataFrame.reset_index : Opposite of set_index. 

5923 DataFrame.reindex : Change to new indices or expand indices. 

5924 DataFrame.reindex_like : Change to same indices as other DataFrame. 

5925 

5926 Examples 

5927 -------- 

5928 >>> df = pd.DataFrame({'month': [1, 4, 7, 10], 

5929 ... 'year': [2012, 2014, 2013, 2014], 

5930 ... 'sale': [55, 40, 84, 31]}) 

5931 >>> df 

5932 month year sale 

5933 0 1 2012 55 

5934 1 4 2014 40 

5935 2 7 2013 84 

5936 3 10 2014 31 

5937 

5938 Set the index to become the 'month' column: 

5939 

5940 >>> df.set_index('month') 

5941 year sale 

5942 month 

5943 1 2012 55 

5944 4 2014 40 

5945 7 2013 84 

5946 10 2014 31 

5947 

5948 Create a MultiIndex using columns 'year' and 'month': 

5949 

5950 >>> df.set_index(['year', 'month']) 

5951 sale 

5952 year month 

5953 2012 1 55 

5954 2014 4 40 

5955 2013 7 84 

5956 2014 10 31 

5957 

5958 Create a MultiIndex using an Index and a column: 

5959 

5960 >>> df.set_index([pd.Index([1, 2, 3, 4]), 'year']) 

5961 month sale 

5962 year 

5963 1 2012 1 55 

5964 2 2014 4 40 

5965 3 2013 7 84 

5966 4 2014 10 31 

5967 

5968 Create a MultiIndex using two Series: 

5969 

5970 >>> s = pd.Series([1, 2, 3, 4]) 

5971 >>> df.set_index([s, s**2]) 

5972 month year sale 

5973 1 1 1 2012 55 

5974 2 4 4 2014 40 

5975 3 9 7 2013 84 

5976 4 16 10 2014 31 

5977 """ 

5978 inplace = validate_bool_kwarg(inplace, "inplace") 

5979 self._check_inplace_and_allows_duplicate_labels(inplace) 

5980 if not isinstance(keys, list): 

5981 keys = [keys] 

5982 

5983 err_msg = ( 

5984 'The parameter "keys" may be a column key, one-dimensional ' 

5985 "array, or a list containing only valid column keys and " 

5986 "one-dimensional arrays." 

5987 ) 

5988 

5989 missing: list[Hashable] = [] 

5990 for col in keys: 

5991 if isinstance(col, (Index, Series, np.ndarray, list, abc.Iterator)): 

5992 # arrays are fine as long as they are one-dimensional 

5993 # iterators get converted to list below 

5994 if getattr(col, "ndim", 1) != 1: 

5995 raise ValueError(err_msg) 

5996 else: 

5997 # everything else gets tried as a key; see GH 24969 

5998 try: 

5999 found = col in self.columns 

6000 except TypeError as err: 

6001 raise TypeError( 

6002 f"{err_msg}. Received column of type {type(col)}" 

6003 ) from err 

6004 else: 

6005 if not found: 

6006 missing.append(col) 

6007 

6008 if missing: 

6009 raise KeyError(f"None of {missing} are in the columns") 

6010 

6011 if inplace: 

6012 frame = self 

6013 else: 

6014 frame = self.copy() 

6015 

6016 arrays = [] 

6017 names: list[Hashable] = [] 

6018 if append: 

6019 names = list(self.index.names) 

6020 if isinstance(self.index, MultiIndex): 

6021 for i in range(self.index.nlevels): 

6022 arrays.append(self.index._get_level_values(i)) 

6023 else: 

6024 arrays.append(self.index) 

6025 

6026 to_remove: list[Hashable] = [] 

6027 for col in keys: 

6028 if isinstance(col, MultiIndex): 

6029 for n in range(col.nlevels): 

6030 arrays.append(col._get_level_values(n)) 

6031 names.extend(col.names) 

6032 elif isinstance(col, (Index, Series)): 

6033 # if Index then not MultiIndex (treated above) 

6034 

6035 # error: Argument 1 to "append" of "list" has incompatible type 

6036 # "Union[Index, Series]"; expected "Index" 

6037 arrays.append(col) # type:ignore[arg-type] 

6038 names.append(col.name) 

6039 elif isinstance(col, (list, np.ndarray)): 

6040 # error: Argument 1 to "append" of "list" has incompatible type 

6041 # "Union[List[Any], ndarray]"; expected "Index" 

6042 arrays.append(col) # type: ignore[arg-type] 

6043 names.append(None) 

6044 elif isinstance(col, abc.Iterator): 

6045 # error: Argument 1 to "append" of "list" has incompatible type 

6046 # "List[Any]"; expected "Index" 

6047 arrays.append(list(col)) # type: ignore[arg-type] 

6048 names.append(None) 

6049 # from here, col can only be a column label 

6050 else: 

6051 arrays.append(frame[col]._values) 

6052 names.append(col) 

6053 if drop: 

6054 to_remove.append(col) 

6055 

6056 if len(arrays[-1]) != len(self): 

6057 # check newest element against length of calling frame, since 

6058 # ensure_index_from_sequences would not raise for append=False. 

6059 raise ValueError( 

6060 f"Length mismatch: Expected {len(self)} rows, " 

6061 f"received array of length {len(arrays[-1])}" 

6062 ) 

6063 

6064 index = ensure_index_from_sequences(arrays, names) 

6065 

6066 if verify_integrity and not index.is_unique: 

6067 duplicates = index[index.duplicated()].unique() 

6068 raise ValueError(f"Index has duplicate keys: {duplicates}") 

6069 

6070 # use set to handle duplicate column names gracefully in case of drop 

6071 for c in set(to_remove): 

6072 del frame[c] 

6073 

6074 # clear up memory usage 

6075 index._cleanup() 

6076 

6077 frame.index = index 

6078 

6079 if not inplace: 

6080 return frame 

6081 return None 

6082 

6083 @overload 

6084 def reset_index( 

6085 self, 

6086 level: IndexLabel = ..., 

6087 *, 

6088 drop: bool = ..., 

6089 inplace: Literal[False] = ..., 

6090 col_level: Hashable = ..., 

6091 col_fill: Hashable = ..., 

6092 allow_duplicates: bool | lib.NoDefault = ..., 

6093 names: Hashable | Sequence[Hashable] = None, 

6094 ) -> DataFrame: 

6095 ... 

6096 

6097 @overload 

6098 def reset_index( 

6099 self, 

6100 level: IndexLabel = ..., 

6101 *, 

6102 drop: bool = ..., 

6103 inplace: Literal[True], 

6104 col_level: Hashable = ..., 

6105 col_fill: Hashable = ..., 

6106 allow_duplicates: bool | lib.NoDefault = ..., 

6107 names: Hashable | Sequence[Hashable] = None, 

6108 ) -> None: 

6109 ... 

6110 

6111 @overload 

6112 def reset_index( 

6113 self, 

6114 level: IndexLabel = ..., 

6115 *, 

6116 drop: bool = ..., 

6117 inplace: bool = ..., 

6118 col_level: Hashable = ..., 

6119 col_fill: Hashable = ..., 

6120 allow_duplicates: bool | lib.NoDefault = ..., 

6121 names: Hashable | Sequence[Hashable] = None, 

6122 ) -> DataFrame | None: 

6123 ... 

6124 

6125 @deprecate_nonkeyword_arguments(version=None, allowed_args=["self", "level"]) 

6126 def reset_index( 

6127 self, 

6128 level: IndexLabel = None, 

6129 drop: bool = False, 

6130 inplace: bool = False, 

6131 col_level: Hashable = 0, 

6132 col_fill: Hashable = "", 

6133 allow_duplicates: bool | lib.NoDefault = lib.no_default, 

6134 names: Hashable | Sequence[Hashable] = None, 

6135 ) -> DataFrame | None: 

6136 """ 

6137 Reset the index, or a level of it. 

6138 

6139 Reset the index of the DataFrame, and use the default one instead. 

6140 If the DataFrame has a MultiIndex, this method can remove one or more 

6141 levels. 

6142 

6143 Parameters 

6144 ---------- 

6145 level : int, str, tuple, or list, default None 

6146 Only remove the given levels from the index. Removes all levels by 

6147 default. 

6148 drop : bool, default False 

6149 Do not try to insert index into dataframe columns. This resets 

6150 the index to the default integer index. 

6151 inplace : bool, default False 

6152 Whether to modify the DataFrame rather than creating a new one. 

6153 col_level : int or str, default 0 

6154 If the columns have multiple levels, determines which level the 

6155 labels are inserted into. By default it is inserted into the first 

6156 level. 

6157 col_fill : object, default '' 

6158 If the columns have multiple levels, determines how the other 

6159 levels are named. If None then the index name is repeated. 

6160 allow_duplicates : bool, optional, default lib.no_default 

6161 Allow duplicate column labels to be created. 

6162 

6163 .. versionadded:: 1.5.0 

6164 

6165 names : int, str or 1-dimensional list, default None 

6166 Using the given string, rename the DataFrame column which contains the 

6167 index data. If the DataFrame has a MultiIndex, this has to be a list or 

6168 tuple with length equal to the number of levels. 

6169 

6170 .. versionadded:: 1.5.0 

6171 

6172 Returns 

6173 ------- 

6174 DataFrame or None 

6175 DataFrame with the new index or None if ``inplace=True``. 

6176 

6177 See Also 

6178 -------- 

6179 DataFrame.set_index : Opposite of reset_index. 

6180 DataFrame.reindex : Change to new indices or expand indices. 

6181 DataFrame.reindex_like : Change to same indices as other DataFrame. 

6182 

6183 Examples 

6184 -------- 

6185 >>> df = pd.DataFrame([('bird', 389.0), 

6186 ... ('bird', 24.0), 

6187 ... ('mammal', 80.5), 

6188 ... ('mammal', np.nan)], 

6189 ... index=['falcon', 'parrot', 'lion', 'monkey'], 

6190 ... columns=('class', 'max_speed')) 

6191 >>> df 

6192 class max_speed 

6193 falcon bird 389.0 

6194 parrot bird 24.0 

6195 lion mammal 80.5 

6196 monkey mammal NaN 

6197 

6198 When we reset the index, the old index is added as a column, and a 

6199 new sequential index is used: 

6200 

6201 >>> df.reset_index() 

6202 index class max_speed 

6203 0 falcon bird 389.0 

6204 1 parrot bird 24.0 

6205 2 lion mammal 80.5 

6206 3 monkey mammal NaN 

6207 

6208 We can use the `drop` parameter to avoid the old index being added as 

6209 a column: 

6210 

6211 >>> df.reset_index(drop=True) 

6212 class max_speed 

6213 0 bird 389.0 

6214 1 bird 24.0 

6215 2 mammal 80.5 

6216 3 mammal NaN 

6217 

6218 You can also use `reset_index` with `MultiIndex`. 

6219 

6220 >>> index = pd.MultiIndex.from_tuples([('bird', 'falcon'), 

6221 ... ('bird', 'parrot'), 

6222 ... ('mammal', 'lion'), 

6223 ... ('mammal', 'monkey')], 

6224 ... names=['class', 'name']) 

6225 >>> columns = pd.MultiIndex.from_tuples([('speed', 'max'), 

6226 ... ('species', 'type')]) 

6227 >>> df = pd.DataFrame([(389.0, 'fly'), 

6228 ... ( 24.0, 'fly'), 

6229 ... ( 80.5, 'run'), 

6230 ... (np.nan, 'jump')], 

6231 ... index=index, 

6232 ... columns=columns) 

6233 >>> df 

6234 speed species 

6235 max type 

6236 class name 

6237 bird falcon 389.0 fly 

6238 parrot 24.0 fly 

6239 mammal lion 80.5 run 

6240 monkey NaN jump 

6241 

6242 Using the `names` parameter, choose a name for the index column: 

6243 

6244 >>> df.reset_index(names=['classes', 'names']) 

6245 classes names speed species 

6246 max type 

6247 0 bird falcon 389.0 fly 

6248 1 bird parrot 24.0 fly 

6249 2 mammal lion 80.5 run 

6250 3 mammal monkey NaN jump 

6251 

6252 If the index has multiple levels, we can reset a subset of them: 

6253 

6254 >>> df.reset_index(level='class') 

6255 class speed species 

6256 max type 

6257 name 

6258 falcon bird 389.0 fly 

6259 parrot bird 24.0 fly 

6260 lion mammal 80.5 run 

6261 monkey mammal NaN jump 

6262 

6263 If we are not dropping the index, by default, it is placed in the top 

6264 level. We can place it in another level: 

6265 

6266 >>> df.reset_index(level='class', col_level=1) 

6267 speed species 

6268 class max type 

6269 name 

6270 falcon bird 389.0 fly 

6271 parrot bird 24.0 fly 

6272 lion mammal 80.5 run 

6273 monkey mammal NaN jump 

6274 

6275 When the index is inserted under another level, we can specify under 

6276 which one with the parameter `col_fill`: 

6277 

6278 >>> df.reset_index(level='class', col_level=1, col_fill='species') 

6279 species speed species 

6280 class max type 

6281 name 

6282 falcon bird 389.0 fly 

6283 parrot bird 24.0 fly 

6284 lion mammal 80.5 run 

6285 monkey mammal NaN jump 

6286 

6287 If we specify a nonexistent level for `col_fill`, it is created: 

6288 

6289 >>> df.reset_index(level='class', col_level=1, col_fill='genus') 

6290 genus speed species 

6291 class max type 

6292 name 

6293 falcon bird 389.0 fly 

6294 parrot bird 24.0 fly 

6295 lion mammal 80.5 run 

6296 monkey mammal NaN jump 

6297 """ 

6298 inplace = validate_bool_kwarg(inplace, "inplace") 

6299 self._check_inplace_and_allows_duplicate_labels(inplace) 

6300 if inplace: 

6301 new_obj = self 

6302 else: 

6303 new_obj = self.copy(deep=None) 

6304 if allow_duplicates is not lib.no_default: 

6305 allow_duplicates = validate_bool_kwarg(allow_duplicates, "allow_duplicates") 

6306 

6307 new_index = default_index(len(new_obj)) 

6308 if level is not None: 

6309 if not isinstance(level, (tuple, list)): 

6310 level = [level] 

6311 level = [self.index._get_level_number(lev) for lev in level] 

6312 if len(level) < self.index.nlevels: 

6313 new_index = self.index.droplevel(level) 

6314 

6315 if not drop: 

6316 to_insert: Iterable[tuple[Any, Any | None]] 

6317 

6318 default = "index" if "index" not in self else "level_0" 

6319 names = self.index._get_default_index_names(names, default) 

6320 

6321 if isinstance(self.index, MultiIndex): 

6322 to_insert = zip(self.index.levels, self.index.codes) 

6323 else: 

6324 to_insert = ((self.index, None),) 

6325 

6326 multi_col = isinstance(self.columns, MultiIndex) 

6327 for i, (lev, lab) in reversed(list(enumerate(to_insert))): 

6328 if level is not None and i not in level: 

6329 continue 

6330 name = names[i] 

6331 if multi_col: 

6332 col_name = list(name) if isinstance(name, tuple) else [name] 

6333 if col_fill is None: 

6334 if len(col_name) not in (1, self.columns.nlevels): 

6335 raise ValueError( 

6336 "col_fill=None is incompatible " 

6337 f"with incomplete column name {name}" 

6338 ) 

6339 col_fill = col_name[0] 

6340 

6341 lev_num = self.columns._get_level_number(col_level) 

6342 name_lst = [col_fill] * lev_num + col_name 

6343 missing = self.columns.nlevels - len(name_lst) 

6344 name_lst += [col_fill] * missing 

6345 name = tuple(name_lst) 

6346 

6347 # to ndarray and maybe infer different dtype 

6348 level_values = lev._values 

6349 if level_values.dtype == np.object_: 

6350 level_values = lib.maybe_convert_objects(level_values) 

6351 

6352 if lab is not None: 

6353 # if we have the codes, extract the values with a mask 

6354 level_values = algorithms.take( 

6355 level_values, lab, allow_fill=True, fill_value=lev._na_value 

6356 ) 

6357 

6358 new_obj.insert( 

6359 0, 

6360 name, 

6361 level_values, 

6362 allow_duplicates=allow_duplicates, 

6363 ) 

6364 

6365 new_obj.index = new_index 

6366 if not inplace: 

6367 return new_obj 

6368 

6369 return None 

6370 

6371 # ---------------------------------------------------------------------- 

6372 # Reindex-based selection methods 

6373 

6374 @doc(NDFrame.isna, klass=_shared_doc_kwargs["klass"]) 

6375 def isna(self) -> DataFrame: 

6376 result = self._constructor(self._mgr.isna(func=isna)) 

6377 return result.__finalize__(self, method="isna") 

6378 

6379 @doc(NDFrame.isna, klass=_shared_doc_kwargs["klass"]) 

6380 def isnull(self) -> DataFrame: 

6381 """ 

6382 DataFrame.isnull is an alias for DataFrame.isna. 

6383 """ 

6384 return self.isna() 

6385 

6386 @doc(NDFrame.notna, klass=_shared_doc_kwargs["klass"]) 

6387 def notna(self) -> DataFrame: 

6388 return ~self.isna() 

6389 

6390 @doc(NDFrame.notna, klass=_shared_doc_kwargs["klass"]) 

6391 def notnull(self) -> DataFrame: 

6392 """ 

6393 DataFrame.notnull is an alias for DataFrame.notna. 

6394 """ 

6395 return ~self.isna() 

6396 

6397 @overload 

6398 def dropna( 

6399 self, 

6400 *, 

6401 axis: Axis = ..., 

6402 how: str | NoDefault = ..., 

6403 thresh: int | NoDefault = ..., 

6404 subset: IndexLabel = ..., 

6405 inplace: Literal[False] = ..., 

6406 ) -> DataFrame: 

6407 ... 

6408 

6409 @overload 

6410 def dropna( 

6411 self, 

6412 *, 

6413 axis: Axis = ..., 

6414 how: str | NoDefault = ..., 

6415 thresh: int | NoDefault = ..., 

6416 subset: IndexLabel = ..., 

6417 inplace: Literal[True], 

6418 ) -> None: 

6419 ... 

6420 

6421 @deprecate_nonkeyword_arguments(version=None, allowed_args=["self"]) 

6422 def dropna( 

6423 self, 

6424 axis: Axis = 0, 

6425 how: str | NoDefault = no_default, 

6426 thresh: int | NoDefault = no_default, 

6427 subset: IndexLabel = None, 

6428 inplace: bool = False, 

6429 ) -> DataFrame | None: 

6430 """ 

6431 Remove missing values. 

6432 

6433 See the :ref:`User Guide <missing_data>` for more on which values are 

6434 considered missing, and how to work with missing data. 

6435 

6436 Parameters 

6437 ---------- 

6438 axis : {0 or 'index', 1 or 'columns'}, default 0 

6439 Determine if rows or columns which contain missing values are 

6440 removed. 

6441 

6442 * 0, or 'index' : Drop rows which contain missing values. 

6443 * 1, or 'columns' : Drop columns which contain missing value. 

6444 

6445 .. versionchanged:: 1.0.0 

6446 

6447 Pass tuple or list to drop on multiple axes. 

6448 Only a single axis is allowed. 

6449 

6450 how : {'any', 'all'}, default 'any' 

6451 Determine if row or column is removed from DataFrame, when we have 

6452 at least one NA or all NA. 

6453 

6454 * 'any' : If any NA values are present, drop that row or column. 

6455 * 'all' : If all values are NA, drop that row or column. 

6456 

6457 thresh : int, optional 

6458 Require that many non-NA values. Cannot be combined with how. 

6459 subset : column label or sequence of labels, optional 

6460 Labels along other axis to consider, e.g. if you are dropping rows 

6461 these would be a list of columns to include. 

6462 inplace : bool, default False 

6463 Whether to modify the DataFrame rather than creating a new one. 

6464 

6465 Returns 

6466 ------- 

6467 DataFrame or None 

6468 DataFrame with NA entries dropped from it or None if ``inplace=True``. 

6469 

6470 See Also 

6471 -------- 

6472 DataFrame.isna: Indicate missing values. 

6473 DataFrame.notna : Indicate existing (non-missing) values. 

6474 DataFrame.fillna : Replace missing values. 

6475 Series.dropna : Drop missing values. 

6476 Index.dropna : Drop missing indices. 

6477 

6478 Examples 

6479 -------- 

6480 >>> df = pd.DataFrame({"name": ['Alfred', 'Batman', 'Catwoman'], 

6481 ... "toy": [np.nan, 'Batmobile', 'Bullwhip'], 

6482 ... "born": [pd.NaT, pd.Timestamp("1940-04-25"), 

6483 ... pd.NaT]}) 

6484 >>> df 

6485 name toy born 

6486 0 Alfred NaN NaT 

6487 1 Batman Batmobile 1940-04-25 

6488 2 Catwoman Bullwhip NaT 

6489 

6490 Drop the rows where at least one element is missing. 

6491 

6492 >>> df.dropna() 

6493 name toy born 

6494 1 Batman Batmobile 1940-04-25 

6495 

6496 Drop the columns where at least one element is missing. 

6497 

6498 >>> df.dropna(axis='columns') 

6499 name 

6500 0 Alfred 

6501 1 Batman 

6502 2 Catwoman 

6503 

6504 Drop the rows where all elements are missing. 

6505 

6506 >>> df.dropna(how='all') 

6507 name toy born 

6508 0 Alfred NaN NaT 

6509 1 Batman Batmobile 1940-04-25 

6510 2 Catwoman Bullwhip NaT 

6511 

6512 Keep only the rows with at least 2 non-NA values. 

6513 

6514 >>> df.dropna(thresh=2) 

6515 name toy born 

6516 1 Batman Batmobile 1940-04-25 

6517 2 Catwoman Bullwhip NaT 

6518 

6519 Define in which columns to look for missing values. 

6520 

6521 >>> df.dropna(subset=['name', 'toy']) 

6522 name toy born 

6523 1 Batman Batmobile 1940-04-25 

6524 2 Catwoman Bullwhip NaT 

6525 

6526 Keep the DataFrame with valid entries in the same variable. 

6527 

6528 >>> df.dropna(inplace=True) 

6529 >>> df 

6530 name toy born 

6531 1 Batman Batmobile 1940-04-25 

6532 """ 

6533 if (how is not no_default) and (thresh is not no_default): 

6534 raise TypeError( 

6535 "You cannot set both the how and thresh arguments at the same time." 

6536 ) 

6537 

6538 if how is no_default: 

6539 how = "any" 

6540 

6541 inplace = validate_bool_kwarg(inplace, "inplace") 

6542 if isinstance(axis, (tuple, list)): 

6543 # GH20987 

6544 raise TypeError("supplying multiple axes to axis is no longer supported.") 

6545 

6546 axis = self._get_axis_number(axis) 

6547 agg_axis = 1 - axis 

6548 

6549 agg_obj = self 

6550 if subset is not None: 

6551 # subset needs to be list 

6552 if not is_list_like(subset): 

6553 subset = [subset] 

6554 ax = self._get_axis(agg_axis) 

6555 indices = ax.get_indexer_for(subset) 

6556 check = indices == -1 

6557 if check.any(): 

6558 raise KeyError(np.array(subset)[check].tolist()) 

6559 agg_obj = self.take(indices, axis=agg_axis) 

6560 

6561 if thresh is not no_default: 

6562 count = agg_obj.count(axis=agg_axis) 

6563 mask = count >= thresh 

6564 elif how == "any": 

6565 # faster equivalent to 'agg_obj.count(agg_axis) == self.shape[agg_axis]' 

6566 mask = notna(agg_obj).all(axis=agg_axis, bool_only=False) 

6567 elif how == "all": 

6568 # faster equivalent to 'agg_obj.count(agg_axis) > 0' 

6569 mask = notna(agg_obj).any(axis=agg_axis, bool_only=False) 

6570 else: 

6571 raise ValueError(f"invalid how option: {how}") 

6572 

6573 if np.all(mask): 

6574 result = self.copy() 

6575 else: 

6576 result = self.loc(axis=axis)[mask] 

6577 

6578 if not inplace: 

6579 return result 

6580 self._update_inplace(result) 

6581 return None 

6582 

6583 @deprecate_nonkeyword_arguments(version=None, allowed_args=["self", "subset"]) 

6584 def drop_duplicates( 

6585 self, 

6586 subset: Hashable | Sequence[Hashable] | None = None, 

6587 keep: Literal["first", "last", False] = "first", 

6588 inplace: bool = False, 

6589 ignore_index: bool = False, 

6590 ) -> DataFrame | None: 

6591 """ 

6592 Return DataFrame with duplicate rows removed. 

6593 

6594 Considering certain columns is optional. Indexes, including time indexes 

6595 are ignored. 

6596 

6597 Parameters 

6598 ---------- 

6599 subset : column label or sequence of labels, optional 

6600 Only consider certain columns for identifying duplicates, by 

6601 default use all of the columns. 

6602 keep : {'first', 'last', False}, default 'first' 

6603 Determines which duplicates (if any) to keep. 

6604 - ``first`` : Drop duplicates except for the first occurrence. 

6605 - ``last`` : Drop duplicates except for the last occurrence. 

6606 - False : Drop all duplicates. 

6607 inplace : bool, default False 

6608 Whether to modify the DataFrame rather than creating a new one. 

6609 ignore_index : bool, default False 

6610 If True, the resulting axis will be labeled 0, 1, …, n - 1. 

6611 

6612 .. versionadded:: 1.0.0 

6613 

6614 Returns 

6615 ------- 

6616 DataFrame or None 

6617 DataFrame with duplicates removed or None if ``inplace=True``. 

6618 

6619 See Also 

6620 -------- 

6621 DataFrame.value_counts: Count unique combinations of columns. 

6622 

6623 Examples 

6624 -------- 

6625 Consider dataset containing ramen rating. 

6626 

6627 >>> df = pd.DataFrame({ 

6628 ... 'brand': ['Yum Yum', 'Yum Yum', 'Indomie', 'Indomie', 'Indomie'], 

6629 ... 'style': ['cup', 'cup', 'cup', 'pack', 'pack'], 

6630 ... 'rating': [4, 4, 3.5, 15, 5] 

6631 ... }) 

6632 >>> df 

6633 brand style rating 

6634 0 Yum Yum cup 4.0 

6635 1 Yum Yum cup 4.0 

6636 2 Indomie cup 3.5 

6637 3 Indomie pack 15.0 

6638 4 Indomie pack 5.0 

6639 

6640 By default, it removes duplicate rows based on all columns. 

6641 

6642 >>> df.drop_duplicates() 

6643 brand style rating 

6644 0 Yum Yum cup 4.0 

6645 2 Indomie cup 3.5 

6646 3 Indomie pack 15.0 

6647 4 Indomie pack 5.0 

6648 

6649 To remove duplicates on specific column(s), use ``subset``. 

6650 

6651 >>> df.drop_duplicates(subset=['brand']) 

6652 brand style rating 

6653 0 Yum Yum cup 4.0 

6654 2 Indomie cup 3.5 

6655 

6656 To remove duplicates and keep last occurrences, use ``keep``. 

6657 

6658 >>> df.drop_duplicates(subset=['brand', 'style'], keep='last') 

6659 brand style rating 

6660 1 Yum Yum cup 4.0 

6661 2 Indomie cup 3.5 

6662 4 Indomie pack 5.0 

6663 """ 

6664 if self.empty: 

6665 return self.copy() 

6666 

6667 inplace = validate_bool_kwarg(inplace, "inplace") 

6668 ignore_index = validate_bool_kwarg(ignore_index, "ignore_index") 

6669 duplicated = self.duplicated(subset, keep=keep) 

6670 

6671 result = self[-duplicated] 

6672 if ignore_index: 

6673 result.index = default_index(len(result)) 

6674 

6675 if inplace: 

6676 self._update_inplace(result) 

6677 return None 

6678 else: 

6679 return result 

6680 

6681 def duplicated( 

6682 self, 

6683 subset: Hashable | Sequence[Hashable] | None = None, 

6684 keep: Literal["first", "last", False] = "first", 

6685 ) -> Series: 

6686 """ 

6687 Return boolean Series denoting duplicate rows. 

6688 

6689 Considering certain columns is optional. 

6690 

6691 Parameters 

6692 ---------- 

6693 subset : column label or sequence of labels, optional 

6694 Only consider certain columns for identifying duplicates, by 

6695 default use all of the columns. 

6696 keep : {'first', 'last', False}, default 'first' 

6697 Determines which duplicates (if any) to mark. 

6698 

6699 - ``first`` : Mark duplicates as ``True`` except for the first occurrence. 

6700 - ``last`` : Mark duplicates as ``True`` except for the last occurrence. 

6701 - False : Mark all duplicates as ``True``. 

6702 

6703 Returns 

6704 ------- 

6705 Series 

6706 Boolean series for each duplicated rows. 

6707 

6708 See Also 

6709 -------- 

6710 Index.duplicated : Equivalent method on index. 

6711 Series.duplicated : Equivalent method on Series. 

6712 Series.drop_duplicates : Remove duplicate values from Series. 

6713 DataFrame.drop_duplicates : Remove duplicate values from DataFrame. 

6714 

6715 Examples 

6716 -------- 

6717 Consider dataset containing ramen rating. 

6718 

6719 >>> df = pd.DataFrame({ 

6720 ... 'brand': ['Yum Yum', 'Yum Yum', 'Indomie', 'Indomie', 'Indomie'], 

6721 ... 'style': ['cup', 'cup', 'cup', 'pack', 'pack'], 

6722 ... 'rating': [4, 4, 3.5, 15, 5] 

6723 ... }) 

6724 >>> df 

6725 brand style rating 

6726 0 Yum Yum cup 4.0 

6727 1 Yum Yum cup 4.0 

6728 2 Indomie cup 3.5 

6729 3 Indomie pack 15.0 

6730 4 Indomie pack 5.0 

6731 

6732 By default, for each set of duplicated values, the first occurrence 

6733 is set on False and all others on True. 

6734 

6735 >>> df.duplicated() 

6736 0 False 

6737 1 True 

6738 2 False 

6739 3 False 

6740 4 False 

6741 dtype: bool 

6742 

6743 By using 'last', the last occurrence of each set of duplicated values 

6744 is set on False and all others on True. 

6745 

6746 >>> df.duplicated(keep='last') 

6747 0 True 

6748 1 False 

6749 2 False 

6750 3 False 

6751 4 False 

6752 dtype: bool 

6753 

6754 By setting ``keep`` on False, all duplicates are True. 

6755 

6756 >>> df.duplicated(keep=False) 

6757 0 True 

6758 1 True 

6759 2 False 

6760 3 False 

6761 4 False 

6762 dtype: bool 

6763 

6764 To find duplicates on specific column(s), use ``subset``. 

6765 

6766 >>> df.duplicated(subset=['brand']) 

6767 0 False 

6768 1 True 

6769 2 False 

6770 3 True 

6771 4 True 

6772 dtype: bool 

6773 """ 

6774 

6775 if self.empty: 

6776 return self._constructor_sliced(dtype=bool) 

6777 

6778 def f(vals) -> tuple[np.ndarray, int]: 

6779 labels, shape = algorithms.factorize(vals, size_hint=len(self)) 

6780 return labels.astype("i8", copy=False), len(shape) 

6781 

6782 if subset is None: 

6783 # https://github.com/pandas-dev/pandas/issues/28770 

6784 # Incompatible types in assignment (expression has type "Index", variable 

6785 # has type "Sequence[Any]") 

6786 subset = self.columns # type: ignore[assignment] 

6787 elif ( 

6788 not np.iterable(subset) 

6789 or isinstance(subset, str) 

6790 or isinstance(subset, tuple) 

6791 and subset in self.columns 

6792 ): 

6793 subset = (subset,) 

6794 

6795 # needed for mypy since can't narrow types using np.iterable 

6796 subset = cast(Sequence, subset) 

6797 

6798 # Verify all columns in subset exist in the queried dataframe 

6799 # Otherwise, raise a KeyError, same as if you try to __getitem__ with a 

6800 # key that doesn't exist. 

6801 diff = set(subset) - set(self.columns) 

6802 if diff: 

6803 raise KeyError(Index(diff)) 

6804 

6805 if len(subset) == 1 and self.columns.is_unique: 

6806 # GH#45236 This is faster than get_group_index below 

6807 result = self[subset[0]].duplicated(keep) 

6808 result.name = None 

6809 else: 

6810 vals = (col.values for name, col in self.items() if name in subset) 

6811 labels, shape = map(list, zip(*map(f, vals))) 

6812 

6813 ids = get_group_index( 

6814 labels, 

6815 # error: Argument 1 to "tuple" has incompatible type "List[_T]"; 

6816 # expected "Iterable[int]" 

6817 tuple(shape), # type: ignore[arg-type] 

6818 sort=False, 

6819 xnull=False, 

6820 ) 

6821 result = self._constructor_sliced(duplicated(ids, keep), index=self.index) 

6822 return result.__finalize__(self, method="duplicated") 

6823 

6824 # ---------------------------------------------------------------------- 

6825 # Sorting 

6826 # error: Signature of "sort_values" incompatible with supertype "NDFrame" 

6827 @overload # type: ignore[override] 

6828 def sort_values( 

6829 self, 

6830 by: IndexLabel, 

6831 *, 

6832 axis: Axis = ..., 

6833 ascending=..., 

6834 inplace: Literal[False] = ..., 

6835 kind: str = ..., 

6836 na_position: str = ..., 

6837 ignore_index: bool = ..., 

6838 key: ValueKeyFunc = ..., 

6839 ) -> DataFrame: 

6840 ... 

6841 

6842 @overload 

6843 def sort_values( 

6844 self, 

6845 by: IndexLabel, 

6846 *, 

6847 axis: Axis = ..., 

6848 ascending=..., 

6849 inplace: Literal[True], 

6850 kind: str = ..., 

6851 na_position: str = ..., 

6852 ignore_index: bool = ..., 

6853 key: ValueKeyFunc = ..., 

6854 ) -> None: 

6855 ... 

6856 

6857 # TODO: Just move the sort_values doc here. 

6858 # error: Signature of "sort_values" incompatible with supertype "NDFrame" 

6859 @deprecate_nonkeyword_arguments(version=None, allowed_args=["self", "by"]) 

6860 @Substitution(**_shared_doc_kwargs) 

6861 @Appender(NDFrame.sort_values.__doc__) 

6862 def sort_values( # type: ignore[override] 

6863 self, 

6864 by: IndexLabel, 

6865 axis: Axis = 0, 

6866 ascending: bool | list[bool] | tuple[bool, ...] = True, 

6867 inplace: bool = False, 

6868 kind: str = "quicksort", 

6869 na_position: str = "last", 

6870 ignore_index: bool = False, 

6871 key: ValueKeyFunc = None, 

6872 ) -> DataFrame | None: 

6873 inplace = validate_bool_kwarg(inplace, "inplace") 

6874 axis = self._get_axis_number(axis) 

6875 ascending = validate_ascending(ascending) 

6876 if not isinstance(by, list): 

6877 by = [by] 

6878 # error: Argument 1 to "len" has incompatible type "Union[bool, List[bool]]"; 

6879 # expected "Sized" 

6880 if is_sequence(ascending) and ( 

6881 len(by) != len(ascending) # type: ignore[arg-type] 

6882 ): 

6883 # error: Argument 1 to "len" has incompatible type "Union[bool, 

6884 # List[bool]]"; expected "Sized" 

6885 raise ValueError( 

6886 f"Length of ascending ({len(ascending)})" # type: ignore[arg-type] 

6887 f" != length of by ({len(by)})" 

6888 ) 

6889 if len(by) > 1: 

6890 

6891 keys = [self._get_label_or_level_values(x, axis=axis) for x in by] 

6892 

6893 # need to rewrap columns in Series to apply key function 

6894 if key is not None: 

6895 # error: List comprehension has incompatible type List[Series]; 

6896 # expected List[ndarray] 

6897 keys = [ 

6898 Series(k, name=name) # type: ignore[misc] 

6899 for (k, name) in zip(keys, by) 

6900 ] 

6901 

6902 indexer = lexsort_indexer( 

6903 keys, orders=ascending, na_position=na_position, key=key 

6904 ) 

6905 elif len(by): 

6906 # len(by) == 1 

6907 

6908 by = by[0] 

6909 k = self._get_label_or_level_values(by, axis=axis) 

6910 

6911 # need to rewrap column in Series to apply key function 

6912 if key is not None: 

6913 # error: Incompatible types in assignment (expression has type 

6914 # "Series", variable has type "ndarray") 

6915 k = Series(k, name=by) # type: ignore[assignment] 

6916 

6917 if isinstance(ascending, (tuple, list)): 

6918 ascending = ascending[0] 

6919 

6920 indexer = nargsort( 

6921 k, kind=kind, ascending=ascending, na_position=na_position, key=key 

6922 ) 

6923 else: 

6924 return self.copy() 

6925 

6926 new_data = self._mgr.take( 

6927 indexer, axis=self._get_block_manager_axis(axis), verify=False 

6928 ) 

6929 

6930 if ignore_index: 

6931 new_data.set_axis( 

6932 self._get_block_manager_axis(axis), default_index(len(indexer)) 

6933 ) 

6934 

6935 result = self._constructor(new_data) 

6936 if inplace: 

6937 return self._update_inplace(result) 

6938 else: 

6939 return result.__finalize__(self, method="sort_values") 

6940 

6941 @overload 

6942 def sort_index( 

6943 self, 

6944 *, 

6945 axis: Axis = ..., 

6946 level: IndexLabel = ..., 

6947 ascending: bool | Sequence[bool] = ..., 

6948 inplace: Literal[True], 

6949 kind: SortKind = ..., 

6950 na_position: NaPosition = ..., 

6951 sort_remaining: bool = ..., 

6952 ignore_index: bool = ..., 

6953 key: IndexKeyFunc = ..., 

6954 ) -> None: 

6955 ... 

6956 

6957 @overload 

6958 def sort_index( 

6959 self, 

6960 *, 

6961 axis: Axis = ..., 

6962 level: IndexLabel = ..., 

6963 ascending: bool | Sequence[bool] = ..., 

6964 inplace: Literal[False] = ..., 

6965 kind: SortKind = ..., 

6966 na_position: NaPosition = ..., 

6967 sort_remaining: bool = ..., 

6968 ignore_index: bool = ..., 

6969 key: IndexKeyFunc = ..., 

6970 ) -> DataFrame: 

6971 ... 

6972 

6973 @overload 

6974 def sort_index( 

6975 self, 

6976 *, 

6977 axis: Axis = ..., 

6978 level: IndexLabel = ..., 

6979 ascending: bool | Sequence[bool] = ..., 

6980 inplace: bool = ..., 

6981 kind: SortKind = ..., 

6982 na_position: NaPosition = ..., 

6983 sort_remaining: bool = ..., 

6984 ignore_index: bool = ..., 

6985 key: IndexKeyFunc = ..., 

6986 ) -> DataFrame | None: 

6987 ... 

6988 

6989 # error: Signature of "sort_index" incompatible with supertype "NDFrame" 

6990 @deprecate_nonkeyword_arguments(version=None, allowed_args=["self"]) 

6991 def sort_index( # type: ignore[override] 

6992 self, 

6993 axis: Axis = 0, 

6994 level: IndexLabel = None, 

6995 ascending: bool | Sequence[bool] = True, 

6996 inplace: bool = False, 

6997 kind: SortKind = "quicksort", 

6998 na_position: NaPosition = "last", 

6999 sort_remaining: bool = True, 

7000 ignore_index: bool = False, 

7001 key: IndexKeyFunc = None, 

7002 ) -> DataFrame | None: 

7003 """ 

7004 Sort object by labels (along an axis). 

7005 

7006 Returns a new DataFrame sorted by label if `inplace` argument is 

7007 ``False``, otherwise updates the original DataFrame and returns None. 

7008 

7009 Parameters 

7010 ---------- 

7011 axis : {0 or 'index', 1 or 'columns'}, default 0 

7012 The axis along which to sort. The value 0 identifies the rows, 

7013 and 1 identifies the columns. 

7014 level : int or level name or list of ints or list of level names 

7015 If not None, sort on values in specified index level(s). 

7016 ascending : bool or list-like of bools, default True 

7017 Sort ascending vs. descending. When the index is a MultiIndex the 

7018 sort direction can be controlled for each level individually. 

7019 inplace : bool, default False 

7020 Whether to modify the DataFrame rather than creating a new one. 

7021 kind : {'quicksort', 'mergesort', 'heapsort', 'stable'}, default 'quicksort' 

7022 Choice of sorting algorithm. See also :func:`numpy.sort` for more 

7023 information. `mergesort` and `stable` are the only stable algorithms. For 

7024 DataFrames, this option is only applied when sorting on a single 

7025 column or label. 

7026 na_position : {'first', 'last'}, default 'last' 

7027 Puts NaNs at the beginning if `first`; `last` puts NaNs at the end. 

7028 Not implemented for MultiIndex. 

7029 sort_remaining : bool, default True 

7030 If True and sorting by level and index is multilevel, sort by other 

7031 levels too (in order) after sorting by specified level. 

7032 ignore_index : bool, default False 

7033 If True, the resulting axis will be labeled 0, 1, …, n - 1. 

7034 

7035 .. versionadded:: 1.0.0 

7036 

7037 key : callable, optional 

7038 If not None, apply the key function to the index values 

7039 before sorting. This is similar to the `key` argument in the 

7040 builtin :meth:`sorted` function, with the notable difference that 

7041 this `key` function should be *vectorized*. It should expect an 

7042 ``Index`` and return an ``Index`` of the same shape. For MultiIndex 

7043 inputs, the key is applied *per level*. 

7044 

7045 .. versionadded:: 1.1.0 

7046 

7047 Returns 

7048 ------- 

7049 DataFrame or None 

7050 The original DataFrame sorted by the labels or None if ``inplace=True``. 

7051 

7052 See Also 

7053 -------- 

7054 Series.sort_index : Sort Series by the index. 

7055 DataFrame.sort_values : Sort DataFrame by the value. 

7056 Series.sort_values : Sort Series by the value. 

7057 

7058 Examples 

7059 -------- 

7060 >>> df = pd.DataFrame([1, 2, 3, 4, 5], index=[100, 29, 234, 1, 150], 

7061 ... columns=['A']) 

7062 >>> df.sort_index() 

7063 A 

7064 1 4 

7065 29 2 

7066 100 1 

7067 150 5 

7068 234 3 

7069 

7070 By default, it sorts in ascending order, to sort in descending order, 

7071 use ``ascending=False`` 

7072 

7073 >>> df.sort_index(ascending=False) 

7074 A 

7075 234 3 

7076 150 5 

7077 100 1 

7078 29 2 

7079 1 4 

7080 

7081 A key function can be specified which is applied to the index before 

7082 sorting. For a ``MultiIndex`` this is applied to each level separately. 

7083 

7084 >>> df = pd.DataFrame({"a": [1, 2, 3, 4]}, index=['A', 'b', 'C', 'd']) 

7085 >>> df.sort_index(key=lambda x: x.str.lower()) 

7086 a 

7087 A 1 

7088 b 2 

7089 C 3 

7090 d 4 

7091 """ 

7092 return super().sort_index( 

7093 axis=axis, 

7094 level=level, 

7095 ascending=ascending, 

7096 inplace=inplace, 

7097 kind=kind, 

7098 na_position=na_position, 

7099 sort_remaining=sort_remaining, 

7100 ignore_index=ignore_index, 

7101 key=key, 

7102 ) 

7103 

7104 def value_counts( 

7105 self, 

7106 subset: Sequence[Hashable] | None = None, 

7107 normalize: bool = False, 

7108 sort: bool = True, 

7109 ascending: bool = False, 

7110 dropna: bool = True, 

7111 ) -> Series: 

7112 """ 

7113 Return a Series containing counts of unique rows in the DataFrame. 

7114 

7115 .. versionadded:: 1.1.0 

7116 

7117 Parameters 

7118 ---------- 

7119 subset : list-like, optional 

7120 Columns to use when counting unique combinations. 

7121 normalize : bool, default False 

7122 Return proportions rather than frequencies. 

7123 sort : bool, default True 

7124 Sort by frequencies. 

7125 ascending : bool, default False 

7126 Sort in ascending order. 

7127 dropna : bool, default True 

7128 Don’t include counts of rows that contain NA values. 

7129 

7130 .. versionadded:: 1.3.0 

7131 

7132 Returns 

7133 ------- 

7134 Series 

7135 

7136 See Also 

7137 -------- 

7138 Series.value_counts: Equivalent method on Series. 

7139 

7140 Notes 

7141 ----- 

7142 The returned Series will have a MultiIndex with one level per input 

7143 column. By default, rows that contain any NA values are omitted from 

7144 the result. By default, the resulting Series will be in descending 

7145 order so that the first element is the most frequently-occurring row. 

7146 

7147 Examples 

7148 -------- 

7149 >>> df = pd.DataFrame({'num_legs': [2, 4, 4, 6], 

7150 ... 'num_wings': [2, 0, 0, 0]}, 

7151 ... index=['falcon', 'dog', 'cat', 'ant']) 

7152 >>> df 

7153 num_legs num_wings 

7154 falcon 2 2 

7155 dog 4 0 

7156 cat 4 0 

7157 ant 6 0 

7158 

7159 >>> df.value_counts() 

7160 num_legs num_wings 

7161 4 0 2 

7162 2 2 1 

7163 6 0 1 

7164 dtype: int64 

7165 

7166 >>> df.value_counts(sort=False) 

7167 num_legs num_wings 

7168 2 2 1 

7169 4 0 2 

7170 6 0 1 

7171 dtype: int64 

7172 

7173 >>> df.value_counts(ascending=True) 

7174 num_legs num_wings 

7175 2 2 1 

7176 6 0 1 

7177 4 0 2 

7178 dtype: int64 

7179 

7180 >>> df.value_counts(normalize=True) 

7181 num_legs num_wings 

7182 4 0 0.50 

7183 2 2 0.25 

7184 6 0 0.25 

7185 dtype: float64 

7186 

7187 With `dropna` set to `False` we can also count rows with NA values. 

7188 

7189 >>> df = pd.DataFrame({'first_name': ['John', 'Anne', 'John', 'Beth'], 

7190 ... 'middle_name': ['Smith', pd.NA, pd.NA, 'Louise']}) 

7191 >>> df 

7192 first_name middle_name 

7193 0 John Smith 

7194 1 Anne <NA> 

7195 2 John <NA> 

7196 3 Beth Louise 

7197 

7198 >>> df.value_counts() 

7199 first_name middle_name 

7200 Beth Louise 1 

7201 John Smith 1 

7202 dtype: int64 

7203 

7204 >>> df.value_counts(dropna=False) 

7205 first_name middle_name 

7206 Anne NaN 1 

7207 Beth Louise 1 

7208 John Smith 1 

7209 NaN 1 

7210 dtype: int64 

7211 """ 

7212 if subset is None: 

7213 subset = self.columns.tolist() 

7214 

7215 counts = self.groupby(subset, dropna=dropna).grouper.size() 

7216 

7217 if sort: 

7218 counts = counts.sort_values(ascending=ascending) 

7219 if normalize: 

7220 counts /= counts.sum() 

7221 

7222 # Force MultiIndex for single column 

7223 if len(subset) == 1: 

7224 counts.index = MultiIndex.from_arrays( 

7225 [counts.index], names=[counts.index.name] 

7226 ) 

7227 

7228 return counts 

7229 

7230 def nlargest(self, n: int, columns: IndexLabel, keep: str = "first") -> DataFrame: 

7231 """ 

7232 Return the first `n` rows ordered by `columns` in descending order. 

7233 

7234 Return the first `n` rows with the largest values in `columns`, in 

7235 descending order. The columns that are not specified are returned as 

7236 well, but not used for ordering. 

7237 

7238 This method is equivalent to 

7239 ``df.sort_values(columns, ascending=False).head(n)``, but more 

7240 performant. 

7241 

7242 Parameters 

7243 ---------- 

7244 n : int 

7245 Number of rows to return. 

7246 columns : label or list of labels 

7247 Column label(s) to order by. 

7248 keep : {'first', 'last', 'all'}, default 'first' 

7249 Where there are duplicate values: 

7250 

7251 - ``first`` : prioritize the first occurrence(s) 

7252 - ``last`` : prioritize the last occurrence(s) 

7253 - ``all`` : do not drop any duplicates, even it means 

7254 selecting more than `n` items. 

7255 

7256 Returns 

7257 ------- 

7258 DataFrame 

7259 The first `n` rows ordered by the given columns in descending 

7260 order. 

7261 

7262 See Also 

7263 -------- 

7264 DataFrame.nsmallest : Return the first `n` rows ordered by `columns` in 

7265 ascending order. 

7266 DataFrame.sort_values : Sort DataFrame by the values. 

7267 DataFrame.head : Return the first `n` rows without re-ordering. 

7268 

7269 Notes 

7270 ----- 

7271 This function cannot be used with all column types. For example, when 

7272 specifying columns with `object` or `category` dtypes, ``TypeError`` is 

7273 raised. 

7274 

7275 Examples 

7276 -------- 

7277 >>> df = pd.DataFrame({'population': [59000000, 65000000, 434000, 

7278 ... 434000, 434000, 337000, 11300, 

7279 ... 11300, 11300], 

7280 ... 'GDP': [1937894, 2583560 , 12011, 4520, 12128, 

7281 ... 17036, 182, 38, 311], 

7282 ... 'alpha-2': ["IT", "FR", "MT", "MV", "BN", 

7283 ... "IS", "NR", "TV", "AI"]}, 

7284 ... index=["Italy", "France", "Malta", 

7285 ... "Maldives", "Brunei", "Iceland", 

7286 ... "Nauru", "Tuvalu", "Anguilla"]) 

7287 >>> df 

7288 population GDP alpha-2 

7289 Italy 59000000 1937894 IT 

7290 France 65000000 2583560 FR 

7291 Malta 434000 12011 MT 

7292 Maldives 434000 4520 MV 

7293 Brunei 434000 12128 BN 

7294 Iceland 337000 17036 IS 

7295 Nauru 11300 182 NR 

7296 Tuvalu 11300 38 TV 

7297 Anguilla 11300 311 AI 

7298 

7299 In the following example, we will use ``nlargest`` to select the three 

7300 rows having the largest values in column "population". 

7301 

7302 >>> df.nlargest(3, 'population') 

7303 population GDP alpha-2 

7304 France 65000000 2583560 FR 

7305 Italy 59000000 1937894 IT 

7306 Malta 434000 12011 MT 

7307 

7308 When using ``keep='last'``, ties are resolved in reverse order: 

7309 

7310 >>> df.nlargest(3, 'population', keep='last') 

7311 population GDP alpha-2 

7312 France 65000000 2583560 FR 

7313 Italy 59000000 1937894 IT 

7314 Brunei 434000 12128 BN 

7315 

7316 When using ``keep='all'``, all duplicate items are maintained: 

7317 

7318 >>> df.nlargest(3, 'population', keep='all') 

7319 population GDP alpha-2 

7320 France 65000000 2583560 FR 

7321 Italy 59000000 1937894 IT 

7322 Malta 434000 12011 MT 

7323 Maldives 434000 4520 MV 

7324 Brunei 434000 12128 BN 

7325 

7326 To order by the largest values in column "population" and then "GDP", 

7327 we can specify multiple columns like in the next example. 

7328 

7329 >>> df.nlargest(3, ['population', 'GDP']) 

7330 population GDP alpha-2 

7331 France 65000000 2583560 FR 

7332 Italy 59000000 1937894 IT 

7333 Brunei 434000 12128 BN 

7334 """ 

7335 return algorithms.SelectNFrame(self, n=n, keep=keep, columns=columns).nlargest() 

7336 

7337 def nsmallest(self, n: int, columns: IndexLabel, keep: str = "first") -> DataFrame: 

7338 """ 

7339 Return the first `n` rows ordered by `columns` in ascending order. 

7340 

7341 Return the first `n` rows with the smallest values in `columns`, in 

7342 ascending order. The columns that are not specified are returned as 

7343 well, but not used for ordering. 

7344 

7345 This method is equivalent to 

7346 ``df.sort_values(columns, ascending=True).head(n)``, but more 

7347 performant. 

7348 

7349 Parameters 

7350 ---------- 

7351 n : int 

7352 Number of items to retrieve. 

7353 columns : list or str 

7354 Column name or names to order by. 

7355 keep : {'first', 'last', 'all'}, default 'first' 

7356 Where there are duplicate values: 

7357 

7358 - ``first`` : take the first occurrence. 

7359 - ``last`` : take the last occurrence. 

7360 - ``all`` : do not drop any duplicates, even it means 

7361 selecting more than `n` items. 

7362 

7363 Returns 

7364 ------- 

7365 DataFrame 

7366 

7367 See Also 

7368 -------- 

7369 DataFrame.nlargest : Return the first `n` rows ordered by `columns` in 

7370 descending order. 

7371 DataFrame.sort_values : Sort DataFrame by the values. 

7372 DataFrame.head : Return the first `n` rows without re-ordering. 

7373 

7374 Examples 

7375 -------- 

7376 >>> df = pd.DataFrame({'population': [59000000, 65000000, 434000, 

7377 ... 434000, 434000, 337000, 337000, 

7378 ... 11300, 11300], 

7379 ... 'GDP': [1937894, 2583560 , 12011, 4520, 12128, 

7380 ... 17036, 182, 38, 311], 

7381 ... 'alpha-2': ["IT", "FR", "MT", "MV", "BN", 

7382 ... "IS", "NR", "TV", "AI"]}, 

7383 ... index=["Italy", "France", "Malta", 

7384 ... "Maldives", "Brunei", "Iceland", 

7385 ... "Nauru", "Tuvalu", "Anguilla"]) 

7386 >>> df 

7387 population GDP alpha-2 

7388 Italy 59000000 1937894 IT 

7389 France 65000000 2583560 FR 

7390 Malta 434000 12011 MT 

7391 Maldives 434000 4520 MV 

7392 Brunei 434000 12128 BN 

7393 Iceland 337000 17036 IS 

7394 Nauru 337000 182 NR 

7395 Tuvalu 11300 38 TV 

7396 Anguilla 11300 311 AI 

7397 

7398 In the following example, we will use ``nsmallest`` to select the 

7399 three rows having the smallest values in column "population". 

7400 

7401 >>> df.nsmallest(3, 'population') 

7402 population GDP alpha-2 

7403 Tuvalu 11300 38 TV 

7404 Anguilla 11300 311 AI 

7405 Iceland 337000 17036 IS 

7406 

7407 When using ``keep='last'``, ties are resolved in reverse order: 

7408 

7409 >>> df.nsmallest(3, 'population', keep='last') 

7410 population GDP alpha-2 

7411 Anguilla 11300 311 AI 

7412 Tuvalu 11300 38 TV 

7413 Nauru 337000 182 NR 

7414 

7415 When using ``keep='all'``, all duplicate items are maintained: 

7416 

7417 >>> df.nsmallest(3, 'population', keep='all') 

7418 population GDP alpha-2 

7419 Tuvalu 11300 38 TV 

7420 Anguilla 11300 311 AI 

7421 Iceland 337000 17036 IS 

7422 Nauru 337000 182 NR 

7423 

7424 To order by the smallest values in column "population" and then "GDP", we can 

7425 specify multiple columns like in the next example. 

7426 

7427 >>> df.nsmallest(3, ['population', 'GDP']) 

7428 population GDP alpha-2 

7429 Tuvalu 11300 38 TV 

7430 Anguilla 11300 311 AI 

7431 Nauru 337000 182 NR 

7432 """ 

7433 return algorithms.SelectNFrame( 

7434 self, n=n, keep=keep, columns=columns 

7435 ).nsmallest() 

7436 

7437 @doc( 

7438 Series.swaplevel, 

7439 klass=_shared_doc_kwargs["klass"], 

7440 extra_params=dedent( 

7441 """axis : {0 or 'index', 1 or 'columns'}, default 0 

7442 The axis to swap levels on. 0 or 'index' for row-wise, 1 or 

7443 'columns' for column-wise.""" 

7444 ), 

7445 examples=dedent( 

7446 """\ 

7447 Examples 

7448 -------- 

7449 >>> df = pd.DataFrame( 

7450 ... {"Grade": ["A", "B", "A", "C"]}, 

7451 ... index=[ 

7452 ... ["Final exam", "Final exam", "Coursework", "Coursework"], 

7453 ... ["History", "Geography", "History", "Geography"], 

7454 ... ["January", "February", "March", "April"], 

7455 ... ], 

7456 ... ) 

7457 >>> df 

7458 Grade 

7459 Final exam History January A 

7460 Geography February B 

7461 Coursework History March A 

7462 Geography April C 

7463 

7464 In the following example, we will swap the levels of the indices. 

7465 Here, we will swap the levels column-wise, but levels can be swapped row-wise 

7466 in a similar manner. Note that column-wise is the default behaviour. 

7467 By not supplying any arguments for i and j, we swap the last and second to 

7468 last indices. 

7469 

7470 >>> df.swaplevel() 

7471 Grade 

7472 Final exam January History A 

7473 February Geography B 

7474 Coursework March History A 

7475 April Geography C 

7476 

7477 By supplying one argument, we can choose which index to swap the last 

7478 index with. We can for example swap the first index with the last one as 

7479 follows. 

7480 

7481 >>> df.swaplevel(0) 

7482 Grade 

7483 January History Final exam A 

7484 February Geography Final exam B 

7485 March History Coursework A 

7486 April Geography Coursework C 

7487 

7488 We can also define explicitly which indices we want to swap by supplying values 

7489 for both i and j. Here, we for example swap the first and second indices. 

7490 

7491 >>> df.swaplevel(0, 1) 

7492 Grade 

7493 History Final exam January A 

7494 Geography Final exam February B 

7495 History Coursework March A 

7496 Geography Coursework April C""" 

7497 ), 

7498 ) 

7499 def swaplevel(self, i: Axis = -2, j: Axis = -1, axis: Axis = 0) -> DataFrame: 

7500 result = self.copy() 

7501 

7502 axis = self._get_axis_number(axis) 

7503 

7504 if not isinstance(result._get_axis(axis), MultiIndex): # pragma: no cover 

7505 raise TypeError("Can only swap levels on a hierarchical axis.") 

7506 

7507 if axis == 0: 

7508 assert isinstance(result.index, MultiIndex) 

7509 result.index = result.index.swaplevel(i, j) 

7510 else: 

7511 assert isinstance(result.columns, MultiIndex) 

7512 result.columns = result.columns.swaplevel(i, j) 

7513 return result 

7514 

7515 def reorder_levels(self, order: Sequence[Axis], axis: Axis = 0) -> DataFrame: 

7516 """ 

7517 Rearrange index levels using input order. May not drop or duplicate levels. 

7518 

7519 Parameters 

7520 ---------- 

7521 order : list of int or list of str 

7522 List representing new level order. Reference level by number 

7523 (position) or by key (label). 

7524 axis : {0 or 'index', 1 or 'columns'}, default 0 

7525 Where to reorder levels. 

7526 

7527 Returns 

7528 ------- 

7529 DataFrame 

7530 

7531 Examples 

7532 -------- 

7533 >>> data = { 

7534 ... "class": ["Mammals", "Mammals", "Reptiles"], 

7535 ... "diet": ["Omnivore", "Carnivore", "Carnivore"], 

7536 ... "species": ["Humans", "Dogs", "Snakes"], 

7537 ... } 

7538 >>> df = pd.DataFrame(data, columns=["class", "diet", "species"]) 

7539 >>> df = df.set_index(["class", "diet"]) 

7540 >>> df 

7541 species 

7542 class diet 

7543 Mammals Omnivore Humans 

7544 Carnivore Dogs 

7545 Reptiles Carnivore Snakes 

7546 

7547 Let's reorder the levels of the index: 

7548 

7549 >>> df.reorder_levels(["diet", "class"]) 

7550 species 

7551 diet class 

7552 Omnivore Mammals Humans 

7553 Carnivore Mammals Dogs 

7554 Reptiles Snakes 

7555 """ 

7556 axis = self._get_axis_number(axis) 

7557 if not isinstance(self._get_axis(axis), MultiIndex): # pragma: no cover 

7558 raise TypeError("Can only reorder levels on a hierarchical axis.") 

7559 

7560 result = self.copy() 

7561 

7562 if axis == 0: 

7563 assert isinstance(result.index, MultiIndex) 

7564 result.index = result.index.reorder_levels(order) 

7565 else: 

7566 assert isinstance(result.columns, MultiIndex) 

7567 result.columns = result.columns.reorder_levels(order) 

7568 return result 

7569 

7570 # ---------------------------------------------------------------------- 

7571 # Arithmetic Methods 

7572 

7573 def _cmp_method(self, other, op): 

7574 axis = 1 # only relevant for Series other case 

7575 

7576 self, other = ops.align_method_FRAME(self, other, axis, flex=False, level=None) 

7577 

7578 # See GH#4537 for discussion of scalar op behavior 

7579 new_data = self._dispatch_frame_op(other, op, axis=axis) 

7580 return self._construct_result(new_data) 

7581 

7582 def _arith_method(self, other, op): 

7583 if ops.should_reindex_frame_op(self, other, op, 1, 1, None, None): 

7584 return ops.frame_arith_method_with_reindex(self, other, op) 

7585 

7586 axis = 1 # only relevant for Series other case 

7587 other = ops.maybe_prepare_scalar_for_op(other, (self.shape[axis],)) 

7588 

7589 self, other = ops.align_method_FRAME(self, other, axis, flex=True, level=None) 

7590 

7591 new_data = self._dispatch_frame_op(other, op, axis=axis) 

7592 return self._construct_result(new_data) 

7593 

7594 _logical_method = _arith_method 

7595 

7596 def _dispatch_frame_op(self, right, func: Callable, axis: int | None = None): 

7597 """ 

7598 Evaluate the frame operation func(left, right) by evaluating 

7599 column-by-column, dispatching to the Series implementation. 

7600 

7601 Parameters 

7602 ---------- 

7603 right : scalar, Series, or DataFrame 

7604 func : arithmetic or comparison operator 

7605 axis : {None, 0, 1} 

7606 

7607 Returns 

7608 ------- 

7609 DataFrame 

7610 """ 

7611 # Get the appropriate array-op to apply to each column/block's values. 

7612 array_op = ops.get_array_op(func) 

7613 

7614 right = lib.item_from_zerodim(right) 

7615 if not is_list_like(right): 

7616 # i.e. scalar, faster than checking np.ndim(right) == 0 

7617 with np.errstate(all="ignore"): 

7618 bm = self._mgr.apply(array_op, right=right) 

7619 return self._constructor(bm) 

7620 

7621 elif isinstance(right, DataFrame): 

7622 assert self.index.equals(right.index) 

7623 assert self.columns.equals(right.columns) 

7624 # TODO: The previous assertion `assert right._indexed_same(self)` 

7625 # fails in cases with empty columns reached via 

7626 # _frame_arith_method_with_reindex 

7627 

7628 # TODO operate_blockwise expects a manager of the same type 

7629 with np.errstate(all="ignore"): 

7630 bm = self._mgr.operate_blockwise( 

7631 # error: Argument 1 to "operate_blockwise" of "ArrayManager" has 

7632 # incompatible type "Union[ArrayManager, BlockManager]"; expected 

7633 # "ArrayManager" 

7634 # error: Argument 1 to "operate_blockwise" of "BlockManager" has 

7635 # incompatible type "Union[ArrayManager, BlockManager]"; expected 

7636 # "BlockManager" 

7637 right._mgr, # type: ignore[arg-type] 

7638 array_op, 

7639 ) 

7640 return self._constructor(bm) 

7641 

7642 elif isinstance(right, Series) and axis == 1: 

7643 # axis=1 means we want to operate row-by-row 

7644 assert right.index.equals(self.columns) 

7645 

7646 right = right._values 

7647 # maybe_align_as_frame ensures we do not have an ndarray here 

7648 assert not isinstance(right, np.ndarray) 

7649 

7650 with np.errstate(all="ignore"): 

7651 arrays = [ 

7652 array_op(_left, _right) 

7653 for _left, _right in zip(self._iter_column_arrays(), right) 

7654 ] 

7655 

7656 elif isinstance(right, Series): 

7657 assert right.index.equals(self.index) # Handle other cases later 

7658 right = right._values 

7659 

7660 with np.errstate(all="ignore"): 

7661 arrays = [array_op(left, right) for left in self._iter_column_arrays()] 

7662 

7663 else: 

7664 # Remaining cases have less-obvious dispatch rules 

7665 raise NotImplementedError(right) 

7666 

7667 return type(self)._from_arrays( 

7668 arrays, self.columns, self.index, verify_integrity=False 

7669 ) 

7670 

7671 def _combine_frame(self, other: DataFrame, func, fill_value=None): 

7672 # at this point we have `self._indexed_same(other)` 

7673 

7674 if fill_value is None: 

7675 # since _arith_op may be called in a loop, avoid function call 

7676 # overhead if possible by doing this check once 

7677 _arith_op = func 

7678 

7679 else: 

7680 

7681 def _arith_op(left, right): 

7682 # for the mixed_type case where we iterate over columns, 

7683 # _arith_op(left, right) is equivalent to 

7684 # left._binop(right, func, fill_value=fill_value) 

7685 left, right = ops.fill_binop(left, right, fill_value) 

7686 return func(left, right) 

7687 

7688 new_data = self._dispatch_frame_op(other, _arith_op) 

7689 return new_data 

7690 

7691 def _construct_result(self, result) -> DataFrame: 

7692 """ 

7693 Wrap the result of an arithmetic, comparison, or logical operation. 

7694 

7695 Parameters 

7696 ---------- 

7697 result : DataFrame 

7698 

7699 Returns 

7700 ------- 

7701 DataFrame 

7702 """ 

7703 out = self._constructor(result, copy=False) 

7704 # Pin columns instead of passing to constructor for compat with 

7705 # non-unique columns case 

7706 out.columns = self.columns 

7707 out.index = self.index 

7708 return out 

7709 

7710 def __divmod__(self, other) -> tuple[DataFrame, DataFrame]: 

7711 # Naive implementation, room for optimization 

7712 div = self // other 

7713 mod = self - div * other 

7714 return div, mod 

7715 

7716 def __rdivmod__(self, other) -> tuple[DataFrame, DataFrame]: 

7717 # Naive implementation, room for optimization 

7718 div = other // self 

7719 mod = other - div * self 

7720 return div, mod 

7721 

7722 # ---------------------------------------------------------------------- 

7723 # Combination-Related 

7724 

7725 @doc( 

7726 _shared_docs["compare"], 

7727 """ 

7728Returns 

7729------- 

7730DataFrame 

7731 DataFrame that shows the differences stacked side by side. 

7732 

7733 The resulting index will be a MultiIndex with 'self' and 'other' 

7734 stacked alternately at the inner level. 

7735 

7736Raises 

7737------ 

7738ValueError 

7739 When the two DataFrames don't have identical labels or shape. 

7740 

7741See Also 

7742-------- 

7743Series.compare : Compare with another Series and show differences. 

7744DataFrame.equals : Test whether two objects contain the same elements. 

7745 

7746Notes 

7747----- 

7748Matching NaNs will not appear as a difference. 

7749 

7750Can only compare identically-labeled 

7751(i.e. same shape, identical row and column labels) DataFrames 

7752 

7753Examples 

7754-------- 

7755>>> df = pd.DataFrame( 

7756... {{ 

7757... "col1": ["a", "a", "b", "b", "a"], 

7758... "col2": [1.0, 2.0, 3.0, np.nan, 5.0], 

7759... "col3": [1.0, 2.0, 3.0, 4.0, 5.0] 

7760... }}, 

7761... columns=["col1", "col2", "col3"], 

7762... ) 

7763>>> df 

7764 col1 col2 col3 

77650 a 1.0 1.0 

77661 a 2.0 2.0 

77672 b 3.0 3.0 

77683 b NaN 4.0 

77694 a 5.0 5.0 

7770 

7771>>> df2 = df.copy() 

7772>>> df2.loc[0, 'col1'] = 'c' 

7773>>> df2.loc[2, 'col3'] = 4.0 

7774>>> df2 

7775 col1 col2 col3 

77760 c 1.0 1.0 

77771 a 2.0 2.0 

77782 b 3.0 4.0 

77793 b NaN 4.0 

77804 a 5.0 5.0 

7781 

7782Align the differences on columns 

7783 

7784>>> df.compare(df2) 

7785 col1 col3 

7786 self other self other 

77870 a c NaN NaN 

77882 NaN NaN 3.0 4.0 

7789 

7790Assign result_names 

7791 

7792>>> df.compare(df2, result_names=("left", "right")) 

7793 col1 col3 

7794 left right left right 

77950 a c NaN NaN 

77962 NaN NaN 3.0 4.0 

7797 

7798Stack the differences on rows 

7799 

7800>>> df.compare(df2, align_axis=0) 

7801 col1 col3 

78020 self a NaN 

7803 other c NaN 

78042 self NaN 3.0 

7805 other NaN 4.0 

7806 

7807Keep the equal values 

7808 

7809>>> df.compare(df2, keep_equal=True) 

7810 col1 col3 

7811 self other self other 

78120 a c 1.0 1.0 

78132 b b 3.0 4.0 

7814 

7815Keep all original rows and columns 

7816 

7817>>> df.compare(df2, keep_shape=True) 

7818 col1 col2 col3 

7819 self other self other self other 

78200 a c NaN NaN NaN NaN 

78211 NaN NaN NaN NaN NaN NaN 

78222 NaN NaN NaN NaN 3.0 4.0 

78233 NaN NaN NaN NaN NaN NaN 

78244 NaN NaN NaN NaN NaN NaN 

7825 

7826Keep all original rows and columns and also all original values 

7827 

7828>>> df.compare(df2, keep_shape=True, keep_equal=True) 

7829 col1 col2 col3 

7830 self other self other self other 

78310 a c 1.0 1.0 1.0 1.0 

78321 a a 2.0 2.0 2.0 2.0 

78332 b b 3.0 3.0 3.0 4.0 

78343 b b NaN NaN 4.0 4.0 

78354 a a 5.0 5.0 5.0 5.0 

7836""", 

7837 klass=_shared_doc_kwargs["klass"], 

7838 ) 

7839 def compare( 

7840 self, 

7841 other: DataFrame, 

7842 align_axis: Axis = 1, 

7843 keep_shape: bool = False, 

7844 keep_equal: bool = False, 

7845 result_names: Suffixes = ("self", "other"), 

7846 ) -> DataFrame: 

7847 return super().compare( 

7848 other=other, 

7849 align_axis=align_axis, 

7850 keep_shape=keep_shape, 

7851 keep_equal=keep_equal, 

7852 result_names=result_names, 

7853 ) 

7854 

7855 def combine( 

7856 self, 

7857 other: DataFrame, 

7858 func: Callable[[Series, Series], Series | Hashable], 

7859 fill_value=None, 

7860 overwrite: bool = True, 

7861 ) -> DataFrame: 

7862 """ 

7863 Perform column-wise combine with another DataFrame. 

7864 

7865 Combines a DataFrame with `other` DataFrame using `func` 

7866 to element-wise combine columns. The row and column indexes of the 

7867 resulting DataFrame will be the union of the two. 

7868 

7869 Parameters 

7870 ---------- 

7871 other : DataFrame 

7872 The DataFrame to merge column-wise. 

7873 func : function 

7874 Function that takes two series as inputs and return a Series or a 

7875 scalar. Used to merge the two dataframes column by columns. 

7876 fill_value : scalar value, default None 

7877 The value to fill NaNs with prior to passing any column to the 

7878 merge func. 

7879 overwrite : bool, default True 

7880 If True, columns in `self` that do not exist in `other` will be 

7881 overwritten with NaNs. 

7882 

7883 Returns 

7884 ------- 

7885 DataFrame 

7886 Combination of the provided DataFrames. 

7887 

7888 See Also 

7889 -------- 

7890 DataFrame.combine_first : Combine two DataFrame objects and default to 

7891 non-null values in frame calling the method. 

7892 

7893 Examples 

7894 -------- 

7895 Combine using a simple function that chooses the smaller column. 

7896 

7897 >>> df1 = pd.DataFrame({'A': [0, 0], 'B': [4, 4]}) 

7898 >>> df2 = pd.DataFrame({'A': [1, 1], 'B': [3, 3]}) 

7899 >>> take_smaller = lambda s1, s2: s1 if s1.sum() < s2.sum() else s2 

7900 >>> df1.combine(df2, take_smaller) 

7901 A B 

7902 0 0 3 

7903 1 0 3 

7904 

7905 Example using a true element-wise combine function. 

7906 

7907 >>> df1 = pd.DataFrame({'A': [5, 0], 'B': [2, 4]}) 

7908 >>> df2 = pd.DataFrame({'A': [1, 1], 'B': [3, 3]}) 

7909 >>> df1.combine(df2, np.minimum) 

7910 A B 

7911 0 1 2 

7912 1 0 3 

7913 

7914 Using `fill_value` fills Nones prior to passing the column to the 

7915 merge function. 

7916 

7917 >>> df1 = pd.DataFrame({'A': [0, 0], 'B': [None, 4]}) 

7918 >>> df2 = pd.DataFrame({'A': [1, 1], 'B': [3, 3]}) 

7919 >>> df1.combine(df2, take_smaller, fill_value=-5) 

7920 A B 

7921 0 0 -5.0 

7922 1 0 4.0 

7923 

7924 However, if the same element in both dataframes is None, that None 

7925 is preserved 

7926 

7927 >>> df1 = pd.DataFrame({'A': [0, 0], 'B': [None, 4]}) 

7928 >>> df2 = pd.DataFrame({'A': [1, 1], 'B': [None, 3]}) 

7929 >>> df1.combine(df2, take_smaller, fill_value=-5) 

7930 A B 

7931 0 0 -5.0 

7932 1 0 3.0 

7933 

7934 Example that demonstrates the use of `overwrite` and behavior when 

7935 the axis differ between the dataframes. 

7936 

7937 >>> df1 = pd.DataFrame({'A': [0, 0], 'B': [4, 4]}) 

7938 >>> df2 = pd.DataFrame({'B': [3, 3], 'C': [-10, 1], }, index=[1, 2]) 

7939 >>> df1.combine(df2, take_smaller) 

7940 A B C 

7941 0 NaN NaN NaN 

7942 1 NaN 3.0 -10.0 

7943 2 NaN 3.0 1.0 

7944 

7945 >>> df1.combine(df2, take_smaller, overwrite=False) 

7946 A B C 

7947 0 0.0 NaN NaN 

7948 1 0.0 3.0 -10.0 

7949 2 NaN 3.0 1.0 

7950 

7951 Demonstrating the preference of the passed in dataframe. 

7952 

7953 >>> df2 = pd.DataFrame({'B': [3, 3], 'C': [1, 1], }, index=[1, 2]) 

7954 >>> df2.combine(df1, take_smaller) 

7955 A B C 

7956 0 0.0 NaN NaN 

7957 1 0.0 3.0 NaN 

7958 2 NaN 3.0 NaN 

7959 

7960 >>> df2.combine(df1, take_smaller, overwrite=False) 

7961 A B C 

7962 0 0.0 NaN NaN 

7963 1 0.0 3.0 1.0 

7964 2 NaN 3.0 1.0 

7965 """ 

7966 other_idxlen = len(other.index) # save for compare 

7967 

7968 this, other = self.align(other, copy=False) 

7969 new_index = this.index 

7970 

7971 if other.empty and len(new_index) == len(self.index): 

7972 return self.copy() 

7973 

7974 if self.empty and len(other) == other_idxlen: 

7975 return other.copy() 

7976 

7977 # sorts if possible 

7978 new_columns = this.columns.union(other.columns) 

7979 do_fill = fill_value is not None 

7980 result = {} 

7981 for col in new_columns: 

7982 series = this[col] 

7983 otherSeries = other[col] 

7984 

7985 this_dtype = series.dtype 

7986 other_dtype = otherSeries.dtype 

7987 

7988 this_mask = isna(series) 

7989 other_mask = isna(otherSeries) 

7990 

7991 # don't overwrite columns unnecessarily 

7992 # DO propagate if this column is not in the intersection 

7993 if not overwrite and other_mask.all(): 

7994 result[col] = this[col].copy() 

7995 continue 

7996 

7997 if do_fill: 

7998 series = series.copy() 

7999 otherSeries = otherSeries.copy() 

8000 series[this_mask] = fill_value 

8001 otherSeries[other_mask] = fill_value 

8002 

8003 if col not in self.columns: 

8004 # If self DataFrame does not have col in other DataFrame, 

8005 # try to promote series, which is all NaN, as other_dtype. 

8006 new_dtype = other_dtype 

8007 try: 

8008 series = series.astype(new_dtype, copy=False) 

8009 except ValueError: 

8010 # e.g. new_dtype is integer types 

8011 pass 

8012 else: 

8013 # if we have different dtypes, possibly promote 

8014 new_dtype = find_common_type([this_dtype, other_dtype]) 

8015 series = series.astype(new_dtype, copy=False) 

8016 otherSeries = otherSeries.astype(new_dtype, copy=False) 

8017 

8018 arr = func(series, otherSeries) 

8019 if isinstance(new_dtype, np.dtype): 

8020 # if new_dtype is an EA Dtype, then `func` is expected to return 

8021 # the correct dtype without any additional casting 

8022 # error: No overload variant of "maybe_downcast_to_dtype" matches 

8023 # argument types "Union[Series, Hashable]", "dtype[Any]" 

8024 arr = maybe_downcast_to_dtype( # type: ignore[call-overload] 

8025 arr, new_dtype 

8026 ) 

8027 

8028 result[col] = arr 

8029 

8030 # convert_objects just in case 

8031 return self._constructor(result, index=new_index, columns=new_columns) 

8032 

8033 def combine_first(self, other: DataFrame) -> DataFrame: 

8034 """ 

8035 Update null elements with value in the same location in `other`. 

8036 

8037 Combine two DataFrame objects by filling null values in one DataFrame 

8038 with non-null values from other DataFrame. The row and column indexes 

8039 of the resulting DataFrame will be the union of the two. The resulting 

8040 dataframe contains the 'first' dataframe values and overrides the 

8041 second one values where both first.loc[index, col] and 

8042 second.loc[index, col] are not missing values, upon calling 

8043 first.combine_first(second). 

8044 

8045 Parameters 

8046 ---------- 

8047 other : DataFrame 

8048 Provided DataFrame to use to fill null values. 

8049 

8050 Returns 

8051 ------- 

8052 DataFrame 

8053 The result of combining the provided DataFrame with the other object. 

8054 

8055 See Also 

8056 -------- 

8057 DataFrame.combine : Perform series-wise operation on two DataFrames 

8058 using a given function. 

8059 

8060 Examples 

8061 -------- 

8062 >>> df1 = pd.DataFrame({'A': [None, 0], 'B': [None, 4]}) 

8063 >>> df2 = pd.DataFrame({'A': [1, 1], 'B': [3, 3]}) 

8064 >>> df1.combine_first(df2) 

8065 A B 

8066 0 1.0 3.0 

8067 1 0.0 4.0 

8068 

8069 Null values still persist if the location of that null value 

8070 does not exist in `other` 

8071 

8072 >>> df1 = pd.DataFrame({'A': [None, 0], 'B': [4, None]}) 

8073 >>> df2 = pd.DataFrame({'B': [3, 3], 'C': [1, 1]}, index=[1, 2]) 

8074 >>> df1.combine_first(df2) 

8075 A B C 

8076 0 NaN 4.0 NaN 

8077 1 0.0 3.0 1.0 

8078 2 NaN 3.0 1.0 

8079 """ 

8080 import pandas.core.computation.expressions as expressions 

8081 

8082 def combiner(x, y): 

8083 mask = extract_array(isna(x)) 

8084 

8085 x_values = extract_array(x, extract_numpy=True) 

8086 y_values = extract_array(y, extract_numpy=True) 

8087 

8088 # If the column y in other DataFrame is not in first DataFrame, 

8089 # just return y_values. 

8090 if y.name not in self.columns: 

8091 return y_values 

8092 

8093 return expressions.where(mask, y_values, x_values) 

8094 

8095 combined = self.combine(other, combiner, overwrite=False) 

8096 

8097 dtypes = { 

8098 col: find_common_type([self.dtypes[col], other.dtypes[col]]) 

8099 for col in self.columns.intersection(other.columns) 

8100 if not is_dtype_equal(combined.dtypes[col], self.dtypes[col]) 

8101 } 

8102 

8103 if dtypes: 

8104 combined = combined.astype(dtypes) 

8105 

8106 return combined 

8107 

8108 def update( 

8109 self, 

8110 other, 

8111 join: str = "left", 

8112 overwrite: bool = True, 

8113 filter_func=None, 

8114 errors: str = "ignore", 

8115 ) -> None: 

8116 """ 

8117 Modify in place using non-NA values from another DataFrame. 

8118 

8119 Aligns on indices. There is no return value. 

8120 

8121 Parameters 

8122 ---------- 

8123 other : DataFrame, or object coercible into a DataFrame 

8124 Should have at least one matching index/column label 

8125 with the original DataFrame. If a Series is passed, 

8126 its name attribute must be set, and that will be 

8127 used as the column name to align with the original DataFrame. 

8128 join : {'left'}, default 'left' 

8129 Only left join is implemented, keeping the index and columns of the 

8130 original object. 

8131 overwrite : bool, default True 

8132 How to handle non-NA values for overlapping keys: 

8133 

8134 * True: overwrite original DataFrame's values 

8135 with values from `other`. 

8136 * False: only update values that are NA in 

8137 the original DataFrame. 

8138 

8139 filter_func : callable(1d-array) -> bool 1d-array, optional 

8140 Can choose to replace values other than NA. Return True for values 

8141 that should be updated. 

8142 errors : {'raise', 'ignore'}, default 'ignore' 

8143 If 'raise', will raise a ValueError if the DataFrame and `other` 

8144 both contain non-NA data in the same place. 

8145 

8146 Returns 

8147 ------- 

8148 None : method directly changes calling object 

8149 

8150 Raises 

8151 ------ 

8152 ValueError 

8153 * When `errors='raise'` and there's overlapping non-NA data. 

8154 * When `errors` is not either `'ignore'` or `'raise'` 

8155 NotImplementedError 

8156 * If `join != 'left'` 

8157 

8158 See Also 

8159 -------- 

8160 dict.update : Similar method for dictionaries. 

8161 DataFrame.merge : For column(s)-on-column(s) operations. 

8162 

8163 Examples 

8164 -------- 

8165 >>> df = pd.DataFrame({'A': [1, 2, 3], 

8166 ... 'B': [400, 500, 600]}) 

8167 >>> new_df = pd.DataFrame({'B': [4, 5, 6], 

8168 ... 'C': [7, 8, 9]}) 

8169 >>> df.update(new_df) 

8170 >>> df 

8171 A B 

8172 0 1 4 

8173 1 2 5 

8174 2 3 6 

8175 

8176 The DataFrame's length does not increase as a result of the update, 

8177 only values at matching index/column labels are updated. 

8178 

8179 >>> df = pd.DataFrame({'A': ['a', 'b', 'c'], 

8180 ... 'B': ['x', 'y', 'z']}) 

8181 >>> new_df = pd.DataFrame({'B': ['d', 'e', 'f', 'g', 'h', 'i']}) 

8182 >>> df.update(new_df) 

8183 >>> df 

8184 A B 

8185 0 a d 

8186 1 b e 

8187 2 c f 

8188 

8189 For Series, its name attribute must be set. 

8190 

8191 >>> df = pd.DataFrame({'A': ['a', 'b', 'c'], 

8192 ... 'B': ['x', 'y', 'z']}) 

8193 >>> new_column = pd.Series(['d', 'e'], name='B', index=[0, 2]) 

8194 >>> df.update(new_column) 

8195 >>> df 

8196 A B 

8197 0 a d 

8198 1 b y 

8199 2 c e 

8200 >>> df = pd.DataFrame({'A': ['a', 'b', 'c'], 

8201 ... 'B': ['x', 'y', 'z']}) 

8202 >>> new_df = pd.DataFrame({'B': ['d', 'e']}, index=[1, 2]) 

8203 >>> df.update(new_df) 

8204 >>> df 

8205 A B 

8206 0 a x 

8207 1 b d 

8208 2 c e 

8209 

8210 If `other` contains NaNs the corresponding values are not updated 

8211 in the original dataframe. 

8212 

8213 >>> df = pd.DataFrame({'A': [1, 2, 3], 

8214 ... 'B': [400, 500, 600]}) 

8215 >>> new_df = pd.DataFrame({'B': [4, np.nan, 6]}) 

8216 >>> df.update(new_df) 

8217 >>> df 

8218 A B 

8219 0 1 4.0 

8220 1 2 500.0 

8221 2 3 6.0 

8222 """ 

8223 import pandas.core.computation.expressions as expressions 

8224 

8225 # TODO: Support other joins 

8226 if join != "left": # pragma: no cover 

8227 raise NotImplementedError("Only left join is supported") 

8228 if errors not in ["ignore", "raise"]: 

8229 raise ValueError("The parameter errors must be either 'ignore' or 'raise'") 

8230 

8231 if not isinstance(other, DataFrame): 

8232 other = DataFrame(other) 

8233 

8234 other = other.reindex_like(self) 

8235 

8236 for col in self.columns: 

8237 this = self[col]._values 

8238 that = other[col]._values 

8239 if filter_func is not None: 

8240 with np.errstate(all="ignore"): 

8241 mask = ~filter_func(this) | isna(that) 

8242 else: 

8243 if errors == "raise": 

8244 mask_this = notna(that) 

8245 mask_that = notna(this) 

8246 if any(mask_this & mask_that): 

8247 raise ValueError("Data overlaps.") 

8248 

8249 if overwrite: 

8250 mask = isna(that) 

8251 else: 

8252 mask = notna(this) 

8253 

8254 # don't overwrite columns unnecessarily 

8255 if mask.all(): 

8256 continue 

8257 

8258 with warnings.catch_warnings(): 

8259 warnings.filterwarnings("ignore", "In a future version, `df.iloc") 

8260 self.loc[:, col] = expressions.where(mask, this, that) 

8261 

8262 # ---------------------------------------------------------------------- 

8263 # Data reshaping 

8264 @Appender( 

8265 """ 

8266Examples 

8267-------- 

8268>>> df = pd.DataFrame({'Animal': ['Falcon', 'Falcon', 

8269... 'Parrot', 'Parrot'], 

8270... 'Max Speed': [380., 370., 24., 26.]}) 

8271>>> df 

8272 Animal Max Speed 

82730 Falcon 380.0 

82741 Falcon 370.0 

82752 Parrot 24.0 

82763 Parrot 26.0 

8277>>> df.groupby(['Animal']).mean() 

8278 Max Speed 

8279Animal 

8280Falcon 375.0 

8281Parrot 25.0 

8282 

8283**Hierarchical Indexes** 

8284 

8285We can groupby different levels of a hierarchical index 

8286using the `level` parameter: 

8287 

8288>>> arrays = [['Falcon', 'Falcon', 'Parrot', 'Parrot'], 

8289... ['Captive', 'Wild', 'Captive', 'Wild']] 

8290>>> index = pd.MultiIndex.from_arrays(arrays, names=('Animal', 'Type')) 

8291>>> df = pd.DataFrame({'Max Speed': [390., 350., 30., 20.]}, 

8292... index=index) 

8293>>> df 

8294 Max Speed 

8295Animal Type 

8296Falcon Captive 390.0 

8297 Wild 350.0 

8298Parrot Captive 30.0 

8299 Wild 20.0 

8300>>> df.groupby(level=0).mean() 

8301 Max Speed 

8302Animal 

8303Falcon 370.0 

8304Parrot 25.0 

8305>>> df.groupby(level="Type").mean() 

8306 Max Speed 

8307Type 

8308Captive 210.0 

8309Wild 185.0 

8310 

8311We can also choose to include NA in group keys or not by setting 

8312`dropna` parameter, the default setting is `True`. 

8313 

8314>>> l = [[1, 2, 3], [1, None, 4], [2, 1, 3], [1, 2, 2]] 

8315>>> df = pd.DataFrame(l, columns=["a", "b", "c"]) 

8316 

8317>>> df.groupby(by=["b"]).sum() 

8318 a c 

8319b 

83201.0 2 3 

83212.0 2 5 

8322 

8323>>> df.groupby(by=["b"], dropna=False).sum() 

8324 a c 

8325b 

83261.0 2 3 

83272.0 2 5 

8328NaN 1 4 

8329 

8330>>> l = [["a", 12, 12], [None, 12.3, 33.], ["b", 12.3, 123], ["a", 1, 1]] 

8331>>> df = pd.DataFrame(l, columns=["a", "b", "c"]) 

8332 

8333>>> df.groupby(by="a").sum() 

8334 b c 

8335a 

8336a 13.0 13.0 

8337b 12.3 123.0 

8338 

8339>>> df.groupby(by="a", dropna=False).sum() 

8340 b c 

8341a 

8342a 13.0 13.0 

8343b 12.3 123.0 

8344NaN 12.3 33.0 

8345 

8346When using ``.apply()``, use ``group_keys`` to include or exclude the group keys. 

8347The ``group_keys`` argument defaults to ``True`` (include). 

8348 

8349>>> df = pd.DataFrame({'Animal': ['Falcon', 'Falcon', 

8350... 'Parrot', 'Parrot'], 

8351... 'Max Speed': [380., 370., 24., 26.]}) 

8352>>> df.groupby("Animal", group_keys=True).apply(lambda x: x) 

8353 Animal Max Speed 

8354Animal 

8355Falcon 0 Falcon 380.0 

8356 1 Falcon 370.0 

8357Parrot 2 Parrot 24.0 

8358 3 Parrot 26.0 

8359 

8360>>> df.groupby("Animal", group_keys=False).apply(lambda x: x) 

8361 Animal Max Speed 

83620 Falcon 380.0 

83631 Falcon 370.0 

83642 Parrot 24.0 

83653 Parrot 26.0 

8366""" 

8367 ) 

8368 @Appender(_shared_docs["groupby"] % _shared_doc_kwargs) 

8369 def groupby( 

8370 self, 

8371 by=None, 

8372 axis: Axis = 0, 

8373 level: IndexLabel | None = None, 

8374 as_index: bool = True, 

8375 sort: bool = True, 

8376 group_keys: bool | lib.NoDefault = no_default, 

8377 squeeze: bool | lib.NoDefault = no_default, 

8378 observed: bool = False, 

8379 dropna: bool = True, 

8380 ) -> DataFrameGroupBy: 

8381 from pandas.core.groupby.generic import DataFrameGroupBy 

8382 

8383 if squeeze is not no_default: 

8384 warnings.warn( 

8385 ( 

8386 "The `squeeze` parameter is deprecated and " 

8387 "will be removed in a future version." 

8388 ), 

8389 FutureWarning, 

8390 stacklevel=find_stack_level(), 

8391 ) 

8392 else: 

8393 squeeze = False 

8394 

8395 if level is None and by is None: 

8396 raise TypeError("You have to supply one of 'by' and 'level'") 

8397 axis = self._get_axis_number(axis) 

8398 

8399 return DataFrameGroupBy( 

8400 obj=self, 

8401 keys=by, 

8402 axis=axis, 

8403 level=level, 

8404 as_index=as_index, 

8405 sort=sort, 

8406 group_keys=group_keys, 

8407 squeeze=squeeze, 

8408 observed=observed, 

8409 dropna=dropna, 

8410 ) 

8411 

8412 _shared_docs[ 

8413 "pivot" 

8414 ] = """ 

8415 Return reshaped DataFrame organized by given index / column values. 

8416 

8417 Reshape data (produce a "pivot" table) based on column values. Uses 

8418 unique values from specified `index` / `columns` to form axes of the 

8419 resulting DataFrame. This function does not support data 

8420 aggregation, multiple values will result in a MultiIndex in the 

8421 columns. See the :ref:`User Guide <reshaping>` for more on reshaping. 

8422 

8423 Parameters 

8424 ----------%s 

8425 index : str or object or a list of str, optional 

8426 Column to use to make new frame's index. If None, uses 

8427 existing index. 

8428 

8429 .. versionchanged:: 1.1.0 

8430 Also accept list of index names. 

8431 

8432 columns : str or object or a list of str 

8433 Column to use to make new frame's columns. 

8434 

8435 .. versionchanged:: 1.1.0 

8436 Also accept list of columns names. 

8437 

8438 values : str, object or a list of the previous, optional 

8439 Column(s) to use for populating new frame's values. If not 

8440 specified, all remaining columns will be used and the result will 

8441 have hierarchically indexed columns. 

8442 

8443 Returns 

8444 ------- 

8445 DataFrame 

8446 Returns reshaped DataFrame. 

8447 

8448 Raises 

8449 ------ 

8450 ValueError: 

8451 When there are any `index`, `columns` combinations with multiple 

8452 values. `DataFrame.pivot_table` when you need to aggregate. 

8453 

8454 See Also 

8455 -------- 

8456 DataFrame.pivot_table : Generalization of pivot that can handle 

8457 duplicate values for one index/column pair. 

8458 DataFrame.unstack : Pivot based on the index values instead of a 

8459 column. 

8460 wide_to_long : Wide panel to long format. Less flexible but more 

8461 user-friendly than melt. 

8462 

8463 Notes 

8464 ----- 

8465 For finer-tuned control, see hierarchical indexing documentation along 

8466 with the related stack/unstack methods. 

8467 

8468 Reference :ref:`the user guide <reshaping.pivot>` for more examples. 

8469 

8470 Examples 

8471 -------- 

8472 >>> df = pd.DataFrame({'foo': ['one', 'one', 'one', 'two', 'two', 

8473 ... 'two'], 

8474 ... 'bar': ['A', 'B', 'C', 'A', 'B', 'C'], 

8475 ... 'baz': [1, 2, 3, 4, 5, 6], 

8476 ... 'zoo': ['x', 'y', 'z', 'q', 'w', 't']}) 

8477 >>> df 

8478 foo bar baz zoo 

8479 0 one A 1 x 

8480 1 one B 2 y 

8481 2 one C 3 z 

8482 3 two A 4 q 

8483 4 two B 5 w 

8484 5 two C 6 t 

8485 

8486 >>> df.pivot(index='foo', columns='bar', values='baz') 

8487 bar A B C 

8488 foo 

8489 one 1 2 3 

8490 two 4 5 6 

8491 

8492 >>> df.pivot(index='foo', columns='bar')['baz'] 

8493 bar A B C 

8494 foo 

8495 one 1 2 3 

8496 two 4 5 6 

8497 

8498 >>> df.pivot(index='foo', columns='bar', values=['baz', 'zoo']) 

8499 baz zoo 

8500 bar A B C A B C 

8501 foo 

8502 one 1 2 3 x y z 

8503 two 4 5 6 q w t 

8504 

8505 You could also assign a list of column names or a list of index names. 

8506 

8507 >>> df = pd.DataFrame({ 

8508 ... "lev1": [1, 1, 1, 2, 2, 2], 

8509 ... "lev2": [1, 1, 2, 1, 1, 2], 

8510 ... "lev3": [1, 2, 1, 2, 1, 2], 

8511 ... "lev4": [1, 2, 3, 4, 5, 6], 

8512 ... "values": [0, 1, 2, 3, 4, 5]}) 

8513 >>> df 

8514 lev1 lev2 lev3 lev4 values 

8515 0 1 1 1 1 0 

8516 1 1 1 2 2 1 

8517 2 1 2 1 3 2 

8518 3 2 1 2 4 3 

8519 4 2 1 1 5 4 

8520 5 2 2 2 6 5 

8521 

8522 >>> df.pivot(index="lev1", columns=["lev2", "lev3"],values="values") 

8523 lev2 1 2 

8524 lev3 1 2 1 2 

8525 lev1 

8526 1 0.0 1.0 2.0 NaN 

8527 2 4.0 3.0 NaN 5.0 

8528 

8529 >>> df.pivot(index=["lev1", "lev2"], columns=["lev3"],values="values") 

8530 lev3 1 2 

8531 lev1 lev2 

8532 1 1 0.0 1.0 

8533 2 2.0 NaN 

8534 2 1 4.0 3.0 

8535 2 NaN 5.0 

8536 

8537 A ValueError is raised if there are any duplicates. 

8538 

8539 >>> df = pd.DataFrame({"foo": ['one', 'one', 'two', 'two'], 

8540 ... "bar": ['A', 'A', 'B', 'C'], 

8541 ... "baz": [1, 2, 3, 4]}) 

8542 >>> df 

8543 foo bar baz 

8544 0 one A 1 

8545 1 one A 2 

8546 2 two B 3 

8547 3 two C 4 

8548 

8549 Notice that the first two rows are the same for our `index` 

8550 and `columns` arguments. 

8551 

8552 >>> df.pivot(index='foo', columns='bar', values='baz') 

8553 Traceback (most recent call last): 

8554 ... 

8555 ValueError: Index contains duplicate entries, cannot reshape 

8556 """ 

8557 

8558 @Substitution("") 

8559 @Appender(_shared_docs["pivot"]) 

8560 @deprecate_nonkeyword_arguments(version=None, allowed_args=["self"]) 

8561 def pivot(self, index=None, columns=None, values=None) -> DataFrame: 

8562 from pandas.core.reshape.pivot import pivot 

8563 

8564 return pivot(self, index=index, columns=columns, values=values) 

8565 

8566 _shared_docs[ 

8567 "pivot_table" 

8568 ] = """ 

8569 Create a spreadsheet-style pivot table as a DataFrame. 

8570 

8571 The levels in the pivot table will be stored in MultiIndex objects 

8572 (hierarchical indexes) on the index and columns of the result DataFrame. 

8573 

8574 Parameters 

8575 ----------%s 

8576 values : column to aggregate, optional 

8577 index : column, Grouper, array, or list of the previous 

8578 If an array is passed, it must be the same length as the data. The 

8579 list can contain any of the other types (except list). 

8580 Keys to group by on the pivot table index. If an array is passed, 

8581 it is being used as the same manner as column values. 

8582 columns : column, Grouper, array, or list of the previous 

8583 If an array is passed, it must be the same length as the data. The 

8584 list can contain any of the other types (except list). 

8585 Keys to group by on the pivot table column. If an array is passed, 

8586 it is being used as the same manner as column values. 

8587 aggfunc : function, list of functions, dict, default numpy.mean 

8588 If list of functions passed, the resulting pivot table will have 

8589 hierarchical columns whose top level are the function names 

8590 (inferred from the function objects themselves) 

8591 If dict is passed, the key is column to aggregate and value 

8592 is function or list of functions. 

8593 fill_value : scalar, default None 

8594 Value to replace missing values with (in the resulting pivot table, 

8595 after aggregation). 

8596 margins : bool, default False 

8597 Add all row / columns (e.g. for subtotal / grand totals). 

8598 dropna : bool, default True 

8599 Do not include columns whose entries are all NaN. If True, 

8600 rows with a NaN value in any column will be omitted before 

8601 computing margins. 

8602 margins_name : str, default 'All' 

8603 Name of the row / column that will contain the totals 

8604 when margins is True. 

8605 observed : bool, default False 

8606 This only applies if any of the groupers are Categoricals. 

8607 If True: only show observed values for categorical groupers. 

8608 If False: show all values for categorical groupers. 

8609 

8610 .. versionchanged:: 0.25.0 

8611 

8612 sort : bool, default True 

8613 Specifies if the result should be sorted. 

8614 

8615 .. versionadded:: 1.3.0 

8616 

8617 Returns 

8618 ------- 

8619 DataFrame 

8620 An Excel style pivot table. 

8621 

8622 See Also 

8623 -------- 

8624 DataFrame.pivot : Pivot without aggregation that can handle 

8625 non-numeric data. 

8626 DataFrame.melt: Unpivot a DataFrame from wide to long format, 

8627 optionally leaving identifiers set. 

8628 wide_to_long : Wide panel to long format. Less flexible but more 

8629 user-friendly than melt. 

8630 

8631 Notes 

8632 ----- 

8633 Reference :ref:`the user guide <reshaping.pivot>` for more examples. 

8634 

8635 Examples 

8636 -------- 

8637 >>> df = pd.DataFrame({"A": ["foo", "foo", "foo", "foo", "foo", 

8638 ... "bar", "bar", "bar", "bar"], 

8639 ... "B": ["one", "one", "one", "two", "two", 

8640 ... "one", "one", "two", "two"], 

8641 ... "C": ["small", "large", "large", "small", 

8642 ... "small", "large", "small", "small", 

8643 ... "large"], 

8644 ... "D": [1, 2, 2, 3, 3, 4, 5, 6, 7], 

8645 ... "E": [2, 4, 5, 5, 6, 6, 8, 9, 9]}) 

8646 >>> df 

8647 A B C D E 

8648 0 foo one small 1 2 

8649 1 foo one large 2 4 

8650 2 foo one large 2 5 

8651 3 foo two small 3 5 

8652 4 foo two small 3 6 

8653 5 bar one large 4 6 

8654 6 bar one small 5 8 

8655 7 bar two small 6 9 

8656 8 bar two large 7 9 

8657 

8658 This first example aggregates values by taking the sum. 

8659 

8660 >>> table = pd.pivot_table(df, values='D', index=['A', 'B'], 

8661 ... columns=['C'], aggfunc=np.sum) 

8662 >>> table 

8663 C large small 

8664 A B 

8665 bar one 4.0 5.0 

8666 two 7.0 6.0 

8667 foo one 4.0 1.0 

8668 two NaN 6.0 

8669 

8670 We can also fill missing values using the `fill_value` parameter. 

8671 

8672 >>> table = pd.pivot_table(df, values='D', index=['A', 'B'], 

8673 ... columns=['C'], aggfunc=np.sum, fill_value=0) 

8674 >>> table 

8675 C large small 

8676 A B 

8677 bar one 4 5 

8678 two 7 6 

8679 foo one 4 1 

8680 two 0 6 

8681 

8682 The next example aggregates by taking the mean across multiple columns. 

8683 

8684 >>> table = pd.pivot_table(df, values=['D', 'E'], index=['A', 'C'], 

8685 ... aggfunc={'D': np.mean, 

8686 ... 'E': np.mean}) 

8687 >>> table 

8688 D E 

8689 A C 

8690 bar large 5.500000 7.500000 

8691 small 5.500000 8.500000 

8692 foo large 2.000000 4.500000 

8693 small 2.333333 4.333333 

8694 

8695 We can also calculate multiple types of aggregations for any given 

8696 value column. 

8697 

8698 >>> table = pd.pivot_table(df, values=['D', 'E'], index=['A', 'C'], 

8699 ... aggfunc={'D': np.mean, 

8700 ... 'E': [min, max, np.mean]}) 

8701 >>> table 

8702 D E 

8703 mean max mean min 

8704 A C 

8705 bar large 5.500000 9 7.500000 6 

8706 small 5.500000 9 8.500000 8 

8707 foo large 2.000000 5 4.500000 4 

8708 small 2.333333 6 4.333333 2 

8709 """ 

8710 

8711 @Substitution("") 

8712 @Appender(_shared_docs["pivot_table"]) 

8713 def pivot_table( 

8714 self, 

8715 values=None, 

8716 index=None, 

8717 columns=None, 

8718 aggfunc="mean", 

8719 fill_value=None, 

8720 margins=False, 

8721 dropna=True, 

8722 margins_name="All", 

8723 observed=False, 

8724 sort=True, 

8725 ) -> DataFrame: 

8726 from pandas.core.reshape.pivot import pivot_table 

8727 

8728 return pivot_table( 

8729 self, 

8730 values=values, 

8731 index=index, 

8732 columns=columns, 

8733 aggfunc=aggfunc, 

8734 fill_value=fill_value, 

8735 margins=margins, 

8736 dropna=dropna, 

8737 margins_name=margins_name, 

8738 observed=observed, 

8739 sort=sort, 

8740 ) 

8741 

8742 def stack(self, level: Level = -1, dropna: bool = True): 

8743 """ 

8744 Stack the prescribed level(s) from columns to index. 

8745 

8746 Return a reshaped DataFrame or Series having a multi-level 

8747 index with one or more new inner-most levels compared to the current 

8748 DataFrame. The new inner-most levels are created by pivoting the 

8749 columns of the current dataframe: 

8750 

8751 - if the columns have a single level, the output is a Series; 

8752 - if the columns have multiple levels, the new index 

8753 level(s) is (are) taken from the prescribed level(s) and 

8754 the output is a DataFrame. 

8755 

8756 Parameters 

8757 ---------- 

8758 level : int, str, list, default -1 

8759 Level(s) to stack from the column axis onto the index 

8760 axis, defined as one index or label, or a list of indices 

8761 or labels. 

8762 dropna : bool, default True 

8763 Whether to drop rows in the resulting Frame/Series with 

8764 missing values. Stacking a column level onto the index 

8765 axis can create combinations of index and column values 

8766 that are missing from the original dataframe. See Examples 

8767 section. 

8768 

8769 Returns 

8770 ------- 

8771 DataFrame or Series 

8772 Stacked dataframe or series. 

8773 

8774 See Also 

8775 -------- 

8776 DataFrame.unstack : Unstack prescribed level(s) from index axis 

8777 onto column axis. 

8778 DataFrame.pivot : Reshape dataframe from long format to wide 

8779 format. 

8780 DataFrame.pivot_table : Create a spreadsheet-style pivot table 

8781 as a DataFrame. 

8782 

8783 Notes 

8784 ----- 

8785 The function is named by analogy with a collection of books 

8786 being reorganized from being side by side on a horizontal 

8787 position (the columns of the dataframe) to being stacked 

8788 vertically on top of each other (in the index of the 

8789 dataframe). 

8790 

8791 Reference :ref:`the user guide <reshaping.stacking>` for more examples. 

8792 

8793 Examples 

8794 -------- 

8795 **Single level columns** 

8796 

8797 >>> df_single_level_cols = pd.DataFrame([[0, 1], [2, 3]], 

8798 ... index=['cat', 'dog'], 

8799 ... columns=['weight', 'height']) 

8800 

8801 Stacking a dataframe with a single level column axis returns a Series: 

8802 

8803 >>> df_single_level_cols 

8804 weight height 

8805 cat 0 1 

8806 dog 2 3 

8807 >>> df_single_level_cols.stack() 

8808 cat weight 0 

8809 height 1 

8810 dog weight 2 

8811 height 3 

8812 dtype: int64 

8813 

8814 **Multi level columns: simple case** 

8815 

8816 >>> multicol1 = pd.MultiIndex.from_tuples([('weight', 'kg'), 

8817 ... ('weight', 'pounds')]) 

8818 >>> df_multi_level_cols1 = pd.DataFrame([[1, 2], [2, 4]], 

8819 ... index=['cat', 'dog'], 

8820 ... columns=multicol1) 

8821 

8822 Stacking a dataframe with a multi-level column axis: 

8823 

8824 >>> df_multi_level_cols1 

8825 weight 

8826 kg pounds 

8827 cat 1 2 

8828 dog 2 4 

8829 >>> df_multi_level_cols1.stack() 

8830 weight 

8831 cat kg 1 

8832 pounds 2 

8833 dog kg 2 

8834 pounds 4 

8835 

8836 **Missing values** 

8837 

8838 >>> multicol2 = pd.MultiIndex.from_tuples([('weight', 'kg'), 

8839 ... ('height', 'm')]) 

8840 >>> df_multi_level_cols2 = pd.DataFrame([[1.0, 2.0], [3.0, 4.0]], 

8841 ... index=['cat', 'dog'], 

8842 ... columns=multicol2) 

8843 

8844 It is common to have missing values when stacking a dataframe 

8845 with multi-level columns, as the stacked dataframe typically 

8846 has more values than the original dataframe. Missing values 

8847 are filled with NaNs: 

8848 

8849 >>> df_multi_level_cols2 

8850 weight height 

8851 kg m 

8852 cat 1.0 2.0 

8853 dog 3.0 4.0 

8854 >>> df_multi_level_cols2.stack() 

8855 height weight 

8856 cat kg NaN 1.0 

8857 m 2.0 NaN 

8858 dog kg NaN 3.0 

8859 m 4.0 NaN 

8860 

8861 **Prescribing the level(s) to be stacked** 

8862 

8863 The first parameter controls which level or levels are stacked: 

8864 

8865 >>> df_multi_level_cols2.stack(0) 

8866 kg m 

8867 cat height NaN 2.0 

8868 weight 1.0 NaN 

8869 dog height NaN 4.0 

8870 weight 3.0 NaN 

8871 >>> df_multi_level_cols2.stack([0, 1]) 

8872 cat height m 2.0 

8873 weight kg 1.0 

8874 dog height m 4.0 

8875 weight kg 3.0 

8876 dtype: float64 

8877 

8878 **Dropping missing values** 

8879 

8880 >>> df_multi_level_cols3 = pd.DataFrame([[None, 1.0], [2.0, 3.0]], 

8881 ... index=['cat', 'dog'], 

8882 ... columns=multicol2) 

8883 

8884 Note that rows where all values are missing are dropped by 

8885 default but this behaviour can be controlled via the dropna 

8886 keyword parameter: 

8887 

8888 >>> df_multi_level_cols3 

8889 weight height 

8890 kg m 

8891 cat NaN 1.0 

8892 dog 2.0 3.0 

8893 >>> df_multi_level_cols3.stack(dropna=False) 

8894 height weight 

8895 cat kg NaN NaN 

8896 m 1.0 NaN 

8897 dog kg NaN 2.0 

8898 m 3.0 NaN 

8899 >>> df_multi_level_cols3.stack(dropna=True) 

8900 height weight 

8901 cat m 1.0 NaN 

8902 dog kg NaN 2.0 

8903 m 3.0 NaN 

8904 """ 

8905 from pandas.core.reshape.reshape import ( 

8906 stack, 

8907 stack_multiple, 

8908 ) 

8909 

8910 if isinstance(level, (tuple, list)): 

8911 result = stack_multiple(self, level, dropna=dropna) 

8912 else: 

8913 result = stack(self, level, dropna=dropna) 

8914 

8915 return result.__finalize__(self, method="stack") 

8916 

8917 def explode( 

8918 self, 

8919 column: IndexLabel, 

8920 ignore_index: bool = False, 

8921 ) -> DataFrame: 

8922 """ 

8923 Transform each element of a list-like to a row, replicating index values. 

8924 

8925 .. versionadded:: 0.25.0 

8926 

8927 Parameters 

8928 ---------- 

8929 column : IndexLabel 

8930 Column(s) to explode. 

8931 For multiple columns, specify a non-empty list with each element 

8932 be str or tuple, and all specified columns their list-like data 

8933 on same row of the frame must have matching length. 

8934 

8935 .. versionadded:: 1.3.0 

8936 Multi-column explode 

8937 

8938 ignore_index : bool, default False 

8939 If True, the resulting index will be labeled 0, 1, …, n - 1. 

8940 

8941 .. versionadded:: 1.1.0 

8942 

8943 Returns 

8944 ------- 

8945 DataFrame 

8946 Exploded lists to rows of the subset columns; 

8947 index will be duplicated for these rows. 

8948 

8949 Raises 

8950 ------ 

8951 ValueError : 

8952 * If columns of the frame are not unique. 

8953 * If specified columns to explode is empty list. 

8954 * If specified columns to explode have not matching count of 

8955 elements rowwise in the frame. 

8956 

8957 See Also 

8958 -------- 

8959 DataFrame.unstack : Pivot a level of the (necessarily hierarchical) 

8960 index labels. 

8961 DataFrame.melt : Unpivot a DataFrame from wide format to long format. 

8962 Series.explode : Explode a DataFrame from list-like columns to long format. 

8963 

8964 Notes 

8965 ----- 

8966 This routine will explode list-likes including lists, tuples, sets, 

8967 Series, and np.ndarray. The result dtype of the subset rows will 

8968 be object. Scalars will be returned unchanged, and empty list-likes will 

8969 result in a np.nan for that row. In addition, the ordering of rows in the 

8970 output will be non-deterministic when exploding sets. 

8971 

8972 Reference :ref:`the user guide <reshaping.explode>` for more examples. 

8973 

8974 Examples 

8975 -------- 

8976 >>> df = pd.DataFrame({'A': [[0, 1, 2], 'foo', [], [3, 4]], 

8977 ... 'B': 1, 

8978 ... 'C': [['a', 'b', 'c'], np.nan, [], ['d', 'e']]}) 

8979 >>> df 

8980 A B C 

8981 0 [0, 1, 2] 1 [a, b, c] 

8982 1 foo 1 NaN 

8983 2 [] 1 [] 

8984 3 [3, 4] 1 [d, e] 

8985 

8986 Single-column explode. 

8987 

8988 >>> df.explode('A') 

8989 A B C 

8990 0 0 1 [a, b, c] 

8991 0 1 1 [a, b, c] 

8992 0 2 1 [a, b, c] 

8993 1 foo 1 NaN 

8994 2 NaN 1 [] 

8995 3 3 1 [d, e] 

8996 3 4 1 [d, e] 

8997 

8998 Multi-column explode. 

8999 

9000 >>> df.explode(list('AC')) 

9001 A B C 

9002 0 0 1 a 

9003 0 1 1 b 

9004 0 2 1 c 

9005 1 foo 1 NaN 

9006 2 NaN 1 NaN 

9007 3 3 1 d 

9008 3 4 1 e 

9009 """ 

9010 if not self.columns.is_unique: 

9011 raise ValueError("columns must be unique") 

9012 

9013 columns: list[Hashable] 

9014 if is_scalar(column) or isinstance(column, tuple): 

9015 columns = [column] 

9016 elif isinstance(column, list) and all( 

9017 is_scalar(c) or isinstance(c, tuple) for c in column 

9018 ): 

9019 if not column: 

9020 raise ValueError("column must be nonempty") 

9021 if len(column) > len(set(column)): 

9022 raise ValueError("column must be unique") 

9023 columns = column 

9024 else: 

9025 raise ValueError("column must be a scalar, tuple, or list thereof") 

9026 

9027 df = self.reset_index(drop=True) 

9028 if len(columns) == 1: 

9029 result = df[columns[0]].explode() 

9030 else: 

9031 mylen = lambda x: len(x) if is_list_like(x) else -1 

9032 counts0 = self[columns[0]].apply(mylen) 

9033 for c in columns[1:]: 

9034 if not all(counts0 == self[c].apply(mylen)): 

9035 raise ValueError("columns must have matching element counts") 

9036 result = DataFrame({c: df[c].explode() for c in columns}) 

9037 result = df.drop(columns, axis=1).join(result) 

9038 if ignore_index: 

9039 result.index = default_index(len(result)) 

9040 else: 

9041 result.index = self.index.take(result.index) 

9042 result = result.reindex(columns=self.columns, copy=False) 

9043 

9044 return result.__finalize__(self, method="explode") 

9045 

9046 def unstack(self, level: Level = -1, fill_value=None): 

9047 """ 

9048 Pivot a level of the (necessarily hierarchical) index labels. 

9049 

9050 Returns a DataFrame having a new level of column labels whose inner-most level 

9051 consists of the pivoted index labels. 

9052 

9053 If the index is not a MultiIndex, the output will be a Series 

9054 (the analogue of stack when the columns are not a MultiIndex). 

9055 

9056 Parameters 

9057 ---------- 

9058 level : int, str, or list of these, default -1 (last level) 

9059 Level(s) of index to unstack, can pass level name. 

9060 fill_value : int, str or dict 

9061 Replace NaN with this value if the unstack produces missing values. 

9062 

9063 Returns 

9064 ------- 

9065 Series or DataFrame 

9066 

9067 See Also 

9068 -------- 

9069 DataFrame.pivot : Pivot a table based on column values. 

9070 DataFrame.stack : Pivot a level of the column labels (inverse operation 

9071 from `unstack`). 

9072 

9073 Notes 

9074 ----- 

9075 Reference :ref:`the user guide <reshaping.stacking>` for more examples. 

9076 

9077 Examples 

9078 -------- 

9079 >>> index = pd.MultiIndex.from_tuples([('one', 'a'), ('one', 'b'), 

9080 ... ('two', 'a'), ('two', 'b')]) 

9081 >>> s = pd.Series(np.arange(1.0, 5.0), index=index) 

9082 >>> s 

9083 one a 1.0 

9084 b 2.0 

9085 two a 3.0 

9086 b 4.0 

9087 dtype: float64 

9088 

9089 >>> s.unstack(level=-1) 

9090 a b 

9091 one 1.0 2.0 

9092 two 3.0 4.0 

9093 

9094 >>> s.unstack(level=0) 

9095 one two 

9096 a 1.0 3.0 

9097 b 2.0 4.0 

9098 

9099 >>> df = s.unstack(level=0) 

9100 >>> df.unstack() 

9101 one a 1.0 

9102 b 2.0 

9103 two a 3.0 

9104 b 4.0 

9105 dtype: float64 

9106 """ 

9107 from pandas.core.reshape.reshape import unstack 

9108 

9109 result = unstack(self, level, fill_value) 

9110 

9111 return result.__finalize__(self, method="unstack") 

9112 

9113 @Appender(_shared_docs["melt"] % {"caller": "df.melt(", "other": "melt"}) 

9114 def melt( 

9115 self, 

9116 id_vars=None, 

9117 value_vars=None, 

9118 var_name=None, 

9119 value_name="value", 

9120 col_level: Level = None, 

9121 ignore_index: bool = True, 

9122 ) -> DataFrame: 

9123 

9124 return melt( 

9125 self, 

9126 id_vars=id_vars, 

9127 value_vars=value_vars, 

9128 var_name=var_name, 

9129 value_name=value_name, 

9130 col_level=col_level, 

9131 ignore_index=ignore_index, 

9132 ).__finalize__(self, method="melt") 

9133 

9134 # ---------------------------------------------------------------------- 

9135 # Time series-related 

9136 

9137 @doc( 

9138 Series.diff, 

9139 klass="DataFrame", 

9140 extra_params="axis : {0 or 'index', 1 or 'columns'}, default 0\n " 

9141 "Take difference over rows (0) or columns (1).\n", 

9142 other_klass="Series", 

9143 examples=dedent( 

9144 """ 

9145 Difference with previous row 

9146 

9147 >>> df = pd.DataFrame({'a': [1, 2, 3, 4, 5, 6], 

9148 ... 'b': [1, 1, 2, 3, 5, 8], 

9149 ... 'c': [1, 4, 9, 16, 25, 36]}) 

9150 >>> df 

9151 a b c 

9152 0 1 1 1 

9153 1 2 1 4 

9154 2 3 2 9 

9155 3 4 3 16 

9156 4 5 5 25 

9157 5 6 8 36 

9158 

9159 >>> df.diff() 

9160 a b c 

9161 0 NaN NaN NaN 

9162 1 1.0 0.0 3.0 

9163 2 1.0 1.0 5.0 

9164 3 1.0 1.0 7.0 

9165 4 1.0 2.0 9.0 

9166 5 1.0 3.0 11.0 

9167 

9168 Difference with previous column 

9169 

9170 >>> df.diff(axis=1) 

9171 a b c 

9172 0 NaN 0 0 

9173 1 NaN -1 3 

9174 2 NaN -1 7 

9175 3 NaN -1 13 

9176 4 NaN 0 20 

9177 5 NaN 2 28 

9178 

9179 Difference with 3rd previous row 

9180 

9181 >>> df.diff(periods=3) 

9182 a b c 

9183 0 NaN NaN NaN 

9184 1 NaN NaN NaN 

9185 2 NaN NaN NaN 

9186 3 3.0 2.0 15.0 

9187 4 3.0 4.0 21.0 

9188 5 3.0 6.0 27.0 

9189 

9190 Difference with following row 

9191 

9192 >>> df.diff(periods=-1) 

9193 a b c 

9194 0 -1.0 0.0 -3.0 

9195 1 -1.0 -1.0 -5.0 

9196 2 -1.0 -1.0 -7.0 

9197 3 -1.0 -2.0 -9.0 

9198 4 -1.0 -3.0 -11.0 

9199 5 NaN NaN NaN 

9200 

9201 Overflow in input dtype 

9202 

9203 >>> df = pd.DataFrame({'a': [1, 0]}, dtype=np.uint8) 

9204 >>> df.diff() 

9205 a 

9206 0 NaN 

9207 1 255.0""" 

9208 ), 

9209 ) 

9210 def diff(self, periods: int = 1, axis: Axis = 0) -> DataFrame: 

9211 if not lib.is_integer(periods): 

9212 if not ( 

9213 is_float(periods) 

9214 # error: "int" has no attribute "is_integer" 

9215 and periods.is_integer() # type: ignore[attr-defined] 

9216 ): 

9217 raise ValueError("periods must be an integer") 

9218 periods = int(periods) 

9219 

9220 axis = self._get_axis_number(axis) 

9221 if axis == 1 and periods != 0: 

9222 return self - self.shift(periods, axis=axis) 

9223 

9224 new_data = self._mgr.diff(n=periods, axis=axis) 

9225 return self._constructor(new_data).__finalize__(self, "diff") 

9226 

9227 # ---------------------------------------------------------------------- 

9228 # Function application 

9229 

9230 def _gotitem( 

9231 self, 

9232 key: IndexLabel, 

9233 ndim: int, 

9234 subset: DataFrame | Series | None = None, 

9235 ) -> DataFrame | Series: 

9236 """ 

9237 Sub-classes to define. Return a sliced object. 

9238 

9239 Parameters 

9240 ---------- 

9241 key : string / list of selections 

9242 ndim : {1, 2} 

9243 requested ndim of result 

9244 subset : object, default None 

9245 subset to act on 

9246 """ 

9247 if subset is None: 

9248 subset = self 

9249 elif subset.ndim == 1: # is Series 

9250 return subset 

9251 

9252 # TODO: _shallow_copy(subset)? 

9253 return subset[key] 

9254 

9255 _agg_summary_and_see_also_doc = dedent( 

9256 """ 

9257 The aggregation operations are always performed over an axis, either the 

9258 index (default) or the column axis. This behavior is different from 

9259 `numpy` aggregation functions (`mean`, `median`, `prod`, `sum`, `std`, 

9260 `var`), where the default is to compute the aggregation of the flattened 

9261 array, e.g., ``numpy.mean(arr_2d)`` as opposed to 

9262 ``numpy.mean(arr_2d, axis=0)``. 

9263 

9264 `agg` is an alias for `aggregate`. Use the alias. 

9265 

9266 See Also 

9267 -------- 

9268 DataFrame.apply : Perform any type of operations. 

9269 DataFrame.transform : Perform transformation type operations. 

9270 core.groupby.GroupBy : Perform operations over groups. 

9271 core.resample.Resampler : Perform operations over resampled bins. 

9272 core.window.Rolling : Perform operations over rolling window. 

9273 core.window.Expanding : Perform operations over expanding window. 

9274 core.window.ExponentialMovingWindow : Perform operation over exponential weighted 

9275 window. 

9276 """ 

9277 ) 

9278 

9279 _agg_examples_doc = dedent( 

9280 """ 

9281 Examples 

9282 -------- 

9283 >>> df = pd.DataFrame([[1, 2, 3], 

9284 ... [4, 5, 6], 

9285 ... [7, 8, 9], 

9286 ... [np.nan, np.nan, np.nan]], 

9287 ... columns=['A', 'B', 'C']) 

9288 

9289 Aggregate these functions over the rows. 

9290 

9291 >>> df.agg(['sum', 'min']) 

9292 A B C 

9293 sum 12.0 15.0 18.0 

9294 min 1.0 2.0 3.0 

9295 

9296 Different aggregations per column. 

9297 

9298 >>> df.agg({'A' : ['sum', 'min'], 'B' : ['min', 'max']}) 

9299 A B 

9300 sum 12.0 NaN 

9301 min 1.0 2.0 

9302 max NaN 8.0 

9303 

9304 Aggregate different functions over the columns and rename the index of the resulting 

9305 DataFrame. 

9306 

9307 >>> df.agg(x=('A', max), y=('B', 'min'), z=('C', np.mean)) 

9308 A B C 

9309 x 7.0 NaN NaN 

9310 y NaN 2.0 NaN 

9311 z NaN NaN 6.0 

9312 

9313 Aggregate over the columns. 

9314 

9315 >>> df.agg("mean", axis="columns") 

9316 0 2.0 

9317 1 5.0 

9318 2 8.0 

9319 3 NaN 

9320 dtype: float64 

9321 """ 

9322 ) 

9323 

9324 @doc( 

9325 _shared_docs["aggregate"], 

9326 klass=_shared_doc_kwargs["klass"], 

9327 axis=_shared_doc_kwargs["axis"], 

9328 see_also=_agg_summary_and_see_also_doc, 

9329 examples=_agg_examples_doc, 

9330 ) 

9331 def aggregate(self, func=None, axis: Axis = 0, *args, **kwargs): 

9332 from pandas.core.apply import frame_apply 

9333 

9334 axis = self._get_axis_number(axis) 

9335 

9336 relabeling, func, columns, order = reconstruct_func(func, **kwargs) 

9337 

9338 op = frame_apply(self, func=func, axis=axis, args=args, kwargs=kwargs) 

9339 result = op.agg() 

9340 

9341 if relabeling: 

9342 # This is to keep the order to columns occurrence unchanged, and also 

9343 # keep the order of new columns occurrence unchanged 

9344 

9345 # For the return values of reconstruct_func, if relabeling is 

9346 # False, columns and order will be None. 

9347 assert columns is not None 

9348 assert order is not None 

9349 

9350 result_in_dict = relabel_result(result, func, columns, order) 

9351 result = DataFrame(result_in_dict, index=columns) 

9352 

9353 return result 

9354 

9355 agg = aggregate 

9356 

9357 # error: Signature of "any" incompatible with supertype "NDFrame" [override] 

9358 @overload # type: ignore[override] 

9359 def any( 

9360 self, 

9361 *, 

9362 axis: Axis = ..., 

9363 bool_only: bool | None = ..., 

9364 skipna: bool = ..., 

9365 level: None = ..., 

9366 **kwargs, 

9367 ) -> Series: 

9368 ... 

9369 

9370 @overload 

9371 def any( 

9372 self, 

9373 *, 

9374 axis: Axis = ..., 

9375 bool_only: bool | None = ..., 

9376 skipna: bool = ..., 

9377 level: Level, 

9378 **kwargs, 

9379 ) -> DataFrame | Series: 

9380 ... 

9381 

9382 @doc(NDFrame.any, **_shared_doc_kwargs) 

9383 def any( 

9384 self, 

9385 axis: Axis = 0, 

9386 bool_only: bool | None = None, 

9387 skipna: bool = True, 

9388 level: Level = None, 

9389 **kwargs, 

9390 ) -> DataFrame | Series: 

9391 ... 

9392 

9393 @doc( 

9394 _shared_docs["transform"], 

9395 klass=_shared_doc_kwargs["klass"], 

9396 axis=_shared_doc_kwargs["axis"], 

9397 ) 

9398 def transform( 

9399 self, func: AggFuncType, axis: Axis = 0, *args, **kwargs 

9400 ) -> DataFrame: 

9401 from pandas.core.apply import frame_apply 

9402 

9403 op = frame_apply(self, func=func, axis=axis, args=args, kwargs=kwargs) 

9404 result = op.transform() 

9405 assert isinstance(result, DataFrame) 

9406 return result 

9407 

9408 def apply( 

9409 self, 

9410 func: AggFuncType, 

9411 axis: Axis = 0, 

9412 raw: bool = False, 

9413 result_type: Literal["expand", "reduce", "broadcast"] | None = None, 

9414 args=(), 

9415 **kwargs, 

9416 ): 

9417 """ 

9418 Apply a function along an axis of the DataFrame. 

9419 

9420 Objects passed to the function are Series objects whose index is 

9421 either the DataFrame's index (``axis=0``) or the DataFrame's columns 

9422 (``axis=1``). By default (``result_type=None``), the final return type 

9423 is inferred from the return type of the applied function. Otherwise, 

9424 it depends on the `result_type` argument. 

9425 

9426 Parameters 

9427 ---------- 

9428 func : function 

9429 Function to apply to each column or row. 

9430 axis : {0 or 'index', 1 or 'columns'}, default 0 

9431 Axis along which the function is applied: 

9432 

9433 * 0 or 'index': apply function to each column. 

9434 * 1 or 'columns': apply function to each row. 

9435 

9436 raw : bool, default False 

9437 Determines if row or column is passed as a Series or ndarray object: 

9438 

9439 * ``False`` : passes each row or column as a Series to the 

9440 function. 

9441 * ``True`` : the passed function will receive ndarray objects 

9442 instead. 

9443 If you are just applying a NumPy reduction function this will 

9444 achieve much better performance. 

9445 

9446 result_type : {'expand', 'reduce', 'broadcast', None}, default None 

9447 These only act when ``axis=1`` (columns): 

9448 

9449 * 'expand' : list-like results will be turned into columns. 

9450 * 'reduce' : returns a Series if possible rather than expanding 

9451 list-like results. This is the opposite of 'expand'. 

9452 * 'broadcast' : results will be broadcast to the original shape 

9453 of the DataFrame, the original index and columns will be 

9454 retained. 

9455 

9456 The default behaviour (None) depends on the return value of the 

9457 applied function: list-like results will be returned as a Series 

9458 of those. However if the apply function returns a Series these 

9459 are expanded to columns. 

9460 args : tuple 

9461 Positional arguments to pass to `func` in addition to the 

9462 array/series. 

9463 **kwargs 

9464 Additional keyword arguments to pass as keywords arguments to 

9465 `func`. 

9466 

9467 Returns 

9468 ------- 

9469 Series or DataFrame 

9470 Result of applying ``func`` along the given axis of the 

9471 DataFrame. 

9472 

9473 See Also 

9474 -------- 

9475 DataFrame.applymap: For elementwise operations. 

9476 DataFrame.aggregate: Only perform aggregating type operations. 

9477 DataFrame.transform: Only perform transforming type operations. 

9478 

9479 Notes 

9480 ----- 

9481 Functions that mutate the passed object can produce unexpected 

9482 behavior or errors and are not supported. See :ref:`gotchas.udf-mutation` 

9483 for more details. 

9484 

9485 Examples 

9486 -------- 

9487 >>> df = pd.DataFrame([[4, 9]] * 3, columns=['A', 'B']) 

9488 >>> df 

9489 A B 

9490 0 4 9 

9491 1 4 9 

9492 2 4 9 

9493 

9494 Using a numpy universal function (in this case the same as 

9495 ``np.sqrt(df)``): 

9496 

9497 >>> df.apply(np.sqrt) 

9498 A B 

9499 0 2.0 3.0 

9500 1 2.0 3.0 

9501 2 2.0 3.0 

9502 

9503 Using a reducing function on either axis 

9504 

9505 >>> df.apply(np.sum, axis=0) 

9506 A 12 

9507 B 27 

9508 dtype: int64 

9509 

9510 >>> df.apply(np.sum, axis=1) 

9511 0 13 

9512 1 13 

9513 2 13 

9514 dtype: int64 

9515 

9516 Returning a list-like will result in a Series 

9517 

9518 >>> df.apply(lambda x: [1, 2], axis=1) 

9519 0 [1, 2] 

9520 1 [1, 2] 

9521 2 [1, 2] 

9522 dtype: object 

9523 

9524 Passing ``result_type='expand'`` will expand list-like results 

9525 to columns of a Dataframe 

9526 

9527 >>> df.apply(lambda x: [1, 2], axis=1, result_type='expand') 

9528 0 1 

9529 0 1 2 

9530 1 1 2 

9531 2 1 2 

9532 

9533 Returning a Series inside the function is similar to passing 

9534 ``result_type='expand'``. The resulting column names 

9535 will be the Series index. 

9536 

9537 >>> df.apply(lambda x: pd.Series([1, 2], index=['foo', 'bar']), axis=1) 

9538 foo bar 

9539 0 1 2 

9540 1 1 2 

9541 2 1 2 

9542 

9543 Passing ``result_type='broadcast'`` will ensure the same shape 

9544 result, whether list-like or scalar is returned by the function, 

9545 and broadcast it along the axis. The resulting column names will 

9546 be the originals. 

9547 

9548 >>> df.apply(lambda x: [1, 2], axis=1, result_type='broadcast') 

9549 A B 

9550 0 1 2 

9551 1 1 2 

9552 2 1 2 

9553 """ 

9554 from pandas.core.apply import frame_apply 

9555 

9556 op = frame_apply( 

9557 self, 

9558 func=func, 

9559 axis=axis, 

9560 raw=raw, 

9561 result_type=result_type, 

9562 args=args, 

9563 kwargs=kwargs, 

9564 ) 

9565 return op.apply().__finalize__(self, method="apply") 

9566 

9567 def applymap( 

9568 self, func: PythonFuncType, na_action: str | None = None, **kwargs 

9569 ) -> DataFrame: 

9570 """ 

9571 Apply a function to a Dataframe elementwise. 

9572 

9573 This method applies a function that accepts and returns a scalar 

9574 to every element of a DataFrame. 

9575 

9576 Parameters 

9577 ---------- 

9578 func : callable 

9579 Python function, returns a single value from a single value. 

9580 na_action : {None, 'ignore'}, default None 

9581 If ‘ignore’, propagate NaN values, without passing them to func. 

9582 

9583 .. versionadded:: 1.2 

9584 

9585 **kwargs 

9586 Additional keyword arguments to pass as keywords arguments to 

9587 `func`. 

9588 

9589 .. versionadded:: 1.3.0 

9590 

9591 Returns 

9592 ------- 

9593 DataFrame 

9594 Transformed DataFrame. 

9595 

9596 See Also 

9597 -------- 

9598 DataFrame.apply : Apply a function along input axis of DataFrame. 

9599 

9600 Examples 

9601 -------- 

9602 >>> df = pd.DataFrame([[1, 2.12], [3.356, 4.567]]) 

9603 >>> df 

9604 0 1 

9605 0 1.000 2.120 

9606 1 3.356 4.567 

9607 

9608 >>> df.applymap(lambda x: len(str(x))) 

9609 0 1 

9610 0 3 4 

9611 1 5 5 

9612 

9613 Like Series.map, NA values can be ignored: 

9614 

9615 >>> df_copy = df.copy() 

9616 >>> df_copy.iloc[0, 0] = pd.NA 

9617 >>> df_copy.applymap(lambda x: len(str(x)), na_action='ignore') 

9618 0 1 

9619 0 NaN 4 

9620 1 5.0 5 

9621 

9622 Note that a vectorized version of `func` often exists, which will 

9623 be much faster. You could square each number elementwise. 

9624 

9625 >>> df.applymap(lambda x: x**2) 

9626 0 1 

9627 0 1.000000 4.494400 

9628 1 11.262736 20.857489 

9629 

9630 But it's better to avoid applymap in that case. 

9631 

9632 >>> df ** 2 

9633 0 1 

9634 0 1.000000 4.494400 

9635 1 11.262736 20.857489 

9636 """ 

9637 if na_action not in {"ignore", None}: 

9638 raise ValueError( 

9639 f"na_action must be 'ignore' or None. Got {repr(na_action)}" 

9640 ) 

9641 ignore_na = na_action == "ignore" 

9642 func = functools.partial(func, **kwargs) 

9643 

9644 # if we have a dtype == 'M8[ns]', provide boxed values 

9645 def infer(x): 

9646 if x.empty: 

9647 return lib.map_infer(x, func, ignore_na=ignore_na) 

9648 return lib.map_infer(x.astype(object)._values, func, ignore_na=ignore_na) 

9649 

9650 return self.apply(infer).__finalize__(self, "applymap") 

9651 

9652 # ---------------------------------------------------------------------- 

9653 # Merging / joining methods 

9654 

9655 def append( 

9656 self, 

9657 other, 

9658 ignore_index: bool = False, 

9659 verify_integrity: bool = False, 

9660 sort: bool = False, 

9661 ) -> DataFrame: 

9662 """ 

9663 Append rows of `other` to the end of caller, returning a new object. 

9664 

9665 .. deprecated:: 1.4.0 

9666 Use :func:`concat` instead. For further details see 

9667 :ref:`whatsnew_140.deprecations.frame_series_append` 

9668 

9669 Columns in `other` that are not in the caller are added as new columns. 

9670 

9671 Parameters 

9672 ---------- 

9673 other : DataFrame or Series/dict-like object, or list of these 

9674 The data to append. 

9675 ignore_index : bool, default False 

9676 If True, the resulting axis will be labeled 0, 1, …, n - 1. 

9677 verify_integrity : bool, default False 

9678 If True, raise ValueError on creating index with duplicates. 

9679 sort : bool, default False 

9680 Sort columns if the columns of `self` and `other` are not aligned. 

9681 

9682 .. versionchanged:: 1.0.0 

9683 

9684 Changed to not sort by default. 

9685 

9686 Returns 

9687 ------- 

9688 DataFrame 

9689 A new DataFrame consisting of the rows of caller and the rows of `other`. 

9690 

9691 See Also 

9692 -------- 

9693 concat : General function to concatenate DataFrame or Series objects. 

9694 

9695 Notes 

9696 ----- 

9697 If a list of dict/series is passed and the keys are all contained in 

9698 the DataFrame's index, the order of the columns in the resulting 

9699 DataFrame will be unchanged. 

9700 

9701 Iteratively appending rows to a DataFrame can be more computationally 

9702 intensive than a single concatenate. A better solution is to append 

9703 those rows to a list and then concatenate the list with the original 

9704 DataFrame all at once. 

9705 

9706 Examples 

9707 -------- 

9708 >>> df = pd.DataFrame([[1, 2], [3, 4]], columns=list('AB'), index=['x', 'y']) 

9709 >>> df 

9710 A B 

9711 x 1 2 

9712 y 3 4 

9713 >>> df2 = pd.DataFrame([[5, 6], [7, 8]], columns=list('AB'), index=['x', 'y']) 

9714 >>> df.append(df2) 

9715 A B 

9716 x 1 2 

9717 y 3 4 

9718 x 5 6 

9719 y 7 8 

9720 

9721 With `ignore_index` set to True: 

9722 

9723 >>> df.append(df2, ignore_index=True) 

9724 A B 

9725 0 1 2 

9726 1 3 4 

9727 2 5 6 

9728 3 7 8 

9729 

9730 The following, while not recommended methods for generating DataFrames, 

9731 show two ways to generate a DataFrame from multiple data sources. 

9732 

9733 Less efficient: 

9734 

9735 >>> df = pd.DataFrame(columns=['A']) 

9736 >>> for i in range(5): 

9737 ... df = df.append({'A': i}, ignore_index=True) 

9738 >>> df 

9739 A 

9740 0 0 

9741 1 1 

9742 2 2 

9743 3 3 

9744 4 4 

9745 

9746 More efficient: 

9747 

9748 >>> pd.concat([pd.DataFrame([i], columns=['A']) for i in range(5)], 

9749 ... ignore_index=True) 

9750 A 

9751 0 0 

9752 1 1 

9753 2 2 

9754 3 3 

9755 4 4 

9756 """ 

9757 warnings.warn( 

9758 "The frame.append method is deprecated " 

9759 "and will be removed from pandas in a future version. " 

9760 "Use pandas.concat instead.", 

9761 FutureWarning, 

9762 stacklevel=find_stack_level(), 

9763 ) 

9764 

9765 return self._append(other, ignore_index, verify_integrity, sort) 

9766 

9767 def _append( 

9768 self, 

9769 other, 

9770 ignore_index: bool = False, 

9771 verify_integrity: bool = False, 

9772 sort: bool = False, 

9773 ) -> DataFrame: 

9774 if isinstance(other, (Series, dict)): 

9775 if isinstance(other, dict): 

9776 if not ignore_index: 

9777 raise TypeError("Can only append a dict if ignore_index=True") 

9778 other = Series(other) 

9779 if other.name is None and not ignore_index: 

9780 raise TypeError( 

9781 "Can only append a Series if ignore_index=True " 

9782 "or if the Series has a name" 

9783 ) 

9784 

9785 index = Index([other.name], name=self.index.name) 

9786 row_df = other.to_frame().T 

9787 # infer_objects is needed for 

9788 # test_append_empty_frame_to_series_with_dateutil_tz 

9789 other = row_df.infer_objects().rename_axis(index.names, copy=False) 

9790 elif isinstance(other, list): 

9791 if not other: 

9792 pass 

9793 elif not isinstance(other[0], DataFrame): 

9794 other = DataFrame(other) 

9795 if self.index.name is not None and not ignore_index: 

9796 other.index.name = self.index.name 

9797 

9798 from pandas.core.reshape.concat import concat 

9799 

9800 if isinstance(other, (list, tuple)): 

9801 to_concat = [self, *other] 

9802 else: 

9803 to_concat = [self, other] 

9804 

9805 result = concat( 

9806 to_concat, 

9807 ignore_index=ignore_index, 

9808 verify_integrity=verify_integrity, 

9809 sort=sort, 

9810 ) 

9811 return result.__finalize__(self, method="append") 

9812 

9813 def join( 

9814 self, 

9815 other: DataFrame | Series | list[DataFrame | Series], 

9816 on: IndexLabel | None = None, 

9817 how: str = "left", 

9818 lsuffix: str = "", 

9819 rsuffix: str = "", 

9820 sort: bool = False, 

9821 validate: str | None = None, 

9822 ) -> DataFrame: 

9823 """ 

9824 Join columns of another DataFrame. 

9825 

9826 Join columns with `other` DataFrame either on index or on a key 

9827 column. Efficiently join multiple DataFrame objects by index at once by 

9828 passing a list. 

9829 

9830 Parameters 

9831 ---------- 

9832 other : DataFrame, Series, or a list containing any combination of them 

9833 Index should be similar to one of the columns in this one. If a 

9834 Series is passed, its name attribute must be set, and that will be 

9835 used as the column name in the resulting joined DataFrame. 

9836 on : str, list of str, or array-like, optional 

9837 Column or index level name(s) in the caller to join on the index 

9838 in `other`, otherwise joins index-on-index. If multiple 

9839 values given, the `other` DataFrame must have a MultiIndex. Can 

9840 pass an array as the join key if it is not already contained in 

9841 the calling DataFrame. Like an Excel VLOOKUP operation. 

9842 how : {'left', 'right', 'outer', 'inner'}, default 'left' 

9843 How to handle the operation of the two objects. 

9844 

9845 * left: use calling frame's index (or column if on is specified) 

9846 * right: use `other`'s index. 

9847 * outer: form union of calling frame's index (or column if on is 

9848 specified) with `other`'s index, and sort it. 

9849 lexicographically. 

9850 * inner: form intersection of calling frame's index (or column if 

9851 on is specified) with `other`'s index, preserving the order 

9852 of the calling's one. 

9853 * cross: creates the cartesian product from both frames, preserves the order 

9854 of the left keys. 

9855 

9856 .. versionadded:: 1.2.0 

9857 

9858 lsuffix : str, default '' 

9859 Suffix to use from left frame's overlapping columns. 

9860 rsuffix : str, default '' 

9861 Suffix to use from right frame's overlapping columns. 

9862 sort : bool, default False 

9863 Order result DataFrame lexicographically by the join key. If False, 

9864 the order of the join key depends on the join type (how keyword). 

9865 validate : str, optional 

9866 If specified, checks if join is of specified type. 

9867 * "one_to_one" or "1:1": check if join keys are unique in both left 

9868 and right datasets. 

9869 * "one_to_many" or "1:m": check if join keys are unique in left dataset. 

9870 * "many_to_one" or "m:1": check if join keys are unique in right dataset. 

9871 * "many_to_many" or "m:m": allowed, but does not result in checks. 

9872 .. versionadded:: 1.5.0 

9873 

9874 Returns 

9875 ------- 

9876 DataFrame 

9877 A dataframe containing columns from both the caller and `other`. 

9878 

9879 See Also 

9880 -------- 

9881 DataFrame.merge : For column(s)-on-column(s) operations. 

9882 

9883 Notes 

9884 ----- 

9885 Parameters `on`, `lsuffix`, and `rsuffix` are not supported when 

9886 passing a list of `DataFrame` objects. 

9887 

9888 Support for specifying index levels as the `on` parameter was added 

9889 in version 0.23.0. 

9890 

9891 Examples 

9892 -------- 

9893 >>> df = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3', 'K4', 'K5'], 

9894 ... 'A': ['A0', 'A1', 'A2', 'A3', 'A4', 'A5']}) 

9895 

9896 >>> df 

9897 key A 

9898 0 K0 A0 

9899 1 K1 A1 

9900 2 K2 A2 

9901 3 K3 A3 

9902 4 K4 A4 

9903 5 K5 A5 

9904 

9905 >>> other = pd.DataFrame({'key': ['K0', 'K1', 'K2'], 

9906 ... 'B': ['B0', 'B1', 'B2']}) 

9907 

9908 >>> other 

9909 key B 

9910 0 K0 B0 

9911 1 K1 B1 

9912 2 K2 B2 

9913 

9914 Join DataFrames using their indexes. 

9915 

9916 >>> df.join(other, lsuffix='_caller', rsuffix='_other') 

9917 key_caller A key_other B 

9918 0 K0 A0 K0 B0 

9919 1 K1 A1 K1 B1 

9920 2 K2 A2 K2 B2 

9921 3 K3 A3 NaN NaN 

9922 4 K4 A4 NaN NaN 

9923 5 K5 A5 NaN NaN 

9924 

9925 If we want to join using the key columns, we need to set key to be 

9926 the index in both `df` and `other`. The joined DataFrame will have 

9927 key as its index. 

9928 

9929 >>> df.set_index('key').join(other.set_index('key')) 

9930 A B 

9931 key 

9932 K0 A0 B0 

9933 K1 A1 B1 

9934 K2 A2 B2 

9935 K3 A3 NaN 

9936 K4 A4 NaN 

9937 K5 A5 NaN 

9938 

9939 Another option to join using the key columns is to use the `on` 

9940 parameter. DataFrame.join always uses `other`'s index but we can use 

9941 any column in `df`. This method preserves the original DataFrame's 

9942 index in the result. 

9943 

9944 >>> df.join(other.set_index('key'), on='key') 

9945 key A B 

9946 0 K0 A0 B0 

9947 1 K1 A1 B1 

9948 2 K2 A2 B2 

9949 3 K3 A3 NaN 

9950 4 K4 A4 NaN 

9951 5 K5 A5 NaN 

9952 

9953 Using non-unique key values shows how they are matched. 

9954 

9955 >>> df = pd.DataFrame({'key': ['K0', 'K1', 'K1', 'K3', 'K0', 'K1'], 

9956 ... 'A': ['A0', 'A1', 'A2', 'A3', 'A4', 'A5']}) 

9957 

9958 >>> df 

9959 key A 

9960 0 K0 A0 

9961 1 K1 A1 

9962 2 K1 A2 

9963 3 K3 A3 

9964 4 K0 A4 

9965 5 K1 A5 

9966 

9967 >>> df.join(other.set_index('key'), on='key', validate='m:1') 

9968 key A B 

9969 0 K0 A0 B0 

9970 1 K1 A1 B1 

9971 2 K1 A2 B1 

9972 3 K3 A3 NaN 

9973 4 K0 A4 B0 

9974 5 K1 A5 B1 

9975 """ 

9976 return self._join_compat( 

9977 other, 

9978 on=on, 

9979 how=how, 

9980 lsuffix=lsuffix, 

9981 rsuffix=rsuffix, 

9982 sort=sort, 

9983 validate=validate, 

9984 ) 

9985 

9986 def _join_compat( 

9987 self, 

9988 other: DataFrame | Series | Iterable[DataFrame | Series], 

9989 on: IndexLabel | None = None, 

9990 how: str = "left", 

9991 lsuffix: str = "", 

9992 rsuffix: str = "", 

9993 sort: bool = False, 

9994 validate: str | None = None, 

9995 ): 

9996 from pandas.core.reshape.concat import concat 

9997 from pandas.core.reshape.merge import merge 

9998 

9999 if isinstance(other, Series): 

10000 if other.name is None: 

10001 raise ValueError("Other Series must have a name") 

10002 other = DataFrame({other.name: other}) 

10003 

10004 if isinstance(other, DataFrame): 

10005 if how == "cross": 

10006 return merge( 

10007 self, 

10008 other, 

10009 how=how, 

10010 on=on, 

10011 suffixes=(lsuffix, rsuffix), 

10012 sort=sort, 

10013 validate=validate, 

10014 ) 

10015 return merge( 

10016 self, 

10017 other, 

10018 left_on=on, 

10019 how=how, 

10020 left_index=on is None, 

10021 right_index=True, 

10022 suffixes=(lsuffix, rsuffix), 

10023 sort=sort, 

10024 validate=validate, 

10025 ) 

10026 else: 

10027 if on is not None: 

10028 raise ValueError( 

10029 "Joining multiple DataFrames only supported for joining on index" 

10030 ) 

10031 

10032 if rsuffix or lsuffix: 

10033 raise ValueError( 

10034 "Suffixes not supported when joining multiple DataFrames" 

10035 ) 

10036 

10037 # Mypy thinks the RHS is a 

10038 # "Union[DataFrame, Series, Iterable[Union[DataFrame, Series]]]" whereas 

10039 # the LHS is an "Iterable[DataFrame]", but in reality both types are 

10040 # "Iterable[Union[DataFrame, Series]]" due to the if statements 

10041 frames = [cast("DataFrame | Series", self)] + list(other) 

10042 

10043 can_concat = all(df.index.is_unique for df in frames) 

10044 

10045 # join indexes only using concat 

10046 if can_concat: 

10047 if how == "left": 

10048 res = concat( 

10049 frames, axis=1, join="outer", verify_integrity=True, sort=sort 

10050 ) 

10051 return res.reindex(self.index, copy=False) 

10052 else: 

10053 return concat( 

10054 frames, axis=1, join=how, verify_integrity=True, sort=sort 

10055 ) 

10056 

10057 joined = frames[0] 

10058 

10059 for frame in frames[1:]: 

10060 joined = merge( 

10061 joined, 

10062 frame, 

10063 how=how, 

10064 left_index=True, 

10065 right_index=True, 

10066 validate=validate, 

10067 ) 

10068 

10069 return joined 

10070 

10071 @Substitution("") 

10072 @Appender(_merge_doc, indents=2) 

10073 def merge( 

10074 self, 

10075 right: DataFrame | Series, 

10076 how: str = "inner", 

10077 on: IndexLabel | None = None, 

10078 left_on: IndexLabel | None = None, 

10079 right_on: IndexLabel | None = None, 

10080 left_index: bool = False, 

10081 right_index: bool = False, 

10082 sort: bool = False, 

10083 suffixes: Suffixes = ("_x", "_y"), 

10084 copy: bool = True, 

10085 indicator: bool = False, 

10086 validate: str | None = None, 

10087 ) -> DataFrame: 

10088 from pandas.core.reshape.merge import merge 

10089 

10090 return merge( 

10091 self, 

10092 right, 

10093 how=how, 

10094 on=on, 

10095 left_on=left_on, 

10096 right_on=right_on, 

10097 left_index=left_index, 

10098 right_index=right_index, 

10099 sort=sort, 

10100 suffixes=suffixes, 

10101 copy=copy, 

10102 indicator=indicator, 

10103 validate=validate, 

10104 ) 

10105 

10106 def round( 

10107 self, decimals: int | dict[IndexLabel, int] | Series = 0, *args, **kwargs 

10108 ) -> DataFrame: 

10109 """ 

10110 Round a DataFrame to a variable number of decimal places. 

10111 

10112 Parameters 

10113 ---------- 

10114 decimals : int, dict, Series 

10115 Number of decimal places to round each column to. If an int is 

10116 given, round each column to the same number of places. 

10117 Otherwise dict and Series round to variable numbers of places. 

10118 Column names should be in the keys if `decimals` is a 

10119 dict-like, or in the index if `decimals` is a Series. Any 

10120 columns not included in `decimals` will be left as is. Elements 

10121 of `decimals` which are not columns of the input will be 

10122 ignored. 

10123 *args 

10124 Additional keywords have no effect but might be accepted for 

10125 compatibility with numpy. 

10126 **kwargs 

10127 Additional keywords have no effect but might be accepted for 

10128 compatibility with numpy. 

10129 

10130 Returns 

10131 ------- 

10132 DataFrame 

10133 A DataFrame with the affected columns rounded to the specified 

10134 number of decimal places. 

10135 

10136 See Also 

10137 -------- 

10138 numpy.around : Round a numpy array to the given number of decimals. 

10139 Series.round : Round a Series to the given number of decimals. 

10140 

10141 Examples 

10142 -------- 

10143 >>> df = pd.DataFrame([(.21, .32), (.01, .67), (.66, .03), (.21, .18)], 

10144 ... columns=['dogs', 'cats']) 

10145 >>> df 

10146 dogs cats 

10147 0 0.21 0.32 

10148 1 0.01 0.67 

10149 2 0.66 0.03 

10150 3 0.21 0.18 

10151 

10152 By providing an integer each column is rounded to the same number 

10153 of decimal places 

10154 

10155 >>> df.round(1) 

10156 dogs cats 

10157 0 0.2 0.3 

10158 1 0.0 0.7 

10159 2 0.7 0.0 

10160 3 0.2 0.2 

10161 

10162 With a dict, the number of places for specific columns can be 

10163 specified with the column names as key and the number of decimal 

10164 places as value 

10165 

10166 >>> df.round({'dogs': 1, 'cats': 0}) 

10167 dogs cats 

10168 0 0.2 0.0 

10169 1 0.0 1.0 

10170 2 0.7 0.0 

10171 3 0.2 0.0 

10172 

10173 Using a Series, the number of places for specific columns can be 

10174 specified with the column names as index and the number of 

10175 decimal places as value 

10176 

10177 >>> decimals = pd.Series([0, 1], index=['cats', 'dogs']) 

10178 >>> df.round(decimals) 

10179 dogs cats 

10180 0 0.2 0.0 

10181 1 0.0 1.0 

10182 2 0.7 0.0 

10183 3 0.2 0.0 

10184 """ 

10185 from pandas.core.reshape.concat import concat 

10186 

10187 def _dict_round(df: DataFrame, decimals): 

10188 for col, vals in df.items(): 

10189 try: 

10190 yield _series_round(vals, decimals[col]) 

10191 except KeyError: 

10192 yield vals 

10193 

10194 def _series_round(ser: Series, decimals: int): 

10195 if is_integer_dtype(ser.dtype) or is_float_dtype(ser.dtype): 

10196 return ser.round(decimals) 

10197 return ser 

10198 

10199 nv.validate_round(args, kwargs) 

10200 

10201 if isinstance(decimals, (dict, Series)): 

10202 if isinstance(decimals, Series) and not decimals.index.is_unique: 

10203 raise ValueError("Index of decimals must be unique") 

10204 if is_dict_like(decimals) and not all( 

10205 is_integer(value) for _, value in decimals.items() 

10206 ): 

10207 raise TypeError("Values in decimals must be integers") 

10208 new_cols = list(_dict_round(self, decimals)) 

10209 elif is_integer(decimals): 

10210 # Dispatch to Series.round 

10211 new_cols = [_series_round(v, decimals) for _, v in self.items()] 

10212 else: 

10213 raise TypeError("decimals must be an integer, a dict-like or a Series") 

10214 

10215 if len(new_cols) > 0: 

10216 return self._constructor( 

10217 concat(new_cols, axis=1), index=self.index, columns=self.columns 

10218 ).__finalize__(self, method="round") 

10219 else: 

10220 return self 

10221 

10222 # ---------------------------------------------------------------------- 

10223 # Statistical methods, etc. 

10224 

10225 def corr( 

10226 self, 

10227 method: str | Callable[[np.ndarray, np.ndarray], float] = "pearson", 

10228 min_periods: int = 1, 

10229 numeric_only: bool | lib.NoDefault = lib.no_default, 

10230 ) -> DataFrame: 

10231 """ 

10232 Compute pairwise correlation of columns, excluding NA/null values. 

10233 

10234 Parameters 

10235 ---------- 

10236 method : {'pearson', 'kendall', 'spearman'} or callable 

10237 Method of correlation: 

10238 

10239 * pearson : standard correlation coefficient 

10240 * kendall : Kendall Tau correlation coefficient 

10241 * spearman : Spearman rank correlation 

10242 * callable: callable with input two 1d ndarrays 

10243 and returning a float. Note that the returned matrix from corr 

10244 will have 1 along the diagonals and will be symmetric 

10245 regardless of the callable's behavior. 

10246 min_periods : int, optional 

10247 Minimum number of observations required per pair of columns 

10248 to have a valid result. Currently only available for Pearson 

10249 and Spearman correlation. 

10250 numeric_only : bool, default True 

10251 Include only `float`, `int` or `boolean` data. 

10252 

10253 .. versionadded:: 1.5.0 

10254 

10255 .. deprecated:: 1.5.0 

10256 The default value of ``numeric_only`` will be ``False`` in a future 

10257 version of pandas. 

10258 

10259 Returns 

10260 ------- 

10261 DataFrame 

10262 Correlation matrix. 

10263 

10264 See Also 

10265 -------- 

10266 DataFrame.corrwith : Compute pairwise correlation with another 

10267 DataFrame or Series. 

10268 Series.corr : Compute the correlation between two Series. 

10269 

10270 Notes 

10271 ----- 

10272 Pearson, Kendall and Spearman correlation are currently computed using pairwise complete observations. 

10273 

10274 * `Pearson correlation coefficient <https://en.wikipedia.org/wiki/Pearson_correlation_coefficient>`_ 

10275 * `Kendall rank correlation coefficient <https://en.wikipedia.org/wiki/Kendall_rank_correlation_coefficient>`_ 

10276 * `Spearman's rank correlation coefficient <https://en.wikipedia.org/wiki/Spearman%27s_rank_correlation_coefficient>`_ 

10277 

10278 Examples 

10279 -------- 

10280 >>> def histogram_intersection(a, b): 

10281 ... v = np.minimum(a, b).sum().round(decimals=1) 

10282 ... return v 

10283 >>> df = pd.DataFrame([(.2, .3), (.0, .6), (.6, .0), (.2, .1)], 

10284 ... columns=['dogs', 'cats']) 

10285 >>> df.corr(method=histogram_intersection) 

10286 dogs cats 

10287 dogs 1.0 0.3 

10288 cats 0.3 1.0 

10289 

10290 >>> df = pd.DataFrame([(1, 1), (2, np.nan), (np.nan, 3), (4, 4)], 

10291 ... columns=['dogs', 'cats']) 

10292 >>> df.corr(min_periods=3) 

10293 dogs cats 

10294 dogs 1.0 NaN 

10295 cats NaN 1.0 

10296 """ # noqa:E501 

10297 numeric_only_bool = com.resolve_numeric_only(numeric_only) 

10298 data = self._get_numeric_data() if numeric_only_bool else self 

10299 if numeric_only is lib.no_default and len(data.columns) < len(self.columns): 

10300 com.deprecate_numeric_only_default(type(self), "corr") 

10301 

10302 cols = data.columns 

10303 idx = cols.copy() 

10304 mat = data.to_numpy(dtype=float, na_value=np.nan, copy=False) 

10305 

10306 if method == "pearson": 

10307 correl = libalgos.nancorr(mat, minp=min_periods) 

10308 elif method == "spearman": 

10309 correl = libalgos.nancorr_spearman(mat, minp=min_periods) 

10310 elif method == "kendall" or callable(method): 

10311 if min_periods is None: 

10312 min_periods = 1 

10313 mat = mat.T 

10314 corrf = nanops.get_corr_func(method) 

10315 K = len(cols) 

10316 correl = np.empty((K, K), dtype=float) 

10317 mask = np.isfinite(mat) 

10318 for i, ac in enumerate(mat): 

10319 for j, bc in enumerate(mat): 

10320 if i > j: 

10321 continue 

10322 

10323 valid = mask[i] & mask[j] 

10324 if valid.sum() < min_periods: 

10325 c = np.nan 

10326 elif i == j: 

10327 c = 1.0 

10328 elif not valid.all(): 

10329 c = corrf(ac[valid], bc[valid]) 

10330 else: 

10331 c = corrf(ac, bc) 

10332 correl[i, j] = c 

10333 correl[j, i] = c 

10334 else: 

10335 raise ValueError( 

10336 "method must be either 'pearson', " 

10337 "'spearman', 'kendall', or a callable, " 

10338 f"'{method}' was supplied" 

10339 ) 

10340 

10341 return self._constructor(correl, index=idx, columns=cols) 

10342 

10343 def cov( 

10344 self, 

10345 min_periods: int | None = None, 

10346 ddof: int | None = 1, 

10347 numeric_only: bool | lib.NoDefault = lib.no_default, 

10348 ) -> DataFrame: 

10349 """ 

10350 Compute pairwise covariance of columns, excluding NA/null values. 

10351 

10352 Compute the pairwise covariance among the series of a DataFrame. 

10353 The returned data frame is the `covariance matrix 

10354 <https://en.wikipedia.org/wiki/Covariance_matrix>`__ of the columns 

10355 of the DataFrame. 

10356 

10357 Both NA and null values are automatically excluded from the 

10358 calculation. (See the note below about bias from missing values.) 

10359 A threshold can be set for the minimum number of 

10360 observations for each value created. Comparisons with observations 

10361 below this threshold will be returned as ``NaN``. 

10362 

10363 This method is generally used for the analysis of time series data to 

10364 understand the relationship between different measures 

10365 across time. 

10366 

10367 Parameters 

10368 ---------- 

10369 min_periods : int, optional 

10370 Minimum number of observations required per pair of columns 

10371 to have a valid result. 

10372 

10373 ddof : int, default 1 

10374 Delta degrees of freedom. The divisor used in calculations 

10375 is ``N - ddof``, where ``N`` represents the number of elements. 

10376 

10377 .. versionadded:: 1.1.0 

10378 

10379 numeric_only : bool, default True 

10380 Include only `float`, `int` or `boolean` data. 

10381 

10382 .. versionadded:: 1.5.0 

10383 

10384 .. deprecated:: 1.5.0 

10385 The default value of ``numeric_only`` will be ``False`` in a future 

10386 version of pandas. 

10387 

10388 Returns 

10389 ------- 

10390 DataFrame 

10391 The covariance matrix of the series of the DataFrame. 

10392 

10393 See Also 

10394 -------- 

10395 Series.cov : Compute covariance with another Series. 

10396 core.window.ewm.ExponentialMovingWindow.cov : Exponential weighted sample 

10397 covariance. 

10398 core.window.expanding.Expanding.cov : Expanding sample covariance. 

10399 core.window.rolling.Rolling.cov : Rolling sample covariance. 

10400 

10401 Notes 

10402 ----- 

10403 Returns the covariance matrix of the DataFrame's time series. 

10404 The covariance is normalized by N-ddof. 

10405 

10406 For DataFrames that have Series that are missing data (assuming that 

10407 data is `missing at random 

10408 <https://en.wikipedia.org/wiki/Missing_data#Missing_at_random>`__) 

10409 the returned covariance matrix will be an unbiased estimate 

10410 of the variance and covariance between the member Series. 

10411 

10412 However, for many applications this estimate may not be acceptable 

10413 because the estimate covariance matrix is not guaranteed to be positive 

10414 semi-definite. This could lead to estimate correlations having 

10415 absolute values which are greater than one, and/or a non-invertible 

10416 covariance matrix. See `Estimation of covariance matrices 

10417 <https://en.wikipedia.org/w/index.php?title=Estimation_of_covariance_ 

10418 matrices>`__ for more details. 

10419 

10420 Examples 

10421 -------- 

10422 >>> df = pd.DataFrame([(1, 2), (0, 3), (2, 0), (1, 1)], 

10423 ... columns=['dogs', 'cats']) 

10424 >>> df.cov() 

10425 dogs cats 

10426 dogs 0.666667 -1.000000 

10427 cats -1.000000 1.666667 

10428 

10429 >>> np.random.seed(42) 

10430 >>> df = pd.DataFrame(np.random.randn(1000, 5), 

10431 ... columns=['a', 'b', 'c', 'd', 'e']) 

10432 >>> df.cov() 

10433 a b c d e 

10434 a 0.998438 -0.020161 0.059277 -0.008943 0.014144 

10435 b -0.020161 1.059352 -0.008543 -0.024738 0.009826 

10436 c 0.059277 -0.008543 1.010670 -0.001486 -0.000271 

10437 d -0.008943 -0.024738 -0.001486 0.921297 -0.013692 

10438 e 0.014144 0.009826 -0.000271 -0.013692 0.977795 

10439 

10440 **Minimum number of periods** 

10441 

10442 This method also supports an optional ``min_periods`` keyword 

10443 that specifies the required minimum number of non-NA observations for 

10444 each column pair in order to have a valid result: 

10445 

10446 >>> np.random.seed(42) 

10447 >>> df = pd.DataFrame(np.random.randn(20, 3), 

10448 ... columns=['a', 'b', 'c']) 

10449 >>> df.loc[df.index[:5], 'a'] = np.nan 

10450 >>> df.loc[df.index[5:10], 'b'] = np.nan 

10451 >>> df.cov(min_periods=12) 

10452 a b c 

10453 a 0.316741 NaN -0.150812 

10454 b NaN 1.248003 0.191417 

10455 c -0.150812 0.191417 0.895202 

10456 """ 

10457 numeric_only_bool = com.resolve_numeric_only(numeric_only) 

10458 data = self._get_numeric_data() if numeric_only_bool else self 

10459 if numeric_only is lib.no_default and len(data.columns) < len(self.columns): 

10460 com.deprecate_numeric_only_default(type(self), "cov") 

10461 

10462 cols = data.columns 

10463 idx = cols.copy() 

10464 mat = data.to_numpy(dtype=float, na_value=np.nan, copy=False) 

10465 

10466 if notna(mat).all(): 

10467 if min_periods is not None and min_periods > len(mat): 

10468 base_cov = np.empty((mat.shape[1], mat.shape[1])) 

10469 base_cov.fill(np.nan) 

10470 else: 

10471 base_cov = np.cov(mat.T, ddof=ddof) 

10472 base_cov = base_cov.reshape((len(cols), len(cols))) 

10473 else: 

10474 base_cov = libalgos.nancorr(mat, cov=True, minp=min_periods) 

10475 

10476 return self._constructor(base_cov, index=idx, columns=cols) 

10477 

10478 def corrwith( 

10479 self, 

10480 other: DataFrame | Series, 

10481 axis: Axis = 0, 

10482 drop: bool = False, 

10483 method: Literal["pearson", "kendall", "spearman"] 

10484 | Callable[[np.ndarray, np.ndarray], float] = "pearson", 

10485 numeric_only: bool | lib.NoDefault = lib.no_default, 

10486 ) -> Series: 

10487 """ 

10488 Compute pairwise correlation. 

10489 

10490 Pairwise correlation is computed between rows or columns of 

10491 DataFrame with rows or columns of Series or DataFrame. DataFrames 

10492 are first aligned along both axes before computing the 

10493 correlations. 

10494 

10495 Parameters 

10496 ---------- 

10497 other : DataFrame, Series 

10498 Object with which to compute correlations. 

10499 axis : {0 or 'index', 1 or 'columns'}, default 0 

10500 The axis to use. 0 or 'index' to compute row-wise, 1 or 'columns' for 

10501 column-wise. 

10502 drop : bool, default False 

10503 Drop missing indices from result. 

10504 method : {'pearson', 'kendall', 'spearman'} or callable 

10505 Method of correlation: 

10506 

10507 * pearson : standard correlation coefficient 

10508 * kendall : Kendall Tau correlation coefficient 

10509 * spearman : Spearman rank correlation 

10510 * callable: callable with input two 1d ndarrays 

10511 and returning a float. 

10512 

10513 numeric_only : bool, default True 

10514 Include only `float`, `int` or `boolean` data. 

10515 

10516 .. versionadded:: 1.5.0 

10517 

10518 .. deprecated:: 1.5.0 

10519 The default value of ``numeric_only`` will be ``False`` in a future 

10520 version of pandas. 

10521 

10522 Returns 

10523 ------- 

10524 Series 

10525 Pairwise correlations. 

10526 

10527 See Also 

10528 -------- 

10529 DataFrame.corr : Compute pairwise correlation of columns. 

10530 

10531 Examples 

10532 -------- 

10533 >>> index = ["a", "b", "c", "d", "e"] 

10534 >>> columns = ["one", "two", "three", "four"] 

10535 >>> df1 = pd.DataFrame(np.arange(20).reshape(5, 4), index=index, columns=columns) 

10536 >>> df2 = pd.DataFrame(np.arange(16).reshape(4, 4), index=index[:4], columns=columns) 

10537 >>> df1.corrwith(df2) 

10538 one 1.0 

10539 two 1.0 

10540 three 1.0 

10541 four 1.0 

10542 dtype: float64 

10543 

10544 >>> df2.corrwith(df1, axis=1) 

10545 a 1.0 

10546 b 1.0 

10547 c 1.0 

10548 d 1.0 

10549 e NaN 

10550 dtype: float64 

10551 """ # noqa:E501 

10552 axis = self._get_axis_number(axis) 

10553 numeric_only_bool = com.resolve_numeric_only(numeric_only) 

10554 this = self._get_numeric_data() if numeric_only_bool else self 

10555 if numeric_only is lib.no_default and len(this.columns) < len(self.columns): 

10556 com.deprecate_numeric_only_default(type(self), "corrwith") 

10557 

10558 if isinstance(other, Series): 

10559 return this.apply(lambda x: other.corr(x, method=method), axis=axis) 

10560 

10561 if numeric_only_bool: 

10562 other = other._get_numeric_data() 

10563 left, right = this.align(other, join="inner", copy=False) 

10564 

10565 if axis == 1: 

10566 left = left.T 

10567 right = right.T 

10568 

10569 if method == "pearson": 

10570 # mask missing values 

10571 left = left + right * 0 

10572 right = right + left * 0 

10573 

10574 # demeaned data 

10575 ldem = left - left.mean(numeric_only=numeric_only_bool) 

10576 rdem = right - right.mean(numeric_only=numeric_only_bool) 

10577 

10578 num = (ldem * rdem).sum() 

10579 dom = ( 

10580 (left.count() - 1) 

10581 * left.std(numeric_only=numeric_only_bool) 

10582 * right.std(numeric_only=numeric_only_bool) 

10583 ) 

10584 

10585 correl = num / dom 

10586 

10587 elif method in ["kendall", "spearman"] or callable(method): 

10588 

10589 def c(x): 

10590 return nanops.nancorr(x[0], x[1], method=method) 

10591 

10592 correl = self._constructor_sliced( 

10593 map(c, zip(left.values.T, right.values.T)), index=left.columns 

10594 ) 

10595 

10596 else: 

10597 raise ValueError( 

10598 f"Invalid method {method} was passed, " 

10599 "valid methods are: 'pearson', 'kendall', " 

10600 "'spearman', or callable" 

10601 ) 

10602 

10603 if not drop: 

10604 # Find non-matching labels along the given axis 

10605 # and append missing correlations (GH 22375) 

10606 raxis = 1 if axis == 0 else 0 

10607 result_index = this._get_axis(raxis).union(other._get_axis(raxis)) 

10608 idx_diff = result_index.difference(correl.index) 

10609 

10610 if len(idx_diff) > 0: 

10611 correl = correl._append( 

10612 Series([np.nan] * len(idx_diff), index=idx_diff) 

10613 ) 

10614 

10615 return correl 

10616 

10617 # ---------------------------------------------------------------------- 

10618 # ndarray-like stats methods 

10619 

10620 def count(self, axis: Axis = 0, level: Level = None, numeric_only: bool = False): 

10621 """ 

10622 Count non-NA cells for each column or row. 

10623 

10624 The values `None`, `NaN`, `NaT`, and optionally `numpy.inf` (depending 

10625 on `pandas.options.mode.use_inf_as_na`) are considered NA. 

10626 

10627 Parameters 

10628 ---------- 

10629 axis : {0 or 'index', 1 or 'columns'}, default 0 

10630 If 0 or 'index' counts are generated for each column. 

10631 If 1 or 'columns' counts are generated for each row. 

10632 level : int or str, optional 

10633 If the axis is a `MultiIndex` (hierarchical), count along a 

10634 particular `level`, collapsing into a `DataFrame`. 

10635 A `str` specifies the level name. 

10636 numeric_only : bool, default False 

10637 Include only `float`, `int` or `boolean` data. 

10638 

10639 Returns 

10640 ------- 

10641 Series or DataFrame 

10642 For each column/row the number of non-NA/null entries. 

10643 If `level` is specified returns a `DataFrame`. 

10644 

10645 See Also 

10646 -------- 

10647 Series.count: Number of non-NA elements in a Series. 

10648 DataFrame.value_counts: Count unique combinations of columns. 

10649 DataFrame.shape: Number of DataFrame rows and columns (including NA 

10650 elements). 

10651 DataFrame.isna: Boolean same-sized DataFrame showing places of NA 

10652 elements. 

10653 

10654 Examples 

10655 -------- 

10656 Constructing DataFrame from a dictionary: 

10657 

10658 >>> df = pd.DataFrame({"Person": 

10659 ... ["John", "Myla", "Lewis", "John", "Myla"], 

10660 ... "Age": [24., np.nan, 21., 33, 26], 

10661 ... "Single": [False, True, True, True, False]}) 

10662 >>> df 

10663 Person Age Single 

10664 0 John 24.0 False 

10665 1 Myla NaN True 

10666 2 Lewis 21.0 True 

10667 3 John 33.0 True 

10668 4 Myla 26.0 False 

10669 

10670 Notice the uncounted NA values: 

10671 

10672 >>> df.count() 

10673 Person 5 

10674 Age 4 

10675 Single 5 

10676 dtype: int64 

10677 

10678 Counts for each **row**: 

10679 

10680 >>> df.count(axis='columns') 

10681 0 3 

10682 1 2 

10683 2 3 

10684 3 3 

10685 4 3 

10686 dtype: int64 

10687 """ 

10688 axis = self._get_axis_number(axis) 

10689 if level is not None: 

10690 warnings.warn( 

10691 "Using the level keyword in DataFrame and Series aggregations is " 

10692 "deprecated and will be removed in a future version. Use groupby " 

10693 "instead. df.count(level=1) should use df.groupby(level=1).count().", 

10694 FutureWarning, 

10695 stacklevel=find_stack_level(), 

10696 ) 

10697 res = self._count_level(level, axis=axis, numeric_only=numeric_only) 

10698 return res.__finalize__(self, method="count") 

10699 

10700 if numeric_only: 

10701 frame = self._get_numeric_data() 

10702 else: 

10703 frame = self 

10704 

10705 # GH #423 

10706 if len(frame._get_axis(axis)) == 0: 

10707 result = self._constructor_sliced(0, index=frame._get_agg_axis(axis)) 

10708 else: 

10709 if frame._is_mixed_type or frame._mgr.any_extension_types: 

10710 # the or any_extension_types is really only hit for single- 

10711 # column frames with an extension array 

10712 result = notna(frame).sum(axis=axis) 

10713 else: 

10714 # GH13407 

10715 series_counts = notna(frame).sum(axis=axis) 

10716 counts = series_counts.values 

10717 result = self._constructor_sliced( 

10718 counts, index=frame._get_agg_axis(axis) 

10719 ) 

10720 

10721 return result.astype("int64").__finalize__(self, method="count") 

10722 

10723 def _count_level(self, level: Level, axis: int = 0, numeric_only: bool = False): 

10724 if numeric_only: 

10725 frame = self._get_numeric_data() 

10726 else: 

10727 frame = self 

10728 

10729 count_axis = frame._get_axis(axis) 

10730 agg_axis = frame._get_agg_axis(axis) 

10731 

10732 if not isinstance(count_axis, MultiIndex): 

10733 raise TypeError( 

10734 f"Can only count levels on hierarchical {self._get_axis_name(axis)}." 

10735 ) 

10736 

10737 # Mask NaNs: Mask rows or columns where the index level is NaN, and all 

10738 # values in the DataFrame that are NaN 

10739 if frame._is_mixed_type: 

10740 # Since we have mixed types, calling notna(frame.values) might 

10741 # upcast everything to object 

10742 values_mask = notna(frame).values 

10743 else: 

10744 # But use the speedup when we have homogeneous dtypes 

10745 values_mask = notna(frame.values) 

10746 

10747 index_mask = notna(count_axis.get_level_values(level=level)) 

10748 if axis == 1: 

10749 mask = index_mask & values_mask 

10750 else: 

10751 mask = index_mask.reshape(-1, 1) & values_mask 

10752 

10753 if isinstance(level, int): 

10754 level_number = level 

10755 else: 

10756 level_number = count_axis._get_level_number(level) 

10757 

10758 level_name = count_axis._names[level_number] 

10759 level_index = count_axis.levels[level_number]._rename(name=level_name) 

10760 level_codes = ensure_platform_int(count_axis.codes[level_number]) 

10761 counts = lib.count_level_2d(mask, level_codes, len(level_index), axis=axis) 

10762 

10763 if axis == 1: 

10764 result = self._constructor(counts, index=agg_axis, columns=level_index) 

10765 else: 

10766 result = self._constructor(counts, index=level_index, columns=agg_axis) 

10767 

10768 return result 

10769 

10770 def _reduce( 

10771 self, 

10772 op, 

10773 name: str, 

10774 *, 

10775 axis: Axis = 0, 

10776 skipna: bool = True, 

10777 numeric_only: bool | None = None, 

10778 filter_type=None, 

10779 **kwds, 

10780 ): 

10781 assert filter_type is None or filter_type == "bool", filter_type 

10782 out_dtype = "bool" if filter_type == "bool" else None 

10783 

10784 if numeric_only is None and name in ["mean", "median"]: 

10785 own_dtypes = [arr.dtype for arr in self._mgr.arrays] 

10786 

10787 dtype_is_dt = np.array( 

10788 [is_datetime64_any_dtype(dtype) for dtype in own_dtypes], 

10789 dtype=bool, 

10790 ) 

10791 if dtype_is_dt.any(): 

10792 warnings.warn( 

10793 "DataFrame.mean and DataFrame.median with numeric_only=None " 

10794 "will include datetime64 and datetime64tz columns in a " 

10795 "future version.", 

10796 FutureWarning, 

10797 stacklevel=find_stack_level(), 

10798 ) 

10799 # Non-copy equivalent to 

10800 # dt64_cols = self.dtypes.apply(is_datetime64_any_dtype) 

10801 # cols = self.columns[~dt64_cols] 

10802 # self = self[cols] 

10803 predicate = lambda x: not is_datetime64_any_dtype(x.dtype) 

10804 mgr = self._mgr._get_data_subset(predicate) 

10805 self = type(self)(mgr) 

10806 

10807 # TODO: Make other agg func handle axis=None properly GH#21597 

10808 axis = self._get_axis_number(axis) 

10809 labels = self._get_agg_axis(axis) 

10810 assert axis in [0, 1] 

10811 

10812 def func(values: np.ndarray): 

10813 # We only use this in the case that operates on self.values 

10814 return op(values, axis=axis, skipna=skipna, **kwds) 

10815 

10816 def blk_func(values, axis=1): 

10817 if isinstance(values, ExtensionArray): 

10818 if not is_1d_only_ea_dtype(values.dtype) and not isinstance( 

10819 self._mgr, ArrayManager 

10820 ): 

10821 return values._reduce(name, axis=1, skipna=skipna, **kwds) 

10822 return values._reduce(name, skipna=skipna, **kwds) 

10823 else: 

10824 return op(values, axis=axis, skipna=skipna, **kwds) 

10825 

10826 def _get_data() -> DataFrame: 

10827 if filter_type is None: 

10828 data = self._get_numeric_data() 

10829 else: 

10830 # GH#25101, GH#24434 

10831 assert filter_type == "bool" 

10832 data = self._get_bool_data() 

10833 return data 

10834 

10835 numeric_only_bool = com.resolve_numeric_only(numeric_only) 

10836 if numeric_only is not None or axis == 0: 

10837 # For numeric_only non-None and axis non-None, we know 

10838 # which blocks to use and no try/except is needed. 

10839 # For numeric_only=None only the case with axis==0 and no object 

10840 # dtypes are unambiguous can be handled with BlockManager.reduce 

10841 # Case with EAs see GH#35881 

10842 df = self 

10843 if numeric_only_bool: 

10844 df = _get_data() 

10845 if axis == 1: 

10846 df = df.T 

10847 axis = 0 

10848 

10849 ignore_failures = numeric_only is None 

10850 

10851 # After possibly _get_data and transposing, we are now in the 

10852 # simple case where we can use BlockManager.reduce 

10853 res, _ = df._mgr.reduce(blk_func, ignore_failures=ignore_failures) 

10854 out = df._constructor(res).iloc[0] 

10855 if out_dtype is not None: 

10856 out = out.astype(out_dtype) 

10857 if axis == 0 and len(self) == 0 and name in ["sum", "prod"]: 

10858 # Even if we are object dtype, follow numpy and return 

10859 # float64, see test_apply_funcs_over_empty 

10860 out = out.astype(np.float64) 

10861 

10862 if numeric_only is None and out.shape[0] != df.shape[1]: 

10863 # columns have been dropped GH#41480 

10864 com.deprecate_numeric_only_default( 

10865 type(self), name, deprecate_none=True 

10866 ) 

10867 

10868 return out 

10869 

10870 assert numeric_only is None 

10871 

10872 data = self 

10873 values = data.values 

10874 

10875 try: 

10876 result = func(values) 

10877 

10878 except TypeError: 

10879 # e.g. in nanops trying to convert strs to float 

10880 

10881 data = _get_data() 

10882 labels = data._get_agg_axis(axis) 

10883 

10884 values = data.values 

10885 with np.errstate(all="ignore"): 

10886 result = func(values) 

10887 

10888 # columns have been dropped GH#41480 

10889 arg_name = "numeric_only" 

10890 if name in ["all", "any"]: 

10891 arg_name = "bool_only" 

10892 warnings.warn( 

10893 "Dropping of nuisance columns in DataFrame reductions " 

10894 f"(with '{arg_name}=None') is deprecated; in a future " 

10895 "version this will raise TypeError. Select only valid " 

10896 "columns before calling the reduction.", 

10897 FutureWarning, 

10898 stacklevel=find_stack_level(), 

10899 ) 

10900 

10901 if hasattr(result, "dtype"): 

10902 if filter_type == "bool" and notna(result).all(): 

10903 result = result.astype(np.bool_) 

10904 elif filter_type is None and is_object_dtype(result.dtype): 

10905 try: 

10906 result = result.astype(np.float64) 

10907 except (ValueError, TypeError): 

10908 # try to coerce to the original dtypes item by item if we can 

10909 pass 

10910 

10911 result = self._constructor_sliced(result, index=labels) 

10912 return result 

10913 

10914 def _reduce_axis1(self, name: str, func, skipna: bool) -> Series: 

10915 """ 

10916 Special case for _reduce to try to avoid a potentially-expensive transpose. 

10917 

10918 Apply the reduction block-wise along axis=1 and then reduce the resulting 

10919 1D arrays. 

10920 """ 

10921 if name == "all": 

10922 result = np.ones(len(self), dtype=bool) 

10923 ufunc = np.logical_and 

10924 elif name == "any": 

10925 result = np.zeros(len(self), dtype=bool) 

10926 # error: Incompatible types in assignment 

10927 # (expression has type "_UFunc_Nin2_Nout1[Literal['logical_or'], 

10928 # Literal[20], Literal[False]]", variable has type 

10929 # "_UFunc_Nin2_Nout1[Literal['logical_and'], Literal[20], 

10930 # Literal[True]]") 

10931 ufunc = np.logical_or # type: ignore[assignment] 

10932 else: 

10933 raise NotImplementedError(name) 

10934 

10935 for arr in self._mgr.arrays: 

10936 middle = func(arr, axis=0, skipna=skipna) 

10937 result = ufunc(result, middle) 

10938 

10939 res_ser = self._constructor_sliced(result, index=self.index) 

10940 return res_ser 

10941 

10942 def nunique(self, axis: Axis = 0, dropna: bool = True) -> Series: 

10943 """ 

10944 Count number of distinct elements in specified axis. 

10945 

10946 Return Series with number of distinct elements. Can ignore NaN 

10947 values. 

10948 

10949 Parameters 

10950 ---------- 

10951 axis : {0 or 'index', 1 or 'columns'}, default 0 

10952 The axis to use. 0 or 'index' for row-wise, 1 or 'columns' for 

10953 column-wise. 

10954 dropna : bool, default True 

10955 Don't include NaN in the counts. 

10956 

10957 Returns 

10958 ------- 

10959 Series 

10960 

10961 See Also 

10962 -------- 

10963 Series.nunique: Method nunique for Series. 

10964 DataFrame.count: Count non-NA cells for each column or row. 

10965 

10966 Examples 

10967 -------- 

10968 >>> df = pd.DataFrame({'A': [4, 5, 6], 'B': [4, 1, 1]}) 

10969 >>> df.nunique() 

10970 A 3 

10971 B 2 

10972 dtype: int64 

10973 

10974 >>> df.nunique(axis=1) 

10975 0 1 

10976 1 2 

10977 2 2 

10978 dtype: int64 

10979 """ 

10980 return self.apply(Series.nunique, axis=axis, dropna=dropna) 

10981 

10982 @doc(_shared_docs["idxmin"], numeric_only_default="False") 

10983 def idxmin( 

10984 self, axis: Axis = 0, skipna: bool = True, numeric_only: bool = False 

10985 ) -> Series: 

10986 axis = self._get_axis_number(axis) 

10987 if numeric_only: 

10988 data = self._get_numeric_data() 

10989 else: 

10990 data = self 

10991 

10992 res = data._reduce( 

10993 nanops.nanargmin, "argmin", axis=axis, skipna=skipna, numeric_only=False 

10994 ) 

10995 indices = res._values 

10996 

10997 # indices will always be np.ndarray since axis is not None and 

10998 # values is a 2d array for DataFrame 

10999 # error: Item "int" of "Union[int, Any]" has no attribute "__iter__" 

11000 assert isinstance(indices, np.ndarray) # for mypy 

11001 

11002 index = data._get_axis(axis) 

11003 result = [index[i] if i >= 0 else np.nan for i in indices] 

11004 final_result = data._constructor_sliced(result, index=data._get_agg_axis(axis)) 

11005 return final_result.__finalize__(self, method="idxmin") 

11006 

11007 @doc(_shared_docs["idxmax"], numeric_only_default="False") 

11008 def idxmax( 

11009 self, axis: Axis = 0, skipna: bool = True, numeric_only: bool = False 

11010 ) -> Series: 

11011 

11012 axis = self._get_axis_number(axis) 

11013 if numeric_only: 

11014 data = self._get_numeric_data() 

11015 else: 

11016 data = self 

11017 

11018 res = data._reduce( 

11019 nanops.nanargmax, "argmax", axis=axis, skipna=skipna, numeric_only=False 

11020 ) 

11021 indices = res._values 

11022 

11023 # indices will always be np.ndarray since axis is not None and 

11024 # values is a 2d array for DataFrame 

11025 # error: Item "int" of "Union[int, Any]" has no attribute "__iter__" 

11026 assert isinstance(indices, np.ndarray) # for mypy 

11027 

11028 index = data._get_axis(axis) 

11029 result = [index[i] if i >= 0 else np.nan for i in indices] 

11030 final_result = data._constructor_sliced(result, index=data._get_agg_axis(axis)) 

11031 return final_result.__finalize__(self, method="idxmax") 

11032 

11033 def _get_agg_axis(self, axis_num: int) -> Index: 

11034 """ 

11035 Let's be explicit about this. 

11036 """ 

11037 if axis_num == 0: 

11038 return self.columns 

11039 elif axis_num == 1: 

11040 return self.index 

11041 else: 

11042 raise ValueError(f"Axis must be 0 or 1 (got {repr(axis_num)})") 

11043 

11044 def mode( 

11045 self, axis: Axis = 0, numeric_only: bool = False, dropna: bool = True 

11046 ) -> DataFrame: 

11047 """ 

11048 Get the mode(s) of each element along the selected axis. 

11049 

11050 The mode of a set of values is the value that appears most often. 

11051 It can be multiple values. 

11052 

11053 Parameters 

11054 ---------- 

11055 axis : {0 or 'index', 1 or 'columns'}, default 0 

11056 The axis to iterate over while searching for the mode: 

11057 

11058 * 0 or 'index' : get mode of each column 

11059 * 1 or 'columns' : get mode of each row. 

11060 

11061 numeric_only : bool, default False 

11062 If True, only apply to numeric columns. 

11063 dropna : bool, default True 

11064 Don't consider counts of NaN/NaT. 

11065 

11066 Returns 

11067 ------- 

11068 DataFrame 

11069 The modes of each column or row. 

11070 

11071 See Also 

11072 -------- 

11073 Series.mode : Return the highest frequency value in a Series. 

11074 Series.value_counts : Return the counts of values in a Series. 

11075 

11076 Examples 

11077 -------- 

11078 >>> df = pd.DataFrame([('bird', 2, 2), 

11079 ... ('mammal', 4, np.nan), 

11080 ... ('arthropod', 8, 0), 

11081 ... ('bird', 2, np.nan)], 

11082 ... index=('falcon', 'horse', 'spider', 'ostrich'), 

11083 ... columns=('species', 'legs', 'wings')) 

11084 >>> df 

11085 species legs wings 

11086 falcon bird 2 2.0 

11087 horse mammal 4 NaN 

11088 spider arthropod 8 0.0 

11089 ostrich bird 2 NaN 

11090 

11091 By default, missing values are not considered, and the mode of wings 

11092 are both 0 and 2. Because the resulting DataFrame has two rows, 

11093 the second row of ``species`` and ``legs`` contains ``NaN``. 

11094 

11095 >>> df.mode() 

11096 species legs wings 

11097 0 bird 2.0 0.0 

11098 1 NaN NaN 2.0 

11099 

11100 Setting ``dropna=False`` ``NaN`` values are considered and they can be 

11101 the mode (like for wings). 

11102 

11103 >>> df.mode(dropna=False) 

11104 species legs wings 

11105 0 bird 2 NaN 

11106 

11107 Setting ``numeric_only=True``, only the mode of numeric columns is 

11108 computed, and columns of other types are ignored. 

11109 

11110 >>> df.mode(numeric_only=True) 

11111 legs wings 

11112 0 2.0 0.0 

11113 1 NaN 2.0 

11114 

11115 To compute the mode over columns and not rows, use the axis parameter: 

11116 

11117 >>> df.mode(axis='columns', numeric_only=True) 

11118 0 1 

11119 falcon 2.0 NaN 

11120 horse 4.0 NaN 

11121 spider 0.0 8.0 

11122 ostrich 2.0 NaN 

11123 """ 

11124 data = self if not numeric_only else self._get_numeric_data() 

11125 

11126 def f(s): 

11127 return s.mode(dropna=dropna) 

11128 

11129 data = data.apply(f, axis=axis) 

11130 # Ensure index is type stable (should always use int index) 

11131 if data.empty: 

11132 data.index = default_index(0) 

11133 

11134 return data 

11135 

11136 @overload 

11137 def quantile( 

11138 self, 

11139 q: float = ..., 

11140 axis: Axis = ..., 

11141 numeric_only: bool | lib.NoDefault = ..., 

11142 interpolation: QuantileInterpolation = ..., 

11143 ) -> Series: 

11144 ... 

11145 

11146 @overload 

11147 def quantile( 

11148 self, 

11149 q: AnyArrayLike | Sequence[float], 

11150 axis: Axis = ..., 

11151 numeric_only: bool | lib.NoDefault = ..., 

11152 interpolation: QuantileInterpolation = ..., 

11153 ) -> Series | DataFrame: 

11154 ... 

11155 

11156 @overload 

11157 def quantile( 

11158 self, 

11159 q: float | AnyArrayLike | Sequence[float] = ..., 

11160 axis: Axis = ..., 

11161 numeric_only: bool | lib.NoDefault = ..., 

11162 interpolation: QuantileInterpolation = ..., 

11163 ) -> Series | DataFrame: 

11164 ... 

11165 

11166 def quantile( 

11167 self, 

11168 q: float | AnyArrayLike | Sequence[float] = 0.5, 

11169 axis: Axis = 0, 

11170 numeric_only: bool | lib.NoDefault = no_default, 

11171 interpolation: QuantileInterpolation = "linear", 

11172 method: Literal["single", "table"] = "single", 

11173 ) -> Series | DataFrame: 

11174 """ 

11175 Return values at the given quantile over requested axis. 

11176 

11177 Parameters 

11178 ---------- 

11179 q : float or array-like, default 0.5 (50% quantile) 

11180 Value between 0 <= q <= 1, the quantile(s) to compute. 

11181 axis : {0 or 'index', 1 or 'columns'}, default 0 

11182 Equals 0 or 'index' for row-wise, 1 or 'columns' for column-wise. 

11183 numeric_only : bool, default True 

11184 If False, the quantile of datetime and timedelta data will be 

11185 computed as well. 

11186 

11187 .. deprecated:: 1.5.0 

11188 The default value of ``numeric_only`` will be ``False`` in a future 

11189 version of pandas. 

11190 

11191 interpolation : {'linear', 'lower', 'higher', 'midpoint', 'nearest'} 

11192 This optional parameter specifies the interpolation method to use, 

11193 when the desired quantile lies between two data points `i` and `j`: 

11194 

11195 * linear: `i + (j - i) * fraction`, where `fraction` is the 

11196 fractional part of the index surrounded by `i` and `j`. 

11197 * lower: `i`. 

11198 * higher: `j`. 

11199 * nearest: `i` or `j` whichever is nearest. 

11200 * midpoint: (`i` + `j`) / 2. 

11201 method : {'single', 'table'}, default 'single' 

11202 Whether to compute quantiles per-column ('single') or over all columns 

11203 ('table'). When 'table', the only allowed interpolation methods are 

11204 'nearest', 'lower', and 'higher'. 

11205 

11206 Returns 

11207 ------- 

11208 Series or DataFrame 

11209 

11210 If ``q`` is an array, a DataFrame will be returned where the 

11211 index is ``q``, the columns are the columns of self, and the 

11212 values are the quantiles. 

11213 If ``q`` is a float, a Series will be returned where the 

11214 index is the columns of self and the values are the quantiles. 

11215 

11216 See Also 

11217 -------- 

11218 core.window.rolling.Rolling.quantile: Rolling quantile. 

11219 numpy.percentile: Numpy function to compute the percentile. 

11220 

11221 Examples 

11222 -------- 

11223 >>> df = pd.DataFrame(np.array([[1, 1], [2, 10], [3, 100], [4, 100]]), 

11224 ... columns=['a', 'b']) 

11225 >>> df.quantile(.1) 

11226 a 1.3 

11227 b 3.7 

11228 Name: 0.1, dtype: float64 

11229 >>> df.quantile([.1, .5]) 

11230 a b 

11231 0.1 1.3 3.7 

11232 0.5 2.5 55.0 

11233 

11234 Specifying `method='table'` will compute the quantile over all columns. 

11235 

11236 >>> df.quantile(.1, method="table", interpolation="nearest") 

11237 a 1 

11238 b 1 

11239 Name: 0.1, dtype: int64 

11240 >>> df.quantile([.1, .5], method="table", interpolation="nearest") 

11241 a b 

11242 0.1 1 1 

11243 0.5 3 100 

11244 

11245 Specifying `numeric_only=False` will also compute the quantile of 

11246 datetime and timedelta data. 

11247 

11248 >>> df = pd.DataFrame({'A': [1, 2], 

11249 ... 'B': [pd.Timestamp('2010'), 

11250 ... pd.Timestamp('2011')], 

11251 ... 'C': [pd.Timedelta('1 days'), 

11252 ... pd.Timedelta('2 days')]}) 

11253 >>> df.quantile(0.5, numeric_only=False) 

11254 A 1.5 

11255 B 2010-07-02 12:00:00 

11256 C 1 days 12:00:00 

11257 Name: 0.5, dtype: object 

11258 """ 

11259 validate_percentile(q) 

11260 axis = self._get_axis_number(axis) 

11261 any_not_numeric = any(not is_numeric_dtype(x) for x in self.dtypes) 

11262 if numeric_only is no_default and any_not_numeric: 

11263 com.deprecate_numeric_only_default(type(self), "quantile") 

11264 numeric_only = com.resolve_numeric_only(numeric_only) 

11265 

11266 if not is_list_like(q): 

11267 # BlockManager.quantile expects listlike, so we wrap and unwrap here 

11268 # error: List item 0 has incompatible type "Union[float, Union[Union[ 

11269 # ExtensionArray, ndarray[Any, Any]], Index, Series], Sequence[float]]"; 

11270 # expected "float" 

11271 res_df = self.quantile( # type: ignore[call-overload] 

11272 [q], 

11273 axis=axis, 

11274 numeric_only=numeric_only, 

11275 interpolation=interpolation, 

11276 method=method, 

11277 ) 

11278 if method == "single": 

11279 res = res_df.iloc[0] 

11280 else: 

11281 # cannot directly iloc over sparse arrays 

11282 res = res_df.T.iloc[:, 0] 

11283 if axis == 1 and len(self) == 0: 

11284 # GH#41544 try to get an appropriate dtype 

11285 dtype = find_common_type(list(self.dtypes)) 

11286 if needs_i8_conversion(dtype): 

11287 return res.astype(dtype) 

11288 return res 

11289 

11290 q = Index(q, dtype=np.float64) 

11291 data = self._get_numeric_data() if numeric_only else self 

11292 

11293 if axis == 1: 

11294 data = data.T 

11295 

11296 if len(data.columns) == 0: 

11297 # GH#23925 _get_numeric_data may have dropped all columns 

11298 cols = Index([], name=self.columns.name) 

11299 

11300 dtype = np.float64 

11301 if axis == 1: 

11302 # GH#41544 try to get an appropriate dtype 

11303 cdtype = find_common_type(list(self.dtypes)) 

11304 if needs_i8_conversion(cdtype): 

11305 dtype = cdtype 

11306 

11307 res = self._constructor([], index=q, columns=cols, dtype=dtype) 

11308 return res.__finalize__(self, method="quantile") 

11309 

11310 valid_method = {"single", "table"} 

11311 if method not in valid_method: 

11312 raise ValueError( 

11313 f"Invalid method: {method}. Method must be in {valid_method}." 

11314 ) 

11315 if method == "single": 

11316 # error: Argument "qs" to "quantile" of "BlockManager" has incompatible type 

11317 # "Index"; expected "Float64Index" 

11318 res = data._mgr.quantile( 

11319 qs=q, axis=1, interpolation=interpolation # type: ignore[arg-type] 

11320 ) 

11321 elif method == "table": 

11322 valid_interpolation = {"nearest", "lower", "higher"} 

11323 if interpolation not in valid_interpolation: 

11324 raise ValueError( 

11325 f"Invalid interpolation: {interpolation}. " 

11326 f"Interpolation must be in {valid_interpolation}" 

11327 ) 

11328 # handle degenerate case 

11329 if len(data) == 0: 

11330 if data.ndim == 2: 

11331 dtype = find_common_type(list(self.dtypes)) 

11332 else: 

11333 dtype = self.dtype 

11334 return self._constructor([], index=q, columns=data.columns, dtype=dtype) 

11335 

11336 q_idx = np.quantile( # type: ignore[call-overload] 

11337 np.arange(len(data)), q, **{np_percentile_argname: interpolation} 

11338 ) 

11339 

11340 by = data.columns 

11341 if len(by) > 1: 

11342 keys = [data._get_label_or_level_values(x) for x in by] 

11343 indexer = lexsort_indexer(keys) 

11344 else: 

11345 by = by[0] 

11346 k = data._get_label_or_level_values(by) # type: ignore[arg-type] 

11347 indexer = nargsort(k) 

11348 

11349 res = data._mgr.take(indexer[q_idx], verify=False) 

11350 res.axes[1] = q 

11351 

11352 result = self._constructor(res) 

11353 return result.__finalize__(self, method="quantile") 

11354 

11355 @doc(NDFrame.asfreq, **_shared_doc_kwargs) 

11356 def asfreq( 

11357 self, 

11358 freq: Frequency, 

11359 method: FillnaOptions | None = None, 

11360 how: str | None = None, 

11361 normalize: bool = False, 

11362 fill_value: Hashable = None, 

11363 ) -> DataFrame: 

11364 return super().asfreq( 

11365 freq=freq, 

11366 method=method, 

11367 how=how, 

11368 normalize=normalize, 

11369 fill_value=fill_value, 

11370 ) 

11371 

11372 @doc(NDFrame.resample, **_shared_doc_kwargs) 

11373 def resample( 

11374 self, 

11375 rule, 

11376 axis: Axis = 0, 

11377 closed: str | None = None, 

11378 label: str | None = None, 

11379 convention: str = "start", 

11380 kind: str | None = None, 

11381 loffset=None, 

11382 base: int | None = None, 

11383 on: Level = None, 

11384 level: Level = None, 

11385 origin: str | TimestampConvertibleTypes = "start_day", 

11386 offset: TimedeltaConvertibleTypes | None = None, 

11387 group_keys: bool | lib.NoDefault = no_default, 

11388 ) -> Resampler: 

11389 return super().resample( 

11390 rule=rule, 

11391 axis=axis, 

11392 closed=closed, 

11393 label=label, 

11394 convention=convention, 

11395 kind=kind, 

11396 loffset=loffset, 

11397 base=base, 

11398 on=on, 

11399 level=level, 

11400 origin=origin, 

11401 offset=offset, 

11402 group_keys=group_keys, 

11403 ) 

11404 

11405 def to_timestamp( 

11406 self, 

11407 freq: Frequency | None = None, 

11408 how: str = "start", 

11409 axis: Axis = 0, 

11410 copy: bool = True, 

11411 ) -> DataFrame: 

11412 """ 

11413 Cast to DatetimeIndex of timestamps, at *beginning* of period. 

11414 

11415 Parameters 

11416 ---------- 

11417 freq : str, default frequency of PeriodIndex 

11418 Desired frequency. 

11419 how : {'s', 'e', 'start', 'end'} 

11420 Convention for converting period to timestamp; start of period 

11421 vs. end. 

11422 axis : {0 or 'index', 1 or 'columns'}, default 0 

11423 The axis to convert (the index by default). 

11424 copy : bool, default True 

11425 If False then underlying input data is not copied. 

11426 

11427 Returns 

11428 ------- 

11429 DataFrame with DatetimeIndex 

11430 """ 

11431 new_obj = self.copy(deep=copy) 

11432 

11433 axis_name = self._get_axis_name(axis) 

11434 old_ax = getattr(self, axis_name) 

11435 if not isinstance(old_ax, PeriodIndex): 

11436 raise TypeError(f"unsupported Type {type(old_ax).__name__}") 

11437 

11438 new_ax = old_ax.to_timestamp(freq=freq, how=how) 

11439 

11440 setattr(new_obj, axis_name, new_ax) 

11441 return new_obj 

11442 

11443 def to_period( 

11444 self, freq: Frequency | None = None, axis: Axis = 0, copy: bool = True 

11445 ) -> DataFrame: 

11446 """ 

11447 Convert DataFrame from DatetimeIndex to PeriodIndex. 

11448 

11449 Convert DataFrame from DatetimeIndex to PeriodIndex with desired 

11450 frequency (inferred from index if not passed). 

11451 

11452 Parameters 

11453 ---------- 

11454 freq : str, default 

11455 Frequency of the PeriodIndex. 

11456 axis : {0 or 'index', 1 or 'columns'}, default 0 

11457 The axis to convert (the index by default). 

11458 copy : bool, default True 

11459 If False then underlying input data is not copied. 

11460 

11461 Returns 

11462 ------- 

11463 DataFrame with PeriodIndex 

11464 

11465 Examples 

11466 -------- 

11467 >>> idx = pd.to_datetime( 

11468 ... [ 

11469 ... "2001-03-31 00:00:00", 

11470 ... "2002-05-31 00:00:00", 

11471 ... "2003-08-31 00:00:00", 

11472 ... ] 

11473 ... ) 

11474 

11475 >>> idx 

11476 DatetimeIndex(['2001-03-31', '2002-05-31', '2003-08-31'], 

11477 dtype='datetime64[ns]', freq=None) 

11478 

11479 >>> idx.to_period("M") 

11480 PeriodIndex(['2001-03', '2002-05', '2003-08'], dtype='period[M]') 

11481 

11482 For the yearly frequency 

11483 

11484 >>> idx.to_period("Y") 

11485 PeriodIndex(['2001', '2002', '2003'], dtype='period[A-DEC]') 

11486 """ 

11487 new_obj = self.copy(deep=copy) 

11488 

11489 axis_name = self._get_axis_name(axis) 

11490 old_ax = getattr(self, axis_name) 

11491 if not isinstance(old_ax, DatetimeIndex): 

11492 raise TypeError(f"unsupported Type {type(old_ax).__name__}") 

11493 

11494 new_ax = old_ax.to_period(freq=freq) 

11495 

11496 setattr(new_obj, axis_name, new_ax) 

11497 return new_obj 

11498 

11499 def isin(self, values: Series | DataFrame | Sequence | Mapping) -> DataFrame: 

11500 """ 

11501 Whether each element in the DataFrame is contained in values. 

11502 

11503 Parameters 

11504 ---------- 

11505 values : iterable, Series, DataFrame or dict 

11506 The result will only be true at a location if all the 

11507 labels match. If `values` is a Series, that's the index. If 

11508 `values` is a dict, the keys must be the column names, 

11509 which must match. If `values` is a DataFrame, 

11510 then both the index and column labels must match. 

11511 

11512 Returns 

11513 ------- 

11514 DataFrame 

11515 DataFrame of booleans showing whether each element in the DataFrame 

11516 is contained in values. 

11517 

11518 See Also 

11519 -------- 

11520 DataFrame.eq: Equality test for DataFrame. 

11521 Series.isin: Equivalent method on Series. 

11522 Series.str.contains: Test if pattern or regex is contained within a 

11523 string of a Series or Index. 

11524 

11525 Examples 

11526 -------- 

11527 >>> df = pd.DataFrame({'num_legs': [2, 4], 'num_wings': [2, 0]}, 

11528 ... index=['falcon', 'dog']) 

11529 >>> df 

11530 num_legs num_wings 

11531 falcon 2 2 

11532 dog 4 0 

11533 

11534 When ``values`` is a list check whether every value in the DataFrame 

11535 is present in the list (which animals have 0 or 2 legs or wings) 

11536 

11537 >>> df.isin([0, 2]) 

11538 num_legs num_wings 

11539 falcon True True 

11540 dog False True 

11541 

11542 To check if ``values`` is *not* in the DataFrame, use the ``~`` operator: 

11543 

11544 >>> ~df.isin([0, 2]) 

11545 num_legs num_wings 

11546 falcon False False 

11547 dog True False 

11548 

11549 When ``values`` is a dict, we can pass values to check for each 

11550 column separately: 

11551 

11552 >>> df.isin({'num_wings': [0, 3]}) 

11553 num_legs num_wings 

11554 falcon False False 

11555 dog False True 

11556 

11557 When ``values`` is a Series or DataFrame the index and column must 

11558 match. Note that 'falcon' does not match based on the number of legs 

11559 in other. 

11560 

11561 >>> other = pd.DataFrame({'num_legs': [8, 3], 'num_wings': [0, 2]}, 

11562 ... index=['spider', 'falcon']) 

11563 >>> df.isin(other) 

11564 num_legs num_wings 

11565 falcon False True 

11566 dog False False 

11567 """ 

11568 if isinstance(values, dict): 

11569 from pandas.core.reshape.concat import concat 

11570 

11571 values = collections.defaultdict(list, values) 

11572 result = concat( 

11573 ( 

11574 self.iloc[:, [i]].isin(values[col]) 

11575 for i, col in enumerate(self.columns) 

11576 ), 

11577 axis=1, 

11578 ) 

11579 elif isinstance(values, Series): 

11580 if not values.index.is_unique: 

11581 raise ValueError("cannot compute isin with a duplicate axis.") 

11582 result = self.eq(values.reindex_like(self), axis="index") 

11583 elif isinstance(values, DataFrame): 

11584 if not (values.columns.is_unique and values.index.is_unique): 

11585 raise ValueError("cannot compute isin with a duplicate axis.") 

11586 result = self.eq(values.reindex_like(self)) 

11587 else: 

11588 if not is_list_like(values): 

11589 raise TypeError( 

11590 "only list-like or dict-like objects are allowed " 

11591 "to be passed to DataFrame.isin(), " 

11592 f"you passed a '{type(values).__name__}'" 

11593 ) 

11594 # error: Argument 2 to "isin" has incompatible type "Union[Sequence[Any], 

11595 # Mapping[Any, Any]]"; expected "Union[Union[ExtensionArray, 

11596 # ndarray[Any, Any]], Index, Series]" 

11597 result = self._constructor( 

11598 algorithms.isin( 

11599 self.values.ravel(), values # type: ignore[arg-type] 

11600 ).reshape(self.shape), 

11601 self.index, 

11602 self.columns, 

11603 ) 

11604 return result.__finalize__(self, method="isin") 

11605 

11606 # ---------------------------------------------------------------------- 

11607 # Add index and columns 

11608 _AXIS_ORDERS = ["index", "columns"] 

11609 _AXIS_TO_AXIS_NUMBER: dict[Axis, int] = { 

11610 **NDFrame._AXIS_TO_AXIS_NUMBER, 

11611 1: 1, 

11612 "columns": 1, 

11613 } 

11614 _AXIS_LEN = len(_AXIS_ORDERS) 

11615 _info_axis_number = 1 

11616 _info_axis_name = "columns" 

11617 

11618 index = properties.AxisProperty( 

11619 axis=1, doc="The index (row labels) of the DataFrame." 

11620 ) 

11621 columns = properties.AxisProperty(axis=0, doc="The column labels of the DataFrame.") 

11622 

11623 @property 

11624 def _AXIS_NUMBERS(self) -> dict[str, int]: 

11625 """.. deprecated:: 1.1.0""" 

11626 super()._AXIS_NUMBERS 

11627 return {"index": 0, "columns": 1} 

11628 

11629 @property 

11630 def _AXIS_NAMES(self) -> dict[int, str]: 

11631 """.. deprecated:: 1.1.0""" 

11632 super()._AXIS_NAMES 

11633 return {0: "index", 1: "columns"} 

11634 

11635 # ---------------------------------------------------------------------- 

11636 # Add plotting methods to DataFrame 

11637 plot = CachedAccessor("plot", pandas.plotting.PlotAccessor) 

11638 hist = pandas.plotting.hist_frame 

11639 boxplot = pandas.plotting.boxplot_frame 

11640 sparse = CachedAccessor("sparse", SparseFrameAccessor) 

11641 

11642 # ---------------------------------------------------------------------- 

11643 # Internal Interface Methods 

11644 

11645 def _to_dict_of_blocks(self, copy: bool = True): 

11646 """ 

11647 Return a dict of dtype -> Constructor Types that 

11648 each is a homogeneous dtype. 

11649 

11650 Internal ONLY - only works for BlockManager 

11651 """ 

11652 mgr = self._mgr 

11653 # convert to BlockManager if needed -> this way support ArrayManager as well 

11654 mgr = mgr_to_mgr(mgr, "block") 

11655 mgr = cast(BlockManager, mgr) 

11656 return { 

11657 k: self._constructor(v).__finalize__(self) 

11658 for k, v, in mgr.to_dict(copy=copy).items() 

11659 } 

11660 

11661 @property 

11662 def values(self) -> np.ndarray: 

11663 """ 

11664 Return a Numpy representation of the DataFrame. 

11665 

11666 .. warning:: 

11667 

11668 We recommend using :meth:`DataFrame.to_numpy` instead. 

11669 

11670 Only the values in the DataFrame will be returned, the axes labels 

11671 will be removed. 

11672 

11673 Returns 

11674 ------- 

11675 numpy.ndarray 

11676 The values of the DataFrame. 

11677 

11678 See Also 

11679 -------- 

11680 DataFrame.to_numpy : Recommended alternative to this method. 

11681 DataFrame.index : Retrieve the index labels. 

11682 DataFrame.columns : Retrieving the column names. 

11683 

11684 Notes 

11685 ----- 

11686 The dtype will be a lower-common-denominator dtype (implicit 

11687 upcasting); that is to say if the dtypes (even of numeric types) 

11688 are mixed, the one that accommodates all will be chosen. Use this 

11689 with care if you are not dealing with the blocks. 

11690 

11691 e.g. If the dtypes are float16 and float32, dtype will be upcast to 

11692 float32. If dtypes are int32 and uint8, dtype will be upcast to 

11693 int32. By :func:`numpy.find_common_type` convention, mixing int64 

11694 and uint64 will result in a float64 dtype. 

11695 

11696 Examples 

11697 -------- 

11698 A DataFrame where all columns are the same type (e.g., int64) results 

11699 in an array of the same type. 

11700 

11701 >>> df = pd.DataFrame({'age': [ 3, 29], 

11702 ... 'height': [94, 170], 

11703 ... 'weight': [31, 115]}) 

11704 >>> df 

11705 age height weight 

11706 0 3 94 31 

11707 1 29 170 115 

11708 >>> df.dtypes 

11709 age int64 

11710 height int64 

11711 weight int64 

11712 dtype: object 

11713 >>> df.values 

11714 array([[ 3, 94, 31], 

11715 [ 29, 170, 115]]) 

11716 

11717 A DataFrame with mixed type columns(e.g., str/object, int64, float32) 

11718 results in an ndarray of the broadest type that accommodates these 

11719 mixed types (e.g., object). 

11720 

11721 >>> df2 = pd.DataFrame([('parrot', 24.0, 'second'), 

11722 ... ('lion', 80.5, 1), 

11723 ... ('monkey', np.nan, None)], 

11724 ... columns=('name', 'max_speed', 'rank')) 

11725 >>> df2.dtypes 

11726 name object 

11727 max_speed float64 

11728 rank object 

11729 dtype: object 

11730 >>> df2.values 

11731 array([['parrot', 24.0, 'second'], 

11732 ['lion', 80.5, 1], 

11733 ['monkey', nan, None]], dtype=object) 

11734 """ 

11735 self._consolidate_inplace() 

11736 return self._mgr.as_array() 

11737 

11738 @overload 

11739 def ffill( 

11740 self, 

11741 *, 

11742 axis: None | Axis = ..., 

11743 inplace: Literal[False] = ..., 

11744 limit: None | int = ..., 

11745 downcast: dict | None = ..., 

11746 ) -> DataFrame: 

11747 ... 

11748 

11749 @overload 

11750 def ffill( 

11751 self, 

11752 *, 

11753 axis: None | Axis = ..., 

11754 inplace: Literal[True], 

11755 limit: None | int = ..., 

11756 downcast: dict | None = ..., 

11757 ) -> None: 

11758 ... 

11759 

11760 @overload 

11761 def ffill( 

11762 self, 

11763 *, 

11764 axis: None | Axis = ..., 

11765 inplace: bool = ..., 

11766 limit: None | int = ..., 

11767 downcast: dict | None = ..., 

11768 ) -> DataFrame | None: 

11769 ... 

11770 

11771 # error: Signature of "ffill" incompatible with supertype "NDFrame" 

11772 @deprecate_nonkeyword_arguments(version=None, allowed_args=["self"]) 

11773 def ffill( # type: ignore[override] 

11774 self, 

11775 axis: None | Axis = None, 

11776 inplace: bool = False, 

11777 limit: None | int = None, 

11778 downcast: dict | None = None, 

11779 ) -> DataFrame | None: 

11780 return super().ffill(axis=axis, inplace=inplace, limit=limit, downcast=downcast) 

11781 

11782 @overload 

11783 def bfill( 

11784 self, 

11785 *, 

11786 axis: None | Axis = ..., 

11787 inplace: Literal[False] = ..., 

11788 limit: None | int = ..., 

11789 downcast=..., 

11790 ) -> DataFrame: 

11791 ... 

11792 

11793 @overload 

11794 def bfill( 

11795 self, 

11796 *, 

11797 axis: None | Axis = ..., 

11798 inplace: Literal[True], 

11799 limit: None | int = ..., 

11800 downcast=..., 

11801 ) -> None: 

11802 ... 

11803 

11804 @overload 

11805 def bfill( 

11806 self, 

11807 *, 

11808 axis: None | Axis = ..., 

11809 inplace: bool = ..., 

11810 limit: None | int = ..., 

11811 downcast=..., 

11812 ) -> DataFrame | None: 

11813 ... 

11814 

11815 # error: Signature of "bfill" incompatible with supertype "NDFrame" 

11816 @deprecate_nonkeyword_arguments(version=None, allowed_args=["self"]) 

11817 def bfill( # type: ignore[override] 

11818 self, 

11819 axis: None | Axis = None, 

11820 inplace: bool = False, 

11821 limit: None | int = None, 

11822 downcast=None, 

11823 ) -> DataFrame | None: 

11824 return super().bfill(axis=axis, inplace=inplace, limit=limit, downcast=downcast) 

11825 

11826 @deprecate_nonkeyword_arguments( 

11827 version=None, allowed_args=["self", "lower", "upper"] 

11828 ) 

11829 def clip( 

11830 self: DataFrame, 

11831 lower: float | None = None, 

11832 upper: float | None = None, 

11833 axis: Axis | None = None, 

11834 inplace: bool = False, 

11835 *args, 

11836 **kwargs, 

11837 ) -> DataFrame | None: 

11838 return super().clip(lower, upper, axis, inplace, *args, **kwargs) 

11839 

11840 @deprecate_nonkeyword_arguments(version=None, allowed_args=["self", "method"]) 

11841 def interpolate( 

11842 self: DataFrame, 

11843 method: str = "linear", 

11844 axis: Axis = 0, 

11845 limit: int | None = None, 

11846 inplace: bool = False, 

11847 limit_direction: str | None = None, 

11848 limit_area: str | None = None, 

11849 downcast: str | None = None, 

11850 **kwargs, 

11851 ) -> DataFrame | None: 

11852 return super().interpolate( 

11853 method, 

11854 axis, 

11855 limit, 

11856 inplace, 

11857 limit_direction, 

11858 limit_area, 

11859 downcast, 

11860 **kwargs, 

11861 ) 

11862 

11863 @overload 

11864 def where( 

11865 self, 

11866 cond, 

11867 other=..., 

11868 *, 

11869 inplace: Literal[False] = ..., 

11870 axis: Axis | None = ..., 

11871 level: Level = ..., 

11872 errors: IgnoreRaise | lib.NoDefault = ..., 

11873 try_cast: bool | lib.NoDefault = ..., 

11874 ) -> DataFrame: 

11875 ... 

11876 

11877 @overload 

11878 def where( 

11879 self, 

11880 cond, 

11881 other=..., 

11882 *, 

11883 inplace: Literal[True], 

11884 axis: Axis | None = ..., 

11885 level: Level = ..., 

11886 errors: IgnoreRaise | lib.NoDefault = ..., 

11887 try_cast: bool | lib.NoDefault = ..., 

11888 ) -> None: 

11889 ... 

11890 

11891 @overload 

11892 def where( 

11893 self, 

11894 cond, 

11895 other=..., 

11896 *, 

11897 inplace: bool = ..., 

11898 axis: Axis | None = ..., 

11899 level: Level = ..., 

11900 errors: IgnoreRaise | lib.NoDefault = ..., 

11901 try_cast: bool | lib.NoDefault = ..., 

11902 ) -> DataFrame | None: 

11903 ... 

11904 

11905 # error: Signature of "where" incompatible with supertype "NDFrame" 

11906 @deprecate_kwarg(old_arg_name="errors", new_arg_name=None) 

11907 @deprecate_nonkeyword_arguments( 

11908 version=None, allowed_args=["self", "cond", "other"] 

11909 ) 

11910 def where( # type: ignore[override] 

11911 self, 

11912 cond, 

11913 other=lib.no_default, 

11914 inplace: bool = False, 

11915 axis: Axis | None = None, 

11916 level: Level = None, 

11917 errors: IgnoreRaise | lib.NoDefault = "raise", 

11918 try_cast: bool | lib.NoDefault = lib.no_default, 

11919 ) -> DataFrame | None: 

11920 return super().where( 

11921 cond, 

11922 other, 

11923 inplace=inplace, 

11924 axis=axis, 

11925 level=level, 

11926 try_cast=try_cast, 

11927 ) 

11928 

11929 @overload 

11930 def mask( 

11931 self, 

11932 cond, 

11933 other=..., 

11934 *, 

11935 inplace: Literal[False] = ..., 

11936 axis: Axis | None = ..., 

11937 level: Level = ..., 

11938 errors: IgnoreRaise | lib.NoDefault = ..., 

11939 try_cast: bool | lib.NoDefault = ..., 

11940 ) -> DataFrame: 

11941 ... 

11942 

11943 @overload 

11944 def mask( 

11945 self, 

11946 cond, 

11947 other=..., 

11948 *, 

11949 inplace: Literal[True], 

11950 axis: Axis | None = ..., 

11951 level: Level = ..., 

11952 errors: IgnoreRaise | lib.NoDefault = ..., 

11953 try_cast: bool | lib.NoDefault = ..., 

11954 ) -> None: 

11955 ... 

11956 

11957 @overload 

11958 def mask( 

11959 self, 

11960 cond, 

11961 other=..., 

11962 *, 

11963 inplace: bool = ..., 

11964 axis: Axis | None = ..., 

11965 level: Level = ..., 

11966 errors: IgnoreRaise | lib.NoDefault = ..., 

11967 try_cast: bool | lib.NoDefault = ..., 

11968 ) -> DataFrame | None: 

11969 ... 

11970 

11971 # error: Signature of "mask" incompatible with supertype "NDFrame" 

11972 @deprecate_kwarg(old_arg_name="errors", new_arg_name=None) 

11973 @deprecate_nonkeyword_arguments( 

11974 version=None, allowed_args=["self", "cond", "other"] 

11975 ) 

11976 def mask( # type: ignore[override] 

11977 self, 

11978 cond, 

11979 other=np.nan, 

11980 inplace: bool = False, 

11981 axis: Axis | None = None, 

11982 level: Level = None, 

11983 errors: IgnoreRaise | lib.NoDefault = "raise", 

11984 try_cast: bool | lib.NoDefault = lib.no_default, 

11985 ) -> DataFrame | None: 

11986 return super().mask( 

11987 cond, 

11988 other, 

11989 inplace=inplace, 

11990 axis=axis, 

11991 level=level, 

11992 try_cast=try_cast, 

11993 ) 

11994 

11995 

11996DataFrame._add_numeric_operations() 

11997 

11998ops.add_flex_arithmetic_methods(DataFrame) 

11999 

12000 

12001def _from_nested_dict(data) -> collections.defaultdict: 

12002 new_data: collections.defaultdict = collections.defaultdict(dict) 

12003 for index, s in data.items(): 

12004 for col, v in s.items(): 

12005 new_data[col][index] = v 

12006 return new_data 

12007 

12008 

12009def _reindex_for_setitem(value: DataFrame | Series, index: Index) -> ArrayLike: 

12010 # reindex if necessary 

12011 

12012 if value.index.equals(index) or not len(index): 

12013 return value._values.copy() 

12014 

12015 # GH#4107 

12016 try: 

12017 reindexed_value = value.reindex(index)._values 

12018 except ValueError as err: 

12019 # raised in MultiIndex.from_tuples, see test_insert_error_msmgs 

12020 if not value.index.is_unique: 

12021 # duplicate axis 

12022 raise err 

12023 

12024 raise TypeError( 

12025 "incompatible index of inserted column with frame index" 

12026 ) from err 

12027 return reindexed_value