Coverage for /var/srv/projects/api.amasfac.comuna18.com/tmp/venv/lib/python3.9/site-packages/pandas/core/groupby/groupby.py: 18%

1"""

2Provide the groupby split-apply-combine paradigm. Define the GroupBy

3class providing the base-class of operations.

5The SeriesGroupBy and DataFrameGroupBy sub-class

6(defined in pandas.core.groupby.generic)

7expose these user-facing objects to provide specific functionality.

8"""

9from __future__ import annotations

11from contextlib import contextmanager

12import datetime

13from functools import (

14 partial,

15 wraps,

16)

17import inspect

18from textwrap import dedent

19import types

20from typing import (

21 TYPE_CHECKING,

22 Callable,

23 Hashable,

24 Iterable,

25 Iterator,

26 List,

27 Literal,

28 Mapping,

29 Sequence,

30 TypeVar,

31 Union,

32 cast,

33 final,

34)

35import warnings

37import numpy as np

39from pandas._config.config import option_context

41from pandas._libs import (

42 Timestamp,

43 lib,

44)

45import pandas._libs.groupby as libgroupby

46from pandas._typing import (

47 ArrayLike,

48 IndexLabel,

49 NDFrameT,

50 PositionalIndexer,

51 RandomState,

52 Scalar,

53 T,

54 npt,

55)

56from pandas.compat.numpy import function as nv

57from pandas.errors import (

58 AbstractMethodError,

59 DataError,

60)

61from pandas.util._decorators import (

62 Appender,

63 Substitution,

64 cache_readonly,

65 doc,

66)

67from pandas.util._exceptions import find_stack_level

69from pandas.core.dtypes.cast import ensure_dtype_can_hold_na

70from pandas.core.dtypes.common import (

71 is_bool_dtype,

72 is_datetime64_dtype,

73 is_float_dtype,

74 is_integer,

75 is_integer_dtype,

76 is_numeric_dtype,

77 is_object_dtype,

78 is_scalar,

79 is_timedelta64_dtype,

80)

81from pandas.core.dtypes.missing import (

82 isna,

83 notna,

84)

86from pandas.core import nanops

87from pandas.core._numba import executor

88import pandas.core.algorithms as algorithms

89from pandas.core.arrays import (

90 BaseMaskedArray,

91 BooleanArray,

92 Categorical,

93 ExtensionArray,

94)

95from pandas.core.base import (

96 PandasObject,

97 SelectionMixin,

98)

99import pandas.core.common as com

100from pandas.core.frame import DataFrame

101from pandas.core.generic import NDFrame

102from pandas.core.groupby import (

103 base,

104 numba_,

105 ops,

106)

107from pandas.core.groupby.indexing import (

108 GroupByIndexingMixin,

109 GroupByNthSelector,

110)

111from pandas.core.indexes.api import (

112 CategoricalIndex,

113 Index,

114 MultiIndex,

115 RangeIndex,

116)

117from pandas.core.internals.blocks import ensure_block_shape

118import pandas.core.sample as sample

119from pandas.core.series import Series

120from pandas.core.sorting import get_group_index_sorter

121from pandas.core.util.numba_ import (

122 get_jit_arguments,

123 maybe_use_numba,

124)

125

126if TYPE_CHECKING: 126 ↛ 127line 126 didn't jump to line 127, because the condition on line 126 was never true

127 from pandas.core.window import (

128 ExpandingGroupby,

129 ExponentialMovingWindowGroupby,

130 RollingGroupby,

131 )

132

133_common_see_also = """

134 See Also

135 --------

136 Series.%(name)s : Apply a function %(name)s to a Series.

137 DataFrame.%(name)s : Apply a function %(name)s

138 to each row or column of a DataFrame.

139"""

140

141_apply_docs = {

142 "template": """

143 Apply function ``func`` group-wise and combine the results together.

144

145 The function passed to ``apply`` must take a {input} as its first

146 argument and return a DataFrame, Series or scalar. ``apply`` will

147 then take care of combining the results back together into a single

148 dataframe or series. ``apply`` is therefore a highly flexible

149 grouping method.

150

151 While ``apply`` is a very flexible method, its downside is that

152 using it can be quite a bit slower than using more specific methods

153 like ``agg`` or ``transform``. Pandas offers a wide range of method that will

154 be much faster than using ``apply`` for their specific purposes, so try to

155 use them before reaching for ``apply``.

156

157 Parameters

158 ----------

159 func : callable

160 A callable that takes a {input} as its first argument, and

161 returns a dataframe, a series or a scalar. In addition the

162 callable may take positional and keyword arguments.

163 args, kwargs : tuple and dict

164 Optional positional and keyword arguments to pass to ``func``.

165

166 Returns

167 -------

168 applied : Series or DataFrame

169

170 See Also

171 --------

172 pipe : Apply function to the full GroupBy object instead of to each

173 group.

174 aggregate : Apply aggregate function to the GroupBy object.

175 transform : Apply function column-by-column to the GroupBy object.

176 Series.apply : Apply a function to a Series.

177 DataFrame.apply : Apply a function to each row or column of a DataFrame.

178

179 Notes

180 -----

181

182 .. versionchanged:: 1.3.0

183

184 The resulting dtype will reflect the return value of the passed ``func``,

185 see the examples below.

186

187 Functions that mutate the passed object can produce unexpected

188 behavior or errors and are not supported. See :ref:`gotchas.udf-mutation`

189 for more details.

190

191 Examples

192 --------

193 {examples}

194 """,

195 "dataframe_examples": """

196 >>> df = pd.DataFrame({'A': 'a a b'.split(),

197 ... 'B': [1,2,3],

198 ... 'C': [4,6,5]})

199 >>> g1 = df.groupby('A', group_keys=False)

200 >>> g2 = df.groupby('A', group_keys=True)

201

202 Notice that ``g1`` have ``g2`` have two groups, ``a`` and ``b``, and only

203 differ in their ``group_keys`` argument. Calling `apply` in various ways,

204 we can get different grouping results:

205

206 Example 1: below the function passed to `apply` takes a DataFrame as

207 its argument and returns a DataFrame. `apply` combines the result for

208 each group together into a new DataFrame:

209

210 >>> g1[['B', 'C']].apply(lambda x: x / x.sum())

211 B C

212 0 0.333333 0.4

213 1 0.666667 0.6

214 2 1.000000 1.0

215

216 In the above, the groups are not part of the index. We can have them included

217 by using ``g2`` where ``group_keys=True``:

218

219 >>> g2[['B', 'C']].apply(lambda x: x / x.sum())

220 B C

221 A

222 a 0 0.333333 0.4

223 1 0.666667 0.6

224 b 2 1.000000 1.0

225

226 Example 2: The function passed to `apply` takes a DataFrame as

227 its argument and returns a Series. `apply` combines the result for

228 each group together into a new DataFrame.

229

230 .. versionchanged:: 1.3.0

231

232 The resulting dtype will reflect the return value of the passed ``func``.

233

234 >>> g1[['B', 'C']].apply(lambda x: x.astype(float).max() - x.min())

235 B C

236 A

237 a 1.0 2.0

238 b 0.0 0.0

239

240 >>> g2[['B', 'C']].apply(lambda x: x.astype(float).max() - x.min())

241 B C

242 A

243 a 1.0 2.0

244 b 0.0 0.0

245

246 The ``group_keys`` argument has no effect here because the result is not

247 like-indexed (i.e. :ref:`a transform <groupby.transform>`) when compared

248 to the input.

249

250 Example 3: The function passed to `apply` takes a DataFrame as

251 its argument and returns a scalar. `apply` combines the result for

252 each group together into a Series, including setting the index as

253 appropriate:

254

255 >>> g1.apply(lambda x: x.C.max() - x.B.min())

256 A

257 a 5

258 b 2

259 dtype: int64""",

260 "series_examples": """

261 >>> s = pd.Series([0, 1, 2], index='a a b'.split())

262 >>> g1 = s.groupby(s.index, group_keys=False)

263 >>> g2 = s.groupby(s.index, group_keys=True)

264

265 From ``s`` above we can see that ``g`` has two groups, ``a`` and ``b``.

266 Notice that ``g1`` have ``g2`` have two groups, ``a`` and ``b``, and only

267 differ in their ``group_keys`` argument. Calling `apply` in various ways,

268 we can get different grouping results:

269

270 Example 1: The function passed to `apply` takes a Series as

271 its argument and returns a Series. `apply` combines the result for

272 each group together into a new Series.

273

274 .. versionchanged:: 1.3.0

275

276 The resulting dtype will reflect the return value of the passed ``func``.

277

278 >>> g1.apply(lambda x: x*2 if x.name == 'a' else x/2)

279 a 0.0

280 a 2.0

281 b 1.0

282 dtype: float64

283

284 In the above, the groups are not part of the index. We can have them included

285 by using ``g2`` where ``group_keys=True``:

286

287 >>> g2.apply(lambda x: x*2 if x.name == 'a' else x/2)

288 a a 0.0

289 a 2.0

290 b b 1.0

291 dtype: float64

292

293 Example 2: The function passed to `apply` takes a Series as

294 its argument and returns a scalar. `apply` combines the result for

295 each group together into a Series, including setting the index as

296 appropriate:

297

298 >>> g1.apply(lambda x: x.max() - x.min())

299 a 1

300 b 0

301 dtype: int64

302

303 The ``group_keys`` argument has no effect here because the result is not

304 like-indexed (i.e. :ref:`a transform <groupby.transform>`) when compared

305 to the input.

306

307 >>> g2.apply(lambda x: x.max() - x.min())

308 a 1

309 b 0

310 dtype: int64""",

311}

312

313_groupby_agg_method_template = """

314Compute {fname} of group values.

315

316Parameters

317----------

318numeric_only : bool, default {no}

319 Include only float, int, boolean columns. If None, will attempt to use

320 everything, then use only numeric data.

321min_count : int, default {mc}

322 The required number of valid values to perform the operation. If fewer

323 than ``min_count`` non-NA values are present the result will be NA.

324

325Returns

326-------

327Series or DataFrame

328 Computed {fname} of values within each group.

329"""

330

331_pipe_template = """

332Apply a ``func`` with arguments to this %(klass)s object and return its result.

333

334Use `.pipe` when you want to improve readability by chaining together

335functions that expect Series, DataFrames, GroupBy or Resampler objects.

336Instead of writing

337

338>>> h(g(f(df.groupby('group')), arg1=a), arg2=b, arg3=c) # doctest: +SKIP

339

340You can write

341

342>>> (df.groupby('group')

343... .pipe(f)

344... .pipe(g, arg1=a)

345... .pipe(h, arg2=b, arg3=c)) # doctest: +SKIP

346

347which is much more readable.

348

349Parameters

350----------

351func : callable or tuple of (callable, str)

352 Function to apply to this %(klass)s object or, alternatively,

353 a `(callable, data_keyword)` tuple where `data_keyword` is a

354 string indicating the keyword of `callable` that expects the

355 %(klass)s object.

356args : iterable, optional

357 Positional arguments passed into `func`.

358kwargs : dict, optional

359 A dictionary of keyword arguments passed into `func`.

360

361Returns

362-------

363object : the return type of `func`.

364

365See Also

366--------

367Series.pipe : Apply a function with arguments to a series.

368DataFrame.pipe: Apply a function with arguments to a dataframe.

369apply : Apply function to each group instead of to the

370 full %(klass)s object.

371

372Notes

373-----

374See more `here

375<https://pandas.pydata.org/pandas-docs/stable/user_guide/groupby.html#piping-function-calls>`_

376

377Examples

378--------

379%(examples)s

380"""

381

382_transform_template = """

383Call function producing a same-indexed %(klass)s on each group.

384

385Returns a %(klass)s having the same indexes as the original object

386filled with the transformed values.

387

388Parameters

389----------

390f : function

391 Function to apply to each group. See the Notes section below for requirements.

392

393 Can also accept a Numba JIT function with

394 ``engine='numba'`` specified.

395

396 If the ``'numba'`` engine is chosen, the function must be

397 a user defined function with ``values`` and ``index`` as the

398 first and second arguments respectively in the function signature.

399 Each group's index will be passed to the user defined function

400 and optionally available for use.

401

402 .. versionchanged:: 1.1.0

403*args

404 Positional arguments to pass to func.

405engine : str, default None

406 * ``'cython'`` : Runs the function through C-extensions from cython.

407 * ``'numba'`` : Runs the function through JIT compiled code from numba.

408 * ``None`` : Defaults to ``'cython'`` or the global setting ``compute.use_numba``

409

410 .. versionadded:: 1.1.0

411engine_kwargs : dict, default None

412 * For ``'cython'`` engine, there are no accepted ``engine_kwargs``

413 * For ``'numba'`` engine, the engine can accept ``nopython``, ``nogil``

414 and ``parallel`` dictionary keys. The values must either be ``True`` or

415 ``False``. The default ``engine_kwargs`` for the ``'numba'`` engine is

416 ``{'nopython': True, 'nogil': False, 'parallel': False}`` and will be

417 applied to the function

418

419 .. versionadded:: 1.1.0

420**kwargs

421 Keyword arguments to be passed into func.

422

423Returns

424-------

425%(klass)s

426

427See Also

428--------

429%(klass)s.groupby.apply : Apply function ``func`` group-wise and combine

430 the results together.

431%(klass)s.groupby.aggregate : Aggregate using one or more

432 operations over the specified axis.

433%(klass)s.transform : Call ``func`` on self producing a %(klass)s with the

434 same axis shape as self.

435

436Notes

437-----

438Each group is endowed the attribute 'name' in case you need to know

439which group you are working on.

440

441The current implementation imposes three requirements on f:

442

443* f must return a value that either has the same shape as the input

444 subframe or can be broadcast to the shape of the input subframe.

445 For example, if `f` returns a scalar it will be broadcast to have the

446 same shape as the input subframe.

447* if this is a DataFrame, f must support application column-by-column

448 in the subframe. If f also supports application to the entire subframe,

449 then a fast path is used starting from the second chunk.

450* f must not mutate groups. Mutation is not supported and may

451 produce unexpected results. See :ref:`gotchas.udf-mutation` for more details.

452

453When using ``engine='numba'``, there will be no "fall back" behavior internally.

454The group data and group index will be passed as numpy arrays to the JITed

455user defined function, and no alternative execution attempts will be tried.

456

457.. versionchanged:: 1.3.0

458

459 The resulting dtype will reflect the return value of the passed ``func``,

460 see the examples below.

461

462.. deprecated:: 1.5.0

463

464 When using ``.transform`` on a grouped DataFrame and the transformation function

465 returns a DataFrame, currently pandas does not align the result's index

466 with the input's index. This behavior is deprecated and alignment will

467 be performed in a future version of pandas. You can apply ``.to_numpy()`` to the

468 result of the transformation function to avoid alignment.

469

470Examples

471--------

472

473>>> df = pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar',

474... 'foo', 'bar'],

475... 'B' : ['one', 'one', 'two', 'three',

476... 'two', 'two'],

477... 'C' : [1, 5, 5, 2, 5, 5],

478... 'D' : [2.0, 5., 8., 1., 2., 9.]})

479>>> grouped = df.groupby('A')[['C', 'D']]

480>>> grouped.transform(lambda x: (x - x.mean()) / x.std())

481 C D

4820 -1.154701 -0.577350

4831 0.577350 0.000000

4842 0.577350 1.154701

4853 -1.154701 -1.000000

4864 0.577350 -0.577350

4875 0.577350 1.000000

488

489Broadcast result of the transformation

490

491>>> grouped.transform(lambda x: x.max() - x.min())

492 C D

4930 4.0 6.0

4941 3.0 8.0

4952 4.0 6.0

4963 3.0 8.0

4974 4.0 6.0

4985 3.0 8.0

499

500.. versionchanged:: 1.3.0

501

502 The resulting dtype will reflect the return value of the passed ``func``,

503 for example:

504

505>>> grouped.transform(lambda x: x.astype(int).max())

506 C D

5070 5 8

5081 5 9

5092 5 8

5103 5 9

5114 5 8

5125 5 9

513"""

514

515_agg_template = """

516Aggregate using one or more operations over the specified axis.

517

518Parameters

519----------

520func : function, str, list or dict

521 Function to use for aggregating the data. If a function, must either

522 work when passed a {klass} or when passed to {klass}.apply.

523

524 Accepted combinations are:

525

526 - function

527 - string function name

528 - list of functions and/or function names, e.g. ``[np.sum, 'mean']``

529 - dict of axis labels -> functions, function names or list of such.

530

531 Can also accept a Numba JIT function with

532 ``engine='numba'`` specified. Only passing a single function is supported

533 with this engine.

534

535 If the ``'numba'`` engine is chosen, the function must be

536 a user defined function with ``values`` and ``index`` as the

537 first and second arguments respectively in the function signature.

538 Each group's index will be passed to the user defined function

539 and optionally available for use.

540

541 .. versionchanged:: 1.1.0

542*args

543 Positional arguments to pass to func.

544engine : str, default None

545 * ``'cython'`` : Runs the function through C-extensions from cython.

546 * ``'numba'`` : Runs the function through JIT compiled code from numba.

547 * ``None`` : Defaults to ``'cython'`` or globally setting ``compute.use_numba``

548

549 .. versionadded:: 1.1.0

550engine_kwargs : dict, default None

551 * For ``'cython'`` engine, there are no accepted ``engine_kwargs``

552 * For ``'numba'`` engine, the engine can accept ``nopython``, ``nogil``

553 and ``parallel`` dictionary keys. The values must either be ``True`` or

554 ``False``. The default ``engine_kwargs`` for the ``'numba'`` engine is

555 ``{{'nopython': True, 'nogil': False, 'parallel': False}}`` and will be

556 applied to the function

557

558 .. versionadded:: 1.1.0

559**kwargs

560 Keyword arguments to be passed into func.

561

562Returns

563-------

564{klass}

565

566See Also

567--------

568{klass}.groupby.apply : Apply function func group-wise

569 and combine the results together.

570{klass}.groupby.transform : Aggregate using one or more

571 operations over the specified axis.

572{klass}.aggregate : Transforms the Series on each group

573 based on the given function.

574

575Notes

576-----

577When using ``engine='numba'``, there will be no "fall back" behavior internally.

578The group data and group index will be passed as numpy arrays to the JITed

579user defined function, and no alternative execution attempts will be tried.

580

581Functions that mutate the passed object can produce unexpected

582behavior or errors and are not supported. See :ref:`gotchas.udf-mutation`

583for more details.

584

585.. versionchanged:: 1.3.0

586

587 The resulting dtype will reflect the return value of the passed ``func``,

588 see the examples below.

589{examples}"""

590

591

592@final

593class GroupByPlot(PandasObject):

594 """

595 Class implementing the .plot attribute for groupby objects.

596 """

597

598 def __init__(self, groupby: GroupBy) -> None:

599 self._groupby = groupby

600

601 def __call__(self, *args, **kwargs):

602 def f(self):

603 return self.plot(*args, **kwargs)

604

605 f.__name__ = "plot"

606 return self._groupby.apply(f)

607

608 def __getattr__(self, name: str):

609 def attr(*args, **kwargs):

610 def f(self):

611 return getattr(self.plot, name)(*args, **kwargs)

612

613 return self._groupby.apply(f)

614

615 return attr

616

617

618_KeysArgType = Union[

619 Hashable,

620 List[Hashable],

621 Callable[[Hashable], Hashable],

622 List[Callable[[Hashable], Hashable]],

623 Mapping[Hashable, Hashable],

624]

625

626

627class BaseGroupBy(PandasObject, SelectionMixin[NDFrameT], GroupByIndexingMixin):

628 _group_selection: IndexLabel | None = None

629 _apply_allowlist: frozenset[str] = frozenset()

630 _hidden_attrs = PandasObject._hidden_attrs | {

631 "as_index",

632 "axis",

633 "dropna",

634 "exclusions",

635 "grouper",

636 "group_keys",

637 "keys",

638 "level",

639 "mutated",

640 "obj",

641 "observed",

642 "sort",

643 "squeeze",

644 }

645

646 axis: int

647 grouper: ops.BaseGrouper

648 keys: _KeysArgType | None = None

649 group_keys: bool | lib.NoDefault

650

651 @final

652 def __len__(self) -> int:

653 return len(self.groups)

654

655 @final

656 def __repr__(self) -> str:

657 # TODO: Better repr for GroupBy object

658 return object.__repr__(self)

659

660 @final

661 @property

662 def groups(self) -> dict[Hashable, np.ndarray]:

663 """

664 Dict {group name -> group labels}.

665 """

666 return self.grouper.groups

667

668 @final

669 @property

670 def ngroups(self) -> int:

671 return self.grouper.ngroups

672

673 @final

674 @property

675 def indices(self) -> dict[Hashable, npt.NDArray[np.intp]]:

676 """

677 Dict {group name -> group indices}.

678 """

679 return self.grouper.indices

680

681 @final

682 def _get_indices(self, names):

683 """

684 Safe get multiple indices, translate keys for

685 datelike to underlying repr.

686 """

687

688 def get_converter(s):

689 # possibly convert to the actual key types

690 # in the indices, could be a Timestamp or a np.datetime64

691 if isinstance(s, datetime.datetime):

692 return lambda key: Timestamp(key)

693 elif isinstance(s, np.datetime64):

694 return lambda key: Timestamp(key).asm8

695 else:

696 return lambda key: key

697

698 if len(names) == 0:

699 return []

700

701 if len(self.indices) > 0:

702 index_sample = next(iter(self.indices))

703 else:

704 index_sample = None # Dummy sample

705

706 name_sample = names[0]

707 if isinstance(index_sample, tuple):

708 if not isinstance(name_sample, tuple):

709 msg = "must supply a tuple to get_group with multiple grouping keys"

710 raise ValueError(msg)

711 if not len(name_sample) == len(index_sample):

712 try:

713 # If the original grouper was a tuple

714 return [self.indices[name] for name in names]

715 except KeyError as err:

716 # turns out it wasn't a tuple

717 msg = (

718 "must supply a same-length tuple to get_group "

719 "with multiple grouping keys"

720 )

721 raise ValueError(msg) from err

722

723 converters = [get_converter(s) for s in index_sample]

724 names = (tuple(f(n) for f, n in zip(converters, name)) for name in names)

725

726 else:

727 converter = get_converter(index_sample)

728 names = (converter(name) for name in names)

729

730 return [self.indices.get(name, []) for name in names]

731

732 @final

733 def _get_index(self, name):

734 """

735 Safe get index, translate keys for datelike to underlying repr.

736 """

737 return self._get_indices([name])[0]

738

739 @final

740 @cache_readonly

741 def _selected_obj(self):

742 # Note: _selected_obj is always just `self.obj` for SeriesGroupBy

743

744 if self._selection is None or isinstance(self.obj, Series):

745 if self._group_selection is not None:

746 return self.obj[self._group_selection]

747 return self.obj

748 else:

749 return self.obj[self._selection]

750

751 @final

752 def _dir_additions(self) -> set[str]:

753 return self.obj._dir_additions() | self._apply_allowlist

754

755 @Substitution(

756 klass="GroupBy",

757 examples=dedent(

758 """\

759 >>> df = pd.DataFrame({'A': 'a b a b'.split(), 'B': [1, 2, 3, 4]})

760 >>> df

761 A B

762 0 a 1

763 1 b 2

764 2 a 3

765 3 b 4

766

767 To get the difference between each groups maximum and minimum value in one

768 pass, you can do

769

770 >>> df.groupby('A').pipe(lambda x: x.max() - x.min())

771 B

772 A

773 a 2

774 b 2"""

775 ),

776 )

777 @Appender(_pipe_template)

778 def pipe(

779 self,

780 func: Callable[..., T] | tuple[Callable[..., T], str],

781 *args,

782 **kwargs,

783 ) -> T:

784 return com.pipe(self, func, *args, **kwargs)

785

786 plot = property(GroupByPlot)

787

788 @final

789 def get_group(self, name, obj=None) -> DataFrame | Series:

790 """

791 Construct DataFrame from group with provided name.

792

793 Parameters

794 ----------

795 name : object

796 The name of the group to get as a DataFrame.

797 obj : DataFrame, default None

798 The DataFrame to take the DataFrame out of. If

799 it is None, the object groupby was called on will

800 be used.

801

802 Returns

803 -------

804 group : same type as obj

805 """

806 if obj is None:

807 obj = self._selected_obj

808

809 inds = self._get_index(name)

810 if not len(inds):

811 raise KeyError(name)

812

813 return obj._take_with_is_copy(inds, axis=self.axis)

814

815 @final

816 def __iter__(self) -> Iterator[tuple[Hashable, NDFrameT]]:

817 """

818 Groupby iterator.

819

820 Returns

821 -------

822 Generator yielding sequence of (name, subsetted object)

823 for each group

824 """

825 keys = self.keys

826 if isinstance(keys, list) and len(keys) == 1:

827 warnings.warn(

828 (

829 "In a future version of pandas, a length 1 "

830 "tuple will be returned when iterating over a "

831 "groupby with a grouper equal to a list of "

832 "length 1. Don't supply a list with a single grouper "

833 "to avoid this warning."

834 ),

835 FutureWarning,

836 stacklevel=find_stack_level(),

837 )

838 return self.grouper.get_iterator(self._selected_obj, axis=self.axis)

839

840

841# To track operations that expand dimensions, like ohlc

842OutputFrameOrSeries = TypeVar("OutputFrameOrSeries", bound=NDFrame)

843

844

845class GroupBy(BaseGroupBy[NDFrameT]):

846 """

847 Class for grouping and aggregating relational data.

848

849 See aggregate, transform, and apply functions on this object.

850

851 It's easiest to use obj.groupby(...) to use GroupBy, but you can also do:

852

853 ::

854

855 grouped = groupby(obj, ...)

856

857 Parameters

858 ----------

859 obj : pandas object

860 axis : int, default 0

861 level : int, default None

862 Level of MultiIndex

863 groupings : list of Grouping objects

864 Most users should ignore this

865 exclusions : array-like, optional

866 List of columns to exclude

867 name : str

868 Most users should ignore this

869

870 Returns

871 -------

872 **Attributes**

873 groups : dict

874 {group name -> group labels}

875 len(grouped) : int

876 Number of groups

877

878 Notes

879 -----

880 After grouping, see aggregate, apply, and transform functions. Here are

881 some other brief notes about usage. When grouping by multiple groups, the

882 result index will be a MultiIndex (hierarchical) by default.

883

884 Iteration produces (key, group) tuples, i.e. chunking the data by group. So

885 you can write code like:

886

887 ::

888

889 grouped = obj.groupby(keys, axis=axis)

890 for key, group in grouped:

891 # do something with the data

892

893 Function calls on GroupBy, if not specially implemented, "dispatch" to the

894 grouped data. So if you group a DataFrame and wish to invoke the std()

895 method on each group, you can simply do:

896

897 ::

898

899 df.groupby(mapper).std()

900

901 rather than

902

903 ::

904

905 df.groupby(mapper).aggregate(np.std)

906

907 You can pass arguments to these "wrapped" functions, too.

908

909 See the online documentation for full exposition on these topics and much

910 more

911 """

912

913 grouper: ops.BaseGrouper

914 as_index: bool

915

916 @final

917 def __init__(

918 self,

919 obj: NDFrameT,

920 keys: _KeysArgType | None = None,

921 axis: int = 0,

922 level: IndexLabel | None = None,

923 grouper: ops.BaseGrouper | None = None,

924 exclusions: frozenset[Hashable] | None = None,

925 selection: IndexLabel | None = None,

926 as_index: bool = True,

927 sort: bool = True,

928 group_keys: bool | lib.NoDefault = True,

929 squeeze: bool = False,

930 observed: bool = False,

931 mutated: bool = False,

932 dropna: bool = True,

933 ) -> None:

934

935 self._selection = selection

936

937 assert isinstance(obj, NDFrame), type(obj)

938

939 self.level = level

940

941 if not as_index:

942 if not isinstance(obj, DataFrame):

943 raise TypeError("as_index=False only valid with DataFrame")

944 if axis != 0:

945 raise ValueError("as_index=False only valid for axis=0")

946

947 self.as_index = as_index

948 self.keys = keys

949 self.sort = sort

950 self.group_keys = group_keys

951 self.squeeze = squeeze

952 self.observed = observed

953 self.mutated = mutated

954 self.dropna = dropna

955

956 if grouper is None:

957 from pandas.core.groupby.grouper import get_grouper

958

959 grouper, exclusions, obj = get_grouper(

960 obj,

961 keys,

962 axis=axis,

963 level=level,

964 sort=sort,

965 observed=observed,

966 mutated=self.mutated,

967 dropna=self.dropna,

968 )

969

970 self.obj = obj

971 self.axis = obj._get_axis_number(axis)

972 self.grouper = grouper

973 self.exclusions = frozenset(exclusions) if exclusions else frozenset()

974

975 def __getattr__(self, attr: str):

976 if attr in self._internal_names_set:

977 return object.__getattribute__(self, attr)

978 if attr in self.obj:

979 return self[attr]

980

981 raise AttributeError(

982 f"'{type(self).__name__}' object has no attribute '{attr}'"

983 )

984

985 @final

986 def _make_wrapper(self, name: str) -> Callable:

987 assert name in self._apply_allowlist

988

989 with self._group_selection_context():

990 # need to setup the selection

991 # as are not passed directly but in the grouper

992 f = getattr(self._obj_with_exclusions, name)

993 if not isinstance(f, types.MethodType):

994 # error: Incompatible return value type

995 # (got "NDFrameT", expected "Callable[..., Any]") [return-value]

996 return cast(Callable, self.apply(lambda self: getattr(self, name)))

997

998 f = getattr(type(self._obj_with_exclusions), name)

999 sig = inspect.signature(f)

1000

1001 def wrapper(*args, **kwargs):

1002 # a little trickery for aggregation functions that need an axis

1003 # argument

1004 if "axis" in sig.parameters:

1005 if kwargs.get("axis", None) is None:

1006 kwargs["axis"] = self.axis

1007

1008 numeric_only = kwargs.get("numeric_only", lib.no_default)

1009

1010 def curried(x):

1011 with warnings.catch_warnings():

1012 # Catch any warnings from dispatch to DataFrame; we'll emit

1013 # a warning for groupby below

1014 match = "The default value of numeric_only "

1015 warnings.filterwarnings("ignore", match, FutureWarning)

1016 return f(x, *args, **kwargs)

1017

1018 # preserve the name so we can detect it when calling plot methods,

1019 # to avoid duplicates

1020 curried.__name__ = name

1021

1022 # special case otherwise extra plots are created when catching the

1023 # exception below

1024 if name in base.plotting_methods:

1025 return self.apply(curried)

1026

1027 is_transform = name in base.transformation_kernels

1028

1029 # Transform needs to keep the same schema, including when empty

1030 if is_transform and self._obj_with_exclusions.empty:

1031 return self._obj_with_exclusions

1032

1033 result = self._python_apply_general(

1034 curried,

1035 self._obj_with_exclusions,

1036 is_transform=is_transform,

1037 not_indexed_same=not is_transform,

1038 )

1039

1040 if self._selected_obj.ndim != 1 and self.axis != 1 and result.ndim != 1:

1041 missing = self._obj_with_exclusions.columns.difference(result.columns)

1042 if len(missing) > 0:

1043 warn_dropping_nuisance_columns_deprecated(

1044 type(self), name, numeric_only

1045 )

1046

1047 if self.grouper.has_dropped_na and is_transform:

1048 # result will have dropped rows due to nans, fill with null

1049 # and ensure index is ordered same as the input

1050 result = self._set_result_index_ordered(result)

1051 return result

1052

1053 wrapper.__name__ = name

1054 return wrapper

1055

1056 # -----------------------------------------------------------------

1057 # Selection

1058

1059 @final

1060 def _set_group_selection(self) -> None:

1061 """

1062 Create group based selection.

1063

1064 Used when selection is not passed directly but instead via a grouper.

1065

1066 NOTE: this should be paired with a call to _reset_group_selection

1067 """

1068 # This is a no-op for SeriesGroupBy

1069 grp = self.grouper

1070 if not (

1071 self.as_index

1072 and grp.groupings is not None

1073 and self.obj.ndim > 1

1074 and self._group_selection is None

1075 ):

1076 return

1077

1078 groupers = [g.name for g in grp.groupings if g.level is None and g.in_axis]

1079

1080 if len(groupers):

1081 # GH12839 clear selected obj cache when group selection changes

1082 ax = self.obj._info_axis

1083 self._group_selection = ax.difference(Index(groupers), sort=False).tolist()

1084 self._reset_cache("_selected_obj")

1085

1086 @final

1087 def _reset_group_selection(self) -> None:

1088 """

1089 Clear group based selection.

1090

1091 Used for methods needing to return info on each group regardless of

1092 whether a group selection was previously set.

1093 """

1094 if self._group_selection is not None:

1095 # GH12839 clear cached selection too when changing group selection

1096 self._group_selection = None

1097 self._reset_cache("_selected_obj")

1098

1099 @contextmanager

1100 def _group_selection_context(self) -> Iterator[GroupBy]:

1101 """

1102 Set / reset the _group_selection_context.

1103 """

1104 self._set_group_selection()

1105 try:

1106 yield self

1107 finally:

1108 self._reset_group_selection()

1109

1110 def _iterate_slices(self) -> Iterable[Series]:

1111 raise AbstractMethodError(self)

1112

1113 # -----------------------------------------------------------------

1114 # Dispatch/Wrapping

1115

1116 @final

1117 def _concat_objects(

1118 self,

1119 values,

1120 not_indexed_same: bool = False,

1121 override_group_keys: bool = False,

1122 ):

1123 from pandas.core.reshape.concat import concat

1124

1125 def reset_identity(values):

1126 # reset the identities of the components

1127 # of the values to prevent aliasing

1128 for v in com.not_none(*values):

1129 ax = v._get_axis(self.axis)

1130 ax._reset_identity()

1131 return values

1132

1133 if self.group_keys and not override_group_keys:

1134

1135 values = reset_identity(values)

1136 if self.as_index:

1137

1138 # possible MI return case

1139 group_keys = self.grouper.result_index

1140 group_levels = self.grouper.levels

1141 group_names = self.grouper.names

1142

1143 result = concat(

1144 values,

1145 axis=self.axis,

1146 keys=group_keys,

1147 levels=group_levels,

1148 names=group_names,

1149 sort=False,

1150 )

1151 else:

1152

1153 # GH5610, returns a MI, with the first level being a

1154 # range index

1155 keys = list(range(len(values)))

1156 result = concat(values, axis=self.axis, keys=keys)

1157

1158 elif not not_indexed_same:

1159 result = concat(values, axis=self.axis)

1160

1161 ax = self._selected_obj._get_axis(self.axis)

1162 if self.dropna:

1163 labels = self.grouper.group_info[0]

1164 mask = labels != -1

1165 ax = ax[mask]

1166

1167 # this is a very unfortunate situation

1168 # we can't use reindex to restore the original order

1169 # when the ax has duplicates

1170 # so we resort to this

1171 # GH 14776, 30667

1172 if ax.has_duplicates and not result.axes[self.axis].equals(ax):

1173 target = algorithms.unique1d(ax._values)

1174 indexer, _ = result.index.get_indexer_non_unique(target)

1175 result = result.take(indexer, axis=self.axis)

1176 else:

1177 result = result.reindex(ax, axis=self.axis, copy=False)

1178

1179 else:

1180 values = reset_identity(values)

1181 result = concat(values, axis=self.axis)

1182

1183 name = self.obj.name if self.obj.ndim == 1 else self._selection

1184 if isinstance(result, Series) and name is not None:

1185

1186 result.name = name

1187

1188 return result

1189

1190 @final

1191 def _set_result_index_ordered(

1192 self, result: OutputFrameOrSeries

1193 ) -> OutputFrameOrSeries:

1194 # set the result index on the passed values object and

1195 # return the new object, xref 8046

1196

1197 obj_axis = self.obj._get_axis(self.axis)

1198

1199 if self.grouper.is_monotonic and not self.grouper.has_dropped_na:

1200 # shortcut if we have an already ordered grouper

1201 result = result.set_axis(obj_axis, axis=self.axis, copy=False)

1202 return result

1203

1204 # row order is scrambled => sort the rows by position in original index

1205 original_positions = Index(self.grouper.result_ilocs())

1206 result = result.set_axis(original_positions, axis=self.axis, copy=False)

1207 result = result.sort_index(axis=self.axis)

1208 if self.grouper.has_dropped_na:

1209 # Add back in any missing rows due to dropna - index here is integral

1210 # with values referring to the row of the input so can use RangeIndex

1211 result = result.reindex(RangeIndex(len(obj_axis)), axis=self.axis)

1212 result = result.set_axis(obj_axis, axis=self.axis, copy=False)

1213

1214 return result

1215

1216 def _indexed_output_to_ndframe(

1217 self, result: Mapping[base.OutputKey, ArrayLike]

1218 ) -> Series | DataFrame:

1219 raise AbstractMethodError(self)

1220

1221 @final

1222 def _wrap_aggregated_output(

1223 self,

1224 output: Series | DataFrame | Mapping[base.OutputKey, ArrayLike],

1225 qs: npt.NDArray[np.float64] | None = None,

1226 ):

1227 """

1228 Wraps the output of GroupBy aggregations into the expected result.

1229

1230 Parameters

1231 ----------

1232 output : Series, DataFrame, or Mapping[base.OutputKey, ArrayLike]

1233 Data to wrap.

1234

1235 Returns

1236 -------

1237 Series or DataFrame

1238 """

1239

1240 if isinstance(output, (Series, DataFrame)):

1241 # We get here (for DataFrameGroupBy) if we used Manager.grouped_reduce,

1242 # in which case our columns are already set correctly.

1243 # ATM we do not get here for SeriesGroupBy; when we do, we will

1244 # need to require that result.name already match self.obj.name

1245 result = output

1246 else:

1247 result = self._indexed_output_to_ndframe(output)

1248

1249 if not self.as_index:

1250 # `not self.as_index` is only relevant for DataFrameGroupBy,

1251 # enforced in __init__

1252 self._insert_inaxis_grouper_inplace(result)

1253 result = result._consolidate()

1254 index = Index(range(self.grouper.ngroups))

1255

1256 else:

1257 index = self.grouper.result_index

1258

1259 if qs is not None:

1260 # We get here with len(qs) != 1 and not self.as_index

1261 # in test_pass_args_kwargs

1262 index = _insert_quantile_level(index, qs)

1263

1264 result.index = index

1265

1266 if self.axis == 1:

1267 # Only relevant for DataFrameGroupBy, no-op for SeriesGroupBy

1268 result = result.T

1269 if result.index.equals(self.obj.index):

1270 # Retain e.g. DatetimeIndex/TimedeltaIndex freq

1271 result.index = self.obj.index.copy()

1272 # TODO: Do this more systematically

1273

1274 return self._reindex_output(result, qs=qs)

1275

1276 @final

1277 def _wrap_transformed_output(

1278 self, output: Mapping[base.OutputKey, ArrayLike]

1279 ) -> Series | DataFrame:

1280 """

1281 Wraps the output of GroupBy transformations into the expected result.

1282

1283 Parameters

1284 ----------

1285 output : Mapping[base.OutputKey, ArrayLike]

1286 Data to wrap.

1287

1288 Returns

1289 -------

1290 Series or DataFrame

1291 Series for SeriesGroupBy, DataFrame for DataFrameGroupBy

1292 """

1293 if isinstance(output, (Series, DataFrame)):

1294 result = output

1295 else:

1296 result = self._indexed_output_to_ndframe(output)

1297

1298 if self.axis == 1:

1299 # Only relevant for DataFrameGroupBy

1300 result = result.T

1301 result.columns = self.obj.columns

1302

1303 result.index = self.obj.index

1304 return result

1305

1306 def _wrap_applied_output(

1307 self,

1308 data,

1309 values: list,

1310 not_indexed_same: bool = False,

1311 override_group_keys: bool = False,

1312 ):

1313 raise AbstractMethodError(self)

1314

1315 def _resolve_numeric_only(

1316 self, how: str, numeric_only: bool | lib.NoDefault, axis: int

1317 ) -> bool:

1318 """

1319 Determine subclass-specific default value for 'numeric_only'.

1320

1321 For SeriesGroupBy we want the default to be False (to match Series behavior).

1322 For DataFrameGroupBy we want it to be True (for backwards-compat).

1323

1324 Parameters

1325 ----------

1326 numeric_only : bool or lib.no_default

1327 axis : int

1328 Axis passed to the groupby op (not self.axis).

1329

1330 Returns

1331 -------

1332 bool

1333 """

1334 # GH#41291

1335 if numeric_only is lib.no_default:

1336 # i.e. not explicitly passed by user

1337 if self.obj.ndim == 2:

1338 # i.e. DataFrameGroupBy

1339 numeric_only = axis != 1

1340 # GH#42395 GH#43108 GH#43154

1341 # Regression from 1.2.5 to 1.3 caused object columns to be dropped

1342 if self.axis:

1343 obj = self._obj_with_exclusions.T

1344 else:

1345 obj = self._obj_with_exclusions

1346 check = obj._get_numeric_data()

1347 if len(obj.columns) and not len(check.columns) and not obj.empty:

1348 numeric_only = False

1349

1350 else:

1351 numeric_only = False

1352

1353 if numeric_only and self.obj.ndim == 1 and not is_numeric_dtype(self.obj.dtype):

1354 # GH#47500

1355 warnings.warn(

1356 f"{type(self).__name__}.{how} called with "

1357 f"numeric_only={numeric_only} and dtype {self.obj.dtype}. This will "

1358 "raise a TypeError in a future version of pandas",

1359 category=FutureWarning,

1360 stacklevel=find_stack_level(),

1361 )

1362 raise NotImplementedError(

1363 f"{type(self).__name__}.{how} does not implement numeric_only"

1364 )

1365

1366 return numeric_only

1367

1368 def _maybe_warn_numeric_only_depr(

1369 self, how: str, result: DataFrame | Series, numeric_only: bool | lib.NoDefault

1370 ) -> None:

1371 """Emit warning on numeric_only behavior deprecation when appropriate.

1372

1373 Parameters

1374 ----------

1375 how : str

1376 Groupby kernel name.

1377 result :

1378 Result of the groupby operation.

1379 numeric_only : bool or lib.no_default

1380 Argument as passed by user.

1381 """

1382 if (

1383 self._obj_with_exclusions.ndim != 1

1384 and result.ndim > 1

1385 and len(result.columns) < len(self._obj_with_exclusions.columns)

1386 ):

1387 warn_dropping_nuisance_columns_deprecated(type(self), how, numeric_only)

1388

1389 # -----------------------------------------------------------------

1390 # numba

1391

1392 @final

1393 def _numba_prep(self, data):

1394 ids, _, ngroups = self.grouper.group_info

1395 sorted_index = get_group_index_sorter(ids, ngroups)

1396 sorted_ids = algorithms.take_nd(ids, sorted_index, allow_fill=False)

1397

1398 sorted_data = data.take(sorted_index, axis=self.axis).to_numpy()

1399 if len(self.grouper.groupings) > 1:

1400 raise NotImplementedError(

1401 "More than 1 grouping labels are not supported with engine='numba'"

1402 )

1403 # GH 46867

1404 index_data = data.index

1405 if isinstance(index_data, MultiIndex):

1406 group_key = self.grouper.groupings[0].name

1407 index_data = index_data.get_level_values(group_key)

1408 sorted_index_data = index_data.take(sorted_index).to_numpy()

1409

1410 starts, ends = lib.generate_slices(sorted_ids, ngroups)

1411 return (

1412 starts,

1413 ends,

1414 sorted_index_data,

1415 sorted_data,

1416 )

1417

1418 def _numba_agg_general(

1419 self,

1420 func: Callable,

1421 engine_kwargs: dict[str, bool] | None,

1422 *aggregator_args,

1423 ):

1424 """

1425 Perform groupby with a standard numerical aggregation function (e.g. mean)

1426 with Numba.

1427 """

1428 if not self.as_index:

1429 raise NotImplementedError(

1430 "as_index=False is not supported. Use .reset_index() instead."

1431 )

1432 if self.axis == 1:

1433 raise NotImplementedError("axis=1 is not supported.")

1434

1435 with self._group_selection_context():

1436 data = self._selected_obj

1437 df = data if data.ndim == 2 else data.to_frame()

1438 starts, ends, sorted_index, sorted_data = self._numba_prep(df)

1439 aggregator = executor.generate_shared_aggregator(

1440 func, **get_jit_arguments(engine_kwargs)

1441 )

1442 result = aggregator(sorted_data, starts, ends, 0, *aggregator_args)

1443

1444 index = self.grouper.result_index

1445 if data.ndim == 1:

1446 result_kwargs = {"name": data.name}

1447 result = result.ravel()

1448 else:

1449 result_kwargs = {"columns": data.columns}

1450 return data._constructor(result, index=index, **result_kwargs)

1451

1452 @final

1453 def _transform_with_numba(self, data, func, *args, engine_kwargs=None, **kwargs):

1454 """

1455 Perform groupby transform routine with the numba engine.

1456

1457 This routine mimics the data splitting routine of the DataSplitter class

1458 to generate the indices of each group in the sorted data and then passes the

1459 data and indices into a Numba jitted function.

1460 """

1461 starts, ends, sorted_index, sorted_data = self._numba_prep(data)

1462 numba_.validate_udf(func)

1463 numba_transform_func = numba_.generate_numba_transform_func(

1464 func, **get_jit_arguments(engine_kwargs, kwargs)

1465 )

1466 result = numba_transform_func(

1467 sorted_data,

1468 sorted_index,

1469 starts,

1470 ends,

1471 len(data.columns),

1472 *args,

1473 )

1474 # result values needs to be resorted to their original positions since we

1475 # evaluated the data sorted by group

1476 return result.take(np.argsort(sorted_index), axis=0)

1477

1478 @final

1479 def _aggregate_with_numba(self, data, func, *args, engine_kwargs=None, **kwargs):

1480 """

1481 Perform groupby aggregation routine with the numba engine.

1482

1483 This routine mimics the data splitting routine of the DataSplitter class

1484 to generate the indices of each group in the sorted data and then passes the

1485 data and indices into a Numba jitted function.

1486 """

1487 starts, ends, sorted_index, sorted_data = self._numba_prep(data)

1488 numba_.validate_udf(func)

1489 numba_agg_func = numba_.generate_numba_agg_func(

1490 func, **get_jit_arguments(engine_kwargs, kwargs)

1491 )

1492 result = numba_agg_func(

1493 sorted_data,

1494 sorted_index,

1495 starts,

1496 ends,

1497 len(data.columns),

1498 *args,

1499 )

1500 return result

1501

1502 # -----------------------------------------------------------------

1503 # apply/agg/transform

1504

1505 @Appender(

1506 _apply_docs["template"].format(

1507 input="dataframe", examples=_apply_docs["dataframe_examples"]

1508 )

1509 )

1510 def apply(self, func, *args, **kwargs) -> NDFrameT:

1511

1512 func = com.is_builtin_func(func)

1513

1514 if isinstance(func, str):

1515 if hasattr(self, func):

1516 res = getattr(self, func)

1517 if callable(res):

1518 return res(*args, **kwargs)

1519 elif args or kwargs:

1520 raise ValueError(f"Cannot pass arguments to property {func}")

1521 return res

1522

1523 else:

1524 raise TypeError(f"apply func should be callable, not '{func}'")

1525

1526 elif args or kwargs:

1527 if callable(func):

1528

1529 @wraps(func)

1530 def f(g):

1531 with np.errstate(all="ignore"):

1532 return func(g, *args, **kwargs)

1533

1534 elif hasattr(nanops, "nan" + func):

1535 # TODO: should we wrap this in to e.g. _is_builtin_func?

1536 f = getattr(nanops, "nan" + func)

1537

1538 else:

1539 raise ValueError(

1540 "func must be a callable if args or kwargs are supplied"

1541 )

1542 else:

1543

1544 f = func

1545

1546 # ignore SettingWithCopy here in case the user mutates

1547 with option_context("mode.chained_assignment", None):

1548 try:

1549 result = self._python_apply_general(f, self._selected_obj)

1550 except TypeError:

1551 # gh-20949

1552 # try again, with .apply acting as a filtering

1553 # operation, by excluding the grouping column

1554 # This would normally not be triggered

1555 # except if the udf is trying an operation that

1556 # fails on *some* columns, e.g. a numeric operation

1557 # on a string grouper column

1558

1559 with self._group_selection_context():

1560 return self._python_apply_general(f, self._selected_obj)

1561

1562 return result

1563

1564 @final

1565 def _python_apply_general(

1566 self,

1567 f: Callable,

1568 data: DataFrame | Series,

1569 not_indexed_same: bool | None = None,

1570 is_transform: bool = False,

1571 is_agg: bool = False,

1572 ) -> NDFrameT:

1573 """

1574 Apply function f in python space

1575

1576 Parameters

1577 ----------

1578 f : callable

1579 Function to apply

1580 data : Series or DataFrame

1581 Data to apply f to

1582 not_indexed_same: bool, optional

1583 When specified, overrides the value of not_indexed_same. Apply behaves

1584 differently when the result index is equal to the input index, but

1585 this can be coincidental leading to value-dependent behavior.

1586 is_transform : bool, default False

1587 Indicator for whether the function is actually a transform

1588 and should not have group keys prepended. This is used

1589 in _make_wrapper which generates both transforms (e.g. diff)

1590 and non-transforms (e.g. corr)

1591 is_agg : bool, default False

1592 Indicator for whether the function is an aggregation. When the

1593 result is empty, we don't want to warn for this case.

1594 See _GroupBy._python_agg_general.

1595

1596 Returns

1597 -------

1598 Series or DataFrame

1599 data after applying f

1600 """

1601 values, mutated = self.grouper.apply(f, data, self.axis)

1602 if not_indexed_same is None:

1603 not_indexed_same = mutated or self.mutated

1604 override_group_keys = False

1605

1606 is_empty_agg = is_agg and len(values) == 0

1607 if (not not_indexed_same and self.group_keys is lib.no_default) and not (

1608 is_transform or is_empty_agg

1609 ):

1610 # We've detected value-dependent behavior: the result's index depends on

1611 # whether the user's function `f` returned the same index or not.

1612 msg = (

1613 "Not prepending group keys to the result index of "

1614 "transform-like apply. In the future, the group keys "

1615 "will be included in the index, regardless of whether "

1616 "the applied function returns a like-indexed object.\n"

1617 "To preserve the previous behavior, use\n\n\t"

1618 ">>> .groupby(..., group_keys=False)\n\n"

1619 "To adopt the future behavior and silence this warning, use "

1620 "\n\n\t>>> .groupby(..., group_keys=True)"

1621 )

1622 warnings.warn(msg, FutureWarning, stacklevel=find_stack_level())

1623 # We want to behave as if `self.group_keys=False` when reconstructing

1624 # the object. However, we don't want to mutate the stateful GroupBy

1625 # object, so we just override it.

1626 # When this deprecation is enforced then override_group_keys

1627 # may be removed.

1628 override_group_keys = True

1629

1630 return self._wrap_applied_output(

1631 data,

1632 values,

1633 not_indexed_same,

1634 override_group_keys=is_transform or override_group_keys,

1635 )

1636

1637 @final

1638 def _python_agg_general(self, func, *args, raise_on_typeerror=False, **kwargs):

1639 func = com.is_builtin_func(func)

1640 f = lambda x: func(x, *args, **kwargs)

1641

1642 # iterate through "columns" ex exclusions to populate output dict

1643 output: dict[base.OutputKey, ArrayLike] = {}

1644

1645 if self.ngroups == 0:

1646 # agg_series below assumes ngroups > 0

1647 return self._python_apply_general(f, self._selected_obj, is_agg=True)

1648

1649 for idx, obj in enumerate(self._iterate_slices()):

1650 name = obj.name

1651

1652 try:

1653 # if this function is invalid for this dtype, we will ignore it.

1654 result = self.grouper.agg_series(obj, f)

1655 except TypeError:

1656 if raise_on_typeerror:

1657 raise

1658 warn_dropping_nuisance_columns_deprecated(

1659 type(self), "agg", numeric_only=False

1660 )

1661 continue

1662

1663 key = base.OutputKey(label=name, position=idx)

1664 output[key] = result

1665

1666 if not output:

1667 return self._python_apply_general(f, self._selected_obj)

1668

1669 return self._wrap_aggregated_output(output)

1670

1671 @final

1672 def _agg_general(

1673 self,

1674 numeric_only: bool | lib.NoDefault = True,

1675 min_count: int = -1,

1676 *,

1677 alias: str,

1678 npfunc: Callable,

1679 ):

1680

1681 with self._group_selection_context():

1682 # try a cython aggregation if we can

1683 result = self._cython_agg_general(

1684 how=alias,

1685 alt=npfunc,

1686 numeric_only=numeric_only,

1687 min_count=min_count,

1688 )

1689 return result.__finalize__(self.obj, method="groupby")

1690

1691 def _agg_py_fallback(

1692 self, values: ArrayLike, ndim: int, alt: Callable

1693 ) -> ArrayLike:

1694 """

1695 Fallback to pure-python aggregation if _cython_operation raises

1696 NotImplementedError.

1697 """

1698 # We get here with a) EADtypes and b) object dtype

1699

1700 if values.ndim == 1:

1701 # For DataFrameGroupBy we only get here with ExtensionArray

1702 ser = Series(values)

1703 else:

1704 # We only get here with values.dtype == object

1705 # TODO: special case not needed with ArrayManager

1706 df = DataFrame(values.T)

1707 # bc we split object blocks in grouped_reduce, we have only 1 col

1708 # otherwise we'd have to worry about block-splitting GH#39329

1709 assert df.shape[1] == 1

1710 # Avoid call to self.values that can occur in DataFrame

1711 # reductions; see GH#28949

1712 ser = df.iloc[:, 0]

1713

1714 # We do not get here with UDFs, so we know that our dtype

1715 # should always be preserved by the implemented aggregations

1716 # TODO: Is this exactly right; see WrappedCythonOp get_result_dtype?

1717 res_values = self.grouper.agg_series(ser, alt, preserve_dtype=True)

1718

1719 if isinstance(values, Categorical):

1720 # Because we only get here with known dtype-preserving

1721 # reductions, we cast back to Categorical.

1722 # TODO: if we ever get "rank" working, exclude it here.

1723 res_values = type(values)._from_sequence(res_values, dtype=values.dtype)

1724

1725 # If we are DataFrameGroupBy and went through a SeriesGroupByPath

1726 # then we need to reshape

1727 # GH#32223 includes case with IntegerArray values, ndarray res_values

1728 # test_groupby_duplicate_columns with object dtype values

1729 return ensure_block_shape(res_values, ndim=ndim)

1730

1731 @final

1732 def _cython_agg_general(

1733 self,

1734 how: str,

1735 alt: Callable,

1736 numeric_only: bool | lib.NoDefault,

1737 min_count: int = -1,

1738 ignore_failures: bool = True,

1739 **kwargs,

1740 ):

1741 # Note: we never get here with how="ohlc" for DataFrameGroupBy;

1742 # that goes through SeriesGroupBy

1743 numeric_only_bool = self._resolve_numeric_only(how, numeric_only, axis=0)

1744

1745 data = self._get_data_to_aggregate()

1746 is_ser = data.ndim == 1

1747

1748 orig_len = len(data)

1749 if numeric_only_bool:

1750 if is_ser and not is_numeric_dtype(self._selected_obj.dtype):

1751 # GH#41291 match Series behavior

1752 kwd_name = "numeric_only"

1753 if how in ["any", "all"]:

1754 kwd_name = "bool_only"

1755 raise NotImplementedError(

1756 f"{type(self).__name__}.{how} does not implement {kwd_name}."

1757 )

1758 elif not is_ser:

1759 data = data.get_numeric_data(copy=False)

1760

1761 def array_func(values: ArrayLike) -> ArrayLike:

1762 try:

1763 result = self.grouper._cython_operation(

1764 "aggregate",

1765 values,

1766 how,

1767 axis=data.ndim - 1,

1768 min_count=min_count,

1769 **kwargs,

1770 )

1771 except NotImplementedError:

1772 # generally if we have numeric_only=False

1773 # and non-applicable functions

1774 # try to python agg

1775 # TODO: shouldn't min_count matter?

1776 result = self._agg_py_fallback(values, ndim=data.ndim, alt=alt)

1777

1778 return result

1779

1780 # TypeError -> we may have an exception in trying to aggregate

1781 # continue and exclude the block

1782 new_mgr = data.grouped_reduce(array_func, ignore_failures=ignore_failures)

1783

1784 if not is_ser and len(new_mgr) < orig_len:

1785 warn_dropping_nuisance_columns_deprecated(type(self), how, numeric_only)

1786

1787 res = self._wrap_agged_manager(new_mgr)

1788 if is_ser:

1789 res.index = self.grouper.result_index

1790 return self._reindex_output(res)

1791 else:

1792 return res

1793

1794 def _cython_transform(

1795 self, how: str, numeric_only: bool = True, axis: int = 0, **kwargs

1796 ):

1797 raise AbstractMethodError(self)

1798

1799 @final

1800 def _transform(self, func, *args, engine=None, engine_kwargs=None, **kwargs):

1801

1802 if maybe_use_numba(engine):

1803 # TODO: tests with self._selected_obj.ndim == 1 on DataFrameGroupBy

1804 with self._group_selection_context():

1805 data = self._selected_obj

1806 df = data if data.ndim == 2 else data.to_frame()

1807 result = self._transform_with_numba(

1808 df, func, *args, engine_kwargs=engine_kwargs, **kwargs

1809 )

1810 if self.obj.ndim == 2:

1811 return cast(DataFrame, self.obj)._constructor(

1812 result, index=data.index, columns=data.columns

1813 )

1814 else:

1815 return cast(Series, self.obj)._constructor(

1816 result.ravel(), index=data.index, name=data.name

1817 )

1818

1819 # optimized transforms

1820 func = com.get_cython_func(func) or func

1821

1822 if not isinstance(func, str):

1823 return self._transform_general(func, *args, **kwargs)

1824

1825 elif func not in base.transform_kernel_allowlist:

1826 msg = f"'{func}' is not a valid function name for transform(name)"

1827 raise ValueError(msg)

1828 elif func in base.cythonized_kernels or func in base.transformation_kernels:

1829 # cythonized transform or canned "agg+broadcast"

1830 return getattr(self, func)(*args, **kwargs)

1831

1832 else:

1833 # i.e. func in base.reduction_kernels

1834

1835 # GH#30918 Use _transform_fast only when we know func is an aggregation

1836 # If func is a reduction, we need to broadcast the

1837 # result to the whole group. Compute func result

1838 # and deal with possible broadcasting below.

1839 # Temporarily set observed for dealing with categoricals.

1840 with com.temp_setattr(self, "observed", True):

1841 result = getattr(self, func)(*args, **kwargs)

1842

1843 return self._wrap_transform_fast_result(result)

1844

1845 @final

1846 def _wrap_transform_fast_result(self, result: NDFrameT) -> NDFrameT:

1847 """

1848 Fast transform path for aggregations.

1849 """

1850 obj = self._obj_with_exclusions

1851

1852 # for each col, reshape to size of original frame by take operation

1853 ids, _, _ = self.grouper.group_info

1854 result = result.reindex(self.grouper.result_index, axis=self.axis, copy=False)

1855

1856 if self.obj.ndim == 1:

1857 # i.e. SeriesGroupBy

1858 out = algorithms.take_nd(result._values, ids)

1859 output = obj._constructor(out, index=obj.index, name=obj.name)

1860 else:

1861 # `.size()` gives Series output on DataFrame input, need axis 0

1862 axis = 0 if result.ndim == 1 else self.axis

1863 # GH#46209

1864 # Don't convert indices: negative indices need to give rise

1865 # to null values in the result

1866 output = result._take(ids, axis=axis, convert_indices=False)

1867 output = output.set_axis(obj._get_axis(self.axis), axis=axis)

1868 return output

1869

1870 # -----------------------------------------------------------------

1871 # Utilities

1872

1873 @final

1874 def _apply_filter(self, indices, dropna):

1875 if len(indices) == 0:

1876 indices = np.array([], dtype="int64")

1877 else:

1878 indices = np.sort(np.concatenate(indices))

1879 if dropna:

1880 filtered = self._selected_obj.take(indices, axis=self.axis)

1881 else:

1882 mask = np.empty(len(self._selected_obj.index), dtype=bool)

1883 mask.fill(False)

1884 mask[indices.astype(int)] = True

1885 # mask fails to broadcast when passed to where; broadcast manually.

1886 mask = np.tile(mask, list(self._selected_obj.shape[1:]) + [1]).T

1887 filtered = self._selected_obj.where(mask) # Fill with NaNs.

1888 return filtered

1889

1890 @final

1891 def _cumcount_array(self, ascending: bool = True) -> np.ndarray:

1892 """

1893 Parameters

1894 ----------

1895 ascending : bool, default True

1896 If False, number in reverse, from length of group - 1 to 0.

1897

1898 Notes

1899 -----

1900 this is currently implementing sort=False

1901 (though the default is sort=True) for groupby in general

1902 """

1903 ids, _, ngroups = self.grouper.group_info

1904 sorter = get_group_index_sorter(ids, ngroups)

1905 ids, count = ids[sorter], len(ids)

1906

1907 if count == 0:

1908 return np.empty(0, dtype=np.int64)

1909

1910 run = np.r_[True, ids[:-1] != ids[1:]]

1911 rep = np.diff(np.r_[np.nonzero(run)[0], count])

1912 out = (~run).cumsum()

1913

1914 if ascending:

1915 out -= np.repeat(out[run], rep)

1916 else:

1917 out = np.repeat(out[np.r_[run[1:], True]], rep) - out

1918

1919 if self.grouper.has_dropped_na:

1920 out = np.where(ids == -1, np.nan, out.astype(np.float64, copy=False))

1921 else:

1922 out = out.astype(np.int64, copy=False)

1923

1924 rev = np.empty(count, dtype=np.intp)

1925 rev[sorter] = np.arange(count, dtype=np.intp)

1926 return out[rev]

1927

1928 # -----------------------------------------------------------------

1929

1930 @final

1931 @property

1932 def _obj_1d_constructor(self) -> Callable:

1933 # GH28330 preserve subclassed Series/DataFrames

1934 if isinstance(self.obj, DataFrame):

1935 return self.obj._constructor_sliced

1936 assert isinstance(self.obj, Series)

1937 return self.obj._constructor

1938

1939 @final

1940 def _bool_agg(self, val_test: Literal["any", "all"], skipna: bool):

1941 """

1942 Shared func to call any / all Cython GroupBy implementations.

1943 """

1944

1945 def objs_to_bool(vals: ArrayLike) -> tuple[np.ndarray, type]:

1946 if is_object_dtype(vals.dtype):

1947 # GH#37501: don't raise on pd.NA when skipna=True

1948 if skipna:

1949 func = np.vectorize(

1950 lambda x: bool(x) if not isna(x) else True, otypes=[bool]

1951 )

1952 vals = func(vals)

1953 else:

1954 vals = vals.astype(bool, copy=False)

1955

1956 vals = cast(np.ndarray, vals)

1957 elif isinstance(vals, BaseMaskedArray):

1958 vals = vals._data.astype(bool, copy=False)

1959 else:

1960 vals = vals.astype(bool, copy=False)

1961

1962 return vals.view(np.int8), bool

1963

1964 def result_to_bool(

1965 result: np.ndarray,

1966 inference: type,

1967 nullable: bool = False,

1968 ) -> ArrayLike:

1969 if nullable:

1970 return BooleanArray(result.astype(bool, copy=False), result == -1)

1971 else:

1972 return result.astype(inference, copy=False)

1973

1974 return self._get_cythonized_result(

1975 libgroupby.group_any_all,

1976 numeric_only=False,

1977 cython_dtype=np.dtype(np.int8),

1978 needs_mask=True,

1979 needs_nullable=True,

1980 pre_processing=objs_to_bool,

1981 post_processing=result_to_bool,

1982 val_test=val_test,

1983 skipna=skipna,

1984 )

1985

1986 @final

1987 @Substitution(name="groupby")

1988 @Appender(_common_see_also)

1989 def any(self, skipna: bool = True):

1990 """

1991 Return True if any value in the group is truthful, else False.

1992

1993 Parameters

1994 ----------

1995 skipna : bool, default True

1996 Flag to ignore nan values during truth testing.

1997

1998 Returns

1999 -------

2000 Series or DataFrame

2001 DataFrame or Series of boolean values, where a value is True if any element

2002 is True within its respective group, False otherwise.

2003 """

2004 return self._bool_agg("any", skipna)

2005

2006 @final

2007 @Substitution(name="groupby")

2008 @Appender(_common_see_also)

2009 def all(self, skipna: bool = True):

2010 """

2011 Return True if all values in the group are truthful, else False.

2012

2013 Parameters

2014 ----------

2015 skipna : bool, default True

2016 Flag to ignore nan values during truth testing.

2017

2018 Returns

2019 -------

2020 Series or DataFrame

2021 DataFrame or Series of boolean values, where a value is True if all elements

2022 are True within its respective group, False otherwise.

2023 """

2024 return self._bool_agg("all", skipna)

2025

2026 @final

2027 @Substitution(name="groupby")

2028 @Appender(_common_see_also)

2029 def count(self) -> NDFrameT:

2030 """

2031 Compute count of group, excluding missing values.

2032

2033 Returns

2034 -------

2035 Series or DataFrame

2036 Count of values within each group.

2037 """

2038 data = self._get_data_to_aggregate()

2039 ids, _, ngroups = self.grouper.group_info

2040 mask = ids != -1

2041

2042 is_series = data.ndim == 1

2043

2044 def hfunc(bvalues: ArrayLike) -> ArrayLike:

2045 # TODO(EA2D): reshape would not be necessary with 2D EAs

2046 if bvalues.ndim == 1:

2047 # EA

2048 masked = mask & ~isna(bvalues).reshape(1, -1)

2049 else:

2050 masked = mask & ~isna(bvalues)

2051

2052 counted = lib.count_level_2d(masked, labels=ids, max_bin=ngroups, axis=1)

2053 if is_series:

2054 assert counted.ndim == 2

2055 assert counted.shape[0] == 1

2056 return counted[0]

2057 return counted

2058

2059 new_mgr = data.grouped_reduce(hfunc)

2060

2061 # If we are grouping on categoricals we want unobserved categories to

2062 # return zero, rather than the default of NaN which the reindexing in

2063 # _wrap_agged_manager() returns. GH 35028

2064 with com.temp_setattr(self, "observed", True):

2065 result = self._wrap_agged_manager(new_mgr)

2066

2067 if result.ndim == 1:

2068 result.index = self.grouper.result_index

2069

2070 return self._reindex_output(result, fill_value=0)

2071

2072 @final

2073 @Substitution(name="groupby")

2074 @Substitution(see_also=_common_see_also)

2075 def mean(

2076 self,

2077 numeric_only: bool | lib.NoDefault = lib.no_default,

2078 engine: str = "cython",

2079 engine_kwargs: dict[str, bool] | None = None,

2080 ):

2081 """

2082 Compute mean of groups, excluding missing values.

2083

2084 Parameters

2085 ----------

2086 numeric_only : bool, default True

2087 Include only float, int, boolean columns. If None, will attempt to use

2088 everything, then use only numeric data.

2089

2090 engine : str, default None

2091 * ``'cython'`` : Runs the operation through C-extensions from cython.

2092 * ``'numba'`` : Runs the operation through JIT compiled code from numba.

2093 * ``None`` : Defaults to ``'cython'`` or globally setting

2094 ``compute.use_numba``

2095

2096 .. versionadded:: 1.4.0

2097

2098 engine_kwargs : dict, default None

2099 * For ``'cython'`` engine, there are no accepted ``engine_kwargs``

2100 * For ``'numba'`` engine, the engine can accept ``nopython``, ``nogil``

2101 and ``parallel`` dictionary keys. The values must either be ``True`` or

2102 ``False``. The default ``engine_kwargs`` for the ``'numba'`` engine is

2103 ``{{'nopython': True, 'nogil': False, 'parallel': False}}``

2104

2105 .. versionadded:: 1.4.0

2106

2107 Returns

2108 -------

2109 pandas.Series or pandas.DataFrame

2110 %(see_also)s

2111 Examples

2112 --------

2113 >>> df = pd.DataFrame({'A': [1, 1, 2, 1, 2],

2114 ... 'B': [np.nan, 2, 3, 4, 5],

2115 ... 'C': [1, 2, 1, 1, 2]}, columns=['A', 'B', 'C'])

2116

2117 Groupby one column and return the mean of the remaining columns in

2118 each group.

2119

2120 >>> df.groupby('A').mean()

2121 B C

2122 A

2123 1 3.0 1.333333

2124 2 4.0 1.500000

2125

2126 Groupby two columns and return the mean of the remaining column.

2127

2128 >>> df.groupby(['A', 'B']).mean()

2129 C

2130 A B

2131 1 2.0 2.0

2132 4.0 1.0

2133 2 3.0 1.0

2134 5.0 2.0

2135

2136 Groupby one column and return the mean of only particular column in

2137 the group.

2138

2139 >>> df.groupby('A')['B'].mean()

2140 A

2141 1 3.0

2142 2 4.0

2143 Name: B, dtype: float64

2144 """

2145 numeric_only_bool = self._resolve_numeric_only("mean", numeric_only, axis=0)

2146

2147 if maybe_use_numba(engine):

2148 from pandas.core._numba.kernels import sliding_mean

2149

2150 return self._numba_agg_general(sliding_mean, engine_kwargs)

2151 else:

2152 result = self._cython_agg_general(

2153 "mean",

2154 alt=lambda x: Series(x).mean(numeric_only=numeric_only_bool),

2155 numeric_only=numeric_only,

2156 )

2157 return result.__finalize__(self.obj, method="groupby")

2158

2159 @final

2160 @Substitution(name="groupby")

2161 @Appender(_common_see_also)

2162 def median(self, numeric_only: bool | lib.NoDefault = lib.no_default):

2163 """

2164 Compute median of groups, excluding missing values.

2165

2166 For multiple groupings, the result index will be a MultiIndex

2167

2168 Parameters

2169 ----------

2170 numeric_only : bool, default True

2171 Include only float, int, boolean columns. If None, will attempt to use

2172 everything, then use only numeric data.

2173

2174 Returns

2175 -------

2176 Series or DataFrame

2177 Median of values within each group.

2178 """

2179 numeric_only_bool = self._resolve_numeric_only("median", numeric_only, axis=0)

2180

2181 result = self._cython_agg_general(

2182 "median",

2183 alt=lambda x: Series(x).median(numeric_only=numeric_only_bool),

2184 numeric_only=numeric_only,

2185 )

2186 return result.__finalize__(self.obj, method="groupby")

2187

2188 @final

2189 @Substitution(name="groupby")

2190 @Appender(_common_see_also)

2191 def std(

2192 self,

2193 ddof: int = 1,

2194 engine: str | None = None,

2195 engine_kwargs: dict[str, bool] | None = None,

2196 numeric_only: bool | lib.NoDefault = lib.no_default,

2197 ):

2198 """

2199 Compute standard deviation of groups, excluding missing values.

2200

2201 For multiple groupings, the result index will be a MultiIndex.

2202

2203 Parameters

2204 ----------

2205 ddof : int, default 1

2206 Degrees of freedom.

2207

2208 engine : str, default None

2209 * ``'cython'`` : Runs the operation through C-extensions from cython.

2210 * ``'numba'`` : Runs the operation through JIT compiled code from numba.

2211 * ``None`` : Defaults to ``'cython'`` or globally setting

2212 ``compute.use_numba``

2213

2214 .. versionadded:: 1.4.0

2215

2216 engine_kwargs : dict, default None

2217 * For ``'cython'`` engine, there are no accepted ``engine_kwargs``

2218 * For ``'numba'`` engine, the engine can accept ``nopython``, ``nogil``

2219 and ``parallel`` dictionary keys. The values must either be ``True`` or

2220 ``False``. The default ``engine_kwargs`` for the ``'numba'`` engine is

2221 ``{{'nopython': True, 'nogil': False, 'parallel': False}}``

2222

2223 .. versionadded:: 1.4.0

2224

2225 numeric_only : bool, default True

2226 Include only `float`, `int` or `boolean` data.

2227

2228 .. versionadded:: 1.5.0

2229

2230 Returns

2231 -------

2232 Series or DataFrame

2233 Standard deviation of values within each group.

2234 """

2235 if maybe_use_numba(engine):

2236 from pandas.core._numba.kernels import sliding_var

2237

2238 return np.sqrt(self._numba_agg_general(sliding_var, engine_kwargs, ddof))

2239 else:

2240 # Resolve numeric_only so that var doesn't warn

2241 numeric_only_bool = self._resolve_numeric_only("std", numeric_only, axis=0)

2242 if (

2243 numeric_only_bool

2244 and self.obj.ndim == 1

2245 and not is_numeric_dtype(self.obj.dtype)

2246 ):

2247 raise TypeError(

2248 f"{type(self).__name__}.std called with "

2249 f"numeric_only={numeric_only} and dtype {self.obj.dtype}"

2250 )

2251 result = self._get_cythonized_result(

2252 libgroupby.group_var,

2253 cython_dtype=np.dtype(np.float64),

2254 numeric_only=numeric_only_bool,

2255 needs_counts=True,

2256 post_processing=lambda vals, inference: np.sqrt(vals),

2257 ddof=ddof,

2258 )

2259 self._maybe_warn_numeric_only_depr("std", result, numeric_only)

2260 return result

2261

2262 @final

2263 @Substitution(name="groupby")

2264 @Appender(_common_see_also)

2265 def var(

2266 self,

2267 ddof: int = 1,

2268 engine: str | None = None,

2269 engine_kwargs: dict[str, bool] | None = None,

2270 numeric_only: bool | lib.NoDefault = lib.no_default,

2271 ):

2272 """

2273 Compute variance of groups, excluding missing values.

2274

2275 For multiple groupings, the result index will be a MultiIndex.

2276

2277 Parameters

2278 ----------

2279 ddof : int, default 1

2280 Degrees of freedom.

2281

2282 engine : str, default None

2283 * ``'cython'`` : Runs the operation through C-extensions from cython.

2284 * ``'numba'`` : Runs the operation through JIT compiled code from numba.

2285 * ``None`` : Defaults to ``'cython'`` or globally setting

2286 ``compute.use_numba``

2287

2288 .. versionadded:: 1.4.0

2289

2290 engine_kwargs : dict, default None

2291 * For ``'cython'`` engine, there are no accepted ``engine_kwargs``

2292 * For ``'numba'`` engine, the engine can accept ``nopython``, ``nogil``

2293 and ``parallel`` dictionary keys. The values must either be ``True`` or

2294 ``False``. The default ``engine_kwargs`` for the ``'numba'`` engine is

2295 ``{{'nopython': True, 'nogil': False, 'parallel': False}}``

2296

2297 .. versionadded:: 1.4.0

2298

2299 numeric_only : bool, default True

2300 Include only `float`, `int` or `boolean` data.

2301

2302 .. versionadded:: 1.5.0

2303

2304 Returns

2305 -------

2306 Series or DataFrame

2307 Variance of values within each group.

2308 """

2309 if maybe_use_numba(engine):

2310 from pandas.core._numba.kernels import sliding_var

2311

2312 return self._numba_agg_general(sliding_var, engine_kwargs, ddof)

2313 else:

2314 return self._cython_agg_general(

2315 "var",

2316 alt=lambda x: Series(x).var(ddof=ddof),

2317 numeric_only=numeric_only,

2318 ignore_failures=numeric_only is lib.no_default,

2319 ddof=ddof,

2320 )

2321

2322 @final

2323 @Substitution(name="groupby")

2324 @Appender(_common_see_also)

2325 def sem(self, ddof: int = 1, numeric_only: bool | lib.NoDefault = lib.no_default):

2326 """

2327 Compute standard error of the mean of groups, excluding missing values.

2328

2329 For multiple groupings, the result index will be a MultiIndex.

2330

2331 Parameters

2332 ----------

2333 ddof : int, default 1

2334 Degrees of freedom.

2335

2336 numeric_only : bool, default True

2337 Include only `float`, `int` or `boolean` data.

2338

2339 .. versionadded:: 1.5.0

2340

2341 Returns

2342 -------

2343 Series or DataFrame

2344 Standard error of the mean of values within each group.

2345 """

2346 # Reolve numeric_only so that std doesn't warn

2347 numeric_only_bool = self._resolve_numeric_only("sem", numeric_only, axis=0)

2348 if (

2349 numeric_only_bool

2350 and self.obj.ndim == 1

2351 and not is_numeric_dtype(self.obj.dtype)

2352 ):

2353 raise TypeError(

2354 f"{type(self).__name__}.sem called with "

2355 f"numeric_only={numeric_only} and dtype {self.obj.dtype}"

2356 )

2357 result = self.std(ddof=ddof, numeric_only=numeric_only_bool)

2358 self._maybe_warn_numeric_only_depr("sem", result, numeric_only)

2359

2360 if result.ndim == 1:

2361 result /= np.sqrt(self.count())

2362 else:

2363 cols = result.columns.difference(self.exclusions).unique()

2364 counts = self.count()

2365 result_ilocs = result.columns.get_indexer_for(cols)

2366 count_ilocs = counts.columns.get_indexer_for(cols)

2367 with warnings.catch_warnings():

2368 # TODO(2.0): once iloc[:, foo] = bar depecation is enforced,

2369 # this catching will be unnecessary

2370 warnings.filterwarnings(

2371 "ignore", ".*will attempt to set the values inplace.*"

2372 )

2373 result.iloc[:, result_ilocs] /= np.sqrt(counts.iloc[:, count_ilocs])

2374 return result

2375

2376 @final

2377 @Substitution(name="groupby")

2378 @Appender(_common_see_also)

2379 def size(self) -> DataFrame | Series:

2380 """

2381 Compute group sizes.

2382

2383 Returns

2384 -------

2385 DataFrame or Series

2386 Number of rows in each group as a Series if as_index is True

2387 or a DataFrame if as_index is False.

2388 """

2389 result = self.grouper.size()

2390

2391 # GH28330 preserve subclassed Series/DataFrames through calls

2392 if isinstance(self.obj, Series):

2393 result = self._obj_1d_constructor(result, name=self.obj.name)

2394 else:

2395 result = self._obj_1d_constructor(result)

2396

2397 if not self.as_index:

2398 # error: Incompatible types in assignment (expression has

2399 # type "DataFrame", variable has type "Series")

2400 result = result.rename("size").reset_index() # type: ignore[assignment]

2401

2402 return self._reindex_output(result, fill_value=0)

2403

2404 @final

2405 @doc(_groupby_agg_method_template, fname="sum", no=True, mc=0)

2406 def sum(

2407 self,

2408 numeric_only: bool | lib.NoDefault = lib.no_default,

2409 min_count: int = 0,

2410 engine: str | None = None,

2411 engine_kwargs: dict[str, bool] | None = None,

2412 ):

2413 if maybe_use_numba(engine):

2414 from pandas.core._numba.kernels import sliding_sum

2415

2416 return self._numba_agg_general(

2417 sliding_sum,

2418 engine_kwargs,

2419 )

2420 else:

2421 # If we are grouping on categoricals we want unobserved categories to

2422 # return zero, rather than the default of NaN which the reindexing in

2423 # _agg_general() returns. GH #31422

2424 with com.temp_setattr(self, "observed", True):

2425 result = self._agg_general(

2426 numeric_only=numeric_only,

2427 min_count=min_count,

2428 alias="sum",

2429 npfunc=np.sum,

2430 )

2431

2432 return self._reindex_output(result, fill_value=0)

2433

2434 @final

2435 @doc(_groupby_agg_method_template, fname="prod", no=True, mc=0)

2436 def prod(

2437 self, numeric_only: bool | lib.NoDefault = lib.no_default, min_count: int = 0

2438 ):

2439 return self._agg_general(

2440 numeric_only=numeric_only, min_count=min_count, alias="prod", npfunc=np.prod

2441 )

2442

2443 @final

2444 @doc(_groupby_agg_method_template, fname="min", no=False, mc=-1)

2445 def min(

2446 self,

2447 numeric_only: bool = False,

2448 min_count: int = -1,

2449 engine: str | None = None,

2450 engine_kwargs: dict[str, bool] | None = None,

2451 ):

2452 if maybe_use_numba(engine):

2453 from pandas.core._numba.kernels import sliding_min_max

2454

2455 return self._numba_agg_general(sliding_min_max, engine_kwargs, False)

2456 else:

2457 return self._agg_general(

2458 numeric_only=numeric_only,

2459 min_count=min_count,

2460 alias="min",

2461 npfunc=np.min,

2462 )

2463

2464 @final

2465 @doc(_groupby_agg_method_template, fname="max", no=False, mc=-1)

2466 def max(

2467 self,

2468 numeric_only: bool = False,

2469 min_count: int = -1,

2470 engine: str | None = None,

2471 engine_kwargs: dict[str, bool] | None = None,

2472 ):

2473 if maybe_use_numba(engine):

2474 from pandas.core._numba.kernels import sliding_min_max

2475

2476 return self._numba_agg_general(sliding_min_max, engine_kwargs, True)

2477 else:

2478 return self._agg_general(

2479 numeric_only=numeric_only,

2480 min_count=min_count,

2481 alias="max",

2482 npfunc=np.max,

2483 )

2484

2485 @final

2486 @Substitution(name="groupby")

2487 def first(self, numeric_only: bool = False, min_count: int = -1):

2488 """

2489 Compute the first non-null entry of each column.

2490

2491 Parameters

2492 ----------

2493 numeric_only : bool, default False

2494 Include only float, int, boolean columns.

2495 min_count : int, default -1

2496 The required number of valid values to perform the operation. If fewer

2497 than ``min_count`` non-NA values are present the result will be NA.

2498

2499 Returns

2500 -------

2501 Series or DataFrame

2502 First non-null of values within each group.

2503

2504 See Also

2505 --------

2506 DataFrame.groupby : Apply a function groupby to each row or column of a

2507 DataFrame.

2508 DataFrame.core.groupby.GroupBy.last : Compute the last non-null entry of each

2509 column.

2510 DataFrame.core.groupby.GroupBy.nth : Take the nth row from each group.

2511

2512 Examples

2513 --------

2514 >>> df = pd.DataFrame(dict(A=[1, 1, 3], B=[None, 5, 6], C=[1, 2, 3],

2515 ... D=['3/11/2000', '3/12/2000', '3/13/2000']))

2516 >>> df['D'] = pd.to_datetime(df['D'])

2517 >>> df.groupby("A").first()

2518 B C D

2519 A

2520 1 5.0 1 2000-03-11

2521 3 6.0 3 2000-03-13

2522 >>> df.groupby("A").first(min_count=2)

2523 B C D

2524 A

2525 1 NaN 1.0 2000-03-11

2526 3 NaN NaN NaT

2527 >>> df.groupby("A").first(numeric_only=True)

2528 B C

2529 A

2530 1 5.0 1

2531 3 6.0 3

2532 """

2533

2534 def first_compat(obj: NDFrameT, axis: int = 0):

2535 def first(x: Series):

2536 """Helper function for first item that isn't NA."""

2537 arr = x.array[notna(x.array)]

2538 if not len(arr):

2539 return np.nan

2540 return arr[0]

2541

2542 if isinstance(obj, DataFrame):

2543 return obj.apply(first, axis=axis)

2544 elif isinstance(obj, Series):

2545 return first(obj)

2546 else: # pragma: no cover

2547 raise TypeError(type(obj))

2548

2549 return self._agg_general(

2550 numeric_only=numeric_only,

2551 min_count=min_count,

2552 alias="first",

2553 npfunc=first_compat,

2554 )

2555

2556 @final

2557 @Substitution(name="groupby")

2558 def last(self, numeric_only: bool = False, min_count: int = -1):

2559 """

2560 Compute the last non-null entry of each column.

2561

2562 Parameters

2563 ----------

2564 numeric_only : bool, default False

2565 Include only float, int, boolean columns. If None, will attempt to use

2566 everything, then use only numeric data.

2567 min_count : int, default -1

2568 The required number of valid values to perform the operation. If fewer

2569 than ``min_count`` non-NA values are present the result will be NA.

2570

2571 Returns

2572 -------

2573 Series or DataFrame

2574 Last non-null of values within each group.

2575

2576 See Also

2577 --------

2578 DataFrame.groupby : Apply a function groupby to each row or column of a

2579 DataFrame.

2580 DataFrame.core.groupby.GroupBy.first : Compute the first non-null entry of each

2581 column.

2582 DataFrame.core.groupby.GroupBy.nth : Take the nth row from each group.

2583

2584 Examples

2585 --------

2586 >>> df = pd.DataFrame(dict(A=[1, 1, 3], B=[5, None, 6], C=[1, 2, 3]))

2587 >>> df.groupby("A").last()

2588 B C

2589 A

2590 1 5.0 2

2591 3 6.0 3

2592 """

2593

2594 def last_compat(obj: NDFrameT, axis: int = 0):

2595 def last(x: Series):

2596 """Helper function for last item that isn't NA."""

2597 arr = x.array[notna(x.array)]

2598 if not len(arr):

2599 return np.nan

2600 return arr[-1]

2601

2602 if isinstance(obj, DataFrame):

2603 return obj.apply(last, axis=axis)

2604 elif isinstance(obj, Series):

2605 return last(obj)

2606 else: # pragma: no cover

2607 raise TypeError(type(obj))

2608

2609 return self._agg_general(

2610 numeric_only=numeric_only,

2611 min_count=min_count,

2612 alias="last",

2613 npfunc=last_compat,

2614 )

2615

2616 @final

2617 @Substitution(name="groupby")

2618 @Appender(_common_see_also)

2619 def ohlc(self) -> DataFrame:

2620 """

2621 Compute open, high, low and close values of a group, excluding missing values.

2622

2623 For multiple groupings, the result index will be a MultiIndex

2624

2625 Returns

2626 -------

2627 DataFrame

2628 Open, high, low and close values within each group.

2629 """

2630 if self.obj.ndim == 1:

2631 # self._iterate_slices() yields only self._selected_obj

2632 obj = self._selected_obj

2633

2634 is_numeric = is_numeric_dtype(obj.dtype)

2635 if not is_numeric:

2636 raise DataError("No numeric types to aggregate")

2637

2638 res_values = self.grouper._cython_operation(

2639 "aggregate", obj._values, "ohlc", axis=0, min_count=-1

2640 )

2641

2642 agg_names = ["open", "high", "low", "close"]

2643 result = self.obj._constructor_expanddim(

2644 res_values, index=self.grouper.result_index, columns=agg_names

2645 )

2646 return self._reindex_output(result)

2647

2648 return self._apply_to_column_groupbys(

2649 lambda x: x.ohlc(), self._obj_with_exclusions

2650 )

2651

2652 @doc(DataFrame.describe)

2653 def describe(self, **kwargs):

2654 with self._group_selection_context():

2655 if len(self._selected_obj) == 0:

2656 described = self._selected_obj.describe(**kwargs)

2657 if self._selected_obj.ndim == 1:

2658 result = described

2659 else:

2660 result = described.unstack()

2661 return result.to_frame().T.iloc[:0]

2662

2663 result = self._python_apply_general(

2664 lambda x: x.describe(**kwargs),

2665 self._selected_obj,

2666 not_indexed_same=True,

2667 )

2668 if self.axis == 1:

2669 return result.T

2670 return result.unstack()

2671

2672 @final

2673 def resample(self, rule, *args, **kwargs):

2674 """

2675 Provide resampling when using a TimeGrouper.

2676

2677 Given a grouper, the function resamples it according to a string

2678 "string" -> "frequency".

2679

2680 See the :ref:`frequency aliases <timeseries.offset_aliases>`

2681 documentation for more details.

2682

2683 Parameters

2684 ----------

2685 rule : str or DateOffset

2686 The offset string or object representing target grouper conversion.

2687 *args, **kwargs

2688 Possible arguments are `how`, `fill_method`, `limit`, `kind` and

2689 `on`, and other arguments of `TimeGrouper`.

2690

2691 Returns

2692 -------

2693 Grouper

2694 Return a new grouper with our resampler appended.

2695

2696 See Also

2697 --------

2698 Grouper : Specify a frequency to resample with when

2699 grouping by a key.

2700 DatetimeIndex.resample : Frequency conversion and resampling of

2701 time series.

2702

2703 Examples

2704 --------

2705 >>> idx = pd.date_range('1/1/2000', periods=4, freq='T')

2706 >>> df = pd.DataFrame(data=4 * [range(2)],

2707 ... index=idx,

2708 ... columns=['a', 'b'])

2709 >>> df.iloc[2, 0] = 5

2710 >>> df

2711 a b

2712 2000-01-01 00:00:00 0 1

2713 2000-01-01 00:01:00 0 1

2714 2000-01-01 00:02:00 5 1

2715 2000-01-01 00:03:00 0 1

2716

2717 Downsample the DataFrame into 3 minute bins and sum the values of

2718 the timestamps falling into a bin.

2719

2720 >>> df.groupby('a').resample('3T').sum()

2721 a b

2722 a

2723 0 2000-01-01 00:00:00 0 2

2724 2000-01-01 00:03:00 0 1

2725 5 2000-01-01 00:00:00 5 1

2726

2727 Upsample the series into 30 second bins.

2728

2729 >>> df.groupby('a').resample('30S').sum()

2730 a b

2731 a

2732 0 2000-01-01 00:00:00 0 1

2733 2000-01-01 00:00:30 0 0

2734 2000-01-01 00:01:00 0 1

2735 2000-01-01 00:01:30 0 0

2736 2000-01-01 00:02:00 0 0

2737 2000-01-01 00:02:30 0 0

2738 2000-01-01 00:03:00 0 1

2739 5 2000-01-01 00:02:00 5 1

2740

2741 Resample by month. Values are assigned to the month of the period.

2742

2743 >>> df.groupby('a').resample('M').sum()

2744 a b

2745 a

2746 0 2000-01-31 0 3

2747 5 2000-01-31 5 1

2748

2749 Downsample the series into 3 minute bins as above, but close the right

2750 side of the bin interval.

2751

2752 >>> df.groupby('a').resample('3T', closed='right').sum()

2753 a b

2754 a

2755 0 1999-12-31 23:57:00 0 1

2756 2000-01-01 00:00:00 0 2

2757 5 2000-01-01 00:00:00 5 1

2758

2759 Downsample the series into 3 minute bins and close the right side of

2760 the bin interval, but label each bin using the right edge instead of

2761 the left.

2762

2763 >>> df.groupby('a').resample('3T', closed='right', label='right').sum()

2764 a b

2765 a

2766 0 2000-01-01 00:00:00 0 1

2767 2000-01-01 00:03:00 0 2

2768 5 2000-01-01 00:03:00 5 1

2769 """

2770 from pandas.core.resample import get_resampler_for_grouping

2771

2772 return get_resampler_for_grouping(self, rule, *args, **kwargs)

2773

2774 @final

2775 @Substitution(name="groupby")

2776 @Appender(_common_see_also)

2777 def rolling(self, *args, **kwargs) -> RollingGroupby:

2778 """

2779 Return a rolling grouper, providing rolling functionality per group.

2780 """

2781 from pandas.core.window import RollingGroupby

2782

2783 return RollingGroupby(

2784 self._selected_obj,

2785 *args,

2786 _grouper=self.grouper,

2787 _as_index=self.as_index,

2788 **kwargs,

2789 )

2790

2791 @final

2792 @Substitution(name="groupby")

2793 @Appender(_common_see_also)

2794 def expanding(self, *args, **kwargs) -> ExpandingGroupby:

2795 """

2796 Return an expanding grouper, providing expanding

2797 functionality per group.

2798 """

2799 from pandas.core.window import ExpandingGroupby

2800

2801 return ExpandingGroupby(

2802 self._selected_obj,

2803 *args,

2804 _grouper=self.grouper,

2805 **kwargs,

2806 )

2807

2808 @final

2809 @Substitution(name="groupby")

2810 @Appender(_common_see_also)

2811 def ewm(self, *args, **kwargs) -> ExponentialMovingWindowGroupby:

2812 """

2813 Return an ewm grouper, providing ewm functionality per group.

2814 """

2815 from pandas.core.window import ExponentialMovingWindowGroupby

2816

2817 return ExponentialMovingWindowGroupby(

2818 self._selected_obj,

2819 *args,

2820 _grouper=self.grouper,

2821 **kwargs,

2822 )

2823

2824 @final

2825 def _fill(self, direction: Literal["ffill", "bfill"], limit=None):

2826 """

2827 Shared function for `pad` and `backfill` to call Cython method.

2828

2829 Parameters

2830 ----------

2831 direction : {'ffill', 'bfill'}

2832 Direction passed to underlying Cython function. `bfill` will cause

2833 values to be filled backwards. `ffill` and any other values will

2834 default to a forward fill

2835 limit : int, default None

2836 Maximum number of consecutive values to fill. If `None`, this

2837 method will convert to -1 prior to passing to Cython

2838

2839 Returns

2840 -------

2841 `Series` or `DataFrame` with filled values

2842

2843 See Also

2844 --------

2845 pad : Returns Series with minimum number of char in object.

2846 backfill : Backward fill the missing values in the dataset.

2847 """

2848 # Need int value for Cython

2849 if limit is None:

2850 limit = -1

2851

2852 ids, _, _ = self.grouper.group_info

2853 sorted_labels = np.argsort(ids, kind="mergesort").astype(np.intp, copy=False)

2854 if direction == "bfill":

2855 sorted_labels = sorted_labels[::-1]

2856

2857 col_func = partial(

2858 libgroupby.group_fillna_indexer,

2859 labels=ids,

2860 sorted_labels=sorted_labels,

2861 direction=direction,

2862 limit=limit,

2863 dropna=self.dropna,

2864 )

2865

2866 def blk_func(values: ArrayLike) -> ArrayLike:

2867 mask = isna(values)

2868 if values.ndim == 1:

2869 indexer = np.empty(values.shape, dtype=np.intp)

2870 col_func(out=indexer, mask=mask)

2871 return algorithms.take_nd(values, indexer)

2872

2873 else:

2874 # We broadcast algorithms.take_nd analogous to

2875 # np.take_along_axis

2876

2877 # Note: we only get here with backfill/pad,

2878 # so if we have a dtype that cannot hold NAs,

2879 # then there will be no -1s in indexer, so we can use

2880 # the original dtype (no need to ensure_dtype_can_hold_na)

2881 if isinstance(values, np.ndarray):

2882 dtype = values.dtype

2883 if self.grouper.has_dropped_na:

2884 # dropped null groups give rise to nan in the result

2885 dtype = ensure_dtype_can_hold_na(values.dtype)

2886 out = np.empty(values.shape, dtype=dtype)

2887 else:

2888 out = type(values)._empty(values.shape, dtype=values.dtype)

2889

2890 for i in range(len(values)):

2891 # call group_fillna_indexer column-wise

2892 indexer = np.empty(values.shape[1], dtype=np.intp)

2893 col_func(out=indexer, mask=mask[i])

2894 out[i, :] = algorithms.take_nd(values[i], indexer)

2895 return out

2896

2897 obj = self._obj_with_exclusions

2898 if self.axis == 1:

2899 obj = obj.T

2900 mgr = obj._mgr

2901 res_mgr = mgr.apply(blk_func)

2902

2903 new_obj = obj._constructor(res_mgr)

2904 if isinstance(new_obj, Series):

2905 new_obj.name = obj.name

2906

2907 return self._wrap_transformed_output(new_obj)

2908

2909 @final

2910 @Substitution(name="groupby")

2911 def ffill(self, limit=None):

2912 """

2913 Forward fill the values.

2914

2915 Parameters

2916 ----------

2917 limit : int, optional

2918 Limit of how many values to fill.

2919

2920 Returns

2921 -------

2922 Series or DataFrame

2923 Object with missing values filled.

2924

2925 See Also

2926 --------

2927 Series.ffill: Returns Series with minimum number of char in object.

2928 DataFrame.ffill: Object with missing values filled or None if inplace=True.

2929 Series.fillna: Fill NaN values of a Series.

2930 DataFrame.fillna: Fill NaN values of a DataFrame.

2931 """

2932 return self._fill("ffill", limit=limit)

2933

2934 def pad(self, limit=None):

2935 """

2936 Forward fill the values.

2937

2938 .. deprecated:: 1.4

2939 Use ffill instead.

2940

2941 Parameters

2942 ----------

2943 limit : int, optional

2944 Limit of how many values to fill.

2945

2946 Returns

2947 -------

2948 Series or DataFrame

2949 Object with missing values filled.

2950 """

2951 warnings.warn(

2952 "pad is deprecated and will be removed in a future version. "

2953 "Use ffill instead.",

2954 FutureWarning,

2955 stacklevel=find_stack_level(),

2956 )

2957 return self.ffill(limit=limit)

2958

2959 @final

2960 @Substitution(name="groupby")

2961 def bfill(self, limit=None):

2962 """

2963 Backward fill the values.

2964

2965 Parameters

2966 ----------

2967 limit : int, optional

2968 Limit of how many values to fill.

2969

2970 Returns

2971 -------

2972 Series or DataFrame

2973 Object with missing values filled.

2974

2975 See Also

2976 --------

2977 Series.bfill : Backward fill the missing values in the dataset.

2978 DataFrame.bfill: Backward fill the missing values in the dataset.

2979 Series.fillna: Fill NaN values of a Series.

2980 DataFrame.fillna: Fill NaN values of a DataFrame.

2981 """

2982 return self._fill("bfill", limit=limit)

2983

2984 def backfill(self, limit=None):

2985 """

2986 Backward fill the values.

2987

2988 .. deprecated:: 1.4

2989 Use bfill instead.

2990

2991 Parameters

2992 ----------

2993 limit : int, optional

2994 Limit of how many values to fill.

2995

2996 Returns

2997 -------

2998 Series or DataFrame

2999 Object with missing values filled.

3000 """

3001 warnings.warn(

3002 "backfill is deprecated and will be removed in a future version. "

3003 "Use bfill instead.",

3004 FutureWarning,

3005 stacklevel=find_stack_level(),

3006 )

3007 return self.bfill(limit=limit)

3008

3009 # https://github.com/python/mypy/issues/1362

3010 # Mypy does not support decorated properties

3011 @final # type: ignore[misc]

3012 @property

3013 @Substitution(name="groupby")

3014 @Substitution(see_also=_common_see_also)

3015 def nth(self) -> GroupByNthSelector:

3016 """

3017 Take the nth row from each group if n is an int, otherwise a subset of rows.

3018

3019 Can be either a call or an index. dropna is not available with index notation.

3020 Index notation accepts a comma separated list of integers and slices.

3021

3022 If dropna, will take the nth non-null row, dropna is either

3023 'all' or 'any'; this is equivalent to calling dropna(how=dropna)

3024 before the groupby.

3025

3026 Parameters

3027 ----------

3028 n : int, slice or list of ints and slices

3029 A single nth value for the row or a list of nth values or slices.

3030

3031 .. versionchanged:: 1.4.0

3032 Added slice and lists containing slices.

3033 Added index notation.

3034

3035 dropna : {'any', 'all', None}, default None

3036 Apply the specified dropna operation before counting which row is

3037 the nth row. Only supported if n is an int.

3038

3039 Returns

3040 -------

3041 Series or DataFrame

3042 N-th value within each group.

3043 %(see_also)s

3044 Examples

3045 --------

3046

3047 >>> df = pd.DataFrame({'A': [1, 1, 2, 1, 2],

3048 ... 'B': [np.nan, 2, 3, 4, 5]}, columns=['A', 'B'])

3049 >>> g = df.groupby('A')

3050 >>> g.nth(0)

3051 B

3052 A

3053 1 NaN

3054 2 3.0

3055 >>> g.nth(1)

3056 B

3057 A

3058 1 2.0

3059 2 5.0

3060 >>> g.nth(-1)

3061 B

3062 A

3063 1 4.0

3064 2 5.0

3065 >>> g.nth([0, 1])

3066 B

3067 A

3068 1 NaN

3069 1 2.0

3070 2 3.0

3071 2 5.0

3072 >>> g.nth(slice(None, -1))

3073 B

3074 A

3075 1 NaN

3076 1 2.0

3077 2 3.0

3078

3079 Index notation may also be used

3080

3081 >>> g.nth[0, 1]

3082 B

3083 A

3084 1 NaN

3085 1 2.0

3086 2 3.0

3087 2 5.0

3088 >>> g.nth[:-1]

3089 B

3090 A

3091 1 NaN

3092 1 2.0

3093 2 3.0

3094

3095 Specifying `dropna` allows count ignoring ``NaN``

3096

3097 >>> g.nth(0, dropna='any')

3098 B

3099 A

3100 1 2.0

3101 2 3.0

3102

3103 NaNs denote group exhausted when using dropna

3104

3105 >>> g.nth(3, dropna='any')

3106 B

3107 A

3108 1 NaN

3109 2 NaN

3110

3111 Specifying `as_index=False` in `groupby` keeps the original index.

3112

3113 >>> df.groupby('A', as_index=False).nth(1)

3114 A B

3115 1 1 2.0

3116 4 2 5.0

3117 """

3118 return GroupByNthSelector(self)

3119

3120 def _nth(

3121 self,

3122 n: PositionalIndexer | tuple,

3123 dropna: Literal["any", "all", None] = None,

3124 ) -> NDFrameT:

3125 if not dropna:

3126 with self._group_selection_context():

3127 mask = self._make_mask_from_positional_indexer(n)

3128

3129 ids, _, _ = self.grouper.group_info

3130

3131 # Drop NA values in grouping

3132 mask = mask & (ids != -1)

3133

3134 out = self._mask_selected_obj(mask)

3135 if not self.as_index:

3136 return out

3137

3138 result_index = self.grouper.result_index

3139 if self.axis == 0:

3140 out.index = result_index[ids[mask]]

3141 if not self.observed and isinstance(result_index, CategoricalIndex):

3142 out = out.reindex(result_index)

3143

3144 out = self._reindex_output(out)

3145 else:

3146 out.columns = result_index[ids[mask]]

3147

3148 return out.sort_index(axis=self.axis) if self.sort else out

3149

3150 # dropna is truthy

3151 if not is_integer(n):

3152 raise ValueError("dropna option only supported for an integer argument")

3153

3154 if dropna not in ["any", "all"]:

3155 # Note: when agg-ing picker doesn't raise this, just returns NaN

3156 raise ValueError(

3157 "For a DataFrame or Series groupby.nth, dropna must be "

3158 "either None, 'any' or 'all', "

3159 f"(was passed {dropna})."

3160 )

3161

3162 # old behaviour, but with all and any support for DataFrames.

3163 # modified in GH 7559 to have better perf

3164 n = cast(int, n)

3165 max_len = n if n >= 0 else -1 - n

3166 dropped = self.obj.dropna(how=dropna, axis=self.axis)

3167

3168 # get a new grouper for our dropped obj

3169 if self.keys is None and self.level is None:

3170

3171 # we don't have the grouper info available

3172 # (e.g. we have selected out

3173 # a column that is not in the current object)

3174 axis = self.grouper.axis

3175 grouper = axis[axis.isin(dropped.index)]

3176

3177 else:

3178

3179 # create a grouper with the original parameters, but on dropped

3180 # object

3181 from pandas.core.groupby.grouper import get_grouper

3182

3183 grouper, _, _ = get_grouper(

3184 dropped,

3185 key=self.keys,

3186 axis=self.axis,

3187 level=self.level,

3188 sort=self.sort,

3189 mutated=self.mutated,

3190 )

3191

3192 grb = dropped.groupby(

3193 grouper, as_index=self.as_index, sort=self.sort, axis=self.axis

3194 )

3195 sizes, result = grb.size(), grb.nth(n)

3196 mask = (sizes < max_len)._values

3197

3198 # set the results which don't meet the criteria

3199 if len(result) and mask.any():

3200 result.loc[mask] = np.nan

3201

3202 # reset/reindex to the original groups

3203 if len(self.obj) == len(dropped) or len(result) == len(

3204 self.grouper.result_index

3205 ):

3206 result.index = self.grouper.result_index

3207 else:

3208 result = result.reindex(self.grouper.result_index)

3209

3210 return result

3211

3212 @final

3213 def quantile(

3214 self,

3215 q=0.5,

3216 interpolation: str = "linear",

3217 numeric_only: bool | lib.NoDefault = lib.no_default,

3218 ):

3219 """

3220 Return group values at the given quantile, a la numpy.percentile.

3221

3222 Parameters

3223 ----------

3224 q : float or array-like, default 0.5 (50% quantile)

3225 Value(s) between 0 and 1 providing the quantile(s) to compute.

3226 interpolation : {'linear', 'lower', 'higher', 'midpoint', 'nearest'}

3227 Method to use when the desired quantile falls between two points.

3228 numeric_only : bool, default True

3229 Include only `float`, `int` or `boolean` data.

3230

3231 .. versionadded:: 1.5.0

3232

3233 Returns

3234 -------

3235 Series or DataFrame

3236 Return type determined by caller of GroupBy object.

3237

3238 See Also

3239 --------

3240 Series.quantile : Similar method for Series.

3241 DataFrame.quantile : Similar method for DataFrame.

3242 numpy.percentile : NumPy method to compute qth percentile.

3243

3244 Examples

3245 --------

3246 >>> df = pd.DataFrame([

3247 ... ['a', 1], ['a', 2], ['a', 3],

3248 ... ['b', 1], ['b', 3], ['b', 5]

3249 ... ], columns=['key', 'val'])

3250 >>> df.groupby('key').quantile()

3251 val

3252 key

3253 a 2.0

3254 b 3.0

3255 """

3256 numeric_only_bool = self._resolve_numeric_only("quantile", numeric_only, axis=0)

3257 if (

3258 numeric_only_bool

3259 and self.obj.ndim == 1

3260 and not is_numeric_dtype(self.obj.dtype)

3261 ):

3262 raise TypeError(

3263 f"{type(self).__name__}.quantile called with "

3264 f"numeric_only={numeric_only} and dtype {self.obj.dtype}"

3265 )

3266

3267 def pre_processor(vals: ArrayLike) -> tuple[np.ndarray, np.dtype | None]:

3268 if is_object_dtype(vals):

3269 raise TypeError(

3270 "'quantile' cannot be performed against 'object' dtypes!"

3271 )

3272

3273 inference: np.dtype | None = None

3274 if is_integer_dtype(vals.dtype):

3275 if isinstance(vals, ExtensionArray):

3276 out = vals.to_numpy(dtype=float, na_value=np.nan)

3277 else:

3278 out = vals

3279 inference = np.dtype(np.int64)

3280 elif is_bool_dtype(vals.dtype) and isinstance(vals, ExtensionArray):

3281 out = vals.to_numpy(dtype=float, na_value=np.nan)

3282 elif is_datetime64_dtype(vals.dtype):

3283 inference = np.dtype("datetime64[ns]")

3284 out = np.asarray(vals).astype(float)

3285 elif is_timedelta64_dtype(vals.dtype):

3286 inference = np.dtype("timedelta64[ns]")

3287 out = np.asarray(vals).astype(float)

3288 elif isinstance(vals, ExtensionArray) and is_float_dtype(vals):

3289 inference = np.dtype(np.float64)

3290 out = vals.to_numpy(dtype=float, na_value=np.nan)

3291 else:

3292 out = np.asarray(vals)

3293

3294 return out, inference

3295

3296 def post_processor(vals: np.ndarray, inference: np.dtype | None) -> np.ndarray:

3297 if inference:

3298 # Check for edge case

3299 if not (

3300 is_integer_dtype(inference)

3301 and interpolation in {"linear", "midpoint"}

3302 ):

3303 vals = vals.astype(inference)

3304

3305 return vals

3306

3307 orig_scalar = is_scalar(q)

3308 if orig_scalar:

3309 q = [q]

3310

3311 qs = np.array(q, dtype=np.float64)

3312 ids, _, ngroups = self.grouper.group_info

3313 nqs = len(qs)

3314

3315 func = partial(

3316 libgroupby.group_quantile, labels=ids, qs=qs, interpolation=interpolation

3317 )

3318

3319 # Put '-1' (NaN) labels as the last group so it does not interfere

3320 # with the calculations. Note: length check avoids failure on empty

3321 # labels. In that case, the value doesn't matter

3322 na_label_for_sorting = ids.max() + 1 if len(ids) > 0 else 0

3323 labels_for_lexsort = np.where(ids == -1, na_label_for_sorting, ids)

3324

3325 def blk_func(values: ArrayLike) -> ArrayLike:

3326 mask = isna(values)

3327 vals, inference = pre_processor(values)

3328

3329 ncols = 1

3330 if vals.ndim == 2:

3331 ncols = vals.shape[0]

3332 shaped_labels = np.broadcast_to(

3333 labels_for_lexsort, (ncols, len(labels_for_lexsort))

3334 )

3335 else:

3336 shaped_labels = labels_for_lexsort

3337

3338 out = np.empty((ncols, ngroups, nqs), dtype=np.float64)

3339

3340 # Get an index of values sorted by values and then labels

3341 order = (vals, shaped_labels)

3342 sort_arr = np.lexsort(order).astype(np.intp, copy=False)

3343

3344 if vals.ndim == 1:

3345 func(out[0], values=vals, mask=mask, sort_indexer=sort_arr)

3346 else:

3347 for i in range(ncols):

3348 func(out[i], values=vals[i], mask=mask[i], sort_indexer=sort_arr[i])

3349

3350 if vals.ndim == 1:

3351 out = out.ravel("K")

3352 else:

3353 out = out.reshape(ncols, ngroups * nqs)

3354 return post_processor(out, inference)

3355

3356 obj = self._obj_with_exclusions

3357 is_ser = obj.ndim == 1

3358 mgr = self._get_data_to_aggregate()

3359 data = mgr.get_numeric_data() if numeric_only_bool else mgr

3360 ignore_failures = numeric_only_bool

3361 res_mgr = data.grouped_reduce(blk_func, ignore_failures=ignore_failures)

3362

3363 if (

3364 numeric_only is lib.no_default

3365 and not is_ser

3366 and len(res_mgr.items) != len(mgr.items)

3367 ):

3368 warn_dropping_nuisance_columns_deprecated(

3369 type(self), "quantile", numeric_only

3370 )

3371

3372 if len(res_mgr.items) == 0:

3373 # re-call grouped_reduce to get the desired exception message

3374 mgr.grouped_reduce(blk_func, ignore_failures=False)

3375 # grouped_reduce _should_ raise, so this should not be reached

3376 raise TypeError( # pragma: no cover

3377 "All columns were dropped in grouped_reduce"

3378 )

3379

3380 if is_ser:

3381 res = self._wrap_agged_manager(res_mgr)

3382 else:

3383 res = obj._constructor(res_mgr)

3384

3385 if orig_scalar:

3386 # Avoid expensive MultiIndex construction

3387 return self._wrap_aggregated_output(res)

3388 return self._wrap_aggregated_output(res, qs=qs)

3389

3390 @final

3391 @Substitution(name="groupby")

3392 def ngroup(self, ascending: bool = True):

3393 """

3394 Number each group from 0 to the number of groups - 1.

3395

3396 This is the enumerative complement of cumcount. Note that the

3397 numbers given to the groups match the order in which the groups

3398 would be seen when iterating over the groupby object, not the

3399 order they are first observed.

3400

3401 Parameters

3402 ----------

3403 ascending : bool, default True

3404 If False, number in reverse, from number of group - 1 to 0.

3405

3406 Returns

3407 -------

3408 Series

3409 Unique numbers for each group.

3410

3411 See Also

3412 --------

3413 .cumcount : Number the rows in each group.

3414

3415 Examples

3416 --------

3417 >>> df = pd.DataFrame({"A": list("aaabba")})

3418 >>> df

3419 A

3420 0 a

3421 1 a

3422 2 a

3423 3 b

3424 4 b

3425 5 a

3426 >>> df.groupby('A').ngroup()

3427 0 0

3428 1 0

3429 2 0

3430 3 1

3431 4 1

3432 5 0

3433 dtype: int64

3434 >>> df.groupby('A').ngroup(ascending=False)

3435 0 1

3436 1 1

3437 2 1

3438 3 0

3439 4 0

3440 5 1

3441 dtype: int64

3442 >>> df.groupby(["A", [1,1,2,3,2,1]]).ngroup()

3443 0 0

3444 1 0

3445 2 1

3446 3 3

3447 4 2

3448 5 0

3449 dtype: int64

3450 """

3451 with self._group_selection_context():

3452 index = self._selected_obj.index

3453 comp_ids = self.grouper.group_info[0]

3454

3455 dtype: type

3456 if self.grouper.has_dropped_na:

3457 comp_ids = np.where(comp_ids == -1, np.nan, comp_ids)

3458 dtype = np.float64

3459 else:

3460 dtype = np.int64

3461

3462 result = self._obj_1d_constructor(comp_ids, index, dtype=dtype)

3463 if not ascending:

3464 result = self.ngroups - 1 - result

3465 return result

3466

3467 @final

3468 @Substitution(name="groupby")

3469 def cumcount(self, ascending: bool = True):

3470 """

3471 Number each item in each group from 0 to the length of that group - 1.

3472

3473 Essentially this is equivalent to

3474

3475 .. code-block:: python

3476

3477 self.apply(lambda x: pd.Series(np.arange(len(x)), x.index))

3478

3479 Parameters

3480 ----------

3481 ascending : bool, default True

3482 If False, number in reverse, from length of group - 1 to 0.

3483

3484 Returns

3485 -------

3486 Series

3487 Sequence number of each element within each group.

3488

3489 See Also

3490 --------

3491 .ngroup : Number the groups themselves.

3492

3493 Examples

3494 --------

3495 >>> df = pd.DataFrame([['a'], ['a'], ['a'], ['b'], ['b'], ['a']],

3496 ... columns=['A'])

3497 >>> df

3498 A

3499 0 a

3500 1 a

3501 2 a

3502 3 b

3503 4 b

3504 5 a

3505 >>> df.groupby('A').cumcount()

3506 0 0

3507 1 1

3508 2 2

3509 3 0

3510 4 1

3511 5 3

3512 dtype: int64

3513 >>> df.groupby('A').cumcount(ascending=False)

3514 0 3

3515 1 2

3516 2 1

3517 3 1

3518 4 0

3519 5 0

3520 dtype: int64

3521 """

3522 with self._group_selection_context():

3523 index = self._selected_obj._get_axis(self.axis)

3524 cumcounts = self._cumcount_array(ascending=ascending)

3525 return self._obj_1d_constructor(cumcounts, index)

3526

3527 @final

3528 @Substitution(name="groupby")

3529 @Substitution(see_also=_common_see_also)

3530 def rank(

3531 self,

3532 method: str = "average",

3533 ascending: bool = True,

3534 na_option: str = "keep",

3535 pct: bool = False,

3536 axis: int = 0,

3537 ) -> NDFrameT:

3538 """

3539 Provide the rank of values within each group.

3540

3541 Parameters

3542 ----------

3543 method : {'average', 'min', 'max', 'first', 'dense'}, default 'average'

3544 * average: average rank of group.

3545 * min: lowest rank in group.

3546 * max: highest rank in group.

3547 * first: ranks assigned in order they appear in the array.

3548 * dense: like 'min', but rank always increases by 1 between groups.

3549 ascending : bool, default True

3550 False for ranks by high (1) to low (N).

3551 na_option : {'keep', 'top', 'bottom'}, default 'keep'

3552 * keep: leave NA values where they are.

3553 * top: smallest rank if ascending.

3554 * bottom: smallest rank if descending.

3555 pct : bool, default False

3556 Compute percentage rank of data within each group.

3557 axis : int, default 0

3558 The axis of the object over which to compute the rank.

3559

3560 Returns

3561 -------

3562 DataFrame with ranking of values within each group

3563 %(see_also)s

3564 Examples

3565 --------

3566 >>> df = pd.DataFrame(

3567 ... {

3568 ... "group": ["a", "a", "a", "a", "a", "b", "b", "b", "b", "b"],

3569 ... "value": [2, 4, 2, 3, 5, 1, 2, 4, 1, 5],

3570 ... }

3571 ... )

3572 >>> df

3573 group value

3574 0 a 2

3575 1 a 4

3576 2 a 2

3577 3 a 3

3578 4 a 5

3579 5 b 1

3580 6 b 2

3581 7 b 4

3582 8 b 1

3583 9 b 5

3584 >>> for method in ['average', 'min', 'max', 'dense', 'first']:

3585 ... df[f'{method}_rank'] = df.groupby('group')['value'].rank(method)

3586 >>> df

3587 group value average_rank min_rank max_rank dense_rank first_rank

3588 0 a 2 1.5 1.0 2.0 1.0 1.0

3589 1 a 4 4.0 4.0 4.0 3.0 4.0

3590 2 a 2 1.5 1.0 2.0 1.0 2.0

3591 3 a 3 3.0 3.0 3.0 2.0 3.0

3592 4 a 5 5.0 5.0 5.0 4.0 5.0

3593 5 b 1 1.5 1.0 2.0 1.0 1.0

3594 6 b 2 3.0 3.0 3.0 2.0 3.0

3595 7 b 4 4.0 4.0 4.0 3.0 4.0

3596 8 b 1 1.5 1.0 2.0 1.0 2.0

3597 9 b 5 5.0 5.0 5.0 4.0 5.0

3598 """

3599 if na_option not in {"keep", "top", "bottom"}:

3600 msg = "na_option must be one of 'keep', 'top', or 'bottom'"

3601 raise ValueError(msg)

3602

3603 kwargs = {

3604 "ties_method": method,

3605 "ascending": ascending,

3606 "na_option": na_option,

3607 "pct": pct,

3608 }

3609 if axis != 0:

3610 # DataFrame uses different keyword name

3611 kwargs["method"] = kwargs.pop("ties_method")

3612 f = lambda x: x.rank(axis=axis, numeric_only=False, **kwargs)

3613 result = self._python_apply_general(

3614 f, self._selected_obj, is_transform=True

3615 )

3616 return result

3617

3618 return self._cython_transform(

3619 "rank",

3620 numeric_only=False,

3621 axis=axis,

3622 **kwargs,

3623 )

3624

3625 @final

3626 @Substitution(name="groupby")

3627 @Appender(_common_see_also)

3628 def cumprod(self, axis=0, *args, **kwargs) -> NDFrameT:

3629 """

3630 Cumulative product for each group.

3631

3632 Returns

3633 -------

3634 Series or DataFrame

3635 """

3636 nv.validate_groupby_func("cumprod", args, kwargs, ["numeric_only", "skipna"])

3637 if axis != 0:

3638 f = lambda x: x.cumprod(axis=axis, **kwargs)

3639 return self._python_apply_general(f, self._selected_obj, is_transform=True)

3640

3641 return self._cython_transform("cumprod", **kwargs)

3642

3643 @final

3644 @Substitution(name="groupby")

3645 @Appender(_common_see_also)

3646 def cumsum(self, axis=0, *args, **kwargs) -> NDFrameT:

3647 """

3648 Cumulative sum for each group.

3649

3650 Returns

3651 -------

3652 Series or DataFrame

3653 """

3654 nv.validate_groupby_func("cumsum", args, kwargs, ["numeric_only", "skipna"])

3655 if axis != 0:

3656 f = lambda x: x.cumsum(axis=axis, **kwargs)

3657 return self._python_apply_general(f, self._selected_obj, is_transform=True)

3658

3659 return self._cython_transform("cumsum", **kwargs)

3660

3661 @final

3662 @Substitution(name="groupby")

3663 @Appender(_common_see_also)

3664 def cummin(self, axis=0, numeric_only=False, **kwargs) -> NDFrameT:

3665 """

3666 Cumulative min for each group.

3667

3668 Returns

3669 -------

3670 Series or DataFrame

3671 """

3672 skipna = kwargs.get("skipna", True)

3673 if axis != 0:

3674 f = lambda x: np.minimum.accumulate(x, axis)

3675 numeric_only_bool = self._resolve_numeric_only("cummax", numeric_only, axis)

3676 obj = self._selected_obj

3677 if numeric_only_bool:

3678 obj = obj._get_numeric_data()

3679 return self._python_apply_general(f, obj, is_transform=True)

3680

3681 return self._cython_transform(

3682 "cummin", numeric_only=numeric_only, skipna=skipna

3683 )

3684

3685 @final

3686 @Substitution(name="groupby")

3687 @Appender(_common_see_also)

3688 def cummax(self, axis=0, numeric_only=False, **kwargs) -> NDFrameT:

3689 """

3690 Cumulative max for each group.

3691

3692 Returns

3693 -------

3694 Series or DataFrame

3695 """

3696 skipna = kwargs.get("skipna", True)

3697 if axis != 0:

3698 f = lambda x: np.maximum.accumulate(x, axis)

3699 numeric_only_bool = self._resolve_numeric_only("cummax", numeric_only, axis)

3700 obj = self._selected_obj

3701 if numeric_only_bool:

3702 obj = obj._get_numeric_data()

3703 return self._python_apply_general(f, obj, is_transform=True)

3704

3705 return self._cython_transform(

3706 "cummax", numeric_only=numeric_only, skipna=skipna

3707 )

3708

3709 @final

3710 def _get_cythonized_result(

3711 self,

3712 base_func: Callable,

3713 cython_dtype: np.dtype,

3714 numeric_only: bool | lib.NoDefault = lib.no_default,

3715 needs_counts: bool = False,

3716 needs_nullable: bool = False,

3717 needs_mask: bool = False,

3718 pre_processing=None,

3719 post_processing=None,

3720 **kwargs,

3721 ):

3722 """

3723 Get result for Cythonized functions.

3724

3725 Parameters

3726 ----------

3727 base_func : callable, Cythonized function to be called

3728 cython_dtype : np.dtype

3729 Type of the array that will be modified by the Cython call.

3730 numeric_only : bool, default True

3731 Whether only numeric datatypes should be computed

3732 needs_counts : bool, default False

3733 Whether the counts should be a part of the Cython call

3734 needs_mask : bool, default False

3735 Whether boolean mask needs to be part of the Cython call

3736 signature

3737 needs_nullable : bool, default False

3738 Whether a bool specifying if the input is nullable is part

3739 of the Cython call signature

3740 pre_processing : function, default None

3741 Function to be applied to `values` prior to passing to Cython.

3742 Function should return a tuple where the first element is the

3743 values to be passed to Cython and the second element is an optional

3744 type which the values should be converted to after being returned

3745 by the Cython operation. This function is also responsible for

3746 raising a TypeError if the values have an invalid type. Raises

3747 if `needs_values` is False.

3748 post_processing : function, default None

3749 Function to be applied to result of Cython function. Should accept

3750 an array of values as the first argument and type inferences as its

3751 second argument, i.e. the signature should be

3752 (ndarray, Type). If `needs_nullable=True`, a third argument should be

3753 `nullable`, to allow for processing specific to nullable values.

3754 **kwargs : dict

3755 Extra arguments to be passed back to Cython funcs

3756

3757 Returns

3758 -------

3759 `Series` or `DataFrame` with filled values

3760 """

3761 how = base_func.__name__

3762 numeric_only_bool = self._resolve_numeric_only(how, numeric_only, axis=0)

3763

3764 if post_processing and not callable(post_processing):

3765 raise ValueError("'post_processing' must be a callable!")

3766 if pre_processing and not callable(pre_processing):

3767 raise ValueError("'pre_processing' must be a callable!")

3768

3769 grouper = self.grouper

3770

3771 ids, _, ngroups = grouper.group_info

3772

3773 base_func = partial(base_func, labels=ids)

3774

3775 def blk_func(values: ArrayLike) -> ArrayLike:

3776 values = values.T

3777 ncols = 1 if values.ndim == 1 else values.shape[1]

3778

3779 result: ArrayLike

3780 result = np.zeros(ngroups * ncols, dtype=cython_dtype)

3781 result = result.reshape((ngroups, ncols))

3782

3783 func = partial(base_func, out=result)

3784

3785 inferences = None

3786

3787 if needs_counts:

3788 counts = np.zeros(self.ngroups, dtype=np.int64)

3789 func = partial(func, counts=counts)

3790

3791 vals = values

3792 if pre_processing:

3793 vals, inferences = pre_processing(vals)

3794

3795 vals = vals.astype(cython_dtype, copy=False)

3796 if vals.ndim == 1:

3797 vals = vals.reshape((-1, 1))

3798 func = partial(func, values=vals)

3799

3800 if needs_mask:

3801 mask = isna(values).view(np.uint8)

3802 if mask.ndim == 1:

3803 mask = mask.reshape(-1, 1)

3804 func = partial(func, mask=mask)

3805

3806 if needs_nullable:

3807 is_nullable = isinstance(values, BaseMaskedArray)

3808 func = partial(func, nullable=is_nullable)

3809

3810 func(**kwargs) # Call func to modify indexer values in place

3811

3812 if values.ndim == 1:

3813 assert result.shape[1] == 1, result.shape

3814 result = result[:, 0]

3815

3816 if post_processing:

3817 pp_kwargs = {}

3818 if needs_nullable:

3819 pp_kwargs["nullable"] = isinstance(values, BaseMaskedArray)

3820

3821 result = post_processing(result, inferences, **pp_kwargs)

3822

3823 return result.T

3824

3825 obj = self._obj_with_exclusions

3826

3827 # Operate block-wise instead of column-by-column

3828 is_ser = obj.ndim == 1

3829 mgr = self._get_data_to_aggregate()

3830 orig_mgr_len = len(mgr)

3831

3832 if numeric_only_bool:

3833 mgr = mgr.get_numeric_data()

3834

3835 res_mgr = mgr.grouped_reduce(blk_func, ignore_failures=True)

3836

3837 if not is_ser and len(res_mgr.items) != orig_mgr_len:

3838 howstr = how.replace("group_", "")

3839 warn_dropping_nuisance_columns_deprecated(type(self), howstr, numeric_only)

3840

3841 if len(res_mgr.items) == 0:

3842 # We re-call grouped_reduce to get the right exception message

3843 mgr.grouped_reduce(blk_func, ignore_failures=False)

3844 # grouped_reduce _should_ raise, so this should not be reached

3845 raise TypeError( # pragma: no cover

3846 "All columns were dropped in grouped_reduce"

3847 )

3848

3849 if is_ser:

3850 out = self._wrap_agged_manager(res_mgr)

3851 else:

3852 out = obj._constructor(res_mgr)

3853

3854 return self._wrap_aggregated_output(out)

3855

3856 @final

3857 @Substitution(name="groupby")

3858 def shift(self, periods=1, freq=None, axis=0, fill_value=None):

3859 """

3860 Shift each group by periods observations.

3861

3862 If freq is passed, the index will be increased using the periods and the freq.

3863

3864 Parameters

3865 ----------

3866 periods : int, default 1

3867 Number of periods to shift.

3868 freq : str, optional

3869 Frequency string.

3870 axis : axis to shift, default 0

3871 Shift direction.

3872 fill_value : optional

3873 The scalar value to use for newly introduced missing values.

3874

3875 Returns

3876 -------

3877 Series or DataFrame

3878 Object shifted within each group.

3879

3880 See Also

3881 --------

3882 Index.shift : Shift values of Index.

3883 tshift : Shift the time index, using the index’s frequency

3884 if available.

3885 """

3886 if freq is not None or axis != 0:

3887 f = lambda x: x.shift(periods, freq, axis, fill_value)

3888 return self._python_apply_general(f, self._selected_obj, is_transform=True)

3889

3890 ids, _, ngroups = self.grouper.group_info

3891 res_indexer = np.zeros(len(ids), dtype=np.int64)

3892

3893 libgroupby.group_shift_indexer(res_indexer, ids, ngroups, periods)

3894

3895 obj = self._obj_with_exclusions

3896

3897 res = obj._reindex_with_indexers(

3898 {self.axis: (obj.axes[self.axis], res_indexer)},

3899 fill_value=fill_value,

3900 allow_dups=True,

3901 )

3902 return res

3903

3904 @final

3905 @Substitution(name="groupby")

3906 @Appender(_common_see_also)

3907 def diff(self, periods: int = 1, axis: int = 0) -> NDFrameT:

3908 """

3909 First discrete difference of element.

3910

3911 Calculates the difference of each element compared with another

3912 element in the group (default is element in previous row).

3913

3914 Parameters

3915 ----------

3916 periods : int, default 1

3917 Periods to shift for calculating difference, accepts negative values.

3918 axis : axis to shift, default 0

3919 Take difference over rows (0) or columns (1).

3920

3921 Returns

3922 -------

3923 Series or DataFrame

3924 First differences.

3925 """

3926 if axis != 0:

3927 return self.apply(lambda x: x.diff(periods=periods, axis=axis))

3928

3929 obj = self._obj_with_exclusions

3930 shifted = self.shift(periods=periods, axis=axis)

3931

3932 # GH45562 - to retain existing behavior and match behavior of Series.diff(),

3933 # int8 and int16 are coerced to float32 rather than float64.

3934 dtypes_to_f32 = ["int8", "int16"]

3935 if obj.ndim == 1:

3936 if obj.dtype in dtypes_to_f32:

3937 shifted = shifted.astype("float32")

3938 else:

3939 to_coerce = [c for c, dtype in obj.dtypes.items() if dtype in dtypes_to_f32]

3940 if len(to_coerce):

3941 shifted = shifted.astype({c: "float32" for c in to_coerce})

3942

3943 return obj - shifted

3944

3945 @final

3946 @Substitution(name="groupby")

3947 @Appender(_common_see_also)

3948 def pct_change(self, periods=1, fill_method="ffill", limit=None, freq=None, axis=0):

3949 """

3950 Calculate pct_change of each value to previous entry in group.

3951

3952 Returns

3953 -------

3954 Series or DataFrame

3955 Percentage changes within each group.

3956 """

3957 # TODO(GH#23918): Remove this conditional for SeriesGroupBy when

3958 # GH#23918 is fixed

3959 if freq is not None or axis != 0:

3960 f = lambda x: x.pct_change(

3961 periods=periods,

3962 fill_method=fill_method,

3963 limit=limit,

3964 freq=freq,

3965 axis=axis,

3966 )

3967 return self._python_apply_general(f, self._selected_obj, is_transform=True)

3968

3969 if fill_method is None: # GH30463

3970 fill_method = "ffill"

3971 limit = 0

3972 filled = getattr(self, fill_method)(limit=limit)

3973 fill_grp = filled.groupby(

3974 self.grouper.codes, axis=self.axis, group_keys=self.group_keys

3975 )

3976 shifted = fill_grp.shift(periods=periods, freq=freq, axis=self.axis)

3977 return (filled / shifted) - 1

3978

3979 @final

3980 @Substitution(name="groupby")

3981 @Substitution(see_also=_common_see_also)

3982 def head(self, n: int = 5) -> NDFrameT:

3983 """

3984 Return first n rows of each group.

3985

3986 Similar to ``.apply(lambda x: x.head(n))``, but it returns a subset of rows

3987 from the original DataFrame with original index and order preserved

3988 (``as_index`` flag is ignored).

3989

3990 Parameters

3991 ----------

3992 n : int

3993 If positive: number of entries to include from start of each group.

3994 If negative: number of entries to exclude from end of each group.

3995

3996 Returns

3997 -------

3998 Series or DataFrame

3999 Subset of original Series or DataFrame as determined by n.

4000 %(see_also)s

4001 Examples

4002 --------

4003

4004 >>> df = pd.DataFrame([[1, 2], [1, 4], [5, 6]],

4005 ... columns=['A', 'B'])

4006 >>> df.groupby('A').head(1)

4007 A B

4008 0 1 2

4009 2 5 6

4010 >>> df.groupby('A').head(-1)

4011 A B

4012 0 1 2

4013 """

4014 self._reset_group_selection()

4015 mask = self._make_mask_from_positional_indexer(slice(None, n))

4016 return self._mask_selected_obj(mask)

4017

4018 @final

4019 @Substitution(name="groupby")

4020 @Substitution(see_also=_common_see_also)

4021 def tail(self, n: int = 5) -> NDFrameT:

4022 """

4023 Return last n rows of each group.

4024

4025 Similar to ``.apply(lambda x: x.tail(n))``, but it returns a subset of rows

4026 from the original DataFrame with original index and order preserved

4027 (``as_index`` flag is ignored).

4028

4029 Parameters

4030 ----------

4031 n : int

4032 If positive: number of entries to include from end of each group.

4033 If negative: number of entries to exclude from start of each group.

4034

4035 Returns

4036 -------

4037 Series or DataFrame

4038 Subset of original Series or DataFrame as determined by n.

4039 %(see_also)s

4040 Examples

4041 --------

4042

4043 >>> df = pd.DataFrame([['a', 1], ['a', 2], ['b', 1], ['b', 2]],

4044 ... columns=['A', 'B'])

4045 >>> df.groupby('A').tail(1)

4046 A B

4047 1 a 2

4048 3 b 2

4049 >>> df.groupby('A').tail(-1)

4050 A B

4051 1 a 2

4052 3 b 2

4053 """

4054 self._reset_group_selection()

4055 if n:

4056 mask = self._make_mask_from_positional_indexer(slice(-n, None))

4057 else:

4058 mask = self._make_mask_from_positional_indexer([])

4059

4060 return self._mask_selected_obj(mask)

4061

4062 @final

4063 def _mask_selected_obj(self, mask: npt.NDArray[np.bool_]) -> NDFrameT:

4064 """

4065 Return _selected_obj with mask applied to the correct axis.

4066

4067 Parameters

4068 ----------

4069 mask : np.ndarray[bool]

4070 Boolean mask to apply.

4071

4072 Returns

4073 -------

4074 Series or DataFrame

4075 Filtered _selected_obj.

4076 """

4077 ids = self.grouper.group_info[0]

4078 mask = mask & (ids != -1)

4079

4080 if self.axis == 0:

4081 return self._selected_obj[mask]

4082 else:

4083 return self._selected_obj.iloc[:, mask]

4084

4085 @final

4086 def _reindex_output(

4087 self,

4088 output: OutputFrameOrSeries,

4089 fill_value: Scalar = np.NaN,

4090 qs: npt.NDArray[np.float64] | None = None,

4091 ) -> OutputFrameOrSeries:

4092 """

4093 If we have categorical groupers, then we might want to make sure that

4094 we have a fully re-indexed output to the levels. This means expanding

4095 the output space to accommodate all values in the cartesian product of

4096 our groups, regardless of whether they were observed in the data or

4097 not. This will expand the output space if there are missing groups.

4098

4099 The method returns early without modifying the input if the number of

4100 groupings is less than 2, self.observed == True or none of the groupers

4101 are categorical.

4102

4103 Parameters

4104 ----------

4105 output : Series or DataFrame

4106 Object resulting from grouping and applying an operation.

4107 fill_value : scalar, default np.NaN

4108 Value to use for unobserved categories if self.observed is False.

4109 qs : np.ndarray[float64] or None, default None

4110 quantile values, only relevant for quantile.

4111

4112 Returns

4113 -------

4114 Series or DataFrame

4115 Object (potentially) re-indexed to include all possible groups.

4116 """

4117 groupings = self.grouper.groupings

4118 if len(groupings) == 1:

4119 return output

4120

4121 # if we only care about the observed values

4122 # we are done

4123 elif self.observed:

4124 return output

4125

4126 # reindexing only applies to a Categorical grouper

4127 elif not any(

4128 isinstance(ping.grouping_vector, (Categorical, CategoricalIndex))

4129 for ping in groupings

4130 ):

4131 return output

4132

4133 levels_list = [ping.group_index for ping in groupings]

4134 names = self.grouper.names

4135 if qs is not None:

4136 # error: Argument 1 to "append" of "list" has incompatible type

4137 # "ndarray[Any, dtype[floating[_64Bit]]]"; expected "Index"

4138 levels_list.append(qs) # type: ignore[arg-type]

4139 names = names + [None]

4140 index, _ = MultiIndex.from_product(levels_list, names=names).sortlevel()

4141

4142 if self.as_index:

4143 # Always holds for SeriesGroupBy unless GH#36507 is implemented

4144 d = {

4145 self.obj._get_axis_name(self.axis): index,

4146 "copy": False,

4147 "fill_value": fill_value,

4148 }

4149 return output.reindex(**d)

4150

4151 # GH 13204

4152 # Here, the categorical in-axis groupers, which need to be fully

4153 # expanded, are columns in `output`. An idea is to do:

4154 # output = output.set_index(self.grouper.names)

4155 # .reindex(index).reset_index()

4156 # but special care has to be taken because of possible not-in-axis

4157 # groupers.

4158 # So, we manually select and drop the in-axis grouper columns,

4159 # reindex `output`, and then reset the in-axis grouper columns.

4160

4161 # Select in-axis groupers

4162 in_axis_grps = (

4163 (i, ping.name) for (i, ping) in enumerate(groupings) if ping.in_axis

4164 )

4165 g_nums, g_names = zip(*in_axis_grps)

4166

4167 output = output.drop(labels=list(g_names), axis=1)

4168

4169 # Set a temp index and reindex (possibly expanding)

4170 output = output.set_index(self.grouper.result_index).reindex(

4171 index, copy=False, fill_value=fill_value

4172 )

4173

4174 # Reset in-axis grouper columns

4175 # (using level numbers `g_nums` because level names may not be unique)

4176 output = output.reset_index(level=g_nums)

4177

4178 return output.reset_index(drop=True)

4179

4180 @final

4181 def sample(

4182 self,

4183 n: int | None = None,

4184 frac: float | None = None,

4185 replace: bool = False,

4186 weights: Sequence | Series | None = None,

4187 random_state: RandomState | None = None,

4188 ):

4189 """

4190 Return a random sample of items from each group.

4191

4192 You can use `random_state` for reproducibility.

4193

4194 .. versionadded:: 1.1.0

4195

4196 Parameters

4197 ----------

4198 n : int, optional

4199 Number of items to return for each group. Cannot be used with

4200 `frac` and must be no larger than the smallest group unless

4201 `replace` is True. Default is one if `frac` is None.

4202 frac : float, optional

4203 Fraction of items to return. Cannot be used with `n`.

4204 replace : bool, default False

4205 Allow or disallow sampling of the same row more than once.

4206 weights : list-like, optional

4207 Default None results in equal probability weighting.

4208 If passed a list-like then values must have the same length as

4209 the underlying DataFrame or Series object and will be used as

4210 sampling probabilities after normalization within each group.

4211 Values must be non-negative with at least one positive element

4212 within each group.

4213 random_state : int, array-like, BitGenerator, np.random.RandomState, np.random.Generator, optional

4214 If int, array-like, or BitGenerator, seed for random number generator.

4215 If np.random.RandomState or np.random.Generator, use as given.

4216

4217 .. versionchanged:: 1.4.0

4218

4219 np.random.Generator objects now accepted

4220

4221 Returns

4222 -------

4223 Series or DataFrame

4224 A new object of same type as caller containing items randomly

4225 sampled within each group from the caller object.

4226

4227 See Also

4228 --------

4229 DataFrame.sample: Generate random samples from a DataFrame object.

4230 numpy.random.choice: Generate a random sample from a given 1-D numpy

4231 array.

4232

4233 Examples

4234 --------

4235 >>> df = pd.DataFrame(

4236 ... {"a": ["red"] * 2 + ["blue"] * 2 + ["black"] * 2, "b": range(6)}

4237 ... )

4238 >>> df

4239 a b

4240 0 red 0

4241 1 red 1

4242 2 blue 2

4243 3 blue 3

4244 4 black 4

4245 5 black 5

4246

4247 Select one row at random for each distinct value in column a. The

4248 `random_state` argument can be used to guarantee reproducibility:

4249

4250 >>> df.groupby("a").sample(n=1, random_state=1)

4251 a b

4252 4 black 4

4253 2 blue 2

4254 1 red 1

4255

4256 Set `frac` to sample fixed proportions rather than counts:

4257

4258 >>> df.groupby("a")["b"].sample(frac=0.5, random_state=2)

4259 5 5

4260 2 2

4261 0 0

4262 Name: b, dtype: int64

4263

4264 Control sample probabilities within groups by setting weights:

4265

4266 >>> df.groupby("a").sample(

4267 ... n=1,

4268 ... weights=[1, 1, 1, 0, 0, 1],

4269 ... random_state=1,

4270 ... )

4271 a b

4272 5 black 5

4273 2 blue 2

4274 0 red 0

4275 """ # noqa:E501

4276 size = sample.process_sampling_size(n, frac, replace)

4277 if weights is not None:

4278 weights_arr = sample.preprocess_weights(

4279 self._selected_obj, weights, axis=self.axis

4280 )

4281

4282 random_state = com.random_state(random_state)

4283

4284 group_iterator = self.grouper.get_iterator(self._selected_obj, self.axis)

4285

4286 sampled_indices = []

4287 for labels, obj in group_iterator:

4288 grp_indices = self.indices[labels]

4289 group_size = len(grp_indices)

4290 if size is not None:

4291 sample_size = size

4292 else:

4293 assert frac is not None

4294 sample_size = round(frac * group_size)

4295

4296 grp_sample = sample.sample(

4297 group_size,

4298 size=sample_size,

4299 replace=replace,

4300 weights=None if weights is None else weights_arr[grp_indices],

4301 random_state=random_state,

4302 )

4303 sampled_indices.append(grp_indices[grp_sample])

4304

4305 sampled_indices = np.concatenate(sampled_indices)

4306 return self._selected_obj.take(sampled_indices, axis=self.axis)

4307

4308

4309@doc(GroupBy)

4310def get_groupby(

4311 obj: NDFrame,

4312 by: _KeysArgType | None = None,

4313 axis: int = 0,

4314 level=None,

4315 grouper: ops.BaseGrouper | None = None,

4316 exclusions=None,

4317 selection=None,

4318 as_index: bool = True,

4319 sort: bool = True,

4320 group_keys: bool | lib.NoDefault = True,

4321 squeeze: bool = False,

4322 observed: bool = False,

4323 mutated: bool = False,

4324 dropna: bool = True,

4325) -> GroupBy:

4326

4327 klass: type[GroupBy]

4328 if isinstance(obj, Series):

4329 from pandas.core.groupby.generic import SeriesGroupBy

4330

4331 klass = SeriesGroupBy

4332 elif isinstance(obj, DataFrame):

4333 from pandas.core.groupby.generic import DataFrameGroupBy

4334

4335 klass = DataFrameGroupBy

4336 else: # pragma: no cover

4337 raise TypeError(f"invalid type: {obj}")

4338

4339 return klass(

4340 obj=obj,

4341 keys=by,

4342 axis=axis,

4343 level=level,

4344 grouper=grouper,

4345 exclusions=exclusions,

4346 selection=selection,

4347 as_index=as_index,

4348 sort=sort,

4349 group_keys=group_keys,

4350 squeeze=squeeze,

4351 observed=observed,

4352 mutated=mutated,

4353 dropna=dropna,

4354 )

4355

4356

4357def _insert_quantile_level(idx: Index, qs: npt.NDArray[np.float64]) -> MultiIndex:

4358 """

4359 Insert the sequence 'qs' of quantiles as the inner-most level of a MultiIndex.

4360

4361 The quantile level in the MultiIndex is a repeated copy of 'qs'.

4362

4363 Parameters

4364 ----------

4365 idx : Index

4366 qs : np.ndarray[float64]

4367

4368 Returns

4369 -------

4370 MultiIndex

4371 """

4372 nqs = len(qs)

4373

4374 if idx._is_multi:

4375 idx = cast(MultiIndex, idx)

4376 lev_codes, lev = Index(qs).factorize()

4377 levels = list(idx.levels) + [lev]

4378 codes = [np.repeat(x, nqs) for x in idx.codes] + [np.tile(lev_codes, len(idx))]

4379 mi = MultiIndex(levels=levels, codes=codes, names=idx.names + [None])

4380 else:

4381 mi = MultiIndex.from_product([idx, qs])

4382 return mi

4383

4384

4385def warn_dropping_nuisance_columns_deprecated(cls, how: str, numeric_only) -> None:

4386 if numeric_only is not lib.no_default and not numeric_only:

4387 # numeric_only was specified and falsey but still dropped nuisance columns

4388 warnings.warn(

4389 "Dropping invalid columns in "

4390 f"{cls.__name__}.{how} is deprecated. "

4391 "In a future version, a TypeError will be raised. "

4392 f"Before calling .{how}, select only columns which "

4393 "should be valid for the function.",

4394 FutureWarning,

4395 stacklevel=find_stack_level(),

4396 )

4397 elif numeric_only is lib.no_default:

4398 warnings.warn(

4399 "The default value of numeric_only in "

4400 f"{cls.__name__}.{how} is deprecated. "

4401 "In a future version, numeric_only will default to False. "

4402 f"Either specify numeric_only or select only columns which "

4403 "should be valid for the function.",

4404 FutureWarning,

4405 stacklevel=find_stack_level(),

4406 )