Coverage for /var/srv/projects/api.amasfac.comuna18.com/tmp/venv/lib/python3.9/site-packages/pandas/core/groupby/groupby.py: 18%

1236 statements  

« prev     ^ index     » next       coverage.py v6.4.4, created at 2023-07-17 14:22 -0600

1""" 

2Provide the groupby split-apply-combine paradigm. Define the GroupBy 

3class providing the base-class of operations. 

4 

5The SeriesGroupBy and DataFrameGroupBy sub-class 

6(defined in pandas.core.groupby.generic) 

7expose these user-facing objects to provide specific functionality. 

8""" 

9from __future__ import annotations 

10 

11from contextlib import contextmanager 

12import datetime 

13from functools import ( 

14 partial, 

15 wraps, 

16) 

17import inspect 

18from textwrap import dedent 

19import types 

20from typing import ( 

21 TYPE_CHECKING, 

22 Callable, 

23 Hashable, 

24 Iterable, 

25 Iterator, 

26 List, 

27 Literal, 

28 Mapping, 

29 Sequence, 

30 TypeVar, 

31 Union, 

32 cast, 

33 final, 

34) 

35import warnings 

36 

37import numpy as np 

38 

39from pandas._config.config import option_context 

40 

41from pandas._libs import ( 

42 Timestamp, 

43 lib, 

44) 

45import pandas._libs.groupby as libgroupby 

46from pandas._typing import ( 

47 ArrayLike, 

48 IndexLabel, 

49 NDFrameT, 

50 PositionalIndexer, 

51 RandomState, 

52 Scalar, 

53 T, 

54 npt, 

55) 

56from pandas.compat.numpy import function as nv 

57from pandas.errors import ( 

58 AbstractMethodError, 

59 DataError, 

60) 

61from pandas.util._decorators import ( 

62 Appender, 

63 Substitution, 

64 cache_readonly, 

65 doc, 

66) 

67from pandas.util._exceptions import find_stack_level 

68 

69from pandas.core.dtypes.cast import ensure_dtype_can_hold_na 

70from pandas.core.dtypes.common import ( 

71 is_bool_dtype, 

72 is_datetime64_dtype, 

73 is_float_dtype, 

74 is_integer, 

75 is_integer_dtype, 

76 is_numeric_dtype, 

77 is_object_dtype, 

78 is_scalar, 

79 is_timedelta64_dtype, 

80) 

81from pandas.core.dtypes.missing import ( 

82 isna, 

83 notna, 

84) 

85 

86from pandas.core import nanops 

87from pandas.core._numba import executor 

88import pandas.core.algorithms as algorithms 

89from pandas.core.arrays import ( 

90 BaseMaskedArray, 

91 BooleanArray, 

92 Categorical, 

93 ExtensionArray, 

94) 

95from pandas.core.base import ( 

96 PandasObject, 

97 SelectionMixin, 

98) 

99import pandas.core.common as com 

100from pandas.core.frame import DataFrame 

101from pandas.core.generic import NDFrame 

102from pandas.core.groupby import ( 

103 base, 

104 numba_, 

105 ops, 

106) 

107from pandas.core.groupby.indexing import ( 

108 GroupByIndexingMixin, 

109 GroupByNthSelector, 

110) 

111from pandas.core.indexes.api import ( 

112 CategoricalIndex, 

113 Index, 

114 MultiIndex, 

115 RangeIndex, 

116) 

117from pandas.core.internals.blocks import ensure_block_shape 

118import pandas.core.sample as sample 

119from pandas.core.series import Series 

120from pandas.core.sorting import get_group_index_sorter 

121from pandas.core.util.numba_ import ( 

122 get_jit_arguments, 

123 maybe_use_numba, 

124) 

125 

126if TYPE_CHECKING: 126 ↛ 127line 126 didn't jump to line 127, because the condition on line 126 was never true

127 from pandas.core.window import ( 

128 ExpandingGroupby, 

129 ExponentialMovingWindowGroupby, 

130 RollingGroupby, 

131 ) 

132 

133_common_see_also = """ 

134 See Also 

135 -------- 

136 Series.%(name)s : Apply a function %(name)s to a Series. 

137 DataFrame.%(name)s : Apply a function %(name)s 

138 to each row or column of a DataFrame. 

139""" 

140 

141_apply_docs = { 

142 "template": """ 

143 Apply function ``func`` group-wise and combine the results together. 

144 

145 The function passed to ``apply`` must take a {input} as its first 

146 argument and return a DataFrame, Series or scalar. ``apply`` will 

147 then take care of combining the results back together into a single 

148 dataframe or series. ``apply`` is therefore a highly flexible 

149 grouping method. 

150 

151 While ``apply`` is a very flexible method, its downside is that 

152 using it can be quite a bit slower than using more specific methods 

153 like ``agg`` or ``transform``. Pandas offers a wide range of method that will 

154 be much faster than using ``apply`` for their specific purposes, so try to 

155 use them before reaching for ``apply``. 

156 

157 Parameters 

158 ---------- 

159 func : callable 

160 A callable that takes a {input} as its first argument, and 

161 returns a dataframe, a series or a scalar. In addition the 

162 callable may take positional and keyword arguments. 

163 args, kwargs : tuple and dict 

164 Optional positional and keyword arguments to pass to ``func``. 

165 

166 Returns 

167 ------- 

168 applied : Series or DataFrame 

169 

170 See Also 

171 -------- 

172 pipe : Apply function to the full GroupBy object instead of to each 

173 group. 

174 aggregate : Apply aggregate function to the GroupBy object. 

175 transform : Apply function column-by-column to the GroupBy object. 

176 Series.apply : Apply a function to a Series. 

177 DataFrame.apply : Apply a function to each row or column of a DataFrame. 

178 

179 Notes 

180 ----- 

181 

182 .. versionchanged:: 1.3.0 

183 

184 The resulting dtype will reflect the return value of the passed ``func``, 

185 see the examples below. 

186 

187 Functions that mutate the passed object can produce unexpected 

188 behavior or errors and are not supported. See :ref:`gotchas.udf-mutation` 

189 for more details. 

190 

191 Examples 

192 -------- 

193 {examples} 

194 """, 

195 "dataframe_examples": """ 

196 >>> df = pd.DataFrame({'A': 'a a b'.split(), 

197 ... 'B': [1,2,3], 

198 ... 'C': [4,6,5]}) 

199 >>> g1 = df.groupby('A', group_keys=False) 

200 >>> g2 = df.groupby('A', group_keys=True) 

201 

202 Notice that ``g1`` have ``g2`` have two groups, ``a`` and ``b``, and only 

203 differ in their ``group_keys`` argument. Calling `apply` in various ways, 

204 we can get different grouping results: 

205 

206 Example 1: below the function passed to `apply` takes a DataFrame as 

207 its argument and returns a DataFrame. `apply` combines the result for 

208 each group together into a new DataFrame: 

209 

210 >>> g1[['B', 'C']].apply(lambda x: x / x.sum()) 

211 B C 

212 0 0.333333 0.4 

213 1 0.666667 0.6 

214 2 1.000000 1.0 

215 

216 In the above, the groups are not part of the index. We can have them included 

217 by using ``g2`` where ``group_keys=True``: 

218 

219 >>> g2[['B', 'C']].apply(lambda x: x / x.sum()) 

220 B C 

221 A 

222 a 0 0.333333 0.4 

223 1 0.666667 0.6 

224 b 2 1.000000 1.0 

225 

226 Example 2: The function passed to `apply` takes a DataFrame as 

227 its argument and returns a Series. `apply` combines the result for 

228 each group together into a new DataFrame. 

229 

230 .. versionchanged:: 1.3.0 

231 

232 The resulting dtype will reflect the return value of the passed ``func``. 

233 

234 >>> g1[['B', 'C']].apply(lambda x: x.astype(float).max() - x.min()) 

235 B C 

236 A 

237 a 1.0 2.0 

238 b 0.0 0.0 

239 

240 >>> g2[['B', 'C']].apply(lambda x: x.astype(float).max() - x.min()) 

241 B C 

242 A 

243 a 1.0 2.0 

244 b 0.0 0.0 

245 

246 The ``group_keys`` argument has no effect here because the result is not 

247 like-indexed (i.e. :ref:`a transform <groupby.transform>`) when compared 

248 to the input. 

249 

250 Example 3: The function passed to `apply` takes a DataFrame as 

251 its argument and returns a scalar. `apply` combines the result for 

252 each group together into a Series, including setting the index as 

253 appropriate: 

254 

255 >>> g1.apply(lambda x: x.C.max() - x.B.min()) 

256 A 

257 a 5 

258 b 2 

259 dtype: int64""", 

260 "series_examples": """ 

261 >>> s = pd.Series([0, 1, 2], index='a a b'.split()) 

262 >>> g1 = s.groupby(s.index, group_keys=False) 

263 >>> g2 = s.groupby(s.index, group_keys=True) 

264 

265 From ``s`` above we can see that ``g`` has two groups, ``a`` and ``b``. 

266 Notice that ``g1`` have ``g2`` have two groups, ``a`` and ``b``, and only 

267 differ in their ``group_keys`` argument. Calling `apply` in various ways, 

268 we can get different grouping results: 

269 

270 Example 1: The function passed to `apply` takes a Series as 

271 its argument and returns a Series. `apply` combines the result for 

272 each group together into a new Series. 

273 

274 .. versionchanged:: 1.3.0 

275 

276 The resulting dtype will reflect the return value of the passed ``func``. 

277 

278 >>> g1.apply(lambda x: x*2 if x.name == 'a' else x/2) 

279 a 0.0 

280 a 2.0 

281 b 1.0 

282 dtype: float64 

283 

284 In the above, the groups are not part of the index. We can have them included 

285 by using ``g2`` where ``group_keys=True``: 

286 

287 >>> g2.apply(lambda x: x*2 if x.name == 'a' else x/2) 

288 a a 0.0 

289 a 2.0 

290 b b 1.0 

291 dtype: float64 

292 

293 Example 2: The function passed to `apply` takes a Series as 

294 its argument and returns a scalar. `apply` combines the result for 

295 each group together into a Series, including setting the index as 

296 appropriate: 

297 

298 >>> g1.apply(lambda x: x.max() - x.min()) 

299 a 1 

300 b 0 

301 dtype: int64 

302 

303 The ``group_keys`` argument has no effect here because the result is not 

304 like-indexed (i.e. :ref:`a transform <groupby.transform>`) when compared 

305 to the input. 

306 

307 >>> g2.apply(lambda x: x.max() - x.min()) 

308 a 1 

309 b 0 

310 dtype: int64""", 

311} 

312 

313_groupby_agg_method_template = """ 

314Compute {fname} of group values. 

315 

316Parameters 

317---------- 

318numeric_only : bool, default {no} 

319 Include only float, int, boolean columns. If None, will attempt to use 

320 everything, then use only numeric data. 

321min_count : int, default {mc} 

322 The required number of valid values to perform the operation. If fewer 

323 than ``min_count`` non-NA values are present the result will be NA. 

324 

325Returns 

326------- 

327Series or DataFrame 

328 Computed {fname} of values within each group. 

329""" 

330 

331_pipe_template = """ 

332Apply a ``func`` with arguments to this %(klass)s object and return its result. 

333 

334Use `.pipe` when you want to improve readability by chaining together 

335functions that expect Series, DataFrames, GroupBy or Resampler objects. 

336Instead of writing 

337 

338>>> h(g(f(df.groupby('group')), arg1=a), arg2=b, arg3=c) # doctest: +SKIP 

339 

340You can write 

341 

342>>> (df.groupby('group') 

343... .pipe(f) 

344... .pipe(g, arg1=a) 

345... .pipe(h, arg2=b, arg3=c)) # doctest: +SKIP 

346 

347which is much more readable. 

348 

349Parameters 

350---------- 

351func : callable or tuple of (callable, str) 

352 Function to apply to this %(klass)s object or, alternatively, 

353 a `(callable, data_keyword)` tuple where `data_keyword` is a 

354 string indicating the keyword of `callable` that expects the 

355 %(klass)s object. 

356args : iterable, optional 

357 Positional arguments passed into `func`. 

358kwargs : dict, optional 

359 A dictionary of keyword arguments passed into `func`. 

360 

361Returns 

362------- 

363object : the return type of `func`. 

364 

365See Also 

366-------- 

367Series.pipe : Apply a function with arguments to a series. 

368DataFrame.pipe: Apply a function with arguments to a dataframe. 

369apply : Apply function to each group instead of to the 

370 full %(klass)s object. 

371 

372Notes 

373----- 

374See more `here 

375<https://pandas.pydata.org/pandas-docs/stable/user_guide/groupby.html#piping-function-calls>`_ 

376 

377Examples 

378-------- 

379%(examples)s 

380""" 

381 

382_transform_template = """ 

383Call function producing a same-indexed %(klass)s on each group. 

384 

385Returns a %(klass)s having the same indexes as the original object 

386filled with the transformed values. 

387 

388Parameters 

389---------- 

390f : function 

391 Function to apply to each group. See the Notes section below for requirements. 

392 

393 Can also accept a Numba JIT function with 

394 ``engine='numba'`` specified. 

395 

396 If the ``'numba'`` engine is chosen, the function must be 

397 a user defined function with ``values`` and ``index`` as the 

398 first and second arguments respectively in the function signature. 

399 Each group's index will be passed to the user defined function 

400 and optionally available for use. 

401 

402 .. versionchanged:: 1.1.0 

403*args 

404 Positional arguments to pass to func. 

405engine : str, default None 

406 * ``'cython'`` : Runs the function through C-extensions from cython. 

407 * ``'numba'`` : Runs the function through JIT compiled code from numba. 

408 * ``None`` : Defaults to ``'cython'`` or the global setting ``compute.use_numba`` 

409 

410 .. versionadded:: 1.1.0 

411engine_kwargs : dict, default None 

412 * For ``'cython'`` engine, there are no accepted ``engine_kwargs`` 

413 * For ``'numba'`` engine, the engine can accept ``nopython``, ``nogil`` 

414 and ``parallel`` dictionary keys. The values must either be ``True`` or 

415 ``False``. The default ``engine_kwargs`` for the ``'numba'`` engine is 

416 ``{'nopython': True, 'nogil': False, 'parallel': False}`` and will be 

417 applied to the function 

418 

419 .. versionadded:: 1.1.0 

420**kwargs 

421 Keyword arguments to be passed into func. 

422 

423Returns 

424------- 

425%(klass)s 

426 

427See Also 

428-------- 

429%(klass)s.groupby.apply : Apply function ``func`` group-wise and combine 

430 the results together. 

431%(klass)s.groupby.aggregate : Aggregate using one or more 

432 operations over the specified axis. 

433%(klass)s.transform : Call ``func`` on self producing a %(klass)s with the 

434 same axis shape as self. 

435 

436Notes 

437----- 

438Each group is endowed the attribute 'name' in case you need to know 

439which group you are working on. 

440 

441The current implementation imposes three requirements on f: 

442 

443* f must return a value that either has the same shape as the input 

444 subframe or can be broadcast to the shape of the input subframe. 

445 For example, if `f` returns a scalar it will be broadcast to have the 

446 same shape as the input subframe. 

447* if this is a DataFrame, f must support application column-by-column 

448 in the subframe. If f also supports application to the entire subframe, 

449 then a fast path is used starting from the second chunk. 

450* f must not mutate groups. Mutation is not supported and may 

451 produce unexpected results. See :ref:`gotchas.udf-mutation` for more details. 

452 

453When using ``engine='numba'``, there will be no "fall back" behavior internally. 

454The group data and group index will be passed as numpy arrays to the JITed 

455user defined function, and no alternative execution attempts will be tried. 

456 

457.. versionchanged:: 1.3.0 

458 

459 The resulting dtype will reflect the return value of the passed ``func``, 

460 see the examples below. 

461 

462.. deprecated:: 1.5.0 

463 

464 When using ``.transform`` on a grouped DataFrame and the transformation function 

465 returns a DataFrame, currently pandas does not align the result's index 

466 with the input's index. This behavior is deprecated and alignment will 

467 be performed in a future version of pandas. You can apply ``.to_numpy()`` to the 

468 result of the transformation function to avoid alignment. 

469 

470Examples 

471-------- 

472 

473>>> df = pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar', 

474... 'foo', 'bar'], 

475... 'B' : ['one', 'one', 'two', 'three', 

476... 'two', 'two'], 

477... 'C' : [1, 5, 5, 2, 5, 5], 

478... 'D' : [2.0, 5., 8., 1., 2., 9.]}) 

479>>> grouped = df.groupby('A')[['C', 'D']] 

480>>> grouped.transform(lambda x: (x - x.mean()) / x.std()) 

481 C D 

4820 -1.154701 -0.577350 

4831 0.577350 0.000000 

4842 0.577350 1.154701 

4853 -1.154701 -1.000000 

4864 0.577350 -0.577350 

4875 0.577350 1.000000 

488 

489Broadcast result of the transformation 

490 

491>>> grouped.transform(lambda x: x.max() - x.min()) 

492 C D 

4930 4.0 6.0 

4941 3.0 8.0 

4952 4.0 6.0 

4963 3.0 8.0 

4974 4.0 6.0 

4985 3.0 8.0 

499 

500.. versionchanged:: 1.3.0 

501 

502 The resulting dtype will reflect the return value of the passed ``func``, 

503 for example: 

504 

505>>> grouped.transform(lambda x: x.astype(int).max()) 

506 C D 

5070 5 8 

5081 5 9 

5092 5 8 

5103 5 9 

5114 5 8 

5125 5 9 

513""" 

514 

515_agg_template = """ 

516Aggregate using one or more operations over the specified axis. 

517 

518Parameters 

519---------- 

520func : function, str, list or dict 

521 Function to use for aggregating the data. If a function, must either 

522 work when passed a {klass} or when passed to {klass}.apply. 

523 

524 Accepted combinations are: 

525 

526 - function 

527 - string function name 

528 - list of functions and/or function names, e.g. ``[np.sum, 'mean']`` 

529 - dict of axis labels -> functions, function names or list of such. 

530 

531 Can also accept a Numba JIT function with 

532 ``engine='numba'`` specified. Only passing a single function is supported 

533 with this engine. 

534 

535 If the ``'numba'`` engine is chosen, the function must be 

536 a user defined function with ``values`` and ``index`` as the 

537 first and second arguments respectively in the function signature. 

538 Each group's index will be passed to the user defined function 

539 and optionally available for use. 

540 

541 .. versionchanged:: 1.1.0 

542*args 

543 Positional arguments to pass to func. 

544engine : str, default None 

545 * ``'cython'`` : Runs the function through C-extensions from cython. 

546 * ``'numba'`` : Runs the function through JIT compiled code from numba. 

547 * ``None`` : Defaults to ``'cython'`` or globally setting ``compute.use_numba`` 

548 

549 .. versionadded:: 1.1.0 

550engine_kwargs : dict, default None 

551 * For ``'cython'`` engine, there are no accepted ``engine_kwargs`` 

552 * For ``'numba'`` engine, the engine can accept ``nopython``, ``nogil`` 

553 and ``parallel`` dictionary keys. The values must either be ``True`` or 

554 ``False``. The default ``engine_kwargs`` for the ``'numba'`` engine is 

555 ``{{'nopython': True, 'nogil': False, 'parallel': False}}`` and will be 

556 applied to the function 

557 

558 .. versionadded:: 1.1.0 

559**kwargs 

560 Keyword arguments to be passed into func. 

561 

562Returns 

563------- 

564{klass} 

565 

566See Also 

567-------- 

568{klass}.groupby.apply : Apply function func group-wise 

569 and combine the results together. 

570{klass}.groupby.transform : Aggregate using one or more 

571 operations over the specified axis. 

572{klass}.aggregate : Transforms the Series on each group 

573 based on the given function. 

574 

575Notes 

576----- 

577When using ``engine='numba'``, there will be no "fall back" behavior internally. 

578The group data and group index will be passed as numpy arrays to the JITed 

579user defined function, and no alternative execution attempts will be tried. 

580 

581Functions that mutate the passed object can produce unexpected 

582behavior or errors and are not supported. See :ref:`gotchas.udf-mutation` 

583for more details. 

584 

585.. versionchanged:: 1.3.0 

586 

587 The resulting dtype will reflect the return value of the passed ``func``, 

588 see the examples below. 

589{examples}""" 

590 

591 

592@final 

593class GroupByPlot(PandasObject): 

594 """ 

595 Class implementing the .plot attribute for groupby objects. 

596 """ 

597 

598 def __init__(self, groupby: GroupBy) -> None: 

599 self._groupby = groupby 

600 

601 def __call__(self, *args, **kwargs): 

602 def f(self): 

603 return self.plot(*args, **kwargs) 

604 

605 f.__name__ = "plot" 

606 return self._groupby.apply(f) 

607 

608 def __getattr__(self, name: str): 

609 def attr(*args, **kwargs): 

610 def f(self): 

611 return getattr(self.plot, name)(*args, **kwargs) 

612 

613 return self._groupby.apply(f) 

614 

615 return attr 

616 

617 

618_KeysArgType = Union[ 

619 Hashable, 

620 List[Hashable], 

621 Callable[[Hashable], Hashable], 

622 List[Callable[[Hashable], Hashable]], 

623 Mapping[Hashable, Hashable], 

624] 

625 

626 

627class BaseGroupBy(PandasObject, SelectionMixin[NDFrameT], GroupByIndexingMixin): 

628 _group_selection: IndexLabel | None = None 

629 _apply_allowlist: frozenset[str] = frozenset() 

630 _hidden_attrs = PandasObject._hidden_attrs | { 

631 "as_index", 

632 "axis", 

633 "dropna", 

634 "exclusions", 

635 "grouper", 

636 "group_keys", 

637 "keys", 

638 "level", 

639 "mutated", 

640 "obj", 

641 "observed", 

642 "sort", 

643 "squeeze", 

644 } 

645 

646 axis: int 

647 grouper: ops.BaseGrouper 

648 keys: _KeysArgType | None = None 

649 group_keys: bool | lib.NoDefault 

650 

651 @final 

652 def __len__(self) -> int: 

653 return len(self.groups) 

654 

655 @final 

656 def __repr__(self) -> str: 

657 # TODO: Better repr for GroupBy object 

658 return object.__repr__(self) 

659 

660 @final 

661 @property 

662 def groups(self) -> dict[Hashable, np.ndarray]: 

663 """ 

664 Dict {group name -> group labels}. 

665 """ 

666 return self.grouper.groups 

667 

668 @final 

669 @property 

670 def ngroups(self) -> int: 

671 return self.grouper.ngroups 

672 

673 @final 

674 @property 

675 def indices(self) -> dict[Hashable, npt.NDArray[np.intp]]: 

676 """ 

677 Dict {group name -> group indices}. 

678 """ 

679 return self.grouper.indices 

680 

681 @final 

682 def _get_indices(self, names): 

683 """ 

684 Safe get multiple indices, translate keys for 

685 datelike to underlying repr. 

686 """ 

687 

688 def get_converter(s): 

689 # possibly convert to the actual key types 

690 # in the indices, could be a Timestamp or a np.datetime64 

691 if isinstance(s, datetime.datetime): 

692 return lambda key: Timestamp(key) 

693 elif isinstance(s, np.datetime64): 

694 return lambda key: Timestamp(key).asm8 

695 else: 

696 return lambda key: key 

697 

698 if len(names) == 0: 

699 return [] 

700 

701 if len(self.indices) > 0: 

702 index_sample = next(iter(self.indices)) 

703 else: 

704 index_sample = None # Dummy sample 

705 

706 name_sample = names[0] 

707 if isinstance(index_sample, tuple): 

708 if not isinstance(name_sample, tuple): 

709 msg = "must supply a tuple to get_group with multiple grouping keys" 

710 raise ValueError(msg) 

711 if not len(name_sample) == len(index_sample): 

712 try: 

713 # If the original grouper was a tuple 

714 return [self.indices[name] for name in names] 

715 except KeyError as err: 

716 # turns out it wasn't a tuple 

717 msg = ( 

718 "must supply a same-length tuple to get_group " 

719 "with multiple grouping keys" 

720 ) 

721 raise ValueError(msg) from err 

722 

723 converters = [get_converter(s) for s in index_sample] 

724 names = (tuple(f(n) for f, n in zip(converters, name)) for name in names) 

725 

726 else: 

727 converter = get_converter(index_sample) 

728 names = (converter(name) for name in names) 

729 

730 return [self.indices.get(name, []) for name in names] 

731 

732 @final 

733 def _get_index(self, name): 

734 """ 

735 Safe get index, translate keys for datelike to underlying repr. 

736 """ 

737 return self._get_indices([name])[0] 

738 

739 @final 

740 @cache_readonly 

741 def _selected_obj(self): 

742 # Note: _selected_obj is always just `self.obj` for SeriesGroupBy 

743 

744 if self._selection is None or isinstance(self.obj, Series): 

745 if self._group_selection is not None: 

746 return self.obj[self._group_selection] 

747 return self.obj 

748 else: 

749 return self.obj[self._selection] 

750 

751 @final 

752 def _dir_additions(self) -> set[str]: 

753 return self.obj._dir_additions() | self._apply_allowlist 

754 

755 @Substitution( 

756 klass="GroupBy", 

757 examples=dedent( 

758 """\ 

759 >>> df = pd.DataFrame({'A': 'a b a b'.split(), 'B': [1, 2, 3, 4]}) 

760 >>> df 

761 A B 

762 0 a 1 

763 1 b 2 

764 2 a 3 

765 3 b 4 

766 

767 To get the difference between each groups maximum and minimum value in one 

768 pass, you can do 

769 

770 >>> df.groupby('A').pipe(lambda x: x.max() - x.min()) 

771 B 

772 A 

773 a 2 

774 b 2""" 

775 ), 

776 ) 

777 @Appender(_pipe_template) 

778 def pipe( 

779 self, 

780 func: Callable[..., T] | tuple[Callable[..., T], str], 

781 *args, 

782 **kwargs, 

783 ) -> T: 

784 return com.pipe(self, func, *args, **kwargs) 

785 

786 plot = property(GroupByPlot) 

787 

788 @final 

789 def get_group(self, name, obj=None) -> DataFrame | Series: 

790 """ 

791 Construct DataFrame from group with provided name. 

792 

793 Parameters 

794 ---------- 

795 name : object 

796 The name of the group to get as a DataFrame. 

797 obj : DataFrame, default None 

798 The DataFrame to take the DataFrame out of. If 

799 it is None, the object groupby was called on will 

800 be used. 

801 

802 Returns 

803 ------- 

804 group : same type as obj 

805 """ 

806 if obj is None: 

807 obj = self._selected_obj 

808 

809 inds = self._get_index(name) 

810 if not len(inds): 

811 raise KeyError(name) 

812 

813 return obj._take_with_is_copy(inds, axis=self.axis) 

814 

815 @final 

816 def __iter__(self) -> Iterator[tuple[Hashable, NDFrameT]]: 

817 """ 

818 Groupby iterator. 

819 

820 Returns 

821 ------- 

822 Generator yielding sequence of (name, subsetted object) 

823 for each group 

824 """ 

825 keys = self.keys 

826 if isinstance(keys, list) and len(keys) == 1: 

827 warnings.warn( 

828 ( 

829 "In a future version of pandas, a length 1 " 

830 "tuple will be returned when iterating over a " 

831 "groupby with a grouper equal to a list of " 

832 "length 1. Don't supply a list with a single grouper " 

833 "to avoid this warning." 

834 ), 

835 FutureWarning, 

836 stacklevel=find_stack_level(), 

837 ) 

838 return self.grouper.get_iterator(self._selected_obj, axis=self.axis) 

839 

840 

841# To track operations that expand dimensions, like ohlc 

842OutputFrameOrSeries = TypeVar("OutputFrameOrSeries", bound=NDFrame) 

843 

844 

845class GroupBy(BaseGroupBy[NDFrameT]): 

846 """ 

847 Class for grouping and aggregating relational data. 

848 

849 See aggregate, transform, and apply functions on this object. 

850 

851 It's easiest to use obj.groupby(...) to use GroupBy, but you can also do: 

852 

853 :: 

854 

855 grouped = groupby(obj, ...) 

856 

857 Parameters 

858 ---------- 

859 obj : pandas object 

860 axis : int, default 0 

861 level : int, default None 

862 Level of MultiIndex 

863 groupings : list of Grouping objects 

864 Most users should ignore this 

865 exclusions : array-like, optional 

866 List of columns to exclude 

867 name : str 

868 Most users should ignore this 

869 

870 Returns 

871 ------- 

872 **Attributes** 

873 groups : dict 

874 {group name -> group labels} 

875 len(grouped) : int 

876 Number of groups 

877 

878 Notes 

879 ----- 

880 After grouping, see aggregate, apply, and transform functions. Here are 

881 some other brief notes about usage. When grouping by multiple groups, the 

882 result index will be a MultiIndex (hierarchical) by default. 

883 

884 Iteration produces (key, group) tuples, i.e. chunking the data by group. So 

885 you can write code like: 

886 

887 :: 

888 

889 grouped = obj.groupby(keys, axis=axis) 

890 for key, group in grouped: 

891 # do something with the data 

892 

893 Function calls on GroupBy, if not specially implemented, "dispatch" to the 

894 grouped data. So if you group a DataFrame and wish to invoke the std() 

895 method on each group, you can simply do: 

896 

897 :: 

898 

899 df.groupby(mapper).std() 

900 

901 rather than 

902 

903 :: 

904 

905 df.groupby(mapper).aggregate(np.std) 

906 

907 You can pass arguments to these "wrapped" functions, too. 

908 

909 See the online documentation for full exposition on these topics and much 

910 more 

911 """ 

912 

913 grouper: ops.BaseGrouper 

914 as_index: bool 

915 

916 @final 

917 def __init__( 

918 self, 

919 obj: NDFrameT, 

920 keys: _KeysArgType | None = None, 

921 axis: int = 0, 

922 level: IndexLabel | None = None, 

923 grouper: ops.BaseGrouper | None = None, 

924 exclusions: frozenset[Hashable] | None = None, 

925 selection: IndexLabel | None = None, 

926 as_index: bool = True, 

927 sort: bool = True, 

928 group_keys: bool | lib.NoDefault = True, 

929 squeeze: bool = False, 

930 observed: bool = False, 

931 mutated: bool = False, 

932 dropna: bool = True, 

933 ) -> None: 

934 

935 self._selection = selection 

936 

937 assert isinstance(obj, NDFrame), type(obj) 

938 

939 self.level = level 

940 

941 if not as_index: 

942 if not isinstance(obj, DataFrame): 

943 raise TypeError("as_index=False only valid with DataFrame") 

944 if axis != 0: 

945 raise ValueError("as_index=False only valid for axis=0") 

946 

947 self.as_index = as_index 

948 self.keys = keys 

949 self.sort = sort 

950 self.group_keys = group_keys 

951 self.squeeze = squeeze 

952 self.observed = observed 

953 self.mutated = mutated 

954 self.dropna = dropna 

955 

956 if grouper is None: 

957 from pandas.core.groupby.grouper import get_grouper 

958 

959 grouper, exclusions, obj = get_grouper( 

960 obj, 

961 keys, 

962 axis=axis, 

963 level=level, 

964 sort=sort, 

965 observed=observed, 

966 mutated=self.mutated, 

967 dropna=self.dropna, 

968 ) 

969 

970 self.obj = obj 

971 self.axis = obj._get_axis_number(axis) 

972 self.grouper = grouper 

973 self.exclusions = frozenset(exclusions) if exclusions else frozenset() 

974 

975 def __getattr__(self, attr: str): 

976 if attr in self._internal_names_set: 

977 return object.__getattribute__(self, attr) 

978 if attr in self.obj: 

979 return self[attr] 

980 

981 raise AttributeError( 

982 f"'{type(self).__name__}' object has no attribute '{attr}'" 

983 ) 

984 

985 @final 

986 def _make_wrapper(self, name: str) -> Callable: 

987 assert name in self._apply_allowlist 

988 

989 with self._group_selection_context(): 

990 # need to setup the selection 

991 # as are not passed directly but in the grouper 

992 f = getattr(self._obj_with_exclusions, name) 

993 if not isinstance(f, types.MethodType): 

994 # error: Incompatible return value type 

995 # (got "NDFrameT", expected "Callable[..., Any]") [return-value] 

996 return cast(Callable, self.apply(lambda self: getattr(self, name))) 

997 

998 f = getattr(type(self._obj_with_exclusions), name) 

999 sig = inspect.signature(f) 

1000 

1001 def wrapper(*args, **kwargs): 

1002 # a little trickery for aggregation functions that need an axis 

1003 # argument 

1004 if "axis" in sig.parameters: 

1005 if kwargs.get("axis", None) is None: 

1006 kwargs["axis"] = self.axis 

1007 

1008 numeric_only = kwargs.get("numeric_only", lib.no_default) 

1009 

1010 def curried(x): 

1011 with warnings.catch_warnings(): 

1012 # Catch any warnings from dispatch to DataFrame; we'll emit 

1013 # a warning for groupby below 

1014 match = "The default value of numeric_only " 

1015 warnings.filterwarnings("ignore", match, FutureWarning) 

1016 return f(x, *args, **kwargs) 

1017 

1018 # preserve the name so we can detect it when calling plot methods, 

1019 # to avoid duplicates 

1020 curried.__name__ = name 

1021 

1022 # special case otherwise extra plots are created when catching the 

1023 # exception below 

1024 if name in base.plotting_methods: 

1025 return self.apply(curried) 

1026 

1027 is_transform = name in base.transformation_kernels 

1028 

1029 # Transform needs to keep the same schema, including when empty 

1030 if is_transform and self._obj_with_exclusions.empty: 

1031 return self._obj_with_exclusions 

1032 

1033 result = self._python_apply_general( 

1034 curried, 

1035 self._obj_with_exclusions, 

1036 is_transform=is_transform, 

1037 not_indexed_same=not is_transform, 

1038 ) 

1039 

1040 if self._selected_obj.ndim != 1 and self.axis != 1 and result.ndim != 1: 

1041 missing = self._obj_with_exclusions.columns.difference(result.columns) 

1042 if len(missing) > 0: 

1043 warn_dropping_nuisance_columns_deprecated( 

1044 type(self), name, numeric_only 

1045 ) 

1046 

1047 if self.grouper.has_dropped_na and is_transform: 

1048 # result will have dropped rows due to nans, fill with null 

1049 # and ensure index is ordered same as the input 

1050 result = self._set_result_index_ordered(result) 

1051 return result 

1052 

1053 wrapper.__name__ = name 

1054 return wrapper 

1055 

1056 # ----------------------------------------------------------------- 

1057 # Selection 

1058 

1059 @final 

1060 def _set_group_selection(self) -> None: 

1061 """ 

1062 Create group based selection. 

1063 

1064 Used when selection is not passed directly but instead via a grouper. 

1065 

1066 NOTE: this should be paired with a call to _reset_group_selection 

1067 """ 

1068 # This is a no-op for SeriesGroupBy 

1069 grp = self.grouper 

1070 if not ( 

1071 self.as_index 

1072 and grp.groupings is not None 

1073 and self.obj.ndim > 1 

1074 and self._group_selection is None 

1075 ): 

1076 return 

1077 

1078 groupers = [g.name for g in grp.groupings if g.level is None and g.in_axis] 

1079 

1080 if len(groupers): 

1081 # GH12839 clear selected obj cache when group selection changes 

1082 ax = self.obj._info_axis 

1083 self._group_selection = ax.difference(Index(groupers), sort=False).tolist() 

1084 self._reset_cache("_selected_obj") 

1085 

1086 @final 

1087 def _reset_group_selection(self) -> None: 

1088 """ 

1089 Clear group based selection. 

1090 

1091 Used for methods needing to return info on each group regardless of 

1092 whether a group selection was previously set. 

1093 """ 

1094 if self._group_selection is not None: 

1095 # GH12839 clear cached selection too when changing group selection 

1096 self._group_selection = None 

1097 self._reset_cache("_selected_obj") 

1098 

1099 @contextmanager 

1100 def _group_selection_context(self) -> Iterator[GroupBy]: 

1101 """ 

1102 Set / reset the _group_selection_context. 

1103 """ 

1104 self._set_group_selection() 

1105 try: 

1106 yield self 

1107 finally: 

1108 self._reset_group_selection() 

1109 

1110 def _iterate_slices(self) -> Iterable[Series]: 

1111 raise AbstractMethodError(self) 

1112 

1113 # ----------------------------------------------------------------- 

1114 # Dispatch/Wrapping 

1115 

1116 @final 

1117 def _concat_objects( 

1118 self, 

1119 values, 

1120 not_indexed_same: bool = False, 

1121 override_group_keys: bool = False, 

1122 ): 

1123 from pandas.core.reshape.concat import concat 

1124 

1125 def reset_identity(values): 

1126 # reset the identities of the components 

1127 # of the values to prevent aliasing 

1128 for v in com.not_none(*values): 

1129 ax = v._get_axis(self.axis) 

1130 ax._reset_identity() 

1131 return values 

1132 

1133 if self.group_keys and not override_group_keys: 

1134 

1135 values = reset_identity(values) 

1136 if self.as_index: 

1137 

1138 # possible MI return case 

1139 group_keys = self.grouper.result_index 

1140 group_levels = self.grouper.levels 

1141 group_names = self.grouper.names 

1142 

1143 result = concat( 

1144 values, 

1145 axis=self.axis, 

1146 keys=group_keys, 

1147 levels=group_levels, 

1148 names=group_names, 

1149 sort=False, 

1150 ) 

1151 else: 

1152 

1153 # GH5610, returns a MI, with the first level being a 

1154 # range index 

1155 keys = list(range(len(values))) 

1156 result = concat(values, axis=self.axis, keys=keys) 

1157 

1158 elif not not_indexed_same: 

1159 result = concat(values, axis=self.axis) 

1160 

1161 ax = self._selected_obj._get_axis(self.axis) 

1162 if self.dropna: 

1163 labels = self.grouper.group_info[0] 

1164 mask = labels != -1 

1165 ax = ax[mask] 

1166 

1167 # this is a very unfortunate situation 

1168 # we can't use reindex to restore the original order 

1169 # when the ax has duplicates 

1170 # so we resort to this 

1171 # GH 14776, 30667 

1172 if ax.has_duplicates and not result.axes[self.axis].equals(ax): 

1173 target = algorithms.unique1d(ax._values) 

1174 indexer, _ = result.index.get_indexer_non_unique(target) 

1175 result = result.take(indexer, axis=self.axis) 

1176 else: 

1177 result = result.reindex(ax, axis=self.axis, copy=False) 

1178 

1179 else: 

1180 values = reset_identity(values) 

1181 result = concat(values, axis=self.axis) 

1182 

1183 name = self.obj.name if self.obj.ndim == 1 else self._selection 

1184 if isinstance(result, Series) and name is not None: 

1185 

1186 result.name = name 

1187 

1188 return result 

1189 

1190 @final 

1191 def _set_result_index_ordered( 

1192 self, result: OutputFrameOrSeries 

1193 ) -> OutputFrameOrSeries: 

1194 # set the result index on the passed values object and 

1195 # return the new object, xref 8046 

1196 

1197 obj_axis = self.obj._get_axis(self.axis) 

1198 

1199 if self.grouper.is_monotonic and not self.grouper.has_dropped_na: 

1200 # shortcut if we have an already ordered grouper 

1201 result = result.set_axis(obj_axis, axis=self.axis, copy=False) 

1202 return result 

1203 

1204 # row order is scrambled => sort the rows by position in original index 

1205 original_positions = Index(self.grouper.result_ilocs()) 

1206 result = result.set_axis(original_positions, axis=self.axis, copy=False) 

1207 result = result.sort_index(axis=self.axis) 

1208 if self.grouper.has_dropped_na: 

1209 # Add back in any missing rows due to dropna - index here is integral 

1210 # with values referring to the row of the input so can use RangeIndex 

1211 result = result.reindex(RangeIndex(len(obj_axis)), axis=self.axis) 

1212 result = result.set_axis(obj_axis, axis=self.axis, copy=False) 

1213 

1214 return result 

1215 

1216 def _indexed_output_to_ndframe( 

1217 self, result: Mapping[base.OutputKey, ArrayLike] 

1218 ) -> Series | DataFrame: 

1219 raise AbstractMethodError(self) 

1220 

1221 @final 

1222 def _wrap_aggregated_output( 

1223 self, 

1224 output: Series | DataFrame | Mapping[base.OutputKey, ArrayLike], 

1225 qs: npt.NDArray[np.float64] | None = None, 

1226 ): 

1227 """ 

1228 Wraps the output of GroupBy aggregations into the expected result. 

1229 

1230 Parameters 

1231 ---------- 

1232 output : Series, DataFrame, or Mapping[base.OutputKey, ArrayLike] 

1233 Data to wrap. 

1234 

1235 Returns 

1236 ------- 

1237 Series or DataFrame 

1238 """ 

1239 

1240 if isinstance(output, (Series, DataFrame)): 

1241 # We get here (for DataFrameGroupBy) if we used Manager.grouped_reduce, 

1242 # in which case our columns are already set correctly. 

1243 # ATM we do not get here for SeriesGroupBy; when we do, we will 

1244 # need to require that result.name already match self.obj.name 

1245 result = output 

1246 else: 

1247 result = self._indexed_output_to_ndframe(output) 

1248 

1249 if not self.as_index: 

1250 # `not self.as_index` is only relevant for DataFrameGroupBy, 

1251 # enforced in __init__ 

1252 self._insert_inaxis_grouper_inplace(result) 

1253 result = result._consolidate() 

1254 index = Index(range(self.grouper.ngroups)) 

1255 

1256 else: 

1257 index = self.grouper.result_index 

1258 

1259 if qs is not None: 

1260 # We get here with len(qs) != 1 and not self.as_index 

1261 # in test_pass_args_kwargs 

1262 index = _insert_quantile_level(index, qs) 

1263 

1264 result.index = index 

1265 

1266 if self.axis == 1: 

1267 # Only relevant for DataFrameGroupBy, no-op for SeriesGroupBy 

1268 result = result.T 

1269 if result.index.equals(self.obj.index): 

1270 # Retain e.g. DatetimeIndex/TimedeltaIndex freq 

1271 result.index = self.obj.index.copy() 

1272 # TODO: Do this more systematically 

1273 

1274 return self._reindex_output(result, qs=qs) 

1275 

1276 @final 

1277 def _wrap_transformed_output( 

1278 self, output: Mapping[base.OutputKey, ArrayLike] 

1279 ) -> Series | DataFrame: 

1280 """ 

1281 Wraps the output of GroupBy transformations into the expected result. 

1282 

1283 Parameters 

1284 ---------- 

1285 output : Mapping[base.OutputKey, ArrayLike] 

1286 Data to wrap. 

1287 

1288 Returns 

1289 ------- 

1290 Series or DataFrame 

1291 Series for SeriesGroupBy, DataFrame for DataFrameGroupBy 

1292 """ 

1293 if isinstance(output, (Series, DataFrame)): 

1294 result = output 

1295 else: 

1296 result = self._indexed_output_to_ndframe(output) 

1297 

1298 if self.axis == 1: 

1299 # Only relevant for DataFrameGroupBy 

1300 result = result.T 

1301 result.columns = self.obj.columns 

1302 

1303 result.index = self.obj.index 

1304 return result 

1305 

1306 def _wrap_applied_output( 

1307 self, 

1308 data, 

1309 values: list, 

1310 not_indexed_same: bool = False, 

1311 override_group_keys: bool = False, 

1312 ): 

1313 raise AbstractMethodError(self) 

1314 

1315 def _resolve_numeric_only( 

1316 self, how: str, numeric_only: bool | lib.NoDefault, axis: int 

1317 ) -> bool: 

1318 """ 

1319 Determine subclass-specific default value for 'numeric_only'. 

1320 

1321 For SeriesGroupBy we want the default to be False (to match Series behavior). 

1322 For DataFrameGroupBy we want it to be True (for backwards-compat). 

1323 

1324 Parameters 

1325 ---------- 

1326 numeric_only : bool or lib.no_default 

1327 axis : int 

1328 Axis passed to the groupby op (not self.axis). 

1329 

1330 Returns 

1331 ------- 

1332 bool 

1333 """ 

1334 # GH#41291 

1335 if numeric_only is lib.no_default: 

1336 # i.e. not explicitly passed by user 

1337 if self.obj.ndim == 2: 

1338 # i.e. DataFrameGroupBy 

1339 numeric_only = axis != 1 

1340 # GH#42395 GH#43108 GH#43154 

1341 # Regression from 1.2.5 to 1.3 caused object columns to be dropped 

1342 if self.axis: 

1343 obj = self._obj_with_exclusions.T 

1344 else: 

1345 obj = self._obj_with_exclusions 

1346 check = obj._get_numeric_data() 

1347 if len(obj.columns) and not len(check.columns) and not obj.empty: 

1348 numeric_only = False 

1349 

1350 else: 

1351 numeric_only = False 

1352 

1353 if numeric_only and self.obj.ndim == 1 and not is_numeric_dtype(self.obj.dtype): 

1354 # GH#47500 

1355 warnings.warn( 

1356 f"{type(self).__name__}.{how} called with " 

1357 f"numeric_only={numeric_only} and dtype {self.obj.dtype}. This will " 

1358 "raise a TypeError in a future version of pandas", 

1359 category=FutureWarning, 

1360 stacklevel=find_stack_level(), 

1361 ) 

1362 raise NotImplementedError( 

1363 f"{type(self).__name__}.{how} does not implement numeric_only" 

1364 ) 

1365 

1366 return numeric_only 

1367 

1368 def _maybe_warn_numeric_only_depr( 

1369 self, how: str, result: DataFrame | Series, numeric_only: bool | lib.NoDefault 

1370 ) -> None: 

1371 """Emit warning on numeric_only behavior deprecation when appropriate. 

1372 

1373 Parameters 

1374 ---------- 

1375 how : str 

1376 Groupby kernel name. 

1377 result : 

1378 Result of the groupby operation. 

1379 numeric_only : bool or lib.no_default 

1380 Argument as passed by user. 

1381 """ 

1382 if ( 

1383 self._obj_with_exclusions.ndim != 1 

1384 and result.ndim > 1 

1385 and len(result.columns) < len(self._obj_with_exclusions.columns) 

1386 ): 

1387 warn_dropping_nuisance_columns_deprecated(type(self), how, numeric_only) 

1388 

1389 # ----------------------------------------------------------------- 

1390 # numba 

1391 

1392 @final 

1393 def _numba_prep(self, data): 

1394 ids, _, ngroups = self.grouper.group_info 

1395 sorted_index = get_group_index_sorter(ids, ngroups) 

1396 sorted_ids = algorithms.take_nd(ids, sorted_index, allow_fill=False) 

1397 

1398 sorted_data = data.take(sorted_index, axis=self.axis).to_numpy() 

1399 if len(self.grouper.groupings) > 1: 

1400 raise NotImplementedError( 

1401 "More than 1 grouping labels are not supported with engine='numba'" 

1402 ) 

1403 # GH 46867 

1404 index_data = data.index 

1405 if isinstance(index_data, MultiIndex): 

1406 group_key = self.grouper.groupings[0].name 

1407 index_data = index_data.get_level_values(group_key) 

1408 sorted_index_data = index_data.take(sorted_index).to_numpy() 

1409 

1410 starts, ends = lib.generate_slices(sorted_ids, ngroups) 

1411 return ( 

1412 starts, 

1413 ends, 

1414 sorted_index_data, 

1415 sorted_data, 

1416 ) 

1417 

1418 def _numba_agg_general( 

1419 self, 

1420 func: Callable, 

1421 engine_kwargs: dict[str, bool] | None, 

1422 *aggregator_args, 

1423 ): 

1424 """ 

1425 Perform groupby with a standard numerical aggregation function (e.g. mean) 

1426 with Numba. 

1427 """ 

1428 if not self.as_index: 

1429 raise NotImplementedError( 

1430 "as_index=False is not supported. Use .reset_index() instead." 

1431 ) 

1432 if self.axis == 1: 

1433 raise NotImplementedError("axis=1 is not supported.") 

1434 

1435 with self._group_selection_context(): 

1436 data = self._selected_obj 

1437 df = data if data.ndim == 2 else data.to_frame() 

1438 starts, ends, sorted_index, sorted_data = self._numba_prep(df) 

1439 aggregator = executor.generate_shared_aggregator( 

1440 func, **get_jit_arguments(engine_kwargs) 

1441 ) 

1442 result = aggregator(sorted_data, starts, ends, 0, *aggregator_args) 

1443 

1444 index = self.grouper.result_index 

1445 if data.ndim == 1: 

1446 result_kwargs = {"name": data.name} 

1447 result = result.ravel() 

1448 else: 

1449 result_kwargs = {"columns": data.columns} 

1450 return data._constructor(result, index=index, **result_kwargs) 

1451 

1452 @final 

1453 def _transform_with_numba(self, data, func, *args, engine_kwargs=None, **kwargs): 

1454 """ 

1455 Perform groupby transform routine with the numba engine. 

1456 

1457 This routine mimics the data splitting routine of the DataSplitter class 

1458 to generate the indices of each group in the sorted data and then passes the 

1459 data and indices into a Numba jitted function. 

1460 """ 

1461 starts, ends, sorted_index, sorted_data = self._numba_prep(data) 

1462 numba_.validate_udf(func) 

1463 numba_transform_func = numba_.generate_numba_transform_func( 

1464 func, **get_jit_arguments(engine_kwargs, kwargs) 

1465 ) 

1466 result = numba_transform_func( 

1467 sorted_data, 

1468 sorted_index, 

1469 starts, 

1470 ends, 

1471 len(data.columns), 

1472 *args, 

1473 ) 

1474 # result values needs to be resorted to their original positions since we 

1475 # evaluated the data sorted by group 

1476 return result.take(np.argsort(sorted_index), axis=0) 

1477 

1478 @final 

1479 def _aggregate_with_numba(self, data, func, *args, engine_kwargs=None, **kwargs): 

1480 """ 

1481 Perform groupby aggregation routine with the numba engine. 

1482 

1483 This routine mimics the data splitting routine of the DataSplitter class 

1484 to generate the indices of each group in the sorted data and then passes the 

1485 data and indices into a Numba jitted function. 

1486 """ 

1487 starts, ends, sorted_index, sorted_data = self._numba_prep(data) 

1488 numba_.validate_udf(func) 

1489 numba_agg_func = numba_.generate_numba_agg_func( 

1490 func, **get_jit_arguments(engine_kwargs, kwargs) 

1491 ) 

1492 result = numba_agg_func( 

1493 sorted_data, 

1494 sorted_index, 

1495 starts, 

1496 ends, 

1497 len(data.columns), 

1498 *args, 

1499 ) 

1500 return result 

1501 

1502 # ----------------------------------------------------------------- 

1503 # apply/agg/transform 

1504 

1505 @Appender( 

1506 _apply_docs["template"].format( 

1507 input="dataframe", examples=_apply_docs["dataframe_examples"] 

1508 ) 

1509 ) 

1510 def apply(self, func, *args, **kwargs) -> NDFrameT: 

1511 

1512 func = com.is_builtin_func(func) 

1513 

1514 if isinstance(func, str): 

1515 if hasattr(self, func): 

1516 res = getattr(self, func) 

1517 if callable(res): 

1518 return res(*args, **kwargs) 

1519 elif args or kwargs: 

1520 raise ValueError(f"Cannot pass arguments to property {func}") 

1521 return res 

1522 

1523 else: 

1524 raise TypeError(f"apply func should be callable, not '{func}'") 

1525 

1526 elif args or kwargs: 

1527 if callable(func): 

1528 

1529 @wraps(func) 

1530 def f(g): 

1531 with np.errstate(all="ignore"): 

1532 return func(g, *args, **kwargs) 

1533 

1534 elif hasattr(nanops, "nan" + func): 

1535 # TODO: should we wrap this in to e.g. _is_builtin_func? 

1536 f = getattr(nanops, "nan" + func) 

1537 

1538 else: 

1539 raise ValueError( 

1540 "func must be a callable if args or kwargs are supplied" 

1541 ) 

1542 else: 

1543 

1544 f = func 

1545 

1546 # ignore SettingWithCopy here in case the user mutates 

1547 with option_context("mode.chained_assignment", None): 

1548 try: 

1549 result = self._python_apply_general(f, self._selected_obj) 

1550 except TypeError: 

1551 # gh-20949 

1552 # try again, with .apply acting as a filtering 

1553 # operation, by excluding the grouping column 

1554 # This would normally not be triggered 

1555 # except if the udf is trying an operation that 

1556 # fails on *some* columns, e.g. a numeric operation 

1557 # on a string grouper column 

1558 

1559 with self._group_selection_context(): 

1560 return self._python_apply_general(f, self._selected_obj) 

1561 

1562 return result 

1563 

1564 @final 

1565 def _python_apply_general( 

1566 self, 

1567 f: Callable, 

1568 data: DataFrame | Series, 

1569 not_indexed_same: bool | None = None, 

1570 is_transform: bool = False, 

1571 is_agg: bool = False, 

1572 ) -> NDFrameT: 

1573 """ 

1574 Apply function f in python space 

1575 

1576 Parameters 

1577 ---------- 

1578 f : callable 

1579 Function to apply 

1580 data : Series or DataFrame 

1581 Data to apply f to 

1582 not_indexed_same: bool, optional 

1583 When specified, overrides the value of not_indexed_same. Apply behaves 

1584 differently when the result index is equal to the input index, but 

1585 this can be coincidental leading to value-dependent behavior. 

1586 is_transform : bool, default False 

1587 Indicator for whether the function is actually a transform 

1588 and should not have group keys prepended. This is used 

1589 in _make_wrapper which generates both transforms (e.g. diff) 

1590 and non-transforms (e.g. corr) 

1591 is_agg : bool, default False 

1592 Indicator for whether the function is an aggregation. When the 

1593 result is empty, we don't want to warn for this case. 

1594 See _GroupBy._python_agg_general. 

1595 

1596 Returns 

1597 ------- 

1598 Series or DataFrame 

1599 data after applying f 

1600 """ 

1601 values, mutated = self.grouper.apply(f, data, self.axis) 

1602 if not_indexed_same is None: 

1603 not_indexed_same = mutated or self.mutated 

1604 override_group_keys = False 

1605 

1606 is_empty_agg = is_agg and len(values) == 0 

1607 if (not not_indexed_same and self.group_keys is lib.no_default) and not ( 

1608 is_transform or is_empty_agg 

1609 ): 

1610 # We've detected value-dependent behavior: the result's index depends on 

1611 # whether the user's function `f` returned the same index or not. 

1612 msg = ( 

1613 "Not prepending group keys to the result index of " 

1614 "transform-like apply. In the future, the group keys " 

1615 "will be included in the index, regardless of whether " 

1616 "the applied function returns a like-indexed object.\n" 

1617 "To preserve the previous behavior, use\n\n\t" 

1618 ">>> .groupby(..., group_keys=False)\n\n" 

1619 "To adopt the future behavior and silence this warning, use " 

1620 "\n\n\t>>> .groupby(..., group_keys=True)" 

1621 ) 

1622 warnings.warn(msg, FutureWarning, stacklevel=find_stack_level()) 

1623 # We want to behave as if `self.group_keys=False` when reconstructing 

1624 # the object. However, we don't want to mutate the stateful GroupBy 

1625 # object, so we just override it. 

1626 # When this deprecation is enforced then override_group_keys 

1627 # may be removed. 

1628 override_group_keys = True 

1629 

1630 return self._wrap_applied_output( 

1631 data, 

1632 values, 

1633 not_indexed_same, 

1634 override_group_keys=is_transform or override_group_keys, 

1635 ) 

1636 

1637 @final 

1638 def _python_agg_general(self, func, *args, raise_on_typeerror=False, **kwargs): 

1639 func = com.is_builtin_func(func) 

1640 f = lambda x: func(x, *args, **kwargs) 

1641 

1642 # iterate through "columns" ex exclusions to populate output dict 

1643 output: dict[base.OutputKey, ArrayLike] = {} 

1644 

1645 if self.ngroups == 0: 

1646 # agg_series below assumes ngroups > 0 

1647 return self._python_apply_general(f, self._selected_obj, is_agg=True) 

1648 

1649 for idx, obj in enumerate(self._iterate_slices()): 

1650 name = obj.name 

1651 

1652 try: 

1653 # if this function is invalid for this dtype, we will ignore it. 

1654 result = self.grouper.agg_series(obj, f) 

1655 except TypeError: 

1656 if raise_on_typeerror: 

1657 raise 

1658 warn_dropping_nuisance_columns_deprecated( 

1659 type(self), "agg", numeric_only=False 

1660 ) 

1661 continue 

1662 

1663 key = base.OutputKey(label=name, position=idx) 

1664 output[key] = result 

1665 

1666 if not output: 

1667 return self._python_apply_general(f, self._selected_obj) 

1668 

1669 return self._wrap_aggregated_output(output) 

1670 

1671 @final 

1672 def _agg_general( 

1673 self, 

1674 numeric_only: bool | lib.NoDefault = True, 

1675 min_count: int = -1, 

1676 *, 

1677 alias: str, 

1678 npfunc: Callable, 

1679 ): 

1680 

1681 with self._group_selection_context(): 

1682 # try a cython aggregation if we can 

1683 result = self._cython_agg_general( 

1684 how=alias, 

1685 alt=npfunc, 

1686 numeric_only=numeric_only, 

1687 min_count=min_count, 

1688 ) 

1689 return result.__finalize__(self.obj, method="groupby") 

1690 

1691 def _agg_py_fallback( 

1692 self, values: ArrayLike, ndim: int, alt: Callable 

1693 ) -> ArrayLike: 

1694 """ 

1695 Fallback to pure-python aggregation if _cython_operation raises 

1696 NotImplementedError. 

1697 """ 

1698 # We get here with a) EADtypes and b) object dtype 

1699 

1700 if values.ndim == 1: 

1701 # For DataFrameGroupBy we only get here with ExtensionArray 

1702 ser = Series(values) 

1703 else: 

1704 # We only get here with values.dtype == object 

1705 # TODO: special case not needed with ArrayManager 

1706 df = DataFrame(values.T) 

1707 # bc we split object blocks in grouped_reduce, we have only 1 col 

1708 # otherwise we'd have to worry about block-splitting GH#39329 

1709 assert df.shape[1] == 1 

1710 # Avoid call to self.values that can occur in DataFrame 

1711 # reductions; see GH#28949 

1712 ser = df.iloc[:, 0] 

1713 

1714 # We do not get here with UDFs, so we know that our dtype 

1715 # should always be preserved by the implemented aggregations 

1716 # TODO: Is this exactly right; see WrappedCythonOp get_result_dtype? 

1717 res_values = self.grouper.agg_series(ser, alt, preserve_dtype=True) 

1718 

1719 if isinstance(values, Categorical): 

1720 # Because we only get here with known dtype-preserving 

1721 # reductions, we cast back to Categorical. 

1722 # TODO: if we ever get "rank" working, exclude it here. 

1723 res_values = type(values)._from_sequence(res_values, dtype=values.dtype) 

1724 

1725 # If we are DataFrameGroupBy and went through a SeriesGroupByPath 

1726 # then we need to reshape 

1727 # GH#32223 includes case with IntegerArray values, ndarray res_values 

1728 # test_groupby_duplicate_columns with object dtype values 

1729 return ensure_block_shape(res_values, ndim=ndim) 

1730 

1731 @final 

1732 def _cython_agg_general( 

1733 self, 

1734 how: str, 

1735 alt: Callable, 

1736 numeric_only: bool | lib.NoDefault, 

1737 min_count: int = -1, 

1738 ignore_failures: bool = True, 

1739 **kwargs, 

1740 ): 

1741 # Note: we never get here with how="ohlc" for DataFrameGroupBy; 

1742 # that goes through SeriesGroupBy 

1743 numeric_only_bool = self._resolve_numeric_only(how, numeric_only, axis=0) 

1744 

1745 data = self._get_data_to_aggregate() 

1746 is_ser = data.ndim == 1 

1747 

1748 orig_len = len(data) 

1749 if numeric_only_bool: 

1750 if is_ser and not is_numeric_dtype(self._selected_obj.dtype): 

1751 # GH#41291 match Series behavior 

1752 kwd_name = "numeric_only" 

1753 if how in ["any", "all"]: 

1754 kwd_name = "bool_only" 

1755 raise NotImplementedError( 

1756 f"{type(self).__name__}.{how} does not implement {kwd_name}." 

1757 ) 

1758 elif not is_ser: 

1759 data = data.get_numeric_data(copy=False) 

1760 

1761 def array_func(values: ArrayLike) -> ArrayLike: 

1762 try: 

1763 result = self.grouper._cython_operation( 

1764 "aggregate", 

1765 values, 

1766 how, 

1767 axis=data.ndim - 1, 

1768 min_count=min_count, 

1769 **kwargs, 

1770 ) 

1771 except NotImplementedError: 

1772 # generally if we have numeric_only=False 

1773 # and non-applicable functions 

1774 # try to python agg 

1775 # TODO: shouldn't min_count matter? 

1776 result = self._agg_py_fallback(values, ndim=data.ndim, alt=alt) 

1777 

1778 return result 

1779 

1780 # TypeError -> we may have an exception in trying to aggregate 

1781 # continue and exclude the block 

1782 new_mgr = data.grouped_reduce(array_func, ignore_failures=ignore_failures) 

1783 

1784 if not is_ser and len(new_mgr) < orig_len: 

1785 warn_dropping_nuisance_columns_deprecated(type(self), how, numeric_only) 

1786 

1787 res = self._wrap_agged_manager(new_mgr) 

1788 if is_ser: 

1789 res.index = self.grouper.result_index 

1790 return self._reindex_output(res) 

1791 else: 

1792 return res 

1793 

1794 def _cython_transform( 

1795 self, how: str, numeric_only: bool = True, axis: int = 0, **kwargs 

1796 ): 

1797 raise AbstractMethodError(self) 

1798 

1799 @final 

1800 def _transform(self, func, *args, engine=None, engine_kwargs=None, **kwargs): 

1801 

1802 if maybe_use_numba(engine): 

1803 # TODO: tests with self._selected_obj.ndim == 1 on DataFrameGroupBy 

1804 with self._group_selection_context(): 

1805 data = self._selected_obj 

1806 df = data if data.ndim == 2 else data.to_frame() 

1807 result = self._transform_with_numba( 

1808 df, func, *args, engine_kwargs=engine_kwargs, **kwargs 

1809 ) 

1810 if self.obj.ndim == 2: 

1811 return cast(DataFrame, self.obj)._constructor( 

1812 result, index=data.index, columns=data.columns 

1813 ) 

1814 else: 

1815 return cast(Series, self.obj)._constructor( 

1816 result.ravel(), index=data.index, name=data.name 

1817 ) 

1818 

1819 # optimized transforms 

1820 func = com.get_cython_func(func) or func 

1821 

1822 if not isinstance(func, str): 

1823 return self._transform_general(func, *args, **kwargs) 

1824 

1825 elif func not in base.transform_kernel_allowlist: 

1826 msg = f"'{func}' is not a valid function name for transform(name)" 

1827 raise ValueError(msg) 

1828 elif func in base.cythonized_kernels or func in base.transformation_kernels: 

1829 # cythonized transform or canned "agg+broadcast" 

1830 return getattr(self, func)(*args, **kwargs) 

1831 

1832 else: 

1833 # i.e. func in base.reduction_kernels 

1834 

1835 # GH#30918 Use _transform_fast only when we know func is an aggregation 

1836 # If func is a reduction, we need to broadcast the 

1837 # result to the whole group. Compute func result 

1838 # and deal with possible broadcasting below. 

1839 # Temporarily set observed for dealing with categoricals. 

1840 with com.temp_setattr(self, "observed", True): 

1841 result = getattr(self, func)(*args, **kwargs) 

1842 

1843 return self._wrap_transform_fast_result(result) 

1844 

1845 @final 

1846 def _wrap_transform_fast_result(self, result: NDFrameT) -> NDFrameT: 

1847 """ 

1848 Fast transform path for aggregations. 

1849 """ 

1850 obj = self._obj_with_exclusions 

1851 

1852 # for each col, reshape to size of original frame by take operation 

1853 ids, _, _ = self.grouper.group_info 

1854 result = result.reindex(self.grouper.result_index, axis=self.axis, copy=False) 

1855 

1856 if self.obj.ndim == 1: 

1857 # i.e. SeriesGroupBy 

1858 out = algorithms.take_nd(result._values, ids) 

1859 output = obj._constructor(out, index=obj.index, name=obj.name) 

1860 else: 

1861 # `.size()` gives Series output on DataFrame input, need axis 0 

1862 axis = 0 if result.ndim == 1 else self.axis 

1863 # GH#46209 

1864 # Don't convert indices: negative indices need to give rise 

1865 # to null values in the result 

1866 output = result._take(ids, axis=axis, convert_indices=False) 

1867 output = output.set_axis(obj._get_axis(self.axis), axis=axis) 

1868 return output 

1869 

1870 # ----------------------------------------------------------------- 

1871 # Utilities 

1872 

1873 @final 

1874 def _apply_filter(self, indices, dropna): 

1875 if len(indices) == 0: 

1876 indices = np.array([], dtype="int64") 

1877 else: 

1878 indices = np.sort(np.concatenate(indices)) 

1879 if dropna: 

1880 filtered = self._selected_obj.take(indices, axis=self.axis) 

1881 else: 

1882 mask = np.empty(len(self._selected_obj.index), dtype=bool) 

1883 mask.fill(False) 

1884 mask[indices.astype(int)] = True 

1885 # mask fails to broadcast when passed to where; broadcast manually. 

1886 mask = np.tile(mask, list(self._selected_obj.shape[1:]) + [1]).T 

1887 filtered = self._selected_obj.where(mask) # Fill with NaNs. 

1888 return filtered 

1889 

1890 @final 

1891 def _cumcount_array(self, ascending: bool = True) -> np.ndarray: 

1892 """ 

1893 Parameters 

1894 ---------- 

1895 ascending : bool, default True 

1896 If False, number in reverse, from length of group - 1 to 0. 

1897 

1898 Notes 

1899 ----- 

1900 this is currently implementing sort=False 

1901 (though the default is sort=True) for groupby in general 

1902 """ 

1903 ids, _, ngroups = self.grouper.group_info 

1904 sorter = get_group_index_sorter(ids, ngroups) 

1905 ids, count = ids[sorter], len(ids) 

1906 

1907 if count == 0: 

1908 return np.empty(0, dtype=np.int64) 

1909 

1910 run = np.r_[True, ids[:-1] != ids[1:]] 

1911 rep = np.diff(np.r_[np.nonzero(run)[0], count]) 

1912 out = (~run).cumsum() 

1913 

1914 if ascending: 

1915 out -= np.repeat(out[run], rep) 

1916 else: 

1917 out = np.repeat(out[np.r_[run[1:], True]], rep) - out 

1918 

1919 if self.grouper.has_dropped_na: 

1920 out = np.where(ids == -1, np.nan, out.astype(np.float64, copy=False)) 

1921 else: 

1922 out = out.astype(np.int64, copy=False) 

1923 

1924 rev = np.empty(count, dtype=np.intp) 

1925 rev[sorter] = np.arange(count, dtype=np.intp) 

1926 return out[rev] 

1927 

1928 # ----------------------------------------------------------------- 

1929 

1930 @final 

1931 @property 

1932 def _obj_1d_constructor(self) -> Callable: 

1933 # GH28330 preserve subclassed Series/DataFrames 

1934 if isinstance(self.obj, DataFrame): 

1935 return self.obj._constructor_sliced 

1936 assert isinstance(self.obj, Series) 

1937 return self.obj._constructor 

1938 

1939 @final 

1940 def _bool_agg(self, val_test: Literal["any", "all"], skipna: bool): 

1941 """ 

1942 Shared func to call any / all Cython GroupBy implementations. 

1943 """ 

1944 

1945 def objs_to_bool(vals: ArrayLike) -> tuple[np.ndarray, type]: 

1946 if is_object_dtype(vals.dtype): 

1947 # GH#37501: don't raise on pd.NA when skipna=True 

1948 if skipna: 

1949 func = np.vectorize( 

1950 lambda x: bool(x) if not isna(x) else True, otypes=[bool] 

1951 ) 

1952 vals = func(vals) 

1953 else: 

1954 vals = vals.astype(bool, copy=False) 

1955 

1956 vals = cast(np.ndarray, vals) 

1957 elif isinstance(vals, BaseMaskedArray): 

1958 vals = vals._data.astype(bool, copy=False) 

1959 else: 

1960 vals = vals.astype(bool, copy=False) 

1961 

1962 return vals.view(np.int8), bool 

1963 

1964 def result_to_bool( 

1965 result: np.ndarray, 

1966 inference: type, 

1967 nullable: bool = False, 

1968 ) -> ArrayLike: 

1969 if nullable: 

1970 return BooleanArray(result.astype(bool, copy=False), result == -1) 

1971 else: 

1972 return result.astype(inference, copy=False) 

1973 

1974 return self._get_cythonized_result( 

1975 libgroupby.group_any_all, 

1976 numeric_only=False, 

1977 cython_dtype=np.dtype(np.int8), 

1978 needs_mask=True, 

1979 needs_nullable=True, 

1980 pre_processing=objs_to_bool, 

1981 post_processing=result_to_bool, 

1982 val_test=val_test, 

1983 skipna=skipna, 

1984 ) 

1985 

1986 @final 

1987 @Substitution(name="groupby") 

1988 @Appender(_common_see_also) 

1989 def any(self, skipna: bool = True): 

1990 """ 

1991 Return True if any value in the group is truthful, else False. 

1992 

1993 Parameters 

1994 ---------- 

1995 skipna : bool, default True 

1996 Flag to ignore nan values during truth testing. 

1997 

1998 Returns 

1999 ------- 

2000 Series or DataFrame 

2001 DataFrame or Series of boolean values, where a value is True if any element 

2002 is True within its respective group, False otherwise. 

2003 """ 

2004 return self._bool_agg("any", skipna) 

2005 

2006 @final 

2007 @Substitution(name="groupby") 

2008 @Appender(_common_see_also) 

2009 def all(self, skipna: bool = True): 

2010 """ 

2011 Return True if all values in the group are truthful, else False. 

2012 

2013 Parameters 

2014 ---------- 

2015 skipna : bool, default True 

2016 Flag to ignore nan values during truth testing. 

2017 

2018 Returns 

2019 ------- 

2020 Series or DataFrame 

2021 DataFrame or Series of boolean values, where a value is True if all elements 

2022 are True within its respective group, False otherwise. 

2023 """ 

2024 return self._bool_agg("all", skipna) 

2025 

2026 @final 

2027 @Substitution(name="groupby") 

2028 @Appender(_common_see_also) 

2029 def count(self) -> NDFrameT: 

2030 """ 

2031 Compute count of group, excluding missing values. 

2032 

2033 Returns 

2034 ------- 

2035 Series or DataFrame 

2036 Count of values within each group. 

2037 """ 

2038 data = self._get_data_to_aggregate() 

2039 ids, _, ngroups = self.grouper.group_info 

2040 mask = ids != -1 

2041 

2042 is_series = data.ndim == 1 

2043 

2044 def hfunc(bvalues: ArrayLike) -> ArrayLike: 

2045 # TODO(EA2D): reshape would not be necessary with 2D EAs 

2046 if bvalues.ndim == 1: 

2047 # EA 

2048 masked = mask & ~isna(bvalues).reshape(1, -1) 

2049 else: 

2050 masked = mask & ~isna(bvalues) 

2051 

2052 counted = lib.count_level_2d(masked, labels=ids, max_bin=ngroups, axis=1) 

2053 if is_series: 

2054 assert counted.ndim == 2 

2055 assert counted.shape[0] == 1 

2056 return counted[0] 

2057 return counted 

2058 

2059 new_mgr = data.grouped_reduce(hfunc) 

2060 

2061 # If we are grouping on categoricals we want unobserved categories to 

2062 # return zero, rather than the default of NaN which the reindexing in 

2063 # _wrap_agged_manager() returns. GH 35028 

2064 with com.temp_setattr(self, "observed", True): 

2065 result = self._wrap_agged_manager(new_mgr) 

2066 

2067 if result.ndim == 1: 

2068 result.index = self.grouper.result_index 

2069 

2070 return self._reindex_output(result, fill_value=0) 

2071 

2072 @final 

2073 @Substitution(name="groupby") 

2074 @Substitution(see_also=_common_see_also) 

2075 def mean( 

2076 self, 

2077 numeric_only: bool | lib.NoDefault = lib.no_default, 

2078 engine: str = "cython", 

2079 engine_kwargs: dict[str, bool] | None = None, 

2080 ): 

2081 """ 

2082 Compute mean of groups, excluding missing values. 

2083 

2084 Parameters 

2085 ---------- 

2086 numeric_only : bool, default True 

2087 Include only float, int, boolean columns. If None, will attempt to use 

2088 everything, then use only numeric data. 

2089 

2090 engine : str, default None 

2091 * ``'cython'`` : Runs the operation through C-extensions from cython. 

2092 * ``'numba'`` : Runs the operation through JIT compiled code from numba. 

2093 * ``None`` : Defaults to ``'cython'`` or globally setting 

2094 ``compute.use_numba`` 

2095 

2096 .. versionadded:: 1.4.0 

2097 

2098 engine_kwargs : dict, default None 

2099 * For ``'cython'`` engine, there are no accepted ``engine_kwargs`` 

2100 * For ``'numba'`` engine, the engine can accept ``nopython``, ``nogil`` 

2101 and ``parallel`` dictionary keys. The values must either be ``True`` or 

2102 ``False``. The default ``engine_kwargs`` for the ``'numba'`` engine is 

2103 ``{{'nopython': True, 'nogil': False, 'parallel': False}}`` 

2104 

2105 .. versionadded:: 1.4.0 

2106 

2107 Returns 

2108 ------- 

2109 pandas.Series or pandas.DataFrame 

2110 %(see_also)s 

2111 Examples 

2112 -------- 

2113 >>> df = pd.DataFrame({'A': [1, 1, 2, 1, 2], 

2114 ... 'B': [np.nan, 2, 3, 4, 5], 

2115 ... 'C': [1, 2, 1, 1, 2]}, columns=['A', 'B', 'C']) 

2116 

2117 Groupby one column and return the mean of the remaining columns in 

2118 each group. 

2119 

2120 >>> df.groupby('A').mean() 

2121 B C 

2122 A 

2123 1 3.0 1.333333 

2124 2 4.0 1.500000 

2125 

2126 Groupby two columns and return the mean of the remaining column. 

2127 

2128 >>> df.groupby(['A', 'B']).mean() 

2129 C 

2130 A B 

2131 1 2.0 2.0 

2132 4.0 1.0 

2133 2 3.0 1.0 

2134 5.0 2.0 

2135 

2136 Groupby one column and return the mean of only particular column in 

2137 the group. 

2138 

2139 >>> df.groupby('A')['B'].mean() 

2140 A 

2141 1 3.0 

2142 2 4.0 

2143 Name: B, dtype: float64 

2144 """ 

2145 numeric_only_bool = self._resolve_numeric_only("mean", numeric_only, axis=0) 

2146 

2147 if maybe_use_numba(engine): 

2148 from pandas.core._numba.kernels import sliding_mean 

2149 

2150 return self._numba_agg_general(sliding_mean, engine_kwargs) 

2151 else: 

2152 result = self._cython_agg_general( 

2153 "mean", 

2154 alt=lambda x: Series(x).mean(numeric_only=numeric_only_bool), 

2155 numeric_only=numeric_only, 

2156 ) 

2157 return result.__finalize__(self.obj, method="groupby") 

2158 

2159 @final 

2160 @Substitution(name="groupby") 

2161 @Appender(_common_see_also) 

2162 def median(self, numeric_only: bool | lib.NoDefault = lib.no_default): 

2163 """ 

2164 Compute median of groups, excluding missing values. 

2165 

2166 For multiple groupings, the result index will be a MultiIndex 

2167 

2168 Parameters 

2169 ---------- 

2170 numeric_only : bool, default True 

2171 Include only float, int, boolean columns. If None, will attempt to use 

2172 everything, then use only numeric data. 

2173 

2174 Returns 

2175 ------- 

2176 Series or DataFrame 

2177 Median of values within each group. 

2178 """ 

2179 numeric_only_bool = self._resolve_numeric_only("median", numeric_only, axis=0) 

2180 

2181 result = self._cython_agg_general( 

2182 "median", 

2183 alt=lambda x: Series(x).median(numeric_only=numeric_only_bool), 

2184 numeric_only=numeric_only, 

2185 ) 

2186 return result.__finalize__(self.obj, method="groupby") 

2187 

2188 @final 

2189 @Substitution(name="groupby") 

2190 @Appender(_common_see_also) 

2191 def std( 

2192 self, 

2193 ddof: int = 1, 

2194 engine: str | None = None, 

2195 engine_kwargs: dict[str, bool] | None = None, 

2196 numeric_only: bool | lib.NoDefault = lib.no_default, 

2197 ): 

2198 """ 

2199 Compute standard deviation of groups, excluding missing values. 

2200 

2201 For multiple groupings, the result index will be a MultiIndex. 

2202 

2203 Parameters 

2204 ---------- 

2205 ddof : int, default 1 

2206 Degrees of freedom. 

2207 

2208 engine : str, default None 

2209 * ``'cython'`` : Runs the operation through C-extensions from cython. 

2210 * ``'numba'`` : Runs the operation through JIT compiled code from numba. 

2211 * ``None`` : Defaults to ``'cython'`` or globally setting 

2212 ``compute.use_numba`` 

2213 

2214 .. versionadded:: 1.4.0 

2215 

2216 engine_kwargs : dict, default None 

2217 * For ``'cython'`` engine, there are no accepted ``engine_kwargs`` 

2218 * For ``'numba'`` engine, the engine can accept ``nopython``, ``nogil`` 

2219 and ``parallel`` dictionary keys. The values must either be ``True`` or 

2220 ``False``. The default ``engine_kwargs`` for the ``'numba'`` engine is 

2221 ``{{'nopython': True, 'nogil': False, 'parallel': False}}`` 

2222 

2223 .. versionadded:: 1.4.0 

2224 

2225 numeric_only : bool, default True 

2226 Include only `float`, `int` or `boolean` data. 

2227 

2228 .. versionadded:: 1.5.0 

2229 

2230 Returns 

2231 ------- 

2232 Series or DataFrame 

2233 Standard deviation of values within each group. 

2234 """ 

2235 if maybe_use_numba(engine): 

2236 from pandas.core._numba.kernels import sliding_var 

2237 

2238 return np.sqrt(self._numba_agg_general(sliding_var, engine_kwargs, ddof)) 

2239 else: 

2240 # Resolve numeric_only so that var doesn't warn 

2241 numeric_only_bool = self._resolve_numeric_only("std", numeric_only, axis=0) 

2242 if ( 

2243 numeric_only_bool 

2244 and self.obj.ndim == 1 

2245 and not is_numeric_dtype(self.obj.dtype) 

2246 ): 

2247 raise TypeError( 

2248 f"{type(self).__name__}.std called with " 

2249 f"numeric_only={numeric_only} and dtype {self.obj.dtype}" 

2250 ) 

2251 result = self._get_cythonized_result( 

2252 libgroupby.group_var, 

2253 cython_dtype=np.dtype(np.float64), 

2254 numeric_only=numeric_only_bool, 

2255 needs_counts=True, 

2256 post_processing=lambda vals, inference: np.sqrt(vals), 

2257 ddof=ddof, 

2258 ) 

2259 self._maybe_warn_numeric_only_depr("std", result, numeric_only) 

2260 return result 

2261 

2262 @final 

2263 @Substitution(name="groupby") 

2264 @Appender(_common_see_also) 

2265 def var( 

2266 self, 

2267 ddof: int = 1, 

2268 engine: str | None = None, 

2269 engine_kwargs: dict[str, bool] | None = None, 

2270 numeric_only: bool | lib.NoDefault = lib.no_default, 

2271 ): 

2272 """ 

2273 Compute variance of groups, excluding missing values. 

2274 

2275 For multiple groupings, the result index will be a MultiIndex. 

2276 

2277 Parameters 

2278 ---------- 

2279 ddof : int, default 1 

2280 Degrees of freedom. 

2281 

2282 engine : str, default None 

2283 * ``'cython'`` : Runs the operation through C-extensions from cython. 

2284 * ``'numba'`` : Runs the operation through JIT compiled code from numba. 

2285 * ``None`` : Defaults to ``'cython'`` or globally setting 

2286 ``compute.use_numba`` 

2287 

2288 .. versionadded:: 1.4.0 

2289 

2290 engine_kwargs : dict, default None 

2291 * For ``'cython'`` engine, there are no accepted ``engine_kwargs`` 

2292 * For ``'numba'`` engine, the engine can accept ``nopython``, ``nogil`` 

2293 and ``parallel`` dictionary keys. The values must either be ``True`` or 

2294 ``False``. The default ``engine_kwargs`` for the ``'numba'`` engine is 

2295 ``{{'nopython': True, 'nogil': False, 'parallel': False}}`` 

2296 

2297 .. versionadded:: 1.4.0 

2298 

2299 numeric_only : bool, default True 

2300 Include only `float`, `int` or `boolean` data. 

2301 

2302 .. versionadded:: 1.5.0 

2303 

2304 Returns 

2305 ------- 

2306 Series or DataFrame 

2307 Variance of values within each group. 

2308 """ 

2309 if maybe_use_numba(engine): 

2310 from pandas.core._numba.kernels import sliding_var 

2311 

2312 return self._numba_agg_general(sliding_var, engine_kwargs, ddof) 

2313 else: 

2314 return self._cython_agg_general( 

2315 "var", 

2316 alt=lambda x: Series(x).var(ddof=ddof), 

2317 numeric_only=numeric_only, 

2318 ignore_failures=numeric_only is lib.no_default, 

2319 ddof=ddof, 

2320 ) 

2321 

2322 @final 

2323 @Substitution(name="groupby") 

2324 @Appender(_common_see_also) 

2325 def sem(self, ddof: int = 1, numeric_only: bool | lib.NoDefault = lib.no_default): 

2326 """ 

2327 Compute standard error of the mean of groups, excluding missing values. 

2328 

2329 For multiple groupings, the result index will be a MultiIndex. 

2330 

2331 Parameters 

2332 ---------- 

2333 ddof : int, default 1 

2334 Degrees of freedom. 

2335 

2336 numeric_only : bool, default True 

2337 Include only `float`, `int` or `boolean` data. 

2338 

2339 .. versionadded:: 1.5.0 

2340 

2341 Returns 

2342 ------- 

2343 Series or DataFrame 

2344 Standard error of the mean of values within each group. 

2345 """ 

2346 # Reolve numeric_only so that std doesn't warn 

2347 numeric_only_bool = self._resolve_numeric_only("sem", numeric_only, axis=0) 

2348 if ( 

2349 numeric_only_bool 

2350 and self.obj.ndim == 1 

2351 and not is_numeric_dtype(self.obj.dtype) 

2352 ): 

2353 raise TypeError( 

2354 f"{type(self).__name__}.sem called with " 

2355 f"numeric_only={numeric_only} and dtype {self.obj.dtype}" 

2356 ) 

2357 result = self.std(ddof=ddof, numeric_only=numeric_only_bool) 

2358 self._maybe_warn_numeric_only_depr("sem", result, numeric_only) 

2359 

2360 if result.ndim == 1: 

2361 result /= np.sqrt(self.count()) 

2362 else: 

2363 cols = result.columns.difference(self.exclusions).unique() 

2364 counts = self.count() 

2365 result_ilocs = result.columns.get_indexer_for(cols) 

2366 count_ilocs = counts.columns.get_indexer_for(cols) 

2367 with warnings.catch_warnings(): 

2368 # TODO(2.0): once iloc[:, foo] = bar depecation is enforced, 

2369 # this catching will be unnecessary 

2370 warnings.filterwarnings( 

2371 "ignore", ".*will attempt to set the values inplace.*" 

2372 ) 

2373 result.iloc[:, result_ilocs] /= np.sqrt(counts.iloc[:, count_ilocs]) 

2374 return result 

2375 

2376 @final 

2377 @Substitution(name="groupby") 

2378 @Appender(_common_see_also) 

2379 def size(self) -> DataFrame | Series: 

2380 """ 

2381 Compute group sizes. 

2382 

2383 Returns 

2384 ------- 

2385 DataFrame or Series 

2386 Number of rows in each group as a Series if as_index is True 

2387 or a DataFrame if as_index is False. 

2388 """ 

2389 result = self.grouper.size() 

2390 

2391 # GH28330 preserve subclassed Series/DataFrames through calls 

2392 if isinstance(self.obj, Series): 

2393 result = self._obj_1d_constructor(result, name=self.obj.name) 

2394 else: 

2395 result = self._obj_1d_constructor(result) 

2396 

2397 if not self.as_index: 

2398 # error: Incompatible types in assignment (expression has 

2399 # type "DataFrame", variable has type "Series") 

2400 result = result.rename("size").reset_index() # type: ignore[assignment] 

2401 

2402 return self._reindex_output(result, fill_value=0) 

2403 

2404 @final 

2405 @doc(_groupby_agg_method_template, fname="sum", no=True, mc=0) 

2406 def sum( 

2407 self, 

2408 numeric_only: bool | lib.NoDefault = lib.no_default, 

2409 min_count: int = 0, 

2410 engine: str | None = None, 

2411 engine_kwargs: dict[str, bool] | None = None, 

2412 ): 

2413 if maybe_use_numba(engine): 

2414 from pandas.core._numba.kernels import sliding_sum 

2415 

2416 return self._numba_agg_general( 

2417 sliding_sum, 

2418 engine_kwargs, 

2419 ) 

2420 else: 

2421 # If we are grouping on categoricals we want unobserved categories to 

2422 # return zero, rather than the default of NaN which the reindexing in 

2423 # _agg_general() returns. GH #31422 

2424 with com.temp_setattr(self, "observed", True): 

2425 result = self._agg_general( 

2426 numeric_only=numeric_only, 

2427 min_count=min_count, 

2428 alias="sum", 

2429 npfunc=np.sum, 

2430 ) 

2431 

2432 return self._reindex_output(result, fill_value=0) 

2433 

2434 @final 

2435 @doc(_groupby_agg_method_template, fname="prod", no=True, mc=0) 

2436 def prod( 

2437 self, numeric_only: bool | lib.NoDefault = lib.no_default, min_count: int = 0 

2438 ): 

2439 return self._agg_general( 

2440 numeric_only=numeric_only, min_count=min_count, alias="prod", npfunc=np.prod 

2441 ) 

2442 

2443 @final 

2444 @doc(_groupby_agg_method_template, fname="min", no=False, mc=-1) 

2445 def min( 

2446 self, 

2447 numeric_only: bool = False, 

2448 min_count: int = -1, 

2449 engine: str | None = None, 

2450 engine_kwargs: dict[str, bool] | None = None, 

2451 ): 

2452 if maybe_use_numba(engine): 

2453 from pandas.core._numba.kernels import sliding_min_max 

2454 

2455 return self._numba_agg_general(sliding_min_max, engine_kwargs, False) 

2456 else: 

2457 return self._agg_general( 

2458 numeric_only=numeric_only, 

2459 min_count=min_count, 

2460 alias="min", 

2461 npfunc=np.min, 

2462 ) 

2463 

2464 @final 

2465 @doc(_groupby_agg_method_template, fname="max", no=False, mc=-1) 

2466 def max( 

2467 self, 

2468 numeric_only: bool = False, 

2469 min_count: int = -1, 

2470 engine: str | None = None, 

2471 engine_kwargs: dict[str, bool] | None = None, 

2472 ): 

2473 if maybe_use_numba(engine): 

2474 from pandas.core._numba.kernels import sliding_min_max 

2475 

2476 return self._numba_agg_general(sliding_min_max, engine_kwargs, True) 

2477 else: 

2478 return self._agg_general( 

2479 numeric_only=numeric_only, 

2480 min_count=min_count, 

2481 alias="max", 

2482 npfunc=np.max, 

2483 ) 

2484 

2485 @final 

2486 @Substitution(name="groupby") 

2487 def first(self, numeric_only: bool = False, min_count: int = -1): 

2488 """ 

2489 Compute the first non-null entry of each column. 

2490 

2491 Parameters 

2492 ---------- 

2493 numeric_only : bool, default False 

2494 Include only float, int, boolean columns. 

2495 min_count : int, default -1 

2496 The required number of valid values to perform the operation. If fewer 

2497 than ``min_count`` non-NA values are present the result will be NA. 

2498 

2499 Returns 

2500 ------- 

2501 Series or DataFrame 

2502 First non-null of values within each group. 

2503 

2504 See Also 

2505 -------- 

2506 DataFrame.groupby : Apply a function groupby to each row or column of a 

2507 DataFrame. 

2508 DataFrame.core.groupby.GroupBy.last : Compute the last non-null entry of each 

2509 column. 

2510 DataFrame.core.groupby.GroupBy.nth : Take the nth row from each group. 

2511 

2512 Examples 

2513 -------- 

2514 >>> df = pd.DataFrame(dict(A=[1, 1, 3], B=[None, 5, 6], C=[1, 2, 3], 

2515 ... D=['3/11/2000', '3/12/2000', '3/13/2000'])) 

2516 >>> df['D'] = pd.to_datetime(df['D']) 

2517 >>> df.groupby("A").first() 

2518 B C D 

2519 A 

2520 1 5.0 1 2000-03-11 

2521 3 6.0 3 2000-03-13 

2522 >>> df.groupby("A").first(min_count=2) 

2523 B C D 

2524 A 

2525 1 NaN 1.0 2000-03-11 

2526 3 NaN NaN NaT 

2527 >>> df.groupby("A").first(numeric_only=True) 

2528 B C 

2529 A 

2530 1 5.0 1 

2531 3 6.0 3 

2532 """ 

2533 

2534 def first_compat(obj: NDFrameT, axis: int = 0): 

2535 def first(x: Series): 

2536 """Helper function for first item that isn't NA.""" 

2537 arr = x.array[notna(x.array)] 

2538 if not len(arr): 

2539 return np.nan 

2540 return arr[0] 

2541 

2542 if isinstance(obj, DataFrame): 

2543 return obj.apply(first, axis=axis) 

2544 elif isinstance(obj, Series): 

2545 return first(obj) 

2546 else: # pragma: no cover 

2547 raise TypeError(type(obj)) 

2548 

2549 return self._agg_general( 

2550 numeric_only=numeric_only, 

2551 min_count=min_count, 

2552 alias="first", 

2553 npfunc=first_compat, 

2554 ) 

2555 

2556 @final 

2557 @Substitution(name="groupby") 

2558 def last(self, numeric_only: bool = False, min_count: int = -1): 

2559 """ 

2560 Compute the last non-null entry of each column. 

2561 

2562 Parameters 

2563 ---------- 

2564 numeric_only : bool, default False 

2565 Include only float, int, boolean columns. If None, will attempt to use 

2566 everything, then use only numeric data. 

2567 min_count : int, default -1 

2568 The required number of valid values to perform the operation. If fewer 

2569 than ``min_count`` non-NA values are present the result will be NA. 

2570 

2571 Returns 

2572 ------- 

2573 Series or DataFrame 

2574 Last non-null of values within each group. 

2575 

2576 See Also 

2577 -------- 

2578 DataFrame.groupby : Apply a function groupby to each row or column of a 

2579 DataFrame. 

2580 DataFrame.core.groupby.GroupBy.first : Compute the first non-null entry of each 

2581 column. 

2582 DataFrame.core.groupby.GroupBy.nth : Take the nth row from each group. 

2583 

2584 Examples 

2585 -------- 

2586 >>> df = pd.DataFrame(dict(A=[1, 1, 3], B=[5, None, 6], C=[1, 2, 3])) 

2587 >>> df.groupby("A").last() 

2588 B C 

2589 A 

2590 1 5.0 2 

2591 3 6.0 3 

2592 """ 

2593 

2594 def last_compat(obj: NDFrameT, axis: int = 0): 

2595 def last(x: Series): 

2596 """Helper function for last item that isn't NA.""" 

2597 arr = x.array[notna(x.array)] 

2598 if not len(arr): 

2599 return np.nan 

2600 return arr[-1] 

2601 

2602 if isinstance(obj, DataFrame): 

2603 return obj.apply(last, axis=axis) 

2604 elif isinstance(obj, Series): 

2605 return last(obj) 

2606 else: # pragma: no cover 

2607 raise TypeError(type(obj)) 

2608 

2609 return self._agg_general( 

2610 numeric_only=numeric_only, 

2611 min_count=min_count, 

2612 alias="last", 

2613 npfunc=last_compat, 

2614 ) 

2615 

2616 @final 

2617 @Substitution(name="groupby") 

2618 @Appender(_common_see_also) 

2619 def ohlc(self) -> DataFrame: 

2620 """ 

2621 Compute open, high, low and close values of a group, excluding missing values. 

2622 

2623 For multiple groupings, the result index will be a MultiIndex 

2624 

2625 Returns 

2626 ------- 

2627 DataFrame 

2628 Open, high, low and close values within each group. 

2629 """ 

2630 if self.obj.ndim == 1: 

2631 # self._iterate_slices() yields only self._selected_obj 

2632 obj = self._selected_obj 

2633 

2634 is_numeric = is_numeric_dtype(obj.dtype) 

2635 if not is_numeric: 

2636 raise DataError("No numeric types to aggregate") 

2637 

2638 res_values = self.grouper._cython_operation( 

2639 "aggregate", obj._values, "ohlc", axis=0, min_count=-1 

2640 ) 

2641 

2642 agg_names = ["open", "high", "low", "close"] 

2643 result = self.obj._constructor_expanddim( 

2644 res_values, index=self.grouper.result_index, columns=agg_names 

2645 ) 

2646 return self._reindex_output(result) 

2647 

2648 return self._apply_to_column_groupbys( 

2649 lambda x: x.ohlc(), self._obj_with_exclusions 

2650 ) 

2651 

2652 @doc(DataFrame.describe) 

2653 def describe(self, **kwargs): 

2654 with self._group_selection_context(): 

2655 if len(self._selected_obj) == 0: 

2656 described = self._selected_obj.describe(**kwargs) 

2657 if self._selected_obj.ndim == 1: 

2658 result = described 

2659 else: 

2660 result = described.unstack() 

2661 return result.to_frame().T.iloc[:0] 

2662 

2663 result = self._python_apply_general( 

2664 lambda x: x.describe(**kwargs), 

2665 self._selected_obj, 

2666 not_indexed_same=True, 

2667 ) 

2668 if self.axis == 1: 

2669 return result.T 

2670 return result.unstack() 

2671 

2672 @final 

2673 def resample(self, rule, *args, **kwargs): 

2674 """ 

2675 Provide resampling when using a TimeGrouper. 

2676 

2677 Given a grouper, the function resamples it according to a string 

2678 "string" -> "frequency". 

2679 

2680 See the :ref:`frequency aliases <timeseries.offset_aliases>` 

2681 documentation for more details. 

2682 

2683 Parameters 

2684 ---------- 

2685 rule : str or DateOffset 

2686 The offset string or object representing target grouper conversion. 

2687 *args, **kwargs 

2688 Possible arguments are `how`, `fill_method`, `limit`, `kind` and 

2689 `on`, and other arguments of `TimeGrouper`. 

2690 

2691 Returns 

2692 ------- 

2693 Grouper 

2694 Return a new grouper with our resampler appended. 

2695 

2696 See Also 

2697 -------- 

2698 Grouper : Specify a frequency to resample with when 

2699 grouping by a key. 

2700 DatetimeIndex.resample : Frequency conversion and resampling of 

2701 time series. 

2702 

2703 Examples 

2704 -------- 

2705 >>> idx = pd.date_range('1/1/2000', periods=4, freq='T') 

2706 >>> df = pd.DataFrame(data=4 * [range(2)], 

2707 ... index=idx, 

2708 ... columns=['a', 'b']) 

2709 >>> df.iloc[2, 0] = 5 

2710 >>> df 

2711 a b 

2712 2000-01-01 00:00:00 0 1 

2713 2000-01-01 00:01:00 0 1 

2714 2000-01-01 00:02:00 5 1 

2715 2000-01-01 00:03:00 0 1 

2716 

2717 Downsample the DataFrame into 3 minute bins and sum the values of 

2718 the timestamps falling into a bin. 

2719 

2720 >>> df.groupby('a').resample('3T').sum() 

2721 a b 

2722 a 

2723 0 2000-01-01 00:00:00 0 2 

2724 2000-01-01 00:03:00 0 1 

2725 5 2000-01-01 00:00:00 5 1 

2726 

2727 Upsample the series into 30 second bins. 

2728 

2729 >>> df.groupby('a').resample('30S').sum() 

2730 a b 

2731 a 

2732 0 2000-01-01 00:00:00 0 1 

2733 2000-01-01 00:00:30 0 0 

2734 2000-01-01 00:01:00 0 1 

2735 2000-01-01 00:01:30 0 0 

2736 2000-01-01 00:02:00 0 0 

2737 2000-01-01 00:02:30 0 0 

2738 2000-01-01 00:03:00 0 1 

2739 5 2000-01-01 00:02:00 5 1 

2740 

2741 Resample by month. Values are assigned to the month of the period. 

2742 

2743 >>> df.groupby('a').resample('M').sum() 

2744 a b 

2745 a 

2746 0 2000-01-31 0 3 

2747 5 2000-01-31 5 1 

2748 

2749 Downsample the series into 3 minute bins as above, but close the right 

2750 side of the bin interval. 

2751 

2752 >>> df.groupby('a').resample('3T', closed='right').sum() 

2753 a b 

2754 a 

2755 0 1999-12-31 23:57:00 0 1 

2756 2000-01-01 00:00:00 0 2 

2757 5 2000-01-01 00:00:00 5 1 

2758 

2759 Downsample the series into 3 minute bins and close the right side of 

2760 the bin interval, but label each bin using the right edge instead of 

2761 the left. 

2762 

2763 >>> df.groupby('a').resample('3T', closed='right', label='right').sum() 

2764 a b 

2765 a 

2766 0 2000-01-01 00:00:00 0 1 

2767 2000-01-01 00:03:00 0 2 

2768 5 2000-01-01 00:03:00 5 1 

2769 """ 

2770 from pandas.core.resample import get_resampler_for_grouping 

2771 

2772 return get_resampler_for_grouping(self, rule, *args, **kwargs) 

2773 

2774 @final 

2775 @Substitution(name="groupby") 

2776 @Appender(_common_see_also) 

2777 def rolling(self, *args, **kwargs) -> RollingGroupby: 

2778 """ 

2779 Return a rolling grouper, providing rolling functionality per group. 

2780 """ 

2781 from pandas.core.window import RollingGroupby 

2782 

2783 return RollingGroupby( 

2784 self._selected_obj, 

2785 *args, 

2786 _grouper=self.grouper, 

2787 _as_index=self.as_index, 

2788 **kwargs, 

2789 ) 

2790 

2791 @final 

2792 @Substitution(name="groupby") 

2793 @Appender(_common_see_also) 

2794 def expanding(self, *args, **kwargs) -> ExpandingGroupby: 

2795 """ 

2796 Return an expanding grouper, providing expanding 

2797 functionality per group. 

2798 """ 

2799 from pandas.core.window import ExpandingGroupby 

2800 

2801 return ExpandingGroupby( 

2802 self._selected_obj, 

2803 *args, 

2804 _grouper=self.grouper, 

2805 **kwargs, 

2806 ) 

2807 

2808 @final 

2809 @Substitution(name="groupby") 

2810 @Appender(_common_see_also) 

2811 def ewm(self, *args, **kwargs) -> ExponentialMovingWindowGroupby: 

2812 """ 

2813 Return an ewm grouper, providing ewm functionality per group. 

2814 """ 

2815 from pandas.core.window import ExponentialMovingWindowGroupby 

2816 

2817 return ExponentialMovingWindowGroupby( 

2818 self._selected_obj, 

2819 *args, 

2820 _grouper=self.grouper, 

2821 **kwargs, 

2822 ) 

2823 

2824 @final 

2825 def _fill(self, direction: Literal["ffill", "bfill"], limit=None): 

2826 """ 

2827 Shared function for `pad` and `backfill` to call Cython method. 

2828 

2829 Parameters 

2830 ---------- 

2831 direction : {'ffill', 'bfill'} 

2832 Direction passed to underlying Cython function. `bfill` will cause 

2833 values to be filled backwards. `ffill` and any other values will 

2834 default to a forward fill 

2835 limit : int, default None 

2836 Maximum number of consecutive values to fill. If `None`, this 

2837 method will convert to -1 prior to passing to Cython 

2838 

2839 Returns 

2840 ------- 

2841 `Series` or `DataFrame` with filled values 

2842 

2843 See Also 

2844 -------- 

2845 pad : Returns Series with minimum number of char in object. 

2846 backfill : Backward fill the missing values in the dataset. 

2847 """ 

2848 # Need int value for Cython 

2849 if limit is None: 

2850 limit = -1 

2851 

2852 ids, _, _ = self.grouper.group_info 

2853 sorted_labels = np.argsort(ids, kind="mergesort").astype(np.intp, copy=False) 

2854 if direction == "bfill": 

2855 sorted_labels = sorted_labels[::-1] 

2856 

2857 col_func = partial( 

2858 libgroupby.group_fillna_indexer, 

2859 labels=ids, 

2860 sorted_labels=sorted_labels, 

2861 direction=direction, 

2862 limit=limit, 

2863 dropna=self.dropna, 

2864 ) 

2865 

2866 def blk_func(values: ArrayLike) -> ArrayLike: 

2867 mask = isna(values) 

2868 if values.ndim == 1: 

2869 indexer = np.empty(values.shape, dtype=np.intp) 

2870 col_func(out=indexer, mask=mask) 

2871 return algorithms.take_nd(values, indexer) 

2872 

2873 else: 

2874 # We broadcast algorithms.take_nd analogous to 

2875 # np.take_along_axis 

2876 

2877 # Note: we only get here with backfill/pad, 

2878 # so if we have a dtype that cannot hold NAs, 

2879 # then there will be no -1s in indexer, so we can use 

2880 # the original dtype (no need to ensure_dtype_can_hold_na) 

2881 if isinstance(values, np.ndarray): 

2882 dtype = values.dtype 

2883 if self.grouper.has_dropped_na: 

2884 # dropped null groups give rise to nan in the result 

2885 dtype = ensure_dtype_can_hold_na(values.dtype) 

2886 out = np.empty(values.shape, dtype=dtype) 

2887 else: 

2888 out = type(values)._empty(values.shape, dtype=values.dtype) 

2889 

2890 for i in range(len(values)): 

2891 # call group_fillna_indexer column-wise 

2892 indexer = np.empty(values.shape[1], dtype=np.intp) 

2893 col_func(out=indexer, mask=mask[i]) 

2894 out[i, :] = algorithms.take_nd(values[i], indexer) 

2895 return out 

2896 

2897 obj = self._obj_with_exclusions 

2898 if self.axis == 1: 

2899 obj = obj.T 

2900 mgr = obj._mgr 

2901 res_mgr = mgr.apply(blk_func) 

2902 

2903 new_obj = obj._constructor(res_mgr) 

2904 if isinstance(new_obj, Series): 

2905 new_obj.name = obj.name 

2906 

2907 return self._wrap_transformed_output(new_obj) 

2908 

2909 @final 

2910 @Substitution(name="groupby") 

2911 def ffill(self, limit=None): 

2912 """ 

2913 Forward fill the values. 

2914 

2915 Parameters 

2916 ---------- 

2917 limit : int, optional 

2918 Limit of how many values to fill. 

2919 

2920 Returns 

2921 ------- 

2922 Series or DataFrame 

2923 Object with missing values filled. 

2924 

2925 See Also 

2926 -------- 

2927 Series.ffill: Returns Series with minimum number of char in object. 

2928 DataFrame.ffill: Object with missing values filled or None if inplace=True. 

2929 Series.fillna: Fill NaN values of a Series. 

2930 DataFrame.fillna: Fill NaN values of a DataFrame. 

2931 """ 

2932 return self._fill("ffill", limit=limit) 

2933 

2934 def pad(self, limit=None): 

2935 """ 

2936 Forward fill the values. 

2937 

2938 .. deprecated:: 1.4 

2939 Use ffill instead. 

2940 

2941 Parameters 

2942 ---------- 

2943 limit : int, optional 

2944 Limit of how many values to fill. 

2945 

2946 Returns 

2947 ------- 

2948 Series or DataFrame 

2949 Object with missing values filled. 

2950 """ 

2951 warnings.warn( 

2952 "pad is deprecated and will be removed in a future version. " 

2953 "Use ffill instead.", 

2954 FutureWarning, 

2955 stacklevel=find_stack_level(), 

2956 ) 

2957 return self.ffill(limit=limit) 

2958 

2959 @final 

2960 @Substitution(name="groupby") 

2961 def bfill(self, limit=None): 

2962 """ 

2963 Backward fill the values. 

2964 

2965 Parameters 

2966 ---------- 

2967 limit : int, optional 

2968 Limit of how many values to fill. 

2969 

2970 Returns 

2971 ------- 

2972 Series or DataFrame 

2973 Object with missing values filled. 

2974 

2975 See Also 

2976 -------- 

2977 Series.bfill : Backward fill the missing values in the dataset. 

2978 DataFrame.bfill: Backward fill the missing values in the dataset. 

2979 Series.fillna: Fill NaN values of a Series. 

2980 DataFrame.fillna: Fill NaN values of a DataFrame. 

2981 """ 

2982 return self._fill("bfill", limit=limit) 

2983 

2984 def backfill(self, limit=None): 

2985 """ 

2986 Backward fill the values. 

2987 

2988 .. deprecated:: 1.4 

2989 Use bfill instead. 

2990 

2991 Parameters 

2992 ---------- 

2993 limit : int, optional 

2994 Limit of how many values to fill. 

2995 

2996 Returns 

2997 ------- 

2998 Series or DataFrame 

2999 Object with missing values filled. 

3000 """ 

3001 warnings.warn( 

3002 "backfill is deprecated and will be removed in a future version. " 

3003 "Use bfill instead.", 

3004 FutureWarning, 

3005 stacklevel=find_stack_level(), 

3006 ) 

3007 return self.bfill(limit=limit) 

3008 

3009 # https://github.com/python/mypy/issues/1362 

3010 # Mypy does not support decorated properties 

3011 @final # type: ignore[misc] 

3012 @property 

3013 @Substitution(name="groupby") 

3014 @Substitution(see_also=_common_see_also) 

3015 def nth(self) -> GroupByNthSelector: 

3016 """ 

3017 Take the nth row from each group if n is an int, otherwise a subset of rows. 

3018 

3019 Can be either a call or an index. dropna is not available with index notation. 

3020 Index notation accepts a comma separated list of integers and slices. 

3021 

3022 If dropna, will take the nth non-null row, dropna is either 

3023 'all' or 'any'; this is equivalent to calling dropna(how=dropna) 

3024 before the groupby. 

3025 

3026 Parameters 

3027 ---------- 

3028 n : int, slice or list of ints and slices 

3029 A single nth value for the row or a list of nth values or slices. 

3030 

3031 .. versionchanged:: 1.4.0 

3032 Added slice and lists containing slices. 

3033 Added index notation. 

3034 

3035 dropna : {'any', 'all', None}, default None 

3036 Apply the specified dropna operation before counting which row is 

3037 the nth row. Only supported if n is an int. 

3038 

3039 Returns 

3040 ------- 

3041 Series or DataFrame 

3042 N-th value within each group. 

3043 %(see_also)s 

3044 Examples 

3045 -------- 

3046 

3047 >>> df = pd.DataFrame({'A': [1, 1, 2, 1, 2], 

3048 ... 'B': [np.nan, 2, 3, 4, 5]}, columns=['A', 'B']) 

3049 >>> g = df.groupby('A') 

3050 >>> g.nth(0) 

3051 B 

3052 A 

3053 1 NaN 

3054 2 3.0 

3055 >>> g.nth(1) 

3056 B 

3057 A 

3058 1 2.0 

3059 2 5.0 

3060 >>> g.nth(-1) 

3061 B 

3062 A 

3063 1 4.0 

3064 2 5.0 

3065 >>> g.nth([0, 1]) 

3066 B 

3067 A 

3068 1 NaN 

3069 1 2.0 

3070 2 3.0 

3071 2 5.0 

3072 >>> g.nth(slice(None, -1)) 

3073 B 

3074 A 

3075 1 NaN 

3076 1 2.0 

3077 2 3.0 

3078 

3079 Index notation may also be used 

3080 

3081 >>> g.nth[0, 1] 

3082 B 

3083 A 

3084 1 NaN 

3085 1 2.0 

3086 2 3.0 

3087 2 5.0 

3088 >>> g.nth[:-1] 

3089 B 

3090 A 

3091 1 NaN 

3092 1 2.0 

3093 2 3.0 

3094 

3095 Specifying `dropna` allows count ignoring ``NaN`` 

3096 

3097 >>> g.nth(0, dropna='any') 

3098 B 

3099 A 

3100 1 2.0 

3101 2 3.0 

3102 

3103 NaNs denote group exhausted when using dropna 

3104 

3105 >>> g.nth(3, dropna='any') 

3106 B 

3107 A 

3108 1 NaN 

3109 2 NaN 

3110 

3111 Specifying `as_index=False` in `groupby` keeps the original index. 

3112 

3113 >>> df.groupby('A', as_index=False).nth(1) 

3114 A B 

3115 1 1 2.0 

3116 4 2 5.0 

3117 """ 

3118 return GroupByNthSelector(self) 

3119 

3120 def _nth( 

3121 self, 

3122 n: PositionalIndexer | tuple, 

3123 dropna: Literal["any", "all", None] = None, 

3124 ) -> NDFrameT: 

3125 if not dropna: 

3126 with self._group_selection_context(): 

3127 mask = self._make_mask_from_positional_indexer(n) 

3128 

3129 ids, _, _ = self.grouper.group_info 

3130 

3131 # Drop NA values in grouping 

3132 mask = mask & (ids != -1) 

3133 

3134 out = self._mask_selected_obj(mask) 

3135 if not self.as_index: 

3136 return out 

3137 

3138 result_index = self.grouper.result_index 

3139 if self.axis == 0: 

3140 out.index = result_index[ids[mask]] 

3141 if not self.observed and isinstance(result_index, CategoricalIndex): 

3142 out = out.reindex(result_index) 

3143 

3144 out = self._reindex_output(out) 

3145 else: 

3146 out.columns = result_index[ids[mask]] 

3147 

3148 return out.sort_index(axis=self.axis) if self.sort else out 

3149 

3150 # dropna is truthy 

3151 if not is_integer(n): 

3152 raise ValueError("dropna option only supported for an integer argument") 

3153 

3154 if dropna not in ["any", "all"]: 

3155 # Note: when agg-ing picker doesn't raise this, just returns NaN 

3156 raise ValueError( 

3157 "For a DataFrame or Series groupby.nth, dropna must be " 

3158 "either None, 'any' or 'all', " 

3159 f"(was passed {dropna})." 

3160 ) 

3161 

3162 # old behaviour, but with all and any support for DataFrames. 

3163 # modified in GH 7559 to have better perf 

3164 n = cast(int, n) 

3165 max_len = n if n >= 0 else -1 - n 

3166 dropped = self.obj.dropna(how=dropna, axis=self.axis) 

3167 

3168 # get a new grouper for our dropped obj 

3169 if self.keys is None and self.level is None: 

3170 

3171 # we don't have the grouper info available 

3172 # (e.g. we have selected out 

3173 # a column that is not in the current object) 

3174 axis = self.grouper.axis 

3175 grouper = axis[axis.isin(dropped.index)] 

3176 

3177 else: 

3178 

3179 # create a grouper with the original parameters, but on dropped 

3180 # object 

3181 from pandas.core.groupby.grouper import get_grouper 

3182 

3183 grouper, _, _ = get_grouper( 

3184 dropped, 

3185 key=self.keys, 

3186 axis=self.axis, 

3187 level=self.level, 

3188 sort=self.sort, 

3189 mutated=self.mutated, 

3190 ) 

3191 

3192 grb = dropped.groupby( 

3193 grouper, as_index=self.as_index, sort=self.sort, axis=self.axis 

3194 ) 

3195 sizes, result = grb.size(), grb.nth(n) 

3196 mask = (sizes < max_len)._values 

3197 

3198 # set the results which don't meet the criteria 

3199 if len(result) and mask.any(): 

3200 result.loc[mask] = np.nan 

3201 

3202 # reset/reindex to the original groups 

3203 if len(self.obj) == len(dropped) or len(result) == len( 

3204 self.grouper.result_index 

3205 ): 

3206 result.index = self.grouper.result_index 

3207 else: 

3208 result = result.reindex(self.grouper.result_index) 

3209 

3210 return result 

3211 

3212 @final 

3213 def quantile( 

3214 self, 

3215 q=0.5, 

3216 interpolation: str = "linear", 

3217 numeric_only: bool | lib.NoDefault = lib.no_default, 

3218 ): 

3219 """ 

3220 Return group values at the given quantile, a la numpy.percentile. 

3221 

3222 Parameters 

3223 ---------- 

3224 q : float or array-like, default 0.5 (50% quantile) 

3225 Value(s) between 0 and 1 providing the quantile(s) to compute. 

3226 interpolation : {'linear', 'lower', 'higher', 'midpoint', 'nearest'} 

3227 Method to use when the desired quantile falls between two points. 

3228 numeric_only : bool, default True 

3229 Include only `float`, `int` or `boolean` data. 

3230 

3231 .. versionadded:: 1.5.0 

3232 

3233 Returns 

3234 ------- 

3235 Series or DataFrame 

3236 Return type determined by caller of GroupBy object. 

3237 

3238 See Also 

3239 -------- 

3240 Series.quantile : Similar method for Series. 

3241 DataFrame.quantile : Similar method for DataFrame. 

3242 numpy.percentile : NumPy method to compute qth percentile. 

3243 

3244 Examples 

3245 -------- 

3246 >>> df = pd.DataFrame([ 

3247 ... ['a', 1], ['a', 2], ['a', 3], 

3248 ... ['b', 1], ['b', 3], ['b', 5] 

3249 ... ], columns=['key', 'val']) 

3250 >>> df.groupby('key').quantile() 

3251 val 

3252 key 

3253 a 2.0 

3254 b 3.0 

3255 """ 

3256 numeric_only_bool = self._resolve_numeric_only("quantile", numeric_only, axis=0) 

3257 if ( 

3258 numeric_only_bool 

3259 and self.obj.ndim == 1 

3260 and not is_numeric_dtype(self.obj.dtype) 

3261 ): 

3262 raise TypeError( 

3263 f"{type(self).__name__}.quantile called with " 

3264 f"numeric_only={numeric_only} and dtype {self.obj.dtype}" 

3265 ) 

3266 

3267 def pre_processor(vals: ArrayLike) -> tuple[np.ndarray, np.dtype | None]: 

3268 if is_object_dtype(vals): 

3269 raise TypeError( 

3270 "'quantile' cannot be performed against 'object' dtypes!" 

3271 ) 

3272 

3273 inference: np.dtype | None = None 

3274 if is_integer_dtype(vals.dtype): 

3275 if isinstance(vals, ExtensionArray): 

3276 out = vals.to_numpy(dtype=float, na_value=np.nan) 

3277 else: 

3278 out = vals 

3279 inference = np.dtype(np.int64) 

3280 elif is_bool_dtype(vals.dtype) and isinstance(vals, ExtensionArray): 

3281 out = vals.to_numpy(dtype=float, na_value=np.nan) 

3282 elif is_datetime64_dtype(vals.dtype): 

3283 inference = np.dtype("datetime64[ns]") 

3284 out = np.asarray(vals).astype(float) 

3285 elif is_timedelta64_dtype(vals.dtype): 

3286 inference = np.dtype("timedelta64[ns]") 

3287 out = np.asarray(vals).astype(float) 

3288 elif isinstance(vals, ExtensionArray) and is_float_dtype(vals): 

3289 inference = np.dtype(np.float64) 

3290 out = vals.to_numpy(dtype=float, na_value=np.nan) 

3291 else: 

3292 out = np.asarray(vals) 

3293 

3294 return out, inference 

3295 

3296 def post_processor(vals: np.ndarray, inference: np.dtype | None) -> np.ndarray: 

3297 if inference: 

3298 # Check for edge case 

3299 if not ( 

3300 is_integer_dtype(inference) 

3301 and interpolation in {"linear", "midpoint"} 

3302 ): 

3303 vals = vals.astype(inference) 

3304 

3305 return vals 

3306 

3307 orig_scalar = is_scalar(q) 

3308 if orig_scalar: 

3309 q = [q] 

3310 

3311 qs = np.array(q, dtype=np.float64) 

3312 ids, _, ngroups = self.grouper.group_info 

3313 nqs = len(qs) 

3314 

3315 func = partial( 

3316 libgroupby.group_quantile, labels=ids, qs=qs, interpolation=interpolation 

3317 ) 

3318 

3319 # Put '-1' (NaN) labels as the last group so it does not interfere 

3320 # with the calculations. Note: length check avoids failure on empty 

3321 # labels. In that case, the value doesn't matter 

3322 na_label_for_sorting = ids.max() + 1 if len(ids) > 0 else 0 

3323 labels_for_lexsort = np.where(ids == -1, na_label_for_sorting, ids) 

3324 

3325 def blk_func(values: ArrayLike) -> ArrayLike: 

3326 mask = isna(values) 

3327 vals, inference = pre_processor(values) 

3328 

3329 ncols = 1 

3330 if vals.ndim == 2: 

3331 ncols = vals.shape[0] 

3332 shaped_labels = np.broadcast_to( 

3333 labels_for_lexsort, (ncols, len(labels_for_lexsort)) 

3334 ) 

3335 else: 

3336 shaped_labels = labels_for_lexsort 

3337 

3338 out = np.empty((ncols, ngroups, nqs), dtype=np.float64) 

3339 

3340 # Get an index of values sorted by values and then labels 

3341 order = (vals, shaped_labels) 

3342 sort_arr = np.lexsort(order).astype(np.intp, copy=False) 

3343 

3344 if vals.ndim == 1: 

3345 func(out[0], values=vals, mask=mask, sort_indexer=sort_arr) 

3346 else: 

3347 for i in range(ncols): 

3348 func(out[i], values=vals[i], mask=mask[i], sort_indexer=sort_arr[i]) 

3349 

3350 if vals.ndim == 1: 

3351 out = out.ravel("K") 

3352 else: 

3353 out = out.reshape(ncols, ngroups * nqs) 

3354 return post_processor(out, inference) 

3355 

3356 obj = self._obj_with_exclusions 

3357 is_ser = obj.ndim == 1 

3358 mgr = self._get_data_to_aggregate() 

3359 data = mgr.get_numeric_data() if numeric_only_bool else mgr 

3360 ignore_failures = numeric_only_bool 

3361 res_mgr = data.grouped_reduce(blk_func, ignore_failures=ignore_failures) 

3362 

3363 if ( 

3364 numeric_only is lib.no_default 

3365 and not is_ser 

3366 and len(res_mgr.items) != len(mgr.items) 

3367 ): 

3368 warn_dropping_nuisance_columns_deprecated( 

3369 type(self), "quantile", numeric_only 

3370 ) 

3371 

3372 if len(res_mgr.items) == 0: 

3373 # re-call grouped_reduce to get the desired exception message 

3374 mgr.grouped_reduce(blk_func, ignore_failures=False) 

3375 # grouped_reduce _should_ raise, so this should not be reached 

3376 raise TypeError( # pragma: no cover 

3377 "All columns were dropped in grouped_reduce" 

3378 ) 

3379 

3380 if is_ser: 

3381 res = self._wrap_agged_manager(res_mgr) 

3382 else: 

3383 res = obj._constructor(res_mgr) 

3384 

3385 if orig_scalar: 

3386 # Avoid expensive MultiIndex construction 

3387 return self._wrap_aggregated_output(res) 

3388 return self._wrap_aggregated_output(res, qs=qs) 

3389 

3390 @final 

3391 @Substitution(name="groupby") 

3392 def ngroup(self, ascending: bool = True): 

3393 """ 

3394 Number each group from 0 to the number of groups - 1. 

3395 

3396 This is the enumerative complement of cumcount. Note that the 

3397 numbers given to the groups match the order in which the groups 

3398 would be seen when iterating over the groupby object, not the 

3399 order they are first observed. 

3400 

3401 Parameters 

3402 ---------- 

3403 ascending : bool, default True 

3404 If False, number in reverse, from number of group - 1 to 0. 

3405 

3406 Returns 

3407 ------- 

3408 Series 

3409 Unique numbers for each group. 

3410 

3411 See Also 

3412 -------- 

3413 .cumcount : Number the rows in each group. 

3414 

3415 Examples 

3416 -------- 

3417 >>> df = pd.DataFrame({"A": list("aaabba")}) 

3418 >>> df 

3419 A 

3420 0 a 

3421 1 a 

3422 2 a 

3423 3 b 

3424 4 b 

3425 5 a 

3426 >>> df.groupby('A').ngroup() 

3427 0 0 

3428 1 0 

3429 2 0 

3430 3 1 

3431 4 1 

3432 5 0 

3433 dtype: int64 

3434 >>> df.groupby('A').ngroup(ascending=False) 

3435 0 1 

3436 1 1 

3437 2 1 

3438 3 0 

3439 4 0 

3440 5 1 

3441 dtype: int64 

3442 >>> df.groupby(["A", [1,1,2,3,2,1]]).ngroup() 

3443 0 0 

3444 1 0 

3445 2 1 

3446 3 3 

3447 4 2 

3448 5 0 

3449 dtype: int64 

3450 """ 

3451 with self._group_selection_context(): 

3452 index = self._selected_obj.index 

3453 comp_ids = self.grouper.group_info[0] 

3454 

3455 dtype: type 

3456 if self.grouper.has_dropped_na: 

3457 comp_ids = np.where(comp_ids == -1, np.nan, comp_ids) 

3458 dtype = np.float64 

3459 else: 

3460 dtype = np.int64 

3461 

3462 result = self._obj_1d_constructor(comp_ids, index, dtype=dtype) 

3463 if not ascending: 

3464 result = self.ngroups - 1 - result 

3465 return result 

3466 

3467 @final 

3468 @Substitution(name="groupby") 

3469 def cumcount(self, ascending: bool = True): 

3470 """ 

3471 Number each item in each group from 0 to the length of that group - 1. 

3472 

3473 Essentially this is equivalent to 

3474 

3475 .. code-block:: python 

3476 

3477 self.apply(lambda x: pd.Series(np.arange(len(x)), x.index)) 

3478 

3479 Parameters 

3480 ---------- 

3481 ascending : bool, default True 

3482 If False, number in reverse, from length of group - 1 to 0. 

3483 

3484 Returns 

3485 ------- 

3486 Series 

3487 Sequence number of each element within each group. 

3488 

3489 See Also 

3490 -------- 

3491 .ngroup : Number the groups themselves. 

3492 

3493 Examples 

3494 -------- 

3495 >>> df = pd.DataFrame([['a'], ['a'], ['a'], ['b'], ['b'], ['a']], 

3496 ... columns=['A']) 

3497 >>> df 

3498 A 

3499 0 a 

3500 1 a 

3501 2 a 

3502 3 b 

3503 4 b 

3504 5 a 

3505 >>> df.groupby('A').cumcount() 

3506 0 0 

3507 1 1 

3508 2 2 

3509 3 0 

3510 4 1 

3511 5 3 

3512 dtype: int64 

3513 >>> df.groupby('A').cumcount(ascending=False) 

3514 0 3 

3515 1 2 

3516 2 1 

3517 3 1 

3518 4 0 

3519 5 0 

3520 dtype: int64 

3521 """ 

3522 with self._group_selection_context(): 

3523 index = self._selected_obj._get_axis(self.axis) 

3524 cumcounts = self._cumcount_array(ascending=ascending) 

3525 return self._obj_1d_constructor(cumcounts, index) 

3526 

3527 @final 

3528 @Substitution(name="groupby") 

3529 @Substitution(see_also=_common_see_also) 

3530 def rank( 

3531 self, 

3532 method: str = "average", 

3533 ascending: bool = True, 

3534 na_option: str = "keep", 

3535 pct: bool = False, 

3536 axis: int = 0, 

3537 ) -> NDFrameT: 

3538 """ 

3539 Provide the rank of values within each group. 

3540 

3541 Parameters 

3542 ---------- 

3543 method : {'average', 'min', 'max', 'first', 'dense'}, default 'average' 

3544 * average: average rank of group. 

3545 * min: lowest rank in group. 

3546 * max: highest rank in group. 

3547 * first: ranks assigned in order they appear in the array. 

3548 * dense: like 'min', but rank always increases by 1 between groups. 

3549 ascending : bool, default True 

3550 False for ranks by high (1) to low (N). 

3551 na_option : {'keep', 'top', 'bottom'}, default 'keep' 

3552 * keep: leave NA values where they are. 

3553 * top: smallest rank if ascending. 

3554 * bottom: smallest rank if descending. 

3555 pct : bool, default False 

3556 Compute percentage rank of data within each group. 

3557 axis : int, default 0 

3558 The axis of the object over which to compute the rank. 

3559 

3560 Returns 

3561 ------- 

3562 DataFrame with ranking of values within each group 

3563 %(see_also)s 

3564 Examples 

3565 -------- 

3566 >>> df = pd.DataFrame( 

3567 ... { 

3568 ... "group": ["a", "a", "a", "a", "a", "b", "b", "b", "b", "b"], 

3569 ... "value": [2, 4, 2, 3, 5, 1, 2, 4, 1, 5], 

3570 ... } 

3571 ... ) 

3572 >>> df 

3573 group value 

3574 0 a 2 

3575 1 a 4 

3576 2 a 2 

3577 3 a 3 

3578 4 a 5 

3579 5 b 1 

3580 6 b 2 

3581 7 b 4 

3582 8 b 1 

3583 9 b 5 

3584 >>> for method in ['average', 'min', 'max', 'dense', 'first']: 

3585 ... df[f'{method}_rank'] = df.groupby('group')['value'].rank(method) 

3586 >>> df 

3587 group value average_rank min_rank max_rank dense_rank first_rank 

3588 0 a 2 1.5 1.0 2.0 1.0 1.0 

3589 1 a 4 4.0 4.0 4.0 3.0 4.0 

3590 2 a 2 1.5 1.0 2.0 1.0 2.0 

3591 3 a 3 3.0 3.0 3.0 2.0 3.0 

3592 4 a 5 5.0 5.0 5.0 4.0 5.0 

3593 5 b 1 1.5 1.0 2.0 1.0 1.0 

3594 6 b 2 3.0 3.0 3.0 2.0 3.0 

3595 7 b 4 4.0 4.0 4.0 3.0 4.0 

3596 8 b 1 1.5 1.0 2.0 1.0 2.0 

3597 9 b 5 5.0 5.0 5.0 4.0 5.0 

3598 """ 

3599 if na_option not in {"keep", "top", "bottom"}: 

3600 msg = "na_option must be one of 'keep', 'top', or 'bottom'" 

3601 raise ValueError(msg) 

3602 

3603 kwargs = { 

3604 "ties_method": method, 

3605 "ascending": ascending, 

3606 "na_option": na_option, 

3607 "pct": pct, 

3608 } 

3609 if axis != 0: 

3610 # DataFrame uses different keyword name 

3611 kwargs["method"] = kwargs.pop("ties_method") 

3612 f = lambda x: x.rank(axis=axis, numeric_only=False, **kwargs) 

3613 result = self._python_apply_general( 

3614 f, self._selected_obj, is_transform=True 

3615 ) 

3616 return result 

3617 

3618 return self._cython_transform( 

3619 "rank", 

3620 numeric_only=False, 

3621 axis=axis, 

3622 **kwargs, 

3623 ) 

3624 

3625 @final 

3626 @Substitution(name="groupby") 

3627 @Appender(_common_see_also) 

3628 def cumprod(self, axis=0, *args, **kwargs) -> NDFrameT: 

3629 """ 

3630 Cumulative product for each group. 

3631 

3632 Returns 

3633 ------- 

3634 Series or DataFrame 

3635 """ 

3636 nv.validate_groupby_func("cumprod", args, kwargs, ["numeric_only", "skipna"]) 

3637 if axis != 0: 

3638 f = lambda x: x.cumprod(axis=axis, **kwargs) 

3639 return self._python_apply_general(f, self._selected_obj, is_transform=True) 

3640 

3641 return self._cython_transform("cumprod", **kwargs) 

3642 

3643 @final 

3644 @Substitution(name="groupby") 

3645 @Appender(_common_see_also) 

3646 def cumsum(self, axis=0, *args, **kwargs) -> NDFrameT: 

3647 """ 

3648 Cumulative sum for each group. 

3649 

3650 Returns 

3651 ------- 

3652 Series or DataFrame 

3653 """ 

3654 nv.validate_groupby_func("cumsum", args, kwargs, ["numeric_only", "skipna"]) 

3655 if axis != 0: 

3656 f = lambda x: x.cumsum(axis=axis, **kwargs) 

3657 return self._python_apply_general(f, self._selected_obj, is_transform=True) 

3658 

3659 return self._cython_transform("cumsum", **kwargs) 

3660 

3661 @final 

3662 @Substitution(name="groupby") 

3663 @Appender(_common_see_also) 

3664 def cummin(self, axis=0, numeric_only=False, **kwargs) -> NDFrameT: 

3665 """ 

3666 Cumulative min for each group. 

3667 

3668 Returns 

3669 ------- 

3670 Series or DataFrame 

3671 """ 

3672 skipna = kwargs.get("skipna", True) 

3673 if axis != 0: 

3674 f = lambda x: np.minimum.accumulate(x, axis) 

3675 numeric_only_bool = self._resolve_numeric_only("cummax", numeric_only, axis) 

3676 obj = self._selected_obj 

3677 if numeric_only_bool: 

3678 obj = obj._get_numeric_data() 

3679 return self._python_apply_general(f, obj, is_transform=True) 

3680 

3681 return self._cython_transform( 

3682 "cummin", numeric_only=numeric_only, skipna=skipna 

3683 ) 

3684 

3685 @final 

3686 @Substitution(name="groupby") 

3687 @Appender(_common_see_also) 

3688 def cummax(self, axis=0, numeric_only=False, **kwargs) -> NDFrameT: 

3689 """ 

3690 Cumulative max for each group. 

3691 

3692 Returns 

3693 ------- 

3694 Series or DataFrame 

3695 """ 

3696 skipna = kwargs.get("skipna", True) 

3697 if axis != 0: 

3698 f = lambda x: np.maximum.accumulate(x, axis) 

3699 numeric_only_bool = self._resolve_numeric_only("cummax", numeric_only, axis) 

3700 obj = self._selected_obj 

3701 if numeric_only_bool: 

3702 obj = obj._get_numeric_data() 

3703 return self._python_apply_general(f, obj, is_transform=True) 

3704 

3705 return self._cython_transform( 

3706 "cummax", numeric_only=numeric_only, skipna=skipna 

3707 ) 

3708 

3709 @final 

3710 def _get_cythonized_result( 

3711 self, 

3712 base_func: Callable, 

3713 cython_dtype: np.dtype, 

3714 numeric_only: bool | lib.NoDefault = lib.no_default, 

3715 needs_counts: bool = False, 

3716 needs_nullable: bool = False, 

3717 needs_mask: bool = False, 

3718 pre_processing=None, 

3719 post_processing=None, 

3720 **kwargs, 

3721 ): 

3722 """ 

3723 Get result for Cythonized functions. 

3724 

3725 Parameters 

3726 ---------- 

3727 base_func : callable, Cythonized function to be called 

3728 cython_dtype : np.dtype 

3729 Type of the array that will be modified by the Cython call. 

3730 numeric_only : bool, default True 

3731 Whether only numeric datatypes should be computed 

3732 needs_counts : bool, default False 

3733 Whether the counts should be a part of the Cython call 

3734 needs_mask : bool, default False 

3735 Whether boolean mask needs to be part of the Cython call 

3736 signature 

3737 needs_nullable : bool, default False 

3738 Whether a bool specifying if the input is nullable is part 

3739 of the Cython call signature 

3740 pre_processing : function, default None 

3741 Function to be applied to `values` prior to passing to Cython. 

3742 Function should return a tuple where the first element is the 

3743 values to be passed to Cython and the second element is an optional 

3744 type which the values should be converted to after being returned 

3745 by the Cython operation. This function is also responsible for 

3746 raising a TypeError if the values have an invalid type. Raises 

3747 if `needs_values` is False. 

3748 post_processing : function, default None 

3749 Function to be applied to result of Cython function. Should accept 

3750 an array of values as the first argument and type inferences as its 

3751 second argument, i.e. the signature should be 

3752 (ndarray, Type). If `needs_nullable=True`, a third argument should be 

3753 `nullable`, to allow for processing specific to nullable values. 

3754 **kwargs : dict 

3755 Extra arguments to be passed back to Cython funcs 

3756 

3757 Returns 

3758 ------- 

3759 `Series` or `DataFrame` with filled values 

3760 """ 

3761 how = base_func.__name__ 

3762 numeric_only_bool = self._resolve_numeric_only(how, numeric_only, axis=0) 

3763 

3764 if post_processing and not callable(post_processing): 

3765 raise ValueError("'post_processing' must be a callable!") 

3766 if pre_processing and not callable(pre_processing): 

3767 raise ValueError("'pre_processing' must be a callable!") 

3768 

3769 grouper = self.grouper 

3770 

3771 ids, _, ngroups = grouper.group_info 

3772 

3773 base_func = partial(base_func, labels=ids) 

3774 

3775 def blk_func(values: ArrayLike) -> ArrayLike: 

3776 values = values.T 

3777 ncols = 1 if values.ndim == 1 else values.shape[1] 

3778 

3779 result: ArrayLike 

3780 result = np.zeros(ngroups * ncols, dtype=cython_dtype) 

3781 result = result.reshape((ngroups, ncols)) 

3782 

3783 func = partial(base_func, out=result) 

3784 

3785 inferences = None 

3786 

3787 if needs_counts: 

3788 counts = np.zeros(self.ngroups, dtype=np.int64) 

3789 func = partial(func, counts=counts) 

3790 

3791 vals = values 

3792 if pre_processing: 

3793 vals, inferences = pre_processing(vals) 

3794 

3795 vals = vals.astype(cython_dtype, copy=False) 

3796 if vals.ndim == 1: 

3797 vals = vals.reshape((-1, 1)) 

3798 func = partial(func, values=vals) 

3799 

3800 if needs_mask: 

3801 mask = isna(values).view(np.uint8) 

3802 if mask.ndim == 1: 

3803 mask = mask.reshape(-1, 1) 

3804 func = partial(func, mask=mask) 

3805 

3806 if needs_nullable: 

3807 is_nullable = isinstance(values, BaseMaskedArray) 

3808 func = partial(func, nullable=is_nullable) 

3809 

3810 func(**kwargs) # Call func to modify indexer values in place 

3811 

3812 if values.ndim == 1: 

3813 assert result.shape[1] == 1, result.shape 

3814 result = result[:, 0] 

3815 

3816 if post_processing: 

3817 pp_kwargs = {} 

3818 if needs_nullable: 

3819 pp_kwargs["nullable"] = isinstance(values, BaseMaskedArray) 

3820 

3821 result = post_processing(result, inferences, **pp_kwargs) 

3822 

3823 return result.T 

3824 

3825 obj = self._obj_with_exclusions 

3826 

3827 # Operate block-wise instead of column-by-column 

3828 is_ser = obj.ndim == 1 

3829 mgr = self._get_data_to_aggregate() 

3830 orig_mgr_len = len(mgr) 

3831 

3832 if numeric_only_bool: 

3833 mgr = mgr.get_numeric_data() 

3834 

3835 res_mgr = mgr.grouped_reduce(blk_func, ignore_failures=True) 

3836 

3837 if not is_ser and len(res_mgr.items) != orig_mgr_len: 

3838 howstr = how.replace("group_", "") 

3839 warn_dropping_nuisance_columns_deprecated(type(self), howstr, numeric_only) 

3840 

3841 if len(res_mgr.items) == 0: 

3842 # We re-call grouped_reduce to get the right exception message 

3843 mgr.grouped_reduce(blk_func, ignore_failures=False) 

3844 # grouped_reduce _should_ raise, so this should not be reached 

3845 raise TypeError( # pragma: no cover 

3846 "All columns were dropped in grouped_reduce" 

3847 ) 

3848 

3849 if is_ser: 

3850 out = self._wrap_agged_manager(res_mgr) 

3851 else: 

3852 out = obj._constructor(res_mgr) 

3853 

3854 return self._wrap_aggregated_output(out) 

3855 

3856 @final 

3857 @Substitution(name="groupby") 

3858 def shift(self, periods=1, freq=None, axis=0, fill_value=None): 

3859 """ 

3860 Shift each group by periods observations. 

3861 

3862 If freq is passed, the index will be increased using the periods and the freq. 

3863 

3864 Parameters 

3865 ---------- 

3866 periods : int, default 1 

3867 Number of periods to shift. 

3868 freq : str, optional 

3869 Frequency string. 

3870 axis : axis to shift, default 0 

3871 Shift direction. 

3872 fill_value : optional 

3873 The scalar value to use for newly introduced missing values. 

3874 

3875 Returns 

3876 ------- 

3877 Series or DataFrame 

3878 Object shifted within each group. 

3879 

3880 See Also 

3881 -------- 

3882 Index.shift : Shift values of Index. 

3883 tshift : Shift the time index, using the index’s frequency 

3884 if available. 

3885 """ 

3886 if freq is not None or axis != 0: 

3887 f = lambda x: x.shift(periods, freq, axis, fill_value) 

3888 return self._python_apply_general(f, self._selected_obj, is_transform=True) 

3889 

3890 ids, _, ngroups = self.grouper.group_info 

3891 res_indexer = np.zeros(len(ids), dtype=np.int64) 

3892 

3893 libgroupby.group_shift_indexer(res_indexer, ids, ngroups, periods) 

3894 

3895 obj = self._obj_with_exclusions 

3896 

3897 res = obj._reindex_with_indexers( 

3898 {self.axis: (obj.axes[self.axis], res_indexer)}, 

3899 fill_value=fill_value, 

3900 allow_dups=True, 

3901 ) 

3902 return res 

3903 

3904 @final 

3905 @Substitution(name="groupby") 

3906 @Appender(_common_see_also) 

3907 def diff(self, periods: int = 1, axis: int = 0) -> NDFrameT: 

3908 """ 

3909 First discrete difference of element. 

3910 

3911 Calculates the difference of each element compared with another 

3912 element in the group (default is element in previous row). 

3913 

3914 Parameters 

3915 ---------- 

3916 periods : int, default 1 

3917 Periods to shift for calculating difference, accepts negative values. 

3918 axis : axis to shift, default 0 

3919 Take difference over rows (0) or columns (1). 

3920 

3921 Returns 

3922 ------- 

3923 Series or DataFrame 

3924 First differences. 

3925 """ 

3926 if axis != 0: 

3927 return self.apply(lambda x: x.diff(periods=periods, axis=axis)) 

3928 

3929 obj = self._obj_with_exclusions 

3930 shifted = self.shift(periods=periods, axis=axis) 

3931 

3932 # GH45562 - to retain existing behavior and match behavior of Series.diff(), 

3933 # int8 and int16 are coerced to float32 rather than float64. 

3934 dtypes_to_f32 = ["int8", "int16"] 

3935 if obj.ndim == 1: 

3936 if obj.dtype in dtypes_to_f32: 

3937 shifted = shifted.astype("float32") 

3938 else: 

3939 to_coerce = [c for c, dtype in obj.dtypes.items() if dtype in dtypes_to_f32] 

3940 if len(to_coerce): 

3941 shifted = shifted.astype({c: "float32" for c in to_coerce}) 

3942 

3943 return obj - shifted 

3944 

3945 @final 

3946 @Substitution(name="groupby") 

3947 @Appender(_common_see_also) 

3948 def pct_change(self, periods=1, fill_method="ffill", limit=None, freq=None, axis=0): 

3949 """ 

3950 Calculate pct_change of each value to previous entry in group. 

3951 

3952 Returns 

3953 ------- 

3954 Series or DataFrame 

3955 Percentage changes within each group. 

3956 """ 

3957 # TODO(GH#23918): Remove this conditional for SeriesGroupBy when 

3958 # GH#23918 is fixed 

3959 if freq is not None or axis != 0: 

3960 f = lambda x: x.pct_change( 

3961 periods=periods, 

3962 fill_method=fill_method, 

3963 limit=limit, 

3964 freq=freq, 

3965 axis=axis, 

3966 ) 

3967 return self._python_apply_general(f, self._selected_obj, is_transform=True) 

3968 

3969 if fill_method is None: # GH30463 

3970 fill_method = "ffill" 

3971 limit = 0 

3972 filled = getattr(self, fill_method)(limit=limit) 

3973 fill_grp = filled.groupby( 

3974 self.grouper.codes, axis=self.axis, group_keys=self.group_keys 

3975 ) 

3976 shifted = fill_grp.shift(periods=periods, freq=freq, axis=self.axis) 

3977 return (filled / shifted) - 1 

3978 

3979 @final 

3980 @Substitution(name="groupby") 

3981 @Substitution(see_also=_common_see_also) 

3982 def head(self, n: int = 5) -> NDFrameT: 

3983 """ 

3984 Return first n rows of each group. 

3985 

3986 Similar to ``.apply(lambda x: x.head(n))``, but it returns a subset of rows 

3987 from the original DataFrame with original index and order preserved 

3988 (``as_index`` flag is ignored). 

3989 

3990 Parameters 

3991 ---------- 

3992 n : int 

3993 If positive: number of entries to include from start of each group. 

3994 If negative: number of entries to exclude from end of each group. 

3995 

3996 Returns 

3997 ------- 

3998 Series or DataFrame 

3999 Subset of original Series or DataFrame as determined by n. 

4000 %(see_also)s 

4001 Examples 

4002 -------- 

4003 

4004 >>> df = pd.DataFrame([[1, 2], [1, 4], [5, 6]], 

4005 ... columns=['A', 'B']) 

4006 >>> df.groupby('A').head(1) 

4007 A B 

4008 0 1 2 

4009 2 5 6 

4010 >>> df.groupby('A').head(-1) 

4011 A B 

4012 0 1 2 

4013 """ 

4014 self._reset_group_selection() 

4015 mask = self._make_mask_from_positional_indexer(slice(None, n)) 

4016 return self._mask_selected_obj(mask) 

4017 

4018 @final 

4019 @Substitution(name="groupby") 

4020 @Substitution(see_also=_common_see_also) 

4021 def tail(self, n: int = 5) -> NDFrameT: 

4022 """ 

4023 Return last n rows of each group. 

4024 

4025 Similar to ``.apply(lambda x: x.tail(n))``, but it returns a subset of rows 

4026 from the original DataFrame with original index and order preserved 

4027 (``as_index`` flag is ignored). 

4028 

4029 Parameters 

4030 ---------- 

4031 n : int 

4032 If positive: number of entries to include from end of each group. 

4033 If negative: number of entries to exclude from start of each group. 

4034 

4035 Returns 

4036 ------- 

4037 Series or DataFrame 

4038 Subset of original Series or DataFrame as determined by n. 

4039 %(see_also)s 

4040 Examples 

4041 -------- 

4042 

4043 >>> df = pd.DataFrame([['a', 1], ['a', 2], ['b', 1], ['b', 2]], 

4044 ... columns=['A', 'B']) 

4045 >>> df.groupby('A').tail(1) 

4046 A B 

4047 1 a 2 

4048 3 b 2 

4049 >>> df.groupby('A').tail(-1) 

4050 A B 

4051 1 a 2 

4052 3 b 2 

4053 """ 

4054 self._reset_group_selection() 

4055 if n: 

4056 mask = self._make_mask_from_positional_indexer(slice(-n, None)) 

4057 else: 

4058 mask = self._make_mask_from_positional_indexer([]) 

4059 

4060 return self._mask_selected_obj(mask) 

4061 

4062 @final 

4063 def _mask_selected_obj(self, mask: npt.NDArray[np.bool_]) -> NDFrameT: 

4064 """ 

4065 Return _selected_obj with mask applied to the correct axis. 

4066 

4067 Parameters 

4068 ---------- 

4069 mask : np.ndarray[bool] 

4070 Boolean mask to apply. 

4071 

4072 Returns 

4073 ------- 

4074 Series or DataFrame 

4075 Filtered _selected_obj. 

4076 """ 

4077 ids = self.grouper.group_info[0] 

4078 mask = mask & (ids != -1) 

4079 

4080 if self.axis == 0: 

4081 return self._selected_obj[mask] 

4082 else: 

4083 return self._selected_obj.iloc[:, mask] 

4084 

4085 @final 

4086 def _reindex_output( 

4087 self, 

4088 output: OutputFrameOrSeries, 

4089 fill_value: Scalar = np.NaN, 

4090 qs: npt.NDArray[np.float64] | None = None, 

4091 ) -> OutputFrameOrSeries: 

4092 """ 

4093 If we have categorical groupers, then we might want to make sure that 

4094 we have a fully re-indexed output to the levels. This means expanding 

4095 the output space to accommodate all values in the cartesian product of 

4096 our groups, regardless of whether they were observed in the data or 

4097 not. This will expand the output space if there are missing groups. 

4098 

4099 The method returns early without modifying the input if the number of 

4100 groupings is less than 2, self.observed == True or none of the groupers 

4101 are categorical. 

4102 

4103 Parameters 

4104 ---------- 

4105 output : Series or DataFrame 

4106 Object resulting from grouping and applying an operation. 

4107 fill_value : scalar, default np.NaN 

4108 Value to use for unobserved categories if self.observed is False. 

4109 qs : np.ndarray[float64] or None, default None 

4110 quantile values, only relevant for quantile. 

4111 

4112 Returns 

4113 ------- 

4114 Series or DataFrame 

4115 Object (potentially) re-indexed to include all possible groups. 

4116 """ 

4117 groupings = self.grouper.groupings 

4118 if len(groupings) == 1: 

4119 return output 

4120 

4121 # if we only care about the observed values 

4122 # we are done 

4123 elif self.observed: 

4124 return output 

4125 

4126 # reindexing only applies to a Categorical grouper 

4127 elif not any( 

4128 isinstance(ping.grouping_vector, (Categorical, CategoricalIndex)) 

4129 for ping in groupings 

4130 ): 

4131 return output 

4132 

4133 levels_list = [ping.group_index for ping in groupings] 

4134 names = self.grouper.names 

4135 if qs is not None: 

4136 # error: Argument 1 to "append" of "list" has incompatible type 

4137 # "ndarray[Any, dtype[floating[_64Bit]]]"; expected "Index" 

4138 levels_list.append(qs) # type: ignore[arg-type] 

4139 names = names + [None] 

4140 index, _ = MultiIndex.from_product(levels_list, names=names).sortlevel() 

4141 

4142 if self.as_index: 

4143 # Always holds for SeriesGroupBy unless GH#36507 is implemented 

4144 d = { 

4145 self.obj._get_axis_name(self.axis): index, 

4146 "copy": False, 

4147 "fill_value": fill_value, 

4148 } 

4149 return output.reindex(**d) 

4150 

4151 # GH 13204 

4152 # Here, the categorical in-axis groupers, which need to be fully 

4153 # expanded, are columns in `output`. An idea is to do: 

4154 # output = output.set_index(self.grouper.names) 

4155 # .reindex(index).reset_index() 

4156 # but special care has to be taken because of possible not-in-axis 

4157 # groupers. 

4158 # So, we manually select and drop the in-axis grouper columns, 

4159 # reindex `output`, and then reset the in-axis grouper columns. 

4160 

4161 # Select in-axis groupers 

4162 in_axis_grps = ( 

4163 (i, ping.name) for (i, ping) in enumerate(groupings) if ping.in_axis 

4164 ) 

4165 g_nums, g_names = zip(*in_axis_grps) 

4166 

4167 output = output.drop(labels=list(g_names), axis=1) 

4168 

4169 # Set a temp index and reindex (possibly expanding) 

4170 output = output.set_index(self.grouper.result_index).reindex( 

4171 index, copy=False, fill_value=fill_value 

4172 ) 

4173 

4174 # Reset in-axis grouper columns 

4175 # (using level numbers `g_nums` because level names may not be unique) 

4176 output = output.reset_index(level=g_nums) 

4177 

4178 return output.reset_index(drop=True) 

4179 

4180 @final 

4181 def sample( 

4182 self, 

4183 n: int | None = None, 

4184 frac: float | None = None, 

4185 replace: bool = False, 

4186 weights: Sequence | Series | None = None, 

4187 random_state: RandomState | None = None, 

4188 ): 

4189 """ 

4190 Return a random sample of items from each group. 

4191 

4192 You can use `random_state` for reproducibility. 

4193 

4194 .. versionadded:: 1.1.0 

4195 

4196 Parameters 

4197 ---------- 

4198 n : int, optional 

4199 Number of items to return for each group. Cannot be used with 

4200 `frac` and must be no larger than the smallest group unless 

4201 `replace` is True. Default is one if `frac` is None. 

4202 frac : float, optional 

4203 Fraction of items to return. Cannot be used with `n`. 

4204 replace : bool, default False 

4205 Allow or disallow sampling of the same row more than once. 

4206 weights : list-like, optional 

4207 Default None results in equal probability weighting. 

4208 If passed a list-like then values must have the same length as 

4209 the underlying DataFrame or Series object and will be used as 

4210 sampling probabilities after normalization within each group. 

4211 Values must be non-negative with at least one positive element 

4212 within each group. 

4213 random_state : int, array-like, BitGenerator, np.random.RandomState, np.random.Generator, optional 

4214 If int, array-like, or BitGenerator, seed for random number generator. 

4215 If np.random.RandomState or np.random.Generator, use as given. 

4216 

4217 .. versionchanged:: 1.4.0 

4218 

4219 np.random.Generator objects now accepted 

4220 

4221 Returns 

4222 ------- 

4223 Series or DataFrame 

4224 A new object of same type as caller containing items randomly 

4225 sampled within each group from the caller object. 

4226 

4227 See Also 

4228 -------- 

4229 DataFrame.sample: Generate random samples from a DataFrame object. 

4230 numpy.random.choice: Generate a random sample from a given 1-D numpy 

4231 array. 

4232 

4233 Examples 

4234 -------- 

4235 >>> df = pd.DataFrame( 

4236 ... {"a": ["red"] * 2 + ["blue"] * 2 + ["black"] * 2, "b": range(6)} 

4237 ... ) 

4238 >>> df 

4239 a b 

4240 0 red 0 

4241 1 red 1 

4242 2 blue 2 

4243 3 blue 3 

4244 4 black 4 

4245 5 black 5 

4246 

4247 Select one row at random for each distinct value in column a. The 

4248 `random_state` argument can be used to guarantee reproducibility: 

4249 

4250 >>> df.groupby("a").sample(n=1, random_state=1) 

4251 a b 

4252 4 black 4 

4253 2 blue 2 

4254 1 red 1 

4255 

4256 Set `frac` to sample fixed proportions rather than counts: 

4257 

4258 >>> df.groupby("a")["b"].sample(frac=0.5, random_state=2) 

4259 5 5 

4260 2 2 

4261 0 0 

4262 Name: b, dtype: int64 

4263 

4264 Control sample probabilities within groups by setting weights: 

4265 

4266 >>> df.groupby("a").sample( 

4267 ... n=1, 

4268 ... weights=[1, 1, 1, 0, 0, 1], 

4269 ... random_state=1, 

4270 ... ) 

4271 a b 

4272 5 black 5 

4273 2 blue 2 

4274 0 red 0 

4275 """ # noqa:E501 

4276 size = sample.process_sampling_size(n, frac, replace) 

4277 if weights is not None: 

4278 weights_arr = sample.preprocess_weights( 

4279 self._selected_obj, weights, axis=self.axis 

4280 ) 

4281 

4282 random_state = com.random_state(random_state) 

4283 

4284 group_iterator = self.grouper.get_iterator(self._selected_obj, self.axis) 

4285 

4286 sampled_indices = [] 

4287 for labels, obj in group_iterator: 

4288 grp_indices = self.indices[labels] 

4289 group_size = len(grp_indices) 

4290 if size is not None: 

4291 sample_size = size 

4292 else: 

4293 assert frac is not None 

4294 sample_size = round(frac * group_size) 

4295 

4296 grp_sample = sample.sample( 

4297 group_size, 

4298 size=sample_size, 

4299 replace=replace, 

4300 weights=None if weights is None else weights_arr[grp_indices], 

4301 random_state=random_state, 

4302 ) 

4303 sampled_indices.append(grp_indices[grp_sample]) 

4304 

4305 sampled_indices = np.concatenate(sampled_indices) 

4306 return self._selected_obj.take(sampled_indices, axis=self.axis) 

4307 

4308 

4309@doc(GroupBy) 

4310def get_groupby( 

4311 obj: NDFrame, 

4312 by: _KeysArgType | None = None, 

4313 axis: int = 0, 

4314 level=None, 

4315 grouper: ops.BaseGrouper | None = None, 

4316 exclusions=None, 

4317 selection=None, 

4318 as_index: bool = True, 

4319 sort: bool = True, 

4320 group_keys: bool | lib.NoDefault = True, 

4321 squeeze: bool = False, 

4322 observed: bool = False, 

4323 mutated: bool = False, 

4324 dropna: bool = True, 

4325) -> GroupBy: 

4326 

4327 klass: type[GroupBy] 

4328 if isinstance(obj, Series): 

4329 from pandas.core.groupby.generic import SeriesGroupBy 

4330 

4331 klass = SeriesGroupBy 

4332 elif isinstance(obj, DataFrame): 

4333 from pandas.core.groupby.generic import DataFrameGroupBy 

4334 

4335 klass = DataFrameGroupBy 

4336 else: # pragma: no cover 

4337 raise TypeError(f"invalid type: {obj}") 

4338 

4339 return klass( 

4340 obj=obj, 

4341 keys=by, 

4342 axis=axis, 

4343 level=level, 

4344 grouper=grouper, 

4345 exclusions=exclusions, 

4346 selection=selection, 

4347 as_index=as_index, 

4348 sort=sort, 

4349 group_keys=group_keys, 

4350 squeeze=squeeze, 

4351 observed=observed, 

4352 mutated=mutated, 

4353 dropna=dropna, 

4354 ) 

4355 

4356 

4357def _insert_quantile_level(idx: Index, qs: npt.NDArray[np.float64]) -> MultiIndex: 

4358 """ 

4359 Insert the sequence 'qs' of quantiles as the inner-most level of a MultiIndex. 

4360 

4361 The quantile level in the MultiIndex is a repeated copy of 'qs'. 

4362 

4363 Parameters 

4364 ---------- 

4365 idx : Index 

4366 qs : np.ndarray[float64] 

4367 

4368 Returns 

4369 ------- 

4370 MultiIndex 

4371 """ 

4372 nqs = len(qs) 

4373 

4374 if idx._is_multi: 

4375 idx = cast(MultiIndex, idx) 

4376 lev_codes, lev = Index(qs).factorize() 

4377 levels = list(idx.levels) + [lev] 

4378 codes = [np.repeat(x, nqs) for x in idx.codes] + [np.tile(lev_codes, len(idx))] 

4379 mi = MultiIndex(levels=levels, codes=codes, names=idx.names + [None]) 

4380 else: 

4381 mi = MultiIndex.from_product([idx, qs]) 

4382 return mi 

4383 

4384 

4385def warn_dropping_nuisance_columns_deprecated(cls, how: str, numeric_only) -> None: 

4386 if numeric_only is not lib.no_default and not numeric_only: 

4387 # numeric_only was specified and falsey but still dropped nuisance columns 

4388 warnings.warn( 

4389 "Dropping invalid columns in " 

4390 f"{cls.__name__}.{how} is deprecated. " 

4391 "In a future version, a TypeError will be raised. " 

4392 f"Before calling .{how}, select only columns which " 

4393 "should be valid for the function.", 

4394 FutureWarning, 

4395 stacklevel=find_stack_level(), 

4396 ) 

4397 elif numeric_only is lib.no_default: 

4398 warnings.warn( 

4399 "The default value of numeric_only in " 

4400 f"{cls.__name__}.{how} is deprecated. " 

4401 "In a future version, numeric_only will default to False. " 

4402 f"Either specify numeric_only or select only columns which " 

4403 "should be valid for the function.", 

4404 FutureWarning, 

4405 stacklevel=find_stack_level(), 

4406 )