Coverage for /var/srv/projects/api.amasfac.comuna18.com/tmp/venv/lib/python3.9/site-packages/pandas/core/groupby/generic.py: 12%

1"""

2Define the SeriesGroupBy and DataFrameGroupBy

3classes that hold the groupby interfaces (and some implementations).

5These are user facing as the result of the ``df.groupby(...)`` operations,

6which here returns a DataFrameGroupBy object.

7"""

8from __future__ import annotations

10from collections import abc

11from functools import partial

12from textwrap import dedent

13from typing import (

14 TYPE_CHECKING,

15 Any,

16 Callable,

17 Hashable,

18 Iterable,

19 Mapping,

20 NamedTuple,

21 Sequence,

22 TypeVar,

23 Union,

24 cast,

25)

26import warnings

28import numpy as np

30from pandas._libs import (

31 Interval,

32 lib,

33 reduction as libreduction,

34)

35from pandas._typing import (

36 ArrayLike,

37 Manager,

38 Manager2D,

39 SingleManager,

40)

41from pandas.errors import SpecificationError

42from pandas.util._decorators import (

43 Appender,

44 Substitution,

45 doc,

46)

47from pandas.util._exceptions import find_stack_level

49from pandas.core.dtypes.common import (

50 ensure_int64,

51 is_bool,

52 is_categorical_dtype,

53 is_dict_like,

54 is_integer_dtype,

55 is_interval_dtype,

56 is_scalar,

57)

58from pandas.core.dtypes.missing import (

59 isna,

60 notna,

61)

63from pandas.core import (

64 algorithms,

65 nanops,

66)

67from pandas.core.apply import (

68 GroupByApply,

69 maybe_mangle_lambdas,

70 reconstruct_func,

71 validate_func_kwargs,

72)

73from pandas.core.arrays.categorical import Categorical

74import pandas.core.common as com

75from pandas.core.construction import create_series_with_explicit_dtype

76from pandas.core.frame import DataFrame

77from pandas.core.groupby import base

78from pandas.core.groupby.groupby import (

79 GroupBy,

80 _agg_template,

81 _apply_docs,

82 _transform_template,

83 warn_dropping_nuisance_columns_deprecated,

84)

85from pandas.core.groupby.grouper import get_grouper

86from pandas.core.indexes.api import (

87 Index,

88 MultiIndex,

89 all_indexes_same,

90)

91from pandas.core.indexes.category import CategoricalIndex

92from pandas.core.series import Series

93from pandas.core.shared_docs import _shared_docs

94from pandas.core.util.numba_ import maybe_use_numba

96from pandas.plotting import boxplot_frame_groupby

98if TYPE_CHECKING: 98 ↛ 99line 98 didn't jump to line 99, because the condition on line 98 was never true

99 from pandas.core.generic import NDFrame

100

101# TODO(typing) the return value on this callable should be any *scalar*.

102AggScalar = Union[str, Callable[..., Any]]

103# TODO: validate types on ScalarResult and move to _typing

104# Blocked from using by https://github.com/python/mypy/issues/1484

105# See note at _mangle_lambda_list

106ScalarResult = TypeVar("ScalarResult")

107

108

109class NamedAgg(NamedTuple):

110 column: Hashable

111 aggfunc: AggScalar

112

113

114def generate_property(name: str, klass: type[DataFrame | Series]):

115 """

116 Create a property for a GroupBy subclass to dispatch to DataFrame/Series.

117

118 Parameters

119 ----------

120 name : str

121 klass : {DataFrame, Series}

122

123 Returns

124 -------

125 property

126 """

127

128 def prop(self):

129 return self._make_wrapper(name)

130

131 parent_method = getattr(klass, name)

132 prop.__doc__ = parent_method.__doc__ or ""

133 prop.__name__ = name

134 return property(prop)

135

136

137def pin_allowlisted_properties(

138 klass: type[DataFrame | Series], allowlist: frozenset[str]

139):

140 """

141 Create GroupBy member defs for DataFrame/Series names in a allowlist.

142

143 Parameters

144 ----------

145 klass : DataFrame or Series class

146 class where members are defined.

147 allowlist : frozenset[str]

148 Set of names of klass methods to be constructed

149

150 Returns

151 -------

152 class decorator

153

154 Notes

155 -----

156 Since we don't want to override methods explicitly defined in the

157 base class, any such name is skipped.

158 """

159

160 def pinner(cls):

161 for name in allowlist:

162 if hasattr(cls, name):

163 # don't override anything that was explicitly defined

164 # in the base class

165 continue

166

167 prop = generate_property(name, klass)

168 setattr(cls, name, prop)

170 return cls

172 return pinner

175@pin_allowlisted_properties(Series, base.series_apply_allowlist)

176class SeriesGroupBy(GroupBy[Series]):

177 _apply_allowlist = base.series_apply_allowlist

178

179 def _wrap_agged_manager(self, mgr: Manager) -> Series:

180 if mgr.ndim == 1:

181 mgr = cast(SingleManager, mgr)

182 single = mgr

183 else:

184 mgr = cast(Manager2D, mgr)

185 single = mgr.iget(0)

186 ser = self.obj._constructor(single, name=self.obj.name)

187 # NB: caller is responsible for setting ser.index

188 return ser

189

190 def _get_data_to_aggregate(self) -> SingleManager:

191 ser = self._obj_with_exclusions

192 single = ser._mgr

193 return single

194

195 def _iterate_slices(self) -> Iterable[Series]:

196 yield self._selected_obj

197

198 _agg_examples_doc = dedent(

199 """

200 Examples

201 --------

202 >>> s = pd.Series([1, 2, 3, 4])

203

204 >>> s

205 0 1

206 1 2

207 2 3

208 3 4

209 dtype: int64

210

211 >>> s.groupby([1, 1, 2, 2]).min()

212 1 1

213 2 3

214 dtype: int64

215

216 >>> s.groupby([1, 1, 2, 2]).agg('min')

217 1 1

218 2 3

219 dtype: int64

220

221 >>> s.groupby([1, 1, 2, 2]).agg(['min', 'max'])

222 min max

223 1 1 2

224 2 3 4

225

226 The output column names can be controlled by passing

227 the desired column names and aggregations as keyword arguments.

228

229 >>> s.groupby([1, 1, 2, 2]).agg(

230 ... minimum='min',

231 ... maximum='max',

232 ... )

233 minimum maximum

234 1 1 2

235 2 3 4

236

237 .. versionchanged:: 1.3.0

238

239 The resulting dtype will reflect the return value of the aggregating function.

240

241 >>> s.groupby([1, 1, 2, 2]).agg(lambda x: x.astype(float).min())

242 1 1.0

243 2 3.0

244 dtype: float64

245 """

246 )

247

248 @Appender(

249 _apply_docs["template"].format(

250 input="series", examples=_apply_docs["series_examples"]

251 )

252 )

253 def apply(self, func, *args, **kwargs) -> Series:

254 return super().apply(func, *args, **kwargs)

255

256 @doc(_agg_template, examples=_agg_examples_doc, klass="Series")

257 def aggregate(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs):

258

259 if maybe_use_numba(engine):

260 with self._group_selection_context():

261 data = self._selected_obj

262 result = self._aggregate_with_numba(

263 data.to_frame(), func, *args, engine_kwargs=engine_kwargs, **kwargs

264 )

265 index = self.grouper.result_index

266 return self.obj._constructor(result.ravel(), index=index, name=data.name)

267

268 relabeling = func is None

269 columns = None

270 if relabeling:

271 columns, func = validate_func_kwargs(kwargs)

272 kwargs = {}

273

274 if isinstance(func, str):

275 return getattr(self, func)(*args, **kwargs)

276

277 elif isinstance(func, abc.Iterable):

278 # Catch instances of lists / tuples

279 # but not the class list / tuple itself.

280 func = maybe_mangle_lambdas(func)

281 ret = self._aggregate_multiple_funcs(func)

282 if relabeling:

283 # columns is not narrowed by mypy from relabeling flag

284 assert columns is not None # for mypy

285 ret.columns = columns

286 return ret

287

288 else:

289 cyfunc = com.get_cython_func(func)

290 if cyfunc and not args and not kwargs:

291 return getattr(self, cyfunc)()

292

293 if self.grouper.nkeys > 1:

294 return self._python_agg_general(func, *args, **kwargs)

295

296 try:

297 return self._python_agg_general(func, *args, **kwargs)

298 except KeyError:

299 # TODO: KeyError is raised in _python_agg_general,

300 # see test_groupby.test_basic

301 result = self._aggregate_named(func, *args, **kwargs)

302

303 # result is a dict whose keys are the elements of result_index

304 index = self.grouper.result_index

305 return create_series_with_explicit_dtype(

306 result, index=index, dtype_if_empty=object

307 )

308

309 agg = aggregate

310

311 def _aggregate_multiple_funcs(self, arg) -> DataFrame:

312 if isinstance(arg, dict):

313

314 # show the deprecation, but only if we

315 # have not shown a higher level one

316 # GH 15931

317 raise SpecificationError("nested renamer is not supported")

318

319 elif any(isinstance(x, (tuple, list)) for x in arg):

320 arg = [(x, x) if not isinstance(x, (tuple, list)) else x for x in arg]

321

322 # indicated column order

323 columns = next(zip(*arg))

324 else:

325 # list of functions / function names

326 columns = []

327 for f in arg:

328 columns.append(com.get_callable_name(f) or f)

329

330 arg = zip(columns, arg)

331

332 results: dict[base.OutputKey, DataFrame | Series] = {}

333 for idx, (name, func) in enumerate(arg):

334

335 key = base.OutputKey(label=name, position=idx)

336 results[key] = self.aggregate(func)

337

338 if any(isinstance(x, DataFrame) for x in results.values()):

339 from pandas import concat

340

341 res_df = concat(

342 results.values(), axis=1, keys=[key.label for key in results.keys()]

343 )

344 return res_df

345

346 indexed_output = {key.position: val for key, val in results.items()}

347 output = self.obj._constructor_expanddim(indexed_output, index=None)

348 output.columns = Index(key.label for key in results)

349

350 output = self._reindex_output(output)

351 return output

352

353 def _indexed_output_to_ndframe(

354 self, output: Mapping[base.OutputKey, ArrayLike]

355 ) -> Series:

356 """

357 Wrap the dict result of a GroupBy aggregation into a Series.

358 """

359 assert len(output) == 1

360 values = next(iter(output.values()))

361 result = self.obj._constructor(values)

362 result.name = self.obj.name

363 return result

364

365 def _wrap_applied_output(

366 self,

367 data: Series,

368 values: list[Any],

369 not_indexed_same: bool = False,

370 override_group_keys: bool = False,

371 ) -> DataFrame | Series:

372 """

373 Wrap the output of SeriesGroupBy.apply into the expected result.

374

375 Parameters

376 ----------

377 data : Series

378 Input data for groupby operation.

379 values : List[Any]

380 Applied output for each group.

381 not_indexed_same : bool, default False

382 Whether the applied outputs are not indexed the same as the group axes.

383

384 Returns

385 -------

386 DataFrame or Series

387 """

388 if len(values) == 0:

389 # GH #6265

390 return self.obj._constructor(

391 [],

392 name=self.obj.name,

393 index=self.grouper.result_index,

394 dtype=data.dtype,

395 )

396 assert values is not None

397

398 if isinstance(values[0], dict):

399 # GH #823 #24880

400 index = self.grouper.result_index

401 res_df = self.obj._constructor_expanddim(values, index=index)

402 res_df = self._reindex_output(res_df)

403 # if self.observed is False,

404 # keep all-NaN rows created while re-indexing

405 res_ser = res_df.stack(dropna=self.observed)

406 res_ser.name = self.obj.name

407 return res_ser

408 elif isinstance(values[0], (Series, DataFrame)):

409 result = self._concat_objects(

410 values,

411 not_indexed_same=not_indexed_same,

412 override_group_keys=override_group_keys,

413 )

414 result.name = self.obj.name

415 return result

416 else:

417 # GH #6265 #24880

418 result = self.obj._constructor(

419 data=values, index=self.grouper.result_index, name=self.obj.name

420 )

421 return self._reindex_output(result)

422

423 def _aggregate_named(self, func, *args, **kwargs):

424 # Note: this is very similar to _aggregate_series_pure_python,

425 # but that does not pin group.name

426 result = {}

427 initialized = False

428

429 for name, group in self:

430 object.__setattr__(group, "name", name)

431

432 output = func(group, *args, **kwargs)

433 output = libreduction.extract_result(output)

434 if not initialized:

435 # We only do this validation on the first iteration

436 libreduction.check_result_array(output, group.dtype)

437 initialized = True

438 result[name] = output

439

440 return result

441

442 @Substitution(klass="Series")

443 @Appender(_transform_template)

444 def transform(self, func, *args, engine=None, engine_kwargs=None, **kwargs):

445 return self._transform(

446 func, *args, engine=engine, engine_kwargs=engine_kwargs, **kwargs

447 )

448

449 def _cython_transform(

450 self, how: str, numeric_only: bool = True, axis: int = 0, **kwargs

451 ):

452 assert axis == 0 # handled by caller

453

454 obj = self._selected_obj

455

456 try:

457 result = self.grouper._cython_operation(

458 "transform", obj._values, how, axis, **kwargs

459 )

460 except NotImplementedError as err:

461 raise TypeError(f"{how} is not supported for {obj.dtype} dtype") from err

462

463 return obj._constructor(result, index=self.obj.index, name=obj.name)

464

465 def _transform_general(self, func: Callable, *args, **kwargs) -> Series:

466 """

467 Transform with a callable func`.

468 """

469 assert callable(func)

470 klass = type(self.obj)

471

472 results = []

473 for name, group in self.grouper.get_iterator(

474 self._selected_obj, axis=self.axis

475 ):

476 # this setattr is needed for test_transform_lambda_with_datetimetz

477 object.__setattr__(group, "name", name)

478 res = func(group, *args, **kwargs)

479

480 results.append(klass(res, index=group.index))

481

482 # check for empty "results" to avoid concat ValueError

483 if results:

484 from pandas.core.reshape.concat import concat

485

486 concatenated = concat(results)

487 result = self._set_result_index_ordered(concatenated)

488 else:

489 result = self.obj._constructor(dtype=np.float64)

490

491 result.name = self.obj.name

492 return result

493

494 def filter(self, func, dropna: bool = True, *args, **kwargs):

495 """

496 Return a copy of a Series excluding elements from groups that

497 do not satisfy the boolean criterion specified by func.

498

499 Parameters

500 ----------

501 func : function

502 To apply to each group. Should return True or False.

503 dropna : Drop groups that do not pass the filter. True by default;

504 if False, groups that evaluate False are filled with NaNs.

505

506 Notes

507 -----

508 Functions that mutate the passed object can produce unexpected

509 behavior or errors and are not supported. See :ref:`gotchas.udf-mutation`

510 for more details.

511

512 Examples

513 --------

514 >>> df = pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar',

515 ... 'foo', 'bar'],

516 ... 'B' : [1, 2, 3, 4, 5, 6],

517 ... 'C' : [2.0, 5., 8., 1., 2., 9.]})

518 >>> grouped = df.groupby('A')

519 >>> df.groupby('A').B.filter(lambda x: x.mean() > 3.)

520 1 2

521 3 4

522 5 6

523 Name: B, dtype: int64

524

525 Returns

526 -------

527 filtered : Series

528 """

529 if isinstance(func, str):

530 wrapper = lambda x: getattr(x, func)(*args, **kwargs)

531 else:

532 wrapper = lambda x: func(x, *args, **kwargs)

533

534 # Interpret np.nan as False.

535 def true_and_notna(x) -> bool:

536 b = wrapper(x)

537 return b and notna(b)

538

539 try:

540 indices = [

541 self._get_index(name) for name, group in self if true_and_notna(group)

542 ]

543 except (ValueError, TypeError) as err:

544 raise TypeError("the filter must return a boolean result") from err

545

546 filtered = self._apply_filter(indices, dropna)

547 return filtered

548

549 def nunique(self, dropna: bool = True) -> Series:

550 """

551 Return number of unique elements in the group.

552

553 Returns

554 -------

555 Series

556 Number of unique values within each group.

557 """

558 ids, _, _ = self.grouper.group_info

559

560 val = self.obj._values

561

562 codes, _ = algorithms.factorize(val, sort=False)

563 sorter = np.lexsort((codes, ids))

564 codes = codes[sorter]

565 ids = ids[sorter]

566

567 # group boundaries are where group ids change

568 # unique observations are where sorted values change

569 idx = np.r_[0, 1 + np.nonzero(ids[1:] != ids[:-1])[0]]

570 inc = np.r_[1, codes[1:] != codes[:-1]]

571

572 # 1st item of each group is a new unique observation

573 mask = codes == -1

574 if dropna:

575 inc[idx] = 1

576 inc[mask] = 0

577 else:

578 inc[mask & np.r_[False, mask[:-1]]] = 0

579 inc[idx] = 1

580

581 out = np.add.reduceat(inc, idx).astype("int64", copy=False)

582 if len(ids):

583 # NaN/NaT group exists if the head of ids is -1,

584 # so remove it from res and exclude its index from idx

585 if ids[0] == -1:

586 res = out[1:]

587 idx = idx[np.flatnonzero(idx)]

588 else:

589 res = out

590 else:

591 res = out[1:]

592 ri = self.grouper.result_index

593

594 # we might have duplications among the bins

595 if len(res) != len(ri):

596 res, out = np.zeros(len(ri), dtype=out.dtype), res

597 res[ids[idx]] = out

598

599 result = self.obj._constructor(res, index=ri, name=self.obj.name)

600 return self._reindex_output(result, fill_value=0)

601

602 @doc(Series.describe)

603 def describe(self, **kwargs):

604 return super().describe(**kwargs)

605

606 def value_counts(

607 self,

608 normalize: bool = False,

609 sort: bool = True,

610 ascending: bool = False,

611 bins=None,

612 dropna: bool = True,

613 ) -> Series:

614

615 from pandas.core.reshape.merge import get_join_indexers

616 from pandas.core.reshape.tile import cut

617

618 ids, _, _ = self.grouper.group_info

619 val = self.obj._values

620

621 names = self.grouper.names + [self.obj.name]

622

623 if is_categorical_dtype(val.dtype) or (

624 bins is not None and not np.iterable(bins)

625 ):

626 # scalar bins cannot be done at top level

627 # in a backward compatible way

628 # GH38672 relates to categorical dtype

629 ser = self.apply(

630 Series.value_counts,

631 normalize=normalize,

632 sort=sort,

633 ascending=ascending,

634 bins=bins,

635 )

636 ser.index.names = names

637 return ser

638

639 # groupby removes null keys from groupings

640 mask = ids != -1

641 ids, val = ids[mask], val[mask]

642

643 if bins is None:

644 lab, lev = algorithms.factorize(val, sort=True)

645 llab = lambda lab, inc: lab[inc]

646 else:

647

648 # lab is a Categorical with categories an IntervalIndex

649 lab = cut(Series(val), bins, include_lowest=True)

650 # error: "ndarray" has no attribute "cat"

651 lev = lab.cat.categories # type: ignore[attr-defined]

652 # error: No overload variant of "take" of "_ArrayOrScalarCommon" matches

653 # argument types "Any", "bool", "Union[Any, float]"

654 lab = lev.take( # type: ignore[call-overload]

655 # error: "ndarray" has no attribute "cat"

656 lab.cat.codes, # type: ignore[attr-defined]

657 allow_fill=True,

658 # error: Item "ndarray" of "Union[ndarray, Index]" has no attribute

659 # "_na_value"

660 fill_value=lev._na_value, # type: ignore[union-attr]

661 )

662 llab = lambda lab, inc: lab[inc]._multiindex.codes[-1]

663

664 if is_interval_dtype(lab.dtype):

665 # TODO: should we do this inside II?

666 lab_interval = cast(Interval, lab)

667

668 sorter = np.lexsort((lab_interval.left, lab_interval.right, ids))

669 else:

670 sorter = np.lexsort((lab, ids))

671

672 ids, lab = ids[sorter], lab[sorter]

673

674 # group boundaries are where group ids change

675 idchanges = 1 + np.nonzero(ids[1:] != ids[:-1])[0]

676 idx = np.r_[0, idchanges]

677 if not len(ids):

678 idx = idchanges

679

680 # new values are where sorted labels change

681 lchanges = llab(lab, slice(1, None)) != llab(lab, slice(None, -1))

682 inc = np.r_[True, lchanges]

683 if not len(val):

684 inc = lchanges

685 inc[idx] = True # group boundaries are also new values

686 out = np.diff(np.nonzero(np.r_[inc, True])[0]) # value counts

687

688 # num. of times each group should be repeated

689 rep = partial(np.repeat, repeats=np.add.reduceat(inc, idx))

690

691 # multi-index components

692 codes = self.grouper.reconstructed_codes

693 # error: Incompatible types in assignment (expression has type

694 # "List[ndarray[Any, dtype[_SCT]]]",

695 # variable has type "List[ndarray[Any, dtype[signedinteger[Any]]]]")

696 codes = [ # type: ignore[assignment]

697 rep(level_codes) for level_codes in codes

698 ] + [llab(lab, inc)]

699 # error: List item 0 has incompatible type "Union[ndarray[Any, Any], Index]";

700 # expected "Index"

701 levels = [ping.group_index for ping in self.grouper.groupings] + [

702 lev # type: ignore[list-item]

703 ]

704

705 if dropna:

706 mask = codes[-1] != -1

707 if mask.all():

708 dropna = False

709 else:

710 out, codes = out[mask], [level_codes[mask] for level_codes in codes]

711

712 if normalize:

713 out = out.astype("float")

714 d = np.diff(np.r_[idx, len(ids)])

715 if dropna:

716 m = ids[lab == -1]

717 np.add.at(d, m, -1)

718 acc = rep(d)[mask]

719 else:

720 acc = rep(d)

721 out /= acc

722

723 if sort and bins is None:

724 cat = ids[inc][mask] if dropna else ids[inc]

725 sorter = np.lexsort((out if ascending else -out, cat))

726 out, codes[-1] = out[sorter], codes[-1][sorter]

727

728 if bins is not None:

729 # for compat. with libgroupby.value_counts need to ensure every

730 # bin is present at every index level, null filled with zeros

731 diff = np.zeros(len(out), dtype="bool")

732 for level_codes in codes[:-1]:

733 diff |= np.r_[True, level_codes[1:] != level_codes[:-1]]

734

735 ncat, nbin = diff.sum(), len(levels[-1])

736

737 left = [np.repeat(np.arange(ncat), nbin), np.tile(np.arange(nbin), ncat)]

738

739 right = [diff.cumsum() - 1, codes[-1]]

740

741 _, idx = get_join_indexers(left, right, sort=False, how="left")

742 out = np.where(idx != -1, out[idx], 0)

743

744 if sort:

745 sorter = np.lexsort((out if ascending else -out, left[0]))

746 out, left[-1] = out[sorter], left[-1][sorter]

747

748 # build the multi-index w/ full levels

749 def build_codes(lev_codes: np.ndarray) -> np.ndarray:

750 return np.repeat(lev_codes[diff], nbin)

751

752 codes = [build_codes(lev_codes) for lev_codes in codes[:-1]]

753 codes.append(left[-1])

754

755 mi = MultiIndex(levels=levels, codes=codes, names=names, verify_integrity=False)

756

757 if is_integer_dtype(out.dtype):

758 out = ensure_int64(out)

759 return self.obj._constructor(out, index=mi, name=self.obj.name)

760

761 @doc(Series.nlargest)

762 def nlargest(self, n: int = 5, keep: str = "first") -> Series:

763 f = partial(Series.nlargest, n=n, keep=keep)

764 data = self._obj_with_exclusions

765 # Don't change behavior if result index happens to be the same, i.e.

766 # already ordered and n >= all group sizes.

767 result = self._python_apply_general(f, data, not_indexed_same=True)

768 return result

769

770 @doc(Series.nsmallest)

771 def nsmallest(self, n: int = 5, keep: str = "first") -> Series:

772 f = partial(Series.nsmallest, n=n, keep=keep)

773 data = self._obj_with_exclusions

774 # Don't change behavior if result index happens to be the same, i.e.

775 # already ordered and n >= all group sizes.

776 result = self._python_apply_general(f, data, not_indexed_same=True)

777 return result

778

779

780@pin_allowlisted_properties(DataFrame, base.dataframe_apply_allowlist)

781class DataFrameGroupBy(GroupBy[DataFrame]):

782

783 _apply_allowlist = base.dataframe_apply_allowlist

784

785 _agg_examples_doc = dedent(

786 """

787 Examples

788 --------

789 >>> df = pd.DataFrame(

790 ... {

791 ... "A": [1, 1, 2, 2],

792 ... "B": [1, 2, 3, 4],

793 ... "C": [0.362838, 0.227877, 1.267767, -0.562860],

794 ... }

795 ... )

796

797 >>> df

798 A B C

799 0 1 1 0.362838

800 1 1 2 0.227877

801 2 2 3 1.267767

802 3 2 4 -0.562860

803

804 The aggregation is for each column.

805

806 >>> df.groupby('A').agg('min')

807 B C

808 A

809 1 1 0.227877

810 2 3 -0.562860

811

812 Multiple aggregations

813

814 >>> df.groupby('A').agg(['min', 'max'])

815 B C

816 min max min max

817 A

818 1 1 2 0.227877 0.362838

819 2 3 4 -0.562860 1.267767

820

821 Select a column for aggregation

822

823 >>> df.groupby('A').B.agg(['min', 'max'])

824 min max

825 A

826 1 1 2

827 2 3 4

828

829 User-defined function for aggregation

830

831 >>> df.groupby('A').agg(lambda x: sum(x) + 2)

832 B C

833 A

834 1 5 2.590715

835 2 9 2.704907

836

837 Different aggregations per column

838

839 >>> df.groupby('A').agg({'B': ['min', 'max'], 'C': 'sum'})

840 B C

841 min max sum

842 A

843 1 1 2 0.590715

844 2 3 4 0.704907

845

846 To control the output names with different aggregations per column,

847 pandas supports "named aggregation"

848

849 >>> df.groupby("A").agg(

850 ... b_min=pd.NamedAgg(column="B", aggfunc="min"),

851 ... c_sum=pd.NamedAgg(column="C", aggfunc="sum"))

852 b_min c_sum

853 A

854 1 1 0.590715

855 2 3 0.704907

856

857 - The keywords are the *output* column names

858 - The values are tuples whose first element is the column to select

859 and the second element is the aggregation to apply to that column.

860 Pandas provides the ``pandas.NamedAgg`` namedtuple with the fields

861 ``['column', 'aggfunc']`` to make it clearer what the arguments are.

862 As usual, the aggregation can be a callable or a string alias.

863

864 See :ref:`groupby.aggregate.named` for more.

865

866 .. versionchanged:: 1.3.0

867

868 The resulting dtype will reflect the return value of the aggregating function.

869

870 >>> df.groupby("A")[["B"]].agg(lambda x: x.astype(float).min())

871 B

872 A

873 1 1.0

874 2 3.0

875 """

876 )

877

878 @doc(_agg_template, examples=_agg_examples_doc, klass="DataFrame")

879 def aggregate(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs):

880

881 if maybe_use_numba(engine):

882 with self._group_selection_context():

883 data = self._selected_obj

884 result = self._aggregate_with_numba(

885 data, func, *args, engine_kwargs=engine_kwargs, **kwargs

886 )

887 index = self.grouper.result_index

888 return self.obj._constructor(result, index=index, columns=data.columns)

889

890 relabeling, func, columns, order = reconstruct_func(func, **kwargs)

891 func = maybe_mangle_lambdas(func)

892

893 op = GroupByApply(self, func, args, kwargs)

894 result = op.agg()

895 if not is_dict_like(func) and result is not None:

896 return result

897 elif relabeling and result is not None:

898 # this should be the only (non-raising) case with relabeling

899 # used reordered index of columns

900 result = result.iloc[:, order]

901 result.columns = columns

902

903 if result is None:

904

905 # grouper specific aggregations

906 if self.grouper.nkeys > 1:

907 # test_groupby_as_index_series_scalar gets here with 'not self.as_index'

908 return self._python_agg_general(func, *args, **kwargs)

909 elif args or kwargs:

910 # test_pass_args_kwargs gets here (with and without as_index)

911 # can't return early

912 result = self._aggregate_frame(func, *args, **kwargs)

913

914 elif self.axis == 1:

915 # _aggregate_multiple_funcs does not allow self.axis == 1

916 # Note: axis == 1 precludes 'not self.as_index', see __init__

917 result = self._aggregate_frame(func)

918 return result

919

920 else:

921

922 # try to treat as if we are passing a list

923 gba = GroupByApply(self, [func], args=(), kwargs={})

924 try:

925 result = gba.agg()

926

927 except ValueError as err:

928 if "no results" not in str(err):

929 # raised directly by _aggregate_multiple_funcs

930 raise

931 result = self._aggregate_frame(func)

932

933 else:

934 sobj = self._selected_obj

935

936 if isinstance(sobj, Series):

937 # GH#35246 test_groupby_as_index_select_column_sum_empty_df

938 result.columns = self._obj_with_exclusions.columns.copy()

939 else:

940 # Retain our column names

941 result.columns._set_names(

942 sobj.columns.names, level=list(range(sobj.columns.nlevels))

943 )

944 # select everything except for the last level, which is the one

945 # containing the name of the function(s), see GH#32040

946 result.columns = result.columns.droplevel(-1)

947

948 if not self.as_index:

949 self._insert_inaxis_grouper_inplace(result)

950 result.index = Index(range(len(result)))

951

952 return result

953

954 agg = aggregate

955

956 def _iterate_slices(self) -> Iterable[Series]:

957 obj = self._selected_obj

958 if self.axis == 1:

959 obj = obj.T

960

961 if isinstance(obj, Series) and obj.name not in self.exclusions:

962 # Occurs when doing DataFrameGroupBy(...)["X"]

963 yield obj

964 else:

965 for label, values in obj.items():

966 if label in self.exclusions:

967 continue

968

969 yield values

970

971 def _aggregate_frame(self, func, *args, **kwargs) -> DataFrame:

972 if self.grouper.nkeys != 1:

973 raise AssertionError("Number of keys must be 1")

974

975 obj = self._obj_with_exclusions

976

977 result: dict[Hashable, NDFrame | np.ndarray] = {}

978 if self.axis == 0:

979 # test_pass_args_kwargs_duplicate_columns gets here with non-unique columns

980 for name, data in self.grouper.get_iterator(obj, self.axis):

981 fres = func(data, *args, **kwargs)

982 result[name] = fres

983 else:

984 # we get here in a number of test_multilevel tests

985 for name in self.indices:

986 grp_df = self.get_group(name, obj=obj)

987 fres = func(grp_df, *args, **kwargs)

988 result[name] = fres

989

990 result_index = self.grouper.result_index

991 other_ax = obj.axes[1 - self.axis]

992 out = self.obj._constructor(result, index=other_ax, columns=result_index)

993 if self.axis == 0:

994 out = out.T

995

996 return out

997

998 def _aggregate_item_by_item(self, func, *args, **kwargs) -> DataFrame:

999 # only for axis==0

1000 # tests that get here with non-unique cols:

1001 # test_resample_with_timedelta_yields_no_empty_groups,

1002 # test_resample_apply_product

1003

1004 obj = self._obj_with_exclusions

1005 result: dict[int, NDFrame] = {}

1006

1007 for i, (item, sgb) in enumerate(self._iterate_column_groupbys(obj)):

1008 result[i] = sgb.aggregate(func, *args, **kwargs)

1009

1010 res_df = self.obj._constructor(result)

1011 res_df.columns = obj.columns

1012 return res_df

1013

1014 def _wrap_applied_output(

1015 self,

1016 data: DataFrame,

1017 values: list,

1018 not_indexed_same: bool = False,

1019 override_group_keys: bool = False,

1020 ):

1021

1022 if len(values) == 0:

1023 result = self.obj._constructor(

1024 index=self.grouper.result_index, columns=data.columns

1025 )

1026 result = result.astype(data.dtypes, copy=False)

1027 return result

1028

1029 # GH12824

1030 first_not_none = next(com.not_none(*values), None)

1031

1032 if first_not_none is None:

1033 # GH9684 - All values are None, return an empty frame.

1034 return self.obj._constructor()

1035 elif isinstance(first_not_none, DataFrame):

1036 return self._concat_objects(

1037 values,

1038 not_indexed_same=not_indexed_same,

1039 override_group_keys=override_group_keys,

1040 )

1041

1042 key_index = self.grouper.result_index if self.as_index else None

1043

1044 if isinstance(first_not_none, (np.ndarray, Index)):

1045 # GH#1738: values is list of arrays of unequal lengths

1046 # fall through to the outer else clause

1047 # TODO: sure this is right? we used to do this

1048 # after raising AttributeError above

1049 return self.obj._constructor_sliced(

1050 values, index=key_index, name=self._selection

1051 )

1052 elif not isinstance(first_not_none, Series):

1053 # values are not series or array-like but scalars

1054 # self._selection not passed through to Series as the

1055 # result should not take the name of original selection

1056 # of columns

1057 if self.as_index:

1058 return self.obj._constructor_sliced(values, index=key_index)

1059 else:

1060 result = self.obj._constructor(values, columns=[self._selection])

1061 self._insert_inaxis_grouper_inplace(result)

1062 return result

1063 else:

1064 # values are Series

1065 return self._wrap_applied_output_series(

1066 values,

1067 not_indexed_same,

1068 first_not_none,

1069 key_index,

1070 override_group_keys,

1071 )

1072

1073 def _wrap_applied_output_series(

1074 self,

1075 values: list[Series],

1076 not_indexed_same: bool,

1077 first_not_none,

1078 key_index,

1079 override_group_keys: bool,

1080 ) -> DataFrame | Series:

1081 # this is to silence a DeprecationWarning

1082 # TODO(2.0): Remove when default dtype of empty Series is object

1083 kwargs = first_not_none._construct_axes_dict()

1084 backup = create_series_with_explicit_dtype(dtype_if_empty=object, **kwargs)

1085 values = [x if (x is not None) else backup for x in values]

1086

1087 all_indexed_same = all_indexes_same(x.index for x in values)

1088

1089 # GH3596

1090 # provide a reduction (Frame -> Series) if groups are

1091 # unique

1092 if self.squeeze:

1093 applied_index = self._selected_obj._get_axis(self.axis)

1094 singular_series = len(values) == 1 and applied_index.nlevels == 1

1095

1096 if singular_series:

1097 # GH2893

1098 # we have series in the values array, we want to

1099 # produce a series:

1100 # if any of the sub-series are not indexed the same

1101 # OR we don't have a multi-index and we have only a

1102 # single values

1103 return self._concat_objects(

1104 values,

1105 not_indexed_same=not_indexed_same,

1106 override_group_keys=override_group_keys,

1107 )

1108

1109 # still a series

1110 # path added as of GH 5545

1111 elif all_indexed_same:

1112 from pandas.core.reshape.concat import concat

1113

1114 return concat(values)

1115

1116 if not all_indexed_same:

1117 # GH 8467

1118 return self._concat_objects(

1119 values,

1120 not_indexed_same=True,

1121 override_group_keys=override_group_keys,

1122 )

1123

1124 # Combine values

1125 # vstack+constructor is faster than concat and handles MI-columns

1126 stacked_values = np.vstack([np.asarray(v) for v in values])

1127

1128 if self.axis == 0:

1129 index = key_index

1130 columns = first_not_none.index.copy()

1131 if columns.name is None:

1132 # GH6124 - propagate name of Series when it's consistent

1133 names = {v.name for v in values}

1134 if len(names) == 1:

1135 columns.name = list(names)[0]

1136 else:

1137 index = first_not_none.index

1138 columns = key_index

1139 stacked_values = stacked_values.T

1140

1141 if stacked_values.dtype == object:

1142 # We'll have the DataFrame constructor do inference

1143 stacked_values = stacked_values.tolist()

1144 result = self.obj._constructor(stacked_values, index=index, columns=columns)

1145

1146 if not self.as_index:

1147 self._insert_inaxis_grouper_inplace(result)

1148

1149 return self._reindex_output(result)

1150

1151 def _cython_transform(

1152 self,

1153 how: str,

1154 numeric_only: bool | lib.NoDefault = lib.no_default,

1155 axis: int = 0,

1156 **kwargs,

1157 ) -> DataFrame:

1158 assert axis == 0 # handled by caller

1159 # TODO: no tests with self.ndim == 1 for DataFrameGroupBy

1160 numeric_only_bool = self._resolve_numeric_only(how, numeric_only, axis)

1161

1162 # With self.axis == 0, we have multi-block tests

1163 # e.g. test_rank_min_int, test_cython_transform_frame

1164 # test_transform_numeric_ret

1165 # With self.axis == 1, _get_data_to_aggregate does a transpose

1166 # so we always have a single block.

1167 mgr: Manager2D = self._get_data_to_aggregate()

1168 orig_mgr_len = len(mgr)

1169 if numeric_only_bool:

1170 mgr = mgr.get_numeric_data(copy=False)

1171

1172 def arr_func(bvalues: ArrayLike) -> ArrayLike:

1173 return self.grouper._cython_operation(

1174 "transform", bvalues, how, 1, **kwargs

1175 )

1176

1177 # We could use `mgr.apply` here and not have to set_axis, but

1178 # we would have to do shape gymnastics for ArrayManager compat

1179 res_mgr = mgr.grouped_reduce(arr_func, ignore_failures=True)

1180 res_mgr.set_axis(1, mgr.axes[1])

1181

1182 if len(res_mgr) < orig_mgr_len:

1183 warn_dropping_nuisance_columns_deprecated(type(self), how, numeric_only)

1184

1185 res_df = self.obj._constructor(res_mgr)

1186 if self.axis == 1:

1187 res_df = res_df.T

1188 return res_df

1189

1190 def _transform_general(self, func, *args, **kwargs):

1191 from pandas.core.reshape.concat import concat

1192

1193 applied = []

1194 obj = self._obj_with_exclusions

1195 gen = self.grouper.get_iterator(obj, axis=self.axis)

1196 fast_path, slow_path = self._define_paths(func, *args, **kwargs)

1197

1198 # Determine whether to use slow or fast path by evaluating on the first group.

1199 # Need to handle the case of an empty generator and process the result so that

1200 # it does not need to be computed again.

1201 try:

1202 name, group = next(gen)

1203 except StopIteration:

1204 pass

1205 else:

1206 object.__setattr__(group, "name", name)

1207 try:

1208 path, res = self._choose_path(fast_path, slow_path, group)

1209 except TypeError:

1210 return self._transform_item_by_item(obj, fast_path)

1211 except ValueError as err:

1212 msg = "transform must return a scalar value for each group"

1213 raise ValueError(msg) from err

1214 if group.size > 0:

1215 res = _wrap_transform_general_frame(self.obj, group, res)

1216 applied.append(res)

1217

1218 # Compute and process with the remaining groups

1219 emit_alignment_warning = False

1220 for name, group in gen:

1221 if group.size == 0:

1222 continue

1223 object.__setattr__(group, "name", name)

1224 res = path(group)

1225 if (

1226 not emit_alignment_warning

1227 and res.ndim == 2

1228 and not res.index.equals(group.index)

1229 ):

1230 emit_alignment_warning = True

1231

1232 res = _wrap_transform_general_frame(self.obj, group, res)

1233 applied.append(res)

1234

1235 if emit_alignment_warning:

1236 # GH#45648

1237 warnings.warn(

1238 "In a future version of pandas, returning a DataFrame in "

1239 "groupby.transform will align with the input's index. Apply "

1240 "`.to_numpy()` to the result in the transform function to keep "

1241 "the current behavior and silence this warning.",

1242 FutureWarning,

1243 stacklevel=find_stack_level(),

1244 )

1245

1246 concat_index = obj.columns if self.axis == 0 else obj.index

1247 other_axis = 1 if self.axis == 0 else 0 # switches between 0 & 1

1248 concatenated = concat(applied, axis=self.axis, verify_integrity=False)

1249 concatenated = concatenated.reindex(concat_index, axis=other_axis, copy=False)

1250 return self._set_result_index_ordered(concatenated)

1251

1252 @Substitution(klass="DataFrame")

1253 @Appender(_transform_template)

1254 def transform(self, func, *args, engine=None, engine_kwargs=None, **kwargs):

1255 return self._transform(

1256 func, *args, engine=engine, engine_kwargs=engine_kwargs, **kwargs

1257 )

1258

1259 def _define_paths(self, func, *args, **kwargs):

1260 if isinstance(func, str):

1261 fast_path = lambda group: getattr(group, func)(*args, **kwargs)

1262 slow_path = lambda group: group.apply(

1263 lambda x: getattr(x, func)(*args, **kwargs), axis=self.axis

1264 )

1265 else:

1266 fast_path = lambda group: func(group, *args, **kwargs)

1267 slow_path = lambda group: group.apply(

1268 lambda x: func(x, *args, **kwargs), axis=self.axis

1269 )

1270 return fast_path, slow_path

1271

1272 def _choose_path(self, fast_path: Callable, slow_path: Callable, group: DataFrame):

1273 path = slow_path

1274 res = slow_path(group)

1275

1276 if self.ngroups == 1:

1277 # no need to evaluate multiple paths when only

1278 # a single group exists

1279 return path, res

1280

1281 # if we make it here, test if we can use the fast path

1282 try:

1283 res_fast = fast_path(group)

1284 except AssertionError:

1285 raise # pragma: no cover

1286 except Exception:

1287 # GH#29631 For user-defined function, we can't predict what may be

1288 # raised; see test_transform.test_transform_fastpath_raises

1289 return path, res

1290

1291 # verify fast path returns either:

1292 # a DataFrame with columns equal to group.columns

1293 # OR a Series with index equal to group.columns

1294 if isinstance(res_fast, DataFrame):

1295 if not res_fast.columns.equals(group.columns):

1296 return path, res

1297 elif isinstance(res_fast, Series):

1298 if not res_fast.index.equals(group.columns):

1299 return path, res

1300 else:

1301 return path, res

1302

1303 if res_fast.equals(res):

1304 path = fast_path

1305

1306 return path, res

1307

1308 def _transform_item_by_item(self, obj: DataFrame, wrapper) -> DataFrame:

1309 # iterate through columns, see test_transform_exclude_nuisance

1310 # gets here with non-unique columns

1311 output = {}

1312 inds = []

1313 for i, (colname, sgb) in enumerate(self._iterate_column_groupbys(obj)):

1314 try:

1315 output[i] = sgb.transform(wrapper)

1316 except TypeError:

1317 # e.g. trying to call nanmean with string values

1318 warn_dropping_nuisance_columns_deprecated(

1319 type(self), "transform", numeric_only=False

1320 )

1321 else:

1322 inds.append(i)

1323

1324 if not output:

1325 raise TypeError("Transform function invalid for data types")

1326

1327 columns = obj.columns.take(inds)

1328

1329 result = self.obj._constructor(output, index=obj.index)

1330 result.columns = columns

1331 return result

1332

1333 def filter(self, func, dropna=True, *args, **kwargs):

1334 """

1335 Return a copy of a DataFrame excluding filtered elements.

1336

1337 Elements from groups are filtered if they do not satisfy the

1338 boolean criterion specified by func.

1339

1340 Parameters

1341 ----------

1342 func : function

1343 Function to apply to each subframe. Should return True or False.

1344 dropna : Drop groups that do not pass the filter. True by default;

1345 If False, groups that evaluate False are filled with NaNs.

1346

1347 Returns

1348 -------

1349 filtered : DataFrame

1350

1351 Notes

1352 -----

1353 Each subframe is endowed the attribute 'name' in case you need to know

1354 which group you are working on.

1355

1356 Functions that mutate the passed object can produce unexpected

1357 behavior or errors and are not supported. See :ref:`gotchas.udf-mutation`

1358 for more details.

1359

1360 Examples

1361 --------

1362 >>> df = pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar',

1363 ... 'foo', 'bar'],

1364 ... 'B' : [1, 2, 3, 4, 5, 6],

1365 ... 'C' : [2.0, 5., 8., 1., 2., 9.]})

1366 >>> grouped = df.groupby('A')

1367 >>> grouped.filter(lambda x: x['B'].mean() > 3.)

1368 A B C

1369 1 bar 2 5.0

1370 3 bar 4 1.0

1371 5 bar 6 9.0

1372 """

1373 indices = []

1374

1375 obj = self._selected_obj

1376 gen = self.grouper.get_iterator(obj, axis=self.axis)

1377

1378 for name, group in gen:

1379 object.__setattr__(group, "name", name)

1380

1381 res = func(group, *args, **kwargs)

1382

1383 try:

1384 res = res.squeeze()

1385 except AttributeError: # allow e.g., scalars and frames to pass

1386 pass

1387

1388 # interpret the result of the filter

1389 if is_bool(res) or (is_scalar(res) and isna(res)):

1390 if res and notna(res):

1391 indices.append(self._get_index(name))

1392 else:

1393 # non scalars aren't allowed

1394 raise TypeError(

1395 f"filter function returned a {type(res).__name__}, "

1396 "but expected a scalar bool"

1397 )

1398

1399 return self._apply_filter(indices, dropna)

1400

1401 def __getitem__(self, key) -> DataFrameGroupBy | SeriesGroupBy:

1402 if self.axis == 1:

1403 # GH 37725

1404 raise ValueError("Cannot subset columns when using axis=1")

1405 # per GH 23566

1406 if isinstance(key, tuple) and len(key) > 1:

1407 # if len == 1, then it becomes a SeriesGroupBy and this is actually

1408 # valid syntax, so don't raise warning

1409 warnings.warn(

1410 "Indexing with multiple keys (implicitly converted to a tuple "

1411 "of keys) will be deprecated, use a list instead.",

1412 FutureWarning,

1413 stacklevel=find_stack_level(),

1414 )

1415 return super().__getitem__(key)

1416

1417 def _gotitem(self, key, ndim: int, subset=None):

1418 """

1419 sub-classes to define

1420 return a sliced object

1421

1422 Parameters

1423 ----------

1424 key : string / list of selections

1425 ndim : {1, 2}

1426 requested ndim of result

1427 subset : object, default None

1428 subset to act on

1429 """

1430 if ndim == 2:

1431 if subset is None:

1432 subset = self.obj

1433 return DataFrameGroupBy(

1434 subset,

1435 self.grouper,

1436 axis=self.axis,

1437 level=self.level,

1438 grouper=self.grouper,

1439 exclusions=self.exclusions,

1440 selection=key,

1441 as_index=self.as_index,

1442 sort=self.sort,

1443 group_keys=self.group_keys,

1444 squeeze=self.squeeze,

1445 observed=self.observed,

1446 mutated=self.mutated,

1447 dropna=self.dropna,

1448 )

1449 elif ndim == 1:

1450 if subset is None:

1451 subset = self.obj[key]

1452 return SeriesGroupBy(

1453 subset,

1454 level=self.level,

1455 grouper=self.grouper,

1456 selection=key,

1457 sort=self.sort,

1458 group_keys=self.group_keys,

1459 squeeze=self.squeeze,

1460 observed=self.observed,

1461 dropna=self.dropna,

1462 )

1463

1464 raise AssertionError("invalid ndim for _gotitem")

1465

1466 def _get_data_to_aggregate(self) -> Manager2D:

1467 obj = self._obj_with_exclusions

1468 if self.axis == 1:

1469 return obj.T._mgr

1470 else:

1471 return obj._mgr

1472

1473 def _insert_inaxis_grouper_inplace(self, result: DataFrame) -> None:

1474 # zip in reverse so we can always insert at loc 0

1475 columns = result.columns

1476 for name, lev, in_axis in zip(

1477 reversed(self.grouper.names),

1478 reversed(self.grouper.get_group_levels()),

1479 reversed([grp.in_axis for grp in self.grouper.groupings]),

1480 ):

1481 # GH #28549

1482 # When using .apply(-), name will be in columns already

1483 if in_axis and name not in columns:

1484 result.insert(0, name, lev)

1485

1486 def _indexed_output_to_ndframe(

1487 self, output: Mapping[base.OutputKey, ArrayLike]

1488 ) -> DataFrame:

1489 """

1490 Wrap the dict result of a GroupBy aggregation into a DataFrame.

1491 """

1492 indexed_output = {key.position: val for key, val in output.items()}

1493 columns = Index([key.label for key in output])

1494 columns._set_names(self._obj_with_exclusions._get_axis(1 - self.axis).names)

1495

1496 result = self.obj._constructor(indexed_output)

1497 result.columns = columns

1498 return result

1499

1500 def _wrap_agged_manager(self, mgr: Manager2D) -> DataFrame:

1501 if not self.as_index:

1502 # GH 41998 - empty mgr always gets index of length 0

1503 rows = mgr.shape[1] if mgr.shape[0] > 0 else 0

1504 index = Index(range(rows))

1505 mgr.set_axis(1, index)

1506 result = self.obj._constructor(mgr)

1507

1508 self._insert_inaxis_grouper_inplace(result)

1509 result = result._consolidate()

1510 else:

1511 index = self.grouper.result_index

1512 mgr.set_axis(1, index)

1513 result = self.obj._constructor(mgr)

1514

1515 if self.axis == 1:

1516 result = result.T

1517

1518 # Note: we only need to pass datetime=True in order to get numeric

1519 # values converted

1520 return self._reindex_output(result)._convert(datetime=True)

1521

1522 def _iterate_column_groupbys(self, obj: DataFrame | Series):

1523 for i, colname in enumerate(obj.columns):

1524 yield colname, SeriesGroupBy(

1525 obj.iloc[:, i],

1526 selection=colname,

1527 grouper=self.grouper,

1528 exclusions=self.exclusions,

1529 observed=self.observed,

1530 )

1531

1532 def _apply_to_column_groupbys(self, func, obj: DataFrame | Series) -> DataFrame:

1533 from pandas.core.reshape.concat import concat

1534

1535 columns = obj.columns

1536 results = [

1537 func(col_groupby) for _, col_groupby in self._iterate_column_groupbys(obj)

1538 ]

1539

1540 if not len(results):

1541 # concat would raise

1542 return DataFrame([], columns=columns, index=self.grouper.result_index)

1543 else:

1544 return concat(results, keys=columns, axis=1)

1545

1546 def nunique(self, dropna: bool = True) -> DataFrame:

1547 """

1548 Return DataFrame with counts of unique elements in each position.

1549

1550 Parameters

1551 ----------

1552 dropna : bool, default True

1553 Don't include NaN in the counts.

1554

1555 Returns

1556 -------

1557 nunique: DataFrame

1558

1559 Examples

1560 --------

1561 >>> df = pd.DataFrame({'id': ['spam', 'egg', 'egg', 'spam',

1562 ... 'ham', 'ham'],

1563 ... 'value1': [1, 5, 5, 2, 5, 5],

1564 ... 'value2': list('abbaxy')})

1565 >>> df

1566 id value1 value2

1567 0 spam 1 a

1568 1 egg 5 b

1569 2 egg 5 b

1570 3 spam 2 a

1571 4 ham 5 x

1572 5 ham 5 y

1573

1574 >>> df.groupby('id').nunique()

1575 value1 value2

1576 id

1577 egg 1 1

1578 ham 1 2

1579 spam 2 1

1580

1581 Check for rows with the same id but conflicting values:

1582

1583 >>> df.groupby('id').filter(lambda g: (g.nunique() > 1).any())

1584 id value1 value2

1585 0 spam 1 a

1586 3 spam 2 a

1587 4 ham 5 x

1588 5 ham 5 y

1589 """

1590

1591 if self.axis != 0:

1592 # see test_groupby_crash_on_nunique

1593 return self._python_agg_general(lambda sgb: sgb.nunique(dropna))

1594

1595 obj = self._obj_with_exclusions

1596 results = self._apply_to_column_groupbys(

1597 lambda sgb: sgb.nunique(dropna), obj=obj

1598 )

1599

1600 if not self.as_index:

1601 results.index = Index(range(len(results)))

1602 self._insert_inaxis_grouper_inplace(results)

1603

1604 return results

1605

1606 @doc(

1607 _shared_docs["idxmax"],

1608 numeric_only_default="True for axis=0, False for axis=1",

1609 )

1610 def idxmax(

1611 self,

1612 axis=0,

1613 skipna: bool = True,

1614 numeric_only: bool | lib.NoDefault = lib.no_default,

1615 ) -> DataFrame:

1616 axis = DataFrame._get_axis_number(axis)

1617 if numeric_only is lib.no_default:

1618 # Cannot use self._resolve_numeric_only; we must pass None to

1619 # DataFrame.idxmax for backwards compatibility

1620 numeric_only_arg = None if axis == 0 else False

1621 else:

1622 numeric_only_arg = numeric_only

1623

1624 def func(df):

1625 with warnings.catch_warnings():

1626 # Suppress numeric_only warnings here, will warn below

1627 warnings.filterwarnings("ignore", ".*numeric_only in DataFrame.argmax")

1628 res = df._reduce(

1629 nanops.nanargmax,

1630 "argmax",

1631 axis=axis,

1632 skipna=skipna,

1633 numeric_only=numeric_only_arg,

1634 )

1635 indices = res._values

1636 index = df._get_axis(axis)

1637 result = [index[i] if i >= 0 else np.nan for i in indices]

1638 return df._constructor_sliced(result, index=res.index)

1639

1640 func.__name__ = "idxmax"

1641 result = self._python_apply_general(

1642 func, self._obj_with_exclusions, not_indexed_same=True

1643 )

1644 self._maybe_warn_numeric_only_depr("idxmax", result, numeric_only)

1645 return result

1646

1647 @doc(

1648 _shared_docs["idxmin"],

1649 numeric_only_default="True for axis=0, False for axis=1",

1650 )

1651 def idxmin(

1652 self,

1653 axis=0,

1654 skipna: bool = True,

1655 numeric_only: bool | lib.NoDefault = lib.no_default,

1656 ) -> DataFrame:

1657 axis = DataFrame._get_axis_number(axis)

1658 if numeric_only is lib.no_default:

1659 # Cannot use self._resolve_numeric_only; we must pass None to

1660 # DataFrame.idxmin for backwards compatibility

1661 numeric_only_arg = None if axis == 0 else False

1662 else:

1663 numeric_only_arg = numeric_only

1664

1665 def func(df):

1666 with warnings.catch_warnings():

1667 # Suppress numeric_only warnings here, will warn below

1668 warnings.filterwarnings("ignore", ".*numeric_only in DataFrame.argmin")

1669 res = df._reduce(

1670 nanops.nanargmin,

1671 "argmin",

1672 axis=axis,

1673 skipna=skipna,

1674 numeric_only=numeric_only_arg,

1675 )

1676 indices = res._values

1677 index = df._get_axis(axis)

1678 result = [index[i] if i >= 0 else np.nan for i in indices]

1679 return df._constructor_sliced(result, index=res.index)

1680

1681 func.__name__ = "idxmin"

1682 result = self._python_apply_general(

1683 func, self._obj_with_exclusions, not_indexed_same=True

1684 )

1685 self._maybe_warn_numeric_only_depr("idxmin", result, numeric_only)

1686 return result

1687

1688 boxplot = boxplot_frame_groupby

1689

1690 def value_counts(

1691 self,

1692 subset: Sequence[Hashable] | None = None,

1693 normalize: bool = False,

1694 sort: bool = True,

1695 ascending: bool = False,

1696 dropna: bool = True,

1697 ) -> DataFrame | Series:

1698 """

1699 Return a Series or DataFrame containing counts of unique rows.

1700

1701 .. versionadded:: 1.4.0

1702

1703 Parameters

1704 ----------

1705 subset : list-like, optional

1706 Columns to use when counting unique combinations.

1707 normalize : bool, default False

1708 Return proportions rather than frequencies.

1709 sort : bool, default True

1710 Sort by frequencies.

1711 ascending : bool, default False

1712 Sort in ascending order.

1713 dropna : bool, default True

1714 Don’t include counts of rows that contain NA values.

1715

1716 Returns

1717 -------

1718 Series or DataFrame

1719 Series if the groupby as_index is True, otherwise DataFrame.

1720

1721 See Also

1722 --------

1723 Series.value_counts: Equivalent method on Series.

1724 DataFrame.value_counts: Equivalent method on DataFrame.

1725 SeriesGroupBy.value_counts: Equivalent method on SeriesGroupBy.

1726

1727 Notes

1728 -----

1729 - If the groupby as_index is True then the returned Series will have a

1730 MultiIndex with one level per input column.

1731 - If the groupby as_index is False then the returned DataFrame will have an

1732 additional column with the value_counts. The column is labelled 'count' or

1733 'proportion', depending on the ``normalize`` parameter.

1734

1735 By default, rows that contain any NA values are omitted from

1736 the result.

1737

1738 By default, the result will be in descending order so that the

1739 first element of each group is the most frequently-occurring row.

1740

1741 Examples

1742 --------

1743 >>> df = pd.DataFrame({

1744 ... 'gender': ['male', 'male', 'female', 'male', 'female', 'male'],

1745 ... 'education': ['low', 'medium', 'high', 'low', 'high', 'low'],

1746 ... 'country': ['US', 'FR', 'US', 'FR', 'FR', 'FR']

1747 ... })

1748

1749 >>> df

1750 gender education country

1751 0 male low US

1752 1 male medium FR

1753 2 female high US

1754 3 male low FR

1755 4 female high FR

1756 5 male low FR

1757

1758 >>> df.groupby('gender').value_counts()

1759 gender education country

1760 female high FR 1

1761 US 1

1762 male low FR 2

1763 US 1

1764 medium FR 1

1765 dtype: int64

1766

1767 >>> df.groupby('gender').value_counts(ascending=True)

1768 gender education country

1769 female high FR 1

1770 US 1

1771 male low US 1

1772 medium FR 1

1773 low FR 2

1774 dtype: int64

1775

1776 >>> df.groupby('gender').value_counts(normalize=True)

1777 gender education country

1778 female high FR 0.50

1779 US 0.50

1780 male low FR 0.50

1781 US 0.25

1782 medium FR 0.25

1783 dtype: float64

1784

1785 >>> df.groupby('gender', as_index=False).value_counts()

1786 gender education country count

1787 0 female high FR 1

1788 1 female high US 1

1789 2 male low FR 2

1790 3 male low US 1

1791 4 male medium FR 1

1792

1793 >>> df.groupby('gender', as_index=False).value_counts(normalize=True)

1794 gender education country proportion

1795 0 female high FR 0.50

1796 1 female high US 0.50

1797 2 male low FR 0.50

1798 3 male low US 0.25

1799 4 male medium FR 0.25

1800 """

1801 if self.axis == 1:

1802 raise NotImplementedError(

1803 "DataFrameGroupBy.value_counts only handles axis=0"

1804 )

1805

1806 with self._group_selection_context():

1807 df = self.obj

1808

1809 in_axis_names = {

1810 grouping.name for grouping in self.grouper.groupings if grouping.in_axis

1811 }

1812 if isinstance(self._selected_obj, Series):

1813 name = self._selected_obj.name

1814 keys = [] if name in in_axis_names else [self._selected_obj]

1815 else:

1816 unique_cols = set(self._selected_obj.columns)

1817 if subset is not None:

1818 subsetted = set(subset)

1819 clashing = subsetted & set(in_axis_names)

1820 if clashing:

1821 raise ValueError(

1822 f"Keys {clashing} in subset cannot be in "

1823 "the groupby column keys."

1824 )

1825 doesnt_exist = subsetted - unique_cols

1826 if doesnt_exist:

1827 raise ValueError(

1828 f"Keys {doesnt_exist} in subset do not "

1829 f"exist in the DataFrame."

1830 )

1831 else:

1832 subsetted = unique_cols

1833

1834 keys = [

1835 # Can't use .values because the column label needs to be preserved

1836 self._selected_obj.iloc[:, idx]

1837 for idx, name in enumerate(self._selected_obj.columns)

1838 if name not in in_axis_names and name in subsetted

1839 ]

1840

1841 groupings = list(self.grouper.groupings)

1842 for key in keys:

1843 grouper, _, _ = get_grouper(

1844 df,

1845 key=key,

1846 axis=self.axis,

1847 sort=self.sort,

1848 observed=False,

1849 dropna=dropna,

1850 )

1851 groupings += list(grouper.groupings)

1852

1853 # Take the size of the overall columns

1854 gb = df.groupby(

1855 groupings,

1856 sort=self.sort,

1857 observed=self.observed,

1858 dropna=self.dropna,

1859 )

1860 result_series = cast(Series, gb.size())

1861

1862 # GH-46357 Include non-observed categories

1863 # of non-grouping columns regardless of `observed`

1864 if any(

1865 isinstance(grouping.grouping_vector, (Categorical, CategoricalIndex))

1866 and not grouping._observed

1867 for grouping in groupings

1868 ):

1869 levels_list = [ping.result_index for ping in groupings]

1870 multi_index, _ = MultiIndex.from_product(

1871 levels_list, names=[ping.name for ping in groupings]

1872 ).sortlevel()

1873 result_series = result_series.reindex(multi_index, fill_value=0)

1874

1875 if normalize:

1876 # Normalize the results by dividing by the original group sizes.

1877 # We are guaranteed to have the first N levels be the

1878 # user-requested grouping.

1879 levels = list(

1880 range(len(self.grouper.groupings), result_series.index.nlevels)

1881 )

1882 indexed_group_size = result_series.groupby(

1883 result_series.index.droplevel(levels),

1884 sort=self.sort,

1885 dropna=self.dropna,

1886 ).transform("sum")

1887 result_series /= indexed_group_size

1888

1889 # Handle groups of non-observed categories

1890 result_series = result_series.fillna(0.0)

1891

1892 if sort:

1893 # Sort the values and then resort by the main grouping

1894 index_level = range(len(self.grouper.groupings))

1895 result_series = result_series.sort_values(

1896 ascending=ascending

1897 ).sort_index(level=index_level, sort_remaining=False)

1898

1899 result: Series | DataFrame

1900 if self.as_index:

1901 result = result_series

1902 else:

1903 # Convert to frame

1904 name = "proportion" if normalize else "count"

1905 index = result_series.index

1906 columns = com.fill_missing_names(index.names)

1907 if name in columns:

1908 raise ValueError(

1909 f"Column label '{name}' is duplicate of result column"

1910 )

1911 result_series.name = name

1912 result_series.index = index.set_names(range(len(columns)))

1913 result_frame = result_series.reset_index()

1914 result_frame.columns = columns + [name]

1915 result = result_frame

1916 return result.__finalize__(self.obj, method="value_counts")

1917

1918

1919def _wrap_transform_general_frame(

1920 obj: DataFrame, group: DataFrame, res: DataFrame | Series

1921) -> DataFrame:

1922 from pandas import concat

1923

1924 if isinstance(res, Series):

1925 # we need to broadcast across the

1926 # other dimension; this will preserve dtypes

1927 # GH14457

1928 if res.index.is_(obj.index):

1929 res_frame = concat([res] * len(group.columns), axis=1)

1930 res_frame.columns = group.columns

1931 res_frame.index = group.index

1932 else:

1933 res_frame = obj._constructor(

1934 np.tile(res.values, (len(group.index), 1)),

1935 columns=group.columns,

1936 index=group.index,

1937 )

1938 assert isinstance(res_frame, DataFrame)

1939 return res_frame

1940 else:

1941 return res