Coverage for /var/srv/projects/api.amasfac.comuna18.com/tmp/venv/lib/python3.9/site-packages/pandas/core/groupby/generic.py: 12%

742 statements  

« prev     ^ index     » next       coverage.py v6.4.4, created at 2023-07-17 14:22 -0600

1""" 

2Define the SeriesGroupBy and DataFrameGroupBy 

3classes that hold the groupby interfaces (and some implementations). 

4 

5These are user facing as the result of the ``df.groupby(...)`` operations, 

6which here returns a DataFrameGroupBy object. 

7""" 

8from __future__ import annotations 

9 

10from collections import abc 

11from functools import partial 

12from textwrap import dedent 

13from typing import ( 

14 TYPE_CHECKING, 

15 Any, 

16 Callable, 

17 Hashable, 

18 Iterable, 

19 Mapping, 

20 NamedTuple, 

21 Sequence, 

22 TypeVar, 

23 Union, 

24 cast, 

25) 

26import warnings 

27 

28import numpy as np 

29 

30from pandas._libs import ( 

31 Interval, 

32 lib, 

33 reduction as libreduction, 

34) 

35from pandas._typing import ( 

36 ArrayLike, 

37 Manager, 

38 Manager2D, 

39 SingleManager, 

40) 

41from pandas.errors import SpecificationError 

42from pandas.util._decorators import ( 

43 Appender, 

44 Substitution, 

45 doc, 

46) 

47from pandas.util._exceptions import find_stack_level 

48 

49from pandas.core.dtypes.common import ( 

50 ensure_int64, 

51 is_bool, 

52 is_categorical_dtype, 

53 is_dict_like, 

54 is_integer_dtype, 

55 is_interval_dtype, 

56 is_scalar, 

57) 

58from pandas.core.dtypes.missing import ( 

59 isna, 

60 notna, 

61) 

62 

63from pandas.core import ( 

64 algorithms, 

65 nanops, 

66) 

67from pandas.core.apply import ( 

68 GroupByApply, 

69 maybe_mangle_lambdas, 

70 reconstruct_func, 

71 validate_func_kwargs, 

72) 

73from pandas.core.arrays.categorical import Categorical 

74import pandas.core.common as com 

75from pandas.core.construction import create_series_with_explicit_dtype 

76from pandas.core.frame import DataFrame 

77from pandas.core.groupby import base 

78from pandas.core.groupby.groupby import ( 

79 GroupBy, 

80 _agg_template, 

81 _apply_docs, 

82 _transform_template, 

83 warn_dropping_nuisance_columns_deprecated, 

84) 

85from pandas.core.groupby.grouper import get_grouper 

86from pandas.core.indexes.api import ( 

87 Index, 

88 MultiIndex, 

89 all_indexes_same, 

90) 

91from pandas.core.indexes.category import CategoricalIndex 

92from pandas.core.series import Series 

93from pandas.core.shared_docs import _shared_docs 

94from pandas.core.util.numba_ import maybe_use_numba 

95 

96from pandas.plotting import boxplot_frame_groupby 

97 

98if TYPE_CHECKING: 98 ↛ 99line 98 didn't jump to line 99, because the condition on line 98 was never true

99 from pandas.core.generic import NDFrame 

100 

101# TODO(typing) the return value on this callable should be any *scalar*. 

102AggScalar = Union[str, Callable[..., Any]] 

103# TODO: validate types on ScalarResult and move to _typing 

104# Blocked from using by https://github.com/python/mypy/issues/1484 

105# See note at _mangle_lambda_list 

106ScalarResult = TypeVar("ScalarResult") 

107 

108 

109class NamedAgg(NamedTuple): 

110 column: Hashable 

111 aggfunc: AggScalar 

112 

113 

114def generate_property(name: str, klass: type[DataFrame | Series]): 

115 """ 

116 Create a property for a GroupBy subclass to dispatch to DataFrame/Series. 

117 

118 Parameters 

119 ---------- 

120 name : str 

121 klass : {DataFrame, Series} 

122 

123 Returns 

124 ------- 

125 property 

126 """ 

127 

128 def prop(self): 

129 return self._make_wrapper(name) 

130 

131 parent_method = getattr(klass, name) 

132 prop.__doc__ = parent_method.__doc__ or "" 

133 prop.__name__ = name 

134 return property(prop) 

135 

136 

137def pin_allowlisted_properties( 

138 klass: type[DataFrame | Series], allowlist: frozenset[str] 

139): 

140 """ 

141 Create GroupBy member defs for DataFrame/Series names in a allowlist. 

142 

143 Parameters 

144 ---------- 

145 klass : DataFrame or Series class 

146 class where members are defined. 

147 allowlist : frozenset[str] 

148 Set of names of klass methods to be constructed 

149 

150 Returns 

151 ------- 

152 class decorator 

153 

154 Notes 

155 ----- 

156 Since we don't want to override methods explicitly defined in the 

157 base class, any such name is skipped. 

158 """ 

159 

160 def pinner(cls): 

161 for name in allowlist: 

162 if hasattr(cls, name): 

163 # don't override anything that was explicitly defined 

164 # in the base class 

165 continue 

166 

167 prop = generate_property(name, klass) 

168 setattr(cls, name, prop) 

169 

170 return cls 

171 

172 return pinner 

173 

174 

175@pin_allowlisted_properties(Series, base.series_apply_allowlist) 

176class SeriesGroupBy(GroupBy[Series]): 

177 _apply_allowlist = base.series_apply_allowlist 

178 

179 def _wrap_agged_manager(self, mgr: Manager) -> Series: 

180 if mgr.ndim == 1: 

181 mgr = cast(SingleManager, mgr) 

182 single = mgr 

183 else: 

184 mgr = cast(Manager2D, mgr) 

185 single = mgr.iget(0) 

186 ser = self.obj._constructor(single, name=self.obj.name) 

187 # NB: caller is responsible for setting ser.index 

188 return ser 

189 

190 def _get_data_to_aggregate(self) -> SingleManager: 

191 ser = self._obj_with_exclusions 

192 single = ser._mgr 

193 return single 

194 

195 def _iterate_slices(self) -> Iterable[Series]: 

196 yield self._selected_obj 

197 

198 _agg_examples_doc = dedent( 

199 """ 

200 Examples 

201 -------- 

202 >>> s = pd.Series([1, 2, 3, 4]) 

203 

204 >>> s 

205 0 1 

206 1 2 

207 2 3 

208 3 4 

209 dtype: int64 

210 

211 >>> s.groupby([1, 1, 2, 2]).min() 

212 1 1 

213 2 3 

214 dtype: int64 

215 

216 >>> s.groupby([1, 1, 2, 2]).agg('min') 

217 1 1 

218 2 3 

219 dtype: int64 

220 

221 >>> s.groupby([1, 1, 2, 2]).agg(['min', 'max']) 

222 min max 

223 1 1 2 

224 2 3 4 

225 

226 The output column names can be controlled by passing 

227 the desired column names and aggregations as keyword arguments. 

228 

229 >>> s.groupby([1, 1, 2, 2]).agg( 

230 ... minimum='min', 

231 ... maximum='max', 

232 ... ) 

233 minimum maximum 

234 1 1 2 

235 2 3 4 

236 

237 .. versionchanged:: 1.3.0 

238 

239 The resulting dtype will reflect the return value of the aggregating function. 

240 

241 >>> s.groupby([1, 1, 2, 2]).agg(lambda x: x.astype(float).min()) 

242 1 1.0 

243 2 3.0 

244 dtype: float64 

245 """ 

246 ) 

247 

248 @Appender( 

249 _apply_docs["template"].format( 

250 input="series", examples=_apply_docs["series_examples"] 

251 ) 

252 ) 

253 def apply(self, func, *args, **kwargs) -> Series: 

254 return super().apply(func, *args, **kwargs) 

255 

256 @doc(_agg_template, examples=_agg_examples_doc, klass="Series") 

257 def aggregate(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs): 

258 

259 if maybe_use_numba(engine): 

260 with self._group_selection_context(): 

261 data = self._selected_obj 

262 result = self._aggregate_with_numba( 

263 data.to_frame(), func, *args, engine_kwargs=engine_kwargs, **kwargs 

264 ) 

265 index = self.grouper.result_index 

266 return self.obj._constructor(result.ravel(), index=index, name=data.name) 

267 

268 relabeling = func is None 

269 columns = None 

270 if relabeling: 

271 columns, func = validate_func_kwargs(kwargs) 

272 kwargs = {} 

273 

274 if isinstance(func, str): 

275 return getattr(self, func)(*args, **kwargs) 

276 

277 elif isinstance(func, abc.Iterable): 

278 # Catch instances of lists / tuples 

279 # but not the class list / tuple itself. 

280 func = maybe_mangle_lambdas(func) 

281 ret = self._aggregate_multiple_funcs(func) 

282 if relabeling: 

283 # columns is not narrowed by mypy from relabeling flag 

284 assert columns is not None # for mypy 

285 ret.columns = columns 

286 return ret 

287 

288 else: 

289 cyfunc = com.get_cython_func(func) 

290 if cyfunc and not args and not kwargs: 

291 return getattr(self, cyfunc)() 

292 

293 if self.grouper.nkeys > 1: 

294 return self._python_agg_general(func, *args, **kwargs) 

295 

296 try: 

297 return self._python_agg_general(func, *args, **kwargs) 

298 except KeyError: 

299 # TODO: KeyError is raised in _python_agg_general, 

300 # see test_groupby.test_basic 

301 result = self._aggregate_named(func, *args, **kwargs) 

302 

303 # result is a dict whose keys are the elements of result_index 

304 index = self.grouper.result_index 

305 return create_series_with_explicit_dtype( 

306 result, index=index, dtype_if_empty=object 

307 ) 

308 

309 agg = aggregate 

310 

311 def _aggregate_multiple_funcs(self, arg) -> DataFrame: 

312 if isinstance(arg, dict): 

313 

314 # show the deprecation, but only if we 

315 # have not shown a higher level one 

316 # GH 15931 

317 raise SpecificationError("nested renamer is not supported") 

318 

319 elif any(isinstance(x, (tuple, list)) for x in arg): 

320 arg = [(x, x) if not isinstance(x, (tuple, list)) else x for x in arg] 

321 

322 # indicated column order 

323 columns = next(zip(*arg)) 

324 else: 

325 # list of functions / function names 

326 columns = [] 

327 for f in arg: 

328 columns.append(com.get_callable_name(f) or f) 

329 

330 arg = zip(columns, arg) 

331 

332 results: dict[base.OutputKey, DataFrame | Series] = {} 

333 for idx, (name, func) in enumerate(arg): 

334 

335 key = base.OutputKey(label=name, position=idx) 

336 results[key] = self.aggregate(func) 

337 

338 if any(isinstance(x, DataFrame) for x in results.values()): 

339 from pandas import concat 

340 

341 res_df = concat( 

342 results.values(), axis=1, keys=[key.label for key in results.keys()] 

343 ) 

344 return res_df 

345 

346 indexed_output = {key.position: val for key, val in results.items()} 

347 output = self.obj._constructor_expanddim(indexed_output, index=None) 

348 output.columns = Index(key.label for key in results) 

349 

350 output = self._reindex_output(output) 

351 return output 

352 

353 def _indexed_output_to_ndframe( 

354 self, output: Mapping[base.OutputKey, ArrayLike] 

355 ) -> Series: 

356 """ 

357 Wrap the dict result of a GroupBy aggregation into a Series. 

358 """ 

359 assert len(output) == 1 

360 values = next(iter(output.values())) 

361 result = self.obj._constructor(values) 

362 result.name = self.obj.name 

363 return result 

364 

365 def _wrap_applied_output( 

366 self, 

367 data: Series, 

368 values: list[Any], 

369 not_indexed_same: bool = False, 

370 override_group_keys: bool = False, 

371 ) -> DataFrame | Series: 

372 """ 

373 Wrap the output of SeriesGroupBy.apply into the expected result. 

374 

375 Parameters 

376 ---------- 

377 data : Series 

378 Input data for groupby operation. 

379 values : List[Any] 

380 Applied output for each group. 

381 not_indexed_same : bool, default False 

382 Whether the applied outputs are not indexed the same as the group axes. 

383 

384 Returns 

385 ------- 

386 DataFrame or Series 

387 """ 

388 if len(values) == 0: 

389 # GH #6265 

390 return self.obj._constructor( 

391 [], 

392 name=self.obj.name, 

393 index=self.grouper.result_index, 

394 dtype=data.dtype, 

395 ) 

396 assert values is not None 

397 

398 if isinstance(values[0], dict): 

399 # GH #823 #24880 

400 index = self.grouper.result_index 

401 res_df = self.obj._constructor_expanddim(values, index=index) 

402 res_df = self._reindex_output(res_df) 

403 # if self.observed is False, 

404 # keep all-NaN rows created while re-indexing 

405 res_ser = res_df.stack(dropna=self.observed) 

406 res_ser.name = self.obj.name 

407 return res_ser 

408 elif isinstance(values[0], (Series, DataFrame)): 

409 result = self._concat_objects( 

410 values, 

411 not_indexed_same=not_indexed_same, 

412 override_group_keys=override_group_keys, 

413 ) 

414 result.name = self.obj.name 

415 return result 

416 else: 

417 # GH #6265 #24880 

418 result = self.obj._constructor( 

419 data=values, index=self.grouper.result_index, name=self.obj.name 

420 ) 

421 return self._reindex_output(result) 

422 

423 def _aggregate_named(self, func, *args, **kwargs): 

424 # Note: this is very similar to _aggregate_series_pure_python, 

425 # but that does not pin group.name 

426 result = {} 

427 initialized = False 

428 

429 for name, group in self: 

430 object.__setattr__(group, "name", name) 

431 

432 output = func(group, *args, **kwargs) 

433 output = libreduction.extract_result(output) 

434 if not initialized: 

435 # We only do this validation on the first iteration 

436 libreduction.check_result_array(output, group.dtype) 

437 initialized = True 

438 result[name] = output 

439 

440 return result 

441 

442 @Substitution(klass="Series") 

443 @Appender(_transform_template) 

444 def transform(self, func, *args, engine=None, engine_kwargs=None, **kwargs): 

445 return self._transform( 

446 func, *args, engine=engine, engine_kwargs=engine_kwargs, **kwargs 

447 ) 

448 

449 def _cython_transform( 

450 self, how: str, numeric_only: bool = True, axis: int = 0, **kwargs 

451 ): 

452 assert axis == 0 # handled by caller 

453 

454 obj = self._selected_obj 

455 

456 try: 

457 result = self.grouper._cython_operation( 

458 "transform", obj._values, how, axis, **kwargs 

459 ) 

460 except NotImplementedError as err: 

461 raise TypeError(f"{how} is not supported for {obj.dtype} dtype") from err 

462 

463 return obj._constructor(result, index=self.obj.index, name=obj.name) 

464 

465 def _transform_general(self, func: Callable, *args, **kwargs) -> Series: 

466 """ 

467 Transform with a callable func`. 

468 """ 

469 assert callable(func) 

470 klass = type(self.obj) 

471 

472 results = [] 

473 for name, group in self.grouper.get_iterator( 

474 self._selected_obj, axis=self.axis 

475 ): 

476 # this setattr is needed for test_transform_lambda_with_datetimetz 

477 object.__setattr__(group, "name", name) 

478 res = func(group, *args, **kwargs) 

479 

480 results.append(klass(res, index=group.index)) 

481 

482 # check for empty "results" to avoid concat ValueError 

483 if results: 

484 from pandas.core.reshape.concat import concat 

485 

486 concatenated = concat(results) 

487 result = self._set_result_index_ordered(concatenated) 

488 else: 

489 result = self.obj._constructor(dtype=np.float64) 

490 

491 result.name = self.obj.name 

492 return result 

493 

494 def filter(self, func, dropna: bool = True, *args, **kwargs): 

495 """ 

496 Return a copy of a Series excluding elements from groups that 

497 do not satisfy the boolean criterion specified by func. 

498 

499 Parameters 

500 ---------- 

501 func : function 

502 To apply to each group. Should return True or False. 

503 dropna : Drop groups that do not pass the filter. True by default; 

504 if False, groups that evaluate False are filled with NaNs. 

505 

506 Notes 

507 ----- 

508 Functions that mutate the passed object can produce unexpected 

509 behavior or errors and are not supported. See :ref:`gotchas.udf-mutation` 

510 for more details. 

511 

512 Examples 

513 -------- 

514 >>> df = pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar', 

515 ... 'foo', 'bar'], 

516 ... 'B' : [1, 2, 3, 4, 5, 6], 

517 ... 'C' : [2.0, 5., 8., 1., 2., 9.]}) 

518 >>> grouped = df.groupby('A') 

519 >>> df.groupby('A').B.filter(lambda x: x.mean() > 3.) 

520 1 2 

521 3 4 

522 5 6 

523 Name: B, dtype: int64 

524 

525 Returns 

526 ------- 

527 filtered : Series 

528 """ 

529 if isinstance(func, str): 

530 wrapper = lambda x: getattr(x, func)(*args, **kwargs) 

531 else: 

532 wrapper = lambda x: func(x, *args, **kwargs) 

533 

534 # Interpret np.nan as False. 

535 def true_and_notna(x) -> bool: 

536 b = wrapper(x) 

537 return b and notna(b) 

538 

539 try: 

540 indices = [ 

541 self._get_index(name) for name, group in self if true_and_notna(group) 

542 ] 

543 except (ValueError, TypeError) as err: 

544 raise TypeError("the filter must return a boolean result") from err 

545 

546 filtered = self._apply_filter(indices, dropna) 

547 return filtered 

548 

549 def nunique(self, dropna: bool = True) -> Series: 

550 """ 

551 Return number of unique elements in the group. 

552 

553 Returns 

554 ------- 

555 Series 

556 Number of unique values within each group. 

557 """ 

558 ids, _, _ = self.grouper.group_info 

559 

560 val = self.obj._values 

561 

562 codes, _ = algorithms.factorize(val, sort=False) 

563 sorter = np.lexsort((codes, ids)) 

564 codes = codes[sorter] 

565 ids = ids[sorter] 

566 

567 # group boundaries are where group ids change 

568 # unique observations are where sorted values change 

569 idx = np.r_[0, 1 + np.nonzero(ids[1:] != ids[:-1])[0]] 

570 inc = np.r_[1, codes[1:] != codes[:-1]] 

571 

572 # 1st item of each group is a new unique observation 

573 mask = codes == -1 

574 if dropna: 

575 inc[idx] = 1 

576 inc[mask] = 0 

577 else: 

578 inc[mask & np.r_[False, mask[:-1]]] = 0 

579 inc[idx] = 1 

580 

581 out = np.add.reduceat(inc, idx).astype("int64", copy=False) 

582 if len(ids): 

583 # NaN/NaT group exists if the head of ids is -1, 

584 # so remove it from res and exclude its index from idx 

585 if ids[0] == -1: 

586 res = out[1:] 

587 idx = idx[np.flatnonzero(idx)] 

588 else: 

589 res = out 

590 else: 

591 res = out[1:] 

592 ri = self.grouper.result_index 

593 

594 # we might have duplications among the bins 

595 if len(res) != len(ri): 

596 res, out = np.zeros(len(ri), dtype=out.dtype), res 

597 res[ids[idx]] = out 

598 

599 result = self.obj._constructor(res, index=ri, name=self.obj.name) 

600 return self._reindex_output(result, fill_value=0) 

601 

602 @doc(Series.describe) 

603 def describe(self, **kwargs): 

604 return super().describe(**kwargs) 

605 

606 def value_counts( 

607 self, 

608 normalize: bool = False, 

609 sort: bool = True, 

610 ascending: bool = False, 

611 bins=None, 

612 dropna: bool = True, 

613 ) -> Series: 

614 

615 from pandas.core.reshape.merge import get_join_indexers 

616 from pandas.core.reshape.tile import cut 

617 

618 ids, _, _ = self.grouper.group_info 

619 val = self.obj._values 

620 

621 names = self.grouper.names + [self.obj.name] 

622 

623 if is_categorical_dtype(val.dtype) or ( 

624 bins is not None and not np.iterable(bins) 

625 ): 

626 # scalar bins cannot be done at top level 

627 # in a backward compatible way 

628 # GH38672 relates to categorical dtype 

629 ser = self.apply( 

630 Series.value_counts, 

631 normalize=normalize, 

632 sort=sort, 

633 ascending=ascending, 

634 bins=bins, 

635 ) 

636 ser.index.names = names 

637 return ser 

638 

639 # groupby removes null keys from groupings 

640 mask = ids != -1 

641 ids, val = ids[mask], val[mask] 

642 

643 if bins is None: 

644 lab, lev = algorithms.factorize(val, sort=True) 

645 llab = lambda lab, inc: lab[inc] 

646 else: 

647 

648 # lab is a Categorical with categories an IntervalIndex 

649 lab = cut(Series(val), bins, include_lowest=True) 

650 # error: "ndarray" has no attribute "cat" 

651 lev = lab.cat.categories # type: ignore[attr-defined] 

652 # error: No overload variant of "take" of "_ArrayOrScalarCommon" matches 

653 # argument types "Any", "bool", "Union[Any, float]" 

654 lab = lev.take( # type: ignore[call-overload] 

655 # error: "ndarray" has no attribute "cat" 

656 lab.cat.codes, # type: ignore[attr-defined] 

657 allow_fill=True, 

658 # error: Item "ndarray" of "Union[ndarray, Index]" has no attribute 

659 # "_na_value" 

660 fill_value=lev._na_value, # type: ignore[union-attr] 

661 ) 

662 llab = lambda lab, inc: lab[inc]._multiindex.codes[-1] 

663 

664 if is_interval_dtype(lab.dtype): 

665 # TODO: should we do this inside II? 

666 lab_interval = cast(Interval, lab) 

667 

668 sorter = np.lexsort((lab_interval.left, lab_interval.right, ids)) 

669 else: 

670 sorter = np.lexsort((lab, ids)) 

671 

672 ids, lab = ids[sorter], lab[sorter] 

673 

674 # group boundaries are where group ids change 

675 idchanges = 1 + np.nonzero(ids[1:] != ids[:-1])[0] 

676 idx = np.r_[0, idchanges] 

677 if not len(ids): 

678 idx = idchanges 

679 

680 # new values are where sorted labels change 

681 lchanges = llab(lab, slice(1, None)) != llab(lab, slice(None, -1)) 

682 inc = np.r_[True, lchanges] 

683 if not len(val): 

684 inc = lchanges 

685 inc[idx] = True # group boundaries are also new values 

686 out = np.diff(np.nonzero(np.r_[inc, True])[0]) # value counts 

687 

688 # num. of times each group should be repeated 

689 rep = partial(np.repeat, repeats=np.add.reduceat(inc, idx)) 

690 

691 # multi-index components 

692 codes = self.grouper.reconstructed_codes 

693 # error: Incompatible types in assignment (expression has type 

694 # "List[ndarray[Any, dtype[_SCT]]]", 

695 # variable has type "List[ndarray[Any, dtype[signedinteger[Any]]]]") 

696 codes = [ # type: ignore[assignment] 

697 rep(level_codes) for level_codes in codes 

698 ] + [llab(lab, inc)] 

699 # error: List item 0 has incompatible type "Union[ndarray[Any, Any], Index]"; 

700 # expected "Index" 

701 levels = [ping.group_index for ping in self.grouper.groupings] + [ 

702 lev # type: ignore[list-item] 

703 ] 

704 

705 if dropna: 

706 mask = codes[-1] != -1 

707 if mask.all(): 

708 dropna = False 

709 else: 

710 out, codes = out[mask], [level_codes[mask] for level_codes in codes] 

711 

712 if normalize: 

713 out = out.astype("float") 

714 d = np.diff(np.r_[idx, len(ids)]) 

715 if dropna: 

716 m = ids[lab == -1] 

717 np.add.at(d, m, -1) 

718 acc = rep(d)[mask] 

719 else: 

720 acc = rep(d) 

721 out /= acc 

722 

723 if sort and bins is None: 

724 cat = ids[inc][mask] if dropna else ids[inc] 

725 sorter = np.lexsort((out if ascending else -out, cat)) 

726 out, codes[-1] = out[sorter], codes[-1][sorter] 

727 

728 if bins is not None: 

729 # for compat. with libgroupby.value_counts need to ensure every 

730 # bin is present at every index level, null filled with zeros 

731 diff = np.zeros(len(out), dtype="bool") 

732 for level_codes in codes[:-1]: 

733 diff |= np.r_[True, level_codes[1:] != level_codes[:-1]] 

734 

735 ncat, nbin = diff.sum(), len(levels[-1]) 

736 

737 left = [np.repeat(np.arange(ncat), nbin), np.tile(np.arange(nbin), ncat)] 

738 

739 right = [diff.cumsum() - 1, codes[-1]] 

740 

741 _, idx = get_join_indexers(left, right, sort=False, how="left") 

742 out = np.where(idx != -1, out[idx], 0) 

743 

744 if sort: 

745 sorter = np.lexsort((out if ascending else -out, left[0])) 

746 out, left[-1] = out[sorter], left[-1][sorter] 

747 

748 # build the multi-index w/ full levels 

749 def build_codes(lev_codes: np.ndarray) -> np.ndarray: 

750 return np.repeat(lev_codes[diff], nbin) 

751 

752 codes = [build_codes(lev_codes) for lev_codes in codes[:-1]] 

753 codes.append(left[-1]) 

754 

755 mi = MultiIndex(levels=levels, codes=codes, names=names, verify_integrity=False) 

756 

757 if is_integer_dtype(out.dtype): 

758 out = ensure_int64(out) 

759 return self.obj._constructor(out, index=mi, name=self.obj.name) 

760 

761 @doc(Series.nlargest) 

762 def nlargest(self, n: int = 5, keep: str = "first") -> Series: 

763 f = partial(Series.nlargest, n=n, keep=keep) 

764 data = self._obj_with_exclusions 

765 # Don't change behavior if result index happens to be the same, i.e. 

766 # already ordered and n >= all group sizes. 

767 result = self._python_apply_general(f, data, not_indexed_same=True) 

768 return result 

769 

770 @doc(Series.nsmallest) 

771 def nsmallest(self, n: int = 5, keep: str = "first") -> Series: 

772 f = partial(Series.nsmallest, n=n, keep=keep) 

773 data = self._obj_with_exclusions 

774 # Don't change behavior if result index happens to be the same, i.e. 

775 # already ordered and n >= all group sizes. 

776 result = self._python_apply_general(f, data, not_indexed_same=True) 

777 return result 

778 

779 

780@pin_allowlisted_properties(DataFrame, base.dataframe_apply_allowlist) 

781class DataFrameGroupBy(GroupBy[DataFrame]): 

782 

783 _apply_allowlist = base.dataframe_apply_allowlist 

784 

785 _agg_examples_doc = dedent( 

786 """ 

787 Examples 

788 -------- 

789 >>> df = pd.DataFrame( 

790 ... { 

791 ... "A": [1, 1, 2, 2], 

792 ... "B": [1, 2, 3, 4], 

793 ... "C": [0.362838, 0.227877, 1.267767, -0.562860], 

794 ... } 

795 ... ) 

796 

797 >>> df 

798 A B C 

799 0 1 1 0.362838 

800 1 1 2 0.227877 

801 2 2 3 1.267767 

802 3 2 4 -0.562860 

803 

804 The aggregation is for each column. 

805 

806 >>> df.groupby('A').agg('min') 

807 B C 

808 A 

809 1 1 0.227877 

810 2 3 -0.562860 

811 

812 Multiple aggregations 

813 

814 >>> df.groupby('A').agg(['min', 'max']) 

815 B C 

816 min max min max 

817 A 

818 1 1 2 0.227877 0.362838 

819 2 3 4 -0.562860 1.267767 

820 

821 Select a column for aggregation 

822 

823 >>> df.groupby('A').B.agg(['min', 'max']) 

824 min max 

825 A 

826 1 1 2 

827 2 3 4 

828 

829 User-defined function for aggregation 

830 

831 >>> df.groupby('A').agg(lambda x: sum(x) + 2) 

832 B C 

833 A 

834 1 5 2.590715 

835 2 9 2.704907 

836 

837 Different aggregations per column 

838 

839 >>> df.groupby('A').agg({'B': ['min', 'max'], 'C': 'sum'}) 

840 B C 

841 min max sum 

842 A 

843 1 1 2 0.590715 

844 2 3 4 0.704907 

845 

846 To control the output names with different aggregations per column, 

847 pandas supports "named aggregation" 

848 

849 >>> df.groupby("A").agg( 

850 ... b_min=pd.NamedAgg(column="B", aggfunc="min"), 

851 ... c_sum=pd.NamedAgg(column="C", aggfunc="sum")) 

852 b_min c_sum 

853 A 

854 1 1 0.590715 

855 2 3 0.704907 

856 

857 - The keywords are the *output* column names 

858 - The values are tuples whose first element is the column to select 

859 and the second element is the aggregation to apply to that column. 

860 Pandas provides the ``pandas.NamedAgg`` namedtuple with the fields 

861 ``['column', 'aggfunc']`` to make it clearer what the arguments are. 

862 As usual, the aggregation can be a callable or a string alias. 

863 

864 See :ref:`groupby.aggregate.named` for more. 

865 

866 .. versionchanged:: 1.3.0 

867 

868 The resulting dtype will reflect the return value of the aggregating function. 

869 

870 >>> df.groupby("A")[["B"]].agg(lambda x: x.astype(float).min()) 

871 B 

872 A 

873 1 1.0 

874 2 3.0 

875 """ 

876 ) 

877 

878 @doc(_agg_template, examples=_agg_examples_doc, klass="DataFrame") 

879 def aggregate(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs): 

880 

881 if maybe_use_numba(engine): 

882 with self._group_selection_context(): 

883 data = self._selected_obj 

884 result = self._aggregate_with_numba( 

885 data, func, *args, engine_kwargs=engine_kwargs, **kwargs 

886 ) 

887 index = self.grouper.result_index 

888 return self.obj._constructor(result, index=index, columns=data.columns) 

889 

890 relabeling, func, columns, order = reconstruct_func(func, **kwargs) 

891 func = maybe_mangle_lambdas(func) 

892 

893 op = GroupByApply(self, func, args, kwargs) 

894 result = op.agg() 

895 if not is_dict_like(func) and result is not None: 

896 return result 

897 elif relabeling and result is not None: 

898 # this should be the only (non-raising) case with relabeling 

899 # used reordered index of columns 

900 result = result.iloc[:, order] 

901 result.columns = columns 

902 

903 if result is None: 

904 

905 # grouper specific aggregations 

906 if self.grouper.nkeys > 1: 

907 # test_groupby_as_index_series_scalar gets here with 'not self.as_index' 

908 return self._python_agg_general(func, *args, **kwargs) 

909 elif args or kwargs: 

910 # test_pass_args_kwargs gets here (with and without as_index) 

911 # can't return early 

912 result = self._aggregate_frame(func, *args, **kwargs) 

913 

914 elif self.axis == 1: 

915 # _aggregate_multiple_funcs does not allow self.axis == 1 

916 # Note: axis == 1 precludes 'not self.as_index', see __init__ 

917 result = self._aggregate_frame(func) 

918 return result 

919 

920 else: 

921 

922 # try to treat as if we are passing a list 

923 gba = GroupByApply(self, [func], args=(), kwargs={}) 

924 try: 

925 result = gba.agg() 

926 

927 except ValueError as err: 

928 if "no results" not in str(err): 

929 # raised directly by _aggregate_multiple_funcs 

930 raise 

931 result = self._aggregate_frame(func) 

932 

933 else: 

934 sobj = self._selected_obj 

935 

936 if isinstance(sobj, Series): 

937 # GH#35246 test_groupby_as_index_select_column_sum_empty_df 

938 result.columns = self._obj_with_exclusions.columns.copy() 

939 else: 

940 # Retain our column names 

941 result.columns._set_names( 

942 sobj.columns.names, level=list(range(sobj.columns.nlevels)) 

943 ) 

944 # select everything except for the last level, which is the one 

945 # containing the name of the function(s), see GH#32040 

946 result.columns = result.columns.droplevel(-1) 

947 

948 if not self.as_index: 

949 self._insert_inaxis_grouper_inplace(result) 

950 result.index = Index(range(len(result))) 

951 

952 return result 

953 

954 agg = aggregate 

955 

956 def _iterate_slices(self) -> Iterable[Series]: 

957 obj = self._selected_obj 

958 if self.axis == 1: 

959 obj = obj.T 

960 

961 if isinstance(obj, Series) and obj.name not in self.exclusions: 

962 # Occurs when doing DataFrameGroupBy(...)["X"] 

963 yield obj 

964 else: 

965 for label, values in obj.items(): 

966 if label in self.exclusions: 

967 continue 

968 

969 yield values 

970 

971 def _aggregate_frame(self, func, *args, **kwargs) -> DataFrame: 

972 if self.grouper.nkeys != 1: 

973 raise AssertionError("Number of keys must be 1") 

974 

975 obj = self._obj_with_exclusions 

976 

977 result: dict[Hashable, NDFrame | np.ndarray] = {} 

978 if self.axis == 0: 

979 # test_pass_args_kwargs_duplicate_columns gets here with non-unique columns 

980 for name, data in self.grouper.get_iterator(obj, self.axis): 

981 fres = func(data, *args, **kwargs) 

982 result[name] = fres 

983 else: 

984 # we get here in a number of test_multilevel tests 

985 for name in self.indices: 

986 grp_df = self.get_group(name, obj=obj) 

987 fres = func(grp_df, *args, **kwargs) 

988 result[name] = fres 

989 

990 result_index = self.grouper.result_index 

991 other_ax = obj.axes[1 - self.axis] 

992 out = self.obj._constructor(result, index=other_ax, columns=result_index) 

993 if self.axis == 0: 

994 out = out.T 

995 

996 return out 

997 

998 def _aggregate_item_by_item(self, func, *args, **kwargs) -> DataFrame: 

999 # only for axis==0 

1000 # tests that get here with non-unique cols: 

1001 # test_resample_with_timedelta_yields_no_empty_groups, 

1002 # test_resample_apply_product 

1003 

1004 obj = self._obj_with_exclusions 

1005 result: dict[int, NDFrame] = {} 

1006 

1007 for i, (item, sgb) in enumerate(self._iterate_column_groupbys(obj)): 

1008 result[i] = sgb.aggregate(func, *args, **kwargs) 

1009 

1010 res_df = self.obj._constructor(result) 

1011 res_df.columns = obj.columns 

1012 return res_df 

1013 

1014 def _wrap_applied_output( 

1015 self, 

1016 data: DataFrame, 

1017 values: list, 

1018 not_indexed_same: bool = False, 

1019 override_group_keys: bool = False, 

1020 ): 

1021 

1022 if len(values) == 0: 

1023 result = self.obj._constructor( 

1024 index=self.grouper.result_index, columns=data.columns 

1025 ) 

1026 result = result.astype(data.dtypes, copy=False) 

1027 return result 

1028 

1029 # GH12824 

1030 first_not_none = next(com.not_none(*values), None) 

1031 

1032 if first_not_none is None: 

1033 # GH9684 - All values are None, return an empty frame. 

1034 return self.obj._constructor() 

1035 elif isinstance(first_not_none, DataFrame): 

1036 return self._concat_objects( 

1037 values, 

1038 not_indexed_same=not_indexed_same, 

1039 override_group_keys=override_group_keys, 

1040 ) 

1041 

1042 key_index = self.grouper.result_index if self.as_index else None 

1043 

1044 if isinstance(first_not_none, (np.ndarray, Index)): 

1045 # GH#1738: values is list of arrays of unequal lengths 

1046 # fall through to the outer else clause 

1047 # TODO: sure this is right? we used to do this 

1048 # after raising AttributeError above 

1049 return self.obj._constructor_sliced( 

1050 values, index=key_index, name=self._selection 

1051 ) 

1052 elif not isinstance(first_not_none, Series): 

1053 # values are not series or array-like but scalars 

1054 # self._selection not passed through to Series as the 

1055 # result should not take the name of original selection 

1056 # of columns 

1057 if self.as_index: 

1058 return self.obj._constructor_sliced(values, index=key_index) 

1059 else: 

1060 result = self.obj._constructor(values, columns=[self._selection]) 

1061 self._insert_inaxis_grouper_inplace(result) 

1062 return result 

1063 else: 

1064 # values are Series 

1065 return self._wrap_applied_output_series( 

1066 values, 

1067 not_indexed_same, 

1068 first_not_none, 

1069 key_index, 

1070 override_group_keys, 

1071 ) 

1072 

1073 def _wrap_applied_output_series( 

1074 self, 

1075 values: list[Series], 

1076 not_indexed_same: bool, 

1077 first_not_none, 

1078 key_index, 

1079 override_group_keys: bool, 

1080 ) -> DataFrame | Series: 

1081 # this is to silence a DeprecationWarning 

1082 # TODO(2.0): Remove when default dtype of empty Series is object 

1083 kwargs = first_not_none._construct_axes_dict() 

1084 backup = create_series_with_explicit_dtype(dtype_if_empty=object, **kwargs) 

1085 values = [x if (x is not None) else backup for x in values] 

1086 

1087 all_indexed_same = all_indexes_same(x.index for x in values) 

1088 

1089 # GH3596 

1090 # provide a reduction (Frame -> Series) if groups are 

1091 # unique 

1092 if self.squeeze: 

1093 applied_index = self._selected_obj._get_axis(self.axis) 

1094 singular_series = len(values) == 1 and applied_index.nlevels == 1 

1095 

1096 if singular_series: 

1097 # GH2893 

1098 # we have series in the values array, we want to 

1099 # produce a series: 

1100 # if any of the sub-series are not indexed the same 

1101 # OR we don't have a multi-index and we have only a 

1102 # single values 

1103 return self._concat_objects( 

1104 values, 

1105 not_indexed_same=not_indexed_same, 

1106 override_group_keys=override_group_keys, 

1107 ) 

1108 

1109 # still a series 

1110 # path added as of GH 5545 

1111 elif all_indexed_same: 

1112 from pandas.core.reshape.concat import concat 

1113 

1114 return concat(values) 

1115 

1116 if not all_indexed_same: 

1117 # GH 8467 

1118 return self._concat_objects( 

1119 values, 

1120 not_indexed_same=True, 

1121 override_group_keys=override_group_keys, 

1122 ) 

1123 

1124 # Combine values 

1125 # vstack+constructor is faster than concat and handles MI-columns 

1126 stacked_values = np.vstack([np.asarray(v) for v in values]) 

1127 

1128 if self.axis == 0: 

1129 index = key_index 

1130 columns = first_not_none.index.copy() 

1131 if columns.name is None: 

1132 # GH6124 - propagate name of Series when it's consistent 

1133 names = {v.name for v in values} 

1134 if len(names) == 1: 

1135 columns.name = list(names)[0] 

1136 else: 

1137 index = first_not_none.index 

1138 columns = key_index 

1139 stacked_values = stacked_values.T 

1140 

1141 if stacked_values.dtype == object: 

1142 # We'll have the DataFrame constructor do inference 

1143 stacked_values = stacked_values.tolist() 

1144 result = self.obj._constructor(stacked_values, index=index, columns=columns) 

1145 

1146 if not self.as_index: 

1147 self._insert_inaxis_grouper_inplace(result) 

1148 

1149 return self._reindex_output(result) 

1150 

1151 def _cython_transform( 

1152 self, 

1153 how: str, 

1154 numeric_only: bool | lib.NoDefault = lib.no_default, 

1155 axis: int = 0, 

1156 **kwargs, 

1157 ) -> DataFrame: 

1158 assert axis == 0 # handled by caller 

1159 # TODO: no tests with self.ndim == 1 for DataFrameGroupBy 

1160 numeric_only_bool = self._resolve_numeric_only(how, numeric_only, axis) 

1161 

1162 # With self.axis == 0, we have multi-block tests 

1163 # e.g. test_rank_min_int, test_cython_transform_frame 

1164 # test_transform_numeric_ret 

1165 # With self.axis == 1, _get_data_to_aggregate does a transpose 

1166 # so we always have a single block. 

1167 mgr: Manager2D = self._get_data_to_aggregate() 

1168 orig_mgr_len = len(mgr) 

1169 if numeric_only_bool: 

1170 mgr = mgr.get_numeric_data(copy=False) 

1171 

1172 def arr_func(bvalues: ArrayLike) -> ArrayLike: 

1173 return self.grouper._cython_operation( 

1174 "transform", bvalues, how, 1, **kwargs 

1175 ) 

1176 

1177 # We could use `mgr.apply` here and not have to set_axis, but 

1178 # we would have to do shape gymnastics for ArrayManager compat 

1179 res_mgr = mgr.grouped_reduce(arr_func, ignore_failures=True) 

1180 res_mgr.set_axis(1, mgr.axes[1]) 

1181 

1182 if len(res_mgr) < orig_mgr_len: 

1183 warn_dropping_nuisance_columns_deprecated(type(self), how, numeric_only) 

1184 

1185 res_df = self.obj._constructor(res_mgr) 

1186 if self.axis == 1: 

1187 res_df = res_df.T 

1188 return res_df 

1189 

1190 def _transform_general(self, func, *args, **kwargs): 

1191 from pandas.core.reshape.concat import concat 

1192 

1193 applied = [] 

1194 obj = self._obj_with_exclusions 

1195 gen = self.grouper.get_iterator(obj, axis=self.axis) 

1196 fast_path, slow_path = self._define_paths(func, *args, **kwargs) 

1197 

1198 # Determine whether to use slow or fast path by evaluating on the first group. 

1199 # Need to handle the case of an empty generator and process the result so that 

1200 # it does not need to be computed again. 

1201 try: 

1202 name, group = next(gen) 

1203 except StopIteration: 

1204 pass 

1205 else: 

1206 object.__setattr__(group, "name", name) 

1207 try: 

1208 path, res = self._choose_path(fast_path, slow_path, group) 

1209 except TypeError: 

1210 return self._transform_item_by_item(obj, fast_path) 

1211 except ValueError as err: 

1212 msg = "transform must return a scalar value for each group" 

1213 raise ValueError(msg) from err 

1214 if group.size > 0: 

1215 res = _wrap_transform_general_frame(self.obj, group, res) 

1216 applied.append(res) 

1217 

1218 # Compute and process with the remaining groups 

1219 emit_alignment_warning = False 

1220 for name, group in gen: 

1221 if group.size == 0: 

1222 continue 

1223 object.__setattr__(group, "name", name) 

1224 res = path(group) 

1225 if ( 

1226 not emit_alignment_warning 

1227 and res.ndim == 2 

1228 and not res.index.equals(group.index) 

1229 ): 

1230 emit_alignment_warning = True 

1231 

1232 res = _wrap_transform_general_frame(self.obj, group, res) 

1233 applied.append(res) 

1234 

1235 if emit_alignment_warning: 

1236 # GH#45648 

1237 warnings.warn( 

1238 "In a future version of pandas, returning a DataFrame in " 

1239 "groupby.transform will align with the input's index. Apply " 

1240 "`.to_numpy()` to the result in the transform function to keep " 

1241 "the current behavior and silence this warning.", 

1242 FutureWarning, 

1243 stacklevel=find_stack_level(), 

1244 ) 

1245 

1246 concat_index = obj.columns if self.axis == 0 else obj.index 

1247 other_axis = 1 if self.axis == 0 else 0 # switches between 0 & 1 

1248 concatenated = concat(applied, axis=self.axis, verify_integrity=False) 

1249 concatenated = concatenated.reindex(concat_index, axis=other_axis, copy=False) 

1250 return self._set_result_index_ordered(concatenated) 

1251 

1252 @Substitution(klass="DataFrame") 

1253 @Appender(_transform_template) 

1254 def transform(self, func, *args, engine=None, engine_kwargs=None, **kwargs): 

1255 return self._transform( 

1256 func, *args, engine=engine, engine_kwargs=engine_kwargs, **kwargs 

1257 ) 

1258 

1259 def _define_paths(self, func, *args, **kwargs): 

1260 if isinstance(func, str): 

1261 fast_path = lambda group: getattr(group, func)(*args, **kwargs) 

1262 slow_path = lambda group: group.apply( 

1263 lambda x: getattr(x, func)(*args, **kwargs), axis=self.axis 

1264 ) 

1265 else: 

1266 fast_path = lambda group: func(group, *args, **kwargs) 

1267 slow_path = lambda group: group.apply( 

1268 lambda x: func(x, *args, **kwargs), axis=self.axis 

1269 ) 

1270 return fast_path, slow_path 

1271 

1272 def _choose_path(self, fast_path: Callable, slow_path: Callable, group: DataFrame): 

1273 path = slow_path 

1274 res = slow_path(group) 

1275 

1276 if self.ngroups == 1: 

1277 # no need to evaluate multiple paths when only 

1278 # a single group exists 

1279 return path, res 

1280 

1281 # if we make it here, test if we can use the fast path 

1282 try: 

1283 res_fast = fast_path(group) 

1284 except AssertionError: 

1285 raise # pragma: no cover 

1286 except Exception: 

1287 # GH#29631 For user-defined function, we can't predict what may be 

1288 # raised; see test_transform.test_transform_fastpath_raises 

1289 return path, res 

1290 

1291 # verify fast path returns either: 

1292 # a DataFrame with columns equal to group.columns 

1293 # OR a Series with index equal to group.columns 

1294 if isinstance(res_fast, DataFrame): 

1295 if not res_fast.columns.equals(group.columns): 

1296 return path, res 

1297 elif isinstance(res_fast, Series): 

1298 if not res_fast.index.equals(group.columns): 

1299 return path, res 

1300 else: 

1301 return path, res 

1302 

1303 if res_fast.equals(res): 

1304 path = fast_path 

1305 

1306 return path, res 

1307 

1308 def _transform_item_by_item(self, obj: DataFrame, wrapper) -> DataFrame: 

1309 # iterate through columns, see test_transform_exclude_nuisance 

1310 # gets here with non-unique columns 

1311 output = {} 

1312 inds = [] 

1313 for i, (colname, sgb) in enumerate(self._iterate_column_groupbys(obj)): 

1314 try: 

1315 output[i] = sgb.transform(wrapper) 

1316 except TypeError: 

1317 # e.g. trying to call nanmean with string values 

1318 warn_dropping_nuisance_columns_deprecated( 

1319 type(self), "transform", numeric_only=False 

1320 ) 

1321 else: 

1322 inds.append(i) 

1323 

1324 if not output: 

1325 raise TypeError("Transform function invalid for data types") 

1326 

1327 columns = obj.columns.take(inds) 

1328 

1329 result = self.obj._constructor(output, index=obj.index) 

1330 result.columns = columns 

1331 return result 

1332 

1333 def filter(self, func, dropna=True, *args, **kwargs): 

1334 """ 

1335 Return a copy of a DataFrame excluding filtered elements. 

1336 

1337 Elements from groups are filtered if they do not satisfy the 

1338 boolean criterion specified by func. 

1339 

1340 Parameters 

1341 ---------- 

1342 func : function 

1343 Function to apply to each subframe. Should return True or False. 

1344 dropna : Drop groups that do not pass the filter. True by default; 

1345 If False, groups that evaluate False are filled with NaNs. 

1346 

1347 Returns 

1348 ------- 

1349 filtered : DataFrame 

1350 

1351 Notes 

1352 ----- 

1353 Each subframe is endowed the attribute 'name' in case you need to know 

1354 which group you are working on. 

1355 

1356 Functions that mutate the passed object can produce unexpected 

1357 behavior or errors and are not supported. See :ref:`gotchas.udf-mutation` 

1358 for more details. 

1359 

1360 Examples 

1361 -------- 

1362 >>> df = pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar', 

1363 ... 'foo', 'bar'], 

1364 ... 'B' : [1, 2, 3, 4, 5, 6], 

1365 ... 'C' : [2.0, 5., 8., 1., 2., 9.]}) 

1366 >>> grouped = df.groupby('A') 

1367 >>> grouped.filter(lambda x: x['B'].mean() > 3.) 

1368 A B C 

1369 1 bar 2 5.0 

1370 3 bar 4 1.0 

1371 5 bar 6 9.0 

1372 """ 

1373 indices = [] 

1374 

1375 obj = self._selected_obj 

1376 gen = self.grouper.get_iterator(obj, axis=self.axis) 

1377 

1378 for name, group in gen: 

1379 object.__setattr__(group, "name", name) 

1380 

1381 res = func(group, *args, **kwargs) 

1382 

1383 try: 

1384 res = res.squeeze() 

1385 except AttributeError: # allow e.g., scalars and frames to pass 

1386 pass 

1387 

1388 # interpret the result of the filter 

1389 if is_bool(res) or (is_scalar(res) and isna(res)): 

1390 if res and notna(res): 

1391 indices.append(self._get_index(name)) 

1392 else: 

1393 # non scalars aren't allowed 

1394 raise TypeError( 

1395 f"filter function returned a {type(res).__name__}, " 

1396 "but expected a scalar bool" 

1397 ) 

1398 

1399 return self._apply_filter(indices, dropna) 

1400 

1401 def __getitem__(self, key) -> DataFrameGroupBy | SeriesGroupBy: 

1402 if self.axis == 1: 

1403 # GH 37725 

1404 raise ValueError("Cannot subset columns when using axis=1") 

1405 # per GH 23566 

1406 if isinstance(key, tuple) and len(key) > 1: 

1407 # if len == 1, then it becomes a SeriesGroupBy and this is actually 

1408 # valid syntax, so don't raise warning 

1409 warnings.warn( 

1410 "Indexing with multiple keys (implicitly converted to a tuple " 

1411 "of keys) will be deprecated, use a list instead.", 

1412 FutureWarning, 

1413 stacklevel=find_stack_level(), 

1414 ) 

1415 return super().__getitem__(key) 

1416 

1417 def _gotitem(self, key, ndim: int, subset=None): 

1418 """ 

1419 sub-classes to define 

1420 return a sliced object 

1421 

1422 Parameters 

1423 ---------- 

1424 key : string / list of selections 

1425 ndim : {1, 2} 

1426 requested ndim of result 

1427 subset : object, default None 

1428 subset to act on 

1429 """ 

1430 if ndim == 2: 

1431 if subset is None: 

1432 subset = self.obj 

1433 return DataFrameGroupBy( 

1434 subset, 

1435 self.grouper, 

1436 axis=self.axis, 

1437 level=self.level, 

1438 grouper=self.grouper, 

1439 exclusions=self.exclusions, 

1440 selection=key, 

1441 as_index=self.as_index, 

1442 sort=self.sort, 

1443 group_keys=self.group_keys, 

1444 squeeze=self.squeeze, 

1445 observed=self.observed, 

1446 mutated=self.mutated, 

1447 dropna=self.dropna, 

1448 ) 

1449 elif ndim == 1: 

1450 if subset is None: 

1451 subset = self.obj[key] 

1452 return SeriesGroupBy( 

1453 subset, 

1454 level=self.level, 

1455 grouper=self.grouper, 

1456 selection=key, 

1457 sort=self.sort, 

1458 group_keys=self.group_keys, 

1459 squeeze=self.squeeze, 

1460 observed=self.observed, 

1461 dropna=self.dropna, 

1462 ) 

1463 

1464 raise AssertionError("invalid ndim for _gotitem") 

1465 

1466 def _get_data_to_aggregate(self) -> Manager2D: 

1467 obj = self._obj_with_exclusions 

1468 if self.axis == 1: 

1469 return obj.T._mgr 

1470 else: 

1471 return obj._mgr 

1472 

1473 def _insert_inaxis_grouper_inplace(self, result: DataFrame) -> None: 

1474 # zip in reverse so we can always insert at loc 0 

1475 columns = result.columns 

1476 for name, lev, in_axis in zip( 

1477 reversed(self.grouper.names), 

1478 reversed(self.grouper.get_group_levels()), 

1479 reversed([grp.in_axis for grp in self.grouper.groupings]), 

1480 ): 

1481 # GH #28549 

1482 # When using .apply(-), name will be in columns already 

1483 if in_axis and name not in columns: 

1484 result.insert(0, name, lev) 

1485 

1486 def _indexed_output_to_ndframe( 

1487 self, output: Mapping[base.OutputKey, ArrayLike] 

1488 ) -> DataFrame: 

1489 """ 

1490 Wrap the dict result of a GroupBy aggregation into a DataFrame. 

1491 """ 

1492 indexed_output = {key.position: val for key, val in output.items()} 

1493 columns = Index([key.label for key in output]) 

1494 columns._set_names(self._obj_with_exclusions._get_axis(1 - self.axis).names) 

1495 

1496 result = self.obj._constructor(indexed_output) 

1497 result.columns = columns 

1498 return result 

1499 

1500 def _wrap_agged_manager(self, mgr: Manager2D) -> DataFrame: 

1501 if not self.as_index: 

1502 # GH 41998 - empty mgr always gets index of length 0 

1503 rows = mgr.shape[1] if mgr.shape[0] > 0 else 0 

1504 index = Index(range(rows)) 

1505 mgr.set_axis(1, index) 

1506 result = self.obj._constructor(mgr) 

1507 

1508 self._insert_inaxis_grouper_inplace(result) 

1509 result = result._consolidate() 

1510 else: 

1511 index = self.grouper.result_index 

1512 mgr.set_axis(1, index) 

1513 result = self.obj._constructor(mgr) 

1514 

1515 if self.axis == 1: 

1516 result = result.T 

1517 

1518 # Note: we only need to pass datetime=True in order to get numeric 

1519 # values converted 

1520 return self._reindex_output(result)._convert(datetime=True) 

1521 

1522 def _iterate_column_groupbys(self, obj: DataFrame | Series): 

1523 for i, colname in enumerate(obj.columns): 

1524 yield colname, SeriesGroupBy( 

1525 obj.iloc[:, i], 

1526 selection=colname, 

1527 grouper=self.grouper, 

1528 exclusions=self.exclusions, 

1529 observed=self.observed, 

1530 ) 

1531 

1532 def _apply_to_column_groupbys(self, func, obj: DataFrame | Series) -> DataFrame: 

1533 from pandas.core.reshape.concat import concat 

1534 

1535 columns = obj.columns 

1536 results = [ 

1537 func(col_groupby) for _, col_groupby in self._iterate_column_groupbys(obj) 

1538 ] 

1539 

1540 if not len(results): 

1541 # concat would raise 

1542 return DataFrame([], columns=columns, index=self.grouper.result_index) 

1543 else: 

1544 return concat(results, keys=columns, axis=1) 

1545 

1546 def nunique(self, dropna: bool = True) -> DataFrame: 

1547 """ 

1548 Return DataFrame with counts of unique elements in each position. 

1549 

1550 Parameters 

1551 ---------- 

1552 dropna : bool, default True 

1553 Don't include NaN in the counts. 

1554 

1555 Returns 

1556 ------- 

1557 nunique: DataFrame 

1558 

1559 Examples 

1560 -------- 

1561 >>> df = pd.DataFrame({'id': ['spam', 'egg', 'egg', 'spam', 

1562 ... 'ham', 'ham'], 

1563 ... 'value1': [1, 5, 5, 2, 5, 5], 

1564 ... 'value2': list('abbaxy')}) 

1565 >>> df 

1566 id value1 value2 

1567 0 spam 1 a 

1568 1 egg 5 b 

1569 2 egg 5 b 

1570 3 spam 2 a 

1571 4 ham 5 x 

1572 5 ham 5 y 

1573 

1574 >>> df.groupby('id').nunique() 

1575 value1 value2 

1576 id 

1577 egg 1 1 

1578 ham 1 2 

1579 spam 2 1 

1580 

1581 Check for rows with the same id but conflicting values: 

1582 

1583 >>> df.groupby('id').filter(lambda g: (g.nunique() > 1).any()) 

1584 id value1 value2 

1585 0 spam 1 a 

1586 3 spam 2 a 

1587 4 ham 5 x 

1588 5 ham 5 y 

1589 """ 

1590 

1591 if self.axis != 0: 

1592 # see test_groupby_crash_on_nunique 

1593 return self._python_agg_general(lambda sgb: sgb.nunique(dropna)) 

1594 

1595 obj = self._obj_with_exclusions 

1596 results = self._apply_to_column_groupbys( 

1597 lambda sgb: sgb.nunique(dropna), obj=obj 

1598 ) 

1599 

1600 if not self.as_index: 

1601 results.index = Index(range(len(results))) 

1602 self._insert_inaxis_grouper_inplace(results) 

1603 

1604 return results 

1605 

1606 @doc( 

1607 _shared_docs["idxmax"], 

1608 numeric_only_default="True for axis=0, False for axis=1", 

1609 ) 

1610 def idxmax( 

1611 self, 

1612 axis=0, 

1613 skipna: bool = True, 

1614 numeric_only: bool | lib.NoDefault = lib.no_default, 

1615 ) -> DataFrame: 

1616 axis = DataFrame._get_axis_number(axis) 

1617 if numeric_only is lib.no_default: 

1618 # Cannot use self._resolve_numeric_only; we must pass None to 

1619 # DataFrame.idxmax for backwards compatibility 

1620 numeric_only_arg = None if axis == 0 else False 

1621 else: 

1622 numeric_only_arg = numeric_only 

1623 

1624 def func(df): 

1625 with warnings.catch_warnings(): 

1626 # Suppress numeric_only warnings here, will warn below 

1627 warnings.filterwarnings("ignore", ".*numeric_only in DataFrame.argmax") 

1628 res = df._reduce( 

1629 nanops.nanargmax, 

1630 "argmax", 

1631 axis=axis, 

1632 skipna=skipna, 

1633 numeric_only=numeric_only_arg, 

1634 ) 

1635 indices = res._values 

1636 index = df._get_axis(axis) 

1637 result = [index[i] if i >= 0 else np.nan for i in indices] 

1638 return df._constructor_sliced(result, index=res.index) 

1639 

1640 func.__name__ = "idxmax" 

1641 result = self._python_apply_general( 

1642 func, self._obj_with_exclusions, not_indexed_same=True 

1643 ) 

1644 self._maybe_warn_numeric_only_depr("idxmax", result, numeric_only) 

1645 return result 

1646 

1647 @doc( 

1648 _shared_docs["idxmin"], 

1649 numeric_only_default="True for axis=0, False for axis=1", 

1650 ) 

1651 def idxmin( 

1652 self, 

1653 axis=0, 

1654 skipna: bool = True, 

1655 numeric_only: bool | lib.NoDefault = lib.no_default, 

1656 ) -> DataFrame: 

1657 axis = DataFrame._get_axis_number(axis) 

1658 if numeric_only is lib.no_default: 

1659 # Cannot use self._resolve_numeric_only; we must pass None to 

1660 # DataFrame.idxmin for backwards compatibility 

1661 numeric_only_arg = None if axis == 0 else False 

1662 else: 

1663 numeric_only_arg = numeric_only 

1664 

1665 def func(df): 

1666 with warnings.catch_warnings(): 

1667 # Suppress numeric_only warnings here, will warn below 

1668 warnings.filterwarnings("ignore", ".*numeric_only in DataFrame.argmin") 

1669 res = df._reduce( 

1670 nanops.nanargmin, 

1671 "argmin", 

1672 axis=axis, 

1673 skipna=skipna, 

1674 numeric_only=numeric_only_arg, 

1675 ) 

1676 indices = res._values 

1677 index = df._get_axis(axis) 

1678 result = [index[i] if i >= 0 else np.nan for i in indices] 

1679 return df._constructor_sliced(result, index=res.index) 

1680 

1681 func.__name__ = "idxmin" 

1682 result = self._python_apply_general( 

1683 func, self._obj_with_exclusions, not_indexed_same=True 

1684 ) 

1685 self._maybe_warn_numeric_only_depr("idxmin", result, numeric_only) 

1686 return result 

1687 

1688 boxplot = boxplot_frame_groupby 

1689 

1690 def value_counts( 

1691 self, 

1692 subset: Sequence[Hashable] | None = None, 

1693 normalize: bool = False, 

1694 sort: bool = True, 

1695 ascending: bool = False, 

1696 dropna: bool = True, 

1697 ) -> DataFrame | Series: 

1698 """ 

1699 Return a Series or DataFrame containing counts of unique rows. 

1700 

1701 .. versionadded:: 1.4.0 

1702 

1703 Parameters 

1704 ---------- 

1705 subset : list-like, optional 

1706 Columns to use when counting unique combinations. 

1707 normalize : bool, default False 

1708 Return proportions rather than frequencies. 

1709 sort : bool, default True 

1710 Sort by frequencies. 

1711 ascending : bool, default False 

1712 Sort in ascending order. 

1713 dropna : bool, default True 

1714 Don’t include counts of rows that contain NA values. 

1715 

1716 Returns 

1717 ------- 

1718 Series or DataFrame 

1719 Series if the groupby as_index is True, otherwise DataFrame. 

1720 

1721 See Also 

1722 -------- 

1723 Series.value_counts: Equivalent method on Series. 

1724 DataFrame.value_counts: Equivalent method on DataFrame. 

1725 SeriesGroupBy.value_counts: Equivalent method on SeriesGroupBy. 

1726 

1727 Notes 

1728 ----- 

1729 - If the groupby as_index is True then the returned Series will have a 

1730 MultiIndex with one level per input column. 

1731 - If the groupby as_index is False then the returned DataFrame will have an 

1732 additional column with the value_counts. The column is labelled 'count' or 

1733 'proportion', depending on the ``normalize`` parameter. 

1734 

1735 By default, rows that contain any NA values are omitted from 

1736 the result. 

1737 

1738 By default, the result will be in descending order so that the 

1739 first element of each group is the most frequently-occurring row. 

1740 

1741 Examples 

1742 -------- 

1743 >>> df = pd.DataFrame({ 

1744 ... 'gender': ['male', 'male', 'female', 'male', 'female', 'male'], 

1745 ... 'education': ['low', 'medium', 'high', 'low', 'high', 'low'], 

1746 ... 'country': ['US', 'FR', 'US', 'FR', 'FR', 'FR'] 

1747 ... }) 

1748 

1749 >>> df 

1750 gender education country 

1751 0 male low US 

1752 1 male medium FR 

1753 2 female high US 

1754 3 male low FR 

1755 4 female high FR 

1756 5 male low FR 

1757 

1758 >>> df.groupby('gender').value_counts() 

1759 gender education country 

1760 female high FR 1 

1761 US 1 

1762 male low FR 2 

1763 US 1 

1764 medium FR 1 

1765 dtype: int64 

1766 

1767 >>> df.groupby('gender').value_counts(ascending=True) 

1768 gender education country 

1769 female high FR 1 

1770 US 1 

1771 male low US 1 

1772 medium FR 1 

1773 low FR 2 

1774 dtype: int64 

1775 

1776 >>> df.groupby('gender').value_counts(normalize=True) 

1777 gender education country 

1778 female high FR 0.50 

1779 US 0.50 

1780 male low FR 0.50 

1781 US 0.25 

1782 medium FR 0.25 

1783 dtype: float64 

1784 

1785 >>> df.groupby('gender', as_index=False).value_counts() 

1786 gender education country count 

1787 0 female high FR 1 

1788 1 female high US 1 

1789 2 male low FR 2 

1790 3 male low US 1 

1791 4 male medium FR 1 

1792 

1793 >>> df.groupby('gender', as_index=False).value_counts(normalize=True) 

1794 gender education country proportion 

1795 0 female high FR 0.50 

1796 1 female high US 0.50 

1797 2 male low FR 0.50 

1798 3 male low US 0.25 

1799 4 male medium FR 0.25 

1800 """ 

1801 if self.axis == 1: 

1802 raise NotImplementedError( 

1803 "DataFrameGroupBy.value_counts only handles axis=0" 

1804 ) 

1805 

1806 with self._group_selection_context(): 

1807 df = self.obj 

1808 

1809 in_axis_names = { 

1810 grouping.name for grouping in self.grouper.groupings if grouping.in_axis 

1811 } 

1812 if isinstance(self._selected_obj, Series): 

1813 name = self._selected_obj.name 

1814 keys = [] if name in in_axis_names else [self._selected_obj] 

1815 else: 

1816 unique_cols = set(self._selected_obj.columns) 

1817 if subset is not None: 

1818 subsetted = set(subset) 

1819 clashing = subsetted & set(in_axis_names) 

1820 if clashing: 

1821 raise ValueError( 

1822 f"Keys {clashing} in subset cannot be in " 

1823 "the groupby column keys." 

1824 ) 

1825 doesnt_exist = subsetted - unique_cols 

1826 if doesnt_exist: 

1827 raise ValueError( 

1828 f"Keys {doesnt_exist} in subset do not " 

1829 f"exist in the DataFrame." 

1830 ) 

1831 else: 

1832 subsetted = unique_cols 

1833 

1834 keys = [ 

1835 # Can't use .values because the column label needs to be preserved 

1836 self._selected_obj.iloc[:, idx] 

1837 for idx, name in enumerate(self._selected_obj.columns) 

1838 if name not in in_axis_names and name in subsetted 

1839 ] 

1840 

1841 groupings = list(self.grouper.groupings) 

1842 for key in keys: 

1843 grouper, _, _ = get_grouper( 

1844 df, 

1845 key=key, 

1846 axis=self.axis, 

1847 sort=self.sort, 

1848 observed=False, 

1849 dropna=dropna, 

1850 ) 

1851 groupings += list(grouper.groupings) 

1852 

1853 # Take the size of the overall columns 

1854 gb = df.groupby( 

1855 groupings, 

1856 sort=self.sort, 

1857 observed=self.observed, 

1858 dropna=self.dropna, 

1859 ) 

1860 result_series = cast(Series, gb.size()) 

1861 

1862 # GH-46357 Include non-observed categories 

1863 # of non-grouping columns regardless of `observed` 

1864 if any( 

1865 isinstance(grouping.grouping_vector, (Categorical, CategoricalIndex)) 

1866 and not grouping._observed 

1867 for grouping in groupings 

1868 ): 

1869 levels_list = [ping.result_index for ping in groupings] 

1870 multi_index, _ = MultiIndex.from_product( 

1871 levels_list, names=[ping.name for ping in groupings] 

1872 ).sortlevel() 

1873 result_series = result_series.reindex(multi_index, fill_value=0) 

1874 

1875 if normalize: 

1876 # Normalize the results by dividing by the original group sizes. 

1877 # We are guaranteed to have the first N levels be the 

1878 # user-requested grouping. 

1879 levels = list( 

1880 range(len(self.grouper.groupings), result_series.index.nlevels) 

1881 ) 

1882 indexed_group_size = result_series.groupby( 

1883 result_series.index.droplevel(levels), 

1884 sort=self.sort, 

1885 dropna=self.dropna, 

1886 ).transform("sum") 

1887 result_series /= indexed_group_size 

1888 

1889 # Handle groups of non-observed categories 

1890 result_series = result_series.fillna(0.0) 

1891 

1892 if sort: 

1893 # Sort the values and then resort by the main grouping 

1894 index_level = range(len(self.grouper.groupings)) 

1895 result_series = result_series.sort_values( 

1896 ascending=ascending 

1897 ).sort_index(level=index_level, sort_remaining=False) 

1898 

1899 result: Series | DataFrame 

1900 if self.as_index: 

1901 result = result_series 

1902 else: 

1903 # Convert to frame 

1904 name = "proportion" if normalize else "count" 

1905 index = result_series.index 

1906 columns = com.fill_missing_names(index.names) 

1907 if name in columns: 

1908 raise ValueError( 

1909 f"Column label '{name}' is duplicate of result column" 

1910 ) 

1911 result_series.name = name 

1912 result_series.index = index.set_names(range(len(columns))) 

1913 result_frame = result_series.reset_index() 

1914 result_frame.columns = columns + [name] 

1915 result = result_frame 

1916 return result.__finalize__(self.obj, method="value_counts") 

1917 

1918 

1919def _wrap_transform_general_frame( 

1920 obj: DataFrame, group: DataFrame, res: DataFrame | Series 

1921) -> DataFrame: 

1922 from pandas import concat 

1923 

1924 if isinstance(res, Series): 

1925 # we need to broadcast across the 

1926 # other dimension; this will preserve dtypes 

1927 # GH14457 

1928 if res.index.is_(obj.index): 

1929 res_frame = concat([res] * len(group.columns), axis=1) 

1930 res_frame.columns = group.columns 

1931 res_frame.index = group.index 

1932 else: 

1933 res_frame = obj._constructor( 

1934 np.tile(res.values, (len(group.index), 1)), 

1935 columns=group.columns, 

1936 index=group.index, 

1937 ) 

1938 assert isinstance(res_frame, DataFrame) 

1939 return res_frame 

1940 else: 

1941 return res