Coverage for /var/srv/projects/api.amasfac.comuna18.com/tmp/venv/lib/python3.9/site-packages/pandas/core/groupby/generic.py: 12%
742 statements
« prev ^ index » next coverage.py v6.4.4, created at 2023-07-17 14:22 -0600
« prev ^ index » next coverage.py v6.4.4, created at 2023-07-17 14:22 -0600
1"""
2Define the SeriesGroupBy and DataFrameGroupBy
3classes that hold the groupby interfaces (and some implementations).
5These are user facing as the result of the ``df.groupby(...)`` operations,
6which here returns a DataFrameGroupBy object.
7"""
8from __future__ import annotations
10from collections import abc
11from functools import partial
12from textwrap import dedent
13from typing import (
14 TYPE_CHECKING,
15 Any,
16 Callable,
17 Hashable,
18 Iterable,
19 Mapping,
20 NamedTuple,
21 Sequence,
22 TypeVar,
23 Union,
24 cast,
25)
26import warnings
28import numpy as np
30from pandas._libs import (
31 Interval,
32 lib,
33 reduction as libreduction,
34)
35from pandas._typing import (
36 ArrayLike,
37 Manager,
38 Manager2D,
39 SingleManager,
40)
41from pandas.errors import SpecificationError
42from pandas.util._decorators import (
43 Appender,
44 Substitution,
45 doc,
46)
47from pandas.util._exceptions import find_stack_level
49from pandas.core.dtypes.common import (
50 ensure_int64,
51 is_bool,
52 is_categorical_dtype,
53 is_dict_like,
54 is_integer_dtype,
55 is_interval_dtype,
56 is_scalar,
57)
58from pandas.core.dtypes.missing import (
59 isna,
60 notna,
61)
63from pandas.core import (
64 algorithms,
65 nanops,
66)
67from pandas.core.apply import (
68 GroupByApply,
69 maybe_mangle_lambdas,
70 reconstruct_func,
71 validate_func_kwargs,
72)
73from pandas.core.arrays.categorical import Categorical
74import pandas.core.common as com
75from pandas.core.construction import create_series_with_explicit_dtype
76from pandas.core.frame import DataFrame
77from pandas.core.groupby import base
78from pandas.core.groupby.groupby import (
79 GroupBy,
80 _agg_template,
81 _apply_docs,
82 _transform_template,
83 warn_dropping_nuisance_columns_deprecated,
84)
85from pandas.core.groupby.grouper import get_grouper
86from pandas.core.indexes.api import (
87 Index,
88 MultiIndex,
89 all_indexes_same,
90)
91from pandas.core.indexes.category import CategoricalIndex
92from pandas.core.series import Series
93from pandas.core.shared_docs import _shared_docs
94from pandas.core.util.numba_ import maybe_use_numba
96from pandas.plotting import boxplot_frame_groupby
98if TYPE_CHECKING: 98 ↛ 99line 98 didn't jump to line 99, because the condition on line 98 was never true
99 from pandas.core.generic import NDFrame
101# TODO(typing) the return value on this callable should be any *scalar*.
102AggScalar = Union[str, Callable[..., Any]]
103# TODO: validate types on ScalarResult and move to _typing
104# Blocked from using by https://github.com/python/mypy/issues/1484
105# See note at _mangle_lambda_list
106ScalarResult = TypeVar("ScalarResult")
109class NamedAgg(NamedTuple):
110 column: Hashable
111 aggfunc: AggScalar
114def generate_property(name: str, klass: type[DataFrame | Series]):
115 """
116 Create a property for a GroupBy subclass to dispatch to DataFrame/Series.
118 Parameters
119 ----------
120 name : str
121 klass : {DataFrame, Series}
123 Returns
124 -------
125 property
126 """
128 def prop(self):
129 return self._make_wrapper(name)
131 parent_method = getattr(klass, name)
132 prop.__doc__ = parent_method.__doc__ or ""
133 prop.__name__ = name
134 return property(prop)
137def pin_allowlisted_properties(
138 klass: type[DataFrame | Series], allowlist: frozenset[str]
139):
140 """
141 Create GroupBy member defs for DataFrame/Series names in a allowlist.
143 Parameters
144 ----------
145 klass : DataFrame or Series class
146 class where members are defined.
147 allowlist : frozenset[str]
148 Set of names of klass methods to be constructed
150 Returns
151 -------
152 class decorator
154 Notes
155 -----
156 Since we don't want to override methods explicitly defined in the
157 base class, any such name is skipped.
158 """
160 def pinner(cls):
161 for name in allowlist:
162 if hasattr(cls, name):
163 # don't override anything that was explicitly defined
164 # in the base class
165 continue
167 prop = generate_property(name, klass)
168 setattr(cls, name, prop)
170 return cls
172 return pinner
175@pin_allowlisted_properties(Series, base.series_apply_allowlist)
176class SeriesGroupBy(GroupBy[Series]):
177 _apply_allowlist = base.series_apply_allowlist
179 def _wrap_agged_manager(self, mgr: Manager) -> Series:
180 if mgr.ndim == 1:
181 mgr = cast(SingleManager, mgr)
182 single = mgr
183 else:
184 mgr = cast(Manager2D, mgr)
185 single = mgr.iget(0)
186 ser = self.obj._constructor(single, name=self.obj.name)
187 # NB: caller is responsible for setting ser.index
188 return ser
190 def _get_data_to_aggregate(self) -> SingleManager:
191 ser = self._obj_with_exclusions
192 single = ser._mgr
193 return single
195 def _iterate_slices(self) -> Iterable[Series]:
196 yield self._selected_obj
198 _agg_examples_doc = dedent(
199 """
200 Examples
201 --------
202 >>> s = pd.Series([1, 2, 3, 4])
204 >>> s
205 0 1
206 1 2
207 2 3
208 3 4
209 dtype: int64
211 >>> s.groupby([1, 1, 2, 2]).min()
212 1 1
213 2 3
214 dtype: int64
216 >>> s.groupby([1, 1, 2, 2]).agg('min')
217 1 1
218 2 3
219 dtype: int64
221 >>> s.groupby([1, 1, 2, 2]).agg(['min', 'max'])
222 min max
223 1 1 2
224 2 3 4
226 The output column names can be controlled by passing
227 the desired column names and aggregations as keyword arguments.
229 >>> s.groupby([1, 1, 2, 2]).agg(
230 ... minimum='min',
231 ... maximum='max',
232 ... )
233 minimum maximum
234 1 1 2
235 2 3 4
237 .. versionchanged:: 1.3.0
239 The resulting dtype will reflect the return value of the aggregating function.
241 >>> s.groupby([1, 1, 2, 2]).agg(lambda x: x.astype(float).min())
242 1 1.0
243 2 3.0
244 dtype: float64
245 """
246 )
248 @Appender(
249 _apply_docs["template"].format(
250 input="series", examples=_apply_docs["series_examples"]
251 )
252 )
253 def apply(self, func, *args, **kwargs) -> Series:
254 return super().apply(func, *args, **kwargs)
256 @doc(_agg_template, examples=_agg_examples_doc, klass="Series")
257 def aggregate(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs):
259 if maybe_use_numba(engine):
260 with self._group_selection_context():
261 data = self._selected_obj
262 result = self._aggregate_with_numba(
263 data.to_frame(), func, *args, engine_kwargs=engine_kwargs, **kwargs
264 )
265 index = self.grouper.result_index
266 return self.obj._constructor(result.ravel(), index=index, name=data.name)
268 relabeling = func is None
269 columns = None
270 if relabeling:
271 columns, func = validate_func_kwargs(kwargs)
272 kwargs = {}
274 if isinstance(func, str):
275 return getattr(self, func)(*args, **kwargs)
277 elif isinstance(func, abc.Iterable):
278 # Catch instances of lists / tuples
279 # but not the class list / tuple itself.
280 func = maybe_mangle_lambdas(func)
281 ret = self._aggregate_multiple_funcs(func)
282 if relabeling:
283 # columns is not narrowed by mypy from relabeling flag
284 assert columns is not None # for mypy
285 ret.columns = columns
286 return ret
288 else:
289 cyfunc = com.get_cython_func(func)
290 if cyfunc and not args and not kwargs:
291 return getattr(self, cyfunc)()
293 if self.grouper.nkeys > 1:
294 return self._python_agg_general(func, *args, **kwargs)
296 try:
297 return self._python_agg_general(func, *args, **kwargs)
298 except KeyError:
299 # TODO: KeyError is raised in _python_agg_general,
300 # see test_groupby.test_basic
301 result = self._aggregate_named(func, *args, **kwargs)
303 # result is a dict whose keys are the elements of result_index
304 index = self.grouper.result_index
305 return create_series_with_explicit_dtype(
306 result, index=index, dtype_if_empty=object
307 )
309 agg = aggregate
311 def _aggregate_multiple_funcs(self, arg) -> DataFrame:
312 if isinstance(arg, dict):
314 # show the deprecation, but only if we
315 # have not shown a higher level one
316 # GH 15931
317 raise SpecificationError("nested renamer is not supported")
319 elif any(isinstance(x, (tuple, list)) for x in arg):
320 arg = [(x, x) if not isinstance(x, (tuple, list)) else x for x in arg]
322 # indicated column order
323 columns = next(zip(*arg))
324 else:
325 # list of functions / function names
326 columns = []
327 for f in arg:
328 columns.append(com.get_callable_name(f) or f)
330 arg = zip(columns, arg)
332 results: dict[base.OutputKey, DataFrame | Series] = {}
333 for idx, (name, func) in enumerate(arg):
335 key = base.OutputKey(label=name, position=idx)
336 results[key] = self.aggregate(func)
338 if any(isinstance(x, DataFrame) for x in results.values()):
339 from pandas import concat
341 res_df = concat(
342 results.values(), axis=1, keys=[key.label for key in results.keys()]
343 )
344 return res_df
346 indexed_output = {key.position: val for key, val in results.items()}
347 output = self.obj._constructor_expanddim(indexed_output, index=None)
348 output.columns = Index(key.label for key in results)
350 output = self._reindex_output(output)
351 return output
353 def _indexed_output_to_ndframe(
354 self, output: Mapping[base.OutputKey, ArrayLike]
355 ) -> Series:
356 """
357 Wrap the dict result of a GroupBy aggregation into a Series.
358 """
359 assert len(output) == 1
360 values = next(iter(output.values()))
361 result = self.obj._constructor(values)
362 result.name = self.obj.name
363 return result
365 def _wrap_applied_output(
366 self,
367 data: Series,
368 values: list[Any],
369 not_indexed_same: bool = False,
370 override_group_keys: bool = False,
371 ) -> DataFrame | Series:
372 """
373 Wrap the output of SeriesGroupBy.apply into the expected result.
375 Parameters
376 ----------
377 data : Series
378 Input data for groupby operation.
379 values : List[Any]
380 Applied output for each group.
381 not_indexed_same : bool, default False
382 Whether the applied outputs are not indexed the same as the group axes.
384 Returns
385 -------
386 DataFrame or Series
387 """
388 if len(values) == 0:
389 # GH #6265
390 return self.obj._constructor(
391 [],
392 name=self.obj.name,
393 index=self.grouper.result_index,
394 dtype=data.dtype,
395 )
396 assert values is not None
398 if isinstance(values[0], dict):
399 # GH #823 #24880
400 index = self.grouper.result_index
401 res_df = self.obj._constructor_expanddim(values, index=index)
402 res_df = self._reindex_output(res_df)
403 # if self.observed is False,
404 # keep all-NaN rows created while re-indexing
405 res_ser = res_df.stack(dropna=self.observed)
406 res_ser.name = self.obj.name
407 return res_ser
408 elif isinstance(values[0], (Series, DataFrame)):
409 result = self._concat_objects(
410 values,
411 not_indexed_same=not_indexed_same,
412 override_group_keys=override_group_keys,
413 )
414 result.name = self.obj.name
415 return result
416 else:
417 # GH #6265 #24880
418 result = self.obj._constructor(
419 data=values, index=self.grouper.result_index, name=self.obj.name
420 )
421 return self._reindex_output(result)
423 def _aggregate_named(self, func, *args, **kwargs):
424 # Note: this is very similar to _aggregate_series_pure_python,
425 # but that does not pin group.name
426 result = {}
427 initialized = False
429 for name, group in self:
430 object.__setattr__(group, "name", name)
432 output = func(group, *args, **kwargs)
433 output = libreduction.extract_result(output)
434 if not initialized:
435 # We only do this validation on the first iteration
436 libreduction.check_result_array(output, group.dtype)
437 initialized = True
438 result[name] = output
440 return result
442 @Substitution(klass="Series")
443 @Appender(_transform_template)
444 def transform(self, func, *args, engine=None, engine_kwargs=None, **kwargs):
445 return self._transform(
446 func, *args, engine=engine, engine_kwargs=engine_kwargs, **kwargs
447 )
449 def _cython_transform(
450 self, how: str, numeric_only: bool = True, axis: int = 0, **kwargs
451 ):
452 assert axis == 0 # handled by caller
454 obj = self._selected_obj
456 try:
457 result = self.grouper._cython_operation(
458 "transform", obj._values, how, axis, **kwargs
459 )
460 except NotImplementedError as err:
461 raise TypeError(f"{how} is not supported for {obj.dtype} dtype") from err
463 return obj._constructor(result, index=self.obj.index, name=obj.name)
465 def _transform_general(self, func: Callable, *args, **kwargs) -> Series:
466 """
467 Transform with a callable func`.
468 """
469 assert callable(func)
470 klass = type(self.obj)
472 results = []
473 for name, group in self.grouper.get_iterator(
474 self._selected_obj, axis=self.axis
475 ):
476 # this setattr is needed for test_transform_lambda_with_datetimetz
477 object.__setattr__(group, "name", name)
478 res = func(group, *args, **kwargs)
480 results.append(klass(res, index=group.index))
482 # check for empty "results" to avoid concat ValueError
483 if results:
484 from pandas.core.reshape.concat import concat
486 concatenated = concat(results)
487 result = self._set_result_index_ordered(concatenated)
488 else:
489 result = self.obj._constructor(dtype=np.float64)
491 result.name = self.obj.name
492 return result
494 def filter(self, func, dropna: bool = True, *args, **kwargs):
495 """
496 Return a copy of a Series excluding elements from groups that
497 do not satisfy the boolean criterion specified by func.
499 Parameters
500 ----------
501 func : function
502 To apply to each group. Should return True or False.
503 dropna : Drop groups that do not pass the filter. True by default;
504 if False, groups that evaluate False are filled with NaNs.
506 Notes
507 -----
508 Functions that mutate the passed object can produce unexpected
509 behavior or errors and are not supported. See :ref:`gotchas.udf-mutation`
510 for more details.
512 Examples
513 --------
514 >>> df = pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar',
515 ... 'foo', 'bar'],
516 ... 'B' : [1, 2, 3, 4, 5, 6],
517 ... 'C' : [2.0, 5., 8., 1., 2., 9.]})
518 >>> grouped = df.groupby('A')
519 >>> df.groupby('A').B.filter(lambda x: x.mean() > 3.)
520 1 2
521 3 4
522 5 6
523 Name: B, dtype: int64
525 Returns
526 -------
527 filtered : Series
528 """
529 if isinstance(func, str):
530 wrapper = lambda x: getattr(x, func)(*args, **kwargs)
531 else:
532 wrapper = lambda x: func(x, *args, **kwargs)
534 # Interpret np.nan as False.
535 def true_and_notna(x) -> bool:
536 b = wrapper(x)
537 return b and notna(b)
539 try:
540 indices = [
541 self._get_index(name) for name, group in self if true_and_notna(group)
542 ]
543 except (ValueError, TypeError) as err:
544 raise TypeError("the filter must return a boolean result") from err
546 filtered = self._apply_filter(indices, dropna)
547 return filtered
549 def nunique(self, dropna: bool = True) -> Series:
550 """
551 Return number of unique elements in the group.
553 Returns
554 -------
555 Series
556 Number of unique values within each group.
557 """
558 ids, _, _ = self.grouper.group_info
560 val = self.obj._values
562 codes, _ = algorithms.factorize(val, sort=False)
563 sorter = np.lexsort((codes, ids))
564 codes = codes[sorter]
565 ids = ids[sorter]
567 # group boundaries are where group ids change
568 # unique observations are where sorted values change
569 idx = np.r_[0, 1 + np.nonzero(ids[1:] != ids[:-1])[0]]
570 inc = np.r_[1, codes[1:] != codes[:-1]]
572 # 1st item of each group is a new unique observation
573 mask = codes == -1
574 if dropna:
575 inc[idx] = 1
576 inc[mask] = 0
577 else:
578 inc[mask & np.r_[False, mask[:-1]]] = 0
579 inc[idx] = 1
581 out = np.add.reduceat(inc, idx).astype("int64", copy=False)
582 if len(ids):
583 # NaN/NaT group exists if the head of ids is -1,
584 # so remove it from res and exclude its index from idx
585 if ids[0] == -1:
586 res = out[1:]
587 idx = idx[np.flatnonzero(idx)]
588 else:
589 res = out
590 else:
591 res = out[1:]
592 ri = self.grouper.result_index
594 # we might have duplications among the bins
595 if len(res) != len(ri):
596 res, out = np.zeros(len(ri), dtype=out.dtype), res
597 res[ids[idx]] = out
599 result = self.obj._constructor(res, index=ri, name=self.obj.name)
600 return self._reindex_output(result, fill_value=0)
602 @doc(Series.describe)
603 def describe(self, **kwargs):
604 return super().describe(**kwargs)
606 def value_counts(
607 self,
608 normalize: bool = False,
609 sort: bool = True,
610 ascending: bool = False,
611 bins=None,
612 dropna: bool = True,
613 ) -> Series:
615 from pandas.core.reshape.merge import get_join_indexers
616 from pandas.core.reshape.tile import cut
618 ids, _, _ = self.grouper.group_info
619 val = self.obj._values
621 names = self.grouper.names + [self.obj.name]
623 if is_categorical_dtype(val.dtype) or (
624 bins is not None and not np.iterable(bins)
625 ):
626 # scalar bins cannot be done at top level
627 # in a backward compatible way
628 # GH38672 relates to categorical dtype
629 ser = self.apply(
630 Series.value_counts,
631 normalize=normalize,
632 sort=sort,
633 ascending=ascending,
634 bins=bins,
635 )
636 ser.index.names = names
637 return ser
639 # groupby removes null keys from groupings
640 mask = ids != -1
641 ids, val = ids[mask], val[mask]
643 if bins is None:
644 lab, lev = algorithms.factorize(val, sort=True)
645 llab = lambda lab, inc: lab[inc]
646 else:
648 # lab is a Categorical with categories an IntervalIndex
649 lab = cut(Series(val), bins, include_lowest=True)
650 # error: "ndarray" has no attribute "cat"
651 lev = lab.cat.categories # type: ignore[attr-defined]
652 # error: No overload variant of "take" of "_ArrayOrScalarCommon" matches
653 # argument types "Any", "bool", "Union[Any, float]"
654 lab = lev.take( # type: ignore[call-overload]
655 # error: "ndarray" has no attribute "cat"
656 lab.cat.codes, # type: ignore[attr-defined]
657 allow_fill=True,
658 # error: Item "ndarray" of "Union[ndarray, Index]" has no attribute
659 # "_na_value"
660 fill_value=lev._na_value, # type: ignore[union-attr]
661 )
662 llab = lambda lab, inc: lab[inc]._multiindex.codes[-1]
664 if is_interval_dtype(lab.dtype):
665 # TODO: should we do this inside II?
666 lab_interval = cast(Interval, lab)
668 sorter = np.lexsort((lab_interval.left, lab_interval.right, ids))
669 else:
670 sorter = np.lexsort((lab, ids))
672 ids, lab = ids[sorter], lab[sorter]
674 # group boundaries are where group ids change
675 idchanges = 1 + np.nonzero(ids[1:] != ids[:-1])[0]
676 idx = np.r_[0, idchanges]
677 if not len(ids):
678 idx = idchanges
680 # new values are where sorted labels change
681 lchanges = llab(lab, slice(1, None)) != llab(lab, slice(None, -1))
682 inc = np.r_[True, lchanges]
683 if not len(val):
684 inc = lchanges
685 inc[idx] = True # group boundaries are also new values
686 out = np.diff(np.nonzero(np.r_[inc, True])[0]) # value counts
688 # num. of times each group should be repeated
689 rep = partial(np.repeat, repeats=np.add.reduceat(inc, idx))
691 # multi-index components
692 codes = self.grouper.reconstructed_codes
693 # error: Incompatible types in assignment (expression has type
694 # "List[ndarray[Any, dtype[_SCT]]]",
695 # variable has type "List[ndarray[Any, dtype[signedinteger[Any]]]]")
696 codes = [ # type: ignore[assignment]
697 rep(level_codes) for level_codes in codes
698 ] + [llab(lab, inc)]
699 # error: List item 0 has incompatible type "Union[ndarray[Any, Any], Index]";
700 # expected "Index"
701 levels = [ping.group_index for ping in self.grouper.groupings] + [
702 lev # type: ignore[list-item]
703 ]
705 if dropna:
706 mask = codes[-1] != -1
707 if mask.all():
708 dropna = False
709 else:
710 out, codes = out[mask], [level_codes[mask] for level_codes in codes]
712 if normalize:
713 out = out.astype("float")
714 d = np.diff(np.r_[idx, len(ids)])
715 if dropna:
716 m = ids[lab == -1]
717 np.add.at(d, m, -1)
718 acc = rep(d)[mask]
719 else:
720 acc = rep(d)
721 out /= acc
723 if sort and bins is None:
724 cat = ids[inc][mask] if dropna else ids[inc]
725 sorter = np.lexsort((out if ascending else -out, cat))
726 out, codes[-1] = out[sorter], codes[-1][sorter]
728 if bins is not None:
729 # for compat. with libgroupby.value_counts need to ensure every
730 # bin is present at every index level, null filled with zeros
731 diff = np.zeros(len(out), dtype="bool")
732 for level_codes in codes[:-1]:
733 diff |= np.r_[True, level_codes[1:] != level_codes[:-1]]
735 ncat, nbin = diff.sum(), len(levels[-1])
737 left = [np.repeat(np.arange(ncat), nbin), np.tile(np.arange(nbin), ncat)]
739 right = [diff.cumsum() - 1, codes[-1]]
741 _, idx = get_join_indexers(left, right, sort=False, how="left")
742 out = np.where(idx != -1, out[idx], 0)
744 if sort:
745 sorter = np.lexsort((out if ascending else -out, left[0]))
746 out, left[-1] = out[sorter], left[-1][sorter]
748 # build the multi-index w/ full levels
749 def build_codes(lev_codes: np.ndarray) -> np.ndarray:
750 return np.repeat(lev_codes[diff], nbin)
752 codes = [build_codes(lev_codes) for lev_codes in codes[:-1]]
753 codes.append(left[-1])
755 mi = MultiIndex(levels=levels, codes=codes, names=names, verify_integrity=False)
757 if is_integer_dtype(out.dtype):
758 out = ensure_int64(out)
759 return self.obj._constructor(out, index=mi, name=self.obj.name)
761 @doc(Series.nlargest)
762 def nlargest(self, n: int = 5, keep: str = "first") -> Series:
763 f = partial(Series.nlargest, n=n, keep=keep)
764 data = self._obj_with_exclusions
765 # Don't change behavior if result index happens to be the same, i.e.
766 # already ordered and n >= all group sizes.
767 result = self._python_apply_general(f, data, not_indexed_same=True)
768 return result
770 @doc(Series.nsmallest)
771 def nsmallest(self, n: int = 5, keep: str = "first") -> Series:
772 f = partial(Series.nsmallest, n=n, keep=keep)
773 data = self._obj_with_exclusions
774 # Don't change behavior if result index happens to be the same, i.e.
775 # already ordered and n >= all group sizes.
776 result = self._python_apply_general(f, data, not_indexed_same=True)
777 return result
780@pin_allowlisted_properties(DataFrame, base.dataframe_apply_allowlist)
781class DataFrameGroupBy(GroupBy[DataFrame]):
783 _apply_allowlist = base.dataframe_apply_allowlist
785 _agg_examples_doc = dedent(
786 """
787 Examples
788 --------
789 >>> df = pd.DataFrame(
790 ... {
791 ... "A": [1, 1, 2, 2],
792 ... "B": [1, 2, 3, 4],
793 ... "C": [0.362838, 0.227877, 1.267767, -0.562860],
794 ... }
795 ... )
797 >>> df
798 A B C
799 0 1 1 0.362838
800 1 1 2 0.227877
801 2 2 3 1.267767
802 3 2 4 -0.562860
804 The aggregation is for each column.
806 >>> df.groupby('A').agg('min')
807 B C
808 A
809 1 1 0.227877
810 2 3 -0.562860
812 Multiple aggregations
814 >>> df.groupby('A').agg(['min', 'max'])
815 B C
816 min max min max
817 A
818 1 1 2 0.227877 0.362838
819 2 3 4 -0.562860 1.267767
821 Select a column for aggregation
823 >>> df.groupby('A').B.agg(['min', 'max'])
824 min max
825 A
826 1 1 2
827 2 3 4
829 User-defined function for aggregation
831 >>> df.groupby('A').agg(lambda x: sum(x) + 2)
832 B C
833 A
834 1 5 2.590715
835 2 9 2.704907
837 Different aggregations per column
839 >>> df.groupby('A').agg({'B': ['min', 'max'], 'C': 'sum'})
840 B C
841 min max sum
842 A
843 1 1 2 0.590715
844 2 3 4 0.704907
846 To control the output names with different aggregations per column,
847 pandas supports "named aggregation"
849 >>> df.groupby("A").agg(
850 ... b_min=pd.NamedAgg(column="B", aggfunc="min"),
851 ... c_sum=pd.NamedAgg(column="C", aggfunc="sum"))
852 b_min c_sum
853 A
854 1 1 0.590715
855 2 3 0.704907
857 - The keywords are the *output* column names
858 - The values are tuples whose first element is the column to select
859 and the second element is the aggregation to apply to that column.
860 Pandas provides the ``pandas.NamedAgg`` namedtuple with the fields
861 ``['column', 'aggfunc']`` to make it clearer what the arguments are.
862 As usual, the aggregation can be a callable or a string alias.
864 See :ref:`groupby.aggregate.named` for more.
866 .. versionchanged:: 1.3.0
868 The resulting dtype will reflect the return value of the aggregating function.
870 >>> df.groupby("A")[["B"]].agg(lambda x: x.astype(float).min())
871 B
872 A
873 1 1.0
874 2 3.0
875 """
876 )
878 @doc(_agg_template, examples=_agg_examples_doc, klass="DataFrame")
879 def aggregate(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs):
881 if maybe_use_numba(engine):
882 with self._group_selection_context():
883 data = self._selected_obj
884 result = self._aggregate_with_numba(
885 data, func, *args, engine_kwargs=engine_kwargs, **kwargs
886 )
887 index = self.grouper.result_index
888 return self.obj._constructor(result, index=index, columns=data.columns)
890 relabeling, func, columns, order = reconstruct_func(func, **kwargs)
891 func = maybe_mangle_lambdas(func)
893 op = GroupByApply(self, func, args, kwargs)
894 result = op.agg()
895 if not is_dict_like(func) and result is not None:
896 return result
897 elif relabeling and result is not None:
898 # this should be the only (non-raising) case with relabeling
899 # used reordered index of columns
900 result = result.iloc[:, order]
901 result.columns = columns
903 if result is None:
905 # grouper specific aggregations
906 if self.grouper.nkeys > 1:
907 # test_groupby_as_index_series_scalar gets here with 'not self.as_index'
908 return self._python_agg_general(func, *args, **kwargs)
909 elif args or kwargs:
910 # test_pass_args_kwargs gets here (with and without as_index)
911 # can't return early
912 result = self._aggregate_frame(func, *args, **kwargs)
914 elif self.axis == 1:
915 # _aggregate_multiple_funcs does not allow self.axis == 1
916 # Note: axis == 1 precludes 'not self.as_index', see __init__
917 result = self._aggregate_frame(func)
918 return result
920 else:
922 # try to treat as if we are passing a list
923 gba = GroupByApply(self, [func], args=(), kwargs={})
924 try:
925 result = gba.agg()
927 except ValueError as err:
928 if "no results" not in str(err):
929 # raised directly by _aggregate_multiple_funcs
930 raise
931 result = self._aggregate_frame(func)
933 else:
934 sobj = self._selected_obj
936 if isinstance(sobj, Series):
937 # GH#35246 test_groupby_as_index_select_column_sum_empty_df
938 result.columns = self._obj_with_exclusions.columns.copy()
939 else:
940 # Retain our column names
941 result.columns._set_names(
942 sobj.columns.names, level=list(range(sobj.columns.nlevels))
943 )
944 # select everything except for the last level, which is the one
945 # containing the name of the function(s), see GH#32040
946 result.columns = result.columns.droplevel(-1)
948 if not self.as_index:
949 self._insert_inaxis_grouper_inplace(result)
950 result.index = Index(range(len(result)))
952 return result
954 agg = aggregate
956 def _iterate_slices(self) -> Iterable[Series]:
957 obj = self._selected_obj
958 if self.axis == 1:
959 obj = obj.T
961 if isinstance(obj, Series) and obj.name not in self.exclusions:
962 # Occurs when doing DataFrameGroupBy(...)["X"]
963 yield obj
964 else:
965 for label, values in obj.items():
966 if label in self.exclusions:
967 continue
969 yield values
971 def _aggregate_frame(self, func, *args, **kwargs) -> DataFrame:
972 if self.grouper.nkeys != 1:
973 raise AssertionError("Number of keys must be 1")
975 obj = self._obj_with_exclusions
977 result: dict[Hashable, NDFrame | np.ndarray] = {}
978 if self.axis == 0:
979 # test_pass_args_kwargs_duplicate_columns gets here with non-unique columns
980 for name, data in self.grouper.get_iterator(obj, self.axis):
981 fres = func(data, *args, **kwargs)
982 result[name] = fres
983 else:
984 # we get here in a number of test_multilevel tests
985 for name in self.indices:
986 grp_df = self.get_group(name, obj=obj)
987 fres = func(grp_df, *args, **kwargs)
988 result[name] = fres
990 result_index = self.grouper.result_index
991 other_ax = obj.axes[1 - self.axis]
992 out = self.obj._constructor(result, index=other_ax, columns=result_index)
993 if self.axis == 0:
994 out = out.T
996 return out
998 def _aggregate_item_by_item(self, func, *args, **kwargs) -> DataFrame:
999 # only for axis==0
1000 # tests that get here with non-unique cols:
1001 # test_resample_with_timedelta_yields_no_empty_groups,
1002 # test_resample_apply_product
1004 obj = self._obj_with_exclusions
1005 result: dict[int, NDFrame] = {}
1007 for i, (item, sgb) in enumerate(self._iterate_column_groupbys(obj)):
1008 result[i] = sgb.aggregate(func, *args, **kwargs)
1010 res_df = self.obj._constructor(result)
1011 res_df.columns = obj.columns
1012 return res_df
1014 def _wrap_applied_output(
1015 self,
1016 data: DataFrame,
1017 values: list,
1018 not_indexed_same: bool = False,
1019 override_group_keys: bool = False,
1020 ):
1022 if len(values) == 0:
1023 result = self.obj._constructor(
1024 index=self.grouper.result_index, columns=data.columns
1025 )
1026 result = result.astype(data.dtypes, copy=False)
1027 return result
1029 # GH12824
1030 first_not_none = next(com.not_none(*values), None)
1032 if first_not_none is None:
1033 # GH9684 - All values are None, return an empty frame.
1034 return self.obj._constructor()
1035 elif isinstance(first_not_none, DataFrame):
1036 return self._concat_objects(
1037 values,
1038 not_indexed_same=not_indexed_same,
1039 override_group_keys=override_group_keys,
1040 )
1042 key_index = self.grouper.result_index if self.as_index else None
1044 if isinstance(first_not_none, (np.ndarray, Index)):
1045 # GH#1738: values is list of arrays of unequal lengths
1046 # fall through to the outer else clause
1047 # TODO: sure this is right? we used to do this
1048 # after raising AttributeError above
1049 return self.obj._constructor_sliced(
1050 values, index=key_index, name=self._selection
1051 )
1052 elif not isinstance(first_not_none, Series):
1053 # values are not series or array-like but scalars
1054 # self._selection not passed through to Series as the
1055 # result should not take the name of original selection
1056 # of columns
1057 if self.as_index:
1058 return self.obj._constructor_sliced(values, index=key_index)
1059 else:
1060 result = self.obj._constructor(values, columns=[self._selection])
1061 self._insert_inaxis_grouper_inplace(result)
1062 return result
1063 else:
1064 # values are Series
1065 return self._wrap_applied_output_series(
1066 values,
1067 not_indexed_same,
1068 first_not_none,
1069 key_index,
1070 override_group_keys,
1071 )
1073 def _wrap_applied_output_series(
1074 self,
1075 values: list[Series],
1076 not_indexed_same: bool,
1077 first_not_none,
1078 key_index,
1079 override_group_keys: bool,
1080 ) -> DataFrame | Series:
1081 # this is to silence a DeprecationWarning
1082 # TODO(2.0): Remove when default dtype of empty Series is object
1083 kwargs = first_not_none._construct_axes_dict()
1084 backup = create_series_with_explicit_dtype(dtype_if_empty=object, **kwargs)
1085 values = [x if (x is not None) else backup for x in values]
1087 all_indexed_same = all_indexes_same(x.index for x in values)
1089 # GH3596
1090 # provide a reduction (Frame -> Series) if groups are
1091 # unique
1092 if self.squeeze:
1093 applied_index = self._selected_obj._get_axis(self.axis)
1094 singular_series = len(values) == 1 and applied_index.nlevels == 1
1096 if singular_series:
1097 # GH2893
1098 # we have series in the values array, we want to
1099 # produce a series:
1100 # if any of the sub-series are not indexed the same
1101 # OR we don't have a multi-index and we have only a
1102 # single values
1103 return self._concat_objects(
1104 values,
1105 not_indexed_same=not_indexed_same,
1106 override_group_keys=override_group_keys,
1107 )
1109 # still a series
1110 # path added as of GH 5545
1111 elif all_indexed_same:
1112 from pandas.core.reshape.concat import concat
1114 return concat(values)
1116 if not all_indexed_same:
1117 # GH 8467
1118 return self._concat_objects(
1119 values,
1120 not_indexed_same=True,
1121 override_group_keys=override_group_keys,
1122 )
1124 # Combine values
1125 # vstack+constructor is faster than concat and handles MI-columns
1126 stacked_values = np.vstack([np.asarray(v) for v in values])
1128 if self.axis == 0:
1129 index = key_index
1130 columns = first_not_none.index.copy()
1131 if columns.name is None:
1132 # GH6124 - propagate name of Series when it's consistent
1133 names = {v.name for v in values}
1134 if len(names) == 1:
1135 columns.name = list(names)[0]
1136 else:
1137 index = first_not_none.index
1138 columns = key_index
1139 stacked_values = stacked_values.T
1141 if stacked_values.dtype == object:
1142 # We'll have the DataFrame constructor do inference
1143 stacked_values = stacked_values.tolist()
1144 result = self.obj._constructor(stacked_values, index=index, columns=columns)
1146 if not self.as_index:
1147 self._insert_inaxis_grouper_inplace(result)
1149 return self._reindex_output(result)
1151 def _cython_transform(
1152 self,
1153 how: str,
1154 numeric_only: bool | lib.NoDefault = lib.no_default,
1155 axis: int = 0,
1156 **kwargs,
1157 ) -> DataFrame:
1158 assert axis == 0 # handled by caller
1159 # TODO: no tests with self.ndim == 1 for DataFrameGroupBy
1160 numeric_only_bool = self._resolve_numeric_only(how, numeric_only, axis)
1162 # With self.axis == 0, we have multi-block tests
1163 # e.g. test_rank_min_int, test_cython_transform_frame
1164 # test_transform_numeric_ret
1165 # With self.axis == 1, _get_data_to_aggregate does a transpose
1166 # so we always have a single block.
1167 mgr: Manager2D = self._get_data_to_aggregate()
1168 orig_mgr_len = len(mgr)
1169 if numeric_only_bool:
1170 mgr = mgr.get_numeric_data(copy=False)
1172 def arr_func(bvalues: ArrayLike) -> ArrayLike:
1173 return self.grouper._cython_operation(
1174 "transform", bvalues, how, 1, **kwargs
1175 )
1177 # We could use `mgr.apply` here and not have to set_axis, but
1178 # we would have to do shape gymnastics for ArrayManager compat
1179 res_mgr = mgr.grouped_reduce(arr_func, ignore_failures=True)
1180 res_mgr.set_axis(1, mgr.axes[1])
1182 if len(res_mgr) < orig_mgr_len:
1183 warn_dropping_nuisance_columns_deprecated(type(self), how, numeric_only)
1185 res_df = self.obj._constructor(res_mgr)
1186 if self.axis == 1:
1187 res_df = res_df.T
1188 return res_df
1190 def _transform_general(self, func, *args, **kwargs):
1191 from pandas.core.reshape.concat import concat
1193 applied = []
1194 obj = self._obj_with_exclusions
1195 gen = self.grouper.get_iterator(obj, axis=self.axis)
1196 fast_path, slow_path = self._define_paths(func, *args, **kwargs)
1198 # Determine whether to use slow or fast path by evaluating on the first group.
1199 # Need to handle the case of an empty generator and process the result so that
1200 # it does not need to be computed again.
1201 try:
1202 name, group = next(gen)
1203 except StopIteration:
1204 pass
1205 else:
1206 object.__setattr__(group, "name", name)
1207 try:
1208 path, res = self._choose_path(fast_path, slow_path, group)
1209 except TypeError:
1210 return self._transform_item_by_item(obj, fast_path)
1211 except ValueError as err:
1212 msg = "transform must return a scalar value for each group"
1213 raise ValueError(msg) from err
1214 if group.size > 0:
1215 res = _wrap_transform_general_frame(self.obj, group, res)
1216 applied.append(res)
1218 # Compute and process with the remaining groups
1219 emit_alignment_warning = False
1220 for name, group in gen:
1221 if group.size == 0:
1222 continue
1223 object.__setattr__(group, "name", name)
1224 res = path(group)
1225 if (
1226 not emit_alignment_warning
1227 and res.ndim == 2
1228 and not res.index.equals(group.index)
1229 ):
1230 emit_alignment_warning = True
1232 res = _wrap_transform_general_frame(self.obj, group, res)
1233 applied.append(res)
1235 if emit_alignment_warning:
1236 # GH#45648
1237 warnings.warn(
1238 "In a future version of pandas, returning a DataFrame in "
1239 "groupby.transform will align with the input's index. Apply "
1240 "`.to_numpy()` to the result in the transform function to keep "
1241 "the current behavior and silence this warning.",
1242 FutureWarning,
1243 stacklevel=find_stack_level(),
1244 )
1246 concat_index = obj.columns if self.axis == 0 else obj.index
1247 other_axis = 1 if self.axis == 0 else 0 # switches between 0 & 1
1248 concatenated = concat(applied, axis=self.axis, verify_integrity=False)
1249 concatenated = concatenated.reindex(concat_index, axis=other_axis, copy=False)
1250 return self._set_result_index_ordered(concatenated)
1252 @Substitution(klass="DataFrame")
1253 @Appender(_transform_template)
1254 def transform(self, func, *args, engine=None, engine_kwargs=None, **kwargs):
1255 return self._transform(
1256 func, *args, engine=engine, engine_kwargs=engine_kwargs, **kwargs
1257 )
1259 def _define_paths(self, func, *args, **kwargs):
1260 if isinstance(func, str):
1261 fast_path = lambda group: getattr(group, func)(*args, **kwargs)
1262 slow_path = lambda group: group.apply(
1263 lambda x: getattr(x, func)(*args, **kwargs), axis=self.axis
1264 )
1265 else:
1266 fast_path = lambda group: func(group, *args, **kwargs)
1267 slow_path = lambda group: group.apply(
1268 lambda x: func(x, *args, **kwargs), axis=self.axis
1269 )
1270 return fast_path, slow_path
1272 def _choose_path(self, fast_path: Callable, slow_path: Callable, group: DataFrame):
1273 path = slow_path
1274 res = slow_path(group)
1276 if self.ngroups == 1:
1277 # no need to evaluate multiple paths when only
1278 # a single group exists
1279 return path, res
1281 # if we make it here, test if we can use the fast path
1282 try:
1283 res_fast = fast_path(group)
1284 except AssertionError:
1285 raise # pragma: no cover
1286 except Exception:
1287 # GH#29631 For user-defined function, we can't predict what may be
1288 # raised; see test_transform.test_transform_fastpath_raises
1289 return path, res
1291 # verify fast path returns either:
1292 # a DataFrame with columns equal to group.columns
1293 # OR a Series with index equal to group.columns
1294 if isinstance(res_fast, DataFrame):
1295 if not res_fast.columns.equals(group.columns):
1296 return path, res
1297 elif isinstance(res_fast, Series):
1298 if not res_fast.index.equals(group.columns):
1299 return path, res
1300 else:
1301 return path, res
1303 if res_fast.equals(res):
1304 path = fast_path
1306 return path, res
1308 def _transform_item_by_item(self, obj: DataFrame, wrapper) -> DataFrame:
1309 # iterate through columns, see test_transform_exclude_nuisance
1310 # gets here with non-unique columns
1311 output = {}
1312 inds = []
1313 for i, (colname, sgb) in enumerate(self._iterate_column_groupbys(obj)):
1314 try:
1315 output[i] = sgb.transform(wrapper)
1316 except TypeError:
1317 # e.g. trying to call nanmean with string values
1318 warn_dropping_nuisance_columns_deprecated(
1319 type(self), "transform", numeric_only=False
1320 )
1321 else:
1322 inds.append(i)
1324 if not output:
1325 raise TypeError("Transform function invalid for data types")
1327 columns = obj.columns.take(inds)
1329 result = self.obj._constructor(output, index=obj.index)
1330 result.columns = columns
1331 return result
1333 def filter(self, func, dropna=True, *args, **kwargs):
1334 """
1335 Return a copy of a DataFrame excluding filtered elements.
1337 Elements from groups are filtered if they do not satisfy the
1338 boolean criterion specified by func.
1340 Parameters
1341 ----------
1342 func : function
1343 Function to apply to each subframe. Should return True or False.
1344 dropna : Drop groups that do not pass the filter. True by default;
1345 If False, groups that evaluate False are filled with NaNs.
1347 Returns
1348 -------
1349 filtered : DataFrame
1351 Notes
1352 -----
1353 Each subframe is endowed the attribute 'name' in case you need to know
1354 which group you are working on.
1356 Functions that mutate the passed object can produce unexpected
1357 behavior or errors and are not supported. See :ref:`gotchas.udf-mutation`
1358 for more details.
1360 Examples
1361 --------
1362 >>> df = pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar',
1363 ... 'foo', 'bar'],
1364 ... 'B' : [1, 2, 3, 4, 5, 6],
1365 ... 'C' : [2.0, 5., 8., 1., 2., 9.]})
1366 >>> grouped = df.groupby('A')
1367 >>> grouped.filter(lambda x: x['B'].mean() > 3.)
1368 A B C
1369 1 bar 2 5.0
1370 3 bar 4 1.0
1371 5 bar 6 9.0
1372 """
1373 indices = []
1375 obj = self._selected_obj
1376 gen = self.grouper.get_iterator(obj, axis=self.axis)
1378 for name, group in gen:
1379 object.__setattr__(group, "name", name)
1381 res = func(group, *args, **kwargs)
1383 try:
1384 res = res.squeeze()
1385 except AttributeError: # allow e.g., scalars and frames to pass
1386 pass
1388 # interpret the result of the filter
1389 if is_bool(res) or (is_scalar(res) and isna(res)):
1390 if res and notna(res):
1391 indices.append(self._get_index(name))
1392 else:
1393 # non scalars aren't allowed
1394 raise TypeError(
1395 f"filter function returned a {type(res).__name__}, "
1396 "but expected a scalar bool"
1397 )
1399 return self._apply_filter(indices, dropna)
1401 def __getitem__(self, key) -> DataFrameGroupBy | SeriesGroupBy:
1402 if self.axis == 1:
1403 # GH 37725
1404 raise ValueError("Cannot subset columns when using axis=1")
1405 # per GH 23566
1406 if isinstance(key, tuple) and len(key) > 1:
1407 # if len == 1, then it becomes a SeriesGroupBy and this is actually
1408 # valid syntax, so don't raise warning
1409 warnings.warn(
1410 "Indexing with multiple keys (implicitly converted to a tuple "
1411 "of keys) will be deprecated, use a list instead.",
1412 FutureWarning,
1413 stacklevel=find_stack_level(),
1414 )
1415 return super().__getitem__(key)
1417 def _gotitem(self, key, ndim: int, subset=None):
1418 """
1419 sub-classes to define
1420 return a sliced object
1422 Parameters
1423 ----------
1424 key : string / list of selections
1425 ndim : {1, 2}
1426 requested ndim of result
1427 subset : object, default None
1428 subset to act on
1429 """
1430 if ndim == 2:
1431 if subset is None:
1432 subset = self.obj
1433 return DataFrameGroupBy(
1434 subset,
1435 self.grouper,
1436 axis=self.axis,
1437 level=self.level,
1438 grouper=self.grouper,
1439 exclusions=self.exclusions,
1440 selection=key,
1441 as_index=self.as_index,
1442 sort=self.sort,
1443 group_keys=self.group_keys,
1444 squeeze=self.squeeze,
1445 observed=self.observed,
1446 mutated=self.mutated,
1447 dropna=self.dropna,
1448 )
1449 elif ndim == 1:
1450 if subset is None:
1451 subset = self.obj[key]
1452 return SeriesGroupBy(
1453 subset,
1454 level=self.level,
1455 grouper=self.grouper,
1456 selection=key,
1457 sort=self.sort,
1458 group_keys=self.group_keys,
1459 squeeze=self.squeeze,
1460 observed=self.observed,
1461 dropna=self.dropna,
1462 )
1464 raise AssertionError("invalid ndim for _gotitem")
1466 def _get_data_to_aggregate(self) -> Manager2D:
1467 obj = self._obj_with_exclusions
1468 if self.axis == 1:
1469 return obj.T._mgr
1470 else:
1471 return obj._mgr
1473 def _insert_inaxis_grouper_inplace(self, result: DataFrame) -> None:
1474 # zip in reverse so we can always insert at loc 0
1475 columns = result.columns
1476 for name, lev, in_axis in zip(
1477 reversed(self.grouper.names),
1478 reversed(self.grouper.get_group_levels()),
1479 reversed([grp.in_axis for grp in self.grouper.groupings]),
1480 ):
1481 # GH #28549
1482 # When using .apply(-), name will be in columns already
1483 if in_axis and name not in columns:
1484 result.insert(0, name, lev)
1486 def _indexed_output_to_ndframe(
1487 self, output: Mapping[base.OutputKey, ArrayLike]
1488 ) -> DataFrame:
1489 """
1490 Wrap the dict result of a GroupBy aggregation into a DataFrame.
1491 """
1492 indexed_output = {key.position: val for key, val in output.items()}
1493 columns = Index([key.label for key in output])
1494 columns._set_names(self._obj_with_exclusions._get_axis(1 - self.axis).names)
1496 result = self.obj._constructor(indexed_output)
1497 result.columns = columns
1498 return result
1500 def _wrap_agged_manager(self, mgr: Manager2D) -> DataFrame:
1501 if not self.as_index:
1502 # GH 41998 - empty mgr always gets index of length 0
1503 rows = mgr.shape[1] if mgr.shape[0] > 0 else 0
1504 index = Index(range(rows))
1505 mgr.set_axis(1, index)
1506 result = self.obj._constructor(mgr)
1508 self._insert_inaxis_grouper_inplace(result)
1509 result = result._consolidate()
1510 else:
1511 index = self.grouper.result_index
1512 mgr.set_axis(1, index)
1513 result = self.obj._constructor(mgr)
1515 if self.axis == 1:
1516 result = result.T
1518 # Note: we only need to pass datetime=True in order to get numeric
1519 # values converted
1520 return self._reindex_output(result)._convert(datetime=True)
1522 def _iterate_column_groupbys(self, obj: DataFrame | Series):
1523 for i, colname in enumerate(obj.columns):
1524 yield colname, SeriesGroupBy(
1525 obj.iloc[:, i],
1526 selection=colname,
1527 grouper=self.grouper,
1528 exclusions=self.exclusions,
1529 observed=self.observed,
1530 )
1532 def _apply_to_column_groupbys(self, func, obj: DataFrame | Series) -> DataFrame:
1533 from pandas.core.reshape.concat import concat
1535 columns = obj.columns
1536 results = [
1537 func(col_groupby) for _, col_groupby in self._iterate_column_groupbys(obj)
1538 ]
1540 if not len(results):
1541 # concat would raise
1542 return DataFrame([], columns=columns, index=self.grouper.result_index)
1543 else:
1544 return concat(results, keys=columns, axis=1)
1546 def nunique(self, dropna: bool = True) -> DataFrame:
1547 """
1548 Return DataFrame with counts of unique elements in each position.
1550 Parameters
1551 ----------
1552 dropna : bool, default True
1553 Don't include NaN in the counts.
1555 Returns
1556 -------
1557 nunique: DataFrame
1559 Examples
1560 --------
1561 >>> df = pd.DataFrame({'id': ['spam', 'egg', 'egg', 'spam',
1562 ... 'ham', 'ham'],
1563 ... 'value1': [1, 5, 5, 2, 5, 5],
1564 ... 'value2': list('abbaxy')})
1565 >>> df
1566 id value1 value2
1567 0 spam 1 a
1568 1 egg 5 b
1569 2 egg 5 b
1570 3 spam 2 a
1571 4 ham 5 x
1572 5 ham 5 y
1574 >>> df.groupby('id').nunique()
1575 value1 value2
1576 id
1577 egg 1 1
1578 ham 1 2
1579 spam 2 1
1581 Check for rows with the same id but conflicting values:
1583 >>> df.groupby('id').filter(lambda g: (g.nunique() > 1).any())
1584 id value1 value2
1585 0 spam 1 a
1586 3 spam 2 a
1587 4 ham 5 x
1588 5 ham 5 y
1589 """
1591 if self.axis != 0:
1592 # see test_groupby_crash_on_nunique
1593 return self._python_agg_general(lambda sgb: sgb.nunique(dropna))
1595 obj = self._obj_with_exclusions
1596 results = self._apply_to_column_groupbys(
1597 lambda sgb: sgb.nunique(dropna), obj=obj
1598 )
1600 if not self.as_index:
1601 results.index = Index(range(len(results)))
1602 self._insert_inaxis_grouper_inplace(results)
1604 return results
1606 @doc(
1607 _shared_docs["idxmax"],
1608 numeric_only_default="True for axis=0, False for axis=1",
1609 )
1610 def idxmax(
1611 self,
1612 axis=0,
1613 skipna: bool = True,
1614 numeric_only: bool | lib.NoDefault = lib.no_default,
1615 ) -> DataFrame:
1616 axis = DataFrame._get_axis_number(axis)
1617 if numeric_only is lib.no_default:
1618 # Cannot use self._resolve_numeric_only; we must pass None to
1619 # DataFrame.idxmax for backwards compatibility
1620 numeric_only_arg = None if axis == 0 else False
1621 else:
1622 numeric_only_arg = numeric_only
1624 def func(df):
1625 with warnings.catch_warnings():
1626 # Suppress numeric_only warnings here, will warn below
1627 warnings.filterwarnings("ignore", ".*numeric_only in DataFrame.argmax")
1628 res = df._reduce(
1629 nanops.nanargmax,
1630 "argmax",
1631 axis=axis,
1632 skipna=skipna,
1633 numeric_only=numeric_only_arg,
1634 )
1635 indices = res._values
1636 index = df._get_axis(axis)
1637 result = [index[i] if i >= 0 else np.nan for i in indices]
1638 return df._constructor_sliced(result, index=res.index)
1640 func.__name__ = "idxmax"
1641 result = self._python_apply_general(
1642 func, self._obj_with_exclusions, not_indexed_same=True
1643 )
1644 self._maybe_warn_numeric_only_depr("idxmax", result, numeric_only)
1645 return result
1647 @doc(
1648 _shared_docs["idxmin"],
1649 numeric_only_default="True for axis=0, False for axis=1",
1650 )
1651 def idxmin(
1652 self,
1653 axis=0,
1654 skipna: bool = True,
1655 numeric_only: bool | lib.NoDefault = lib.no_default,
1656 ) -> DataFrame:
1657 axis = DataFrame._get_axis_number(axis)
1658 if numeric_only is lib.no_default:
1659 # Cannot use self._resolve_numeric_only; we must pass None to
1660 # DataFrame.idxmin for backwards compatibility
1661 numeric_only_arg = None if axis == 0 else False
1662 else:
1663 numeric_only_arg = numeric_only
1665 def func(df):
1666 with warnings.catch_warnings():
1667 # Suppress numeric_only warnings here, will warn below
1668 warnings.filterwarnings("ignore", ".*numeric_only in DataFrame.argmin")
1669 res = df._reduce(
1670 nanops.nanargmin,
1671 "argmin",
1672 axis=axis,
1673 skipna=skipna,
1674 numeric_only=numeric_only_arg,
1675 )
1676 indices = res._values
1677 index = df._get_axis(axis)
1678 result = [index[i] if i >= 0 else np.nan for i in indices]
1679 return df._constructor_sliced(result, index=res.index)
1681 func.__name__ = "idxmin"
1682 result = self._python_apply_general(
1683 func, self._obj_with_exclusions, not_indexed_same=True
1684 )
1685 self._maybe_warn_numeric_only_depr("idxmin", result, numeric_only)
1686 return result
1688 boxplot = boxplot_frame_groupby
1690 def value_counts(
1691 self,
1692 subset: Sequence[Hashable] | None = None,
1693 normalize: bool = False,
1694 sort: bool = True,
1695 ascending: bool = False,
1696 dropna: bool = True,
1697 ) -> DataFrame | Series:
1698 """
1699 Return a Series or DataFrame containing counts of unique rows.
1701 .. versionadded:: 1.4.0
1703 Parameters
1704 ----------
1705 subset : list-like, optional
1706 Columns to use when counting unique combinations.
1707 normalize : bool, default False
1708 Return proportions rather than frequencies.
1709 sort : bool, default True
1710 Sort by frequencies.
1711 ascending : bool, default False
1712 Sort in ascending order.
1713 dropna : bool, default True
1714 Don’t include counts of rows that contain NA values.
1716 Returns
1717 -------
1718 Series or DataFrame
1719 Series if the groupby as_index is True, otherwise DataFrame.
1721 See Also
1722 --------
1723 Series.value_counts: Equivalent method on Series.
1724 DataFrame.value_counts: Equivalent method on DataFrame.
1725 SeriesGroupBy.value_counts: Equivalent method on SeriesGroupBy.
1727 Notes
1728 -----
1729 - If the groupby as_index is True then the returned Series will have a
1730 MultiIndex with one level per input column.
1731 - If the groupby as_index is False then the returned DataFrame will have an
1732 additional column with the value_counts. The column is labelled 'count' or
1733 'proportion', depending on the ``normalize`` parameter.
1735 By default, rows that contain any NA values are omitted from
1736 the result.
1738 By default, the result will be in descending order so that the
1739 first element of each group is the most frequently-occurring row.
1741 Examples
1742 --------
1743 >>> df = pd.DataFrame({
1744 ... 'gender': ['male', 'male', 'female', 'male', 'female', 'male'],
1745 ... 'education': ['low', 'medium', 'high', 'low', 'high', 'low'],
1746 ... 'country': ['US', 'FR', 'US', 'FR', 'FR', 'FR']
1747 ... })
1749 >>> df
1750 gender education country
1751 0 male low US
1752 1 male medium FR
1753 2 female high US
1754 3 male low FR
1755 4 female high FR
1756 5 male low FR
1758 >>> df.groupby('gender').value_counts()
1759 gender education country
1760 female high FR 1
1761 US 1
1762 male low FR 2
1763 US 1
1764 medium FR 1
1765 dtype: int64
1767 >>> df.groupby('gender').value_counts(ascending=True)
1768 gender education country
1769 female high FR 1
1770 US 1
1771 male low US 1
1772 medium FR 1
1773 low FR 2
1774 dtype: int64
1776 >>> df.groupby('gender').value_counts(normalize=True)
1777 gender education country
1778 female high FR 0.50
1779 US 0.50
1780 male low FR 0.50
1781 US 0.25
1782 medium FR 0.25
1783 dtype: float64
1785 >>> df.groupby('gender', as_index=False).value_counts()
1786 gender education country count
1787 0 female high FR 1
1788 1 female high US 1
1789 2 male low FR 2
1790 3 male low US 1
1791 4 male medium FR 1
1793 >>> df.groupby('gender', as_index=False).value_counts(normalize=True)
1794 gender education country proportion
1795 0 female high FR 0.50
1796 1 female high US 0.50
1797 2 male low FR 0.50
1798 3 male low US 0.25
1799 4 male medium FR 0.25
1800 """
1801 if self.axis == 1:
1802 raise NotImplementedError(
1803 "DataFrameGroupBy.value_counts only handles axis=0"
1804 )
1806 with self._group_selection_context():
1807 df = self.obj
1809 in_axis_names = {
1810 grouping.name for grouping in self.grouper.groupings if grouping.in_axis
1811 }
1812 if isinstance(self._selected_obj, Series):
1813 name = self._selected_obj.name
1814 keys = [] if name in in_axis_names else [self._selected_obj]
1815 else:
1816 unique_cols = set(self._selected_obj.columns)
1817 if subset is not None:
1818 subsetted = set(subset)
1819 clashing = subsetted & set(in_axis_names)
1820 if clashing:
1821 raise ValueError(
1822 f"Keys {clashing} in subset cannot be in "
1823 "the groupby column keys."
1824 )
1825 doesnt_exist = subsetted - unique_cols
1826 if doesnt_exist:
1827 raise ValueError(
1828 f"Keys {doesnt_exist} in subset do not "
1829 f"exist in the DataFrame."
1830 )
1831 else:
1832 subsetted = unique_cols
1834 keys = [
1835 # Can't use .values because the column label needs to be preserved
1836 self._selected_obj.iloc[:, idx]
1837 for idx, name in enumerate(self._selected_obj.columns)
1838 if name not in in_axis_names and name in subsetted
1839 ]
1841 groupings = list(self.grouper.groupings)
1842 for key in keys:
1843 grouper, _, _ = get_grouper(
1844 df,
1845 key=key,
1846 axis=self.axis,
1847 sort=self.sort,
1848 observed=False,
1849 dropna=dropna,
1850 )
1851 groupings += list(grouper.groupings)
1853 # Take the size of the overall columns
1854 gb = df.groupby(
1855 groupings,
1856 sort=self.sort,
1857 observed=self.observed,
1858 dropna=self.dropna,
1859 )
1860 result_series = cast(Series, gb.size())
1862 # GH-46357 Include non-observed categories
1863 # of non-grouping columns regardless of `observed`
1864 if any(
1865 isinstance(grouping.grouping_vector, (Categorical, CategoricalIndex))
1866 and not grouping._observed
1867 for grouping in groupings
1868 ):
1869 levels_list = [ping.result_index for ping in groupings]
1870 multi_index, _ = MultiIndex.from_product(
1871 levels_list, names=[ping.name for ping in groupings]
1872 ).sortlevel()
1873 result_series = result_series.reindex(multi_index, fill_value=0)
1875 if normalize:
1876 # Normalize the results by dividing by the original group sizes.
1877 # We are guaranteed to have the first N levels be the
1878 # user-requested grouping.
1879 levels = list(
1880 range(len(self.grouper.groupings), result_series.index.nlevels)
1881 )
1882 indexed_group_size = result_series.groupby(
1883 result_series.index.droplevel(levels),
1884 sort=self.sort,
1885 dropna=self.dropna,
1886 ).transform("sum")
1887 result_series /= indexed_group_size
1889 # Handle groups of non-observed categories
1890 result_series = result_series.fillna(0.0)
1892 if sort:
1893 # Sort the values and then resort by the main grouping
1894 index_level = range(len(self.grouper.groupings))
1895 result_series = result_series.sort_values(
1896 ascending=ascending
1897 ).sort_index(level=index_level, sort_remaining=False)
1899 result: Series | DataFrame
1900 if self.as_index:
1901 result = result_series
1902 else:
1903 # Convert to frame
1904 name = "proportion" if normalize else "count"
1905 index = result_series.index
1906 columns = com.fill_missing_names(index.names)
1907 if name in columns:
1908 raise ValueError(
1909 f"Column label '{name}' is duplicate of result column"
1910 )
1911 result_series.name = name
1912 result_series.index = index.set_names(range(len(columns)))
1913 result_frame = result_series.reset_index()
1914 result_frame.columns = columns + [name]
1915 result = result_frame
1916 return result.__finalize__(self.obj, method="value_counts")
1919def _wrap_transform_general_frame(
1920 obj: DataFrame, group: DataFrame, res: DataFrame | Series
1921) -> DataFrame:
1922 from pandas import concat
1924 if isinstance(res, Series):
1925 # we need to broadcast across the
1926 # other dimension; this will preserve dtypes
1927 # GH14457
1928 if res.index.is_(obj.index):
1929 res_frame = concat([res] * len(group.columns), axis=1)
1930 res_frame.columns = group.columns
1931 res_frame.index = group.index
1932 else:
1933 res_frame = obj._constructor(
1934 np.tile(res.values, (len(group.index), 1)),
1935 columns=group.columns,
1936 index=group.index,
1937 )
1938 assert isinstance(res_frame, DataFrame)
1939 return res_frame
1940 else:
1941 return res