Coverage for /var/srv/projects/api.amasfac.comuna18.com/tmp/venv/lib/python3.9/site-packages/pandas/core/groupby/groupby.py: 18%
1236 statements
« prev ^ index » next coverage.py v6.4.4, created at 2023-07-17 14:22 -0600
« prev ^ index » next coverage.py v6.4.4, created at 2023-07-17 14:22 -0600
1"""
2Provide the groupby split-apply-combine paradigm. Define the GroupBy
3class providing the base-class of operations.
5The SeriesGroupBy and DataFrameGroupBy sub-class
6(defined in pandas.core.groupby.generic)
7expose these user-facing objects to provide specific functionality.
8"""
9from __future__ import annotations
11from contextlib import contextmanager
12import datetime
13from functools import (
14 partial,
15 wraps,
16)
17import inspect
18from textwrap import dedent
19import types
20from typing import (
21 TYPE_CHECKING,
22 Callable,
23 Hashable,
24 Iterable,
25 Iterator,
26 List,
27 Literal,
28 Mapping,
29 Sequence,
30 TypeVar,
31 Union,
32 cast,
33 final,
34)
35import warnings
37import numpy as np
39from pandas._config.config import option_context
41from pandas._libs import (
42 Timestamp,
43 lib,
44)
45import pandas._libs.groupby as libgroupby
46from pandas._typing import (
47 ArrayLike,
48 IndexLabel,
49 NDFrameT,
50 PositionalIndexer,
51 RandomState,
52 Scalar,
53 T,
54 npt,
55)
56from pandas.compat.numpy import function as nv
57from pandas.errors import (
58 AbstractMethodError,
59 DataError,
60)
61from pandas.util._decorators import (
62 Appender,
63 Substitution,
64 cache_readonly,
65 doc,
66)
67from pandas.util._exceptions import find_stack_level
69from pandas.core.dtypes.cast import ensure_dtype_can_hold_na
70from pandas.core.dtypes.common import (
71 is_bool_dtype,
72 is_datetime64_dtype,
73 is_float_dtype,
74 is_integer,
75 is_integer_dtype,
76 is_numeric_dtype,
77 is_object_dtype,
78 is_scalar,
79 is_timedelta64_dtype,
80)
81from pandas.core.dtypes.missing import (
82 isna,
83 notna,
84)
86from pandas.core import nanops
87from pandas.core._numba import executor
88import pandas.core.algorithms as algorithms
89from pandas.core.arrays import (
90 BaseMaskedArray,
91 BooleanArray,
92 Categorical,
93 ExtensionArray,
94)
95from pandas.core.base import (
96 PandasObject,
97 SelectionMixin,
98)
99import pandas.core.common as com
100from pandas.core.frame import DataFrame
101from pandas.core.generic import NDFrame
102from pandas.core.groupby import (
103 base,
104 numba_,
105 ops,
106)
107from pandas.core.groupby.indexing import (
108 GroupByIndexingMixin,
109 GroupByNthSelector,
110)
111from pandas.core.indexes.api import (
112 CategoricalIndex,
113 Index,
114 MultiIndex,
115 RangeIndex,
116)
117from pandas.core.internals.blocks import ensure_block_shape
118import pandas.core.sample as sample
119from pandas.core.series import Series
120from pandas.core.sorting import get_group_index_sorter
121from pandas.core.util.numba_ import (
122 get_jit_arguments,
123 maybe_use_numba,
124)
126if TYPE_CHECKING: 126 ↛ 127line 126 didn't jump to line 127, because the condition on line 126 was never true
127 from pandas.core.window import (
128 ExpandingGroupby,
129 ExponentialMovingWindowGroupby,
130 RollingGroupby,
131 )
133_common_see_also = """
134 See Also
135 --------
136 Series.%(name)s : Apply a function %(name)s to a Series.
137 DataFrame.%(name)s : Apply a function %(name)s
138 to each row or column of a DataFrame.
139"""
141_apply_docs = {
142 "template": """
143 Apply function ``func`` group-wise and combine the results together.
145 The function passed to ``apply`` must take a {input} as its first
146 argument and return a DataFrame, Series or scalar. ``apply`` will
147 then take care of combining the results back together into a single
148 dataframe or series. ``apply`` is therefore a highly flexible
149 grouping method.
151 While ``apply`` is a very flexible method, its downside is that
152 using it can be quite a bit slower than using more specific methods
153 like ``agg`` or ``transform``. Pandas offers a wide range of method that will
154 be much faster than using ``apply`` for their specific purposes, so try to
155 use them before reaching for ``apply``.
157 Parameters
158 ----------
159 func : callable
160 A callable that takes a {input} as its first argument, and
161 returns a dataframe, a series or a scalar. In addition the
162 callable may take positional and keyword arguments.
163 args, kwargs : tuple and dict
164 Optional positional and keyword arguments to pass to ``func``.
166 Returns
167 -------
168 applied : Series or DataFrame
170 See Also
171 --------
172 pipe : Apply function to the full GroupBy object instead of to each
173 group.
174 aggregate : Apply aggregate function to the GroupBy object.
175 transform : Apply function column-by-column to the GroupBy object.
176 Series.apply : Apply a function to a Series.
177 DataFrame.apply : Apply a function to each row or column of a DataFrame.
179 Notes
180 -----
182 .. versionchanged:: 1.3.0
184 The resulting dtype will reflect the return value of the passed ``func``,
185 see the examples below.
187 Functions that mutate the passed object can produce unexpected
188 behavior or errors and are not supported. See :ref:`gotchas.udf-mutation`
189 for more details.
191 Examples
192 --------
193 {examples}
194 """,
195 "dataframe_examples": """
196 >>> df = pd.DataFrame({'A': 'a a b'.split(),
197 ... 'B': [1,2,3],
198 ... 'C': [4,6,5]})
199 >>> g1 = df.groupby('A', group_keys=False)
200 >>> g2 = df.groupby('A', group_keys=True)
202 Notice that ``g1`` have ``g2`` have two groups, ``a`` and ``b``, and only
203 differ in their ``group_keys`` argument. Calling `apply` in various ways,
204 we can get different grouping results:
206 Example 1: below the function passed to `apply` takes a DataFrame as
207 its argument and returns a DataFrame. `apply` combines the result for
208 each group together into a new DataFrame:
210 >>> g1[['B', 'C']].apply(lambda x: x / x.sum())
211 B C
212 0 0.333333 0.4
213 1 0.666667 0.6
214 2 1.000000 1.0
216 In the above, the groups are not part of the index. We can have them included
217 by using ``g2`` where ``group_keys=True``:
219 >>> g2[['B', 'C']].apply(lambda x: x / x.sum())
220 B C
221 A
222 a 0 0.333333 0.4
223 1 0.666667 0.6
224 b 2 1.000000 1.0
226 Example 2: The function passed to `apply` takes a DataFrame as
227 its argument and returns a Series. `apply` combines the result for
228 each group together into a new DataFrame.
230 .. versionchanged:: 1.3.0
232 The resulting dtype will reflect the return value of the passed ``func``.
234 >>> g1[['B', 'C']].apply(lambda x: x.astype(float).max() - x.min())
235 B C
236 A
237 a 1.0 2.0
238 b 0.0 0.0
240 >>> g2[['B', 'C']].apply(lambda x: x.astype(float).max() - x.min())
241 B C
242 A
243 a 1.0 2.0
244 b 0.0 0.0
246 The ``group_keys`` argument has no effect here because the result is not
247 like-indexed (i.e. :ref:`a transform <groupby.transform>`) when compared
248 to the input.
250 Example 3: The function passed to `apply` takes a DataFrame as
251 its argument and returns a scalar. `apply` combines the result for
252 each group together into a Series, including setting the index as
253 appropriate:
255 >>> g1.apply(lambda x: x.C.max() - x.B.min())
256 A
257 a 5
258 b 2
259 dtype: int64""",
260 "series_examples": """
261 >>> s = pd.Series([0, 1, 2], index='a a b'.split())
262 >>> g1 = s.groupby(s.index, group_keys=False)
263 >>> g2 = s.groupby(s.index, group_keys=True)
265 From ``s`` above we can see that ``g`` has two groups, ``a`` and ``b``.
266 Notice that ``g1`` have ``g2`` have two groups, ``a`` and ``b``, and only
267 differ in their ``group_keys`` argument. Calling `apply` in various ways,
268 we can get different grouping results:
270 Example 1: The function passed to `apply` takes a Series as
271 its argument and returns a Series. `apply` combines the result for
272 each group together into a new Series.
274 .. versionchanged:: 1.3.0
276 The resulting dtype will reflect the return value of the passed ``func``.
278 >>> g1.apply(lambda x: x*2 if x.name == 'a' else x/2)
279 a 0.0
280 a 2.0
281 b 1.0
282 dtype: float64
284 In the above, the groups are not part of the index. We can have them included
285 by using ``g2`` where ``group_keys=True``:
287 >>> g2.apply(lambda x: x*2 if x.name == 'a' else x/2)
288 a a 0.0
289 a 2.0
290 b b 1.0
291 dtype: float64
293 Example 2: The function passed to `apply` takes a Series as
294 its argument and returns a scalar. `apply` combines the result for
295 each group together into a Series, including setting the index as
296 appropriate:
298 >>> g1.apply(lambda x: x.max() - x.min())
299 a 1
300 b 0
301 dtype: int64
303 The ``group_keys`` argument has no effect here because the result is not
304 like-indexed (i.e. :ref:`a transform <groupby.transform>`) when compared
305 to the input.
307 >>> g2.apply(lambda x: x.max() - x.min())
308 a 1
309 b 0
310 dtype: int64""",
311}
313_groupby_agg_method_template = """
314Compute {fname} of group values.
316Parameters
317----------
318numeric_only : bool, default {no}
319 Include only float, int, boolean columns. If None, will attempt to use
320 everything, then use only numeric data.
321min_count : int, default {mc}
322 The required number of valid values to perform the operation. If fewer
323 than ``min_count`` non-NA values are present the result will be NA.
325Returns
326-------
327Series or DataFrame
328 Computed {fname} of values within each group.
329"""
331_pipe_template = """
332Apply a ``func`` with arguments to this %(klass)s object and return its result.
334Use `.pipe` when you want to improve readability by chaining together
335functions that expect Series, DataFrames, GroupBy or Resampler objects.
336Instead of writing
338>>> h(g(f(df.groupby('group')), arg1=a), arg2=b, arg3=c) # doctest: +SKIP
340You can write
342>>> (df.groupby('group')
343... .pipe(f)
344... .pipe(g, arg1=a)
345... .pipe(h, arg2=b, arg3=c)) # doctest: +SKIP
347which is much more readable.
349Parameters
350----------
351func : callable or tuple of (callable, str)
352 Function to apply to this %(klass)s object or, alternatively,
353 a `(callable, data_keyword)` tuple where `data_keyword` is a
354 string indicating the keyword of `callable` that expects the
355 %(klass)s object.
356args : iterable, optional
357 Positional arguments passed into `func`.
358kwargs : dict, optional
359 A dictionary of keyword arguments passed into `func`.
361Returns
362-------
363object : the return type of `func`.
365See Also
366--------
367Series.pipe : Apply a function with arguments to a series.
368DataFrame.pipe: Apply a function with arguments to a dataframe.
369apply : Apply function to each group instead of to the
370 full %(klass)s object.
372Notes
373-----
374See more `here
375<https://pandas.pydata.org/pandas-docs/stable/user_guide/groupby.html#piping-function-calls>`_
377Examples
378--------
379%(examples)s
380"""
382_transform_template = """
383Call function producing a same-indexed %(klass)s on each group.
385Returns a %(klass)s having the same indexes as the original object
386filled with the transformed values.
388Parameters
389----------
390f : function
391 Function to apply to each group. See the Notes section below for requirements.
393 Can also accept a Numba JIT function with
394 ``engine='numba'`` specified.
396 If the ``'numba'`` engine is chosen, the function must be
397 a user defined function with ``values`` and ``index`` as the
398 first and second arguments respectively in the function signature.
399 Each group's index will be passed to the user defined function
400 and optionally available for use.
402 .. versionchanged:: 1.1.0
403*args
404 Positional arguments to pass to func.
405engine : str, default None
406 * ``'cython'`` : Runs the function through C-extensions from cython.
407 * ``'numba'`` : Runs the function through JIT compiled code from numba.
408 * ``None`` : Defaults to ``'cython'`` or the global setting ``compute.use_numba``
410 .. versionadded:: 1.1.0
411engine_kwargs : dict, default None
412 * For ``'cython'`` engine, there are no accepted ``engine_kwargs``
413 * For ``'numba'`` engine, the engine can accept ``nopython``, ``nogil``
414 and ``parallel`` dictionary keys. The values must either be ``True`` or
415 ``False``. The default ``engine_kwargs`` for the ``'numba'`` engine is
416 ``{'nopython': True, 'nogil': False, 'parallel': False}`` and will be
417 applied to the function
419 .. versionadded:: 1.1.0
420**kwargs
421 Keyword arguments to be passed into func.
423Returns
424-------
425%(klass)s
427See Also
428--------
429%(klass)s.groupby.apply : Apply function ``func`` group-wise and combine
430 the results together.
431%(klass)s.groupby.aggregate : Aggregate using one or more
432 operations over the specified axis.
433%(klass)s.transform : Call ``func`` on self producing a %(klass)s with the
434 same axis shape as self.
436Notes
437-----
438Each group is endowed the attribute 'name' in case you need to know
439which group you are working on.
441The current implementation imposes three requirements on f:
443* f must return a value that either has the same shape as the input
444 subframe or can be broadcast to the shape of the input subframe.
445 For example, if `f` returns a scalar it will be broadcast to have the
446 same shape as the input subframe.
447* if this is a DataFrame, f must support application column-by-column
448 in the subframe. If f also supports application to the entire subframe,
449 then a fast path is used starting from the second chunk.
450* f must not mutate groups. Mutation is not supported and may
451 produce unexpected results. See :ref:`gotchas.udf-mutation` for more details.
453When using ``engine='numba'``, there will be no "fall back" behavior internally.
454The group data and group index will be passed as numpy arrays to the JITed
455user defined function, and no alternative execution attempts will be tried.
457.. versionchanged:: 1.3.0
459 The resulting dtype will reflect the return value of the passed ``func``,
460 see the examples below.
462.. deprecated:: 1.5.0
464 When using ``.transform`` on a grouped DataFrame and the transformation function
465 returns a DataFrame, currently pandas does not align the result's index
466 with the input's index. This behavior is deprecated and alignment will
467 be performed in a future version of pandas. You can apply ``.to_numpy()`` to the
468 result of the transformation function to avoid alignment.
470Examples
471--------
473>>> df = pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar',
474... 'foo', 'bar'],
475... 'B' : ['one', 'one', 'two', 'three',
476... 'two', 'two'],
477... 'C' : [1, 5, 5, 2, 5, 5],
478... 'D' : [2.0, 5., 8., 1., 2., 9.]})
479>>> grouped = df.groupby('A')[['C', 'D']]
480>>> grouped.transform(lambda x: (x - x.mean()) / x.std())
481 C D
4820 -1.154701 -0.577350
4831 0.577350 0.000000
4842 0.577350 1.154701
4853 -1.154701 -1.000000
4864 0.577350 -0.577350
4875 0.577350 1.000000
489Broadcast result of the transformation
491>>> grouped.transform(lambda x: x.max() - x.min())
492 C D
4930 4.0 6.0
4941 3.0 8.0
4952 4.0 6.0
4963 3.0 8.0
4974 4.0 6.0
4985 3.0 8.0
500.. versionchanged:: 1.3.0
502 The resulting dtype will reflect the return value of the passed ``func``,
503 for example:
505>>> grouped.transform(lambda x: x.astype(int).max())
506 C D
5070 5 8
5081 5 9
5092 5 8
5103 5 9
5114 5 8
5125 5 9
513"""
515_agg_template = """
516Aggregate using one or more operations over the specified axis.
518Parameters
519----------
520func : function, str, list or dict
521 Function to use for aggregating the data. If a function, must either
522 work when passed a {klass} or when passed to {klass}.apply.
524 Accepted combinations are:
526 - function
527 - string function name
528 - list of functions and/or function names, e.g. ``[np.sum, 'mean']``
529 - dict of axis labels -> functions, function names or list of such.
531 Can also accept a Numba JIT function with
532 ``engine='numba'`` specified. Only passing a single function is supported
533 with this engine.
535 If the ``'numba'`` engine is chosen, the function must be
536 a user defined function with ``values`` and ``index`` as the
537 first and second arguments respectively in the function signature.
538 Each group's index will be passed to the user defined function
539 and optionally available for use.
541 .. versionchanged:: 1.1.0
542*args
543 Positional arguments to pass to func.
544engine : str, default None
545 * ``'cython'`` : Runs the function through C-extensions from cython.
546 * ``'numba'`` : Runs the function through JIT compiled code from numba.
547 * ``None`` : Defaults to ``'cython'`` or globally setting ``compute.use_numba``
549 .. versionadded:: 1.1.0
550engine_kwargs : dict, default None
551 * For ``'cython'`` engine, there are no accepted ``engine_kwargs``
552 * For ``'numba'`` engine, the engine can accept ``nopython``, ``nogil``
553 and ``parallel`` dictionary keys. The values must either be ``True`` or
554 ``False``. The default ``engine_kwargs`` for the ``'numba'`` engine is
555 ``{{'nopython': True, 'nogil': False, 'parallel': False}}`` and will be
556 applied to the function
558 .. versionadded:: 1.1.0
559**kwargs
560 Keyword arguments to be passed into func.
562Returns
563-------
564{klass}
566See Also
567--------
568{klass}.groupby.apply : Apply function func group-wise
569 and combine the results together.
570{klass}.groupby.transform : Aggregate using one or more
571 operations over the specified axis.
572{klass}.aggregate : Transforms the Series on each group
573 based on the given function.
575Notes
576-----
577When using ``engine='numba'``, there will be no "fall back" behavior internally.
578The group data and group index will be passed as numpy arrays to the JITed
579user defined function, and no alternative execution attempts will be tried.
581Functions that mutate the passed object can produce unexpected
582behavior or errors and are not supported. See :ref:`gotchas.udf-mutation`
583for more details.
585.. versionchanged:: 1.3.0
587 The resulting dtype will reflect the return value of the passed ``func``,
588 see the examples below.
589{examples}"""
592@final
593class GroupByPlot(PandasObject):
594 """
595 Class implementing the .plot attribute for groupby objects.
596 """
598 def __init__(self, groupby: GroupBy) -> None:
599 self._groupby = groupby
601 def __call__(self, *args, **kwargs):
602 def f(self):
603 return self.plot(*args, **kwargs)
605 f.__name__ = "plot"
606 return self._groupby.apply(f)
608 def __getattr__(self, name: str):
609 def attr(*args, **kwargs):
610 def f(self):
611 return getattr(self.plot, name)(*args, **kwargs)
613 return self._groupby.apply(f)
615 return attr
618_KeysArgType = Union[
619 Hashable,
620 List[Hashable],
621 Callable[[Hashable], Hashable],
622 List[Callable[[Hashable], Hashable]],
623 Mapping[Hashable, Hashable],
624]
627class BaseGroupBy(PandasObject, SelectionMixin[NDFrameT], GroupByIndexingMixin):
628 _group_selection: IndexLabel | None = None
629 _apply_allowlist: frozenset[str] = frozenset()
630 _hidden_attrs = PandasObject._hidden_attrs | {
631 "as_index",
632 "axis",
633 "dropna",
634 "exclusions",
635 "grouper",
636 "group_keys",
637 "keys",
638 "level",
639 "mutated",
640 "obj",
641 "observed",
642 "sort",
643 "squeeze",
644 }
646 axis: int
647 grouper: ops.BaseGrouper
648 keys: _KeysArgType | None = None
649 group_keys: bool | lib.NoDefault
651 @final
652 def __len__(self) -> int:
653 return len(self.groups)
655 @final
656 def __repr__(self) -> str:
657 # TODO: Better repr for GroupBy object
658 return object.__repr__(self)
660 @final
661 @property
662 def groups(self) -> dict[Hashable, np.ndarray]:
663 """
664 Dict {group name -> group labels}.
665 """
666 return self.grouper.groups
668 @final
669 @property
670 def ngroups(self) -> int:
671 return self.grouper.ngroups
673 @final
674 @property
675 def indices(self) -> dict[Hashable, npt.NDArray[np.intp]]:
676 """
677 Dict {group name -> group indices}.
678 """
679 return self.grouper.indices
681 @final
682 def _get_indices(self, names):
683 """
684 Safe get multiple indices, translate keys for
685 datelike to underlying repr.
686 """
688 def get_converter(s):
689 # possibly convert to the actual key types
690 # in the indices, could be a Timestamp or a np.datetime64
691 if isinstance(s, datetime.datetime):
692 return lambda key: Timestamp(key)
693 elif isinstance(s, np.datetime64):
694 return lambda key: Timestamp(key).asm8
695 else:
696 return lambda key: key
698 if len(names) == 0:
699 return []
701 if len(self.indices) > 0:
702 index_sample = next(iter(self.indices))
703 else:
704 index_sample = None # Dummy sample
706 name_sample = names[0]
707 if isinstance(index_sample, tuple):
708 if not isinstance(name_sample, tuple):
709 msg = "must supply a tuple to get_group with multiple grouping keys"
710 raise ValueError(msg)
711 if not len(name_sample) == len(index_sample):
712 try:
713 # If the original grouper was a tuple
714 return [self.indices[name] for name in names]
715 except KeyError as err:
716 # turns out it wasn't a tuple
717 msg = (
718 "must supply a same-length tuple to get_group "
719 "with multiple grouping keys"
720 )
721 raise ValueError(msg) from err
723 converters = [get_converter(s) for s in index_sample]
724 names = (tuple(f(n) for f, n in zip(converters, name)) for name in names)
726 else:
727 converter = get_converter(index_sample)
728 names = (converter(name) for name in names)
730 return [self.indices.get(name, []) for name in names]
732 @final
733 def _get_index(self, name):
734 """
735 Safe get index, translate keys for datelike to underlying repr.
736 """
737 return self._get_indices([name])[0]
739 @final
740 @cache_readonly
741 def _selected_obj(self):
742 # Note: _selected_obj is always just `self.obj` for SeriesGroupBy
744 if self._selection is None or isinstance(self.obj, Series):
745 if self._group_selection is not None:
746 return self.obj[self._group_selection]
747 return self.obj
748 else:
749 return self.obj[self._selection]
751 @final
752 def _dir_additions(self) -> set[str]:
753 return self.obj._dir_additions() | self._apply_allowlist
755 @Substitution(
756 klass="GroupBy",
757 examples=dedent(
758 """\
759 >>> df = pd.DataFrame({'A': 'a b a b'.split(), 'B': [1, 2, 3, 4]})
760 >>> df
761 A B
762 0 a 1
763 1 b 2
764 2 a 3
765 3 b 4
767 To get the difference between each groups maximum and minimum value in one
768 pass, you can do
770 >>> df.groupby('A').pipe(lambda x: x.max() - x.min())
771 B
772 A
773 a 2
774 b 2"""
775 ),
776 )
777 @Appender(_pipe_template)
778 def pipe(
779 self,
780 func: Callable[..., T] | tuple[Callable[..., T], str],
781 *args,
782 **kwargs,
783 ) -> T:
784 return com.pipe(self, func, *args, **kwargs)
786 plot = property(GroupByPlot)
788 @final
789 def get_group(self, name, obj=None) -> DataFrame | Series:
790 """
791 Construct DataFrame from group with provided name.
793 Parameters
794 ----------
795 name : object
796 The name of the group to get as a DataFrame.
797 obj : DataFrame, default None
798 The DataFrame to take the DataFrame out of. If
799 it is None, the object groupby was called on will
800 be used.
802 Returns
803 -------
804 group : same type as obj
805 """
806 if obj is None:
807 obj = self._selected_obj
809 inds = self._get_index(name)
810 if not len(inds):
811 raise KeyError(name)
813 return obj._take_with_is_copy(inds, axis=self.axis)
815 @final
816 def __iter__(self) -> Iterator[tuple[Hashable, NDFrameT]]:
817 """
818 Groupby iterator.
820 Returns
821 -------
822 Generator yielding sequence of (name, subsetted object)
823 for each group
824 """
825 keys = self.keys
826 if isinstance(keys, list) and len(keys) == 1:
827 warnings.warn(
828 (
829 "In a future version of pandas, a length 1 "
830 "tuple will be returned when iterating over a "
831 "groupby with a grouper equal to a list of "
832 "length 1. Don't supply a list with a single grouper "
833 "to avoid this warning."
834 ),
835 FutureWarning,
836 stacklevel=find_stack_level(),
837 )
838 return self.grouper.get_iterator(self._selected_obj, axis=self.axis)
841# To track operations that expand dimensions, like ohlc
842OutputFrameOrSeries = TypeVar("OutputFrameOrSeries", bound=NDFrame)
845class GroupBy(BaseGroupBy[NDFrameT]):
846 """
847 Class for grouping and aggregating relational data.
849 See aggregate, transform, and apply functions on this object.
851 It's easiest to use obj.groupby(...) to use GroupBy, but you can also do:
853 ::
855 grouped = groupby(obj, ...)
857 Parameters
858 ----------
859 obj : pandas object
860 axis : int, default 0
861 level : int, default None
862 Level of MultiIndex
863 groupings : list of Grouping objects
864 Most users should ignore this
865 exclusions : array-like, optional
866 List of columns to exclude
867 name : str
868 Most users should ignore this
870 Returns
871 -------
872 **Attributes**
873 groups : dict
874 {group name -> group labels}
875 len(grouped) : int
876 Number of groups
878 Notes
879 -----
880 After grouping, see aggregate, apply, and transform functions. Here are
881 some other brief notes about usage. When grouping by multiple groups, the
882 result index will be a MultiIndex (hierarchical) by default.
884 Iteration produces (key, group) tuples, i.e. chunking the data by group. So
885 you can write code like:
887 ::
889 grouped = obj.groupby(keys, axis=axis)
890 for key, group in grouped:
891 # do something with the data
893 Function calls on GroupBy, if not specially implemented, "dispatch" to the
894 grouped data. So if you group a DataFrame and wish to invoke the std()
895 method on each group, you can simply do:
897 ::
899 df.groupby(mapper).std()
901 rather than
903 ::
905 df.groupby(mapper).aggregate(np.std)
907 You can pass arguments to these "wrapped" functions, too.
909 See the online documentation for full exposition on these topics and much
910 more
911 """
913 grouper: ops.BaseGrouper
914 as_index: bool
916 @final
917 def __init__(
918 self,
919 obj: NDFrameT,
920 keys: _KeysArgType | None = None,
921 axis: int = 0,
922 level: IndexLabel | None = None,
923 grouper: ops.BaseGrouper | None = None,
924 exclusions: frozenset[Hashable] | None = None,
925 selection: IndexLabel | None = None,
926 as_index: bool = True,
927 sort: bool = True,
928 group_keys: bool | lib.NoDefault = True,
929 squeeze: bool = False,
930 observed: bool = False,
931 mutated: bool = False,
932 dropna: bool = True,
933 ) -> None:
935 self._selection = selection
937 assert isinstance(obj, NDFrame), type(obj)
939 self.level = level
941 if not as_index:
942 if not isinstance(obj, DataFrame):
943 raise TypeError("as_index=False only valid with DataFrame")
944 if axis != 0:
945 raise ValueError("as_index=False only valid for axis=0")
947 self.as_index = as_index
948 self.keys = keys
949 self.sort = sort
950 self.group_keys = group_keys
951 self.squeeze = squeeze
952 self.observed = observed
953 self.mutated = mutated
954 self.dropna = dropna
956 if grouper is None:
957 from pandas.core.groupby.grouper import get_grouper
959 grouper, exclusions, obj = get_grouper(
960 obj,
961 keys,
962 axis=axis,
963 level=level,
964 sort=sort,
965 observed=observed,
966 mutated=self.mutated,
967 dropna=self.dropna,
968 )
970 self.obj = obj
971 self.axis = obj._get_axis_number(axis)
972 self.grouper = grouper
973 self.exclusions = frozenset(exclusions) if exclusions else frozenset()
975 def __getattr__(self, attr: str):
976 if attr in self._internal_names_set:
977 return object.__getattribute__(self, attr)
978 if attr in self.obj:
979 return self[attr]
981 raise AttributeError(
982 f"'{type(self).__name__}' object has no attribute '{attr}'"
983 )
985 @final
986 def _make_wrapper(self, name: str) -> Callable:
987 assert name in self._apply_allowlist
989 with self._group_selection_context():
990 # need to setup the selection
991 # as are not passed directly but in the grouper
992 f = getattr(self._obj_with_exclusions, name)
993 if not isinstance(f, types.MethodType):
994 # error: Incompatible return value type
995 # (got "NDFrameT", expected "Callable[..., Any]") [return-value]
996 return cast(Callable, self.apply(lambda self: getattr(self, name)))
998 f = getattr(type(self._obj_with_exclusions), name)
999 sig = inspect.signature(f)
1001 def wrapper(*args, **kwargs):
1002 # a little trickery for aggregation functions that need an axis
1003 # argument
1004 if "axis" in sig.parameters:
1005 if kwargs.get("axis", None) is None:
1006 kwargs["axis"] = self.axis
1008 numeric_only = kwargs.get("numeric_only", lib.no_default)
1010 def curried(x):
1011 with warnings.catch_warnings():
1012 # Catch any warnings from dispatch to DataFrame; we'll emit
1013 # a warning for groupby below
1014 match = "The default value of numeric_only "
1015 warnings.filterwarnings("ignore", match, FutureWarning)
1016 return f(x, *args, **kwargs)
1018 # preserve the name so we can detect it when calling plot methods,
1019 # to avoid duplicates
1020 curried.__name__ = name
1022 # special case otherwise extra plots are created when catching the
1023 # exception below
1024 if name in base.plotting_methods:
1025 return self.apply(curried)
1027 is_transform = name in base.transformation_kernels
1029 # Transform needs to keep the same schema, including when empty
1030 if is_transform and self._obj_with_exclusions.empty:
1031 return self._obj_with_exclusions
1033 result = self._python_apply_general(
1034 curried,
1035 self._obj_with_exclusions,
1036 is_transform=is_transform,
1037 not_indexed_same=not is_transform,
1038 )
1040 if self._selected_obj.ndim != 1 and self.axis != 1 and result.ndim != 1:
1041 missing = self._obj_with_exclusions.columns.difference(result.columns)
1042 if len(missing) > 0:
1043 warn_dropping_nuisance_columns_deprecated(
1044 type(self), name, numeric_only
1045 )
1047 if self.grouper.has_dropped_na and is_transform:
1048 # result will have dropped rows due to nans, fill with null
1049 # and ensure index is ordered same as the input
1050 result = self._set_result_index_ordered(result)
1051 return result
1053 wrapper.__name__ = name
1054 return wrapper
1056 # -----------------------------------------------------------------
1057 # Selection
1059 @final
1060 def _set_group_selection(self) -> None:
1061 """
1062 Create group based selection.
1064 Used when selection is not passed directly but instead via a grouper.
1066 NOTE: this should be paired with a call to _reset_group_selection
1067 """
1068 # This is a no-op for SeriesGroupBy
1069 grp = self.grouper
1070 if not (
1071 self.as_index
1072 and grp.groupings is not None
1073 and self.obj.ndim > 1
1074 and self._group_selection is None
1075 ):
1076 return
1078 groupers = [g.name for g in grp.groupings if g.level is None and g.in_axis]
1080 if len(groupers):
1081 # GH12839 clear selected obj cache when group selection changes
1082 ax = self.obj._info_axis
1083 self._group_selection = ax.difference(Index(groupers), sort=False).tolist()
1084 self._reset_cache("_selected_obj")
1086 @final
1087 def _reset_group_selection(self) -> None:
1088 """
1089 Clear group based selection.
1091 Used for methods needing to return info on each group regardless of
1092 whether a group selection was previously set.
1093 """
1094 if self._group_selection is not None:
1095 # GH12839 clear cached selection too when changing group selection
1096 self._group_selection = None
1097 self._reset_cache("_selected_obj")
1099 @contextmanager
1100 def _group_selection_context(self) -> Iterator[GroupBy]:
1101 """
1102 Set / reset the _group_selection_context.
1103 """
1104 self._set_group_selection()
1105 try:
1106 yield self
1107 finally:
1108 self._reset_group_selection()
1110 def _iterate_slices(self) -> Iterable[Series]:
1111 raise AbstractMethodError(self)
1113 # -----------------------------------------------------------------
1114 # Dispatch/Wrapping
1116 @final
1117 def _concat_objects(
1118 self,
1119 values,
1120 not_indexed_same: bool = False,
1121 override_group_keys: bool = False,
1122 ):
1123 from pandas.core.reshape.concat import concat
1125 def reset_identity(values):
1126 # reset the identities of the components
1127 # of the values to prevent aliasing
1128 for v in com.not_none(*values):
1129 ax = v._get_axis(self.axis)
1130 ax._reset_identity()
1131 return values
1133 if self.group_keys and not override_group_keys:
1135 values = reset_identity(values)
1136 if self.as_index:
1138 # possible MI return case
1139 group_keys = self.grouper.result_index
1140 group_levels = self.grouper.levels
1141 group_names = self.grouper.names
1143 result = concat(
1144 values,
1145 axis=self.axis,
1146 keys=group_keys,
1147 levels=group_levels,
1148 names=group_names,
1149 sort=False,
1150 )
1151 else:
1153 # GH5610, returns a MI, with the first level being a
1154 # range index
1155 keys = list(range(len(values)))
1156 result = concat(values, axis=self.axis, keys=keys)
1158 elif not not_indexed_same:
1159 result = concat(values, axis=self.axis)
1161 ax = self._selected_obj._get_axis(self.axis)
1162 if self.dropna:
1163 labels = self.grouper.group_info[0]
1164 mask = labels != -1
1165 ax = ax[mask]
1167 # this is a very unfortunate situation
1168 # we can't use reindex to restore the original order
1169 # when the ax has duplicates
1170 # so we resort to this
1171 # GH 14776, 30667
1172 if ax.has_duplicates and not result.axes[self.axis].equals(ax):
1173 target = algorithms.unique1d(ax._values)
1174 indexer, _ = result.index.get_indexer_non_unique(target)
1175 result = result.take(indexer, axis=self.axis)
1176 else:
1177 result = result.reindex(ax, axis=self.axis, copy=False)
1179 else:
1180 values = reset_identity(values)
1181 result = concat(values, axis=self.axis)
1183 name = self.obj.name if self.obj.ndim == 1 else self._selection
1184 if isinstance(result, Series) and name is not None:
1186 result.name = name
1188 return result
1190 @final
1191 def _set_result_index_ordered(
1192 self, result: OutputFrameOrSeries
1193 ) -> OutputFrameOrSeries:
1194 # set the result index on the passed values object and
1195 # return the new object, xref 8046
1197 obj_axis = self.obj._get_axis(self.axis)
1199 if self.grouper.is_monotonic and not self.grouper.has_dropped_na:
1200 # shortcut if we have an already ordered grouper
1201 result = result.set_axis(obj_axis, axis=self.axis, copy=False)
1202 return result
1204 # row order is scrambled => sort the rows by position in original index
1205 original_positions = Index(self.grouper.result_ilocs())
1206 result = result.set_axis(original_positions, axis=self.axis, copy=False)
1207 result = result.sort_index(axis=self.axis)
1208 if self.grouper.has_dropped_na:
1209 # Add back in any missing rows due to dropna - index here is integral
1210 # with values referring to the row of the input so can use RangeIndex
1211 result = result.reindex(RangeIndex(len(obj_axis)), axis=self.axis)
1212 result = result.set_axis(obj_axis, axis=self.axis, copy=False)
1214 return result
1216 def _indexed_output_to_ndframe(
1217 self, result: Mapping[base.OutputKey, ArrayLike]
1218 ) -> Series | DataFrame:
1219 raise AbstractMethodError(self)
1221 @final
1222 def _wrap_aggregated_output(
1223 self,
1224 output: Series | DataFrame | Mapping[base.OutputKey, ArrayLike],
1225 qs: npt.NDArray[np.float64] | None = None,
1226 ):
1227 """
1228 Wraps the output of GroupBy aggregations into the expected result.
1230 Parameters
1231 ----------
1232 output : Series, DataFrame, or Mapping[base.OutputKey, ArrayLike]
1233 Data to wrap.
1235 Returns
1236 -------
1237 Series or DataFrame
1238 """
1240 if isinstance(output, (Series, DataFrame)):
1241 # We get here (for DataFrameGroupBy) if we used Manager.grouped_reduce,
1242 # in which case our columns are already set correctly.
1243 # ATM we do not get here for SeriesGroupBy; when we do, we will
1244 # need to require that result.name already match self.obj.name
1245 result = output
1246 else:
1247 result = self._indexed_output_to_ndframe(output)
1249 if not self.as_index:
1250 # `not self.as_index` is only relevant for DataFrameGroupBy,
1251 # enforced in __init__
1252 self._insert_inaxis_grouper_inplace(result)
1253 result = result._consolidate()
1254 index = Index(range(self.grouper.ngroups))
1256 else:
1257 index = self.grouper.result_index
1259 if qs is not None:
1260 # We get here with len(qs) != 1 and not self.as_index
1261 # in test_pass_args_kwargs
1262 index = _insert_quantile_level(index, qs)
1264 result.index = index
1266 if self.axis == 1:
1267 # Only relevant for DataFrameGroupBy, no-op for SeriesGroupBy
1268 result = result.T
1269 if result.index.equals(self.obj.index):
1270 # Retain e.g. DatetimeIndex/TimedeltaIndex freq
1271 result.index = self.obj.index.copy()
1272 # TODO: Do this more systematically
1274 return self._reindex_output(result, qs=qs)
1276 @final
1277 def _wrap_transformed_output(
1278 self, output: Mapping[base.OutputKey, ArrayLike]
1279 ) -> Series | DataFrame:
1280 """
1281 Wraps the output of GroupBy transformations into the expected result.
1283 Parameters
1284 ----------
1285 output : Mapping[base.OutputKey, ArrayLike]
1286 Data to wrap.
1288 Returns
1289 -------
1290 Series or DataFrame
1291 Series for SeriesGroupBy, DataFrame for DataFrameGroupBy
1292 """
1293 if isinstance(output, (Series, DataFrame)):
1294 result = output
1295 else:
1296 result = self._indexed_output_to_ndframe(output)
1298 if self.axis == 1:
1299 # Only relevant for DataFrameGroupBy
1300 result = result.T
1301 result.columns = self.obj.columns
1303 result.index = self.obj.index
1304 return result
1306 def _wrap_applied_output(
1307 self,
1308 data,
1309 values: list,
1310 not_indexed_same: bool = False,
1311 override_group_keys: bool = False,
1312 ):
1313 raise AbstractMethodError(self)
1315 def _resolve_numeric_only(
1316 self, how: str, numeric_only: bool | lib.NoDefault, axis: int
1317 ) -> bool:
1318 """
1319 Determine subclass-specific default value for 'numeric_only'.
1321 For SeriesGroupBy we want the default to be False (to match Series behavior).
1322 For DataFrameGroupBy we want it to be True (for backwards-compat).
1324 Parameters
1325 ----------
1326 numeric_only : bool or lib.no_default
1327 axis : int
1328 Axis passed to the groupby op (not self.axis).
1330 Returns
1331 -------
1332 bool
1333 """
1334 # GH#41291
1335 if numeric_only is lib.no_default:
1336 # i.e. not explicitly passed by user
1337 if self.obj.ndim == 2:
1338 # i.e. DataFrameGroupBy
1339 numeric_only = axis != 1
1340 # GH#42395 GH#43108 GH#43154
1341 # Regression from 1.2.5 to 1.3 caused object columns to be dropped
1342 if self.axis:
1343 obj = self._obj_with_exclusions.T
1344 else:
1345 obj = self._obj_with_exclusions
1346 check = obj._get_numeric_data()
1347 if len(obj.columns) and not len(check.columns) and not obj.empty:
1348 numeric_only = False
1350 else:
1351 numeric_only = False
1353 if numeric_only and self.obj.ndim == 1 and not is_numeric_dtype(self.obj.dtype):
1354 # GH#47500
1355 warnings.warn(
1356 f"{type(self).__name__}.{how} called with "
1357 f"numeric_only={numeric_only} and dtype {self.obj.dtype}. This will "
1358 "raise a TypeError in a future version of pandas",
1359 category=FutureWarning,
1360 stacklevel=find_stack_level(),
1361 )
1362 raise NotImplementedError(
1363 f"{type(self).__name__}.{how} does not implement numeric_only"
1364 )
1366 return numeric_only
1368 def _maybe_warn_numeric_only_depr(
1369 self, how: str, result: DataFrame | Series, numeric_only: bool | lib.NoDefault
1370 ) -> None:
1371 """Emit warning on numeric_only behavior deprecation when appropriate.
1373 Parameters
1374 ----------
1375 how : str
1376 Groupby kernel name.
1377 result :
1378 Result of the groupby operation.
1379 numeric_only : bool or lib.no_default
1380 Argument as passed by user.
1381 """
1382 if (
1383 self._obj_with_exclusions.ndim != 1
1384 and result.ndim > 1
1385 and len(result.columns) < len(self._obj_with_exclusions.columns)
1386 ):
1387 warn_dropping_nuisance_columns_deprecated(type(self), how, numeric_only)
1389 # -----------------------------------------------------------------
1390 # numba
1392 @final
1393 def _numba_prep(self, data):
1394 ids, _, ngroups = self.grouper.group_info
1395 sorted_index = get_group_index_sorter(ids, ngroups)
1396 sorted_ids = algorithms.take_nd(ids, sorted_index, allow_fill=False)
1398 sorted_data = data.take(sorted_index, axis=self.axis).to_numpy()
1399 if len(self.grouper.groupings) > 1:
1400 raise NotImplementedError(
1401 "More than 1 grouping labels are not supported with engine='numba'"
1402 )
1403 # GH 46867
1404 index_data = data.index
1405 if isinstance(index_data, MultiIndex):
1406 group_key = self.grouper.groupings[0].name
1407 index_data = index_data.get_level_values(group_key)
1408 sorted_index_data = index_data.take(sorted_index).to_numpy()
1410 starts, ends = lib.generate_slices(sorted_ids, ngroups)
1411 return (
1412 starts,
1413 ends,
1414 sorted_index_data,
1415 sorted_data,
1416 )
1418 def _numba_agg_general(
1419 self,
1420 func: Callable,
1421 engine_kwargs: dict[str, bool] | None,
1422 *aggregator_args,
1423 ):
1424 """
1425 Perform groupby with a standard numerical aggregation function (e.g. mean)
1426 with Numba.
1427 """
1428 if not self.as_index:
1429 raise NotImplementedError(
1430 "as_index=False is not supported. Use .reset_index() instead."
1431 )
1432 if self.axis == 1:
1433 raise NotImplementedError("axis=1 is not supported.")
1435 with self._group_selection_context():
1436 data = self._selected_obj
1437 df = data if data.ndim == 2 else data.to_frame()
1438 starts, ends, sorted_index, sorted_data = self._numba_prep(df)
1439 aggregator = executor.generate_shared_aggregator(
1440 func, **get_jit_arguments(engine_kwargs)
1441 )
1442 result = aggregator(sorted_data, starts, ends, 0, *aggregator_args)
1444 index = self.grouper.result_index
1445 if data.ndim == 1:
1446 result_kwargs = {"name": data.name}
1447 result = result.ravel()
1448 else:
1449 result_kwargs = {"columns": data.columns}
1450 return data._constructor(result, index=index, **result_kwargs)
1452 @final
1453 def _transform_with_numba(self, data, func, *args, engine_kwargs=None, **kwargs):
1454 """
1455 Perform groupby transform routine with the numba engine.
1457 This routine mimics the data splitting routine of the DataSplitter class
1458 to generate the indices of each group in the sorted data and then passes the
1459 data and indices into a Numba jitted function.
1460 """
1461 starts, ends, sorted_index, sorted_data = self._numba_prep(data)
1462 numba_.validate_udf(func)
1463 numba_transform_func = numba_.generate_numba_transform_func(
1464 func, **get_jit_arguments(engine_kwargs, kwargs)
1465 )
1466 result = numba_transform_func(
1467 sorted_data,
1468 sorted_index,
1469 starts,
1470 ends,
1471 len(data.columns),
1472 *args,
1473 )
1474 # result values needs to be resorted to their original positions since we
1475 # evaluated the data sorted by group
1476 return result.take(np.argsort(sorted_index), axis=0)
1478 @final
1479 def _aggregate_with_numba(self, data, func, *args, engine_kwargs=None, **kwargs):
1480 """
1481 Perform groupby aggregation routine with the numba engine.
1483 This routine mimics the data splitting routine of the DataSplitter class
1484 to generate the indices of each group in the sorted data and then passes the
1485 data and indices into a Numba jitted function.
1486 """
1487 starts, ends, sorted_index, sorted_data = self._numba_prep(data)
1488 numba_.validate_udf(func)
1489 numba_agg_func = numba_.generate_numba_agg_func(
1490 func, **get_jit_arguments(engine_kwargs, kwargs)
1491 )
1492 result = numba_agg_func(
1493 sorted_data,
1494 sorted_index,
1495 starts,
1496 ends,
1497 len(data.columns),
1498 *args,
1499 )
1500 return result
1502 # -----------------------------------------------------------------
1503 # apply/agg/transform
1505 @Appender(
1506 _apply_docs["template"].format(
1507 input="dataframe", examples=_apply_docs["dataframe_examples"]
1508 )
1509 )
1510 def apply(self, func, *args, **kwargs) -> NDFrameT:
1512 func = com.is_builtin_func(func)
1514 if isinstance(func, str):
1515 if hasattr(self, func):
1516 res = getattr(self, func)
1517 if callable(res):
1518 return res(*args, **kwargs)
1519 elif args or kwargs:
1520 raise ValueError(f"Cannot pass arguments to property {func}")
1521 return res
1523 else:
1524 raise TypeError(f"apply func should be callable, not '{func}'")
1526 elif args or kwargs:
1527 if callable(func):
1529 @wraps(func)
1530 def f(g):
1531 with np.errstate(all="ignore"):
1532 return func(g, *args, **kwargs)
1534 elif hasattr(nanops, "nan" + func):
1535 # TODO: should we wrap this in to e.g. _is_builtin_func?
1536 f = getattr(nanops, "nan" + func)
1538 else:
1539 raise ValueError(
1540 "func must be a callable if args or kwargs are supplied"
1541 )
1542 else:
1544 f = func
1546 # ignore SettingWithCopy here in case the user mutates
1547 with option_context("mode.chained_assignment", None):
1548 try:
1549 result = self._python_apply_general(f, self._selected_obj)
1550 except TypeError:
1551 # gh-20949
1552 # try again, with .apply acting as a filtering
1553 # operation, by excluding the grouping column
1554 # This would normally not be triggered
1555 # except if the udf is trying an operation that
1556 # fails on *some* columns, e.g. a numeric operation
1557 # on a string grouper column
1559 with self._group_selection_context():
1560 return self._python_apply_general(f, self._selected_obj)
1562 return result
1564 @final
1565 def _python_apply_general(
1566 self,
1567 f: Callable,
1568 data: DataFrame | Series,
1569 not_indexed_same: bool | None = None,
1570 is_transform: bool = False,
1571 is_agg: bool = False,
1572 ) -> NDFrameT:
1573 """
1574 Apply function f in python space
1576 Parameters
1577 ----------
1578 f : callable
1579 Function to apply
1580 data : Series or DataFrame
1581 Data to apply f to
1582 not_indexed_same: bool, optional
1583 When specified, overrides the value of not_indexed_same. Apply behaves
1584 differently when the result index is equal to the input index, but
1585 this can be coincidental leading to value-dependent behavior.
1586 is_transform : bool, default False
1587 Indicator for whether the function is actually a transform
1588 and should not have group keys prepended. This is used
1589 in _make_wrapper which generates both transforms (e.g. diff)
1590 and non-transforms (e.g. corr)
1591 is_agg : bool, default False
1592 Indicator for whether the function is an aggregation. When the
1593 result is empty, we don't want to warn for this case.
1594 See _GroupBy._python_agg_general.
1596 Returns
1597 -------
1598 Series or DataFrame
1599 data after applying f
1600 """
1601 values, mutated = self.grouper.apply(f, data, self.axis)
1602 if not_indexed_same is None:
1603 not_indexed_same = mutated or self.mutated
1604 override_group_keys = False
1606 is_empty_agg = is_agg and len(values) == 0
1607 if (not not_indexed_same and self.group_keys is lib.no_default) and not (
1608 is_transform or is_empty_agg
1609 ):
1610 # We've detected value-dependent behavior: the result's index depends on
1611 # whether the user's function `f` returned the same index or not.
1612 msg = (
1613 "Not prepending group keys to the result index of "
1614 "transform-like apply. In the future, the group keys "
1615 "will be included in the index, regardless of whether "
1616 "the applied function returns a like-indexed object.\n"
1617 "To preserve the previous behavior, use\n\n\t"
1618 ">>> .groupby(..., group_keys=False)\n\n"
1619 "To adopt the future behavior and silence this warning, use "
1620 "\n\n\t>>> .groupby(..., group_keys=True)"
1621 )
1622 warnings.warn(msg, FutureWarning, stacklevel=find_stack_level())
1623 # We want to behave as if `self.group_keys=False` when reconstructing
1624 # the object. However, we don't want to mutate the stateful GroupBy
1625 # object, so we just override it.
1626 # When this deprecation is enforced then override_group_keys
1627 # may be removed.
1628 override_group_keys = True
1630 return self._wrap_applied_output(
1631 data,
1632 values,
1633 not_indexed_same,
1634 override_group_keys=is_transform or override_group_keys,
1635 )
1637 @final
1638 def _python_agg_general(self, func, *args, raise_on_typeerror=False, **kwargs):
1639 func = com.is_builtin_func(func)
1640 f = lambda x: func(x, *args, **kwargs)
1642 # iterate through "columns" ex exclusions to populate output dict
1643 output: dict[base.OutputKey, ArrayLike] = {}
1645 if self.ngroups == 0:
1646 # agg_series below assumes ngroups > 0
1647 return self._python_apply_general(f, self._selected_obj, is_agg=True)
1649 for idx, obj in enumerate(self._iterate_slices()):
1650 name = obj.name
1652 try:
1653 # if this function is invalid for this dtype, we will ignore it.
1654 result = self.grouper.agg_series(obj, f)
1655 except TypeError:
1656 if raise_on_typeerror:
1657 raise
1658 warn_dropping_nuisance_columns_deprecated(
1659 type(self), "agg", numeric_only=False
1660 )
1661 continue
1663 key = base.OutputKey(label=name, position=idx)
1664 output[key] = result
1666 if not output:
1667 return self._python_apply_general(f, self._selected_obj)
1669 return self._wrap_aggregated_output(output)
1671 @final
1672 def _agg_general(
1673 self,
1674 numeric_only: bool | lib.NoDefault = True,
1675 min_count: int = -1,
1676 *,
1677 alias: str,
1678 npfunc: Callable,
1679 ):
1681 with self._group_selection_context():
1682 # try a cython aggregation if we can
1683 result = self._cython_agg_general(
1684 how=alias,
1685 alt=npfunc,
1686 numeric_only=numeric_only,
1687 min_count=min_count,
1688 )
1689 return result.__finalize__(self.obj, method="groupby")
1691 def _agg_py_fallback(
1692 self, values: ArrayLike, ndim: int, alt: Callable
1693 ) -> ArrayLike:
1694 """
1695 Fallback to pure-python aggregation if _cython_operation raises
1696 NotImplementedError.
1697 """
1698 # We get here with a) EADtypes and b) object dtype
1700 if values.ndim == 1:
1701 # For DataFrameGroupBy we only get here with ExtensionArray
1702 ser = Series(values)
1703 else:
1704 # We only get here with values.dtype == object
1705 # TODO: special case not needed with ArrayManager
1706 df = DataFrame(values.T)
1707 # bc we split object blocks in grouped_reduce, we have only 1 col
1708 # otherwise we'd have to worry about block-splitting GH#39329
1709 assert df.shape[1] == 1
1710 # Avoid call to self.values that can occur in DataFrame
1711 # reductions; see GH#28949
1712 ser = df.iloc[:, 0]
1714 # We do not get here with UDFs, so we know that our dtype
1715 # should always be preserved by the implemented aggregations
1716 # TODO: Is this exactly right; see WrappedCythonOp get_result_dtype?
1717 res_values = self.grouper.agg_series(ser, alt, preserve_dtype=True)
1719 if isinstance(values, Categorical):
1720 # Because we only get here with known dtype-preserving
1721 # reductions, we cast back to Categorical.
1722 # TODO: if we ever get "rank" working, exclude it here.
1723 res_values = type(values)._from_sequence(res_values, dtype=values.dtype)
1725 # If we are DataFrameGroupBy and went through a SeriesGroupByPath
1726 # then we need to reshape
1727 # GH#32223 includes case with IntegerArray values, ndarray res_values
1728 # test_groupby_duplicate_columns with object dtype values
1729 return ensure_block_shape(res_values, ndim=ndim)
1731 @final
1732 def _cython_agg_general(
1733 self,
1734 how: str,
1735 alt: Callable,
1736 numeric_only: bool | lib.NoDefault,
1737 min_count: int = -1,
1738 ignore_failures: bool = True,
1739 **kwargs,
1740 ):
1741 # Note: we never get here with how="ohlc" for DataFrameGroupBy;
1742 # that goes through SeriesGroupBy
1743 numeric_only_bool = self._resolve_numeric_only(how, numeric_only, axis=0)
1745 data = self._get_data_to_aggregate()
1746 is_ser = data.ndim == 1
1748 orig_len = len(data)
1749 if numeric_only_bool:
1750 if is_ser and not is_numeric_dtype(self._selected_obj.dtype):
1751 # GH#41291 match Series behavior
1752 kwd_name = "numeric_only"
1753 if how in ["any", "all"]:
1754 kwd_name = "bool_only"
1755 raise NotImplementedError(
1756 f"{type(self).__name__}.{how} does not implement {kwd_name}."
1757 )
1758 elif not is_ser:
1759 data = data.get_numeric_data(copy=False)
1761 def array_func(values: ArrayLike) -> ArrayLike:
1762 try:
1763 result = self.grouper._cython_operation(
1764 "aggregate",
1765 values,
1766 how,
1767 axis=data.ndim - 1,
1768 min_count=min_count,
1769 **kwargs,
1770 )
1771 except NotImplementedError:
1772 # generally if we have numeric_only=False
1773 # and non-applicable functions
1774 # try to python agg
1775 # TODO: shouldn't min_count matter?
1776 result = self._agg_py_fallback(values, ndim=data.ndim, alt=alt)
1778 return result
1780 # TypeError -> we may have an exception in trying to aggregate
1781 # continue and exclude the block
1782 new_mgr = data.grouped_reduce(array_func, ignore_failures=ignore_failures)
1784 if not is_ser and len(new_mgr) < orig_len:
1785 warn_dropping_nuisance_columns_deprecated(type(self), how, numeric_only)
1787 res = self._wrap_agged_manager(new_mgr)
1788 if is_ser:
1789 res.index = self.grouper.result_index
1790 return self._reindex_output(res)
1791 else:
1792 return res
1794 def _cython_transform(
1795 self, how: str, numeric_only: bool = True, axis: int = 0, **kwargs
1796 ):
1797 raise AbstractMethodError(self)
1799 @final
1800 def _transform(self, func, *args, engine=None, engine_kwargs=None, **kwargs):
1802 if maybe_use_numba(engine):
1803 # TODO: tests with self._selected_obj.ndim == 1 on DataFrameGroupBy
1804 with self._group_selection_context():
1805 data = self._selected_obj
1806 df = data if data.ndim == 2 else data.to_frame()
1807 result = self._transform_with_numba(
1808 df, func, *args, engine_kwargs=engine_kwargs, **kwargs
1809 )
1810 if self.obj.ndim == 2:
1811 return cast(DataFrame, self.obj)._constructor(
1812 result, index=data.index, columns=data.columns
1813 )
1814 else:
1815 return cast(Series, self.obj)._constructor(
1816 result.ravel(), index=data.index, name=data.name
1817 )
1819 # optimized transforms
1820 func = com.get_cython_func(func) or func
1822 if not isinstance(func, str):
1823 return self._transform_general(func, *args, **kwargs)
1825 elif func not in base.transform_kernel_allowlist:
1826 msg = f"'{func}' is not a valid function name for transform(name)"
1827 raise ValueError(msg)
1828 elif func in base.cythonized_kernels or func in base.transformation_kernels:
1829 # cythonized transform or canned "agg+broadcast"
1830 return getattr(self, func)(*args, **kwargs)
1832 else:
1833 # i.e. func in base.reduction_kernels
1835 # GH#30918 Use _transform_fast only when we know func is an aggregation
1836 # If func is a reduction, we need to broadcast the
1837 # result to the whole group. Compute func result
1838 # and deal with possible broadcasting below.
1839 # Temporarily set observed for dealing with categoricals.
1840 with com.temp_setattr(self, "observed", True):
1841 result = getattr(self, func)(*args, **kwargs)
1843 return self._wrap_transform_fast_result(result)
1845 @final
1846 def _wrap_transform_fast_result(self, result: NDFrameT) -> NDFrameT:
1847 """
1848 Fast transform path for aggregations.
1849 """
1850 obj = self._obj_with_exclusions
1852 # for each col, reshape to size of original frame by take operation
1853 ids, _, _ = self.grouper.group_info
1854 result = result.reindex(self.grouper.result_index, axis=self.axis, copy=False)
1856 if self.obj.ndim == 1:
1857 # i.e. SeriesGroupBy
1858 out = algorithms.take_nd(result._values, ids)
1859 output = obj._constructor(out, index=obj.index, name=obj.name)
1860 else:
1861 # `.size()` gives Series output on DataFrame input, need axis 0
1862 axis = 0 if result.ndim == 1 else self.axis
1863 # GH#46209
1864 # Don't convert indices: negative indices need to give rise
1865 # to null values in the result
1866 output = result._take(ids, axis=axis, convert_indices=False)
1867 output = output.set_axis(obj._get_axis(self.axis), axis=axis)
1868 return output
1870 # -----------------------------------------------------------------
1871 # Utilities
1873 @final
1874 def _apply_filter(self, indices, dropna):
1875 if len(indices) == 0:
1876 indices = np.array([], dtype="int64")
1877 else:
1878 indices = np.sort(np.concatenate(indices))
1879 if dropna:
1880 filtered = self._selected_obj.take(indices, axis=self.axis)
1881 else:
1882 mask = np.empty(len(self._selected_obj.index), dtype=bool)
1883 mask.fill(False)
1884 mask[indices.astype(int)] = True
1885 # mask fails to broadcast when passed to where; broadcast manually.
1886 mask = np.tile(mask, list(self._selected_obj.shape[1:]) + [1]).T
1887 filtered = self._selected_obj.where(mask) # Fill with NaNs.
1888 return filtered
1890 @final
1891 def _cumcount_array(self, ascending: bool = True) -> np.ndarray:
1892 """
1893 Parameters
1894 ----------
1895 ascending : bool, default True
1896 If False, number in reverse, from length of group - 1 to 0.
1898 Notes
1899 -----
1900 this is currently implementing sort=False
1901 (though the default is sort=True) for groupby in general
1902 """
1903 ids, _, ngroups = self.grouper.group_info
1904 sorter = get_group_index_sorter(ids, ngroups)
1905 ids, count = ids[sorter], len(ids)
1907 if count == 0:
1908 return np.empty(0, dtype=np.int64)
1910 run = np.r_[True, ids[:-1] != ids[1:]]
1911 rep = np.diff(np.r_[np.nonzero(run)[0], count])
1912 out = (~run).cumsum()
1914 if ascending:
1915 out -= np.repeat(out[run], rep)
1916 else:
1917 out = np.repeat(out[np.r_[run[1:], True]], rep) - out
1919 if self.grouper.has_dropped_na:
1920 out = np.where(ids == -1, np.nan, out.astype(np.float64, copy=False))
1921 else:
1922 out = out.astype(np.int64, copy=False)
1924 rev = np.empty(count, dtype=np.intp)
1925 rev[sorter] = np.arange(count, dtype=np.intp)
1926 return out[rev]
1928 # -----------------------------------------------------------------
1930 @final
1931 @property
1932 def _obj_1d_constructor(self) -> Callable:
1933 # GH28330 preserve subclassed Series/DataFrames
1934 if isinstance(self.obj, DataFrame):
1935 return self.obj._constructor_sliced
1936 assert isinstance(self.obj, Series)
1937 return self.obj._constructor
1939 @final
1940 def _bool_agg(self, val_test: Literal["any", "all"], skipna: bool):
1941 """
1942 Shared func to call any / all Cython GroupBy implementations.
1943 """
1945 def objs_to_bool(vals: ArrayLike) -> tuple[np.ndarray, type]:
1946 if is_object_dtype(vals.dtype):
1947 # GH#37501: don't raise on pd.NA when skipna=True
1948 if skipna:
1949 func = np.vectorize(
1950 lambda x: bool(x) if not isna(x) else True, otypes=[bool]
1951 )
1952 vals = func(vals)
1953 else:
1954 vals = vals.astype(bool, copy=False)
1956 vals = cast(np.ndarray, vals)
1957 elif isinstance(vals, BaseMaskedArray):
1958 vals = vals._data.astype(bool, copy=False)
1959 else:
1960 vals = vals.astype(bool, copy=False)
1962 return vals.view(np.int8), bool
1964 def result_to_bool(
1965 result: np.ndarray,
1966 inference: type,
1967 nullable: bool = False,
1968 ) -> ArrayLike:
1969 if nullable:
1970 return BooleanArray(result.astype(bool, copy=False), result == -1)
1971 else:
1972 return result.astype(inference, copy=False)
1974 return self._get_cythonized_result(
1975 libgroupby.group_any_all,
1976 numeric_only=False,
1977 cython_dtype=np.dtype(np.int8),
1978 needs_mask=True,
1979 needs_nullable=True,
1980 pre_processing=objs_to_bool,
1981 post_processing=result_to_bool,
1982 val_test=val_test,
1983 skipna=skipna,
1984 )
1986 @final
1987 @Substitution(name="groupby")
1988 @Appender(_common_see_also)
1989 def any(self, skipna: bool = True):
1990 """
1991 Return True if any value in the group is truthful, else False.
1993 Parameters
1994 ----------
1995 skipna : bool, default True
1996 Flag to ignore nan values during truth testing.
1998 Returns
1999 -------
2000 Series or DataFrame
2001 DataFrame or Series of boolean values, where a value is True if any element
2002 is True within its respective group, False otherwise.
2003 """
2004 return self._bool_agg("any", skipna)
2006 @final
2007 @Substitution(name="groupby")
2008 @Appender(_common_see_also)
2009 def all(self, skipna: bool = True):
2010 """
2011 Return True if all values in the group are truthful, else False.
2013 Parameters
2014 ----------
2015 skipna : bool, default True
2016 Flag to ignore nan values during truth testing.
2018 Returns
2019 -------
2020 Series or DataFrame
2021 DataFrame or Series of boolean values, where a value is True if all elements
2022 are True within its respective group, False otherwise.
2023 """
2024 return self._bool_agg("all", skipna)
2026 @final
2027 @Substitution(name="groupby")
2028 @Appender(_common_see_also)
2029 def count(self) -> NDFrameT:
2030 """
2031 Compute count of group, excluding missing values.
2033 Returns
2034 -------
2035 Series or DataFrame
2036 Count of values within each group.
2037 """
2038 data = self._get_data_to_aggregate()
2039 ids, _, ngroups = self.grouper.group_info
2040 mask = ids != -1
2042 is_series = data.ndim == 1
2044 def hfunc(bvalues: ArrayLike) -> ArrayLike:
2045 # TODO(EA2D): reshape would not be necessary with 2D EAs
2046 if bvalues.ndim == 1:
2047 # EA
2048 masked = mask & ~isna(bvalues).reshape(1, -1)
2049 else:
2050 masked = mask & ~isna(bvalues)
2052 counted = lib.count_level_2d(masked, labels=ids, max_bin=ngroups, axis=1)
2053 if is_series:
2054 assert counted.ndim == 2
2055 assert counted.shape[0] == 1
2056 return counted[0]
2057 return counted
2059 new_mgr = data.grouped_reduce(hfunc)
2061 # If we are grouping on categoricals we want unobserved categories to
2062 # return zero, rather than the default of NaN which the reindexing in
2063 # _wrap_agged_manager() returns. GH 35028
2064 with com.temp_setattr(self, "observed", True):
2065 result = self._wrap_agged_manager(new_mgr)
2067 if result.ndim == 1:
2068 result.index = self.grouper.result_index
2070 return self._reindex_output(result, fill_value=0)
2072 @final
2073 @Substitution(name="groupby")
2074 @Substitution(see_also=_common_see_also)
2075 def mean(
2076 self,
2077 numeric_only: bool | lib.NoDefault = lib.no_default,
2078 engine: str = "cython",
2079 engine_kwargs: dict[str, bool] | None = None,
2080 ):
2081 """
2082 Compute mean of groups, excluding missing values.
2084 Parameters
2085 ----------
2086 numeric_only : bool, default True
2087 Include only float, int, boolean columns. If None, will attempt to use
2088 everything, then use only numeric data.
2090 engine : str, default None
2091 * ``'cython'`` : Runs the operation through C-extensions from cython.
2092 * ``'numba'`` : Runs the operation through JIT compiled code from numba.
2093 * ``None`` : Defaults to ``'cython'`` or globally setting
2094 ``compute.use_numba``
2096 .. versionadded:: 1.4.0
2098 engine_kwargs : dict, default None
2099 * For ``'cython'`` engine, there are no accepted ``engine_kwargs``
2100 * For ``'numba'`` engine, the engine can accept ``nopython``, ``nogil``
2101 and ``parallel`` dictionary keys. The values must either be ``True`` or
2102 ``False``. The default ``engine_kwargs`` for the ``'numba'`` engine is
2103 ``{{'nopython': True, 'nogil': False, 'parallel': False}}``
2105 .. versionadded:: 1.4.0
2107 Returns
2108 -------
2109 pandas.Series or pandas.DataFrame
2110 %(see_also)s
2111 Examples
2112 --------
2113 >>> df = pd.DataFrame({'A': [1, 1, 2, 1, 2],
2114 ... 'B': [np.nan, 2, 3, 4, 5],
2115 ... 'C': [1, 2, 1, 1, 2]}, columns=['A', 'B', 'C'])
2117 Groupby one column and return the mean of the remaining columns in
2118 each group.
2120 >>> df.groupby('A').mean()
2121 B C
2122 A
2123 1 3.0 1.333333
2124 2 4.0 1.500000
2126 Groupby two columns and return the mean of the remaining column.
2128 >>> df.groupby(['A', 'B']).mean()
2129 C
2130 A B
2131 1 2.0 2.0
2132 4.0 1.0
2133 2 3.0 1.0
2134 5.0 2.0
2136 Groupby one column and return the mean of only particular column in
2137 the group.
2139 >>> df.groupby('A')['B'].mean()
2140 A
2141 1 3.0
2142 2 4.0
2143 Name: B, dtype: float64
2144 """
2145 numeric_only_bool = self._resolve_numeric_only("mean", numeric_only, axis=0)
2147 if maybe_use_numba(engine):
2148 from pandas.core._numba.kernels import sliding_mean
2150 return self._numba_agg_general(sliding_mean, engine_kwargs)
2151 else:
2152 result = self._cython_agg_general(
2153 "mean",
2154 alt=lambda x: Series(x).mean(numeric_only=numeric_only_bool),
2155 numeric_only=numeric_only,
2156 )
2157 return result.__finalize__(self.obj, method="groupby")
2159 @final
2160 @Substitution(name="groupby")
2161 @Appender(_common_see_also)
2162 def median(self, numeric_only: bool | lib.NoDefault = lib.no_default):
2163 """
2164 Compute median of groups, excluding missing values.
2166 For multiple groupings, the result index will be a MultiIndex
2168 Parameters
2169 ----------
2170 numeric_only : bool, default True
2171 Include only float, int, boolean columns. If None, will attempt to use
2172 everything, then use only numeric data.
2174 Returns
2175 -------
2176 Series or DataFrame
2177 Median of values within each group.
2178 """
2179 numeric_only_bool = self._resolve_numeric_only("median", numeric_only, axis=0)
2181 result = self._cython_agg_general(
2182 "median",
2183 alt=lambda x: Series(x).median(numeric_only=numeric_only_bool),
2184 numeric_only=numeric_only,
2185 )
2186 return result.__finalize__(self.obj, method="groupby")
2188 @final
2189 @Substitution(name="groupby")
2190 @Appender(_common_see_also)
2191 def std(
2192 self,
2193 ddof: int = 1,
2194 engine: str | None = None,
2195 engine_kwargs: dict[str, bool] | None = None,
2196 numeric_only: bool | lib.NoDefault = lib.no_default,
2197 ):
2198 """
2199 Compute standard deviation of groups, excluding missing values.
2201 For multiple groupings, the result index will be a MultiIndex.
2203 Parameters
2204 ----------
2205 ddof : int, default 1
2206 Degrees of freedom.
2208 engine : str, default None
2209 * ``'cython'`` : Runs the operation through C-extensions from cython.
2210 * ``'numba'`` : Runs the operation through JIT compiled code from numba.
2211 * ``None`` : Defaults to ``'cython'`` or globally setting
2212 ``compute.use_numba``
2214 .. versionadded:: 1.4.0
2216 engine_kwargs : dict, default None
2217 * For ``'cython'`` engine, there are no accepted ``engine_kwargs``
2218 * For ``'numba'`` engine, the engine can accept ``nopython``, ``nogil``
2219 and ``parallel`` dictionary keys. The values must either be ``True`` or
2220 ``False``. The default ``engine_kwargs`` for the ``'numba'`` engine is
2221 ``{{'nopython': True, 'nogil': False, 'parallel': False}}``
2223 .. versionadded:: 1.4.0
2225 numeric_only : bool, default True
2226 Include only `float`, `int` or `boolean` data.
2228 .. versionadded:: 1.5.0
2230 Returns
2231 -------
2232 Series or DataFrame
2233 Standard deviation of values within each group.
2234 """
2235 if maybe_use_numba(engine):
2236 from pandas.core._numba.kernels import sliding_var
2238 return np.sqrt(self._numba_agg_general(sliding_var, engine_kwargs, ddof))
2239 else:
2240 # Resolve numeric_only so that var doesn't warn
2241 numeric_only_bool = self._resolve_numeric_only("std", numeric_only, axis=0)
2242 if (
2243 numeric_only_bool
2244 and self.obj.ndim == 1
2245 and not is_numeric_dtype(self.obj.dtype)
2246 ):
2247 raise TypeError(
2248 f"{type(self).__name__}.std called with "
2249 f"numeric_only={numeric_only} and dtype {self.obj.dtype}"
2250 )
2251 result = self._get_cythonized_result(
2252 libgroupby.group_var,
2253 cython_dtype=np.dtype(np.float64),
2254 numeric_only=numeric_only_bool,
2255 needs_counts=True,
2256 post_processing=lambda vals, inference: np.sqrt(vals),
2257 ddof=ddof,
2258 )
2259 self._maybe_warn_numeric_only_depr("std", result, numeric_only)
2260 return result
2262 @final
2263 @Substitution(name="groupby")
2264 @Appender(_common_see_also)
2265 def var(
2266 self,
2267 ddof: int = 1,
2268 engine: str | None = None,
2269 engine_kwargs: dict[str, bool] | None = None,
2270 numeric_only: bool | lib.NoDefault = lib.no_default,
2271 ):
2272 """
2273 Compute variance of groups, excluding missing values.
2275 For multiple groupings, the result index will be a MultiIndex.
2277 Parameters
2278 ----------
2279 ddof : int, default 1
2280 Degrees of freedom.
2282 engine : str, default None
2283 * ``'cython'`` : Runs the operation through C-extensions from cython.
2284 * ``'numba'`` : Runs the operation through JIT compiled code from numba.
2285 * ``None`` : Defaults to ``'cython'`` or globally setting
2286 ``compute.use_numba``
2288 .. versionadded:: 1.4.0
2290 engine_kwargs : dict, default None
2291 * For ``'cython'`` engine, there are no accepted ``engine_kwargs``
2292 * For ``'numba'`` engine, the engine can accept ``nopython``, ``nogil``
2293 and ``parallel`` dictionary keys. The values must either be ``True`` or
2294 ``False``. The default ``engine_kwargs`` for the ``'numba'`` engine is
2295 ``{{'nopython': True, 'nogil': False, 'parallel': False}}``
2297 .. versionadded:: 1.4.0
2299 numeric_only : bool, default True
2300 Include only `float`, `int` or `boolean` data.
2302 .. versionadded:: 1.5.0
2304 Returns
2305 -------
2306 Series or DataFrame
2307 Variance of values within each group.
2308 """
2309 if maybe_use_numba(engine):
2310 from pandas.core._numba.kernels import sliding_var
2312 return self._numba_agg_general(sliding_var, engine_kwargs, ddof)
2313 else:
2314 return self._cython_agg_general(
2315 "var",
2316 alt=lambda x: Series(x).var(ddof=ddof),
2317 numeric_only=numeric_only,
2318 ignore_failures=numeric_only is lib.no_default,
2319 ddof=ddof,
2320 )
2322 @final
2323 @Substitution(name="groupby")
2324 @Appender(_common_see_also)
2325 def sem(self, ddof: int = 1, numeric_only: bool | lib.NoDefault = lib.no_default):
2326 """
2327 Compute standard error of the mean of groups, excluding missing values.
2329 For multiple groupings, the result index will be a MultiIndex.
2331 Parameters
2332 ----------
2333 ddof : int, default 1
2334 Degrees of freedom.
2336 numeric_only : bool, default True
2337 Include only `float`, `int` or `boolean` data.
2339 .. versionadded:: 1.5.0
2341 Returns
2342 -------
2343 Series or DataFrame
2344 Standard error of the mean of values within each group.
2345 """
2346 # Reolve numeric_only so that std doesn't warn
2347 numeric_only_bool = self._resolve_numeric_only("sem", numeric_only, axis=0)
2348 if (
2349 numeric_only_bool
2350 and self.obj.ndim == 1
2351 and not is_numeric_dtype(self.obj.dtype)
2352 ):
2353 raise TypeError(
2354 f"{type(self).__name__}.sem called with "
2355 f"numeric_only={numeric_only} and dtype {self.obj.dtype}"
2356 )
2357 result = self.std(ddof=ddof, numeric_only=numeric_only_bool)
2358 self._maybe_warn_numeric_only_depr("sem", result, numeric_only)
2360 if result.ndim == 1:
2361 result /= np.sqrt(self.count())
2362 else:
2363 cols = result.columns.difference(self.exclusions).unique()
2364 counts = self.count()
2365 result_ilocs = result.columns.get_indexer_for(cols)
2366 count_ilocs = counts.columns.get_indexer_for(cols)
2367 with warnings.catch_warnings():
2368 # TODO(2.0): once iloc[:, foo] = bar depecation is enforced,
2369 # this catching will be unnecessary
2370 warnings.filterwarnings(
2371 "ignore", ".*will attempt to set the values inplace.*"
2372 )
2373 result.iloc[:, result_ilocs] /= np.sqrt(counts.iloc[:, count_ilocs])
2374 return result
2376 @final
2377 @Substitution(name="groupby")
2378 @Appender(_common_see_also)
2379 def size(self) -> DataFrame | Series:
2380 """
2381 Compute group sizes.
2383 Returns
2384 -------
2385 DataFrame or Series
2386 Number of rows in each group as a Series if as_index is True
2387 or a DataFrame if as_index is False.
2388 """
2389 result = self.grouper.size()
2391 # GH28330 preserve subclassed Series/DataFrames through calls
2392 if isinstance(self.obj, Series):
2393 result = self._obj_1d_constructor(result, name=self.obj.name)
2394 else:
2395 result = self._obj_1d_constructor(result)
2397 if not self.as_index:
2398 # error: Incompatible types in assignment (expression has
2399 # type "DataFrame", variable has type "Series")
2400 result = result.rename("size").reset_index() # type: ignore[assignment]
2402 return self._reindex_output(result, fill_value=0)
2404 @final
2405 @doc(_groupby_agg_method_template, fname="sum", no=True, mc=0)
2406 def sum(
2407 self,
2408 numeric_only: bool | lib.NoDefault = lib.no_default,
2409 min_count: int = 0,
2410 engine: str | None = None,
2411 engine_kwargs: dict[str, bool] | None = None,
2412 ):
2413 if maybe_use_numba(engine):
2414 from pandas.core._numba.kernels import sliding_sum
2416 return self._numba_agg_general(
2417 sliding_sum,
2418 engine_kwargs,
2419 )
2420 else:
2421 # If we are grouping on categoricals we want unobserved categories to
2422 # return zero, rather than the default of NaN which the reindexing in
2423 # _agg_general() returns. GH #31422
2424 with com.temp_setattr(self, "observed", True):
2425 result = self._agg_general(
2426 numeric_only=numeric_only,
2427 min_count=min_count,
2428 alias="sum",
2429 npfunc=np.sum,
2430 )
2432 return self._reindex_output(result, fill_value=0)
2434 @final
2435 @doc(_groupby_agg_method_template, fname="prod", no=True, mc=0)
2436 def prod(
2437 self, numeric_only: bool | lib.NoDefault = lib.no_default, min_count: int = 0
2438 ):
2439 return self._agg_general(
2440 numeric_only=numeric_only, min_count=min_count, alias="prod", npfunc=np.prod
2441 )
2443 @final
2444 @doc(_groupby_agg_method_template, fname="min", no=False, mc=-1)
2445 def min(
2446 self,
2447 numeric_only: bool = False,
2448 min_count: int = -1,
2449 engine: str | None = None,
2450 engine_kwargs: dict[str, bool] | None = None,
2451 ):
2452 if maybe_use_numba(engine):
2453 from pandas.core._numba.kernels import sliding_min_max
2455 return self._numba_agg_general(sliding_min_max, engine_kwargs, False)
2456 else:
2457 return self._agg_general(
2458 numeric_only=numeric_only,
2459 min_count=min_count,
2460 alias="min",
2461 npfunc=np.min,
2462 )
2464 @final
2465 @doc(_groupby_agg_method_template, fname="max", no=False, mc=-1)
2466 def max(
2467 self,
2468 numeric_only: bool = False,
2469 min_count: int = -1,
2470 engine: str | None = None,
2471 engine_kwargs: dict[str, bool] | None = None,
2472 ):
2473 if maybe_use_numba(engine):
2474 from pandas.core._numba.kernels import sliding_min_max
2476 return self._numba_agg_general(sliding_min_max, engine_kwargs, True)
2477 else:
2478 return self._agg_general(
2479 numeric_only=numeric_only,
2480 min_count=min_count,
2481 alias="max",
2482 npfunc=np.max,
2483 )
2485 @final
2486 @Substitution(name="groupby")
2487 def first(self, numeric_only: bool = False, min_count: int = -1):
2488 """
2489 Compute the first non-null entry of each column.
2491 Parameters
2492 ----------
2493 numeric_only : bool, default False
2494 Include only float, int, boolean columns.
2495 min_count : int, default -1
2496 The required number of valid values to perform the operation. If fewer
2497 than ``min_count`` non-NA values are present the result will be NA.
2499 Returns
2500 -------
2501 Series or DataFrame
2502 First non-null of values within each group.
2504 See Also
2505 --------
2506 DataFrame.groupby : Apply a function groupby to each row or column of a
2507 DataFrame.
2508 DataFrame.core.groupby.GroupBy.last : Compute the last non-null entry of each
2509 column.
2510 DataFrame.core.groupby.GroupBy.nth : Take the nth row from each group.
2512 Examples
2513 --------
2514 >>> df = pd.DataFrame(dict(A=[1, 1, 3], B=[None, 5, 6], C=[1, 2, 3],
2515 ... D=['3/11/2000', '3/12/2000', '3/13/2000']))
2516 >>> df['D'] = pd.to_datetime(df['D'])
2517 >>> df.groupby("A").first()
2518 B C D
2519 A
2520 1 5.0 1 2000-03-11
2521 3 6.0 3 2000-03-13
2522 >>> df.groupby("A").first(min_count=2)
2523 B C D
2524 A
2525 1 NaN 1.0 2000-03-11
2526 3 NaN NaN NaT
2527 >>> df.groupby("A").first(numeric_only=True)
2528 B C
2529 A
2530 1 5.0 1
2531 3 6.0 3
2532 """
2534 def first_compat(obj: NDFrameT, axis: int = 0):
2535 def first(x: Series):
2536 """Helper function for first item that isn't NA."""
2537 arr = x.array[notna(x.array)]
2538 if not len(arr):
2539 return np.nan
2540 return arr[0]
2542 if isinstance(obj, DataFrame):
2543 return obj.apply(first, axis=axis)
2544 elif isinstance(obj, Series):
2545 return first(obj)
2546 else: # pragma: no cover
2547 raise TypeError(type(obj))
2549 return self._agg_general(
2550 numeric_only=numeric_only,
2551 min_count=min_count,
2552 alias="first",
2553 npfunc=first_compat,
2554 )
2556 @final
2557 @Substitution(name="groupby")
2558 def last(self, numeric_only: bool = False, min_count: int = -1):
2559 """
2560 Compute the last non-null entry of each column.
2562 Parameters
2563 ----------
2564 numeric_only : bool, default False
2565 Include only float, int, boolean columns. If None, will attempt to use
2566 everything, then use only numeric data.
2567 min_count : int, default -1
2568 The required number of valid values to perform the operation. If fewer
2569 than ``min_count`` non-NA values are present the result will be NA.
2571 Returns
2572 -------
2573 Series or DataFrame
2574 Last non-null of values within each group.
2576 See Also
2577 --------
2578 DataFrame.groupby : Apply a function groupby to each row or column of a
2579 DataFrame.
2580 DataFrame.core.groupby.GroupBy.first : Compute the first non-null entry of each
2581 column.
2582 DataFrame.core.groupby.GroupBy.nth : Take the nth row from each group.
2584 Examples
2585 --------
2586 >>> df = pd.DataFrame(dict(A=[1, 1, 3], B=[5, None, 6], C=[1, 2, 3]))
2587 >>> df.groupby("A").last()
2588 B C
2589 A
2590 1 5.0 2
2591 3 6.0 3
2592 """
2594 def last_compat(obj: NDFrameT, axis: int = 0):
2595 def last(x: Series):
2596 """Helper function for last item that isn't NA."""
2597 arr = x.array[notna(x.array)]
2598 if not len(arr):
2599 return np.nan
2600 return arr[-1]
2602 if isinstance(obj, DataFrame):
2603 return obj.apply(last, axis=axis)
2604 elif isinstance(obj, Series):
2605 return last(obj)
2606 else: # pragma: no cover
2607 raise TypeError(type(obj))
2609 return self._agg_general(
2610 numeric_only=numeric_only,
2611 min_count=min_count,
2612 alias="last",
2613 npfunc=last_compat,
2614 )
2616 @final
2617 @Substitution(name="groupby")
2618 @Appender(_common_see_also)
2619 def ohlc(self) -> DataFrame:
2620 """
2621 Compute open, high, low and close values of a group, excluding missing values.
2623 For multiple groupings, the result index will be a MultiIndex
2625 Returns
2626 -------
2627 DataFrame
2628 Open, high, low and close values within each group.
2629 """
2630 if self.obj.ndim == 1:
2631 # self._iterate_slices() yields only self._selected_obj
2632 obj = self._selected_obj
2634 is_numeric = is_numeric_dtype(obj.dtype)
2635 if not is_numeric:
2636 raise DataError("No numeric types to aggregate")
2638 res_values = self.grouper._cython_operation(
2639 "aggregate", obj._values, "ohlc", axis=0, min_count=-1
2640 )
2642 agg_names = ["open", "high", "low", "close"]
2643 result = self.obj._constructor_expanddim(
2644 res_values, index=self.grouper.result_index, columns=agg_names
2645 )
2646 return self._reindex_output(result)
2648 return self._apply_to_column_groupbys(
2649 lambda x: x.ohlc(), self._obj_with_exclusions
2650 )
2652 @doc(DataFrame.describe)
2653 def describe(self, **kwargs):
2654 with self._group_selection_context():
2655 if len(self._selected_obj) == 0:
2656 described = self._selected_obj.describe(**kwargs)
2657 if self._selected_obj.ndim == 1:
2658 result = described
2659 else:
2660 result = described.unstack()
2661 return result.to_frame().T.iloc[:0]
2663 result = self._python_apply_general(
2664 lambda x: x.describe(**kwargs),
2665 self._selected_obj,
2666 not_indexed_same=True,
2667 )
2668 if self.axis == 1:
2669 return result.T
2670 return result.unstack()
2672 @final
2673 def resample(self, rule, *args, **kwargs):
2674 """
2675 Provide resampling when using a TimeGrouper.
2677 Given a grouper, the function resamples it according to a string
2678 "string" -> "frequency".
2680 See the :ref:`frequency aliases <timeseries.offset_aliases>`
2681 documentation for more details.
2683 Parameters
2684 ----------
2685 rule : str or DateOffset
2686 The offset string or object representing target grouper conversion.
2687 *args, **kwargs
2688 Possible arguments are `how`, `fill_method`, `limit`, `kind` and
2689 `on`, and other arguments of `TimeGrouper`.
2691 Returns
2692 -------
2693 Grouper
2694 Return a new grouper with our resampler appended.
2696 See Also
2697 --------
2698 Grouper : Specify a frequency to resample with when
2699 grouping by a key.
2700 DatetimeIndex.resample : Frequency conversion and resampling of
2701 time series.
2703 Examples
2704 --------
2705 >>> idx = pd.date_range('1/1/2000', periods=4, freq='T')
2706 >>> df = pd.DataFrame(data=4 * [range(2)],
2707 ... index=idx,
2708 ... columns=['a', 'b'])
2709 >>> df.iloc[2, 0] = 5
2710 >>> df
2711 a b
2712 2000-01-01 00:00:00 0 1
2713 2000-01-01 00:01:00 0 1
2714 2000-01-01 00:02:00 5 1
2715 2000-01-01 00:03:00 0 1
2717 Downsample the DataFrame into 3 minute bins and sum the values of
2718 the timestamps falling into a bin.
2720 >>> df.groupby('a').resample('3T').sum()
2721 a b
2722 a
2723 0 2000-01-01 00:00:00 0 2
2724 2000-01-01 00:03:00 0 1
2725 5 2000-01-01 00:00:00 5 1
2727 Upsample the series into 30 second bins.
2729 >>> df.groupby('a').resample('30S').sum()
2730 a b
2731 a
2732 0 2000-01-01 00:00:00 0 1
2733 2000-01-01 00:00:30 0 0
2734 2000-01-01 00:01:00 0 1
2735 2000-01-01 00:01:30 0 0
2736 2000-01-01 00:02:00 0 0
2737 2000-01-01 00:02:30 0 0
2738 2000-01-01 00:03:00 0 1
2739 5 2000-01-01 00:02:00 5 1
2741 Resample by month. Values are assigned to the month of the period.
2743 >>> df.groupby('a').resample('M').sum()
2744 a b
2745 a
2746 0 2000-01-31 0 3
2747 5 2000-01-31 5 1
2749 Downsample the series into 3 minute bins as above, but close the right
2750 side of the bin interval.
2752 >>> df.groupby('a').resample('3T', closed='right').sum()
2753 a b
2754 a
2755 0 1999-12-31 23:57:00 0 1
2756 2000-01-01 00:00:00 0 2
2757 5 2000-01-01 00:00:00 5 1
2759 Downsample the series into 3 minute bins and close the right side of
2760 the bin interval, but label each bin using the right edge instead of
2761 the left.
2763 >>> df.groupby('a').resample('3T', closed='right', label='right').sum()
2764 a b
2765 a
2766 0 2000-01-01 00:00:00 0 1
2767 2000-01-01 00:03:00 0 2
2768 5 2000-01-01 00:03:00 5 1
2769 """
2770 from pandas.core.resample import get_resampler_for_grouping
2772 return get_resampler_for_grouping(self, rule, *args, **kwargs)
2774 @final
2775 @Substitution(name="groupby")
2776 @Appender(_common_see_also)
2777 def rolling(self, *args, **kwargs) -> RollingGroupby:
2778 """
2779 Return a rolling grouper, providing rolling functionality per group.
2780 """
2781 from pandas.core.window import RollingGroupby
2783 return RollingGroupby(
2784 self._selected_obj,
2785 *args,
2786 _grouper=self.grouper,
2787 _as_index=self.as_index,
2788 **kwargs,
2789 )
2791 @final
2792 @Substitution(name="groupby")
2793 @Appender(_common_see_also)
2794 def expanding(self, *args, **kwargs) -> ExpandingGroupby:
2795 """
2796 Return an expanding grouper, providing expanding
2797 functionality per group.
2798 """
2799 from pandas.core.window import ExpandingGroupby
2801 return ExpandingGroupby(
2802 self._selected_obj,
2803 *args,
2804 _grouper=self.grouper,
2805 **kwargs,
2806 )
2808 @final
2809 @Substitution(name="groupby")
2810 @Appender(_common_see_also)
2811 def ewm(self, *args, **kwargs) -> ExponentialMovingWindowGroupby:
2812 """
2813 Return an ewm grouper, providing ewm functionality per group.
2814 """
2815 from pandas.core.window import ExponentialMovingWindowGroupby
2817 return ExponentialMovingWindowGroupby(
2818 self._selected_obj,
2819 *args,
2820 _grouper=self.grouper,
2821 **kwargs,
2822 )
2824 @final
2825 def _fill(self, direction: Literal["ffill", "bfill"], limit=None):
2826 """
2827 Shared function for `pad` and `backfill` to call Cython method.
2829 Parameters
2830 ----------
2831 direction : {'ffill', 'bfill'}
2832 Direction passed to underlying Cython function. `bfill` will cause
2833 values to be filled backwards. `ffill` and any other values will
2834 default to a forward fill
2835 limit : int, default None
2836 Maximum number of consecutive values to fill. If `None`, this
2837 method will convert to -1 prior to passing to Cython
2839 Returns
2840 -------
2841 `Series` or `DataFrame` with filled values
2843 See Also
2844 --------
2845 pad : Returns Series with minimum number of char in object.
2846 backfill : Backward fill the missing values in the dataset.
2847 """
2848 # Need int value for Cython
2849 if limit is None:
2850 limit = -1
2852 ids, _, _ = self.grouper.group_info
2853 sorted_labels = np.argsort(ids, kind="mergesort").astype(np.intp, copy=False)
2854 if direction == "bfill":
2855 sorted_labels = sorted_labels[::-1]
2857 col_func = partial(
2858 libgroupby.group_fillna_indexer,
2859 labels=ids,
2860 sorted_labels=sorted_labels,
2861 direction=direction,
2862 limit=limit,
2863 dropna=self.dropna,
2864 )
2866 def blk_func(values: ArrayLike) -> ArrayLike:
2867 mask = isna(values)
2868 if values.ndim == 1:
2869 indexer = np.empty(values.shape, dtype=np.intp)
2870 col_func(out=indexer, mask=mask)
2871 return algorithms.take_nd(values, indexer)
2873 else:
2874 # We broadcast algorithms.take_nd analogous to
2875 # np.take_along_axis
2877 # Note: we only get here with backfill/pad,
2878 # so if we have a dtype that cannot hold NAs,
2879 # then there will be no -1s in indexer, so we can use
2880 # the original dtype (no need to ensure_dtype_can_hold_na)
2881 if isinstance(values, np.ndarray):
2882 dtype = values.dtype
2883 if self.grouper.has_dropped_na:
2884 # dropped null groups give rise to nan in the result
2885 dtype = ensure_dtype_can_hold_na(values.dtype)
2886 out = np.empty(values.shape, dtype=dtype)
2887 else:
2888 out = type(values)._empty(values.shape, dtype=values.dtype)
2890 for i in range(len(values)):
2891 # call group_fillna_indexer column-wise
2892 indexer = np.empty(values.shape[1], dtype=np.intp)
2893 col_func(out=indexer, mask=mask[i])
2894 out[i, :] = algorithms.take_nd(values[i], indexer)
2895 return out
2897 obj = self._obj_with_exclusions
2898 if self.axis == 1:
2899 obj = obj.T
2900 mgr = obj._mgr
2901 res_mgr = mgr.apply(blk_func)
2903 new_obj = obj._constructor(res_mgr)
2904 if isinstance(new_obj, Series):
2905 new_obj.name = obj.name
2907 return self._wrap_transformed_output(new_obj)
2909 @final
2910 @Substitution(name="groupby")
2911 def ffill(self, limit=None):
2912 """
2913 Forward fill the values.
2915 Parameters
2916 ----------
2917 limit : int, optional
2918 Limit of how many values to fill.
2920 Returns
2921 -------
2922 Series or DataFrame
2923 Object with missing values filled.
2925 See Also
2926 --------
2927 Series.ffill: Returns Series with minimum number of char in object.
2928 DataFrame.ffill: Object with missing values filled or None if inplace=True.
2929 Series.fillna: Fill NaN values of a Series.
2930 DataFrame.fillna: Fill NaN values of a DataFrame.
2931 """
2932 return self._fill("ffill", limit=limit)
2934 def pad(self, limit=None):
2935 """
2936 Forward fill the values.
2938 .. deprecated:: 1.4
2939 Use ffill instead.
2941 Parameters
2942 ----------
2943 limit : int, optional
2944 Limit of how many values to fill.
2946 Returns
2947 -------
2948 Series or DataFrame
2949 Object with missing values filled.
2950 """
2951 warnings.warn(
2952 "pad is deprecated and will be removed in a future version. "
2953 "Use ffill instead.",
2954 FutureWarning,
2955 stacklevel=find_stack_level(),
2956 )
2957 return self.ffill(limit=limit)
2959 @final
2960 @Substitution(name="groupby")
2961 def bfill(self, limit=None):
2962 """
2963 Backward fill the values.
2965 Parameters
2966 ----------
2967 limit : int, optional
2968 Limit of how many values to fill.
2970 Returns
2971 -------
2972 Series or DataFrame
2973 Object with missing values filled.
2975 See Also
2976 --------
2977 Series.bfill : Backward fill the missing values in the dataset.
2978 DataFrame.bfill: Backward fill the missing values in the dataset.
2979 Series.fillna: Fill NaN values of a Series.
2980 DataFrame.fillna: Fill NaN values of a DataFrame.
2981 """
2982 return self._fill("bfill", limit=limit)
2984 def backfill(self, limit=None):
2985 """
2986 Backward fill the values.
2988 .. deprecated:: 1.4
2989 Use bfill instead.
2991 Parameters
2992 ----------
2993 limit : int, optional
2994 Limit of how many values to fill.
2996 Returns
2997 -------
2998 Series or DataFrame
2999 Object with missing values filled.
3000 """
3001 warnings.warn(
3002 "backfill is deprecated and will be removed in a future version. "
3003 "Use bfill instead.",
3004 FutureWarning,
3005 stacklevel=find_stack_level(),
3006 )
3007 return self.bfill(limit=limit)
3009 # https://github.com/python/mypy/issues/1362
3010 # Mypy does not support decorated properties
3011 @final # type: ignore[misc]
3012 @property
3013 @Substitution(name="groupby")
3014 @Substitution(see_also=_common_see_also)
3015 def nth(self) -> GroupByNthSelector:
3016 """
3017 Take the nth row from each group if n is an int, otherwise a subset of rows.
3019 Can be either a call or an index. dropna is not available with index notation.
3020 Index notation accepts a comma separated list of integers and slices.
3022 If dropna, will take the nth non-null row, dropna is either
3023 'all' or 'any'; this is equivalent to calling dropna(how=dropna)
3024 before the groupby.
3026 Parameters
3027 ----------
3028 n : int, slice or list of ints and slices
3029 A single nth value for the row or a list of nth values or slices.
3031 .. versionchanged:: 1.4.0
3032 Added slice and lists containing slices.
3033 Added index notation.
3035 dropna : {'any', 'all', None}, default None
3036 Apply the specified dropna operation before counting which row is
3037 the nth row. Only supported if n is an int.
3039 Returns
3040 -------
3041 Series or DataFrame
3042 N-th value within each group.
3043 %(see_also)s
3044 Examples
3045 --------
3047 >>> df = pd.DataFrame({'A': [1, 1, 2, 1, 2],
3048 ... 'B': [np.nan, 2, 3, 4, 5]}, columns=['A', 'B'])
3049 >>> g = df.groupby('A')
3050 >>> g.nth(0)
3051 B
3052 A
3053 1 NaN
3054 2 3.0
3055 >>> g.nth(1)
3056 B
3057 A
3058 1 2.0
3059 2 5.0
3060 >>> g.nth(-1)
3061 B
3062 A
3063 1 4.0
3064 2 5.0
3065 >>> g.nth([0, 1])
3066 B
3067 A
3068 1 NaN
3069 1 2.0
3070 2 3.0
3071 2 5.0
3072 >>> g.nth(slice(None, -1))
3073 B
3074 A
3075 1 NaN
3076 1 2.0
3077 2 3.0
3079 Index notation may also be used
3081 >>> g.nth[0, 1]
3082 B
3083 A
3084 1 NaN
3085 1 2.0
3086 2 3.0
3087 2 5.0
3088 >>> g.nth[:-1]
3089 B
3090 A
3091 1 NaN
3092 1 2.0
3093 2 3.0
3095 Specifying `dropna` allows count ignoring ``NaN``
3097 >>> g.nth(0, dropna='any')
3098 B
3099 A
3100 1 2.0
3101 2 3.0
3103 NaNs denote group exhausted when using dropna
3105 >>> g.nth(3, dropna='any')
3106 B
3107 A
3108 1 NaN
3109 2 NaN
3111 Specifying `as_index=False` in `groupby` keeps the original index.
3113 >>> df.groupby('A', as_index=False).nth(1)
3114 A B
3115 1 1 2.0
3116 4 2 5.0
3117 """
3118 return GroupByNthSelector(self)
3120 def _nth(
3121 self,
3122 n: PositionalIndexer | tuple,
3123 dropna: Literal["any", "all", None] = None,
3124 ) -> NDFrameT:
3125 if not dropna:
3126 with self._group_selection_context():
3127 mask = self._make_mask_from_positional_indexer(n)
3129 ids, _, _ = self.grouper.group_info
3131 # Drop NA values in grouping
3132 mask = mask & (ids != -1)
3134 out = self._mask_selected_obj(mask)
3135 if not self.as_index:
3136 return out
3138 result_index = self.grouper.result_index
3139 if self.axis == 0:
3140 out.index = result_index[ids[mask]]
3141 if not self.observed and isinstance(result_index, CategoricalIndex):
3142 out = out.reindex(result_index)
3144 out = self._reindex_output(out)
3145 else:
3146 out.columns = result_index[ids[mask]]
3148 return out.sort_index(axis=self.axis) if self.sort else out
3150 # dropna is truthy
3151 if not is_integer(n):
3152 raise ValueError("dropna option only supported for an integer argument")
3154 if dropna not in ["any", "all"]:
3155 # Note: when agg-ing picker doesn't raise this, just returns NaN
3156 raise ValueError(
3157 "For a DataFrame or Series groupby.nth, dropna must be "
3158 "either None, 'any' or 'all', "
3159 f"(was passed {dropna})."
3160 )
3162 # old behaviour, but with all and any support for DataFrames.
3163 # modified in GH 7559 to have better perf
3164 n = cast(int, n)
3165 max_len = n if n >= 0 else -1 - n
3166 dropped = self.obj.dropna(how=dropna, axis=self.axis)
3168 # get a new grouper for our dropped obj
3169 if self.keys is None and self.level is None:
3171 # we don't have the grouper info available
3172 # (e.g. we have selected out
3173 # a column that is not in the current object)
3174 axis = self.grouper.axis
3175 grouper = axis[axis.isin(dropped.index)]
3177 else:
3179 # create a grouper with the original parameters, but on dropped
3180 # object
3181 from pandas.core.groupby.grouper import get_grouper
3183 grouper, _, _ = get_grouper(
3184 dropped,
3185 key=self.keys,
3186 axis=self.axis,
3187 level=self.level,
3188 sort=self.sort,
3189 mutated=self.mutated,
3190 )
3192 grb = dropped.groupby(
3193 grouper, as_index=self.as_index, sort=self.sort, axis=self.axis
3194 )
3195 sizes, result = grb.size(), grb.nth(n)
3196 mask = (sizes < max_len)._values
3198 # set the results which don't meet the criteria
3199 if len(result) and mask.any():
3200 result.loc[mask] = np.nan
3202 # reset/reindex to the original groups
3203 if len(self.obj) == len(dropped) or len(result) == len(
3204 self.grouper.result_index
3205 ):
3206 result.index = self.grouper.result_index
3207 else:
3208 result = result.reindex(self.grouper.result_index)
3210 return result
3212 @final
3213 def quantile(
3214 self,
3215 q=0.5,
3216 interpolation: str = "linear",
3217 numeric_only: bool | lib.NoDefault = lib.no_default,
3218 ):
3219 """
3220 Return group values at the given quantile, a la numpy.percentile.
3222 Parameters
3223 ----------
3224 q : float or array-like, default 0.5 (50% quantile)
3225 Value(s) between 0 and 1 providing the quantile(s) to compute.
3226 interpolation : {'linear', 'lower', 'higher', 'midpoint', 'nearest'}
3227 Method to use when the desired quantile falls between two points.
3228 numeric_only : bool, default True
3229 Include only `float`, `int` or `boolean` data.
3231 .. versionadded:: 1.5.0
3233 Returns
3234 -------
3235 Series or DataFrame
3236 Return type determined by caller of GroupBy object.
3238 See Also
3239 --------
3240 Series.quantile : Similar method for Series.
3241 DataFrame.quantile : Similar method for DataFrame.
3242 numpy.percentile : NumPy method to compute qth percentile.
3244 Examples
3245 --------
3246 >>> df = pd.DataFrame([
3247 ... ['a', 1], ['a', 2], ['a', 3],
3248 ... ['b', 1], ['b', 3], ['b', 5]
3249 ... ], columns=['key', 'val'])
3250 >>> df.groupby('key').quantile()
3251 val
3252 key
3253 a 2.0
3254 b 3.0
3255 """
3256 numeric_only_bool = self._resolve_numeric_only("quantile", numeric_only, axis=0)
3257 if (
3258 numeric_only_bool
3259 and self.obj.ndim == 1
3260 and not is_numeric_dtype(self.obj.dtype)
3261 ):
3262 raise TypeError(
3263 f"{type(self).__name__}.quantile called with "
3264 f"numeric_only={numeric_only} and dtype {self.obj.dtype}"
3265 )
3267 def pre_processor(vals: ArrayLike) -> tuple[np.ndarray, np.dtype | None]:
3268 if is_object_dtype(vals):
3269 raise TypeError(
3270 "'quantile' cannot be performed against 'object' dtypes!"
3271 )
3273 inference: np.dtype | None = None
3274 if is_integer_dtype(vals.dtype):
3275 if isinstance(vals, ExtensionArray):
3276 out = vals.to_numpy(dtype=float, na_value=np.nan)
3277 else:
3278 out = vals
3279 inference = np.dtype(np.int64)
3280 elif is_bool_dtype(vals.dtype) and isinstance(vals, ExtensionArray):
3281 out = vals.to_numpy(dtype=float, na_value=np.nan)
3282 elif is_datetime64_dtype(vals.dtype):
3283 inference = np.dtype("datetime64[ns]")
3284 out = np.asarray(vals).astype(float)
3285 elif is_timedelta64_dtype(vals.dtype):
3286 inference = np.dtype("timedelta64[ns]")
3287 out = np.asarray(vals).astype(float)
3288 elif isinstance(vals, ExtensionArray) and is_float_dtype(vals):
3289 inference = np.dtype(np.float64)
3290 out = vals.to_numpy(dtype=float, na_value=np.nan)
3291 else:
3292 out = np.asarray(vals)
3294 return out, inference
3296 def post_processor(vals: np.ndarray, inference: np.dtype | None) -> np.ndarray:
3297 if inference:
3298 # Check for edge case
3299 if not (
3300 is_integer_dtype(inference)
3301 and interpolation in {"linear", "midpoint"}
3302 ):
3303 vals = vals.astype(inference)
3305 return vals
3307 orig_scalar = is_scalar(q)
3308 if orig_scalar:
3309 q = [q]
3311 qs = np.array(q, dtype=np.float64)
3312 ids, _, ngroups = self.grouper.group_info
3313 nqs = len(qs)
3315 func = partial(
3316 libgroupby.group_quantile, labels=ids, qs=qs, interpolation=interpolation
3317 )
3319 # Put '-1' (NaN) labels as the last group so it does not interfere
3320 # with the calculations. Note: length check avoids failure on empty
3321 # labels. In that case, the value doesn't matter
3322 na_label_for_sorting = ids.max() + 1 if len(ids) > 0 else 0
3323 labels_for_lexsort = np.where(ids == -1, na_label_for_sorting, ids)
3325 def blk_func(values: ArrayLike) -> ArrayLike:
3326 mask = isna(values)
3327 vals, inference = pre_processor(values)
3329 ncols = 1
3330 if vals.ndim == 2:
3331 ncols = vals.shape[0]
3332 shaped_labels = np.broadcast_to(
3333 labels_for_lexsort, (ncols, len(labels_for_lexsort))
3334 )
3335 else:
3336 shaped_labels = labels_for_lexsort
3338 out = np.empty((ncols, ngroups, nqs), dtype=np.float64)
3340 # Get an index of values sorted by values and then labels
3341 order = (vals, shaped_labels)
3342 sort_arr = np.lexsort(order).astype(np.intp, copy=False)
3344 if vals.ndim == 1:
3345 func(out[0], values=vals, mask=mask, sort_indexer=sort_arr)
3346 else:
3347 for i in range(ncols):
3348 func(out[i], values=vals[i], mask=mask[i], sort_indexer=sort_arr[i])
3350 if vals.ndim == 1:
3351 out = out.ravel("K")
3352 else:
3353 out = out.reshape(ncols, ngroups * nqs)
3354 return post_processor(out, inference)
3356 obj = self._obj_with_exclusions
3357 is_ser = obj.ndim == 1
3358 mgr = self._get_data_to_aggregate()
3359 data = mgr.get_numeric_data() if numeric_only_bool else mgr
3360 ignore_failures = numeric_only_bool
3361 res_mgr = data.grouped_reduce(blk_func, ignore_failures=ignore_failures)
3363 if (
3364 numeric_only is lib.no_default
3365 and not is_ser
3366 and len(res_mgr.items) != len(mgr.items)
3367 ):
3368 warn_dropping_nuisance_columns_deprecated(
3369 type(self), "quantile", numeric_only
3370 )
3372 if len(res_mgr.items) == 0:
3373 # re-call grouped_reduce to get the desired exception message
3374 mgr.grouped_reduce(blk_func, ignore_failures=False)
3375 # grouped_reduce _should_ raise, so this should not be reached
3376 raise TypeError( # pragma: no cover
3377 "All columns were dropped in grouped_reduce"
3378 )
3380 if is_ser:
3381 res = self._wrap_agged_manager(res_mgr)
3382 else:
3383 res = obj._constructor(res_mgr)
3385 if orig_scalar:
3386 # Avoid expensive MultiIndex construction
3387 return self._wrap_aggregated_output(res)
3388 return self._wrap_aggregated_output(res, qs=qs)
3390 @final
3391 @Substitution(name="groupby")
3392 def ngroup(self, ascending: bool = True):
3393 """
3394 Number each group from 0 to the number of groups - 1.
3396 This is the enumerative complement of cumcount. Note that the
3397 numbers given to the groups match the order in which the groups
3398 would be seen when iterating over the groupby object, not the
3399 order they are first observed.
3401 Parameters
3402 ----------
3403 ascending : bool, default True
3404 If False, number in reverse, from number of group - 1 to 0.
3406 Returns
3407 -------
3408 Series
3409 Unique numbers for each group.
3411 See Also
3412 --------
3413 .cumcount : Number the rows in each group.
3415 Examples
3416 --------
3417 >>> df = pd.DataFrame({"A": list("aaabba")})
3418 >>> df
3419 A
3420 0 a
3421 1 a
3422 2 a
3423 3 b
3424 4 b
3425 5 a
3426 >>> df.groupby('A').ngroup()
3427 0 0
3428 1 0
3429 2 0
3430 3 1
3431 4 1
3432 5 0
3433 dtype: int64
3434 >>> df.groupby('A').ngroup(ascending=False)
3435 0 1
3436 1 1
3437 2 1
3438 3 0
3439 4 0
3440 5 1
3441 dtype: int64
3442 >>> df.groupby(["A", [1,1,2,3,2,1]]).ngroup()
3443 0 0
3444 1 0
3445 2 1
3446 3 3
3447 4 2
3448 5 0
3449 dtype: int64
3450 """
3451 with self._group_selection_context():
3452 index = self._selected_obj.index
3453 comp_ids = self.grouper.group_info[0]
3455 dtype: type
3456 if self.grouper.has_dropped_na:
3457 comp_ids = np.where(comp_ids == -1, np.nan, comp_ids)
3458 dtype = np.float64
3459 else:
3460 dtype = np.int64
3462 result = self._obj_1d_constructor(comp_ids, index, dtype=dtype)
3463 if not ascending:
3464 result = self.ngroups - 1 - result
3465 return result
3467 @final
3468 @Substitution(name="groupby")
3469 def cumcount(self, ascending: bool = True):
3470 """
3471 Number each item in each group from 0 to the length of that group - 1.
3473 Essentially this is equivalent to
3475 .. code-block:: python
3477 self.apply(lambda x: pd.Series(np.arange(len(x)), x.index))
3479 Parameters
3480 ----------
3481 ascending : bool, default True
3482 If False, number in reverse, from length of group - 1 to 0.
3484 Returns
3485 -------
3486 Series
3487 Sequence number of each element within each group.
3489 See Also
3490 --------
3491 .ngroup : Number the groups themselves.
3493 Examples
3494 --------
3495 >>> df = pd.DataFrame([['a'], ['a'], ['a'], ['b'], ['b'], ['a']],
3496 ... columns=['A'])
3497 >>> df
3498 A
3499 0 a
3500 1 a
3501 2 a
3502 3 b
3503 4 b
3504 5 a
3505 >>> df.groupby('A').cumcount()
3506 0 0
3507 1 1
3508 2 2
3509 3 0
3510 4 1
3511 5 3
3512 dtype: int64
3513 >>> df.groupby('A').cumcount(ascending=False)
3514 0 3
3515 1 2
3516 2 1
3517 3 1
3518 4 0
3519 5 0
3520 dtype: int64
3521 """
3522 with self._group_selection_context():
3523 index = self._selected_obj._get_axis(self.axis)
3524 cumcounts = self._cumcount_array(ascending=ascending)
3525 return self._obj_1d_constructor(cumcounts, index)
3527 @final
3528 @Substitution(name="groupby")
3529 @Substitution(see_also=_common_see_also)
3530 def rank(
3531 self,
3532 method: str = "average",
3533 ascending: bool = True,
3534 na_option: str = "keep",
3535 pct: bool = False,
3536 axis: int = 0,
3537 ) -> NDFrameT:
3538 """
3539 Provide the rank of values within each group.
3541 Parameters
3542 ----------
3543 method : {'average', 'min', 'max', 'first', 'dense'}, default 'average'
3544 * average: average rank of group.
3545 * min: lowest rank in group.
3546 * max: highest rank in group.
3547 * first: ranks assigned in order they appear in the array.
3548 * dense: like 'min', but rank always increases by 1 between groups.
3549 ascending : bool, default True
3550 False for ranks by high (1) to low (N).
3551 na_option : {'keep', 'top', 'bottom'}, default 'keep'
3552 * keep: leave NA values where they are.
3553 * top: smallest rank if ascending.
3554 * bottom: smallest rank if descending.
3555 pct : bool, default False
3556 Compute percentage rank of data within each group.
3557 axis : int, default 0
3558 The axis of the object over which to compute the rank.
3560 Returns
3561 -------
3562 DataFrame with ranking of values within each group
3563 %(see_also)s
3564 Examples
3565 --------
3566 >>> df = pd.DataFrame(
3567 ... {
3568 ... "group": ["a", "a", "a", "a", "a", "b", "b", "b", "b", "b"],
3569 ... "value": [2, 4, 2, 3, 5, 1, 2, 4, 1, 5],
3570 ... }
3571 ... )
3572 >>> df
3573 group value
3574 0 a 2
3575 1 a 4
3576 2 a 2
3577 3 a 3
3578 4 a 5
3579 5 b 1
3580 6 b 2
3581 7 b 4
3582 8 b 1
3583 9 b 5
3584 >>> for method in ['average', 'min', 'max', 'dense', 'first']:
3585 ... df[f'{method}_rank'] = df.groupby('group')['value'].rank(method)
3586 >>> df
3587 group value average_rank min_rank max_rank dense_rank first_rank
3588 0 a 2 1.5 1.0 2.0 1.0 1.0
3589 1 a 4 4.0 4.0 4.0 3.0 4.0
3590 2 a 2 1.5 1.0 2.0 1.0 2.0
3591 3 a 3 3.0 3.0 3.0 2.0 3.0
3592 4 a 5 5.0 5.0 5.0 4.0 5.0
3593 5 b 1 1.5 1.0 2.0 1.0 1.0
3594 6 b 2 3.0 3.0 3.0 2.0 3.0
3595 7 b 4 4.0 4.0 4.0 3.0 4.0
3596 8 b 1 1.5 1.0 2.0 1.0 2.0
3597 9 b 5 5.0 5.0 5.0 4.0 5.0
3598 """
3599 if na_option not in {"keep", "top", "bottom"}:
3600 msg = "na_option must be one of 'keep', 'top', or 'bottom'"
3601 raise ValueError(msg)
3603 kwargs = {
3604 "ties_method": method,
3605 "ascending": ascending,
3606 "na_option": na_option,
3607 "pct": pct,
3608 }
3609 if axis != 0:
3610 # DataFrame uses different keyword name
3611 kwargs["method"] = kwargs.pop("ties_method")
3612 f = lambda x: x.rank(axis=axis, numeric_only=False, **kwargs)
3613 result = self._python_apply_general(
3614 f, self._selected_obj, is_transform=True
3615 )
3616 return result
3618 return self._cython_transform(
3619 "rank",
3620 numeric_only=False,
3621 axis=axis,
3622 **kwargs,
3623 )
3625 @final
3626 @Substitution(name="groupby")
3627 @Appender(_common_see_also)
3628 def cumprod(self, axis=0, *args, **kwargs) -> NDFrameT:
3629 """
3630 Cumulative product for each group.
3632 Returns
3633 -------
3634 Series or DataFrame
3635 """
3636 nv.validate_groupby_func("cumprod", args, kwargs, ["numeric_only", "skipna"])
3637 if axis != 0:
3638 f = lambda x: x.cumprod(axis=axis, **kwargs)
3639 return self._python_apply_general(f, self._selected_obj, is_transform=True)
3641 return self._cython_transform("cumprod", **kwargs)
3643 @final
3644 @Substitution(name="groupby")
3645 @Appender(_common_see_also)
3646 def cumsum(self, axis=0, *args, **kwargs) -> NDFrameT:
3647 """
3648 Cumulative sum for each group.
3650 Returns
3651 -------
3652 Series or DataFrame
3653 """
3654 nv.validate_groupby_func("cumsum", args, kwargs, ["numeric_only", "skipna"])
3655 if axis != 0:
3656 f = lambda x: x.cumsum(axis=axis, **kwargs)
3657 return self._python_apply_general(f, self._selected_obj, is_transform=True)
3659 return self._cython_transform("cumsum", **kwargs)
3661 @final
3662 @Substitution(name="groupby")
3663 @Appender(_common_see_also)
3664 def cummin(self, axis=0, numeric_only=False, **kwargs) -> NDFrameT:
3665 """
3666 Cumulative min for each group.
3668 Returns
3669 -------
3670 Series or DataFrame
3671 """
3672 skipna = kwargs.get("skipna", True)
3673 if axis != 0:
3674 f = lambda x: np.minimum.accumulate(x, axis)
3675 numeric_only_bool = self._resolve_numeric_only("cummax", numeric_only, axis)
3676 obj = self._selected_obj
3677 if numeric_only_bool:
3678 obj = obj._get_numeric_data()
3679 return self._python_apply_general(f, obj, is_transform=True)
3681 return self._cython_transform(
3682 "cummin", numeric_only=numeric_only, skipna=skipna
3683 )
3685 @final
3686 @Substitution(name="groupby")
3687 @Appender(_common_see_also)
3688 def cummax(self, axis=0, numeric_only=False, **kwargs) -> NDFrameT:
3689 """
3690 Cumulative max for each group.
3692 Returns
3693 -------
3694 Series or DataFrame
3695 """
3696 skipna = kwargs.get("skipna", True)
3697 if axis != 0:
3698 f = lambda x: np.maximum.accumulate(x, axis)
3699 numeric_only_bool = self._resolve_numeric_only("cummax", numeric_only, axis)
3700 obj = self._selected_obj
3701 if numeric_only_bool:
3702 obj = obj._get_numeric_data()
3703 return self._python_apply_general(f, obj, is_transform=True)
3705 return self._cython_transform(
3706 "cummax", numeric_only=numeric_only, skipna=skipna
3707 )
3709 @final
3710 def _get_cythonized_result(
3711 self,
3712 base_func: Callable,
3713 cython_dtype: np.dtype,
3714 numeric_only: bool | lib.NoDefault = lib.no_default,
3715 needs_counts: bool = False,
3716 needs_nullable: bool = False,
3717 needs_mask: bool = False,
3718 pre_processing=None,
3719 post_processing=None,
3720 **kwargs,
3721 ):
3722 """
3723 Get result for Cythonized functions.
3725 Parameters
3726 ----------
3727 base_func : callable, Cythonized function to be called
3728 cython_dtype : np.dtype
3729 Type of the array that will be modified by the Cython call.
3730 numeric_only : bool, default True
3731 Whether only numeric datatypes should be computed
3732 needs_counts : bool, default False
3733 Whether the counts should be a part of the Cython call
3734 needs_mask : bool, default False
3735 Whether boolean mask needs to be part of the Cython call
3736 signature
3737 needs_nullable : bool, default False
3738 Whether a bool specifying if the input is nullable is part
3739 of the Cython call signature
3740 pre_processing : function, default None
3741 Function to be applied to `values` prior to passing to Cython.
3742 Function should return a tuple where the first element is the
3743 values to be passed to Cython and the second element is an optional
3744 type which the values should be converted to after being returned
3745 by the Cython operation. This function is also responsible for
3746 raising a TypeError if the values have an invalid type. Raises
3747 if `needs_values` is False.
3748 post_processing : function, default None
3749 Function to be applied to result of Cython function. Should accept
3750 an array of values as the first argument and type inferences as its
3751 second argument, i.e. the signature should be
3752 (ndarray, Type). If `needs_nullable=True`, a third argument should be
3753 `nullable`, to allow for processing specific to nullable values.
3754 **kwargs : dict
3755 Extra arguments to be passed back to Cython funcs
3757 Returns
3758 -------
3759 `Series` or `DataFrame` with filled values
3760 """
3761 how = base_func.__name__
3762 numeric_only_bool = self._resolve_numeric_only(how, numeric_only, axis=0)
3764 if post_processing and not callable(post_processing):
3765 raise ValueError("'post_processing' must be a callable!")
3766 if pre_processing and not callable(pre_processing):
3767 raise ValueError("'pre_processing' must be a callable!")
3769 grouper = self.grouper
3771 ids, _, ngroups = grouper.group_info
3773 base_func = partial(base_func, labels=ids)
3775 def blk_func(values: ArrayLike) -> ArrayLike:
3776 values = values.T
3777 ncols = 1 if values.ndim == 1 else values.shape[1]
3779 result: ArrayLike
3780 result = np.zeros(ngroups * ncols, dtype=cython_dtype)
3781 result = result.reshape((ngroups, ncols))
3783 func = partial(base_func, out=result)
3785 inferences = None
3787 if needs_counts:
3788 counts = np.zeros(self.ngroups, dtype=np.int64)
3789 func = partial(func, counts=counts)
3791 vals = values
3792 if pre_processing:
3793 vals, inferences = pre_processing(vals)
3795 vals = vals.astype(cython_dtype, copy=False)
3796 if vals.ndim == 1:
3797 vals = vals.reshape((-1, 1))
3798 func = partial(func, values=vals)
3800 if needs_mask:
3801 mask = isna(values).view(np.uint8)
3802 if mask.ndim == 1:
3803 mask = mask.reshape(-1, 1)
3804 func = partial(func, mask=mask)
3806 if needs_nullable:
3807 is_nullable = isinstance(values, BaseMaskedArray)
3808 func = partial(func, nullable=is_nullable)
3810 func(**kwargs) # Call func to modify indexer values in place
3812 if values.ndim == 1:
3813 assert result.shape[1] == 1, result.shape
3814 result = result[:, 0]
3816 if post_processing:
3817 pp_kwargs = {}
3818 if needs_nullable:
3819 pp_kwargs["nullable"] = isinstance(values, BaseMaskedArray)
3821 result = post_processing(result, inferences, **pp_kwargs)
3823 return result.T
3825 obj = self._obj_with_exclusions
3827 # Operate block-wise instead of column-by-column
3828 is_ser = obj.ndim == 1
3829 mgr = self._get_data_to_aggregate()
3830 orig_mgr_len = len(mgr)
3832 if numeric_only_bool:
3833 mgr = mgr.get_numeric_data()
3835 res_mgr = mgr.grouped_reduce(blk_func, ignore_failures=True)
3837 if not is_ser and len(res_mgr.items) != orig_mgr_len:
3838 howstr = how.replace("group_", "")
3839 warn_dropping_nuisance_columns_deprecated(type(self), howstr, numeric_only)
3841 if len(res_mgr.items) == 0:
3842 # We re-call grouped_reduce to get the right exception message
3843 mgr.grouped_reduce(blk_func, ignore_failures=False)
3844 # grouped_reduce _should_ raise, so this should not be reached
3845 raise TypeError( # pragma: no cover
3846 "All columns were dropped in grouped_reduce"
3847 )
3849 if is_ser:
3850 out = self._wrap_agged_manager(res_mgr)
3851 else:
3852 out = obj._constructor(res_mgr)
3854 return self._wrap_aggregated_output(out)
3856 @final
3857 @Substitution(name="groupby")
3858 def shift(self, periods=1, freq=None, axis=0, fill_value=None):
3859 """
3860 Shift each group by periods observations.
3862 If freq is passed, the index will be increased using the periods and the freq.
3864 Parameters
3865 ----------
3866 periods : int, default 1
3867 Number of periods to shift.
3868 freq : str, optional
3869 Frequency string.
3870 axis : axis to shift, default 0
3871 Shift direction.
3872 fill_value : optional
3873 The scalar value to use for newly introduced missing values.
3875 Returns
3876 -------
3877 Series or DataFrame
3878 Object shifted within each group.
3880 See Also
3881 --------
3882 Index.shift : Shift values of Index.
3883 tshift : Shift the time index, using the index’s frequency
3884 if available.
3885 """
3886 if freq is not None or axis != 0:
3887 f = lambda x: x.shift(periods, freq, axis, fill_value)
3888 return self._python_apply_general(f, self._selected_obj, is_transform=True)
3890 ids, _, ngroups = self.grouper.group_info
3891 res_indexer = np.zeros(len(ids), dtype=np.int64)
3893 libgroupby.group_shift_indexer(res_indexer, ids, ngroups, periods)
3895 obj = self._obj_with_exclusions
3897 res = obj._reindex_with_indexers(
3898 {self.axis: (obj.axes[self.axis], res_indexer)},
3899 fill_value=fill_value,
3900 allow_dups=True,
3901 )
3902 return res
3904 @final
3905 @Substitution(name="groupby")
3906 @Appender(_common_see_also)
3907 def diff(self, periods: int = 1, axis: int = 0) -> NDFrameT:
3908 """
3909 First discrete difference of element.
3911 Calculates the difference of each element compared with another
3912 element in the group (default is element in previous row).
3914 Parameters
3915 ----------
3916 periods : int, default 1
3917 Periods to shift for calculating difference, accepts negative values.
3918 axis : axis to shift, default 0
3919 Take difference over rows (0) or columns (1).
3921 Returns
3922 -------
3923 Series or DataFrame
3924 First differences.
3925 """
3926 if axis != 0:
3927 return self.apply(lambda x: x.diff(periods=periods, axis=axis))
3929 obj = self._obj_with_exclusions
3930 shifted = self.shift(periods=periods, axis=axis)
3932 # GH45562 - to retain existing behavior and match behavior of Series.diff(),
3933 # int8 and int16 are coerced to float32 rather than float64.
3934 dtypes_to_f32 = ["int8", "int16"]
3935 if obj.ndim == 1:
3936 if obj.dtype in dtypes_to_f32:
3937 shifted = shifted.astype("float32")
3938 else:
3939 to_coerce = [c for c, dtype in obj.dtypes.items() if dtype in dtypes_to_f32]
3940 if len(to_coerce):
3941 shifted = shifted.astype({c: "float32" for c in to_coerce})
3943 return obj - shifted
3945 @final
3946 @Substitution(name="groupby")
3947 @Appender(_common_see_also)
3948 def pct_change(self, periods=1, fill_method="ffill", limit=None, freq=None, axis=0):
3949 """
3950 Calculate pct_change of each value to previous entry in group.
3952 Returns
3953 -------
3954 Series or DataFrame
3955 Percentage changes within each group.
3956 """
3957 # TODO(GH#23918): Remove this conditional for SeriesGroupBy when
3958 # GH#23918 is fixed
3959 if freq is not None or axis != 0:
3960 f = lambda x: x.pct_change(
3961 periods=periods,
3962 fill_method=fill_method,
3963 limit=limit,
3964 freq=freq,
3965 axis=axis,
3966 )
3967 return self._python_apply_general(f, self._selected_obj, is_transform=True)
3969 if fill_method is None: # GH30463
3970 fill_method = "ffill"
3971 limit = 0
3972 filled = getattr(self, fill_method)(limit=limit)
3973 fill_grp = filled.groupby(
3974 self.grouper.codes, axis=self.axis, group_keys=self.group_keys
3975 )
3976 shifted = fill_grp.shift(periods=periods, freq=freq, axis=self.axis)
3977 return (filled / shifted) - 1
3979 @final
3980 @Substitution(name="groupby")
3981 @Substitution(see_also=_common_see_also)
3982 def head(self, n: int = 5) -> NDFrameT:
3983 """
3984 Return first n rows of each group.
3986 Similar to ``.apply(lambda x: x.head(n))``, but it returns a subset of rows
3987 from the original DataFrame with original index and order preserved
3988 (``as_index`` flag is ignored).
3990 Parameters
3991 ----------
3992 n : int
3993 If positive: number of entries to include from start of each group.
3994 If negative: number of entries to exclude from end of each group.
3996 Returns
3997 -------
3998 Series or DataFrame
3999 Subset of original Series or DataFrame as determined by n.
4000 %(see_also)s
4001 Examples
4002 --------
4004 >>> df = pd.DataFrame([[1, 2], [1, 4], [5, 6]],
4005 ... columns=['A', 'B'])
4006 >>> df.groupby('A').head(1)
4007 A B
4008 0 1 2
4009 2 5 6
4010 >>> df.groupby('A').head(-1)
4011 A B
4012 0 1 2
4013 """
4014 self._reset_group_selection()
4015 mask = self._make_mask_from_positional_indexer(slice(None, n))
4016 return self._mask_selected_obj(mask)
4018 @final
4019 @Substitution(name="groupby")
4020 @Substitution(see_also=_common_see_also)
4021 def tail(self, n: int = 5) -> NDFrameT:
4022 """
4023 Return last n rows of each group.
4025 Similar to ``.apply(lambda x: x.tail(n))``, but it returns a subset of rows
4026 from the original DataFrame with original index and order preserved
4027 (``as_index`` flag is ignored).
4029 Parameters
4030 ----------
4031 n : int
4032 If positive: number of entries to include from end of each group.
4033 If negative: number of entries to exclude from start of each group.
4035 Returns
4036 -------
4037 Series or DataFrame
4038 Subset of original Series or DataFrame as determined by n.
4039 %(see_also)s
4040 Examples
4041 --------
4043 >>> df = pd.DataFrame([['a', 1], ['a', 2], ['b', 1], ['b', 2]],
4044 ... columns=['A', 'B'])
4045 >>> df.groupby('A').tail(1)
4046 A B
4047 1 a 2
4048 3 b 2
4049 >>> df.groupby('A').tail(-1)
4050 A B
4051 1 a 2
4052 3 b 2
4053 """
4054 self._reset_group_selection()
4055 if n:
4056 mask = self._make_mask_from_positional_indexer(slice(-n, None))
4057 else:
4058 mask = self._make_mask_from_positional_indexer([])
4060 return self._mask_selected_obj(mask)
4062 @final
4063 def _mask_selected_obj(self, mask: npt.NDArray[np.bool_]) -> NDFrameT:
4064 """
4065 Return _selected_obj with mask applied to the correct axis.
4067 Parameters
4068 ----------
4069 mask : np.ndarray[bool]
4070 Boolean mask to apply.
4072 Returns
4073 -------
4074 Series or DataFrame
4075 Filtered _selected_obj.
4076 """
4077 ids = self.grouper.group_info[0]
4078 mask = mask & (ids != -1)
4080 if self.axis == 0:
4081 return self._selected_obj[mask]
4082 else:
4083 return self._selected_obj.iloc[:, mask]
4085 @final
4086 def _reindex_output(
4087 self,
4088 output: OutputFrameOrSeries,
4089 fill_value: Scalar = np.NaN,
4090 qs: npt.NDArray[np.float64] | None = None,
4091 ) -> OutputFrameOrSeries:
4092 """
4093 If we have categorical groupers, then we might want to make sure that
4094 we have a fully re-indexed output to the levels. This means expanding
4095 the output space to accommodate all values in the cartesian product of
4096 our groups, regardless of whether they were observed in the data or
4097 not. This will expand the output space if there are missing groups.
4099 The method returns early without modifying the input if the number of
4100 groupings is less than 2, self.observed == True or none of the groupers
4101 are categorical.
4103 Parameters
4104 ----------
4105 output : Series or DataFrame
4106 Object resulting from grouping and applying an operation.
4107 fill_value : scalar, default np.NaN
4108 Value to use for unobserved categories if self.observed is False.
4109 qs : np.ndarray[float64] or None, default None
4110 quantile values, only relevant for quantile.
4112 Returns
4113 -------
4114 Series or DataFrame
4115 Object (potentially) re-indexed to include all possible groups.
4116 """
4117 groupings = self.grouper.groupings
4118 if len(groupings) == 1:
4119 return output
4121 # if we only care about the observed values
4122 # we are done
4123 elif self.observed:
4124 return output
4126 # reindexing only applies to a Categorical grouper
4127 elif not any(
4128 isinstance(ping.grouping_vector, (Categorical, CategoricalIndex))
4129 for ping in groupings
4130 ):
4131 return output
4133 levels_list = [ping.group_index for ping in groupings]
4134 names = self.grouper.names
4135 if qs is not None:
4136 # error: Argument 1 to "append" of "list" has incompatible type
4137 # "ndarray[Any, dtype[floating[_64Bit]]]"; expected "Index"
4138 levels_list.append(qs) # type: ignore[arg-type]
4139 names = names + [None]
4140 index, _ = MultiIndex.from_product(levels_list, names=names).sortlevel()
4142 if self.as_index:
4143 # Always holds for SeriesGroupBy unless GH#36507 is implemented
4144 d = {
4145 self.obj._get_axis_name(self.axis): index,
4146 "copy": False,
4147 "fill_value": fill_value,
4148 }
4149 return output.reindex(**d)
4151 # GH 13204
4152 # Here, the categorical in-axis groupers, which need to be fully
4153 # expanded, are columns in `output`. An idea is to do:
4154 # output = output.set_index(self.grouper.names)
4155 # .reindex(index).reset_index()
4156 # but special care has to be taken because of possible not-in-axis
4157 # groupers.
4158 # So, we manually select and drop the in-axis grouper columns,
4159 # reindex `output`, and then reset the in-axis grouper columns.
4161 # Select in-axis groupers
4162 in_axis_grps = (
4163 (i, ping.name) for (i, ping) in enumerate(groupings) if ping.in_axis
4164 )
4165 g_nums, g_names = zip(*in_axis_grps)
4167 output = output.drop(labels=list(g_names), axis=1)
4169 # Set a temp index and reindex (possibly expanding)
4170 output = output.set_index(self.grouper.result_index).reindex(
4171 index, copy=False, fill_value=fill_value
4172 )
4174 # Reset in-axis grouper columns
4175 # (using level numbers `g_nums` because level names may not be unique)
4176 output = output.reset_index(level=g_nums)
4178 return output.reset_index(drop=True)
4180 @final
4181 def sample(
4182 self,
4183 n: int | None = None,
4184 frac: float | None = None,
4185 replace: bool = False,
4186 weights: Sequence | Series | None = None,
4187 random_state: RandomState | None = None,
4188 ):
4189 """
4190 Return a random sample of items from each group.
4192 You can use `random_state` for reproducibility.
4194 .. versionadded:: 1.1.0
4196 Parameters
4197 ----------
4198 n : int, optional
4199 Number of items to return for each group. Cannot be used with
4200 `frac` and must be no larger than the smallest group unless
4201 `replace` is True. Default is one if `frac` is None.
4202 frac : float, optional
4203 Fraction of items to return. Cannot be used with `n`.
4204 replace : bool, default False
4205 Allow or disallow sampling of the same row more than once.
4206 weights : list-like, optional
4207 Default None results in equal probability weighting.
4208 If passed a list-like then values must have the same length as
4209 the underlying DataFrame or Series object and will be used as
4210 sampling probabilities after normalization within each group.
4211 Values must be non-negative with at least one positive element
4212 within each group.
4213 random_state : int, array-like, BitGenerator, np.random.RandomState, np.random.Generator, optional
4214 If int, array-like, or BitGenerator, seed for random number generator.
4215 If np.random.RandomState or np.random.Generator, use as given.
4217 .. versionchanged:: 1.4.0
4219 np.random.Generator objects now accepted
4221 Returns
4222 -------
4223 Series or DataFrame
4224 A new object of same type as caller containing items randomly
4225 sampled within each group from the caller object.
4227 See Also
4228 --------
4229 DataFrame.sample: Generate random samples from a DataFrame object.
4230 numpy.random.choice: Generate a random sample from a given 1-D numpy
4231 array.
4233 Examples
4234 --------
4235 >>> df = pd.DataFrame(
4236 ... {"a": ["red"] * 2 + ["blue"] * 2 + ["black"] * 2, "b": range(6)}
4237 ... )
4238 >>> df
4239 a b
4240 0 red 0
4241 1 red 1
4242 2 blue 2
4243 3 blue 3
4244 4 black 4
4245 5 black 5
4247 Select one row at random for each distinct value in column a. The
4248 `random_state` argument can be used to guarantee reproducibility:
4250 >>> df.groupby("a").sample(n=1, random_state=1)
4251 a b
4252 4 black 4
4253 2 blue 2
4254 1 red 1
4256 Set `frac` to sample fixed proportions rather than counts:
4258 >>> df.groupby("a")["b"].sample(frac=0.5, random_state=2)
4259 5 5
4260 2 2
4261 0 0
4262 Name: b, dtype: int64
4264 Control sample probabilities within groups by setting weights:
4266 >>> df.groupby("a").sample(
4267 ... n=1,
4268 ... weights=[1, 1, 1, 0, 0, 1],
4269 ... random_state=1,
4270 ... )
4271 a b
4272 5 black 5
4273 2 blue 2
4274 0 red 0
4275 """ # noqa:E501
4276 size = sample.process_sampling_size(n, frac, replace)
4277 if weights is not None:
4278 weights_arr = sample.preprocess_weights(
4279 self._selected_obj, weights, axis=self.axis
4280 )
4282 random_state = com.random_state(random_state)
4284 group_iterator = self.grouper.get_iterator(self._selected_obj, self.axis)
4286 sampled_indices = []
4287 for labels, obj in group_iterator:
4288 grp_indices = self.indices[labels]
4289 group_size = len(grp_indices)
4290 if size is not None:
4291 sample_size = size
4292 else:
4293 assert frac is not None
4294 sample_size = round(frac * group_size)
4296 grp_sample = sample.sample(
4297 group_size,
4298 size=sample_size,
4299 replace=replace,
4300 weights=None if weights is None else weights_arr[grp_indices],
4301 random_state=random_state,
4302 )
4303 sampled_indices.append(grp_indices[grp_sample])
4305 sampled_indices = np.concatenate(sampled_indices)
4306 return self._selected_obj.take(sampled_indices, axis=self.axis)
4309@doc(GroupBy)
4310def get_groupby(
4311 obj: NDFrame,
4312 by: _KeysArgType | None = None,
4313 axis: int = 0,
4314 level=None,
4315 grouper: ops.BaseGrouper | None = None,
4316 exclusions=None,
4317 selection=None,
4318 as_index: bool = True,
4319 sort: bool = True,
4320 group_keys: bool | lib.NoDefault = True,
4321 squeeze: bool = False,
4322 observed: bool = False,
4323 mutated: bool = False,
4324 dropna: bool = True,
4325) -> GroupBy:
4327 klass: type[GroupBy]
4328 if isinstance(obj, Series):
4329 from pandas.core.groupby.generic import SeriesGroupBy
4331 klass = SeriesGroupBy
4332 elif isinstance(obj, DataFrame):
4333 from pandas.core.groupby.generic import DataFrameGroupBy
4335 klass = DataFrameGroupBy
4336 else: # pragma: no cover
4337 raise TypeError(f"invalid type: {obj}")
4339 return klass(
4340 obj=obj,
4341 keys=by,
4342 axis=axis,
4343 level=level,
4344 grouper=grouper,
4345 exclusions=exclusions,
4346 selection=selection,
4347 as_index=as_index,
4348 sort=sort,
4349 group_keys=group_keys,
4350 squeeze=squeeze,
4351 observed=observed,
4352 mutated=mutated,
4353 dropna=dropna,
4354 )
4357def _insert_quantile_level(idx: Index, qs: npt.NDArray[np.float64]) -> MultiIndex:
4358 """
4359 Insert the sequence 'qs' of quantiles as the inner-most level of a MultiIndex.
4361 The quantile level in the MultiIndex is a repeated copy of 'qs'.
4363 Parameters
4364 ----------
4365 idx : Index
4366 qs : np.ndarray[float64]
4368 Returns
4369 -------
4370 MultiIndex
4371 """
4372 nqs = len(qs)
4374 if idx._is_multi:
4375 idx = cast(MultiIndex, idx)
4376 lev_codes, lev = Index(qs).factorize()
4377 levels = list(idx.levels) + [lev]
4378 codes = [np.repeat(x, nqs) for x in idx.codes] + [np.tile(lev_codes, len(idx))]
4379 mi = MultiIndex(levels=levels, codes=codes, names=idx.names + [None])
4380 else:
4381 mi = MultiIndex.from_product([idx, qs])
4382 return mi
4385def warn_dropping_nuisance_columns_deprecated(cls, how: str, numeric_only) -> None:
4386 if numeric_only is not lib.no_default and not numeric_only:
4387 # numeric_only was specified and falsey but still dropped nuisance columns
4388 warnings.warn(
4389 "Dropping invalid columns in "
4390 f"{cls.__name__}.{how} is deprecated. "
4391 "In a future version, a TypeError will be raised. "
4392 f"Before calling .{how}, select only columns which "
4393 "should be valid for the function.",
4394 FutureWarning,
4395 stacklevel=find_stack_level(),
4396 )
4397 elif numeric_only is lib.no_default:
4398 warnings.warn(
4399 "The default value of numeric_only in "
4400 f"{cls.__name__}.{how} is deprecated. "
4401 "In a future version, numeric_only will default to False. "
4402 f"Either specify numeric_only or select only columns which "
4403 "should be valid for the function.",
4404 FutureWarning,
4405 stacklevel=find_stack_level(),
4406 )