Coverage for /var/srv/projects/api.amasfac.comuna18.com/tmp/venv/lib/python3.9/site-packages/pandas/core/groupby/ops.py: 22%
560 statements
« prev ^ index » next coverage.py v6.4.4, created at 2023-07-17 14:22 -0600
« prev ^ index » next coverage.py v6.4.4, created at 2023-07-17 14:22 -0600
1"""
2Provide classes to perform the groupby aggregate operations.
4These are not exposed to the user and provide implementations of the grouping
5operations, primarily in cython. These classes (BaseGrouper and BinGrouper)
6are contained *in* the SeriesGroupBy and DataFrameGroupBy objects.
7"""
8from __future__ import annotations
10import collections
11import functools
12from typing import (
13 TYPE_CHECKING,
14 Callable,
15 Generic,
16 Hashable,
17 Iterator,
18 NoReturn,
19 Sequence,
20 final,
21)
23import numpy as np
25from pandas._libs import (
26 NaT,
27 lib,
28)
29import pandas._libs.groupby as libgroupby
30import pandas._libs.reduction as libreduction
31from pandas._typing import (
32 ArrayLike,
33 DtypeObj,
34 NDFrameT,
35 Shape,
36 npt,
37)
38from pandas.errors import AbstractMethodError
39from pandas.util._decorators import cache_readonly
41from pandas.core.dtypes.cast import (
42 maybe_cast_pointwise_result,
43 maybe_downcast_to_dtype,
44)
45from pandas.core.dtypes.common import (
46 ensure_float64,
47 ensure_int64,
48 ensure_platform_int,
49 ensure_uint64,
50 is_1d_only_ea_dtype,
51 is_bool_dtype,
52 is_complex_dtype,
53 is_datetime64_any_dtype,
54 is_float_dtype,
55 is_integer_dtype,
56 is_numeric_dtype,
57 is_sparse,
58 is_timedelta64_dtype,
59 needs_i8_conversion,
60)
61from pandas.core.dtypes.dtypes import CategoricalDtype
62from pandas.core.dtypes.missing import (
63 isna,
64 maybe_fill,
65)
67from pandas.core.arrays import (
68 Categorical,
69 DatetimeArray,
70 ExtensionArray,
71 PeriodArray,
72 TimedeltaArray,
73)
74from pandas.core.arrays.boolean import BooleanDtype
75from pandas.core.arrays.floating import FloatingDtype
76from pandas.core.arrays.integer import IntegerDtype
77from pandas.core.arrays.masked import (
78 BaseMaskedArray,
79 BaseMaskedDtype,
80)
81from pandas.core.arrays.string_ import StringDtype
82from pandas.core.frame import DataFrame
83from pandas.core.groupby import grouper
84from pandas.core.indexes.api import (
85 CategoricalIndex,
86 Index,
87 MultiIndex,
88 ensure_index,
89)
90from pandas.core.series import Series
91from pandas.core.sorting import (
92 compress_group_index,
93 decons_obs_group_ids,
94 get_flattened_list,
95 get_group_index,
96 get_group_index_sorter,
97 get_indexer_dict,
98)
100if TYPE_CHECKING: 100 ↛ 101line 100 didn't jump to line 101, because the condition on line 100 was never true
101 from pandas.core.generic import NDFrame
104class WrappedCythonOp:
105 """
106 Dispatch logic for functions defined in _libs.groupby
108 Parameters
109 ----------
110 kind: str
111 Whether the operation is an aggregate or transform.
112 how: str
113 Operation name, e.g. "mean".
114 has_dropped_na: bool
115 True precisely when dropna=True and the grouper contains a null value.
116 """
118 # Functions for which we do _not_ attempt to cast the cython result
119 # back to the original dtype.
120 cast_blocklist = frozenset(["rank", "count", "size", "idxmin", "idxmax"])
122 def __init__(self, kind: str, how: str, has_dropped_na: bool) -> None:
123 self.kind = kind
124 self.how = how
125 self.has_dropped_na = has_dropped_na
127 _CYTHON_FUNCTIONS = {
128 "aggregate": {
129 "sum": "group_sum",
130 "prod": "group_prod",
131 "min": "group_min",
132 "max": "group_max",
133 "mean": "group_mean",
134 "median": "group_median_float64",
135 "var": "group_var",
136 "first": "group_nth",
137 "last": "group_last",
138 "ohlc": "group_ohlc",
139 },
140 "transform": {
141 "cumprod": "group_cumprod_float64",
142 "cumsum": "group_cumsum",
143 "cummin": "group_cummin",
144 "cummax": "group_cummax",
145 "rank": "group_rank",
146 },
147 }
149 # "group_any" and "group_all" are also support masks, but don't go
150 # through WrappedCythonOp
151 _MASKED_CYTHON_FUNCTIONS = {
152 "cummin",
153 "cummax",
154 "min",
155 "max",
156 "last",
157 "first",
158 "rank",
159 "sum",
160 "ohlc",
161 "cumsum",
162 "prod",
163 }
165 _cython_arity = {"ohlc": 4} # OHLC
167 # Note: we make this a classmethod and pass kind+how so that caching
168 # works at the class level and not the instance level
169 @classmethod
170 @functools.lru_cache(maxsize=None)
171 def _get_cython_function(
172 cls, kind: str, how: str, dtype: np.dtype, is_numeric: bool
173 ):
175 dtype_str = dtype.name
176 ftype = cls._CYTHON_FUNCTIONS[kind][how]
178 # see if there is a fused-type version of function
179 # only valid for numeric
180 f = getattr(libgroupby, ftype)
181 if is_numeric:
182 return f
183 elif dtype == np.dtype(object):
184 if how in ["median", "cumprod"]:
185 # no fused types -> no __signatures__
186 raise NotImplementedError(
187 f"function is not implemented for this dtype: "
188 f"[how->{how},dtype->{dtype_str}]"
189 )
190 elif "object" not in f.__signatures__:
191 # raise NotImplementedError here rather than TypeError later
192 raise NotImplementedError(
193 f"function is not implemented for this dtype: "
194 f"[how->{how},dtype->{dtype_str}]"
195 )
196 return f
197 else:
198 raise NotImplementedError(
199 "This should not be reached. Please report a bug at "
200 "github.com/pandas-dev/pandas/",
201 dtype,
202 )
204 def _get_cython_vals(self, values: np.ndarray) -> np.ndarray:
205 """
206 Cast numeric dtypes to float64 for functions that only support that.
208 Parameters
209 ----------
210 values : np.ndarray
212 Returns
213 -------
214 values : np.ndarray
215 """
216 how = self.how
218 if how in ["median", "cumprod"]:
219 # these two only have float64 implementations
220 # We should only get here with is_numeric, as non-numeric cases
221 # should raise in _get_cython_function
222 values = ensure_float64(values)
224 elif values.dtype.kind in ["i", "u"]:
225 if how in ["var", "mean"] or (
226 self.kind == "transform" and self.has_dropped_na
227 ):
228 # result may still include NaN, so we have to cast
229 values = ensure_float64(values)
231 elif how in ["sum", "ohlc", "prod", "cumsum"]:
232 # Avoid overflow during group op
233 if values.dtype.kind == "i":
234 values = ensure_int64(values)
235 else:
236 values = ensure_uint64(values)
238 return values
240 # TODO: general case implementation overridable by EAs.
241 def _disallow_invalid_ops(self, dtype: DtypeObj, is_numeric: bool = False):
242 """
243 Check if we can do this operation with our cython functions.
245 Raises
246 ------
247 NotImplementedError
248 This is either not a valid function for this dtype, or
249 valid but not implemented in cython.
250 """
251 how = self.how
253 if is_numeric:
254 # never an invalid op for those dtypes, so return early as fastpath
255 return
257 if isinstance(dtype, CategoricalDtype):
258 # NotImplementedError for methods that can fall back to a
259 # non-cython implementation.
260 if how in ["sum", "prod", "cumsum", "cumprod"]:
261 raise TypeError(f"{dtype} type does not support {how} operations")
262 elif how not in ["rank"]:
263 # only "rank" is implemented in cython
264 raise NotImplementedError(f"{dtype} dtype not supported")
265 elif not dtype.ordered:
266 # TODO: TypeError?
267 raise NotImplementedError(f"{dtype} dtype not supported")
269 elif is_sparse(dtype):
270 # categoricals are only 1d, so we
271 # are not setup for dim transforming
272 raise NotImplementedError(f"{dtype} dtype not supported")
273 elif is_datetime64_any_dtype(dtype):
274 # TODO: same for period_dtype? no for these methods with Period
275 # we raise NotImplemented if this is an invalid operation
276 # entirely, e.g. adding datetimes
277 if how in ["sum", "prod", "cumsum", "cumprod"]:
278 raise TypeError(f"datetime64 type does not support {how} operations")
279 elif is_timedelta64_dtype(dtype):
280 if how in ["prod", "cumprod"]:
281 raise TypeError(f"timedelta64 type does not support {how} operations")
283 def _get_output_shape(self, ngroups: int, values: np.ndarray) -> Shape:
284 how = self.how
285 kind = self.kind
287 arity = self._cython_arity.get(how, 1)
289 out_shape: Shape
290 if how == "ohlc":
291 out_shape = (ngroups, 4)
292 elif arity > 1:
293 raise NotImplementedError(
294 "arity of more than 1 is not supported for the 'how' argument"
295 )
296 elif kind == "transform":
297 out_shape = values.shape
298 else:
299 out_shape = (ngroups,) + values.shape[1:]
300 return out_shape
302 def _get_out_dtype(self, dtype: np.dtype) -> np.dtype:
303 how = self.how
305 if how == "rank":
306 out_dtype = "float64"
307 else:
308 if is_numeric_dtype(dtype):
309 out_dtype = f"{dtype.kind}{dtype.itemsize}"
310 else:
311 out_dtype = "object"
312 return np.dtype(out_dtype)
314 def _get_result_dtype(self, dtype: np.dtype) -> np.dtype:
315 """
316 Get the desired dtype of a result based on the
317 input dtype and how it was computed.
319 Parameters
320 ----------
321 dtype : np.dtype
323 Returns
324 -------
325 np.dtype
326 The desired dtype of the result.
327 """
328 how = self.how
330 if how in ["sum", "cumsum", "sum", "prod"]:
331 if dtype == np.dtype(bool):
332 return np.dtype(np.int64)
333 elif how in ["mean", "median", "var"]:
334 if is_float_dtype(dtype) or is_complex_dtype(dtype):
335 return dtype
336 elif is_numeric_dtype(dtype):
337 return np.dtype(np.float64)
338 return dtype
340 def uses_mask(self) -> bool:
341 return self.how in self._MASKED_CYTHON_FUNCTIONS
343 @final
344 def _ea_wrap_cython_operation(
345 self,
346 values: ExtensionArray,
347 min_count: int,
348 ngroups: int,
349 comp_ids: np.ndarray,
350 **kwargs,
351 ) -> ArrayLike:
352 """
353 If we have an ExtensionArray, unwrap, call _cython_operation, and
354 re-wrap if appropriate.
355 """
356 if isinstance(values, BaseMaskedArray) and self.uses_mask():
357 return self._masked_ea_wrap_cython_operation(
358 values,
359 min_count=min_count,
360 ngroups=ngroups,
361 comp_ids=comp_ids,
362 **kwargs,
363 )
365 elif isinstance(values, Categorical) and self.uses_mask():
366 assert self.how == "rank" # the only one implemented ATM
367 assert values.ordered # checked earlier
368 mask = values.isna()
369 npvalues = values._ndarray
371 res_values = self._cython_op_ndim_compat(
372 npvalues,
373 min_count=min_count,
374 ngroups=ngroups,
375 comp_ids=comp_ids,
376 mask=mask,
377 **kwargs,
378 )
380 # If we ever have more than just "rank" here, we'll need to do
381 # `if self.how in self.cast_blocklist` like we do for other dtypes.
382 return res_values
384 npvalues = self._ea_to_cython_values(values)
386 res_values = self._cython_op_ndim_compat(
387 npvalues,
388 min_count=min_count,
389 ngroups=ngroups,
390 comp_ids=comp_ids,
391 mask=None,
392 **kwargs,
393 )
395 if self.how in self.cast_blocklist:
396 # i.e. how in ["rank"], since other cast_blocklist methods dont go
397 # through cython_operation
398 return res_values
400 return self._reconstruct_ea_result(values, res_values)
402 # TODO: general case implementation overridable by EAs.
403 def _ea_to_cython_values(self, values: ExtensionArray) -> np.ndarray:
404 # GH#43682
405 if isinstance(values, (DatetimeArray, PeriodArray, TimedeltaArray)):
406 # All of the functions implemented here are ordinal, so we can
407 # operate on the tz-naive equivalents
408 npvalues = values._ndarray.view("M8[ns]")
409 elif isinstance(values.dtype, (BooleanDtype, IntegerDtype)):
410 # IntegerArray or BooleanArray
411 npvalues = values.to_numpy("float64", na_value=np.nan)
412 elif isinstance(values.dtype, FloatingDtype):
413 # FloatingArray
414 npvalues = values.to_numpy(values.dtype.numpy_dtype, na_value=np.nan)
415 elif isinstance(values.dtype, StringDtype):
416 # StringArray
417 npvalues = values.to_numpy(object, na_value=np.nan)
418 else:
419 raise NotImplementedError(
420 f"function is not implemented for this dtype: {values.dtype}"
421 )
422 return npvalues
424 # TODO: general case implementation overridable by EAs.
425 def _reconstruct_ea_result(
426 self, values: ExtensionArray, res_values: np.ndarray
427 ) -> ExtensionArray:
428 """
429 Construct an ExtensionArray result from an ndarray result.
430 """
431 dtype: BaseMaskedDtype | StringDtype
433 if isinstance(values.dtype, StringDtype):
434 dtype = values.dtype
435 string_array_cls = dtype.construct_array_type()
436 return string_array_cls._from_sequence(res_values, dtype=dtype)
438 elif isinstance(values.dtype, BaseMaskedDtype):
439 new_dtype = self._get_result_dtype(values.dtype.numpy_dtype)
440 dtype = BaseMaskedDtype.from_numpy_dtype(new_dtype)
441 masked_array_cls = dtype.construct_array_type()
442 return masked_array_cls._from_sequence(res_values, dtype=dtype)
444 elif isinstance(values, (DatetimeArray, TimedeltaArray, PeriodArray)):
445 # In to_cython_values we took a view as M8[ns]
446 assert res_values.dtype == "M8[ns]"
447 res_values = res_values.view(values._ndarray.dtype)
448 return values._from_backing_data(res_values)
450 raise NotImplementedError
452 @final
453 def _masked_ea_wrap_cython_operation(
454 self,
455 values: BaseMaskedArray,
456 min_count: int,
457 ngroups: int,
458 comp_ids: np.ndarray,
459 **kwargs,
460 ) -> BaseMaskedArray:
461 """
462 Equivalent of `_ea_wrap_cython_operation`, but optimized for masked EA's
463 and cython algorithms which accept a mask.
464 """
465 orig_values = values
467 # libgroupby functions are responsible for NOT altering mask
468 mask = values._mask
469 if self.kind != "aggregate":
470 result_mask = mask.copy()
471 else:
472 result_mask = np.zeros(ngroups, dtype=bool)
474 arr = values._data
476 res_values = self._cython_op_ndim_compat(
477 arr,
478 min_count=min_count,
479 ngroups=ngroups,
480 comp_ids=comp_ids,
481 mask=mask,
482 result_mask=result_mask,
483 **kwargs,
484 )
486 if self.how == "ohlc":
487 result_mask = np.tile(result_mask, (4, 1)).T
489 # res_values should already have the correct dtype, we just need to
490 # wrap in a MaskedArray
491 return orig_values._maybe_mask_result(res_values, result_mask)
493 @final
494 def _cython_op_ndim_compat(
495 self,
496 values: np.ndarray,
497 *,
498 min_count: int,
499 ngroups: int,
500 comp_ids: np.ndarray,
501 mask: npt.NDArray[np.bool_] | None = None,
502 result_mask: npt.NDArray[np.bool_] | None = None,
503 **kwargs,
504 ) -> np.ndarray:
505 if values.ndim == 1:
506 # expand to 2d, dispatch, then squeeze if appropriate
507 values2d = values[None, :]
508 if mask is not None:
509 mask = mask[None, :]
510 if result_mask is not None:
511 result_mask = result_mask[None, :]
512 res = self._call_cython_op(
513 values2d,
514 min_count=min_count,
515 ngroups=ngroups,
516 comp_ids=comp_ids,
517 mask=mask,
518 result_mask=result_mask,
519 **kwargs,
520 )
521 if res.shape[0] == 1:
522 return res[0]
524 # otherwise we have OHLC
525 return res.T
527 return self._call_cython_op(
528 values,
529 min_count=min_count,
530 ngroups=ngroups,
531 comp_ids=comp_ids,
532 mask=mask,
533 result_mask=result_mask,
534 **kwargs,
535 )
537 @final
538 def _call_cython_op(
539 self,
540 values: np.ndarray, # np.ndarray[ndim=2]
541 *,
542 min_count: int,
543 ngroups: int,
544 comp_ids: np.ndarray,
545 mask: npt.NDArray[np.bool_] | None,
546 result_mask: npt.NDArray[np.bool_] | None,
547 **kwargs,
548 ) -> np.ndarray: # np.ndarray[ndim=2]
549 orig_values = values
551 dtype = values.dtype
552 is_numeric = is_numeric_dtype(dtype)
554 is_datetimelike = needs_i8_conversion(dtype)
556 if is_datetimelike:
557 values = values.view("int64")
558 is_numeric = True
559 elif is_bool_dtype(dtype):
560 values = values.view("uint8")
561 if values.dtype == "float16":
562 values = values.astype(np.float32)
564 values = values.T
565 if mask is not None:
566 mask = mask.T
567 if result_mask is not None:
568 result_mask = result_mask.T
570 out_shape = self._get_output_shape(ngroups, values)
571 func = self._get_cython_function(self.kind, self.how, values.dtype, is_numeric)
572 values = self._get_cython_vals(values)
573 out_dtype = self._get_out_dtype(values.dtype)
575 result = maybe_fill(np.empty(out_shape, dtype=out_dtype))
576 if self.kind == "aggregate":
577 counts = np.zeros(ngroups, dtype=np.int64)
578 if self.how in ["min", "max", "mean", "last", "first"]:
579 func(
580 out=result,
581 counts=counts,
582 values=values,
583 labels=comp_ids,
584 min_count=min_count,
585 mask=mask,
586 result_mask=result_mask,
587 is_datetimelike=is_datetimelike,
588 )
589 elif self.how in ["sum"]:
590 # We support datetimelike
591 func(
592 out=result,
593 counts=counts,
594 values=values,
595 labels=comp_ids,
596 mask=mask,
597 result_mask=result_mask,
598 min_count=min_count,
599 is_datetimelike=is_datetimelike,
600 )
601 elif self.how in ["ohlc", "prod"]:
602 func(
603 result,
604 counts,
605 values,
606 comp_ids,
607 min_count=min_count,
608 mask=mask,
609 result_mask=result_mask,
610 )
611 else:
612 func(result, counts, values, comp_ids, min_count, **kwargs)
613 else:
614 # TODO: min_count
615 if self.uses_mask():
616 if self.how != "rank":
617 # TODO: should rank take result_mask?
618 kwargs["result_mask"] = result_mask
619 func(
620 out=result,
621 values=values,
622 labels=comp_ids,
623 ngroups=ngroups,
624 is_datetimelike=is_datetimelike,
625 mask=mask,
626 **kwargs,
627 )
628 else:
629 func(
630 out=result,
631 values=values,
632 labels=comp_ids,
633 ngroups=ngroups,
634 is_datetimelike=is_datetimelike,
635 **kwargs,
636 )
638 if self.kind == "aggregate":
639 # i.e. counts is defined. Locations where count<min_count
640 # need to have the result set to np.nan, which may require casting,
641 # see GH#40767
642 if is_integer_dtype(result.dtype) and not is_datetimelike:
643 # if the op keeps the int dtypes, we have to use 0
644 cutoff = max(0 if self.how in ["sum", "prod"] else 1, min_count)
645 empty_groups = counts < cutoff
646 if empty_groups.any():
647 if result_mask is not None and self.uses_mask():
648 assert result_mask[empty_groups].all()
649 else:
650 # Note: this conversion could be lossy, see GH#40767
651 result = result.astype("float64")
652 result[empty_groups] = np.nan
654 result = result.T
656 if self.how not in self.cast_blocklist:
657 # e.g. if we are int64 and need to restore to datetime64/timedelta64
658 # "rank" is the only member of cast_blocklist we get here
659 # Casting only needed for float16, bool, datetimelike,
660 # and self.how in ["sum", "prod", "ohlc", "cumprod"]
661 res_dtype = self._get_result_dtype(orig_values.dtype)
662 op_result = maybe_downcast_to_dtype(result, res_dtype)
663 else:
664 op_result = result
666 return op_result
668 @final
669 def cython_operation(
670 self,
671 *,
672 values: ArrayLike,
673 axis: int,
674 min_count: int = -1,
675 comp_ids: np.ndarray,
676 ngroups: int,
677 **kwargs,
678 ) -> ArrayLike:
679 """
680 Call our cython function, with appropriate pre- and post- processing.
681 """
682 if values.ndim > 2:
683 raise NotImplementedError("number of dimensions is currently limited to 2")
684 elif values.ndim == 2:
685 assert axis == 1, axis
686 elif not is_1d_only_ea_dtype(values.dtype):
687 # Note: it is *not* the case that axis is always 0 for 1-dim values,
688 # as we can have 1D ExtensionArrays that we need to treat as 2D
689 assert axis == 0
691 dtype = values.dtype
692 is_numeric = is_numeric_dtype(dtype)
694 # can we do this operation with our cython functions
695 # if not raise NotImplementedError
696 self._disallow_invalid_ops(dtype, is_numeric)
698 if not isinstance(values, np.ndarray):
699 # i.e. ExtensionArray
700 return self._ea_wrap_cython_operation(
701 values,
702 min_count=min_count,
703 ngroups=ngroups,
704 comp_ids=comp_ids,
705 **kwargs,
706 )
708 return self._cython_op_ndim_compat(
709 values,
710 min_count=min_count,
711 ngroups=ngroups,
712 comp_ids=comp_ids,
713 mask=None,
714 **kwargs,
715 )
718class BaseGrouper:
719 """
720 This is an internal Grouper class, which actually holds
721 the generated groups
723 Parameters
724 ----------
725 axis : Index
726 groupings : Sequence[Grouping]
727 all the grouping instances to handle in this grouper
728 for example for grouper list to groupby, need to pass the list
729 sort : bool, default True
730 whether this grouper will give sorted result or not
731 group_keys : bool, default True
732 mutated : bool, default False
733 indexer : np.ndarray[np.intp], optional
734 the indexer created by Grouper
735 some groupers (TimeGrouper) will sort its axis and its
736 group_info is also sorted, so need the indexer to reorder
738 """
740 axis: Index
742 def __init__(
743 self,
744 axis: Index,
745 groupings: Sequence[grouper.Grouping],
746 sort: bool = True,
747 group_keys: bool = True,
748 mutated: bool = False,
749 indexer: npt.NDArray[np.intp] | None = None,
750 dropna: bool = True,
751 ) -> None:
752 assert isinstance(axis, Index), axis
754 self.axis = axis
755 self._groupings: list[grouper.Grouping] = list(groupings)
756 self._sort = sort
757 self.group_keys = group_keys
758 self.mutated = mutated
759 self.indexer = indexer
760 self.dropna = dropna
762 @property
763 def groupings(self) -> list[grouper.Grouping]:
764 return self._groupings
766 @property
767 def shape(self) -> Shape:
768 return tuple(ping.ngroups for ping in self.groupings)
770 def __iter__(self) -> Iterator[Hashable]:
771 return iter(self.indices)
773 @property
774 def nkeys(self) -> int:
775 return len(self.groupings)
777 def get_iterator(
778 self, data: NDFrameT, axis: int = 0
779 ) -> Iterator[tuple[Hashable, NDFrameT]]:
780 """
781 Groupby iterator
783 Returns
784 -------
785 Generator yielding sequence of (name, subsetted object)
786 for each group
787 """
788 splitter = self._get_splitter(data, axis=axis)
789 keys = self.group_keys_seq
790 yield from zip(keys, splitter)
792 @final
793 def _get_splitter(self, data: NDFrame, axis: int = 0) -> DataSplitter:
794 """
795 Returns
796 -------
797 Generator yielding subsetted objects
798 """
799 ids, _, ngroups = self.group_info
800 return get_splitter(data, ids, ngroups, axis=axis)
802 def _get_grouper(self):
803 """
804 We are a grouper as part of another's groupings.
806 We have a specific method of grouping, so cannot
807 convert to a Index for our grouper.
808 """
809 return self.groupings[0].grouping_vector
811 @final
812 @cache_readonly
813 def group_keys_seq(self):
814 if len(self.groupings) == 1:
815 return self.levels[0]
816 else:
817 ids, _, ngroups = self.group_info
819 # provide "flattened" iterator for multi-group setting
820 return get_flattened_list(ids, ngroups, self.levels, self.codes)
822 @final
823 def apply(
824 self, f: Callable, data: DataFrame | Series, axis: int = 0
825 ) -> tuple[list, bool]:
826 mutated = self.mutated
827 splitter = self._get_splitter(data, axis=axis)
828 group_keys = self.group_keys_seq
829 result_values = []
831 # This calls DataSplitter.__iter__
832 zipped = zip(group_keys, splitter)
834 for key, group in zipped:
835 object.__setattr__(group, "name", key)
837 # group might be modified
838 group_axes = group.axes
839 res = f(group)
840 if not mutated and not _is_indexed_like(res, group_axes, axis):
841 mutated = True
842 result_values.append(res)
843 # getattr pattern for __name__ is needed for functools.partial objects
844 if len(group_keys) == 0 and getattr(f, "__name__", None) in [
845 "mad",
846 "skew",
847 "sum",
848 "prod",
849 ]:
850 # If group_keys is empty, then no function calls have been made,
851 # so we will not have raised even if this is an invalid dtype.
852 # So do one dummy call here to raise appropriate TypeError.
853 f(data.iloc[:0])
855 return result_values, mutated
857 @cache_readonly
858 def indices(self) -> dict[Hashable, npt.NDArray[np.intp]]:
859 """dict {group name -> group indices}"""
860 if len(self.groupings) == 1 and isinstance(self.result_index, CategoricalIndex):
861 # This shows unused categories in indices GH#38642
862 return self.groupings[0].indices
863 codes_list = [ping.codes for ping in self.groupings]
864 keys = [ping.group_index for ping in self.groupings]
865 return get_indexer_dict(codes_list, keys)
867 @final
868 def result_ilocs(self) -> npt.NDArray[np.intp]:
869 """
870 Get the original integer locations of result_index in the input.
871 """
872 # Original indices are where group_index would go via sorting.
873 # But when dropna is true, we need to remove null values while accounting for
874 # any gaps that then occur because of them.
875 group_index = get_group_index(
876 self.codes, self.shape, sort=self._sort, xnull=True
877 )
878 group_index, _ = compress_group_index(group_index, sort=self._sort)
880 if self.has_dropped_na:
881 mask = np.where(group_index >= 0)
882 # Count how many gaps are caused by previous null values for each position
883 null_gaps = np.cumsum(group_index == -1)[mask]
884 group_index = group_index[mask]
886 result = get_group_index_sorter(group_index, self.ngroups)
888 if self.has_dropped_na:
889 # Shift by the number of prior null gaps
890 result += np.take(null_gaps, result)
892 return result
894 @final
895 @property
896 def codes(self) -> list[npt.NDArray[np.signedinteger]]:
897 return [ping.codes for ping in self.groupings]
899 @property
900 def levels(self) -> list[Index]:
901 return [ping.group_index for ping in self.groupings]
903 @property
904 def names(self) -> list[Hashable]:
905 return [ping.name for ping in self.groupings]
907 @final
908 def size(self) -> Series:
909 """
910 Compute group sizes.
911 """
912 ids, _, ngroups = self.group_info
913 out: np.ndarray | list
914 if ngroups:
915 out = np.bincount(ids[ids != -1], minlength=ngroups)
916 else:
917 out = []
918 return Series(out, index=self.result_index, dtype="int64")
920 @cache_readonly
921 def groups(self) -> dict[Hashable, np.ndarray]:
922 """dict {group name -> group labels}"""
923 if len(self.groupings) == 1:
924 return self.groupings[0].groups
925 else:
926 to_groupby = zip(*(ping.grouping_vector for ping in self.groupings))
927 index = Index(to_groupby)
928 return self.axis.groupby(index)
930 @final
931 @cache_readonly
932 def is_monotonic(self) -> bool:
933 # return if my group orderings are monotonic
934 return Index(self.group_info[0]).is_monotonic_increasing
936 @final
937 @cache_readonly
938 def has_dropped_na(self) -> bool:
939 """
940 Whether grouper has null value(s) that are dropped.
941 """
942 return bool((self.group_info[0] < 0).any())
944 @cache_readonly
945 def group_info(self) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp], int]:
946 comp_ids, obs_group_ids = self._get_compressed_codes()
948 ngroups = len(obs_group_ids)
949 comp_ids = ensure_platform_int(comp_ids)
951 return comp_ids, obs_group_ids, ngroups
953 @final
954 @cache_readonly
955 def codes_info(self) -> npt.NDArray[np.intp]:
956 # return the codes of items in original grouped axis
957 ids, _, _ = self.group_info
958 if self.indexer is not None:
959 sorter = np.lexsort((ids, self.indexer))
960 ids = ids[sorter]
961 ids = ensure_platform_int(ids)
962 # TODO: if numpy annotates np.lexsort, this ensure_platform_int
963 # may become unnecessary
964 return ids
966 @final
967 def _get_compressed_codes(
968 self,
969 ) -> tuple[npt.NDArray[np.signedinteger], npt.NDArray[np.intp]]:
970 # The first returned ndarray may have any signed integer dtype
971 if len(self.groupings) > 1:
972 group_index = get_group_index(self.codes, self.shape, sort=True, xnull=True)
973 return compress_group_index(group_index, sort=self._sort)
974 # FIXME: compress_group_index's second return value is int64, not intp
976 ping = self.groupings[0]
977 return ping.codes, np.arange(len(ping.group_index), dtype=np.intp)
979 @final
980 @cache_readonly
981 def ngroups(self) -> int:
982 return len(self.result_index)
984 @property
985 def reconstructed_codes(self) -> list[npt.NDArray[np.intp]]:
986 codes = self.codes
987 ids, obs_ids, _ = self.group_info
988 return decons_obs_group_ids(ids, obs_ids, self.shape, codes, xnull=True)
990 @cache_readonly
991 def result_index(self) -> Index:
992 if len(self.groupings) == 1:
993 return self.groupings[0].result_index.rename(self.names[0])
995 codes = self.reconstructed_codes
996 levels = [ping.result_index for ping in self.groupings]
997 return MultiIndex(
998 levels=levels, codes=codes, verify_integrity=False, names=self.names
999 )
1001 @final
1002 def get_group_levels(self) -> list[ArrayLike]:
1003 # Note: only called from _insert_inaxis_grouper_inplace, which
1004 # is only called for BaseGrouper, never for BinGrouper
1005 if len(self.groupings) == 1:
1006 return [self.groupings[0].group_arraylike]
1008 name_list = []
1009 for ping, codes in zip(self.groupings, self.reconstructed_codes):
1010 codes = ensure_platform_int(codes)
1011 levels = ping.group_arraylike.take(codes)
1013 name_list.append(levels)
1015 return name_list
1017 # ------------------------------------------------------------
1018 # Aggregation functions
1020 @final
1021 def _cython_operation(
1022 self,
1023 kind: str,
1024 values,
1025 how: str,
1026 axis: int,
1027 min_count: int = -1,
1028 **kwargs,
1029 ) -> ArrayLike:
1030 """
1031 Returns the values of a cython operation.
1032 """
1033 assert kind in ["transform", "aggregate"]
1035 cy_op = WrappedCythonOp(kind=kind, how=how, has_dropped_na=self.has_dropped_na)
1037 ids, _, _ = self.group_info
1038 ngroups = self.ngroups
1039 return cy_op.cython_operation(
1040 values=values,
1041 axis=axis,
1042 min_count=min_count,
1043 comp_ids=ids,
1044 ngroups=ngroups,
1045 **kwargs,
1046 )
1048 @final
1049 def agg_series(
1050 self, obj: Series, func: Callable, preserve_dtype: bool = False
1051 ) -> ArrayLike:
1052 """
1053 Parameters
1054 ----------
1055 obj : Series
1056 func : function taking a Series and returning a scalar-like
1057 preserve_dtype : bool
1058 Whether the aggregation is known to be dtype-preserving.
1060 Returns
1061 -------
1062 np.ndarray or ExtensionArray
1063 """
1064 # test_groupby_empty_with_category gets here with self.ngroups == 0
1065 # and len(obj) > 0
1067 if len(obj) == 0:
1068 # SeriesGrouper would raise if we were to call _aggregate_series_fast
1069 result = self._aggregate_series_pure_python(obj, func)
1071 elif not isinstance(obj._values, np.ndarray):
1072 result = self._aggregate_series_pure_python(obj, func)
1074 # we can preserve a little bit more aggressively with EA dtype
1075 # because maybe_cast_pointwise_result will do a try/except
1076 # with _from_sequence. NB we are assuming here that _from_sequence
1077 # is sufficiently strict that it casts appropriately.
1078 preserve_dtype = True
1080 else:
1081 result = self._aggregate_series_pure_python(obj, func)
1083 npvalues = lib.maybe_convert_objects(result, try_float=False)
1084 if preserve_dtype:
1085 out = maybe_cast_pointwise_result(npvalues, obj.dtype, numeric_only=True)
1086 else:
1087 out = npvalues
1088 return out
1090 @final
1091 def _aggregate_series_pure_python(
1092 self, obj: Series, func: Callable
1093 ) -> npt.NDArray[np.object_]:
1094 ids, _, ngroups = self.group_info
1096 counts = np.zeros(ngroups, dtype=int)
1097 result = np.empty(ngroups, dtype="O")
1098 initialized = False
1100 # equiv: splitter = self._get_splitter(obj, axis=0)
1101 splitter = get_splitter(obj, ids, ngroups, axis=0)
1103 for i, group in enumerate(splitter):
1104 res = func(group)
1105 res = libreduction.extract_result(res)
1107 if not initialized:
1108 # We only do this validation on the first iteration
1109 libreduction.check_result_array(res, group.dtype)
1110 initialized = True
1112 counts[i] = group.shape[0]
1113 result[i] = res
1115 return result
1118class BinGrouper(BaseGrouper):
1119 """
1120 This is an internal Grouper class
1122 Parameters
1123 ----------
1124 bins : the split index of binlabels to group the item of axis
1125 binlabels : the label list
1126 mutated : bool, default False
1127 indexer : np.ndarray[np.intp]
1129 Examples
1130 --------
1131 bins: [2, 4, 6, 8, 10]
1132 binlabels: DatetimeIndex(['2005-01-01', '2005-01-03',
1133 '2005-01-05', '2005-01-07', '2005-01-09'],
1134 dtype='datetime64[ns]', freq='2D')
1136 the group_info, which contains the label of each item in grouped
1137 axis, the index of label in label list, group number, is
1139 (array([0, 0, 1, 1, 2, 2, 3, 3, 4, 4]), array([0, 1, 2, 3, 4]), 5)
1141 means that, the grouped axis has 10 items, can be grouped into 5
1142 labels, the first and second items belong to the first label, the
1143 third and forth items belong to the second label, and so on
1145 """
1147 bins: npt.NDArray[np.int64]
1148 binlabels: Index
1149 mutated: bool
1151 def __init__(
1152 self,
1153 bins,
1154 binlabels,
1155 mutated: bool = False,
1156 indexer=None,
1157 ) -> None:
1158 self.bins = ensure_int64(bins)
1159 self.binlabels = ensure_index(binlabels)
1160 self.mutated = mutated
1161 self.indexer = indexer
1163 # These lengths must match, otherwise we could call agg_series
1164 # with empty self.bins, which would raise in libreduction.
1165 assert len(self.binlabels) == len(self.bins)
1167 @cache_readonly
1168 def groups(self):
1169 """dict {group name -> group labels}"""
1170 # this is mainly for compat
1171 # GH 3881
1172 result = {
1173 key: value
1174 for key, value in zip(self.binlabels, self.bins)
1175 if key is not NaT
1176 }
1177 return result
1179 @property
1180 def nkeys(self) -> int:
1181 # still matches len(self.groupings), but we can hard-code
1182 return 1
1184 def _get_grouper(self):
1185 """
1186 We are a grouper as part of another's groupings.
1188 We have a specific method of grouping, so cannot
1189 convert to a Index for our grouper.
1190 """
1191 return self
1193 def get_iterator(self, data: NDFrame, axis: int = 0):
1194 """
1195 Groupby iterator
1197 Returns
1198 -------
1199 Generator yielding sequence of (name, subsetted object)
1200 for each group
1201 """
1202 if axis == 0:
1203 slicer = lambda start, edge: data.iloc[start:edge]
1204 else:
1205 slicer = lambda start, edge: data.iloc[:, start:edge]
1207 length = len(data.axes[axis])
1209 start = 0
1210 for edge, label in zip(self.bins, self.binlabels):
1211 if label is not NaT:
1212 yield label, slicer(start, edge)
1213 start = edge
1215 if start < length:
1216 yield self.binlabels[-1], slicer(start, None)
1218 @cache_readonly
1219 def indices(self):
1220 indices = collections.defaultdict(list)
1222 i = 0
1223 for label, bin in zip(self.binlabels, self.bins):
1224 if i < bin:
1225 if label is not NaT:
1226 indices[label] = list(range(i, bin))
1227 i = bin
1228 return indices
1230 @cache_readonly
1231 def group_info(self) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp], int]:
1232 ngroups = self.ngroups
1233 obs_group_ids = np.arange(ngroups, dtype=np.intp)
1234 rep = np.diff(np.r_[0, self.bins])
1236 rep = ensure_platform_int(rep)
1237 if ngroups == len(self.bins):
1238 comp_ids = np.repeat(np.arange(ngroups), rep)
1239 else:
1240 comp_ids = np.repeat(np.r_[-1, np.arange(ngroups)], rep)
1242 return (
1243 ensure_platform_int(comp_ids),
1244 obs_group_ids,
1245 ngroups,
1246 )
1248 @cache_readonly
1249 def reconstructed_codes(self) -> list[np.ndarray]:
1250 # get unique result indices, and prepend 0 as groupby starts from the first
1251 return [np.r_[0, np.flatnonzero(self.bins[1:] != self.bins[:-1]) + 1]]
1253 @cache_readonly
1254 def result_index(self) -> Index:
1255 if len(self.binlabels) != 0 and isna(self.binlabels[0]):
1256 return self.binlabels[1:]
1258 return self.binlabels
1260 @property
1261 def levels(self) -> list[Index]:
1262 return [self.binlabels]
1264 @property
1265 def names(self) -> list[Hashable]:
1266 return [self.binlabels.name]
1268 @property
1269 def groupings(self) -> list[grouper.Grouping]:
1270 lev = self.binlabels
1271 ping = grouper.Grouping(lev, lev, in_axis=False, level=None)
1272 return [ping]
1274 def _aggregate_series_fast(self, obj: Series, func: Callable) -> NoReturn:
1275 # -> np.ndarray[object]
1276 raise NotImplementedError(
1277 "This should not be reached; use _aggregate_series_pure_python"
1278 )
1281def _is_indexed_like(obj, axes, axis: int) -> bool:
1282 if isinstance(obj, Series):
1283 if len(axes) > 1:
1284 return False
1285 return obj.axes[axis].equals(axes[axis])
1286 elif isinstance(obj, DataFrame):
1287 return obj.axes[axis].equals(axes[axis])
1289 return False
1292# ----------------------------------------------------------------------
1293# Splitting / application
1296class DataSplitter(Generic[NDFrameT]):
1297 def __init__(
1298 self,
1299 data: NDFrameT,
1300 labels: npt.NDArray[np.intp],
1301 ngroups: int,
1302 axis: int = 0,
1303 ) -> None:
1304 self.data = data
1305 self.labels = ensure_platform_int(labels) # _should_ already be np.intp
1306 self.ngroups = ngroups
1308 self.axis = axis
1309 assert isinstance(axis, int), axis
1311 @cache_readonly
1312 def slabels(self) -> npt.NDArray[np.intp]:
1313 # Sorted labels
1314 return self.labels.take(self._sort_idx)
1316 @cache_readonly
1317 def _sort_idx(self) -> npt.NDArray[np.intp]:
1318 # Counting sort indexer
1319 return get_group_index_sorter(self.labels, self.ngroups)
1321 def __iter__(self):
1322 sdata = self.sorted_data
1324 if self.ngroups == 0:
1325 # we are inside a generator, rather than raise StopIteration
1326 # we merely return signal the end
1327 return
1329 starts, ends = lib.generate_slices(self.slabels, self.ngroups)
1331 for start, end in zip(starts, ends):
1332 yield self._chop(sdata, slice(start, end))
1334 @cache_readonly
1335 def sorted_data(self) -> NDFrameT:
1336 return self.data.take(self._sort_idx, axis=self.axis)
1338 def _chop(self, sdata, slice_obj: slice) -> NDFrame:
1339 raise AbstractMethodError(self)
1342class SeriesSplitter(DataSplitter):
1343 def _chop(self, sdata: Series, slice_obj: slice) -> Series:
1344 # fastpath equivalent to `sdata.iloc[slice_obj]`
1345 mgr = sdata._mgr.get_slice(slice_obj)
1346 ser = sdata._constructor(mgr, name=sdata.name, fastpath=True)
1347 return ser.__finalize__(sdata, method="groupby")
1350class FrameSplitter(DataSplitter):
1351 def _chop(self, sdata: DataFrame, slice_obj: slice) -> DataFrame:
1352 # Fastpath equivalent to:
1353 # if self.axis == 0:
1354 # return sdata.iloc[slice_obj]
1355 # else:
1356 # return sdata.iloc[:, slice_obj]
1357 mgr = sdata._mgr.get_slice(slice_obj, axis=1 - self.axis)
1358 df = sdata._constructor(mgr)
1359 return df.__finalize__(sdata, method="groupby")
1362def get_splitter(
1363 data: NDFrame, labels: np.ndarray, ngroups: int, axis: int = 0
1364) -> DataSplitter:
1365 if isinstance(data, Series):
1366 klass: type[DataSplitter] = SeriesSplitter
1367 else:
1368 # i.e. DataFrame
1369 klass = FrameSplitter
1371 return klass(data, labels, ngroups, axis)