Coverage for /var/srv/projects/api.amasfac.comuna18.com/tmp/venv/lib/python3.9/site-packages/pandas/core/groupby/ops.py: 22%

1"""

2Provide classes to perform the groupby aggregate operations.

4These are not exposed to the user and provide implementations of the grouping

5operations, primarily in cython. These classes (BaseGrouper and BinGrouper)

6are contained *in* the SeriesGroupBy and DataFrameGroupBy objects.

7"""

8from __future__ import annotations

10import collections

11import functools

12from typing import (

13 TYPE_CHECKING,

14 Callable,

15 Generic,

16 Hashable,

17 Iterator,

18 NoReturn,

19 Sequence,

20 final,

21)

23import numpy as np

25from pandas._libs import (

26 NaT,

27 lib,

28)

29import pandas._libs.groupby as libgroupby

30import pandas._libs.reduction as libreduction

31from pandas._typing import (

32 ArrayLike,

33 DtypeObj,

34 NDFrameT,

35 Shape,

36 npt,

37)

38from pandas.errors import AbstractMethodError

39from pandas.util._decorators import cache_readonly

41from pandas.core.dtypes.cast import (

42 maybe_cast_pointwise_result,

43 maybe_downcast_to_dtype,

44)

45from pandas.core.dtypes.common import (

46 ensure_float64,

47 ensure_int64,

48 ensure_platform_int,

49 ensure_uint64,

50 is_1d_only_ea_dtype,

51 is_bool_dtype,

52 is_complex_dtype,

53 is_datetime64_any_dtype,

54 is_float_dtype,

55 is_integer_dtype,

56 is_numeric_dtype,

57 is_sparse,

58 is_timedelta64_dtype,

59 needs_i8_conversion,

60)

61from pandas.core.dtypes.dtypes import CategoricalDtype

62from pandas.core.dtypes.missing import (

63 isna,

64 maybe_fill,

65)

67from pandas.core.arrays import (

68 Categorical,

69 DatetimeArray,

70 ExtensionArray,

71 PeriodArray,

72 TimedeltaArray,

73)

74from pandas.core.arrays.boolean import BooleanDtype

75from pandas.core.arrays.floating import FloatingDtype

76from pandas.core.arrays.integer import IntegerDtype

77from pandas.core.arrays.masked import (

78 BaseMaskedArray,

79 BaseMaskedDtype,

80)

81from pandas.core.arrays.string_ import StringDtype

82from pandas.core.frame import DataFrame

83from pandas.core.groupby import grouper

84from pandas.core.indexes.api import (

85 CategoricalIndex,

86 Index,

87 MultiIndex,

88 ensure_index,

89)

90from pandas.core.series import Series

91from pandas.core.sorting import (

92 compress_group_index,

93 decons_obs_group_ids,

94 get_flattened_list,

95 get_group_index,

96 get_group_index_sorter,

97 get_indexer_dict,

98)

100if TYPE_CHECKING: 100 ↛ 101line 100 didn't jump to line 101, because the condition on line 100 was never true

101 from pandas.core.generic import NDFrame

102

103

104class WrappedCythonOp:

105 """

106 Dispatch logic for functions defined in _libs.groupby

107

108 Parameters

109 ----------

110 kind: str

111 Whether the operation is an aggregate or transform.

112 how: str

113 Operation name, e.g. "mean".

114 has_dropped_na: bool

115 True precisely when dropna=True and the grouper contains a null value.

116 """

117

118 # Functions for which we do _not_ attempt to cast the cython result

119 # back to the original dtype.

120 cast_blocklist = frozenset(["rank", "count", "size", "idxmin", "idxmax"])

121

122 def __init__(self, kind: str, how: str, has_dropped_na: bool) -> None:

123 self.kind = kind

124 self.how = how

125 self.has_dropped_na = has_dropped_na

126

127 _CYTHON_FUNCTIONS = {

128 "aggregate": {

129 "sum": "group_sum",

130 "prod": "group_prod",

131 "min": "group_min",

132 "max": "group_max",

133 "mean": "group_mean",

134 "median": "group_median_float64",

135 "var": "group_var",

136 "first": "group_nth",

137 "last": "group_last",

138 "ohlc": "group_ohlc",

139 },

140 "transform": {

141 "cumprod": "group_cumprod_float64",

142 "cumsum": "group_cumsum",

143 "cummin": "group_cummin",

144 "cummax": "group_cummax",

145 "rank": "group_rank",

146 },

147 }

148

149 # "group_any" and "group_all" are also support masks, but don't go

150 # through WrappedCythonOp

151 _MASKED_CYTHON_FUNCTIONS = {

152 "cummin",

153 "cummax",

154 "min",

155 "max",

156 "last",

157 "first",

158 "rank",

159 "sum",

160 "ohlc",

161 "cumsum",

162 "prod",

163 }

164

165 _cython_arity = {"ohlc": 4} # OHLC

166

167 # Note: we make this a classmethod and pass kind+how so that caching

168 # works at the class level and not the instance level

169 @classmethod

170 @functools.lru_cache(maxsize=None)

171 def _get_cython_function(

172 cls, kind: str, how: str, dtype: np.dtype, is_numeric: bool

173 ):

174

175 dtype_str = dtype.name

176 ftype = cls._CYTHON_FUNCTIONS[kind][how]

177

178 # see if there is a fused-type version of function

179 # only valid for numeric

180 f = getattr(libgroupby, ftype)

181 if is_numeric:

182 return f

183 elif dtype == np.dtype(object):

184 if how in ["median", "cumprod"]:

185 # no fused types -> no __signatures__

186 raise NotImplementedError(

187 f"function is not implemented for this dtype: "

188 f"[how->{how},dtype->{dtype_str}]"

189 )

190 elif "object" not in f.__signatures__:

191 # raise NotImplementedError here rather than TypeError later

192 raise NotImplementedError(

193 f"function is not implemented for this dtype: "

194 f"[how->{how},dtype->{dtype_str}]"

195 )

196 return f

197 else:

198 raise NotImplementedError(

199 "This should not be reached. Please report a bug at "

200 "github.com/pandas-dev/pandas/",

201 dtype,

202 )

203

204 def _get_cython_vals(self, values: np.ndarray) -> np.ndarray:

205 """

206 Cast numeric dtypes to float64 for functions that only support that.

207

208 Parameters

209 ----------

210 values : np.ndarray

211

212 Returns

213 -------

214 values : np.ndarray

215 """

216 how = self.how

217

218 if how in ["median", "cumprod"]:

219 # these two only have float64 implementations

220 # We should only get here with is_numeric, as non-numeric cases

221 # should raise in _get_cython_function

222 values = ensure_float64(values)

223

224 elif values.dtype.kind in ["i", "u"]:

225 if how in ["var", "mean"] or (

226 self.kind == "transform" and self.has_dropped_na

227 ):

228 # result may still include NaN, so we have to cast

229 values = ensure_float64(values)

230

231 elif how in ["sum", "ohlc", "prod", "cumsum"]:

232 # Avoid overflow during group op

233 if values.dtype.kind == "i":

234 values = ensure_int64(values)

235 else:

236 values = ensure_uint64(values)

237

238 return values

239

240 # TODO: general case implementation overridable by EAs.

241 def _disallow_invalid_ops(self, dtype: DtypeObj, is_numeric: bool = False):

242 """

243 Check if we can do this operation with our cython functions.

244

245 Raises

246 ------

247 NotImplementedError

248 This is either not a valid function for this dtype, or

249 valid but not implemented in cython.

250 """

251 how = self.how

252

253 if is_numeric:

254 # never an invalid op for those dtypes, so return early as fastpath

255 return

256

257 if isinstance(dtype, CategoricalDtype):

258 # NotImplementedError for methods that can fall back to a

259 # non-cython implementation.

260 if how in ["sum", "prod", "cumsum", "cumprod"]:

261 raise TypeError(f"{dtype} type does not support {how} operations")

262 elif how not in ["rank"]:

263 # only "rank" is implemented in cython

264 raise NotImplementedError(f"{dtype} dtype not supported")

265 elif not dtype.ordered:

266 # TODO: TypeError?

267 raise NotImplementedError(f"{dtype} dtype not supported")

268

269 elif is_sparse(dtype):

270 # categoricals are only 1d, so we

271 # are not setup for dim transforming

272 raise NotImplementedError(f"{dtype} dtype not supported")

273 elif is_datetime64_any_dtype(dtype):

274 # TODO: same for period_dtype? no for these methods with Period

275 # we raise NotImplemented if this is an invalid operation

276 # entirely, e.g. adding datetimes

277 if how in ["sum", "prod", "cumsum", "cumprod"]:

278 raise TypeError(f"datetime64 type does not support {how} operations")

279 elif is_timedelta64_dtype(dtype):

280 if how in ["prod", "cumprod"]:

281 raise TypeError(f"timedelta64 type does not support {how} operations")

282

283 def _get_output_shape(self, ngroups: int, values: np.ndarray) -> Shape:

284 how = self.how

285 kind = self.kind

286

287 arity = self._cython_arity.get(how, 1)

288

289 out_shape: Shape

290 if how == "ohlc":

291 out_shape = (ngroups, 4)

292 elif arity > 1:

293 raise NotImplementedError(

294 "arity of more than 1 is not supported for the 'how' argument"

295 )

296 elif kind == "transform":

297 out_shape = values.shape

298 else:

299 out_shape = (ngroups,) + values.shape[1:]

300 return out_shape

301

302 def _get_out_dtype(self, dtype: np.dtype) -> np.dtype:

303 how = self.how

304

305 if how == "rank":

306 out_dtype = "float64"

307 else:

308 if is_numeric_dtype(dtype):

309 out_dtype = f"{dtype.kind}{dtype.itemsize}"

310 else:

311 out_dtype = "object"

312 return np.dtype(out_dtype)

313

314 def _get_result_dtype(self, dtype: np.dtype) -> np.dtype:

315 """

316 Get the desired dtype of a result based on the

317 input dtype and how it was computed.

318

319 Parameters

320 ----------

321 dtype : np.dtype

322

323 Returns

324 -------

325 np.dtype

326 The desired dtype of the result.

327 """

328 how = self.how

329

330 if how in ["sum", "cumsum", "sum", "prod"]:

331 if dtype == np.dtype(bool):

332 return np.dtype(np.int64)

333 elif how in ["mean", "median", "var"]:

334 if is_float_dtype(dtype) or is_complex_dtype(dtype):

335 return dtype

336 elif is_numeric_dtype(dtype):

337 return np.dtype(np.float64)

338 return dtype

339

340 def uses_mask(self) -> bool:

341 return self.how in self._MASKED_CYTHON_FUNCTIONS

342

343 @final

344 def _ea_wrap_cython_operation(

345 self,

346 values: ExtensionArray,

347 min_count: int,

348 ngroups: int,

349 comp_ids: np.ndarray,

350 **kwargs,

351 ) -> ArrayLike:

352 """

353 If we have an ExtensionArray, unwrap, call _cython_operation, and

354 re-wrap if appropriate.

355 """

356 if isinstance(values, BaseMaskedArray) and self.uses_mask():

357 return self._masked_ea_wrap_cython_operation(

358 values,

359 min_count=min_count,

360 ngroups=ngroups,

361 comp_ids=comp_ids,

362 **kwargs,

363 )

364

365 elif isinstance(values, Categorical) and self.uses_mask():

366 assert self.how == "rank" # the only one implemented ATM

367 assert values.ordered # checked earlier

368 mask = values.isna()

369 npvalues = values._ndarray

370

371 res_values = self._cython_op_ndim_compat(

372 npvalues,

373 min_count=min_count,

374 ngroups=ngroups,

375 comp_ids=comp_ids,

376 mask=mask,

377 **kwargs,

378 )

379

380 # If we ever have more than just "rank" here, we'll need to do

381 # `if self.how in self.cast_blocklist` like we do for other dtypes.

382 return res_values

383

384 npvalues = self._ea_to_cython_values(values)

385

386 res_values = self._cython_op_ndim_compat(

387 npvalues,

388 min_count=min_count,

389 ngroups=ngroups,

390 comp_ids=comp_ids,

391 mask=None,

392 **kwargs,

393 )

394

395 if self.how in self.cast_blocklist:

396 # i.e. how in ["rank"], since other cast_blocklist methods dont go

397 # through cython_operation

398 return res_values

399

400 return self._reconstruct_ea_result(values, res_values)

401

402 # TODO: general case implementation overridable by EAs.

403 def _ea_to_cython_values(self, values: ExtensionArray) -> np.ndarray:

404 # GH#43682

405 if isinstance(values, (DatetimeArray, PeriodArray, TimedeltaArray)):

406 # All of the functions implemented here are ordinal, so we can

407 # operate on the tz-naive equivalents

408 npvalues = values._ndarray.view("M8[ns]")

409 elif isinstance(values.dtype, (BooleanDtype, IntegerDtype)):

410 # IntegerArray or BooleanArray

411 npvalues = values.to_numpy("float64", na_value=np.nan)

412 elif isinstance(values.dtype, FloatingDtype):

413 # FloatingArray

414 npvalues = values.to_numpy(values.dtype.numpy_dtype, na_value=np.nan)

415 elif isinstance(values.dtype, StringDtype):

416 # StringArray

417 npvalues = values.to_numpy(object, na_value=np.nan)

418 else:

419 raise NotImplementedError(

420 f"function is not implemented for this dtype: {values.dtype}"

421 )

422 return npvalues

423

424 # TODO: general case implementation overridable by EAs.

425 def _reconstruct_ea_result(

426 self, values: ExtensionArray, res_values: np.ndarray

427 ) -> ExtensionArray:

428 """

429 Construct an ExtensionArray result from an ndarray result.

430 """

431 dtype: BaseMaskedDtype | StringDtype

432

433 if isinstance(values.dtype, StringDtype):

434 dtype = values.dtype

435 string_array_cls = dtype.construct_array_type()

436 return string_array_cls._from_sequence(res_values, dtype=dtype)

437

438 elif isinstance(values.dtype, BaseMaskedDtype):

439 new_dtype = self._get_result_dtype(values.dtype.numpy_dtype)

440 dtype = BaseMaskedDtype.from_numpy_dtype(new_dtype)

441 masked_array_cls = dtype.construct_array_type()

442 return masked_array_cls._from_sequence(res_values, dtype=dtype)

443

444 elif isinstance(values, (DatetimeArray, TimedeltaArray, PeriodArray)):

445 # In to_cython_values we took a view as M8[ns]

446 assert res_values.dtype == "M8[ns]"

447 res_values = res_values.view(values._ndarray.dtype)

448 return values._from_backing_data(res_values)

449

450 raise NotImplementedError

451

452 @final

453 def _masked_ea_wrap_cython_operation(

454 self,

455 values: BaseMaskedArray,

456 min_count: int,

457 ngroups: int,

458 comp_ids: np.ndarray,

459 **kwargs,

460 ) -> BaseMaskedArray:

461 """

462 Equivalent of `_ea_wrap_cython_operation`, but optimized for masked EA's

463 and cython algorithms which accept a mask.

464 """

465 orig_values = values

466

467 # libgroupby functions are responsible for NOT altering mask

468 mask = values._mask

469 if self.kind != "aggregate":

470 result_mask = mask.copy()

471 else:

472 result_mask = np.zeros(ngroups, dtype=bool)

473

474 arr = values._data

475

476 res_values = self._cython_op_ndim_compat(

477 arr,

478 min_count=min_count,

479 ngroups=ngroups,

480 comp_ids=comp_ids,

481 mask=mask,

482 result_mask=result_mask,

483 **kwargs,

484 )

485

486 if self.how == "ohlc":

487 result_mask = np.tile(result_mask, (4, 1)).T

488

489 # res_values should already have the correct dtype, we just need to

490 # wrap in a MaskedArray

491 return orig_values._maybe_mask_result(res_values, result_mask)

492

493 @final

494 def _cython_op_ndim_compat(

495 self,

496 values: np.ndarray,

497 *,

498 min_count: int,

499 ngroups: int,

500 comp_ids: np.ndarray,

501 mask: npt.NDArray[np.bool_] | None = None,

502 result_mask: npt.NDArray[np.bool_] | None = None,

503 **kwargs,

504 ) -> np.ndarray:

505 if values.ndim == 1:

506 # expand to 2d, dispatch, then squeeze if appropriate

507 values2d = values[None, :]

508 if mask is not None:

509 mask = mask[None, :]

510 if result_mask is not None:

511 result_mask = result_mask[None, :]

512 res = self._call_cython_op(

513 values2d,

514 min_count=min_count,

515 ngroups=ngroups,

516 comp_ids=comp_ids,

517 mask=mask,

518 result_mask=result_mask,

519 **kwargs,

520 )

521 if res.shape[0] == 1:

522 return res[0]

523

524 # otherwise we have OHLC

525 return res.T

526

527 return self._call_cython_op(

528 values,

529 min_count=min_count,

530 ngroups=ngroups,

531 comp_ids=comp_ids,

532 mask=mask,

533 result_mask=result_mask,

534 **kwargs,

535 )

536

537 @final

538 def _call_cython_op(

539 self,

540 values: np.ndarray, # np.ndarray[ndim=2]

541 *,

542 min_count: int,

543 ngroups: int,

544 comp_ids: np.ndarray,

545 mask: npt.NDArray[np.bool_] | None,

546 result_mask: npt.NDArray[np.bool_] | None,

547 **kwargs,

548 ) -> np.ndarray: # np.ndarray[ndim=2]

549 orig_values = values

550

551 dtype = values.dtype

552 is_numeric = is_numeric_dtype(dtype)

553

554 is_datetimelike = needs_i8_conversion(dtype)

555

556 if is_datetimelike:

557 values = values.view("int64")

558 is_numeric = True

559 elif is_bool_dtype(dtype):

560 values = values.view("uint8")

561 if values.dtype == "float16":

562 values = values.astype(np.float32)

563

564 values = values.T

565 if mask is not None:

566 mask = mask.T

567 if result_mask is not None:

568 result_mask = result_mask.T

569

570 out_shape = self._get_output_shape(ngroups, values)

571 func = self._get_cython_function(self.kind, self.how, values.dtype, is_numeric)

572 values = self._get_cython_vals(values)

573 out_dtype = self._get_out_dtype(values.dtype)

574

575 result = maybe_fill(np.empty(out_shape, dtype=out_dtype))

576 if self.kind == "aggregate":

577 counts = np.zeros(ngroups, dtype=np.int64)

578 if self.how in ["min", "max", "mean", "last", "first"]:

579 func(

580 out=result,

581 counts=counts,

582 values=values,

583 labels=comp_ids,

584 min_count=min_count,

585 mask=mask,

586 result_mask=result_mask,

587 is_datetimelike=is_datetimelike,

588 )

589 elif self.how in ["sum"]:

590 # We support datetimelike

591 func(

592 out=result,

593 counts=counts,

594 values=values,

595 labels=comp_ids,

596 mask=mask,

597 result_mask=result_mask,

598 min_count=min_count,

599 is_datetimelike=is_datetimelike,

600 )

601 elif self.how in ["ohlc", "prod"]:

602 func(

603 result,

604 counts,

605 values,

606 comp_ids,

607 min_count=min_count,

608 mask=mask,

609 result_mask=result_mask,

610 )

611 else:

612 func(result, counts, values, comp_ids, min_count, **kwargs)

613 else:

614 # TODO: min_count

615 if self.uses_mask():

616 if self.how != "rank":

617 # TODO: should rank take result_mask?

618 kwargs["result_mask"] = result_mask

619 func(

620 out=result,

621 values=values,

622 labels=comp_ids,

623 ngroups=ngroups,

624 is_datetimelike=is_datetimelike,

625 mask=mask,

626 **kwargs,

627 )

628 else:

629 func(

630 out=result,

631 values=values,

632 labels=comp_ids,

633 ngroups=ngroups,

634 is_datetimelike=is_datetimelike,

635 **kwargs,

636 )

637

638 if self.kind == "aggregate":

639 # i.e. counts is defined. Locations where count<min_count

640 # need to have the result set to np.nan, which may require casting,

641 # see GH#40767

642 if is_integer_dtype(result.dtype) and not is_datetimelike:

643 # if the op keeps the int dtypes, we have to use 0

644 cutoff = max(0 if self.how in ["sum", "prod"] else 1, min_count)

645 empty_groups = counts < cutoff

646 if empty_groups.any():

647 if result_mask is not None and self.uses_mask():

648 assert result_mask[empty_groups].all()

649 else:

650 # Note: this conversion could be lossy, see GH#40767

651 result = result.astype("float64")

652 result[empty_groups] = np.nan

653

654 result = result.T

655

656 if self.how not in self.cast_blocklist:

657 # e.g. if we are int64 and need to restore to datetime64/timedelta64

658 # "rank" is the only member of cast_blocklist we get here

659 # Casting only needed for float16, bool, datetimelike,

660 # and self.how in ["sum", "prod", "ohlc", "cumprod"]

661 res_dtype = self._get_result_dtype(orig_values.dtype)

662 op_result = maybe_downcast_to_dtype(result, res_dtype)

663 else:

664 op_result = result

665

666 return op_result

667

668 @final

669 def cython_operation(

670 self,

671 *,

672 values: ArrayLike,

673 axis: int,

674 min_count: int = -1,

675 comp_ids: np.ndarray,

676 ngroups: int,

677 **kwargs,

678 ) -> ArrayLike:

679 """

680 Call our cython function, with appropriate pre- and post- processing.

681 """

682 if values.ndim > 2:

683 raise NotImplementedError("number of dimensions is currently limited to 2")

684 elif values.ndim == 2:

685 assert axis == 1, axis

686 elif not is_1d_only_ea_dtype(values.dtype):

687 # Note: it is *not* the case that axis is always 0 for 1-dim values,

688 # as we can have 1D ExtensionArrays that we need to treat as 2D

689 assert axis == 0

690

691 dtype = values.dtype

692 is_numeric = is_numeric_dtype(dtype)

693

694 # can we do this operation with our cython functions

695 # if not raise NotImplementedError

696 self._disallow_invalid_ops(dtype, is_numeric)

697

698 if not isinstance(values, np.ndarray):

699 # i.e. ExtensionArray

700 return self._ea_wrap_cython_operation(

701 values,

702 min_count=min_count,

703 ngroups=ngroups,

704 comp_ids=comp_ids,

705 **kwargs,

706 )

707

708 return self._cython_op_ndim_compat(

709 values,

710 min_count=min_count,

711 ngroups=ngroups,

712 comp_ids=comp_ids,

713 mask=None,

714 **kwargs,

715 )

716

717

718class BaseGrouper:

719 """

720 This is an internal Grouper class, which actually holds

721 the generated groups

722

723 Parameters

724 ----------

725 axis : Index

726 groupings : Sequence[Grouping]

727 all the grouping instances to handle in this grouper

728 for example for grouper list to groupby, need to pass the list

729 sort : bool, default True

730 whether this grouper will give sorted result or not

731 group_keys : bool, default True

732 mutated : bool, default False

733 indexer : np.ndarray[np.intp], optional

734 the indexer created by Grouper

735 some groupers (TimeGrouper) will sort its axis and its

736 group_info is also sorted, so need the indexer to reorder

737

738 """

739

740 axis: Index

741

742 def __init__(

743 self,

744 axis: Index,

745 groupings: Sequence[grouper.Grouping],

746 sort: bool = True,

747 group_keys: bool = True,

748 mutated: bool = False,

749 indexer: npt.NDArray[np.intp] | None = None,

750 dropna: bool = True,

751 ) -> None:

752 assert isinstance(axis, Index), axis

753

754 self.axis = axis

755 self._groupings: list[grouper.Grouping] = list(groupings)

756 self._sort = sort

757 self.group_keys = group_keys

758 self.mutated = mutated

759 self.indexer = indexer

760 self.dropna = dropna

761

762 @property

763 def groupings(self) -> list[grouper.Grouping]:

764 return self._groupings

765

766 @property

767 def shape(self) -> Shape:

768 return tuple(ping.ngroups for ping in self.groupings)

769

770 def __iter__(self) -> Iterator[Hashable]:

771 return iter(self.indices)

772

773 @property

774 def nkeys(self) -> int:

775 return len(self.groupings)

776

777 def get_iterator(

778 self, data: NDFrameT, axis: int = 0

779 ) -> Iterator[tuple[Hashable, NDFrameT]]:

780 """

781 Groupby iterator

782

783 Returns

784 -------

785 Generator yielding sequence of (name, subsetted object)

786 for each group

787 """

788 splitter = self._get_splitter(data, axis=axis)

789 keys = self.group_keys_seq

790 yield from zip(keys, splitter)

791

792 @final

793 def _get_splitter(self, data: NDFrame, axis: int = 0) -> DataSplitter:

794 """

795 Returns

796 -------

797 Generator yielding subsetted objects

798 """

799 ids, _, ngroups = self.group_info

800 return get_splitter(data, ids, ngroups, axis=axis)

801

802 def _get_grouper(self):

803 """

804 We are a grouper as part of another's groupings.

805

806 We have a specific method of grouping, so cannot

807 convert to a Index for our grouper.

808 """

809 return self.groupings[0].grouping_vector

810

811 @final

812 @cache_readonly

813 def group_keys_seq(self):

814 if len(self.groupings) == 1:

815 return self.levels[0]

816 else:

817 ids, _, ngroups = self.group_info

818

819 # provide "flattened" iterator for multi-group setting

820 return get_flattened_list(ids, ngroups, self.levels, self.codes)

821

822 @final

823 def apply(

824 self, f: Callable, data: DataFrame | Series, axis: int = 0

825 ) -> tuple[list, bool]:

826 mutated = self.mutated

827 splitter = self._get_splitter(data, axis=axis)

828 group_keys = self.group_keys_seq

829 result_values = []

830

831 # This calls DataSplitter.__iter__

832 zipped = zip(group_keys, splitter)

833

834 for key, group in zipped:

835 object.__setattr__(group, "name", key)

836

837 # group might be modified

838 group_axes = group.axes

839 res = f(group)

840 if not mutated and not _is_indexed_like(res, group_axes, axis):

841 mutated = True

842 result_values.append(res)

843 # getattr pattern for __name__ is needed for functools.partial objects

844 if len(group_keys) == 0 and getattr(f, "__name__", None) in [

845 "mad",

846 "skew",

847 "sum",

848 "prod",

849 ]:

850 # If group_keys is empty, then no function calls have been made,

851 # so we will not have raised even if this is an invalid dtype.

852 # So do one dummy call here to raise appropriate TypeError.

853 f(data.iloc[:0])

854

855 return result_values, mutated

856

857 @cache_readonly

858 def indices(self) -> dict[Hashable, npt.NDArray[np.intp]]:

859 """dict {group name -> group indices}"""

860 if len(self.groupings) == 1 and isinstance(self.result_index, CategoricalIndex):

861 # This shows unused categories in indices GH#38642

862 return self.groupings[0].indices

863 codes_list = [ping.codes for ping in self.groupings]

864 keys = [ping.group_index for ping in self.groupings]

865 return get_indexer_dict(codes_list, keys)

866

867 @final

868 def result_ilocs(self) -> npt.NDArray[np.intp]:

869 """

870 Get the original integer locations of result_index in the input.

871 """

872 # Original indices are where group_index would go via sorting.

873 # But when dropna is true, we need to remove null values while accounting for

874 # any gaps that then occur because of them.

875 group_index = get_group_index(

876 self.codes, self.shape, sort=self._sort, xnull=True

877 )

878 group_index, _ = compress_group_index(group_index, sort=self._sort)

879

880 if self.has_dropped_na:

881 mask = np.where(group_index >= 0)

882 # Count how many gaps are caused by previous null values for each position

883 null_gaps = np.cumsum(group_index == -1)[mask]

884 group_index = group_index[mask]

885

886 result = get_group_index_sorter(group_index, self.ngroups)

887

888 if self.has_dropped_na:

889 # Shift by the number of prior null gaps

890 result += np.take(null_gaps, result)

891

892 return result

893

894 @final

895 @property

896 def codes(self) -> list[npt.NDArray[np.signedinteger]]:

897 return [ping.codes for ping in self.groupings]

898

899 @property

900 def levels(self) -> list[Index]:

901 return [ping.group_index for ping in self.groupings]

902

903 @property

904 def names(self) -> list[Hashable]:

905 return [ping.name for ping in self.groupings]

906

907 @final

908 def size(self) -> Series:

909 """

910 Compute group sizes.

911 """

912 ids, _, ngroups = self.group_info

913 out: np.ndarray | list

914 if ngroups:

915 out = np.bincount(ids[ids != -1], minlength=ngroups)

916 else:

917 out = []

918 return Series(out, index=self.result_index, dtype="int64")

919

920 @cache_readonly

921 def groups(self) -> dict[Hashable, np.ndarray]:

922 """dict {group name -> group labels}"""

923 if len(self.groupings) == 1:

924 return self.groupings[0].groups

925 else:

926 to_groupby = zip(*(ping.grouping_vector for ping in self.groupings))

927 index = Index(to_groupby)

928 return self.axis.groupby(index)

929

930 @final

931 @cache_readonly

932 def is_monotonic(self) -> bool:

933 # return if my group orderings are monotonic

934 return Index(self.group_info[0]).is_monotonic_increasing

935

936 @final

937 @cache_readonly

938 def has_dropped_na(self) -> bool:

939 """

940 Whether grouper has null value(s) that are dropped.

941 """

942 return bool((self.group_info[0] < 0).any())

943

944 @cache_readonly

945 def group_info(self) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp], int]:

946 comp_ids, obs_group_ids = self._get_compressed_codes()

947

948 ngroups = len(obs_group_ids)

949 comp_ids = ensure_platform_int(comp_ids)

950

951 return comp_ids, obs_group_ids, ngroups

952

953 @final

954 @cache_readonly

955 def codes_info(self) -> npt.NDArray[np.intp]:

956 # return the codes of items in original grouped axis

957 ids, _, _ = self.group_info

958 if self.indexer is not None:

959 sorter = np.lexsort((ids, self.indexer))

960 ids = ids[sorter]

961 ids = ensure_platform_int(ids)

962 # TODO: if numpy annotates np.lexsort, this ensure_platform_int

963 # may become unnecessary

964 return ids

965

966 @final

967 def _get_compressed_codes(

968 self,

969 ) -> tuple[npt.NDArray[np.signedinteger], npt.NDArray[np.intp]]:

970 # The first returned ndarray may have any signed integer dtype

971 if len(self.groupings) > 1:

972 group_index = get_group_index(self.codes, self.shape, sort=True, xnull=True)

973 return compress_group_index(group_index, sort=self._sort)

974 # FIXME: compress_group_index's second return value is int64, not intp

975

976 ping = self.groupings[0]

977 return ping.codes, np.arange(len(ping.group_index), dtype=np.intp)

978

979 @final

980 @cache_readonly

981 def ngroups(self) -> int:

982 return len(self.result_index)

983

984 @property

985 def reconstructed_codes(self) -> list[npt.NDArray[np.intp]]:

986 codes = self.codes

987 ids, obs_ids, _ = self.group_info

988 return decons_obs_group_ids(ids, obs_ids, self.shape, codes, xnull=True)

989

990 @cache_readonly

991 def result_index(self) -> Index:

992 if len(self.groupings) == 1:

993 return self.groupings[0].result_index.rename(self.names[0])

994

995 codes = self.reconstructed_codes

996 levels = [ping.result_index for ping in self.groupings]

997 return MultiIndex(

998 levels=levels, codes=codes, verify_integrity=False, names=self.names

999 )

1000

1001 @final

1002 def get_group_levels(self) -> list[ArrayLike]:

1003 # Note: only called from _insert_inaxis_grouper_inplace, which

1004 # is only called for BaseGrouper, never for BinGrouper

1005 if len(self.groupings) == 1:

1006 return [self.groupings[0].group_arraylike]

1007

1008 name_list = []

1009 for ping, codes in zip(self.groupings, self.reconstructed_codes):

1010 codes = ensure_platform_int(codes)

1011 levels = ping.group_arraylike.take(codes)

1012

1013 name_list.append(levels)

1014

1015 return name_list

1016

1017 # ------------------------------------------------------------

1018 # Aggregation functions

1019

1020 @final

1021 def _cython_operation(

1022 self,

1023 kind: str,

1024 values,

1025 how: str,

1026 axis: int,

1027 min_count: int = -1,

1028 **kwargs,

1029 ) -> ArrayLike:

1030 """

1031 Returns the values of a cython operation.

1032 """

1033 assert kind in ["transform", "aggregate"]

1034

1035 cy_op = WrappedCythonOp(kind=kind, how=how, has_dropped_na=self.has_dropped_na)

1036

1037 ids, _, _ = self.group_info

1038 ngroups = self.ngroups

1039 return cy_op.cython_operation(

1040 values=values,

1041 axis=axis,

1042 min_count=min_count,

1043 comp_ids=ids,

1044 ngroups=ngroups,

1045 **kwargs,

1046 )

1047

1048 @final

1049 def agg_series(

1050 self, obj: Series, func: Callable, preserve_dtype: bool = False

1051 ) -> ArrayLike:

1052 """

1053 Parameters

1054 ----------

1055 obj : Series

1056 func : function taking a Series and returning a scalar-like

1057 preserve_dtype : bool

1058 Whether the aggregation is known to be dtype-preserving.

1059

1060 Returns

1061 -------

1062 np.ndarray or ExtensionArray

1063 """

1064 # test_groupby_empty_with_category gets here with self.ngroups == 0

1065 # and len(obj) > 0

1066

1067 if len(obj) == 0:

1068 # SeriesGrouper would raise if we were to call _aggregate_series_fast

1069 result = self._aggregate_series_pure_python(obj, func)

1070

1071 elif not isinstance(obj._values, np.ndarray):

1072 result = self._aggregate_series_pure_python(obj, func)

1073

1074 # we can preserve a little bit more aggressively with EA dtype

1075 # because maybe_cast_pointwise_result will do a try/except

1076 # with _from_sequence. NB we are assuming here that _from_sequence

1077 # is sufficiently strict that it casts appropriately.

1078 preserve_dtype = True

1079

1080 else:

1081 result = self._aggregate_series_pure_python(obj, func)

1082

1083 npvalues = lib.maybe_convert_objects(result, try_float=False)

1084 if preserve_dtype:

1085 out = maybe_cast_pointwise_result(npvalues, obj.dtype, numeric_only=True)

1086 else:

1087 out = npvalues

1088 return out

1089

1090 @final

1091 def _aggregate_series_pure_python(

1092 self, obj: Series, func: Callable

1093 ) -> npt.NDArray[np.object_]:

1094 ids, _, ngroups = self.group_info

1095

1096 counts = np.zeros(ngroups, dtype=int)

1097 result = np.empty(ngroups, dtype="O")

1098 initialized = False

1099

1100 # equiv: splitter = self._get_splitter(obj, axis=0)

1101 splitter = get_splitter(obj, ids, ngroups, axis=0)

1102

1103 for i, group in enumerate(splitter):

1104 res = func(group)

1105 res = libreduction.extract_result(res)

1106

1107 if not initialized:

1108 # We only do this validation on the first iteration

1109 libreduction.check_result_array(res, group.dtype)

1110 initialized = True

1111

1112 counts[i] = group.shape[0]

1113 result[i] = res

1114

1115 return result

1116

1117

1118class BinGrouper(BaseGrouper):

1119 """

1120 This is an internal Grouper class

1121

1122 Parameters

1123 ----------

1124 bins : the split index of binlabels to group the item of axis

1125 binlabels : the label list

1126 mutated : bool, default False

1127 indexer : np.ndarray[np.intp]

1128

1129 Examples

1130 --------

1131 bins: [2, 4, 6, 8, 10]

1132 binlabels: DatetimeIndex(['2005-01-01', '2005-01-03',

1133 '2005-01-05', '2005-01-07', '2005-01-09'],

1134 dtype='datetime64[ns]', freq='2D')

1135

1136 the group_info, which contains the label of each item in grouped

1137 axis, the index of label in label list, group number, is

1138

1139 (array([0, 0, 1, 1, 2, 2, 3, 3, 4, 4]), array([0, 1, 2, 3, 4]), 5)

1140

1141 means that, the grouped axis has 10 items, can be grouped into 5

1142 labels, the first and second items belong to the first label, the

1143 third and forth items belong to the second label, and so on

1144

1145 """

1146

1147 bins: npt.NDArray[np.int64]

1148 binlabels: Index

1149 mutated: bool

1150

1151 def __init__(

1152 self,

1153 bins,

1154 binlabels,

1155 mutated: bool = False,

1156 indexer=None,

1157 ) -> None:

1158 self.bins = ensure_int64(bins)

1159 self.binlabels = ensure_index(binlabels)

1160 self.mutated = mutated

1161 self.indexer = indexer

1162

1163 # These lengths must match, otherwise we could call agg_series

1164 # with empty self.bins, which would raise in libreduction.

1165 assert len(self.binlabels) == len(self.bins)

1166

1167 @cache_readonly

1168 def groups(self):

1169 """dict {group name -> group labels}"""

1170 # this is mainly for compat

1171 # GH 3881

1172 result = {

1173 key: value

1174 for key, value in zip(self.binlabels, self.bins)

1175 if key is not NaT

1176 }

1177 return result

1178

1179 @property

1180 def nkeys(self) -> int:

1181 # still matches len(self.groupings), but we can hard-code

1182 return 1

1183

1184 def _get_grouper(self):

1185 """

1186 We are a grouper as part of another's groupings.

1187

1188 We have a specific method of grouping, so cannot

1189 convert to a Index for our grouper.

1190 """

1191 return self

1192

1193 def get_iterator(self, data: NDFrame, axis: int = 0):

1194 """

1195 Groupby iterator

1196

1197 Returns

1198 -------

1199 Generator yielding sequence of (name, subsetted object)

1200 for each group

1201 """

1202 if axis == 0:

1203 slicer = lambda start, edge: data.iloc[start:edge]

1204 else:

1205 slicer = lambda start, edge: data.iloc[:, start:edge]

1206

1207 length = len(data.axes[axis])

1208

1209 start = 0

1210 for edge, label in zip(self.bins, self.binlabels):

1211 if label is not NaT:

1212 yield label, slicer(start, edge)

1213 start = edge

1214

1215 if start < length:

1216 yield self.binlabels[-1], slicer(start, None)

1217

1218 @cache_readonly

1219 def indices(self):

1220 indices = collections.defaultdict(list)

1221

1222 i = 0

1223 for label, bin in zip(self.binlabels, self.bins):

1224 if i < bin:

1225 if label is not NaT:

1226 indices[label] = list(range(i, bin))

1227 i = bin

1228 return indices

1229

1230 @cache_readonly

1231 def group_info(self) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp], int]:

1232 ngroups = self.ngroups

1233 obs_group_ids = np.arange(ngroups, dtype=np.intp)

1234 rep = np.diff(np.r_[0, self.bins])

1235

1236 rep = ensure_platform_int(rep)

1237 if ngroups == len(self.bins):

1238 comp_ids = np.repeat(np.arange(ngroups), rep)

1239 else:

1240 comp_ids = np.repeat(np.r_[-1, np.arange(ngroups)], rep)

1241

1242 return (

1243 ensure_platform_int(comp_ids),

1244 obs_group_ids,

1245 ngroups,

1246 )

1247

1248 @cache_readonly

1249 def reconstructed_codes(self) -> list[np.ndarray]:

1250 # get unique result indices, and prepend 0 as groupby starts from the first

1251 return [np.r_[0, np.flatnonzero(self.bins[1:] != self.bins[:-1]) + 1]]

1252

1253 @cache_readonly

1254 def result_index(self) -> Index:

1255 if len(self.binlabels) != 0 and isna(self.binlabels[0]):

1256 return self.binlabels[1:]

1257

1258 return self.binlabels

1259

1260 @property

1261 def levels(self) -> list[Index]:

1262 return [self.binlabels]

1263

1264 @property

1265 def names(self) -> list[Hashable]:

1266 return [self.binlabels.name]

1267

1268 @property

1269 def groupings(self) -> list[grouper.Grouping]:

1270 lev = self.binlabels

1271 ping = grouper.Grouping(lev, lev, in_axis=False, level=None)

1272 return [ping]

1273

1274 def _aggregate_series_fast(self, obj: Series, func: Callable) -> NoReturn:

1275 # -> np.ndarray[object]

1276 raise NotImplementedError(

1277 "This should not be reached; use _aggregate_series_pure_python"

1278 )

1279

1280

1281def _is_indexed_like(obj, axes, axis: int) -> bool:

1282 if isinstance(obj, Series):

1283 if len(axes) > 1:

1284 return False

1285 return obj.axes[axis].equals(axes[axis])

1286 elif isinstance(obj, DataFrame):

1287 return obj.axes[axis].equals(axes[axis])

1288

1289 return False

1290

1291

1292# ----------------------------------------------------------------------

1293# Splitting / application

1294

1295

1296class DataSplitter(Generic[NDFrameT]):

1297 def __init__(

1298 self,

1299 data: NDFrameT,

1300 labels: npt.NDArray[np.intp],

1301 ngroups: int,

1302 axis: int = 0,

1303 ) -> None:

1304 self.data = data

1305 self.labels = ensure_platform_int(labels) # _should_ already be np.intp

1306 self.ngroups = ngroups

1307

1308 self.axis = axis

1309 assert isinstance(axis, int), axis

1310

1311 @cache_readonly

1312 def slabels(self) -> npt.NDArray[np.intp]:

1313 # Sorted labels

1314 return self.labels.take(self._sort_idx)

1315

1316 @cache_readonly

1317 def _sort_idx(self) -> npt.NDArray[np.intp]:

1318 # Counting sort indexer

1319 return get_group_index_sorter(self.labels, self.ngroups)

1320

1321 def __iter__(self):

1322 sdata = self.sorted_data

1323

1324 if self.ngroups == 0:

1325 # we are inside a generator, rather than raise StopIteration

1326 # we merely return signal the end

1327 return

1328

1329 starts, ends = lib.generate_slices(self.slabels, self.ngroups)

1330

1331 for start, end in zip(starts, ends):

1332 yield self._chop(sdata, slice(start, end))

1333

1334 @cache_readonly

1335 def sorted_data(self) -> NDFrameT:

1336 return self.data.take(self._sort_idx, axis=self.axis)

1337

1338 def _chop(self, sdata, slice_obj: slice) -> NDFrame:

1339 raise AbstractMethodError(self)

1340

1341

1342class SeriesSplitter(DataSplitter):

1343 def _chop(self, sdata: Series, slice_obj: slice) -> Series:

1344 # fastpath equivalent to `sdata.iloc[slice_obj]`

1345 mgr = sdata._mgr.get_slice(slice_obj)

1346 ser = sdata._constructor(mgr, name=sdata.name, fastpath=True)

1347 return ser.__finalize__(sdata, method="groupby")

1348

1349

1350class FrameSplitter(DataSplitter):

1351 def _chop(self, sdata: DataFrame, slice_obj: slice) -> DataFrame:

1352 # Fastpath equivalent to:

1353 # if self.axis == 0:

1354 # return sdata.iloc[slice_obj]

1355 # else:

1356 # return sdata.iloc[:, slice_obj]

1357 mgr = sdata._mgr.get_slice(slice_obj, axis=1 - self.axis)

1358 df = sdata._constructor(mgr)

1359 return df.__finalize__(sdata, method="groupby")

1360

1361

1362def get_splitter(

1363 data: NDFrame, labels: np.ndarray, ngroups: int, axis: int = 0

1364) -> DataSplitter:

1365 if isinstance(data, Series):

1366 klass: type[DataSplitter] = SeriesSplitter

1367 else:

1368 # i.e. DataFrame

1369 klass = FrameSplitter

1370

1371 return klass(data, labels, ngroups, axis)