Coverage for /var/srv/projects/api.amasfac.comuna18.com/tmp/venv/lib/python3.9/site-packages/pandas/core/groupby/ops.py: 22%

560 statements  

« prev     ^ index     » next       coverage.py v6.4.4, created at 2023-07-17 14:22 -0600

1""" 

2Provide classes to perform the groupby aggregate operations. 

3 

4These are not exposed to the user and provide implementations of the grouping 

5operations, primarily in cython. These classes (BaseGrouper and BinGrouper) 

6are contained *in* the SeriesGroupBy and DataFrameGroupBy objects. 

7""" 

8from __future__ import annotations 

9 

10import collections 

11import functools 

12from typing import ( 

13 TYPE_CHECKING, 

14 Callable, 

15 Generic, 

16 Hashable, 

17 Iterator, 

18 NoReturn, 

19 Sequence, 

20 final, 

21) 

22 

23import numpy as np 

24 

25from pandas._libs import ( 

26 NaT, 

27 lib, 

28) 

29import pandas._libs.groupby as libgroupby 

30import pandas._libs.reduction as libreduction 

31from pandas._typing import ( 

32 ArrayLike, 

33 DtypeObj, 

34 NDFrameT, 

35 Shape, 

36 npt, 

37) 

38from pandas.errors import AbstractMethodError 

39from pandas.util._decorators import cache_readonly 

40 

41from pandas.core.dtypes.cast import ( 

42 maybe_cast_pointwise_result, 

43 maybe_downcast_to_dtype, 

44) 

45from pandas.core.dtypes.common import ( 

46 ensure_float64, 

47 ensure_int64, 

48 ensure_platform_int, 

49 ensure_uint64, 

50 is_1d_only_ea_dtype, 

51 is_bool_dtype, 

52 is_complex_dtype, 

53 is_datetime64_any_dtype, 

54 is_float_dtype, 

55 is_integer_dtype, 

56 is_numeric_dtype, 

57 is_sparse, 

58 is_timedelta64_dtype, 

59 needs_i8_conversion, 

60) 

61from pandas.core.dtypes.dtypes import CategoricalDtype 

62from pandas.core.dtypes.missing import ( 

63 isna, 

64 maybe_fill, 

65) 

66 

67from pandas.core.arrays import ( 

68 Categorical, 

69 DatetimeArray, 

70 ExtensionArray, 

71 PeriodArray, 

72 TimedeltaArray, 

73) 

74from pandas.core.arrays.boolean import BooleanDtype 

75from pandas.core.arrays.floating import FloatingDtype 

76from pandas.core.arrays.integer import IntegerDtype 

77from pandas.core.arrays.masked import ( 

78 BaseMaskedArray, 

79 BaseMaskedDtype, 

80) 

81from pandas.core.arrays.string_ import StringDtype 

82from pandas.core.frame import DataFrame 

83from pandas.core.groupby import grouper 

84from pandas.core.indexes.api import ( 

85 CategoricalIndex, 

86 Index, 

87 MultiIndex, 

88 ensure_index, 

89) 

90from pandas.core.series import Series 

91from pandas.core.sorting import ( 

92 compress_group_index, 

93 decons_obs_group_ids, 

94 get_flattened_list, 

95 get_group_index, 

96 get_group_index_sorter, 

97 get_indexer_dict, 

98) 

99 

100if TYPE_CHECKING: 100 ↛ 101line 100 didn't jump to line 101, because the condition on line 100 was never true

101 from pandas.core.generic import NDFrame 

102 

103 

104class WrappedCythonOp: 

105 """ 

106 Dispatch logic for functions defined in _libs.groupby 

107 

108 Parameters 

109 ---------- 

110 kind: str 

111 Whether the operation is an aggregate or transform. 

112 how: str 

113 Operation name, e.g. "mean". 

114 has_dropped_na: bool 

115 True precisely when dropna=True and the grouper contains a null value. 

116 """ 

117 

118 # Functions for which we do _not_ attempt to cast the cython result 

119 # back to the original dtype. 

120 cast_blocklist = frozenset(["rank", "count", "size", "idxmin", "idxmax"]) 

121 

122 def __init__(self, kind: str, how: str, has_dropped_na: bool) -> None: 

123 self.kind = kind 

124 self.how = how 

125 self.has_dropped_na = has_dropped_na 

126 

127 _CYTHON_FUNCTIONS = { 

128 "aggregate": { 

129 "sum": "group_sum", 

130 "prod": "group_prod", 

131 "min": "group_min", 

132 "max": "group_max", 

133 "mean": "group_mean", 

134 "median": "group_median_float64", 

135 "var": "group_var", 

136 "first": "group_nth", 

137 "last": "group_last", 

138 "ohlc": "group_ohlc", 

139 }, 

140 "transform": { 

141 "cumprod": "group_cumprod_float64", 

142 "cumsum": "group_cumsum", 

143 "cummin": "group_cummin", 

144 "cummax": "group_cummax", 

145 "rank": "group_rank", 

146 }, 

147 } 

148 

149 # "group_any" and "group_all" are also support masks, but don't go 

150 # through WrappedCythonOp 

151 _MASKED_CYTHON_FUNCTIONS = { 

152 "cummin", 

153 "cummax", 

154 "min", 

155 "max", 

156 "last", 

157 "first", 

158 "rank", 

159 "sum", 

160 "ohlc", 

161 "cumsum", 

162 "prod", 

163 } 

164 

165 _cython_arity = {"ohlc": 4} # OHLC 

166 

167 # Note: we make this a classmethod and pass kind+how so that caching 

168 # works at the class level and not the instance level 

169 @classmethod 

170 @functools.lru_cache(maxsize=None) 

171 def _get_cython_function( 

172 cls, kind: str, how: str, dtype: np.dtype, is_numeric: bool 

173 ): 

174 

175 dtype_str = dtype.name 

176 ftype = cls._CYTHON_FUNCTIONS[kind][how] 

177 

178 # see if there is a fused-type version of function 

179 # only valid for numeric 

180 f = getattr(libgroupby, ftype) 

181 if is_numeric: 

182 return f 

183 elif dtype == np.dtype(object): 

184 if how in ["median", "cumprod"]: 

185 # no fused types -> no __signatures__ 

186 raise NotImplementedError( 

187 f"function is not implemented for this dtype: " 

188 f"[how->{how},dtype->{dtype_str}]" 

189 ) 

190 elif "object" not in f.__signatures__: 

191 # raise NotImplementedError here rather than TypeError later 

192 raise NotImplementedError( 

193 f"function is not implemented for this dtype: " 

194 f"[how->{how},dtype->{dtype_str}]" 

195 ) 

196 return f 

197 else: 

198 raise NotImplementedError( 

199 "This should not be reached. Please report a bug at " 

200 "github.com/pandas-dev/pandas/", 

201 dtype, 

202 ) 

203 

204 def _get_cython_vals(self, values: np.ndarray) -> np.ndarray: 

205 """ 

206 Cast numeric dtypes to float64 for functions that only support that. 

207 

208 Parameters 

209 ---------- 

210 values : np.ndarray 

211 

212 Returns 

213 ------- 

214 values : np.ndarray 

215 """ 

216 how = self.how 

217 

218 if how in ["median", "cumprod"]: 

219 # these two only have float64 implementations 

220 # We should only get here with is_numeric, as non-numeric cases 

221 # should raise in _get_cython_function 

222 values = ensure_float64(values) 

223 

224 elif values.dtype.kind in ["i", "u"]: 

225 if how in ["var", "mean"] or ( 

226 self.kind == "transform" and self.has_dropped_na 

227 ): 

228 # result may still include NaN, so we have to cast 

229 values = ensure_float64(values) 

230 

231 elif how in ["sum", "ohlc", "prod", "cumsum"]: 

232 # Avoid overflow during group op 

233 if values.dtype.kind == "i": 

234 values = ensure_int64(values) 

235 else: 

236 values = ensure_uint64(values) 

237 

238 return values 

239 

240 # TODO: general case implementation overridable by EAs. 

241 def _disallow_invalid_ops(self, dtype: DtypeObj, is_numeric: bool = False): 

242 """ 

243 Check if we can do this operation with our cython functions. 

244 

245 Raises 

246 ------ 

247 NotImplementedError 

248 This is either not a valid function for this dtype, or 

249 valid but not implemented in cython. 

250 """ 

251 how = self.how 

252 

253 if is_numeric: 

254 # never an invalid op for those dtypes, so return early as fastpath 

255 return 

256 

257 if isinstance(dtype, CategoricalDtype): 

258 # NotImplementedError for methods that can fall back to a 

259 # non-cython implementation. 

260 if how in ["sum", "prod", "cumsum", "cumprod"]: 

261 raise TypeError(f"{dtype} type does not support {how} operations") 

262 elif how not in ["rank"]: 

263 # only "rank" is implemented in cython 

264 raise NotImplementedError(f"{dtype} dtype not supported") 

265 elif not dtype.ordered: 

266 # TODO: TypeError? 

267 raise NotImplementedError(f"{dtype} dtype not supported") 

268 

269 elif is_sparse(dtype): 

270 # categoricals are only 1d, so we 

271 # are not setup for dim transforming 

272 raise NotImplementedError(f"{dtype} dtype not supported") 

273 elif is_datetime64_any_dtype(dtype): 

274 # TODO: same for period_dtype? no for these methods with Period 

275 # we raise NotImplemented if this is an invalid operation 

276 # entirely, e.g. adding datetimes 

277 if how in ["sum", "prod", "cumsum", "cumprod"]: 

278 raise TypeError(f"datetime64 type does not support {how} operations") 

279 elif is_timedelta64_dtype(dtype): 

280 if how in ["prod", "cumprod"]: 

281 raise TypeError(f"timedelta64 type does not support {how} operations") 

282 

283 def _get_output_shape(self, ngroups: int, values: np.ndarray) -> Shape: 

284 how = self.how 

285 kind = self.kind 

286 

287 arity = self._cython_arity.get(how, 1) 

288 

289 out_shape: Shape 

290 if how == "ohlc": 

291 out_shape = (ngroups, 4) 

292 elif arity > 1: 

293 raise NotImplementedError( 

294 "arity of more than 1 is not supported for the 'how' argument" 

295 ) 

296 elif kind == "transform": 

297 out_shape = values.shape 

298 else: 

299 out_shape = (ngroups,) + values.shape[1:] 

300 return out_shape 

301 

302 def _get_out_dtype(self, dtype: np.dtype) -> np.dtype: 

303 how = self.how 

304 

305 if how == "rank": 

306 out_dtype = "float64" 

307 else: 

308 if is_numeric_dtype(dtype): 

309 out_dtype = f"{dtype.kind}{dtype.itemsize}" 

310 else: 

311 out_dtype = "object" 

312 return np.dtype(out_dtype) 

313 

314 def _get_result_dtype(self, dtype: np.dtype) -> np.dtype: 

315 """ 

316 Get the desired dtype of a result based on the 

317 input dtype and how it was computed. 

318 

319 Parameters 

320 ---------- 

321 dtype : np.dtype 

322 

323 Returns 

324 ------- 

325 np.dtype 

326 The desired dtype of the result. 

327 """ 

328 how = self.how 

329 

330 if how in ["sum", "cumsum", "sum", "prod"]: 

331 if dtype == np.dtype(bool): 

332 return np.dtype(np.int64) 

333 elif how in ["mean", "median", "var"]: 

334 if is_float_dtype(dtype) or is_complex_dtype(dtype): 

335 return dtype 

336 elif is_numeric_dtype(dtype): 

337 return np.dtype(np.float64) 

338 return dtype 

339 

340 def uses_mask(self) -> bool: 

341 return self.how in self._MASKED_CYTHON_FUNCTIONS 

342 

343 @final 

344 def _ea_wrap_cython_operation( 

345 self, 

346 values: ExtensionArray, 

347 min_count: int, 

348 ngroups: int, 

349 comp_ids: np.ndarray, 

350 **kwargs, 

351 ) -> ArrayLike: 

352 """ 

353 If we have an ExtensionArray, unwrap, call _cython_operation, and 

354 re-wrap if appropriate. 

355 """ 

356 if isinstance(values, BaseMaskedArray) and self.uses_mask(): 

357 return self._masked_ea_wrap_cython_operation( 

358 values, 

359 min_count=min_count, 

360 ngroups=ngroups, 

361 comp_ids=comp_ids, 

362 **kwargs, 

363 ) 

364 

365 elif isinstance(values, Categorical) and self.uses_mask(): 

366 assert self.how == "rank" # the only one implemented ATM 

367 assert values.ordered # checked earlier 

368 mask = values.isna() 

369 npvalues = values._ndarray 

370 

371 res_values = self._cython_op_ndim_compat( 

372 npvalues, 

373 min_count=min_count, 

374 ngroups=ngroups, 

375 comp_ids=comp_ids, 

376 mask=mask, 

377 **kwargs, 

378 ) 

379 

380 # If we ever have more than just "rank" here, we'll need to do 

381 # `if self.how in self.cast_blocklist` like we do for other dtypes. 

382 return res_values 

383 

384 npvalues = self._ea_to_cython_values(values) 

385 

386 res_values = self._cython_op_ndim_compat( 

387 npvalues, 

388 min_count=min_count, 

389 ngroups=ngroups, 

390 comp_ids=comp_ids, 

391 mask=None, 

392 **kwargs, 

393 ) 

394 

395 if self.how in self.cast_blocklist: 

396 # i.e. how in ["rank"], since other cast_blocklist methods dont go 

397 # through cython_operation 

398 return res_values 

399 

400 return self._reconstruct_ea_result(values, res_values) 

401 

402 # TODO: general case implementation overridable by EAs. 

403 def _ea_to_cython_values(self, values: ExtensionArray) -> np.ndarray: 

404 # GH#43682 

405 if isinstance(values, (DatetimeArray, PeriodArray, TimedeltaArray)): 

406 # All of the functions implemented here are ordinal, so we can 

407 # operate on the tz-naive equivalents 

408 npvalues = values._ndarray.view("M8[ns]") 

409 elif isinstance(values.dtype, (BooleanDtype, IntegerDtype)): 

410 # IntegerArray or BooleanArray 

411 npvalues = values.to_numpy("float64", na_value=np.nan) 

412 elif isinstance(values.dtype, FloatingDtype): 

413 # FloatingArray 

414 npvalues = values.to_numpy(values.dtype.numpy_dtype, na_value=np.nan) 

415 elif isinstance(values.dtype, StringDtype): 

416 # StringArray 

417 npvalues = values.to_numpy(object, na_value=np.nan) 

418 else: 

419 raise NotImplementedError( 

420 f"function is not implemented for this dtype: {values.dtype}" 

421 ) 

422 return npvalues 

423 

424 # TODO: general case implementation overridable by EAs. 

425 def _reconstruct_ea_result( 

426 self, values: ExtensionArray, res_values: np.ndarray 

427 ) -> ExtensionArray: 

428 """ 

429 Construct an ExtensionArray result from an ndarray result. 

430 """ 

431 dtype: BaseMaskedDtype | StringDtype 

432 

433 if isinstance(values.dtype, StringDtype): 

434 dtype = values.dtype 

435 string_array_cls = dtype.construct_array_type() 

436 return string_array_cls._from_sequence(res_values, dtype=dtype) 

437 

438 elif isinstance(values.dtype, BaseMaskedDtype): 

439 new_dtype = self._get_result_dtype(values.dtype.numpy_dtype) 

440 dtype = BaseMaskedDtype.from_numpy_dtype(new_dtype) 

441 masked_array_cls = dtype.construct_array_type() 

442 return masked_array_cls._from_sequence(res_values, dtype=dtype) 

443 

444 elif isinstance(values, (DatetimeArray, TimedeltaArray, PeriodArray)): 

445 # In to_cython_values we took a view as M8[ns] 

446 assert res_values.dtype == "M8[ns]" 

447 res_values = res_values.view(values._ndarray.dtype) 

448 return values._from_backing_data(res_values) 

449 

450 raise NotImplementedError 

451 

452 @final 

453 def _masked_ea_wrap_cython_operation( 

454 self, 

455 values: BaseMaskedArray, 

456 min_count: int, 

457 ngroups: int, 

458 comp_ids: np.ndarray, 

459 **kwargs, 

460 ) -> BaseMaskedArray: 

461 """ 

462 Equivalent of `_ea_wrap_cython_operation`, but optimized for masked EA's 

463 and cython algorithms which accept a mask. 

464 """ 

465 orig_values = values 

466 

467 # libgroupby functions are responsible for NOT altering mask 

468 mask = values._mask 

469 if self.kind != "aggregate": 

470 result_mask = mask.copy() 

471 else: 

472 result_mask = np.zeros(ngroups, dtype=bool) 

473 

474 arr = values._data 

475 

476 res_values = self._cython_op_ndim_compat( 

477 arr, 

478 min_count=min_count, 

479 ngroups=ngroups, 

480 comp_ids=comp_ids, 

481 mask=mask, 

482 result_mask=result_mask, 

483 **kwargs, 

484 ) 

485 

486 if self.how == "ohlc": 

487 result_mask = np.tile(result_mask, (4, 1)).T 

488 

489 # res_values should already have the correct dtype, we just need to 

490 # wrap in a MaskedArray 

491 return orig_values._maybe_mask_result(res_values, result_mask) 

492 

493 @final 

494 def _cython_op_ndim_compat( 

495 self, 

496 values: np.ndarray, 

497 *, 

498 min_count: int, 

499 ngroups: int, 

500 comp_ids: np.ndarray, 

501 mask: npt.NDArray[np.bool_] | None = None, 

502 result_mask: npt.NDArray[np.bool_] | None = None, 

503 **kwargs, 

504 ) -> np.ndarray: 

505 if values.ndim == 1: 

506 # expand to 2d, dispatch, then squeeze if appropriate 

507 values2d = values[None, :] 

508 if mask is not None: 

509 mask = mask[None, :] 

510 if result_mask is not None: 

511 result_mask = result_mask[None, :] 

512 res = self._call_cython_op( 

513 values2d, 

514 min_count=min_count, 

515 ngroups=ngroups, 

516 comp_ids=comp_ids, 

517 mask=mask, 

518 result_mask=result_mask, 

519 **kwargs, 

520 ) 

521 if res.shape[0] == 1: 

522 return res[0] 

523 

524 # otherwise we have OHLC 

525 return res.T 

526 

527 return self._call_cython_op( 

528 values, 

529 min_count=min_count, 

530 ngroups=ngroups, 

531 comp_ids=comp_ids, 

532 mask=mask, 

533 result_mask=result_mask, 

534 **kwargs, 

535 ) 

536 

537 @final 

538 def _call_cython_op( 

539 self, 

540 values: np.ndarray, # np.ndarray[ndim=2] 

541 *, 

542 min_count: int, 

543 ngroups: int, 

544 comp_ids: np.ndarray, 

545 mask: npt.NDArray[np.bool_] | None, 

546 result_mask: npt.NDArray[np.bool_] | None, 

547 **kwargs, 

548 ) -> np.ndarray: # np.ndarray[ndim=2] 

549 orig_values = values 

550 

551 dtype = values.dtype 

552 is_numeric = is_numeric_dtype(dtype) 

553 

554 is_datetimelike = needs_i8_conversion(dtype) 

555 

556 if is_datetimelike: 

557 values = values.view("int64") 

558 is_numeric = True 

559 elif is_bool_dtype(dtype): 

560 values = values.view("uint8") 

561 if values.dtype == "float16": 

562 values = values.astype(np.float32) 

563 

564 values = values.T 

565 if mask is not None: 

566 mask = mask.T 

567 if result_mask is not None: 

568 result_mask = result_mask.T 

569 

570 out_shape = self._get_output_shape(ngroups, values) 

571 func = self._get_cython_function(self.kind, self.how, values.dtype, is_numeric) 

572 values = self._get_cython_vals(values) 

573 out_dtype = self._get_out_dtype(values.dtype) 

574 

575 result = maybe_fill(np.empty(out_shape, dtype=out_dtype)) 

576 if self.kind == "aggregate": 

577 counts = np.zeros(ngroups, dtype=np.int64) 

578 if self.how in ["min", "max", "mean", "last", "first"]: 

579 func( 

580 out=result, 

581 counts=counts, 

582 values=values, 

583 labels=comp_ids, 

584 min_count=min_count, 

585 mask=mask, 

586 result_mask=result_mask, 

587 is_datetimelike=is_datetimelike, 

588 ) 

589 elif self.how in ["sum"]: 

590 # We support datetimelike 

591 func( 

592 out=result, 

593 counts=counts, 

594 values=values, 

595 labels=comp_ids, 

596 mask=mask, 

597 result_mask=result_mask, 

598 min_count=min_count, 

599 is_datetimelike=is_datetimelike, 

600 ) 

601 elif self.how in ["ohlc", "prod"]: 

602 func( 

603 result, 

604 counts, 

605 values, 

606 comp_ids, 

607 min_count=min_count, 

608 mask=mask, 

609 result_mask=result_mask, 

610 ) 

611 else: 

612 func(result, counts, values, comp_ids, min_count, **kwargs) 

613 else: 

614 # TODO: min_count 

615 if self.uses_mask(): 

616 if self.how != "rank": 

617 # TODO: should rank take result_mask? 

618 kwargs["result_mask"] = result_mask 

619 func( 

620 out=result, 

621 values=values, 

622 labels=comp_ids, 

623 ngroups=ngroups, 

624 is_datetimelike=is_datetimelike, 

625 mask=mask, 

626 **kwargs, 

627 ) 

628 else: 

629 func( 

630 out=result, 

631 values=values, 

632 labels=comp_ids, 

633 ngroups=ngroups, 

634 is_datetimelike=is_datetimelike, 

635 **kwargs, 

636 ) 

637 

638 if self.kind == "aggregate": 

639 # i.e. counts is defined. Locations where count<min_count 

640 # need to have the result set to np.nan, which may require casting, 

641 # see GH#40767 

642 if is_integer_dtype(result.dtype) and not is_datetimelike: 

643 # if the op keeps the int dtypes, we have to use 0 

644 cutoff = max(0 if self.how in ["sum", "prod"] else 1, min_count) 

645 empty_groups = counts < cutoff 

646 if empty_groups.any(): 

647 if result_mask is not None and self.uses_mask(): 

648 assert result_mask[empty_groups].all() 

649 else: 

650 # Note: this conversion could be lossy, see GH#40767 

651 result = result.astype("float64") 

652 result[empty_groups] = np.nan 

653 

654 result = result.T 

655 

656 if self.how not in self.cast_blocklist: 

657 # e.g. if we are int64 and need to restore to datetime64/timedelta64 

658 # "rank" is the only member of cast_blocklist we get here 

659 # Casting only needed for float16, bool, datetimelike, 

660 # and self.how in ["sum", "prod", "ohlc", "cumprod"] 

661 res_dtype = self._get_result_dtype(orig_values.dtype) 

662 op_result = maybe_downcast_to_dtype(result, res_dtype) 

663 else: 

664 op_result = result 

665 

666 return op_result 

667 

668 @final 

669 def cython_operation( 

670 self, 

671 *, 

672 values: ArrayLike, 

673 axis: int, 

674 min_count: int = -1, 

675 comp_ids: np.ndarray, 

676 ngroups: int, 

677 **kwargs, 

678 ) -> ArrayLike: 

679 """ 

680 Call our cython function, with appropriate pre- and post- processing. 

681 """ 

682 if values.ndim > 2: 

683 raise NotImplementedError("number of dimensions is currently limited to 2") 

684 elif values.ndim == 2: 

685 assert axis == 1, axis 

686 elif not is_1d_only_ea_dtype(values.dtype): 

687 # Note: it is *not* the case that axis is always 0 for 1-dim values, 

688 # as we can have 1D ExtensionArrays that we need to treat as 2D 

689 assert axis == 0 

690 

691 dtype = values.dtype 

692 is_numeric = is_numeric_dtype(dtype) 

693 

694 # can we do this operation with our cython functions 

695 # if not raise NotImplementedError 

696 self._disallow_invalid_ops(dtype, is_numeric) 

697 

698 if not isinstance(values, np.ndarray): 

699 # i.e. ExtensionArray 

700 return self._ea_wrap_cython_operation( 

701 values, 

702 min_count=min_count, 

703 ngroups=ngroups, 

704 comp_ids=comp_ids, 

705 **kwargs, 

706 ) 

707 

708 return self._cython_op_ndim_compat( 

709 values, 

710 min_count=min_count, 

711 ngroups=ngroups, 

712 comp_ids=comp_ids, 

713 mask=None, 

714 **kwargs, 

715 ) 

716 

717 

718class BaseGrouper: 

719 """ 

720 This is an internal Grouper class, which actually holds 

721 the generated groups 

722 

723 Parameters 

724 ---------- 

725 axis : Index 

726 groupings : Sequence[Grouping] 

727 all the grouping instances to handle in this grouper 

728 for example for grouper list to groupby, need to pass the list 

729 sort : bool, default True 

730 whether this grouper will give sorted result or not 

731 group_keys : bool, default True 

732 mutated : bool, default False 

733 indexer : np.ndarray[np.intp], optional 

734 the indexer created by Grouper 

735 some groupers (TimeGrouper) will sort its axis and its 

736 group_info is also sorted, so need the indexer to reorder 

737 

738 """ 

739 

740 axis: Index 

741 

742 def __init__( 

743 self, 

744 axis: Index, 

745 groupings: Sequence[grouper.Grouping], 

746 sort: bool = True, 

747 group_keys: bool = True, 

748 mutated: bool = False, 

749 indexer: npt.NDArray[np.intp] | None = None, 

750 dropna: bool = True, 

751 ) -> None: 

752 assert isinstance(axis, Index), axis 

753 

754 self.axis = axis 

755 self._groupings: list[grouper.Grouping] = list(groupings) 

756 self._sort = sort 

757 self.group_keys = group_keys 

758 self.mutated = mutated 

759 self.indexer = indexer 

760 self.dropna = dropna 

761 

762 @property 

763 def groupings(self) -> list[grouper.Grouping]: 

764 return self._groupings 

765 

766 @property 

767 def shape(self) -> Shape: 

768 return tuple(ping.ngroups for ping in self.groupings) 

769 

770 def __iter__(self) -> Iterator[Hashable]: 

771 return iter(self.indices) 

772 

773 @property 

774 def nkeys(self) -> int: 

775 return len(self.groupings) 

776 

777 def get_iterator( 

778 self, data: NDFrameT, axis: int = 0 

779 ) -> Iterator[tuple[Hashable, NDFrameT]]: 

780 """ 

781 Groupby iterator 

782 

783 Returns 

784 ------- 

785 Generator yielding sequence of (name, subsetted object) 

786 for each group 

787 """ 

788 splitter = self._get_splitter(data, axis=axis) 

789 keys = self.group_keys_seq 

790 yield from zip(keys, splitter) 

791 

792 @final 

793 def _get_splitter(self, data: NDFrame, axis: int = 0) -> DataSplitter: 

794 """ 

795 Returns 

796 ------- 

797 Generator yielding subsetted objects 

798 """ 

799 ids, _, ngroups = self.group_info 

800 return get_splitter(data, ids, ngroups, axis=axis) 

801 

802 def _get_grouper(self): 

803 """ 

804 We are a grouper as part of another's groupings. 

805 

806 We have a specific method of grouping, so cannot 

807 convert to a Index for our grouper. 

808 """ 

809 return self.groupings[0].grouping_vector 

810 

811 @final 

812 @cache_readonly 

813 def group_keys_seq(self): 

814 if len(self.groupings) == 1: 

815 return self.levels[0] 

816 else: 

817 ids, _, ngroups = self.group_info 

818 

819 # provide "flattened" iterator for multi-group setting 

820 return get_flattened_list(ids, ngroups, self.levels, self.codes) 

821 

822 @final 

823 def apply( 

824 self, f: Callable, data: DataFrame | Series, axis: int = 0 

825 ) -> tuple[list, bool]: 

826 mutated = self.mutated 

827 splitter = self._get_splitter(data, axis=axis) 

828 group_keys = self.group_keys_seq 

829 result_values = [] 

830 

831 # This calls DataSplitter.__iter__ 

832 zipped = zip(group_keys, splitter) 

833 

834 for key, group in zipped: 

835 object.__setattr__(group, "name", key) 

836 

837 # group might be modified 

838 group_axes = group.axes 

839 res = f(group) 

840 if not mutated and not _is_indexed_like(res, group_axes, axis): 

841 mutated = True 

842 result_values.append(res) 

843 # getattr pattern for __name__ is needed for functools.partial objects 

844 if len(group_keys) == 0 and getattr(f, "__name__", None) in [ 

845 "mad", 

846 "skew", 

847 "sum", 

848 "prod", 

849 ]: 

850 # If group_keys is empty, then no function calls have been made, 

851 # so we will not have raised even if this is an invalid dtype. 

852 # So do one dummy call here to raise appropriate TypeError. 

853 f(data.iloc[:0]) 

854 

855 return result_values, mutated 

856 

857 @cache_readonly 

858 def indices(self) -> dict[Hashable, npt.NDArray[np.intp]]: 

859 """dict {group name -> group indices}""" 

860 if len(self.groupings) == 1 and isinstance(self.result_index, CategoricalIndex): 

861 # This shows unused categories in indices GH#38642 

862 return self.groupings[0].indices 

863 codes_list = [ping.codes for ping in self.groupings] 

864 keys = [ping.group_index for ping in self.groupings] 

865 return get_indexer_dict(codes_list, keys) 

866 

867 @final 

868 def result_ilocs(self) -> npt.NDArray[np.intp]: 

869 """ 

870 Get the original integer locations of result_index in the input. 

871 """ 

872 # Original indices are where group_index would go via sorting. 

873 # But when dropna is true, we need to remove null values while accounting for 

874 # any gaps that then occur because of them. 

875 group_index = get_group_index( 

876 self.codes, self.shape, sort=self._sort, xnull=True 

877 ) 

878 group_index, _ = compress_group_index(group_index, sort=self._sort) 

879 

880 if self.has_dropped_na: 

881 mask = np.where(group_index >= 0) 

882 # Count how many gaps are caused by previous null values for each position 

883 null_gaps = np.cumsum(group_index == -1)[mask] 

884 group_index = group_index[mask] 

885 

886 result = get_group_index_sorter(group_index, self.ngroups) 

887 

888 if self.has_dropped_na: 

889 # Shift by the number of prior null gaps 

890 result += np.take(null_gaps, result) 

891 

892 return result 

893 

894 @final 

895 @property 

896 def codes(self) -> list[npt.NDArray[np.signedinteger]]: 

897 return [ping.codes for ping in self.groupings] 

898 

899 @property 

900 def levels(self) -> list[Index]: 

901 return [ping.group_index for ping in self.groupings] 

902 

903 @property 

904 def names(self) -> list[Hashable]: 

905 return [ping.name for ping in self.groupings] 

906 

907 @final 

908 def size(self) -> Series: 

909 """ 

910 Compute group sizes. 

911 """ 

912 ids, _, ngroups = self.group_info 

913 out: np.ndarray | list 

914 if ngroups: 

915 out = np.bincount(ids[ids != -1], minlength=ngroups) 

916 else: 

917 out = [] 

918 return Series(out, index=self.result_index, dtype="int64") 

919 

920 @cache_readonly 

921 def groups(self) -> dict[Hashable, np.ndarray]: 

922 """dict {group name -> group labels}""" 

923 if len(self.groupings) == 1: 

924 return self.groupings[0].groups 

925 else: 

926 to_groupby = zip(*(ping.grouping_vector for ping in self.groupings)) 

927 index = Index(to_groupby) 

928 return self.axis.groupby(index) 

929 

930 @final 

931 @cache_readonly 

932 def is_monotonic(self) -> bool: 

933 # return if my group orderings are monotonic 

934 return Index(self.group_info[0]).is_monotonic_increasing 

935 

936 @final 

937 @cache_readonly 

938 def has_dropped_na(self) -> bool: 

939 """ 

940 Whether grouper has null value(s) that are dropped. 

941 """ 

942 return bool((self.group_info[0] < 0).any()) 

943 

944 @cache_readonly 

945 def group_info(self) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp], int]: 

946 comp_ids, obs_group_ids = self._get_compressed_codes() 

947 

948 ngroups = len(obs_group_ids) 

949 comp_ids = ensure_platform_int(comp_ids) 

950 

951 return comp_ids, obs_group_ids, ngroups 

952 

953 @final 

954 @cache_readonly 

955 def codes_info(self) -> npt.NDArray[np.intp]: 

956 # return the codes of items in original grouped axis 

957 ids, _, _ = self.group_info 

958 if self.indexer is not None: 

959 sorter = np.lexsort((ids, self.indexer)) 

960 ids = ids[sorter] 

961 ids = ensure_platform_int(ids) 

962 # TODO: if numpy annotates np.lexsort, this ensure_platform_int 

963 # may become unnecessary 

964 return ids 

965 

966 @final 

967 def _get_compressed_codes( 

968 self, 

969 ) -> tuple[npt.NDArray[np.signedinteger], npt.NDArray[np.intp]]: 

970 # The first returned ndarray may have any signed integer dtype 

971 if len(self.groupings) > 1: 

972 group_index = get_group_index(self.codes, self.shape, sort=True, xnull=True) 

973 return compress_group_index(group_index, sort=self._sort) 

974 # FIXME: compress_group_index's second return value is int64, not intp 

975 

976 ping = self.groupings[0] 

977 return ping.codes, np.arange(len(ping.group_index), dtype=np.intp) 

978 

979 @final 

980 @cache_readonly 

981 def ngroups(self) -> int: 

982 return len(self.result_index) 

983 

984 @property 

985 def reconstructed_codes(self) -> list[npt.NDArray[np.intp]]: 

986 codes = self.codes 

987 ids, obs_ids, _ = self.group_info 

988 return decons_obs_group_ids(ids, obs_ids, self.shape, codes, xnull=True) 

989 

990 @cache_readonly 

991 def result_index(self) -> Index: 

992 if len(self.groupings) == 1: 

993 return self.groupings[0].result_index.rename(self.names[0]) 

994 

995 codes = self.reconstructed_codes 

996 levels = [ping.result_index for ping in self.groupings] 

997 return MultiIndex( 

998 levels=levels, codes=codes, verify_integrity=False, names=self.names 

999 ) 

1000 

1001 @final 

1002 def get_group_levels(self) -> list[ArrayLike]: 

1003 # Note: only called from _insert_inaxis_grouper_inplace, which 

1004 # is only called for BaseGrouper, never for BinGrouper 

1005 if len(self.groupings) == 1: 

1006 return [self.groupings[0].group_arraylike] 

1007 

1008 name_list = [] 

1009 for ping, codes in zip(self.groupings, self.reconstructed_codes): 

1010 codes = ensure_platform_int(codes) 

1011 levels = ping.group_arraylike.take(codes) 

1012 

1013 name_list.append(levels) 

1014 

1015 return name_list 

1016 

1017 # ------------------------------------------------------------ 

1018 # Aggregation functions 

1019 

1020 @final 

1021 def _cython_operation( 

1022 self, 

1023 kind: str, 

1024 values, 

1025 how: str, 

1026 axis: int, 

1027 min_count: int = -1, 

1028 **kwargs, 

1029 ) -> ArrayLike: 

1030 """ 

1031 Returns the values of a cython operation. 

1032 """ 

1033 assert kind in ["transform", "aggregate"] 

1034 

1035 cy_op = WrappedCythonOp(kind=kind, how=how, has_dropped_na=self.has_dropped_na) 

1036 

1037 ids, _, _ = self.group_info 

1038 ngroups = self.ngroups 

1039 return cy_op.cython_operation( 

1040 values=values, 

1041 axis=axis, 

1042 min_count=min_count, 

1043 comp_ids=ids, 

1044 ngroups=ngroups, 

1045 **kwargs, 

1046 ) 

1047 

1048 @final 

1049 def agg_series( 

1050 self, obj: Series, func: Callable, preserve_dtype: bool = False 

1051 ) -> ArrayLike: 

1052 """ 

1053 Parameters 

1054 ---------- 

1055 obj : Series 

1056 func : function taking a Series and returning a scalar-like 

1057 preserve_dtype : bool 

1058 Whether the aggregation is known to be dtype-preserving. 

1059 

1060 Returns 

1061 ------- 

1062 np.ndarray or ExtensionArray 

1063 """ 

1064 # test_groupby_empty_with_category gets here with self.ngroups == 0 

1065 # and len(obj) > 0 

1066 

1067 if len(obj) == 0: 

1068 # SeriesGrouper would raise if we were to call _aggregate_series_fast 

1069 result = self._aggregate_series_pure_python(obj, func) 

1070 

1071 elif not isinstance(obj._values, np.ndarray): 

1072 result = self._aggregate_series_pure_python(obj, func) 

1073 

1074 # we can preserve a little bit more aggressively with EA dtype 

1075 # because maybe_cast_pointwise_result will do a try/except 

1076 # with _from_sequence. NB we are assuming here that _from_sequence 

1077 # is sufficiently strict that it casts appropriately. 

1078 preserve_dtype = True 

1079 

1080 else: 

1081 result = self._aggregate_series_pure_python(obj, func) 

1082 

1083 npvalues = lib.maybe_convert_objects(result, try_float=False) 

1084 if preserve_dtype: 

1085 out = maybe_cast_pointwise_result(npvalues, obj.dtype, numeric_only=True) 

1086 else: 

1087 out = npvalues 

1088 return out 

1089 

1090 @final 

1091 def _aggregate_series_pure_python( 

1092 self, obj: Series, func: Callable 

1093 ) -> npt.NDArray[np.object_]: 

1094 ids, _, ngroups = self.group_info 

1095 

1096 counts = np.zeros(ngroups, dtype=int) 

1097 result = np.empty(ngroups, dtype="O") 

1098 initialized = False 

1099 

1100 # equiv: splitter = self._get_splitter(obj, axis=0) 

1101 splitter = get_splitter(obj, ids, ngroups, axis=0) 

1102 

1103 for i, group in enumerate(splitter): 

1104 res = func(group) 

1105 res = libreduction.extract_result(res) 

1106 

1107 if not initialized: 

1108 # We only do this validation on the first iteration 

1109 libreduction.check_result_array(res, group.dtype) 

1110 initialized = True 

1111 

1112 counts[i] = group.shape[0] 

1113 result[i] = res 

1114 

1115 return result 

1116 

1117 

1118class BinGrouper(BaseGrouper): 

1119 """ 

1120 This is an internal Grouper class 

1121 

1122 Parameters 

1123 ---------- 

1124 bins : the split index of binlabels to group the item of axis 

1125 binlabels : the label list 

1126 mutated : bool, default False 

1127 indexer : np.ndarray[np.intp] 

1128 

1129 Examples 

1130 -------- 

1131 bins: [2, 4, 6, 8, 10] 

1132 binlabels: DatetimeIndex(['2005-01-01', '2005-01-03', 

1133 '2005-01-05', '2005-01-07', '2005-01-09'], 

1134 dtype='datetime64[ns]', freq='2D') 

1135 

1136 the group_info, which contains the label of each item in grouped 

1137 axis, the index of label in label list, group number, is 

1138 

1139 (array([0, 0, 1, 1, 2, 2, 3, 3, 4, 4]), array([0, 1, 2, 3, 4]), 5) 

1140 

1141 means that, the grouped axis has 10 items, can be grouped into 5 

1142 labels, the first and second items belong to the first label, the 

1143 third and forth items belong to the second label, and so on 

1144 

1145 """ 

1146 

1147 bins: npt.NDArray[np.int64] 

1148 binlabels: Index 

1149 mutated: bool 

1150 

1151 def __init__( 

1152 self, 

1153 bins, 

1154 binlabels, 

1155 mutated: bool = False, 

1156 indexer=None, 

1157 ) -> None: 

1158 self.bins = ensure_int64(bins) 

1159 self.binlabels = ensure_index(binlabels) 

1160 self.mutated = mutated 

1161 self.indexer = indexer 

1162 

1163 # These lengths must match, otherwise we could call agg_series 

1164 # with empty self.bins, which would raise in libreduction. 

1165 assert len(self.binlabels) == len(self.bins) 

1166 

1167 @cache_readonly 

1168 def groups(self): 

1169 """dict {group name -> group labels}""" 

1170 # this is mainly for compat 

1171 # GH 3881 

1172 result = { 

1173 key: value 

1174 for key, value in zip(self.binlabels, self.bins) 

1175 if key is not NaT 

1176 } 

1177 return result 

1178 

1179 @property 

1180 def nkeys(self) -> int: 

1181 # still matches len(self.groupings), but we can hard-code 

1182 return 1 

1183 

1184 def _get_grouper(self): 

1185 """ 

1186 We are a grouper as part of another's groupings. 

1187 

1188 We have a specific method of grouping, so cannot 

1189 convert to a Index for our grouper. 

1190 """ 

1191 return self 

1192 

1193 def get_iterator(self, data: NDFrame, axis: int = 0): 

1194 """ 

1195 Groupby iterator 

1196 

1197 Returns 

1198 ------- 

1199 Generator yielding sequence of (name, subsetted object) 

1200 for each group 

1201 """ 

1202 if axis == 0: 

1203 slicer = lambda start, edge: data.iloc[start:edge] 

1204 else: 

1205 slicer = lambda start, edge: data.iloc[:, start:edge] 

1206 

1207 length = len(data.axes[axis]) 

1208 

1209 start = 0 

1210 for edge, label in zip(self.bins, self.binlabels): 

1211 if label is not NaT: 

1212 yield label, slicer(start, edge) 

1213 start = edge 

1214 

1215 if start < length: 

1216 yield self.binlabels[-1], slicer(start, None) 

1217 

1218 @cache_readonly 

1219 def indices(self): 

1220 indices = collections.defaultdict(list) 

1221 

1222 i = 0 

1223 for label, bin in zip(self.binlabels, self.bins): 

1224 if i < bin: 

1225 if label is not NaT: 

1226 indices[label] = list(range(i, bin)) 

1227 i = bin 

1228 return indices 

1229 

1230 @cache_readonly 

1231 def group_info(self) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp], int]: 

1232 ngroups = self.ngroups 

1233 obs_group_ids = np.arange(ngroups, dtype=np.intp) 

1234 rep = np.diff(np.r_[0, self.bins]) 

1235 

1236 rep = ensure_platform_int(rep) 

1237 if ngroups == len(self.bins): 

1238 comp_ids = np.repeat(np.arange(ngroups), rep) 

1239 else: 

1240 comp_ids = np.repeat(np.r_[-1, np.arange(ngroups)], rep) 

1241 

1242 return ( 

1243 ensure_platform_int(comp_ids), 

1244 obs_group_ids, 

1245 ngroups, 

1246 ) 

1247 

1248 @cache_readonly 

1249 def reconstructed_codes(self) -> list[np.ndarray]: 

1250 # get unique result indices, and prepend 0 as groupby starts from the first 

1251 return [np.r_[0, np.flatnonzero(self.bins[1:] != self.bins[:-1]) + 1]] 

1252 

1253 @cache_readonly 

1254 def result_index(self) -> Index: 

1255 if len(self.binlabels) != 0 and isna(self.binlabels[0]): 

1256 return self.binlabels[1:] 

1257 

1258 return self.binlabels 

1259 

1260 @property 

1261 def levels(self) -> list[Index]: 

1262 return [self.binlabels] 

1263 

1264 @property 

1265 def names(self) -> list[Hashable]: 

1266 return [self.binlabels.name] 

1267 

1268 @property 

1269 def groupings(self) -> list[grouper.Grouping]: 

1270 lev = self.binlabels 

1271 ping = grouper.Grouping(lev, lev, in_axis=False, level=None) 

1272 return [ping] 

1273 

1274 def _aggregate_series_fast(self, obj: Series, func: Callable) -> NoReturn: 

1275 # -> np.ndarray[object] 

1276 raise NotImplementedError( 

1277 "This should not be reached; use _aggregate_series_pure_python" 

1278 ) 

1279 

1280 

1281def _is_indexed_like(obj, axes, axis: int) -> bool: 

1282 if isinstance(obj, Series): 

1283 if len(axes) > 1: 

1284 return False 

1285 return obj.axes[axis].equals(axes[axis]) 

1286 elif isinstance(obj, DataFrame): 

1287 return obj.axes[axis].equals(axes[axis]) 

1288 

1289 return False 

1290 

1291 

1292# ---------------------------------------------------------------------- 

1293# Splitting / application 

1294 

1295 

1296class DataSplitter(Generic[NDFrameT]): 

1297 def __init__( 

1298 self, 

1299 data: NDFrameT, 

1300 labels: npt.NDArray[np.intp], 

1301 ngroups: int, 

1302 axis: int = 0, 

1303 ) -> None: 

1304 self.data = data 

1305 self.labels = ensure_platform_int(labels) # _should_ already be np.intp 

1306 self.ngroups = ngroups 

1307 

1308 self.axis = axis 

1309 assert isinstance(axis, int), axis 

1310 

1311 @cache_readonly 

1312 def slabels(self) -> npt.NDArray[np.intp]: 

1313 # Sorted labels 

1314 return self.labels.take(self._sort_idx) 

1315 

1316 @cache_readonly 

1317 def _sort_idx(self) -> npt.NDArray[np.intp]: 

1318 # Counting sort indexer 

1319 return get_group_index_sorter(self.labels, self.ngroups) 

1320 

1321 def __iter__(self): 

1322 sdata = self.sorted_data 

1323 

1324 if self.ngroups == 0: 

1325 # we are inside a generator, rather than raise StopIteration 

1326 # we merely return signal the end 

1327 return 

1328 

1329 starts, ends = lib.generate_slices(self.slabels, self.ngroups) 

1330 

1331 for start, end in zip(starts, ends): 

1332 yield self._chop(sdata, slice(start, end)) 

1333 

1334 @cache_readonly 

1335 def sorted_data(self) -> NDFrameT: 

1336 return self.data.take(self._sort_idx, axis=self.axis) 

1337 

1338 def _chop(self, sdata, slice_obj: slice) -> NDFrame: 

1339 raise AbstractMethodError(self) 

1340 

1341 

1342class SeriesSplitter(DataSplitter): 

1343 def _chop(self, sdata: Series, slice_obj: slice) -> Series: 

1344 # fastpath equivalent to `sdata.iloc[slice_obj]` 

1345 mgr = sdata._mgr.get_slice(slice_obj) 

1346 ser = sdata._constructor(mgr, name=sdata.name, fastpath=True) 

1347 return ser.__finalize__(sdata, method="groupby") 

1348 

1349 

1350class FrameSplitter(DataSplitter): 

1351 def _chop(self, sdata: DataFrame, slice_obj: slice) -> DataFrame: 

1352 # Fastpath equivalent to: 

1353 # if self.axis == 0: 

1354 # return sdata.iloc[slice_obj] 

1355 # else: 

1356 # return sdata.iloc[:, slice_obj] 

1357 mgr = sdata._mgr.get_slice(slice_obj, axis=1 - self.axis) 

1358 df = sdata._constructor(mgr) 

1359 return df.__finalize__(sdata, method="groupby") 

1360 

1361 

1362def get_splitter( 

1363 data: NDFrame, labels: np.ndarray, ngroups: int, axis: int = 0 

1364) -> DataSplitter: 

1365 if isinstance(data, Series): 

1366 klass: type[DataSplitter] = SeriesSplitter 

1367 else: 

1368 # i.e. DataFrame 

1369 klass = FrameSplitter 

1370 

1371 return klass(data, labels, ngroups, axis)