Coverage for /var/srv/projects/api.amasfac.comuna18.com/tmp/venv/lib/python3.9/site-packages/pandas/core/groupby/grouper.py: 15%

350 statements  

« prev     ^ index     » next       coverage.py v6.4.4, created at 2023-07-17 14:22 -0600

1""" 

2Provide user facing operators for doing the split part of the 

3split-apply-combine paradigm. 

4""" 

5from __future__ import annotations 

6 

7from typing import ( 

8 TYPE_CHECKING, 

9 Any, 

10 Hashable, 

11 final, 

12) 

13import warnings 

14 

15import numpy as np 

16 

17from pandas._typing import ( 

18 ArrayLike, 

19 NDFrameT, 

20 npt, 

21) 

22from pandas.errors import InvalidIndexError 

23from pandas.util._decorators import cache_readonly 

24from pandas.util._exceptions import find_stack_level 

25 

26from pandas.core.dtypes.cast import sanitize_to_nanoseconds 

27from pandas.core.dtypes.common import ( 

28 is_categorical_dtype, 

29 is_list_like, 

30 is_scalar, 

31) 

32 

33import pandas.core.algorithms as algorithms 

34from pandas.core.arrays import ( 

35 Categorical, 

36 ExtensionArray, 

37) 

38import pandas.core.common as com 

39from pandas.core.frame import DataFrame 

40from pandas.core.groupby import ops 

41from pandas.core.groupby.categorical import ( 

42 recode_for_groupby, 

43 recode_from_groupby, 

44) 

45from pandas.core.indexes.api import ( 

46 CategoricalIndex, 

47 Index, 

48 MultiIndex, 

49) 

50from pandas.core.series import Series 

51 

52from pandas.io.formats.printing import pprint_thing 

53 

54if TYPE_CHECKING: 54 ↛ 55line 54 didn't jump to line 55, because the condition on line 54 was never true

55 from pandas.core.generic import NDFrame 

56 

57 

58class Grouper: 

59 """ 

60 A Grouper allows the user to specify a groupby instruction for an object. 

61 

62 This specification will select a column via the key parameter, or if the 

63 level and/or axis parameters are given, a level of the index of the target 

64 object. 

65 

66 If `axis` and/or `level` are passed as keywords to both `Grouper` and 

67 `groupby`, the values passed to `Grouper` take precedence. 

68 

69 Parameters 

70 ---------- 

71 key : str, defaults to None 

72 Groupby key, which selects the grouping column of the target. 

73 level : name/number, defaults to None 

74 The level for the target index. 

75 freq : str / frequency object, defaults to None 

76 This will groupby the specified frequency if the target selection 

77 (via key or level) is a datetime-like object. For full specification 

78 of available frequencies, please see `here 

79 <https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#offset-aliases>`_. 

80 axis : str, int, defaults to 0 

81 Number/name of the axis. 

82 sort : bool, default to False 

83 Whether to sort the resulting labels. 

84 closed : {'left' or 'right'} 

85 Closed end of interval. Only when `freq` parameter is passed. 

86 label : {'left' or 'right'} 

87 Interval boundary to use for labeling. 

88 Only when `freq` parameter is passed. 

89 convention : {'start', 'end', 'e', 's'} 

90 If grouper is PeriodIndex and `freq` parameter is passed. 

91 base : int, default 0 

92 Only when `freq` parameter is passed. 

93 For frequencies that evenly subdivide 1 day, the "origin" of the 

94 aggregated intervals. For example, for '5min' frequency, base could 

95 range from 0 through 4. Defaults to 0. 

96 

97 .. deprecated:: 1.1.0 

98 The new arguments that you should use are 'offset' or 'origin'. 

99 

100 loffset : str, DateOffset, timedelta object 

101 Only when `freq` parameter is passed. 

102 

103 .. deprecated:: 1.1.0 

104 loffset is only working for ``.resample(...)`` and not for 

105 Grouper (:issue:`28302`). 

106 However, loffset is also deprecated for ``.resample(...)`` 

107 See: :class:`DataFrame.resample` 

108 

109 origin : Timestamp or str, default 'start_day' 

110 The timestamp on which to adjust the grouping. The timezone of origin must 

111 match the timezone of the index. 

112 If string, must be one of the following: 

113 

114 - 'epoch': `origin` is 1970-01-01 

115 - 'start': `origin` is the first value of the timeseries 

116 - 'start_day': `origin` is the first day at midnight of the timeseries 

117 

118 .. versionadded:: 1.1.0 

119 

120 - 'end': `origin` is the last value of the timeseries 

121 - 'end_day': `origin` is the ceiling midnight of the last day 

122 

123 .. versionadded:: 1.3.0 

124 

125 offset : Timedelta or str, default is None 

126 An offset timedelta added to the origin. 

127 

128 .. versionadded:: 1.1.0 

129 

130 dropna : bool, default True 

131 If True, and if group keys contain NA values, NA values together with 

132 row/column will be dropped. If False, NA values will also be treated as 

133 the key in groups. 

134 

135 .. versionadded:: 1.2.0 

136 

137 Returns 

138 ------- 

139 A specification for a groupby instruction 

140 

141 Examples 

142 -------- 

143 Syntactic sugar for ``df.groupby('A')`` 

144 

145 >>> df = pd.DataFrame( 

146 ... { 

147 ... "Animal": ["Falcon", "Parrot", "Falcon", "Falcon", "Parrot"], 

148 ... "Speed": [100, 5, 200, 300, 15], 

149 ... } 

150 ... ) 

151 >>> df 

152 Animal Speed 

153 0 Falcon 100 

154 1 Parrot 5 

155 2 Falcon 200 

156 3 Falcon 300 

157 4 Parrot 15 

158 >>> df.groupby(pd.Grouper(key="Animal")).mean() 

159 Speed 

160 Animal 

161 Falcon 200.0 

162 Parrot 10.0 

163 

164 Specify a resample operation on the column 'Publish date' 

165 

166 >>> df = pd.DataFrame( 

167 ... { 

168 ... "Publish date": [ 

169 ... pd.Timestamp("2000-01-02"), 

170 ... pd.Timestamp("2000-01-02"), 

171 ... pd.Timestamp("2000-01-09"), 

172 ... pd.Timestamp("2000-01-16") 

173 ... ], 

174 ... "ID": [0, 1, 2, 3], 

175 ... "Price": [10, 20, 30, 40] 

176 ... } 

177 ... ) 

178 >>> df 

179 Publish date ID Price 

180 0 2000-01-02 0 10 

181 1 2000-01-02 1 20 

182 2 2000-01-09 2 30 

183 3 2000-01-16 3 40 

184 >>> df.groupby(pd.Grouper(key="Publish date", freq="1W")).mean() 

185 ID Price 

186 Publish date 

187 2000-01-02 0.5 15.0 

188 2000-01-09 2.0 30.0 

189 2000-01-16 3.0 40.0 

190 

191 If you want to adjust the start of the bins based on a fixed timestamp: 

192 

193 >>> start, end = '2000-10-01 23:30:00', '2000-10-02 00:30:00' 

194 >>> rng = pd.date_range(start, end, freq='7min') 

195 >>> ts = pd.Series(np.arange(len(rng)) * 3, index=rng) 

196 >>> ts 

197 2000-10-01 23:30:00 0 

198 2000-10-01 23:37:00 3 

199 2000-10-01 23:44:00 6 

200 2000-10-01 23:51:00 9 

201 2000-10-01 23:58:00 12 

202 2000-10-02 00:05:00 15 

203 2000-10-02 00:12:00 18 

204 2000-10-02 00:19:00 21 

205 2000-10-02 00:26:00 24 

206 Freq: 7T, dtype: int64 

207 

208 >>> ts.groupby(pd.Grouper(freq='17min')).sum() 

209 2000-10-01 23:14:00 0 

210 2000-10-01 23:31:00 9 

211 2000-10-01 23:48:00 21 

212 2000-10-02 00:05:00 54 

213 2000-10-02 00:22:00 24 

214 Freq: 17T, dtype: int64 

215 

216 >>> ts.groupby(pd.Grouper(freq='17min', origin='epoch')).sum() 

217 2000-10-01 23:18:00 0 

218 2000-10-01 23:35:00 18 

219 2000-10-01 23:52:00 27 

220 2000-10-02 00:09:00 39 

221 2000-10-02 00:26:00 24 

222 Freq: 17T, dtype: int64 

223 

224 >>> ts.groupby(pd.Grouper(freq='17min', origin='2000-01-01')).sum() 

225 2000-10-01 23:24:00 3 

226 2000-10-01 23:41:00 15 

227 2000-10-01 23:58:00 45 

228 2000-10-02 00:15:00 45 

229 Freq: 17T, dtype: int64 

230 

231 If you want to adjust the start of the bins with an `offset` Timedelta, the two 

232 following lines are equivalent: 

233 

234 >>> ts.groupby(pd.Grouper(freq='17min', origin='start')).sum() 

235 2000-10-01 23:30:00 9 

236 2000-10-01 23:47:00 21 

237 2000-10-02 00:04:00 54 

238 2000-10-02 00:21:00 24 

239 Freq: 17T, dtype: int64 

240 

241 >>> ts.groupby(pd.Grouper(freq='17min', offset='23h30min')).sum() 

242 2000-10-01 23:30:00 9 

243 2000-10-01 23:47:00 21 

244 2000-10-02 00:04:00 54 

245 2000-10-02 00:21:00 24 

246 Freq: 17T, dtype: int64 

247 

248 To replace the use of the deprecated `base` argument, you can now use `offset`, 

249 in this example it is equivalent to have `base=2`: 

250 

251 >>> ts.groupby(pd.Grouper(freq='17min', offset='2min')).sum() 

252 2000-10-01 23:16:00 0 

253 2000-10-01 23:33:00 9 

254 2000-10-01 23:50:00 36 

255 2000-10-02 00:07:00 39 

256 2000-10-02 00:24:00 24 

257 Freq: 17T, dtype: int64 

258 """ 

259 

260 axis: int 

261 sort: bool 

262 dropna: bool 

263 _gpr_index: Index | None 

264 _grouper: Index | None 

265 

266 _attributes: tuple[str, ...] = ("key", "level", "freq", "axis", "sort", "dropna") 

267 

268 def __new__(cls, *args, **kwargs): 

269 if kwargs.get("freq") is not None: 

270 from pandas.core.resample import TimeGrouper 

271 

272 _check_deprecated_resample_kwargs(kwargs, origin=cls) 

273 cls = TimeGrouper 

274 return super().__new__(cls) 

275 

276 def __init__( 

277 self, 

278 key=None, 

279 level=None, 

280 freq=None, 

281 axis: int = 0, 

282 sort: bool = False, 

283 dropna: bool = True, 

284 ) -> None: 

285 self.key = key 

286 self.level = level 

287 self.freq = freq 

288 self.axis = axis 

289 self.sort = sort 

290 self.dropna = dropna 

291 

292 self.grouper = None 

293 self._gpr_index = None 

294 self.obj = None 

295 self.indexer = None 

296 self.binner = None 

297 self._grouper = None 

298 self._indexer = None 

299 

300 @final 

301 @property 

302 def ax(self) -> Index: 

303 index = self._gpr_index 

304 if index is None: 

305 raise ValueError("_set_grouper must be called before ax is accessed") 

306 return index 

307 

308 def _get_grouper( 

309 self, obj: NDFrameT, validate: bool = True 

310 ) -> tuple[Any, ops.BaseGrouper, NDFrameT]: 

311 """ 

312 Parameters 

313 ---------- 

314 obj : Series or DataFrame 

315 validate : bool, default True 

316 if True, validate the grouper 

317 

318 Returns 

319 ------- 

320 a tuple of binner, grouper, obj (possibly sorted) 

321 """ 

322 self._set_grouper(obj) 

323 # error: Value of type variable "NDFrameT" of "get_grouper" cannot be 

324 # "Optional[Any]" 

325 # error: Incompatible types in assignment (expression has type "BaseGrouper", 

326 # variable has type "None") 

327 self.grouper, _, self.obj = get_grouper( # type: ignore[type-var,assignment] 

328 self.obj, 

329 [self.key], 

330 axis=self.axis, 

331 level=self.level, 

332 sort=self.sort, 

333 validate=validate, 

334 dropna=self.dropna, 

335 ) 

336 

337 # error: Incompatible return value type (got "Tuple[None, None, None]", 

338 # expected "Tuple[Any, BaseGrouper, NDFrameT]") 

339 return self.binner, self.grouper, self.obj # type: ignore[return-value] 

340 

341 @final 

342 def _set_grouper(self, obj: NDFrame, sort: bool = False) -> None: 

343 """ 

344 given an object and the specifications, setup the internal grouper 

345 for this particular specification 

346 

347 Parameters 

348 ---------- 

349 obj : Series or DataFrame 

350 sort : bool, default False 

351 whether the resulting grouper should be sorted 

352 """ 

353 assert obj is not None 

354 

355 if self.key is not None and self.level is not None: 

356 raise ValueError("The Grouper cannot specify both a key and a level!") 

357 

358 # Keep self.grouper value before overriding 

359 if self._grouper is None: 

360 # TODO: What are we assuming about subsequent calls? 

361 self._grouper = self._gpr_index 

362 self._indexer = self.indexer 

363 

364 # the key must be a valid info item 

365 if self.key is not None: 

366 key = self.key 

367 # The 'on' is already defined 

368 if getattr(self._gpr_index, "name", None) == key and isinstance( 

369 obj, Series 

370 ): 

371 # Sometimes self._grouper will have been resorted while 

372 # obj has not. In this case there is a mismatch when we 

373 # call self._grouper.take(obj.index) so we need to undo the sorting 

374 # before we call _grouper.take. 

375 assert self._grouper is not None 

376 if self._indexer is not None: 

377 reverse_indexer = self._indexer.argsort() 

378 unsorted_ax = self._grouper.take(reverse_indexer) 

379 ax = unsorted_ax.take(obj.index) 

380 else: 

381 ax = self._grouper.take(obj.index) 

382 else: 

383 if key not in obj._info_axis: 

384 raise KeyError(f"The grouper name {key} is not found") 

385 ax = Index(obj[key], name=key) 

386 

387 else: 

388 ax = obj._get_axis(self.axis) 

389 if self.level is not None: 

390 level = self.level 

391 

392 # if a level is given it must be a mi level or 

393 # equivalent to the axis name 

394 if isinstance(ax, MultiIndex): 

395 level = ax._get_level_number(level) 

396 ax = Index(ax._get_level_values(level), name=ax.names[level]) 

397 

398 else: 

399 if level not in (0, ax.name): 

400 raise ValueError(f"The level {level} is not valid") 

401 

402 # possibly sort 

403 if (self.sort or sort) and not ax.is_monotonic_increasing: 

404 # use stable sort to support first, last, nth 

405 # TODO: why does putting na_position="first" fix datetimelike cases? 

406 indexer = self.indexer = ax.array.argsort( 

407 kind="mergesort", na_position="first" 

408 ) 

409 ax = ax.take(indexer) 

410 obj = obj.take(indexer, axis=self.axis) 

411 

412 # error: Incompatible types in assignment (expression has type 

413 # "NDFrameT", variable has type "None") 

414 self.obj = obj # type: ignore[assignment] 

415 self._gpr_index = ax 

416 

417 @final 

418 @property 

419 def groups(self): 

420 # error: "None" has no attribute "groups" 

421 return self.grouper.groups # type: ignore[attr-defined] 

422 

423 @final 

424 def __repr__(self) -> str: 

425 attrs_list = ( 

426 f"{attr_name}={repr(getattr(self, attr_name))}" 

427 for attr_name in self._attributes 

428 if getattr(self, attr_name) is not None 

429 ) 

430 attrs = ", ".join(attrs_list) 

431 cls_name = type(self).__name__ 

432 return f"{cls_name}({attrs})" 

433 

434 

435@final 

436class Grouping: 

437 """ 

438 Holds the grouping information for a single key 

439 

440 Parameters 

441 ---------- 

442 index : Index 

443 grouper : 

444 obj : DataFrame or Series 

445 name : Label 

446 level : 

447 observed : bool, default False 

448 If we are a Categorical, use the observed values 

449 in_axis : if the Grouping is a column in self.obj and hence among 

450 Groupby.exclusions list 

451 

452 Returns 

453 ------- 

454 **Attributes**: 

455 * indices : dict of {group -> index_list} 

456 * codes : ndarray, group codes 

457 * group_index : unique groups 

458 * groups : dict of {group -> label_list} 

459 """ 

460 

461 _codes: npt.NDArray[np.signedinteger] | None = None 

462 _group_index: Index | None = None 

463 _passed_categorical: bool 

464 _all_grouper: Categorical | None 

465 _index: Index 

466 

467 def __init__( 

468 self, 

469 index: Index, 

470 grouper=None, 

471 obj: NDFrame | None = None, 

472 level=None, 

473 sort: bool = True, 

474 observed: bool = False, 

475 in_axis: bool = False, 

476 dropna: bool = True, 

477 ) -> None: 

478 self.level = level 

479 self._orig_grouper = grouper 

480 self.grouping_vector = _convert_grouper(index, grouper) 

481 self._all_grouper = None 

482 self._index = index 

483 self._sort = sort 

484 self.obj = obj 

485 self._observed = observed 

486 self.in_axis = in_axis 

487 self._dropna = dropna 

488 

489 self._passed_categorical = False 

490 

491 # we have a single grouper which may be a myriad of things, 

492 # some of which are dependent on the passing in level 

493 

494 ilevel = self._ilevel 

495 if ilevel is not None: 

496 mapper = self.grouping_vector 

497 # In extant tests, the new self.grouping_vector matches 

498 # `index.get_level_values(ilevel)` whenever 

499 # mapper is None and isinstance(index, MultiIndex) 

500 ( 

501 self.grouping_vector, # Index 

502 self._codes, 

503 self._group_index, 

504 ) = index._get_grouper_for_level(mapper, level=ilevel, dropna=dropna) 

505 

506 # a passed Grouper like, directly get the grouper in the same way 

507 # as single grouper groupby, use the group_info to get codes 

508 elif isinstance(self.grouping_vector, Grouper): 

509 # get the new grouper; we already have disambiguated 

510 # what key/level refer to exactly, don't need to 

511 # check again as we have by this point converted these 

512 # to an actual value (rather than a pd.Grouper) 

513 assert self.obj is not None # for mypy 

514 _, newgrouper, newobj = self.grouping_vector._get_grouper( 

515 self.obj, validate=False 

516 ) 

517 self.obj = newobj 

518 

519 ng = newgrouper._get_grouper() 

520 if isinstance(newgrouper, ops.BinGrouper): 

521 # in this case we have `ng is newgrouper` 

522 self.grouping_vector = ng 

523 else: 

524 # ops.BaseGrouper 

525 # use Index instead of ndarray so we can recover the name 

526 self.grouping_vector = Index(ng, name=newgrouper.result_index.name) 

527 

528 elif is_categorical_dtype(self.grouping_vector): 

529 # a passed Categorical 

530 self._passed_categorical = True 

531 

532 self.grouping_vector, self._all_grouper = recode_for_groupby( 

533 self.grouping_vector, sort, observed 

534 ) 

535 

536 elif not isinstance( 

537 self.grouping_vector, (Series, Index, ExtensionArray, np.ndarray) 

538 ): 

539 # no level passed 

540 if getattr(self.grouping_vector, "ndim", 1) != 1: 

541 t = self.name or str(type(self.grouping_vector)) 

542 raise ValueError(f"Grouper for '{t}' not 1-dimensional") 

543 

544 self.grouping_vector = index.map(self.grouping_vector) 

545 

546 if not ( 

547 hasattr(self.grouping_vector, "__len__") 

548 and len(self.grouping_vector) == len(index) 

549 ): 

550 grper = pprint_thing(self.grouping_vector) 

551 errmsg = ( 

552 "Grouper result violates len(labels) == " 

553 f"len(data)\nresult: {grper}" 

554 ) 

555 self.grouping_vector = None # Try for sanity 

556 raise AssertionError(errmsg) 

557 

558 if isinstance(self.grouping_vector, np.ndarray): 

559 # if we have a date/time-like grouper, make sure that we have 

560 # Timestamps like 

561 self.grouping_vector = sanitize_to_nanoseconds(self.grouping_vector) 

562 

563 def __repr__(self) -> str: 

564 return f"Grouping({self.name})" 

565 

566 def __iter__(self): 

567 return iter(self.indices) 

568 

569 @cache_readonly 

570 def name(self) -> Hashable: 

571 ilevel = self._ilevel 

572 if ilevel is not None: 

573 return self._index.names[ilevel] 

574 

575 if isinstance(self._orig_grouper, (Index, Series)): 

576 return self._orig_grouper.name 

577 

578 elif isinstance(self.grouping_vector, ops.BaseGrouper): 

579 return self.grouping_vector.result_index.name 

580 

581 elif isinstance(self.grouping_vector, Index): 

582 return self.grouping_vector.name 

583 

584 # otherwise we have ndarray or ExtensionArray -> no name 

585 return None 

586 

587 @cache_readonly 

588 def _ilevel(self) -> int | None: 

589 """ 

590 If necessary, converted index level name to index level position. 

591 """ 

592 level = self.level 

593 if level is None: 

594 return None 

595 if not isinstance(level, int): 

596 index = self._index 

597 if level not in index.names: 

598 raise AssertionError(f"Level {level} not in index") 

599 return index.names.index(level) 

600 return level 

601 

602 @property 

603 def ngroups(self) -> int: 

604 return len(self.group_index) 

605 

606 @cache_readonly 

607 def indices(self) -> dict[Hashable, npt.NDArray[np.intp]]: 

608 # we have a list of groupers 

609 if isinstance(self.grouping_vector, ops.BaseGrouper): 

610 return self.grouping_vector.indices 

611 

612 values = Categorical(self.grouping_vector) 

613 return values._reverse_indexer() 

614 

615 @property 

616 def codes(self) -> npt.NDArray[np.signedinteger]: 

617 if self._codes is not None: 

618 # _codes is set in __init__ for MultiIndex cases 

619 return self._codes 

620 

621 return self._codes_and_uniques[0] 

622 

623 @cache_readonly 

624 def group_arraylike(self) -> ArrayLike: 

625 """ 

626 Analogous to result_index, but holding an ArrayLike to ensure 

627 we can retain ExtensionDtypes. 

628 """ 

629 if self._group_index is not None: 

630 # _group_index is set in __init__ for MultiIndex cases 

631 return self._group_index._values 

632 

633 elif self._all_grouper is not None: 

634 # retain dtype for categories, including unobserved ones 

635 return self.result_index._values 

636 

637 return self._codes_and_uniques[1] 

638 

639 @cache_readonly 

640 def result_index(self) -> Index: 

641 # result_index retains dtype for categories, including unobserved ones, 

642 # which group_index does not 

643 if self._all_grouper is not None: 

644 group_idx = self.group_index 

645 assert isinstance(group_idx, CategoricalIndex) 

646 return recode_from_groupby(self._all_grouper, self._sort, group_idx) 

647 return self.group_index 

648 

649 @cache_readonly 

650 def group_index(self) -> Index: 

651 if self._group_index is not None: 

652 # _group_index is set in __init__ for MultiIndex cases 

653 return self._group_index 

654 

655 uniques = self._codes_and_uniques[1] 

656 return Index._with_infer(uniques, name=self.name) 

657 

658 @cache_readonly 

659 def _codes_and_uniques(self) -> tuple[npt.NDArray[np.signedinteger], ArrayLike]: 

660 if self._passed_categorical: 

661 # we make a CategoricalIndex out of the cat grouper 

662 # preserving the categories / ordered attributes; 

663 # doesn't (yet - GH#46909) handle dropna=False 

664 cat = self.grouping_vector 

665 categories = cat.categories 

666 

667 if self._observed: 

668 ucodes = algorithms.unique1d(cat.codes) 

669 ucodes = ucodes[ucodes != -1] 

670 if self._sort or cat.ordered: 

671 ucodes = np.sort(ucodes) 

672 else: 

673 ucodes = np.arange(len(categories)) 

674 

675 uniques = Categorical.from_codes( 

676 codes=ucodes, categories=categories, ordered=cat.ordered 

677 ) 

678 return cat.codes, uniques 

679 

680 elif isinstance(self.grouping_vector, ops.BaseGrouper): 

681 # we have a list of groupers 

682 codes = self.grouping_vector.codes_info 

683 # error: Incompatible types in assignment (expression has type "Union 

684 # [ExtensionArray, ndarray[Any, Any]]", variable has type "Categorical") 

685 uniques = ( 

686 self.grouping_vector.result_index._values # type: ignore[assignment] 

687 ) 

688 else: 

689 # GH35667, replace dropna=False with use_na_sentinel=False 

690 # error: Incompatible types in assignment (expression has type "Union[ 

691 # ndarray[Any, Any], Index]", variable has type "Categorical") 

692 codes, uniques = algorithms.factorize( # type: ignore[assignment] 

693 self.grouping_vector, sort=self._sort, use_na_sentinel=self._dropna 

694 ) 

695 return codes, uniques 

696 

697 @cache_readonly 

698 def groups(self) -> dict[Hashable, np.ndarray]: 

699 return self._index.groupby(Categorical.from_codes(self.codes, self.group_index)) 

700 

701 

702def get_grouper( 

703 obj: NDFrameT, 

704 key=None, 

705 axis: int = 0, 

706 level=None, 

707 sort: bool = True, 

708 observed: bool = False, 

709 mutated: bool = False, 

710 validate: bool = True, 

711 dropna: bool = True, 

712) -> tuple[ops.BaseGrouper, frozenset[Hashable], NDFrameT]: 

713 """ 

714 Create and return a BaseGrouper, which is an internal 

715 mapping of how to create the grouper indexers. 

716 This may be composed of multiple Grouping objects, indicating 

717 multiple groupers 

718 

719 Groupers are ultimately index mappings. They can originate as: 

720 index mappings, keys to columns, functions, or Groupers 

721 

722 Groupers enable local references to axis,level,sort, while 

723 the passed in axis, level, and sort are 'global'. 

724 

725 This routine tries to figure out what the passing in references 

726 are and then creates a Grouping for each one, combined into 

727 a BaseGrouper. 

728 

729 If observed & we have a categorical grouper, only show the observed 

730 values. 

731 

732 If validate, then check for key/level overlaps. 

733 

734 """ 

735 group_axis = obj._get_axis(axis) 

736 

737 # validate that the passed single level is compatible with the passed 

738 # axis of the object 

739 if level is not None: 

740 # TODO: These if-block and else-block are almost same. 

741 # MultiIndex instance check is removable, but it seems that there are 

742 # some processes only for non-MultiIndex in else-block, 

743 # eg. `obj.index.name != level`. We have to consider carefully whether 

744 # these are applicable for MultiIndex. Even if these are applicable, 

745 # we need to check if it makes no side effect to subsequent processes 

746 # on the outside of this condition. 

747 # (GH 17621) 

748 if isinstance(group_axis, MultiIndex): 

749 if is_list_like(level) and len(level) == 1: 

750 level = level[0] 

751 

752 if key is None and is_scalar(level): 

753 # Get the level values from group_axis 

754 key = group_axis.get_level_values(level) 

755 level = None 

756 

757 else: 

758 # allow level to be a length-one list-like object 

759 # (e.g., level=[0]) 

760 # GH 13901 

761 if is_list_like(level): 

762 nlevels = len(level) 

763 if nlevels == 1: 

764 level = level[0] 

765 elif nlevels == 0: 

766 raise ValueError("No group keys passed!") 

767 else: 

768 raise ValueError("multiple levels only valid with MultiIndex") 

769 

770 if isinstance(level, str): 

771 if obj._get_axis(axis).name != level: 

772 raise ValueError( 

773 f"level name {level} is not the name " 

774 f"of the {obj._get_axis_name(axis)}" 

775 ) 

776 elif level > 0 or level < -1: 

777 raise ValueError("level > 0 or level < -1 only valid with MultiIndex") 

778 

779 # NOTE: `group_axis` and `group_axis.get_level_values(level)` 

780 # are same in this section. 

781 level = None 

782 key = group_axis 

783 

784 # a passed-in Grouper, directly convert 

785 if isinstance(key, Grouper): 

786 binner, grouper, obj = key._get_grouper(obj, validate=False) 

787 if key.key is None: 

788 return grouper, frozenset(), obj 

789 else: 

790 return grouper, frozenset({key.key}), obj 

791 

792 # already have a BaseGrouper, just return it 

793 elif isinstance(key, ops.BaseGrouper): 

794 return key, frozenset(), obj 

795 

796 if not isinstance(key, list): 

797 keys = [key] 

798 match_axis_length = False 

799 else: 

800 keys = key 

801 match_axis_length = len(keys) == len(group_axis) 

802 

803 # what are we after, exactly? 

804 any_callable = any(callable(g) or isinstance(g, dict) for g in keys) 

805 any_groupers = any(isinstance(g, (Grouper, Grouping)) for g in keys) 

806 any_arraylike = any( 

807 isinstance(g, (list, tuple, Series, Index, np.ndarray)) for g in keys 

808 ) 

809 

810 # is this an index replacement? 

811 if ( 

812 not any_callable 

813 and not any_arraylike 

814 and not any_groupers 

815 and match_axis_length 

816 and level is None 

817 ): 

818 if isinstance(obj, DataFrame): 

819 all_in_columns_index = all( 

820 g in obj.columns or g in obj.index.names for g in keys 

821 ) 

822 else: 

823 assert isinstance(obj, Series) 

824 all_in_columns_index = all(g in obj.index.names for g in keys) 

825 

826 if not all_in_columns_index: 

827 keys = [com.asarray_tuplesafe(keys)] 

828 

829 if isinstance(level, (tuple, list)): 

830 if key is None: 

831 keys = [None] * len(level) 

832 levels = level 

833 else: 

834 levels = [level] * len(keys) 

835 

836 groupings: list[Grouping] = [] 

837 exclusions: set[Hashable] = set() 

838 

839 # if the actual grouper should be obj[key] 

840 def is_in_axis(key) -> bool: 

841 

842 if not _is_label_like(key): 

843 if obj.ndim == 1: 

844 return False 

845 

846 # items -> .columns for DataFrame, .index for Series 

847 items = obj.axes[-1] 

848 try: 

849 items.get_loc(key) 

850 except (KeyError, TypeError, InvalidIndexError): 

851 # TypeError shows up here if we pass e.g. Int64Index 

852 return False 

853 

854 return True 

855 

856 # if the grouper is obj[name] 

857 def is_in_obj(gpr) -> bool: 

858 if not hasattr(gpr, "name"): 

859 return False 

860 try: 

861 return gpr is obj[gpr.name] 

862 except (KeyError, IndexError, InvalidIndexError): 

863 # IndexError reached in e.g. test_skip_group_keys when we pass 

864 # lambda here 

865 # InvalidIndexError raised on key-types inappropriate for index, 

866 # e.g. DatetimeIndex.get_loc(tuple()) 

867 return False 

868 

869 for gpr, level in zip(keys, levels): 

870 

871 if is_in_obj(gpr): # df.groupby(df['name']) 

872 in_axis = True 

873 exclusions.add(gpr.name) 

874 

875 elif is_in_axis(gpr): # df.groupby('name') 

876 if gpr in obj: 

877 if validate: 

878 obj._check_label_or_level_ambiguity(gpr, axis=axis) 

879 in_axis, name, gpr = True, gpr, obj[gpr] 

880 if gpr.ndim != 1: 

881 # non-unique columns; raise here to get the name in the 

882 # exception message 

883 raise ValueError(f"Grouper for '{name}' not 1-dimensional") 

884 exclusions.add(name) 

885 elif obj._is_level_reference(gpr, axis=axis): 

886 in_axis, level, gpr = False, gpr, None 

887 else: 

888 raise KeyError(gpr) 

889 elif isinstance(gpr, Grouper) and gpr.key is not None: 

890 # Add key to exclusions 

891 exclusions.add(gpr.key) 

892 in_axis = False 

893 else: 

894 in_axis = False 

895 

896 # create the Grouping 

897 # allow us to passing the actual Grouping as the gpr 

898 ping = ( 

899 Grouping( 

900 group_axis, 

901 gpr, 

902 obj=obj, 

903 level=level, 

904 sort=sort, 

905 observed=observed, 

906 in_axis=in_axis, 

907 dropna=dropna, 

908 ) 

909 if not isinstance(gpr, Grouping) 

910 else gpr 

911 ) 

912 

913 groupings.append(ping) 

914 

915 if len(groupings) == 0 and len(obj): 

916 raise ValueError("No group keys passed!") 

917 elif len(groupings) == 0: 

918 groupings.append(Grouping(Index([], dtype="int"), np.array([], dtype=np.intp))) 

919 

920 # create the internals grouper 

921 grouper = ops.BaseGrouper( 

922 group_axis, groupings, sort=sort, mutated=mutated, dropna=dropna 

923 ) 

924 return grouper, frozenset(exclusions), obj 

925 

926 

927def _is_label_like(val) -> bool: 

928 return isinstance(val, (str, tuple)) or (val is not None and is_scalar(val)) 

929 

930 

931def _convert_grouper(axis: Index, grouper): 

932 if isinstance(grouper, dict): 

933 return grouper.get 

934 elif isinstance(grouper, Series): 

935 if grouper.index.equals(axis): 

936 return grouper._values 

937 else: 

938 return grouper.reindex(axis)._values 

939 elif isinstance(grouper, MultiIndex): 

940 return grouper._values 

941 elif isinstance(grouper, (list, tuple, Index, Categorical, np.ndarray)): 

942 if len(grouper) != len(axis): 

943 raise ValueError("Grouper and axis must be same length") 

944 

945 if isinstance(grouper, (list, tuple)): 

946 grouper = com.asarray_tuplesafe(grouper) 

947 return grouper 

948 else: 

949 return grouper 

950 

951 

952def _check_deprecated_resample_kwargs(kwargs, origin): 

953 """ 

954 Check for use of deprecated parameters in ``resample`` and related functions. 

955 

956 Raises the appropriate warnings if these parameters are detected. 

957 Only sets an approximate ``stacklevel`` for the warnings (see #37603, #36629). 

958 

959 Parameters 

960 ---------- 

961 kwargs : dict 

962 Dictionary of keyword arguments to check for deprecated parameters. 

963 origin : object 

964 From where this function is being called; either Grouper or TimeGrouper. Used 

965 to determine an approximate stacklevel. 

966 """ 

967 # Deprecation warning of `base` and `loffset` since v1.1.0: 

968 # we are raising the warning here to be able to set the `stacklevel` 

969 # properly since we need to raise the `base` and `loffset` deprecation 

970 # warning from three different cases: 

971 # core/generic.py::NDFrame.resample 

972 # core/groupby/groupby.py::GroupBy.resample 

973 # core/groupby/grouper.py::Grouper 

974 # raising these warnings from TimeGrouper directly would fail the test: 

975 # tests/resample/test_deprecated.py::test_deprecating_on_loffset_and_base 

976 

977 if kwargs.get("base", None) is not None: 

978 warnings.warn( 

979 "'base' in .resample() and in Grouper() is deprecated.\n" 

980 "The new arguments that you should use are 'offset' or 'origin'.\n" 

981 '\n>>> df.resample(freq="3s", base=2)\n' 

982 "\nbecomes:\n" 

983 '\n>>> df.resample(freq="3s", offset="2s")\n', 

984 FutureWarning, 

985 stacklevel=find_stack_level(), 

986 ) 

987 if kwargs.get("loffset", None) is not None: 

988 warnings.warn( 

989 "'loffset' in .resample() and in Grouper() is deprecated.\n" 

990 '\n>>> df.resample(freq="3s", loffset="8H")\n' 

991 "\nbecomes:\n" 

992 "\n>>> from pandas.tseries.frequencies import to_offset" 

993 '\n>>> df = df.resample(freq="3s").mean()' 

994 '\n>>> df.index = df.index.to_timestamp() + to_offset("8H")\n', 

995 FutureWarning, 

996 stacklevel=find_stack_level(), 

997 )