Coverage for /var/srv/projects/api.amasfac.comuna18.com/tmp/venv/lib/python3.9/site-packages/pandas/core/algorithms.py: 9%

575 statements  

« prev     ^ index     » next       coverage.py v6.4.4, created at 2023-07-17 14:22 -0600

1""" 

2Generic data algorithms. This module is experimental at the moment and not 

3intended for public consumption 

4""" 

5from __future__ import annotations 

6 

7import inspect 

8import operator 

9from textwrap import dedent 

10from typing import ( 

11 TYPE_CHECKING, 

12 Hashable, 

13 Literal, 

14 Sequence, 

15 cast, 

16 final, 

17 overload, 

18) 

19import warnings 

20 

21import numpy as np 

22 

23from pandas._libs import ( 

24 algos, 

25 hashtable as htable, 

26 iNaT, 

27 lib, 

28) 

29from pandas._typing import ( 

30 AnyArrayLike, 

31 ArrayLike, 

32 DtypeObj, 

33 IndexLabel, 

34 TakeIndexer, 

35 npt, 

36) 

37from pandas.util._decorators import doc 

38from pandas.util._exceptions import find_stack_level 

39 

40from pandas.core.dtypes.cast import ( 

41 construct_1d_object_array_from_listlike, 

42 infer_dtype_from_array, 

43 sanitize_to_nanoseconds, 

44) 

45from pandas.core.dtypes.common import ( 

46 ensure_float64, 

47 ensure_object, 

48 ensure_platform_int, 

49 is_array_like, 

50 is_bool_dtype, 

51 is_categorical_dtype, 

52 is_complex_dtype, 

53 is_datetime64_dtype, 

54 is_extension_array_dtype, 

55 is_float_dtype, 

56 is_integer, 

57 is_integer_dtype, 

58 is_list_like, 

59 is_numeric_dtype, 

60 is_object_dtype, 

61 is_scalar, 

62 is_signed_integer_dtype, 

63 is_timedelta64_dtype, 

64 needs_i8_conversion, 

65) 

66from pandas.core.dtypes.concat import concat_compat 

67from pandas.core.dtypes.dtypes import ( 

68 BaseMaskedDtype, 

69 ExtensionDtype, 

70 PandasDtype, 

71) 

72from pandas.core.dtypes.generic import ( 

73 ABCDatetimeArray, 

74 ABCExtensionArray, 

75 ABCIndex, 

76 ABCMultiIndex, 

77 ABCRangeIndex, 

78 ABCSeries, 

79 ABCTimedeltaArray, 

80) 

81from pandas.core.dtypes.missing import ( 

82 isna, 

83 na_value_for_dtype, 

84) 

85 

86from pandas.core.array_algos.take import take_nd 

87from pandas.core.construction import ( 

88 array as pd_array, 

89 ensure_wrapped_if_datetimelike, 

90 extract_array, 

91) 

92from pandas.core.indexers import validate_indices 

93 

94if TYPE_CHECKING: 94 ↛ 96line 94 didn't jump to line 96, because the condition on line 94 was never true

95 

96 from pandas._typing import ( 

97 NumpySorter, 

98 NumpyValueArrayLike, 

99 ) 

100 

101 from pandas import ( 

102 Categorical, 

103 DataFrame, 

104 Index, 

105 MultiIndex, 

106 Series, 

107 ) 

108 from pandas.core.arrays import ( 

109 BaseMaskedArray, 

110 ExtensionArray, 

111 ) 

112 

113 

114# --------------- # 

115# dtype access # 

116# --------------- # 

117def _ensure_data(values: ArrayLike) -> np.ndarray: 

118 """ 

119 routine to ensure that our data is of the correct 

120 input dtype for lower-level routines 

121 

122 This will coerce: 

123 - ints -> int64 

124 - uint -> uint64 

125 - bool -> uint8 

126 - datetimelike -> i8 

127 - datetime64tz -> i8 (in local tz) 

128 - categorical -> codes 

129 

130 Parameters 

131 ---------- 

132 values : np.ndarray or ExtensionArray 

133 

134 Returns 

135 ------- 

136 np.ndarray 

137 """ 

138 

139 if not isinstance(values, ABCMultiIndex): 

140 # extract_array would raise 

141 values = extract_array(values, extract_numpy=True) 

142 

143 if is_object_dtype(values.dtype): 

144 return ensure_object(np.asarray(values)) 

145 

146 elif isinstance(values.dtype, BaseMaskedDtype): 

147 # i.e. BooleanArray, FloatingArray, IntegerArray 

148 values = cast("BaseMaskedArray", values) 

149 if not values._hasna: 

150 # No pd.NAs -> We can avoid an object-dtype cast (and copy) GH#41816 

151 # recurse to avoid re-implementing logic for eg bool->uint8 

152 return _ensure_data(values._data) 

153 return np.asarray(values) 

154 

155 elif is_categorical_dtype(values.dtype): 

156 # NB: cases that go through here should NOT be using _reconstruct_data 

157 # on the back-end. 

158 values = cast("Categorical", values) 

159 return values.codes 

160 

161 elif is_bool_dtype(values.dtype): 

162 if isinstance(values, np.ndarray): 

163 # i.e. actually dtype == np.dtype("bool") 

164 return np.asarray(values).view("uint8") 

165 else: 

166 # e.g. Sparse[bool, False] # TODO: no test cases get here 

167 return np.asarray(values).astype("uint8", copy=False) 

168 

169 elif is_integer_dtype(values.dtype): 

170 return np.asarray(values) 

171 

172 elif is_float_dtype(values.dtype): 

173 # Note: checking `values.dtype == "float128"` raises on Windows and 32bit 

174 # error: Item "ExtensionDtype" of "Union[Any, ExtensionDtype, dtype[Any]]" 

175 # has no attribute "itemsize" 

176 if values.dtype.itemsize in [2, 12, 16]: # type: ignore[union-attr] 

177 # we dont (yet) have float128 hashtable support 

178 return ensure_float64(values) 

179 return np.asarray(values) 

180 

181 elif is_complex_dtype(values.dtype): 

182 return cast(np.ndarray, values) 

183 

184 # datetimelike 

185 elif needs_i8_conversion(values.dtype): 

186 if isinstance(values, np.ndarray): 

187 values = sanitize_to_nanoseconds(values) 

188 npvalues = values.view("i8") 

189 npvalues = cast(np.ndarray, npvalues) 

190 return npvalues 

191 

192 # we have failed, return object 

193 values = np.asarray(values, dtype=object) 

194 return ensure_object(values) 

195 

196 

197def _reconstruct_data( 

198 values: ArrayLike, dtype: DtypeObj, original: AnyArrayLike 

199) -> ArrayLike: 

200 """ 

201 reverse of _ensure_data 

202 

203 Parameters 

204 ---------- 

205 values : np.ndarray or ExtensionArray 

206 dtype : np.dtype or ExtensionDtype 

207 original : AnyArrayLike 

208 

209 Returns 

210 ------- 

211 ExtensionArray or np.ndarray 

212 """ 

213 if isinstance(values, ABCExtensionArray) and values.dtype == dtype: 

214 # Catch DatetimeArray/TimedeltaArray 

215 return values 

216 

217 if not isinstance(dtype, np.dtype): 

218 # i.e. ExtensionDtype; note we have ruled out above the possibility 

219 # that values.dtype == dtype 

220 cls = dtype.construct_array_type() 

221 

222 values = cls._from_sequence(values, dtype=dtype) 

223 

224 else: 

225 if is_datetime64_dtype(dtype): 

226 dtype = np.dtype("datetime64[ns]") 

227 elif is_timedelta64_dtype(dtype): 

228 dtype = np.dtype("timedelta64[ns]") 

229 

230 values = values.astype(dtype, copy=False) 

231 

232 return values 

233 

234 

235def _ensure_arraylike(values) -> ArrayLike: 

236 """ 

237 ensure that we are arraylike if not already 

238 """ 

239 if not is_array_like(values): 

240 inferred = lib.infer_dtype(values, skipna=False) 

241 if inferred in ["mixed", "string", "mixed-integer"]: 

242 # "mixed-integer" to ensure we do not cast ["ss", 42] to str GH#22160 

243 if isinstance(values, tuple): 

244 values = list(values) 

245 values = construct_1d_object_array_from_listlike(values) 

246 else: 

247 values = np.asarray(values) 

248 return values 

249 

250 

251_hashtables = { 

252 "complex128": htable.Complex128HashTable, 

253 "complex64": htable.Complex64HashTable, 

254 "float64": htable.Float64HashTable, 

255 "float32": htable.Float32HashTable, 

256 "uint64": htable.UInt64HashTable, 

257 "uint32": htable.UInt32HashTable, 

258 "uint16": htable.UInt16HashTable, 

259 "uint8": htable.UInt8HashTable, 

260 "int64": htable.Int64HashTable, 

261 "int32": htable.Int32HashTable, 

262 "int16": htable.Int16HashTable, 

263 "int8": htable.Int8HashTable, 

264 "string": htable.StringHashTable, 

265 "object": htable.PyObjectHashTable, 

266} 

267 

268 

269def _get_hashtable_algo(values: np.ndarray): 

270 """ 

271 Parameters 

272 ---------- 

273 values : np.ndarray 

274 

275 Returns 

276 ------- 

277 htable : HashTable subclass 

278 values : ndarray 

279 """ 

280 values = _ensure_data(values) 

281 

282 ndtype = _check_object_for_strings(values) 

283 htable = _hashtables[ndtype] 

284 return htable, values 

285 

286 

287def _check_object_for_strings(values: np.ndarray) -> str: 

288 """ 

289 Check if we can use string hashtable instead of object hashtable. 

290 

291 Parameters 

292 ---------- 

293 values : ndarray 

294 

295 Returns 

296 ------- 

297 str 

298 """ 

299 ndtype = values.dtype.name 

300 if ndtype == "object": 

301 

302 # it's cheaper to use a String Hash Table than Object; we infer 

303 # including nulls because that is the only difference between 

304 # StringHashTable and ObjectHashtable 

305 if lib.infer_dtype(values, skipna=False) in ["string"]: 

306 ndtype = "string" 

307 return ndtype 

308 

309 

310# --------------- # 

311# top-level algos # 

312# --------------- # 

313 

314 

315def unique(values): 

316 """ 

317 Return unique values based on a hash table. 

318 

319 Uniques are returned in order of appearance. This does NOT sort. 

320 

321 Significantly faster than numpy.unique for long enough sequences. 

322 Includes NA values. 

323 

324 Parameters 

325 ---------- 

326 values : 1d array-like 

327 

328 Returns 

329 ------- 

330 numpy.ndarray or ExtensionArray 

331 

332 The return can be: 

333 

334 * Index : when the input is an Index 

335 * Categorical : when the input is a Categorical dtype 

336 * ndarray : when the input is a Series/ndarray 

337 

338 Return numpy.ndarray or ExtensionArray. 

339 

340 See Also 

341 -------- 

342 Index.unique : Return unique values from an Index. 

343 Series.unique : Return unique values of Series object. 

344 

345 Examples 

346 -------- 

347 >>> pd.unique(pd.Series([2, 1, 3, 3])) 

348 array([2, 1, 3]) 

349 

350 >>> pd.unique(pd.Series([2] + [1] * 5)) 

351 array([2, 1]) 

352 

353 >>> pd.unique(pd.Series([pd.Timestamp("20160101"), pd.Timestamp("20160101")])) 

354 array(['2016-01-01T00:00:00.000000000'], dtype='datetime64[ns]') 

355 

356 >>> pd.unique( 

357 ... pd.Series( 

358 ... [ 

359 ... pd.Timestamp("20160101", tz="US/Eastern"), 

360 ... pd.Timestamp("20160101", tz="US/Eastern"), 

361 ... ] 

362 ... ) 

363 ... ) 

364 <DatetimeArray> 

365 ['2016-01-01 00:00:00-05:00'] 

366 Length: 1, dtype: datetime64[ns, US/Eastern] 

367 

368 >>> pd.unique( 

369 ... pd.Index( 

370 ... [ 

371 ... pd.Timestamp("20160101", tz="US/Eastern"), 

372 ... pd.Timestamp("20160101", tz="US/Eastern"), 

373 ... ] 

374 ... ) 

375 ... ) 

376 DatetimeIndex(['2016-01-01 00:00:00-05:00'], 

377 dtype='datetime64[ns, US/Eastern]', 

378 freq=None) 

379 

380 >>> pd.unique(list("baabc")) 

381 array(['b', 'a', 'c'], dtype=object) 

382 

383 An unordered Categorical will return categories in the 

384 order of appearance. 

385 

386 >>> pd.unique(pd.Series(pd.Categorical(list("baabc")))) 

387 ['b', 'a', 'c'] 

388 Categories (3, object): ['a', 'b', 'c'] 

389 

390 >>> pd.unique(pd.Series(pd.Categorical(list("baabc"), categories=list("abc")))) 

391 ['b', 'a', 'c'] 

392 Categories (3, object): ['a', 'b', 'c'] 

393 

394 An ordered Categorical preserves the category ordering. 

395 

396 >>> pd.unique( 

397 ... pd.Series( 

398 ... pd.Categorical(list("baabc"), categories=list("abc"), ordered=True) 

399 ... ) 

400 ... ) 

401 ['b', 'a', 'c'] 

402 Categories (3, object): ['a' < 'b' < 'c'] 

403 

404 An array of tuples 

405 

406 >>> pd.unique([("a", "b"), ("b", "a"), ("a", "c"), ("b", "a")]) 

407 array([('a', 'b'), ('b', 'a'), ('a', 'c')], dtype=object) 

408 """ 

409 return unique_with_mask(values) 

410 

411 

412def unique_with_mask(values, mask: npt.NDArray[np.bool_] | None = None): 

413 """See algorithms.unique for docs. Takes a mask for masked arrays.""" 

414 values = _ensure_arraylike(values) 

415 

416 if is_extension_array_dtype(values.dtype): 

417 # Dispatch to extension dtype's unique. 

418 return values.unique() 

419 

420 original = values 

421 htable, values = _get_hashtable_algo(values) 

422 

423 table = htable(len(values)) 

424 if mask is None: 

425 uniques = table.unique(values) 

426 uniques = _reconstruct_data(uniques, original.dtype, original) 

427 return uniques 

428 

429 else: 

430 uniques, mask = table.unique(values, mask=mask) 

431 uniques = _reconstruct_data(uniques, original.dtype, original) 

432 assert mask is not None # for mypy 

433 return uniques, mask.astype("bool") 

434 

435 

436unique1d = unique 

437 

438 

439def isin(comps: AnyArrayLike, values: AnyArrayLike) -> npt.NDArray[np.bool_]: 

440 """ 

441 Compute the isin boolean array. 

442 

443 Parameters 

444 ---------- 

445 comps : array-like 

446 values : array-like 

447 

448 Returns 

449 ------- 

450 ndarray[bool] 

451 Same length as `comps`. 

452 """ 

453 if not is_list_like(comps): 

454 raise TypeError( 

455 "only list-like objects are allowed to be passed " 

456 f"to isin(), you passed a [{type(comps).__name__}]" 

457 ) 

458 if not is_list_like(values): 

459 raise TypeError( 

460 "only list-like objects are allowed to be passed " 

461 f"to isin(), you passed a [{type(values).__name__}]" 

462 ) 

463 

464 if not isinstance(values, (ABCIndex, ABCSeries, ABCExtensionArray, np.ndarray)): 

465 orig_values = values 

466 values = _ensure_arraylike(list(values)) 

467 

468 if is_numeric_dtype(values) and not is_signed_integer_dtype(comps): 

469 # GH#46485 Use object to avoid upcast to float64 later 

470 # TODO: Share with _find_common_type_compat 

471 values = construct_1d_object_array_from_listlike(list(orig_values)) 

472 

473 elif isinstance(values, ABCMultiIndex): 

474 # Avoid raising in extract_array 

475 values = np.array(values) 

476 else: 

477 values = extract_array(values, extract_numpy=True, extract_range=True) 

478 

479 comps_array = _ensure_arraylike(comps) 

480 comps_array = extract_array(comps_array, extract_numpy=True) 

481 if not isinstance(comps_array, np.ndarray): 

482 # i.e. Extension Array 

483 return comps_array.isin(values) 

484 

485 elif needs_i8_conversion(comps_array.dtype): 

486 # Dispatch to DatetimeLikeArrayMixin.isin 

487 return pd_array(comps_array).isin(values) 

488 elif needs_i8_conversion(values.dtype) and not is_object_dtype(comps_array.dtype): 

489 # e.g. comps_array are integers and values are datetime64s 

490 return np.zeros(comps_array.shape, dtype=bool) 

491 # TODO: not quite right ... Sparse/Categorical 

492 elif needs_i8_conversion(values.dtype): 

493 return isin(comps_array, values.astype(object)) 

494 

495 elif isinstance(values.dtype, ExtensionDtype): 

496 return isin(np.asarray(comps_array), np.asarray(values)) 

497 

498 # GH16012 

499 # Ensure np.in1d doesn't get object types or it *may* throw an exception 

500 # Albeit hashmap has O(1) look-up (vs. O(logn) in sorted array), 

501 # in1d is faster for small sizes 

502 if ( 

503 len(comps_array) > 1_000_000 

504 and len(values) <= 26 

505 and not is_object_dtype(comps_array) 

506 ): 

507 # If the values include nan we need to check for nan explicitly 

508 # since np.nan it not equal to np.nan 

509 if isna(values).any(): 

510 

511 def f(c, v): 

512 return np.logical_or(np.in1d(c, v), np.isnan(c)) 

513 

514 else: 

515 f = np.in1d 

516 

517 else: 

518 common = np.find_common_type([values.dtype, comps_array.dtype], []) 

519 values = values.astype(common, copy=False) 

520 comps_array = comps_array.astype(common, copy=False) 

521 f = htable.ismember 

522 

523 return f(comps_array, values) 

524 

525 

526def factorize_array( 

527 values: np.ndarray, 

528 na_sentinel: int | None = -1, 

529 size_hint: int | None = None, 

530 na_value: object = None, 

531 mask: npt.NDArray[np.bool_] | None = None, 

532) -> tuple[npt.NDArray[np.intp], np.ndarray]: 

533 """ 

534 Factorize a numpy array to codes and uniques. 

535 

536 This doesn't do any coercion of types or unboxing before factorization. 

537 

538 Parameters 

539 ---------- 

540 values : ndarray 

541 na_sentinel : int, default -1 

542 size_hint : int, optional 

543 Passed through to the hashtable's 'get_labels' method 

544 na_value : object, optional 

545 A value in `values` to consider missing. Note: only use this 

546 parameter when you know that you don't have any values pandas would 

547 consider missing in the array (NaN for float data, iNaT for 

548 datetimes, etc.). 

549 mask : ndarray[bool], optional 

550 If not None, the mask is used as indicator for missing values 

551 (True = missing, False = valid) instead of `na_value` or 

552 condition "val != val". 

553 

554 Returns 

555 ------- 

556 codes : ndarray[np.intp] 

557 uniques : ndarray 

558 """ 

559 ignore_na = na_sentinel is not None 

560 if not ignore_na: 

561 na_sentinel = -1 

562 

563 original = values 

564 if values.dtype.kind in ["m", "M"]: 

565 # _get_hashtable_algo will cast dt64/td64 to i8 via _ensure_data, so we 

566 # need to do the same to na_value. We are assuming here that the passed 

567 # na_value is an appropriately-typed NaT. 

568 # e.g. test_where_datetimelike_categorical 

569 na_value = iNaT 

570 

571 hash_klass, values = _get_hashtable_algo(values) 

572 

573 table = hash_klass(size_hint or len(values)) 

574 uniques, codes = table.factorize( 

575 values, 

576 na_sentinel=na_sentinel, 

577 na_value=na_value, 

578 mask=mask, 

579 ignore_na=ignore_na, 

580 ) 

581 

582 # re-cast e.g. i8->dt64/td64, uint8->bool 

583 uniques = _reconstruct_data(uniques, original.dtype, original) 

584 

585 codes = ensure_platform_int(codes) 

586 return codes, uniques 

587 

588 

589@doc( 

590 values=dedent( 

591 """\ 

592 values : sequence 

593 A 1-D sequence. Sequences that aren't pandas objects are 

594 coerced to ndarrays before factorization. 

595 """ 

596 ), 

597 sort=dedent( 

598 """\ 

599 sort : bool, default False 

600 Sort `uniques` and shuffle `codes` to maintain the 

601 relationship. 

602 """ 

603 ), 

604 size_hint=dedent( 

605 """\ 

606 size_hint : int, optional 

607 Hint to the hashtable sizer. 

608 """ 

609 ), 

610) 

611def factorize( 

612 values, 

613 sort: bool = False, 

614 na_sentinel: int | None | lib.NoDefault = lib.no_default, 

615 use_na_sentinel: bool | lib.NoDefault = lib.no_default, 

616 size_hint: int | None = None, 

617) -> tuple[np.ndarray, np.ndarray | Index]: 

618 """ 

619 Encode the object as an enumerated type or categorical variable. 

620 

621 This method is useful for obtaining a numeric representation of an 

622 array when all that matters is identifying distinct values. `factorize` 

623 is available as both a top-level function :func:`pandas.factorize`, 

624 and as a method :meth:`Series.factorize` and :meth:`Index.factorize`. 

625 

626 Parameters 

627 ---------- 

628 {values}{sort} 

629 na_sentinel : int or None, default -1 

630 Value to mark "not found". If None, will not drop the NaN 

631 from the uniques of the values. 

632 

633 .. deprecated:: 1.5.0 

634 The na_sentinel argument is deprecated and 

635 will be removed in a future version of pandas. Specify use_na_sentinel as 

636 either True or False. 

637 

638 .. versionchanged:: 1.1.2 

639 

640 use_na_sentinel : bool, default True 

641 If True, the sentinel -1 will be used for NaN values. If False, 

642 NaN values will be encoded as non-negative integers and will not drop the 

643 NaN from the uniques of the values. 

644 

645 .. versionadded:: 1.5.0 

646 {size_hint}\ 

647 

648 Returns 

649 ------- 

650 codes : ndarray 

651 An integer ndarray that's an indexer into `uniques`. 

652 ``uniques.take(codes)`` will have the same values as `values`. 

653 uniques : ndarray, Index, or Categorical 

654 The unique valid values. When `values` is Categorical, `uniques` 

655 is a Categorical. When `values` is some other pandas object, an 

656 `Index` is returned. Otherwise, a 1-D ndarray is returned. 

657 

658 .. note:: 

659 

660 Even if there's a missing value in `values`, `uniques` will 

661 *not* contain an entry for it. 

662 

663 See Also 

664 -------- 

665 cut : Discretize continuous-valued array. 

666 unique : Find the unique value in an array. 

667 

668 Notes 

669 ----- 

670 Reference :ref:`the user guide <reshaping.factorize>` for more examples. 

671 

672 Examples 

673 -------- 

674 These examples all show factorize as a top-level method like 

675 ``pd.factorize(values)``. The results are identical for methods like 

676 :meth:`Series.factorize`. 

677 

678 >>> codes, uniques = pd.factorize(['b', 'b', 'a', 'c', 'b']) 

679 >>> codes 

680 array([0, 0, 1, 2, 0]...) 

681 >>> uniques 

682 array(['b', 'a', 'c'], dtype=object) 

683 

684 With ``sort=True``, the `uniques` will be sorted, and `codes` will be 

685 shuffled so that the relationship is the maintained. 

686 

687 >>> codes, uniques = pd.factorize(['b', 'b', 'a', 'c', 'b'], sort=True) 

688 >>> codes 

689 array([1, 1, 0, 2, 1]...) 

690 >>> uniques 

691 array(['a', 'b', 'c'], dtype=object) 

692 

693 When ``use_na_sentinel=True`` (the default), missing values are indicated in 

694 the `codes` with the sentinel value ``-1`` and missing values are not 

695 included in `uniques`. 

696 

697 >>> codes, uniques = pd.factorize(['b', None, 'a', 'c', 'b']) 

698 >>> codes 

699 array([ 0, -1, 1, 2, 0]...) 

700 >>> uniques 

701 array(['b', 'a', 'c'], dtype=object) 

702 

703 Thus far, we've only factorized lists (which are internally coerced to 

704 NumPy arrays). When factorizing pandas objects, the type of `uniques` 

705 will differ. For Categoricals, a `Categorical` is returned. 

706 

707 >>> cat = pd.Categorical(['a', 'a', 'c'], categories=['a', 'b', 'c']) 

708 >>> codes, uniques = pd.factorize(cat) 

709 >>> codes 

710 array([0, 0, 1]...) 

711 >>> uniques 

712 ['a', 'c'] 

713 Categories (3, object): ['a', 'b', 'c'] 

714 

715 Notice that ``'b'`` is in ``uniques.categories``, despite not being 

716 present in ``cat.values``. 

717 

718 For all other pandas objects, an Index of the appropriate type is 

719 returned. 

720 

721 >>> cat = pd.Series(['a', 'a', 'c']) 

722 >>> codes, uniques = pd.factorize(cat) 

723 >>> codes 

724 array([0, 0, 1]...) 

725 >>> uniques 

726 Index(['a', 'c'], dtype='object') 

727 

728 If NaN is in the values, and we want to include NaN in the uniques of the 

729 values, it can be achieved by setting ``use_na_sentinel=False``. 

730 

731 >>> values = np.array([1, 2, 1, np.nan]) 

732 >>> codes, uniques = pd.factorize(values) # default: use_na_sentinel=True 

733 >>> codes 

734 array([ 0, 1, 0, -1]) 

735 >>> uniques 

736 array([1., 2.]) 

737 

738 >>> codes, uniques = pd.factorize(values, use_na_sentinel=False) 

739 >>> codes 

740 array([0, 1, 0, 2]) 

741 >>> uniques 

742 array([ 1., 2., nan]) 

743 """ 

744 # Implementation notes: This method is responsible for 3 things 

745 # 1.) coercing data to array-like (ndarray, Index, extension array) 

746 # 2.) factorizing codes and uniques 

747 # 3.) Maybe boxing the uniques in an Index 

748 # 

749 # Step 2 is dispatched to extension types (like Categorical). They are 

750 # responsible only for factorization. All data coercion, sorting and boxing 

751 # should happen here. 

752 

753 # GH#46910 deprecated na_sentinel in favor of use_na_sentinel: 

754 # na_sentinel=None corresponds to use_na_sentinel=False 

755 # na_sentinel=-1 correspond to use_na_sentinel=True 

756 # Other na_sentinel values will not be supported when the deprecation is enforced. 

757 na_sentinel = resolve_na_sentinel(na_sentinel, use_na_sentinel) 

758 if isinstance(values, ABCRangeIndex): 

759 return values.factorize(sort=sort) 

760 

761 values = _ensure_arraylike(values) 

762 original = values 

763 if not isinstance(values, ABCMultiIndex): 

764 values = extract_array(values, extract_numpy=True) 

765 

766 # GH35667, if na_sentinel=None, we will not dropna NaNs from the uniques 

767 # of values, assign na_sentinel=-1 to replace code value for NaN. 

768 dropna = na_sentinel is not None 

769 

770 if ( 

771 isinstance(values, (ABCDatetimeArray, ABCTimedeltaArray)) 

772 and values.freq is not None 

773 ): 

774 # The presence of 'freq' means we can fast-path sorting and know there 

775 # aren't NAs 

776 codes, uniques = values.factorize(sort=sort) 

777 return _re_wrap_factorize(original, uniques, codes) 

778 

779 elif not isinstance(values.dtype, np.dtype): 

780 if ( 

781 na_sentinel == -1 or na_sentinel is None 

782 ) and "use_na_sentinel" in inspect.signature(values.factorize).parameters: 

783 # Avoid using catch_warnings when possible 

784 # GH#46910 - TimelikeOps has deprecated signature 

785 codes, uniques = values.factorize( # type: ignore[call-arg] 

786 use_na_sentinel=na_sentinel is not None 

787 ) 

788 else: 

789 na_sentinel_arg = -1 if na_sentinel is None else na_sentinel 

790 with warnings.catch_warnings(): 

791 # We've already warned above 

792 warnings.filterwarnings("ignore", ".*use_na_sentinel.*", FutureWarning) 

793 codes, uniques = values.factorize(na_sentinel=na_sentinel_arg) 

794 

795 else: 

796 values = np.asarray(values) # convert DTA/TDA/MultiIndex 

797 # TODO: pass na_sentinel=na_sentinel to factorize_array. When sort is True and 

798 # na_sentinel is None we append NA on the end because safe_sort does not 

799 # handle null values in uniques. 

800 if na_sentinel is None and sort: 

801 na_sentinel_arg = -1 

802 elif na_sentinel is None: 

803 na_sentinel_arg = None 

804 else: 

805 na_sentinel_arg = na_sentinel 

806 

807 if not dropna and not sort and is_object_dtype(values): 

808 # factorize can now handle differentiating various types of null values. 

809 # These can only occur when the array has object dtype. 

810 # However, for backwards compatibility we only use the null for the 

811 # provided dtype. This may be revisited in the future, see GH#48476. 

812 null_mask = isna(values) 

813 if null_mask.any(): 

814 na_value = na_value_for_dtype(values.dtype, compat=False) 

815 # Don't modify (potentially user-provided) array 

816 values = np.where(null_mask, na_value, values) 

817 

818 codes, uniques = factorize_array( 

819 values, 

820 na_sentinel=na_sentinel_arg, 

821 size_hint=size_hint, 

822 ) 

823 

824 if sort and len(uniques) > 0: 

825 if na_sentinel is None: 

826 # TODO: Can remove when na_sentinel=na_sentinel as in TODO above 

827 na_sentinel = -1 

828 uniques, codes = safe_sort( 

829 uniques, codes, na_sentinel=na_sentinel, assume_unique=True, verify=False 

830 ) 

831 

832 if not dropna and sort: 

833 # TODO: Can remove entire block when na_sentinel=na_sentinel as in TODO above 

834 if na_sentinel is None: 

835 na_sentinel_arg = -1 

836 else: 

837 na_sentinel_arg = na_sentinel 

838 code_is_na = codes == na_sentinel_arg 

839 if code_is_na.any(): 

840 # na_value is set based on the dtype of uniques, and compat set to False is 

841 # because we do not want na_value to be 0 for integers 

842 na_value = na_value_for_dtype(uniques.dtype, compat=False) 

843 uniques = np.append(uniques, [na_value]) 

844 codes = np.where(code_is_na, len(uniques) - 1, codes) 

845 

846 uniques = _reconstruct_data(uniques, original.dtype, original) 

847 

848 return _re_wrap_factorize(original, uniques, codes) 

849 

850 

851def resolve_na_sentinel( 

852 na_sentinel: int | None | lib.NoDefault, 

853 use_na_sentinel: bool | lib.NoDefault, 

854) -> int | None: 

855 """ 

856 Determine value of na_sentinel for factorize methods. 

857 

858 See GH#46910 for details on the deprecation. 

859 

860 Parameters 

861 ---------- 

862 na_sentinel : int, None, or lib.no_default 

863 Value passed to the method. 

864 use_na_sentinel : bool or lib.no_default 

865 Value passed to the method. 

866 

867 Returns 

868 ------- 

869 Resolved value of na_sentinel. 

870 """ 

871 if na_sentinel is not lib.no_default and use_na_sentinel is not lib.no_default: 

872 raise ValueError( 

873 "Cannot specify both `na_sentinel` and `use_na_sentile`; " 

874 f"got `na_sentinel={na_sentinel}` and `use_na_sentinel={use_na_sentinel}`" 

875 ) 

876 if na_sentinel is lib.no_default: 

877 result = -1 if use_na_sentinel is lib.no_default or use_na_sentinel else None 

878 else: 

879 if na_sentinel is None: 

880 msg = ( 

881 "Specifying `na_sentinel=None` is deprecated, specify " 

882 "`use_na_sentinel=False` instead." 

883 ) 

884 elif na_sentinel == -1: 

885 msg = ( 

886 "Specifying `na_sentinel=-1` is deprecated, specify " 

887 "`use_na_sentinel=True` instead." 

888 ) 

889 else: 

890 msg = ( 

891 "Specifying the specific value to use for `na_sentinel` is " 

892 "deprecated and will be removed in a future version of pandas. " 

893 "Specify `use_na_sentinel=True` to use the sentinel value -1, and " 

894 "`use_na_sentinel=False` to encode NaN values." 

895 ) 

896 warnings.warn(msg, FutureWarning, stacklevel=find_stack_level()) 

897 result = na_sentinel 

898 return result 

899 

900 

901def _re_wrap_factorize(original, uniques, codes: np.ndarray): 

902 """ 

903 Wrap factorize results in Series or Index depending on original type. 

904 """ 

905 if isinstance(original, ABCIndex): 

906 uniques = ensure_wrapped_if_datetimelike(uniques) 

907 uniques = original._shallow_copy(uniques, name=None) 

908 elif isinstance(original, ABCSeries): 

909 from pandas import Index 

910 

911 uniques = Index(uniques) 

912 

913 return codes, uniques 

914 

915 

916def value_counts( 

917 values, 

918 sort: bool = True, 

919 ascending: bool = False, 

920 normalize: bool = False, 

921 bins=None, 

922 dropna: bool = True, 

923) -> Series: 

924 """ 

925 Compute a histogram of the counts of non-null values. 

926 

927 Parameters 

928 ---------- 

929 values : ndarray (1-d) 

930 sort : bool, default True 

931 Sort by values 

932 ascending : bool, default False 

933 Sort in ascending order 

934 normalize: bool, default False 

935 If True then compute a relative histogram 

936 bins : integer, optional 

937 Rather than count values, group them into half-open bins, 

938 convenience for pd.cut, only works with numeric data 

939 dropna : bool, default True 

940 Don't include counts of NaN 

941 

942 Returns 

943 ------- 

944 Series 

945 """ 

946 from pandas import ( 

947 Index, 

948 Series, 

949 ) 

950 

951 name = getattr(values, "name", None) 

952 

953 if bins is not None: 

954 from pandas.core.reshape.tile import cut 

955 

956 values = Series(values) 

957 try: 

958 ii = cut(values, bins, include_lowest=True) 

959 except TypeError as err: 

960 raise TypeError("bins argument only works with numeric data.") from err 

961 

962 # count, remove nulls (from the index), and but the bins 

963 result = ii.value_counts(dropna=dropna) 

964 result = result[result.index.notna()] 

965 result.index = result.index.astype("interval") 

966 result = result.sort_index() 

967 

968 # if we are dropna and we have NO values 

969 if dropna and (result._values == 0).all(): 

970 result = result.iloc[0:0] 

971 

972 # normalizing is by len of all (regardless of dropna) 

973 counts = np.array([len(ii)]) 

974 

975 else: 

976 

977 if is_extension_array_dtype(values): 

978 

979 # handle Categorical and sparse, 

980 result = Series(values)._values.value_counts(dropna=dropna) 

981 result.name = name 

982 counts = result._values 

983 

984 else: 

985 values = _ensure_arraylike(values) 

986 keys, counts = value_counts_arraylike(values, dropna) 

987 

988 # For backwards compatibility, we let Index do its normal type 

989 # inference, _except_ for if if infers from object to bool. 

990 idx = Index._with_infer(keys) 

991 if idx.dtype == bool and keys.dtype == object: 

992 idx = idx.astype(object) 

993 

994 result = Series(counts, index=idx, name=name) 

995 

996 if sort: 

997 result = result.sort_values(ascending=ascending) 

998 

999 if normalize: 

1000 result = result / counts.sum() 

1001 

1002 return result 

1003 

1004 

1005# Called once from SparseArray, otherwise could be private 

1006def value_counts_arraylike( 

1007 values: np.ndarray, dropna: bool, mask: npt.NDArray[np.bool_] | None = None 

1008) -> tuple[ArrayLike, npt.NDArray[np.int64]]: 

1009 """ 

1010 Parameters 

1011 ---------- 

1012 values : np.ndarray 

1013 dropna : bool 

1014 mask : np.ndarray[bool] or None, default None 

1015 

1016 Returns 

1017 ------- 

1018 uniques : np.ndarray 

1019 counts : np.ndarray[np.int64] 

1020 """ 

1021 original = values 

1022 values = _ensure_data(values) 

1023 

1024 keys, counts = htable.value_count(values, dropna, mask=mask) 

1025 

1026 if needs_i8_conversion(original.dtype): 

1027 # datetime, timedelta, or period 

1028 

1029 if dropna: 

1030 mask = keys != iNaT 

1031 keys, counts = keys[mask], counts[mask] 

1032 

1033 res_keys = _reconstruct_data(keys, original.dtype, original) 

1034 return res_keys, counts 

1035 

1036 

1037def duplicated( 

1038 values: ArrayLike, keep: Literal["first", "last", False] = "first" 

1039) -> npt.NDArray[np.bool_]: 

1040 """ 

1041 Return boolean ndarray denoting duplicate values. 

1042 

1043 Parameters 

1044 ---------- 

1045 values : nd.array, ExtensionArray or Series 

1046 Array over which to check for duplicate values. 

1047 keep : {'first', 'last', False}, default 'first' 

1048 - ``first`` : Mark duplicates as ``True`` except for the first 

1049 occurrence. 

1050 - ``last`` : Mark duplicates as ``True`` except for the last 

1051 occurrence. 

1052 - False : Mark all duplicates as ``True``. 

1053 

1054 Returns 

1055 ------- 

1056 duplicated : ndarray[bool] 

1057 """ 

1058 values = _ensure_data(values) 

1059 return htable.duplicated(values, keep=keep) 

1060 

1061 

1062def mode( 

1063 values: ArrayLike, dropna: bool = True, mask: npt.NDArray[np.bool_] | None = None 

1064) -> ArrayLike: 

1065 """ 

1066 Returns the mode(s) of an array. 

1067 

1068 Parameters 

1069 ---------- 

1070 values : array-like 

1071 Array over which to check for duplicate values. 

1072 dropna : bool, default True 

1073 Don't consider counts of NaN/NaT. 

1074 

1075 Returns 

1076 ------- 

1077 np.ndarray or ExtensionArray 

1078 """ 

1079 values = _ensure_arraylike(values) 

1080 original = values 

1081 

1082 if needs_i8_conversion(values.dtype): 

1083 # Got here with ndarray; dispatch to DatetimeArray/TimedeltaArray. 

1084 values = ensure_wrapped_if_datetimelike(values) 

1085 values = cast("ExtensionArray", values) 

1086 return values._mode(dropna=dropna) 

1087 

1088 values = _ensure_data(values) 

1089 

1090 npresult = htable.mode(values, dropna=dropna, mask=mask) 

1091 try: 

1092 npresult = np.sort(npresult) 

1093 except TypeError as err: 

1094 warnings.warn( 

1095 f"Unable to sort modes: {err}", 

1096 stacklevel=find_stack_level(), 

1097 ) 

1098 

1099 result = _reconstruct_data(npresult, original.dtype, original) 

1100 return result 

1101 

1102 

1103def rank( 

1104 values: ArrayLike, 

1105 axis: int = 0, 

1106 method: str = "average", 

1107 na_option: str = "keep", 

1108 ascending: bool = True, 

1109 pct: bool = False, 

1110) -> npt.NDArray[np.float64]: 

1111 """ 

1112 Rank the values along a given axis. 

1113 

1114 Parameters 

1115 ---------- 

1116 values : np.ndarray or ExtensionArray 

1117 Array whose values will be ranked. The number of dimensions in this 

1118 array must not exceed 2. 

1119 axis : int, default 0 

1120 Axis over which to perform rankings. 

1121 method : {'average', 'min', 'max', 'first', 'dense'}, default 'average' 

1122 The method by which tiebreaks are broken during the ranking. 

1123 na_option : {'keep', 'top'}, default 'keep' 

1124 The method by which NaNs are placed in the ranking. 

1125 - ``keep``: rank each NaN value with a NaN ranking 

1126 - ``top``: replace each NaN with either +/- inf so that they 

1127 there are ranked at the top 

1128 ascending : bool, default True 

1129 Whether or not the elements should be ranked in ascending order. 

1130 pct : bool, default False 

1131 Whether or not to the display the returned rankings in integer form 

1132 (e.g. 1, 2, 3) or in percentile form (e.g. 0.333..., 0.666..., 1). 

1133 """ 

1134 is_datetimelike = needs_i8_conversion(values.dtype) 

1135 values = _ensure_data(values) 

1136 

1137 if values.ndim == 1: 

1138 ranks = algos.rank_1d( 

1139 values, 

1140 is_datetimelike=is_datetimelike, 

1141 ties_method=method, 

1142 ascending=ascending, 

1143 na_option=na_option, 

1144 pct=pct, 

1145 ) 

1146 elif values.ndim == 2: 

1147 ranks = algos.rank_2d( 

1148 values, 

1149 axis=axis, 

1150 is_datetimelike=is_datetimelike, 

1151 ties_method=method, 

1152 ascending=ascending, 

1153 na_option=na_option, 

1154 pct=pct, 

1155 ) 

1156 else: 

1157 raise TypeError("Array with ndim > 2 are not supported.") 

1158 

1159 return ranks 

1160 

1161 

1162def checked_add_with_arr( 

1163 arr: npt.NDArray[np.int64], 

1164 b: int | npt.NDArray[np.int64], 

1165 arr_mask: npt.NDArray[np.bool_] | None = None, 

1166 b_mask: npt.NDArray[np.bool_] | None = None, 

1167) -> npt.NDArray[np.int64]: 

1168 """ 

1169 Perform array addition that checks for underflow and overflow. 

1170 

1171 Performs the addition of an int64 array and an int64 integer (or array) 

1172 but checks that they do not result in overflow first. For elements that 

1173 are indicated to be NaN, whether or not there is overflow for that element 

1174 is automatically ignored. 

1175 

1176 Parameters 

1177 ---------- 

1178 arr : np.ndarray[int64] addend. 

1179 b : array or scalar addend. 

1180 arr_mask : np.ndarray[bool] or None, default None 

1181 array indicating which elements to exclude from checking 

1182 b_mask : np.ndarray[bool] or None, default None 

1183 array or scalar indicating which element(s) to exclude from checking 

1184 

1185 Returns 

1186 ------- 

1187 sum : An array for elements x + b for each element x in arr if b is 

1188 a scalar or an array for elements x + y for each element pair 

1189 (x, y) in (arr, b). 

1190 

1191 Raises 

1192 ------ 

1193 OverflowError if any x + y exceeds the maximum or minimum int64 value. 

1194 """ 

1195 # For performance reasons, we broadcast 'b' to the new array 'b2' 

1196 # so that it has the same size as 'arr'. 

1197 b2 = np.broadcast_to(b, arr.shape) 

1198 if b_mask is not None: 

1199 # We do the same broadcasting for b_mask as well. 

1200 b2_mask = np.broadcast_to(b_mask, arr.shape) 

1201 else: 

1202 b2_mask = None 

1203 

1204 # For elements that are NaN, regardless of their value, we should 

1205 # ignore whether they overflow or not when doing the checked add. 

1206 if arr_mask is not None and b2_mask is not None: 

1207 not_nan = np.logical_not(arr_mask | b2_mask) 

1208 elif arr_mask is not None: 

1209 not_nan = np.logical_not(arr_mask) 

1210 elif b_mask is not None: 

1211 # error: Argument 1 to "__call__" of "_UFunc_Nin1_Nout1" has 

1212 # incompatible type "Optional[ndarray[Any, dtype[bool_]]]"; 

1213 # expected "Union[_SupportsArray[dtype[Any]], _NestedSequence 

1214 # [_SupportsArray[dtype[Any]]], bool, int, float, complex, str 

1215 # , bytes, _NestedSequence[Union[bool, int, float, complex, str 

1216 # , bytes]]]" 

1217 not_nan = np.logical_not(b2_mask) # type: ignore[arg-type] 

1218 else: 

1219 not_nan = np.empty(arr.shape, dtype=bool) 

1220 not_nan.fill(True) 

1221 

1222 # gh-14324: For each element in 'arr' and its corresponding element 

1223 # in 'b2', we check the sign of the element in 'b2'. If it is positive, 

1224 # we then check whether its sum with the element in 'arr' exceeds 

1225 # np.iinfo(np.int64).max. If so, we have an overflow error. If it 

1226 # it is negative, we then check whether its sum with the element in 

1227 # 'arr' exceeds np.iinfo(np.int64).min. If so, we have an overflow 

1228 # error as well. 

1229 i8max = lib.i8max 

1230 i8min = iNaT 

1231 

1232 mask1 = b2 > 0 

1233 mask2 = b2 < 0 

1234 

1235 if not mask1.any(): 

1236 to_raise = ((i8min - b2 > arr) & not_nan).any() 

1237 elif not mask2.any(): 

1238 to_raise = ((i8max - b2 < arr) & not_nan).any() 

1239 else: 

1240 to_raise = ((i8max - b2[mask1] < arr[mask1]) & not_nan[mask1]).any() or ( 

1241 (i8min - b2[mask2] > arr[mask2]) & not_nan[mask2] 

1242 ).any() 

1243 

1244 if to_raise: 

1245 raise OverflowError("Overflow in int64 addition") 

1246 

1247 result = arr + b 

1248 if arr_mask is not None or b2_mask is not None: 

1249 np.putmask(result, ~not_nan, iNaT) 

1250 

1251 return result 

1252 

1253 

1254# --------------- # 

1255# select n # 

1256# --------------- # 

1257 

1258 

1259class SelectN: 

1260 def __init__(self, obj, n: int, keep: str) -> None: 

1261 self.obj = obj 

1262 self.n = n 

1263 self.keep = keep 

1264 

1265 if self.keep not in ("first", "last", "all"): 

1266 raise ValueError('keep must be either "first", "last" or "all"') 

1267 

1268 def compute(self, method: str) -> DataFrame | Series: 

1269 raise NotImplementedError 

1270 

1271 @final 

1272 def nlargest(self): 

1273 return self.compute("nlargest") 

1274 

1275 @final 

1276 def nsmallest(self): 

1277 return self.compute("nsmallest") 

1278 

1279 @final 

1280 @staticmethod 

1281 def is_valid_dtype_n_method(dtype: DtypeObj) -> bool: 

1282 """ 

1283 Helper function to determine if dtype is valid for 

1284 nsmallest/nlargest methods 

1285 """ 

1286 return ( 

1287 is_numeric_dtype(dtype) and not is_complex_dtype(dtype) 

1288 ) or needs_i8_conversion(dtype) 

1289 

1290 

1291class SelectNSeries(SelectN): 

1292 """ 

1293 Implement n largest/smallest for Series 

1294 

1295 Parameters 

1296 ---------- 

1297 obj : Series 

1298 n : int 

1299 keep : {'first', 'last'}, default 'first' 

1300 

1301 Returns 

1302 ------- 

1303 nordered : Series 

1304 """ 

1305 

1306 def compute(self, method: str) -> Series: 

1307 

1308 from pandas.core.reshape.concat import concat 

1309 

1310 n = self.n 

1311 dtype = self.obj.dtype 

1312 if not self.is_valid_dtype_n_method(dtype): 

1313 raise TypeError(f"Cannot use method '{method}' with dtype {dtype}") 

1314 

1315 if n <= 0: 

1316 return self.obj[[]] 

1317 

1318 dropped = self.obj.dropna() 

1319 nan_index = self.obj.drop(dropped.index) 

1320 

1321 # slow method 

1322 if n >= len(self.obj): 

1323 ascending = method == "nsmallest" 

1324 return self.obj.sort_values(ascending=ascending).head(n) 

1325 

1326 # fast method 

1327 new_dtype = dropped.dtype 

1328 arr = _ensure_data(dropped.values) 

1329 if method == "nlargest": 

1330 arr = -arr 

1331 if is_integer_dtype(new_dtype): 

1332 # GH 21426: ensure reverse ordering at boundaries 

1333 arr -= 1 

1334 

1335 elif is_bool_dtype(new_dtype): 

1336 # GH 26154: ensure False is smaller than True 

1337 arr = 1 - (-arr) 

1338 

1339 if self.keep == "last": 

1340 arr = arr[::-1] 

1341 

1342 nbase = n 

1343 narr = len(arr) 

1344 n = min(n, narr) 

1345 

1346 # arr passed into kth_smallest must be contiguous. We copy 

1347 # here because kth_smallest will modify its input 

1348 kth_val = algos.kth_smallest(arr.copy(order="C"), n - 1) 

1349 (ns,) = np.nonzero(arr <= kth_val) 

1350 inds = ns[arr[ns].argsort(kind="mergesort")] 

1351 

1352 if self.keep != "all": 

1353 inds = inds[:n] 

1354 findex = nbase 

1355 else: 

1356 if len(inds) < nbase and len(nan_index) + len(inds) >= nbase: 

1357 findex = len(nan_index) + len(inds) 

1358 else: 

1359 findex = len(inds) 

1360 

1361 if self.keep == "last": 

1362 # reverse indices 

1363 inds = narr - 1 - inds 

1364 

1365 return concat([dropped.iloc[inds], nan_index]).iloc[:findex] 

1366 

1367 

1368class SelectNFrame(SelectN): 

1369 """ 

1370 Implement n largest/smallest for DataFrame 

1371 

1372 Parameters 

1373 ---------- 

1374 obj : DataFrame 

1375 n : int 

1376 keep : {'first', 'last'}, default 'first' 

1377 columns : list or str 

1378 

1379 Returns 

1380 ------- 

1381 nordered : DataFrame 

1382 """ 

1383 

1384 def __init__(self, obj: DataFrame, n: int, keep: str, columns: IndexLabel) -> None: 

1385 super().__init__(obj, n, keep) 

1386 if not is_list_like(columns) or isinstance(columns, tuple): 

1387 columns = [columns] 

1388 

1389 columns = cast(Sequence[Hashable], columns) 

1390 columns = list(columns) 

1391 self.columns = columns 

1392 

1393 def compute(self, method: str) -> DataFrame: 

1394 

1395 from pandas.core.api import Int64Index 

1396 

1397 n = self.n 

1398 frame = self.obj 

1399 columns = self.columns 

1400 

1401 for column in columns: 

1402 dtype = frame[column].dtype 

1403 if not self.is_valid_dtype_n_method(dtype): 

1404 raise TypeError( 

1405 f"Column {repr(column)} has dtype {dtype}, " 

1406 f"cannot use method {repr(method)} with this dtype" 

1407 ) 

1408 

1409 def get_indexer(current_indexer, other_indexer): 

1410 """ 

1411 Helper function to concat `current_indexer` and `other_indexer` 

1412 depending on `method` 

1413 """ 

1414 if method == "nsmallest": 

1415 return current_indexer.append(other_indexer) 

1416 else: 

1417 return other_indexer.append(current_indexer) 

1418 

1419 # Below we save and reset the index in case index contains duplicates 

1420 original_index = frame.index 

1421 cur_frame = frame = frame.reset_index(drop=True) 

1422 cur_n = n 

1423 indexer = Int64Index([]) 

1424 

1425 for i, column in enumerate(columns): 

1426 # For each column we apply method to cur_frame[column]. 

1427 # If it's the last column or if we have the number of 

1428 # results desired we are done. 

1429 # Otherwise there are duplicates of the largest/smallest 

1430 # value and we need to look at the rest of the columns 

1431 # to determine which of the rows with the largest/smallest 

1432 # value in the column to keep. 

1433 series = cur_frame[column] 

1434 is_last_column = len(columns) - 1 == i 

1435 values = getattr(series, method)( 

1436 cur_n, keep=self.keep if is_last_column else "all" 

1437 ) 

1438 

1439 if is_last_column or len(values) <= cur_n: 

1440 indexer = get_indexer(indexer, values.index) 

1441 break 

1442 

1443 # Now find all values which are equal to 

1444 # the (nsmallest: largest)/(nlargest: smallest) 

1445 # from our series. 

1446 border_value = values == values[values.index[-1]] 

1447 

1448 # Some of these values are among the top-n 

1449 # some aren't. 

1450 unsafe_values = values[border_value] 

1451 

1452 # These values are definitely among the top-n 

1453 safe_values = values[~border_value] 

1454 indexer = get_indexer(indexer, safe_values.index) 

1455 

1456 # Go on and separate the unsafe_values on the remaining 

1457 # columns. 

1458 cur_frame = cur_frame.loc[unsafe_values.index] 

1459 cur_n = n - len(indexer) 

1460 

1461 frame = frame.take(indexer) 

1462 

1463 # Restore the index on frame 

1464 frame.index = original_index.take(indexer) 

1465 

1466 # If there is only one column, the frame is already sorted. 

1467 if len(columns) == 1: 

1468 return frame 

1469 

1470 ascending = method == "nsmallest" 

1471 

1472 return frame.sort_values(columns, ascending=ascending, kind="mergesort") 

1473 

1474 

1475# ---- # 

1476# take # 

1477# ---- # 

1478 

1479 

1480def take( 

1481 arr, 

1482 indices: TakeIndexer, 

1483 axis: int = 0, 

1484 allow_fill: bool = False, 

1485 fill_value=None, 

1486): 

1487 """ 

1488 Take elements from an array. 

1489 

1490 Parameters 

1491 ---------- 

1492 arr : array-like or scalar value 

1493 Non array-likes (sequences/scalars without a dtype) are coerced 

1494 to an ndarray. 

1495 indices : sequence of int or one-dimensional np.ndarray of int 

1496 Indices to be taken. 

1497 axis : int, default 0 

1498 The axis over which to select values. 

1499 allow_fill : bool, default False 

1500 How to handle negative values in `indices`. 

1501 

1502 * False: negative values in `indices` indicate positional indices 

1503 from the right (the default). This is similar to :func:`numpy.take`. 

1504 

1505 * True: negative values in `indices` indicate 

1506 missing values. These values are set to `fill_value`. Any other 

1507 negative values raise a ``ValueError``. 

1508 

1509 fill_value : any, optional 

1510 Fill value to use for NA-indices when `allow_fill` is True. 

1511 This may be ``None``, in which case the default NA value for 

1512 the type (``self.dtype.na_value``) is used. 

1513 

1514 For multi-dimensional `arr`, each *element* is filled with 

1515 `fill_value`. 

1516 

1517 Returns 

1518 ------- 

1519 ndarray or ExtensionArray 

1520 Same type as the input. 

1521 

1522 Raises 

1523 ------ 

1524 IndexError 

1525 When `indices` is out of bounds for the array. 

1526 ValueError 

1527 When the indexer contains negative values other than ``-1`` 

1528 and `allow_fill` is True. 

1529 

1530 Notes 

1531 ----- 

1532 When `allow_fill` is False, `indices` may be whatever dimensionality 

1533 is accepted by NumPy for `arr`. 

1534 

1535 When `allow_fill` is True, `indices` should be 1-D. 

1536 

1537 See Also 

1538 -------- 

1539 numpy.take : Take elements from an array along an axis. 

1540 

1541 Examples 

1542 -------- 

1543 >>> import pandas as pd 

1544 

1545 With the default ``allow_fill=False``, negative numbers indicate 

1546 positional indices from the right. 

1547 

1548 >>> pd.api.extensions.take(np.array([10, 20, 30]), [0, 0, -1]) 

1549 array([10, 10, 30]) 

1550 

1551 Setting ``allow_fill=True`` will place `fill_value` in those positions. 

1552 

1553 >>> pd.api.extensions.take(np.array([10, 20, 30]), [0, 0, -1], allow_fill=True) 

1554 array([10., 10., nan]) 

1555 

1556 >>> pd.api.extensions.take(np.array([10, 20, 30]), [0, 0, -1], allow_fill=True, 

1557 ... fill_value=-10) 

1558 array([ 10, 10, -10]) 

1559 """ 

1560 if not is_array_like(arr): 

1561 arr = np.asarray(arr) 

1562 

1563 indices = np.asarray(indices, dtype=np.intp) 

1564 

1565 if allow_fill: 

1566 # Pandas style, -1 means NA 

1567 validate_indices(indices, arr.shape[axis]) 

1568 result = take_nd( 

1569 arr, indices, axis=axis, allow_fill=True, fill_value=fill_value 

1570 ) 

1571 else: 

1572 # NumPy style 

1573 result = arr.take(indices, axis=axis) 

1574 return result 

1575 

1576 

1577# ------------ # 

1578# searchsorted # 

1579# ------------ # 

1580 

1581 

1582def searchsorted( 

1583 arr: ArrayLike, 

1584 value: NumpyValueArrayLike | ExtensionArray, 

1585 side: Literal["left", "right"] = "left", 

1586 sorter: NumpySorter = None, 

1587) -> npt.NDArray[np.intp] | np.intp: 

1588 """ 

1589 Find indices where elements should be inserted to maintain order. 

1590 

1591 .. versionadded:: 0.25.0 

1592 

1593 Find the indices into a sorted array `arr` (a) such that, if the 

1594 corresponding elements in `value` were inserted before the indices, 

1595 the order of `arr` would be preserved. 

1596 

1597 Assuming that `arr` is sorted: 

1598 

1599 ====== ================================ 

1600 `side` returned index `i` satisfies 

1601 ====== ================================ 

1602 left ``arr[i-1] < value <= self[i]`` 

1603 right ``arr[i-1] <= value < self[i]`` 

1604 ====== ================================ 

1605 

1606 Parameters 

1607 ---------- 

1608 arr: np.ndarray, ExtensionArray, Series 

1609 Input array. If `sorter` is None, then it must be sorted in 

1610 ascending order, otherwise `sorter` must be an array of indices 

1611 that sort it. 

1612 value : array-like or scalar 

1613 Values to insert into `arr`. 

1614 side : {'left', 'right'}, optional 

1615 If 'left', the index of the first suitable location found is given. 

1616 If 'right', return the last such index. If there is no suitable 

1617 index, return either 0 or N (where N is the length of `self`). 

1618 sorter : 1-D array-like, optional 

1619 Optional array of integer indices that sort array a into ascending 

1620 order. They are typically the result of argsort. 

1621 

1622 Returns 

1623 ------- 

1624 array of ints or int 

1625 If value is array-like, array of insertion points. 

1626 If value is scalar, a single integer. 

1627 

1628 See Also 

1629 -------- 

1630 numpy.searchsorted : Similar method from NumPy. 

1631 """ 

1632 if sorter is not None: 

1633 sorter = ensure_platform_int(sorter) 

1634 

1635 if ( 

1636 isinstance(arr, np.ndarray) 

1637 and is_integer_dtype(arr.dtype) 

1638 and (is_integer(value) or is_integer_dtype(value)) 

1639 ): 

1640 # if `arr` and `value` have different dtypes, `arr` would be 

1641 # recast by numpy, causing a slow search. 

1642 # Before searching below, we therefore try to give `value` the 

1643 # same dtype as `arr`, while guarding against integer overflows. 

1644 iinfo = np.iinfo(arr.dtype.type) 

1645 value_arr = np.array([value]) if is_scalar(value) else np.array(value) 

1646 if (value_arr >= iinfo.min).all() and (value_arr <= iinfo.max).all(): 

1647 # value within bounds, so no overflow, so can convert value dtype 

1648 # to dtype of arr 

1649 dtype = arr.dtype 

1650 else: 

1651 dtype = value_arr.dtype 

1652 

1653 if is_scalar(value): 

1654 # We know that value is int 

1655 value = cast(int, dtype.type(value)) 

1656 else: 

1657 value = pd_array(cast(ArrayLike, value), dtype=dtype) 

1658 else: 

1659 # E.g. if `arr` is an array with dtype='datetime64[ns]' 

1660 # and `value` is a pd.Timestamp, we may need to convert value 

1661 arr = ensure_wrapped_if_datetimelike(arr) 

1662 

1663 # Argument 1 to "searchsorted" of "ndarray" has incompatible type 

1664 # "Union[NumpyValueArrayLike, ExtensionArray]"; expected "NumpyValueArrayLike" 

1665 return arr.searchsorted(value, side=side, sorter=sorter) # type: ignore[arg-type] 

1666 

1667 

1668# ---- # 

1669# diff # 

1670# ---- # 

1671 

1672_diff_special = {"float64", "float32", "int64", "int32", "int16", "int8"} 

1673 

1674 

1675def diff(arr, n: int, axis: int = 0): 

1676 """ 

1677 difference of n between self, 

1678 analogous to s-s.shift(n) 

1679 

1680 Parameters 

1681 ---------- 

1682 arr : ndarray or ExtensionArray 

1683 n : int 

1684 number of periods 

1685 axis : {0, 1} 

1686 axis to shift on 

1687 stacklevel : int, default 3 

1688 The stacklevel for the lost dtype warning. 

1689 

1690 Returns 

1691 ------- 

1692 shifted 

1693 """ 

1694 

1695 n = int(n) 

1696 na = np.nan 

1697 dtype = arr.dtype 

1698 

1699 is_bool = is_bool_dtype(dtype) 

1700 if is_bool: 

1701 op = operator.xor 

1702 else: 

1703 op = operator.sub 

1704 

1705 if isinstance(dtype, PandasDtype): 

1706 # PandasArray cannot necessarily hold shifted versions of itself. 

1707 arr = arr.to_numpy() 

1708 dtype = arr.dtype 

1709 

1710 if not isinstance(dtype, np.dtype): 

1711 # i.e ExtensionDtype 

1712 if hasattr(arr, f"__{op.__name__}__"): 

1713 if axis != 0: 

1714 raise ValueError(f"cannot diff {type(arr).__name__} on axis={axis}") 

1715 return op(arr, arr.shift(n)) 

1716 else: 

1717 warnings.warn( 

1718 "dtype lost in 'diff()'. In the future this will raise a " 

1719 "TypeError. Convert to a suitable dtype prior to calling 'diff'.", 

1720 FutureWarning, 

1721 stacklevel=find_stack_level(), 

1722 ) 

1723 arr = np.asarray(arr) 

1724 dtype = arr.dtype 

1725 

1726 is_timedelta = False 

1727 if needs_i8_conversion(arr.dtype): 

1728 dtype = np.int64 

1729 arr = arr.view("i8") 

1730 na = iNaT 

1731 is_timedelta = True 

1732 

1733 elif is_bool: 

1734 # We have to cast in order to be able to hold np.nan 

1735 dtype = np.object_ 

1736 

1737 elif is_integer_dtype(dtype): 

1738 # We have to cast in order to be able to hold np.nan 

1739 

1740 # int8, int16 are incompatible with float64, 

1741 # see https://github.com/cython/cython/issues/2646 

1742 if arr.dtype.name in ["int8", "int16"]: 

1743 dtype = np.float32 

1744 else: 

1745 dtype = np.float64 

1746 

1747 orig_ndim = arr.ndim 

1748 if orig_ndim == 1: 

1749 # reshape so we can always use algos.diff_2d 

1750 arr = arr.reshape(-1, 1) 

1751 # TODO: require axis == 0 

1752 

1753 dtype = np.dtype(dtype) 

1754 out_arr = np.empty(arr.shape, dtype=dtype) 

1755 

1756 na_indexer = [slice(None)] * 2 

1757 na_indexer[axis] = slice(None, n) if n >= 0 else slice(n, None) 

1758 out_arr[tuple(na_indexer)] = na 

1759 

1760 if arr.dtype.name in _diff_special: 

1761 # TODO: can diff_2d dtype specialization troubles be fixed by defining 

1762 # out_arr inside diff_2d? 

1763 algos.diff_2d(arr, out_arr, n, axis, datetimelike=is_timedelta) 

1764 else: 

1765 # To keep mypy happy, _res_indexer is a list while res_indexer is 

1766 # a tuple, ditto for lag_indexer. 

1767 _res_indexer = [slice(None)] * 2 

1768 _res_indexer[axis] = slice(n, None) if n >= 0 else slice(None, n) 

1769 res_indexer = tuple(_res_indexer) 

1770 

1771 _lag_indexer = [slice(None)] * 2 

1772 _lag_indexer[axis] = slice(None, -n) if n > 0 else slice(-n, None) 

1773 lag_indexer = tuple(_lag_indexer) 

1774 

1775 out_arr[res_indexer] = op(arr[res_indexer], arr[lag_indexer]) 

1776 

1777 if is_timedelta: 

1778 out_arr = out_arr.view("timedelta64[ns]") 

1779 

1780 if orig_ndim == 1: 

1781 out_arr = out_arr[:, 0] 

1782 return out_arr 

1783 

1784 

1785# -------------------------------------------------------------------- 

1786# Helper functions 

1787 

1788# Note: safe_sort is in algorithms.py instead of sorting.py because it is 

1789# low-dependency, is used in this module, and used private methods from 

1790# this module. 

1791def safe_sort( 

1792 values, 

1793 codes=None, 

1794 na_sentinel: int = -1, 

1795 assume_unique: bool = False, 

1796 verify: bool = True, 

1797) -> np.ndarray | MultiIndex | tuple[np.ndarray | MultiIndex, np.ndarray]: 

1798 """ 

1799 Sort ``values`` and reorder corresponding ``codes``. 

1800 

1801 ``values`` should be unique if ``codes`` is not None. 

1802 Safe for use with mixed types (int, str), orders ints before strs. 

1803 

1804 Parameters 

1805 ---------- 

1806 values : list-like 

1807 Sequence; must be unique if ``codes`` is not None. 

1808 codes : list_like, optional 

1809 Indices to ``values``. All out of bound indices are treated as 

1810 "not found" and will be masked with ``na_sentinel``. 

1811 na_sentinel : int, default -1 

1812 Value in ``codes`` to mark "not found". 

1813 Ignored when ``codes`` is None. 

1814 assume_unique : bool, default False 

1815 When True, ``values`` are assumed to be unique, which can speed up 

1816 the calculation. Ignored when ``codes`` is None. 

1817 verify : bool, default True 

1818 Check if codes are out of bound for the values and put out of bound 

1819 codes equal to na_sentinel. If ``verify=False``, it is assumed there 

1820 are no out of bound codes. Ignored when ``codes`` is None. 

1821 

1822 .. versionadded:: 0.25.0 

1823 

1824 Returns 

1825 ------- 

1826 ordered : ndarray or MultiIndex 

1827 Sorted ``values`` 

1828 new_codes : ndarray 

1829 Reordered ``codes``; returned when ``codes`` is not None. 

1830 

1831 Raises 

1832 ------ 

1833 TypeError 

1834 * If ``values`` is not list-like or if ``codes`` is neither None 

1835 nor list-like 

1836 * If ``values`` cannot be sorted 

1837 ValueError 

1838 * If ``codes`` is not None and ``values`` contain duplicates. 

1839 """ 

1840 if not is_list_like(values): 

1841 raise TypeError( 

1842 "Only list-like objects are allowed to be passed to safe_sort as values" 

1843 ) 

1844 original_values = values 

1845 is_mi = isinstance(original_values, ABCMultiIndex) 

1846 

1847 if not isinstance(values, (np.ndarray, ABCExtensionArray)): 

1848 # don't convert to string types 

1849 dtype, _ = infer_dtype_from_array(values) 

1850 # error: Argument "dtype" to "asarray" has incompatible type "Union[dtype[Any], 

1851 # ExtensionDtype]"; expected "Union[dtype[Any], None, type, _SupportsDType, str, 

1852 # Union[Tuple[Any, int], Tuple[Any, Union[int, Sequence[int]]], List[Any], 

1853 # _DTypeDict, Tuple[Any, Any]]]" 

1854 values = np.asarray(values, dtype=dtype) # type: ignore[arg-type] 

1855 

1856 sorter = None 

1857 ordered: np.ndarray | MultiIndex 

1858 

1859 if ( 

1860 not is_extension_array_dtype(values) 

1861 and lib.infer_dtype(values, skipna=False) == "mixed-integer" 

1862 ): 

1863 ordered = _sort_mixed(values) 

1864 else: 

1865 try: 

1866 sorter = values.argsort() 

1867 if is_mi: 

1868 # Operate on original object instead of casted array (MultiIndex) 

1869 ordered = original_values.take(sorter) 

1870 else: 

1871 ordered = values.take(sorter) 

1872 except TypeError: 

1873 # Previous sorters failed or were not applicable, try `_sort_mixed` 

1874 # which would work, but which fails for special case of 1d arrays 

1875 # with tuples. 

1876 if values.size and isinstance(values[0], tuple): 

1877 ordered = _sort_tuples(values, original_values) 

1878 else: 

1879 ordered = _sort_mixed(values) 

1880 

1881 # codes: 

1882 

1883 if codes is None: 

1884 return ordered 

1885 

1886 if not is_list_like(codes): 

1887 raise TypeError( 

1888 "Only list-like objects or None are allowed to " 

1889 "be passed to safe_sort as codes" 

1890 ) 

1891 codes = ensure_platform_int(np.asarray(codes)) 

1892 

1893 if not assume_unique and not len(unique(values)) == len(values): 

1894 raise ValueError("values should be unique if codes is not None") 

1895 

1896 if sorter is None: 

1897 # mixed types 

1898 hash_klass, values = _get_hashtable_algo(values) 

1899 t = hash_klass(len(values)) 

1900 t.map_locations(values) 

1901 sorter = ensure_platform_int(t.lookup(ordered)) 

1902 

1903 if na_sentinel == -1: 

1904 # take_nd is faster, but only works for na_sentinels of -1 

1905 order2 = sorter.argsort() 

1906 new_codes = take_nd(order2, codes, fill_value=-1) 

1907 if verify: 

1908 mask = (codes < -len(values)) | (codes >= len(values)) 

1909 else: 

1910 mask = None 

1911 else: 

1912 reverse_indexer = np.empty(len(sorter), dtype=np.int_) 

1913 reverse_indexer.put(sorter, np.arange(len(sorter))) 

1914 # Out of bound indices will be masked with `na_sentinel` next, so we 

1915 # may deal with them here without performance loss using `mode='wrap'` 

1916 new_codes = reverse_indexer.take(codes, mode="wrap") 

1917 

1918 mask = codes == na_sentinel 

1919 if verify: 

1920 mask = mask | (codes < -len(values)) | (codes >= len(values)) 

1921 

1922 if mask is not None: 

1923 np.putmask(new_codes, mask, na_sentinel) 

1924 

1925 return ordered, ensure_platform_int(new_codes) 

1926 

1927 

1928def _sort_mixed(values) -> np.ndarray: 

1929 """order ints before strings in 1d arrays, safe in py3""" 

1930 str_pos = np.array([isinstance(x, str) for x in values], dtype=bool) 

1931 none_pos = np.array([x is None for x in values], dtype=bool) 

1932 nums = np.sort(values[~str_pos & ~none_pos]) 

1933 strs = np.sort(values[str_pos]) 

1934 return np.concatenate( 

1935 [nums, np.asarray(strs, dtype=object), np.array(values[none_pos])] 

1936 ) 

1937 

1938 

1939@overload 

1940def _sort_tuples(values: np.ndarray, original_values: np.ndarray) -> np.ndarray: 

1941 ... 

1942 

1943 

1944@overload 

1945def _sort_tuples(values: np.ndarray, original_values: MultiIndex) -> MultiIndex: 

1946 ... 

1947 

1948 

1949def _sort_tuples( 

1950 values: np.ndarray, original_values: np.ndarray | MultiIndex 

1951) -> np.ndarray | MultiIndex: 

1952 """ 

1953 Convert array of tuples (1d) to array or array (2d). 

1954 We need to keep the columns separately as they contain different types and 

1955 nans (can't use `np.sort` as it may fail when str and nan are mixed in a 

1956 column as types cannot be compared). 

1957 We have to apply the indexer to the original values to keep the dtypes in 

1958 case of MultiIndexes 

1959 """ 

1960 from pandas.core.internals.construction import to_arrays 

1961 from pandas.core.sorting import lexsort_indexer 

1962 

1963 arrays, _ = to_arrays(values, None) 

1964 indexer = lexsort_indexer(arrays, orders=True) 

1965 return original_values[indexer] 

1966 

1967 

1968def union_with_duplicates(lvals: ArrayLike, rvals: ArrayLike) -> ArrayLike: 

1969 """ 

1970 Extracts the union from lvals and rvals with respect to duplicates and nans in 

1971 both arrays. 

1972 

1973 Parameters 

1974 ---------- 

1975 lvals: np.ndarray or ExtensionArray 

1976 left values which is ordered in front. 

1977 rvals: np.ndarray or ExtensionArray 

1978 right values ordered after lvals. 

1979 

1980 Returns 

1981 ------- 

1982 np.ndarray or ExtensionArray 

1983 Containing the unsorted union of both arrays. 

1984 

1985 Notes 

1986 ----- 

1987 Caller is responsible for ensuring lvals.dtype == rvals.dtype. 

1988 """ 

1989 indexer = [] 

1990 l_count = value_counts(lvals, dropna=False) 

1991 r_count = value_counts(rvals, dropna=False) 

1992 l_count, r_count = l_count.align(r_count, fill_value=0) 

1993 unique_array = unique(concat_compat([lvals, rvals])) 

1994 unique_array = ensure_wrapped_if_datetimelike(unique_array) 

1995 

1996 for i, value in enumerate(unique_array): 

1997 indexer += [i] * int(max(l_count.at[value], r_count.at[value])) 

1998 return unique_array.take(indexer)