Coverage for /var/srv/projects/api.amasfac.comuna18.com/tmp/venv/lib/python3.9/site-packages/pandas/core/algorithms.py: 9%

1"""

2Generic data algorithms. This module is experimental at the moment and not

3intended for public consumption

4"""

5from __future__ import annotations

7import inspect

8import operator

9from textwrap import dedent

10from typing import (

11 TYPE_CHECKING,

12 Hashable,

13 Literal,

14 Sequence,

15 cast,

16 final,

17 overload,

18)

19import warnings

21import numpy as np

23from pandas._libs import (

24 algos,

25 hashtable as htable,

26 iNaT,

27 lib,

28)

29from pandas._typing import (

30 AnyArrayLike,

31 ArrayLike,

32 DtypeObj,

33 IndexLabel,

34 TakeIndexer,

35 npt,

36)

37from pandas.util._decorators import doc

38from pandas.util._exceptions import find_stack_level

40from pandas.core.dtypes.cast import (

41 construct_1d_object_array_from_listlike,

42 infer_dtype_from_array,

43 sanitize_to_nanoseconds,

44)

45from pandas.core.dtypes.common import (

46 ensure_float64,

47 ensure_object,

48 ensure_platform_int,

49 is_array_like,

50 is_bool_dtype,

51 is_categorical_dtype,

52 is_complex_dtype,

53 is_datetime64_dtype,

54 is_extension_array_dtype,

55 is_float_dtype,

56 is_integer,

57 is_integer_dtype,

58 is_list_like,

59 is_numeric_dtype,

60 is_object_dtype,

61 is_scalar,

62 is_signed_integer_dtype,

63 is_timedelta64_dtype,

64 needs_i8_conversion,

65)

66from pandas.core.dtypes.concat import concat_compat

67from pandas.core.dtypes.dtypes import (

68 BaseMaskedDtype,

69 ExtensionDtype,

70 PandasDtype,

71)

72from pandas.core.dtypes.generic import (

73 ABCDatetimeArray,

74 ABCExtensionArray,

75 ABCIndex,

76 ABCMultiIndex,

77 ABCRangeIndex,

78 ABCSeries,

79 ABCTimedeltaArray,

80)

81from pandas.core.dtypes.missing import (

82 isna,

83 na_value_for_dtype,

84)

86from pandas.core.array_algos.take import take_nd

87from pandas.core.construction import (

88 array as pd_array,

89 ensure_wrapped_if_datetimelike,

90 extract_array,

91)

92from pandas.core.indexers import validate_indices

94if TYPE_CHECKING: 94 ↛ 96line 94 didn't jump to line 96, because the condition on line 94 was never true

96 from pandas._typing import (

97 NumpySorter,

98 NumpyValueArrayLike,

99 )

100

101 from pandas import (

102 Categorical,

103 DataFrame,

104 Index,

105 MultiIndex,

106 Series,

107 )

108 from pandas.core.arrays import (

109 BaseMaskedArray,

110 ExtensionArray,

111 )

112

113

114# --------------- #

115# dtype access #

116# --------------- #

117def _ensure_data(values: ArrayLike) -> np.ndarray:

118 """

119 routine to ensure that our data is of the correct

120 input dtype for lower-level routines

121

122 This will coerce:

123 - ints -> int64

124 - uint -> uint64

125 - bool -> uint8

126 - datetimelike -> i8

127 - datetime64tz -> i8 (in local tz)

128 - categorical -> codes

129

130 Parameters

131 ----------

132 values : np.ndarray or ExtensionArray

133

134 Returns

135 -------

136 np.ndarray

137 """

138

139 if not isinstance(values, ABCMultiIndex):

140 # extract_array would raise

141 values = extract_array(values, extract_numpy=True)

142

143 if is_object_dtype(values.dtype):

144 return ensure_object(np.asarray(values))

145

146 elif isinstance(values.dtype, BaseMaskedDtype):

147 # i.e. BooleanArray, FloatingArray, IntegerArray

148 values = cast("BaseMaskedArray", values)

149 if not values._hasna:

150 # No pd.NAs -> We can avoid an object-dtype cast (and copy) GH#41816

151 # recurse to avoid re-implementing logic for eg bool->uint8

152 return _ensure_data(values._data)

153 return np.asarray(values)

154

155 elif is_categorical_dtype(values.dtype):

156 # NB: cases that go through here should NOT be using _reconstruct_data

157 # on the back-end.

158 values = cast("Categorical", values)

159 return values.codes

160

161 elif is_bool_dtype(values.dtype):

162 if isinstance(values, np.ndarray):

163 # i.e. actually dtype == np.dtype("bool")

164 return np.asarray(values).view("uint8")

165 else:

166 # e.g. Sparse[bool, False] # TODO: no test cases get here

167 return np.asarray(values).astype("uint8", copy=False)

168

169 elif is_integer_dtype(values.dtype):

170 return np.asarray(values)

171

172 elif is_float_dtype(values.dtype):

173 # Note: checking `values.dtype == "float128"` raises on Windows and 32bit

174 # error: Item "ExtensionDtype" of "Union[Any, ExtensionDtype, dtype[Any]]"

175 # has no attribute "itemsize"

176 if values.dtype.itemsize in [2, 12, 16]: # type: ignore[union-attr]

177 # we dont (yet) have float128 hashtable support

178 return ensure_float64(values)

179 return np.asarray(values)

180

181 elif is_complex_dtype(values.dtype):

182 return cast(np.ndarray, values)

183

184 # datetimelike

185 elif needs_i8_conversion(values.dtype):

186 if isinstance(values, np.ndarray):

187 values = sanitize_to_nanoseconds(values)

188 npvalues = values.view("i8")

189 npvalues = cast(np.ndarray, npvalues)

190 return npvalues

191

192 # we have failed, return object

193 values = np.asarray(values, dtype=object)

194 return ensure_object(values)

195

196

197def _reconstruct_data(

198 values: ArrayLike, dtype: DtypeObj, original: AnyArrayLike

199) -> ArrayLike:

200 """

201 reverse of _ensure_data

202

203 Parameters

204 ----------

205 values : np.ndarray or ExtensionArray

206 dtype : np.dtype or ExtensionDtype

207 original : AnyArrayLike

208

209 Returns

210 -------

211 ExtensionArray or np.ndarray

212 """

213 if isinstance(values, ABCExtensionArray) and values.dtype == dtype:

214 # Catch DatetimeArray/TimedeltaArray

215 return values

216

217 if not isinstance(dtype, np.dtype):

218 # i.e. ExtensionDtype; note we have ruled out above the possibility

219 # that values.dtype == dtype

220 cls = dtype.construct_array_type()

221

222 values = cls._from_sequence(values, dtype=dtype)

223

224 else:

225 if is_datetime64_dtype(dtype):

226 dtype = np.dtype("datetime64[ns]")

227 elif is_timedelta64_dtype(dtype):

228 dtype = np.dtype("timedelta64[ns]")

229

230 values = values.astype(dtype, copy=False)

231

232 return values

233

234

235def _ensure_arraylike(values) -> ArrayLike:

236 """

237 ensure that we are arraylike if not already

238 """

239 if not is_array_like(values):

240 inferred = lib.infer_dtype(values, skipna=False)

241 if inferred in ["mixed", "string", "mixed-integer"]:

242 # "mixed-integer" to ensure we do not cast ["ss", 42] to str GH#22160

243 if isinstance(values, tuple):

244 values = list(values)

245 values = construct_1d_object_array_from_listlike(values)

246 else:

247 values = np.asarray(values)

248 return values

249

250

251_hashtables = {

252 "complex128": htable.Complex128HashTable,

253 "complex64": htable.Complex64HashTable,

254 "float64": htable.Float64HashTable,

255 "float32": htable.Float32HashTable,

256 "uint64": htable.UInt64HashTable,

257 "uint32": htable.UInt32HashTable,

258 "uint16": htable.UInt16HashTable,

259 "uint8": htable.UInt8HashTable,

260 "int64": htable.Int64HashTable,

261 "int32": htable.Int32HashTable,

262 "int16": htable.Int16HashTable,

263 "int8": htable.Int8HashTable,

264 "string": htable.StringHashTable,

265 "object": htable.PyObjectHashTable,

266}

267

268

269def _get_hashtable_algo(values: np.ndarray):

270 """

271 Parameters

272 ----------

273 values : np.ndarray

274

275 Returns

276 -------

277 htable : HashTable subclass

278 values : ndarray

279 """

280 values = _ensure_data(values)

281

282 ndtype = _check_object_for_strings(values)

283 htable = _hashtables[ndtype]

284 return htable, values

285

286

287def _check_object_for_strings(values: np.ndarray) -> str:

288 """

289 Check if we can use string hashtable instead of object hashtable.

290

291 Parameters

292 ----------

293 values : ndarray

294

295 Returns

296 -------

297 str

298 """

299 ndtype = values.dtype.name

300 if ndtype == "object":

301

302 # it's cheaper to use a String Hash Table than Object; we infer

303 # including nulls because that is the only difference between

304 # StringHashTable and ObjectHashtable

305 if lib.infer_dtype(values, skipna=False) in ["string"]:

306 ndtype = "string"

307 return ndtype

308

309

310# --------------- #

311# top-level algos #

312# --------------- #

313

314

315def unique(values):

316 """

317 Return unique values based on a hash table.

318

319 Uniques are returned in order of appearance. This does NOT sort.

320

321 Significantly faster than numpy.unique for long enough sequences.

322 Includes NA values.

323

324 Parameters

325 ----------

326 values : 1d array-like

327

328 Returns

329 -------

330 numpy.ndarray or ExtensionArray

331

332 The return can be:

333

334 * Index : when the input is an Index

335 * Categorical : when the input is a Categorical dtype

336 * ndarray : when the input is a Series/ndarray

337

338 Return numpy.ndarray or ExtensionArray.

339

340 See Also

341 --------

342 Index.unique : Return unique values from an Index.

343 Series.unique : Return unique values of Series object.

344

345 Examples

346 --------

347 >>> pd.unique(pd.Series([2, 1, 3, 3]))

348 array([2, 1, 3])

349

350 >>> pd.unique(pd.Series([2] + [1] * 5))

351 array([2, 1])

352

353 >>> pd.unique(pd.Series([pd.Timestamp("20160101"), pd.Timestamp("20160101")]))

354 array(['2016-01-01T00:00:00.000000000'], dtype='datetime64[ns]')

355

356 >>> pd.unique(

357 ... pd.Series(

358 ... [

359 ... pd.Timestamp("20160101", tz="US/Eastern"),

360 ... pd.Timestamp("20160101", tz="US/Eastern"),

361 ... ]

362 ... )

363 ... )

364 <DatetimeArray>

365 ['2016-01-01 00:00:00-05:00']

366 Length: 1, dtype: datetime64[ns, US/Eastern]

367

368 >>> pd.unique(

369 ... pd.Index(

370 ... [

371 ... pd.Timestamp("20160101", tz="US/Eastern"),

372 ... pd.Timestamp("20160101", tz="US/Eastern"),

373 ... ]

374 ... )

375 ... )

376 DatetimeIndex(['2016-01-01 00:00:00-05:00'],

377 dtype='datetime64[ns, US/Eastern]',

378 freq=None)

379

380 >>> pd.unique(list("baabc"))

381 array(['b', 'a', 'c'], dtype=object)

382

383 An unordered Categorical will return categories in the

384 order of appearance.

385

386 >>> pd.unique(pd.Series(pd.Categorical(list("baabc"))))

387 ['b', 'a', 'c']

388 Categories (3, object): ['a', 'b', 'c']

389

390 >>> pd.unique(pd.Series(pd.Categorical(list("baabc"), categories=list("abc"))))

391 ['b', 'a', 'c']

392 Categories (3, object): ['a', 'b', 'c']

393

394 An ordered Categorical preserves the category ordering.

395

396 >>> pd.unique(

397 ... pd.Series(

398 ... pd.Categorical(list("baabc"), categories=list("abc"), ordered=True)

399 ... )

400 ... )

401 ['b', 'a', 'c']

402 Categories (3, object): ['a' < 'b' < 'c']

403

404 An array of tuples

405

406 >>> pd.unique([("a", "b"), ("b", "a"), ("a", "c"), ("b", "a")])

407 array([('a', 'b'), ('b', 'a'), ('a', 'c')], dtype=object)

408 """

409 return unique_with_mask(values)

410

411

412def unique_with_mask(values, mask: npt.NDArray[np.bool_] | None = None):

413 """See algorithms.unique for docs. Takes a mask for masked arrays."""

414 values = _ensure_arraylike(values)

415

416 if is_extension_array_dtype(values.dtype):

417 # Dispatch to extension dtype's unique.

418 return values.unique()

419

420 original = values

421 htable, values = _get_hashtable_algo(values)

422

423 table = htable(len(values))

424 if mask is None:

425 uniques = table.unique(values)

426 uniques = _reconstruct_data(uniques, original.dtype, original)

427 return uniques

428

429 else:

430 uniques, mask = table.unique(values, mask=mask)

431 uniques = _reconstruct_data(uniques, original.dtype, original)

432 assert mask is not None # for mypy

433 return uniques, mask.astype("bool")

436unique1d = unique

439def isin(comps: AnyArrayLike, values: AnyArrayLike) -> npt.NDArray[np.bool_]:

440 """

441 Compute the isin boolean array.

442

443 Parameters

444 ----------

445 comps : array-like

446 values : array-like

447

448 Returns

449 -------

450 ndarray[bool]

451 Same length as `comps`.

452 """

453 if not is_list_like(comps):

454 raise TypeError(

455 "only list-like objects are allowed to be passed "

456 f"to isin(), you passed a [{type(comps).__name__}]"

457 )

458 if not is_list_like(values):

459 raise TypeError(

460 "only list-like objects are allowed to be passed "

461 f"to isin(), you passed a [{type(values).__name__}]"

462 )

463

464 if not isinstance(values, (ABCIndex, ABCSeries, ABCExtensionArray, np.ndarray)):

465 orig_values = values

466 values = _ensure_arraylike(list(values))

467

468 if is_numeric_dtype(values) and not is_signed_integer_dtype(comps):

469 # GH#46485 Use object to avoid upcast to float64 later

470 # TODO: Share with _find_common_type_compat

471 values = construct_1d_object_array_from_listlike(list(orig_values))

472

473 elif isinstance(values, ABCMultiIndex):

474 # Avoid raising in extract_array

475 values = np.array(values)

476 else:

477 values = extract_array(values, extract_numpy=True, extract_range=True)

478

479 comps_array = _ensure_arraylike(comps)

480 comps_array = extract_array(comps_array, extract_numpy=True)

481 if not isinstance(comps_array, np.ndarray):

482 # i.e. Extension Array

483 return comps_array.isin(values)

484

485 elif needs_i8_conversion(comps_array.dtype):

486 # Dispatch to DatetimeLikeArrayMixin.isin

487 return pd_array(comps_array).isin(values)

488 elif needs_i8_conversion(values.dtype) and not is_object_dtype(comps_array.dtype):

489 # e.g. comps_array are integers and values are datetime64s

490 return np.zeros(comps_array.shape, dtype=bool)

491 # TODO: not quite right ... Sparse/Categorical

492 elif needs_i8_conversion(values.dtype):

493 return isin(comps_array, values.astype(object))

494

495 elif isinstance(values.dtype, ExtensionDtype):

496 return isin(np.asarray(comps_array), np.asarray(values))

497

498 # GH16012

499 # Ensure np.in1d doesn't get object types or it *may* throw an exception

500 # Albeit hashmap has O(1) look-up (vs. O(logn) in sorted array),

501 # in1d is faster for small sizes

502 if (

503 len(comps_array) > 1_000_000

504 and len(values) <= 26

505 and not is_object_dtype(comps_array)

506 ):

507 # If the values include nan we need to check for nan explicitly

508 # since np.nan it not equal to np.nan

509 if isna(values).any():

510

511 def f(c, v):

512 return np.logical_or(np.in1d(c, v), np.isnan(c))

513

514 else:

515 f = np.in1d

516

517 else:

518 common = np.find_common_type([values.dtype, comps_array.dtype], [])

519 values = values.astype(common, copy=False)

520 comps_array = comps_array.astype(common, copy=False)

521 f = htable.ismember

522

523 return f(comps_array, values)

524

525

526def factorize_array(

527 values: np.ndarray,

528 na_sentinel: int | None = -1,

529 size_hint: int | None = None,

530 na_value: object = None,

531 mask: npt.NDArray[np.bool_] | None = None,

532) -> tuple[npt.NDArray[np.intp], np.ndarray]:

533 """

534 Factorize a numpy array to codes and uniques.

535

536 This doesn't do any coercion of types or unboxing before factorization.

537

538 Parameters

539 ----------

540 values : ndarray

541 na_sentinel : int, default -1

542 size_hint : int, optional

543 Passed through to the hashtable's 'get_labels' method

544 na_value : object, optional

545 A value in `values` to consider missing. Note: only use this

546 parameter when you know that you don't have any values pandas would

547 consider missing in the array (NaN for float data, iNaT for

548 datetimes, etc.).

549 mask : ndarray[bool], optional

550 If not None, the mask is used as indicator for missing values

551 (True = missing, False = valid) instead of `na_value` or

552 condition "val != val".

553

554 Returns

555 -------

556 codes : ndarray[np.intp]

557 uniques : ndarray

558 """

559 ignore_na = na_sentinel is not None

560 if not ignore_na:

561 na_sentinel = -1

562

563 original = values

564 if values.dtype.kind in ["m", "M"]:

565 # _get_hashtable_algo will cast dt64/td64 to i8 via _ensure_data, so we

566 # need to do the same to na_value. We are assuming here that the passed

567 # na_value is an appropriately-typed NaT.

568 # e.g. test_where_datetimelike_categorical

569 na_value = iNaT

570

571 hash_klass, values = _get_hashtable_algo(values)

572

573 table = hash_klass(size_hint or len(values))

574 uniques, codes = table.factorize(

575 values,

576 na_sentinel=na_sentinel,

577 na_value=na_value,

578 mask=mask,

579 ignore_na=ignore_na,

580 )

581

582 # re-cast e.g. i8->dt64/td64, uint8->bool

583 uniques = _reconstruct_data(uniques, original.dtype, original)

584

585 codes = ensure_platform_int(codes)

586 return codes, uniques

587

588

589@doc(

590 values=dedent(

591 """\

592 values : sequence

593 A 1-D sequence. Sequences that aren't pandas objects are

594 coerced to ndarrays before factorization.

595 """

596 ),

597 sort=dedent(

598 """\

599 sort : bool, default False

600 Sort `uniques` and shuffle `codes` to maintain the

601 relationship.

602 """

603 ),

604 size_hint=dedent(

605 """\

606 size_hint : int, optional

607 Hint to the hashtable sizer.

608 """

609 ),

610)

611def factorize(

612 values,

613 sort: bool = False,

614 na_sentinel: int | None | lib.NoDefault = lib.no_default,

615 use_na_sentinel: bool | lib.NoDefault = lib.no_default,

616 size_hint: int | None = None,

617) -> tuple[np.ndarray, np.ndarray | Index]:

618 """

619 Encode the object as an enumerated type or categorical variable.

620

621 This method is useful for obtaining a numeric representation of an

622 array when all that matters is identifying distinct values. `factorize`

623 is available as both a top-level function :func:`pandas.factorize`,

624 and as a method :meth:`Series.factorize` and :meth:`Index.factorize`.

625

626 Parameters

627 ----------

628 {values}{sort}

629 na_sentinel : int or None, default -1

630 Value to mark "not found". If None, will not drop the NaN

631 from the uniques of the values.

632

633 .. deprecated:: 1.5.0

634 The na_sentinel argument is deprecated and

635 will be removed in a future version of pandas. Specify use_na_sentinel as

636 either True or False.

637

638 .. versionchanged:: 1.1.2

639

640 use_na_sentinel : bool, default True

641 If True, the sentinel -1 will be used for NaN values. If False,

642 NaN values will be encoded as non-negative integers and will not drop the

643 NaN from the uniques of the values.

644

645 .. versionadded:: 1.5.0

646 {size_hint}\

647

648 Returns

649 -------

650 codes : ndarray

651 An integer ndarray that's an indexer into `uniques`.

652 ``uniques.take(codes)`` will have the same values as `values`.

653 uniques : ndarray, Index, or Categorical

654 The unique valid values. When `values` is Categorical, `uniques`

655 is a Categorical. When `values` is some other pandas object, an

656 `Index` is returned. Otherwise, a 1-D ndarray is returned.

657

658 .. note::

659

660 Even if there's a missing value in `values`, `uniques` will

661 *not* contain an entry for it.

662

663 See Also

664 --------

665 cut : Discretize continuous-valued array.

666 unique : Find the unique value in an array.

667

668 Notes

669 -----

670 Reference :ref:`the user guide <reshaping.factorize>` for more examples.

671

672 Examples

673 --------

674 These examples all show factorize as a top-level method like

675 ``pd.factorize(values)``. The results are identical for methods like

676 :meth:`Series.factorize`.

677

678 >>> codes, uniques = pd.factorize(['b', 'b', 'a', 'c', 'b'])

679 >>> codes

680 array([0, 0, 1, 2, 0]...)

681 >>> uniques

682 array(['b', 'a', 'c'], dtype=object)

683

684 With ``sort=True``, the `uniques` will be sorted, and `codes` will be

685 shuffled so that the relationship is the maintained.

686

687 >>> codes, uniques = pd.factorize(['b', 'b', 'a', 'c', 'b'], sort=True)

688 >>> codes

689 array([1, 1, 0, 2, 1]...)

690 >>> uniques

691 array(['a', 'b', 'c'], dtype=object)

692

693 When ``use_na_sentinel=True`` (the default), missing values are indicated in

694 the `codes` with the sentinel value ``-1`` and missing values are not

695 included in `uniques`.

696

697 >>> codes, uniques = pd.factorize(['b', None, 'a', 'c', 'b'])

698 >>> codes

699 array([ 0, -1, 1, 2, 0]...)

700 >>> uniques

701 array(['b', 'a', 'c'], dtype=object)

702

703 Thus far, we've only factorized lists (which are internally coerced to

704 NumPy arrays). When factorizing pandas objects, the type of `uniques`

705 will differ. For Categoricals, a `Categorical` is returned.

706

707 >>> cat = pd.Categorical(['a', 'a', 'c'], categories=['a', 'b', 'c'])

708 >>> codes, uniques = pd.factorize(cat)

709 >>> codes

710 array([0, 0, 1]...)

711 >>> uniques

712 ['a', 'c']

713 Categories (3, object): ['a', 'b', 'c']

714

715 Notice that ``'b'`` is in ``uniques.categories``, despite not being

716 present in ``cat.values``.

717

718 For all other pandas objects, an Index of the appropriate type is

719 returned.

720

721 >>> cat = pd.Series(['a', 'a', 'c'])

722 >>> codes, uniques = pd.factorize(cat)

723 >>> codes

724 array([0, 0, 1]...)

725 >>> uniques

726 Index(['a', 'c'], dtype='object')

727

728 If NaN is in the values, and we want to include NaN in the uniques of the

729 values, it can be achieved by setting ``use_na_sentinel=False``.

730

731 >>> values = np.array([1, 2, 1, np.nan])

732 >>> codes, uniques = pd.factorize(values) # default: use_na_sentinel=True

733 >>> codes

734 array([ 0, 1, 0, -1])

735 >>> uniques

736 array([1., 2.])

737

738 >>> codes, uniques = pd.factorize(values, use_na_sentinel=False)

739 >>> codes

740 array([0, 1, 0, 2])

741 >>> uniques

742 array([ 1., 2., nan])

743 """

744 # Implementation notes: This method is responsible for 3 things

745 # 1.) coercing data to array-like (ndarray, Index, extension array)

746 # 2.) factorizing codes and uniques

747 # 3.) Maybe boxing the uniques in an Index

748 #

749 # Step 2 is dispatched to extension types (like Categorical). They are

750 # responsible only for factorization. All data coercion, sorting and boxing

751 # should happen here.

752

753 # GH#46910 deprecated na_sentinel in favor of use_na_sentinel:

754 # na_sentinel=None corresponds to use_na_sentinel=False

755 # na_sentinel=-1 correspond to use_na_sentinel=True

756 # Other na_sentinel values will not be supported when the deprecation is enforced.

757 na_sentinel = resolve_na_sentinel(na_sentinel, use_na_sentinel)

758 if isinstance(values, ABCRangeIndex):

759 return values.factorize(sort=sort)

760

761 values = _ensure_arraylike(values)

762 original = values

763 if not isinstance(values, ABCMultiIndex):

764 values = extract_array(values, extract_numpy=True)

765

766 # GH35667, if na_sentinel=None, we will not dropna NaNs from the uniques

767 # of values, assign na_sentinel=-1 to replace code value for NaN.

768 dropna = na_sentinel is not None

769

770 if (

771 isinstance(values, (ABCDatetimeArray, ABCTimedeltaArray))

772 and values.freq is not None

773 ):

774 # The presence of 'freq' means we can fast-path sorting and know there

775 # aren't NAs

776 codes, uniques = values.factorize(sort=sort)

777 return _re_wrap_factorize(original, uniques, codes)

778

779 elif not isinstance(values.dtype, np.dtype):

780 if (

781 na_sentinel == -1 or na_sentinel is None

782 ) and "use_na_sentinel" in inspect.signature(values.factorize).parameters:

783 # Avoid using catch_warnings when possible

784 # GH#46910 - TimelikeOps has deprecated signature

785 codes, uniques = values.factorize( # type: ignore[call-arg]

786 use_na_sentinel=na_sentinel is not None

787 )

788 else:

789 na_sentinel_arg = -1 if na_sentinel is None else na_sentinel

790 with warnings.catch_warnings():

791 # We've already warned above

792 warnings.filterwarnings("ignore", ".*use_na_sentinel.*", FutureWarning)

793 codes, uniques = values.factorize(na_sentinel=na_sentinel_arg)

794

795 else:

796 values = np.asarray(values) # convert DTA/TDA/MultiIndex

797 # TODO: pass na_sentinel=na_sentinel to factorize_array. When sort is True and

798 # na_sentinel is None we append NA on the end because safe_sort does not

799 # handle null values in uniques.

800 if na_sentinel is None and sort:

801 na_sentinel_arg = -1

802 elif na_sentinel is None:

803 na_sentinel_arg = None

804 else:

805 na_sentinel_arg = na_sentinel

806

807 if not dropna and not sort and is_object_dtype(values):

808 # factorize can now handle differentiating various types of null values.

809 # These can only occur when the array has object dtype.

810 # However, for backwards compatibility we only use the null for the

811 # provided dtype. This may be revisited in the future, see GH#48476.

812 null_mask = isna(values)

813 if null_mask.any():

814 na_value = na_value_for_dtype(values.dtype, compat=False)

815 # Don't modify (potentially user-provided) array

816 values = np.where(null_mask, na_value, values)

817

818 codes, uniques = factorize_array(

819 values,

820 na_sentinel=na_sentinel_arg,

821 size_hint=size_hint,

822 )

823

824 if sort and len(uniques) > 0:

825 if na_sentinel is None:

826 # TODO: Can remove when na_sentinel=na_sentinel as in TODO above

827 na_sentinel = -1

828 uniques, codes = safe_sort(

829 uniques, codes, na_sentinel=na_sentinel, assume_unique=True, verify=False

830 )

831

832 if not dropna and sort:

833 # TODO: Can remove entire block when na_sentinel=na_sentinel as in TODO above

834 if na_sentinel is None:

835 na_sentinel_arg = -1

836 else:

837 na_sentinel_arg = na_sentinel

838 code_is_na = codes == na_sentinel_arg

839 if code_is_na.any():

840 # na_value is set based on the dtype of uniques, and compat set to False is

841 # because we do not want na_value to be 0 for integers

842 na_value = na_value_for_dtype(uniques.dtype, compat=False)

843 uniques = np.append(uniques, [na_value])

844 codes = np.where(code_is_na, len(uniques) - 1, codes)

845

846 uniques = _reconstruct_data(uniques, original.dtype, original)

847

848 return _re_wrap_factorize(original, uniques, codes)

849

850

851def resolve_na_sentinel(

852 na_sentinel: int | None | lib.NoDefault,

853 use_na_sentinel: bool | lib.NoDefault,

854) -> int | None:

855 """

856 Determine value of na_sentinel for factorize methods.

857

858 See GH#46910 for details on the deprecation.

859

860 Parameters

861 ----------

862 na_sentinel : int, None, or lib.no_default

863 Value passed to the method.

864 use_na_sentinel : bool or lib.no_default

865 Value passed to the method.

866

867 Returns

868 -------

869 Resolved value of na_sentinel.

870 """

871 if na_sentinel is not lib.no_default and use_na_sentinel is not lib.no_default:

872 raise ValueError(

873 "Cannot specify both `na_sentinel` and `use_na_sentile`; "

874 f"got `na_sentinel={na_sentinel}` and `use_na_sentinel={use_na_sentinel}`"

875 )

876 if na_sentinel is lib.no_default:

877 result = -1 if use_na_sentinel is lib.no_default or use_na_sentinel else None

878 else:

879 if na_sentinel is None:

880 msg = (

881 "Specifying `na_sentinel=None` is deprecated, specify "

882 "`use_na_sentinel=False` instead."

883 )

884 elif na_sentinel == -1:

885 msg = (

886 "Specifying `na_sentinel=-1` is deprecated, specify "

887 "`use_na_sentinel=True` instead."

888 )

889 else:

890 msg = (

891 "Specifying the specific value to use for `na_sentinel` is "

892 "deprecated and will be removed in a future version of pandas. "

893 "Specify `use_na_sentinel=True` to use the sentinel value -1, and "

894 "`use_na_sentinel=False` to encode NaN values."

895 )

896 warnings.warn(msg, FutureWarning, stacklevel=find_stack_level())

897 result = na_sentinel

898 return result

899

900

901def _re_wrap_factorize(original, uniques, codes: np.ndarray):

902 """

903 Wrap factorize results in Series or Index depending on original type.

904 """

905 if isinstance(original, ABCIndex):

906 uniques = ensure_wrapped_if_datetimelike(uniques)

907 uniques = original._shallow_copy(uniques, name=None)

908 elif isinstance(original, ABCSeries):

909 from pandas import Index

910

911 uniques = Index(uniques)

912

913 return codes, uniques

914

915

916def value_counts(

917 values,

918 sort: bool = True,

919 ascending: bool = False,

920 normalize: bool = False,

921 bins=None,

922 dropna: bool = True,

923) -> Series:

924 """

925 Compute a histogram of the counts of non-null values.

926

927 Parameters

928 ----------

929 values : ndarray (1-d)

930 sort : bool, default True

931 Sort by values

932 ascending : bool, default False

933 Sort in ascending order

934 normalize: bool, default False

935 If True then compute a relative histogram

936 bins : integer, optional

937 Rather than count values, group them into half-open bins,

938 convenience for pd.cut, only works with numeric data

939 dropna : bool, default True

940 Don't include counts of NaN

941

942 Returns

943 -------

944 Series

945 """

946 from pandas import (

947 Index,

948 Series,

949 )

950

951 name = getattr(values, "name", None)

952

953 if bins is not None:

954 from pandas.core.reshape.tile import cut

955

956 values = Series(values)

957 try:

958 ii = cut(values, bins, include_lowest=True)

959 except TypeError as err:

960 raise TypeError("bins argument only works with numeric data.") from err

961

962 # count, remove nulls (from the index), and but the bins

963 result = ii.value_counts(dropna=dropna)

964 result = result[result.index.notna()]

965 result.index = result.index.astype("interval")

966 result = result.sort_index()

967

968 # if we are dropna and we have NO values

969 if dropna and (result._values == 0).all():

970 result = result.iloc[0:0]

971

972 # normalizing is by len of all (regardless of dropna)

973 counts = np.array([len(ii)])

974

975 else:

976

977 if is_extension_array_dtype(values):

978

979 # handle Categorical and sparse,

980 result = Series(values)._values.value_counts(dropna=dropna)

981 result.name = name

982 counts = result._values

983

984 else:

985 values = _ensure_arraylike(values)

986 keys, counts = value_counts_arraylike(values, dropna)

987

988 # For backwards compatibility, we let Index do its normal type

989 # inference, _except_ for if if infers from object to bool.

990 idx = Index._with_infer(keys)

991 if idx.dtype == bool and keys.dtype == object:

992 idx = idx.astype(object)

993

994 result = Series(counts, index=idx, name=name)

995

996 if sort:

997 result = result.sort_values(ascending=ascending)

998

999 if normalize:

1000 result = result / counts.sum()

1001

1002 return result

1003

1004

1005# Called once from SparseArray, otherwise could be private

1006def value_counts_arraylike(

1007 values: np.ndarray, dropna: bool, mask: npt.NDArray[np.bool_] | None = None

1008) -> tuple[ArrayLike, npt.NDArray[np.int64]]:

1009 """

1010 Parameters

1011 ----------

1012 values : np.ndarray

1013 dropna : bool

1014 mask : np.ndarray[bool] or None, default None

1015

1016 Returns

1017 -------

1018 uniques : np.ndarray

1019 counts : np.ndarray[np.int64]

1020 """

1021 original = values

1022 values = _ensure_data(values)

1023

1024 keys, counts = htable.value_count(values, dropna, mask=mask)

1025

1026 if needs_i8_conversion(original.dtype):

1027 # datetime, timedelta, or period

1028

1029 if dropna:

1030 mask = keys != iNaT

1031 keys, counts = keys[mask], counts[mask]

1032

1033 res_keys = _reconstruct_data(keys, original.dtype, original)

1034 return res_keys, counts

1035

1036

1037def duplicated(

1038 values: ArrayLike, keep: Literal["first", "last", False] = "first"

1039) -> npt.NDArray[np.bool_]:

1040 """

1041 Return boolean ndarray denoting duplicate values.

1042

1043 Parameters

1044 ----------

1045 values : nd.array, ExtensionArray or Series

1046 Array over which to check for duplicate values.

1047 keep : {'first', 'last', False}, default 'first'

1048 - ``first`` : Mark duplicates as ``True`` except for the first

1049 occurrence.

1050 - ``last`` : Mark duplicates as ``True`` except for the last

1051 occurrence.

1052 - False : Mark all duplicates as ``True``.

1053

1054 Returns

1055 -------

1056 duplicated : ndarray[bool]

1057 """

1058 values = _ensure_data(values)

1059 return htable.duplicated(values, keep=keep)

1060

1061

1062def mode(

1063 values: ArrayLike, dropna: bool = True, mask: npt.NDArray[np.bool_] | None = None

1064) -> ArrayLike:

1065 """

1066 Returns the mode(s) of an array.

1067

1068 Parameters

1069 ----------

1070 values : array-like

1071 Array over which to check for duplicate values.

1072 dropna : bool, default True

1073 Don't consider counts of NaN/NaT.

1074

1075 Returns

1076 -------

1077 np.ndarray or ExtensionArray

1078 """

1079 values = _ensure_arraylike(values)

1080 original = values

1081

1082 if needs_i8_conversion(values.dtype):

1083 # Got here with ndarray; dispatch to DatetimeArray/TimedeltaArray.

1084 values = ensure_wrapped_if_datetimelike(values)

1085 values = cast("ExtensionArray", values)

1086 return values._mode(dropna=dropna)

1087

1088 values = _ensure_data(values)

1089

1090 npresult = htable.mode(values, dropna=dropna, mask=mask)

1091 try:

1092 npresult = np.sort(npresult)

1093 except TypeError as err:

1094 warnings.warn(

1095 f"Unable to sort modes: {err}",

1096 stacklevel=find_stack_level(),

1097 )

1098

1099 result = _reconstruct_data(npresult, original.dtype, original)

1100 return result

1101

1102

1103def rank(

1104 values: ArrayLike,

1105 axis: int = 0,

1106 method: str = "average",

1107 na_option: str = "keep",

1108 ascending: bool = True,

1109 pct: bool = False,

1110) -> npt.NDArray[np.float64]:

1111 """

1112 Rank the values along a given axis.

1113

1114 Parameters

1115 ----------

1116 values : np.ndarray or ExtensionArray

1117 Array whose values will be ranked. The number of dimensions in this

1118 array must not exceed 2.

1119 axis : int, default 0

1120 Axis over which to perform rankings.

1121 method : {'average', 'min', 'max', 'first', 'dense'}, default 'average'

1122 The method by which tiebreaks are broken during the ranking.

1123 na_option : {'keep', 'top'}, default 'keep'

1124 The method by which NaNs are placed in the ranking.

1125 - ``keep``: rank each NaN value with a NaN ranking

1126 - ``top``: replace each NaN with either +/- inf so that they

1127 there are ranked at the top

1128 ascending : bool, default True

1129 Whether or not the elements should be ranked in ascending order.

1130 pct : bool, default False

1131 Whether or not to the display the returned rankings in integer form

1132 (e.g. 1, 2, 3) or in percentile form (e.g. 0.333..., 0.666..., 1).

1133 """

1134 is_datetimelike = needs_i8_conversion(values.dtype)

1135 values = _ensure_data(values)

1136

1137 if values.ndim == 1:

1138 ranks = algos.rank_1d(

1139 values,

1140 is_datetimelike=is_datetimelike,

1141 ties_method=method,

1142 ascending=ascending,

1143 na_option=na_option,

1144 pct=pct,

1145 )

1146 elif values.ndim == 2:

1147 ranks = algos.rank_2d(

1148 values,

1149 axis=axis,

1150 is_datetimelike=is_datetimelike,

1151 ties_method=method,

1152 ascending=ascending,

1153 na_option=na_option,

1154 pct=pct,

1155 )

1156 else:

1157 raise TypeError("Array with ndim > 2 are not supported.")

1158

1159 return ranks

1160

1161

1162def checked_add_with_arr(

1163 arr: npt.NDArray[np.int64],

1164 b: int | npt.NDArray[np.int64],

1165 arr_mask: npt.NDArray[np.bool_] | None = None,

1166 b_mask: npt.NDArray[np.bool_] | None = None,

1167) -> npt.NDArray[np.int64]:

1168 """

1169 Perform array addition that checks for underflow and overflow.

1170

1171 Performs the addition of an int64 array and an int64 integer (or array)

1172 but checks that they do not result in overflow first. For elements that

1173 are indicated to be NaN, whether or not there is overflow for that element

1174 is automatically ignored.

1175

1176 Parameters

1177 ----------

1178 arr : np.ndarray[int64] addend.

1179 b : array or scalar addend.

1180 arr_mask : np.ndarray[bool] or None, default None

1181 array indicating which elements to exclude from checking

1182 b_mask : np.ndarray[bool] or None, default None

1183 array or scalar indicating which element(s) to exclude from checking

1184

1185 Returns

1186 -------

1187 sum : An array for elements x + b for each element x in arr if b is

1188 a scalar or an array for elements x + y for each element pair

1189 (x, y) in (arr, b).

1190

1191 Raises

1192 ------

1193 OverflowError if any x + y exceeds the maximum or minimum int64 value.

1194 """

1195 # For performance reasons, we broadcast 'b' to the new array 'b2'

1196 # so that it has the same size as 'arr'.

1197 b2 = np.broadcast_to(b, arr.shape)

1198 if b_mask is not None:

1199 # We do the same broadcasting for b_mask as well.

1200 b2_mask = np.broadcast_to(b_mask, arr.shape)

1201 else:

1202 b2_mask = None

1203

1204 # For elements that are NaN, regardless of their value, we should

1205 # ignore whether they overflow or not when doing the checked add.

1206 if arr_mask is not None and b2_mask is not None:

1207 not_nan = np.logical_not(arr_mask | b2_mask)

1208 elif arr_mask is not None:

1209 not_nan = np.logical_not(arr_mask)

1210 elif b_mask is not None:

1211 # error: Argument 1 to "__call__" of "_UFunc_Nin1_Nout1" has

1212 # incompatible type "Optional[ndarray[Any, dtype[bool_]]]";

1213 # expected "Union[_SupportsArray[dtype[Any]], _NestedSequence

1214 # [_SupportsArray[dtype[Any]]], bool, int, float, complex, str

1215 # , bytes, _NestedSequence[Union[bool, int, float, complex, str

1216 # , bytes]]]"

1217 not_nan = np.logical_not(b2_mask) # type: ignore[arg-type]

1218 else:

1219 not_nan = np.empty(arr.shape, dtype=bool)

1220 not_nan.fill(True)

1221

1222 # gh-14324: For each element in 'arr' and its corresponding element

1223 # in 'b2', we check the sign of the element in 'b2'. If it is positive,

1224 # we then check whether its sum with the element in 'arr' exceeds

1225 # np.iinfo(np.int64).max. If so, we have an overflow error. If it

1226 # it is negative, we then check whether its sum with the element in

1227 # 'arr' exceeds np.iinfo(np.int64).min. If so, we have an overflow

1228 # error as well.

1229 i8max = lib.i8max

1230 i8min = iNaT

1231

1232 mask1 = b2 > 0

1233 mask2 = b2 < 0

1234

1235 if not mask1.any():

1236 to_raise = ((i8min - b2 > arr) & not_nan).any()

1237 elif not mask2.any():

1238 to_raise = ((i8max - b2 < arr) & not_nan).any()

1239 else:

1240 to_raise = ((i8max - b2[mask1] < arr[mask1]) & not_nan[mask1]).any() or (

1241 (i8min - b2[mask2] > arr[mask2]) & not_nan[mask2]

1242 ).any()

1243

1244 if to_raise:

1245 raise OverflowError("Overflow in int64 addition")

1246

1247 result = arr + b

1248 if arr_mask is not None or b2_mask is not None:

1249 np.putmask(result, ~not_nan, iNaT)

1250

1251 return result

1252

1253

1254# --------------- #

1255# select n #

1256# --------------- #

1257

1258

1259class SelectN:

1260 def __init__(self, obj, n: int, keep: str) -> None:

1261 self.obj = obj

1262 self.n = n

1263 self.keep = keep

1264

1265 if self.keep not in ("first", "last", "all"):

1266 raise ValueError('keep must be either "first", "last" or "all"')

1267

1268 def compute(self, method: str) -> DataFrame | Series:

1269 raise NotImplementedError

1270

1271 @final

1272 def nlargest(self):

1273 return self.compute("nlargest")

1274

1275 @final

1276 def nsmallest(self):

1277 return self.compute("nsmallest")

1278

1279 @final

1280 @staticmethod

1281 def is_valid_dtype_n_method(dtype: DtypeObj) -> bool:

1282 """

1283 Helper function to determine if dtype is valid for

1284 nsmallest/nlargest methods

1285 """

1286 return (

1287 is_numeric_dtype(dtype) and not is_complex_dtype(dtype)

1288 ) or needs_i8_conversion(dtype)

1289

1290

1291class SelectNSeries(SelectN):

1292 """

1293 Implement n largest/smallest for Series

1294

1295 Parameters

1296 ----------

1297 obj : Series

1298 n : int

1299 keep : {'first', 'last'}, default 'first'

1300

1301 Returns

1302 -------

1303 nordered : Series

1304 """

1305

1306 def compute(self, method: str) -> Series:

1307

1308 from pandas.core.reshape.concat import concat

1309

1310 n = self.n

1311 dtype = self.obj.dtype

1312 if not self.is_valid_dtype_n_method(dtype):

1313 raise TypeError(f"Cannot use method '{method}' with dtype {dtype}")

1314

1315 if n <= 0:

1316 return self.obj[[]]

1317

1318 dropped = self.obj.dropna()

1319 nan_index = self.obj.drop(dropped.index)

1320

1321 # slow method

1322 if n >= len(self.obj):

1323 ascending = method == "nsmallest"

1324 return self.obj.sort_values(ascending=ascending).head(n)

1325

1326 # fast method

1327 new_dtype = dropped.dtype

1328 arr = _ensure_data(dropped.values)

1329 if method == "nlargest":

1330 arr = -arr

1331 if is_integer_dtype(new_dtype):

1332 # GH 21426: ensure reverse ordering at boundaries

1333 arr -= 1

1334

1335 elif is_bool_dtype(new_dtype):

1336 # GH 26154: ensure False is smaller than True

1337 arr = 1 - (-arr)

1338

1339 if self.keep == "last":

1340 arr = arr[::-1]

1341

1342 nbase = n

1343 narr = len(arr)

1344 n = min(n, narr)

1345

1346 # arr passed into kth_smallest must be contiguous. We copy

1347 # here because kth_smallest will modify its input

1348 kth_val = algos.kth_smallest(arr.copy(order="C"), n - 1)

1349 (ns,) = np.nonzero(arr <= kth_val)

1350 inds = ns[arr[ns].argsort(kind="mergesort")]

1351

1352 if self.keep != "all":

1353 inds = inds[:n]

1354 findex = nbase

1355 else:

1356 if len(inds) < nbase and len(nan_index) + len(inds) >= nbase:

1357 findex = len(nan_index) + len(inds)

1358 else:

1359 findex = len(inds)

1360

1361 if self.keep == "last":

1362 # reverse indices

1363 inds = narr - 1 - inds

1364

1365 return concat([dropped.iloc[inds], nan_index]).iloc[:findex]

1366

1367

1368class SelectNFrame(SelectN):

1369 """

1370 Implement n largest/smallest for DataFrame

1371

1372 Parameters

1373 ----------

1374 obj : DataFrame

1375 n : int

1376 keep : {'first', 'last'}, default 'first'

1377 columns : list or str

1378

1379 Returns

1380 -------

1381 nordered : DataFrame

1382 """

1383

1384 def __init__(self, obj: DataFrame, n: int, keep: str, columns: IndexLabel) -> None:

1385 super().__init__(obj, n, keep)

1386 if not is_list_like(columns) or isinstance(columns, tuple):

1387 columns = [columns]

1388

1389 columns = cast(Sequence[Hashable], columns)

1390 columns = list(columns)

1391 self.columns = columns

1392

1393 def compute(self, method: str) -> DataFrame:

1394

1395 from pandas.core.api import Int64Index

1396

1397 n = self.n

1398 frame = self.obj

1399 columns = self.columns

1400

1401 for column in columns:

1402 dtype = frame[column].dtype

1403 if not self.is_valid_dtype_n_method(dtype):

1404 raise TypeError(

1405 f"Column {repr(column)} has dtype {dtype}, "

1406 f"cannot use method {repr(method)} with this dtype"

1407 )

1408

1409 def get_indexer(current_indexer, other_indexer):

1410 """

1411 Helper function to concat `current_indexer` and `other_indexer`

1412 depending on `method`

1413 """

1414 if method == "nsmallest":

1415 return current_indexer.append(other_indexer)

1416 else:

1417 return other_indexer.append(current_indexer)

1418

1419 # Below we save and reset the index in case index contains duplicates

1420 original_index = frame.index

1421 cur_frame = frame = frame.reset_index(drop=True)

1422 cur_n = n

1423 indexer = Int64Index([])

1424

1425 for i, column in enumerate(columns):

1426 # For each column we apply method to cur_frame[column].

1427 # If it's the last column or if we have the number of

1428 # results desired we are done.

1429 # Otherwise there are duplicates of the largest/smallest

1430 # value and we need to look at the rest of the columns

1431 # to determine which of the rows with the largest/smallest

1432 # value in the column to keep.

1433 series = cur_frame[column]

1434 is_last_column = len(columns) - 1 == i

1435 values = getattr(series, method)(

1436 cur_n, keep=self.keep if is_last_column else "all"

1437 )

1438

1439 if is_last_column or len(values) <= cur_n:

1440 indexer = get_indexer(indexer, values.index)

1441 break

1442

1443 # Now find all values which are equal to

1444 # the (nsmallest: largest)/(nlargest: smallest)

1445 # from our series.

1446 border_value = values == values[values.index[-1]]

1447

1448 # Some of these values are among the top-n

1449 # some aren't.

1450 unsafe_values = values[border_value]

1451

1452 # These values are definitely among the top-n

1453 safe_values = values[~border_value]

1454 indexer = get_indexer(indexer, safe_values.index)

1455

1456 # Go on and separate the unsafe_values on the remaining

1457 # columns.

1458 cur_frame = cur_frame.loc[unsafe_values.index]

1459 cur_n = n - len(indexer)

1460

1461 frame = frame.take(indexer)

1462

1463 # Restore the index on frame

1464 frame.index = original_index.take(indexer)

1465

1466 # If there is only one column, the frame is already sorted.

1467 if len(columns) == 1:

1468 return frame

1469

1470 ascending = method == "nsmallest"

1471

1472 return frame.sort_values(columns, ascending=ascending, kind="mergesort")

1473

1474

1475# ---- #

1476# take #

1477# ---- #

1478

1479

1480def take(

1481 arr,

1482 indices: TakeIndexer,

1483 axis: int = 0,

1484 allow_fill: bool = False,

1485 fill_value=None,

1486):

1487 """

1488 Take elements from an array.

1489

1490 Parameters

1491 ----------

1492 arr : array-like or scalar value

1493 Non array-likes (sequences/scalars without a dtype) are coerced

1494 to an ndarray.

1495 indices : sequence of int or one-dimensional np.ndarray of int

1496 Indices to be taken.

1497 axis : int, default 0

1498 The axis over which to select values.

1499 allow_fill : bool, default False

1500 How to handle negative values in `indices`.

1501

1502 * False: negative values in `indices` indicate positional indices

1503 from the right (the default). This is similar to :func:`numpy.take`.

1504

1505 * True: negative values in `indices` indicate

1506 missing values. These values are set to `fill_value`. Any other

1507 negative values raise a ``ValueError``.

1508

1509 fill_value : any, optional

1510 Fill value to use for NA-indices when `allow_fill` is True.

1511 This may be ``None``, in which case the default NA value for

1512 the type (``self.dtype.na_value``) is used.

1513

1514 For multi-dimensional `arr`, each *element* is filled with

1515 `fill_value`.

1516

1517 Returns

1518 -------

1519 ndarray or ExtensionArray

1520 Same type as the input.

1521

1522 Raises

1523 ------

1524 IndexError

1525 When `indices` is out of bounds for the array.

1526 ValueError

1527 When the indexer contains negative values other than ``-1``

1528 and `allow_fill` is True.

1529

1530 Notes

1531 -----

1532 When `allow_fill` is False, `indices` may be whatever dimensionality

1533 is accepted by NumPy for `arr`.

1534

1535 When `allow_fill` is True, `indices` should be 1-D.

1536

1537 See Also

1538 --------

1539 numpy.take : Take elements from an array along an axis.

1540

1541 Examples

1542 --------

1543 >>> import pandas as pd

1544

1545 With the default ``allow_fill=False``, negative numbers indicate

1546 positional indices from the right.

1547

1548 >>> pd.api.extensions.take(np.array([10, 20, 30]), [0, 0, -1])

1549 array([10, 10, 30])

1550

1551 Setting ``allow_fill=True`` will place `fill_value` in those positions.

1552

1553 >>> pd.api.extensions.take(np.array([10, 20, 30]), [0, 0, -1], allow_fill=True)

1554 array([10., 10., nan])

1555

1556 >>> pd.api.extensions.take(np.array([10, 20, 30]), [0, 0, -1], allow_fill=True,

1557 ... fill_value=-10)

1558 array([ 10, 10, -10])

1559 """

1560 if not is_array_like(arr):

1561 arr = np.asarray(arr)

1562

1563 indices = np.asarray(indices, dtype=np.intp)

1564

1565 if allow_fill:

1566 # Pandas style, -1 means NA

1567 validate_indices(indices, arr.shape[axis])

1568 result = take_nd(

1569 arr, indices, axis=axis, allow_fill=True, fill_value=fill_value

1570 )

1571 else:

1572 # NumPy style

1573 result = arr.take(indices, axis=axis)

1574 return result

1575

1576

1577# ------------ #

1578# searchsorted #

1579# ------------ #

1580

1581

1582def searchsorted(

1583 arr: ArrayLike,

1584 value: NumpyValueArrayLike | ExtensionArray,

1585 side: Literal["left", "right"] = "left",

1586 sorter: NumpySorter = None,

1587) -> npt.NDArray[np.intp] | np.intp:

1588 """

1589 Find indices where elements should be inserted to maintain order.

1590

1591 .. versionadded:: 0.25.0

1592

1593 Find the indices into a sorted array `arr` (a) such that, if the

1594 corresponding elements in `value` were inserted before the indices,

1595 the order of `arr` would be preserved.

1596

1597 Assuming that `arr` is sorted:

1598

1599 ====== ================================

1600 `side` returned index `i` satisfies

1601 ====== ================================

1602 left ``arr[i-1] < value <= self[i]``

1603 right ``arr[i-1] <= value < self[i]``

1604 ====== ================================

1605

1606 Parameters

1607 ----------

1608 arr: np.ndarray, ExtensionArray, Series

1609 Input array. If `sorter` is None, then it must be sorted in

1610 ascending order, otherwise `sorter` must be an array of indices

1611 that sort it.

1612 value : array-like or scalar

1613 Values to insert into `arr`.

1614 side : {'left', 'right'}, optional

1615 If 'left', the index of the first suitable location found is given.

1616 If 'right', return the last such index. If there is no suitable

1617 index, return either 0 or N (where N is the length of `self`).

1618 sorter : 1-D array-like, optional

1619 Optional array of integer indices that sort array a into ascending

1620 order. They are typically the result of argsort.

1621

1622 Returns

1623 -------

1624 array of ints or int

1625 If value is array-like, array of insertion points.

1626 If value is scalar, a single integer.

1627

1628 See Also

1629 --------

1630 numpy.searchsorted : Similar method from NumPy.

1631 """

1632 if sorter is not None:

1633 sorter = ensure_platform_int(sorter)

1634

1635 if (

1636 isinstance(arr, np.ndarray)

1637 and is_integer_dtype(arr.dtype)

1638 and (is_integer(value) or is_integer_dtype(value))

1639 ):

1640 # if `arr` and `value` have different dtypes, `arr` would be

1641 # recast by numpy, causing a slow search.

1642 # Before searching below, we therefore try to give `value` the

1643 # same dtype as `arr`, while guarding against integer overflows.

1644 iinfo = np.iinfo(arr.dtype.type)

1645 value_arr = np.array([value]) if is_scalar(value) else np.array(value)

1646 if (value_arr >= iinfo.min).all() and (value_arr <= iinfo.max).all():

1647 # value within bounds, so no overflow, so can convert value dtype

1648 # to dtype of arr

1649 dtype = arr.dtype

1650 else:

1651 dtype = value_arr.dtype

1652

1653 if is_scalar(value):

1654 # We know that value is int

1655 value = cast(int, dtype.type(value))

1656 else:

1657 value = pd_array(cast(ArrayLike, value), dtype=dtype)

1658 else:

1659 # E.g. if `arr` is an array with dtype='datetime64[ns]'

1660 # and `value` is a pd.Timestamp, we may need to convert value

1661 arr = ensure_wrapped_if_datetimelike(arr)

1662

1663 # Argument 1 to "searchsorted" of "ndarray" has incompatible type

1664 # "Union[NumpyValueArrayLike, ExtensionArray]"; expected "NumpyValueArrayLike"

1665 return arr.searchsorted(value, side=side, sorter=sorter) # type: ignore[arg-type]

1666

1667

1668# ---- #

1669# diff #

1670# ---- #

1671

1672_diff_special = {"float64", "float32", "int64", "int32", "int16", "int8"}

1673

1674

1675def diff(arr, n: int, axis: int = 0):

1676 """

1677 difference of n between self,

1678 analogous to s-s.shift(n)

1679

1680 Parameters

1681 ----------

1682 arr : ndarray or ExtensionArray

1683 n : int

1684 number of periods

1685 axis : {0, 1}

1686 axis to shift on

1687 stacklevel : int, default 3

1688 The stacklevel for the lost dtype warning.

1689

1690 Returns

1691 -------

1692 shifted

1693 """

1694

1695 n = int(n)

1696 na = np.nan

1697 dtype = arr.dtype

1698

1699 is_bool = is_bool_dtype(dtype)

1700 if is_bool:

1701 op = operator.xor

1702 else:

1703 op = operator.sub

1704

1705 if isinstance(dtype, PandasDtype):

1706 # PandasArray cannot necessarily hold shifted versions of itself.

1707 arr = arr.to_numpy()

1708 dtype = arr.dtype

1709

1710 if not isinstance(dtype, np.dtype):

1711 # i.e ExtensionDtype

1712 if hasattr(arr, f"__{op.__name__}__"):

1713 if axis != 0:

1714 raise ValueError(f"cannot diff {type(arr).__name__} on axis={axis}")

1715 return op(arr, arr.shift(n))

1716 else:

1717 warnings.warn(

1718 "dtype lost in 'diff()'. In the future this will raise a "

1719 "TypeError. Convert to a suitable dtype prior to calling 'diff'.",

1720 FutureWarning,

1721 stacklevel=find_stack_level(),

1722 )

1723 arr = np.asarray(arr)

1724 dtype = arr.dtype

1725

1726 is_timedelta = False

1727 if needs_i8_conversion(arr.dtype):

1728 dtype = np.int64

1729 arr = arr.view("i8")

1730 na = iNaT

1731 is_timedelta = True

1732

1733 elif is_bool:

1734 # We have to cast in order to be able to hold np.nan

1735 dtype = np.object_

1736

1737 elif is_integer_dtype(dtype):

1738 # We have to cast in order to be able to hold np.nan

1739

1740 # int8, int16 are incompatible with float64,

1741 # see https://github.com/cython/cython/issues/2646

1742 if arr.dtype.name in ["int8", "int16"]:

1743 dtype = np.float32

1744 else:

1745 dtype = np.float64

1746

1747 orig_ndim = arr.ndim

1748 if orig_ndim == 1:

1749 # reshape so we can always use algos.diff_2d

1750 arr = arr.reshape(-1, 1)

1751 # TODO: require axis == 0

1752

1753 dtype = np.dtype(dtype)

1754 out_arr = np.empty(arr.shape, dtype=dtype)

1755

1756 na_indexer = [slice(None)] * 2

1757 na_indexer[axis] = slice(None, n) if n >= 0 else slice(n, None)

1758 out_arr[tuple(na_indexer)] = na

1759

1760 if arr.dtype.name in _diff_special:

1761 # TODO: can diff_2d dtype specialization troubles be fixed by defining

1762 # out_arr inside diff_2d?

1763 algos.diff_2d(arr, out_arr, n, axis, datetimelike=is_timedelta)

1764 else:

1765 # To keep mypy happy, _res_indexer is a list while res_indexer is

1766 # a tuple, ditto for lag_indexer.

1767 _res_indexer = [slice(None)] * 2

1768 _res_indexer[axis] = slice(n, None) if n >= 0 else slice(None, n)

1769 res_indexer = tuple(_res_indexer)

1770

1771 _lag_indexer = [slice(None)] * 2

1772 _lag_indexer[axis] = slice(None, -n) if n > 0 else slice(-n, None)

1773 lag_indexer = tuple(_lag_indexer)

1774

1775 out_arr[res_indexer] = op(arr[res_indexer], arr[lag_indexer])

1776

1777 if is_timedelta:

1778 out_arr = out_arr.view("timedelta64[ns]")

1779

1780 if orig_ndim == 1:

1781 out_arr = out_arr[:, 0]

1782 return out_arr

1783

1784

1785# --------------------------------------------------------------------

1786# Helper functions

1787

1788# Note: safe_sort is in algorithms.py instead of sorting.py because it is

1789# low-dependency, is used in this module, and used private methods from

1790# this module.

1791def safe_sort(

1792 values,

1793 codes=None,

1794 na_sentinel: int = -1,

1795 assume_unique: bool = False,

1796 verify: bool = True,

1797) -> np.ndarray | MultiIndex | tuple[np.ndarray | MultiIndex, np.ndarray]:

1798 """

1799 Sort ``values`` and reorder corresponding ``codes``.

1800

1801 ``values`` should be unique if ``codes`` is not None.

1802 Safe for use with mixed types (int, str), orders ints before strs.

1803

1804 Parameters

1805 ----------

1806 values : list-like

1807 Sequence; must be unique if ``codes`` is not None.

1808 codes : list_like, optional

1809 Indices to ``values``. All out of bound indices are treated as

1810 "not found" and will be masked with ``na_sentinel``.

1811 na_sentinel : int, default -1

1812 Value in ``codes`` to mark "not found".

1813 Ignored when ``codes`` is None.

1814 assume_unique : bool, default False

1815 When True, ``values`` are assumed to be unique, which can speed up

1816 the calculation. Ignored when ``codes`` is None.

1817 verify : bool, default True

1818 Check if codes are out of bound for the values and put out of bound

1819 codes equal to na_sentinel. If ``verify=False``, it is assumed there

1820 are no out of bound codes. Ignored when ``codes`` is None.

1821

1822 .. versionadded:: 0.25.0

1823

1824 Returns

1825 -------

1826 ordered : ndarray or MultiIndex

1827 Sorted ``values``

1828 new_codes : ndarray

1829 Reordered ``codes``; returned when ``codes`` is not None.

1830

1831 Raises

1832 ------

1833 TypeError

1834 * If ``values`` is not list-like or if ``codes`` is neither None

1835 nor list-like

1836 * If ``values`` cannot be sorted

1837 ValueError

1838 * If ``codes`` is not None and ``values`` contain duplicates.

1839 """

1840 if not is_list_like(values):

1841 raise TypeError(

1842 "Only list-like objects are allowed to be passed to safe_sort as values"

1843 )

1844 original_values = values

1845 is_mi = isinstance(original_values, ABCMultiIndex)

1846

1847 if not isinstance(values, (np.ndarray, ABCExtensionArray)):

1848 # don't convert to string types

1849 dtype, _ = infer_dtype_from_array(values)

1850 # error: Argument "dtype" to "asarray" has incompatible type "Union[dtype[Any],

1851 # ExtensionDtype]"; expected "Union[dtype[Any], None, type, _SupportsDType, str,

1852 # Union[Tuple[Any, int], Tuple[Any, Union[int, Sequence[int]]], List[Any],

1853 # _DTypeDict, Tuple[Any, Any]]]"

1854 values = np.asarray(values, dtype=dtype) # type: ignore[arg-type]

1855

1856 sorter = None

1857 ordered: np.ndarray | MultiIndex

1858

1859 if (

1860 not is_extension_array_dtype(values)

1861 and lib.infer_dtype(values, skipna=False) == "mixed-integer"

1862 ):

1863 ordered = _sort_mixed(values)

1864 else:

1865 try:

1866 sorter = values.argsort()

1867 if is_mi:

1868 # Operate on original object instead of casted array (MultiIndex)

1869 ordered = original_values.take(sorter)

1870 else:

1871 ordered = values.take(sorter)

1872 except TypeError:

1873 # Previous sorters failed or were not applicable, try `_sort_mixed`

1874 # which would work, but which fails for special case of 1d arrays

1875 # with tuples.

1876 if values.size and isinstance(values[0], tuple):

1877 ordered = _sort_tuples(values, original_values)

1878 else:

1879 ordered = _sort_mixed(values)

1880

1881 # codes:

1882

1883 if codes is None:

1884 return ordered

1885

1886 if not is_list_like(codes):

1887 raise TypeError(

1888 "Only list-like objects or None are allowed to "

1889 "be passed to safe_sort as codes"

1890 )

1891 codes = ensure_platform_int(np.asarray(codes))

1892

1893 if not assume_unique and not len(unique(values)) == len(values):

1894 raise ValueError("values should be unique if codes is not None")

1895

1896 if sorter is None:

1897 # mixed types

1898 hash_klass, values = _get_hashtable_algo(values)

1899 t = hash_klass(len(values))

1900 t.map_locations(values)

1901 sorter = ensure_platform_int(t.lookup(ordered))

1902

1903 if na_sentinel == -1:

1904 # take_nd is faster, but only works for na_sentinels of -1

1905 order2 = sorter.argsort()

1906 new_codes = take_nd(order2, codes, fill_value=-1)

1907 if verify:

1908 mask = (codes < -len(values)) | (codes >= len(values))

1909 else:

1910 mask = None

1911 else:

1912 reverse_indexer = np.empty(len(sorter), dtype=np.int_)

1913 reverse_indexer.put(sorter, np.arange(len(sorter)))

1914 # Out of bound indices will be masked with `na_sentinel` next, so we

1915 # may deal with them here without performance loss using `mode='wrap'`

1916 new_codes = reverse_indexer.take(codes, mode="wrap")

1917

1918 mask = codes == na_sentinel

1919 if verify:

1920 mask = mask | (codes < -len(values)) | (codes >= len(values))

1921

1922 if mask is not None:

1923 np.putmask(new_codes, mask, na_sentinel)

1924

1925 return ordered, ensure_platform_int(new_codes)

1926

1927

1928def _sort_mixed(values) -> np.ndarray:

1929 """order ints before strings in 1d arrays, safe in py3"""

1930 str_pos = np.array([isinstance(x, str) for x in values], dtype=bool)

1931 none_pos = np.array([x is None for x in values], dtype=bool)

1932 nums = np.sort(values[~str_pos & ~none_pos])

1933 strs = np.sort(values[str_pos])

1934 return np.concatenate(

1935 [nums, np.asarray(strs, dtype=object), np.array(values[none_pos])]

1936 )

1937

1938

1939@overload

1940def _sort_tuples(values: np.ndarray, original_values: np.ndarray) -> np.ndarray:

1941 ...

1942

1943

1944@overload

1945def _sort_tuples(values: np.ndarray, original_values: MultiIndex) -> MultiIndex:

1946 ...

1947

1948

1949def _sort_tuples(

1950 values: np.ndarray, original_values: np.ndarray | MultiIndex

1951) -> np.ndarray | MultiIndex:

1952 """

1953 Convert array of tuples (1d) to array or array (2d).

1954 We need to keep the columns separately as they contain different types and

1955 nans (can't use `np.sort` as it may fail when str and nan are mixed in a

1956 column as types cannot be compared).

1957 We have to apply the indexer to the original values to keep the dtypes in

1958 case of MultiIndexes

1959 """

1960 from pandas.core.internals.construction import to_arrays

1961 from pandas.core.sorting import lexsort_indexer

1962

1963 arrays, _ = to_arrays(values, None)

1964 indexer = lexsort_indexer(arrays, orders=True)

1965 return original_values[indexer]

1966

1967

1968def union_with_duplicates(lvals: ArrayLike, rvals: ArrayLike) -> ArrayLike:

1969 """

1970 Extracts the union from lvals and rvals with respect to duplicates and nans in

1971 both arrays.

1972

1973 Parameters

1974 ----------

1975 lvals: np.ndarray or ExtensionArray

1976 left values which is ordered in front.

1977 rvals: np.ndarray or ExtensionArray

1978 right values ordered after lvals.

1979

1980 Returns

1981 -------

1982 np.ndarray or ExtensionArray

1983 Containing the unsorted union of both arrays.

1984

1985 Notes

1986 -----

1987 Caller is responsible for ensuring lvals.dtype == rvals.dtype.

1988 """

1989 indexer = []

1990 l_count = value_counts(lvals, dropna=False)

1991 r_count = value_counts(rvals, dropna=False)

1992 l_count, r_count = l_count.align(r_count, fill_value=0)

1993 unique_array = unique(concat_compat([lvals, rvals]))

1994 unique_array = ensure_wrapped_if_datetimelike(unique_array)

1995

1996 for i, value in enumerate(unique_array):

1997 indexer += [i] * int(max(l_count.at[value], r_count.at[value]))

1998 return unique_array.take(indexer)