Coverage for /var/srv/projects/api.amasfac.comuna18.com/tmp/venv/lib/python3.9/site-packages/pandas/core/algorithms.py: 9%
575 statements
« prev ^ index » next coverage.py v6.4.4, created at 2023-07-17 14:22 -0600
« prev ^ index » next coverage.py v6.4.4, created at 2023-07-17 14:22 -0600
1"""
2Generic data algorithms. This module is experimental at the moment and not
3intended for public consumption
4"""
5from __future__ import annotations
7import inspect
8import operator
9from textwrap import dedent
10from typing import (
11 TYPE_CHECKING,
12 Hashable,
13 Literal,
14 Sequence,
15 cast,
16 final,
17 overload,
18)
19import warnings
21import numpy as np
23from pandas._libs import (
24 algos,
25 hashtable as htable,
26 iNaT,
27 lib,
28)
29from pandas._typing import (
30 AnyArrayLike,
31 ArrayLike,
32 DtypeObj,
33 IndexLabel,
34 TakeIndexer,
35 npt,
36)
37from pandas.util._decorators import doc
38from pandas.util._exceptions import find_stack_level
40from pandas.core.dtypes.cast import (
41 construct_1d_object_array_from_listlike,
42 infer_dtype_from_array,
43 sanitize_to_nanoseconds,
44)
45from pandas.core.dtypes.common import (
46 ensure_float64,
47 ensure_object,
48 ensure_platform_int,
49 is_array_like,
50 is_bool_dtype,
51 is_categorical_dtype,
52 is_complex_dtype,
53 is_datetime64_dtype,
54 is_extension_array_dtype,
55 is_float_dtype,
56 is_integer,
57 is_integer_dtype,
58 is_list_like,
59 is_numeric_dtype,
60 is_object_dtype,
61 is_scalar,
62 is_signed_integer_dtype,
63 is_timedelta64_dtype,
64 needs_i8_conversion,
65)
66from pandas.core.dtypes.concat import concat_compat
67from pandas.core.dtypes.dtypes import (
68 BaseMaskedDtype,
69 ExtensionDtype,
70 PandasDtype,
71)
72from pandas.core.dtypes.generic import (
73 ABCDatetimeArray,
74 ABCExtensionArray,
75 ABCIndex,
76 ABCMultiIndex,
77 ABCRangeIndex,
78 ABCSeries,
79 ABCTimedeltaArray,
80)
81from pandas.core.dtypes.missing import (
82 isna,
83 na_value_for_dtype,
84)
86from pandas.core.array_algos.take import take_nd
87from pandas.core.construction import (
88 array as pd_array,
89 ensure_wrapped_if_datetimelike,
90 extract_array,
91)
92from pandas.core.indexers import validate_indices
94if TYPE_CHECKING: 94 ↛ 96line 94 didn't jump to line 96, because the condition on line 94 was never true
96 from pandas._typing import (
97 NumpySorter,
98 NumpyValueArrayLike,
99 )
101 from pandas import (
102 Categorical,
103 DataFrame,
104 Index,
105 MultiIndex,
106 Series,
107 )
108 from pandas.core.arrays import (
109 BaseMaskedArray,
110 ExtensionArray,
111 )
114# --------------- #
115# dtype access #
116# --------------- #
117def _ensure_data(values: ArrayLike) -> np.ndarray:
118 """
119 routine to ensure that our data is of the correct
120 input dtype for lower-level routines
122 This will coerce:
123 - ints -> int64
124 - uint -> uint64
125 - bool -> uint8
126 - datetimelike -> i8
127 - datetime64tz -> i8 (in local tz)
128 - categorical -> codes
130 Parameters
131 ----------
132 values : np.ndarray or ExtensionArray
134 Returns
135 -------
136 np.ndarray
137 """
139 if not isinstance(values, ABCMultiIndex):
140 # extract_array would raise
141 values = extract_array(values, extract_numpy=True)
143 if is_object_dtype(values.dtype):
144 return ensure_object(np.asarray(values))
146 elif isinstance(values.dtype, BaseMaskedDtype):
147 # i.e. BooleanArray, FloatingArray, IntegerArray
148 values = cast("BaseMaskedArray", values)
149 if not values._hasna:
150 # No pd.NAs -> We can avoid an object-dtype cast (and copy) GH#41816
151 # recurse to avoid re-implementing logic for eg bool->uint8
152 return _ensure_data(values._data)
153 return np.asarray(values)
155 elif is_categorical_dtype(values.dtype):
156 # NB: cases that go through here should NOT be using _reconstruct_data
157 # on the back-end.
158 values = cast("Categorical", values)
159 return values.codes
161 elif is_bool_dtype(values.dtype):
162 if isinstance(values, np.ndarray):
163 # i.e. actually dtype == np.dtype("bool")
164 return np.asarray(values).view("uint8")
165 else:
166 # e.g. Sparse[bool, False] # TODO: no test cases get here
167 return np.asarray(values).astype("uint8", copy=False)
169 elif is_integer_dtype(values.dtype):
170 return np.asarray(values)
172 elif is_float_dtype(values.dtype):
173 # Note: checking `values.dtype == "float128"` raises on Windows and 32bit
174 # error: Item "ExtensionDtype" of "Union[Any, ExtensionDtype, dtype[Any]]"
175 # has no attribute "itemsize"
176 if values.dtype.itemsize in [2, 12, 16]: # type: ignore[union-attr]
177 # we dont (yet) have float128 hashtable support
178 return ensure_float64(values)
179 return np.asarray(values)
181 elif is_complex_dtype(values.dtype):
182 return cast(np.ndarray, values)
184 # datetimelike
185 elif needs_i8_conversion(values.dtype):
186 if isinstance(values, np.ndarray):
187 values = sanitize_to_nanoseconds(values)
188 npvalues = values.view("i8")
189 npvalues = cast(np.ndarray, npvalues)
190 return npvalues
192 # we have failed, return object
193 values = np.asarray(values, dtype=object)
194 return ensure_object(values)
197def _reconstruct_data(
198 values: ArrayLike, dtype: DtypeObj, original: AnyArrayLike
199) -> ArrayLike:
200 """
201 reverse of _ensure_data
203 Parameters
204 ----------
205 values : np.ndarray or ExtensionArray
206 dtype : np.dtype or ExtensionDtype
207 original : AnyArrayLike
209 Returns
210 -------
211 ExtensionArray or np.ndarray
212 """
213 if isinstance(values, ABCExtensionArray) and values.dtype == dtype:
214 # Catch DatetimeArray/TimedeltaArray
215 return values
217 if not isinstance(dtype, np.dtype):
218 # i.e. ExtensionDtype; note we have ruled out above the possibility
219 # that values.dtype == dtype
220 cls = dtype.construct_array_type()
222 values = cls._from_sequence(values, dtype=dtype)
224 else:
225 if is_datetime64_dtype(dtype):
226 dtype = np.dtype("datetime64[ns]")
227 elif is_timedelta64_dtype(dtype):
228 dtype = np.dtype("timedelta64[ns]")
230 values = values.astype(dtype, copy=False)
232 return values
235def _ensure_arraylike(values) -> ArrayLike:
236 """
237 ensure that we are arraylike if not already
238 """
239 if not is_array_like(values):
240 inferred = lib.infer_dtype(values, skipna=False)
241 if inferred in ["mixed", "string", "mixed-integer"]:
242 # "mixed-integer" to ensure we do not cast ["ss", 42] to str GH#22160
243 if isinstance(values, tuple):
244 values = list(values)
245 values = construct_1d_object_array_from_listlike(values)
246 else:
247 values = np.asarray(values)
248 return values
251_hashtables = {
252 "complex128": htable.Complex128HashTable,
253 "complex64": htable.Complex64HashTable,
254 "float64": htable.Float64HashTable,
255 "float32": htable.Float32HashTable,
256 "uint64": htable.UInt64HashTable,
257 "uint32": htable.UInt32HashTable,
258 "uint16": htable.UInt16HashTable,
259 "uint8": htable.UInt8HashTable,
260 "int64": htable.Int64HashTable,
261 "int32": htable.Int32HashTable,
262 "int16": htable.Int16HashTable,
263 "int8": htable.Int8HashTable,
264 "string": htable.StringHashTable,
265 "object": htable.PyObjectHashTable,
266}
269def _get_hashtable_algo(values: np.ndarray):
270 """
271 Parameters
272 ----------
273 values : np.ndarray
275 Returns
276 -------
277 htable : HashTable subclass
278 values : ndarray
279 """
280 values = _ensure_data(values)
282 ndtype = _check_object_for_strings(values)
283 htable = _hashtables[ndtype]
284 return htable, values
287def _check_object_for_strings(values: np.ndarray) -> str:
288 """
289 Check if we can use string hashtable instead of object hashtable.
291 Parameters
292 ----------
293 values : ndarray
295 Returns
296 -------
297 str
298 """
299 ndtype = values.dtype.name
300 if ndtype == "object":
302 # it's cheaper to use a String Hash Table than Object; we infer
303 # including nulls because that is the only difference between
304 # StringHashTable and ObjectHashtable
305 if lib.infer_dtype(values, skipna=False) in ["string"]:
306 ndtype = "string"
307 return ndtype
310# --------------- #
311# top-level algos #
312# --------------- #
315def unique(values):
316 """
317 Return unique values based on a hash table.
319 Uniques are returned in order of appearance. This does NOT sort.
321 Significantly faster than numpy.unique for long enough sequences.
322 Includes NA values.
324 Parameters
325 ----------
326 values : 1d array-like
328 Returns
329 -------
330 numpy.ndarray or ExtensionArray
332 The return can be:
334 * Index : when the input is an Index
335 * Categorical : when the input is a Categorical dtype
336 * ndarray : when the input is a Series/ndarray
338 Return numpy.ndarray or ExtensionArray.
340 See Also
341 --------
342 Index.unique : Return unique values from an Index.
343 Series.unique : Return unique values of Series object.
345 Examples
346 --------
347 >>> pd.unique(pd.Series([2, 1, 3, 3]))
348 array([2, 1, 3])
350 >>> pd.unique(pd.Series([2] + [1] * 5))
351 array([2, 1])
353 >>> pd.unique(pd.Series([pd.Timestamp("20160101"), pd.Timestamp("20160101")]))
354 array(['2016-01-01T00:00:00.000000000'], dtype='datetime64[ns]')
356 >>> pd.unique(
357 ... pd.Series(
358 ... [
359 ... pd.Timestamp("20160101", tz="US/Eastern"),
360 ... pd.Timestamp("20160101", tz="US/Eastern"),
361 ... ]
362 ... )
363 ... )
364 <DatetimeArray>
365 ['2016-01-01 00:00:00-05:00']
366 Length: 1, dtype: datetime64[ns, US/Eastern]
368 >>> pd.unique(
369 ... pd.Index(
370 ... [
371 ... pd.Timestamp("20160101", tz="US/Eastern"),
372 ... pd.Timestamp("20160101", tz="US/Eastern"),
373 ... ]
374 ... )
375 ... )
376 DatetimeIndex(['2016-01-01 00:00:00-05:00'],
377 dtype='datetime64[ns, US/Eastern]',
378 freq=None)
380 >>> pd.unique(list("baabc"))
381 array(['b', 'a', 'c'], dtype=object)
383 An unordered Categorical will return categories in the
384 order of appearance.
386 >>> pd.unique(pd.Series(pd.Categorical(list("baabc"))))
387 ['b', 'a', 'c']
388 Categories (3, object): ['a', 'b', 'c']
390 >>> pd.unique(pd.Series(pd.Categorical(list("baabc"), categories=list("abc"))))
391 ['b', 'a', 'c']
392 Categories (3, object): ['a', 'b', 'c']
394 An ordered Categorical preserves the category ordering.
396 >>> pd.unique(
397 ... pd.Series(
398 ... pd.Categorical(list("baabc"), categories=list("abc"), ordered=True)
399 ... )
400 ... )
401 ['b', 'a', 'c']
402 Categories (3, object): ['a' < 'b' < 'c']
404 An array of tuples
406 >>> pd.unique([("a", "b"), ("b", "a"), ("a", "c"), ("b", "a")])
407 array([('a', 'b'), ('b', 'a'), ('a', 'c')], dtype=object)
408 """
409 return unique_with_mask(values)
412def unique_with_mask(values, mask: npt.NDArray[np.bool_] | None = None):
413 """See algorithms.unique for docs. Takes a mask for masked arrays."""
414 values = _ensure_arraylike(values)
416 if is_extension_array_dtype(values.dtype):
417 # Dispatch to extension dtype's unique.
418 return values.unique()
420 original = values
421 htable, values = _get_hashtable_algo(values)
423 table = htable(len(values))
424 if mask is None:
425 uniques = table.unique(values)
426 uniques = _reconstruct_data(uniques, original.dtype, original)
427 return uniques
429 else:
430 uniques, mask = table.unique(values, mask=mask)
431 uniques = _reconstruct_data(uniques, original.dtype, original)
432 assert mask is not None # for mypy
433 return uniques, mask.astype("bool")
436unique1d = unique
439def isin(comps: AnyArrayLike, values: AnyArrayLike) -> npt.NDArray[np.bool_]:
440 """
441 Compute the isin boolean array.
443 Parameters
444 ----------
445 comps : array-like
446 values : array-like
448 Returns
449 -------
450 ndarray[bool]
451 Same length as `comps`.
452 """
453 if not is_list_like(comps):
454 raise TypeError(
455 "only list-like objects are allowed to be passed "
456 f"to isin(), you passed a [{type(comps).__name__}]"
457 )
458 if not is_list_like(values):
459 raise TypeError(
460 "only list-like objects are allowed to be passed "
461 f"to isin(), you passed a [{type(values).__name__}]"
462 )
464 if not isinstance(values, (ABCIndex, ABCSeries, ABCExtensionArray, np.ndarray)):
465 orig_values = values
466 values = _ensure_arraylike(list(values))
468 if is_numeric_dtype(values) and not is_signed_integer_dtype(comps):
469 # GH#46485 Use object to avoid upcast to float64 later
470 # TODO: Share with _find_common_type_compat
471 values = construct_1d_object_array_from_listlike(list(orig_values))
473 elif isinstance(values, ABCMultiIndex):
474 # Avoid raising in extract_array
475 values = np.array(values)
476 else:
477 values = extract_array(values, extract_numpy=True, extract_range=True)
479 comps_array = _ensure_arraylike(comps)
480 comps_array = extract_array(comps_array, extract_numpy=True)
481 if not isinstance(comps_array, np.ndarray):
482 # i.e. Extension Array
483 return comps_array.isin(values)
485 elif needs_i8_conversion(comps_array.dtype):
486 # Dispatch to DatetimeLikeArrayMixin.isin
487 return pd_array(comps_array).isin(values)
488 elif needs_i8_conversion(values.dtype) and not is_object_dtype(comps_array.dtype):
489 # e.g. comps_array are integers and values are datetime64s
490 return np.zeros(comps_array.shape, dtype=bool)
491 # TODO: not quite right ... Sparse/Categorical
492 elif needs_i8_conversion(values.dtype):
493 return isin(comps_array, values.astype(object))
495 elif isinstance(values.dtype, ExtensionDtype):
496 return isin(np.asarray(comps_array), np.asarray(values))
498 # GH16012
499 # Ensure np.in1d doesn't get object types or it *may* throw an exception
500 # Albeit hashmap has O(1) look-up (vs. O(logn) in sorted array),
501 # in1d is faster for small sizes
502 if (
503 len(comps_array) > 1_000_000
504 and len(values) <= 26
505 and not is_object_dtype(comps_array)
506 ):
507 # If the values include nan we need to check for nan explicitly
508 # since np.nan it not equal to np.nan
509 if isna(values).any():
511 def f(c, v):
512 return np.logical_or(np.in1d(c, v), np.isnan(c))
514 else:
515 f = np.in1d
517 else:
518 common = np.find_common_type([values.dtype, comps_array.dtype], [])
519 values = values.astype(common, copy=False)
520 comps_array = comps_array.astype(common, copy=False)
521 f = htable.ismember
523 return f(comps_array, values)
526def factorize_array(
527 values: np.ndarray,
528 na_sentinel: int | None = -1,
529 size_hint: int | None = None,
530 na_value: object = None,
531 mask: npt.NDArray[np.bool_] | None = None,
532) -> tuple[npt.NDArray[np.intp], np.ndarray]:
533 """
534 Factorize a numpy array to codes and uniques.
536 This doesn't do any coercion of types or unboxing before factorization.
538 Parameters
539 ----------
540 values : ndarray
541 na_sentinel : int, default -1
542 size_hint : int, optional
543 Passed through to the hashtable's 'get_labels' method
544 na_value : object, optional
545 A value in `values` to consider missing. Note: only use this
546 parameter when you know that you don't have any values pandas would
547 consider missing in the array (NaN for float data, iNaT for
548 datetimes, etc.).
549 mask : ndarray[bool], optional
550 If not None, the mask is used as indicator for missing values
551 (True = missing, False = valid) instead of `na_value` or
552 condition "val != val".
554 Returns
555 -------
556 codes : ndarray[np.intp]
557 uniques : ndarray
558 """
559 ignore_na = na_sentinel is not None
560 if not ignore_na:
561 na_sentinel = -1
563 original = values
564 if values.dtype.kind in ["m", "M"]:
565 # _get_hashtable_algo will cast dt64/td64 to i8 via _ensure_data, so we
566 # need to do the same to na_value. We are assuming here that the passed
567 # na_value is an appropriately-typed NaT.
568 # e.g. test_where_datetimelike_categorical
569 na_value = iNaT
571 hash_klass, values = _get_hashtable_algo(values)
573 table = hash_klass(size_hint or len(values))
574 uniques, codes = table.factorize(
575 values,
576 na_sentinel=na_sentinel,
577 na_value=na_value,
578 mask=mask,
579 ignore_na=ignore_na,
580 )
582 # re-cast e.g. i8->dt64/td64, uint8->bool
583 uniques = _reconstruct_data(uniques, original.dtype, original)
585 codes = ensure_platform_int(codes)
586 return codes, uniques
589@doc(
590 values=dedent(
591 """\
592 values : sequence
593 A 1-D sequence. Sequences that aren't pandas objects are
594 coerced to ndarrays before factorization.
595 """
596 ),
597 sort=dedent(
598 """\
599 sort : bool, default False
600 Sort `uniques` and shuffle `codes` to maintain the
601 relationship.
602 """
603 ),
604 size_hint=dedent(
605 """\
606 size_hint : int, optional
607 Hint to the hashtable sizer.
608 """
609 ),
610)
611def factorize(
612 values,
613 sort: bool = False,
614 na_sentinel: int | None | lib.NoDefault = lib.no_default,
615 use_na_sentinel: bool | lib.NoDefault = lib.no_default,
616 size_hint: int | None = None,
617) -> tuple[np.ndarray, np.ndarray | Index]:
618 """
619 Encode the object as an enumerated type or categorical variable.
621 This method is useful for obtaining a numeric representation of an
622 array when all that matters is identifying distinct values. `factorize`
623 is available as both a top-level function :func:`pandas.factorize`,
624 and as a method :meth:`Series.factorize` and :meth:`Index.factorize`.
626 Parameters
627 ----------
628 {values}{sort}
629 na_sentinel : int or None, default -1
630 Value to mark "not found". If None, will not drop the NaN
631 from the uniques of the values.
633 .. deprecated:: 1.5.0
634 The na_sentinel argument is deprecated and
635 will be removed in a future version of pandas. Specify use_na_sentinel as
636 either True or False.
638 .. versionchanged:: 1.1.2
640 use_na_sentinel : bool, default True
641 If True, the sentinel -1 will be used for NaN values. If False,
642 NaN values will be encoded as non-negative integers and will not drop the
643 NaN from the uniques of the values.
645 .. versionadded:: 1.5.0
646 {size_hint}\
648 Returns
649 -------
650 codes : ndarray
651 An integer ndarray that's an indexer into `uniques`.
652 ``uniques.take(codes)`` will have the same values as `values`.
653 uniques : ndarray, Index, or Categorical
654 The unique valid values. When `values` is Categorical, `uniques`
655 is a Categorical. When `values` is some other pandas object, an
656 `Index` is returned. Otherwise, a 1-D ndarray is returned.
658 .. note::
660 Even if there's a missing value in `values`, `uniques` will
661 *not* contain an entry for it.
663 See Also
664 --------
665 cut : Discretize continuous-valued array.
666 unique : Find the unique value in an array.
668 Notes
669 -----
670 Reference :ref:`the user guide <reshaping.factorize>` for more examples.
672 Examples
673 --------
674 These examples all show factorize as a top-level method like
675 ``pd.factorize(values)``. The results are identical for methods like
676 :meth:`Series.factorize`.
678 >>> codes, uniques = pd.factorize(['b', 'b', 'a', 'c', 'b'])
679 >>> codes
680 array([0, 0, 1, 2, 0]...)
681 >>> uniques
682 array(['b', 'a', 'c'], dtype=object)
684 With ``sort=True``, the `uniques` will be sorted, and `codes` will be
685 shuffled so that the relationship is the maintained.
687 >>> codes, uniques = pd.factorize(['b', 'b', 'a', 'c', 'b'], sort=True)
688 >>> codes
689 array([1, 1, 0, 2, 1]...)
690 >>> uniques
691 array(['a', 'b', 'c'], dtype=object)
693 When ``use_na_sentinel=True`` (the default), missing values are indicated in
694 the `codes` with the sentinel value ``-1`` and missing values are not
695 included in `uniques`.
697 >>> codes, uniques = pd.factorize(['b', None, 'a', 'c', 'b'])
698 >>> codes
699 array([ 0, -1, 1, 2, 0]...)
700 >>> uniques
701 array(['b', 'a', 'c'], dtype=object)
703 Thus far, we've only factorized lists (which are internally coerced to
704 NumPy arrays). When factorizing pandas objects, the type of `uniques`
705 will differ. For Categoricals, a `Categorical` is returned.
707 >>> cat = pd.Categorical(['a', 'a', 'c'], categories=['a', 'b', 'c'])
708 >>> codes, uniques = pd.factorize(cat)
709 >>> codes
710 array([0, 0, 1]...)
711 >>> uniques
712 ['a', 'c']
713 Categories (3, object): ['a', 'b', 'c']
715 Notice that ``'b'`` is in ``uniques.categories``, despite not being
716 present in ``cat.values``.
718 For all other pandas objects, an Index of the appropriate type is
719 returned.
721 >>> cat = pd.Series(['a', 'a', 'c'])
722 >>> codes, uniques = pd.factorize(cat)
723 >>> codes
724 array([0, 0, 1]...)
725 >>> uniques
726 Index(['a', 'c'], dtype='object')
728 If NaN is in the values, and we want to include NaN in the uniques of the
729 values, it can be achieved by setting ``use_na_sentinel=False``.
731 >>> values = np.array([1, 2, 1, np.nan])
732 >>> codes, uniques = pd.factorize(values) # default: use_na_sentinel=True
733 >>> codes
734 array([ 0, 1, 0, -1])
735 >>> uniques
736 array([1., 2.])
738 >>> codes, uniques = pd.factorize(values, use_na_sentinel=False)
739 >>> codes
740 array([0, 1, 0, 2])
741 >>> uniques
742 array([ 1., 2., nan])
743 """
744 # Implementation notes: This method is responsible for 3 things
745 # 1.) coercing data to array-like (ndarray, Index, extension array)
746 # 2.) factorizing codes and uniques
747 # 3.) Maybe boxing the uniques in an Index
748 #
749 # Step 2 is dispatched to extension types (like Categorical). They are
750 # responsible only for factorization. All data coercion, sorting and boxing
751 # should happen here.
753 # GH#46910 deprecated na_sentinel in favor of use_na_sentinel:
754 # na_sentinel=None corresponds to use_na_sentinel=False
755 # na_sentinel=-1 correspond to use_na_sentinel=True
756 # Other na_sentinel values will not be supported when the deprecation is enforced.
757 na_sentinel = resolve_na_sentinel(na_sentinel, use_na_sentinel)
758 if isinstance(values, ABCRangeIndex):
759 return values.factorize(sort=sort)
761 values = _ensure_arraylike(values)
762 original = values
763 if not isinstance(values, ABCMultiIndex):
764 values = extract_array(values, extract_numpy=True)
766 # GH35667, if na_sentinel=None, we will not dropna NaNs from the uniques
767 # of values, assign na_sentinel=-1 to replace code value for NaN.
768 dropna = na_sentinel is not None
770 if (
771 isinstance(values, (ABCDatetimeArray, ABCTimedeltaArray))
772 and values.freq is not None
773 ):
774 # The presence of 'freq' means we can fast-path sorting and know there
775 # aren't NAs
776 codes, uniques = values.factorize(sort=sort)
777 return _re_wrap_factorize(original, uniques, codes)
779 elif not isinstance(values.dtype, np.dtype):
780 if (
781 na_sentinel == -1 or na_sentinel is None
782 ) and "use_na_sentinel" in inspect.signature(values.factorize).parameters:
783 # Avoid using catch_warnings when possible
784 # GH#46910 - TimelikeOps has deprecated signature
785 codes, uniques = values.factorize( # type: ignore[call-arg]
786 use_na_sentinel=na_sentinel is not None
787 )
788 else:
789 na_sentinel_arg = -1 if na_sentinel is None else na_sentinel
790 with warnings.catch_warnings():
791 # We've already warned above
792 warnings.filterwarnings("ignore", ".*use_na_sentinel.*", FutureWarning)
793 codes, uniques = values.factorize(na_sentinel=na_sentinel_arg)
795 else:
796 values = np.asarray(values) # convert DTA/TDA/MultiIndex
797 # TODO: pass na_sentinel=na_sentinel to factorize_array. When sort is True and
798 # na_sentinel is None we append NA on the end because safe_sort does not
799 # handle null values in uniques.
800 if na_sentinel is None and sort:
801 na_sentinel_arg = -1
802 elif na_sentinel is None:
803 na_sentinel_arg = None
804 else:
805 na_sentinel_arg = na_sentinel
807 if not dropna and not sort and is_object_dtype(values):
808 # factorize can now handle differentiating various types of null values.
809 # These can only occur when the array has object dtype.
810 # However, for backwards compatibility we only use the null for the
811 # provided dtype. This may be revisited in the future, see GH#48476.
812 null_mask = isna(values)
813 if null_mask.any():
814 na_value = na_value_for_dtype(values.dtype, compat=False)
815 # Don't modify (potentially user-provided) array
816 values = np.where(null_mask, na_value, values)
818 codes, uniques = factorize_array(
819 values,
820 na_sentinel=na_sentinel_arg,
821 size_hint=size_hint,
822 )
824 if sort and len(uniques) > 0:
825 if na_sentinel is None:
826 # TODO: Can remove when na_sentinel=na_sentinel as in TODO above
827 na_sentinel = -1
828 uniques, codes = safe_sort(
829 uniques, codes, na_sentinel=na_sentinel, assume_unique=True, verify=False
830 )
832 if not dropna and sort:
833 # TODO: Can remove entire block when na_sentinel=na_sentinel as in TODO above
834 if na_sentinel is None:
835 na_sentinel_arg = -1
836 else:
837 na_sentinel_arg = na_sentinel
838 code_is_na = codes == na_sentinel_arg
839 if code_is_na.any():
840 # na_value is set based on the dtype of uniques, and compat set to False is
841 # because we do not want na_value to be 0 for integers
842 na_value = na_value_for_dtype(uniques.dtype, compat=False)
843 uniques = np.append(uniques, [na_value])
844 codes = np.where(code_is_na, len(uniques) - 1, codes)
846 uniques = _reconstruct_data(uniques, original.dtype, original)
848 return _re_wrap_factorize(original, uniques, codes)
851def resolve_na_sentinel(
852 na_sentinel: int | None | lib.NoDefault,
853 use_na_sentinel: bool | lib.NoDefault,
854) -> int | None:
855 """
856 Determine value of na_sentinel for factorize methods.
858 See GH#46910 for details on the deprecation.
860 Parameters
861 ----------
862 na_sentinel : int, None, or lib.no_default
863 Value passed to the method.
864 use_na_sentinel : bool or lib.no_default
865 Value passed to the method.
867 Returns
868 -------
869 Resolved value of na_sentinel.
870 """
871 if na_sentinel is not lib.no_default and use_na_sentinel is not lib.no_default:
872 raise ValueError(
873 "Cannot specify both `na_sentinel` and `use_na_sentile`; "
874 f"got `na_sentinel={na_sentinel}` and `use_na_sentinel={use_na_sentinel}`"
875 )
876 if na_sentinel is lib.no_default:
877 result = -1 if use_na_sentinel is lib.no_default or use_na_sentinel else None
878 else:
879 if na_sentinel is None:
880 msg = (
881 "Specifying `na_sentinel=None` is deprecated, specify "
882 "`use_na_sentinel=False` instead."
883 )
884 elif na_sentinel == -1:
885 msg = (
886 "Specifying `na_sentinel=-1` is deprecated, specify "
887 "`use_na_sentinel=True` instead."
888 )
889 else:
890 msg = (
891 "Specifying the specific value to use for `na_sentinel` is "
892 "deprecated and will be removed in a future version of pandas. "
893 "Specify `use_na_sentinel=True` to use the sentinel value -1, and "
894 "`use_na_sentinel=False` to encode NaN values."
895 )
896 warnings.warn(msg, FutureWarning, stacklevel=find_stack_level())
897 result = na_sentinel
898 return result
901def _re_wrap_factorize(original, uniques, codes: np.ndarray):
902 """
903 Wrap factorize results in Series or Index depending on original type.
904 """
905 if isinstance(original, ABCIndex):
906 uniques = ensure_wrapped_if_datetimelike(uniques)
907 uniques = original._shallow_copy(uniques, name=None)
908 elif isinstance(original, ABCSeries):
909 from pandas import Index
911 uniques = Index(uniques)
913 return codes, uniques
916def value_counts(
917 values,
918 sort: bool = True,
919 ascending: bool = False,
920 normalize: bool = False,
921 bins=None,
922 dropna: bool = True,
923) -> Series:
924 """
925 Compute a histogram of the counts of non-null values.
927 Parameters
928 ----------
929 values : ndarray (1-d)
930 sort : bool, default True
931 Sort by values
932 ascending : bool, default False
933 Sort in ascending order
934 normalize: bool, default False
935 If True then compute a relative histogram
936 bins : integer, optional
937 Rather than count values, group them into half-open bins,
938 convenience for pd.cut, only works with numeric data
939 dropna : bool, default True
940 Don't include counts of NaN
942 Returns
943 -------
944 Series
945 """
946 from pandas import (
947 Index,
948 Series,
949 )
951 name = getattr(values, "name", None)
953 if bins is not None:
954 from pandas.core.reshape.tile import cut
956 values = Series(values)
957 try:
958 ii = cut(values, bins, include_lowest=True)
959 except TypeError as err:
960 raise TypeError("bins argument only works with numeric data.") from err
962 # count, remove nulls (from the index), and but the bins
963 result = ii.value_counts(dropna=dropna)
964 result = result[result.index.notna()]
965 result.index = result.index.astype("interval")
966 result = result.sort_index()
968 # if we are dropna and we have NO values
969 if dropna and (result._values == 0).all():
970 result = result.iloc[0:0]
972 # normalizing is by len of all (regardless of dropna)
973 counts = np.array([len(ii)])
975 else:
977 if is_extension_array_dtype(values):
979 # handle Categorical and sparse,
980 result = Series(values)._values.value_counts(dropna=dropna)
981 result.name = name
982 counts = result._values
984 else:
985 values = _ensure_arraylike(values)
986 keys, counts = value_counts_arraylike(values, dropna)
988 # For backwards compatibility, we let Index do its normal type
989 # inference, _except_ for if if infers from object to bool.
990 idx = Index._with_infer(keys)
991 if idx.dtype == bool and keys.dtype == object:
992 idx = idx.astype(object)
994 result = Series(counts, index=idx, name=name)
996 if sort:
997 result = result.sort_values(ascending=ascending)
999 if normalize:
1000 result = result / counts.sum()
1002 return result
1005# Called once from SparseArray, otherwise could be private
1006def value_counts_arraylike(
1007 values: np.ndarray, dropna: bool, mask: npt.NDArray[np.bool_] | None = None
1008) -> tuple[ArrayLike, npt.NDArray[np.int64]]:
1009 """
1010 Parameters
1011 ----------
1012 values : np.ndarray
1013 dropna : bool
1014 mask : np.ndarray[bool] or None, default None
1016 Returns
1017 -------
1018 uniques : np.ndarray
1019 counts : np.ndarray[np.int64]
1020 """
1021 original = values
1022 values = _ensure_data(values)
1024 keys, counts = htable.value_count(values, dropna, mask=mask)
1026 if needs_i8_conversion(original.dtype):
1027 # datetime, timedelta, or period
1029 if dropna:
1030 mask = keys != iNaT
1031 keys, counts = keys[mask], counts[mask]
1033 res_keys = _reconstruct_data(keys, original.dtype, original)
1034 return res_keys, counts
1037def duplicated(
1038 values: ArrayLike, keep: Literal["first", "last", False] = "first"
1039) -> npt.NDArray[np.bool_]:
1040 """
1041 Return boolean ndarray denoting duplicate values.
1043 Parameters
1044 ----------
1045 values : nd.array, ExtensionArray or Series
1046 Array over which to check for duplicate values.
1047 keep : {'first', 'last', False}, default 'first'
1048 - ``first`` : Mark duplicates as ``True`` except for the first
1049 occurrence.
1050 - ``last`` : Mark duplicates as ``True`` except for the last
1051 occurrence.
1052 - False : Mark all duplicates as ``True``.
1054 Returns
1055 -------
1056 duplicated : ndarray[bool]
1057 """
1058 values = _ensure_data(values)
1059 return htable.duplicated(values, keep=keep)
1062def mode(
1063 values: ArrayLike, dropna: bool = True, mask: npt.NDArray[np.bool_] | None = None
1064) -> ArrayLike:
1065 """
1066 Returns the mode(s) of an array.
1068 Parameters
1069 ----------
1070 values : array-like
1071 Array over which to check for duplicate values.
1072 dropna : bool, default True
1073 Don't consider counts of NaN/NaT.
1075 Returns
1076 -------
1077 np.ndarray or ExtensionArray
1078 """
1079 values = _ensure_arraylike(values)
1080 original = values
1082 if needs_i8_conversion(values.dtype):
1083 # Got here with ndarray; dispatch to DatetimeArray/TimedeltaArray.
1084 values = ensure_wrapped_if_datetimelike(values)
1085 values = cast("ExtensionArray", values)
1086 return values._mode(dropna=dropna)
1088 values = _ensure_data(values)
1090 npresult = htable.mode(values, dropna=dropna, mask=mask)
1091 try:
1092 npresult = np.sort(npresult)
1093 except TypeError as err:
1094 warnings.warn(
1095 f"Unable to sort modes: {err}",
1096 stacklevel=find_stack_level(),
1097 )
1099 result = _reconstruct_data(npresult, original.dtype, original)
1100 return result
1103def rank(
1104 values: ArrayLike,
1105 axis: int = 0,
1106 method: str = "average",
1107 na_option: str = "keep",
1108 ascending: bool = True,
1109 pct: bool = False,
1110) -> npt.NDArray[np.float64]:
1111 """
1112 Rank the values along a given axis.
1114 Parameters
1115 ----------
1116 values : np.ndarray or ExtensionArray
1117 Array whose values will be ranked. The number of dimensions in this
1118 array must not exceed 2.
1119 axis : int, default 0
1120 Axis over which to perform rankings.
1121 method : {'average', 'min', 'max', 'first', 'dense'}, default 'average'
1122 The method by which tiebreaks are broken during the ranking.
1123 na_option : {'keep', 'top'}, default 'keep'
1124 The method by which NaNs are placed in the ranking.
1125 - ``keep``: rank each NaN value with a NaN ranking
1126 - ``top``: replace each NaN with either +/- inf so that they
1127 there are ranked at the top
1128 ascending : bool, default True
1129 Whether or not the elements should be ranked in ascending order.
1130 pct : bool, default False
1131 Whether or not to the display the returned rankings in integer form
1132 (e.g. 1, 2, 3) or in percentile form (e.g. 0.333..., 0.666..., 1).
1133 """
1134 is_datetimelike = needs_i8_conversion(values.dtype)
1135 values = _ensure_data(values)
1137 if values.ndim == 1:
1138 ranks = algos.rank_1d(
1139 values,
1140 is_datetimelike=is_datetimelike,
1141 ties_method=method,
1142 ascending=ascending,
1143 na_option=na_option,
1144 pct=pct,
1145 )
1146 elif values.ndim == 2:
1147 ranks = algos.rank_2d(
1148 values,
1149 axis=axis,
1150 is_datetimelike=is_datetimelike,
1151 ties_method=method,
1152 ascending=ascending,
1153 na_option=na_option,
1154 pct=pct,
1155 )
1156 else:
1157 raise TypeError("Array with ndim > 2 are not supported.")
1159 return ranks
1162def checked_add_with_arr(
1163 arr: npt.NDArray[np.int64],
1164 b: int | npt.NDArray[np.int64],
1165 arr_mask: npt.NDArray[np.bool_] | None = None,
1166 b_mask: npt.NDArray[np.bool_] | None = None,
1167) -> npt.NDArray[np.int64]:
1168 """
1169 Perform array addition that checks for underflow and overflow.
1171 Performs the addition of an int64 array and an int64 integer (or array)
1172 but checks that they do not result in overflow first. For elements that
1173 are indicated to be NaN, whether or not there is overflow for that element
1174 is automatically ignored.
1176 Parameters
1177 ----------
1178 arr : np.ndarray[int64] addend.
1179 b : array or scalar addend.
1180 arr_mask : np.ndarray[bool] or None, default None
1181 array indicating which elements to exclude from checking
1182 b_mask : np.ndarray[bool] or None, default None
1183 array or scalar indicating which element(s) to exclude from checking
1185 Returns
1186 -------
1187 sum : An array for elements x + b for each element x in arr if b is
1188 a scalar or an array for elements x + y for each element pair
1189 (x, y) in (arr, b).
1191 Raises
1192 ------
1193 OverflowError if any x + y exceeds the maximum or minimum int64 value.
1194 """
1195 # For performance reasons, we broadcast 'b' to the new array 'b2'
1196 # so that it has the same size as 'arr'.
1197 b2 = np.broadcast_to(b, arr.shape)
1198 if b_mask is not None:
1199 # We do the same broadcasting for b_mask as well.
1200 b2_mask = np.broadcast_to(b_mask, arr.shape)
1201 else:
1202 b2_mask = None
1204 # For elements that are NaN, regardless of their value, we should
1205 # ignore whether they overflow or not when doing the checked add.
1206 if arr_mask is not None and b2_mask is not None:
1207 not_nan = np.logical_not(arr_mask | b2_mask)
1208 elif arr_mask is not None:
1209 not_nan = np.logical_not(arr_mask)
1210 elif b_mask is not None:
1211 # error: Argument 1 to "__call__" of "_UFunc_Nin1_Nout1" has
1212 # incompatible type "Optional[ndarray[Any, dtype[bool_]]]";
1213 # expected "Union[_SupportsArray[dtype[Any]], _NestedSequence
1214 # [_SupportsArray[dtype[Any]]], bool, int, float, complex, str
1215 # , bytes, _NestedSequence[Union[bool, int, float, complex, str
1216 # , bytes]]]"
1217 not_nan = np.logical_not(b2_mask) # type: ignore[arg-type]
1218 else:
1219 not_nan = np.empty(arr.shape, dtype=bool)
1220 not_nan.fill(True)
1222 # gh-14324: For each element in 'arr' and its corresponding element
1223 # in 'b2', we check the sign of the element in 'b2'. If it is positive,
1224 # we then check whether its sum with the element in 'arr' exceeds
1225 # np.iinfo(np.int64).max. If so, we have an overflow error. If it
1226 # it is negative, we then check whether its sum with the element in
1227 # 'arr' exceeds np.iinfo(np.int64).min. If so, we have an overflow
1228 # error as well.
1229 i8max = lib.i8max
1230 i8min = iNaT
1232 mask1 = b2 > 0
1233 mask2 = b2 < 0
1235 if not mask1.any():
1236 to_raise = ((i8min - b2 > arr) & not_nan).any()
1237 elif not mask2.any():
1238 to_raise = ((i8max - b2 < arr) & not_nan).any()
1239 else:
1240 to_raise = ((i8max - b2[mask1] < arr[mask1]) & not_nan[mask1]).any() or (
1241 (i8min - b2[mask2] > arr[mask2]) & not_nan[mask2]
1242 ).any()
1244 if to_raise:
1245 raise OverflowError("Overflow in int64 addition")
1247 result = arr + b
1248 if arr_mask is not None or b2_mask is not None:
1249 np.putmask(result, ~not_nan, iNaT)
1251 return result
1254# --------------- #
1255# select n #
1256# --------------- #
1259class SelectN:
1260 def __init__(self, obj, n: int, keep: str) -> None:
1261 self.obj = obj
1262 self.n = n
1263 self.keep = keep
1265 if self.keep not in ("first", "last", "all"):
1266 raise ValueError('keep must be either "first", "last" or "all"')
1268 def compute(self, method: str) -> DataFrame | Series:
1269 raise NotImplementedError
1271 @final
1272 def nlargest(self):
1273 return self.compute("nlargest")
1275 @final
1276 def nsmallest(self):
1277 return self.compute("nsmallest")
1279 @final
1280 @staticmethod
1281 def is_valid_dtype_n_method(dtype: DtypeObj) -> bool:
1282 """
1283 Helper function to determine if dtype is valid for
1284 nsmallest/nlargest methods
1285 """
1286 return (
1287 is_numeric_dtype(dtype) and not is_complex_dtype(dtype)
1288 ) or needs_i8_conversion(dtype)
1291class SelectNSeries(SelectN):
1292 """
1293 Implement n largest/smallest for Series
1295 Parameters
1296 ----------
1297 obj : Series
1298 n : int
1299 keep : {'first', 'last'}, default 'first'
1301 Returns
1302 -------
1303 nordered : Series
1304 """
1306 def compute(self, method: str) -> Series:
1308 from pandas.core.reshape.concat import concat
1310 n = self.n
1311 dtype = self.obj.dtype
1312 if not self.is_valid_dtype_n_method(dtype):
1313 raise TypeError(f"Cannot use method '{method}' with dtype {dtype}")
1315 if n <= 0:
1316 return self.obj[[]]
1318 dropped = self.obj.dropna()
1319 nan_index = self.obj.drop(dropped.index)
1321 # slow method
1322 if n >= len(self.obj):
1323 ascending = method == "nsmallest"
1324 return self.obj.sort_values(ascending=ascending).head(n)
1326 # fast method
1327 new_dtype = dropped.dtype
1328 arr = _ensure_data(dropped.values)
1329 if method == "nlargest":
1330 arr = -arr
1331 if is_integer_dtype(new_dtype):
1332 # GH 21426: ensure reverse ordering at boundaries
1333 arr -= 1
1335 elif is_bool_dtype(new_dtype):
1336 # GH 26154: ensure False is smaller than True
1337 arr = 1 - (-arr)
1339 if self.keep == "last":
1340 arr = arr[::-1]
1342 nbase = n
1343 narr = len(arr)
1344 n = min(n, narr)
1346 # arr passed into kth_smallest must be contiguous. We copy
1347 # here because kth_smallest will modify its input
1348 kth_val = algos.kth_smallest(arr.copy(order="C"), n - 1)
1349 (ns,) = np.nonzero(arr <= kth_val)
1350 inds = ns[arr[ns].argsort(kind="mergesort")]
1352 if self.keep != "all":
1353 inds = inds[:n]
1354 findex = nbase
1355 else:
1356 if len(inds) < nbase and len(nan_index) + len(inds) >= nbase:
1357 findex = len(nan_index) + len(inds)
1358 else:
1359 findex = len(inds)
1361 if self.keep == "last":
1362 # reverse indices
1363 inds = narr - 1 - inds
1365 return concat([dropped.iloc[inds], nan_index]).iloc[:findex]
1368class SelectNFrame(SelectN):
1369 """
1370 Implement n largest/smallest for DataFrame
1372 Parameters
1373 ----------
1374 obj : DataFrame
1375 n : int
1376 keep : {'first', 'last'}, default 'first'
1377 columns : list or str
1379 Returns
1380 -------
1381 nordered : DataFrame
1382 """
1384 def __init__(self, obj: DataFrame, n: int, keep: str, columns: IndexLabel) -> None:
1385 super().__init__(obj, n, keep)
1386 if not is_list_like(columns) or isinstance(columns, tuple):
1387 columns = [columns]
1389 columns = cast(Sequence[Hashable], columns)
1390 columns = list(columns)
1391 self.columns = columns
1393 def compute(self, method: str) -> DataFrame:
1395 from pandas.core.api import Int64Index
1397 n = self.n
1398 frame = self.obj
1399 columns = self.columns
1401 for column in columns:
1402 dtype = frame[column].dtype
1403 if not self.is_valid_dtype_n_method(dtype):
1404 raise TypeError(
1405 f"Column {repr(column)} has dtype {dtype}, "
1406 f"cannot use method {repr(method)} with this dtype"
1407 )
1409 def get_indexer(current_indexer, other_indexer):
1410 """
1411 Helper function to concat `current_indexer` and `other_indexer`
1412 depending on `method`
1413 """
1414 if method == "nsmallest":
1415 return current_indexer.append(other_indexer)
1416 else:
1417 return other_indexer.append(current_indexer)
1419 # Below we save and reset the index in case index contains duplicates
1420 original_index = frame.index
1421 cur_frame = frame = frame.reset_index(drop=True)
1422 cur_n = n
1423 indexer = Int64Index([])
1425 for i, column in enumerate(columns):
1426 # For each column we apply method to cur_frame[column].
1427 # If it's the last column or if we have the number of
1428 # results desired we are done.
1429 # Otherwise there are duplicates of the largest/smallest
1430 # value and we need to look at the rest of the columns
1431 # to determine which of the rows with the largest/smallest
1432 # value in the column to keep.
1433 series = cur_frame[column]
1434 is_last_column = len(columns) - 1 == i
1435 values = getattr(series, method)(
1436 cur_n, keep=self.keep if is_last_column else "all"
1437 )
1439 if is_last_column or len(values) <= cur_n:
1440 indexer = get_indexer(indexer, values.index)
1441 break
1443 # Now find all values which are equal to
1444 # the (nsmallest: largest)/(nlargest: smallest)
1445 # from our series.
1446 border_value = values == values[values.index[-1]]
1448 # Some of these values are among the top-n
1449 # some aren't.
1450 unsafe_values = values[border_value]
1452 # These values are definitely among the top-n
1453 safe_values = values[~border_value]
1454 indexer = get_indexer(indexer, safe_values.index)
1456 # Go on and separate the unsafe_values on the remaining
1457 # columns.
1458 cur_frame = cur_frame.loc[unsafe_values.index]
1459 cur_n = n - len(indexer)
1461 frame = frame.take(indexer)
1463 # Restore the index on frame
1464 frame.index = original_index.take(indexer)
1466 # If there is only one column, the frame is already sorted.
1467 if len(columns) == 1:
1468 return frame
1470 ascending = method == "nsmallest"
1472 return frame.sort_values(columns, ascending=ascending, kind="mergesort")
1475# ---- #
1476# take #
1477# ---- #
1480def take(
1481 arr,
1482 indices: TakeIndexer,
1483 axis: int = 0,
1484 allow_fill: bool = False,
1485 fill_value=None,
1486):
1487 """
1488 Take elements from an array.
1490 Parameters
1491 ----------
1492 arr : array-like or scalar value
1493 Non array-likes (sequences/scalars without a dtype) are coerced
1494 to an ndarray.
1495 indices : sequence of int or one-dimensional np.ndarray of int
1496 Indices to be taken.
1497 axis : int, default 0
1498 The axis over which to select values.
1499 allow_fill : bool, default False
1500 How to handle negative values in `indices`.
1502 * False: negative values in `indices` indicate positional indices
1503 from the right (the default). This is similar to :func:`numpy.take`.
1505 * True: negative values in `indices` indicate
1506 missing values. These values are set to `fill_value`. Any other
1507 negative values raise a ``ValueError``.
1509 fill_value : any, optional
1510 Fill value to use for NA-indices when `allow_fill` is True.
1511 This may be ``None``, in which case the default NA value for
1512 the type (``self.dtype.na_value``) is used.
1514 For multi-dimensional `arr`, each *element* is filled with
1515 `fill_value`.
1517 Returns
1518 -------
1519 ndarray or ExtensionArray
1520 Same type as the input.
1522 Raises
1523 ------
1524 IndexError
1525 When `indices` is out of bounds for the array.
1526 ValueError
1527 When the indexer contains negative values other than ``-1``
1528 and `allow_fill` is True.
1530 Notes
1531 -----
1532 When `allow_fill` is False, `indices` may be whatever dimensionality
1533 is accepted by NumPy for `arr`.
1535 When `allow_fill` is True, `indices` should be 1-D.
1537 See Also
1538 --------
1539 numpy.take : Take elements from an array along an axis.
1541 Examples
1542 --------
1543 >>> import pandas as pd
1545 With the default ``allow_fill=False``, negative numbers indicate
1546 positional indices from the right.
1548 >>> pd.api.extensions.take(np.array([10, 20, 30]), [0, 0, -1])
1549 array([10, 10, 30])
1551 Setting ``allow_fill=True`` will place `fill_value` in those positions.
1553 >>> pd.api.extensions.take(np.array([10, 20, 30]), [0, 0, -1], allow_fill=True)
1554 array([10., 10., nan])
1556 >>> pd.api.extensions.take(np.array([10, 20, 30]), [0, 0, -1], allow_fill=True,
1557 ... fill_value=-10)
1558 array([ 10, 10, -10])
1559 """
1560 if not is_array_like(arr):
1561 arr = np.asarray(arr)
1563 indices = np.asarray(indices, dtype=np.intp)
1565 if allow_fill:
1566 # Pandas style, -1 means NA
1567 validate_indices(indices, arr.shape[axis])
1568 result = take_nd(
1569 arr, indices, axis=axis, allow_fill=True, fill_value=fill_value
1570 )
1571 else:
1572 # NumPy style
1573 result = arr.take(indices, axis=axis)
1574 return result
1577# ------------ #
1578# searchsorted #
1579# ------------ #
1582def searchsorted(
1583 arr: ArrayLike,
1584 value: NumpyValueArrayLike | ExtensionArray,
1585 side: Literal["left", "right"] = "left",
1586 sorter: NumpySorter = None,
1587) -> npt.NDArray[np.intp] | np.intp:
1588 """
1589 Find indices where elements should be inserted to maintain order.
1591 .. versionadded:: 0.25.0
1593 Find the indices into a sorted array `arr` (a) such that, if the
1594 corresponding elements in `value` were inserted before the indices,
1595 the order of `arr` would be preserved.
1597 Assuming that `arr` is sorted:
1599 ====== ================================
1600 `side` returned index `i` satisfies
1601 ====== ================================
1602 left ``arr[i-1] < value <= self[i]``
1603 right ``arr[i-1] <= value < self[i]``
1604 ====== ================================
1606 Parameters
1607 ----------
1608 arr: np.ndarray, ExtensionArray, Series
1609 Input array. If `sorter` is None, then it must be sorted in
1610 ascending order, otherwise `sorter` must be an array of indices
1611 that sort it.
1612 value : array-like or scalar
1613 Values to insert into `arr`.
1614 side : {'left', 'right'}, optional
1615 If 'left', the index of the first suitable location found is given.
1616 If 'right', return the last such index. If there is no suitable
1617 index, return either 0 or N (where N is the length of `self`).
1618 sorter : 1-D array-like, optional
1619 Optional array of integer indices that sort array a into ascending
1620 order. They are typically the result of argsort.
1622 Returns
1623 -------
1624 array of ints or int
1625 If value is array-like, array of insertion points.
1626 If value is scalar, a single integer.
1628 See Also
1629 --------
1630 numpy.searchsorted : Similar method from NumPy.
1631 """
1632 if sorter is not None:
1633 sorter = ensure_platform_int(sorter)
1635 if (
1636 isinstance(arr, np.ndarray)
1637 and is_integer_dtype(arr.dtype)
1638 and (is_integer(value) or is_integer_dtype(value))
1639 ):
1640 # if `arr` and `value` have different dtypes, `arr` would be
1641 # recast by numpy, causing a slow search.
1642 # Before searching below, we therefore try to give `value` the
1643 # same dtype as `arr`, while guarding against integer overflows.
1644 iinfo = np.iinfo(arr.dtype.type)
1645 value_arr = np.array([value]) if is_scalar(value) else np.array(value)
1646 if (value_arr >= iinfo.min).all() and (value_arr <= iinfo.max).all():
1647 # value within bounds, so no overflow, so can convert value dtype
1648 # to dtype of arr
1649 dtype = arr.dtype
1650 else:
1651 dtype = value_arr.dtype
1653 if is_scalar(value):
1654 # We know that value is int
1655 value = cast(int, dtype.type(value))
1656 else:
1657 value = pd_array(cast(ArrayLike, value), dtype=dtype)
1658 else:
1659 # E.g. if `arr` is an array with dtype='datetime64[ns]'
1660 # and `value` is a pd.Timestamp, we may need to convert value
1661 arr = ensure_wrapped_if_datetimelike(arr)
1663 # Argument 1 to "searchsorted" of "ndarray" has incompatible type
1664 # "Union[NumpyValueArrayLike, ExtensionArray]"; expected "NumpyValueArrayLike"
1665 return arr.searchsorted(value, side=side, sorter=sorter) # type: ignore[arg-type]
1668# ---- #
1669# diff #
1670# ---- #
1672_diff_special = {"float64", "float32", "int64", "int32", "int16", "int8"}
1675def diff(arr, n: int, axis: int = 0):
1676 """
1677 difference of n between self,
1678 analogous to s-s.shift(n)
1680 Parameters
1681 ----------
1682 arr : ndarray or ExtensionArray
1683 n : int
1684 number of periods
1685 axis : {0, 1}
1686 axis to shift on
1687 stacklevel : int, default 3
1688 The stacklevel for the lost dtype warning.
1690 Returns
1691 -------
1692 shifted
1693 """
1695 n = int(n)
1696 na = np.nan
1697 dtype = arr.dtype
1699 is_bool = is_bool_dtype(dtype)
1700 if is_bool:
1701 op = operator.xor
1702 else:
1703 op = operator.sub
1705 if isinstance(dtype, PandasDtype):
1706 # PandasArray cannot necessarily hold shifted versions of itself.
1707 arr = arr.to_numpy()
1708 dtype = arr.dtype
1710 if not isinstance(dtype, np.dtype):
1711 # i.e ExtensionDtype
1712 if hasattr(arr, f"__{op.__name__}__"):
1713 if axis != 0:
1714 raise ValueError(f"cannot diff {type(arr).__name__} on axis={axis}")
1715 return op(arr, arr.shift(n))
1716 else:
1717 warnings.warn(
1718 "dtype lost in 'diff()'. In the future this will raise a "
1719 "TypeError. Convert to a suitable dtype prior to calling 'diff'.",
1720 FutureWarning,
1721 stacklevel=find_stack_level(),
1722 )
1723 arr = np.asarray(arr)
1724 dtype = arr.dtype
1726 is_timedelta = False
1727 if needs_i8_conversion(arr.dtype):
1728 dtype = np.int64
1729 arr = arr.view("i8")
1730 na = iNaT
1731 is_timedelta = True
1733 elif is_bool:
1734 # We have to cast in order to be able to hold np.nan
1735 dtype = np.object_
1737 elif is_integer_dtype(dtype):
1738 # We have to cast in order to be able to hold np.nan
1740 # int8, int16 are incompatible with float64,
1741 # see https://github.com/cython/cython/issues/2646
1742 if arr.dtype.name in ["int8", "int16"]:
1743 dtype = np.float32
1744 else:
1745 dtype = np.float64
1747 orig_ndim = arr.ndim
1748 if orig_ndim == 1:
1749 # reshape so we can always use algos.diff_2d
1750 arr = arr.reshape(-1, 1)
1751 # TODO: require axis == 0
1753 dtype = np.dtype(dtype)
1754 out_arr = np.empty(arr.shape, dtype=dtype)
1756 na_indexer = [slice(None)] * 2
1757 na_indexer[axis] = slice(None, n) if n >= 0 else slice(n, None)
1758 out_arr[tuple(na_indexer)] = na
1760 if arr.dtype.name in _diff_special:
1761 # TODO: can diff_2d dtype specialization troubles be fixed by defining
1762 # out_arr inside diff_2d?
1763 algos.diff_2d(arr, out_arr, n, axis, datetimelike=is_timedelta)
1764 else:
1765 # To keep mypy happy, _res_indexer is a list while res_indexer is
1766 # a tuple, ditto for lag_indexer.
1767 _res_indexer = [slice(None)] * 2
1768 _res_indexer[axis] = slice(n, None) if n >= 0 else slice(None, n)
1769 res_indexer = tuple(_res_indexer)
1771 _lag_indexer = [slice(None)] * 2
1772 _lag_indexer[axis] = slice(None, -n) if n > 0 else slice(-n, None)
1773 lag_indexer = tuple(_lag_indexer)
1775 out_arr[res_indexer] = op(arr[res_indexer], arr[lag_indexer])
1777 if is_timedelta:
1778 out_arr = out_arr.view("timedelta64[ns]")
1780 if orig_ndim == 1:
1781 out_arr = out_arr[:, 0]
1782 return out_arr
1785# --------------------------------------------------------------------
1786# Helper functions
1788# Note: safe_sort is in algorithms.py instead of sorting.py because it is
1789# low-dependency, is used in this module, and used private methods from
1790# this module.
1791def safe_sort(
1792 values,
1793 codes=None,
1794 na_sentinel: int = -1,
1795 assume_unique: bool = False,
1796 verify: bool = True,
1797) -> np.ndarray | MultiIndex | tuple[np.ndarray | MultiIndex, np.ndarray]:
1798 """
1799 Sort ``values`` and reorder corresponding ``codes``.
1801 ``values`` should be unique if ``codes`` is not None.
1802 Safe for use with mixed types (int, str), orders ints before strs.
1804 Parameters
1805 ----------
1806 values : list-like
1807 Sequence; must be unique if ``codes`` is not None.
1808 codes : list_like, optional
1809 Indices to ``values``. All out of bound indices are treated as
1810 "not found" and will be masked with ``na_sentinel``.
1811 na_sentinel : int, default -1
1812 Value in ``codes`` to mark "not found".
1813 Ignored when ``codes`` is None.
1814 assume_unique : bool, default False
1815 When True, ``values`` are assumed to be unique, which can speed up
1816 the calculation. Ignored when ``codes`` is None.
1817 verify : bool, default True
1818 Check if codes are out of bound for the values and put out of bound
1819 codes equal to na_sentinel. If ``verify=False``, it is assumed there
1820 are no out of bound codes. Ignored when ``codes`` is None.
1822 .. versionadded:: 0.25.0
1824 Returns
1825 -------
1826 ordered : ndarray or MultiIndex
1827 Sorted ``values``
1828 new_codes : ndarray
1829 Reordered ``codes``; returned when ``codes`` is not None.
1831 Raises
1832 ------
1833 TypeError
1834 * If ``values`` is not list-like or if ``codes`` is neither None
1835 nor list-like
1836 * If ``values`` cannot be sorted
1837 ValueError
1838 * If ``codes`` is not None and ``values`` contain duplicates.
1839 """
1840 if not is_list_like(values):
1841 raise TypeError(
1842 "Only list-like objects are allowed to be passed to safe_sort as values"
1843 )
1844 original_values = values
1845 is_mi = isinstance(original_values, ABCMultiIndex)
1847 if not isinstance(values, (np.ndarray, ABCExtensionArray)):
1848 # don't convert to string types
1849 dtype, _ = infer_dtype_from_array(values)
1850 # error: Argument "dtype" to "asarray" has incompatible type "Union[dtype[Any],
1851 # ExtensionDtype]"; expected "Union[dtype[Any], None, type, _SupportsDType, str,
1852 # Union[Tuple[Any, int], Tuple[Any, Union[int, Sequence[int]]], List[Any],
1853 # _DTypeDict, Tuple[Any, Any]]]"
1854 values = np.asarray(values, dtype=dtype) # type: ignore[arg-type]
1856 sorter = None
1857 ordered: np.ndarray | MultiIndex
1859 if (
1860 not is_extension_array_dtype(values)
1861 and lib.infer_dtype(values, skipna=False) == "mixed-integer"
1862 ):
1863 ordered = _sort_mixed(values)
1864 else:
1865 try:
1866 sorter = values.argsort()
1867 if is_mi:
1868 # Operate on original object instead of casted array (MultiIndex)
1869 ordered = original_values.take(sorter)
1870 else:
1871 ordered = values.take(sorter)
1872 except TypeError:
1873 # Previous sorters failed or were not applicable, try `_sort_mixed`
1874 # which would work, but which fails for special case of 1d arrays
1875 # with tuples.
1876 if values.size and isinstance(values[0], tuple):
1877 ordered = _sort_tuples(values, original_values)
1878 else:
1879 ordered = _sort_mixed(values)
1881 # codes:
1883 if codes is None:
1884 return ordered
1886 if not is_list_like(codes):
1887 raise TypeError(
1888 "Only list-like objects or None are allowed to "
1889 "be passed to safe_sort as codes"
1890 )
1891 codes = ensure_platform_int(np.asarray(codes))
1893 if not assume_unique and not len(unique(values)) == len(values):
1894 raise ValueError("values should be unique if codes is not None")
1896 if sorter is None:
1897 # mixed types
1898 hash_klass, values = _get_hashtable_algo(values)
1899 t = hash_klass(len(values))
1900 t.map_locations(values)
1901 sorter = ensure_platform_int(t.lookup(ordered))
1903 if na_sentinel == -1:
1904 # take_nd is faster, but only works for na_sentinels of -1
1905 order2 = sorter.argsort()
1906 new_codes = take_nd(order2, codes, fill_value=-1)
1907 if verify:
1908 mask = (codes < -len(values)) | (codes >= len(values))
1909 else:
1910 mask = None
1911 else:
1912 reverse_indexer = np.empty(len(sorter), dtype=np.int_)
1913 reverse_indexer.put(sorter, np.arange(len(sorter)))
1914 # Out of bound indices will be masked with `na_sentinel` next, so we
1915 # may deal with them here without performance loss using `mode='wrap'`
1916 new_codes = reverse_indexer.take(codes, mode="wrap")
1918 mask = codes == na_sentinel
1919 if verify:
1920 mask = mask | (codes < -len(values)) | (codes >= len(values))
1922 if mask is not None:
1923 np.putmask(new_codes, mask, na_sentinel)
1925 return ordered, ensure_platform_int(new_codes)
1928def _sort_mixed(values) -> np.ndarray:
1929 """order ints before strings in 1d arrays, safe in py3"""
1930 str_pos = np.array([isinstance(x, str) for x in values], dtype=bool)
1931 none_pos = np.array([x is None for x in values], dtype=bool)
1932 nums = np.sort(values[~str_pos & ~none_pos])
1933 strs = np.sort(values[str_pos])
1934 return np.concatenate(
1935 [nums, np.asarray(strs, dtype=object), np.array(values[none_pos])]
1936 )
1939@overload
1940def _sort_tuples(values: np.ndarray, original_values: np.ndarray) -> np.ndarray:
1941 ...
1944@overload
1945def _sort_tuples(values: np.ndarray, original_values: MultiIndex) -> MultiIndex:
1946 ...
1949def _sort_tuples(
1950 values: np.ndarray, original_values: np.ndarray | MultiIndex
1951) -> np.ndarray | MultiIndex:
1952 """
1953 Convert array of tuples (1d) to array or array (2d).
1954 We need to keep the columns separately as they contain different types and
1955 nans (can't use `np.sort` as it may fail when str and nan are mixed in a
1956 column as types cannot be compared).
1957 We have to apply the indexer to the original values to keep the dtypes in
1958 case of MultiIndexes
1959 """
1960 from pandas.core.internals.construction import to_arrays
1961 from pandas.core.sorting import lexsort_indexer
1963 arrays, _ = to_arrays(values, None)
1964 indexer = lexsort_indexer(arrays, orders=True)
1965 return original_values[indexer]
1968def union_with_duplicates(lvals: ArrayLike, rvals: ArrayLike) -> ArrayLike:
1969 """
1970 Extracts the union from lvals and rvals with respect to duplicates and nans in
1971 both arrays.
1973 Parameters
1974 ----------
1975 lvals: np.ndarray or ExtensionArray
1976 left values which is ordered in front.
1977 rvals: np.ndarray or ExtensionArray
1978 right values ordered after lvals.
1980 Returns
1981 -------
1982 np.ndarray or ExtensionArray
1983 Containing the unsorted union of both arrays.
1985 Notes
1986 -----
1987 Caller is responsible for ensuring lvals.dtype == rvals.dtype.
1988 """
1989 indexer = []
1990 l_count = value_counts(lvals, dropna=False)
1991 r_count = value_counts(rvals, dropna=False)
1992 l_count, r_count = l_count.align(r_count, fill_value=0)
1993 unique_array = unique(concat_compat([lvals, rvals]))
1994 unique_array = ensure_wrapped_if_datetimelike(unique_array)
1996 for i, value in enumerate(unique_array):
1997 indexer += [i] * int(max(l_count.at[value], r_count.at[value]))
1998 return unique_array.take(indexer)