Coverage for /var/srv/projects/api.amasfac.comuna18.com/tmp/venv/lib/python3.9/site-packages/pandas/core/sorting.py: 8%
255 statements
« prev ^ index » next coverage.py v6.4.4, created at 2023-07-17 14:22 -0600
« prev ^ index » next coverage.py v6.4.4, created at 2023-07-17 14:22 -0600
1""" miscellaneous sorting / groupby utilities """
2from __future__ import annotations
4from collections import defaultdict
5from typing import (
6 TYPE_CHECKING,
7 Callable,
8 DefaultDict,
9 Hashable,
10 Iterable,
11 Sequence,
12 cast,
13)
14import warnings
16import numpy as np
18from pandas._libs import (
19 algos,
20 hashtable,
21 lib,
22)
23from pandas._libs.hashtable import unique_label_indices
24from pandas._typing import (
25 IndexKeyFunc,
26 Level,
27 NaPosition,
28 Shape,
29 SortKind,
30 npt,
31)
33from pandas.core.dtypes.common import (
34 ensure_int64,
35 ensure_platform_int,
36 is_extension_array_dtype,
37)
38from pandas.core.dtypes.generic import (
39 ABCMultiIndex,
40 ABCRangeIndex,
41)
42from pandas.core.dtypes.missing import isna
44from pandas.core.construction import extract_array
46if TYPE_CHECKING: 46 ↛ 47line 46 didn't jump to line 47, because the condition on line 46 was never true
47 from pandas import MultiIndex
48 from pandas.core.arrays import ExtensionArray
49 from pandas.core.indexes.base import Index
52def get_indexer_indexer(
53 target: Index,
54 level: Level | list[Level] | None,
55 ascending: Sequence[bool] | bool,
56 kind: SortKind,
57 na_position: NaPosition,
58 sort_remaining: bool,
59 key: IndexKeyFunc,
60) -> npt.NDArray[np.intp] | None:
61 """
62 Helper method that return the indexer according to input parameters for
63 the sort_index method of DataFrame and Series.
65 Parameters
66 ----------
67 target : Index
68 level : int or level name or list of ints or list of level names
69 ascending : bool or list of bools, default True
70 kind : {'quicksort', 'mergesort', 'heapsort', 'stable'}, default 'quicksort'
71 na_position : {'first', 'last'}, default 'last'
72 sort_remaining : bool, default True
73 key : callable, optional
75 Returns
76 -------
77 Optional[ndarray[intp]]
78 The indexer for the new index.
79 """
81 target = ensure_key_mapped(target, key, levels=level)
82 target = target._sort_levels_monotonic()
84 if level is not None:
85 _, indexer = target.sortlevel(
86 level, ascending=ascending, sort_remaining=sort_remaining
87 )
88 elif isinstance(target, ABCMultiIndex):
89 indexer = lexsort_indexer(
90 target._get_codes_for_sorting(), orders=ascending, na_position=na_position
91 )
92 else:
93 # Check monotonic-ness before sort an index (GH 11080)
94 if (ascending and target.is_monotonic_increasing) or (
95 not ascending and target.is_monotonic_decreasing
96 ):
97 return None
99 # ascending can only be a Sequence for MultiIndex
100 indexer = nargsort(
101 target,
102 kind=kind,
103 ascending=cast(bool, ascending),
104 na_position=na_position,
105 )
106 return indexer
109def get_group_index(
110 labels, shape: Shape, sort: bool, xnull: bool
111) -> npt.NDArray[np.int64]:
112 """
113 For the particular label_list, gets the offsets into the hypothetical list
114 representing the totally ordered cartesian product of all possible label
115 combinations, *as long as* this space fits within int64 bounds;
116 otherwise, though group indices identify unique combinations of
117 labels, they cannot be deconstructed.
118 - If `sort`, rank of returned ids preserve lexical ranks of labels.
119 i.e. returned id's can be used to do lexical sort on labels;
120 - If `xnull` nulls (-1 labels) are passed through.
122 Parameters
123 ----------
124 labels : sequence of arrays
125 Integers identifying levels at each location
126 shape : tuple[int, ...]
127 Number of unique levels at each location
128 sort : bool
129 If the ranks of returned ids should match lexical ranks of labels
130 xnull : bool
131 If true nulls are excluded. i.e. -1 values in the labels are
132 passed through.
134 Returns
135 -------
136 An array of type int64 where two elements are equal if their corresponding
137 labels are equal at all location.
139 Notes
140 -----
141 The length of `labels` and `shape` must be identical.
142 """
144 def _int64_cut_off(shape) -> int:
145 acc = 1
146 for i, mul in enumerate(shape):
147 acc *= int(mul)
148 if not acc < lib.i8max:
149 return i
150 return len(shape)
152 def maybe_lift(lab, size) -> tuple[np.ndarray, int]:
153 # promote nan values (assigned -1 label in lab array)
154 # so that all output values are non-negative
155 return (lab + 1, size + 1) if (lab == -1).any() else (lab, size)
157 labels = [ensure_int64(x) for x in labels]
158 lshape = list(shape)
159 if not xnull:
160 for i, (lab, size) in enumerate(zip(labels, shape)):
161 lab, size = maybe_lift(lab, size)
162 labels[i] = lab
163 lshape[i] = size
165 labels = list(labels)
167 # Iteratively process all the labels in chunks sized so less
168 # than lib.i8max unique int ids will be required for each chunk
169 while True:
170 # how many levels can be done without overflow:
171 nlev = _int64_cut_off(lshape)
173 # compute flat ids for the first `nlev` levels
174 stride = np.prod(lshape[1:nlev], dtype="i8")
175 out = stride * labels[0].astype("i8", subok=False, copy=False)
177 for i in range(1, nlev):
178 if lshape[i] == 0:
179 stride = np.int64(0)
180 else:
181 stride //= lshape[i]
182 out += labels[i] * stride
184 if xnull: # exclude nulls
185 mask = labels[0] == -1
186 for lab in labels[1:nlev]:
187 mask |= lab == -1
188 out[mask] = -1
190 if nlev == len(lshape): # all levels done!
191 break
193 # compress what has been done so far in order to avoid overflow
194 # to retain lexical ranks, obs_ids should be sorted
195 comp_ids, obs_ids = compress_group_index(out, sort=sort)
197 labels = [comp_ids] + labels[nlev:]
198 lshape = [len(obs_ids)] + lshape[nlev:]
200 return out
203def get_compressed_ids(
204 labels, sizes: Shape
205) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.int64]]:
206 """
207 Group_index is offsets into cartesian product of all possible labels. This
208 space can be huge, so this function compresses it, by computing offsets
209 (comp_ids) into the list of unique labels (obs_group_ids).
211 Parameters
212 ----------
213 labels : list of label arrays
214 sizes : tuple[int] of size of the levels
216 Returns
217 -------
218 np.ndarray[np.intp]
219 comp_ids
220 np.ndarray[np.int64]
221 obs_group_ids
222 """
223 ids = get_group_index(labels, sizes, sort=True, xnull=False)
224 return compress_group_index(ids, sort=True)
227def is_int64_overflow_possible(shape: Shape) -> bool:
228 the_prod = 1
229 for x in shape:
230 the_prod *= int(x)
232 return the_prod >= lib.i8max
235def _decons_group_index(
236 comp_labels: npt.NDArray[np.intp], shape: Shape
237) -> list[npt.NDArray[np.intp]]:
238 # reconstruct labels
239 if is_int64_overflow_possible(shape):
240 # at some point group indices are factorized,
241 # and may not be deconstructed here! wrong path!
242 raise ValueError("cannot deconstruct factorized group indices!")
244 label_list = []
245 factor = 1
246 y = np.array(0)
247 x = comp_labels
248 for i in reversed(range(len(shape))):
249 labels = (x - y) % (factor * shape[i]) // factor
250 np.putmask(labels, comp_labels < 0, -1)
251 label_list.append(labels)
252 y = labels * factor
253 factor *= shape[i]
254 return label_list[::-1]
257def decons_obs_group_ids(
258 comp_ids: npt.NDArray[np.intp],
259 obs_ids: npt.NDArray[np.intp],
260 shape: Shape,
261 labels: Sequence[npt.NDArray[np.signedinteger]],
262 xnull: bool,
263) -> list[npt.NDArray[np.intp]]:
264 """
265 Reconstruct labels from observed group ids.
267 Parameters
268 ----------
269 comp_ids : np.ndarray[np.intp]
270 obs_ids: np.ndarray[np.intp]
271 shape : tuple[int]
272 labels : Sequence[np.ndarray[np.signedinteger]]
273 xnull : bool
274 If nulls are excluded; i.e. -1 labels are passed through.
275 """
276 if not xnull:
277 lift = np.fromiter(((a == -1).any() for a in labels), dtype=np.intp)
278 arr_shape = np.asarray(shape, dtype=np.intp) + lift
279 shape = tuple(arr_shape)
281 if not is_int64_overflow_possible(shape):
282 # obs ids are deconstructable! take the fast route!
283 out = _decons_group_index(obs_ids, shape)
284 return out if xnull or not lift.any() else [x - y for x, y in zip(out, lift)]
286 indexer = unique_label_indices(comp_ids)
287 return [lab[indexer].astype(np.intp, subok=False, copy=True) for lab in labels]
290def indexer_from_factorized(
291 labels, shape: Shape, compress: bool = True
292) -> npt.NDArray[np.intp]:
293 ids = get_group_index(labels, shape, sort=True, xnull=False)
295 if not compress:
296 ngroups = (ids.size and ids.max()) + 1
297 else:
298 ids, obs = compress_group_index(ids, sort=True)
299 ngroups = len(obs)
301 return get_group_index_sorter(ids, ngroups)
304def lexsort_indexer(
305 keys, orders=None, na_position: str = "last", key: Callable | None = None
306) -> npt.NDArray[np.intp]:
307 """
308 Performs lexical sorting on a set of keys
310 Parameters
311 ----------
312 keys : sequence of arrays
313 Sequence of ndarrays to be sorted by the indexer
314 orders : bool or list of booleans, optional
315 Determines the sorting order for each element in keys. If a list,
316 it must be the same length as keys. This determines whether the
317 corresponding element in keys should be sorted in ascending
318 (True) or descending (False) order. if bool, applied to all
319 elements as above. if None, defaults to True.
320 na_position : {'first', 'last'}, default 'last'
321 Determines placement of NA elements in the sorted list ("last" or "first")
322 key : Callable, optional
323 Callable key function applied to every element in keys before sorting
325 .. versionadded:: 1.0.0
327 Returns
328 -------
329 np.ndarray[np.intp]
330 """
331 from pandas.core.arrays import Categorical
333 labels = []
334 shape = []
335 if isinstance(orders, bool):
336 orders = [orders] * len(keys)
337 elif orders is None:
338 orders = [True] * len(keys)
340 keys = [ensure_key_mapped(k, key) for k in keys]
342 for k, order in zip(keys, orders):
343 with warnings.catch_warnings():
344 # TODO(2.0): unnecessary once deprecation is enforced
345 # GH#45618 don't issue warning user can't do anything about
346 warnings.filterwarnings(
347 "ignore", ".*(SparseArray|SparseDtype).*", category=FutureWarning
348 )
350 cat = Categorical(k, ordered=True)
352 if na_position not in ["last", "first"]:
353 raise ValueError(f"invalid na_position: {na_position}")
355 n = len(cat.categories)
356 codes = cat.codes.copy()
358 mask = cat.codes == -1
359 if order: # ascending
360 if na_position == "last":
361 codes = np.where(mask, n, codes)
362 elif na_position == "first":
363 codes += 1
364 else: # not order means descending
365 if na_position == "last":
366 codes = np.where(mask, n, n - codes - 1)
367 elif na_position == "first":
368 codes = np.where(mask, 0, n - codes)
369 if mask.any():
370 n += 1
372 shape.append(n)
373 labels.append(codes)
375 return indexer_from_factorized(labels, tuple(shape))
378def nargsort(
379 items,
380 kind: str = "quicksort",
381 ascending: bool = True,
382 na_position: str = "last",
383 key: Callable | None = None,
384 mask: npt.NDArray[np.bool_] | None = None,
385) -> npt.NDArray[np.intp]:
386 """
387 Intended to be a drop-in replacement for np.argsort which handles NaNs.
389 Adds ascending, na_position, and key parameters.
391 (GH #6399, #5231, #27237)
393 Parameters
394 ----------
395 kind : str, default 'quicksort'
396 ascending : bool, default True
397 na_position : {'first', 'last'}, default 'last'
398 key : Optional[Callable], default None
399 mask : Optional[np.ndarray[bool]], default None
400 Passed when called by ExtensionArray.argsort.
402 Returns
403 -------
404 np.ndarray[np.intp]
405 """
407 if key is not None:
408 items = ensure_key_mapped(items, key)
409 return nargsort(
410 items,
411 kind=kind,
412 ascending=ascending,
413 na_position=na_position,
414 key=None,
415 mask=mask,
416 )
418 if isinstance(items, ABCRangeIndex):
419 return items.argsort(ascending=ascending) # TODO: test coverage with key?
420 elif not isinstance(items, ABCMultiIndex):
421 items = extract_array(items)
422 if mask is None:
423 mask = np.asarray(isna(items)) # TODO: does this exclude MultiIndex too?
425 if is_extension_array_dtype(items):
426 return items.argsort(ascending=ascending, kind=kind, na_position=na_position)
427 else:
428 items = np.asanyarray(items)
430 idx = np.arange(len(items))
431 non_nans = items[~mask]
432 non_nan_idx = idx[~mask]
434 nan_idx = np.nonzero(mask)[0]
435 if not ascending:
436 non_nans = non_nans[::-1]
437 non_nan_idx = non_nan_idx[::-1]
438 indexer = non_nan_idx[non_nans.argsort(kind=kind)]
439 if not ascending:
440 indexer = indexer[::-1]
441 # Finally, place the NaNs at the end or the beginning according to
442 # na_position
443 if na_position == "last":
444 indexer = np.concatenate([indexer, nan_idx])
445 elif na_position == "first":
446 indexer = np.concatenate([nan_idx, indexer])
447 else:
448 raise ValueError(f"invalid na_position: {na_position}")
449 return ensure_platform_int(indexer)
452def nargminmax(values: ExtensionArray, method: str, axis: int = 0):
453 """
454 Implementation of np.argmin/argmax but for ExtensionArray and which
455 handles missing values.
457 Parameters
458 ----------
459 values : ExtensionArray
460 method : {"argmax", "argmin"}
461 axis : int, default 0
463 Returns
464 -------
465 int
466 """
467 assert method in {"argmax", "argmin"}
468 func = np.argmax if method == "argmax" else np.argmin
470 mask = np.asarray(isna(values))
471 arr_values = values._values_for_argsort()
473 if arr_values.ndim > 1:
474 if mask.any():
475 if axis == 1:
476 zipped = zip(arr_values, mask)
477 else:
478 zipped = zip(arr_values.T, mask.T)
479 return np.array([_nanargminmax(v, m, func) for v, m in zipped])
480 return func(arr_values, axis=axis)
482 return _nanargminmax(arr_values, mask, func)
485def _nanargminmax(values: np.ndarray, mask: npt.NDArray[np.bool_], func) -> int:
486 """
487 See nanargminmax.__doc__.
488 """
489 idx = np.arange(values.shape[0])
490 non_nans = values[~mask]
491 non_nan_idx = idx[~mask]
493 return non_nan_idx[func(non_nans)]
496def _ensure_key_mapped_multiindex(
497 index: MultiIndex, key: Callable, level=None
498) -> MultiIndex:
499 """
500 Returns a new MultiIndex in which key has been applied
501 to all levels specified in level (or all levels if level
502 is None). Used for key sorting for MultiIndex.
504 Parameters
505 ----------
506 index : MultiIndex
507 Index to which to apply the key function on the
508 specified levels.
509 key : Callable
510 Function that takes an Index and returns an Index of
511 the same shape. This key is applied to each level
512 separately. The name of the level can be used to
513 distinguish different levels for application.
514 level : list-like, int or str, default None
515 Level or list of levels to apply the key function to.
516 If None, key function is applied to all levels. Other
517 levels are left unchanged.
519 Returns
520 -------
521 labels : MultiIndex
522 Resulting MultiIndex with modified levels.
523 """
525 if level is not None:
526 if isinstance(level, (str, int)):
527 sort_levels = [level]
528 else:
529 sort_levels = level
531 sort_levels = [index._get_level_number(lev) for lev in sort_levels]
532 else:
533 sort_levels = list(range(index.nlevels)) # satisfies mypy
535 mapped = [
536 ensure_key_mapped(index._get_level_values(level), key)
537 if level in sort_levels
538 else index._get_level_values(level)
539 for level in range(index.nlevels)
540 ]
542 return type(index).from_arrays(mapped)
545def ensure_key_mapped(values, key: Callable | None, levels=None):
546 """
547 Applies a callable key function to the values function and checks
548 that the resulting value has the same shape. Can be called on Index
549 subclasses, Series, DataFrames, or ndarrays.
551 Parameters
552 ----------
553 values : Series, DataFrame, Index subclass, or ndarray
554 key : Optional[Callable], key to be called on the values array
555 levels : Optional[List], if values is a MultiIndex, list of levels to
556 apply the key to.
557 """
558 from pandas.core.indexes.api import Index
560 if not key:
561 return values
563 if isinstance(values, ABCMultiIndex):
564 return _ensure_key_mapped_multiindex(values, key, level=levels)
566 result = key(values.copy())
567 if len(result) != len(values):
568 raise ValueError(
569 "User-provided `key` function must not change the shape of the array."
570 )
572 try:
573 if isinstance(
574 values, Index
575 ): # convert to a new Index subclass, not necessarily the same
576 result = Index(result)
577 else:
578 type_of_values = type(values)
579 result = type_of_values(result) # try to revert to original type otherwise
580 except TypeError:
581 raise TypeError(
582 f"User-provided `key` function returned an invalid type {type(result)} \
583 which could not be converted to {type(values)}."
584 )
586 return result
589def get_flattened_list(
590 comp_ids: npt.NDArray[np.intp],
591 ngroups: int,
592 levels: Iterable[Index],
593 labels: Iterable[np.ndarray],
594) -> list[tuple]:
595 """Map compressed group id -> key tuple."""
596 comp_ids = comp_ids.astype(np.int64, copy=False)
597 arrays: DefaultDict[int, list[int]] = defaultdict(list)
598 for labs, level in zip(labels, levels):
599 table = hashtable.Int64HashTable(ngroups)
600 table.map_keys_to_values(comp_ids, labs.astype(np.int64, copy=False))
601 for i in range(ngroups):
602 arrays[i].append(level[table.get_item(i)])
603 return [tuple(array) for array in arrays.values()]
606def get_indexer_dict(
607 label_list: list[np.ndarray], keys: list[Index]
608) -> dict[Hashable, npt.NDArray[np.intp]]:
609 """
610 Returns
611 -------
612 dict:
613 Labels mapped to indexers.
614 """
615 shape = tuple(len(x) for x in keys)
617 group_index = get_group_index(label_list, shape, sort=True, xnull=True)
618 if np.all(group_index == -1):
619 # Short-circuit, lib.indices_fast will return the same
620 return {}
621 ngroups = (
622 ((group_index.size and group_index.max()) + 1)
623 if is_int64_overflow_possible(shape)
624 else np.prod(shape, dtype="i8")
625 )
627 sorter = get_group_index_sorter(group_index, ngroups)
629 sorted_labels = [lab.take(sorter) for lab in label_list]
630 group_index = group_index.take(sorter)
632 return lib.indices_fast(sorter, group_index, keys, sorted_labels)
635# ----------------------------------------------------------------------
636# sorting levels...cleverly?
639def get_group_index_sorter(
640 group_index: npt.NDArray[np.intp], ngroups: int | None = None
641) -> npt.NDArray[np.intp]:
642 """
643 algos.groupsort_indexer implements `counting sort` and it is at least
644 O(ngroups), where
645 ngroups = prod(shape)
646 shape = map(len, keys)
647 that is, linear in the number of combinations (cartesian product) of unique
648 values of groupby keys. This can be huge when doing multi-key groupby.
649 np.argsort(kind='mergesort') is O(count x log(count)) where count is the
650 length of the data-frame;
651 Both algorithms are `stable` sort and that is necessary for correctness of
652 groupby operations. e.g. consider:
653 df.groupby(key)[col].transform('first')
655 Parameters
656 ----------
657 group_index : np.ndarray[np.intp]
658 signed integer dtype
659 ngroups : int or None, default None
661 Returns
662 -------
663 np.ndarray[np.intp]
664 """
665 if ngroups is None:
666 ngroups = 1 + group_index.max()
667 count = len(group_index)
668 alpha = 0.0 # taking complexities literally; there may be
669 beta = 1.0 # some room for fine-tuning these parameters
670 do_groupsort = count > 0 and ((alpha + beta * ngroups) < (count * np.log(count)))
671 if do_groupsort:
672 sorter, _ = algos.groupsort_indexer(
673 ensure_platform_int(group_index),
674 ngroups,
675 )
676 # sorter _should_ already be intp, but mypy is not yet able to verify
677 else:
678 sorter = group_index.argsort(kind="mergesort")
679 return ensure_platform_int(sorter)
682def compress_group_index(
683 group_index: npt.NDArray[np.int64], sort: bool = True
684) -> tuple[npt.NDArray[np.int64], npt.NDArray[np.int64]]:
685 """
686 Group_index is offsets into cartesian product of all possible labels. This
687 space can be huge, so this function compresses it, by computing offsets
688 (comp_ids) into the list of unique labels (obs_group_ids).
689 """
690 size_hint = len(group_index)
691 table = hashtable.Int64HashTable(size_hint)
693 group_index = ensure_int64(group_index)
695 # note, group labels come out ascending (ie, 1,2,3 etc)
696 comp_ids, obs_group_ids = table.get_labels_groupby(group_index)
698 if sort and len(obs_group_ids) > 0:
699 obs_group_ids, comp_ids = _reorder_by_uniques(obs_group_ids, comp_ids)
701 return ensure_int64(comp_ids), ensure_int64(obs_group_ids)
704def _reorder_by_uniques(
705 uniques: npt.NDArray[np.int64], labels: npt.NDArray[np.intp]
706) -> tuple[npt.NDArray[np.int64], npt.NDArray[np.intp]]:
707 """
708 Parameters
709 ----------
710 uniques : np.ndarray[np.int64]
711 labels : np.ndarray[np.intp]
713 Returns
714 -------
715 np.ndarray[np.int64]
716 np.ndarray[np.intp]
717 """
718 # sorter is index where elements ought to go
719 sorter = uniques.argsort()
721 # reverse_indexer is where elements came from
722 reverse_indexer = np.empty(len(sorter), dtype=np.intp)
723 reverse_indexer.put(sorter, np.arange(len(sorter)))
725 mask = labels < 0
727 # move labels to right locations (ie, unsort ascending labels)
728 labels = reverse_indexer.take(labels)
729 np.putmask(labels, mask, -1)
731 # sort observed ids
732 uniques = uniques.take(sorter)
734 return uniques, labels