Coverage for /var/srv/projects/api.amasfac.comuna18.com/tmp/venv/lib/python3.9/site-packages/pandas/core/sorting.py: 8%

1""" miscellaneous sorting / groupby utilities """

2from __future__ import annotations

4from collections import defaultdict

5from typing import (

6 TYPE_CHECKING,

7 Callable,

8 DefaultDict,

9 Hashable,

10 Iterable,

11 Sequence,

12 cast,

13)

14import warnings

16import numpy as np

18from pandas._libs import (

19 algos,

20 hashtable,

21 lib,

22)

23from pandas._libs.hashtable import unique_label_indices

24from pandas._typing import (

25 IndexKeyFunc,

26 Level,

27 NaPosition,

28 Shape,

29 SortKind,

30 npt,

31)

33from pandas.core.dtypes.common import (

34 ensure_int64,

35 ensure_platform_int,

36 is_extension_array_dtype,

37)

38from pandas.core.dtypes.generic import (

39 ABCMultiIndex,

40 ABCRangeIndex,

41)

42from pandas.core.dtypes.missing import isna

44from pandas.core.construction import extract_array

46if TYPE_CHECKING: 46 ↛ 47line 46 didn't jump to line 47, because the condition on line 46 was never true

47 from pandas import MultiIndex

48 from pandas.core.arrays import ExtensionArray

49 from pandas.core.indexes.base import Index

52def get_indexer_indexer(

53 target: Index,

54 level: Level | list[Level] | None,

55 ascending: Sequence[bool] | bool,

56 kind: SortKind,

57 na_position: NaPosition,

58 sort_remaining: bool,

59 key: IndexKeyFunc,

60) -> npt.NDArray[np.intp] | None:

61 """

62 Helper method that return the indexer according to input parameters for

63 the sort_index method of DataFrame and Series.

65 Parameters

66 ----------

67 target : Index

68 level : int or level name or list of ints or list of level names

69 ascending : bool or list of bools, default True

70 kind : {'quicksort', 'mergesort', 'heapsort', 'stable'}, default 'quicksort'

71 na_position : {'first', 'last'}, default 'last'

72 sort_remaining : bool, default True

73 key : callable, optional

75 Returns

76 -------

77 Optional[ndarray[intp]]

78 The indexer for the new index.

79 """

81 target = ensure_key_mapped(target, key, levels=level)

82 target = target._sort_levels_monotonic()

84 if level is not None:

85 _, indexer = target.sortlevel(

86 level, ascending=ascending, sort_remaining=sort_remaining

87 )

88 elif isinstance(target, ABCMultiIndex):

89 indexer = lexsort_indexer(

90 target._get_codes_for_sorting(), orders=ascending, na_position=na_position

91 )

92 else:

93 # Check monotonic-ness before sort an index (GH 11080)

94 if (ascending and target.is_monotonic_increasing) or (

95 not ascending and target.is_monotonic_decreasing

96 ):

97 return None

99 # ascending can only be a Sequence for MultiIndex

100 indexer = nargsort(

101 target,

102 kind=kind,

103 ascending=cast(bool, ascending),

104 na_position=na_position,

105 )

106 return indexer

107

108

109def get_group_index(

110 labels, shape: Shape, sort: bool, xnull: bool

111) -> npt.NDArray[np.int64]:

112 """

113 For the particular label_list, gets the offsets into the hypothetical list

114 representing the totally ordered cartesian product of all possible label

115 combinations, *as long as* this space fits within int64 bounds;

116 otherwise, though group indices identify unique combinations of

117 labels, they cannot be deconstructed.

118 - If `sort`, rank of returned ids preserve lexical ranks of labels.

119 i.e. returned id's can be used to do lexical sort on labels;

120 - If `xnull` nulls (-1 labels) are passed through.

121

122 Parameters

123 ----------

124 labels : sequence of arrays

125 Integers identifying levels at each location

126 shape : tuple[int, ...]

127 Number of unique levels at each location

128 sort : bool

129 If the ranks of returned ids should match lexical ranks of labels

130 xnull : bool

131 If true nulls are excluded. i.e. -1 values in the labels are

132 passed through.

133

134 Returns

135 -------

136 An array of type int64 where two elements are equal if their corresponding

137 labels are equal at all location.

138

139 Notes

140 -----

141 The length of `labels` and `shape` must be identical.

142 """

143

144 def _int64_cut_off(shape) -> int:

145 acc = 1

146 for i, mul in enumerate(shape):

147 acc *= int(mul)

148 if not acc < lib.i8max:

149 return i

150 return len(shape)

151

152 def maybe_lift(lab, size) -> tuple[np.ndarray, int]:

153 # promote nan values (assigned -1 label in lab array)

154 # so that all output values are non-negative

155 return (lab + 1, size + 1) if (lab == -1).any() else (lab, size)

156

157 labels = [ensure_int64(x) for x in labels]

158 lshape = list(shape)

159 if not xnull:

160 for i, (lab, size) in enumerate(zip(labels, shape)):

161 lab, size = maybe_lift(lab, size)

162 labels[i] = lab

163 lshape[i] = size

164

165 labels = list(labels)

166

167 # Iteratively process all the labels in chunks sized so less

168 # than lib.i8max unique int ids will be required for each chunk

169 while True:

170 # how many levels can be done without overflow:

171 nlev = _int64_cut_off(lshape)

172

173 # compute flat ids for the first `nlev` levels

174 stride = np.prod(lshape[1:nlev], dtype="i8")

175 out = stride * labels[0].astype("i8", subok=False, copy=False)

176

177 for i in range(1, nlev):

178 if lshape[i] == 0:

179 stride = np.int64(0)

180 else:

181 stride //= lshape[i]

182 out += labels[i] * stride

183

184 if xnull: # exclude nulls

185 mask = labels[0] == -1

186 for lab in labels[1:nlev]:

187 mask |= lab == -1

188 out[mask] = -1

189

190 if nlev == len(lshape): # all levels done!

191 break

192

193 # compress what has been done so far in order to avoid overflow

194 # to retain lexical ranks, obs_ids should be sorted

195 comp_ids, obs_ids = compress_group_index(out, sort=sort)

196

197 labels = [comp_ids] + labels[nlev:]

198 lshape = [len(obs_ids)] + lshape[nlev:]

199

200 return out

201

202

203def get_compressed_ids(

204 labels, sizes: Shape

205) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.int64]]:

206 """

207 Group_index is offsets into cartesian product of all possible labels. This

208 space can be huge, so this function compresses it, by computing offsets

209 (comp_ids) into the list of unique labels (obs_group_ids).

210

211 Parameters

212 ----------

213 labels : list of label arrays

214 sizes : tuple[int] of size of the levels

215

216 Returns

217 -------

218 np.ndarray[np.intp]

219 comp_ids

220 np.ndarray[np.int64]

221 obs_group_ids

222 """

223 ids = get_group_index(labels, sizes, sort=True, xnull=False)

224 return compress_group_index(ids, sort=True)

225

226

227def is_int64_overflow_possible(shape: Shape) -> bool:

228 the_prod = 1

229 for x in shape:

230 the_prod *= int(x)

231

232 return the_prod >= lib.i8max

233

234

235def _decons_group_index(

236 comp_labels: npt.NDArray[np.intp], shape: Shape

237) -> list[npt.NDArray[np.intp]]:

238 # reconstruct labels

239 if is_int64_overflow_possible(shape):

240 # at some point group indices are factorized,

241 # and may not be deconstructed here! wrong path!

242 raise ValueError("cannot deconstruct factorized group indices!")

243

244 label_list = []

245 factor = 1

246 y = np.array(0)

247 x = comp_labels

248 for i in reversed(range(len(shape))):

249 labels = (x - y) % (factor * shape[i]) // factor

250 np.putmask(labels, comp_labels < 0, -1)

251 label_list.append(labels)

252 y = labels * factor

253 factor *= shape[i]

254 return label_list[::-1]

255

256

257def decons_obs_group_ids(

258 comp_ids: npt.NDArray[np.intp],

259 obs_ids: npt.NDArray[np.intp],

260 shape: Shape,

261 labels: Sequence[npt.NDArray[np.signedinteger]],

262 xnull: bool,

263) -> list[npt.NDArray[np.intp]]:

264 """

265 Reconstruct labels from observed group ids.

266

267 Parameters

268 ----------

269 comp_ids : np.ndarray[np.intp]

270 obs_ids: np.ndarray[np.intp]

271 shape : tuple[int]

272 labels : Sequence[np.ndarray[np.signedinteger]]

273 xnull : bool

274 If nulls are excluded; i.e. -1 labels are passed through.

275 """

276 if not xnull:

277 lift = np.fromiter(((a == -1).any() for a in labels), dtype=np.intp)

278 arr_shape = np.asarray(shape, dtype=np.intp) + lift

279 shape = tuple(arr_shape)

280

281 if not is_int64_overflow_possible(shape):

282 # obs ids are deconstructable! take the fast route!

283 out = _decons_group_index(obs_ids, shape)

284 return out if xnull or not lift.any() else [x - y for x, y in zip(out, lift)]

285

286 indexer = unique_label_indices(comp_ids)

287 return [lab[indexer].astype(np.intp, subok=False, copy=True) for lab in labels]

288

289

290def indexer_from_factorized(

291 labels, shape: Shape, compress: bool = True

292) -> npt.NDArray[np.intp]:

293 ids = get_group_index(labels, shape, sort=True, xnull=False)

294

295 if not compress:

296 ngroups = (ids.size and ids.max()) + 1

297 else:

298 ids, obs = compress_group_index(ids, sort=True)

299 ngroups = len(obs)

300

301 return get_group_index_sorter(ids, ngroups)

302

303

304def lexsort_indexer(

305 keys, orders=None, na_position: str = "last", key: Callable | None = None

306) -> npt.NDArray[np.intp]:

307 """

308 Performs lexical sorting on a set of keys

309

310 Parameters

311 ----------

312 keys : sequence of arrays

313 Sequence of ndarrays to be sorted by the indexer

314 orders : bool or list of booleans, optional

315 Determines the sorting order for each element in keys. If a list,

316 it must be the same length as keys. This determines whether the

317 corresponding element in keys should be sorted in ascending

318 (True) or descending (False) order. if bool, applied to all

319 elements as above. if None, defaults to True.

320 na_position : {'first', 'last'}, default 'last'

321 Determines placement of NA elements in the sorted list ("last" or "first")

322 key : Callable, optional

323 Callable key function applied to every element in keys before sorting

324

325 .. versionadded:: 1.0.0

326

327 Returns

328 -------

329 np.ndarray[np.intp]

330 """

331 from pandas.core.arrays import Categorical

332

333 labels = []

334 shape = []

335 if isinstance(orders, bool):

336 orders = [orders] * len(keys)

337 elif orders is None:

338 orders = [True] * len(keys)

339

340 keys = [ensure_key_mapped(k, key) for k in keys]

341

342 for k, order in zip(keys, orders):

343 with warnings.catch_warnings():

344 # TODO(2.0): unnecessary once deprecation is enforced

345 # GH#45618 don't issue warning user can't do anything about

346 warnings.filterwarnings(

347 "ignore", ".*(SparseArray|SparseDtype).*", category=FutureWarning

348 )

349

350 cat = Categorical(k, ordered=True)

351

352 if na_position not in ["last", "first"]:

353 raise ValueError(f"invalid na_position: {na_position}")

354

355 n = len(cat.categories)

356 codes = cat.codes.copy()

357

358 mask = cat.codes == -1

359 if order: # ascending

360 if na_position == "last":

361 codes = np.where(mask, n, codes)

362 elif na_position == "first":

363 codes += 1

364 else: # not order means descending

365 if na_position == "last":

366 codes = np.where(mask, n, n - codes - 1)

367 elif na_position == "first":

368 codes = np.where(mask, 0, n - codes)

369 if mask.any():

370 n += 1

371

372 shape.append(n)

373 labels.append(codes)

374

375 return indexer_from_factorized(labels, tuple(shape))

376

377

378def nargsort(

379 items,

380 kind: str = "quicksort",

381 ascending: bool = True,

382 na_position: str = "last",

383 key: Callable | None = None,

384 mask: npt.NDArray[np.bool_] | None = None,

385) -> npt.NDArray[np.intp]:

386 """

387 Intended to be a drop-in replacement for np.argsort which handles NaNs.

388

389 Adds ascending, na_position, and key parameters.

390

391 (GH #6399, #5231, #27237)

392

393 Parameters

394 ----------

395 kind : str, default 'quicksort'

396 ascending : bool, default True

397 na_position : {'first', 'last'}, default 'last'

398 key : Optional[Callable], default None

399 mask : Optional[np.ndarray[bool]], default None

400 Passed when called by ExtensionArray.argsort.

401

402 Returns

403 -------

404 np.ndarray[np.intp]

405 """

406

407 if key is not None:

408 items = ensure_key_mapped(items, key)

409 return nargsort(

410 items,

411 kind=kind,

412 ascending=ascending,

413 na_position=na_position,

414 key=None,

415 mask=mask,

416 )

417

418 if isinstance(items, ABCRangeIndex):

419 return items.argsort(ascending=ascending) # TODO: test coverage with key?

420 elif not isinstance(items, ABCMultiIndex):

421 items = extract_array(items)

422 if mask is None:

423 mask = np.asarray(isna(items)) # TODO: does this exclude MultiIndex too?

424

425 if is_extension_array_dtype(items):

426 return items.argsort(ascending=ascending, kind=kind, na_position=na_position)

427 else:

428 items = np.asanyarray(items)

429

430 idx = np.arange(len(items))

431 non_nans = items[~mask]

432 non_nan_idx = idx[~mask]

433

434 nan_idx = np.nonzero(mask)[0]

435 if not ascending:

436 non_nans = non_nans[::-1]

437 non_nan_idx = non_nan_idx[::-1]

438 indexer = non_nan_idx[non_nans.argsort(kind=kind)]

439 if not ascending:

440 indexer = indexer[::-1]

441 # Finally, place the NaNs at the end or the beginning according to

442 # na_position

443 if na_position == "last":

444 indexer = np.concatenate([indexer, nan_idx])

445 elif na_position == "first":

446 indexer = np.concatenate([nan_idx, indexer])

447 else:

448 raise ValueError(f"invalid na_position: {na_position}")

449 return ensure_platform_int(indexer)

450

451

452def nargminmax(values: ExtensionArray, method: str, axis: int = 0):

453 """

454 Implementation of np.argmin/argmax but for ExtensionArray and which

455 handles missing values.

456

457 Parameters

458 ----------

459 values : ExtensionArray

460 method : {"argmax", "argmin"}

461 axis : int, default 0

462

463 Returns

464 -------

465 int

466 """

467 assert method in {"argmax", "argmin"}

468 func = np.argmax if method == "argmax" else np.argmin

469

470 mask = np.asarray(isna(values))

471 arr_values = values._values_for_argsort()

472

473 if arr_values.ndim > 1:

474 if mask.any():

475 if axis == 1:

476 zipped = zip(arr_values, mask)

477 else:

478 zipped = zip(arr_values.T, mask.T)

479 return np.array([_nanargminmax(v, m, func) for v, m in zipped])

480 return func(arr_values, axis=axis)

481

482 return _nanargminmax(arr_values, mask, func)

483

484

485def _nanargminmax(values: np.ndarray, mask: npt.NDArray[np.bool_], func) -> int:

486 """

487 See nanargminmax.__doc__.

488 """

489 idx = np.arange(values.shape[0])

490 non_nans = values[~mask]

491 non_nan_idx = idx[~mask]

492

493 return non_nan_idx[func(non_nans)]

494

495

496def _ensure_key_mapped_multiindex(

497 index: MultiIndex, key: Callable, level=None

498) -> MultiIndex:

499 """

500 Returns a new MultiIndex in which key has been applied

501 to all levels specified in level (or all levels if level

502 is None). Used for key sorting for MultiIndex.

503

504 Parameters

505 ----------

506 index : MultiIndex

507 Index to which to apply the key function on the

508 specified levels.

509 key : Callable

510 Function that takes an Index and returns an Index of

511 the same shape. This key is applied to each level

512 separately. The name of the level can be used to

513 distinguish different levels for application.

514 level : list-like, int or str, default None

515 Level or list of levels to apply the key function to.

516 If None, key function is applied to all levels. Other

517 levels are left unchanged.

518

519 Returns

520 -------

521 labels : MultiIndex

522 Resulting MultiIndex with modified levels.

523 """

524

525 if level is not None:

526 if isinstance(level, (str, int)):

527 sort_levels = [level]

528 else:

529 sort_levels = level

530

531 sort_levels = [index._get_level_number(lev) for lev in sort_levels]

532 else:

533 sort_levels = list(range(index.nlevels)) # satisfies mypy

534

535 mapped = [

536 ensure_key_mapped(index._get_level_values(level), key)

537 if level in sort_levels

538 else index._get_level_values(level)

539 for level in range(index.nlevels)

540 ]

541

542 return type(index).from_arrays(mapped)

543

544

545def ensure_key_mapped(values, key: Callable | None, levels=None):

546 """

547 Applies a callable key function to the values function and checks

548 that the resulting value has the same shape. Can be called on Index

549 subclasses, Series, DataFrames, or ndarrays.

550

551 Parameters

552 ----------

553 values : Series, DataFrame, Index subclass, or ndarray

554 key : Optional[Callable], key to be called on the values array

555 levels : Optional[List], if values is a MultiIndex, list of levels to

556 apply the key to.

557 """

558 from pandas.core.indexes.api import Index

559

560 if not key:

561 return values

562

563 if isinstance(values, ABCMultiIndex):

564 return _ensure_key_mapped_multiindex(values, key, level=levels)

565

566 result = key(values.copy())

567 if len(result) != len(values):

568 raise ValueError(

569 "User-provided `key` function must not change the shape of the array."

570 )

571

572 try:

573 if isinstance(

574 values, Index

575 ): # convert to a new Index subclass, not necessarily the same

576 result = Index(result)

577 else:

578 type_of_values = type(values)

579 result = type_of_values(result) # try to revert to original type otherwise

580 except TypeError:

581 raise TypeError(

582 f"User-provided `key` function returned an invalid type {type(result)} \

583 which could not be converted to {type(values)}."

584 )

585

586 return result

587

588

589def get_flattened_list(

590 comp_ids: npt.NDArray[np.intp],

591 ngroups: int,

592 levels: Iterable[Index],

593 labels: Iterable[np.ndarray],

594) -> list[tuple]:

595 """Map compressed group id -> key tuple."""

596 comp_ids = comp_ids.astype(np.int64, copy=False)

597 arrays: DefaultDict[int, list[int]] = defaultdict(list)

598 for labs, level in zip(labels, levels):

599 table = hashtable.Int64HashTable(ngroups)

600 table.map_keys_to_values(comp_ids, labs.astype(np.int64, copy=False))

601 for i in range(ngroups):

602 arrays[i].append(level[table.get_item(i)])

603 return [tuple(array) for array in arrays.values()]

604

605

606def get_indexer_dict(

607 label_list: list[np.ndarray], keys: list[Index]

608) -> dict[Hashable, npt.NDArray[np.intp]]:

609 """

610 Returns

611 -------

612 dict:

613 Labels mapped to indexers.

614 """

615 shape = tuple(len(x) for x in keys)

616

617 group_index = get_group_index(label_list, shape, sort=True, xnull=True)

618 if np.all(group_index == -1):

619 # Short-circuit, lib.indices_fast will return the same

620 return {}

621 ngroups = (

622 ((group_index.size and group_index.max()) + 1)

623 if is_int64_overflow_possible(shape)

624 else np.prod(shape, dtype="i8")

625 )

626

627 sorter = get_group_index_sorter(group_index, ngroups)

628

629 sorted_labels = [lab.take(sorter) for lab in label_list]

630 group_index = group_index.take(sorter)

631

632 return lib.indices_fast(sorter, group_index, keys, sorted_labels)

633

634

635# ----------------------------------------------------------------------

636# sorting levels...cleverly?

637

638

639def get_group_index_sorter(

640 group_index: npt.NDArray[np.intp], ngroups: int | None = None

641) -> npt.NDArray[np.intp]:

642 """

643 algos.groupsort_indexer implements `counting sort` and it is at least

644 O(ngroups), where

645 ngroups = prod(shape)

646 shape = map(len, keys)

647 that is, linear in the number of combinations (cartesian product) of unique

648 values of groupby keys. This can be huge when doing multi-key groupby.

649 np.argsort(kind='mergesort') is O(count x log(count)) where count is the

650 length of the data-frame;

651 Both algorithms are `stable` sort and that is necessary for correctness of

652 groupby operations. e.g. consider:

653 df.groupby(key)[col].transform('first')

654

655 Parameters

656 ----------

657 group_index : np.ndarray[np.intp]

658 signed integer dtype

659 ngroups : int or None, default None

660

661 Returns

662 -------

663 np.ndarray[np.intp]

664 """

665 if ngroups is None:

666 ngroups = 1 + group_index.max()

667 count = len(group_index)

668 alpha = 0.0 # taking complexities literally; there may be

669 beta = 1.0 # some room for fine-tuning these parameters

670 do_groupsort = count > 0 and ((alpha + beta * ngroups) < (count * np.log(count)))

671 if do_groupsort:

672 sorter, _ = algos.groupsort_indexer(

673 ensure_platform_int(group_index),

674 ngroups,

675 )

676 # sorter _should_ already be intp, but mypy is not yet able to verify

677 else:

678 sorter = group_index.argsort(kind="mergesort")

679 return ensure_platform_int(sorter)

680

681

682def compress_group_index(

683 group_index: npt.NDArray[np.int64], sort: bool = True

684) -> tuple[npt.NDArray[np.int64], npt.NDArray[np.int64]]:

685 """

686 Group_index is offsets into cartesian product of all possible labels. This

687 space can be huge, so this function compresses it, by computing offsets

688 (comp_ids) into the list of unique labels (obs_group_ids).

689 """

690 size_hint = len(group_index)

691 table = hashtable.Int64HashTable(size_hint)

692

693 group_index = ensure_int64(group_index)

694

695 # note, group labels come out ascending (ie, 1,2,3 etc)

696 comp_ids, obs_group_ids = table.get_labels_groupby(group_index)

697

698 if sort and len(obs_group_ids) > 0:

699 obs_group_ids, comp_ids = _reorder_by_uniques(obs_group_ids, comp_ids)

700

701 return ensure_int64(comp_ids), ensure_int64(obs_group_ids)

702

703

704def _reorder_by_uniques(

705 uniques: npt.NDArray[np.int64], labels: npt.NDArray[np.intp]

706) -> tuple[npt.NDArray[np.int64], npt.NDArray[np.intp]]:

707 """

708 Parameters

709 ----------

710 uniques : np.ndarray[np.int64]

711 labels : np.ndarray[np.intp]

712

713 Returns

714 -------

715 np.ndarray[np.int64]

716 np.ndarray[np.intp]

717 """

718 # sorter is index where elements ought to go

719 sorter = uniques.argsort()

720

721 # reverse_indexer is where elements came from

722 reverse_indexer = np.empty(len(sorter), dtype=np.intp)

723 reverse_indexer.put(sorter, np.arange(len(sorter)))

724

725 mask = labels < 0

726

727 # move labels to right locations (ie, unsort ascending labels)

728 labels = reverse_indexer.take(labels)

729 np.putmask(labels, mask, -1)

730

731 # sort observed ids

732 uniques = uniques.take(sorter)

733

734 return uniques, labels