Coverage for /var/srv/projects/api.amasfac.comuna18.com/tmp/venv/lib/python3.9/site-packages/pandas/core/reshape/tile.py: 8%

1"""

2Quantilization functions and related stuff

3"""

4from __future__ import annotations

6from typing import (

7 Any,

8 Callable,

9 Literal,

10)

12import numpy as np

14from pandas._libs import (

15 Timedelta,

16 Timestamp,

17)

18from pandas._libs.lib import infer_dtype

19from pandas._typing import IntervalLeftRight

21from pandas.core.dtypes.common import (

22 DT64NS_DTYPE,

23 ensure_platform_int,

24 is_bool_dtype,

25 is_categorical_dtype,

26 is_datetime64_dtype,

27 is_datetime64tz_dtype,

28 is_datetime_or_timedelta_dtype,

29 is_extension_array_dtype,

30 is_integer,

31 is_list_like,

32 is_numeric_dtype,

33 is_scalar,

34 is_timedelta64_dtype,

35)

36from pandas.core.dtypes.generic import ABCSeries

37from pandas.core.dtypes.missing import isna

39from pandas import (

40 Categorical,

41 Index,

42 IntervalIndex,

43 to_datetime,

44 to_timedelta,

45)

46import pandas.core.algorithms as algos

47import pandas.core.nanops as nanops

50def cut(

51 x,

52 bins,

53 right: bool = True,

54 labels=None,

55 retbins: bool = False,

56 precision: int = 3,

57 include_lowest: bool = False,

58 duplicates: str = "raise",

59 ordered: bool = True,

60):

61 """

62 Bin values into discrete intervals.

64 Use `cut` when you need to segment and sort data values into bins. This

65 function is also useful for going from a continuous variable to a

66 categorical variable. For example, `cut` could convert ages to groups of

67 age ranges. Supports binning into an equal number of bins, or a

68 pre-specified array of bins.

70 Parameters

71 ----------

72 x : array-like

73 The input array to be binned. Must be 1-dimensional.

74 bins : int, sequence of scalars, or IntervalIndex

75 The criteria to bin by.

77 * int : Defines the number of equal-width bins in the range of `x`. The

78 range of `x` is extended by .1% on each side to include the minimum

79 and maximum values of `x`.

80 * sequence of scalars : Defines the bin edges allowing for non-uniform

81 width. No extension of the range of `x` is done.

82 * IntervalIndex : Defines the exact bins to be used. Note that

83 IntervalIndex for `bins` must be non-overlapping.

85 right : bool, default True

86 Indicates whether `bins` includes the rightmost edge or not. If

87 ``right == True`` (the default), then the `bins` ``[1, 2, 3, 4]``

88 indicate (1,2], (2,3], (3,4]. This argument is ignored when

89 `bins` is an IntervalIndex.

90 labels : array or False, default None

91 Specifies the labels for the returned bins. Must be the same length as

92 the resulting bins. If False, returns only integer indicators of the

93 bins. This affects the type of the output container (see below).

94 This argument is ignored when `bins` is an IntervalIndex. If True,

95 raises an error. When `ordered=False`, labels must be provided.

96 retbins : bool, default False

97 Whether to return the bins or not. Useful when bins is provided

98 as a scalar.

99 precision : int, default 3

100 The precision at which to store and display the bins labels.

101 include_lowest : bool, default False

102 Whether the first interval should be left-inclusive or not.

103 duplicates : {default 'raise', 'drop'}, optional

104 If bin edges are not unique, raise ValueError or drop non-uniques.

105 ordered : bool, default True

106 Whether the labels are ordered or not. Applies to returned types

107 Categorical and Series (with Categorical dtype). If True,

108 the resulting categorical will be ordered. If False, the resulting

109 categorical will be unordered (labels must be provided).

110

111 .. versionadded:: 1.1.0

112

113 Returns

114 -------

115 out : Categorical, Series, or ndarray

116 An array-like object representing the respective bin for each value

117 of `x`. The type depends on the value of `labels`.

118

119 * None (default) : returns a Series for Series `x` or a

120 Categorical for all other inputs. The values stored within

121 are Interval dtype.

122

123 * sequence of scalars : returns a Series for Series `x` or a

124 Categorical for all other inputs. The values stored within

125 are whatever the type in the sequence is.

126

127 * False : returns an ndarray of integers.

128

129 bins : numpy.ndarray or IntervalIndex.

130 The computed or specified bins. Only returned when `retbins=True`.

131 For scalar or sequence `bins`, this is an ndarray with the computed

132 bins. If set `duplicates=drop`, `bins` will drop non-unique bin. For

133 an IntervalIndex `bins`, this is equal to `bins`.

134

135 See Also

136 --------

137 qcut : Discretize variable into equal-sized buckets based on rank

138 or based on sample quantiles.

139 Categorical : Array type for storing data that come from a

140 fixed set of values.

141 Series : One-dimensional array with axis labels (including time series).

142 IntervalIndex : Immutable Index implementing an ordered, sliceable set.

143

144 Notes

145 -----

146 Any NA values will be NA in the result. Out of bounds values will be NA in

147 the resulting Series or Categorical object.

148

149 Reference :ref:`the user guide <reshaping.tile.cut>` for more examples.

150

151 Examples

152 --------

153 Discretize into three equal-sized bins.

154

155 >>> pd.cut(np.array([1, 7, 5, 4, 6, 3]), 3)

156 ... # doctest: +ELLIPSIS

157 [(0.994, 3.0], (5.0, 7.0], (3.0, 5.0], (3.0, 5.0], (5.0, 7.0], ...

158 Categories (3, interval[float64, right]): [(0.994, 3.0] < (3.0, 5.0] ...

159

160 >>> pd.cut(np.array([1, 7, 5, 4, 6, 3]), 3, retbins=True)

161 ... # doctest: +ELLIPSIS

162 ([(0.994, 3.0], (5.0, 7.0], (3.0, 5.0], (3.0, 5.0], (5.0, 7.0], ...

163 Categories (3, interval[float64, right]): [(0.994, 3.0] < (3.0, 5.0] ...

164 array([0.994, 3. , 5. , 7. ]))

165

166 Discovers the same bins, but assign them specific labels. Notice that

167 the returned Categorical's categories are `labels` and is ordered.

168

169 >>> pd.cut(np.array([1, 7, 5, 4, 6, 3]),

170 ... 3, labels=["bad", "medium", "good"])

171 ['bad', 'good', 'medium', 'medium', 'good', 'bad']

172 Categories (3, object): ['bad' < 'medium' < 'good']

173

174 ``ordered=False`` will result in unordered categories when labels are passed.

175 This parameter can be used to allow non-unique labels:

176

177 >>> pd.cut(np.array([1, 7, 5, 4, 6, 3]), 3,

178 ... labels=["B", "A", "B"], ordered=False)

179 ['B', 'B', 'A', 'A', 'B', 'B']

180 Categories (2, object): ['A', 'B']

181

182 ``labels=False`` implies you just want the bins back.

183

184 >>> pd.cut([0, 1, 1, 2], bins=4, labels=False)

185 array([0, 1, 1, 3])

186

187 Passing a Series as an input returns a Series with categorical dtype:

188

189 >>> s = pd.Series(np.array([2, 4, 6, 8, 10]),

190 ... index=['a', 'b', 'c', 'd', 'e'])

191 >>> pd.cut(s, 3)

192 ... # doctest: +ELLIPSIS

193 a (1.992, 4.667]

194 b (1.992, 4.667]

195 c (4.667, 7.333]

196 d (7.333, 10.0]

197 e (7.333, 10.0]

198 dtype: category

199 Categories (3, interval[float64, right]): [(1.992, 4.667] < (4.667, ...

200

201 Passing a Series as an input returns a Series with mapping value.

202 It is used to map numerically to intervals based on bins.

203

204 >>> s = pd.Series(np.array([2, 4, 6, 8, 10]),

205 ... index=['a', 'b', 'c', 'd', 'e'])

206 >>> pd.cut(s, [0, 2, 4, 6, 8, 10], labels=False, retbins=True, right=False)

207 ... # doctest: +ELLIPSIS

208 (a 1.0

209 b 2.0

210 c 3.0

211 d 4.0

212 e NaN

213 dtype: float64,

214 array([ 0, 2, 4, 6, 8, 10]))

215

216 Use `drop` optional when bins is not unique

217

218 >>> pd.cut(s, [0, 2, 4, 6, 10, 10], labels=False, retbins=True,

219 ... right=False, duplicates='drop')

220 ... # doctest: +ELLIPSIS

221 (a 1.0

222 b 2.0

223 c 3.0

224 d 3.0

225 e NaN

226 dtype: float64,

227 array([ 0, 2, 4, 6, 10]))

228

229 Passing an IntervalIndex for `bins` results in those categories exactly.

230 Notice that values not covered by the IntervalIndex are set to NaN. 0

231 is to the left of the first bin (which is closed on the right), and 1.5

232 falls between two bins.

233

234 >>> bins = pd.IntervalIndex.from_tuples([(0, 1), (2, 3), (4, 5)])

235 >>> pd.cut([0, 0.5, 1.5, 2.5, 4.5], bins)

236 [NaN, (0.0, 1.0], NaN, (2.0, 3.0], (4.0, 5.0]]

237 Categories (3, interval[int64, right]): [(0, 1] < (2, 3] < (4, 5]]

238 """

239 # NOTE: this binning code is changed a bit from histogram for var(x) == 0

240

241 original = x

242 x = _preprocess_for_cut(x)

243 x, dtype = _coerce_to_type(x)

244

245 if not np.iterable(bins):

246 if is_scalar(bins) and bins < 1:

247 raise ValueError("`bins` should be a positive integer.")

248

249 try: # for array-like

250 sz = x.size

251 except AttributeError:

252 x = np.asarray(x)

253 sz = x.size

254

255 if sz == 0:

256 raise ValueError("Cannot cut empty array")

257

258 rng = (nanops.nanmin(x), nanops.nanmax(x))

259 mn, mx = (mi + 0.0 for mi in rng)

260

261 if np.isinf(mn) or np.isinf(mx):

262 # GH 24314

263 raise ValueError(

264 "cannot specify integer `bins` when input data contains infinity"

265 )

266 elif mn == mx: # adjust end points before binning

267 mn -= 0.001 * abs(mn) if mn != 0 else 0.001

268 mx += 0.001 * abs(mx) if mx != 0 else 0.001

269 bins = np.linspace(mn, mx, bins + 1, endpoint=True)

270 else: # adjust end points after binning

271 bins = np.linspace(mn, mx, bins + 1, endpoint=True)

272 adj = (mx - mn) * 0.001 # 0.1% of the range

273 if right:

274 bins[0] -= adj

275 else:

276 bins[-1] += adj

277

278 elif isinstance(bins, IntervalIndex):

279 if bins.is_overlapping:

280 raise ValueError("Overlapping IntervalIndex is not accepted.")

281

282 else:

283 if is_datetime64tz_dtype(bins):

284 bins = np.asarray(bins, dtype=DT64NS_DTYPE)

285 else:

286 bins = np.asarray(bins)

287 bins = _convert_bin_to_numeric_type(bins, dtype)

288

289 # GH 26045: cast to float64 to avoid an overflow

290 if (np.diff(bins.astype("float64")) < 0).any():

291 raise ValueError("bins must increase monotonically.")

292

293 fac, bins = _bins_to_cuts(

294 x,

295 bins,

296 right=right,

297 labels=labels,

298 precision=precision,

299 include_lowest=include_lowest,

300 dtype=dtype,

301 duplicates=duplicates,

302 ordered=ordered,

303 )

304

305 return _postprocess_for_cut(fac, bins, retbins, dtype, original)

306

307

308def qcut(

309 x,

310 q,

311 labels=None,

312 retbins: bool = False,

313 precision: int = 3,

314 duplicates: str = "raise",

315):

316 """

317 Quantile-based discretization function.

318

319 Discretize variable into equal-sized buckets based on rank or based

320 on sample quantiles. For example 1000 values for 10 quantiles would

321 produce a Categorical object indicating quantile membership for each data point.

322

323 Parameters

324 ----------

325 x : 1d ndarray or Series

326 q : int or list-like of float

327 Number of quantiles. 10 for deciles, 4 for quartiles, etc. Alternately

328 array of quantiles, e.g. [0, .25, .5, .75, 1.] for quartiles.

329 labels : array or False, default None

330 Used as labels for the resulting bins. Must be of the same length as

331 the resulting bins. If False, return only integer indicators of the

332 bins. If True, raises an error.

333 retbins : bool, optional

334 Whether to return the (bins, labels) or not. Can be useful if bins

335 is given as a scalar.

336 precision : int, optional

337 The precision at which to store and display the bins labels.

338 duplicates : {default 'raise', 'drop'}, optional

339 If bin edges are not unique, raise ValueError or drop non-uniques.

340

341 Returns

342 -------

343 out : Categorical or Series or array of integers if labels is False

344 The return type (Categorical or Series) depends on the input: a Series

345 of type category if input is a Series else Categorical. Bins are

346 represented as categories when categorical data is returned.

347 bins : ndarray of floats

348 Returned only if `retbins` is True.

349

350 Notes

351 -----

352 Out of bounds values will be NA in the resulting Categorical object

353

354 Examples

355 --------

356 >>> pd.qcut(range(5), 4)

357 ... # doctest: +ELLIPSIS

358 [(-0.001, 1.0], (-0.001, 1.0], (1.0, 2.0], (2.0, 3.0], (3.0, 4.0]]

359 Categories (4, interval[float64, right]): [(-0.001, 1.0] < (1.0, 2.0] ...

360

361 >>> pd.qcut(range(5), 3, labels=["good", "medium", "bad"])

362 ... # doctest: +SKIP

363 [good, good, medium, bad, bad]

364 Categories (3, object): [good < medium < bad]

365

366 >>> pd.qcut(range(5), 4, labels=False)

367 array([0, 0, 1, 2, 3])

368 """

369 original = x

370 x = _preprocess_for_cut(x)

371 x, dtype = _coerce_to_type(x)

372

373 quantiles = np.linspace(0, 1, q + 1) if is_integer(q) else q

374

375 x_np = np.asarray(x)

376 x_np = x_np[~np.isnan(x_np)]

377 bins = np.quantile(x_np, quantiles)

378

379 fac, bins = _bins_to_cuts(

380 x,

381 bins,

382 labels=labels,

383 precision=precision,

384 include_lowest=True,

385 dtype=dtype,

386 duplicates=duplicates,

387 )

388

389 return _postprocess_for_cut(fac, bins, retbins, dtype, original)

390

391

392def _bins_to_cuts(

393 x,

394 bins: np.ndarray,

395 right: bool = True,

396 labels=None,

397 precision: int = 3,

398 include_lowest: bool = False,

399 dtype=None,

400 duplicates: str = "raise",

401 ordered: bool = True,

402):

403 if not ordered and labels is None:

404 raise ValueError("'labels' must be provided if 'ordered = False'")

405

406 if duplicates not in ["raise", "drop"]:

407 raise ValueError(

408 "invalid value for 'duplicates' parameter, valid options are: raise, drop"

409 )

410

411 if isinstance(bins, IntervalIndex):

412 # we have a fast-path here

413 ids = bins.get_indexer(x)

414 result = Categorical.from_codes(ids, categories=bins, ordered=True)

415 return result, bins

416

417 unique_bins = algos.unique(bins)

418 if len(unique_bins) < len(bins) and len(bins) != 2:

419 if duplicates == "raise":

420 raise ValueError(

421 f"Bin edges must be unique: {repr(bins)}.\n"

422 f"You can drop duplicate edges by setting the 'duplicates' kwarg"

423 )

424 else:

425 bins = unique_bins

426

427 side: Literal["left", "right"] = "left" if right else "right"

428 ids = ensure_platform_int(bins.searchsorted(x, side=side))

429

430 if include_lowest:

431 ids[np.asarray(x) == bins[0]] = 1

432

433 na_mask = isna(x) | (ids == len(bins)) | (ids == 0)

434 has_nas = na_mask.any()

435

436 if labels is not False:

437 if not (labels is None or is_list_like(labels)):

438 raise ValueError(

439 "Bin labels must either be False, None or passed in as a "

440 "list-like argument"

441 )

442

443 elif labels is None:

444 labels = _format_labels(

445 bins, precision, right=right, include_lowest=include_lowest, dtype=dtype

446 )

447 elif ordered and len(set(labels)) != len(labels):

448 raise ValueError(

449 "labels must be unique if ordered=True; pass ordered=False "

450 "for duplicate labels"

451 )

452 else:

453 if len(labels) != len(bins) - 1:

454 raise ValueError(

455 "Bin labels must be one fewer than the number of bin edges"

456 )

457 if not is_categorical_dtype(labels):

458 labels = Categorical(

459 labels,

460 categories=labels if len(set(labels)) == len(labels) else None,

461 ordered=ordered,

462 )

463 # TODO: handle mismatch between categorical label order and pandas.cut order.

464 np.putmask(ids, na_mask, 0)

465 result = algos.take_nd(labels, ids - 1)

466

467 else:

468 result = ids - 1

469 if has_nas:

470 result = result.astype(np.float64)

471 np.putmask(result, na_mask, np.nan)

472

473 return result, bins

474

475

476def _coerce_to_type(x):

477 """

478 if the passed data is of datetime/timedelta, bool or nullable int type,

479 this method converts it to numeric so that cut or qcut method can

480 handle it

481 """

482 dtype = None

483

484 if is_datetime64tz_dtype(x.dtype):

485 dtype = x.dtype

486 elif is_datetime64_dtype(x.dtype):

487 x = to_datetime(x)

488 dtype = np.dtype("datetime64[ns]")

489 elif is_timedelta64_dtype(x.dtype):

490 x = to_timedelta(x)

491 dtype = np.dtype("timedelta64[ns]")

492 elif is_bool_dtype(x.dtype):

493 # GH 20303

494 x = x.astype(np.int64)

495 # To support cut and qcut for IntegerArray we convert to float dtype.

496 # Will properly support in the future.

497 # https://github.com/pandas-dev/pandas/pull/31290

498 # https://github.com/pandas-dev/pandas/issues/31389

499 elif is_extension_array_dtype(x.dtype) and is_numeric_dtype(x.dtype):

500 x = x.to_numpy(dtype=np.float64, na_value=np.nan)

501

502 if dtype is not None:

503 # GH 19768: force NaT to NaN during integer conversion

504 x = np.where(x.notna(), x.view(np.int64), np.nan)

505

506 return x, dtype

507

508

509def _convert_bin_to_numeric_type(bins, dtype):

510 """

511 if the passed bin is of datetime/timedelta type,

512 this method converts it to integer

513

514 Parameters

515 ----------

516 bins : list-like of bins

517 dtype : dtype of data

518

519 Raises

520 ------

521 ValueError if bins are not of a compat dtype to dtype

522 """

523 bins_dtype = infer_dtype(bins, skipna=False)

524 if is_timedelta64_dtype(dtype):

525 if bins_dtype in ["timedelta", "timedelta64"]:

526 bins = to_timedelta(bins).view(np.int64)

527 else:

528 raise ValueError("bins must be of timedelta64 dtype")

529 elif is_datetime64_dtype(dtype) or is_datetime64tz_dtype(dtype):

530 if bins_dtype in ["datetime", "datetime64"]:

531 bins = to_datetime(bins).view(np.int64)

532 else:

533 raise ValueError("bins must be of datetime64 dtype")

534

535 return bins

536

537

538def _convert_bin_to_datelike_type(bins, dtype):

539 """

540 Convert bins to a DatetimeIndex or TimedeltaIndex if the original dtype is

541 datelike

542

543 Parameters

544 ----------

545 bins : list-like of bins

546 dtype : dtype of data

547

548 Returns

549 -------

550 bins : Array-like of bins, DatetimeIndex or TimedeltaIndex if dtype is

551 datelike

552 """

553 if is_datetime64tz_dtype(dtype):

554 bins = to_datetime(bins.astype(np.int64), utc=True).tz_convert(dtype.tz)

555 elif is_datetime_or_timedelta_dtype(dtype):

556 bins = Index(bins.astype(np.int64), dtype=dtype)

557 return bins

558

559

560def _format_labels(

561 bins, precision: int, right: bool = True, include_lowest: bool = False, dtype=None

562):

563 """based on the dtype, return our labels"""

564 closed: IntervalLeftRight = "right" if right else "left"

565

566 formatter: Callable[[Any], Timestamp] | Callable[[Any], Timedelta]

567

568 if is_datetime64tz_dtype(dtype):

569 formatter = lambda x: Timestamp(x, tz=dtype.tz)

570 adjust = lambda x: x - Timedelta("1ns")

571 elif is_datetime64_dtype(dtype):

572 formatter = Timestamp

573 adjust = lambda x: x - Timedelta("1ns")

574 elif is_timedelta64_dtype(dtype):

575 formatter = Timedelta

576 adjust = lambda x: x - Timedelta("1ns")

577 else:

578 precision = _infer_precision(precision, bins)

579 formatter = lambda x: _round_frac(x, precision)

580 adjust = lambda x: x - 10 ** (-precision)

581

582 breaks = [formatter(b) for b in bins]

583 if right and include_lowest:

584 # adjust lhs of first interval by precision to account for being right closed

585 breaks[0] = adjust(breaks[0])

586

587 return IntervalIndex.from_breaks(breaks, closed=closed)

588

589

590def _preprocess_for_cut(x):

591 """

592 handles preprocessing for cut where we convert passed

593 input to array, strip the index information and store it

594 separately

595 """

596 # Check that the passed array is a Pandas or Numpy object

597 # We don't want to strip away a Pandas data-type here (e.g. datetimetz)

598 ndim = getattr(x, "ndim", None)

599 if ndim is None:

600 x = np.asarray(x)

601 if x.ndim != 1:

602 raise ValueError("Input array must be 1 dimensional")

603

604 return x

605

606

607def _postprocess_for_cut(fac, bins, retbins: bool, dtype, original):

608 """

609 handles post processing for the cut method where

610 we combine the index information if the originally passed

611 datatype was a series

612 """

613 if isinstance(original, ABCSeries):

614 fac = original._constructor(fac, index=original.index, name=original.name)

615

616 if not retbins:

617 return fac

618

619 bins = _convert_bin_to_datelike_type(bins, dtype)

620

621 return fac, bins

622

623

624def _round_frac(x, precision: int):

625 """

626 Round the fractional part of the given number

627 """

628 if not np.isfinite(x) or x == 0:

629 return x

630 else:

631 frac, whole = np.modf(x)

632 if whole == 0:

633 digits = -int(np.floor(np.log10(abs(frac)))) - 1 + precision

634 else:

635 digits = precision

636 return np.around(x, digits)

637

638

639def _infer_precision(base_precision: int, bins) -> int:

640 """

641 Infer an appropriate precision for _round_frac

642 """

643 for precision in range(base_precision, 20):

644 levels = [_round_frac(b, precision) for b in bins]

645 if algos.unique(levels).size == bins.size:

646 return precision

647 return base_precision # default