Coverage for /var/srv/projects/api.amasfac.comuna18.com/tmp/venv/lib/python3.9/site-packages/pandas/core/sample.py: 11%
57 statements
« prev ^ index » next coverage.py v6.4.4, created at 2023-07-17 14:22 -0600
« prev ^ index » next coverage.py v6.4.4, created at 2023-07-17 14:22 -0600
1"""
2Module containing utilities for NDFrame.sample() and .GroupBy.sample()
3"""
4from __future__ import annotations
6from typing import TYPE_CHECKING
8import numpy as np
10from pandas._libs import lib
12from pandas.core.dtypes.generic import (
13 ABCDataFrame,
14 ABCSeries,
15)
17if TYPE_CHECKING: 17 ↛ 18line 17 didn't jump to line 18, because the condition on line 17 was never true
18 from pandas.core.generic import NDFrame
21def preprocess_weights(obj: NDFrame, weights, axis: int) -> np.ndarray:
22 """
23 Process and validate the `weights` argument to `NDFrame.sample` and
24 `.GroupBy.sample`.
26 Returns `weights` as an ndarray[np.float64], validated except for normalizing
27 weights (because that must be done groupwise in groupby sampling).
28 """
29 # If a series, align with frame
30 if isinstance(weights, ABCSeries):
31 weights = weights.reindex(obj.axes[axis])
33 # Strings acceptable if a dataframe and axis = 0
34 if isinstance(weights, str):
35 if isinstance(obj, ABCDataFrame):
36 if axis == 0:
37 try:
38 weights = obj[weights]
39 except KeyError as err:
40 raise KeyError(
41 "String passed to weights not a valid column"
42 ) from err
43 else:
44 raise ValueError(
45 "Strings can only be passed to "
46 "weights when sampling from rows on "
47 "a DataFrame"
48 )
49 else:
50 raise ValueError(
51 "Strings cannot be passed as weights when sampling from a Series."
52 )
54 if isinstance(obj, ABCSeries):
55 func = obj._constructor
56 else:
57 func = obj._constructor_sliced
59 weights = func(weights, dtype="float64")._values
61 if len(weights) != obj.shape[axis]:
62 raise ValueError("Weights and axis to be sampled must be of same length")
64 if lib.has_infs(weights):
65 raise ValueError("weight vector may not include `inf` values")
67 if (weights < 0).any():
68 raise ValueError("weight vector many not include negative values")
70 missing = np.isnan(weights)
71 if missing.any():
72 # Don't modify weights in place
73 weights = weights.copy()
74 weights[missing] = 0
75 return weights
78def process_sampling_size(
79 n: int | None, frac: float | None, replace: bool
80) -> int | None:
81 """
82 Process and validate the `n` and `frac` arguments to `NDFrame.sample` and
83 `.GroupBy.sample`.
85 Returns None if `frac` should be used (variable sampling sizes), otherwise returns
86 the constant sampling size.
87 """
88 # If no frac or n, default to n=1.
89 if n is None and frac is None:
90 n = 1
91 elif n is not None and frac is not None:
92 raise ValueError("Please enter a value for `frac` OR `n`, not both")
93 elif n is not None:
94 if n < 0:
95 raise ValueError(
96 "A negative number of rows requested. Please provide `n` >= 0."
97 )
98 if n % 1 != 0:
99 raise ValueError("Only integers accepted as `n` values")
100 else:
101 assert frac is not None # for mypy
102 if frac > 1 and not replace:
103 raise ValueError(
104 "Replace has to be set to `True` when "
105 "upsampling the population `frac` > 1."
106 )
107 if frac < 0:
108 raise ValueError(
109 "A negative number of rows requested. Please provide `frac` >= 0."
110 )
112 return n
115def sample(
116 obj_len: int,
117 size: int,
118 replace: bool,
119 weights: np.ndarray | None,
120 random_state: np.random.RandomState | np.random.Generator,
121) -> np.ndarray:
122 """
123 Randomly sample `size` indices in `np.arange(obj_len)`
125 Parameters
126 ----------
127 obj_len : int
128 The length of the indices being considered
129 size : int
130 The number of values to choose
131 replace : bool
132 Allow or disallow sampling of the same row more than once.
133 weights : np.ndarray[np.float64] or None
134 If None, equal probability weighting, otherwise weights according
135 to the vector normalized
136 random_state: np.random.RandomState or np.random.Generator
137 State used for the random sampling
139 Returns
140 -------
141 np.ndarray[np.intp]
142 """
143 if weights is not None:
144 weight_sum = weights.sum()
145 if weight_sum != 0:
146 weights = weights / weight_sum
147 else:
148 raise ValueError("Invalid weights: weights sum to zero")
150 return random_state.choice(obj_len, size=size, replace=replace, p=weights).astype(
151 np.intp, copy=False
152 )