Coverage for /var/srv/projects/api.amasfac.comuna18.com/tmp/venv/lib/python3.9/site-packages/pandas/core/sample.py: 11%

57 statements  

« prev     ^ index     » next       coverage.py v6.4.4, created at 2023-07-17 14:22 -0600

1""" 

2Module containing utilities for NDFrame.sample() and .GroupBy.sample() 

3""" 

4from __future__ import annotations 

5 

6from typing import TYPE_CHECKING 

7 

8import numpy as np 

9 

10from pandas._libs import lib 

11 

12from pandas.core.dtypes.generic import ( 

13 ABCDataFrame, 

14 ABCSeries, 

15) 

16 

17if TYPE_CHECKING: 17 ↛ 18line 17 didn't jump to line 18, because the condition on line 17 was never true

18 from pandas.core.generic import NDFrame 

19 

20 

21def preprocess_weights(obj: NDFrame, weights, axis: int) -> np.ndarray: 

22 """ 

23 Process and validate the `weights` argument to `NDFrame.sample` and 

24 `.GroupBy.sample`. 

25 

26 Returns `weights` as an ndarray[np.float64], validated except for normalizing 

27 weights (because that must be done groupwise in groupby sampling). 

28 """ 

29 # If a series, align with frame 

30 if isinstance(weights, ABCSeries): 

31 weights = weights.reindex(obj.axes[axis]) 

32 

33 # Strings acceptable if a dataframe and axis = 0 

34 if isinstance(weights, str): 

35 if isinstance(obj, ABCDataFrame): 

36 if axis == 0: 

37 try: 

38 weights = obj[weights] 

39 except KeyError as err: 

40 raise KeyError( 

41 "String passed to weights not a valid column" 

42 ) from err 

43 else: 

44 raise ValueError( 

45 "Strings can only be passed to " 

46 "weights when sampling from rows on " 

47 "a DataFrame" 

48 ) 

49 else: 

50 raise ValueError( 

51 "Strings cannot be passed as weights when sampling from a Series." 

52 ) 

53 

54 if isinstance(obj, ABCSeries): 

55 func = obj._constructor 

56 else: 

57 func = obj._constructor_sliced 

58 

59 weights = func(weights, dtype="float64")._values 

60 

61 if len(weights) != obj.shape[axis]: 

62 raise ValueError("Weights and axis to be sampled must be of same length") 

63 

64 if lib.has_infs(weights): 

65 raise ValueError("weight vector may not include `inf` values") 

66 

67 if (weights < 0).any(): 

68 raise ValueError("weight vector many not include negative values") 

69 

70 missing = np.isnan(weights) 

71 if missing.any(): 

72 # Don't modify weights in place 

73 weights = weights.copy() 

74 weights[missing] = 0 

75 return weights 

76 

77 

78def process_sampling_size( 

79 n: int | None, frac: float | None, replace: bool 

80) -> int | None: 

81 """ 

82 Process and validate the `n` and `frac` arguments to `NDFrame.sample` and 

83 `.GroupBy.sample`. 

84 

85 Returns None if `frac` should be used (variable sampling sizes), otherwise returns 

86 the constant sampling size. 

87 """ 

88 # If no frac or n, default to n=1. 

89 if n is None and frac is None: 

90 n = 1 

91 elif n is not None and frac is not None: 

92 raise ValueError("Please enter a value for `frac` OR `n`, not both") 

93 elif n is not None: 

94 if n < 0: 

95 raise ValueError( 

96 "A negative number of rows requested. Please provide `n` >= 0." 

97 ) 

98 if n % 1 != 0: 

99 raise ValueError("Only integers accepted as `n` values") 

100 else: 

101 assert frac is not None # for mypy 

102 if frac > 1 and not replace: 

103 raise ValueError( 

104 "Replace has to be set to `True` when " 

105 "upsampling the population `frac` > 1." 

106 ) 

107 if frac < 0: 

108 raise ValueError( 

109 "A negative number of rows requested. Please provide `frac` >= 0." 

110 ) 

111 

112 return n 

113 

114 

115def sample( 

116 obj_len: int, 

117 size: int, 

118 replace: bool, 

119 weights: np.ndarray | None, 

120 random_state: np.random.RandomState | np.random.Generator, 

121) -> np.ndarray: 

122 """ 

123 Randomly sample `size` indices in `np.arange(obj_len)` 

124 

125 Parameters 

126 ---------- 

127 obj_len : int 

128 The length of the indices being considered 

129 size : int 

130 The number of values to choose 

131 replace : bool 

132 Allow or disallow sampling of the same row more than once. 

133 weights : np.ndarray[np.float64] or None 

134 If None, equal probability weighting, otherwise weights according 

135 to the vector normalized 

136 random_state: np.random.RandomState or np.random.Generator 

137 State used for the random sampling 

138 

139 Returns 

140 ------- 

141 np.ndarray[np.intp] 

142 """ 

143 if weights is not None: 

144 weight_sum = weights.sum() 

145 if weight_sum != 0: 

146 weights = weights / weight_sum 

147 else: 

148 raise ValueError("Invalid weights: weights sum to zero") 

149 

150 return random_state.choice(obj_len, size=size, replace=replace, p=weights).astype( 

151 np.intp, copy=False 

152 )