Coverage for /var/srv/projects/api.amasfac.comuna18.com/tmp/venv/lib/python3.9/site-packages/pandas/core/arrays/sparse/dtype.py: 28%

139 statements  

« prev     ^ index     » next       coverage.py v6.4.4, created at 2023-07-17 14:22 -0600

1"""Sparse Dtype""" 

2from __future__ import annotations 

3 

4import re 

5from typing import ( 

6 TYPE_CHECKING, 

7 Any, 

8) 

9import warnings 

10 

11import numpy as np 

12 

13from pandas._typing import ( 

14 Dtype, 

15 DtypeObj, 

16 type_t, 

17) 

18from pandas.errors import PerformanceWarning 

19from pandas.util._exceptions import find_stack_level 

20 

21from pandas.core.dtypes.astype import astype_nansafe 

22from pandas.core.dtypes.base import ( 

23 ExtensionDtype, 

24 register_extension_dtype, 

25) 

26from pandas.core.dtypes.common import ( 

27 is_bool_dtype, 

28 is_object_dtype, 

29 is_scalar, 

30 is_string_dtype, 

31 pandas_dtype, 

32) 

33from pandas.core.dtypes.missing import ( 

34 isna, 

35 na_value_for_dtype, 

36) 

37 

38if TYPE_CHECKING: 38 ↛ 39line 38 didn't jump to line 39, because the condition on line 38 was never true

39 from pandas.core.arrays.sparse.array import SparseArray 

40 

41 

42@register_extension_dtype 

43class SparseDtype(ExtensionDtype): 

44 """ 

45 Dtype for data stored in :class:`SparseArray`. 

46 

47 This dtype implements the pandas ExtensionDtype interface. 

48 

49 Parameters 

50 ---------- 

51 dtype : str, ExtensionDtype, numpy.dtype, type, default numpy.float64 

52 The dtype of the underlying array storing the non-fill value values. 

53 fill_value : scalar, optional 

54 The scalar value not stored in the SparseArray. By default, this 

55 depends on `dtype`. 

56 

57 =========== ========== 

58 dtype na_value 

59 =========== ========== 

60 float ``np.nan`` 

61 int ``0`` 

62 bool ``False`` 

63 datetime64 ``pd.NaT`` 

64 timedelta64 ``pd.NaT`` 

65 =========== ========== 

66 

67 The default value may be overridden by specifying a `fill_value`. 

68 

69 Attributes 

70 ---------- 

71 None 

72 

73 Methods 

74 ------- 

75 None 

76 """ 

77 

78 # We include `_is_na_fill_value` in the metadata to avoid hash collisions 

79 # between SparseDtype(float, 0.0) and SparseDtype(float, nan). 

80 # Without is_na_fill_value in the comparison, those would be equal since 

81 # hash(nan) is (sometimes?) 0. 

82 _metadata = ("_dtype", "_fill_value", "_is_na_fill_value") 

83 

84 def __init__(self, dtype: Dtype = np.float64, fill_value: Any = None) -> None: 

85 

86 if isinstance(dtype, type(self)): 

87 if fill_value is None: 

88 fill_value = dtype.fill_value 

89 dtype = dtype.subtype 

90 

91 dtype = pandas_dtype(dtype) 

92 if is_string_dtype(dtype): 

93 dtype = np.dtype("object") 

94 

95 if fill_value is None: 

96 fill_value = na_value_for_dtype(dtype) 

97 

98 self._dtype = dtype 

99 self._fill_value = fill_value 

100 self._check_fill_value() 

101 

102 def __hash__(self) -> int: 

103 # Python3 doesn't inherit __hash__ when a base class overrides 

104 # __eq__, so we explicitly do it here. 

105 return super().__hash__() 

106 

107 def __eq__(self, other: Any) -> bool: 

108 # We have to override __eq__ to handle NA values in _metadata. 

109 # The base class does simple == checks, which fail for NA. 

110 if isinstance(other, str): 

111 try: 

112 other = self.construct_from_string(other) 

113 except TypeError: 

114 return False 

115 

116 if isinstance(other, type(self)): 

117 subtype = self.subtype == other.subtype 

118 if self._is_na_fill_value: 

119 # this case is complicated by two things: 

120 # SparseDtype(float, float(nan)) == SparseDtype(float, np.nan) 

121 # SparseDtype(float, np.nan) != SparseDtype(float, pd.NaT) 

122 # i.e. we want to treat any floating-point NaN as equal, but 

123 # not a floating-point NaN and a datetime NaT. 

124 fill_value = ( 

125 other._is_na_fill_value 

126 and isinstance(self.fill_value, type(other.fill_value)) 

127 or isinstance(other.fill_value, type(self.fill_value)) 

128 ) 

129 else: 

130 fill_value = self.fill_value == other.fill_value 

131 

132 return subtype and fill_value 

133 return False 

134 

135 @property 

136 def fill_value(self): 

137 """ 

138 The fill value of the array. 

139 

140 Converting the SparseArray to a dense ndarray will fill the 

141 array with this value. 

142 

143 .. warning:: 

144 

145 It's possible to end up with a SparseArray that has ``fill_value`` 

146 values in ``sp_values``. This can occur, for example, when setting 

147 ``SparseArray.fill_value`` directly. 

148 """ 

149 return self._fill_value 

150 

151 def _check_fill_value(self): 

152 if not is_scalar(self._fill_value): 

153 raise ValueError( 

154 f"fill_value must be a scalar. Got {self._fill_value} instead" 

155 ) 

156 # TODO: Right now we can use Sparse boolean array 

157 # with any fill_value. Here was an attempt 

158 # to allow only 3 value: True, False or nan 

159 # but plenty test has failed. 

160 # see pull 44955 

161 # if self._is_boolean and not ( 

162 # is_bool(self._fill_value) or isna(self._fill_value) 

163 # ): 

164 # raise ValueError( 

165 # "fill_value must be True, False or nan " 

166 # f"for boolean type. Got {self._fill_value} instead" 

167 # ) 

168 

169 @property 

170 def _is_na_fill_value(self) -> bool: 

171 return isna(self.fill_value) 

172 

173 @property 

174 def _is_numeric(self) -> bool: 

175 return not is_object_dtype(self.subtype) 

176 

177 @property 

178 def _is_boolean(self) -> bool: 

179 return is_bool_dtype(self.subtype) 

180 

181 @property 

182 def kind(self) -> str: 

183 """ 

184 The sparse kind. Either 'integer', or 'block'. 

185 """ 

186 return self.subtype.kind 

187 

188 @property 

189 def type(self): 

190 return self.subtype.type 

191 

192 @property 

193 def subtype(self): 

194 return self._dtype 

195 

196 @property 

197 def name(self) -> str: 

198 return f"Sparse[{self.subtype.name}, {repr(self.fill_value)}]" 

199 

200 def __repr__(self) -> str: 

201 return self.name 

202 

203 @classmethod 

204 def construct_array_type(cls) -> type_t[SparseArray]: 

205 """ 

206 Return the array type associated with this dtype. 

207 

208 Returns 

209 ------- 

210 type 

211 """ 

212 from pandas.core.arrays.sparse.array import SparseArray 

213 

214 return SparseArray 

215 

216 @classmethod 

217 def construct_from_string(cls, string: str) -> SparseDtype: 

218 """ 

219 Construct a SparseDtype from a string form. 

220 

221 Parameters 

222 ---------- 

223 string : str 

224 Can take the following forms. 

225 

226 string dtype 

227 ================ ============================ 

228 'int' SparseDtype[np.int64, 0] 

229 'Sparse' SparseDtype[np.float64, nan] 

230 'Sparse[int]' SparseDtype[np.int64, 0] 

231 'Sparse[int, 0]' SparseDtype[np.int64, 0] 

232 ================ ============================ 

233 

234 It is not possible to specify non-default fill values 

235 with a string. An argument like ``'Sparse[int, 1]'`` 

236 will raise a ``TypeError`` because the default fill value 

237 for integers is 0. 

238 

239 Returns 

240 ------- 

241 SparseDtype 

242 """ 

243 if not isinstance(string, str): 

244 raise TypeError( 

245 f"'construct_from_string' expects a string, got {type(string)}" 

246 ) 

247 msg = f"Cannot construct a 'SparseDtype' from '{string}'" 

248 if string.startswith("Sparse"): 

249 try: 

250 sub_type, has_fill_value = cls._parse_subtype(string) 

251 except ValueError as err: 

252 raise TypeError(msg) from err 

253 else: 

254 result = SparseDtype(sub_type) 

255 msg = ( 

256 f"Cannot construct a 'SparseDtype' from '{string}'.\n\nIt " 

257 "looks like the fill_value in the string is not " 

258 "the default for the dtype. Non-default fill_values " 

259 "are not supported. Use the 'SparseDtype()' " 

260 "constructor instead." 

261 ) 

262 if has_fill_value and str(result) != string: 

263 raise TypeError(msg) 

264 return result 

265 else: 

266 raise TypeError(msg) 

267 

268 @staticmethod 

269 def _parse_subtype(dtype: str) -> tuple[str, bool]: 

270 """ 

271 Parse a string to get the subtype 

272 

273 Parameters 

274 ---------- 

275 dtype : str 

276 A string like 

277 

278 * Sparse[subtype] 

279 * Sparse[subtype, fill_value] 

280 

281 Returns 

282 ------- 

283 subtype : str 

284 

285 Raises 

286 ------ 

287 ValueError 

288 When the subtype cannot be extracted. 

289 """ 

290 xpr = re.compile(r"Sparse\[(?P<subtype>[^,]*)(, )?(?P<fill_value>.*?)?\]$") 

291 m = xpr.match(dtype) 

292 has_fill_value = False 

293 if m: 

294 subtype = m.groupdict()["subtype"] 

295 has_fill_value = bool(m.groupdict()["fill_value"]) 

296 elif dtype == "Sparse": 

297 subtype = "float64" 

298 else: 

299 raise ValueError(f"Cannot parse {dtype}") 

300 return subtype, has_fill_value 

301 

302 @classmethod 

303 def is_dtype(cls, dtype: object) -> bool: 

304 dtype = getattr(dtype, "dtype", dtype) 

305 if isinstance(dtype, str) and dtype.startswith("Sparse"): 

306 sub_type, _ = cls._parse_subtype(dtype) 

307 dtype = np.dtype(sub_type) 

308 elif isinstance(dtype, cls): 

309 return True 

310 return isinstance(dtype, np.dtype) or dtype == "Sparse" 

311 

312 def update_dtype(self, dtype) -> SparseDtype: 

313 """ 

314 Convert the SparseDtype to a new dtype. 

315 

316 This takes care of converting the ``fill_value``. 

317 

318 Parameters 

319 ---------- 

320 dtype : Union[str, numpy.dtype, SparseDtype] 

321 The new dtype to use. 

322 

323 * For a SparseDtype, it is simply returned 

324 * For a NumPy dtype (or str), the current fill value 

325 is converted to the new dtype, and a SparseDtype 

326 with `dtype` and the new fill value is returned. 

327 

328 Returns 

329 ------- 

330 SparseDtype 

331 A new SparseDtype with the correct `dtype` and fill value 

332 for that `dtype`. 

333 

334 Raises 

335 ------ 

336 ValueError 

337 When the current fill value cannot be converted to the 

338 new `dtype` (e.g. trying to convert ``np.nan`` to an 

339 integer dtype). 

340 

341 

342 Examples 

343 -------- 

344 >>> SparseDtype(int, 0).update_dtype(float) 

345 Sparse[float64, 0.0] 

346 

347 >>> SparseDtype(int, 1).update_dtype(SparseDtype(float, np.nan)) 

348 Sparse[float64, nan] 

349 """ 

350 cls = type(self) 

351 dtype = pandas_dtype(dtype) 

352 

353 if not isinstance(dtype, cls): 

354 if not isinstance(dtype, np.dtype): 

355 raise TypeError("sparse arrays of extension dtypes not supported") 

356 

357 fvarr = astype_nansafe(np.array(self.fill_value), dtype) 

358 # NB: not fv_0d.item(), as that casts dt64->int 

359 fill_value = fvarr[0] 

360 dtype = cls(dtype, fill_value=fill_value) 

361 

362 return dtype 

363 

364 @property 

365 def _subtype_with_str(self): 

366 """ 

367 Whether the SparseDtype's subtype should be considered ``str``. 

368 

369 Typically, pandas will store string data in an object-dtype array. 

370 When converting values to a dtype, e.g. in ``.astype``, we need to 

371 be more specific, we need the actual underlying type. 

372 

373 Returns 

374 ------- 

375 >>> SparseDtype(int, 1)._subtype_with_str 

376 dtype('int64') 

377 

378 >>> SparseDtype(object, 1)._subtype_with_str 

379 dtype('O') 

380 

381 >>> dtype = SparseDtype(str, '') 

382 >>> dtype.subtype 

383 dtype('O') 

384 

385 >>> dtype._subtype_with_str 

386 <class 'str'> 

387 """ 

388 if isinstance(self.fill_value, str): 

389 return type(self.fill_value) 

390 return self.subtype 

391 

392 def _get_common_dtype(self, dtypes: list[DtypeObj]) -> DtypeObj | None: 

393 # TODO for now only handle SparseDtypes and numpy dtypes => extend 

394 # with other compatible extension dtypes 

395 if any( 

396 isinstance(x, ExtensionDtype) and not isinstance(x, SparseDtype) 

397 for x in dtypes 

398 ): 

399 return None 

400 

401 fill_values = [x.fill_value for x in dtypes if isinstance(x, SparseDtype)] 

402 fill_value = fill_values[0] 

403 

404 # np.nan isn't a singleton, so we may end up with multiple 

405 # NaNs here, so we ignore the all NA case too. 

406 if not (len(set(fill_values)) == 1 or isna(fill_values).all()): 

407 warnings.warn( 

408 "Concatenating sparse arrays with multiple fill " 

409 f"values: '{fill_values}'. Picking the first and " 

410 "converting the rest.", 

411 PerformanceWarning, 

412 stacklevel=find_stack_level(), 

413 ) 

414 

415 np_dtypes = [x.subtype if isinstance(x, SparseDtype) else x for x in dtypes] 

416 return SparseDtype(np.find_common_type(np_dtypes, []), fill_value=fill_value)