Coverage for /var/srv/projects/api.amasfac.comuna18.com/tmp/venv/lib/python3.9/site-packages/pandas/core/arrays/sparse/dtype.py: 28%

1"""Sparse Dtype"""

2from __future__ import annotations

4import re

5from typing import (

6 TYPE_CHECKING,

7 Any,

9import warnings

11import numpy as np

13from pandas._typing import (

14 Dtype,

15 DtypeObj,

16 type_t,

17)

18from pandas.errors import PerformanceWarning

19from pandas.util._exceptions import find_stack_level

21from pandas.core.dtypes.astype import astype_nansafe

22from pandas.core.dtypes.base import (

23 ExtensionDtype,

24 register_extension_dtype,

25)

26from pandas.core.dtypes.common import (

27 is_bool_dtype,

28 is_object_dtype,

29 is_scalar,

30 is_string_dtype,

31 pandas_dtype,

32)

33from pandas.core.dtypes.missing import (

34 isna,

35 na_value_for_dtype,

36)

38if TYPE_CHECKING: 38 ↛ 39line 38 didn't jump to line 39, because the condition on line 38 was never true

39 from pandas.core.arrays.sparse.array import SparseArray

42@register_extension_dtype

43class SparseDtype(ExtensionDtype):

44 """

45 Dtype for data stored in :class:`SparseArray`.

47 This dtype implements the pandas ExtensionDtype interface.

49 Parameters

50 ----------

51 dtype : str, ExtensionDtype, numpy.dtype, type, default numpy.float64

52 The dtype of the underlying array storing the non-fill value values.

53 fill_value : scalar, optional

54 The scalar value not stored in the SparseArray. By default, this

55 depends on `dtype`.

57 =========== ==========

58 dtype na_value

59 =========== ==========

60 float ``np.nan``

61 int ``0``

62 bool ``False``

63 datetime64 ``pd.NaT``

64 timedelta64 ``pd.NaT``

65 =========== ==========

67 The default value may be overridden by specifying a `fill_value`.

69 Attributes

70 ----------

71 None

73 Methods

74 -------

75 None

76 """

78 # We include `_is_na_fill_value` in the metadata to avoid hash collisions

79 # between SparseDtype(float, 0.0) and SparseDtype(float, nan).

80 # Without is_na_fill_value in the comparison, those would be equal since

81 # hash(nan) is (sometimes?) 0.

82 _metadata = ("_dtype", "_fill_value", "_is_na_fill_value")

84 def __init__(self, dtype: Dtype = np.float64, fill_value: Any = None) -> None:

86 if isinstance(dtype, type(self)):

87 if fill_value is None:

88 fill_value = dtype.fill_value

89 dtype = dtype.subtype

91 dtype = pandas_dtype(dtype)

92 if is_string_dtype(dtype):

93 dtype = np.dtype("object")

95 if fill_value is None:

96 fill_value = na_value_for_dtype(dtype)

98 self._dtype = dtype

99 self._fill_value = fill_value

100 self._check_fill_value()

101

102 def __hash__(self) -> int:

103 # Python3 doesn't inherit __hash__ when a base class overrides

104 # __eq__, so we explicitly do it here.

105 return super().__hash__()

106

107 def __eq__(self, other: Any) -> bool:

108 # We have to override __eq__ to handle NA values in _metadata.

109 # The base class does simple == checks, which fail for NA.

110 if isinstance(other, str):

111 try:

112 other = self.construct_from_string(other)

113 except TypeError:

114 return False

115

116 if isinstance(other, type(self)):

117 subtype = self.subtype == other.subtype

118 if self._is_na_fill_value:

119 # this case is complicated by two things:

120 # SparseDtype(float, float(nan)) == SparseDtype(float, np.nan)

121 # SparseDtype(float, np.nan) != SparseDtype(float, pd.NaT)

122 # i.e. we want to treat any floating-point NaN as equal, but

123 # not a floating-point NaN and a datetime NaT.

124 fill_value = (

125 other._is_na_fill_value

126 and isinstance(self.fill_value, type(other.fill_value))

127 or isinstance(other.fill_value, type(self.fill_value))

128 )

129 else:

130 fill_value = self.fill_value == other.fill_value

131

132 return subtype and fill_value

133 return False

134

135 @property

136 def fill_value(self):

137 """

138 The fill value of the array.

139

140 Converting the SparseArray to a dense ndarray will fill the

141 array with this value.

142

143 .. warning::

144

145 It's possible to end up with a SparseArray that has ``fill_value``

146 values in ``sp_values``. This can occur, for example, when setting

147 ``SparseArray.fill_value`` directly.

148 """

149 return self._fill_value

150

151 def _check_fill_value(self):

152 if not is_scalar(self._fill_value):

153 raise ValueError(

154 f"fill_value must be a scalar. Got {self._fill_value} instead"

155 )

156 # TODO: Right now we can use Sparse boolean array

157 # with any fill_value. Here was an attempt

158 # to allow only 3 value: True, False or nan

159 # but plenty test has failed.

160 # see pull 44955

161 # if self._is_boolean and not (

162 # is_bool(self._fill_value) or isna(self._fill_value)

163 # ):

164 # raise ValueError(

165 # "fill_value must be True, False or nan "

166 # f"for boolean type. Got {self._fill_value} instead"

167 # )

168

169 @property

170 def _is_na_fill_value(self) -> bool:

171 return isna(self.fill_value)

172

173 @property

174 def _is_numeric(self) -> bool:

175 return not is_object_dtype(self.subtype)

176

177 @property

178 def _is_boolean(self) -> bool:

179 return is_bool_dtype(self.subtype)

180

181 @property

182 def kind(self) -> str:

183 """

184 The sparse kind. Either 'integer', or 'block'.

185 """

186 return self.subtype.kind

187

188 @property

189 def type(self):

190 return self.subtype.type

191

192 @property

193 def subtype(self):

194 return self._dtype

195

196 @property

197 def name(self) -> str:

198 return f"Sparse[{self.subtype.name}, {repr(self.fill_value)}]"

199

200 def __repr__(self) -> str:

201 return self.name

202

203 @classmethod

204 def construct_array_type(cls) -> type_t[SparseArray]:

205 """

206 Return the array type associated with this dtype.

207

208 Returns

209 -------

210 type

211 """

212 from pandas.core.arrays.sparse.array import SparseArray

213

214 return SparseArray

215

216 @classmethod

217 def construct_from_string(cls, string: str) -> SparseDtype:

218 """

219 Construct a SparseDtype from a string form.

220

221 Parameters

222 ----------

223 string : str

224 Can take the following forms.

225

226 string dtype

227 ================ ============================

228 'int' SparseDtype[np.int64, 0]

229 'Sparse' SparseDtype[np.float64, nan]

230 'Sparse[int]' SparseDtype[np.int64, 0]

231 'Sparse[int, 0]' SparseDtype[np.int64, 0]

232 ================ ============================

233

234 It is not possible to specify non-default fill values

235 with a string. An argument like ``'Sparse[int, 1]'``

236 will raise a ``TypeError`` because the default fill value

237 for integers is 0.

238

239 Returns

240 -------

241 SparseDtype

242 """

243 if not isinstance(string, str):

244 raise TypeError(

245 f"'construct_from_string' expects a string, got {type(string)}"

246 )

247 msg = f"Cannot construct a 'SparseDtype' from '{string}'"

248 if string.startswith("Sparse"):

249 try:

250 sub_type, has_fill_value = cls._parse_subtype(string)

251 except ValueError as err:

252 raise TypeError(msg) from err

253 else:

254 result = SparseDtype(sub_type)

255 msg = (

256 f"Cannot construct a 'SparseDtype' from '{string}'.\n\nIt "

257 "looks like the fill_value in the string is not "

258 "the default for the dtype. Non-default fill_values "

259 "are not supported. Use the 'SparseDtype()' "

260 "constructor instead."

261 )

262 if has_fill_value and str(result) != string:

263 raise TypeError(msg)

264 return result

265 else:

266 raise TypeError(msg)

267

268 @staticmethod

269 def _parse_subtype(dtype: str) -> tuple[str, bool]:

270 """

271 Parse a string to get the subtype

272

273 Parameters

274 ----------

275 dtype : str

276 A string like

277

278 * Sparse[subtype]

279 * Sparse[subtype, fill_value]

280

281 Returns

282 -------

283 subtype : str

284

285 Raises

286 ------

287 ValueError

288 When the subtype cannot be extracted.

289 """

290 xpr = re.compile(r"Sparse\[(?P<subtype>[^,]*)(, )?(?P<fill_value>.*?)?\]$")

291 m = xpr.match(dtype)

292 has_fill_value = False

293 if m:

294 subtype = m.groupdict()["subtype"]

295 has_fill_value = bool(m.groupdict()["fill_value"])

296 elif dtype == "Sparse":

297 subtype = "float64"

298 else:

299 raise ValueError(f"Cannot parse {dtype}")

300 return subtype, has_fill_value

301

302 @classmethod

303 def is_dtype(cls, dtype: object) -> bool:

304 dtype = getattr(dtype, "dtype", dtype)

305 if isinstance(dtype, str) and dtype.startswith("Sparse"):

306 sub_type, _ = cls._parse_subtype(dtype)

307 dtype = np.dtype(sub_type)

308 elif isinstance(dtype, cls):

309 return True

310 return isinstance(dtype, np.dtype) or dtype == "Sparse"

311

312 def update_dtype(self, dtype) -> SparseDtype:

313 """

314 Convert the SparseDtype to a new dtype.

315

316 This takes care of converting the ``fill_value``.

317

318 Parameters

319 ----------

320 dtype : Union[str, numpy.dtype, SparseDtype]

321 The new dtype to use.

322

323 * For a SparseDtype, it is simply returned

324 * For a NumPy dtype (or str), the current fill value

325 is converted to the new dtype, and a SparseDtype

326 with `dtype` and the new fill value is returned.

327

328 Returns

329 -------

330 SparseDtype

331 A new SparseDtype with the correct `dtype` and fill value

332 for that `dtype`.

333

334 Raises

335 ------

336 ValueError

337 When the current fill value cannot be converted to the

338 new `dtype` (e.g. trying to convert ``np.nan`` to an

339 integer dtype).

340

341

342 Examples

343 --------

344 >>> SparseDtype(int, 0).update_dtype(float)

345 Sparse[float64, 0.0]

346

347 >>> SparseDtype(int, 1).update_dtype(SparseDtype(float, np.nan))

348 Sparse[float64, nan]

349 """

350 cls = type(self)

351 dtype = pandas_dtype(dtype)

352

353 if not isinstance(dtype, cls):

354 if not isinstance(dtype, np.dtype):

355 raise TypeError("sparse arrays of extension dtypes not supported")

356

357 fvarr = astype_nansafe(np.array(self.fill_value), dtype)

358 # NB: not fv_0d.item(), as that casts dt64->int

359 fill_value = fvarr[0]

360 dtype = cls(dtype, fill_value=fill_value)

361

362 return dtype

363

364 @property

365 def _subtype_with_str(self):

366 """

367 Whether the SparseDtype's subtype should be considered ``str``.

368

369 Typically, pandas will store string data in an object-dtype array.

370 When converting values to a dtype, e.g. in ``.astype``, we need to

371 be more specific, we need the actual underlying type.

372

373 Returns

374 -------

375 >>> SparseDtype(int, 1)._subtype_with_str

376 dtype('int64')

377

378 >>> SparseDtype(object, 1)._subtype_with_str

379 dtype('O')

380

381 >>> dtype = SparseDtype(str, '')

382 >>> dtype.subtype

383 dtype('O')

384

385 >>> dtype._subtype_with_str

386 <class 'str'>

387 """

388 if isinstance(self.fill_value, str):

389 return type(self.fill_value)

390 return self.subtype

391

392 def _get_common_dtype(self, dtypes: list[DtypeObj]) -> DtypeObj | None:

393 # TODO for now only handle SparseDtypes and numpy dtypes => extend

394 # with other compatible extension dtypes

395 if any(

396 isinstance(x, ExtensionDtype) and not isinstance(x, SparseDtype)

397 for x in dtypes

398 ):

399 return None

400

401 fill_values = [x.fill_value for x in dtypes if isinstance(x, SparseDtype)]

402 fill_value = fill_values[0]

403

404 # np.nan isn't a singleton, so we may end up with multiple

405 # NaNs here, so we ignore the all NA case too.

406 if not (len(set(fill_values)) == 1 or isna(fill_values).all()):

407 warnings.warn(

408 "Concatenating sparse arrays with multiple fill "

409 f"values: '{fill_values}'. Picking the first and "

410 "converting the rest.",

411 PerformanceWarning,

412 stacklevel=find_stack_level(),

413 )

414

415 np_dtypes = [x.subtype if isinstance(x, SparseDtype) else x for x in dtypes]

416 return SparseDtype(np.find_common_type(np_dtypes, []), fill_value=fill_value)