Coverage for /var/srv/projects/api.amasfac.comuna18.com/tmp/venv/lib/python3.9/site-packages/pandas/core/arrays/sparse/dtype.py: 28%
139 statements
« prev ^ index » next coverage.py v6.4.4, created at 2023-07-17 14:22 -0600
« prev ^ index » next coverage.py v6.4.4, created at 2023-07-17 14:22 -0600
1"""Sparse Dtype"""
2from __future__ import annotations
4import re
5from typing import (
6 TYPE_CHECKING,
7 Any,
8)
9import warnings
11import numpy as np
13from pandas._typing import (
14 Dtype,
15 DtypeObj,
16 type_t,
17)
18from pandas.errors import PerformanceWarning
19from pandas.util._exceptions import find_stack_level
21from pandas.core.dtypes.astype import astype_nansafe
22from pandas.core.dtypes.base import (
23 ExtensionDtype,
24 register_extension_dtype,
25)
26from pandas.core.dtypes.common import (
27 is_bool_dtype,
28 is_object_dtype,
29 is_scalar,
30 is_string_dtype,
31 pandas_dtype,
32)
33from pandas.core.dtypes.missing import (
34 isna,
35 na_value_for_dtype,
36)
38if TYPE_CHECKING: 38 ↛ 39line 38 didn't jump to line 39, because the condition on line 38 was never true
39 from pandas.core.arrays.sparse.array import SparseArray
42@register_extension_dtype
43class SparseDtype(ExtensionDtype):
44 """
45 Dtype for data stored in :class:`SparseArray`.
47 This dtype implements the pandas ExtensionDtype interface.
49 Parameters
50 ----------
51 dtype : str, ExtensionDtype, numpy.dtype, type, default numpy.float64
52 The dtype of the underlying array storing the non-fill value values.
53 fill_value : scalar, optional
54 The scalar value not stored in the SparseArray. By default, this
55 depends on `dtype`.
57 =========== ==========
58 dtype na_value
59 =========== ==========
60 float ``np.nan``
61 int ``0``
62 bool ``False``
63 datetime64 ``pd.NaT``
64 timedelta64 ``pd.NaT``
65 =========== ==========
67 The default value may be overridden by specifying a `fill_value`.
69 Attributes
70 ----------
71 None
73 Methods
74 -------
75 None
76 """
78 # We include `_is_na_fill_value` in the metadata to avoid hash collisions
79 # between SparseDtype(float, 0.0) and SparseDtype(float, nan).
80 # Without is_na_fill_value in the comparison, those would be equal since
81 # hash(nan) is (sometimes?) 0.
82 _metadata = ("_dtype", "_fill_value", "_is_na_fill_value")
84 def __init__(self, dtype: Dtype = np.float64, fill_value: Any = None) -> None:
86 if isinstance(dtype, type(self)):
87 if fill_value is None:
88 fill_value = dtype.fill_value
89 dtype = dtype.subtype
91 dtype = pandas_dtype(dtype)
92 if is_string_dtype(dtype):
93 dtype = np.dtype("object")
95 if fill_value is None:
96 fill_value = na_value_for_dtype(dtype)
98 self._dtype = dtype
99 self._fill_value = fill_value
100 self._check_fill_value()
102 def __hash__(self) -> int:
103 # Python3 doesn't inherit __hash__ when a base class overrides
104 # __eq__, so we explicitly do it here.
105 return super().__hash__()
107 def __eq__(self, other: Any) -> bool:
108 # We have to override __eq__ to handle NA values in _metadata.
109 # The base class does simple == checks, which fail for NA.
110 if isinstance(other, str):
111 try:
112 other = self.construct_from_string(other)
113 except TypeError:
114 return False
116 if isinstance(other, type(self)):
117 subtype = self.subtype == other.subtype
118 if self._is_na_fill_value:
119 # this case is complicated by two things:
120 # SparseDtype(float, float(nan)) == SparseDtype(float, np.nan)
121 # SparseDtype(float, np.nan) != SparseDtype(float, pd.NaT)
122 # i.e. we want to treat any floating-point NaN as equal, but
123 # not a floating-point NaN and a datetime NaT.
124 fill_value = (
125 other._is_na_fill_value
126 and isinstance(self.fill_value, type(other.fill_value))
127 or isinstance(other.fill_value, type(self.fill_value))
128 )
129 else:
130 fill_value = self.fill_value == other.fill_value
132 return subtype and fill_value
133 return False
135 @property
136 def fill_value(self):
137 """
138 The fill value of the array.
140 Converting the SparseArray to a dense ndarray will fill the
141 array with this value.
143 .. warning::
145 It's possible to end up with a SparseArray that has ``fill_value``
146 values in ``sp_values``. This can occur, for example, when setting
147 ``SparseArray.fill_value`` directly.
148 """
149 return self._fill_value
151 def _check_fill_value(self):
152 if not is_scalar(self._fill_value):
153 raise ValueError(
154 f"fill_value must be a scalar. Got {self._fill_value} instead"
155 )
156 # TODO: Right now we can use Sparse boolean array
157 # with any fill_value. Here was an attempt
158 # to allow only 3 value: True, False or nan
159 # but plenty test has failed.
160 # see pull 44955
161 # if self._is_boolean and not (
162 # is_bool(self._fill_value) or isna(self._fill_value)
163 # ):
164 # raise ValueError(
165 # "fill_value must be True, False or nan "
166 # f"for boolean type. Got {self._fill_value} instead"
167 # )
169 @property
170 def _is_na_fill_value(self) -> bool:
171 return isna(self.fill_value)
173 @property
174 def _is_numeric(self) -> bool:
175 return not is_object_dtype(self.subtype)
177 @property
178 def _is_boolean(self) -> bool:
179 return is_bool_dtype(self.subtype)
181 @property
182 def kind(self) -> str:
183 """
184 The sparse kind. Either 'integer', or 'block'.
185 """
186 return self.subtype.kind
188 @property
189 def type(self):
190 return self.subtype.type
192 @property
193 def subtype(self):
194 return self._dtype
196 @property
197 def name(self) -> str:
198 return f"Sparse[{self.subtype.name}, {repr(self.fill_value)}]"
200 def __repr__(self) -> str:
201 return self.name
203 @classmethod
204 def construct_array_type(cls) -> type_t[SparseArray]:
205 """
206 Return the array type associated with this dtype.
208 Returns
209 -------
210 type
211 """
212 from pandas.core.arrays.sparse.array import SparseArray
214 return SparseArray
216 @classmethod
217 def construct_from_string(cls, string: str) -> SparseDtype:
218 """
219 Construct a SparseDtype from a string form.
221 Parameters
222 ----------
223 string : str
224 Can take the following forms.
226 string dtype
227 ================ ============================
228 'int' SparseDtype[np.int64, 0]
229 'Sparse' SparseDtype[np.float64, nan]
230 'Sparse[int]' SparseDtype[np.int64, 0]
231 'Sparse[int, 0]' SparseDtype[np.int64, 0]
232 ================ ============================
234 It is not possible to specify non-default fill values
235 with a string. An argument like ``'Sparse[int, 1]'``
236 will raise a ``TypeError`` because the default fill value
237 for integers is 0.
239 Returns
240 -------
241 SparseDtype
242 """
243 if not isinstance(string, str):
244 raise TypeError(
245 f"'construct_from_string' expects a string, got {type(string)}"
246 )
247 msg = f"Cannot construct a 'SparseDtype' from '{string}'"
248 if string.startswith("Sparse"):
249 try:
250 sub_type, has_fill_value = cls._parse_subtype(string)
251 except ValueError as err:
252 raise TypeError(msg) from err
253 else:
254 result = SparseDtype(sub_type)
255 msg = (
256 f"Cannot construct a 'SparseDtype' from '{string}'.\n\nIt "
257 "looks like the fill_value in the string is not "
258 "the default for the dtype. Non-default fill_values "
259 "are not supported. Use the 'SparseDtype()' "
260 "constructor instead."
261 )
262 if has_fill_value and str(result) != string:
263 raise TypeError(msg)
264 return result
265 else:
266 raise TypeError(msg)
268 @staticmethod
269 def _parse_subtype(dtype: str) -> tuple[str, bool]:
270 """
271 Parse a string to get the subtype
273 Parameters
274 ----------
275 dtype : str
276 A string like
278 * Sparse[subtype]
279 * Sparse[subtype, fill_value]
281 Returns
282 -------
283 subtype : str
285 Raises
286 ------
287 ValueError
288 When the subtype cannot be extracted.
289 """
290 xpr = re.compile(r"Sparse\[(?P<subtype>[^,]*)(, )?(?P<fill_value>.*?)?\]$")
291 m = xpr.match(dtype)
292 has_fill_value = False
293 if m:
294 subtype = m.groupdict()["subtype"]
295 has_fill_value = bool(m.groupdict()["fill_value"])
296 elif dtype == "Sparse":
297 subtype = "float64"
298 else:
299 raise ValueError(f"Cannot parse {dtype}")
300 return subtype, has_fill_value
302 @classmethod
303 def is_dtype(cls, dtype: object) -> bool:
304 dtype = getattr(dtype, "dtype", dtype)
305 if isinstance(dtype, str) and dtype.startswith("Sparse"):
306 sub_type, _ = cls._parse_subtype(dtype)
307 dtype = np.dtype(sub_type)
308 elif isinstance(dtype, cls):
309 return True
310 return isinstance(dtype, np.dtype) or dtype == "Sparse"
312 def update_dtype(self, dtype) -> SparseDtype:
313 """
314 Convert the SparseDtype to a new dtype.
316 This takes care of converting the ``fill_value``.
318 Parameters
319 ----------
320 dtype : Union[str, numpy.dtype, SparseDtype]
321 The new dtype to use.
323 * For a SparseDtype, it is simply returned
324 * For a NumPy dtype (or str), the current fill value
325 is converted to the new dtype, and a SparseDtype
326 with `dtype` and the new fill value is returned.
328 Returns
329 -------
330 SparseDtype
331 A new SparseDtype with the correct `dtype` and fill value
332 for that `dtype`.
334 Raises
335 ------
336 ValueError
337 When the current fill value cannot be converted to the
338 new `dtype` (e.g. trying to convert ``np.nan`` to an
339 integer dtype).
342 Examples
343 --------
344 >>> SparseDtype(int, 0).update_dtype(float)
345 Sparse[float64, 0.0]
347 >>> SparseDtype(int, 1).update_dtype(SparseDtype(float, np.nan))
348 Sparse[float64, nan]
349 """
350 cls = type(self)
351 dtype = pandas_dtype(dtype)
353 if not isinstance(dtype, cls):
354 if not isinstance(dtype, np.dtype):
355 raise TypeError("sparse arrays of extension dtypes not supported")
357 fvarr = astype_nansafe(np.array(self.fill_value), dtype)
358 # NB: not fv_0d.item(), as that casts dt64->int
359 fill_value = fvarr[0]
360 dtype = cls(dtype, fill_value=fill_value)
362 return dtype
364 @property
365 def _subtype_with_str(self):
366 """
367 Whether the SparseDtype's subtype should be considered ``str``.
369 Typically, pandas will store string data in an object-dtype array.
370 When converting values to a dtype, e.g. in ``.astype``, we need to
371 be more specific, we need the actual underlying type.
373 Returns
374 -------
375 >>> SparseDtype(int, 1)._subtype_with_str
376 dtype('int64')
378 >>> SparseDtype(object, 1)._subtype_with_str
379 dtype('O')
381 >>> dtype = SparseDtype(str, '')
382 >>> dtype.subtype
383 dtype('O')
385 >>> dtype._subtype_with_str
386 <class 'str'>
387 """
388 if isinstance(self.fill_value, str):
389 return type(self.fill_value)
390 return self.subtype
392 def _get_common_dtype(self, dtypes: list[DtypeObj]) -> DtypeObj | None:
393 # TODO for now only handle SparseDtypes and numpy dtypes => extend
394 # with other compatible extension dtypes
395 if any(
396 isinstance(x, ExtensionDtype) and not isinstance(x, SparseDtype)
397 for x in dtypes
398 ):
399 return None
401 fill_values = [x.fill_value for x in dtypes if isinstance(x, SparseDtype)]
402 fill_value = fill_values[0]
404 # np.nan isn't a singleton, so we may end up with multiple
405 # NaNs here, so we ignore the all NA case too.
406 if not (len(set(fill_values)) == 1 or isna(fill_values).all()):
407 warnings.warn(
408 "Concatenating sparse arrays with multiple fill "
409 f"values: '{fill_values}'. Picking the first and "
410 "converting the rest.",
411 PerformanceWarning,
412 stacklevel=find_stack_level(),
413 )
415 np_dtypes = [x.subtype if isinstance(x, SparseDtype) else x for x in dtypes]
416 return SparseDtype(np.find_common_type(np_dtypes, []), fill_value=fill_value)