Coverage for /var/srv/projects/api.amasfac.comuna18.com/tmp/venv/lib/python3.9/site-packages/pandas/core/arrays/string_.py: 20%
241 statements
« prev ^ index » next coverage.py v6.4.4, created at 2023-07-17 14:22 -0600
« prev ^ index » next coverage.py v6.4.4, created at 2023-07-17 14:22 -0600
1from __future__ import annotations
3from typing import TYPE_CHECKING
5import numpy as np
7from pandas._config import get_option
9from pandas._libs import (
10 lib,
11 missing as libmissing,
12)
13from pandas._libs.arrays import NDArrayBacked
14from pandas._typing import (
15 Dtype,
16 Scalar,
17 npt,
18 type_t,
19)
20from pandas.compat import pa_version_under1p01
21from pandas.compat.numpy import function as nv
23from pandas.core.dtypes.base import (
24 ExtensionDtype,
25 StorageExtensionDtype,
26 register_extension_dtype,
27)
28from pandas.core.dtypes.common import (
29 is_array_like,
30 is_bool_dtype,
31 is_dtype_equal,
32 is_integer_dtype,
33 is_object_dtype,
34 is_string_dtype,
35 pandas_dtype,
36)
38from pandas.core import ops
39from pandas.core.array_algos import masked_reductions
40from pandas.core.arrays import (
41 ExtensionArray,
42 FloatingArray,
43 IntegerArray,
44)
45from pandas.core.arrays.floating import FloatingDtype
46from pandas.core.arrays.integer import IntegerDtype
47from pandas.core.arrays.numpy_ import PandasArray
48from pandas.core.construction import extract_array
49from pandas.core.indexers import check_array_indexer
50from pandas.core.missing import isna
52if TYPE_CHECKING: 52 ↛ 53line 52 didn't jump to line 53, because the condition on line 52 was never true
53 import pyarrow
55 from pandas import Series
58@register_extension_dtype
59class StringDtype(StorageExtensionDtype):
60 """
61 Extension dtype for string data.
63 .. versionadded:: 1.0.0
65 .. warning::
67 StringDtype is considered experimental. The implementation and
68 parts of the API may change without warning.
70 Parameters
71 ----------
72 storage : {"python", "pyarrow"}, optional
73 If not given, the value of ``pd.options.mode.string_storage``.
75 Attributes
76 ----------
77 None
79 Methods
80 -------
81 None
83 Examples
84 --------
85 >>> pd.StringDtype()
86 string[python]
88 >>> pd.StringDtype(storage="pyarrow")
89 string[pyarrow]
90 """
92 name = "string"
94 #: StringDtype().na_value uses pandas.NA
95 @property
96 def na_value(self) -> libmissing.NAType:
97 return libmissing.NA
99 _metadata = ("storage",)
101 def __init__(self, storage=None) -> None:
102 if storage is None:
103 storage = get_option("mode.string_storage")
104 if storage not in {"python", "pyarrow"}:
105 raise ValueError(
106 f"Storage must be 'python' or 'pyarrow'. Got {storage} instead."
107 )
108 if storage == "pyarrow" and pa_version_under1p01:
109 raise ImportError(
110 "pyarrow>=1.0.0 is required for PyArrow backed StringArray."
111 )
112 self.storage = storage
114 @property
115 def type(self) -> type[str]:
116 return str
118 @classmethod
119 def construct_from_string(cls, string):
120 """
121 Construct a StringDtype from a string.
123 Parameters
124 ----------
125 string : str
126 The type of the name. The storage type will be taking from `string`.
127 Valid options and their storage types are
129 ========================== ==============================================
130 string result storage
131 ========================== ==============================================
132 ``'string'`` pd.options.mode.string_storage, default python
133 ``'string[python]'`` python
134 ``'string[pyarrow]'`` pyarrow
135 ========================== ==============================================
137 Returns
138 -------
139 StringDtype
141 Raise
142 -----
143 TypeError
144 If the string is not a valid option.
145 """
146 if not isinstance(string, str):
147 raise TypeError(
148 f"'construct_from_string' expects a string, got {type(string)}"
149 )
150 if string == "string":
151 return cls()
152 elif string == "string[python]":
153 return cls(storage="python")
154 elif string == "string[pyarrow]":
155 return cls(storage="pyarrow")
156 else:
157 raise TypeError(f"Cannot construct a '{cls.__name__}' from '{string}'")
159 # https://github.com/pandas-dev/pandas/issues/36126
160 # error: Signature of "construct_array_type" incompatible with supertype
161 # "ExtensionDtype"
162 def construct_array_type( # type: ignore[override]
163 self,
164 ) -> type_t[BaseStringArray]:
165 """
166 Return the array type associated with this dtype.
168 Returns
169 -------
170 type
171 """
172 from pandas.core.arrays.string_arrow import ArrowStringArray
174 if self.storage == "python":
175 return StringArray
176 else:
177 return ArrowStringArray
179 def __from_arrow__(
180 self, array: pyarrow.Array | pyarrow.ChunkedArray
181 ) -> BaseStringArray:
182 """
183 Construct StringArray from pyarrow Array/ChunkedArray.
184 """
185 if self.storage == "pyarrow":
186 from pandas.core.arrays.string_arrow import ArrowStringArray
188 return ArrowStringArray(array)
189 else:
191 import pyarrow
193 if isinstance(array, pyarrow.Array):
194 chunks = [array]
195 else:
196 # pyarrow.ChunkedArray
197 chunks = array.chunks
199 results = []
200 for arr in chunks:
201 # using _from_sequence to ensure None is converted to NA
202 str_arr = StringArray._from_sequence(np.array(arr))
203 results.append(str_arr)
205 if results:
206 return StringArray._concat_same_type(results)
207 else:
208 return StringArray(np.array([], dtype="object"))
211class BaseStringArray(ExtensionArray):
212 """
213 Mixin class for StringArray, ArrowStringArray.
214 """
216 pass
219class StringArray(BaseStringArray, PandasArray):
220 """
221 Extension array for string data.
223 .. versionadded:: 1.0.0
225 .. warning::
227 StringArray is considered experimental. The implementation and
228 parts of the API may change without warning.
230 Parameters
231 ----------
232 values : array-like
233 The array of data.
235 .. warning::
237 Currently, this expects an object-dtype ndarray
238 where the elements are Python strings
239 or nan-likes (``None``, ``np.nan``, ``NA``).
240 This may change without warning in the future. Use
241 :meth:`pandas.array` with ``dtype="string"`` for a stable way of
242 creating a `StringArray` from any sequence.
244 .. versionchanged:: 1.5.0
246 StringArray now accepts array-likes containing
247 nan-likes(``None``, ``np.nan``) for the ``values`` parameter
248 in addition to strings and :attr:`pandas.NA`
250 copy : bool, default False
251 Whether to copy the array of data.
253 Attributes
254 ----------
255 None
257 Methods
258 -------
259 None
261 See Also
262 --------
263 array
264 The recommended function for creating a StringArray.
265 Series.str
266 The string methods are available on Series backed by
267 a StringArray.
269 Notes
270 -----
271 StringArray returns a BooleanArray for comparison methods.
273 Examples
274 --------
275 >>> pd.array(['This is', 'some text', None, 'data.'], dtype="string")
276 <StringArray>
277 ['This is', 'some text', <NA>, 'data.']
278 Length: 4, dtype: string
280 Unlike arrays instantiated with ``dtype="object"``, ``StringArray``
281 will convert the values to strings.
283 >>> pd.array(['1', 1], dtype="object")
284 <PandasArray>
285 ['1', 1]
286 Length: 2, dtype: object
287 >>> pd.array(['1', 1], dtype="string")
288 <StringArray>
289 ['1', '1']
290 Length: 2, dtype: string
292 However, instantiating StringArrays directly with non-strings will raise an error.
294 For comparison methods, `StringArray` returns a :class:`pandas.BooleanArray`:
296 >>> pd.array(["a", None, "c"], dtype="string") == "a"
297 <BooleanArray>
298 [True, <NA>, False]
299 Length: 3, dtype: boolean
300 """
302 # undo the PandasArray hack
303 _typ = "extension"
305 def __init__(self, values, copy=False) -> None:
306 values = extract_array(values)
308 super().__init__(values, copy=copy)
309 if not isinstance(values, type(self)):
310 self._validate()
311 NDArrayBacked.__init__(self, self._ndarray, StringDtype(storage="python"))
313 def _validate(self):
314 """Validate that we only store NA or strings."""
315 if len(self._ndarray) and not lib.is_string_array(self._ndarray, skipna=True):
316 raise ValueError("StringArray requires a sequence of strings or pandas.NA")
317 if self._ndarray.dtype != "object":
318 raise ValueError(
319 "StringArray requires a sequence of strings or pandas.NA. Got "
320 f"'{self._ndarray.dtype}' dtype instead."
321 )
322 # Check to see if need to convert Na values to pd.NA
323 if self._ndarray.ndim > 2:
324 # Ravel if ndims > 2 b/c no cythonized version available
325 lib.convert_nans_to_NA(self._ndarray.ravel("K"))
326 else:
327 lib.convert_nans_to_NA(self._ndarray)
329 @classmethod
330 def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy=False):
331 if dtype and not (isinstance(dtype, str) and dtype == "string"):
332 dtype = pandas_dtype(dtype)
333 assert isinstance(dtype, StringDtype) and dtype.storage == "python"
335 from pandas.core.arrays.masked import BaseMaskedArray
337 if isinstance(scalars, BaseMaskedArray):
338 # avoid costly conversion to object dtype
339 na_values = scalars._mask
340 result = scalars._data
341 result = lib.ensure_string_array(result, copy=copy, convert_na_value=False)
342 result[na_values] = libmissing.NA
344 else:
345 # convert non-na-likes to str, and nan-likes to StringDtype().na_value
346 result = lib.ensure_string_array(scalars, na_value=libmissing.NA, copy=copy)
348 # Manually creating new array avoids the validation step in the __init__, so is
349 # faster. Refactor need for validation?
350 new_string_array = cls.__new__(cls)
351 NDArrayBacked.__init__(new_string_array, result, StringDtype(storage="python"))
353 return new_string_array
355 @classmethod
356 def _from_sequence_of_strings(
357 cls, strings, *, dtype: Dtype | None = None, copy=False
358 ):
359 return cls._from_sequence(strings, dtype=dtype, copy=copy)
361 @classmethod
362 def _empty(cls, shape, dtype) -> StringArray:
363 values = np.empty(shape, dtype=object)
364 values[:] = libmissing.NA
365 return cls(values).astype(dtype, copy=False)
367 def __arrow_array__(self, type=None):
368 """
369 Convert myself into a pyarrow Array.
370 """
371 import pyarrow as pa
373 if type is None:
374 type = pa.string()
376 values = self._ndarray.copy()
377 values[self.isna()] = None
378 return pa.array(values, type=type, from_pandas=True)
380 def _values_for_factorize(self):
381 arr = self._ndarray.copy()
382 mask = self.isna()
383 arr[mask] = None
384 return arr, None
386 def __setitem__(self, key, value):
387 value = extract_array(value, extract_numpy=True)
388 if isinstance(value, type(self)):
389 # extract_array doesn't extract PandasArray subclasses
390 value = value._ndarray
392 key = check_array_indexer(self, key)
393 scalar_key = lib.is_scalar(key)
394 scalar_value = lib.is_scalar(value)
395 if scalar_key and not scalar_value:
396 raise ValueError("setting an array element with a sequence.")
398 # validate new items
399 if scalar_value:
400 if isna(value):
401 value = libmissing.NA
402 elif not isinstance(value, str):
403 raise ValueError(
404 f"Cannot set non-string value '{value}' into a StringArray."
405 )
406 else:
407 if not is_array_like(value):
408 value = np.asarray(value, dtype=object)
409 if len(value) and not lib.is_string_array(value, skipna=True):
410 raise ValueError("Must provide strings.")
412 value[isna(value)] = libmissing.NA
414 super().__setitem__(key, value)
416 def _putmask(self, mask: npt.NDArray[np.bool_], value) -> None:
417 # the super() method NDArrayBackedExtensionArray._putmask uses
418 # np.putmask which doesn't properly handle None/pd.NA, so using the
419 # base class implementation that uses __setitem__
420 ExtensionArray._putmask(self, mask, value)
422 def astype(self, dtype, copy: bool = True):
423 dtype = pandas_dtype(dtype)
425 if is_dtype_equal(dtype, self.dtype):
426 if copy:
427 return self.copy()
428 return self
430 elif isinstance(dtype, IntegerDtype):
431 arr = self._ndarray.copy()
432 mask = self.isna()
433 arr[mask] = 0
434 values = arr.astype(dtype.numpy_dtype)
435 return IntegerArray(values, mask, copy=False)
436 elif isinstance(dtype, FloatingDtype):
437 arr = self.copy()
438 mask = self.isna()
439 arr[mask] = "0"
440 values = arr.astype(dtype.numpy_dtype)
441 return FloatingArray(values, mask, copy=False)
442 elif isinstance(dtype, ExtensionDtype):
443 return super().astype(dtype, copy=copy)
444 elif np.issubdtype(dtype, np.floating):
445 arr = self._ndarray.copy()
446 mask = self.isna()
447 arr[mask] = 0
448 values = arr.astype(dtype)
449 values[mask] = np.nan
450 return values
452 return super().astype(dtype, copy)
454 def _reduce(
455 self, name: str, *, skipna: bool = True, axis: int | None = 0, **kwargs
456 ):
457 if name in ["min", "max"]:
458 return getattr(self, name)(skipna=skipna, axis=axis)
460 raise TypeError(f"Cannot perform reduction '{name}' with string dtype")
462 def min(self, axis=None, skipna: bool = True, **kwargs) -> Scalar:
463 nv.validate_min((), kwargs)
464 result = masked_reductions.min(
465 values=self.to_numpy(), mask=self.isna(), skipna=skipna
466 )
467 return self._wrap_reduction_result(axis, result)
469 def max(self, axis=None, skipna: bool = True, **kwargs) -> Scalar:
470 nv.validate_max((), kwargs)
471 result = masked_reductions.max(
472 values=self.to_numpy(), mask=self.isna(), skipna=skipna
473 )
474 return self._wrap_reduction_result(axis, result)
476 def value_counts(self, dropna: bool = True) -> Series:
477 from pandas import value_counts
479 result = value_counts(self._ndarray, dropna=dropna).astype("Int64")
480 result.index = result.index.astype(self.dtype)
481 return result
483 def memory_usage(self, deep: bool = False) -> int:
484 result = self._ndarray.nbytes
485 if deep:
486 return result + lib.memory_usage_of_objects(self._ndarray)
487 return result
489 def _cmp_method(self, other, op):
490 from pandas.arrays import BooleanArray
492 if isinstance(other, StringArray):
493 other = other._ndarray
495 mask = isna(self) | isna(other)
496 valid = ~mask
498 if not lib.is_scalar(other):
499 if len(other) != len(self):
500 # prevent improper broadcasting when other is 2D
501 raise ValueError(
502 f"Lengths of operands do not match: {len(self)} != {len(other)}"
503 )
505 other = np.asarray(other)
506 other = other[valid]
508 if op.__name__ in ops.ARITHMETIC_BINOPS:
509 result = np.empty_like(self._ndarray, dtype="object")
510 result[mask] = libmissing.NA
511 result[valid] = op(self._ndarray[valid], other)
512 return StringArray(result)
513 else:
514 # logical
515 result = np.zeros(len(self._ndarray), dtype="bool")
516 result[valid] = op(self._ndarray[valid], other)
517 return BooleanArray(result, mask)
519 _arith_method = _cmp_method
521 # ------------------------------------------------------------------------
522 # String methods interface
523 # error: Incompatible types in assignment (expression has type "NAType",
524 # base class "PandasArray" defined the type as "float")
525 _str_na_value = libmissing.NA # type: ignore[assignment]
527 def _str_map(
528 self, f, na_value=None, dtype: Dtype | None = None, convert: bool = True
529 ):
530 from pandas.arrays import BooleanArray
532 if dtype is None:
533 dtype = StringDtype(storage="python")
534 if na_value is None:
535 na_value = self.dtype.na_value
537 mask = isna(self)
538 arr = np.asarray(self)
540 if is_integer_dtype(dtype) or is_bool_dtype(dtype):
541 constructor: type[IntegerArray] | type[BooleanArray]
542 if is_integer_dtype(dtype):
543 constructor = IntegerArray
544 else:
545 constructor = BooleanArray
547 na_value_is_na = isna(na_value)
548 if na_value_is_na:
549 na_value = 1
550 result = lib.map_infer_mask(
551 arr,
552 f,
553 mask.view("uint8"),
554 convert=False,
555 na_value=na_value,
556 # error: Argument 1 to "dtype" has incompatible type
557 # "Union[ExtensionDtype, str, dtype[Any], Type[object]]"; expected
558 # "Type[object]"
559 dtype=np.dtype(dtype), # type: ignore[arg-type]
560 )
562 if not na_value_is_na:
563 mask[:] = False
565 return constructor(result, mask)
567 elif is_string_dtype(dtype) and not is_object_dtype(dtype):
568 # i.e. StringDtype
569 result = lib.map_infer_mask(
570 arr, f, mask.view("uint8"), convert=False, na_value=na_value
571 )
572 return StringArray(result)
573 else:
574 # This is when the result type is object. We reach this when
575 # -> We know the result type is truly object (e.g. .encode returns bytes
576 # or .findall returns a list).
577 # -> We don't know the result type. E.g. `.get` can return anything.
578 return lib.map_infer_mask(arr, f, mask.view("uint8"))