Coverage for /var/srv/projects/api.amasfac.comuna18.com/tmp/venv/lib/python3.9/site-packages/pandas/core/arrays/string_arrow.py: 18%
240 statements
« prev ^ index » next coverage.py v6.4.4, created at 2023-07-17 14:22 -0600
« prev ^ index » next coverage.py v6.4.4, created at 2023-07-17 14:22 -0600
1from __future__ import annotations
3from collections.abc import Callable # noqa: PDF001
4import re
5from typing import Union
7import numpy as np
9from pandas._libs import (
10 lib,
11 missing as libmissing,
12)
13from pandas._typing import (
14 Dtype,
15 NpDtype,
16 Scalar,
17 npt,
18)
19from pandas.compat import (
20 pa_version_under1p01,
21 pa_version_under2p0,
22 pa_version_under3p0,
23 pa_version_under4p0,
24)
26from pandas.core.dtypes.common import (
27 is_bool_dtype,
28 is_dtype_equal,
29 is_integer_dtype,
30 is_object_dtype,
31 is_scalar,
32 is_string_dtype,
33 pandas_dtype,
34)
35from pandas.core.dtypes.missing import isna
37from pandas.core.arrays.arrow import ArrowExtensionArray
38from pandas.core.arrays.boolean import BooleanDtype
39from pandas.core.arrays.integer import Int64Dtype
40from pandas.core.arrays.numeric import NumericDtype
41from pandas.core.arrays.string_ import (
42 BaseStringArray,
43 StringDtype,
44)
45from pandas.core.strings.object_array import ObjectStringArrayMixin
47if not pa_version_under1p01: 47 ↛ 48line 47 didn't jump to line 48, because the condition on line 47 was never true
48 import pyarrow as pa
49 import pyarrow.compute as pc
51 from pandas.core.arrays.arrow._arrow_utils import fallback_performancewarning
53ArrowStringScalarOrNAT = Union[str, libmissing.NAType]
56def _chk_pyarrow_available() -> None:
57 if pa_version_under1p01:
58 msg = "pyarrow>=1.0.0 is required for PyArrow backed ArrowExtensionArray."
59 raise ImportError(msg)
62# TODO: Inherit directly from BaseStringArrayMethods. Currently we inherit from
63# ObjectStringArrayMixin because we want to have the object-dtype based methods as
64# fallback for the ones that pyarrow doesn't yet support
67class ArrowStringArray(ArrowExtensionArray, BaseStringArray, ObjectStringArrayMixin):
68 """
69 Extension array for string data in a ``pyarrow.ChunkedArray``.
71 .. versionadded:: 1.2.0
73 .. warning::
75 ArrowStringArray is considered experimental. The implementation and
76 parts of the API may change without warning.
78 Parameters
79 ----------
80 values : pyarrow.Array or pyarrow.ChunkedArray
81 The array of data.
83 Attributes
84 ----------
85 None
87 Methods
88 -------
89 None
91 See Also
92 --------
93 array
94 The recommended function for creating a ArrowStringArray.
95 Series.str
96 The string methods are available on Series backed by
97 a ArrowStringArray.
99 Notes
100 -----
101 ArrowStringArray returns a BooleanArray for comparison methods.
103 Examples
104 --------
105 >>> pd.array(['This is', 'some text', None, 'data.'], dtype="string[pyarrow]")
106 <ArrowStringArray>
107 ['This is', 'some text', <NA>, 'data.']
108 Length: 4, dtype: string
109 """
111 # error: Incompatible types in assignment (expression has type "StringDtype",
112 # base class "ArrowExtensionArray" defined the type as "ArrowDtype")
113 _dtype: StringDtype # type: ignore[assignment]
115 def __init__(self, values) -> None:
116 super().__init__(values)
117 self._dtype = StringDtype(storage="pyarrow")
119 if not pa.types.is_string(self._data.type):
120 raise ValueError(
121 "ArrowStringArray requires a PyArrow (chunked) array of string type"
122 )
124 @classmethod
125 def _from_sequence(cls, scalars, dtype: Dtype | None = None, copy: bool = False):
126 from pandas.core.arrays.masked import BaseMaskedArray
128 _chk_pyarrow_available()
130 if dtype and not (isinstance(dtype, str) and dtype == "string"):
131 dtype = pandas_dtype(dtype)
132 assert isinstance(dtype, StringDtype) and dtype.storage == "pyarrow"
134 if isinstance(scalars, BaseMaskedArray):
135 # avoid costly conversion to object dtype in ensure_string_array and
136 # numerical issues with Float32Dtype
137 na_values = scalars._mask
138 result = scalars._data
139 result = lib.ensure_string_array(result, copy=copy, convert_na_value=False)
140 return cls(pa.array(result, mask=na_values, type=pa.string()))
142 # convert non-na-likes to str
143 result = lib.ensure_string_array(scalars, copy=copy)
144 return cls(pa.array(result, type=pa.string(), from_pandas=True))
146 @classmethod
147 def _from_sequence_of_strings(
148 cls, strings, dtype: Dtype | None = None, copy: bool = False
149 ):
150 return cls._from_sequence(strings, dtype=dtype, copy=copy)
152 @property
153 def dtype(self) -> StringDtype: # type: ignore[override]
154 """
155 An instance of 'string[pyarrow]'.
156 """
157 return self._dtype
159 def __array__(self, dtype: NpDtype | None = None) -> np.ndarray:
160 """Correctly construct numpy arrays when passed to `np.asarray()`."""
161 return self.to_numpy(dtype=dtype)
163 def to_numpy(
164 self,
165 dtype: npt.DTypeLike | None = None,
166 copy: bool = False,
167 na_value=lib.no_default,
168 ) -> np.ndarray:
169 """
170 Convert to a NumPy ndarray.
171 """
172 # TODO: copy argument is ignored
174 result = np.array(self._data, dtype=dtype)
175 if self._data.null_count > 0:
176 if na_value is lib.no_default:
177 if dtype and np.issubdtype(dtype, np.floating):
178 return result
179 na_value = self._dtype.na_value
180 mask = self.isna()
181 result[mask] = na_value
182 return result
184 def insert(self, loc: int, item) -> ArrowStringArray:
185 if not isinstance(item, str) and item is not libmissing.NA:
186 raise TypeError("Scalar must be NA or str")
187 return super().insert(loc, item)
189 def _maybe_convert_setitem_value(self, value):
190 """Maybe convert value to be pyarrow compatible."""
191 if is_scalar(value):
192 if isna(value):
193 value = None
194 elif not isinstance(value, str):
195 raise ValueError("Scalar must be NA or str")
196 else:
197 value = np.array(value, dtype=object, copy=True)
198 value[isna(value)] = None
199 for v in value:
200 if not (v is None or isinstance(v, str)):
201 raise ValueError("Scalar must be NA or str")
202 return value
204 def isin(self, values) -> npt.NDArray[np.bool_]:
205 if pa_version_under2p0:
206 fallback_performancewarning(version="2")
207 return super().isin(values)
209 value_set = [
210 pa_scalar.as_py()
211 for pa_scalar in [pa.scalar(value, from_pandas=True) for value in values]
212 if pa_scalar.type in (pa.string(), pa.null())
213 ]
215 # for an empty value_set pyarrow 3.0.0 segfaults and pyarrow 2.0.0 returns True
216 # for null values, so we short-circuit to return all False array.
217 if not len(value_set):
218 return np.zeros(len(self), dtype=bool)
220 kwargs = {}
221 if pa_version_under3p0:
222 # in pyarrow 2.0.0 skip_null is ignored but is a required keyword and raises
223 # with unexpected keyword argument in pyarrow 3.0.0+
224 kwargs["skip_null"] = True
226 result = pc.is_in(self._data, value_set=pa.array(value_set), **kwargs)
227 # pyarrow 2.0.0 returned nulls, so we explicily specify dtype to convert nulls
228 # to False
229 return np.array(result, dtype=np.bool_)
231 def astype(self, dtype, copy: bool = True):
232 dtype = pandas_dtype(dtype)
234 if is_dtype_equal(dtype, self.dtype):
235 if copy:
236 return self.copy()
237 return self
239 elif isinstance(dtype, NumericDtype):
240 data = self._data.cast(pa.from_numpy_dtype(dtype.numpy_dtype))
241 return dtype.__from_arrow__(data)
243 return super().astype(dtype, copy=copy)
245 # ------------------------------------------------------------------------
246 # String methods interface
248 # error: Incompatible types in assignment (expression has type "NAType",
249 # base class "ObjectStringArrayMixin" defined the type as "float")
250 _str_na_value = libmissing.NA # type: ignore[assignment]
252 def _str_map(
253 self, f, na_value=None, dtype: Dtype | None = None, convert: bool = True
254 ):
255 # TODO: de-duplicate with StringArray method. This method is moreless copy and
256 # paste.
258 from pandas.arrays import (
259 BooleanArray,
260 IntegerArray,
261 )
263 if dtype is None:
264 dtype = self.dtype
265 if na_value is None:
266 na_value = self.dtype.na_value
268 mask = isna(self)
269 arr = np.asarray(self)
271 if is_integer_dtype(dtype) or is_bool_dtype(dtype):
272 constructor: type[IntegerArray] | type[BooleanArray]
273 if is_integer_dtype(dtype):
274 constructor = IntegerArray
275 else:
276 constructor = BooleanArray
278 na_value_is_na = isna(na_value)
279 if na_value_is_na:
280 na_value = 1
281 result = lib.map_infer_mask(
282 arr,
283 f,
284 mask.view("uint8"),
285 convert=False,
286 na_value=na_value,
287 # error: Argument 1 to "dtype" has incompatible type
288 # "Union[ExtensionDtype, str, dtype[Any], Type[object]]"; expected
289 # "Type[object]"
290 dtype=np.dtype(dtype), # type: ignore[arg-type]
291 )
293 if not na_value_is_na:
294 mask[:] = False
296 return constructor(result, mask)
298 elif is_string_dtype(dtype) and not is_object_dtype(dtype):
299 # i.e. StringDtype
300 result = lib.map_infer_mask(
301 arr, f, mask.view("uint8"), convert=False, na_value=na_value
302 )
303 result = pa.array(result, mask=mask, type=pa.string(), from_pandas=True)
304 return type(self)(result)
305 else:
306 # This is when the result type is object. We reach this when
307 # -> We know the result type is truly object (e.g. .encode returns bytes
308 # or .findall returns a list).
309 # -> We don't know the result type. E.g. `.get` can return anything.
310 return lib.map_infer_mask(arr, f, mask.view("uint8"))
312 def _str_contains(self, pat, case=True, flags=0, na=np.nan, regex: bool = True):
313 if flags:
314 fallback_performancewarning()
315 return super()._str_contains(pat, case, flags, na, regex)
317 if regex:
318 if pa_version_under4p0 or case is False:
319 fallback_performancewarning(version="4")
320 return super()._str_contains(pat, case, flags, na, regex)
321 else:
322 result = pc.match_substring_regex(self._data, pat)
323 else:
324 if case:
325 result = pc.match_substring(self._data, pat)
326 else:
327 result = pc.match_substring(pc.utf8_upper(self._data), pat.upper())
328 result = BooleanDtype().__from_arrow__(result)
329 if not isna(na):
330 result[isna(result)] = bool(na)
331 return result
333 def _str_startswith(self, pat: str, na=None):
334 if pa_version_under4p0:
335 fallback_performancewarning(version="4")
336 return super()._str_startswith(pat, na)
338 pat = "^" + re.escape(pat)
339 return self._str_contains(pat, na=na, regex=True)
341 def _str_endswith(self, pat: str, na=None):
342 if pa_version_under4p0:
343 fallback_performancewarning(version="4")
344 return super()._str_endswith(pat, na)
346 pat = re.escape(pat) + "$"
347 return self._str_contains(pat, na=na, regex=True)
349 def _str_replace(
350 self,
351 pat: str | re.Pattern,
352 repl: str | Callable,
353 n: int = -1,
354 case: bool = True,
355 flags: int = 0,
356 regex: bool = True,
357 ):
358 if (
359 pa_version_under4p0
360 or isinstance(pat, re.Pattern)
361 or callable(repl)
362 or not case
363 or flags
364 ):
365 fallback_performancewarning(version="4")
366 return super()._str_replace(pat, repl, n, case, flags, regex)
368 func = pc.replace_substring_regex if regex else pc.replace_substring
369 result = func(self._data, pattern=pat, replacement=repl, max_replacements=n)
370 return type(self)(result)
372 def _str_match(
373 self, pat: str, case: bool = True, flags: int = 0, na: Scalar | None = None
374 ):
375 if pa_version_under4p0:
376 fallback_performancewarning(version="4")
377 return super()._str_match(pat, case, flags, na)
379 if not pat.startswith("^"):
380 pat = "^" + pat
381 return self._str_contains(pat, case, flags, na, regex=True)
383 def _str_fullmatch(
384 self, pat, case: bool = True, flags: int = 0, na: Scalar | None = None
385 ):
386 if pa_version_under4p0:
387 fallback_performancewarning(version="4")
388 return super()._str_fullmatch(pat, case, flags, na)
390 if not pat.endswith("$") or pat.endswith("//$"):
391 pat = pat + "$"
392 return self._str_match(pat, case, flags, na)
394 def _str_isalnum(self):
395 result = pc.utf8_is_alnum(self._data)
396 return BooleanDtype().__from_arrow__(result)
398 def _str_isalpha(self):
399 result = pc.utf8_is_alpha(self._data)
400 return BooleanDtype().__from_arrow__(result)
402 def _str_isdecimal(self):
403 result = pc.utf8_is_decimal(self._data)
404 return BooleanDtype().__from_arrow__(result)
406 def _str_isdigit(self):
407 result = pc.utf8_is_digit(self._data)
408 return BooleanDtype().__from_arrow__(result)
410 def _str_islower(self):
411 result = pc.utf8_is_lower(self._data)
412 return BooleanDtype().__from_arrow__(result)
414 def _str_isnumeric(self):
415 result = pc.utf8_is_numeric(self._data)
416 return BooleanDtype().__from_arrow__(result)
418 def _str_isspace(self):
419 if pa_version_under2p0:
420 fallback_performancewarning(version="2")
421 return super()._str_isspace()
423 result = pc.utf8_is_space(self._data)
424 return BooleanDtype().__from_arrow__(result)
426 def _str_istitle(self):
427 result = pc.utf8_is_title(self._data)
428 return BooleanDtype().__from_arrow__(result)
430 def _str_isupper(self):
431 result = pc.utf8_is_upper(self._data)
432 return BooleanDtype().__from_arrow__(result)
434 def _str_len(self):
435 if pa_version_under4p0:
436 fallback_performancewarning(version="4")
437 return super()._str_len()
439 result = pc.utf8_length(self._data)
440 return Int64Dtype().__from_arrow__(result)
442 def _str_lower(self):
443 return type(self)(pc.utf8_lower(self._data))
445 def _str_upper(self):
446 return type(self)(pc.utf8_upper(self._data))
448 def _str_strip(self, to_strip=None):
449 if pa_version_under4p0:
450 fallback_performancewarning(version="4")
451 return super()._str_strip(to_strip)
453 if to_strip is None:
454 result = pc.utf8_trim_whitespace(self._data)
455 else:
456 result = pc.utf8_trim(self._data, characters=to_strip)
457 return type(self)(result)
459 def _str_lstrip(self, to_strip=None):
460 if pa_version_under4p0:
461 fallback_performancewarning(version="4")
462 return super()._str_lstrip(to_strip)
464 if to_strip is None:
465 result = pc.utf8_ltrim_whitespace(self._data)
466 else:
467 result = pc.utf8_ltrim(self._data, characters=to_strip)
468 return type(self)(result)
470 def _str_rstrip(self, to_strip=None):
471 if pa_version_under4p0:
472 fallback_performancewarning(version="4")
473 return super()._str_rstrip(to_strip)
475 if to_strip is None:
476 result = pc.utf8_rtrim_whitespace(self._data)
477 else:
478 result = pc.utf8_rtrim(self._data, characters=to_strip)
479 return type(self)(result)