Coverage for /var/srv/projects/api.amasfac.comuna18.com/tmp/venv/lib/python3.9/site-packages/pandas/core/strings/object_array.py: 15%
304 statements
« prev ^ index » next coverage.py v6.4.4, created at 2023-07-17 14:22 -0600
« prev ^ index » next coverage.py v6.4.4, created at 2023-07-17 14:22 -0600
1from __future__ import annotations
3from collections.abc import Callable # noqa: PDF001
4import re
5import textwrap
6from typing import TYPE_CHECKING
7import unicodedata
9import numpy as np
11import pandas._libs.lib as lib
12import pandas._libs.missing as libmissing
13import pandas._libs.ops as libops
14from pandas._typing import (
15 NpDtype,
16 Scalar,
17)
19from pandas.core.dtypes.common import is_scalar
20from pandas.core.dtypes.missing import isna
22from pandas.core.strings.base import BaseStringArrayMethods
24if TYPE_CHECKING: 24 ↛ 25line 24 didn't jump to line 25, because the condition on line 24 was never true
25 from pandas import Series
28class ObjectStringArrayMixin(BaseStringArrayMethods):
29 """
30 String Methods operating on object-dtype ndarrays.
31 """
33 _str_na_value = np.nan
35 def __len__(self):
36 # For typing, _str_map relies on the object being sized.
37 raise NotImplementedError
39 def _str_map(
40 self, f, na_value=None, dtype: NpDtype | None = None, convert: bool = True
41 ):
42 """
43 Map a callable over valid elements of the array.
45 Parameters
46 ----------
47 f : Callable
48 A function to call on each non-NA element.
49 na_value : Scalar, optional
50 The value to set for NA values. Might also be used for the
51 fill value if the callable `f` raises an exception.
52 This defaults to ``self._str_na_value`` which is ``np.nan``
53 for object-dtype and Categorical and ``pd.NA`` for StringArray.
54 dtype : Dtype, optional
55 The dtype of the result array.
56 convert : bool, default True
57 Whether to call `maybe_convert_objects` on the resulting ndarray
58 """
59 if dtype is None:
60 dtype = np.dtype("object")
61 if na_value is None:
62 na_value = self._str_na_value
64 if not len(self):
65 return np.array([], dtype=dtype)
67 arr = np.asarray(self, dtype=object)
68 mask = isna(arr)
69 map_convert = convert and not np.all(mask)
70 try:
71 result = lib.map_infer_mask(arr, f, mask.view(np.uint8), map_convert)
72 except (TypeError, AttributeError) as err:
73 # Reraise the exception if callable `f` got wrong number of args.
74 # The user may want to be warned by this, instead of getting NaN
75 p_err = (
76 r"((takes)|(missing)) (?(2)from \d+ to )?\d+ "
77 r"(?(3)required )positional arguments?"
78 )
80 if len(err.args) >= 1 and re.search(p_err, err.args[0]):
81 # FIXME: this should be totally avoidable
82 raise err
84 def g(x):
85 # This type of fallback behavior can be removed once
86 # we remove object-dtype .str accessor.
87 try:
88 return f(x)
89 except (TypeError, AttributeError):
90 return na_value
92 return self._str_map(g, na_value=na_value, dtype=dtype)
93 if not isinstance(result, np.ndarray):
94 return result
95 if na_value is not np.nan:
96 np.putmask(result, mask, na_value)
97 if convert and result.dtype == object:
98 result = lib.maybe_convert_objects(result)
99 return result
101 def _str_count(self, pat, flags=0):
102 regex = re.compile(pat, flags=flags)
103 f = lambda x: len(regex.findall(x))
104 return self._str_map(f, dtype="int64")
106 def _str_pad(self, width, side="left", fillchar=" "):
107 if side == "left":
108 f = lambda x: x.rjust(width, fillchar)
109 elif side == "right":
110 f = lambda x: x.ljust(width, fillchar)
111 elif side == "both":
112 f = lambda x: x.center(width, fillchar)
113 else: # pragma: no cover
114 raise ValueError("Invalid side")
115 return self._str_map(f)
117 def _str_contains(self, pat, case=True, flags=0, na=np.nan, regex: bool = True):
118 if regex:
119 if not case:
120 flags |= re.IGNORECASE
122 pat = re.compile(pat, flags=flags)
124 f = lambda x: pat.search(x) is not None
125 else:
126 if case:
127 f = lambda x: pat in x
128 else:
129 upper_pat = pat.upper()
130 f = lambda x: upper_pat in x.upper()
131 return self._str_map(f, na, dtype=np.dtype("bool"))
133 def _str_startswith(self, pat, na=None):
134 f = lambda x: x.startswith(pat)
135 return self._str_map(f, na_value=na, dtype=np.dtype(bool))
137 def _str_endswith(self, pat, na=None):
138 f = lambda x: x.endswith(pat)
139 return self._str_map(f, na_value=na, dtype=np.dtype(bool))
141 def _str_replace(
142 self,
143 pat: str | re.Pattern,
144 repl: str | Callable,
145 n: int = -1,
146 case: bool = True,
147 flags: int = 0,
148 regex: bool = True,
149 ):
150 if case is False:
151 # add case flag, if provided
152 flags |= re.IGNORECASE
154 if regex or flags or callable(repl):
155 if not isinstance(pat, re.Pattern):
156 if regex is False:
157 pat = re.escape(pat)
158 pat = re.compile(pat, flags=flags)
160 n = n if n >= 0 else 0
161 f = lambda x: pat.sub(repl=repl, string=x, count=n)
162 else:
163 f = lambda x: x.replace(pat, repl, n)
165 return self._str_map(f, dtype=str)
167 def _str_repeat(self, repeats):
168 if is_scalar(repeats):
170 def scalar_rep(x):
171 try:
172 return bytes.__mul__(x, repeats)
173 except TypeError:
174 return str.__mul__(x, repeats)
176 return self._str_map(scalar_rep, dtype=str)
177 else:
178 from pandas.core.arrays.string_ import BaseStringArray
180 def rep(x, r):
181 if x is libmissing.NA:
182 return x
183 try:
184 return bytes.__mul__(x, r)
185 except TypeError:
186 return str.__mul__(x, r)
188 repeats = np.asarray(repeats, dtype=object)
189 result = libops.vec_binop(np.asarray(self), repeats, rep)
190 if isinstance(self, BaseStringArray):
191 # Not going through map, so we have to do this here.
192 result = type(self)._from_sequence(result)
193 return result
195 def _str_match(
196 self, pat: str, case: bool = True, flags: int = 0, na: Scalar | None = None
197 ):
198 if not case:
199 flags |= re.IGNORECASE
201 regex = re.compile(pat, flags=flags)
203 f = lambda x: regex.match(x) is not None
204 return self._str_map(f, na_value=na, dtype=np.dtype(bool))
206 def _str_fullmatch(
207 self,
208 pat: str | re.Pattern,
209 case: bool = True,
210 flags: int = 0,
211 na: Scalar | None = None,
212 ):
213 if not case:
214 flags |= re.IGNORECASE
216 regex = re.compile(pat, flags=flags)
218 f = lambda x: regex.fullmatch(x) is not None
219 return self._str_map(f, na_value=na, dtype=np.dtype(bool))
221 def _str_encode(self, encoding, errors="strict"):
222 f = lambda x: x.encode(encoding, errors=errors)
223 return self._str_map(f, dtype=object)
225 def _str_find(self, sub, start=0, end=None):
226 return self._str_find_(sub, start, end, side="left")
228 def _str_rfind(self, sub, start=0, end=None):
229 return self._str_find_(sub, start, end, side="right")
231 def _str_find_(self, sub, start, end, side):
232 if side == "left":
233 method = "find"
234 elif side == "right":
235 method = "rfind"
236 else: # pragma: no cover
237 raise ValueError("Invalid side")
239 if end is None:
240 f = lambda x: getattr(x, method)(sub, start)
241 else:
242 f = lambda x: getattr(x, method)(sub, start, end)
243 return self._str_map(f, dtype="int64")
245 def _str_findall(self, pat, flags=0):
246 regex = re.compile(pat, flags=flags)
247 return self._str_map(regex.findall, dtype="object")
249 def _str_get(self, i):
250 def f(x):
251 if isinstance(x, dict):
252 return x.get(i)
253 elif len(x) > i >= -len(x):
254 return x[i]
255 return self._str_na_value
257 return self._str_map(f)
259 def _str_index(self, sub, start=0, end=None):
260 if end:
261 f = lambda x: x.index(sub, start, end)
262 else:
263 f = lambda x: x.index(sub, start, end)
264 return self._str_map(f, dtype="int64")
266 def _str_rindex(self, sub, start=0, end=None):
267 if end:
268 f = lambda x: x.rindex(sub, start, end)
269 else:
270 f = lambda x: x.rindex(sub, start, end)
271 return self._str_map(f, dtype="int64")
273 def _str_join(self, sep):
274 return self._str_map(sep.join)
276 def _str_partition(self, sep, expand):
277 result = self._str_map(lambda x: x.partition(sep), dtype="object")
278 return result
280 def _str_rpartition(self, sep, expand):
281 return self._str_map(lambda x: x.rpartition(sep), dtype="object")
283 def _str_len(self):
284 return self._str_map(len, dtype="int64")
286 def _str_slice(self, start=None, stop=None, step=None):
287 obj = slice(start, stop, step)
288 return self._str_map(lambda x: x[obj])
290 def _str_slice_replace(self, start=None, stop=None, repl=None):
291 if repl is None:
292 repl = ""
294 def f(x):
295 if x[start:stop] == "":
296 local_stop = start
297 else:
298 local_stop = stop
299 y = ""
300 if start is not None:
301 y += x[:start]
302 y += repl
303 if stop is not None:
304 y += x[local_stop:]
305 return y
307 return self._str_map(f)
309 def _str_split(
310 self,
311 pat: str | re.Pattern | None = None,
312 n=-1,
313 expand=False,
314 regex: bool | None = None,
315 ):
316 if pat is None:
317 if n is None or n == 0:
318 n = -1
319 f = lambda x: x.split(pat, n)
320 else:
321 new_pat: str | re.Pattern
322 if regex is True or isinstance(pat, re.Pattern):
323 new_pat = re.compile(pat)
324 elif regex is False:
325 new_pat = pat
326 # regex is None so link to old behavior #43563
327 else:
328 if len(pat) == 1:
329 new_pat = pat
330 else:
331 new_pat = re.compile(pat)
333 if isinstance(new_pat, re.Pattern):
334 if n is None or n == -1:
335 n = 0
336 f = lambda x: new_pat.split(x, maxsplit=n)
337 else:
338 if n is None or n == 0:
339 n = -1
340 f = lambda x: x.split(pat, n)
341 return self._str_map(f, dtype=object)
343 def _str_rsplit(self, pat=None, n=-1):
344 if n is None or n == 0:
345 n = -1
346 f = lambda x: x.rsplit(pat, n)
347 return self._str_map(f, dtype="object")
349 def _str_translate(self, table):
350 return self._str_map(lambda x: x.translate(table))
352 def _str_wrap(self, width, **kwargs):
353 kwargs["width"] = width
354 tw = textwrap.TextWrapper(**kwargs)
355 return self._str_map(lambda s: "\n".join(tw.wrap(s)))
357 def _str_get_dummies(self, sep="|"):
358 from pandas import Series
360 arr = Series(self).fillna("")
361 try:
362 arr = sep + arr + sep
363 except (TypeError, NotImplementedError):
364 arr = sep + arr.astype(str) + sep
366 tags: set[str] = set()
367 for ts in Series(arr).str.split(sep):
368 tags.update(ts)
369 tags2 = sorted(tags - {""})
371 dummies = np.empty((len(arr), len(tags2)), dtype=np.int64)
373 for i, t in enumerate(tags2):
374 pat = sep + t + sep
375 dummies[:, i] = lib.map_infer(arr.to_numpy(), lambda x: pat in x)
376 return dummies, tags2
378 def _str_upper(self):
379 return self._str_map(lambda x: x.upper())
381 def _str_isalnum(self):
382 return self._str_map(str.isalnum, dtype="bool")
384 def _str_isalpha(self):
385 return self._str_map(str.isalpha, dtype="bool")
387 def _str_isdecimal(self):
388 return self._str_map(str.isdecimal, dtype="bool")
390 def _str_isdigit(self):
391 return self._str_map(str.isdigit, dtype="bool")
393 def _str_islower(self):
394 return self._str_map(str.islower, dtype="bool")
396 def _str_isnumeric(self):
397 return self._str_map(str.isnumeric, dtype="bool")
399 def _str_isspace(self):
400 return self._str_map(str.isspace, dtype="bool")
402 def _str_istitle(self):
403 return self._str_map(str.istitle, dtype="bool")
405 def _str_isupper(self):
406 return self._str_map(str.isupper, dtype="bool")
408 def _str_capitalize(self):
409 return self._str_map(str.capitalize)
411 def _str_casefold(self):
412 return self._str_map(str.casefold)
414 def _str_title(self):
415 return self._str_map(str.title)
417 def _str_swapcase(self):
418 return self._str_map(str.swapcase)
420 def _str_lower(self):
421 return self._str_map(str.lower)
423 def _str_normalize(self, form):
424 f = lambda x: unicodedata.normalize(form, x)
425 return self._str_map(f)
427 def _str_strip(self, to_strip=None):
428 return self._str_map(lambda x: x.strip(to_strip))
430 def _str_lstrip(self, to_strip=None):
431 return self._str_map(lambda x: x.lstrip(to_strip))
433 def _str_rstrip(self, to_strip=None):
434 return self._str_map(lambda x: x.rstrip(to_strip))
436 def _str_removeprefix(self, prefix: str) -> Series:
437 # outstanding question on whether to use native methods for users on Python 3.9+
438 # https://github.com/pandas-dev/pandas/pull/39226#issuecomment-836719770,
439 # in which case we could do return self._str_map(str.removeprefix)
441 def removeprefix(text: str) -> str:
442 if text.startswith(prefix):
443 return text[len(prefix) :]
444 return text
446 return self._str_map(removeprefix)
448 def _str_removesuffix(self, suffix: str) -> Series:
449 # this could be used on Python 3.9+
450 # f = lambda x: x.removesuffix(suffix)
451 # return self._str_map(str.removesuffix)
453 def removesuffix(text: str) -> str:
454 if text.endswith(suffix):
455 return text[: -len(suffix)]
456 return text
458 return self._str_map(removesuffix)
460 def _str_extract(self, pat: str, flags: int = 0, expand: bool = True):
461 regex = re.compile(pat, flags=flags)
462 na_value = self._str_na_value
464 if not expand:
466 def g(x):
467 m = regex.search(x)
468 return m.groups()[0] if m else na_value
470 return self._str_map(g, convert=False)
472 empty_row = [na_value] * regex.groups
474 def f(x):
475 if not isinstance(x, str):
476 return empty_row
477 m = regex.search(x)
478 if m:
479 return [na_value if item is None else item for item in m.groups()]
480 else:
481 return empty_row
483 return [f(val) for val in np.asarray(self)]