Coverage for /var/srv/projects/api.amasfac.comuna18.com/tmp/venv/lib/python3.9/site-packages/pandas/core/strings/accessor.py: 25%
573 statements
« prev ^ index » next coverage.py v6.4.4, created at 2023-07-17 14:22 -0600
« prev ^ index » next coverage.py v6.4.4, created at 2023-07-17 14:22 -0600
1from __future__ import annotations
3import codecs
4from functools import wraps
5import re
6from typing import (
7 TYPE_CHECKING,
8 Callable,
9 Hashable,
10 cast,
11)
12import warnings
14import numpy as np
16import pandas._libs.lib as lib
17from pandas._typing import (
18 DtypeObj,
19 F,
20 Scalar,
21)
22from pandas.util._decorators import (
23 Appender,
24 deprecate_nonkeyword_arguments,
25)
26from pandas.util._exceptions import find_stack_level
28from pandas.core.dtypes.common import (
29 ensure_object,
30 is_bool_dtype,
31 is_categorical_dtype,
32 is_integer,
33 is_list_like,
34 is_object_dtype,
35 is_re,
36)
37from pandas.core.dtypes.generic import (
38 ABCDataFrame,
39 ABCIndex,
40 ABCMultiIndex,
41 ABCSeries,
42)
43from pandas.core.dtypes.missing import isna
45from pandas.core.base import NoNewAttributesMixin
46from pandas.core.construction import extract_array
48if TYPE_CHECKING: 48 ↛ 49line 48 didn't jump to line 49, because the condition on line 48 was never true
49 from pandas import (
50 DataFrame,
51 Index,
52 Series,
53 )
55_shared_docs: dict[str, str] = {}
56_cpython_optimized_encoders = (
57 "utf-8",
58 "utf8",
59 "latin-1",
60 "latin1",
61 "iso-8859-1",
62 "mbcs",
63 "ascii",
64)
65_cpython_optimized_decoders = _cpython_optimized_encoders + ("utf-16", "utf-32")
68def forbid_nonstring_types(
69 forbidden: list[str] | None, name: str | None = None
70) -> Callable[[F], F]:
71 """
72 Decorator to forbid specific types for a method of StringMethods.
74 For calling `.str.{method}` on a Series or Index, it is necessary to first
75 initialize the :class:`StringMethods` object, and then call the method.
76 However, different methods allow different input types, and so this can not
77 be checked during :meth:`StringMethods.__init__`, but must be done on a
78 per-method basis. This decorator exists to facilitate this process, and
79 make it explicit which (inferred) types are disallowed by the method.
81 :meth:`StringMethods.__init__` allows the *union* of types its different
82 methods allow (after skipping NaNs; see :meth:`StringMethods._validate`),
83 namely: ['string', 'empty', 'bytes', 'mixed', 'mixed-integer'].
85 The default string types ['string', 'empty'] are allowed for all methods.
86 For the additional types ['bytes', 'mixed', 'mixed-integer'], each method
87 then needs to forbid the types it is not intended for.
89 Parameters
90 ----------
91 forbidden : list-of-str or None
92 List of forbidden non-string types, may be one or more of
93 `['bytes', 'mixed', 'mixed-integer']`.
94 name : str, default None
95 Name of the method to use in the error message. By default, this is
96 None, in which case the name from the method being wrapped will be
97 copied. However, for working with further wrappers (like _pat_wrapper
98 and _noarg_wrapper), it is necessary to specify the name.
100 Returns
101 -------
102 func : wrapper
103 The method to which the decorator is applied, with an added check that
104 enforces the inferred type to not be in the list of forbidden types.
106 Raises
107 ------
108 TypeError
109 If the inferred type of the underlying data is in `forbidden`.
110 """
111 # deal with None
112 forbidden = [] if forbidden is None else forbidden
114 allowed_types = {"string", "empty", "bytes", "mixed", "mixed-integer"} - set(
115 forbidden
116 )
118 def _forbid_nonstring_types(func: F) -> F:
119 func_name = func.__name__ if name is None else name
121 @wraps(func)
122 def wrapper(self, *args, **kwargs):
123 if self._inferred_dtype not in allowed_types:
124 msg = (
125 f"Cannot use .str.{func_name} with values of "
126 f"inferred dtype '{self._inferred_dtype}'."
127 )
128 raise TypeError(msg)
129 return func(self, *args, **kwargs)
131 wrapper.__name__ = func_name
132 return cast(F, wrapper)
134 return _forbid_nonstring_types
137def _map_and_wrap(name, docstring):
138 @forbid_nonstring_types(["bytes"], name=name)
139 def wrapper(self):
140 result = getattr(self._data.array, f"_str_{name}")()
141 return self._wrap_result(result)
143 wrapper.__doc__ = docstring
144 return wrapper
147class StringMethods(NoNewAttributesMixin):
148 """
149 Vectorized string functions for Series and Index.
151 NAs stay NA unless handled otherwise by a particular method.
152 Patterned after Python's string methods, with some inspiration from
153 R's stringr package.
155 Examples
156 --------
157 >>> s = pd.Series(["A_Str_Series"])
158 >>> s
159 0 A_Str_Series
160 dtype: object
162 >>> s.str.split("_")
163 0 [A, Str, Series]
164 dtype: object
166 >>> s.str.replace("_", "")
167 0 AStrSeries
168 dtype: object
169 """
171 # Note: see the docstring in pandas.core.strings.__init__
172 # for an explanation of the implementation.
173 # TODO: Dispatch all the methods
174 # Currently the following are not dispatched to the array
175 # * cat
176 # * extractall
178 def __init__(self, data) -> None:
179 from pandas.core.arrays.string_ import StringDtype
181 self._inferred_dtype = self._validate(data)
182 self._is_categorical = is_categorical_dtype(data.dtype)
183 self._is_string = isinstance(data.dtype, StringDtype)
184 self._data = data
186 self._index = self._name = None
187 if isinstance(data, ABCSeries):
188 self._index = data.index
189 self._name = data.name
191 # ._values.categories works for both Series/Index
192 self._parent = data._values.categories if self._is_categorical else data
193 # save orig to blow up categoricals to the right type
194 self._orig = data
195 self._freeze()
197 @staticmethod
198 def _validate(data):
199 """
200 Auxiliary function for StringMethods, infers and checks dtype of data.
202 This is a "first line of defence" at the creation of the StringMethods-
203 object, and just checks that the dtype is in the
204 *union* of the allowed types over all string methods below; this
205 restriction is then refined on a per-method basis using the decorator
206 @forbid_nonstring_types (more info in the corresponding docstring).
208 This really should exclude all series/index with any non-string values,
209 but that isn't practical for performance reasons until we have a str
210 dtype (GH 9343 / 13877)
212 Parameters
213 ----------
214 data : The content of the Series
216 Returns
217 -------
218 dtype : inferred dtype of data
219 """
220 if isinstance(data, ABCMultiIndex):
221 raise AttributeError(
222 "Can only use .str accessor with Index, not MultiIndex"
223 )
225 # see _libs/lib.pyx for list of inferred types
226 allowed_types = ["string", "empty", "bytes", "mixed", "mixed-integer"]
228 data = extract_array(data)
230 values = getattr(data, "categories", data) # categorical / normal
232 inferred_dtype = lib.infer_dtype(values, skipna=True)
234 if inferred_dtype not in allowed_types:
235 raise AttributeError("Can only use .str accessor with string values!")
236 return inferred_dtype
238 def __getitem__(self, key):
239 result = self._data.array._str_getitem(key)
240 return self._wrap_result(result)
242 def __iter__(self):
243 warnings.warn(
244 "Columnar iteration over characters will be deprecated in future releases.",
245 FutureWarning,
246 stacklevel=find_stack_level(),
247 )
248 i = 0
249 g = self.get(i)
250 while g.notna().any():
251 yield g
252 i += 1
253 g = self.get(i)
255 def _wrap_result(
256 self,
257 result,
258 name=None,
259 expand: bool | None = None,
260 fill_value=np.nan,
261 returns_string=True,
262 returns_bool: bool = False,
263 ):
264 from pandas import (
265 Index,
266 MultiIndex,
267 )
269 if not hasattr(result, "ndim") or not hasattr(result, "dtype"):
270 if isinstance(result, ABCDataFrame):
271 result = result.__finalize__(self._orig, name="str")
272 return result
273 assert result.ndim < 3
275 # We can be wrapping a string / object / categorical result, in which
276 # case we'll want to return the same dtype as the input.
277 # Or we can be wrapping a numeric output, in which case we don't want
278 # to return a StringArray.
279 # Ideally the array method returns the right array type.
280 if expand is None:
281 # infer from ndim if expand is not specified
282 expand = result.ndim != 1
284 elif (
285 expand is True
286 and is_object_dtype(result)
287 and not isinstance(self._orig, ABCIndex)
288 ):
289 # required when expand=True is explicitly specified
290 # not needed when inferred
292 def cons_row(x):
293 if is_list_like(x):
294 return x
295 else:
296 return [x]
298 result = [cons_row(x) for x in result]
299 if result and not self._is_string:
300 # propagate nan values to match longest sequence (GH 18450)
301 max_len = max(len(x) for x in result)
302 result = [
303 x * max_len if len(x) == 0 or x[0] is np.nan else x for x in result
304 ]
306 if not isinstance(expand, bool):
307 raise ValueError("expand must be True or False")
309 if expand is False:
310 # if expand is False, result should have the same name
311 # as the original otherwise specified
312 if name is None:
313 name = getattr(result, "name", None)
314 if name is None:
315 # do not use logical or, _orig may be a DataFrame
316 # which has "name" column
317 name = self._orig.name
319 # Wait until we are sure result is a Series or Index before
320 # checking attributes (GH 12180)
321 if isinstance(self._orig, ABCIndex):
322 # if result is a boolean np.array, return the np.array
323 # instead of wrapping it into a boolean Index (GH 8875)
324 if is_bool_dtype(result):
325 return result
327 if expand:
328 result = list(result)
329 out = MultiIndex.from_tuples(result, names=name)
330 if out.nlevels == 1:
331 # We had all tuples of length-one, which are
332 # better represented as a regular Index.
333 out = out.get_level_values(0)
334 return out
335 else:
336 return Index._with_infer(result, name=name)
337 else:
338 index = self._orig.index
339 # This is a mess.
340 dtype: DtypeObj | str | None
341 vdtype = getattr(result, "dtype", None)
342 if self._is_string:
343 if is_bool_dtype(vdtype):
344 dtype = result.dtype
345 elif returns_string:
346 dtype = self._orig.dtype
347 else:
348 dtype = vdtype
349 else:
350 dtype = vdtype
352 if expand:
353 cons = self._orig._constructor_expanddim
354 result = cons(result, columns=name, index=index, dtype=dtype)
355 else:
356 # Must be a Series
357 cons = self._orig._constructor
358 result = cons(result, name=name, index=index, dtype=dtype)
359 result = result.__finalize__(self._orig, method="str")
360 if name is not None and result.ndim == 1:
361 # __finalize__ might copy over the original name, but we may
362 # want the new name (e.g. str.extract).
363 result.name = name
364 return result
366 def _get_series_list(self, others):
367 """
368 Auxiliary function for :meth:`str.cat`. Turn potentially mixed input
369 into a list of Series (elements without an index must match the length
370 of the calling Series/Index).
372 Parameters
373 ----------
374 others : Series, DataFrame, np.ndarray, list-like or list-like of
375 Objects that are either Series, Index or np.ndarray (1-dim).
377 Returns
378 -------
379 list of Series
380 Others transformed into list of Series.
381 """
382 from pandas import (
383 DataFrame,
384 Series,
385 )
387 # self._orig is either Series or Index
388 idx = self._orig if isinstance(self._orig, ABCIndex) else self._orig.index
390 # Generally speaking, all objects without an index inherit the index
391 # `idx` of the calling Series/Index - i.e. must have matching length.
392 # Objects with an index (i.e. Series/Index/DataFrame) keep their own.
393 if isinstance(others, ABCSeries):
394 return [others]
395 elif isinstance(others, ABCIndex):
396 return [Series(others._values, index=idx, dtype=others.dtype)]
397 elif isinstance(others, ABCDataFrame):
398 return [others[x] for x in others]
399 elif isinstance(others, np.ndarray) and others.ndim == 2:
400 others = DataFrame(others, index=idx)
401 return [others[x] for x in others]
402 elif is_list_like(others, allow_sets=False):
403 others = list(others) # ensure iterators do not get read twice etc
405 # in case of list-like `others`, all elements must be
406 # either Series/Index/np.ndarray (1-dim)...
407 if all(
408 isinstance(x, (ABCSeries, ABCIndex))
409 or (isinstance(x, np.ndarray) and x.ndim == 1)
410 for x in others
411 ):
412 los: list[Series] = []
413 while others: # iterate through list and append each element
414 los = los + self._get_series_list(others.pop(0))
415 return los
416 # ... or just strings
417 elif all(not is_list_like(x) for x in others):
418 return [Series(others, index=idx)]
419 raise TypeError(
420 "others must be Series, Index, DataFrame, np.ndarray "
421 "or list-like (either containing only strings or "
422 "containing only objects of type Series/Index/"
423 "np.ndarray[1-dim])"
424 )
426 @forbid_nonstring_types(["bytes", "mixed", "mixed-integer"])
427 def cat(
428 self, others=None, sep=None, na_rep=None, join="left"
429 ) -> str | Series | Index:
430 """
431 Concatenate strings in the Series/Index with given separator.
433 If `others` is specified, this function concatenates the Series/Index
434 and elements of `others` element-wise.
435 If `others` is not passed, then all values in the Series/Index are
436 concatenated into a single string with a given `sep`.
438 Parameters
439 ----------
440 others : Series, Index, DataFrame, np.ndarray or list-like
441 Series, Index, DataFrame, np.ndarray (one- or two-dimensional) and
442 other list-likes of strings must have the same length as the
443 calling Series/Index, with the exception of indexed objects (i.e.
444 Series/Index/DataFrame) if `join` is not None.
446 If others is a list-like that contains a combination of Series,
447 Index or np.ndarray (1-dim), then all elements will be unpacked and
448 must satisfy the above criteria individually.
450 If others is None, the method returns the concatenation of all
451 strings in the calling Series/Index.
452 sep : str, default ''
453 The separator between the different elements/columns. By default
454 the empty string `''` is used.
455 na_rep : str or None, default None
456 Representation that is inserted for all missing values:
458 - If `na_rep` is None, and `others` is None, missing values in the
459 Series/Index are omitted from the result.
460 - If `na_rep` is None, and `others` is not None, a row containing a
461 missing value in any of the columns (before concatenation) will
462 have a missing value in the result.
463 join : {'left', 'right', 'outer', 'inner'}, default 'left'
464 Determines the join-style between the calling Series/Index and any
465 Series/Index/DataFrame in `others` (objects without an index need
466 to match the length of the calling Series/Index). To disable
467 alignment, use `.values` on any Series/Index/DataFrame in `others`.
469 .. versionadded:: 0.23.0
470 .. versionchanged:: 1.0.0
471 Changed default of `join` from None to `'left'`.
473 Returns
474 -------
475 str, Series or Index
476 If `others` is None, `str` is returned, otherwise a `Series/Index`
477 (same type as caller) of objects is returned.
479 See Also
480 --------
481 split : Split each string in the Series/Index.
482 join : Join lists contained as elements in the Series/Index.
484 Examples
485 --------
486 When not passing `others`, all values are concatenated into a single
487 string:
489 >>> s = pd.Series(['a', 'b', np.nan, 'd'])
490 >>> s.str.cat(sep=' ')
491 'a b d'
493 By default, NA values in the Series are ignored. Using `na_rep`, they
494 can be given a representation:
496 >>> s.str.cat(sep=' ', na_rep='?')
497 'a b ? d'
499 If `others` is specified, corresponding values are concatenated with
500 the separator. Result will be a Series of strings.
502 >>> s.str.cat(['A', 'B', 'C', 'D'], sep=',')
503 0 a,A
504 1 b,B
505 2 NaN
506 3 d,D
507 dtype: object
509 Missing values will remain missing in the result, but can again be
510 represented using `na_rep`
512 >>> s.str.cat(['A', 'B', 'C', 'D'], sep=',', na_rep='-')
513 0 a,A
514 1 b,B
515 2 -,C
516 3 d,D
517 dtype: object
519 If `sep` is not specified, the values are concatenated without
520 separation.
522 >>> s.str.cat(['A', 'B', 'C', 'D'], na_rep='-')
523 0 aA
524 1 bB
525 2 -C
526 3 dD
527 dtype: object
529 Series with different indexes can be aligned before concatenation. The
530 `join`-keyword works as in other methods.
532 >>> t = pd.Series(['d', 'a', 'e', 'c'], index=[3, 0, 4, 2])
533 >>> s.str.cat(t, join='left', na_rep='-')
534 0 aa
535 1 b-
536 2 -c
537 3 dd
538 dtype: object
539 >>>
540 >>> s.str.cat(t, join='outer', na_rep='-')
541 0 aa
542 1 b-
543 2 -c
544 3 dd
545 4 -e
546 dtype: object
547 >>>
548 >>> s.str.cat(t, join='inner', na_rep='-')
549 0 aa
550 2 -c
551 3 dd
552 dtype: object
553 >>>
554 >>> s.str.cat(t, join='right', na_rep='-')
555 3 dd
556 0 aa
557 4 -e
558 2 -c
559 dtype: object
561 For more examples, see :ref:`here <text.concatenate>`.
562 """
563 # TODO: dispatch
564 from pandas import (
565 Index,
566 Series,
567 concat,
568 )
570 if isinstance(others, str):
571 raise ValueError("Did you mean to supply a `sep` keyword?")
572 if sep is None:
573 sep = ""
575 if isinstance(self._orig, ABCIndex):
576 data = Series(self._orig, index=self._orig, dtype=self._orig.dtype)
577 else: # Series
578 data = self._orig
580 # concatenate Series/Index with itself if no "others"
581 if others is None:
582 # error: Incompatible types in assignment (expression has type
583 # "ndarray", variable has type "Series")
584 data = ensure_object(data) # type: ignore[assignment]
585 na_mask = isna(data)
586 if na_rep is None and na_mask.any():
587 return sep.join(data[~na_mask])
588 elif na_rep is not None and na_mask.any():
589 return sep.join(np.where(na_mask, na_rep, data))
590 else:
591 return sep.join(data)
593 try:
594 # turn anything in "others" into lists of Series
595 others = self._get_series_list(others)
596 except ValueError as err: # do not catch TypeError raised by _get_series_list
597 raise ValueError(
598 "If `others` contains arrays or lists (or other "
599 "list-likes without an index), these must all be "
600 "of the same length as the calling Series/Index."
601 ) from err
603 # align if required
604 if any(not data.index.equals(x.index) for x in others):
605 # Need to add keys for uniqueness in case of duplicate columns
606 others = concat(
607 others,
608 axis=1,
609 join=(join if join == "inner" else "outer"),
610 keys=range(len(others)),
611 sort=False,
612 copy=False,
613 )
614 data, others = data.align(others, join=join)
615 others = [others[x] for x in others] # again list of Series
617 all_cols = [ensure_object(x) for x in [data] + others]
618 na_masks = np.array([isna(x) for x in all_cols])
619 union_mask = np.logical_or.reduce(na_masks, axis=0)
621 if na_rep is None and union_mask.any():
622 # no na_rep means NaNs for all rows where any column has a NaN
623 # only necessary if there are actually any NaNs
624 result = np.empty(len(data), dtype=object)
625 np.putmask(result, union_mask, np.nan)
627 not_masked = ~union_mask
628 result[not_masked] = cat_safe([x[not_masked] for x in all_cols], sep)
629 elif na_rep is not None and union_mask.any():
630 # fill NaNs with na_rep in case there are actually any NaNs
631 all_cols = [
632 np.where(nm, na_rep, col) for nm, col in zip(na_masks, all_cols)
633 ]
634 result = cat_safe(all_cols, sep)
635 else:
636 # no NaNs - can just concatenate
637 result = cat_safe(all_cols, sep)
639 out: Index | Series
640 if isinstance(self._orig, ABCIndex):
641 # add dtype for case that result is all-NA
643 out = Index(result, dtype=object, name=self._orig.name)
644 else: # Series
645 if is_categorical_dtype(self._orig.dtype):
646 # We need to infer the new categories.
647 dtype = None
648 else:
649 dtype = self._orig.dtype
650 res_ser = Series(
651 result, dtype=dtype, index=data.index, name=self._orig.name
652 )
653 out = res_ser.__finalize__(self._orig, method="str_cat")
654 return out
656 _shared_docs[
657 "str_split"
658 ] = r"""
659 Split strings around given separator/delimiter.
661 Splits the string in the Series/Index from the %(side)s,
662 at the specified delimiter string.
664 Parameters
665 ----------
666 pat : str%(pat_regex)s, optional
667 %(pat_description)s.
668 If not specified, split on whitespace.
669 n : int, default -1 (all)
670 Limit number of splits in output.
671 ``None``, 0 and -1 will be interpreted as return all splits.
672 expand : bool, default False
673 Expand the split strings into separate columns.
675 - If ``True``, return DataFrame/MultiIndex expanding dimensionality.
676 - If ``False``, return Series/Index, containing lists of strings.
677 %(regex_argument)s
678 Returns
679 -------
680 Series, Index, DataFrame or MultiIndex
681 Type matches caller unless ``expand=True`` (see Notes).
682 %(raises_split)s
683 See Also
684 --------
685 Series.str.split : Split strings around given separator/delimiter.
686 Series.str.rsplit : Splits string around given separator/delimiter,
687 starting from the right.
688 Series.str.join : Join lists contained as elements in the Series/Index
689 with passed delimiter.
690 str.split : Standard library version for split.
691 str.rsplit : Standard library version for rsplit.
693 Notes
694 -----
695 The handling of the `n` keyword depends on the number of found splits:
697 - If found splits > `n`, make first `n` splits only
698 - If found splits <= `n`, make all splits
699 - If for a certain row the number of found splits < `n`,
700 append `None` for padding up to `n` if ``expand=True``
702 If using ``expand=True``, Series and Index callers return DataFrame and
703 MultiIndex objects, respectively.
704 %(regex_pat_note)s
705 Examples
706 --------
707 >>> s = pd.Series(
708 ... [
709 ... "this is a regular sentence",
710 ... "https://docs.python.org/3/tutorial/index.html",
711 ... np.nan
712 ... ]
713 ... )
714 >>> s
715 0 this is a regular sentence
716 1 https://docs.python.org/3/tutorial/index.html
717 2 NaN
718 dtype: object
720 In the default setting, the string is split by whitespace.
722 >>> s.str.split()
723 0 [this, is, a, regular, sentence]
724 1 [https://docs.python.org/3/tutorial/index.html]
725 2 NaN
726 dtype: object
728 Without the `n` parameter, the outputs of `rsplit` and `split`
729 are identical.
731 >>> s.str.rsplit()
732 0 [this, is, a, regular, sentence]
733 1 [https://docs.python.org/3/tutorial/index.html]
734 2 NaN
735 dtype: object
737 The `n` parameter can be used to limit the number of splits on the
738 delimiter. The outputs of `split` and `rsplit` are different.
740 >>> s.str.split(n=2)
741 0 [this, is, a regular sentence]
742 1 [https://docs.python.org/3/tutorial/index.html]
743 2 NaN
744 dtype: object
746 >>> s.str.rsplit(n=2)
747 0 [this is a, regular, sentence]
748 1 [https://docs.python.org/3/tutorial/index.html]
749 2 NaN
750 dtype: object
752 The `pat` parameter can be used to split by other characters.
754 >>> s.str.split(pat="/")
755 0 [this is a regular sentence]
756 1 [https:, , docs.python.org, 3, tutorial, index...
757 2 NaN
758 dtype: object
760 When using ``expand=True``, the split elements will expand out into
761 separate columns. If NaN is present, it is propagated throughout
762 the columns during the split.
764 >>> s.str.split(expand=True)
765 0 1 2 3 4
766 0 this is a regular sentence
767 1 https://docs.python.org/3/tutorial/index.html None None None None
768 2 NaN NaN NaN NaN NaN
770 For slightly more complex use cases like splitting the html document name
771 from a url, a combination of parameter settings can be used.
773 >>> s.str.rsplit("/", n=1, expand=True)
774 0 1
775 0 this is a regular sentence None
776 1 https://docs.python.org/3/tutorial index.html
777 2 NaN NaN
778 %(regex_examples)s"""
780 @Appender(
781 _shared_docs["str_split"]
782 % {
783 "side": "beginning",
784 "pat_regex": " or compiled regex",
785 "pat_description": "String or regular expression to split on",
786 "regex_argument": """
787 regex : bool, default None
788 Determines if the passed-in pattern is a regular expression:
790 - If ``True``, assumes the passed-in pattern is a regular expression
791 - If ``False``, treats the pattern as a literal string.
792 - If ``None`` and `pat` length is 1, treats `pat` as a literal string.
793 - If ``None`` and `pat` length is not 1, treats `pat` as a regular expression.
794 - Cannot be set to False if `pat` is a compiled regex
796 .. versionadded:: 1.4.0
797 """,
798 "raises_split": """
799 Raises
800 ------
801 ValueError
802 * if `regex` is False and `pat` is a compiled regex
803 """,
804 "regex_pat_note": """
805 Use of `regex =False` with a `pat` as a compiled regex will raise an error.
806 """,
807 "method": "split",
808 "regex_examples": r"""
809 Remember to escape special characters when explicitly using regular expressions.
811 >>> s = pd.Series(["foo and bar plus baz"])
812 >>> s.str.split(r"and|plus", expand=True)
813 0 1 2
814 0 foo bar baz
816 Regular expressions can be used to handle urls or file names.
817 When `pat` is a string and ``regex=None`` (the default), the given `pat` is compiled
818 as a regex only if ``len(pat) != 1``.
820 >>> s = pd.Series(['foojpgbar.jpg'])
821 >>> s.str.split(r".", expand=True)
822 0 1
823 0 foojpgbar jpg
825 >>> s.str.split(r"\.jpg", expand=True)
826 0 1
827 0 foojpgbar
829 When ``regex=True``, `pat` is interpreted as a regex
831 >>> s.str.split(r"\.jpg", regex=True, expand=True)
832 0 1
833 0 foojpgbar
835 A compiled regex can be passed as `pat`
837 >>> import re
838 >>> s.str.split(re.compile(r"\.jpg"), expand=True)
839 0 1
840 0 foojpgbar
842 When ``regex=False``, `pat` is interpreted as the string itself
844 >>> s.str.split(r"\.jpg", regex=False, expand=True)
845 0
846 0 foojpgbar.jpg
847 """,
848 }
849 )
850 @deprecate_nonkeyword_arguments(version=None, allowed_args=["self", "pat"])
851 @forbid_nonstring_types(["bytes"])
852 def split(
853 self,
854 pat: str | re.Pattern | None = None,
855 n=-1,
856 expand=False,
857 *,
858 regex: bool | None = None,
859 ):
860 if regex is False and is_re(pat):
861 raise ValueError(
862 "Cannot use a compiled regex as replacement pattern with regex=False"
863 )
864 if is_re(pat):
865 regex = True
866 result = self._data.array._str_split(pat, n, expand, regex)
867 return self._wrap_result(result, returns_string=expand, expand=expand)
869 @Appender(
870 _shared_docs["str_split"]
871 % {
872 "side": "end",
873 "pat_regex": "",
874 "pat_description": "String to split on",
875 "regex_argument": "",
876 "raises_split": "",
877 "regex_pat_note": "",
878 "method": "rsplit",
879 "regex_examples": "",
880 }
881 )
882 @deprecate_nonkeyword_arguments(version=None, allowed_args=["self", "pat"])
883 @forbid_nonstring_types(["bytes"])
884 def rsplit(self, pat=None, n=-1, expand=False):
885 result = self._data.array._str_rsplit(pat, n=n)
886 return self._wrap_result(result, expand=expand, returns_string=expand)
888 _shared_docs[
889 "str_partition"
890 ] = """
891 Split the string at the %(side)s occurrence of `sep`.
893 This method splits the string at the %(side)s occurrence of `sep`,
894 and returns 3 elements containing the part before the separator,
895 the separator itself, and the part after the separator.
896 If the separator is not found, return %(return)s.
898 Parameters
899 ----------
900 sep : str, default whitespace
901 String to split on.
902 expand : bool, default True
903 If True, return DataFrame/MultiIndex expanding dimensionality.
904 If False, return Series/Index.
906 Returns
907 -------
908 DataFrame/MultiIndex or Series/Index of objects
910 See Also
911 --------
912 %(also)s
913 Series.str.split : Split strings around given separators.
914 str.partition : Standard library version.
916 Examples
917 --------
919 >>> s = pd.Series(['Linda van der Berg', 'George Pitt-Rivers'])
920 >>> s
921 0 Linda van der Berg
922 1 George Pitt-Rivers
923 dtype: object
925 >>> s.str.partition()
926 0 1 2
927 0 Linda van der Berg
928 1 George Pitt-Rivers
930 To partition by the last space instead of the first one:
932 >>> s.str.rpartition()
933 0 1 2
934 0 Linda van der Berg
935 1 George Pitt-Rivers
937 To partition by something different than a space:
939 >>> s.str.partition('-')
940 0 1 2
941 0 Linda van der Berg
942 1 George Pitt - Rivers
944 To return a Series containing tuples instead of a DataFrame:
946 >>> s.str.partition('-', expand=False)
947 0 (Linda van der Berg, , )
948 1 (George Pitt, -, Rivers)
949 dtype: object
951 Also available on indices:
953 >>> idx = pd.Index(['X 123', 'Y 999'])
954 >>> idx
955 Index(['X 123', 'Y 999'], dtype='object')
957 Which will create a MultiIndex:
959 >>> idx.str.partition()
960 MultiIndex([('X', ' ', '123'),
961 ('Y', ' ', '999')],
962 )
964 Or an index with tuples with ``expand=False``:
966 >>> idx.str.partition(expand=False)
967 Index([('X', ' ', '123'), ('Y', ' ', '999')], dtype='object')
968 """
970 @Appender(
971 _shared_docs["str_partition"]
972 % {
973 "side": "first",
974 "return": "3 elements containing the string itself, followed by two "
975 "empty strings",
976 "also": "rpartition : Split the string at the last occurrence of `sep`.",
977 }
978 )
979 @forbid_nonstring_types(["bytes"])
980 def partition(self, sep=" ", expand=True):
981 result = self._data.array._str_partition(sep, expand)
982 return self._wrap_result(result, expand=expand, returns_string=expand)
984 @Appender(
985 _shared_docs["str_partition"]
986 % {
987 "side": "last",
988 "return": "3 elements containing two empty strings, followed by the "
989 "string itself",
990 "also": "partition : Split the string at the first occurrence of `sep`.",
991 }
992 )
993 @forbid_nonstring_types(["bytes"])
994 def rpartition(self, sep=" ", expand=True):
995 result = self._data.array._str_rpartition(sep, expand)
996 return self._wrap_result(result, expand=expand, returns_string=expand)
998 def get(self, i):
999 """
1000 Extract element from each component at specified position or with specified key.
1002 Extract element from lists, tuples, dict, or strings in each element in the
1003 Series/Index.
1005 Parameters
1006 ----------
1007 i : int or hashable dict label
1008 Position or key of element to extract.
1010 Returns
1011 -------
1012 Series or Index
1014 Examples
1015 --------
1016 >>> s = pd.Series(["String",
1017 ... (1, 2, 3),
1018 ... ["a", "b", "c"],
1019 ... 123,
1020 ... -456,
1021 ... {1: "Hello", "2": "World"}])
1022 >>> s
1023 0 String
1024 1 (1, 2, 3)
1025 2 [a, b, c]
1026 3 123
1027 4 -456
1028 5 {1: 'Hello', '2': 'World'}
1029 dtype: object
1031 >>> s.str.get(1)
1032 0 t
1033 1 2
1034 2 b
1035 3 NaN
1036 4 NaN
1037 5 Hello
1038 dtype: object
1040 >>> s.str.get(-1)
1041 0 g
1042 1 3
1043 2 c
1044 3 NaN
1045 4 NaN
1046 5 None
1047 dtype: object
1049 Return element with given key
1051 >>> s = pd.Series([{"name": "Hello", "value": "World"},
1052 ... {"name": "Goodbye", "value": "Planet"}])
1053 >>> s.str.get('name')
1054 0 Hello
1055 1 Goodbye
1056 dtype: object
1057 """
1058 result = self._data.array._str_get(i)
1059 return self._wrap_result(result)
1061 @forbid_nonstring_types(["bytes"])
1062 def join(self, sep):
1063 """
1064 Join lists contained as elements in the Series/Index with passed delimiter.
1066 If the elements of a Series are lists themselves, join the content of these
1067 lists using the delimiter passed to the function.
1068 This function is an equivalent to :meth:`str.join`.
1070 Parameters
1071 ----------
1072 sep : str
1073 Delimiter to use between list entries.
1075 Returns
1076 -------
1077 Series/Index: object
1078 The list entries concatenated by intervening occurrences of the
1079 delimiter.
1081 Raises
1082 ------
1083 AttributeError
1084 If the supplied Series contains neither strings nor lists.
1086 See Also
1087 --------
1088 str.join : Standard library version of this method.
1089 Series.str.split : Split strings around given separator/delimiter.
1091 Notes
1092 -----
1093 If any of the list items is not a string object, the result of the join
1094 will be `NaN`.
1096 Examples
1097 --------
1098 Example with a list that contains non-string elements.
1100 >>> s = pd.Series([['lion', 'elephant', 'zebra'],
1101 ... [1.1, 2.2, 3.3],
1102 ... ['cat', np.nan, 'dog'],
1103 ... ['cow', 4.5, 'goat'],
1104 ... ['duck', ['swan', 'fish'], 'guppy']])
1105 >>> s
1106 0 [lion, elephant, zebra]
1107 1 [1.1, 2.2, 3.3]
1108 2 [cat, nan, dog]
1109 3 [cow, 4.5, goat]
1110 4 [duck, [swan, fish], guppy]
1111 dtype: object
1113 Join all lists using a '-'. The lists containing object(s) of types other
1114 than str will produce a NaN.
1116 >>> s.str.join('-')
1117 0 lion-elephant-zebra
1118 1 NaN
1119 2 NaN
1120 3 NaN
1121 4 NaN
1122 dtype: object
1123 """
1124 result = self._data.array._str_join(sep)
1125 return self._wrap_result(result)
1127 @forbid_nonstring_types(["bytes"])
1128 def contains(self, pat, case=True, flags=0, na=None, regex=True):
1129 r"""
1130 Test if pattern or regex is contained within a string of a Series or Index.
1132 Return boolean Series or Index based on whether a given pattern or regex is
1133 contained within a string of a Series or Index.
1135 Parameters
1136 ----------
1137 pat : str
1138 Character sequence or regular expression.
1139 case : bool, default True
1140 If True, case sensitive.
1141 flags : int, default 0 (no flags)
1142 Flags to pass through to the re module, e.g. re.IGNORECASE.
1143 na : scalar, optional
1144 Fill value for missing values. The default depends on dtype of the
1145 array. For object-dtype, ``numpy.nan`` is used. For ``StringDtype``,
1146 ``pandas.NA`` is used.
1147 regex : bool, default True
1148 If True, assumes the pat is a regular expression.
1150 If False, treats the pat as a literal string.
1152 Returns
1153 -------
1154 Series or Index of boolean values
1155 A Series or Index of boolean values indicating whether the
1156 given pattern is contained within the string of each element
1157 of the Series or Index.
1159 See Also
1160 --------
1161 match : Analogous, but stricter, relying on re.match instead of re.search.
1162 Series.str.startswith : Test if the start of each string element matches a
1163 pattern.
1164 Series.str.endswith : Same as startswith, but tests the end of string.
1166 Examples
1167 --------
1168 Returning a Series of booleans using only a literal pattern.
1170 >>> s1 = pd.Series(['Mouse', 'dog', 'house and parrot', '23', np.NaN])
1171 >>> s1.str.contains('og', regex=False)
1172 0 False
1173 1 True
1174 2 False
1175 3 False
1176 4 NaN
1177 dtype: object
1179 Returning an Index of booleans using only a literal pattern.
1181 >>> ind = pd.Index(['Mouse', 'dog', 'house and parrot', '23.0', np.NaN])
1182 >>> ind.str.contains('23', regex=False)
1183 Index([False, False, False, True, nan], dtype='object')
1185 Specifying case sensitivity using `case`.
1187 >>> s1.str.contains('oG', case=True, regex=True)
1188 0 False
1189 1 False
1190 2 False
1191 3 False
1192 4 NaN
1193 dtype: object
1195 Specifying `na` to be `False` instead of `NaN` replaces NaN values
1196 with `False`. If Series or Index does not contain NaN values
1197 the resultant dtype will be `bool`, otherwise, an `object` dtype.
1199 >>> s1.str.contains('og', na=False, regex=True)
1200 0 False
1201 1 True
1202 2 False
1203 3 False
1204 4 False
1205 dtype: bool
1207 Returning 'house' or 'dog' when either expression occurs in a string.
1209 >>> s1.str.contains('house|dog', regex=True)
1210 0 False
1211 1 True
1212 2 True
1213 3 False
1214 4 NaN
1215 dtype: object
1217 Ignoring case sensitivity using `flags` with regex.
1219 >>> import re
1220 >>> s1.str.contains('PARROT', flags=re.IGNORECASE, regex=True)
1221 0 False
1222 1 False
1223 2 True
1224 3 False
1225 4 NaN
1226 dtype: object
1228 Returning any digit using regular expression.
1230 >>> s1.str.contains('\\d', regex=True)
1231 0 False
1232 1 False
1233 2 False
1234 3 True
1235 4 NaN
1236 dtype: object
1238 Ensure `pat` is a not a literal pattern when `regex` is set to True.
1239 Note in the following example one might expect only `s2[1]` and `s2[3]` to
1240 return `True`. However, '.0' as a regex matches any character
1241 followed by a 0.
1243 >>> s2 = pd.Series(['40', '40.0', '41', '41.0', '35'])
1244 >>> s2.str.contains('.0', regex=True)
1245 0 True
1246 1 True
1247 2 False
1248 3 True
1249 4 False
1250 dtype: bool
1251 """
1252 if regex and re.compile(pat).groups:
1253 warnings.warn(
1254 "This pattern is interpreted as a regular expression, and has "
1255 "match groups. To actually get the groups, use str.extract.",
1256 UserWarning,
1257 stacklevel=find_stack_level(),
1258 )
1260 result = self._data.array._str_contains(pat, case, flags, na, regex)
1261 return self._wrap_result(result, fill_value=na, returns_string=False)
1263 @forbid_nonstring_types(["bytes"])
1264 def match(self, pat, case=True, flags=0, na=None):
1265 """
1266 Determine if each string starts with a match of a regular expression.
1268 Parameters
1269 ----------
1270 pat : str
1271 Character sequence or regular expression.
1272 case : bool, default True
1273 If True, case sensitive.
1274 flags : int, default 0 (no flags)
1275 Regex module flags, e.g. re.IGNORECASE.
1276 na : scalar, optional
1277 Fill value for missing values. The default depends on dtype of the
1278 array. For object-dtype, ``numpy.nan`` is used. For ``StringDtype``,
1279 ``pandas.NA`` is used.
1281 Returns
1282 -------
1283 Series/Index/array of boolean values
1285 See Also
1286 --------
1287 fullmatch : Stricter matching that requires the entire string to match.
1288 contains : Analogous, but less strict, relying on re.search instead of
1289 re.match.
1290 extract : Extract matched groups.
1291 """
1292 result = self._data.array._str_match(pat, case=case, flags=flags, na=na)
1293 return self._wrap_result(result, fill_value=na, returns_string=False)
1295 @forbid_nonstring_types(["bytes"])
1296 def fullmatch(self, pat, case=True, flags=0, na=None):
1297 """
1298 Determine if each string entirely matches a regular expression.
1300 .. versionadded:: 1.1.0
1302 Parameters
1303 ----------
1304 pat : str
1305 Character sequence or regular expression.
1306 case : bool, default True
1307 If True, case sensitive.
1308 flags : int, default 0 (no flags)
1309 Regex module flags, e.g. re.IGNORECASE.
1310 na : scalar, optional
1311 Fill value for missing values. The default depends on dtype of the
1312 array. For object-dtype, ``numpy.nan`` is used. For ``StringDtype``,
1313 ``pandas.NA`` is used.
1315 Returns
1316 -------
1317 Series/Index/array of boolean values
1319 See Also
1320 --------
1321 match : Similar, but also returns `True` when only a *prefix* of the string
1322 matches the regular expression.
1323 extract : Extract matched groups.
1324 """
1325 result = self._data.array._str_fullmatch(pat, case=case, flags=flags, na=na)
1326 return self._wrap_result(result, fill_value=na, returns_string=False)
1328 @forbid_nonstring_types(["bytes"])
1329 def replace(
1330 self,
1331 pat: str | re.Pattern,
1332 repl: str | Callable,
1333 n: int = -1,
1334 case: bool | None = None,
1335 flags: int = 0,
1336 regex: bool | None = None,
1337 ):
1338 r"""
1339 Replace each occurrence of pattern/regex in the Series/Index.
1341 Equivalent to :meth:`str.replace` or :func:`re.sub`, depending on
1342 the regex value.
1344 Parameters
1345 ----------
1346 pat : str or compiled regex
1347 String can be a character sequence or regular expression.
1348 repl : str or callable
1349 Replacement string or a callable. The callable is passed the regex
1350 match object and must return a replacement string to be used.
1351 See :func:`re.sub`.
1352 n : int, default -1 (all)
1353 Number of replacements to make from start.
1354 case : bool, default None
1355 Determines if replace is case sensitive:
1357 - If True, case sensitive (the default if `pat` is a string)
1358 - Set to False for case insensitive
1359 - Cannot be set if `pat` is a compiled regex.
1361 flags : int, default 0 (no flags)
1362 Regex module flags, e.g. re.IGNORECASE. Cannot be set if `pat` is a compiled
1363 regex.
1364 regex : bool, default True
1365 Determines if the passed-in pattern is a regular expression:
1367 - If True, assumes the passed-in pattern is a regular expression.
1368 - If False, treats the pattern as a literal string
1369 - Cannot be set to False if `pat` is a compiled regex or `repl` is
1370 a callable.
1372 .. versionadded:: 0.23.0
1374 Returns
1375 -------
1376 Series or Index of object
1377 A copy of the object with all matching occurrences of `pat` replaced by
1378 `repl`.
1380 Raises
1381 ------
1382 ValueError
1383 * if `regex` is False and `repl` is a callable or `pat` is a compiled
1384 regex
1385 * if `pat` is a compiled regex and `case` or `flags` is set
1387 Notes
1388 -----
1389 When `pat` is a compiled regex, all flags should be included in the
1390 compiled regex. Use of `case`, `flags`, or `regex=False` with a compiled
1391 regex will raise an error.
1393 Examples
1394 --------
1395 When `pat` is a string and `regex` is True (the default), the given `pat`
1396 is compiled as a regex. When `repl` is a string, it replaces matching
1397 regex patterns as with :meth:`re.sub`. NaN value(s) in the Series are
1398 left as is:
1400 >>> pd.Series(['foo', 'fuz', np.nan]).str.replace('f.', 'ba', regex=True)
1401 0 bao
1402 1 baz
1403 2 NaN
1404 dtype: object
1406 When `pat` is a string and `regex` is False, every `pat` is replaced with
1407 `repl` as with :meth:`str.replace`:
1409 >>> pd.Series(['f.o', 'fuz', np.nan]).str.replace('f.', 'ba', regex=False)
1410 0 bao
1411 1 fuz
1412 2 NaN
1413 dtype: object
1415 When `repl` is a callable, it is called on every `pat` using
1416 :func:`re.sub`. The callable should expect one positional argument
1417 (a regex object) and return a string.
1419 To get the idea:
1421 >>> pd.Series(['foo', 'fuz', np.nan]).str.replace('f', repr, regex=True)
1422 0 <re.Match object; span=(0, 1), match='f'>oo
1423 1 <re.Match object; span=(0, 1), match='f'>uz
1424 2 NaN
1425 dtype: object
1427 Reverse every lowercase alphabetic word:
1429 >>> repl = lambda m: m.group(0)[::-1]
1430 >>> ser = pd.Series(['foo 123', 'bar baz', np.nan])
1431 >>> ser.str.replace(r'[a-z]+', repl, regex=True)
1432 0 oof 123
1433 1 rab zab
1434 2 NaN
1435 dtype: object
1437 Using regex groups (extract second group and swap case):
1439 >>> pat = r"(?P<one>\w+) (?P<two>\w+) (?P<three>\w+)"
1440 >>> repl = lambda m: m.group('two').swapcase()
1441 >>> ser = pd.Series(['One Two Three', 'Foo Bar Baz'])
1442 >>> ser.str.replace(pat, repl, regex=True)
1443 0 tWO
1444 1 bAR
1445 dtype: object
1447 Using a compiled regex with flags
1449 >>> import re
1450 >>> regex_pat = re.compile(r'FUZ', flags=re.IGNORECASE)
1451 >>> pd.Series(['foo', 'fuz', np.nan]).str.replace(regex_pat, 'bar', regex=True)
1452 0 foo
1453 1 bar
1454 2 NaN
1455 dtype: object
1456 """
1457 if regex is None:
1458 if isinstance(pat, str) and any(c in pat for c in ".+*|^$?[](){}\\"):
1459 # warn only in cases where regex behavior would differ from literal
1460 msg = (
1461 "The default value of regex will change from True to False "
1462 "in a future version."
1463 )
1464 if len(pat) == 1:
1465 msg += (
1466 " In addition, single character regular expressions will "
1467 "*not* be treated as literal strings when regex=True."
1468 )
1469 warnings.warn(msg, FutureWarning, stacklevel=find_stack_level())
1471 # Check whether repl is valid (GH 13438, GH 15055)
1472 if not (isinstance(repl, str) or callable(repl)):
1473 raise TypeError("repl must be a string or callable")
1475 is_compiled_re = is_re(pat)
1476 if regex or regex is None:
1477 if is_compiled_re and (case is not None or flags != 0):
1478 raise ValueError(
1479 "case and flags cannot be set when pat is a compiled regex"
1480 )
1482 elif is_compiled_re:
1483 raise ValueError(
1484 "Cannot use a compiled regex as replacement pattern with regex=False"
1485 )
1486 elif callable(repl):
1487 raise ValueError("Cannot use a callable replacement when regex=False")
1489 # The current behavior is to treat single character patterns as literal strings,
1490 # even when ``regex`` is set to ``True``.
1491 if isinstance(pat, str) and len(pat) == 1:
1492 regex = False
1494 if regex is None:
1495 regex = True
1497 if case is None:
1498 case = True
1500 result = self._data.array._str_replace(
1501 pat, repl, n=n, case=case, flags=flags, regex=regex
1502 )
1503 return self._wrap_result(result)
1505 @forbid_nonstring_types(["bytes"])
1506 def repeat(self, repeats):
1507 """
1508 Duplicate each string in the Series or Index.
1510 Parameters
1511 ----------
1512 repeats : int or sequence of int
1513 Same value for all (int) or different value per (sequence).
1515 Returns
1516 -------
1517 Series or Index of object
1518 Series or Index of repeated string objects specified by
1519 input parameter repeats.
1521 Examples
1522 --------
1523 >>> s = pd.Series(['a', 'b', 'c'])
1524 >>> s
1525 0 a
1526 1 b
1527 2 c
1528 dtype: object
1530 Single int repeats string in Series
1532 >>> s.str.repeat(repeats=2)
1533 0 aa
1534 1 bb
1535 2 cc
1536 dtype: object
1538 Sequence of int repeats corresponding string in Series
1540 >>> s.str.repeat(repeats=[1, 2, 3])
1541 0 a
1542 1 bb
1543 2 ccc
1544 dtype: object
1545 """
1546 result = self._data.array._str_repeat(repeats)
1547 return self._wrap_result(result)
1549 @forbid_nonstring_types(["bytes"])
1550 def pad(self, width, side="left", fillchar=" "):
1551 """
1552 Pad strings in the Series/Index up to width.
1554 Parameters
1555 ----------
1556 width : int
1557 Minimum width of resulting string; additional characters will be filled
1558 with character defined in `fillchar`.
1559 side : {'left', 'right', 'both'}, default 'left'
1560 Side from which to fill resulting string.
1561 fillchar : str, default ' '
1562 Additional character for filling, default is whitespace.
1564 Returns
1565 -------
1566 Series or Index of object
1567 Returns Series or Index with minimum number of char in object.
1569 See Also
1570 --------
1571 Series.str.rjust : Fills the left side of strings with an arbitrary
1572 character. Equivalent to ``Series.str.pad(side='left')``.
1573 Series.str.ljust : Fills the right side of strings with an arbitrary
1574 character. Equivalent to ``Series.str.pad(side='right')``.
1575 Series.str.center : Fills both sides of strings with an arbitrary
1576 character. Equivalent to ``Series.str.pad(side='both')``.
1577 Series.str.zfill : Pad strings in the Series/Index by prepending '0'
1578 character. Equivalent to ``Series.str.pad(side='left', fillchar='0')``.
1580 Examples
1581 --------
1582 >>> s = pd.Series(["caribou", "tiger"])
1583 >>> s
1584 0 caribou
1585 1 tiger
1586 dtype: object
1588 >>> s.str.pad(width=10)
1589 0 caribou
1590 1 tiger
1591 dtype: object
1593 >>> s.str.pad(width=10, side='right', fillchar='-')
1594 0 caribou---
1595 1 tiger-----
1596 dtype: object
1598 >>> s.str.pad(width=10, side='both', fillchar='-')
1599 0 -caribou--
1600 1 --tiger---
1601 dtype: object
1602 """
1603 if not isinstance(fillchar, str):
1604 msg = f"fillchar must be a character, not {type(fillchar).__name__}"
1605 raise TypeError(msg)
1607 if len(fillchar) != 1:
1608 raise TypeError("fillchar must be a character, not str")
1610 if not is_integer(width):
1611 msg = f"width must be of integer type, not {type(width).__name__}"
1612 raise TypeError(msg)
1614 result = self._data.array._str_pad(width, side=side, fillchar=fillchar)
1615 return self._wrap_result(result)
1617 _shared_docs[
1618 "str_pad"
1619 ] = """
1620 Pad %(side)s side of strings in the Series/Index.
1622 Equivalent to :meth:`str.%(method)s`.
1624 Parameters
1625 ----------
1626 width : int
1627 Minimum width of resulting string; additional characters will be filled
1628 with ``fillchar``.
1629 fillchar : str
1630 Additional character for filling, default is whitespace.
1632 Returns
1633 -------
1634 filled : Series/Index of objects.
1635 """
1637 @Appender(_shared_docs["str_pad"] % {"side": "left and right", "method": "center"})
1638 @forbid_nonstring_types(["bytes"])
1639 def center(self, width, fillchar=" "):
1640 return self.pad(width, side="both", fillchar=fillchar)
1642 @Appender(_shared_docs["str_pad"] % {"side": "right", "method": "ljust"})
1643 @forbid_nonstring_types(["bytes"])
1644 def ljust(self, width, fillchar=" "):
1645 return self.pad(width, side="right", fillchar=fillchar)
1647 @Appender(_shared_docs["str_pad"] % {"side": "left", "method": "rjust"})
1648 @forbid_nonstring_types(["bytes"])
1649 def rjust(self, width, fillchar=" "):
1650 return self.pad(width, side="left", fillchar=fillchar)
1652 @forbid_nonstring_types(["bytes"])
1653 def zfill(self, width):
1654 """
1655 Pad strings in the Series/Index by prepending '0' characters.
1657 Strings in the Series/Index are padded with '0' characters on the
1658 left of the string to reach a total string length `width`. Strings
1659 in the Series/Index with length greater or equal to `width` are
1660 unchanged.
1662 Parameters
1663 ----------
1664 width : int
1665 Minimum length of resulting string; strings with length less
1666 than `width` be prepended with '0' characters.
1668 Returns
1669 -------
1670 Series/Index of objects.
1672 See Also
1673 --------
1674 Series.str.rjust : Fills the left side of strings with an arbitrary
1675 character.
1676 Series.str.ljust : Fills the right side of strings with an arbitrary
1677 character.
1678 Series.str.pad : Fills the specified sides of strings with an arbitrary
1679 character.
1680 Series.str.center : Fills both sides of strings with an arbitrary
1681 character.
1683 Notes
1684 -----
1685 Differs from :meth:`str.zfill` which has special handling
1686 for '+'/'-' in the string.
1688 Examples
1689 --------
1690 >>> s = pd.Series(['-1', '1', '1000', 10, np.nan])
1691 >>> s
1692 0 -1
1693 1 1
1694 2 1000
1695 3 10
1696 4 NaN
1697 dtype: object
1699 Note that ``10`` and ``NaN`` are not strings, therefore they are
1700 converted to ``NaN``. The minus sign in ``'-1'`` is treated as a
1701 special character and the zero is added to the right of it
1702 (:meth:`str.zfill` would have moved it to the left). ``1000``
1703 remains unchanged as it is longer than `width`.
1705 >>> s.str.zfill(3)
1706 0 -01
1707 1 001
1708 2 1000
1709 3 NaN
1710 4 NaN
1711 dtype: object
1712 """
1713 if not is_integer(width):
1714 msg = f"width must be of integer type, not {type(width).__name__}"
1715 raise TypeError(msg)
1716 f = lambda x: x.zfill(width)
1717 result = self._data.array._str_map(f)
1718 return self._wrap_result(result)
1720 def slice(self, start=None, stop=None, step=None):
1721 """
1722 Slice substrings from each element in the Series or Index.
1724 Parameters
1725 ----------
1726 start : int, optional
1727 Start position for slice operation.
1728 stop : int, optional
1729 Stop position for slice operation.
1730 step : int, optional
1731 Step size for slice operation.
1733 Returns
1734 -------
1735 Series or Index of object
1736 Series or Index from sliced substring from original string object.
1738 See Also
1739 --------
1740 Series.str.slice_replace : Replace a slice with a string.
1741 Series.str.get : Return element at position.
1742 Equivalent to `Series.str.slice(start=i, stop=i+1)` with `i`
1743 being the position.
1745 Examples
1746 --------
1747 >>> s = pd.Series(["koala", "dog", "chameleon"])
1748 >>> s
1749 0 koala
1750 1 dog
1751 2 chameleon
1752 dtype: object
1754 >>> s.str.slice(start=1)
1755 0 oala
1756 1 og
1757 2 hameleon
1758 dtype: object
1760 >>> s.str.slice(start=-1)
1761 0 a
1762 1 g
1763 2 n
1764 dtype: object
1766 >>> s.str.slice(stop=2)
1767 0 ko
1768 1 do
1769 2 ch
1770 dtype: object
1772 >>> s.str.slice(step=2)
1773 0 kaa
1774 1 dg
1775 2 caeen
1776 dtype: object
1778 >>> s.str.slice(start=0, stop=5, step=3)
1779 0 kl
1780 1 d
1781 2 cm
1782 dtype: object
1784 Equivalent behaviour to:
1786 >>> s.str[0:5:3]
1787 0 kl
1788 1 d
1789 2 cm
1790 dtype: object
1791 """
1792 result = self._data.array._str_slice(start, stop, step)
1793 return self._wrap_result(result)
1795 @forbid_nonstring_types(["bytes"])
1796 def slice_replace(self, start=None, stop=None, repl=None):
1797 """
1798 Replace a positional slice of a string with another value.
1800 Parameters
1801 ----------
1802 start : int, optional
1803 Left index position to use for the slice. If not specified (None),
1804 the slice is unbounded on the left, i.e. slice from the start
1805 of the string.
1806 stop : int, optional
1807 Right index position to use for the slice. If not specified (None),
1808 the slice is unbounded on the right, i.e. slice until the
1809 end of the string.
1810 repl : str, optional
1811 String for replacement. If not specified (None), the sliced region
1812 is replaced with an empty string.
1814 Returns
1815 -------
1816 Series or Index
1817 Same type as the original object.
1819 See Also
1820 --------
1821 Series.str.slice : Just slicing without replacement.
1823 Examples
1824 --------
1825 >>> s = pd.Series(['a', 'ab', 'abc', 'abdc', 'abcde'])
1826 >>> s
1827 0 a
1828 1 ab
1829 2 abc
1830 3 abdc
1831 4 abcde
1832 dtype: object
1834 Specify just `start`, meaning replace `start` until the end of the
1835 string with `repl`.
1837 >>> s.str.slice_replace(1, repl='X')
1838 0 aX
1839 1 aX
1840 2 aX
1841 3 aX
1842 4 aX
1843 dtype: object
1845 Specify just `stop`, meaning the start of the string to `stop` is replaced
1846 with `repl`, and the rest of the string is included.
1848 >>> s.str.slice_replace(stop=2, repl='X')
1849 0 X
1850 1 X
1851 2 Xc
1852 3 Xdc
1853 4 Xcde
1854 dtype: object
1856 Specify `start` and `stop`, meaning the slice from `start` to `stop` is
1857 replaced with `repl`. Everything before or after `start` and `stop` is
1858 included as is.
1860 >>> s.str.slice_replace(start=1, stop=3, repl='X')
1861 0 aX
1862 1 aX
1863 2 aX
1864 3 aXc
1865 4 aXde
1866 dtype: object
1867 """
1868 result = self._data.array._str_slice_replace(start, stop, repl)
1869 return self._wrap_result(result)
1871 def decode(self, encoding, errors="strict"):
1872 """
1873 Decode character string in the Series/Index using indicated encoding.
1875 Equivalent to :meth:`str.decode` in python2 and :meth:`bytes.decode` in
1876 python3.
1878 Parameters
1879 ----------
1880 encoding : str
1881 errors : str, optional
1883 Returns
1884 -------
1885 Series or Index
1886 """
1887 # TODO: Add a similar _bytes interface.
1888 if encoding in _cpython_optimized_decoders:
1889 # CPython optimized implementation
1890 f = lambda x: x.decode(encoding, errors)
1891 else:
1892 decoder = codecs.getdecoder(encoding)
1893 f = lambda x: decoder(x, errors)[0]
1894 arr = self._data.array
1895 # assert isinstance(arr, (StringArray,))
1896 result = arr._str_map(f)
1897 return self._wrap_result(result)
1899 @forbid_nonstring_types(["bytes"])
1900 def encode(self, encoding, errors="strict"):
1901 """
1902 Encode character string in the Series/Index using indicated encoding.
1904 Equivalent to :meth:`str.encode`.
1906 Parameters
1907 ----------
1908 encoding : str
1909 errors : str, optional
1911 Returns
1912 -------
1913 encoded : Series/Index of objects
1914 """
1915 result = self._data.array._str_encode(encoding, errors)
1916 return self._wrap_result(result, returns_string=False)
1918 _shared_docs[
1919 "str_strip"
1920 ] = r"""
1921 Remove %(position)s characters.
1923 Strip whitespaces (including newlines) or a set of specified characters
1924 from each string in the Series/Index from %(side)s.
1925 Replaces any non-strings in Series with NaNs.
1926 Equivalent to :meth:`str.%(method)s`.
1928 Parameters
1929 ----------
1930 to_strip : str or None, default None
1931 Specifying the set of characters to be removed.
1932 All combinations of this set of characters will be stripped.
1933 If None then whitespaces are removed.
1935 Returns
1936 -------
1937 Series or Index of object
1939 See Also
1940 --------
1941 Series.str.strip : Remove leading and trailing characters in Series/Index.
1942 Series.str.lstrip : Remove leading characters in Series/Index.
1943 Series.str.rstrip : Remove trailing characters in Series/Index.
1945 Examples
1946 --------
1947 >>> s = pd.Series(['1. Ant. ', '2. Bee!\n', '3. Cat?\t', np.nan, 10, True])
1948 >>> s
1949 0 1. Ant.
1950 1 2. Bee!\n
1951 2 3. Cat?\t
1952 3 NaN
1953 4 10
1954 5 True
1955 dtype: object
1957 >>> s.str.strip()
1958 0 1. Ant.
1959 1 2. Bee!
1960 2 3. Cat?
1961 3 NaN
1962 4 NaN
1963 5 NaN
1964 dtype: object
1966 >>> s.str.lstrip('123.')
1967 0 Ant.
1968 1 Bee!\n
1969 2 Cat?\t
1970 3 NaN
1971 4 NaN
1972 5 NaN
1973 dtype: object
1975 >>> s.str.rstrip('.!? \n\t')
1976 0 1. Ant
1977 1 2. Bee
1978 2 3. Cat
1979 3 NaN
1980 4 NaN
1981 5 NaN
1982 dtype: object
1984 >>> s.str.strip('123.!? \n\t')
1985 0 Ant
1986 1 Bee
1987 2 Cat
1988 3 NaN
1989 4 NaN
1990 5 NaN
1991 dtype: object
1992 """
1994 @Appender(
1995 _shared_docs["str_strip"]
1996 % {
1997 "side": "left and right sides",
1998 "method": "strip",
1999 "position": "leading and trailing",
2000 }
2001 )
2002 @forbid_nonstring_types(["bytes"])
2003 def strip(self, to_strip=None):
2004 result = self._data.array._str_strip(to_strip)
2005 return self._wrap_result(result)
2007 @Appender(
2008 _shared_docs["str_strip"]
2009 % {"side": "left side", "method": "lstrip", "position": "leading"}
2010 )
2011 @forbid_nonstring_types(["bytes"])
2012 def lstrip(self, to_strip=None):
2013 result = self._data.array._str_lstrip(to_strip)
2014 return self._wrap_result(result)
2016 @Appender(
2017 _shared_docs["str_strip"]
2018 % {"side": "right side", "method": "rstrip", "position": "trailing"}
2019 )
2020 @forbid_nonstring_types(["bytes"])
2021 def rstrip(self, to_strip=None):
2022 result = self._data.array._str_rstrip(to_strip)
2023 return self._wrap_result(result)
2025 _shared_docs[
2026 "str_removefix"
2027 ] = r"""
2028 Remove a %(side)s from an object series.
2030 If the %(side)s is not present, the original string will be returned.
2032 Parameters
2033 ----------
2034 %(side)s : str
2035 Remove the %(side)s of the string.
2037 Returns
2038 -------
2039 Series/Index: object
2040 The Series or Index with given %(side)s removed.
2042 See Also
2043 --------
2044 Series.str.remove%(other_side)s : Remove a %(other_side)s from an object series.
2046 Examples
2047 --------
2048 >>> s = pd.Series(["str_foo", "str_bar", "no_prefix"])
2049 >>> s
2050 0 str_foo
2051 1 str_bar
2052 2 no_prefix
2053 dtype: object
2054 >>> s.str.removeprefix("str_")
2055 0 foo
2056 1 bar
2057 2 no_prefix
2058 dtype: object
2060 >>> s = pd.Series(["foo_str", "bar_str", "no_suffix"])
2061 >>> s
2062 0 foo_str
2063 1 bar_str
2064 2 no_suffix
2065 dtype: object
2066 >>> s.str.removesuffix("_str")
2067 0 foo
2068 1 bar
2069 2 no_suffix
2070 dtype: object
2071 """
2073 @Appender(
2074 _shared_docs["str_removefix"] % {"side": "prefix", "other_side": "suffix"}
2075 )
2076 @forbid_nonstring_types(["bytes"])
2077 def removeprefix(self, prefix):
2078 result = self._data.array._str_removeprefix(prefix)
2079 return self._wrap_result(result)
2081 @Appender(
2082 _shared_docs["str_removefix"] % {"side": "suffix", "other_side": "prefix"}
2083 )
2084 @forbid_nonstring_types(["bytes"])
2085 def removesuffix(self, suffix):
2086 result = self._data.array._str_removesuffix(suffix)
2087 return self._wrap_result(result)
2089 @forbid_nonstring_types(["bytes"])
2090 def wrap(self, width, **kwargs):
2091 r"""
2092 Wrap strings in Series/Index at specified line width.
2094 This method has the same keyword parameters and defaults as
2095 :class:`textwrap.TextWrapper`.
2097 Parameters
2098 ----------
2099 width : int
2100 Maximum line width.
2101 expand_tabs : bool, optional
2102 If True, tab characters will be expanded to spaces (default: True).
2103 replace_whitespace : bool, optional
2104 If True, each whitespace character (as defined by string.whitespace)
2105 remaining after tab expansion will be replaced by a single space
2106 (default: True).
2107 drop_whitespace : bool, optional
2108 If True, whitespace that, after wrapping, happens to end up at the
2109 beginning or end of a line is dropped (default: True).
2110 break_long_words : bool, optional
2111 If True, then words longer than width will be broken in order to ensure
2112 that no lines are longer than width. If it is false, long words will
2113 not be broken, and some lines may be longer than width (default: True).
2114 break_on_hyphens : bool, optional
2115 If True, wrapping will occur preferably on whitespace and right after
2116 hyphens in compound words, as it is customary in English. If false,
2117 only whitespaces will be considered as potentially good places for line
2118 breaks, but you need to set break_long_words to false if you want truly
2119 insecable words (default: True).
2121 Returns
2122 -------
2123 Series or Index
2125 Notes
2126 -----
2127 Internally, this method uses a :class:`textwrap.TextWrapper` instance with
2128 default settings. To achieve behavior matching R's stringr library str_wrap
2129 function, use the arguments:
2131 - expand_tabs = False
2132 - replace_whitespace = True
2133 - drop_whitespace = True
2134 - break_long_words = False
2135 - break_on_hyphens = False
2137 Examples
2138 --------
2139 >>> s = pd.Series(['line to be wrapped', 'another line to be wrapped'])
2140 >>> s.str.wrap(12)
2141 0 line to be\nwrapped
2142 1 another line\nto be\nwrapped
2143 dtype: object
2144 """
2145 result = self._data.array._str_wrap(width, **kwargs)
2146 return self._wrap_result(result)
2148 @forbid_nonstring_types(["bytes"])
2149 def get_dummies(self, sep="|"):
2150 """
2151 Return DataFrame of dummy/indicator variables for Series.
2153 Each string in Series is split by sep and returned as a DataFrame
2154 of dummy/indicator variables.
2156 Parameters
2157 ----------
2158 sep : str, default "|"
2159 String to split on.
2161 Returns
2162 -------
2163 DataFrame
2164 Dummy variables corresponding to values of the Series.
2166 See Also
2167 --------
2168 get_dummies : Convert categorical variable into dummy/indicator
2169 variables.
2171 Examples
2172 --------
2173 >>> pd.Series(['a|b', 'a', 'a|c']).str.get_dummies()
2174 a b c
2175 0 1 1 0
2176 1 1 0 0
2177 2 1 0 1
2179 >>> pd.Series(['a|b', np.nan, 'a|c']).str.get_dummies()
2180 a b c
2181 0 1 1 0
2182 1 0 0 0
2183 2 1 0 1
2184 """
2185 # we need to cast to Series of strings as only that has all
2186 # methods available for making the dummies...
2187 result, name = self._data.array._str_get_dummies(sep)
2188 return self._wrap_result(
2189 result,
2190 name=name,
2191 expand=True,
2192 returns_string=False,
2193 )
2195 @forbid_nonstring_types(["bytes"])
2196 def translate(self, table):
2197 """
2198 Map all characters in the string through the given mapping table.
2200 Equivalent to standard :meth:`str.translate`.
2202 Parameters
2203 ----------
2204 table : dict
2205 Table is a mapping of Unicode ordinals to Unicode ordinals, strings, or
2206 None. Unmapped characters are left untouched.
2207 Characters mapped to None are deleted. :meth:`str.maketrans` is a
2208 helper function for making translation tables.
2210 Returns
2211 -------
2212 Series or Index
2213 """
2214 result = self._data.array._str_translate(table)
2215 return self._wrap_result(result)
2217 @forbid_nonstring_types(["bytes"])
2218 def count(self, pat, flags=0):
2219 r"""
2220 Count occurrences of pattern in each string of the Series/Index.
2222 This function is used to count the number of times a particular regex
2223 pattern is repeated in each of the string elements of the
2224 :class:`~pandas.Series`.
2226 Parameters
2227 ----------
2228 pat : str
2229 Valid regular expression.
2230 flags : int, default 0, meaning no flags
2231 Flags for the `re` module. For a complete list, `see here
2232 <https://docs.python.org/3/howto/regex.html#compilation-flags>`_.
2233 **kwargs
2234 For compatibility with other string methods. Not used.
2236 Returns
2237 -------
2238 Series or Index
2239 Same type as the calling object containing the integer counts.
2241 See Also
2242 --------
2243 re : Standard library module for regular expressions.
2244 str.count : Standard library version, without regular expression support.
2246 Notes
2247 -----
2248 Some characters need to be escaped when passing in `pat`.
2249 eg. ``'$'`` has a special meaning in regex and must be escaped when
2250 finding this literal character.
2252 Examples
2253 --------
2254 >>> s = pd.Series(['A', 'B', 'Aaba', 'Baca', np.nan, 'CABA', 'cat'])
2255 >>> s.str.count('a')
2256 0 0.0
2257 1 0.0
2258 2 2.0
2259 3 2.0
2260 4 NaN
2261 5 0.0
2262 6 1.0
2263 dtype: float64
2265 Escape ``'$'`` to find the literal dollar sign.
2267 >>> s = pd.Series(['$', 'B', 'Aab$', '$$ca', 'C$B$', 'cat'])
2268 >>> s.str.count('\\$')
2269 0 1
2270 1 0
2271 2 1
2272 3 2
2273 4 2
2274 5 0
2275 dtype: int64
2277 This is also available on Index
2279 >>> pd.Index(['A', 'A', 'Aaba', 'cat']).str.count('a')
2280 Int64Index([0, 0, 2, 1], dtype='int64')
2281 """
2282 result = self._data.array._str_count(pat, flags)
2283 return self._wrap_result(result, returns_string=False)
2285 @forbid_nonstring_types(["bytes"])
2286 def startswith(
2287 self, pat: str | tuple[str, ...], na: Scalar | None = None
2288 ) -> Series | Index:
2289 """
2290 Test if the start of each string element matches a pattern.
2292 Equivalent to :meth:`str.startswith`.
2294 Parameters
2295 ----------
2296 pat : str or tuple[str, ...]
2297 Character sequence or tuple of strings. Regular expressions are not
2298 accepted.
2299 na : object, default NaN
2300 Object shown if element tested is not a string. The default depends
2301 on dtype of the array. For object-dtype, ``numpy.nan`` is used.
2302 For ``StringDtype``, ``pandas.NA`` is used.
2304 Returns
2305 -------
2306 Series or Index of bool
2307 A Series of booleans indicating whether the given pattern matches
2308 the start of each string element.
2310 See Also
2311 --------
2312 str.startswith : Python standard library string method.
2313 Series.str.endswith : Same as startswith, but tests the end of string.
2314 Series.str.contains : Tests if string element contains a pattern.
2316 Examples
2317 --------
2318 >>> s = pd.Series(['bat', 'Bear', 'cat', np.nan])
2319 >>> s
2320 0 bat
2321 1 Bear
2322 2 cat
2323 3 NaN
2324 dtype: object
2326 >>> s.str.startswith('b')
2327 0 True
2328 1 False
2329 2 False
2330 3 NaN
2331 dtype: object
2333 >>> s.str.startswith(('b', 'B'))
2334 0 True
2335 1 True
2336 2 False
2337 3 NaN
2338 dtype: object
2340 Specifying `na` to be `False` instead of `NaN`.
2342 >>> s.str.startswith('b', na=False)
2343 0 True
2344 1 False
2345 2 False
2346 3 False
2347 dtype: bool
2348 """
2349 if not isinstance(pat, (str, tuple)):
2350 msg = f"expected a string or tuple, not {type(pat).__name__}"
2351 raise TypeError(msg)
2352 result = self._data.array._str_startswith(pat, na=na)
2353 return self._wrap_result(result, returns_string=False)
2355 @forbid_nonstring_types(["bytes"])
2356 def endswith(
2357 self, pat: str | tuple[str, ...], na: Scalar | None = None
2358 ) -> Series | Index:
2359 """
2360 Test if the end of each string element matches a pattern.
2362 Equivalent to :meth:`str.endswith`.
2364 Parameters
2365 ----------
2366 pat : str or tuple[str, ...]
2367 Character sequence or tuple of strings. Regular expressions are not
2368 accepted.
2369 na : object, default NaN
2370 Object shown if element tested is not a string. The default depends
2371 on dtype of the array. For object-dtype, ``numpy.nan`` is used.
2372 For ``StringDtype``, ``pandas.NA`` is used.
2374 Returns
2375 -------
2376 Series or Index of bool
2377 A Series of booleans indicating whether the given pattern matches
2378 the end of each string element.
2380 See Also
2381 --------
2382 str.endswith : Python standard library string method.
2383 Series.str.startswith : Same as endswith, but tests the start of string.
2384 Series.str.contains : Tests if string element contains a pattern.
2386 Examples
2387 --------
2388 >>> s = pd.Series(['bat', 'bear', 'caT', np.nan])
2389 >>> s
2390 0 bat
2391 1 bear
2392 2 caT
2393 3 NaN
2394 dtype: object
2396 >>> s.str.endswith('t')
2397 0 True
2398 1 False
2399 2 False
2400 3 NaN
2401 dtype: object
2403 >>> s.str.endswith(('t', 'T'))
2404 0 True
2405 1 False
2406 2 True
2407 3 NaN
2408 dtype: object
2410 Specifying `na` to be `False` instead of `NaN`.
2412 >>> s.str.endswith('t', na=False)
2413 0 True
2414 1 False
2415 2 False
2416 3 False
2417 dtype: bool
2418 """
2419 if not isinstance(pat, (str, tuple)):
2420 msg = f"expected a string or tuple, not {type(pat).__name__}"
2421 raise TypeError(msg)
2422 result = self._data.array._str_endswith(pat, na=na)
2423 return self._wrap_result(result, returns_string=False)
2425 @forbid_nonstring_types(["bytes"])
2426 def findall(self, pat, flags=0):
2427 """
2428 Find all occurrences of pattern or regular expression in the Series/Index.
2430 Equivalent to applying :func:`re.findall` to all the elements in the
2431 Series/Index.
2433 Parameters
2434 ----------
2435 pat : str
2436 Pattern or regular expression.
2437 flags : int, default 0
2438 Flags from ``re`` module, e.g. `re.IGNORECASE` (default is 0, which
2439 means no flags).
2441 Returns
2442 -------
2443 Series/Index of lists of strings
2444 All non-overlapping matches of pattern or regular expression in each
2445 string of this Series/Index.
2447 See Also
2448 --------
2449 count : Count occurrences of pattern or regular expression in each string
2450 of the Series/Index.
2451 extractall : For each string in the Series, extract groups from all matches
2452 of regular expression and return a DataFrame with one row for each
2453 match and one column for each group.
2454 re.findall : The equivalent ``re`` function to all non-overlapping matches
2455 of pattern or regular expression in string, as a list of strings.
2457 Examples
2458 --------
2459 >>> s = pd.Series(['Lion', 'Monkey', 'Rabbit'])
2461 The search for the pattern 'Monkey' returns one match:
2463 >>> s.str.findall('Monkey')
2464 0 []
2465 1 [Monkey]
2466 2 []
2467 dtype: object
2469 On the other hand, the search for the pattern 'MONKEY' doesn't return any
2470 match:
2472 >>> s.str.findall('MONKEY')
2473 0 []
2474 1 []
2475 2 []
2476 dtype: object
2478 Flags can be added to the pattern or regular expression. For instance,
2479 to find the pattern 'MONKEY' ignoring the case:
2481 >>> import re
2482 >>> s.str.findall('MONKEY', flags=re.IGNORECASE)
2483 0 []
2484 1 [Monkey]
2485 2 []
2486 dtype: object
2488 When the pattern matches more than one string in the Series, all matches
2489 are returned:
2491 >>> s.str.findall('on')
2492 0 [on]
2493 1 [on]
2494 2 []
2495 dtype: object
2497 Regular expressions are supported too. For instance, the search for all the
2498 strings ending with the word 'on' is shown next:
2500 >>> s.str.findall('on$')
2501 0 [on]
2502 1 []
2503 2 []
2504 dtype: object
2506 If the pattern is found more than once in the same string, then a list of
2507 multiple strings is returned:
2509 >>> s.str.findall('b')
2510 0 []
2511 1 []
2512 2 [b, b]
2513 dtype: object
2514 """
2515 result = self._data.array._str_findall(pat, flags)
2516 return self._wrap_result(result, returns_string=False)
2518 @forbid_nonstring_types(["bytes"])
2519 def extract(
2520 self, pat: str, flags: int = 0, expand: bool = True
2521 ) -> DataFrame | Series | Index:
2522 r"""
2523 Extract capture groups in the regex `pat` as columns in a DataFrame.
2525 For each subject string in the Series, extract groups from the
2526 first match of regular expression `pat`.
2528 Parameters
2529 ----------
2530 pat : str
2531 Regular expression pattern with capturing groups.
2532 flags : int, default 0 (no flags)
2533 Flags from the ``re`` module, e.g. ``re.IGNORECASE``, that
2534 modify regular expression matching for things like case,
2535 spaces, etc. For more details, see :mod:`re`.
2536 expand : bool, default True
2537 If True, return DataFrame with one column per capture group.
2538 If False, return a Series/Index if there is one capture group
2539 or DataFrame if there are multiple capture groups.
2541 Returns
2542 -------
2543 DataFrame or Series or Index
2544 A DataFrame with one row for each subject string, and one
2545 column for each group. Any capture group names in regular
2546 expression pat will be used for column names; otherwise
2547 capture group numbers will be used. The dtype of each result
2548 column is always object, even when no match is found. If
2549 ``expand=False`` and pat has only one capture group, then
2550 return a Series (if subject is a Series) or Index (if subject
2551 is an Index).
2553 See Also
2554 --------
2555 extractall : Returns all matches (not just the first match).
2557 Examples
2558 --------
2559 A pattern with two groups will return a DataFrame with two columns.
2560 Non-matches will be NaN.
2562 >>> s = pd.Series(['a1', 'b2', 'c3'])
2563 >>> s.str.extract(r'([ab])(\d)')
2564 0 1
2565 0 a 1
2566 1 b 2
2567 2 NaN NaN
2569 A pattern may contain optional groups.
2571 >>> s.str.extract(r'([ab])?(\d)')
2572 0 1
2573 0 a 1
2574 1 b 2
2575 2 NaN 3
2577 Named groups will become column names in the result.
2579 >>> s.str.extract(r'(?P<letter>[ab])(?P<digit>\d)')
2580 letter digit
2581 0 a 1
2582 1 b 2
2583 2 NaN NaN
2585 A pattern with one group will return a DataFrame with one column
2586 if expand=True.
2588 >>> s.str.extract(r'[ab](\d)', expand=True)
2589 0
2590 0 1
2591 1 2
2592 2 NaN
2594 A pattern with one group will return a Series if expand=False.
2596 >>> s.str.extract(r'[ab](\d)', expand=False)
2597 0 1
2598 1 2
2599 2 NaN
2600 dtype: object
2601 """
2602 from pandas import DataFrame
2604 if not isinstance(expand, bool):
2605 raise ValueError("expand must be True or False")
2607 regex = re.compile(pat, flags=flags)
2608 if regex.groups == 0:
2609 raise ValueError("pattern contains no capture groups")
2611 if not expand and regex.groups > 1 and isinstance(self._data, ABCIndex):
2612 raise ValueError("only one regex group is supported with Index")
2614 obj = self._data
2615 result_dtype = _result_dtype(obj)
2617 returns_df = regex.groups > 1 or expand
2619 if returns_df:
2620 name = None
2621 columns = _get_group_names(regex)
2623 if obj.array.size == 0:
2624 result = DataFrame(columns=columns, dtype=result_dtype)
2626 else:
2627 result_list = self._data.array._str_extract(
2628 pat, flags=flags, expand=returns_df
2629 )
2631 result_index: Index | None
2632 if isinstance(obj, ABCSeries):
2633 result_index = obj.index
2634 else:
2635 result_index = None
2637 result = DataFrame(
2638 result_list, columns=columns, index=result_index, dtype=result_dtype
2639 )
2641 else:
2642 name = _get_single_group_name(regex)
2643 result = self._data.array._str_extract(pat, flags=flags, expand=returns_df)
2644 return self._wrap_result(result, name=name)
2646 @forbid_nonstring_types(["bytes"])
2647 def extractall(self, pat, flags=0):
2648 r"""
2649 Extract capture groups in the regex `pat` as columns in DataFrame.
2651 For each subject string in the Series, extract groups from all
2652 matches of regular expression pat. When each subject string in the
2653 Series has exactly one match, extractall(pat).xs(0, level='match')
2654 is the same as extract(pat).
2656 Parameters
2657 ----------
2658 pat : str
2659 Regular expression pattern with capturing groups.
2660 flags : int, default 0 (no flags)
2661 A ``re`` module flag, for example ``re.IGNORECASE``. These allow
2662 to modify regular expression matching for things like case, spaces,
2663 etc. Multiple flags can be combined with the bitwise OR operator,
2664 for example ``re.IGNORECASE | re.MULTILINE``.
2666 Returns
2667 -------
2668 DataFrame
2669 A ``DataFrame`` with one row for each match, and one column for each
2670 group. Its rows have a ``MultiIndex`` with first levels that come from
2671 the subject ``Series``. The last level is named 'match' and indexes the
2672 matches in each item of the ``Series``. Any capture group names in
2673 regular expression pat will be used for column names; otherwise capture
2674 group numbers will be used.
2676 See Also
2677 --------
2678 extract : Returns first match only (not all matches).
2680 Examples
2681 --------
2682 A pattern with one group will return a DataFrame with one column.
2683 Indices with no matches will not appear in the result.
2685 >>> s = pd.Series(["a1a2", "b1", "c1"], index=["A", "B", "C"])
2686 >>> s.str.extractall(r"[ab](\d)")
2687 0
2688 match
2689 A 0 1
2690 1 2
2691 B 0 1
2693 Capture group names are used for column names of the result.
2695 >>> s.str.extractall(r"[ab](?P<digit>\d)")
2696 digit
2697 match
2698 A 0 1
2699 1 2
2700 B 0 1
2702 A pattern with two groups will return a DataFrame with two columns.
2704 >>> s.str.extractall(r"(?P<letter>[ab])(?P<digit>\d)")
2705 letter digit
2706 match
2707 A 0 a 1
2708 1 a 2
2709 B 0 b 1
2711 Optional groups that do not match are NaN in the result.
2713 >>> s.str.extractall(r"(?P<letter>[ab])?(?P<digit>\d)")
2714 letter digit
2715 match
2716 A 0 a 1
2717 1 a 2
2718 B 0 b 1
2719 C 0 NaN 1
2720 """
2721 # TODO: dispatch
2722 return str_extractall(self._orig, pat, flags)
2724 _shared_docs[
2725 "find"
2726 ] = """
2727 Return %(side)s indexes in each strings in the Series/Index.
2729 Each of returned indexes corresponds to the position where the
2730 substring is fully contained between [start:end]. Return -1 on
2731 failure. Equivalent to standard :meth:`str.%(method)s`.
2733 Parameters
2734 ----------
2735 sub : str
2736 Substring being searched.
2737 start : int
2738 Left edge index.
2739 end : int
2740 Right edge index.
2742 Returns
2743 -------
2744 Series or Index of int.
2746 See Also
2747 --------
2748 %(also)s
2749 """
2751 @Appender(
2752 _shared_docs["find"]
2753 % {
2754 "side": "lowest",
2755 "method": "find",
2756 "also": "rfind : Return highest indexes in each strings.",
2757 }
2758 )
2759 @forbid_nonstring_types(["bytes"])
2760 def find(self, sub, start=0, end=None):
2761 if not isinstance(sub, str):
2762 msg = f"expected a string object, not {type(sub).__name__}"
2763 raise TypeError(msg)
2765 result = self._data.array._str_find(sub, start, end)
2766 return self._wrap_result(result, returns_string=False)
2768 @Appender(
2769 _shared_docs["find"]
2770 % {
2771 "side": "highest",
2772 "method": "rfind",
2773 "also": "find : Return lowest indexes in each strings.",
2774 }
2775 )
2776 @forbid_nonstring_types(["bytes"])
2777 def rfind(self, sub, start=0, end=None):
2778 if not isinstance(sub, str):
2779 msg = f"expected a string object, not {type(sub).__name__}"
2780 raise TypeError(msg)
2782 result = self._data.array._str_rfind(sub, start=start, end=end)
2783 return self._wrap_result(result, returns_string=False)
2785 @forbid_nonstring_types(["bytes"])
2786 def normalize(self, form):
2787 """
2788 Return the Unicode normal form for the strings in the Series/Index.
2790 For more information on the forms, see the
2791 :func:`unicodedata.normalize`.
2793 Parameters
2794 ----------
2795 form : {'NFC', 'NFKC', 'NFD', 'NFKD'}
2796 Unicode form.
2798 Returns
2799 -------
2800 normalized : Series/Index of objects
2801 """
2802 result = self._data.array._str_normalize(form)
2803 return self._wrap_result(result)
2805 _shared_docs[
2806 "index"
2807 ] = """
2808 Return %(side)s indexes in each string in Series/Index.
2810 Each of the returned indexes corresponds to the position where the
2811 substring is fully contained between [start:end]. This is the same
2812 as ``str.%(similar)s`` except instead of returning -1, it raises a
2813 ValueError when the substring is not found. Equivalent to standard
2814 ``str.%(method)s``.
2816 Parameters
2817 ----------
2818 sub : str
2819 Substring being searched.
2820 start : int
2821 Left edge index.
2822 end : int
2823 Right edge index.
2825 Returns
2826 -------
2827 Series or Index of object
2829 See Also
2830 --------
2831 %(also)s
2832 """
2834 @Appender(
2835 _shared_docs["index"]
2836 % {
2837 "side": "lowest",
2838 "similar": "find",
2839 "method": "index",
2840 "also": "rindex : Return highest indexes in each strings.",
2841 }
2842 )
2843 @forbid_nonstring_types(["bytes"])
2844 def index(self, sub, start=0, end=None):
2845 if not isinstance(sub, str):
2846 msg = f"expected a string object, not {type(sub).__name__}"
2847 raise TypeError(msg)
2849 result = self._data.array._str_index(sub, start=start, end=end)
2850 return self._wrap_result(result, returns_string=False)
2852 @Appender(
2853 _shared_docs["index"]
2854 % {
2855 "side": "highest",
2856 "similar": "rfind",
2857 "method": "rindex",
2858 "also": "index : Return lowest indexes in each strings.",
2859 }
2860 )
2861 @forbid_nonstring_types(["bytes"])
2862 def rindex(self, sub, start=0, end=None):
2863 if not isinstance(sub, str):
2864 msg = f"expected a string object, not {type(sub).__name__}"
2865 raise TypeError(msg)
2867 result = self._data.array._str_rindex(sub, start=start, end=end)
2868 return self._wrap_result(result, returns_string=False)
2870 def len(self):
2871 """
2872 Compute the length of each element in the Series/Index.
2874 The element may be a sequence (such as a string, tuple or list) or a collection
2875 (such as a dictionary).
2877 Returns
2878 -------
2879 Series or Index of int
2880 A Series or Index of integer values indicating the length of each
2881 element in the Series or Index.
2883 See Also
2884 --------
2885 str.len : Python built-in function returning the length of an object.
2886 Series.size : Returns the length of the Series.
2888 Examples
2889 --------
2890 Returns the length (number of characters) in a string. Returns the
2891 number of entries for dictionaries, lists or tuples.
2893 >>> s = pd.Series(['dog',
2894 ... '',
2895 ... 5,
2896 ... {'foo' : 'bar'},
2897 ... [2, 3, 5, 7],
2898 ... ('one', 'two', 'three')])
2899 >>> s
2900 0 dog
2901 1
2902 2 5
2903 3 {'foo': 'bar'}
2904 4 [2, 3, 5, 7]
2905 5 (one, two, three)
2906 dtype: object
2907 >>> s.str.len()
2908 0 3.0
2909 1 0.0
2910 2 NaN
2911 3 1.0
2912 4 4.0
2913 5 3.0
2914 dtype: float64
2915 """
2916 result = self._data.array._str_len()
2917 return self._wrap_result(result, returns_string=False)
2919 _shared_docs[
2920 "casemethods"
2921 ] = """
2922 Convert strings in the Series/Index to %(type)s.
2923 %(version)s
2924 Equivalent to :meth:`str.%(method)s`.
2926 Returns
2927 -------
2928 Series or Index of object
2930 See Also
2931 --------
2932 Series.str.lower : Converts all characters to lowercase.
2933 Series.str.upper : Converts all characters to uppercase.
2934 Series.str.title : Converts first character of each word to uppercase and
2935 remaining to lowercase.
2936 Series.str.capitalize : Converts first character to uppercase and
2937 remaining to lowercase.
2938 Series.str.swapcase : Converts uppercase to lowercase and lowercase to
2939 uppercase.
2940 Series.str.casefold: Removes all case distinctions in the string.
2942 Examples
2943 --------
2944 >>> s = pd.Series(['lower', 'CAPITALS', 'this is a sentence', 'SwApCaSe'])
2945 >>> s
2946 0 lower
2947 1 CAPITALS
2948 2 this is a sentence
2949 3 SwApCaSe
2950 dtype: object
2952 >>> s.str.lower()
2953 0 lower
2954 1 capitals
2955 2 this is a sentence
2956 3 swapcase
2957 dtype: object
2959 >>> s.str.upper()
2960 0 LOWER
2961 1 CAPITALS
2962 2 THIS IS A SENTENCE
2963 3 SWAPCASE
2964 dtype: object
2966 >>> s.str.title()
2967 0 Lower
2968 1 Capitals
2969 2 This Is A Sentence
2970 3 Swapcase
2971 dtype: object
2973 >>> s.str.capitalize()
2974 0 Lower
2975 1 Capitals
2976 2 This is a sentence
2977 3 Swapcase
2978 dtype: object
2980 >>> s.str.swapcase()
2981 0 LOWER
2982 1 capitals
2983 2 THIS IS A SENTENCE
2984 3 sWaPcAsE
2985 dtype: object
2986 """
2987 # Types:
2988 # cases:
2989 # upper, lower, title, capitalize, swapcase, casefold
2990 # boolean:
2991 # isalpha, isnumeric isalnum isdigit isdecimal isspace islower isupper istitle
2992 # _doc_args holds dict of strings to use in substituting casemethod docs
2993 _doc_args: dict[str, dict[str, str]] = {}
2994 _doc_args["lower"] = {"type": "lowercase", "method": "lower", "version": ""}
2995 _doc_args["upper"] = {"type": "uppercase", "method": "upper", "version": ""}
2996 _doc_args["title"] = {"type": "titlecase", "method": "title", "version": ""}
2997 _doc_args["capitalize"] = {
2998 "type": "be capitalized",
2999 "method": "capitalize",
3000 "version": "",
3001 }
3002 _doc_args["swapcase"] = {
3003 "type": "be swapcased",
3004 "method": "swapcase",
3005 "version": "",
3006 }
3007 _doc_args["casefold"] = {
3008 "type": "be casefolded",
3009 "method": "casefold",
3010 "version": "\n .. versionadded:: 0.25.0\n",
3011 }
3013 @Appender(_shared_docs["casemethods"] % _doc_args["lower"])
3014 @forbid_nonstring_types(["bytes"])
3015 def lower(self):
3016 result = self._data.array._str_lower()
3017 return self._wrap_result(result)
3019 @Appender(_shared_docs["casemethods"] % _doc_args["upper"])
3020 @forbid_nonstring_types(["bytes"])
3021 def upper(self):
3022 result = self._data.array._str_upper()
3023 return self._wrap_result(result)
3025 @Appender(_shared_docs["casemethods"] % _doc_args["title"])
3026 @forbid_nonstring_types(["bytes"])
3027 def title(self):
3028 result = self._data.array._str_title()
3029 return self._wrap_result(result)
3031 @Appender(_shared_docs["casemethods"] % _doc_args["capitalize"])
3032 @forbid_nonstring_types(["bytes"])
3033 def capitalize(self):
3034 result = self._data.array._str_capitalize()
3035 return self._wrap_result(result)
3037 @Appender(_shared_docs["casemethods"] % _doc_args["swapcase"])
3038 @forbid_nonstring_types(["bytes"])
3039 def swapcase(self):
3040 result = self._data.array._str_swapcase()
3041 return self._wrap_result(result)
3043 @Appender(_shared_docs["casemethods"] % _doc_args["casefold"])
3044 @forbid_nonstring_types(["bytes"])
3045 def casefold(self):
3046 result = self._data.array._str_casefold()
3047 return self._wrap_result(result)
3049 _shared_docs[
3050 "ismethods"
3051 ] = """
3052 Check whether all characters in each string are %(type)s.
3054 This is equivalent to running the Python string method
3055 :meth:`str.%(method)s` for each element of the Series/Index. If a string
3056 has zero characters, ``False`` is returned for that check.
3058 Returns
3059 -------
3060 Series or Index of bool
3061 Series or Index of boolean values with the same length as the original
3062 Series/Index.
3064 See Also
3065 --------
3066 Series.str.isalpha : Check whether all characters are alphabetic.
3067 Series.str.isnumeric : Check whether all characters are numeric.
3068 Series.str.isalnum : Check whether all characters are alphanumeric.
3069 Series.str.isdigit : Check whether all characters are digits.
3070 Series.str.isdecimal : Check whether all characters are decimal.
3071 Series.str.isspace : Check whether all characters are whitespace.
3072 Series.str.islower : Check whether all characters are lowercase.
3073 Series.str.isupper : Check whether all characters are uppercase.
3074 Series.str.istitle : Check whether all characters are titlecase.
3076 Examples
3077 --------
3078 **Checks for Alphabetic and Numeric Characters**
3080 >>> s1 = pd.Series(['one', 'one1', '1', ''])
3082 >>> s1.str.isalpha()
3083 0 True
3084 1 False
3085 2 False
3086 3 False
3087 dtype: bool
3089 >>> s1.str.isnumeric()
3090 0 False
3091 1 False
3092 2 True
3093 3 False
3094 dtype: bool
3096 >>> s1.str.isalnum()
3097 0 True
3098 1 True
3099 2 True
3100 3 False
3101 dtype: bool
3103 Note that checks against characters mixed with any additional punctuation
3104 or whitespace will evaluate to false for an alphanumeric check.
3106 >>> s2 = pd.Series(['A B', '1.5', '3,000'])
3107 >>> s2.str.isalnum()
3108 0 False
3109 1 False
3110 2 False
3111 dtype: bool
3113 **More Detailed Checks for Numeric Characters**
3115 There are several different but overlapping sets of numeric characters that
3116 can be checked for.
3118 >>> s3 = pd.Series(['23', '³', '⅕', ''])
3120 The ``s3.str.isdecimal`` method checks for characters used to form numbers
3121 in base 10.
3123 >>> s3.str.isdecimal()
3124 0 True
3125 1 False
3126 2 False
3127 3 False
3128 dtype: bool
3130 The ``s.str.isdigit`` method is the same as ``s3.str.isdecimal`` but also
3131 includes special digits, like superscripted and subscripted digits in
3132 unicode.
3134 >>> s3.str.isdigit()
3135 0 True
3136 1 True
3137 2 False
3138 3 False
3139 dtype: bool
3141 The ``s.str.isnumeric`` method is the same as ``s3.str.isdigit`` but also
3142 includes other characters that can represent quantities such as unicode
3143 fractions.
3145 >>> s3.str.isnumeric()
3146 0 True
3147 1 True
3148 2 True
3149 3 False
3150 dtype: bool
3152 **Checks for Whitespace**
3154 >>> s4 = pd.Series([' ', '\\t\\r\\n ', ''])
3155 >>> s4.str.isspace()
3156 0 True
3157 1 True
3158 2 False
3159 dtype: bool
3161 **Checks for Character Case**
3163 >>> s5 = pd.Series(['leopard', 'Golden Eagle', 'SNAKE', ''])
3165 >>> s5.str.islower()
3166 0 True
3167 1 False
3168 2 False
3169 3 False
3170 dtype: bool
3172 >>> s5.str.isupper()
3173 0 False
3174 1 False
3175 2 True
3176 3 False
3177 dtype: bool
3179 The ``s5.str.istitle`` method checks for whether all words are in title
3180 case (whether only the first letter of each word is capitalized). Words are
3181 assumed to be as any sequence of non-numeric characters separated by
3182 whitespace characters.
3184 >>> s5.str.istitle()
3185 0 False
3186 1 True
3187 2 False
3188 3 False
3189 dtype: bool
3190 """
3191 _doc_args["isalnum"] = {"type": "alphanumeric", "method": "isalnum"}
3192 _doc_args["isalpha"] = {"type": "alphabetic", "method": "isalpha"}
3193 _doc_args["isdigit"] = {"type": "digits", "method": "isdigit"}
3194 _doc_args["isspace"] = {"type": "whitespace", "method": "isspace"}
3195 _doc_args["islower"] = {"type": "lowercase", "method": "islower"}
3196 _doc_args["isupper"] = {"type": "uppercase", "method": "isupper"}
3197 _doc_args["istitle"] = {"type": "titlecase", "method": "istitle"}
3198 _doc_args["isnumeric"] = {"type": "numeric", "method": "isnumeric"}
3199 _doc_args["isdecimal"] = {"type": "decimal", "method": "isdecimal"}
3200 # force _noarg_wrapper return type with dtype=np.dtype(bool) (GH 29624)
3202 isalnum = _map_and_wrap(
3203 "isalnum", docstring=_shared_docs["ismethods"] % _doc_args["isalnum"]
3204 )
3205 isalpha = _map_and_wrap(
3206 "isalpha", docstring=_shared_docs["ismethods"] % _doc_args["isalpha"]
3207 )
3208 isdigit = _map_and_wrap(
3209 "isdigit", docstring=_shared_docs["ismethods"] % _doc_args["isdigit"]
3210 )
3211 isspace = _map_and_wrap(
3212 "isspace", docstring=_shared_docs["ismethods"] % _doc_args["isspace"]
3213 )
3214 islower = _map_and_wrap(
3215 "islower", docstring=_shared_docs["ismethods"] % _doc_args["islower"]
3216 )
3217 isupper = _map_and_wrap(
3218 "isupper", docstring=_shared_docs["ismethods"] % _doc_args["isupper"]
3219 )
3220 istitle = _map_and_wrap(
3221 "istitle", docstring=_shared_docs["ismethods"] % _doc_args["istitle"]
3222 )
3223 isnumeric = _map_and_wrap(
3224 "isnumeric", docstring=_shared_docs["ismethods"] % _doc_args["isnumeric"]
3225 )
3226 isdecimal = _map_and_wrap(
3227 "isdecimal", docstring=_shared_docs["ismethods"] % _doc_args["isdecimal"]
3228 )
3231def cat_safe(list_of_columns: list, sep: str):
3232 """
3233 Auxiliary function for :meth:`str.cat`.
3235 Same signature as cat_core, but handles TypeErrors in concatenation, which
3236 happen if the arrays in list_of columns have the wrong dtypes or content.
3238 Parameters
3239 ----------
3240 list_of_columns : list of numpy arrays
3241 List of arrays to be concatenated with sep;
3242 these arrays may not contain NaNs!
3243 sep : string
3244 The separator string for concatenating the columns.
3246 Returns
3247 -------
3248 nd.array
3249 The concatenation of list_of_columns with sep.
3250 """
3251 try:
3252 result = cat_core(list_of_columns, sep)
3253 except TypeError:
3254 # if there are any non-string values (wrong dtype or hidden behind
3255 # object dtype), np.sum will fail; catch and return with better message
3256 for column in list_of_columns:
3257 dtype = lib.infer_dtype(column, skipna=True)
3258 if dtype not in ["string", "empty"]:
3259 raise TypeError(
3260 "Concatenation requires list-likes containing only "
3261 "strings (or missing values). Offending values found in "
3262 f"column {dtype}"
3263 ) from None
3264 return result
3267def cat_core(list_of_columns: list, sep: str):
3268 """
3269 Auxiliary function for :meth:`str.cat`
3271 Parameters
3272 ----------
3273 list_of_columns : list of numpy arrays
3274 List of arrays to be concatenated with sep;
3275 these arrays may not contain NaNs!
3276 sep : string
3277 The separator string for concatenating the columns.
3279 Returns
3280 -------
3281 nd.array
3282 The concatenation of list_of_columns with sep.
3283 """
3284 if sep == "":
3285 # no need to interleave sep if it is empty
3286 arr_of_cols = np.asarray(list_of_columns, dtype=object)
3287 return np.sum(arr_of_cols, axis=0)
3288 list_with_sep = [sep] * (2 * len(list_of_columns) - 1)
3289 list_with_sep[::2] = list_of_columns
3290 arr_with_sep = np.asarray(list_with_sep, dtype=object)
3291 return np.sum(arr_with_sep, axis=0)
3294def _result_dtype(arr):
3295 # workaround #27953
3296 # ideally we just pass `dtype=arr.dtype` unconditionally, but this fails
3297 # when the list of values is empty.
3298 from pandas.core.arrays.string_ import StringDtype
3300 if isinstance(arr.dtype, StringDtype):
3301 return arr.dtype
3302 else:
3303 return object
3306def _get_single_group_name(regex: re.Pattern) -> Hashable:
3307 if regex.groupindex:
3308 return next(iter(regex.groupindex))
3309 else:
3310 return None
3313def _get_group_names(regex: re.Pattern) -> list[Hashable]:
3314 """
3315 Get named groups from compiled regex.
3317 Unnamed groups are numbered.
3319 Parameters
3320 ----------
3321 regex : compiled regex
3323 Returns
3324 -------
3325 list of column labels
3326 """
3327 names = {v: k for k, v in regex.groupindex.items()}
3328 return [names.get(1 + i, i) for i in range(regex.groups)]
3331def str_extractall(arr, pat, flags=0):
3332 regex = re.compile(pat, flags=flags)
3333 # the regex must contain capture groups.
3334 if regex.groups == 0:
3335 raise ValueError("pattern contains no capture groups")
3337 if isinstance(arr, ABCIndex):
3338 arr = arr.to_series().reset_index(drop=True)
3340 columns = _get_group_names(regex)
3341 match_list = []
3342 index_list = []
3343 is_mi = arr.index.nlevels > 1
3345 for subject_key, subject in arr.items():
3346 if isinstance(subject, str):
3348 if not is_mi:
3349 subject_key = (subject_key,)
3351 for match_i, match_tuple in enumerate(regex.findall(subject)):
3352 if isinstance(match_tuple, str):
3353 match_tuple = (match_tuple,)
3354 na_tuple = [np.NaN if group == "" else group for group in match_tuple]
3355 match_list.append(na_tuple)
3356 result_key = tuple(subject_key + (match_i,))
3357 index_list.append(result_key)
3359 from pandas import MultiIndex
3361 index = MultiIndex.from_tuples(index_list, names=arr.index.names + ["match"])
3362 dtype = _result_dtype(arr)
3364 result = arr._constructor_expanddim(
3365 match_list, index=index, columns=columns, dtype=dtype
3366 )
3367 return result