Coverage for /var/srv/projects/api.amasfac.comuna18.com/tmp/venv/lib/python3.9/site-packages/pandas/core/strings/accessor.py: 25%

1from __future__ import annotations

3import codecs

4from functools import wraps

5import re

6from typing import (

7 TYPE_CHECKING,

8 Callable,

9 Hashable,

10 cast,

11)

12import warnings

14import numpy as np

16import pandas._libs.lib as lib

17from pandas._typing import (

18 DtypeObj,

19 F,

20 Scalar,

21)

22from pandas.util._decorators import (

23 Appender,

24 deprecate_nonkeyword_arguments,

25)

26from pandas.util._exceptions import find_stack_level

28from pandas.core.dtypes.common import (

29 ensure_object,

30 is_bool_dtype,

31 is_categorical_dtype,

32 is_integer,

33 is_list_like,

34 is_object_dtype,

35 is_re,

36)

37from pandas.core.dtypes.generic import (

38 ABCDataFrame,

39 ABCIndex,

40 ABCMultiIndex,

41 ABCSeries,

42)

43from pandas.core.dtypes.missing import isna

45from pandas.core.base import NoNewAttributesMixin

46from pandas.core.construction import extract_array

48if TYPE_CHECKING: 48 ↛ 49line 48 didn't jump to line 49, because the condition on line 48 was never true

49 from pandas import (

50 DataFrame,

51 Index,

52 Series,

53 )

55_shared_docs: dict[str, str] = {}

56_cpython_optimized_encoders = (

57 "utf-8",

58 "utf8",

59 "latin-1",

60 "latin1",

61 "iso-8859-1",

62 "mbcs",

63 "ascii",

64)

65_cpython_optimized_decoders = _cpython_optimized_encoders + ("utf-16", "utf-32")

68def forbid_nonstring_types(

69 forbidden: list[str] | None, name: str | None = None

70) -> Callable[[F], F]:

71 """

72 Decorator to forbid specific types for a method of StringMethods.

74 For calling `.str.{method}` on a Series or Index, it is necessary to first

75 initialize the :class:`StringMethods` object, and then call the method.

76 However, different methods allow different input types, and so this can not

77 be checked during :meth:`StringMethods.__init__`, but must be done on a

78 per-method basis. This decorator exists to facilitate this process, and

79 make it explicit which (inferred) types are disallowed by the method.

81 :meth:`StringMethods.__init__` allows the *union* of types its different

82 methods allow (after skipping NaNs; see :meth:`StringMethods._validate`),

83 namely: ['string', 'empty', 'bytes', 'mixed', 'mixed-integer'].

85 The default string types ['string', 'empty'] are allowed for all methods.

86 For the additional types ['bytes', 'mixed', 'mixed-integer'], each method

87 then needs to forbid the types it is not intended for.

89 Parameters

90 ----------

91 forbidden : list-of-str or None

92 List of forbidden non-string types, may be one or more of

93 `['bytes', 'mixed', 'mixed-integer']`.

94 name : str, default None

95 Name of the method to use in the error message. By default, this is

96 None, in which case the name from the method being wrapped will be

97 copied. However, for working with further wrappers (like _pat_wrapper

98 and _noarg_wrapper), it is necessary to specify the name.

100 Returns

101 -------

102 func : wrapper

103 The method to which the decorator is applied, with an added check that

104 enforces the inferred type to not be in the list of forbidden types.

105

106 Raises

107 ------

108 TypeError

109 If the inferred type of the underlying data is in `forbidden`.

110 """

111 # deal with None

112 forbidden = [] if forbidden is None else forbidden

113

114 allowed_types = {"string", "empty", "bytes", "mixed", "mixed-integer"} - set(

115 forbidden

116 )

117

118 def _forbid_nonstring_types(func: F) -> F:

119 func_name = func.__name__ if name is None else name

120

121 @wraps(func)

122 def wrapper(self, *args, **kwargs):

123 if self._inferred_dtype not in allowed_types:

124 msg = (

125 f"Cannot use .str.{func_name} with values of "

126 f"inferred dtype '{self._inferred_dtype}'."

127 )

128 raise TypeError(msg)

129 return func(self, *args, **kwargs)

130

131 wrapper.__name__ = func_name

132 return cast(F, wrapper)

133

134 return _forbid_nonstring_types

135

136

137def _map_and_wrap(name, docstring):

138 @forbid_nonstring_types(["bytes"], name=name)

139 def wrapper(self):

140 result = getattr(self._data.array, f"_str_{name}")()

141 return self._wrap_result(result)

142

143 wrapper.__doc__ = docstring

144 return wrapper

145

146

147class StringMethods(NoNewAttributesMixin):

148 """

149 Vectorized string functions for Series and Index.

150

151 NAs stay NA unless handled otherwise by a particular method.

152 Patterned after Python's string methods, with some inspiration from

153 R's stringr package.

154

155 Examples

156 --------

157 >>> s = pd.Series(["A_Str_Series"])

158 >>> s

159 0 A_Str_Series

160 dtype: object

161

162 >>> s.str.split("_")

163 0 [A, Str, Series]

164 dtype: object

165

166 >>> s.str.replace("_", "")

167 0 AStrSeries

168 dtype: object

169 """

170

171 # Note: see the docstring in pandas.core.strings.__init__

172 # for an explanation of the implementation.

173 # TODO: Dispatch all the methods

174 # Currently the following are not dispatched to the array

175 # * cat

176 # * extractall

177

178 def __init__(self, data) -> None:

179 from pandas.core.arrays.string_ import StringDtype

180

181 self._inferred_dtype = self._validate(data)

182 self._is_categorical = is_categorical_dtype(data.dtype)

183 self._is_string = isinstance(data.dtype, StringDtype)

184 self._data = data

185

186 self._index = self._name = None

187 if isinstance(data, ABCSeries):

188 self._index = data.index

189 self._name = data.name

190

191 # ._values.categories works for both Series/Index

192 self._parent = data._values.categories if self._is_categorical else data

193 # save orig to blow up categoricals to the right type

194 self._orig = data

195 self._freeze()

196

197 @staticmethod

198 def _validate(data):

199 """

200 Auxiliary function for StringMethods, infers and checks dtype of data.

201

202 This is a "first line of defence" at the creation of the StringMethods-

203 object, and just checks that the dtype is in the

204 *union* of the allowed types over all string methods below; this

205 restriction is then refined on a per-method basis using the decorator

206 @forbid_nonstring_types (more info in the corresponding docstring).

207

208 This really should exclude all series/index with any non-string values,

209 but that isn't practical for performance reasons until we have a str

210 dtype (GH 9343 / 13877)

211

212 Parameters

213 ----------

214 data : The content of the Series

215

216 Returns

217 -------

218 dtype : inferred dtype of data

219 """

220 if isinstance(data, ABCMultiIndex):

221 raise AttributeError(

222 "Can only use .str accessor with Index, not MultiIndex"

223 )

224

225 # see _libs/lib.pyx for list of inferred types

226 allowed_types = ["string", "empty", "bytes", "mixed", "mixed-integer"]

227

228 data = extract_array(data)

229

230 values = getattr(data, "categories", data) # categorical / normal

231

232 inferred_dtype = lib.infer_dtype(values, skipna=True)

233

234 if inferred_dtype not in allowed_types:

235 raise AttributeError("Can only use .str accessor with string values!")

236 return inferred_dtype

237

238 def __getitem__(self, key):

239 result = self._data.array._str_getitem(key)

240 return self._wrap_result(result)

241

242 def __iter__(self):

243 warnings.warn(

244 "Columnar iteration over characters will be deprecated in future releases.",

245 FutureWarning,

246 stacklevel=find_stack_level(),

247 )

248 i = 0

249 g = self.get(i)

250 while g.notna().any():

251 yield g

252 i += 1

253 g = self.get(i)

254

255 def _wrap_result(

256 self,

257 result,

258 name=None,

259 expand: bool | None = None,

260 fill_value=np.nan,

261 returns_string=True,

262 returns_bool: bool = False,

263 ):

264 from pandas import (

265 Index,

266 MultiIndex,

267 )

268

269 if not hasattr(result, "ndim") or not hasattr(result, "dtype"):

270 if isinstance(result, ABCDataFrame):

271 result = result.__finalize__(self._orig, name="str")

272 return result

273 assert result.ndim < 3

274

275 # We can be wrapping a string / object / categorical result, in which

276 # case we'll want to return the same dtype as the input.

277 # Or we can be wrapping a numeric output, in which case we don't want

278 # to return a StringArray.

279 # Ideally the array method returns the right array type.

280 if expand is None:

281 # infer from ndim if expand is not specified

282 expand = result.ndim != 1

283

284 elif (

285 expand is True

286 and is_object_dtype(result)

287 and not isinstance(self._orig, ABCIndex)

288 ):

289 # required when expand=True is explicitly specified

290 # not needed when inferred

291

292 def cons_row(x):

293 if is_list_like(x):

294 return x

295 else:

296 return [x]

297

298 result = [cons_row(x) for x in result]

299 if result and not self._is_string:

300 # propagate nan values to match longest sequence (GH 18450)

301 max_len = max(len(x) for x in result)

302 result = [

303 x * max_len if len(x) == 0 or x[0] is np.nan else x for x in result

304 ]

305

306 if not isinstance(expand, bool):

307 raise ValueError("expand must be True or False")

308

309 if expand is False:

310 # if expand is False, result should have the same name

311 # as the original otherwise specified

312 if name is None:

313 name = getattr(result, "name", None)

314 if name is None:

315 # do not use logical or, _orig may be a DataFrame

316 # which has "name" column

317 name = self._orig.name

318

319 # Wait until we are sure result is a Series or Index before

320 # checking attributes (GH 12180)

321 if isinstance(self._orig, ABCIndex):

322 # if result is a boolean np.array, return the np.array

323 # instead of wrapping it into a boolean Index (GH 8875)

324 if is_bool_dtype(result):

325 return result

326

327 if expand:

328 result = list(result)

329 out = MultiIndex.from_tuples(result, names=name)

330 if out.nlevels == 1:

331 # We had all tuples of length-one, which are

332 # better represented as a regular Index.

333 out = out.get_level_values(0)

334 return out

335 else:

336 return Index._with_infer(result, name=name)

337 else:

338 index = self._orig.index

339 # This is a mess.

340 dtype: DtypeObj | str | None

341 vdtype = getattr(result, "dtype", None)

342 if self._is_string:

343 if is_bool_dtype(vdtype):

344 dtype = result.dtype

345 elif returns_string:

346 dtype = self._orig.dtype

347 else:

348 dtype = vdtype

349 else:

350 dtype = vdtype

351

352 if expand:

353 cons = self._orig._constructor_expanddim

354 result = cons(result, columns=name, index=index, dtype=dtype)

355 else:

356 # Must be a Series

357 cons = self._orig._constructor

358 result = cons(result, name=name, index=index, dtype=dtype)

359 result = result.__finalize__(self._orig, method="str")

360 if name is not None and result.ndim == 1:

361 # __finalize__ might copy over the original name, but we may

362 # want the new name (e.g. str.extract).

363 result.name = name

364 return result

365

366 def _get_series_list(self, others):

367 """

368 Auxiliary function for :meth:`str.cat`. Turn potentially mixed input

369 into a list of Series (elements without an index must match the length

370 of the calling Series/Index).

371

372 Parameters

373 ----------

374 others : Series, DataFrame, np.ndarray, list-like or list-like of

375 Objects that are either Series, Index or np.ndarray (1-dim).

376

377 Returns

378 -------

379 list of Series

380 Others transformed into list of Series.

381 """

382 from pandas import (

383 DataFrame,

384 Series,

385 )

386

387 # self._orig is either Series or Index

388 idx = self._orig if isinstance(self._orig, ABCIndex) else self._orig.index

389

390 # Generally speaking, all objects without an index inherit the index

391 # `idx` of the calling Series/Index - i.e. must have matching length.

392 # Objects with an index (i.e. Series/Index/DataFrame) keep their own.

393 if isinstance(others, ABCSeries):

394 return [others]

395 elif isinstance(others, ABCIndex):

396 return [Series(others._values, index=idx, dtype=others.dtype)]

397 elif isinstance(others, ABCDataFrame):

398 return [others[x] for x in others]

399 elif isinstance(others, np.ndarray) and others.ndim == 2:

400 others = DataFrame(others, index=idx)

401 return [others[x] for x in others]

402 elif is_list_like(others, allow_sets=False):

403 others = list(others) # ensure iterators do not get read twice etc

404

405 # in case of list-like `others`, all elements must be

406 # either Series/Index/np.ndarray (1-dim)...

407 if all(

408 isinstance(x, (ABCSeries, ABCIndex))

409 or (isinstance(x, np.ndarray) and x.ndim == 1)

410 for x in others

411 ):

412 los: list[Series] = []

413 while others: # iterate through list and append each element

414 los = los + self._get_series_list(others.pop(0))

415 return los

416 # ... or just strings

417 elif all(not is_list_like(x) for x in others):

418 return [Series(others, index=idx)]

419 raise TypeError(

420 "others must be Series, Index, DataFrame, np.ndarray "

421 "or list-like (either containing only strings or "

422 "containing only objects of type Series/Index/"

423 "np.ndarray[1-dim])"

424 )

425

426 @forbid_nonstring_types(["bytes", "mixed", "mixed-integer"])

427 def cat(

428 self, others=None, sep=None, na_rep=None, join="left"

429 ) -> str | Series | Index:

430 """

431 Concatenate strings in the Series/Index with given separator.

432

433 If `others` is specified, this function concatenates the Series/Index

434 and elements of `others` element-wise.

435 If `others` is not passed, then all values in the Series/Index are

436 concatenated into a single string with a given `sep`.

437

438 Parameters

439 ----------

440 others : Series, Index, DataFrame, np.ndarray or list-like

441 Series, Index, DataFrame, np.ndarray (one- or two-dimensional) and

442 other list-likes of strings must have the same length as the

443 calling Series/Index, with the exception of indexed objects (i.e.

444 Series/Index/DataFrame) if `join` is not None.

445

446 If others is a list-like that contains a combination of Series,

447 Index or np.ndarray (1-dim), then all elements will be unpacked and

448 must satisfy the above criteria individually.

449

450 If others is None, the method returns the concatenation of all

451 strings in the calling Series/Index.

452 sep : str, default ''

453 The separator between the different elements/columns. By default

454 the empty string `''` is used.

455 na_rep : str or None, default None

456 Representation that is inserted for all missing values:

457

458 - If `na_rep` is None, and `others` is None, missing values in the

459 Series/Index are omitted from the result.

460 - If `na_rep` is None, and `others` is not None, a row containing a

461 missing value in any of the columns (before concatenation) will

462 have a missing value in the result.

463 join : {'left', 'right', 'outer', 'inner'}, default 'left'

464 Determines the join-style between the calling Series/Index and any

465 Series/Index/DataFrame in `others` (objects without an index need

466 to match the length of the calling Series/Index). To disable

467 alignment, use `.values` on any Series/Index/DataFrame in `others`.

468

469 .. versionadded:: 0.23.0

470 .. versionchanged:: 1.0.0

471 Changed default of `join` from None to `'left'`.

472

473 Returns

474 -------

475 str, Series or Index

476 If `others` is None, `str` is returned, otherwise a `Series/Index`

477 (same type as caller) of objects is returned.

478

479 See Also

480 --------

481 split : Split each string in the Series/Index.

482 join : Join lists contained as elements in the Series/Index.

483

484 Examples

485 --------

486 When not passing `others`, all values are concatenated into a single

487 string:

488

489 >>> s = pd.Series(['a', 'b', np.nan, 'd'])

490 >>> s.str.cat(sep=' ')

491 'a b d'

492

493 By default, NA values in the Series are ignored. Using `na_rep`, they

494 can be given a representation:

495

496 >>> s.str.cat(sep=' ', na_rep='?')

497 'a b ? d'

498

499 If `others` is specified, corresponding values are concatenated with

500 the separator. Result will be a Series of strings.

501

502 >>> s.str.cat(['A', 'B', 'C', 'D'], sep=',')

503 0 a,A

504 1 b,B

505 2 NaN

506 3 d,D

507 dtype: object

508

509 Missing values will remain missing in the result, but can again be

510 represented using `na_rep`

511

512 >>> s.str.cat(['A', 'B', 'C', 'D'], sep=',', na_rep='-')

513 0 a,A

514 1 b,B

515 2 -,C

516 3 d,D

517 dtype: object

518

519 If `sep` is not specified, the values are concatenated without

520 separation.

521

522 >>> s.str.cat(['A', 'B', 'C', 'D'], na_rep='-')

523 0 aA

524 1 bB

525 2 -C

526 3 dD

527 dtype: object

528

529 Series with different indexes can be aligned before concatenation. The

530 `join`-keyword works as in other methods.

531

532 >>> t = pd.Series(['d', 'a', 'e', 'c'], index=[3, 0, 4, 2])

533 >>> s.str.cat(t, join='left', na_rep='-')

534 0 aa

535 1 b-

536 2 -c

537 3 dd

538 dtype: object

539 >>>

540 >>> s.str.cat(t, join='outer', na_rep='-')

541 0 aa

542 1 b-

543 2 -c

544 3 dd

545 4 -e

546 dtype: object

547 >>>

548 >>> s.str.cat(t, join='inner', na_rep='-')

549 0 aa

550 2 -c

551 3 dd

552 dtype: object

553 >>>

554 >>> s.str.cat(t, join='right', na_rep='-')

555 3 dd

556 0 aa

557 4 -e

558 2 -c

559 dtype: object

560

561 For more examples, see :ref:`here <text.concatenate>`.

562 """

563 # TODO: dispatch

564 from pandas import (

565 Index,

566 Series,

567 concat,

568 )

569

570 if isinstance(others, str):

571 raise ValueError("Did you mean to supply a `sep` keyword?")

572 if sep is None:

573 sep = ""

574

575 if isinstance(self._orig, ABCIndex):

576 data = Series(self._orig, index=self._orig, dtype=self._orig.dtype)

577 else: # Series

578 data = self._orig

579

580 # concatenate Series/Index with itself if no "others"

581 if others is None:

582 # error: Incompatible types in assignment (expression has type

583 # "ndarray", variable has type "Series")

584 data = ensure_object(data) # type: ignore[assignment]

585 na_mask = isna(data)

586 if na_rep is None and na_mask.any():

587 return sep.join(data[~na_mask])

588 elif na_rep is not None and na_mask.any():

589 return sep.join(np.where(na_mask, na_rep, data))

590 else:

591 return sep.join(data)

592

593 try:

594 # turn anything in "others" into lists of Series

595 others = self._get_series_list(others)

596 except ValueError as err: # do not catch TypeError raised by _get_series_list

597 raise ValueError(

598 "If `others` contains arrays or lists (or other "

599 "list-likes without an index), these must all be "

600 "of the same length as the calling Series/Index."

601 ) from err

602

603 # align if required

604 if any(not data.index.equals(x.index) for x in others):

605 # Need to add keys for uniqueness in case of duplicate columns

606 others = concat(

607 others,

608 axis=1,

609 join=(join if join == "inner" else "outer"),

610 keys=range(len(others)),

611 sort=False,

612 copy=False,

613 )

614 data, others = data.align(others, join=join)

615 others = [others[x] for x in others] # again list of Series

616

617 all_cols = [ensure_object(x) for x in [data] + others]

618 na_masks = np.array([isna(x) for x in all_cols])

619 union_mask = np.logical_or.reduce(na_masks, axis=0)

620

621 if na_rep is None and union_mask.any():

622 # no na_rep means NaNs for all rows where any column has a NaN

623 # only necessary if there are actually any NaNs

624 result = np.empty(len(data), dtype=object)

625 np.putmask(result, union_mask, np.nan)

626

627 not_masked = ~union_mask

628 result[not_masked] = cat_safe([x[not_masked] for x in all_cols], sep)

629 elif na_rep is not None and union_mask.any():

630 # fill NaNs with na_rep in case there are actually any NaNs

631 all_cols = [

632 np.where(nm, na_rep, col) for nm, col in zip(na_masks, all_cols)

633 ]

634 result = cat_safe(all_cols, sep)

635 else:

636 # no NaNs - can just concatenate

637 result = cat_safe(all_cols, sep)

638

639 out: Index | Series

640 if isinstance(self._orig, ABCIndex):

641 # add dtype for case that result is all-NA

642

643 out = Index(result, dtype=object, name=self._orig.name)

644 else: # Series

645 if is_categorical_dtype(self._orig.dtype):

646 # We need to infer the new categories.

647 dtype = None

648 else:

649 dtype = self._orig.dtype

650 res_ser = Series(

651 result, dtype=dtype, index=data.index, name=self._orig.name

652 )

653 out = res_ser.__finalize__(self._orig, method="str_cat")

654 return out

655

656 _shared_docs[

657 "str_split"

658 ] = r"""

659 Split strings around given separator/delimiter.

660

661 Splits the string in the Series/Index from the %(side)s,

662 at the specified delimiter string.

663

664 Parameters

665 ----------

666 pat : str%(pat_regex)s, optional

667 %(pat_description)s.

668 If not specified, split on whitespace.

669 n : int, default -1 (all)

670 Limit number of splits in output.

671 ``None``, 0 and -1 will be interpreted as return all splits.

672 expand : bool, default False

673 Expand the split strings into separate columns.

674

675 - If ``True``, return DataFrame/MultiIndex expanding dimensionality.

676 - If ``False``, return Series/Index, containing lists of strings.

677 %(regex_argument)s

678 Returns

679 -------

680 Series, Index, DataFrame or MultiIndex

681 Type matches caller unless ``expand=True`` (see Notes).

682 %(raises_split)s

683 See Also

684 --------

685 Series.str.split : Split strings around given separator/delimiter.

686 Series.str.rsplit : Splits string around given separator/delimiter,

687 starting from the right.

688 Series.str.join : Join lists contained as elements in the Series/Index

689 with passed delimiter.

690 str.split : Standard library version for split.

691 str.rsplit : Standard library version for rsplit.

692

693 Notes

694 -----

695 The handling of the `n` keyword depends on the number of found splits:

696

697 - If found splits > `n`, make first `n` splits only

698 - If found splits <= `n`, make all splits

699 - If for a certain row the number of found splits < `n`,

700 append `None` for padding up to `n` if ``expand=True``

701

702 If using ``expand=True``, Series and Index callers return DataFrame and

703 MultiIndex objects, respectively.

704 %(regex_pat_note)s

705 Examples

706 --------

707 >>> s = pd.Series(

708 ... [

709 ... "this is a regular sentence",

710 ... "https://docs.python.org/3/tutorial/index.html",

711 ... np.nan

712 ... ]

713 ... )

714 >>> s

715 0 this is a regular sentence

716 1 https://docs.python.org/3/tutorial/index.html

717 2 NaN

718 dtype: object

719

720 In the default setting, the string is split by whitespace.

721

722 >>> s.str.split()

723 0 [this, is, a, regular, sentence]

724 1 [https://docs.python.org/3/tutorial/index.html]

725 2 NaN

726 dtype: object

727

728 Without the `n` parameter, the outputs of `rsplit` and `split`

729 are identical.

730

731 >>> s.str.rsplit()

732 0 [this, is, a, regular, sentence]

733 1 [https://docs.python.org/3/tutorial/index.html]

734 2 NaN

735 dtype: object

736

737 The `n` parameter can be used to limit the number of splits on the

738 delimiter. The outputs of `split` and `rsplit` are different.

739

740 >>> s.str.split(n=2)

741 0 [this, is, a regular sentence]

742 1 [https://docs.python.org/3/tutorial/index.html]

743 2 NaN

744 dtype: object

745

746 >>> s.str.rsplit(n=2)

747 0 [this is a, regular, sentence]

748 1 [https://docs.python.org/3/tutorial/index.html]

749 2 NaN

750 dtype: object

751

752 The `pat` parameter can be used to split by other characters.

753

754 >>> s.str.split(pat="/")

755 0 [this is a regular sentence]

756 1 [https:, , docs.python.org, 3, tutorial, index...

757 2 NaN

758 dtype: object

759

760 When using ``expand=True``, the split elements will expand out into

761 separate columns. If NaN is present, it is propagated throughout

762 the columns during the split.

763

764 >>> s.str.split(expand=True)

765 0 1 2 3 4

766 0 this is a regular sentence

767 1 https://docs.python.org/3/tutorial/index.html None None None None

768 2 NaN NaN NaN NaN NaN

769

770 For slightly more complex use cases like splitting the html document name

771 from a url, a combination of parameter settings can be used.

772

773 >>> s.str.rsplit("/", n=1, expand=True)

774 0 1

775 0 this is a regular sentence None

776 1 https://docs.python.org/3/tutorial index.html

777 2 NaN NaN

778 %(regex_examples)s"""

779

780 @Appender(

781 _shared_docs["str_split"]

782 % {

783 "side": "beginning",

784 "pat_regex": " or compiled regex",

785 "pat_description": "String or regular expression to split on",

786 "regex_argument": """

787 regex : bool, default None

788 Determines if the passed-in pattern is a regular expression:

789

790 - If ``True``, assumes the passed-in pattern is a regular expression

791 - If ``False``, treats the pattern as a literal string.

792 - If ``None`` and `pat` length is 1, treats `pat` as a literal string.

793 - If ``None`` and `pat` length is not 1, treats `pat` as a regular expression.

794 - Cannot be set to False if `pat` is a compiled regex

795

796 .. versionadded:: 1.4.0

797 """,

798 "raises_split": """

799 Raises

800 ------

801 ValueError

802 * if `regex` is False and `pat` is a compiled regex

803 """,

804 "regex_pat_note": """

805 Use of `regex =False` with a `pat` as a compiled regex will raise an error.

806 """,

807 "method": "split",

808 "regex_examples": r"""

809 Remember to escape special characters when explicitly using regular expressions.

810

811 >>> s = pd.Series(["foo and bar plus baz"])

812 >>> s.str.split(r"and|plus", expand=True)

813 0 1 2

814 0 foo bar baz

815

816 Regular expressions can be used to handle urls or file names.

817 When `pat` is a string and ``regex=None`` (the default), the given `pat` is compiled

818 as a regex only if ``len(pat) != 1``.

819

820 >>> s = pd.Series(['foojpgbar.jpg'])

821 >>> s.str.split(r".", expand=True)

822 0 1

823 0 foojpgbar jpg

824

825 >>> s.str.split(r"\.jpg", expand=True)

826 0 1

827 0 foojpgbar

828

829 When ``regex=True``, `pat` is interpreted as a regex

830

831 >>> s.str.split(r"\.jpg", regex=True, expand=True)

832 0 1

833 0 foojpgbar

834

835 A compiled regex can be passed as `pat`

836

837 >>> import re

838 >>> s.str.split(re.compile(r"\.jpg"), expand=True)

839 0 1

840 0 foojpgbar

841

842 When ``regex=False``, `pat` is interpreted as the string itself

843

844 >>> s.str.split(r"\.jpg", regex=False, expand=True)

845 0

846 0 foojpgbar.jpg

847 """,

848 }

849 )

850 @deprecate_nonkeyword_arguments(version=None, allowed_args=["self", "pat"])

851 @forbid_nonstring_types(["bytes"])

852 def split(

853 self,

854 pat: str | re.Pattern | None = None,

855 n=-1,

856 expand=False,

857 *,

858 regex: bool | None = None,

859 ):

860 if regex is False and is_re(pat):

861 raise ValueError(

862 "Cannot use a compiled regex as replacement pattern with regex=False"

863 )

864 if is_re(pat):

865 regex = True

866 result = self._data.array._str_split(pat, n, expand, regex)

867 return self._wrap_result(result, returns_string=expand, expand=expand)

868

869 @Appender(

870 _shared_docs["str_split"]

871 % {

872 "side": "end",

873 "pat_regex": "",

874 "pat_description": "String to split on",

875 "regex_argument": "",

876 "raises_split": "",

877 "regex_pat_note": "",

878 "method": "rsplit",

879 "regex_examples": "",

880 }

881 )

882 @deprecate_nonkeyword_arguments(version=None, allowed_args=["self", "pat"])

883 @forbid_nonstring_types(["bytes"])

884 def rsplit(self, pat=None, n=-1, expand=False):

885 result = self._data.array._str_rsplit(pat, n=n)

886 return self._wrap_result(result, expand=expand, returns_string=expand)

887

888 _shared_docs[

889 "str_partition"

890 ] = """

891 Split the string at the %(side)s occurrence of `sep`.

892

893 This method splits the string at the %(side)s occurrence of `sep`,

894 and returns 3 elements containing the part before the separator,

895 the separator itself, and the part after the separator.

896 If the separator is not found, return %(return)s.

897

898 Parameters

899 ----------

900 sep : str, default whitespace

901 String to split on.

902 expand : bool, default True

903 If True, return DataFrame/MultiIndex expanding dimensionality.

904 If False, return Series/Index.

905

906 Returns

907 -------

908 DataFrame/MultiIndex or Series/Index of objects

909

910 See Also

911 --------

912 %(also)s

913 Series.str.split : Split strings around given separators.

914 str.partition : Standard library version.

915

916 Examples

917 --------

918

919 >>> s = pd.Series(['Linda van der Berg', 'George Pitt-Rivers'])

920 >>> s

921 0 Linda van der Berg

922 1 George Pitt-Rivers

923 dtype: object

924

925 >>> s.str.partition()

926 0 1 2

927 0 Linda van der Berg

928 1 George Pitt-Rivers

929

930 To partition by the last space instead of the first one:

931

932 >>> s.str.rpartition()

933 0 1 2

934 0 Linda van der Berg

935 1 George Pitt-Rivers

936

937 To partition by something different than a space:

938

939 >>> s.str.partition('-')

940 0 1 2

941 0 Linda van der Berg

942 1 George Pitt - Rivers

943

944 To return a Series containing tuples instead of a DataFrame:

945

946 >>> s.str.partition('-', expand=False)

947 0 (Linda van der Berg, , )

948 1 (George Pitt, -, Rivers)

949 dtype: object

950

951 Also available on indices:

952

953 >>> idx = pd.Index(['X 123', 'Y 999'])

954 >>> idx

955 Index(['X 123', 'Y 999'], dtype='object')

956

957 Which will create a MultiIndex:

958

959 >>> idx.str.partition()

960 MultiIndex([('X', ' ', '123'),

961 ('Y', ' ', '999')],

962 )

963

964 Or an index with tuples with ``expand=False``:

965

966 >>> idx.str.partition(expand=False)

967 Index([('X', ' ', '123'), ('Y', ' ', '999')], dtype='object')

968 """

969

970 @Appender(

971 _shared_docs["str_partition"]

972 % {

973 "side": "first",

974 "return": "3 elements containing the string itself, followed by two "

975 "empty strings",

976 "also": "rpartition : Split the string at the last occurrence of `sep`.",

977 }

978 )

979 @forbid_nonstring_types(["bytes"])

980 def partition(self, sep=" ", expand=True):

981 result = self._data.array._str_partition(sep, expand)

982 return self._wrap_result(result, expand=expand, returns_string=expand)

983

984 @Appender(

985 _shared_docs["str_partition"]

986 % {

987 "side": "last",

988 "return": "3 elements containing two empty strings, followed by the "

989 "string itself",

990 "also": "partition : Split the string at the first occurrence of `sep`.",

991 }

992 )

993 @forbid_nonstring_types(["bytes"])

994 def rpartition(self, sep=" ", expand=True):

995 result = self._data.array._str_rpartition(sep, expand)

996 return self._wrap_result(result, expand=expand, returns_string=expand)

997

998 def get(self, i):

999 """

1000 Extract element from each component at specified position or with specified key.

1001

1002 Extract element from lists, tuples, dict, or strings in each element in the

1003 Series/Index.

1004

1005 Parameters

1006 ----------

1007 i : int or hashable dict label

1008 Position or key of element to extract.

1009

1010 Returns

1011 -------

1012 Series or Index

1013

1014 Examples

1015 --------

1016 >>> s = pd.Series(["String",

1017 ... (1, 2, 3),

1018 ... ["a", "b", "c"],

1019 ... 123,

1020 ... -456,

1021 ... {1: "Hello", "2": "World"}])

1022 >>> s

1023 0 String

1024 1 (1, 2, 3)

1025 2 [a, b, c]

1026 3 123

1027 4 -456

1028 5 {1: 'Hello', '2': 'World'}

1029 dtype: object

1030

1031 >>> s.str.get(1)

1032 0 t

1033 1 2

1034 2 b

1035 3 NaN

1036 4 NaN

1037 5 Hello

1038 dtype: object

1039

1040 >>> s.str.get(-1)

1041 0 g

1042 1 3

1043 2 c

1044 3 NaN

1045 4 NaN

1046 5 None

1047 dtype: object

1048

1049 Return element with given key

1050

1051 >>> s = pd.Series([{"name": "Hello", "value": "World"},

1052 ... {"name": "Goodbye", "value": "Planet"}])

1053 >>> s.str.get('name')

1054 0 Hello

1055 1 Goodbye

1056 dtype: object

1057 """

1058 result = self._data.array._str_get(i)

1059 return self._wrap_result(result)

1060

1061 @forbid_nonstring_types(["bytes"])

1062 def join(self, sep):

1063 """

1064 Join lists contained as elements in the Series/Index with passed delimiter.

1065

1066 If the elements of a Series are lists themselves, join the content of these

1067 lists using the delimiter passed to the function.

1068 This function is an equivalent to :meth:`str.join`.

1069

1070 Parameters

1071 ----------

1072 sep : str

1073 Delimiter to use between list entries.

1074

1075 Returns

1076 -------

1077 Series/Index: object

1078 The list entries concatenated by intervening occurrences of the

1079 delimiter.

1080

1081 Raises

1082 ------

1083 AttributeError

1084 If the supplied Series contains neither strings nor lists.

1085

1086 See Also

1087 --------

1088 str.join : Standard library version of this method.

1089 Series.str.split : Split strings around given separator/delimiter.

1090

1091 Notes

1092 -----

1093 If any of the list items is not a string object, the result of the join

1094 will be `NaN`.

1095

1096 Examples

1097 --------

1098 Example with a list that contains non-string elements.

1099

1100 >>> s = pd.Series([['lion', 'elephant', 'zebra'],

1101 ... [1.1, 2.2, 3.3],

1102 ... ['cat', np.nan, 'dog'],

1103 ... ['cow', 4.5, 'goat'],

1104 ... ['duck', ['swan', 'fish'], 'guppy']])

1105 >>> s

1106 0 [lion, elephant, zebra]

1107 1 [1.1, 2.2, 3.3]

1108 2 [cat, nan, dog]

1109 3 [cow, 4.5, goat]

1110 4 [duck, [swan, fish], guppy]

1111 dtype: object

1112

1113 Join all lists using a '-'. The lists containing object(s) of types other

1114 than str will produce a NaN.

1115

1116 >>> s.str.join('-')

1117 0 lion-elephant-zebra

1118 1 NaN

1119 2 NaN

1120 3 NaN

1121 4 NaN

1122 dtype: object

1123 """

1124 result = self._data.array._str_join(sep)

1125 return self._wrap_result(result)

1126

1127 @forbid_nonstring_types(["bytes"])

1128 def contains(self, pat, case=True, flags=0, na=None, regex=True):

1129 r"""

1130 Test if pattern or regex is contained within a string of a Series or Index.

1131

1132 Return boolean Series or Index based on whether a given pattern or regex is

1133 contained within a string of a Series or Index.

1134

1135 Parameters

1136 ----------

1137 pat : str

1138 Character sequence or regular expression.

1139 case : bool, default True

1140 If True, case sensitive.

1141 flags : int, default 0 (no flags)

1142 Flags to pass through to the re module, e.g. re.IGNORECASE.

1143 na : scalar, optional

1144 Fill value for missing values. The default depends on dtype of the

1145 array. For object-dtype, ``numpy.nan`` is used. For ``StringDtype``,

1146 ``pandas.NA`` is used.

1147 regex : bool, default True

1148 If True, assumes the pat is a regular expression.

1149

1150 If False, treats the pat as a literal string.

1151

1152 Returns

1153 -------

1154 Series or Index of boolean values

1155 A Series or Index of boolean values indicating whether the

1156 given pattern is contained within the string of each element

1157 of the Series or Index.

1158

1159 See Also

1160 --------

1161 match : Analogous, but stricter, relying on re.match instead of re.search.

1162 Series.str.startswith : Test if the start of each string element matches a

1163 pattern.

1164 Series.str.endswith : Same as startswith, but tests the end of string.

1165

1166 Examples

1167 --------

1168 Returning a Series of booleans using only a literal pattern.

1169

1170 >>> s1 = pd.Series(['Mouse', 'dog', 'house and parrot', '23', np.NaN])

1171 >>> s1.str.contains('og', regex=False)

1172 0 False

1173 1 True

1174 2 False

1175 3 False

1176 4 NaN

1177 dtype: object

1178

1179 Returning an Index of booleans using only a literal pattern.

1180

1181 >>> ind = pd.Index(['Mouse', 'dog', 'house and parrot', '23.0', np.NaN])

1182 >>> ind.str.contains('23', regex=False)

1183 Index([False, False, False, True, nan], dtype='object')

1184

1185 Specifying case sensitivity using `case`.

1186

1187 >>> s1.str.contains('oG', case=True, regex=True)

1188 0 False

1189 1 False

1190 2 False

1191 3 False

1192 4 NaN

1193 dtype: object

1194

1195 Specifying `na` to be `False` instead of `NaN` replaces NaN values

1196 with `False`. If Series or Index does not contain NaN values

1197 the resultant dtype will be `bool`, otherwise, an `object` dtype.

1198

1199 >>> s1.str.contains('og', na=False, regex=True)

1200 0 False

1201 1 True

1202 2 False

1203 3 False

1204 4 False

1205 dtype: bool

1206

1207 Returning 'house' or 'dog' when either expression occurs in a string.

1208

1209 >>> s1.str.contains('house|dog', regex=True)

1210 0 False

1211 1 True

1212 2 True

1213 3 False

1214 4 NaN

1215 dtype: object

1216

1217 Ignoring case sensitivity using `flags` with regex.

1218

1219 >>> import re

1220 >>> s1.str.contains('PARROT', flags=re.IGNORECASE, regex=True)

1221 0 False

1222 1 False

1223 2 True

1224 3 False

1225 4 NaN

1226 dtype: object

1227

1228 Returning any digit using regular expression.

1229

1230 >>> s1.str.contains('\\d', regex=True)

1231 0 False

1232 1 False

1233 2 False

1234 3 True

1235 4 NaN

1236 dtype: object

1237

1238 Ensure `pat` is a not a literal pattern when `regex` is set to True.

1239 Note in the following example one might expect only `s2[1]` and `s2[3]` to

1240 return `True`. However, '.0' as a regex matches any character

1241 followed by a 0.

1242

1243 >>> s2 = pd.Series(['40', '40.0', '41', '41.0', '35'])

1244 >>> s2.str.contains('.0', regex=True)

1245 0 True

1246 1 True

1247 2 False

1248 3 True

1249 4 False

1250 dtype: bool

1251 """

1252 if regex and re.compile(pat).groups:

1253 warnings.warn(

1254 "This pattern is interpreted as a regular expression, and has "

1255 "match groups. To actually get the groups, use str.extract.",

1256 UserWarning,

1257 stacklevel=find_stack_level(),

1258 )

1259

1260 result = self._data.array._str_contains(pat, case, flags, na, regex)

1261 return self._wrap_result(result, fill_value=na, returns_string=False)

1262

1263 @forbid_nonstring_types(["bytes"])

1264 def match(self, pat, case=True, flags=0, na=None):

1265 """

1266 Determine if each string starts with a match of a regular expression.

1267

1268 Parameters

1269 ----------

1270 pat : str

1271 Character sequence or regular expression.

1272 case : bool, default True

1273 If True, case sensitive.

1274 flags : int, default 0 (no flags)

1275 Regex module flags, e.g. re.IGNORECASE.

1276 na : scalar, optional

1277 Fill value for missing values. The default depends on dtype of the

1278 array. For object-dtype, ``numpy.nan`` is used. For ``StringDtype``,

1279 ``pandas.NA`` is used.

1280

1281 Returns

1282 -------

1283 Series/Index/array of boolean values

1284

1285 See Also

1286 --------

1287 fullmatch : Stricter matching that requires the entire string to match.

1288 contains : Analogous, but less strict, relying on re.search instead of

1289 re.match.

1290 extract : Extract matched groups.

1291 """

1292 result = self._data.array._str_match(pat, case=case, flags=flags, na=na)

1293 return self._wrap_result(result, fill_value=na, returns_string=False)

1294

1295 @forbid_nonstring_types(["bytes"])

1296 def fullmatch(self, pat, case=True, flags=0, na=None):

1297 """

1298 Determine if each string entirely matches a regular expression.

1299

1300 .. versionadded:: 1.1.0

1301

1302 Parameters

1303 ----------

1304 pat : str

1305 Character sequence or regular expression.

1306 case : bool, default True

1307 If True, case sensitive.

1308 flags : int, default 0 (no flags)

1309 Regex module flags, e.g. re.IGNORECASE.

1310 na : scalar, optional

1311 Fill value for missing values. The default depends on dtype of the

1312 array. For object-dtype, ``numpy.nan`` is used. For ``StringDtype``,

1313 ``pandas.NA`` is used.

1314

1315 Returns

1316 -------

1317 Series/Index/array of boolean values

1318

1319 See Also

1320 --------

1321 match : Similar, but also returns `True` when only a *prefix* of the string

1322 matches the regular expression.

1323 extract : Extract matched groups.

1324 """

1325 result = self._data.array._str_fullmatch(pat, case=case, flags=flags, na=na)

1326 return self._wrap_result(result, fill_value=na, returns_string=False)

1327

1328 @forbid_nonstring_types(["bytes"])

1329 def replace(

1330 self,

1331 pat: str | re.Pattern,

1332 repl: str | Callable,

1333 n: int = -1,

1334 case: bool | None = None,

1335 flags: int = 0,

1336 regex: bool | None = None,

1337 ):

1338 r"""

1339 Replace each occurrence of pattern/regex in the Series/Index.

1340

1341 Equivalent to :meth:`str.replace` or :func:`re.sub`, depending on

1342 the regex value.

1343

1344 Parameters

1345 ----------

1346 pat : str or compiled regex

1347 String can be a character sequence or regular expression.

1348 repl : str or callable

1349 Replacement string or a callable. The callable is passed the regex

1350 match object and must return a replacement string to be used.

1351 See :func:`re.sub`.

1352 n : int, default -1 (all)

1353 Number of replacements to make from start.

1354 case : bool, default None

1355 Determines if replace is case sensitive:

1356

1357 - If True, case sensitive (the default if `pat` is a string)

1358 - Set to False for case insensitive

1359 - Cannot be set if `pat` is a compiled regex.

1360

1361 flags : int, default 0 (no flags)

1362 Regex module flags, e.g. re.IGNORECASE. Cannot be set if `pat` is a compiled

1363 regex.

1364 regex : bool, default True

1365 Determines if the passed-in pattern is a regular expression:

1366

1367 - If True, assumes the passed-in pattern is a regular expression.

1368 - If False, treats the pattern as a literal string

1369 - Cannot be set to False if `pat` is a compiled regex or `repl` is

1370 a callable.

1371

1372 .. versionadded:: 0.23.0

1373

1374 Returns

1375 -------

1376 Series or Index of object

1377 A copy of the object with all matching occurrences of `pat` replaced by

1378 `repl`.

1379

1380 Raises

1381 ------

1382 ValueError

1383 * if `regex` is False and `repl` is a callable or `pat` is a compiled

1384 regex

1385 * if `pat` is a compiled regex and `case` or `flags` is set

1386

1387 Notes

1388 -----

1389 When `pat` is a compiled regex, all flags should be included in the

1390 compiled regex. Use of `case`, `flags`, or `regex=False` with a compiled

1391 regex will raise an error.

1392

1393 Examples

1394 --------

1395 When `pat` is a string and `regex` is True (the default), the given `pat`

1396 is compiled as a regex. When `repl` is a string, it replaces matching

1397 regex patterns as with :meth:`re.sub`. NaN value(s) in the Series are

1398 left as is:

1399

1400 >>> pd.Series(['foo', 'fuz', np.nan]).str.replace('f.', 'ba', regex=True)

1401 0 bao

1402 1 baz

1403 2 NaN

1404 dtype: object

1405

1406 When `pat` is a string and `regex` is False, every `pat` is replaced with

1407 `repl` as with :meth:`str.replace`:

1408

1409 >>> pd.Series(['f.o', 'fuz', np.nan]).str.replace('f.', 'ba', regex=False)

1410 0 bao

1411 1 fuz

1412 2 NaN

1413 dtype: object

1414

1415 When `repl` is a callable, it is called on every `pat` using

1416 :func:`re.sub`. The callable should expect one positional argument

1417 (a regex object) and return a string.

1418

1419 To get the idea:

1420

1421 >>> pd.Series(['foo', 'fuz', np.nan]).str.replace('f', repr, regex=True)

1422 0 <re.Match object; span=(0, 1), match='f'>oo

1423 1 <re.Match object; span=(0, 1), match='f'>uz

1424 2 NaN

1425 dtype: object

1426

1427 Reverse every lowercase alphabetic word:

1428

1429 >>> repl = lambda m: m.group(0)[::-1]

1430 >>> ser = pd.Series(['foo 123', 'bar baz', np.nan])

1431 >>> ser.str.replace(r'[a-z]+', repl, regex=True)

1432 0 oof 123

1433 1 rab zab

1434 2 NaN

1435 dtype: object

1436

1437 Using regex groups (extract second group and swap case):

1438

1439 >>> pat = r"(?P<one>\w+) (?P<two>\w+) (?P<three>\w+)"

1440 >>> repl = lambda m: m.group('two').swapcase()

1441 >>> ser = pd.Series(['One Two Three', 'Foo Bar Baz'])

1442 >>> ser.str.replace(pat, repl, regex=True)

1443 0 tWO

1444 1 bAR

1445 dtype: object

1446

1447 Using a compiled regex with flags

1448

1449 >>> import re

1450 >>> regex_pat = re.compile(r'FUZ', flags=re.IGNORECASE)

1451 >>> pd.Series(['foo', 'fuz', np.nan]).str.replace(regex_pat, 'bar', regex=True)

1452 0 foo

1453 1 bar

1454 2 NaN

1455 dtype: object

1456 """

1457 if regex is None:

1458 if isinstance(pat, str) and any(c in pat for c in ".+*|^$?[](){}\\"):

1459 # warn only in cases where regex behavior would differ from literal

1460 msg = (

1461 "The default value of regex will change from True to False "

1462 "in a future version."

1463 )

1464 if len(pat) == 1:

1465 msg += (

1466 " In addition, single character regular expressions will "

1467 "*not* be treated as literal strings when regex=True."

1468 )

1469 warnings.warn(msg, FutureWarning, stacklevel=find_stack_level())

1470

1471 # Check whether repl is valid (GH 13438, GH 15055)

1472 if not (isinstance(repl, str) or callable(repl)):

1473 raise TypeError("repl must be a string or callable")

1474

1475 is_compiled_re = is_re(pat)

1476 if regex or regex is None:

1477 if is_compiled_re and (case is not None or flags != 0):

1478 raise ValueError(

1479 "case and flags cannot be set when pat is a compiled regex"

1480 )

1481

1482 elif is_compiled_re:

1483 raise ValueError(

1484 "Cannot use a compiled regex as replacement pattern with regex=False"

1485 )

1486 elif callable(repl):

1487 raise ValueError("Cannot use a callable replacement when regex=False")

1488

1489 # The current behavior is to treat single character patterns as literal strings,

1490 # even when ``regex`` is set to ``True``.

1491 if isinstance(pat, str) and len(pat) == 1:

1492 regex = False

1493

1494 if regex is None:

1495 regex = True

1496

1497 if case is None:

1498 case = True

1499

1500 result = self._data.array._str_replace(

1501 pat, repl, n=n, case=case, flags=flags, regex=regex

1502 )

1503 return self._wrap_result(result)

1504

1505 @forbid_nonstring_types(["bytes"])

1506 def repeat(self, repeats):

1507 """

1508 Duplicate each string in the Series or Index.

1509

1510 Parameters

1511 ----------

1512 repeats : int or sequence of int

1513 Same value for all (int) or different value per (sequence).

1514

1515 Returns

1516 -------

1517 Series or Index of object

1518 Series or Index of repeated string objects specified by

1519 input parameter repeats.

1520

1521 Examples

1522 --------

1523 >>> s = pd.Series(['a', 'b', 'c'])

1524 >>> s

1525 0 a

1526 1 b

1527 2 c

1528 dtype: object

1529

1530 Single int repeats string in Series

1531

1532 >>> s.str.repeat(repeats=2)

1533 0 aa

1534 1 bb

1535 2 cc

1536 dtype: object

1537

1538 Sequence of int repeats corresponding string in Series

1539

1540 >>> s.str.repeat(repeats=[1, 2, 3])

1541 0 a

1542 1 bb

1543 2 ccc

1544 dtype: object

1545 """

1546 result = self._data.array._str_repeat(repeats)

1547 return self._wrap_result(result)

1548

1549 @forbid_nonstring_types(["bytes"])

1550 def pad(self, width, side="left", fillchar=" "):

1551 """

1552 Pad strings in the Series/Index up to width.

1553

1554 Parameters

1555 ----------

1556 width : int

1557 Minimum width of resulting string; additional characters will be filled

1558 with character defined in `fillchar`.

1559 side : {'left', 'right', 'both'}, default 'left'

1560 Side from which to fill resulting string.

1561 fillchar : str, default ' '

1562 Additional character for filling, default is whitespace.

1563

1564 Returns

1565 -------

1566 Series or Index of object

1567 Returns Series or Index with minimum number of char in object.

1568

1569 See Also

1570 --------

1571 Series.str.rjust : Fills the left side of strings with an arbitrary

1572 character. Equivalent to ``Series.str.pad(side='left')``.

1573 Series.str.ljust : Fills the right side of strings with an arbitrary

1574 character. Equivalent to ``Series.str.pad(side='right')``.

1575 Series.str.center : Fills both sides of strings with an arbitrary

1576 character. Equivalent to ``Series.str.pad(side='both')``.

1577 Series.str.zfill : Pad strings in the Series/Index by prepending '0'

1578 character. Equivalent to ``Series.str.pad(side='left', fillchar='0')``.

1579

1580 Examples

1581 --------

1582 >>> s = pd.Series(["caribou", "tiger"])

1583 >>> s

1584 0 caribou

1585 1 tiger

1586 dtype: object

1587

1588 >>> s.str.pad(width=10)

1589 0 caribou

1590 1 tiger

1591 dtype: object

1592

1593 >>> s.str.pad(width=10, side='right', fillchar='-')

1594 0 caribou---

1595 1 tiger-----

1596 dtype: object

1597

1598 >>> s.str.pad(width=10, side='both', fillchar='-')

1599 0 -caribou--

1600 1 --tiger---

1601 dtype: object

1602 """

1603 if not isinstance(fillchar, str):

1604 msg = f"fillchar must be a character, not {type(fillchar).__name__}"

1605 raise TypeError(msg)

1606

1607 if len(fillchar) != 1:

1608 raise TypeError("fillchar must be a character, not str")

1609

1610 if not is_integer(width):

1611 msg = f"width must be of integer type, not {type(width).__name__}"

1612 raise TypeError(msg)

1613

1614 result = self._data.array._str_pad(width, side=side, fillchar=fillchar)

1615 return self._wrap_result(result)

1616

1617 _shared_docs[

1618 "str_pad"

1619 ] = """

1620 Pad %(side)s side of strings in the Series/Index.

1621

1622 Equivalent to :meth:`str.%(method)s`.

1623

1624 Parameters

1625 ----------

1626 width : int

1627 Minimum width of resulting string; additional characters will be filled

1628 with ``fillchar``.

1629 fillchar : str

1630 Additional character for filling, default is whitespace.

1631

1632 Returns

1633 -------

1634 filled : Series/Index of objects.

1635 """

1636

1637 @Appender(_shared_docs["str_pad"] % {"side": "left and right", "method": "center"})

1638 @forbid_nonstring_types(["bytes"])

1639 def center(self, width, fillchar=" "):

1640 return self.pad(width, side="both", fillchar=fillchar)

1641

1642 @Appender(_shared_docs["str_pad"] % {"side": "right", "method": "ljust"})

1643 @forbid_nonstring_types(["bytes"])

1644 def ljust(self, width, fillchar=" "):

1645 return self.pad(width, side="right", fillchar=fillchar)

1646

1647 @Appender(_shared_docs["str_pad"] % {"side": "left", "method": "rjust"})

1648 @forbid_nonstring_types(["bytes"])

1649 def rjust(self, width, fillchar=" "):

1650 return self.pad(width, side="left", fillchar=fillchar)

1651

1652 @forbid_nonstring_types(["bytes"])

1653 def zfill(self, width):

1654 """

1655 Pad strings in the Series/Index by prepending '0' characters.

1656

1657 Strings in the Series/Index are padded with '0' characters on the

1658 left of the string to reach a total string length `width`. Strings

1659 in the Series/Index with length greater or equal to `width` are

1660 unchanged.

1661

1662 Parameters

1663 ----------

1664 width : int

1665 Minimum length of resulting string; strings with length less

1666 than `width` be prepended with '0' characters.

1667

1668 Returns

1669 -------

1670 Series/Index of objects.

1671

1672 See Also

1673 --------

1674 Series.str.rjust : Fills the left side of strings with an arbitrary

1675 character.

1676 Series.str.ljust : Fills the right side of strings with an arbitrary

1677 character.

1678 Series.str.pad : Fills the specified sides of strings with an arbitrary

1679 character.

1680 Series.str.center : Fills both sides of strings with an arbitrary

1681 character.

1682

1683 Notes

1684 -----

1685 Differs from :meth:`str.zfill` which has special handling

1686 for '+'/'-' in the string.

1687

1688 Examples

1689 --------

1690 >>> s = pd.Series(['-1', '1', '1000', 10, np.nan])

1691 >>> s

1692 0 -1

1693 1 1

1694 2 1000

1695 3 10

1696 4 NaN

1697 dtype: object

1698

1699 Note that ``10`` and ``NaN`` are not strings, therefore they are

1700 converted to ``NaN``. The minus sign in ``'-1'`` is treated as a

1701 special character and the zero is added to the right of it

1702 (:meth:`str.zfill` would have moved it to the left). ``1000``

1703 remains unchanged as it is longer than `width`.

1704

1705 >>> s.str.zfill(3)

1706 0 -01

1707 1 001

1708 2 1000

1709 3 NaN

1710 4 NaN

1711 dtype: object

1712 """

1713 if not is_integer(width):

1714 msg = f"width must be of integer type, not {type(width).__name__}"

1715 raise TypeError(msg)

1716 f = lambda x: x.zfill(width)

1717 result = self._data.array._str_map(f)

1718 return self._wrap_result(result)

1719

1720 def slice(self, start=None, stop=None, step=None):

1721 """

1722 Slice substrings from each element in the Series or Index.

1723

1724 Parameters

1725 ----------

1726 start : int, optional

1727 Start position for slice operation.

1728 stop : int, optional

1729 Stop position for slice operation.

1730 step : int, optional

1731 Step size for slice operation.

1732

1733 Returns

1734 -------

1735 Series or Index of object

1736 Series or Index from sliced substring from original string object.

1737

1738 See Also

1739 --------

1740 Series.str.slice_replace : Replace a slice with a string.

1741 Series.str.get : Return element at position.

1742 Equivalent to `Series.str.slice(start=i, stop=i+1)` with `i`

1743 being the position.

1744

1745 Examples

1746 --------

1747 >>> s = pd.Series(["koala", "dog", "chameleon"])

1748 >>> s

1749 0 koala

1750 1 dog

1751 2 chameleon

1752 dtype: object

1753

1754 >>> s.str.slice(start=1)

1755 0 oala

1756 1 og

1757 2 hameleon

1758 dtype: object

1759

1760 >>> s.str.slice(start=-1)

1761 0 a

1762 1 g

1763 2 n

1764 dtype: object

1765

1766 >>> s.str.slice(stop=2)

1767 0 ko

1768 1 do

1769 2 ch

1770 dtype: object

1771

1772 >>> s.str.slice(step=2)

1773 0 kaa

1774 1 dg

1775 2 caeen

1776 dtype: object

1777

1778 >>> s.str.slice(start=0, stop=5, step=3)

1779 0 kl

1780 1 d

1781 2 cm

1782 dtype: object

1783

1784 Equivalent behaviour to:

1785

1786 >>> s.str[0:5:3]

1787 0 kl

1788 1 d

1789 2 cm

1790 dtype: object

1791 """

1792 result = self._data.array._str_slice(start, stop, step)

1793 return self._wrap_result(result)

1794

1795 @forbid_nonstring_types(["bytes"])

1796 def slice_replace(self, start=None, stop=None, repl=None):

1797 """

1798 Replace a positional slice of a string with another value.

1799

1800 Parameters

1801 ----------

1802 start : int, optional

1803 Left index position to use for the slice. If not specified (None),

1804 the slice is unbounded on the left, i.e. slice from the start

1805 of the string.

1806 stop : int, optional

1807 Right index position to use for the slice. If not specified (None),

1808 the slice is unbounded on the right, i.e. slice until the

1809 end of the string.

1810 repl : str, optional

1811 String for replacement. If not specified (None), the sliced region

1812 is replaced with an empty string.

1813

1814 Returns

1815 -------

1816 Series or Index

1817 Same type as the original object.

1818

1819 See Also

1820 --------

1821 Series.str.slice : Just slicing without replacement.

1822

1823 Examples

1824 --------

1825 >>> s = pd.Series(['a', 'ab', 'abc', 'abdc', 'abcde'])

1826 >>> s

1827 0 a

1828 1 ab

1829 2 abc

1830 3 abdc

1831 4 abcde

1832 dtype: object

1833

1834 Specify just `start`, meaning replace `start` until the end of the

1835 string with `repl`.

1836

1837 >>> s.str.slice_replace(1, repl='X')

1838 0 aX

1839 1 aX

1840 2 aX

1841 3 aX

1842 4 aX

1843 dtype: object

1844

1845 Specify just `stop`, meaning the start of the string to `stop` is replaced

1846 with `repl`, and the rest of the string is included.

1847

1848 >>> s.str.slice_replace(stop=2, repl='X')

1849 0 X

1850 1 X

1851 2 Xc

1852 3 Xdc

1853 4 Xcde

1854 dtype: object

1855

1856 Specify `start` and `stop`, meaning the slice from `start` to `stop` is

1857 replaced with `repl`. Everything before or after `start` and `stop` is

1858 included as is.

1859

1860 >>> s.str.slice_replace(start=1, stop=3, repl='X')

1861 0 aX

1862 1 aX

1863 2 aX

1864 3 aXc

1865 4 aXde

1866 dtype: object

1867 """

1868 result = self._data.array._str_slice_replace(start, stop, repl)

1869 return self._wrap_result(result)

1870

1871 def decode(self, encoding, errors="strict"):

1872 """

1873 Decode character string in the Series/Index using indicated encoding.

1874

1875 Equivalent to :meth:`str.decode` in python2 and :meth:`bytes.decode` in

1876 python3.

1877

1878 Parameters

1879 ----------

1880 encoding : str

1881 errors : str, optional

1882

1883 Returns

1884 -------

1885 Series or Index

1886 """

1887 # TODO: Add a similar _bytes interface.

1888 if encoding in _cpython_optimized_decoders:

1889 # CPython optimized implementation

1890 f = lambda x: x.decode(encoding, errors)

1891 else:

1892 decoder = codecs.getdecoder(encoding)

1893 f = lambda x: decoder(x, errors)[0]

1894 arr = self._data.array

1895 # assert isinstance(arr, (StringArray,))

1896 result = arr._str_map(f)

1897 return self._wrap_result(result)

1898

1899 @forbid_nonstring_types(["bytes"])

1900 def encode(self, encoding, errors="strict"):

1901 """

1902 Encode character string in the Series/Index using indicated encoding.

1903

1904 Equivalent to :meth:`str.encode`.

1905

1906 Parameters

1907 ----------

1908 encoding : str

1909 errors : str, optional

1910

1911 Returns

1912 -------

1913 encoded : Series/Index of objects

1914 """

1915 result = self._data.array._str_encode(encoding, errors)

1916 return self._wrap_result(result, returns_string=False)

1917

1918 _shared_docs[

1919 "str_strip"

1920 ] = r"""

1921 Remove %(position)s characters.

1922

1923 Strip whitespaces (including newlines) or a set of specified characters

1924 from each string in the Series/Index from %(side)s.

1925 Replaces any non-strings in Series with NaNs.

1926 Equivalent to :meth:`str.%(method)s`.

1927

1928 Parameters

1929 ----------

1930 to_strip : str or None, default None

1931 Specifying the set of characters to be removed.

1932 All combinations of this set of characters will be stripped.

1933 If None then whitespaces are removed.

1934

1935 Returns

1936 -------

1937 Series or Index of object

1938

1939 See Also

1940 --------

1941 Series.str.strip : Remove leading and trailing characters in Series/Index.

1942 Series.str.lstrip : Remove leading characters in Series/Index.

1943 Series.str.rstrip : Remove trailing characters in Series/Index.

1944

1945 Examples

1946 --------

1947 >>> s = pd.Series(['1. Ant. ', '2. Bee!\n', '3. Cat?\t', np.nan, 10, True])

1948 >>> s

1949 0 1. Ant.

1950 1 2. Bee!\n

1951 2 3. Cat?\t

1952 3 NaN

1953 4 10

1954 5 True

1955 dtype: object

1956

1957 >>> s.str.strip()

1958 0 1. Ant.

1959 1 2. Bee!

1960 2 3. Cat?

1961 3 NaN

1962 4 NaN

1963 5 NaN

1964 dtype: object

1965

1966 >>> s.str.lstrip('123.')

1967 0 Ant.

1968 1 Bee!\n

1969 2 Cat?\t

1970 3 NaN

1971 4 NaN

1972 5 NaN

1973 dtype: object

1974

1975 >>> s.str.rstrip('.!? \n\t')

1976 0 1. Ant

1977 1 2. Bee

1978 2 3. Cat

1979 3 NaN

1980 4 NaN

1981 5 NaN

1982 dtype: object

1983

1984 >>> s.str.strip('123.!? \n\t')

1985 0 Ant

1986 1 Bee

1987 2 Cat

1988 3 NaN

1989 4 NaN

1990 5 NaN

1991 dtype: object

1992 """

1993

1994 @Appender(

1995 _shared_docs["str_strip"]

1996 % {

1997 "side": "left and right sides",

1998 "method": "strip",

1999 "position": "leading and trailing",

2000 }

2001 )

2002 @forbid_nonstring_types(["bytes"])

2003 def strip(self, to_strip=None):

2004 result = self._data.array._str_strip(to_strip)

2005 return self._wrap_result(result)

2006

2007 @Appender(

2008 _shared_docs["str_strip"]

2009 % {"side": "left side", "method": "lstrip", "position": "leading"}

2010 )

2011 @forbid_nonstring_types(["bytes"])

2012 def lstrip(self, to_strip=None):

2013 result = self._data.array._str_lstrip(to_strip)

2014 return self._wrap_result(result)

2015

2016 @Appender(

2017 _shared_docs["str_strip"]

2018 % {"side": "right side", "method": "rstrip", "position": "trailing"}

2019 )

2020 @forbid_nonstring_types(["bytes"])

2021 def rstrip(self, to_strip=None):

2022 result = self._data.array._str_rstrip(to_strip)

2023 return self._wrap_result(result)

2024

2025 _shared_docs[

2026 "str_removefix"

2027 ] = r"""

2028 Remove a %(side)s from an object series.

2029

2030 If the %(side)s is not present, the original string will be returned.

2031

2032 Parameters

2033 ----------

2034 %(side)s : str

2035 Remove the %(side)s of the string.

2036

2037 Returns

2038 -------

2039 Series/Index: object

2040 The Series or Index with given %(side)s removed.

2041

2042 See Also

2043 --------

2044 Series.str.remove%(other_side)s : Remove a %(other_side)s from an object series.

2045

2046 Examples

2047 --------

2048 >>> s = pd.Series(["str_foo", "str_bar", "no_prefix"])

2049 >>> s

2050 0 str_foo

2051 1 str_bar

2052 2 no_prefix

2053 dtype: object

2054 >>> s.str.removeprefix("str_")

2055 0 foo

2056 1 bar

2057 2 no_prefix

2058 dtype: object

2059

2060 >>> s = pd.Series(["foo_str", "bar_str", "no_suffix"])

2061 >>> s

2062 0 foo_str

2063 1 bar_str

2064 2 no_suffix

2065 dtype: object

2066 >>> s.str.removesuffix("_str")

2067 0 foo

2068 1 bar

2069 2 no_suffix

2070 dtype: object

2071 """

2072

2073 @Appender(

2074 _shared_docs["str_removefix"] % {"side": "prefix", "other_side": "suffix"}

2075 )

2076 @forbid_nonstring_types(["bytes"])

2077 def removeprefix(self, prefix):

2078 result = self._data.array._str_removeprefix(prefix)

2079 return self._wrap_result(result)

2080

2081 @Appender(

2082 _shared_docs["str_removefix"] % {"side": "suffix", "other_side": "prefix"}

2083 )

2084 @forbid_nonstring_types(["bytes"])

2085 def removesuffix(self, suffix):

2086 result = self._data.array._str_removesuffix(suffix)

2087 return self._wrap_result(result)

2088

2089 @forbid_nonstring_types(["bytes"])

2090 def wrap(self, width, **kwargs):

2091 r"""

2092 Wrap strings in Series/Index at specified line width.

2093

2094 This method has the same keyword parameters and defaults as

2095 :class:`textwrap.TextWrapper`.

2096

2097 Parameters

2098 ----------

2099 width : int

2100 Maximum line width.

2101 expand_tabs : bool, optional

2102 If True, tab characters will be expanded to spaces (default: True).

2103 replace_whitespace : bool, optional

2104 If True, each whitespace character (as defined by string.whitespace)

2105 remaining after tab expansion will be replaced by a single space

2106 (default: True).

2107 drop_whitespace : bool, optional

2108 If True, whitespace that, after wrapping, happens to end up at the

2109 beginning or end of a line is dropped (default: True).

2110 break_long_words : bool, optional

2111 If True, then words longer than width will be broken in order to ensure

2112 that no lines are longer than width. If it is false, long words will

2113 not be broken, and some lines may be longer than width (default: True).

2114 break_on_hyphens : bool, optional

2115 If True, wrapping will occur preferably on whitespace and right after

2116 hyphens in compound words, as it is customary in English. If false,

2117 only whitespaces will be considered as potentially good places for line

2118 breaks, but you need to set break_long_words to false if you want truly

2119 insecable words (default: True).

2120

2121 Returns

2122 -------

2123 Series or Index

2124

2125 Notes

2126 -----

2127 Internally, this method uses a :class:`textwrap.TextWrapper` instance with

2128 default settings. To achieve behavior matching R's stringr library str_wrap

2129 function, use the arguments:

2130

2131 - expand_tabs = False

2132 - replace_whitespace = True

2133 - drop_whitespace = True

2134 - break_long_words = False

2135 - break_on_hyphens = False

2136

2137 Examples

2138 --------

2139 >>> s = pd.Series(['line to be wrapped', 'another line to be wrapped'])

2140 >>> s.str.wrap(12)

2141 0 line to be\nwrapped

2142 1 another line\nto be\nwrapped

2143 dtype: object

2144 """

2145 result = self._data.array._str_wrap(width, **kwargs)

2146 return self._wrap_result(result)

2147

2148 @forbid_nonstring_types(["bytes"])

2149 def get_dummies(self, sep="|"):

2150 """

2151 Return DataFrame of dummy/indicator variables for Series.

2152

2153 Each string in Series is split by sep and returned as a DataFrame

2154 of dummy/indicator variables.

2155

2156 Parameters

2157 ----------

2158 sep : str, default "|"

2159 String to split on.

2160

2161 Returns

2162 -------

2163 DataFrame

2164 Dummy variables corresponding to values of the Series.

2165

2166 See Also

2167 --------

2168 get_dummies : Convert categorical variable into dummy/indicator

2169 variables.

2170

2171 Examples

2172 --------

2173 >>> pd.Series(['a|b', 'a', 'a|c']).str.get_dummies()

2174 a b c

2175 0 1 1 0

2176 1 1 0 0

2177 2 1 0 1

2178

2179 >>> pd.Series(['a|b', np.nan, 'a|c']).str.get_dummies()

2180 a b c

2181 0 1 1 0

2182 1 0 0 0

2183 2 1 0 1

2184 """

2185 # we need to cast to Series of strings as only that has all

2186 # methods available for making the dummies...

2187 result, name = self._data.array._str_get_dummies(sep)

2188 return self._wrap_result(

2189 result,

2190 name=name,

2191 expand=True,

2192 returns_string=False,

2193 )

2194

2195 @forbid_nonstring_types(["bytes"])

2196 def translate(self, table):

2197 """

2198 Map all characters in the string through the given mapping table.

2199

2200 Equivalent to standard :meth:`str.translate`.

2201

2202 Parameters

2203 ----------

2204 table : dict

2205 Table is a mapping of Unicode ordinals to Unicode ordinals, strings, or

2206 None. Unmapped characters are left untouched.

2207 Characters mapped to None are deleted. :meth:`str.maketrans` is a

2208 helper function for making translation tables.

2209

2210 Returns

2211 -------

2212 Series or Index

2213 """

2214 result = self._data.array._str_translate(table)

2215 return self._wrap_result(result)

2216

2217 @forbid_nonstring_types(["bytes"])

2218 def count(self, pat, flags=0):

2219 r"""

2220 Count occurrences of pattern in each string of the Series/Index.

2221

2222 This function is used to count the number of times a particular regex

2223 pattern is repeated in each of the string elements of the

2224 :class:`~pandas.Series`.

2225

2226 Parameters

2227 ----------

2228 pat : str

2229 Valid regular expression.

2230 flags : int, default 0, meaning no flags

2231 Flags for the `re` module. For a complete list, `see here

2232 <https://docs.python.org/3/howto/regex.html#compilation-flags>`_.

2233 **kwargs

2234 For compatibility with other string methods. Not used.

2235

2236 Returns

2237 -------

2238 Series or Index

2239 Same type as the calling object containing the integer counts.

2240

2241 See Also

2242 --------

2243 re : Standard library module for regular expressions.

2244 str.count : Standard library version, without regular expression support.

2245

2246 Notes

2247 -----

2248 Some characters need to be escaped when passing in `pat`.

2249 eg. ``'$'`` has a special meaning in regex and must be escaped when

2250 finding this literal character.

2251

2252 Examples

2253 --------

2254 >>> s = pd.Series(['A', 'B', 'Aaba', 'Baca', np.nan, 'CABA', 'cat'])

2255 >>> s.str.count('a')

2256 0 0.0

2257 1 0.0

2258 2 2.0

2259 3 2.0

2260 4 NaN

2261 5 0.0

2262 6 1.0

2263 dtype: float64

2264

2265 Escape ``'$'`` to find the literal dollar sign.

2266

2267 >>> s = pd.Series(['$', 'B', 'Aab$', '$$ca', 'C$B$', 'cat'])

2268 >>> s.str.count('\\$')

2269 0 1

2270 1 0

2271 2 1

2272 3 2

2273 4 2

2274 5 0

2275 dtype: int64

2276

2277 This is also available on Index

2278

2279 >>> pd.Index(['A', 'A', 'Aaba', 'cat']).str.count('a')

2280 Int64Index([0, 0, 2, 1], dtype='int64')

2281 """

2282 result = self._data.array._str_count(pat, flags)

2283 return self._wrap_result(result, returns_string=False)

2284

2285 @forbid_nonstring_types(["bytes"])

2286 def startswith(

2287 self, pat: str | tuple[str, ...], na: Scalar | None = None

2288 ) -> Series | Index:

2289 """

2290 Test if the start of each string element matches a pattern.

2291

2292 Equivalent to :meth:`str.startswith`.

2293

2294 Parameters

2295 ----------

2296 pat : str or tuple[str, ...]

2297 Character sequence or tuple of strings. Regular expressions are not

2298 accepted.

2299 na : object, default NaN

2300 Object shown if element tested is not a string. The default depends

2301 on dtype of the array. For object-dtype, ``numpy.nan`` is used.

2302 For ``StringDtype``, ``pandas.NA`` is used.

2303

2304 Returns

2305 -------

2306 Series or Index of bool

2307 A Series of booleans indicating whether the given pattern matches

2308 the start of each string element.

2309

2310 See Also

2311 --------

2312 str.startswith : Python standard library string method.

2313 Series.str.endswith : Same as startswith, but tests the end of string.

2314 Series.str.contains : Tests if string element contains a pattern.

2315

2316 Examples

2317 --------

2318 >>> s = pd.Series(['bat', 'Bear', 'cat', np.nan])

2319 >>> s

2320 0 bat

2321 1 Bear

2322 2 cat

2323 3 NaN

2324 dtype: object

2325

2326 >>> s.str.startswith('b')

2327 0 True

2328 1 False

2329 2 False

2330 3 NaN

2331 dtype: object

2332

2333 >>> s.str.startswith(('b', 'B'))

2334 0 True

2335 1 True

2336 2 False

2337 3 NaN

2338 dtype: object

2339

2340 Specifying `na` to be `False` instead of `NaN`.

2341

2342 >>> s.str.startswith('b', na=False)

2343 0 True

2344 1 False

2345 2 False

2346 3 False

2347 dtype: bool

2348 """

2349 if not isinstance(pat, (str, tuple)):

2350 msg = f"expected a string or tuple, not {type(pat).__name__}"

2351 raise TypeError(msg)

2352 result = self._data.array._str_startswith(pat, na=na)

2353 return self._wrap_result(result, returns_string=False)

2354

2355 @forbid_nonstring_types(["bytes"])

2356 def endswith(

2357 self, pat: str | tuple[str, ...], na: Scalar | None = None

2358 ) -> Series | Index:

2359 """

2360 Test if the end of each string element matches a pattern.

2361

2362 Equivalent to :meth:`str.endswith`.

2363

2364 Parameters

2365 ----------

2366 pat : str or tuple[str, ...]

2367 Character sequence or tuple of strings. Regular expressions are not

2368 accepted.

2369 na : object, default NaN

2370 Object shown if element tested is not a string. The default depends

2371 on dtype of the array. For object-dtype, ``numpy.nan`` is used.

2372 For ``StringDtype``, ``pandas.NA`` is used.

2373

2374 Returns

2375 -------

2376 Series or Index of bool

2377 A Series of booleans indicating whether the given pattern matches

2378 the end of each string element.

2379

2380 See Also

2381 --------

2382 str.endswith : Python standard library string method.

2383 Series.str.startswith : Same as endswith, but tests the start of string.

2384 Series.str.contains : Tests if string element contains a pattern.

2385

2386 Examples

2387 --------

2388 >>> s = pd.Series(['bat', 'bear', 'caT', np.nan])

2389 >>> s

2390 0 bat

2391 1 bear

2392 2 caT

2393 3 NaN

2394 dtype: object

2395

2396 >>> s.str.endswith('t')

2397 0 True

2398 1 False

2399 2 False

2400 3 NaN

2401 dtype: object

2402

2403 >>> s.str.endswith(('t', 'T'))

2404 0 True

2405 1 False

2406 2 True

2407 3 NaN

2408 dtype: object

2409

2410 Specifying `na` to be `False` instead of `NaN`.

2411

2412 >>> s.str.endswith('t', na=False)

2413 0 True

2414 1 False

2415 2 False

2416 3 False

2417 dtype: bool

2418 """

2419 if not isinstance(pat, (str, tuple)):

2420 msg = f"expected a string or tuple, not {type(pat).__name__}"

2421 raise TypeError(msg)

2422 result = self._data.array._str_endswith(pat, na=na)

2423 return self._wrap_result(result, returns_string=False)

2424

2425 @forbid_nonstring_types(["bytes"])

2426 def findall(self, pat, flags=0):

2427 """

2428 Find all occurrences of pattern or regular expression in the Series/Index.

2429

2430 Equivalent to applying :func:`re.findall` to all the elements in the

2431 Series/Index.

2432

2433 Parameters

2434 ----------

2435 pat : str

2436 Pattern or regular expression.

2437 flags : int, default 0

2438 Flags from ``re`` module, e.g. `re.IGNORECASE` (default is 0, which

2439 means no flags).

2440

2441 Returns

2442 -------

2443 Series/Index of lists of strings

2444 All non-overlapping matches of pattern or regular expression in each

2445 string of this Series/Index.

2446

2447 See Also

2448 --------

2449 count : Count occurrences of pattern or regular expression in each string

2450 of the Series/Index.

2451 extractall : For each string in the Series, extract groups from all matches

2452 of regular expression and return a DataFrame with one row for each

2453 match and one column for each group.

2454 re.findall : The equivalent ``re`` function to all non-overlapping matches

2455 of pattern or regular expression in string, as a list of strings.

2456

2457 Examples

2458 --------

2459 >>> s = pd.Series(['Lion', 'Monkey', 'Rabbit'])

2460

2461 The search for the pattern 'Monkey' returns one match:

2462

2463 >>> s.str.findall('Monkey')

2464 0 []

2465 1 [Monkey]

2466 2 []

2467 dtype: object

2468

2469 On the other hand, the search for the pattern 'MONKEY' doesn't return any

2470 match:

2471

2472 >>> s.str.findall('MONKEY')

2473 0 []

2474 1 []

2475 2 []

2476 dtype: object

2477

2478 Flags can be added to the pattern or regular expression. For instance,

2479 to find the pattern 'MONKEY' ignoring the case:

2480

2481 >>> import re

2482 >>> s.str.findall('MONKEY', flags=re.IGNORECASE)

2483 0 []

2484 1 [Monkey]

2485 2 []

2486 dtype: object

2487

2488 When the pattern matches more than one string in the Series, all matches

2489 are returned:

2490

2491 >>> s.str.findall('on')

2492 0 [on]

2493 1 [on]

2494 2 []

2495 dtype: object

2496

2497 Regular expressions are supported too. For instance, the search for all the

2498 strings ending with the word 'on' is shown next:

2499

2500 >>> s.str.findall('on$')

2501 0 [on]

2502 1 []

2503 2 []

2504 dtype: object

2505

2506 If the pattern is found more than once in the same string, then a list of

2507 multiple strings is returned:

2508

2509 >>> s.str.findall('b')

2510 0 []

2511 1 []

2512 2 [b, b]

2513 dtype: object

2514 """

2515 result = self._data.array._str_findall(pat, flags)

2516 return self._wrap_result(result, returns_string=False)

2517

2518 @forbid_nonstring_types(["bytes"])

2519 def extract(

2520 self, pat: str, flags: int = 0, expand: bool = True

2521 ) -> DataFrame | Series | Index:

2522 r"""

2523 Extract capture groups in the regex `pat` as columns in a DataFrame.

2524

2525 For each subject string in the Series, extract groups from the

2526 first match of regular expression `pat`.

2527

2528 Parameters

2529 ----------

2530 pat : str

2531 Regular expression pattern with capturing groups.

2532 flags : int, default 0 (no flags)

2533 Flags from the ``re`` module, e.g. ``re.IGNORECASE``, that

2534 modify regular expression matching for things like case,

2535 spaces, etc. For more details, see :mod:`re`.

2536 expand : bool, default True

2537 If True, return DataFrame with one column per capture group.

2538 If False, return a Series/Index if there is one capture group

2539 or DataFrame if there are multiple capture groups.

2540

2541 Returns

2542 -------

2543 DataFrame or Series or Index

2544 A DataFrame with one row for each subject string, and one

2545 column for each group. Any capture group names in regular

2546 expression pat will be used for column names; otherwise

2547 capture group numbers will be used. The dtype of each result

2548 column is always object, even when no match is found. If

2549 ``expand=False`` and pat has only one capture group, then

2550 return a Series (if subject is a Series) or Index (if subject

2551 is an Index).

2552

2553 See Also

2554 --------

2555 extractall : Returns all matches (not just the first match).

2556

2557 Examples

2558 --------

2559 A pattern with two groups will return a DataFrame with two columns.

2560 Non-matches will be NaN.

2561

2562 >>> s = pd.Series(['a1', 'b2', 'c3'])

2563 >>> s.str.extract(r'([ab])(\d)')

2564 0 1

2565 0 a 1

2566 1 b 2

2567 2 NaN NaN

2568

2569 A pattern may contain optional groups.

2570

2571 >>> s.str.extract(r'([ab])?(\d)')

2572 0 1

2573 0 a 1

2574 1 b 2

2575 2 NaN 3

2576

2577 Named groups will become column names in the result.

2578

2579 >>> s.str.extract(r'(?P<letter>[ab])(?P<digit>\d)')

2580 letter digit

2581 0 a 1

2582 1 b 2

2583 2 NaN NaN

2584

2585 A pattern with one group will return a DataFrame with one column

2586 if expand=True.

2587

2588 >>> s.str.extract(r'[ab](\d)', expand=True)

2589 0

2590 0 1

2591 1 2

2592 2 NaN

2593

2594 A pattern with one group will return a Series if expand=False.

2595

2596 >>> s.str.extract(r'[ab](\d)', expand=False)

2597 0 1

2598 1 2

2599 2 NaN

2600 dtype: object

2601 """

2602 from pandas import DataFrame

2603

2604 if not isinstance(expand, bool):

2605 raise ValueError("expand must be True or False")

2606

2607 regex = re.compile(pat, flags=flags)

2608 if regex.groups == 0:

2609 raise ValueError("pattern contains no capture groups")

2610

2611 if not expand and regex.groups > 1 and isinstance(self._data, ABCIndex):

2612 raise ValueError("only one regex group is supported with Index")

2613

2614 obj = self._data

2615 result_dtype = _result_dtype(obj)

2616

2617 returns_df = regex.groups > 1 or expand

2618

2619 if returns_df:

2620 name = None

2621 columns = _get_group_names(regex)

2622

2623 if obj.array.size == 0:

2624 result = DataFrame(columns=columns, dtype=result_dtype)

2625

2626 else:

2627 result_list = self._data.array._str_extract(

2628 pat, flags=flags, expand=returns_df

2629 )

2630

2631 result_index: Index | None

2632 if isinstance(obj, ABCSeries):

2633 result_index = obj.index

2634 else:

2635 result_index = None

2636

2637 result = DataFrame(

2638 result_list, columns=columns, index=result_index, dtype=result_dtype

2639 )

2640

2641 else:

2642 name = _get_single_group_name(regex)

2643 result = self._data.array._str_extract(pat, flags=flags, expand=returns_df)

2644 return self._wrap_result(result, name=name)

2645

2646 @forbid_nonstring_types(["bytes"])

2647 def extractall(self, pat, flags=0):

2648 r"""

2649 Extract capture groups in the regex `pat` as columns in DataFrame.

2650

2651 For each subject string in the Series, extract groups from all

2652 matches of regular expression pat. When each subject string in the

2653 Series has exactly one match, extractall(pat).xs(0, level='match')

2654 is the same as extract(pat).

2655

2656 Parameters

2657 ----------

2658 pat : str

2659 Regular expression pattern with capturing groups.

2660 flags : int, default 0 (no flags)

2661 A ``re`` module flag, for example ``re.IGNORECASE``. These allow

2662 to modify regular expression matching for things like case, spaces,

2663 etc. Multiple flags can be combined with the bitwise OR operator,

2664 for example ``re.IGNORECASE | re.MULTILINE``.

2665

2666 Returns

2667 -------

2668 DataFrame

2669 A ``DataFrame`` with one row for each match, and one column for each

2670 group. Its rows have a ``MultiIndex`` with first levels that come from

2671 the subject ``Series``. The last level is named 'match' and indexes the

2672 matches in each item of the ``Series``. Any capture group names in

2673 regular expression pat will be used for column names; otherwise capture

2674 group numbers will be used.

2675

2676 See Also

2677 --------

2678 extract : Returns first match only (not all matches).

2679

2680 Examples

2681 --------

2682 A pattern with one group will return a DataFrame with one column.

2683 Indices with no matches will not appear in the result.

2684

2685 >>> s = pd.Series(["a1a2", "b1", "c1"], index=["A", "B", "C"])

2686 >>> s.str.extractall(r"[ab](\d)")

2687 0

2688 match

2689 A 0 1

2690 1 2

2691 B 0 1

2692

2693 Capture group names are used for column names of the result.

2694

2695 >>> s.str.extractall(r"[ab](?P<digit>\d)")

2696 digit

2697 match

2698 A 0 1

2699 1 2

2700 B 0 1

2701

2702 A pattern with two groups will return a DataFrame with two columns.

2703

2704 >>> s.str.extractall(r"(?P<letter>[ab])(?P<digit>\d)")

2705 letter digit

2706 match

2707 A 0 a 1

2708 1 a 2

2709 B 0 b 1

2710

2711 Optional groups that do not match are NaN in the result.

2712

2713 >>> s.str.extractall(r"(?P<letter>[ab])?(?P<digit>\d)")

2714 letter digit

2715 match

2716 A 0 a 1

2717 1 a 2

2718 B 0 b 1

2719 C 0 NaN 1

2720 """

2721 # TODO: dispatch

2722 return str_extractall(self._orig, pat, flags)

2723

2724 _shared_docs[

2725 "find"

2726 ] = """

2727 Return %(side)s indexes in each strings in the Series/Index.

2728

2729 Each of returned indexes corresponds to the position where the

2730 substring is fully contained between [start:end]. Return -1 on

2731 failure. Equivalent to standard :meth:`str.%(method)s`.

2732

2733 Parameters

2734 ----------

2735 sub : str

2736 Substring being searched.

2737 start : int

2738 Left edge index.

2739 end : int

2740 Right edge index.

2741

2742 Returns

2743 -------

2744 Series or Index of int.

2745

2746 See Also

2747 --------

2748 %(also)s

2749 """

2750

2751 @Appender(

2752 _shared_docs["find"]

2753 % {

2754 "side": "lowest",

2755 "method": "find",

2756 "also": "rfind : Return highest indexes in each strings.",

2757 }

2758 )

2759 @forbid_nonstring_types(["bytes"])

2760 def find(self, sub, start=0, end=None):

2761 if not isinstance(sub, str):

2762 msg = f"expected a string object, not {type(sub).__name__}"

2763 raise TypeError(msg)

2764

2765 result = self._data.array._str_find(sub, start, end)

2766 return self._wrap_result(result, returns_string=False)

2767

2768 @Appender(

2769 _shared_docs["find"]

2770 % {

2771 "side": "highest",

2772 "method": "rfind",

2773 "also": "find : Return lowest indexes in each strings.",

2774 }

2775 )

2776 @forbid_nonstring_types(["bytes"])

2777 def rfind(self, sub, start=0, end=None):

2778 if not isinstance(sub, str):

2779 msg = f"expected a string object, not {type(sub).__name__}"

2780 raise TypeError(msg)

2781

2782 result = self._data.array._str_rfind(sub, start=start, end=end)

2783 return self._wrap_result(result, returns_string=False)

2784

2785 @forbid_nonstring_types(["bytes"])

2786 def normalize(self, form):

2787 """

2788 Return the Unicode normal form for the strings in the Series/Index.

2789

2790 For more information on the forms, see the

2791 :func:`unicodedata.normalize`.

2792

2793 Parameters

2794 ----------

2795 form : {'NFC', 'NFKC', 'NFD', 'NFKD'}

2796 Unicode form.

2797

2798 Returns

2799 -------

2800 normalized : Series/Index of objects

2801 """

2802 result = self._data.array._str_normalize(form)

2803 return self._wrap_result(result)

2804

2805 _shared_docs[

2806 "index"

2807 ] = """

2808 Return %(side)s indexes in each string in Series/Index.

2809

2810 Each of the returned indexes corresponds to the position where the

2811 substring is fully contained between [start:end]. This is the same

2812 as ``str.%(similar)s`` except instead of returning -1, it raises a

2813 ValueError when the substring is not found. Equivalent to standard

2814 ``str.%(method)s``.

2815

2816 Parameters

2817 ----------

2818 sub : str

2819 Substring being searched.

2820 start : int

2821 Left edge index.

2822 end : int

2823 Right edge index.

2824

2825 Returns

2826 -------

2827 Series or Index of object

2828

2829 See Also

2830 --------

2831 %(also)s

2832 """

2833

2834 @Appender(

2835 _shared_docs["index"]

2836 % {

2837 "side": "lowest",

2838 "similar": "find",

2839 "method": "index",

2840 "also": "rindex : Return highest indexes in each strings.",

2841 }

2842 )

2843 @forbid_nonstring_types(["bytes"])

2844 def index(self, sub, start=0, end=None):

2845 if not isinstance(sub, str):

2846 msg = f"expected a string object, not {type(sub).__name__}"

2847 raise TypeError(msg)

2848

2849 result = self._data.array._str_index(sub, start=start, end=end)

2850 return self._wrap_result(result, returns_string=False)

2851

2852 @Appender(

2853 _shared_docs["index"]

2854 % {

2855 "side": "highest",

2856 "similar": "rfind",

2857 "method": "rindex",

2858 "also": "index : Return lowest indexes in each strings.",

2859 }

2860 )

2861 @forbid_nonstring_types(["bytes"])

2862 def rindex(self, sub, start=0, end=None):

2863 if not isinstance(sub, str):

2864 msg = f"expected a string object, not {type(sub).__name__}"

2865 raise TypeError(msg)

2866

2867 result = self._data.array._str_rindex(sub, start=start, end=end)

2868 return self._wrap_result(result, returns_string=False)

2869

2870 def len(self):

2871 """

2872 Compute the length of each element in the Series/Index.

2873

2874 The element may be a sequence (such as a string, tuple or list) or a collection

2875 (such as a dictionary).

2876

2877 Returns

2878 -------

2879 Series or Index of int

2880 A Series or Index of integer values indicating the length of each

2881 element in the Series or Index.

2882

2883 See Also

2884 --------

2885 str.len : Python built-in function returning the length of an object.

2886 Series.size : Returns the length of the Series.

2887

2888 Examples

2889 --------

2890 Returns the length (number of characters) in a string. Returns the

2891 number of entries for dictionaries, lists or tuples.

2892

2893 >>> s = pd.Series(['dog',

2894 ... '',

2895 ... 5,

2896 ... {'foo' : 'bar'},

2897 ... [2, 3, 5, 7],

2898 ... ('one', 'two', 'three')])

2899 >>> s

2900 0 dog

2901 1

2902 2 5

2903 3 {'foo': 'bar'}

2904 4 [2, 3, 5, 7]

2905 5 (one, two, three)

2906 dtype: object

2907 >>> s.str.len()

2908 0 3.0

2909 1 0.0

2910 2 NaN

2911 3 1.0

2912 4 4.0

2913 5 3.0

2914 dtype: float64

2915 """

2916 result = self._data.array._str_len()

2917 return self._wrap_result(result, returns_string=False)

2918

2919 _shared_docs[

2920 "casemethods"

2921 ] = """

2922 Convert strings in the Series/Index to %(type)s.

2923 %(version)s

2924 Equivalent to :meth:`str.%(method)s`.

2925

2926 Returns

2927 -------

2928 Series or Index of object

2929

2930 See Also

2931 --------

2932 Series.str.lower : Converts all characters to lowercase.

2933 Series.str.upper : Converts all characters to uppercase.

2934 Series.str.title : Converts first character of each word to uppercase and

2935 remaining to lowercase.

2936 Series.str.capitalize : Converts first character to uppercase and

2937 remaining to lowercase.

2938 Series.str.swapcase : Converts uppercase to lowercase and lowercase to

2939 uppercase.

2940 Series.str.casefold: Removes all case distinctions in the string.

2941

2942 Examples

2943 --------

2944 >>> s = pd.Series(['lower', 'CAPITALS', 'this is a sentence', 'SwApCaSe'])

2945 >>> s

2946 0 lower

2947 1 CAPITALS

2948 2 this is a sentence

2949 3 SwApCaSe

2950 dtype: object

2951

2952 >>> s.str.lower()

2953 0 lower

2954 1 capitals

2955 2 this is a sentence

2956 3 swapcase

2957 dtype: object

2958

2959 >>> s.str.upper()

2960 0 LOWER

2961 1 CAPITALS

2962 2 THIS IS A SENTENCE

2963 3 SWAPCASE

2964 dtype: object

2965

2966 >>> s.str.title()

2967 0 Lower

2968 1 Capitals

2969 2 This Is A Sentence

2970 3 Swapcase

2971 dtype: object

2972

2973 >>> s.str.capitalize()

2974 0 Lower

2975 1 Capitals

2976 2 This is a sentence

2977 3 Swapcase

2978 dtype: object

2979

2980 >>> s.str.swapcase()

2981 0 LOWER

2982 1 capitals

2983 2 THIS IS A SENTENCE

2984 3 sWaPcAsE

2985 dtype: object

2986 """

2987 # Types:

2988 # cases:

2989 # upper, lower, title, capitalize, swapcase, casefold

2990 # boolean:

2991 # isalpha, isnumeric isalnum isdigit isdecimal isspace islower isupper istitle

2992 # _doc_args holds dict of strings to use in substituting casemethod docs

2993 _doc_args: dict[str, dict[str, str]] = {}

2994 _doc_args["lower"] = {"type": "lowercase", "method": "lower", "version": ""}

2995 _doc_args["upper"] = {"type": "uppercase", "method": "upper", "version": ""}

2996 _doc_args["title"] = {"type": "titlecase", "method": "title", "version": ""}

2997 _doc_args["capitalize"] = {

2998 "type": "be capitalized",

2999 "method": "capitalize",

3000 "version": "",

3001 }

3002 _doc_args["swapcase"] = {

3003 "type": "be swapcased",

3004 "method": "swapcase",

3005 "version": "",

3006 }

3007 _doc_args["casefold"] = {

3008 "type": "be casefolded",

3009 "method": "casefold",

3010 "version": "\n .. versionadded:: 0.25.0\n",

3011 }

3012

3013 @Appender(_shared_docs["casemethods"] % _doc_args["lower"])

3014 @forbid_nonstring_types(["bytes"])

3015 def lower(self):

3016 result = self._data.array._str_lower()

3017 return self._wrap_result(result)

3018

3019 @Appender(_shared_docs["casemethods"] % _doc_args["upper"])

3020 @forbid_nonstring_types(["bytes"])

3021 def upper(self):

3022 result = self._data.array._str_upper()

3023 return self._wrap_result(result)

3024

3025 @Appender(_shared_docs["casemethods"] % _doc_args["title"])

3026 @forbid_nonstring_types(["bytes"])

3027 def title(self):

3028 result = self._data.array._str_title()

3029 return self._wrap_result(result)

3030

3031 @Appender(_shared_docs["casemethods"] % _doc_args["capitalize"])

3032 @forbid_nonstring_types(["bytes"])

3033 def capitalize(self):

3034 result = self._data.array._str_capitalize()

3035 return self._wrap_result(result)

3036

3037 @Appender(_shared_docs["casemethods"] % _doc_args["swapcase"])

3038 @forbid_nonstring_types(["bytes"])

3039 def swapcase(self):

3040 result = self._data.array._str_swapcase()

3041 return self._wrap_result(result)

3042

3043 @Appender(_shared_docs["casemethods"] % _doc_args["casefold"])

3044 @forbid_nonstring_types(["bytes"])

3045 def casefold(self):

3046 result = self._data.array._str_casefold()

3047 return self._wrap_result(result)

3048

3049 _shared_docs[

3050 "ismethods"

3051 ] = """

3052 Check whether all characters in each string are %(type)s.

3053

3054 This is equivalent to running the Python string method

3055 :meth:`str.%(method)s` for each element of the Series/Index. If a string

3056 has zero characters, ``False`` is returned for that check.

3057

3058 Returns

3059 -------

3060 Series or Index of bool

3061 Series or Index of boolean values with the same length as the original

3062 Series/Index.

3063

3064 See Also

3065 --------

3066 Series.str.isalpha : Check whether all characters are alphabetic.

3067 Series.str.isnumeric : Check whether all characters are numeric.

3068 Series.str.isalnum : Check whether all characters are alphanumeric.

3069 Series.str.isdigit : Check whether all characters are digits.

3070 Series.str.isdecimal : Check whether all characters are decimal.

3071 Series.str.isspace : Check whether all characters are whitespace.

3072 Series.str.islower : Check whether all characters are lowercase.

3073 Series.str.isupper : Check whether all characters are uppercase.

3074 Series.str.istitle : Check whether all characters are titlecase.

3075

3076 Examples

3077 --------

3078 **Checks for Alphabetic and Numeric Characters**

3079

3080 >>> s1 = pd.Series(['one', 'one1', '1', ''])

3081

3082 >>> s1.str.isalpha()

3083 0 True

3084 1 False

3085 2 False

3086 3 False

3087 dtype: bool

3088

3089 >>> s1.str.isnumeric()

3090 0 False

3091 1 False

3092 2 True

3093 3 False

3094 dtype: bool

3095

3096 >>> s1.str.isalnum()

3097 0 True

3098 1 True

3099 2 True

3100 3 False

3101 dtype: bool

3102

3103 Note that checks against characters mixed with any additional punctuation

3104 or whitespace will evaluate to false for an alphanumeric check.

3105

3106 >>> s2 = pd.Series(['A B', '1.5', '3,000'])

3107 >>> s2.str.isalnum()

3108 0 False

3109 1 False

3110 2 False

3111 dtype: bool

3112

3113 **More Detailed Checks for Numeric Characters**

3114

3115 There are several different but overlapping sets of numeric characters that

3116 can be checked for.

3117

3118 >>> s3 = pd.Series(['23', '³', '⅕', ''])

3119

3120 The ``s3.str.isdecimal`` method checks for characters used to form numbers

3121 in base 10.

3122

3123 >>> s3.str.isdecimal()

3124 0 True

3125 1 False

3126 2 False

3127 3 False

3128 dtype: bool

3129

3130 The ``s.str.isdigit`` method is the same as ``s3.str.isdecimal`` but also

3131 includes special digits, like superscripted and subscripted digits in

3132 unicode.

3133

3134 >>> s3.str.isdigit()

3135 0 True

3136 1 True

3137 2 False

3138 3 False

3139 dtype: bool

3140

3141 The ``s.str.isnumeric`` method is the same as ``s3.str.isdigit`` but also

3142 includes other characters that can represent quantities such as unicode

3143 fractions.

3144

3145 >>> s3.str.isnumeric()

3146 0 True

3147 1 True

3148 2 True

3149 3 False

3150 dtype: bool

3151

3152 **Checks for Whitespace**

3153

3154 >>> s4 = pd.Series([' ', '\\t\\r\\n ', ''])

3155 >>> s4.str.isspace()

3156 0 True

3157 1 True

3158 2 False

3159 dtype: bool

3160

3161 **Checks for Character Case**

3162

3163 >>> s5 = pd.Series(['leopard', 'Golden Eagle', 'SNAKE', ''])

3164

3165 >>> s5.str.islower()

3166 0 True

3167 1 False

3168 2 False

3169 3 False

3170 dtype: bool

3171

3172 >>> s5.str.isupper()

3173 0 False

3174 1 False

3175 2 True

3176 3 False

3177 dtype: bool

3178

3179 The ``s5.str.istitle`` method checks for whether all words are in title

3180 case (whether only the first letter of each word is capitalized). Words are

3181 assumed to be as any sequence of non-numeric characters separated by

3182 whitespace characters.

3183

3184 >>> s5.str.istitle()

3185 0 False

3186 1 True

3187 2 False

3188 3 False

3189 dtype: bool

3190 """

3191 _doc_args["isalnum"] = {"type": "alphanumeric", "method": "isalnum"}

3192 _doc_args["isalpha"] = {"type": "alphabetic", "method": "isalpha"}

3193 _doc_args["isdigit"] = {"type": "digits", "method": "isdigit"}

3194 _doc_args["isspace"] = {"type": "whitespace", "method": "isspace"}

3195 _doc_args["islower"] = {"type": "lowercase", "method": "islower"}

3196 _doc_args["isupper"] = {"type": "uppercase", "method": "isupper"}

3197 _doc_args["istitle"] = {"type": "titlecase", "method": "istitle"}

3198 _doc_args["isnumeric"] = {"type": "numeric", "method": "isnumeric"}

3199 _doc_args["isdecimal"] = {"type": "decimal", "method": "isdecimal"}

3200 # force _noarg_wrapper return type with dtype=np.dtype(bool) (GH 29624)

3201

3202 isalnum = _map_and_wrap(

3203 "isalnum", docstring=_shared_docs["ismethods"] % _doc_args["isalnum"]

3204 )

3205 isalpha = _map_and_wrap(

3206 "isalpha", docstring=_shared_docs["ismethods"] % _doc_args["isalpha"]

3207 )

3208 isdigit = _map_and_wrap(

3209 "isdigit", docstring=_shared_docs["ismethods"] % _doc_args["isdigit"]

3210 )

3211 isspace = _map_and_wrap(

3212 "isspace", docstring=_shared_docs["ismethods"] % _doc_args["isspace"]

3213 )

3214 islower = _map_and_wrap(

3215 "islower", docstring=_shared_docs["ismethods"] % _doc_args["islower"]

3216 )

3217 isupper = _map_and_wrap(

3218 "isupper", docstring=_shared_docs["ismethods"] % _doc_args["isupper"]

3219 )

3220 istitle = _map_and_wrap(

3221 "istitle", docstring=_shared_docs["ismethods"] % _doc_args["istitle"]

3222 )

3223 isnumeric = _map_and_wrap(

3224 "isnumeric", docstring=_shared_docs["ismethods"] % _doc_args["isnumeric"]

3225 )

3226 isdecimal = _map_and_wrap(

3227 "isdecimal", docstring=_shared_docs["ismethods"] % _doc_args["isdecimal"]

3228 )

3229

3230

3231def cat_safe(list_of_columns: list, sep: str):

3232 """

3233 Auxiliary function for :meth:`str.cat`.

3234

3235 Same signature as cat_core, but handles TypeErrors in concatenation, which

3236 happen if the arrays in list_of columns have the wrong dtypes or content.

3237

3238 Parameters

3239 ----------

3240 list_of_columns : list of numpy arrays

3241 List of arrays to be concatenated with sep;

3242 these arrays may not contain NaNs!

3243 sep : string

3244 The separator string for concatenating the columns.

3245

3246 Returns

3247 -------

3248 nd.array

3249 The concatenation of list_of_columns with sep.

3250 """

3251 try:

3252 result = cat_core(list_of_columns, sep)

3253 except TypeError:

3254 # if there are any non-string values (wrong dtype or hidden behind

3255 # object dtype), np.sum will fail; catch and return with better message

3256 for column in list_of_columns:

3257 dtype = lib.infer_dtype(column, skipna=True)

3258 if dtype not in ["string", "empty"]:

3259 raise TypeError(

3260 "Concatenation requires list-likes containing only "

3261 "strings (or missing values). Offending values found in "

3262 f"column {dtype}"

3263 ) from None

3264 return result

3265

3266

3267def cat_core(list_of_columns: list, sep: str):

3268 """

3269 Auxiliary function for :meth:`str.cat`

3270

3271 Parameters

3272 ----------

3273 list_of_columns : list of numpy arrays

3274 List of arrays to be concatenated with sep;

3275 these arrays may not contain NaNs!

3276 sep : string

3277 The separator string for concatenating the columns.

3278

3279 Returns

3280 -------

3281 nd.array

3282 The concatenation of list_of_columns with sep.

3283 """

3284 if sep == "":

3285 # no need to interleave sep if it is empty

3286 arr_of_cols = np.asarray(list_of_columns, dtype=object)

3287 return np.sum(arr_of_cols, axis=0)

3288 list_with_sep = [sep] * (2 * len(list_of_columns) - 1)

3289 list_with_sep[::2] = list_of_columns

3290 arr_with_sep = np.asarray(list_with_sep, dtype=object)

3291 return np.sum(arr_with_sep, axis=0)

3292

3293

3294def _result_dtype(arr):

3295 # workaround #27953

3296 # ideally we just pass `dtype=arr.dtype` unconditionally, but this fails

3297 # when the list of values is empty.

3298 from pandas.core.arrays.string_ import StringDtype

3299

3300 if isinstance(arr.dtype, StringDtype):

3301 return arr.dtype

3302 else:

3303 return object

3304

3305

3306def _get_single_group_name(regex: re.Pattern) -> Hashable:

3307 if regex.groupindex:

3308 return next(iter(regex.groupindex))

3309 else:

3310 return None

3311

3312

3313def _get_group_names(regex: re.Pattern) -> list[Hashable]:

3314 """

3315 Get named groups from compiled regex.

3316

3317 Unnamed groups are numbered.

3318

3319 Parameters

3320 ----------

3321 regex : compiled regex

3322

3323 Returns

3324 -------

3325 list of column labels

3326 """

3327 names = {v: k for k, v in regex.groupindex.items()}

3328 return [names.get(1 + i, i) for i in range(regex.groups)]

3329

3330

3331def str_extractall(arr, pat, flags=0):

3332 regex = re.compile(pat, flags=flags)

3333 # the regex must contain capture groups.

3334 if regex.groups == 0:

3335 raise ValueError("pattern contains no capture groups")

3336

3337 if isinstance(arr, ABCIndex):

3338 arr = arr.to_series().reset_index(drop=True)

3339

3340 columns = _get_group_names(regex)

3341 match_list = []

3342 index_list = []

3343 is_mi = arr.index.nlevels > 1

3344

3345 for subject_key, subject in arr.items():

3346 if isinstance(subject, str):

3347

3348 if not is_mi:

3349 subject_key = (subject_key,)

3350

3351 for match_i, match_tuple in enumerate(regex.findall(subject)):

3352 if isinstance(match_tuple, str):

3353 match_tuple = (match_tuple,)

3354 na_tuple = [np.NaN if group == "" else group for group in match_tuple]

3355 match_list.append(na_tuple)

3356 result_key = tuple(subject_key + (match_i,))

3357 index_list.append(result_key)

3358

3359 from pandas import MultiIndex

3360

3361 index = MultiIndex.from_tuples(index_list, names=arr.index.names + ["match"])

3362 dtype = _result_dtype(arr)

3363

3364 result = arr._constructor_expanddim(

3365 match_list, index=index, columns=columns, dtype=dtype

3366 )

3367 return result