Coverage for /var/srv/projects/api.amasfac.comuna18.com/tmp/venv/lib/python3.9/site-packages/pandas/core/strings/accessor.py: 25%

573 statements  

« prev     ^ index     » next       coverage.py v6.4.4, created at 2023-07-17 14:22 -0600

1from __future__ import annotations 

2 

3import codecs 

4from functools import wraps 

5import re 

6from typing import ( 

7 TYPE_CHECKING, 

8 Callable, 

9 Hashable, 

10 cast, 

11) 

12import warnings 

13 

14import numpy as np 

15 

16import pandas._libs.lib as lib 

17from pandas._typing import ( 

18 DtypeObj, 

19 F, 

20 Scalar, 

21) 

22from pandas.util._decorators import ( 

23 Appender, 

24 deprecate_nonkeyword_arguments, 

25) 

26from pandas.util._exceptions import find_stack_level 

27 

28from pandas.core.dtypes.common import ( 

29 ensure_object, 

30 is_bool_dtype, 

31 is_categorical_dtype, 

32 is_integer, 

33 is_list_like, 

34 is_object_dtype, 

35 is_re, 

36) 

37from pandas.core.dtypes.generic import ( 

38 ABCDataFrame, 

39 ABCIndex, 

40 ABCMultiIndex, 

41 ABCSeries, 

42) 

43from pandas.core.dtypes.missing import isna 

44 

45from pandas.core.base import NoNewAttributesMixin 

46from pandas.core.construction import extract_array 

47 

48if TYPE_CHECKING: 48 ↛ 49line 48 didn't jump to line 49, because the condition on line 48 was never true

49 from pandas import ( 

50 DataFrame, 

51 Index, 

52 Series, 

53 ) 

54 

55_shared_docs: dict[str, str] = {} 

56_cpython_optimized_encoders = ( 

57 "utf-8", 

58 "utf8", 

59 "latin-1", 

60 "latin1", 

61 "iso-8859-1", 

62 "mbcs", 

63 "ascii", 

64) 

65_cpython_optimized_decoders = _cpython_optimized_encoders + ("utf-16", "utf-32") 

66 

67 

68def forbid_nonstring_types( 

69 forbidden: list[str] | None, name: str | None = None 

70) -> Callable[[F], F]: 

71 """ 

72 Decorator to forbid specific types for a method of StringMethods. 

73 

74 For calling `.str.{method}` on a Series or Index, it is necessary to first 

75 initialize the :class:`StringMethods` object, and then call the method. 

76 However, different methods allow different input types, and so this can not 

77 be checked during :meth:`StringMethods.__init__`, but must be done on a 

78 per-method basis. This decorator exists to facilitate this process, and 

79 make it explicit which (inferred) types are disallowed by the method. 

80 

81 :meth:`StringMethods.__init__` allows the *union* of types its different 

82 methods allow (after skipping NaNs; see :meth:`StringMethods._validate`), 

83 namely: ['string', 'empty', 'bytes', 'mixed', 'mixed-integer']. 

84 

85 The default string types ['string', 'empty'] are allowed for all methods. 

86 For the additional types ['bytes', 'mixed', 'mixed-integer'], each method 

87 then needs to forbid the types it is not intended for. 

88 

89 Parameters 

90 ---------- 

91 forbidden : list-of-str or None 

92 List of forbidden non-string types, may be one or more of 

93 `['bytes', 'mixed', 'mixed-integer']`. 

94 name : str, default None 

95 Name of the method to use in the error message. By default, this is 

96 None, in which case the name from the method being wrapped will be 

97 copied. However, for working with further wrappers (like _pat_wrapper 

98 and _noarg_wrapper), it is necessary to specify the name. 

99 

100 Returns 

101 ------- 

102 func : wrapper 

103 The method to which the decorator is applied, with an added check that 

104 enforces the inferred type to not be in the list of forbidden types. 

105 

106 Raises 

107 ------ 

108 TypeError 

109 If the inferred type of the underlying data is in `forbidden`. 

110 """ 

111 # deal with None 

112 forbidden = [] if forbidden is None else forbidden 

113 

114 allowed_types = {"string", "empty", "bytes", "mixed", "mixed-integer"} - set( 

115 forbidden 

116 ) 

117 

118 def _forbid_nonstring_types(func: F) -> F: 

119 func_name = func.__name__ if name is None else name 

120 

121 @wraps(func) 

122 def wrapper(self, *args, **kwargs): 

123 if self._inferred_dtype not in allowed_types: 

124 msg = ( 

125 f"Cannot use .str.{func_name} with values of " 

126 f"inferred dtype '{self._inferred_dtype}'." 

127 ) 

128 raise TypeError(msg) 

129 return func(self, *args, **kwargs) 

130 

131 wrapper.__name__ = func_name 

132 return cast(F, wrapper) 

133 

134 return _forbid_nonstring_types 

135 

136 

137def _map_and_wrap(name, docstring): 

138 @forbid_nonstring_types(["bytes"], name=name) 

139 def wrapper(self): 

140 result = getattr(self._data.array, f"_str_{name}")() 

141 return self._wrap_result(result) 

142 

143 wrapper.__doc__ = docstring 

144 return wrapper 

145 

146 

147class StringMethods(NoNewAttributesMixin): 

148 """ 

149 Vectorized string functions for Series and Index. 

150 

151 NAs stay NA unless handled otherwise by a particular method. 

152 Patterned after Python's string methods, with some inspiration from 

153 R's stringr package. 

154 

155 Examples 

156 -------- 

157 >>> s = pd.Series(["A_Str_Series"]) 

158 >>> s 

159 0 A_Str_Series 

160 dtype: object 

161 

162 >>> s.str.split("_") 

163 0 [A, Str, Series] 

164 dtype: object 

165 

166 >>> s.str.replace("_", "") 

167 0 AStrSeries 

168 dtype: object 

169 """ 

170 

171 # Note: see the docstring in pandas.core.strings.__init__ 

172 # for an explanation of the implementation. 

173 # TODO: Dispatch all the methods 

174 # Currently the following are not dispatched to the array 

175 # * cat 

176 # * extractall 

177 

178 def __init__(self, data) -> None: 

179 from pandas.core.arrays.string_ import StringDtype 

180 

181 self._inferred_dtype = self._validate(data) 

182 self._is_categorical = is_categorical_dtype(data.dtype) 

183 self._is_string = isinstance(data.dtype, StringDtype) 

184 self._data = data 

185 

186 self._index = self._name = None 

187 if isinstance(data, ABCSeries): 

188 self._index = data.index 

189 self._name = data.name 

190 

191 # ._values.categories works for both Series/Index 

192 self._parent = data._values.categories if self._is_categorical else data 

193 # save orig to blow up categoricals to the right type 

194 self._orig = data 

195 self._freeze() 

196 

197 @staticmethod 

198 def _validate(data): 

199 """ 

200 Auxiliary function for StringMethods, infers and checks dtype of data. 

201 

202 This is a "first line of defence" at the creation of the StringMethods- 

203 object, and just checks that the dtype is in the 

204 *union* of the allowed types over all string methods below; this 

205 restriction is then refined on a per-method basis using the decorator 

206 @forbid_nonstring_types (more info in the corresponding docstring). 

207 

208 This really should exclude all series/index with any non-string values, 

209 but that isn't practical for performance reasons until we have a str 

210 dtype (GH 9343 / 13877) 

211 

212 Parameters 

213 ---------- 

214 data : The content of the Series 

215 

216 Returns 

217 ------- 

218 dtype : inferred dtype of data 

219 """ 

220 if isinstance(data, ABCMultiIndex): 

221 raise AttributeError( 

222 "Can only use .str accessor with Index, not MultiIndex" 

223 ) 

224 

225 # see _libs/lib.pyx for list of inferred types 

226 allowed_types = ["string", "empty", "bytes", "mixed", "mixed-integer"] 

227 

228 data = extract_array(data) 

229 

230 values = getattr(data, "categories", data) # categorical / normal 

231 

232 inferred_dtype = lib.infer_dtype(values, skipna=True) 

233 

234 if inferred_dtype not in allowed_types: 

235 raise AttributeError("Can only use .str accessor with string values!") 

236 return inferred_dtype 

237 

238 def __getitem__(self, key): 

239 result = self._data.array._str_getitem(key) 

240 return self._wrap_result(result) 

241 

242 def __iter__(self): 

243 warnings.warn( 

244 "Columnar iteration over characters will be deprecated in future releases.", 

245 FutureWarning, 

246 stacklevel=find_stack_level(), 

247 ) 

248 i = 0 

249 g = self.get(i) 

250 while g.notna().any(): 

251 yield g 

252 i += 1 

253 g = self.get(i) 

254 

255 def _wrap_result( 

256 self, 

257 result, 

258 name=None, 

259 expand: bool | None = None, 

260 fill_value=np.nan, 

261 returns_string=True, 

262 returns_bool: bool = False, 

263 ): 

264 from pandas import ( 

265 Index, 

266 MultiIndex, 

267 ) 

268 

269 if not hasattr(result, "ndim") or not hasattr(result, "dtype"): 

270 if isinstance(result, ABCDataFrame): 

271 result = result.__finalize__(self._orig, name="str") 

272 return result 

273 assert result.ndim < 3 

274 

275 # We can be wrapping a string / object / categorical result, in which 

276 # case we'll want to return the same dtype as the input. 

277 # Or we can be wrapping a numeric output, in which case we don't want 

278 # to return a StringArray. 

279 # Ideally the array method returns the right array type. 

280 if expand is None: 

281 # infer from ndim if expand is not specified 

282 expand = result.ndim != 1 

283 

284 elif ( 

285 expand is True 

286 and is_object_dtype(result) 

287 and not isinstance(self._orig, ABCIndex) 

288 ): 

289 # required when expand=True is explicitly specified 

290 # not needed when inferred 

291 

292 def cons_row(x): 

293 if is_list_like(x): 

294 return x 

295 else: 

296 return [x] 

297 

298 result = [cons_row(x) for x in result] 

299 if result and not self._is_string: 

300 # propagate nan values to match longest sequence (GH 18450) 

301 max_len = max(len(x) for x in result) 

302 result = [ 

303 x * max_len if len(x) == 0 or x[0] is np.nan else x for x in result 

304 ] 

305 

306 if not isinstance(expand, bool): 

307 raise ValueError("expand must be True or False") 

308 

309 if expand is False: 

310 # if expand is False, result should have the same name 

311 # as the original otherwise specified 

312 if name is None: 

313 name = getattr(result, "name", None) 

314 if name is None: 

315 # do not use logical or, _orig may be a DataFrame 

316 # which has "name" column 

317 name = self._orig.name 

318 

319 # Wait until we are sure result is a Series or Index before 

320 # checking attributes (GH 12180) 

321 if isinstance(self._orig, ABCIndex): 

322 # if result is a boolean np.array, return the np.array 

323 # instead of wrapping it into a boolean Index (GH 8875) 

324 if is_bool_dtype(result): 

325 return result 

326 

327 if expand: 

328 result = list(result) 

329 out = MultiIndex.from_tuples(result, names=name) 

330 if out.nlevels == 1: 

331 # We had all tuples of length-one, which are 

332 # better represented as a regular Index. 

333 out = out.get_level_values(0) 

334 return out 

335 else: 

336 return Index._with_infer(result, name=name) 

337 else: 

338 index = self._orig.index 

339 # This is a mess. 

340 dtype: DtypeObj | str | None 

341 vdtype = getattr(result, "dtype", None) 

342 if self._is_string: 

343 if is_bool_dtype(vdtype): 

344 dtype = result.dtype 

345 elif returns_string: 

346 dtype = self._orig.dtype 

347 else: 

348 dtype = vdtype 

349 else: 

350 dtype = vdtype 

351 

352 if expand: 

353 cons = self._orig._constructor_expanddim 

354 result = cons(result, columns=name, index=index, dtype=dtype) 

355 else: 

356 # Must be a Series 

357 cons = self._orig._constructor 

358 result = cons(result, name=name, index=index, dtype=dtype) 

359 result = result.__finalize__(self._orig, method="str") 

360 if name is not None and result.ndim == 1: 

361 # __finalize__ might copy over the original name, but we may 

362 # want the new name (e.g. str.extract). 

363 result.name = name 

364 return result 

365 

366 def _get_series_list(self, others): 

367 """ 

368 Auxiliary function for :meth:`str.cat`. Turn potentially mixed input 

369 into a list of Series (elements without an index must match the length 

370 of the calling Series/Index). 

371 

372 Parameters 

373 ---------- 

374 others : Series, DataFrame, np.ndarray, list-like or list-like of 

375 Objects that are either Series, Index or np.ndarray (1-dim). 

376 

377 Returns 

378 ------- 

379 list of Series 

380 Others transformed into list of Series. 

381 """ 

382 from pandas import ( 

383 DataFrame, 

384 Series, 

385 ) 

386 

387 # self._orig is either Series or Index 

388 idx = self._orig if isinstance(self._orig, ABCIndex) else self._orig.index 

389 

390 # Generally speaking, all objects without an index inherit the index 

391 # `idx` of the calling Series/Index - i.e. must have matching length. 

392 # Objects with an index (i.e. Series/Index/DataFrame) keep their own. 

393 if isinstance(others, ABCSeries): 

394 return [others] 

395 elif isinstance(others, ABCIndex): 

396 return [Series(others._values, index=idx, dtype=others.dtype)] 

397 elif isinstance(others, ABCDataFrame): 

398 return [others[x] for x in others] 

399 elif isinstance(others, np.ndarray) and others.ndim == 2: 

400 others = DataFrame(others, index=idx) 

401 return [others[x] for x in others] 

402 elif is_list_like(others, allow_sets=False): 

403 others = list(others) # ensure iterators do not get read twice etc 

404 

405 # in case of list-like `others`, all elements must be 

406 # either Series/Index/np.ndarray (1-dim)... 

407 if all( 

408 isinstance(x, (ABCSeries, ABCIndex)) 

409 or (isinstance(x, np.ndarray) and x.ndim == 1) 

410 for x in others 

411 ): 

412 los: list[Series] = [] 

413 while others: # iterate through list and append each element 

414 los = los + self._get_series_list(others.pop(0)) 

415 return los 

416 # ... or just strings 

417 elif all(not is_list_like(x) for x in others): 

418 return [Series(others, index=idx)] 

419 raise TypeError( 

420 "others must be Series, Index, DataFrame, np.ndarray " 

421 "or list-like (either containing only strings or " 

422 "containing only objects of type Series/Index/" 

423 "np.ndarray[1-dim])" 

424 ) 

425 

426 @forbid_nonstring_types(["bytes", "mixed", "mixed-integer"]) 

427 def cat( 

428 self, others=None, sep=None, na_rep=None, join="left" 

429 ) -> str | Series | Index: 

430 """ 

431 Concatenate strings in the Series/Index with given separator. 

432 

433 If `others` is specified, this function concatenates the Series/Index 

434 and elements of `others` element-wise. 

435 If `others` is not passed, then all values in the Series/Index are 

436 concatenated into a single string with a given `sep`. 

437 

438 Parameters 

439 ---------- 

440 others : Series, Index, DataFrame, np.ndarray or list-like 

441 Series, Index, DataFrame, np.ndarray (one- or two-dimensional) and 

442 other list-likes of strings must have the same length as the 

443 calling Series/Index, with the exception of indexed objects (i.e. 

444 Series/Index/DataFrame) if `join` is not None. 

445 

446 If others is a list-like that contains a combination of Series, 

447 Index or np.ndarray (1-dim), then all elements will be unpacked and 

448 must satisfy the above criteria individually. 

449 

450 If others is None, the method returns the concatenation of all 

451 strings in the calling Series/Index. 

452 sep : str, default '' 

453 The separator between the different elements/columns. By default 

454 the empty string `''` is used. 

455 na_rep : str or None, default None 

456 Representation that is inserted for all missing values: 

457 

458 - If `na_rep` is None, and `others` is None, missing values in the 

459 Series/Index are omitted from the result. 

460 - If `na_rep` is None, and `others` is not None, a row containing a 

461 missing value in any of the columns (before concatenation) will 

462 have a missing value in the result. 

463 join : {'left', 'right', 'outer', 'inner'}, default 'left' 

464 Determines the join-style between the calling Series/Index and any 

465 Series/Index/DataFrame in `others` (objects without an index need 

466 to match the length of the calling Series/Index). To disable 

467 alignment, use `.values` on any Series/Index/DataFrame in `others`. 

468 

469 .. versionadded:: 0.23.0 

470 .. versionchanged:: 1.0.0 

471 Changed default of `join` from None to `'left'`. 

472 

473 Returns 

474 ------- 

475 str, Series or Index 

476 If `others` is None, `str` is returned, otherwise a `Series/Index` 

477 (same type as caller) of objects is returned. 

478 

479 See Also 

480 -------- 

481 split : Split each string in the Series/Index. 

482 join : Join lists contained as elements in the Series/Index. 

483 

484 Examples 

485 -------- 

486 When not passing `others`, all values are concatenated into a single 

487 string: 

488 

489 >>> s = pd.Series(['a', 'b', np.nan, 'd']) 

490 >>> s.str.cat(sep=' ') 

491 'a b d' 

492 

493 By default, NA values in the Series are ignored. Using `na_rep`, they 

494 can be given a representation: 

495 

496 >>> s.str.cat(sep=' ', na_rep='?') 

497 'a b ? d' 

498 

499 If `others` is specified, corresponding values are concatenated with 

500 the separator. Result will be a Series of strings. 

501 

502 >>> s.str.cat(['A', 'B', 'C', 'D'], sep=',') 

503 0 a,A 

504 1 b,B 

505 2 NaN 

506 3 d,D 

507 dtype: object 

508 

509 Missing values will remain missing in the result, but can again be 

510 represented using `na_rep` 

511 

512 >>> s.str.cat(['A', 'B', 'C', 'D'], sep=',', na_rep='-') 

513 0 a,A 

514 1 b,B 

515 2 -,C 

516 3 d,D 

517 dtype: object 

518 

519 If `sep` is not specified, the values are concatenated without 

520 separation. 

521 

522 >>> s.str.cat(['A', 'B', 'C', 'D'], na_rep='-') 

523 0 aA 

524 1 bB 

525 2 -C 

526 3 dD 

527 dtype: object 

528 

529 Series with different indexes can be aligned before concatenation. The 

530 `join`-keyword works as in other methods. 

531 

532 >>> t = pd.Series(['d', 'a', 'e', 'c'], index=[3, 0, 4, 2]) 

533 >>> s.str.cat(t, join='left', na_rep='-') 

534 0 aa 

535 1 b- 

536 2 -c 

537 3 dd 

538 dtype: object 

539 >>> 

540 >>> s.str.cat(t, join='outer', na_rep='-') 

541 0 aa 

542 1 b- 

543 2 -c 

544 3 dd 

545 4 -e 

546 dtype: object 

547 >>> 

548 >>> s.str.cat(t, join='inner', na_rep='-') 

549 0 aa 

550 2 -c 

551 3 dd 

552 dtype: object 

553 >>> 

554 >>> s.str.cat(t, join='right', na_rep='-') 

555 3 dd 

556 0 aa 

557 4 -e 

558 2 -c 

559 dtype: object 

560 

561 For more examples, see :ref:`here <text.concatenate>`. 

562 """ 

563 # TODO: dispatch 

564 from pandas import ( 

565 Index, 

566 Series, 

567 concat, 

568 ) 

569 

570 if isinstance(others, str): 

571 raise ValueError("Did you mean to supply a `sep` keyword?") 

572 if sep is None: 

573 sep = "" 

574 

575 if isinstance(self._orig, ABCIndex): 

576 data = Series(self._orig, index=self._orig, dtype=self._orig.dtype) 

577 else: # Series 

578 data = self._orig 

579 

580 # concatenate Series/Index with itself if no "others" 

581 if others is None: 

582 # error: Incompatible types in assignment (expression has type 

583 # "ndarray", variable has type "Series") 

584 data = ensure_object(data) # type: ignore[assignment] 

585 na_mask = isna(data) 

586 if na_rep is None and na_mask.any(): 

587 return sep.join(data[~na_mask]) 

588 elif na_rep is not None and na_mask.any(): 

589 return sep.join(np.where(na_mask, na_rep, data)) 

590 else: 

591 return sep.join(data) 

592 

593 try: 

594 # turn anything in "others" into lists of Series 

595 others = self._get_series_list(others) 

596 except ValueError as err: # do not catch TypeError raised by _get_series_list 

597 raise ValueError( 

598 "If `others` contains arrays or lists (or other " 

599 "list-likes without an index), these must all be " 

600 "of the same length as the calling Series/Index." 

601 ) from err 

602 

603 # align if required 

604 if any(not data.index.equals(x.index) for x in others): 

605 # Need to add keys for uniqueness in case of duplicate columns 

606 others = concat( 

607 others, 

608 axis=1, 

609 join=(join if join == "inner" else "outer"), 

610 keys=range(len(others)), 

611 sort=False, 

612 copy=False, 

613 ) 

614 data, others = data.align(others, join=join) 

615 others = [others[x] for x in others] # again list of Series 

616 

617 all_cols = [ensure_object(x) for x in [data] + others] 

618 na_masks = np.array([isna(x) for x in all_cols]) 

619 union_mask = np.logical_or.reduce(na_masks, axis=0) 

620 

621 if na_rep is None and union_mask.any(): 

622 # no na_rep means NaNs for all rows where any column has a NaN 

623 # only necessary if there are actually any NaNs 

624 result = np.empty(len(data), dtype=object) 

625 np.putmask(result, union_mask, np.nan) 

626 

627 not_masked = ~union_mask 

628 result[not_masked] = cat_safe([x[not_masked] for x in all_cols], sep) 

629 elif na_rep is not None and union_mask.any(): 

630 # fill NaNs with na_rep in case there are actually any NaNs 

631 all_cols = [ 

632 np.where(nm, na_rep, col) for nm, col in zip(na_masks, all_cols) 

633 ] 

634 result = cat_safe(all_cols, sep) 

635 else: 

636 # no NaNs - can just concatenate 

637 result = cat_safe(all_cols, sep) 

638 

639 out: Index | Series 

640 if isinstance(self._orig, ABCIndex): 

641 # add dtype for case that result is all-NA 

642 

643 out = Index(result, dtype=object, name=self._orig.name) 

644 else: # Series 

645 if is_categorical_dtype(self._orig.dtype): 

646 # We need to infer the new categories. 

647 dtype = None 

648 else: 

649 dtype = self._orig.dtype 

650 res_ser = Series( 

651 result, dtype=dtype, index=data.index, name=self._orig.name 

652 ) 

653 out = res_ser.__finalize__(self._orig, method="str_cat") 

654 return out 

655 

656 _shared_docs[ 

657 "str_split" 

658 ] = r""" 

659 Split strings around given separator/delimiter. 

660 

661 Splits the string in the Series/Index from the %(side)s, 

662 at the specified delimiter string. 

663 

664 Parameters 

665 ---------- 

666 pat : str%(pat_regex)s, optional 

667 %(pat_description)s. 

668 If not specified, split on whitespace. 

669 n : int, default -1 (all) 

670 Limit number of splits in output. 

671 ``None``, 0 and -1 will be interpreted as return all splits. 

672 expand : bool, default False 

673 Expand the split strings into separate columns. 

674 

675 - If ``True``, return DataFrame/MultiIndex expanding dimensionality. 

676 - If ``False``, return Series/Index, containing lists of strings. 

677 %(regex_argument)s 

678 Returns 

679 ------- 

680 Series, Index, DataFrame or MultiIndex 

681 Type matches caller unless ``expand=True`` (see Notes). 

682 %(raises_split)s 

683 See Also 

684 -------- 

685 Series.str.split : Split strings around given separator/delimiter. 

686 Series.str.rsplit : Splits string around given separator/delimiter, 

687 starting from the right. 

688 Series.str.join : Join lists contained as elements in the Series/Index 

689 with passed delimiter. 

690 str.split : Standard library version for split. 

691 str.rsplit : Standard library version for rsplit. 

692 

693 Notes 

694 ----- 

695 The handling of the `n` keyword depends on the number of found splits: 

696 

697 - If found splits > `n`, make first `n` splits only 

698 - If found splits <= `n`, make all splits 

699 - If for a certain row the number of found splits < `n`, 

700 append `None` for padding up to `n` if ``expand=True`` 

701 

702 If using ``expand=True``, Series and Index callers return DataFrame and 

703 MultiIndex objects, respectively. 

704 %(regex_pat_note)s 

705 Examples 

706 -------- 

707 >>> s = pd.Series( 

708 ... [ 

709 ... "this is a regular sentence", 

710 ... "https://docs.python.org/3/tutorial/index.html", 

711 ... np.nan 

712 ... ] 

713 ... ) 

714 >>> s 

715 0 this is a regular sentence 

716 1 https://docs.python.org/3/tutorial/index.html 

717 2 NaN 

718 dtype: object 

719 

720 In the default setting, the string is split by whitespace. 

721 

722 >>> s.str.split() 

723 0 [this, is, a, regular, sentence] 

724 1 [https://docs.python.org/3/tutorial/index.html] 

725 2 NaN 

726 dtype: object 

727 

728 Without the `n` parameter, the outputs of `rsplit` and `split` 

729 are identical. 

730 

731 >>> s.str.rsplit() 

732 0 [this, is, a, regular, sentence] 

733 1 [https://docs.python.org/3/tutorial/index.html] 

734 2 NaN 

735 dtype: object 

736 

737 The `n` parameter can be used to limit the number of splits on the 

738 delimiter. The outputs of `split` and `rsplit` are different. 

739 

740 >>> s.str.split(n=2) 

741 0 [this, is, a regular sentence] 

742 1 [https://docs.python.org/3/tutorial/index.html] 

743 2 NaN 

744 dtype: object 

745 

746 >>> s.str.rsplit(n=2) 

747 0 [this is a, regular, sentence] 

748 1 [https://docs.python.org/3/tutorial/index.html] 

749 2 NaN 

750 dtype: object 

751 

752 The `pat` parameter can be used to split by other characters. 

753 

754 >>> s.str.split(pat="/") 

755 0 [this is a regular sentence] 

756 1 [https:, , docs.python.org, 3, tutorial, index... 

757 2 NaN 

758 dtype: object 

759 

760 When using ``expand=True``, the split elements will expand out into 

761 separate columns. If NaN is present, it is propagated throughout 

762 the columns during the split. 

763 

764 >>> s.str.split(expand=True) 

765 0 1 2 3 4 

766 0 this is a regular sentence 

767 1 https://docs.python.org/3/tutorial/index.html None None None None 

768 2 NaN NaN NaN NaN NaN 

769 

770 For slightly more complex use cases like splitting the html document name 

771 from a url, a combination of parameter settings can be used. 

772 

773 >>> s.str.rsplit("/", n=1, expand=True) 

774 0 1 

775 0 this is a regular sentence None 

776 1 https://docs.python.org/3/tutorial index.html 

777 2 NaN NaN 

778 %(regex_examples)s""" 

779 

780 @Appender( 

781 _shared_docs["str_split"] 

782 % { 

783 "side": "beginning", 

784 "pat_regex": " or compiled regex", 

785 "pat_description": "String or regular expression to split on", 

786 "regex_argument": """ 

787 regex : bool, default None 

788 Determines if the passed-in pattern is a regular expression: 

789 

790 - If ``True``, assumes the passed-in pattern is a regular expression 

791 - If ``False``, treats the pattern as a literal string. 

792 - If ``None`` and `pat` length is 1, treats `pat` as a literal string. 

793 - If ``None`` and `pat` length is not 1, treats `pat` as a regular expression. 

794 - Cannot be set to False if `pat` is a compiled regex 

795 

796 .. versionadded:: 1.4.0 

797 """, 

798 "raises_split": """ 

799 Raises 

800 ------ 

801 ValueError 

802 * if `regex` is False and `pat` is a compiled regex 

803 """, 

804 "regex_pat_note": """ 

805 Use of `regex =False` with a `pat` as a compiled regex will raise an error. 

806 """, 

807 "method": "split", 

808 "regex_examples": r""" 

809 Remember to escape special characters when explicitly using regular expressions. 

810 

811 >>> s = pd.Series(["foo and bar plus baz"]) 

812 >>> s.str.split(r"and|plus", expand=True) 

813 0 1 2 

814 0 foo bar baz 

815 

816 Regular expressions can be used to handle urls or file names. 

817 When `pat` is a string and ``regex=None`` (the default), the given `pat` is compiled 

818 as a regex only if ``len(pat) != 1``. 

819 

820 >>> s = pd.Series(['foojpgbar.jpg']) 

821 >>> s.str.split(r".", expand=True) 

822 0 1 

823 0 foojpgbar jpg 

824 

825 >>> s.str.split(r"\.jpg", expand=True) 

826 0 1 

827 0 foojpgbar 

828 

829 When ``regex=True``, `pat` is interpreted as a regex 

830 

831 >>> s.str.split(r"\.jpg", regex=True, expand=True) 

832 0 1 

833 0 foojpgbar 

834 

835 A compiled regex can be passed as `pat` 

836 

837 >>> import re 

838 >>> s.str.split(re.compile(r"\.jpg"), expand=True) 

839 0 1 

840 0 foojpgbar 

841 

842 When ``regex=False``, `pat` is interpreted as the string itself 

843 

844 >>> s.str.split(r"\.jpg", regex=False, expand=True) 

845 0 

846 0 foojpgbar.jpg 

847 """, 

848 } 

849 ) 

850 @deprecate_nonkeyword_arguments(version=None, allowed_args=["self", "pat"]) 

851 @forbid_nonstring_types(["bytes"]) 

852 def split( 

853 self, 

854 pat: str | re.Pattern | None = None, 

855 n=-1, 

856 expand=False, 

857 *, 

858 regex: bool | None = None, 

859 ): 

860 if regex is False and is_re(pat): 

861 raise ValueError( 

862 "Cannot use a compiled regex as replacement pattern with regex=False" 

863 ) 

864 if is_re(pat): 

865 regex = True 

866 result = self._data.array._str_split(pat, n, expand, regex) 

867 return self._wrap_result(result, returns_string=expand, expand=expand) 

868 

869 @Appender( 

870 _shared_docs["str_split"] 

871 % { 

872 "side": "end", 

873 "pat_regex": "", 

874 "pat_description": "String to split on", 

875 "regex_argument": "", 

876 "raises_split": "", 

877 "regex_pat_note": "", 

878 "method": "rsplit", 

879 "regex_examples": "", 

880 } 

881 ) 

882 @deprecate_nonkeyword_arguments(version=None, allowed_args=["self", "pat"]) 

883 @forbid_nonstring_types(["bytes"]) 

884 def rsplit(self, pat=None, n=-1, expand=False): 

885 result = self._data.array._str_rsplit(pat, n=n) 

886 return self._wrap_result(result, expand=expand, returns_string=expand) 

887 

888 _shared_docs[ 

889 "str_partition" 

890 ] = """ 

891 Split the string at the %(side)s occurrence of `sep`. 

892 

893 This method splits the string at the %(side)s occurrence of `sep`, 

894 and returns 3 elements containing the part before the separator, 

895 the separator itself, and the part after the separator. 

896 If the separator is not found, return %(return)s. 

897 

898 Parameters 

899 ---------- 

900 sep : str, default whitespace 

901 String to split on. 

902 expand : bool, default True 

903 If True, return DataFrame/MultiIndex expanding dimensionality. 

904 If False, return Series/Index. 

905 

906 Returns 

907 ------- 

908 DataFrame/MultiIndex or Series/Index of objects 

909 

910 See Also 

911 -------- 

912 %(also)s 

913 Series.str.split : Split strings around given separators. 

914 str.partition : Standard library version. 

915 

916 Examples 

917 -------- 

918 

919 >>> s = pd.Series(['Linda van der Berg', 'George Pitt-Rivers']) 

920 >>> s 

921 0 Linda van der Berg 

922 1 George Pitt-Rivers 

923 dtype: object 

924 

925 >>> s.str.partition() 

926 0 1 2 

927 0 Linda van der Berg 

928 1 George Pitt-Rivers 

929 

930 To partition by the last space instead of the first one: 

931 

932 >>> s.str.rpartition() 

933 0 1 2 

934 0 Linda van der Berg 

935 1 George Pitt-Rivers 

936 

937 To partition by something different than a space: 

938 

939 >>> s.str.partition('-') 

940 0 1 2 

941 0 Linda van der Berg 

942 1 George Pitt - Rivers 

943 

944 To return a Series containing tuples instead of a DataFrame: 

945 

946 >>> s.str.partition('-', expand=False) 

947 0 (Linda van der Berg, , ) 

948 1 (George Pitt, -, Rivers) 

949 dtype: object 

950 

951 Also available on indices: 

952 

953 >>> idx = pd.Index(['X 123', 'Y 999']) 

954 >>> idx 

955 Index(['X 123', 'Y 999'], dtype='object') 

956 

957 Which will create a MultiIndex: 

958 

959 >>> idx.str.partition() 

960 MultiIndex([('X', ' ', '123'), 

961 ('Y', ' ', '999')], 

962 ) 

963 

964 Or an index with tuples with ``expand=False``: 

965 

966 >>> idx.str.partition(expand=False) 

967 Index([('X', ' ', '123'), ('Y', ' ', '999')], dtype='object') 

968 """ 

969 

970 @Appender( 

971 _shared_docs["str_partition"] 

972 % { 

973 "side": "first", 

974 "return": "3 elements containing the string itself, followed by two " 

975 "empty strings", 

976 "also": "rpartition : Split the string at the last occurrence of `sep`.", 

977 } 

978 ) 

979 @forbid_nonstring_types(["bytes"]) 

980 def partition(self, sep=" ", expand=True): 

981 result = self._data.array._str_partition(sep, expand) 

982 return self._wrap_result(result, expand=expand, returns_string=expand) 

983 

984 @Appender( 

985 _shared_docs["str_partition"] 

986 % { 

987 "side": "last", 

988 "return": "3 elements containing two empty strings, followed by the " 

989 "string itself", 

990 "also": "partition : Split the string at the first occurrence of `sep`.", 

991 } 

992 ) 

993 @forbid_nonstring_types(["bytes"]) 

994 def rpartition(self, sep=" ", expand=True): 

995 result = self._data.array._str_rpartition(sep, expand) 

996 return self._wrap_result(result, expand=expand, returns_string=expand) 

997 

998 def get(self, i): 

999 """ 

1000 Extract element from each component at specified position or with specified key. 

1001 

1002 Extract element from lists, tuples, dict, or strings in each element in the 

1003 Series/Index. 

1004 

1005 Parameters 

1006 ---------- 

1007 i : int or hashable dict label 

1008 Position or key of element to extract. 

1009 

1010 Returns 

1011 ------- 

1012 Series or Index 

1013 

1014 Examples 

1015 -------- 

1016 >>> s = pd.Series(["String", 

1017 ... (1, 2, 3), 

1018 ... ["a", "b", "c"], 

1019 ... 123, 

1020 ... -456, 

1021 ... {1: "Hello", "2": "World"}]) 

1022 >>> s 

1023 0 String 

1024 1 (1, 2, 3) 

1025 2 [a, b, c] 

1026 3 123 

1027 4 -456 

1028 5 {1: 'Hello', '2': 'World'} 

1029 dtype: object 

1030 

1031 >>> s.str.get(1) 

1032 0 t 

1033 1 2 

1034 2 b 

1035 3 NaN 

1036 4 NaN 

1037 5 Hello 

1038 dtype: object 

1039 

1040 >>> s.str.get(-1) 

1041 0 g 

1042 1 3 

1043 2 c 

1044 3 NaN 

1045 4 NaN 

1046 5 None 

1047 dtype: object 

1048 

1049 Return element with given key 

1050 

1051 >>> s = pd.Series([{"name": "Hello", "value": "World"}, 

1052 ... {"name": "Goodbye", "value": "Planet"}]) 

1053 >>> s.str.get('name') 

1054 0 Hello 

1055 1 Goodbye 

1056 dtype: object 

1057 """ 

1058 result = self._data.array._str_get(i) 

1059 return self._wrap_result(result) 

1060 

1061 @forbid_nonstring_types(["bytes"]) 

1062 def join(self, sep): 

1063 """ 

1064 Join lists contained as elements in the Series/Index with passed delimiter. 

1065 

1066 If the elements of a Series are lists themselves, join the content of these 

1067 lists using the delimiter passed to the function. 

1068 This function is an equivalent to :meth:`str.join`. 

1069 

1070 Parameters 

1071 ---------- 

1072 sep : str 

1073 Delimiter to use between list entries. 

1074 

1075 Returns 

1076 ------- 

1077 Series/Index: object 

1078 The list entries concatenated by intervening occurrences of the 

1079 delimiter. 

1080 

1081 Raises 

1082 ------ 

1083 AttributeError 

1084 If the supplied Series contains neither strings nor lists. 

1085 

1086 See Also 

1087 -------- 

1088 str.join : Standard library version of this method. 

1089 Series.str.split : Split strings around given separator/delimiter. 

1090 

1091 Notes 

1092 ----- 

1093 If any of the list items is not a string object, the result of the join 

1094 will be `NaN`. 

1095 

1096 Examples 

1097 -------- 

1098 Example with a list that contains non-string elements. 

1099 

1100 >>> s = pd.Series([['lion', 'elephant', 'zebra'], 

1101 ... [1.1, 2.2, 3.3], 

1102 ... ['cat', np.nan, 'dog'], 

1103 ... ['cow', 4.5, 'goat'], 

1104 ... ['duck', ['swan', 'fish'], 'guppy']]) 

1105 >>> s 

1106 0 [lion, elephant, zebra] 

1107 1 [1.1, 2.2, 3.3] 

1108 2 [cat, nan, dog] 

1109 3 [cow, 4.5, goat] 

1110 4 [duck, [swan, fish], guppy] 

1111 dtype: object 

1112 

1113 Join all lists using a '-'. The lists containing object(s) of types other 

1114 than str will produce a NaN. 

1115 

1116 >>> s.str.join('-') 

1117 0 lion-elephant-zebra 

1118 1 NaN 

1119 2 NaN 

1120 3 NaN 

1121 4 NaN 

1122 dtype: object 

1123 """ 

1124 result = self._data.array._str_join(sep) 

1125 return self._wrap_result(result) 

1126 

1127 @forbid_nonstring_types(["bytes"]) 

1128 def contains(self, pat, case=True, flags=0, na=None, regex=True): 

1129 r""" 

1130 Test if pattern or regex is contained within a string of a Series or Index. 

1131 

1132 Return boolean Series or Index based on whether a given pattern or regex is 

1133 contained within a string of a Series or Index. 

1134 

1135 Parameters 

1136 ---------- 

1137 pat : str 

1138 Character sequence or regular expression. 

1139 case : bool, default True 

1140 If True, case sensitive. 

1141 flags : int, default 0 (no flags) 

1142 Flags to pass through to the re module, e.g. re.IGNORECASE. 

1143 na : scalar, optional 

1144 Fill value for missing values. The default depends on dtype of the 

1145 array. For object-dtype, ``numpy.nan`` is used. For ``StringDtype``, 

1146 ``pandas.NA`` is used. 

1147 regex : bool, default True 

1148 If True, assumes the pat is a regular expression. 

1149 

1150 If False, treats the pat as a literal string. 

1151 

1152 Returns 

1153 ------- 

1154 Series or Index of boolean values 

1155 A Series or Index of boolean values indicating whether the 

1156 given pattern is contained within the string of each element 

1157 of the Series or Index. 

1158 

1159 See Also 

1160 -------- 

1161 match : Analogous, but stricter, relying on re.match instead of re.search. 

1162 Series.str.startswith : Test if the start of each string element matches a 

1163 pattern. 

1164 Series.str.endswith : Same as startswith, but tests the end of string. 

1165 

1166 Examples 

1167 -------- 

1168 Returning a Series of booleans using only a literal pattern. 

1169 

1170 >>> s1 = pd.Series(['Mouse', 'dog', 'house and parrot', '23', np.NaN]) 

1171 >>> s1.str.contains('og', regex=False) 

1172 0 False 

1173 1 True 

1174 2 False 

1175 3 False 

1176 4 NaN 

1177 dtype: object 

1178 

1179 Returning an Index of booleans using only a literal pattern. 

1180 

1181 >>> ind = pd.Index(['Mouse', 'dog', 'house and parrot', '23.0', np.NaN]) 

1182 >>> ind.str.contains('23', regex=False) 

1183 Index([False, False, False, True, nan], dtype='object') 

1184 

1185 Specifying case sensitivity using `case`. 

1186 

1187 >>> s1.str.contains('oG', case=True, regex=True) 

1188 0 False 

1189 1 False 

1190 2 False 

1191 3 False 

1192 4 NaN 

1193 dtype: object 

1194 

1195 Specifying `na` to be `False` instead of `NaN` replaces NaN values 

1196 with `False`. If Series or Index does not contain NaN values 

1197 the resultant dtype will be `bool`, otherwise, an `object` dtype. 

1198 

1199 >>> s1.str.contains('og', na=False, regex=True) 

1200 0 False 

1201 1 True 

1202 2 False 

1203 3 False 

1204 4 False 

1205 dtype: bool 

1206 

1207 Returning 'house' or 'dog' when either expression occurs in a string. 

1208 

1209 >>> s1.str.contains('house|dog', regex=True) 

1210 0 False 

1211 1 True 

1212 2 True 

1213 3 False 

1214 4 NaN 

1215 dtype: object 

1216 

1217 Ignoring case sensitivity using `flags` with regex. 

1218 

1219 >>> import re 

1220 >>> s1.str.contains('PARROT', flags=re.IGNORECASE, regex=True) 

1221 0 False 

1222 1 False 

1223 2 True 

1224 3 False 

1225 4 NaN 

1226 dtype: object 

1227 

1228 Returning any digit using regular expression. 

1229 

1230 >>> s1.str.contains('\\d', regex=True) 

1231 0 False 

1232 1 False 

1233 2 False 

1234 3 True 

1235 4 NaN 

1236 dtype: object 

1237 

1238 Ensure `pat` is a not a literal pattern when `regex` is set to True. 

1239 Note in the following example one might expect only `s2[1]` and `s2[3]` to 

1240 return `True`. However, '.0' as a regex matches any character 

1241 followed by a 0. 

1242 

1243 >>> s2 = pd.Series(['40', '40.0', '41', '41.0', '35']) 

1244 >>> s2.str.contains('.0', regex=True) 

1245 0 True 

1246 1 True 

1247 2 False 

1248 3 True 

1249 4 False 

1250 dtype: bool 

1251 """ 

1252 if regex and re.compile(pat).groups: 

1253 warnings.warn( 

1254 "This pattern is interpreted as a regular expression, and has " 

1255 "match groups. To actually get the groups, use str.extract.", 

1256 UserWarning, 

1257 stacklevel=find_stack_level(), 

1258 ) 

1259 

1260 result = self._data.array._str_contains(pat, case, flags, na, regex) 

1261 return self._wrap_result(result, fill_value=na, returns_string=False) 

1262 

1263 @forbid_nonstring_types(["bytes"]) 

1264 def match(self, pat, case=True, flags=0, na=None): 

1265 """ 

1266 Determine if each string starts with a match of a regular expression. 

1267 

1268 Parameters 

1269 ---------- 

1270 pat : str 

1271 Character sequence or regular expression. 

1272 case : bool, default True 

1273 If True, case sensitive. 

1274 flags : int, default 0 (no flags) 

1275 Regex module flags, e.g. re.IGNORECASE. 

1276 na : scalar, optional 

1277 Fill value for missing values. The default depends on dtype of the 

1278 array. For object-dtype, ``numpy.nan`` is used. For ``StringDtype``, 

1279 ``pandas.NA`` is used. 

1280 

1281 Returns 

1282 ------- 

1283 Series/Index/array of boolean values 

1284 

1285 See Also 

1286 -------- 

1287 fullmatch : Stricter matching that requires the entire string to match. 

1288 contains : Analogous, but less strict, relying on re.search instead of 

1289 re.match. 

1290 extract : Extract matched groups. 

1291 """ 

1292 result = self._data.array._str_match(pat, case=case, flags=flags, na=na) 

1293 return self._wrap_result(result, fill_value=na, returns_string=False) 

1294 

1295 @forbid_nonstring_types(["bytes"]) 

1296 def fullmatch(self, pat, case=True, flags=0, na=None): 

1297 """ 

1298 Determine if each string entirely matches a regular expression. 

1299 

1300 .. versionadded:: 1.1.0 

1301 

1302 Parameters 

1303 ---------- 

1304 pat : str 

1305 Character sequence or regular expression. 

1306 case : bool, default True 

1307 If True, case sensitive. 

1308 flags : int, default 0 (no flags) 

1309 Regex module flags, e.g. re.IGNORECASE. 

1310 na : scalar, optional 

1311 Fill value for missing values. The default depends on dtype of the 

1312 array. For object-dtype, ``numpy.nan`` is used. For ``StringDtype``, 

1313 ``pandas.NA`` is used. 

1314 

1315 Returns 

1316 ------- 

1317 Series/Index/array of boolean values 

1318 

1319 See Also 

1320 -------- 

1321 match : Similar, but also returns `True` when only a *prefix* of the string 

1322 matches the regular expression. 

1323 extract : Extract matched groups. 

1324 """ 

1325 result = self._data.array._str_fullmatch(pat, case=case, flags=flags, na=na) 

1326 return self._wrap_result(result, fill_value=na, returns_string=False) 

1327 

1328 @forbid_nonstring_types(["bytes"]) 

1329 def replace( 

1330 self, 

1331 pat: str | re.Pattern, 

1332 repl: str | Callable, 

1333 n: int = -1, 

1334 case: bool | None = None, 

1335 flags: int = 0, 

1336 regex: bool | None = None, 

1337 ): 

1338 r""" 

1339 Replace each occurrence of pattern/regex in the Series/Index. 

1340 

1341 Equivalent to :meth:`str.replace` or :func:`re.sub`, depending on 

1342 the regex value. 

1343 

1344 Parameters 

1345 ---------- 

1346 pat : str or compiled regex 

1347 String can be a character sequence or regular expression. 

1348 repl : str or callable 

1349 Replacement string or a callable. The callable is passed the regex 

1350 match object and must return a replacement string to be used. 

1351 See :func:`re.sub`. 

1352 n : int, default -1 (all) 

1353 Number of replacements to make from start. 

1354 case : bool, default None 

1355 Determines if replace is case sensitive: 

1356 

1357 - If True, case sensitive (the default if `pat` is a string) 

1358 - Set to False for case insensitive 

1359 - Cannot be set if `pat` is a compiled regex. 

1360 

1361 flags : int, default 0 (no flags) 

1362 Regex module flags, e.g. re.IGNORECASE. Cannot be set if `pat` is a compiled 

1363 regex. 

1364 regex : bool, default True 

1365 Determines if the passed-in pattern is a regular expression: 

1366 

1367 - If True, assumes the passed-in pattern is a regular expression. 

1368 - If False, treats the pattern as a literal string 

1369 - Cannot be set to False if `pat` is a compiled regex or `repl` is 

1370 a callable. 

1371 

1372 .. versionadded:: 0.23.0 

1373 

1374 Returns 

1375 ------- 

1376 Series or Index of object 

1377 A copy of the object with all matching occurrences of `pat` replaced by 

1378 `repl`. 

1379 

1380 Raises 

1381 ------ 

1382 ValueError 

1383 * if `regex` is False and `repl` is a callable or `pat` is a compiled 

1384 regex 

1385 * if `pat` is a compiled regex and `case` or `flags` is set 

1386 

1387 Notes 

1388 ----- 

1389 When `pat` is a compiled regex, all flags should be included in the 

1390 compiled regex. Use of `case`, `flags`, or `regex=False` with a compiled 

1391 regex will raise an error. 

1392 

1393 Examples 

1394 -------- 

1395 When `pat` is a string and `regex` is True (the default), the given `pat` 

1396 is compiled as a regex. When `repl` is a string, it replaces matching 

1397 regex patterns as with :meth:`re.sub`. NaN value(s) in the Series are 

1398 left as is: 

1399 

1400 >>> pd.Series(['foo', 'fuz', np.nan]).str.replace('f.', 'ba', regex=True) 

1401 0 bao 

1402 1 baz 

1403 2 NaN 

1404 dtype: object 

1405 

1406 When `pat` is a string and `regex` is False, every `pat` is replaced with 

1407 `repl` as with :meth:`str.replace`: 

1408 

1409 >>> pd.Series(['f.o', 'fuz', np.nan]).str.replace('f.', 'ba', regex=False) 

1410 0 bao 

1411 1 fuz 

1412 2 NaN 

1413 dtype: object 

1414 

1415 When `repl` is a callable, it is called on every `pat` using 

1416 :func:`re.sub`. The callable should expect one positional argument 

1417 (a regex object) and return a string. 

1418 

1419 To get the idea: 

1420 

1421 >>> pd.Series(['foo', 'fuz', np.nan]).str.replace('f', repr, regex=True) 

1422 0 <re.Match object; span=(0, 1), match='f'>oo 

1423 1 <re.Match object; span=(0, 1), match='f'>uz 

1424 2 NaN 

1425 dtype: object 

1426 

1427 Reverse every lowercase alphabetic word: 

1428 

1429 >>> repl = lambda m: m.group(0)[::-1] 

1430 >>> ser = pd.Series(['foo 123', 'bar baz', np.nan]) 

1431 >>> ser.str.replace(r'[a-z]+', repl, regex=True) 

1432 0 oof 123 

1433 1 rab zab 

1434 2 NaN 

1435 dtype: object 

1436 

1437 Using regex groups (extract second group and swap case): 

1438 

1439 >>> pat = r"(?P<one>\w+) (?P<two>\w+) (?P<three>\w+)" 

1440 >>> repl = lambda m: m.group('two').swapcase() 

1441 >>> ser = pd.Series(['One Two Three', 'Foo Bar Baz']) 

1442 >>> ser.str.replace(pat, repl, regex=True) 

1443 0 tWO 

1444 1 bAR 

1445 dtype: object 

1446 

1447 Using a compiled regex with flags 

1448 

1449 >>> import re 

1450 >>> regex_pat = re.compile(r'FUZ', flags=re.IGNORECASE) 

1451 >>> pd.Series(['foo', 'fuz', np.nan]).str.replace(regex_pat, 'bar', regex=True) 

1452 0 foo 

1453 1 bar 

1454 2 NaN 

1455 dtype: object 

1456 """ 

1457 if regex is None: 

1458 if isinstance(pat, str) and any(c in pat for c in ".+*|^$?[](){}\\"): 

1459 # warn only in cases where regex behavior would differ from literal 

1460 msg = ( 

1461 "The default value of regex will change from True to False " 

1462 "in a future version." 

1463 ) 

1464 if len(pat) == 1: 

1465 msg += ( 

1466 " In addition, single character regular expressions will " 

1467 "*not* be treated as literal strings when regex=True." 

1468 ) 

1469 warnings.warn(msg, FutureWarning, stacklevel=find_stack_level()) 

1470 

1471 # Check whether repl is valid (GH 13438, GH 15055) 

1472 if not (isinstance(repl, str) or callable(repl)): 

1473 raise TypeError("repl must be a string or callable") 

1474 

1475 is_compiled_re = is_re(pat) 

1476 if regex or regex is None: 

1477 if is_compiled_re and (case is not None or flags != 0): 

1478 raise ValueError( 

1479 "case and flags cannot be set when pat is a compiled regex" 

1480 ) 

1481 

1482 elif is_compiled_re: 

1483 raise ValueError( 

1484 "Cannot use a compiled regex as replacement pattern with regex=False" 

1485 ) 

1486 elif callable(repl): 

1487 raise ValueError("Cannot use a callable replacement when regex=False") 

1488 

1489 # The current behavior is to treat single character patterns as literal strings, 

1490 # even when ``regex`` is set to ``True``. 

1491 if isinstance(pat, str) and len(pat) == 1: 

1492 regex = False 

1493 

1494 if regex is None: 

1495 regex = True 

1496 

1497 if case is None: 

1498 case = True 

1499 

1500 result = self._data.array._str_replace( 

1501 pat, repl, n=n, case=case, flags=flags, regex=regex 

1502 ) 

1503 return self._wrap_result(result) 

1504 

1505 @forbid_nonstring_types(["bytes"]) 

1506 def repeat(self, repeats): 

1507 """ 

1508 Duplicate each string in the Series or Index. 

1509 

1510 Parameters 

1511 ---------- 

1512 repeats : int or sequence of int 

1513 Same value for all (int) or different value per (sequence). 

1514 

1515 Returns 

1516 ------- 

1517 Series or Index of object 

1518 Series or Index of repeated string objects specified by 

1519 input parameter repeats. 

1520 

1521 Examples 

1522 -------- 

1523 >>> s = pd.Series(['a', 'b', 'c']) 

1524 >>> s 

1525 0 a 

1526 1 b 

1527 2 c 

1528 dtype: object 

1529 

1530 Single int repeats string in Series 

1531 

1532 >>> s.str.repeat(repeats=2) 

1533 0 aa 

1534 1 bb 

1535 2 cc 

1536 dtype: object 

1537 

1538 Sequence of int repeats corresponding string in Series 

1539 

1540 >>> s.str.repeat(repeats=[1, 2, 3]) 

1541 0 a 

1542 1 bb 

1543 2 ccc 

1544 dtype: object 

1545 """ 

1546 result = self._data.array._str_repeat(repeats) 

1547 return self._wrap_result(result) 

1548 

1549 @forbid_nonstring_types(["bytes"]) 

1550 def pad(self, width, side="left", fillchar=" "): 

1551 """ 

1552 Pad strings in the Series/Index up to width. 

1553 

1554 Parameters 

1555 ---------- 

1556 width : int 

1557 Minimum width of resulting string; additional characters will be filled 

1558 with character defined in `fillchar`. 

1559 side : {'left', 'right', 'both'}, default 'left' 

1560 Side from which to fill resulting string. 

1561 fillchar : str, default ' ' 

1562 Additional character for filling, default is whitespace. 

1563 

1564 Returns 

1565 ------- 

1566 Series or Index of object 

1567 Returns Series or Index with minimum number of char in object. 

1568 

1569 See Also 

1570 -------- 

1571 Series.str.rjust : Fills the left side of strings with an arbitrary 

1572 character. Equivalent to ``Series.str.pad(side='left')``. 

1573 Series.str.ljust : Fills the right side of strings with an arbitrary 

1574 character. Equivalent to ``Series.str.pad(side='right')``. 

1575 Series.str.center : Fills both sides of strings with an arbitrary 

1576 character. Equivalent to ``Series.str.pad(side='both')``. 

1577 Series.str.zfill : Pad strings in the Series/Index by prepending '0' 

1578 character. Equivalent to ``Series.str.pad(side='left', fillchar='0')``. 

1579 

1580 Examples 

1581 -------- 

1582 >>> s = pd.Series(["caribou", "tiger"]) 

1583 >>> s 

1584 0 caribou 

1585 1 tiger 

1586 dtype: object 

1587 

1588 >>> s.str.pad(width=10) 

1589 0 caribou 

1590 1 tiger 

1591 dtype: object 

1592 

1593 >>> s.str.pad(width=10, side='right', fillchar='-') 

1594 0 caribou--- 

1595 1 tiger----- 

1596 dtype: object 

1597 

1598 >>> s.str.pad(width=10, side='both', fillchar='-') 

1599 0 -caribou-- 

1600 1 --tiger--- 

1601 dtype: object 

1602 """ 

1603 if not isinstance(fillchar, str): 

1604 msg = f"fillchar must be a character, not {type(fillchar).__name__}" 

1605 raise TypeError(msg) 

1606 

1607 if len(fillchar) != 1: 

1608 raise TypeError("fillchar must be a character, not str") 

1609 

1610 if not is_integer(width): 

1611 msg = f"width must be of integer type, not {type(width).__name__}" 

1612 raise TypeError(msg) 

1613 

1614 result = self._data.array._str_pad(width, side=side, fillchar=fillchar) 

1615 return self._wrap_result(result) 

1616 

1617 _shared_docs[ 

1618 "str_pad" 

1619 ] = """ 

1620 Pad %(side)s side of strings in the Series/Index. 

1621 

1622 Equivalent to :meth:`str.%(method)s`. 

1623 

1624 Parameters 

1625 ---------- 

1626 width : int 

1627 Minimum width of resulting string; additional characters will be filled 

1628 with ``fillchar``. 

1629 fillchar : str 

1630 Additional character for filling, default is whitespace. 

1631 

1632 Returns 

1633 ------- 

1634 filled : Series/Index of objects. 

1635 """ 

1636 

1637 @Appender(_shared_docs["str_pad"] % {"side": "left and right", "method": "center"}) 

1638 @forbid_nonstring_types(["bytes"]) 

1639 def center(self, width, fillchar=" "): 

1640 return self.pad(width, side="both", fillchar=fillchar) 

1641 

1642 @Appender(_shared_docs["str_pad"] % {"side": "right", "method": "ljust"}) 

1643 @forbid_nonstring_types(["bytes"]) 

1644 def ljust(self, width, fillchar=" "): 

1645 return self.pad(width, side="right", fillchar=fillchar) 

1646 

1647 @Appender(_shared_docs["str_pad"] % {"side": "left", "method": "rjust"}) 

1648 @forbid_nonstring_types(["bytes"]) 

1649 def rjust(self, width, fillchar=" "): 

1650 return self.pad(width, side="left", fillchar=fillchar) 

1651 

1652 @forbid_nonstring_types(["bytes"]) 

1653 def zfill(self, width): 

1654 """ 

1655 Pad strings in the Series/Index by prepending '0' characters. 

1656 

1657 Strings in the Series/Index are padded with '0' characters on the 

1658 left of the string to reach a total string length `width`. Strings 

1659 in the Series/Index with length greater or equal to `width` are 

1660 unchanged. 

1661 

1662 Parameters 

1663 ---------- 

1664 width : int 

1665 Minimum length of resulting string; strings with length less 

1666 than `width` be prepended with '0' characters. 

1667 

1668 Returns 

1669 ------- 

1670 Series/Index of objects. 

1671 

1672 See Also 

1673 -------- 

1674 Series.str.rjust : Fills the left side of strings with an arbitrary 

1675 character. 

1676 Series.str.ljust : Fills the right side of strings with an arbitrary 

1677 character. 

1678 Series.str.pad : Fills the specified sides of strings with an arbitrary 

1679 character. 

1680 Series.str.center : Fills both sides of strings with an arbitrary 

1681 character. 

1682 

1683 Notes 

1684 ----- 

1685 Differs from :meth:`str.zfill` which has special handling 

1686 for '+'/'-' in the string. 

1687 

1688 Examples 

1689 -------- 

1690 >>> s = pd.Series(['-1', '1', '1000', 10, np.nan]) 

1691 >>> s 

1692 0 -1 

1693 1 1 

1694 2 1000 

1695 3 10 

1696 4 NaN 

1697 dtype: object 

1698 

1699 Note that ``10`` and ``NaN`` are not strings, therefore they are 

1700 converted to ``NaN``. The minus sign in ``'-1'`` is treated as a 

1701 special character and the zero is added to the right of it 

1702 (:meth:`str.zfill` would have moved it to the left). ``1000`` 

1703 remains unchanged as it is longer than `width`. 

1704 

1705 >>> s.str.zfill(3) 

1706 0 -01 

1707 1 001 

1708 2 1000 

1709 3 NaN 

1710 4 NaN 

1711 dtype: object 

1712 """ 

1713 if not is_integer(width): 

1714 msg = f"width must be of integer type, not {type(width).__name__}" 

1715 raise TypeError(msg) 

1716 f = lambda x: x.zfill(width) 

1717 result = self._data.array._str_map(f) 

1718 return self._wrap_result(result) 

1719 

1720 def slice(self, start=None, stop=None, step=None): 

1721 """ 

1722 Slice substrings from each element in the Series or Index. 

1723 

1724 Parameters 

1725 ---------- 

1726 start : int, optional 

1727 Start position for slice operation. 

1728 stop : int, optional 

1729 Stop position for slice operation. 

1730 step : int, optional 

1731 Step size for slice operation. 

1732 

1733 Returns 

1734 ------- 

1735 Series or Index of object 

1736 Series or Index from sliced substring from original string object. 

1737 

1738 See Also 

1739 -------- 

1740 Series.str.slice_replace : Replace a slice with a string. 

1741 Series.str.get : Return element at position. 

1742 Equivalent to `Series.str.slice(start=i, stop=i+1)` with `i` 

1743 being the position. 

1744 

1745 Examples 

1746 -------- 

1747 >>> s = pd.Series(["koala", "dog", "chameleon"]) 

1748 >>> s 

1749 0 koala 

1750 1 dog 

1751 2 chameleon 

1752 dtype: object 

1753 

1754 >>> s.str.slice(start=1) 

1755 0 oala 

1756 1 og 

1757 2 hameleon 

1758 dtype: object 

1759 

1760 >>> s.str.slice(start=-1) 

1761 0 a 

1762 1 g 

1763 2 n 

1764 dtype: object 

1765 

1766 >>> s.str.slice(stop=2) 

1767 0 ko 

1768 1 do 

1769 2 ch 

1770 dtype: object 

1771 

1772 >>> s.str.slice(step=2) 

1773 0 kaa 

1774 1 dg 

1775 2 caeen 

1776 dtype: object 

1777 

1778 >>> s.str.slice(start=0, stop=5, step=3) 

1779 0 kl 

1780 1 d 

1781 2 cm 

1782 dtype: object 

1783 

1784 Equivalent behaviour to: 

1785 

1786 >>> s.str[0:5:3] 

1787 0 kl 

1788 1 d 

1789 2 cm 

1790 dtype: object 

1791 """ 

1792 result = self._data.array._str_slice(start, stop, step) 

1793 return self._wrap_result(result) 

1794 

1795 @forbid_nonstring_types(["bytes"]) 

1796 def slice_replace(self, start=None, stop=None, repl=None): 

1797 """ 

1798 Replace a positional slice of a string with another value. 

1799 

1800 Parameters 

1801 ---------- 

1802 start : int, optional 

1803 Left index position to use for the slice. If not specified (None), 

1804 the slice is unbounded on the left, i.e. slice from the start 

1805 of the string. 

1806 stop : int, optional 

1807 Right index position to use for the slice. If not specified (None), 

1808 the slice is unbounded on the right, i.e. slice until the 

1809 end of the string. 

1810 repl : str, optional 

1811 String for replacement. If not specified (None), the sliced region 

1812 is replaced with an empty string. 

1813 

1814 Returns 

1815 ------- 

1816 Series or Index 

1817 Same type as the original object. 

1818 

1819 See Also 

1820 -------- 

1821 Series.str.slice : Just slicing without replacement. 

1822 

1823 Examples 

1824 -------- 

1825 >>> s = pd.Series(['a', 'ab', 'abc', 'abdc', 'abcde']) 

1826 >>> s 

1827 0 a 

1828 1 ab 

1829 2 abc 

1830 3 abdc 

1831 4 abcde 

1832 dtype: object 

1833 

1834 Specify just `start`, meaning replace `start` until the end of the 

1835 string with `repl`. 

1836 

1837 >>> s.str.slice_replace(1, repl='X') 

1838 0 aX 

1839 1 aX 

1840 2 aX 

1841 3 aX 

1842 4 aX 

1843 dtype: object 

1844 

1845 Specify just `stop`, meaning the start of the string to `stop` is replaced 

1846 with `repl`, and the rest of the string is included. 

1847 

1848 >>> s.str.slice_replace(stop=2, repl='X') 

1849 0 X 

1850 1 X 

1851 2 Xc 

1852 3 Xdc 

1853 4 Xcde 

1854 dtype: object 

1855 

1856 Specify `start` and `stop`, meaning the slice from `start` to `stop` is 

1857 replaced with `repl`. Everything before or after `start` and `stop` is 

1858 included as is. 

1859 

1860 >>> s.str.slice_replace(start=1, stop=3, repl='X') 

1861 0 aX 

1862 1 aX 

1863 2 aX 

1864 3 aXc 

1865 4 aXde 

1866 dtype: object 

1867 """ 

1868 result = self._data.array._str_slice_replace(start, stop, repl) 

1869 return self._wrap_result(result) 

1870 

1871 def decode(self, encoding, errors="strict"): 

1872 """ 

1873 Decode character string in the Series/Index using indicated encoding. 

1874 

1875 Equivalent to :meth:`str.decode` in python2 and :meth:`bytes.decode` in 

1876 python3. 

1877 

1878 Parameters 

1879 ---------- 

1880 encoding : str 

1881 errors : str, optional 

1882 

1883 Returns 

1884 ------- 

1885 Series or Index 

1886 """ 

1887 # TODO: Add a similar _bytes interface. 

1888 if encoding in _cpython_optimized_decoders: 

1889 # CPython optimized implementation 

1890 f = lambda x: x.decode(encoding, errors) 

1891 else: 

1892 decoder = codecs.getdecoder(encoding) 

1893 f = lambda x: decoder(x, errors)[0] 

1894 arr = self._data.array 

1895 # assert isinstance(arr, (StringArray,)) 

1896 result = arr._str_map(f) 

1897 return self._wrap_result(result) 

1898 

1899 @forbid_nonstring_types(["bytes"]) 

1900 def encode(self, encoding, errors="strict"): 

1901 """ 

1902 Encode character string in the Series/Index using indicated encoding. 

1903 

1904 Equivalent to :meth:`str.encode`. 

1905 

1906 Parameters 

1907 ---------- 

1908 encoding : str 

1909 errors : str, optional 

1910 

1911 Returns 

1912 ------- 

1913 encoded : Series/Index of objects 

1914 """ 

1915 result = self._data.array._str_encode(encoding, errors) 

1916 return self._wrap_result(result, returns_string=False) 

1917 

1918 _shared_docs[ 

1919 "str_strip" 

1920 ] = r""" 

1921 Remove %(position)s characters. 

1922 

1923 Strip whitespaces (including newlines) or a set of specified characters 

1924 from each string in the Series/Index from %(side)s. 

1925 Replaces any non-strings in Series with NaNs. 

1926 Equivalent to :meth:`str.%(method)s`. 

1927 

1928 Parameters 

1929 ---------- 

1930 to_strip : str or None, default None 

1931 Specifying the set of characters to be removed. 

1932 All combinations of this set of characters will be stripped. 

1933 If None then whitespaces are removed. 

1934 

1935 Returns 

1936 ------- 

1937 Series or Index of object 

1938 

1939 See Also 

1940 -------- 

1941 Series.str.strip : Remove leading and trailing characters in Series/Index. 

1942 Series.str.lstrip : Remove leading characters in Series/Index. 

1943 Series.str.rstrip : Remove trailing characters in Series/Index. 

1944 

1945 Examples 

1946 -------- 

1947 >>> s = pd.Series(['1. Ant. ', '2. Bee!\n', '3. Cat?\t', np.nan, 10, True]) 

1948 >>> s 

1949 0 1. Ant. 

1950 1 2. Bee!\n 

1951 2 3. Cat?\t 

1952 3 NaN 

1953 4 10 

1954 5 True 

1955 dtype: object 

1956 

1957 >>> s.str.strip() 

1958 0 1. Ant. 

1959 1 2. Bee! 

1960 2 3. Cat? 

1961 3 NaN 

1962 4 NaN 

1963 5 NaN 

1964 dtype: object 

1965 

1966 >>> s.str.lstrip('123.') 

1967 0 Ant. 

1968 1 Bee!\n 

1969 2 Cat?\t 

1970 3 NaN 

1971 4 NaN 

1972 5 NaN 

1973 dtype: object 

1974 

1975 >>> s.str.rstrip('.!? \n\t') 

1976 0 1. Ant 

1977 1 2. Bee 

1978 2 3. Cat 

1979 3 NaN 

1980 4 NaN 

1981 5 NaN 

1982 dtype: object 

1983 

1984 >>> s.str.strip('123.!? \n\t') 

1985 0 Ant 

1986 1 Bee 

1987 2 Cat 

1988 3 NaN 

1989 4 NaN 

1990 5 NaN 

1991 dtype: object 

1992 """ 

1993 

1994 @Appender( 

1995 _shared_docs["str_strip"] 

1996 % { 

1997 "side": "left and right sides", 

1998 "method": "strip", 

1999 "position": "leading and trailing", 

2000 } 

2001 ) 

2002 @forbid_nonstring_types(["bytes"]) 

2003 def strip(self, to_strip=None): 

2004 result = self._data.array._str_strip(to_strip) 

2005 return self._wrap_result(result) 

2006 

2007 @Appender( 

2008 _shared_docs["str_strip"] 

2009 % {"side": "left side", "method": "lstrip", "position": "leading"} 

2010 ) 

2011 @forbid_nonstring_types(["bytes"]) 

2012 def lstrip(self, to_strip=None): 

2013 result = self._data.array._str_lstrip(to_strip) 

2014 return self._wrap_result(result) 

2015 

2016 @Appender( 

2017 _shared_docs["str_strip"] 

2018 % {"side": "right side", "method": "rstrip", "position": "trailing"} 

2019 ) 

2020 @forbid_nonstring_types(["bytes"]) 

2021 def rstrip(self, to_strip=None): 

2022 result = self._data.array._str_rstrip(to_strip) 

2023 return self._wrap_result(result) 

2024 

2025 _shared_docs[ 

2026 "str_removefix" 

2027 ] = r""" 

2028 Remove a %(side)s from an object series. 

2029 

2030 If the %(side)s is not present, the original string will be returned. 

2031 

2032 Parameters 

2033 ---------- 

2034 %(side)s : str 

2035 Remove the %(side)s of the string. 

2036 

2037 Returns 

2038 ------- 

2039 Series/Index: object 

2040 The Series or Index with given %(side)s removed. 

2041 

2042 See Also 

2043 -------- 

2044 Series.str.remove%(other_side)s : Remove a %(other_side)s from an object series. 

2045 

2046 Examples 

2047 -------- 

2048 >>> s = pd.Series(["str_foo", "str_bar", "no_prefix"]) 

2049 >>> s 

2050 0 str_foo 

2051 1 str_bar 

2052 2 no_prefix 

2053 dtype: object 

2054 >>> s.str.removeprefix("str_") 

2055 0 foo 

2056 1 bar 

2057 2 no_prefix 

2058 dtype: object 

2059 

2060 >>> s = pd.Series(["foo_str", "bar_str", "no_suffix"]) 

2061 >>> s 

2062 0 foo_str 

2063 1 bar_str 

2064 2 no_suffix 

2065 dtype: object 

2066 >>> s.str.removesuffix("_str") 

2067 0 foo 

2068 1 bar 

2069 2 no_suffix 

2070 dtype: object 

2071 """ 

2072 

2073 @Appender( 

2074 _shared_docs["str_removefix"] % {"side": "prefix", "other_side": "suffix"} 

2075 ) 

2076 @forbid_nonstring_types(["bytes"]) 

2077 def removeprefix(self, prefix): 

2078 result = self._data.array._str_removeprefix(prefix) 

2079 return self._wrap_result(result) 

2080 

2081 @Appender( 

2082 _shared_docs["str_removefix"] % {"side": "suffix", "other_side": "prefix"} 

2083 ) 

2084 @forbid_nonstring_types(["bytes"]) 

2085 def removesuffix(self, suffix): 

2086 result = self._data.array._str_removesuffix(suffix) 

2087 return self._wrap_result(result) 

2088 

2089 @forbid_nonstring_types(["bytes"]) 

2090 def wrap(self, width, **kwargs): 

2091 r""" 

2092 Wrap strings in Series/Index at specified line width. 

2093 

2094 This method has the same keyword parameters and defaults as 

2095 :class:`textwrap.TextWrapper`. 

2096 

2097 Parameters 

2098 ---------- 

2099 width : int 

2100 Maximum line width. 

2101 expand_tabs : bool, optional 

2102 If True, tab characters will be expanded to spaces (default: True). 

2103 replace_whitespace : bool, optional 

2104 If True, each whitespace character (as defined by string.whitespace) 

2105 remaining after tab expansion will be replaced by a single space 

2106 (default: True). 

2107 drop_whitespace : bool, optional 

2108 If True, whitespace that, after wrapping, happens to end up at the 

2109 beginning or end of a line is dropped (default: True). 

2110 break_long_words : bool, optional 

2111 If True, then words longer than width will be broken in order to ensure 

2112 that no lines are longer than width. If it is false, long words will 

2113 not be broken, and some lines may be longer than width (default: True). 

2114 break_on_hyphens : bool, optional 

2115 If True, wrapping will occur preferably on whitespace and right after 

2116 hyphens in compound words, as it is customary in English. If false, 

2117 only whitespaces will be considered as potentially good places for line 

2118 breaks, but you need to set break_long_words to false if you want truly 

2119 insecable words (default: True). 

2120 

2121 Returns 

2122 ------- 

2123 Series or Index 

2124 

2125 Notes 

2126 ----- 

2127 Internally, this method uses a :class:`textwrap.TextWrapper` instance with 

2128 default settings. To achieve behavior matching R's stringr library str_wrap 

2129 function, use the arguments: 

2130 

2131 - expand_tabs = False 

2132 - replace_whitespace = True 

2133 - drop_whitespace = True 

2134 - break_long_words = False 

2135 - break_on_hyphens = False 

2136 

2137 Examples 

2138 -------- 

2139 >>> s = pd.Series(['line to be wrapped', 'another line to be wrapped']) 

2140 >>> s.str.wrap(12) 

2141 0 line to be\nwrapped 

2142 1 another line\nto be\nwrapped 

2143 dtype: object 

2144 """ 

2145 result = self._data.array._str_wrap(width, **kwargs) 

2146 return self._wrap_result(result) 

2147 

2148 @forbid_nonstring_types(["bytes"]) 

2149 def get_dummies(self, sep="|"): 

2150 """ 

2151 Return DataFrame of dummy/indicator variables for Series. 

2152 

2153 Each string in Series is split by sep and returned as a DataFrame 

2154 of dummy/indicator variables. 

2155 

2156 Parameters 

2157 ---------- 

2158 sep : str, default "|" 

2159 String to split on. 

2160 

2161 Returns 

2162 ------- 

2163 DataFrame 

2164 Dummy variables corresponding to values of the Series. 

2165 

2166 See Also 

2167 -------- 

2168 get_dummies : Convert categorical variable into dummy/indicator 

2169 variables. 

2170 

2171 Examples 

2172 -------- 

2173 >>> pd.Series(['a|b', 'a', 'a|c']).str.get_dummies() 

2174 a b c 

2175 0 1 1 0 

2176 1 1 0 0 

2177 2 1 0 1 

2178 

2179 >>> pd.Series(['a|b', np.nan, 'a|c']).str.get_dummies() 

2180 a b c 

2181 0 1 1 0 

2182 1 0 0 0 

2183 2 1 0 1 

2184 """ 

2185 # we need to cast to Series of strings as only that has all 

2186 # methods available for making the dummies... 

2187 result, name = self._data.array._str_get_dummies(sep) 

2188 return self._wrap_result( 

2189 result, 

2190 name=name, 

2191 expand=True, 

2192 returns_string=False, 

2193 ) 

2194 

2195 @forbid_nonstring_types(["bytes"]) 

2196 def translate(self, table): 

2197 """ 

2198 Map all characters in the string through the given mapping table. 

2199 

2200 Equivalent to standard :meth:`str.translate`. 

2201 

2202 Parameters 

2203 ---------- 

2204 table : dict 

2205 Table is a mapping of Unicode ordinals to Unicode ordinals, strings, or 

2206 None. Unmapped characters are left untouched. 

2207 Characters mapped to None are deleted. :meth:`str.maketrans` is a 

2208 helper function for making translation tables. 

2209 

2210 Returns 

2211 ------- 

2212 Series or Index 

2213 """ 

2214 result = self._data.array._str_translate(table) 

2215 return self._wrap_result(result) 

2216 

2217 @forbid_nonstring_types(["bytes"]) 

2218 def count(self, pat, flags=0): 

2219 r""" 

2220 Count occurrences of pattern in each string of the Series/Index. 

2221 

2222 This function is used to count the number of times a particular regex 

2223 pattern is repeated in each of the string elements of the 

2224 :class:`~pandas.Series`. 

2225 

2226 Parameters 

2227 ---------- 

2228 pat : str 

2229 Valid regular expression. 

2230 flags : int, default 0, meaning no flags 

2231 Flags for the `re` module. For a complete list, `see here 

2232 <https://docs.python.org/3/howto/regex.html#compilation-flags>`_. 

2233 **kwargs 

2234 For compatibility with other string methods. Not used. 

2235 

2236 Returns 

2237 ------- 

2238 Series or Index 

2239 Same type as the calling object containing the integer counts. 

2240 

2241 See Also 

2242 -------- 

2243 re : Standard library module for regular expressions. 

2244 str.count : Standard library version, without regular expression support. 

2245 

2246 Notes 

2247 ----- 

2248 Some characters need to be escaped when passing in `pat`. 

2249 eg. ``'$'`` has a special meaning in regex and must be escaped when 

2250 finding this literal character. 

2251 

2252 Examples 

2253 -------- 

2254 >>> s = pd.Series(['A', 'B', 'Aaba', 'Baca', np.nan, 'CABA', 'cat']) 

2255 >>> s.str.count('a') 

2256 0 0.0 

2257 1 0.0 

2258 2 2.0 

2259 3 2.0 

2260 4 NaN 

2261 5 0.0 

2262 6 1.0 

2263 dtype: float64 

2264 

2265 Escape ``'$'`` to find the literal dollar sign. 

2266 

2267 >>> s = pd.Series(['$', 'B', 'Aab$', '$$ca', 'C$B$', 'cat']) 

2268 >>> s.str.count('\\$') 

2269 0 1 

2270 1 0 

2271 2 1 

2272 3 2 

2273 4 2 

2274 5 0 

2275 dtype: int64 

2276 

2277 This is also available on Index 

2278 

2279 >>> pd.Index(['A', 'A', 'Aaba', 'cat']).str.count('a') 

2280 Int64Index([0, 0, 2, 1], dtype='int64') 

2281 """ 

2282 result = self._data.array._str_count(pat, flags) 

2283 return self._wrap_result(result, returns_string=False) 

2284 

2285 @forbid_nonstring_types(["bytes"]) 

2286 def startswith( 

2287 self, pat: str | tuple[str, ...], na: Scalar | None = None 

2288 ) -> Series | Index: 

2289 """ 

2290 Test if the start of each string element matches a pattern. 

2291 

2292 Equivalent to :meth:`str.startswith`. 

2293 

2294 Parameters 

2295 ---------- 

2296 pat : str or tuple[str, ...] 

2297 Character sequence or tuple of strings. Regular expressions are not 

2298 accepted. 

2299 na : object, default NaN 

2300 Object shown if element tested is not a string. The default depends 

2301 on dtype of the array. For object-dtype, ``numpy.nan`` is used. 

2302 For ``StringDtype``, ``pandas.NA`` is used. 

2303 

2304 Returns 

2305 ------- 

2306 Series or Index of bool 

2307 A Series of booleans indicating whether the given pattern matches 

2308 the start of each string element. 

2309 

2310 See Also 

2311 -------- 

2312 str.startswith : Python standard library string method. 

2313 Series.str.endswith : Same as startswith, but tests the end of string. 

2314 Series.str.contains : Tests if string element contains a pattern. 

2315 

2316 Examples 

2317 -------- 

2318 >>> s = pd.Series(['bat', 'Bear', 'cat', np.nan]) 

2319 >>> s 

2320 0 bat 

2321 1 Bear 

2322 2 cat 

2323 3 NaN 

2324 dtype: object 

2325 

2326 >>> s.str.startswith('b') 

2327 0 True 

2328 1 False 

2329 2 False 

2330 3 NaN 

2331 dtype: object 

2332 

2333 >>> s.str.startswith(('b', 'B')) 

2334 0 True 

2335 1 True 

2336 2 False 

2337 3 NaN 

2338 dtype: object 

2339 

2340 Specifying `na` to be `False` instead of `NaN`. 

2341 

2342 >>> s.str.startswith('b', na=False) 

2343 0 True 

2344 1 False 

2345 2 False 

2346 3 False 

2347 dtype: bool 

2348 """ 

2349 if not isinstance(pat, (str, tuple)): 

2350 msg = f"expected a string or tuple, not {type(pat).__name__}" 

2351 raise TypeError(msg) 

2352 result = self._data.array._str_startswith(pat, na=na) 

2353 return self._wrap_result(result, returns_string=False) 

2354 

2355 @forbid_nonstring_types(["bytes"]) 

2356 def endswith( 

2357 self, pat: str | tuple[str, ...], na: Scalar | None = None 

2358 ) -> Series | Index: 

2359 """ 

2360 Test if the end of each string element matches a pattern. 

2361 

2362 Equivalent to :meth:`str.endswith`. 

2363 

2364 Parameters 

2365 ---------- 

2366 pat : str or tuple[str, ...] 

2367 Character sequence or tuple of strings. Regular expressions are not 

2368 accepted. 

2369 na : object, default NaN 

2370 Object shown if element tested is not a string. The default depends 

2371 on dtype of the array. For object-dtype, ``numpy.nan`` is used. 

2372 For ``StringDtype``, ``pandas.NA`` is used. 

2373 

2374 Returns 

2375 ------- 

2376 Series or Index of bool 

2377 A Series of booleans indicating whether the given pattern matches 

2378 the end of each string element. 

2379 

2380 See Also 

2381 -------- 

2382 str.endswith : Python standard library string method. 

2383 Series.str.startswith : Same as endswith, but tests the start of string. 

2384 Series.str.contains : Tests if string element contains a pattern. 

2385 

2386 Examples 

2387 -------- 

2388 >>> s = pd.Series(['bat', 'bear', 'caT', np.nan]) 

2389 >>> s 

2390 0 bat 

2391 1 bear 

2392 2 caT 

2393 3 NaN 

2394 dtype: object 

2395 

2396 >>> s.str.endswith('t') 

2397 0 True 

2398 1 False 

2399 2 False 

2400 3 NaN 

2401 dtype: object 

2402 

2403 >>> s.str.endswith(('t', 'T')) 

2404 0 True 

2405 1 False 

2406 2 True 

2407 3 NaN 

2408 dtype: object 

2409 

2410 Specifying `na` to be `False` instead of `NaN`. 

2411 

2412 >>> s.str.endswith('t', na=False) 

2413 0 True 

2414 1 False 

2415 2 False 

2416 3 False 

2417 dtype: bool 

2418 """ 

2419 if not isinstance(pat, (str, tuple)): 

2420 msg = f"expected a string or tuple, not {type(pat).__name__}" 

2421 raise TypeError(msg) 

2422 result = self._data.array._str_endswith(pat, na=na) 

2423 return self._wrap_result(result, returns_string=False) 

2424 

2425 @forbid_nonstring_types(["bytes"]) 

2426 def findall(self, pat, flags=0): 

2427 """ 

2428 Find all occurrences of pattern or regular expression in the Series/Index. 

2429 

2430 Equivalent to applying :func:`re.findall` to all the elements in the 

2431 Series/Index. 

2432 

2433 Parameters 

2434 ---------- 

2435 pat : str 

2436 Pattern or regular expression. 

2437 flags : int, default 0 

2438 Flags from ``re`` module, e.g. `re.IGNORECASE` (default is 0, which 

2439 means no flags). 

2440 

2441 Returns 

2442 ------- 

2443 Series/Index of lists of strings 

2444 All non-overlapping matches of pattern or regular expression in each 

2445 string of this Series/Index. 

2446 

2447 See Also 

2448 -------- 

2449 count : Count occurrences of pattern or regular expression in each string 

2450 of the Series/Index. 

2451 extractall : For each string in the Series, extract groups from all matches 

2452 of regular expression and return a DataFrame with one row for each 

2453 match and one column for each group. 

2454 re.findall : The equivalent ``re`` function to all non-overlapping matches 

2455 of pattern or regular expression in string, as a list of strings. 

2456 

2457 Examples 

2458 -------- 

2459 >>> s = pd.Series(['Lion', 'Monkey', 'Rabbit']) 

2460 

2461 The search for the pattern 'Monkey' returns one match: 

2462 

2463 >>> s.str.findall('Monkey') 

2464 0 [] 

2465 1 [Monkey] 

2466 2 [] 

2467 dtype: object 

2468 

2469 On the other hand, the search for the pattern 'MONKEY' doesn't return any 

2470 match: 

2471 

2472 >>> s.str.findall('MONKEY') 

2473 0 [] 

2474 1 [] 

2475 2 [] 

2476 dtype: object 

2477 

2478 Flags can be added to the pattern or regular expression. For instance, 

2479 to find the pattern 'MONKEY' ignoring the case: 

2480 

2481 >>> import re 

2482 >>> s.str.findall('MONKEY', flags=re.IGNORECASE) 

2483 0 [] 

2484 1 [Monkey] 

2485 2 [] 

2486 dtype: object 

2487 

2488 When the pattern matches more than one string in the Series, all matches 

2489 are returned: 

2490 

2491 >>> s.str.findall('on') 

2492 0 [on] 

2493 1 [on] 

2494 2 [] 

2495 dtype: object 

2496 

2497 Regular expressions are supported too. For instance, the search for all the 

2498 strings ending with the word 'on' is shown next: 

2499 

2500 >>> s.str.findall('on$') 

2501 0 [on] 

2502 1 [] 

2503 2 [] 

2504 dtype: object 

2505 

2506 If the pattern is found more than once in the same string, then a list of 

2507 multiple strings is returned: 

2508 

2509 >>> s.str.findall('b') 

2510 0 [] 

2511 1 [] 

2512 2 [b, b] 

2513 dtype: object 

2514 """ 

2515 result = self._data.array._str_findall(pat, flags) 

2516 return self._wrap_result(result, returns_string=False) 

2517 

2518 @forbid_nonstring_types(["bytes"]) 

2519 def extract( 

2520 self, pat: str, flags: int = 0, expand: bool = True 

2521 ) -> DataFrame | Series | Index: 

2522 r""" 

2523 Extract capture groups in the regex `pat` as columns in a DataFrame. 

2524 

2525 For each subject string in the Series, extract groups from the 

2526 first match of regular expression `pat`. 

2527 

2528 Parameters 

2529 ---------- 

2530 pat : str 

2531 Regular expression pattern with capturing groups. 

2532 flags : int, default 0 (no flags) 

2533 Flags from the ``re`` module, e.g. ``re.IGNORECASE``, that 

2534 modify regular expression matching for things like case, 

2535 spaces, etc. For more details, see :mod:`re`. 

2536 expand : bool, default True 

2537 If True, return DataFrame with one column per capture group. 

2538 If False, return a Series/Index if there is one capture group 

2539 or DataFrame if there are multiple capture groups. 

2540 

2541 Returns 

2542 ------- 

2543 DataFrame or Series or Index 

2544 A DataFrame with one row for each subject string, and one 

2545 column for each group. Any capture group names in regular 

2546 expression pat will be used for column names; otherwise 

2547 capture group numbers will be used. The dtype of each result 

2548 column is always object, even when no match is found. If 

2549 ``expand=False`` and pat has only one capture group, then 

2550 return a Series (if subject is a Series) or Index (if subject 

2551 is an Index). 

2552 

2553 See Also 

2554 -------- 

2555 extractall : Returns all matches (not just the first match). 

2556 

2557 Examples 

2558 -------- 

2559 A pattern with two groups will return a DataFrame with two columns. 

2560 Non-matches will be NaN. 

2561 

2562 >>> s = pd.Series(['a1', 'b2', 'c3']) 

2563 >>> s.str.extract(r'([ab])(\d)') 

2564 0 1 

2565 0 a 1 

2566 1 b 2 

2567 2 NaN NaN 

2568 

2569 A pattern may contain optional groups. 

2570 

2571 >>> s.str.extract(r'([ab])?(\d)') 

2572 0 1 

2573 0 a 1 

2574 1 b 2 

2575 2 NaN 3 

2576 

2577 Named groups will become column names in the result. 

2578 

2579 >>> s.str.extract(r'(?P<letter>[ab])(?P<digit>\d)') 

2580 letter digit 

2581 0 a 1 

2582 1 b 2 

2583 2 NaN NaN 

2584 

2585 A pattern with one group will return a DataFrame with one column 

2586 if expand=True. 

2587 

2588 >>> s.str.extract(r'[ab](\d)', expand=True) 

2589 0 

2590 0 1 

2591 1 2 

2592 2 NaN 

2593 

2594 A pattern with one group will return a Series if expand=False. 

2595 

2596 >>> s.str.extract(r'[ab](\d)', expand=False) 

2597 0 1 

2598 1 2 

2599 2 NaN 

2600 dtype: object 

2601 """ 

2602 from pandas import DataFrame 

2603 

2604 if not isinstance(expand, bool): 

2605 raise ValueError("expand must be True or False") 

2606 

2607 regex = re.compile(pat, flags=flags) 

2608 if regex.groups == 0: 

2609 raise ValueError("pattern contains no capture groups") 

2610 

2611 if not expand and regex.groups > 1 and isinstance(self._data, ABCIndex): 

2612 raise ValueError("only one regex group is supported with Index") 

2613 

2614 obj = self._data 

2615 result_dtype = _result_dtype(obj) 

2616 

2617 returns_df = regex.groups > 1 or expand 

2618 

2619 if returns_df: 

2620 name = None 

2621 columns = _get_group_names(regex) 

2622 

2623 if obj.array.size == 0: 

2624 result = DataFrame(columns=columns, dtype=result_dtype) 

2625 

2626 else: 

2627 result_list = self._data.array._str_extract( 

2628 pat, flags=flags, expand=returns_df 

2629 ) 

2630 

2631 result_index: Index | None 

2632 if isinstance(obj, ABCSeries): 

2633 result_index = obj.index 

2634 else: 

2635 result_index = None 

2636 

2637 result = DataFrame( 

2638 result_list, columns=columns, index=result_index, dtype=result_dtype 

2639 ) 

2640 

2641 else: 

2642 name = _get_single_group_name(regex) 

2643 result = self._data.array._str_extract(pat, flags=flags, expand=returns_df) 

2644 return self._wrap_result(result, name=name) 

2645 

2646 @forbid_nonstring_types(["bytes"]) 

2647 def extractall(self, pat, flags=0): 

2648 r""" 

2649 Extract capture groups in the regex `pat` as columns in DataFrame. 

2650 

2651 For each subject string in the Series, extract groups from all 

2652 matches of regular expression pat. When each subject string in the 

2653 Series has exactly one match, extractall(pat).xs(0, level='match') 

2654 is the same as extract(pat). 

2655 

2656 Parameters 

2657 ---------- 

2658 pat : str 

2659 Regular expression pattern with capturing groups. 

2660 flags : int, default 0 (no flags) 

2661 A ``re`` module flag, for example ``re.IGNORECASE``. These allow 

2662 to modify regular expression matching for things like case, spaces, 

2663 etc. Multiple flags can be combined with the bitwise OR operator, 

2664 for example ``re.IGNORECASE | re.MULTILINE``. 

2665 

2666 Returns 

2667 ------- 

2668 DataFrame 

2669 A ``DataFrame`` with one row for each match, and one column for each 

2670 group. Its rows have a ``MultiIndex`` with first levels that come from 

2671 the subject ``Series``. The last level is named 'match' and indexes the 

2672 matches in each item of the ``Series``. Any capture group names in 

2673 regular expression pat will be used for column names; otherwise capture 

2674 group numbers will be used. 

2675 

2676 See Also 

2677 -------- 

2678 extract : Returns first match only (not all matches). 

2679 

2680 Examples 

2681 -------- 

2682 A pattern with one group will return a DataFrame with one column. 

2683 Indices with no matches will not appear in the result. 

2684 

2685 >>> s = pd.Series(["a1a2", "b1", "c1"], index=["A", "B", "C"]) 

2686 >>> s.str.extractall(r"[ab](\d)") 

2687 0 

2688 match 

2689 A 0 1 

2690 1 2 

2691 B 0 1 

2692 

2693 Capture group names are used for column names of the result. 

2694 

2695 >>> s.str.extractall(r"[ab](?P<digit>\d)") 

2696 digit 

2697 match 

2698 A 0 1 

2699 1 2 

2700 B 0 1 

2701 

2702 A pattern with two groups will return a DataFrame with two columns. 

2703 

2704 >>> s.str.extractall(r"(?P<letter>[ab])(?P<digit>\d)") 

2705 letter digit 

2706 match 

2707 A 0 a 1 

2708 1 a 2 

2709 B 0 b 1 

2710 

2711 Optional groups that do not match are NaN in the result. 

2712 

2713 >>> s.str.extractall(r"(?P<letter>[ab])?(?P<digit>\d)") 

2714 letter digit 

2715 match 

2716 A 0 a 1 

2717 1 a 2 

2718 B 0 b 1 

2719 C 0 NaN 1 

2720 """ 

2721 # TODO: dispatch 

2722 return str_extractall(self._orig, pat, flags) 

2723 

2724 _shared_docs[ 

2725 "find" 

2726 ] = """ 

2727 Return %(side)s indexes in each strings in the Series/Index. 

2728 

2729 Each of returned indexes corresponds to the position where the 

2730 substring is fully contained between [start:end]. Return -1 on 

2731 failure. Equivalent to standard :meth:`str.%(method)s`. 

2732 

2733 Parameters 

2734 ---------- 

2735 sub : str 

2736 Substring being searched. 

2737 start : int 

2738 Left edge index. 

2739 end : int 

2740 Right edge index. 

2741 

2742 Returns 

2743 ------- 

2744 Series or Index of int. 

2745 

2746 See Also 

2747 -------- 

2748 %(also)s 

2749 """ 

2750 

2751 @Appender( 

2752 _shared_docs["find"] 

2753 % { 

2754 "side": "lowest", 

2755 "method": "find", 

2756 "also": "rfind : Return highest indexes in each strings.", 

2757 } 

2758 ) 

2759 @forbid_nonstring_types(["bytes"]) 

2760 def find(self, sub, start=0, end=None): 

2761 if not isinstance(sub, str): 

2762 msg = f"expected a string object, not {type(sub).__name__}" 

2763 raise TypeError(msg) 

2764 

2765 result = self._data.array._str_find(sub, start, end) 

2766 return self._wrap_result(result, returns_string=False) 

2767 

2768 @Appender( 

2769 _shared_docs["find"] 

2770 % { 

2771 "side": "highest", 

2772 "method": "rfind", 

2773 "also": "find : Return lowest indexes in each strings.", 

2774 } 

2775 ) 

2776 @forbid_nonstring_types(["bytes"]) 

2777 def rfind(self, sub, start=0, end=None): 

2778 if not isinstance(sub, str): 

2779 msg = f"expected a string object, not {type(sub).__name__}" 

2780 raise TypeError(msg) 

2781 

2782 result = self._data.array._str_rfind(sub, start=start, end=end) 

2783 return self._wrap_result(result, returns_string=False) 

2784 

2785 @forbid_nonstring_types(["bytes"]) 

2786 def normalize(self, form): 

2787 """ 

2788 Return the Unicode normal form for the strings in the Series/Index. 

2789 

2790 For more information on the forms, see the 

2791 :func:`unicodedata.normalize`. 

2792 

2793 Parameters 

2794 ---------- 

2795 form : {'NFC', 'NFKC', 'NFD', 'NFKD'} 

2796 Unicode form. 

2797 

2798 Returns 

2799 ------- 

2800 normalized : Series/Index of objects 

2801 """ 

2802 result = self._data.array._str_normalize(form) 

2803 return self._wrap_result(result) 

2804 

2805 _shared_docs[ 

2806 "index" 

2807 ] = """ 

2808 Return %(side)s indexes in each string in Series/Index. 

2809 

2810 Each of the returned indexes corresponds to the position where the 

2811 substring is fully contained between [start:end]. This is the same 

2812 as ``str.%(similar)s`` except instead of returning -1, it raises a 

2813 ValueError when the substring is not found. Equivalent to standard 

2814 ``str.%(method)s``. 

2815 

2816 Parameters 

2817 ---------- 

2818 sub : str 

2819 Substring being searched. 

2820 start : int 

2821 Left edge index. 

2822 end : int 

2823 Right edge index. 

2824 

2825 Returns 

2826 ------- 

2827 Series or Index of object 

2828 

2829 See Also 

2830 -------- 

2831 %(also)s 

2832 """ 

2833 

2834 @Appender( 

2835 _shared_docs["index"] 

2836 % { 

2837 "side": "lowest", 

2838 "similar": "find", 

2839 "method": "index", 

2840 "also": "rindex : Return highest indexes in each strings.", 

2841 } 

2842 ) 

2843 @forbid_nonstring_types(["bytes"]) 

2844 def index(self, sub, start=0, end=None): 

2845 if not isinstance(sub, str): 

2846 msg = f"expected a string object, not {type(sub).__name__}" 

2847 raise TypeError(msg) 

2848 

2849 result = self._data.array._str_index(sub, start=start, end=end) 

2850 return self._wrap_result(result, returns_string=False) 

2851 

2852 @Appender( 

2853 _shared_docs["index"] 

2854 % { 

2855 "side": "highest", 

2856 "similar": "rfind", 

2857 "method": "rindex", 

2858 "also": "index : Return lowest indexes in each strings.", 

2859 } 

2860 ) 

2861 @forbid_nonstring_types(["bytes"]) 

2862 def rindex(self, sub, start=0, end=None): 

2863 if not isinstance(sub, str): 

2864 msg = f"expected a string object, not {type(sub).__name__}" 

2865 raise TypeError(msg) 

2866 

2867 result = self._data.array._str_rindex(sub, start=start, end=end) 

2868 return self._wrap_result(result, returns_string=False) 

2869 

2870 def len(self): 

2871 """ 

2872 Compute the length of each element in the Series/Index. 

2873 

2874 The element may be a sequence (such as a string, tuple or list) or a collection 

2875 (such as a dictionary). 

2876 

2877 Returns 

2878 ------- 

2879 Series or Index of int 

2880 A Series or Index of integer values indicating the length of each 

2881 element in the Series or Index. 

2882 

2883 See Also 

2884 -------- 

2885 str.len : Python built-in function returning the length of an object. 

2886 Series.size : Returns the length of the Series. 

2887 

2888 Examples 

2889 -------- 

2890 Returns the length (number of characters) in a string. Returns the 

2891 number of entries for dictionaries, lists or tuples. 

2892 

2893 >>> s = pd.Series(['dog', 

2894 ... '', 

2895 ... 5, 

2896 ... {'foo' : 'bar'}, 

2897 ... [2, 3, 5, 7], 

2898 ... ('one', 'two', 'three')]) 

2899 >>> s 

2900 0 dog 

2901 1 

2902 2 5 

2903 3 {'foo': 'bar'} 

2904 4 [2, 3, 5, 7] 

2905 5 (one, two, three) 

2906 dtype: object 

2907 >>> s.str.len() 

2908 0 3.0 

2909 1 0.0 

2910 2 NaN 

2911 3 1.0 

2912 4 4.0 

2913 5 3.0 

2914 dtype: float64 

2915 """ 

2916 result = self._data.array._str_len() 

2917 return self._wrap_result(result, returns_string=False) 

2918 

2919 _shared_docs[ 

2920 "casemethods" 

2921 ] = """ 

2922 Convert strings in the Series/Index to %(type)s. 

2923 %(version)s 

2924 Equivalent to :meth:`str.%(method)s`. 

2925 

2926 Returns 

2927 ------- 

2928 Series or Index of object 

2929 

2930 See Also 

2931 -------- 

2932 Series.str.lower : Converts all characters to lowercase. 

2933 Series.str.upper : Converts all characters to uppercase. 

2934 Series.str.title : Converts first character of each word to uppercase and 

2935 remaining to lowercase. 

2936 Series.str.capitalize : Converts first character to uppercase and 

2937 remaining to lowercase. 

2938 Series.str.swapcase : Converts uppercase to lowercase and lowercase to 

2939 uppercase. 

2940 Series.str.casefold: Removes all case distinctions in the string. 

2941 

2942 Examples 

2943 -------- 

2944 >>> s = pd.Series(['lower', 'CAPITALS', 'this is a sentence', 'SwApCaSe']) 

2945 >>> s 

2946 0 lower 

2947 1 CAPITALS 

2948 2 this is a sentence 

2949 3 SwApCaSe 

2950 dtype: object 

2951 

2952 >>> s.str.lower() 

2953 0 lower 

2954 1 capitals 

2955 2 this is a sentence 

2956 3 swapcase 

2957 dtype: object 

2958 

2959 >>> s.str.upper() 

2960 0 LOWER 

2961 1 CAPITALS 

2962 2 THIS IS A SENTENCE 

2963 3 SWAPCASE 

2964 dtype: object 

2965 

2966 >>> s.str.title() 

2967 0 Lower 

2968 1 Capitals 

2969 2 This Is A Sentence 

2970 3 Swapcase 

2971 dtype: object 

2972 

2973 >>> s.str.capitalize() 

2974 0 Lower 

2975 1 Capitals 

2976 2 This is a sentence 

2977 3 Swapcase 

2978 dtype: object 

2979 

2980 >>> s.str.swapcase() 

2981 0 LOWER 

2982 1 capitals 

2983 2 THIS IS A SENTENCE 

2984 3 sWaPcAsE 

2985 dtype: object 

2986 """ 

2987 # Types: 

2988 # cases: 

2989 # upper, lower, title, capitalize, swapcase, casefold 

2990 # boolean: 

2991 # isalpha, isnumeric isalnum isdigit isdecimal isspace islower isupper istitle 

2992 # _doc_args holds dict of strings to use in substituting casemethod docs 

2993 _doc_args: dict[str, dict[str, str]] = {} 

2994 _doc_args["lower"] = {"type": "lowercase", "method": "lower", "version": ""} 

2995 _doc_args["upper"] = {"type": "uppercase", "method": "upper", "version": ""} 

2996 _doc_args["title"] = {"type": "titlecase", "method": "title", "version": ""} 

2997 _doc_args["capitalize"] = { 

2998 "type": "be capitalized", 

2999 "method": "capitalize", 

3000 "version": "", 

3001 } 

3002 _doc_args["swapcase"] = { 

3003 "type": "be swapcased", 

3004 "method": "swapcase", 

3005 "version": "", 

3006 } 

3007 _doc_args["casefold"] = { 

3008 "type": "be casefolded", 

3009 "method": "casefold", 

3010 "version": "\n .. versionadded:: 0.25.0\n", 

3011 } 

3012 

3013 @Appender(_shared_docs["casemethods"] % _doc_args["lower"]) 

3014 @forbid_nonstring_types(["bytes"]) 

3015 def lower(self): 

3016 result = self._data.array._str_lower() 

3017 return self._wrap_result(result) 

3018 

3019 @Appender(_shared_docs["casemethods"] % _doc_args["upper"]) 

3020 @forbid_nonstring_types(["bytes"]) 

3021 def upper(self): 

3022 result = self._data.array._str_upper() 

3023 return self._wrap_result(result) 

3024 

3025 @Appender(_shared_docs["casemethods"] % _doc_args["title"]) 

3026 @forbid_nonstring_types(["bytes"]) 

3027 def title(self): 

3028 result = self._data.array._str_title() 

3029 return self._wrap_result(result) 

3030 

3031 @Appender(_shared_docs["casemethods"] % _doc_args["capitalize"]) 

3032 @forbid_nonstring_types(["bytes"]) 

3033 def capitalize(self): 

3034 result = self._data.array._str_capitalize() 

3035 return self._wrap_result(result) 

3036 

3037 @Appender(_shared_docs["casemethods"] % _doc_args["swapcase"]) 

3038 @forbid_nonstring_types(["bytes"]) 

3039 def swapcase(self): 

3040 result = self._data.array._str_swapcase() 

3041 return self._wrap_result(result) 

3042 

3043 @Appender(_shared_docs["casemethods"] % _doc_args["casefold"]) 

3044 @forbid_nonstring_types(["bytes"]) 

3045 def casefold(self): 

3046 result = self._data.array._str_casefold() 

3047 return self._wrap_result(result) 

3048 

3049 _shared_docs[ 

3050 "ismethods" 

3051 ] = """ 

3052 Check whether all characters in each string are %(type)s. 

3053 

3054 This is equivalent to running the Python string method 

3055 :meth:`str.%(method)s` for each element of the Series/Index. If a string 

3056 has zero characters, ``False`` is returned for that check. 

3057 

3058 Returns 

3059 ------- 

3060 Series or Index of bool 

3061 Series or Index of boolean values with the same length as the original 

3062 Series/Index. 

3063 

3064 See Also 

3065 -------- 

3066 Series.str.isalpha : Check whether all characters are alphabetic. 

3067 Series.str.isnumeric : Check whether all characters are numeric. 

3068 Series.str.isalnum : Check whether all characters are alphanumeric. 

3069 Series.str.isdigit : Check whether all characters are digits. 

3070 Series.str.isdecimal : Check whether all characters are decimal. 

3071 Series.str.isspace : Check whether all characters are whitespace. 

3072 Series.str.islower : Check whether all characters are lowercase. 

3073 Series.str.isupper : Check whether all characters are uppercase. 

3074 Series.str.istitle : Check whether all characters are titlecase. 

3075 

3076 Examples 

3077 -------- 

3078 **Checks for Alphabetic and Numeric Characters** 

3079 

3080 >>> s1 = pd.Series(['one', 'one1', '1', '']) 

3081 

3082 >>> s1.str.isalpha() 

3083 0 True 

3084 1 False 

3085 2 False 

3086 3 False 

3087 dtype: bool 

3088 

3089 >>> s1.str.isnumeric() 

3090 0 False 

3091 1 False 

3092 2 True 

3093 3 False 

3094 dtype: bool 

3095 

3096 >>> s1.str.isalnum() 

3097 0 True 

3098 1 True 

3099 2 True 

3100 3 False 

3101 dtype: bool 

3102 

3103 Note that checks against characters mixed with any additional punctuation 

3104 or whitespace will evaluate to false for an alphanumeric check. 

3105 

3106 >>> s2 = pd.Series(['A B', '1.5', '3,000']) 

3107 >>> s2.str.isalnum() 

3108 0 False 

3109 1 False 

3110 2 False 

3111 dtype: bool 

3112 

3113 **More Detailed Checks for Numeric Characters** 

3114 

3115 There are several different but overlapping sets of numeric characters that 

3116 can be checked for. 

3117 

3118 >>> s3 = pd.Series(['23', '³', '⅕', '']) 

3119 

3120 The ``s3.str.isdecimal`` method checks for characters used to form numbers 

3121 in base 10. 

3122 

3123 >>> s3.str.isdecimal() 

3124 0 True 

3125 1 False 

3126 2 False 

3127 3 False 

3128 dtype: bool 

3129 

3130 The ``s.str.isdigit`` method is the same as ``s3.str.isdecimal`` but also 

3131 includes special digits, like superscripted and subscripted digits in 

3132 unicode. 

3133 

3134 >>> s3.str.isdigit() 

3135 0 True 

3136 1 True 

3137 2 False 

3138 3 False 

3139 dtype: bool 

3140 

3141 The ``s.str.isnumeric`` method is the same as ``s3.str.isdigit`` but also 

3142 includes other characters that can represent quantities such as unicode 

3143 fractions. 

3144 

3145 >>> s3.str.isnumeric() 

3146 0 True 

3147 1 True 

3148 2 True 

3149 3 False 

3150 dtype: bool 

3151 

3152 **Checks for Whitespace** 

3153 

3154 >>> s4 = pd.Series([' ', '\\t\\r\\n ', '']) 

3155 >>> s4.str.isspace() 

3156 0 True 

3157 1 True 

3158 2 False 

3159 dtype: bool 

3160 

3161 **Checks for Character Case** 

3162 

3163 >>> s5 = pd.Series(['leopard', 'Golden Eagle', 'SNAKE', '']) 

3164 

3165 >>> s5.str.islower() 

3166 0 True 

3167 1 False 

3168 2 False 

3169 3 False 

3170 dtype: bool 

3171 

3172 >>> s5.str.isupper() 

3173 0 False 

3174 1 False 

3175 2 True 

3176 3 False 

3177 dtype: bool 

3178 

3179 The ``s5.str.istitle`` method checks for whether all words are in title 

3180 case (whether only the first letter of each word is capitalized). Words are 

3181 assumed to be as any sequence of non-numeric characters separated by 

3182 whitespace characters. 

3183 

3184 >>> s5.str.istitle() 

3185 0 False 

3186 1 True 

3187 2 False 

3188 3 False 

3189 dtype: bool 

3190 """ 

3191 _doc_args["isalnum"] = {"type": "alphanumeric", "method": "isalnum"} 

3192 _doc_args["isalpha"] = {"type": "alphabetic", "method": "isalpha"} 

3193 _doc_args["isdigit"] = {"type": "digits", "method": "isdigit"} 

3194 _doc_args["isspace"] = {"type": "whitespace", "method": "isspace"} 

3195 _doc_args["islower"] = {"type": "lowercase", "method": "islower"} 

3196 _doc_args["isupper"] = {"type": "uppercase", "method": "isupper"} 

3197 _doc_args["istitle"] = {"type": "titlecase", "method": "istitle"} 

3198 _doc_args["isnumeric"] = {"type": "numeric", "method": "isnumeric"} 

3199 _doc_args["isdecimal"] = {"type": "decimal", "method": "isdecimal"} 

3200 # force _noarg_wrapper return type with dtype=np.dtype(bool) (GH 29624) 

3201 

3202 isalnum = _map_and_wrap( 

3203 "isalnum", docstring=_shared_docs["ismethods"] % _doc_args["isalnum"] 

3204 ) 

3205 isalpha = _map_and_wrap( 

3206 "isalpha", docstring=_shared_docs["ismethods"] % _doc_args["isalpha"] 

3207 ) 

3208 isdigit = _map_and_wrap( 

3209 "isdigit", docstring=_shared_docs["ismethods"] % _doc_args["isdigit"] 

3210 ) 

3211 isspace = _map_and_wrap( 

3212 "isspace", docstring=_shared_docs["ismethods"] % _doc_args["isspace"] 

3213 ) 

3214 islower = _map_and_wrap( 

3215 "islower", docstring=_shared_docs["ismethods"] % _doc_args["islower"] 

3216 ) 

3217 isupper = _map_and_wrap( 

3218 "isupper", docstring=_shared_docs["ismethods"] % _doc_args["isupper"] 

3219 ) 

3220 istitle = _map_and_wrap( 

3221 "istitle", docstring=_shared_docs["ismethods"] % _doc_args["istitle"] 

3222 ) 

3223 isnumeric = _map_and_wrap( 

3224 "isnumeric", docstring=_shared_docs["ismethods"] % _doc_args["isnumeric"] 

3225 ) 

3226 isdecimal = _map_and_wrap( 

3227 "isdecimal", docstring=_shared_docs["ismethods"] % _doc_args["isdecimal"] 

3228 ) 

3229 

3230 

3231def cat_safe(list_of_columns: list, sep: str): 

3232 """ 

3233 Auxiliary function for :meth:`str.cat`. 

3234 

3235 Same signature as cat_core, but handles TypeErrors in concatenation, which 

3236 happen if the arrays in list_of columns have the wrong dtypes or content. 

3237 

3238 Parameters 

3239 ---------- 

3240 list_of_columns : list of numpy arrays 

3241 List of arrays to be concatenated with sep; 

3242 these arrays may not contain NaNs! 

3243 sep : string 

3244 The separator string for concatenating the columns. 

3245 

3246 Returns 

3247 ------- 

3248 nd.array 

3249 The concatenation of list_of_columns with sep. 

3250 """ 

3251 try: 

3252 result = cat_core(list_of_columns, sep) 

3253 except TypeError: 

3254 # if there are any non-string values (wrong dtype or hidden behind 

3255 # object dtype), np.sum will fail; catch and return with better message 

3256 for column in list_of_columns: 

3257 dtype = lib.infer_dtype(column, skipna=True) 

3258 if dtype not in ["string", "empty"]: 

3259 raise TypeError( 

3260 "Concatenation requires list-likes containing only " 

3261 "strings (or missing values). Offending values found in " 

3262 f"column {dtype}" 

3263 ) from None 

3264 return result 

3265 

3266 

3267def cat_core(list_of_columns: list, sep: str): 

3268 """ 

3269 Auxiliary function for :meth:`str.cat` 

3270 

3271 Parameters 

3272 ---------- 

3273 list_of_columns : list of numpy arrays 

3274 List of arrays to be concatenated with sep; 

3275 these arrays may not contain NaNs! 

3276 sep : string 

3277 The separator string for concatenating the columns. 

3278 

3279 Returns 

3280 ------- 

3281 nd.array 

3282 The concatenation of list_of_columns with sep. 

3283 """ 

3284 if sep == "": 

3285 # no need to interleave sep if it is empty 

3286 arr_of_cols = np.asarray(list_of_columns, dtype=object) 

3287 return np.sum(arr_of_cols, axis=0) 

3288 list_with_sep = [sep] * (2 * len(list_of_columns) - 1) 

3289 list_with_sep[::2] = list_of_columns 

3290 arr_with_sep = np.asarray(list_with_sep, dtype=object) 

3291 return np.sum(arr_with_sep, axis=0) 

3292 

3293 

3294def _result_dtype(arr): 

3295 # workaround #27953 

3296 # ideally we just pass `dtype=arr.dtype` unconditionally, but this fails 

3297 # when the list of values is empty. 

3298 from pandas.core.arrays.string_ import StringDtype 

3299 

3300 if isinstance(arr.dtype, StringDtype): 

3301 return arr.dtype 

3302 else: 

3303 return object 

3304 

3305 

3306def _get_single_group_name(regex: re.Pattern) -> Hashable: 

3307 if regex.groupindex: 

3308 return next(iter(regex.groupindex)) 

3309 else: 

3310 return None 

3311 

3312 

3313def _get_group_names(regex: re.Pattern) -> list[Hashable]: 

3314 """ 

3315 Get named groups from compiled regex. 

3316 

3317 Unnamed groups are numbered. 

3318 

3319 Parameters 

3320 ---------- 

3321 regex : compiled regex 

3322 

3323 Returns 

3324 ------- 

3325 list of column labels 

3326 """ 

3327 names = {v: k for k, v in regex.groupindex.items()} 

3328 return [names.get(1 + i, i) for i in range(regex.groups)] 

3329 

3330 

3331def str_extractall(arr, pat, flags=0): 

3332 regex = re.compile(pat, flags=flags) 

3333 # the regex must contain capture groups. 

3334 if regex.groups == 0: 

3335 raise ValueError("pattern contains no capture groups") 

3336 

3337 if isinstance(arr, ABCIndex): 

3338 arr = arr.to_series().reset_index(drop=True) 

3339 

3340 columns = _get_group_names(regex) 

3341 match_list = [] 

3342 index_list = [] 

3343 is_mi = arr.index.nlevels > 1 

3344 

3345 for subject_key, subject in arr.items(): 

3346 if isinstance(subject, str): 

3347 

3348 if not is_mi: 

3349 subject_key = (subject_key,) 

3350 

3351 for match_i, match_tuple in enumerate(regex.findall(subject)): 

3352 if isinstance(match_tuple, str): 

3353 match_tuple = (match_tuple,) 

3354 na_tuple = [np.NaN if group == "" else group for group in match_tuple] 

3355 match_list.append(na_tuple) 

3356 result_key = tuple(subject_key + (match_i,)) 

3357 index_list.append(result_key) 

3358 

3359 from pandas import MultiIndex 

3360 

3361 index = MultiIndex.from_tuples(index_list, names=arr.index.names + ["match"]) 

3362 dtype = _result_dtype(arr) 

3363 

3364 result = arr._constructor_expanddim( 

3365 match_list, index=index, columns=columns, dtype=dtype 

3366 ) 

3367 return result