Coverage for /var/srv/projects/api.amasfac.comuna18.com/tmp/venv/lib/python3.9/site-packages/pandas/core/strings/object_array.py: 15%

304 statements  

« prev     ^ index     » next       coverage.py v6.4.4, created at 2023-07-17 14:22 -0600

1from __future__ import annotations 

2 

3from collections.abc import Callable # noqa: PDF001 

4import re 

5import textwrap 

6from typing import TYPE_CHECKING 

7import unicodedata 

8 

9import numpy as np 

10 

11import pandas._libs.lib as lib 

12import pandas._libs.missing as libmissing 

13import pandas._libs.ops as libops 

14from pandas._typing import ( 

15 NpDtype, 

16 Scalar, 

17) 

18 

19from pandas.core.dtypes.common import is_scalar 

20from pandas.core.dtypes.missing import isna 

21 

22from pandas.core.strings.base import BaseStringArrayMethods 

23 

24if TYPE_CHECKING: 24 ↛ 25line 24 didn't jump to line 25, because the condition on line 24 was never true

25 from pandas import Series 

26 

27 

28class ObjectStringArrayMixin(BaseStringArrayMethods): 

29 """ 

30 String Methods operating on object-dtype ndarrays. 

31 """ 

32 

33 _str_na_value = np.nan 

34 

35 def __len__(self): 

36 # For typing, _str_map relies on the object being sized. 

37 raise NotImplementedError 

38 

39 def _str_map( 

40 self, f, na_value=None, dtype: NpDtype | None = None, convert: bool = True 

41 ): 

42 """ 

43 Map a callable over valid elements of the array. 

44 

45 Parameters 

46 ---------- 

47 f : Callable 

48 A function to call on each non-NA element. 

49 na_value : Scalar, optional 

50 The value to set for NA values. Might also be used for the 

51 fill value if the callable `f` raises an exception. 

52 This defaults to ``self._str_na_value`` which is ``np.nan`` 

53 for object-dtype and Categorical and ``pd.NA`` for StringArray. 

54 dtype : Dtype, optional 

55 The dtype of the result array. 

56 convert : bool, default True 

57 Whether to call `maybe_convert_objects` on the resulting ndarray 

58 """ 

59 if dtype is None: 

60 dtype = np.dtype("object") 

61 if na_value is None: 

62 na_value = self._str_na_value 

63 

64 if not len(self): 

65 return np.array([], dtype=dtype) 

66 

67 arr = np.asarray(self, dtype=object) 

68 mask = isna(arr) 

69 map_convert = convert and not np.all(mask) 

70 try: 

71 result = lib.map_infer_mask(arr, f, mask.view(np.uint8), map_convert) 

72 except (TypeError, AttributeError) as err: 

73 # Reraise the exception if callable `f` got wrong number of args. 

74 # The user may want to be warned by this, instead of getting NaN 

75 p_err = ( 

76 r"((takes)|(missing)) (?(2)from \d+ to )?\d+ " 

77 r"(?(3)required )positional arguments?" 

78 ) 

79 

80 if len(err.args) >= 1 and re.search(p_err, err.args[0]): 

81 # FIXME: this should be totally avoidable 

82 raise err 

83 

84 def g(x): 

85 # This type of fallback behavior can be removed once 

86 # we remove object-dtype .str accessor. 

87 try: 

88 return f(x) 

89 except (TypeError, AttributeError): 

90 return na_value 

91 

92 return self._str_map(g, na_value=na_value, dtype=dtype) 

93 if not isinstance(result, np.ndarray): 

94 return result 

95 if na_value is not np.nan: 

96 np.putmask(result, mask, na_value) 

97 if convert and result.dtype == object: 

98 result = lib.maybe_convert_objects(result) 

99 return result 

100 

101 def _str_count(self, pat, flags=0): 

102 regex = re.compile(pat, flags=flags) 

103 f = lambda x: len(regex.findall(x)) 

104 return self._str_map(f, dtype="int64") 

105 

106 def _str_pad(self, width, side="left", fillchar=" "): 

107 if side == "left": 

108 f = lambda x: x.rjust(width, fillchar) 

109 elif side == "right": 

110 f = lambda x: x.ljust(width, fillchar) 

111 elif side == "both": 

112 f = lambda x: x.center(width, fillchar) 

113 else: # pragma: no cover 

114 raise ValueError("Invalid side") 

115 return self._str_map(f) 

116 

117 def _str_contains(self, pat, case=True, flags=0, na=np.nan, regex: bool = True): 

118 if regex: 

119 if not case: 

120 flags |= re.IGNORECASE 

121 

122 pat = re.compile(pat, flags=flags) 

123 

124 f = lambda x: pat.search(x) is not None 

125 else: 

126 if case: 

127 f = lambda x: pat in x 

128 else: 

129 upper_pat = pat.upper() 

130 f = lambda x: upper_pat in x.upper() 

131 return self._str_map(f, na, dtype=np.dtype("bool")) 

132 

133 def _str_startswith(self, pat, na=None): 

134 f = lambda x: x.startswith(pat) 

135 return self._str_map(f, na_value=na, dtype=np.dtype(bool)) 

136 

137 def _str_endswith(self, pat, na=None): 

138 f = lambda x: x.endswith(pat) 

139 return self._str_map(f, na_value=na, dtype=np.dtype(bool)) 

140 

141 def _str_replace( 

142 self, 

143 pat: str | re.Pattern, 

144 repl: str | Callable, 

145 n: int = -1, 

146 case: bool = True, 

147 flags: int = 0, 

148 regex: bool = True, 

149 ): 

150 if case is False: 

151 # add case flag, if provided 

152 flags |= re.IGNORECASE 

153 

154 if regex or flags or callable(repl): 

155 if not isinstance(pat, re.Pattern): 

156 if regex is False: 

157 pat = re.escape(pat) 

158 pat = re.compile(pat, flags=flags) 

159 

160 n = n if n >= 0 else 0 

161 f = lambda x: pat.sub(repl=repl, string=x, count=n) 

162 else: 

163 f = lambda x: x.replace(pat, repl, n) 

164 

165 return self._str_map(f, dtype=str) 

166 

167 def _str_repeat(self, repeats): 

168 if is_scalar(repeats): 

169 

170 def scalar_rep(x): 

171 try: 

172 return bytes.__mul__(x, repeats) 

173 except TypeError: 

174 return str.__mul__(x, repeats) 

175 

176 return self._str_map(scalar_rep, dtype=str) 

177 else: 

178 from pandas.core.arrays.string_ import BaseStringArray 

179 

180 def rep(x, r): 

181 if x is libmissing.NA: 

182 return x 

183 try: 

184 return bytes.__mul__(x, r) 

185 except TypeError: 

186 return str.__mul__(x, r) 

187 

188 repeats = np.asarray(repeats, dtype=object) 

189 result = libops.vec_binop(np.asarray(self), repeats, rep) 

190 if isinstance(self, BaseStringArray): 

191 # Not going through map, so we have to do this here. 

192 result = type(self)._from_sequence(result) 

193 return result 

194 

195 def _str_match( 

196 self, pat: str, case: bool = True, flags: int = 0, na: Scalar | None = None 

197 ): 

198 if not case: 

199 flags |= re.IGNORECASE 

200 

201 regex = re.compile(pat, flags=flags) 

202 

203 f = lambda x: regex.match(x) is not None 

204 return self._str_map(f, na_value=na, dtype=np.dtype(bool)) 

205 

206 def _str_fullmatch( 

207 self, 

208 pat: str | re.Pattern, 

209 case: bool = True, 

210 flags: int = 0, 

211 na: Scalar | None = None, 

212 ): 

213 if not case: 

214 flags |= re.IGNORECASE 

215 

216 regex = re.compile(pat, flags=flags) 

217 

218 f = lambda x: regex.fullmatch(x) is not None 

219 return self._str_map(f, na_value=na, dtype=np.dtype(bool)) 

220 

221 def _str_encode(self, encoding, errors="strict"): 

222 f = lambda x: x.encode(encoding, errors=errors) 

223 return self._str_map(f, dtype=object) 

224 

225 def _str_find(self, sub, start=0, end=None): 

226 return self._str_find_(sub, start, end, side="left") 

227 

228 def _str_rfind(self, sub, start=0, end=None): 

229 return self._str_find_(sub, start, end, side="right") 

230 

231 def _str_find_(self, sub, start, end, side): 

232 if side == "left": 

233 method = "find" 

234 elif side == "right": 

235 method = "rfind" 

236 else: # pragma: no cover 

237 raise ValueError("Invalid side") 

238 

239 if end is None: 

240 f = lambda x: getattr(x, method)(sub, start) 

241 else: 

242 f = lambda x: getattr(x, method)(sub, start, end) 

243 return self._str_map(f, dtype="int64") 

244 

245 def _str_findall(self, pat, flags=0): 

246 regex = re.compile(pat, flags=flags) 

247 return self._str_map(regex.findall, dtype="object") 

248 

249 def _str_get(self, i): 

250 def f(x): 

251 if isinstance(x, dict): 

252 return x.get(i) 

253 elif len(x) > i >= -len(x): 

254 return x[i] 

255 return self._str_na_value 

256 

257 return self._str_map(f) 

258 

259 def _str_index(self, sub, start=0, end=None): 

260 if end: 

261 f = lambda x: x.index(sub, start, end) 

262 else: 

263 f = lambda x: x.index(sub, start, end) 

264 return self._str_map(f, dtype="int64") 

265 

266 def _str_rindex(self, sub, start=0, end=None): 

267 if end: 

268 f = lambda x: x.rindex(sub, start, end) 

269 else: 

270 f = lambda x: x.rindex(sub, start, end) 

271 return self._str_map(f, dtype="int64") 

272 

273 def _str_join(self, sep): 

274 return self._str_map(sep.join) 

275 

276 def _str_partition(self, sep, expand): 

277 result = self._str_map(lambda x: x.partition(sep), dtype="object") 

278 return result 

279 

280 def _str_rpartition(self, sep, expand): 

281 return self._str_map(lambda x: x.rpartition(sep), dtype="object") 

282 

283 def _str_len(self): 

284 return self._str_map(len, dtype="int64") 

285 

286 def _str_slice(self, start=None, stop=None, step=None): 

287 obj = slice(start, stop, step) 

288 return self._str_map(lambda x: x[obj]) 

289 

290 def _str_slice_replace(self, start=None, stop=None, repl=None): 

291 if repl is None: 

292 repl = "" 

293 

294 def f(x): 

295 if x[start:stop] == "": 

296 local_stop = start 

297 else: 

298 local_stop = stop 

299 y = "" 

300 if start is not None: 

301 y += x[:start] 

302 y += repl 

303 if stop is not None: 

304 y += x[local_stop:] 

305 return y 

306 

307 return self._str_map(f) 

308 

309 def _str_split( 

310 self, 

311 pat: str | re.Pattern | None = None, 

312 n=-1, 

313 expand=False, 

314 regex: bool | None = None, 

315 ): 

316 if pat is None: 

317 if n is None or n == 0: 

318 n = -1 

319 f = lambda x: x.split(pat, n) 

320 else: 

321 new_pat: str | re.Pattern 

322 if regex is True or isinstance(pat, re.Pattern): 

323 new_pat = re.compile(pat) 

324 elif regex is False: 

325 new_pat = pat 

326 # regex is None so link to old behavior #43563 

327 else: 

328 if len(pat) == 1: 

329 new_pat = pat 

330 else: 

331 new_pat = re.compile(pat) 

332 

333 if isinstance(new_pat, re.Pattern): 

334 if n is None or n == -1: 

335 n = 0 

336 f = lambda x: new_pat.split(x, maxsplit=n) 

337 else: 

338 if n is None or n == 0: 

339 n = -1 

340 f = lambda x: x.split(pat, n) 

341 return self._str_map(f, dtype=object) 

342 

343 def _str_rsplit(self, pat=None, n=-1): 

344 if n is None or n == 0: 

345 n = -1 

346 f = lambda x: x.rsplit(pat, n) 

347 return self._str_map(f, dtype="object") 

348 

349 def _str_translate(self, table): 

350 return self._str_map(lambda x: x.translate(table)) 

351 

352 def _str_wrap(self, width, **kwargs): 

353 kwargs["width"] = width 

354 tw = textwrap.TextWrapper(**kwargs) 

355 return self._str_map(lambda s: "\n".join(tw.wrap(s))) 

356 

357 def _str_get_dummies(self, sep="|"): 

358 from pandas import Series 

359 

360 arr = Series(self).fillna("") 

361 try: 

362 arr = sep + arr + sep 

363 except (TypeError, NotImplementedError): 

364 arr = sep + arr.astype(str) + sep 

365 

366 tags: set[str] = set() 

367 for ts in Series(arr).str.split(sep): 

368 tags.update(ts) 

369 tags2 = sorted(tags - {""}) 

370 

371 dummies = np.empty((len(arr), len(tags2)), dtype=np.int64) 

372 

373 for i, t in enumerate(tags2): 

374 pat = sep + t + sep 

375 dummies[:, i] = lib.map_infer(arr.to_numpy(), lambda x: pat in x) 

376 return dummies, tags2 

377 

378 def _str_upper(self): 

379 return self._str_map(lambda x: x.upper()) 

380 

381 def _str_isalnum(self): 

382 return self._str_map(str.isalnum, dtype="bool") 

383 

384 def _str_isalpha(self): 

385 return self._str_map(str.isalpha, dtype="bool") 

386 

387 def _str_isdecimal(self): 

388 return self._str_map(str.isdecimal, dtype="bool") 

389 

390 def _str_isdigit(self): 

391 return self._str_map(str.isdigit, dtype="bool") 

392 

393 def _str_islower(self): 

394 return self._str_map(str.islower, dtype="bool") 

395 

396 def _str_isnumeric(self): 

397 return self._str_map(str.isnumeric, dtype="bool") 

398 

399 def _str_isspace(self): 

400 return self._str_map(str.isspace, dtype="bool") 

401 

402 def _str_istitle(self): 

403 return self._str_map(str.istitle, dtype="bool") 

404 

405 def _str_isupper(self): 

406 return self._str_map(str.isupper, dtype="bool") 

407 

408 def _str_capitalize(self): 

409 return self._str_map(str.capitalize) 

410 

411 def _str_casefold(self): 

412 return self._str_map(str.casefold) 

413 

414 def _str_title(self): 

415 return self._str_map(str.title) 

416 

417 def _str_swapcase(self): 

418 return self._str_map(str.swapcase) 

419 

420 def _str_lower(self): 

421 return self._str_map(str.lower) 

422 

423 def _str_normalize(self, form): 

424 f = lambda x: unicodedata.normalize(form, x) 

425 return self._str_map(f) 

426 

427 def _str_strip(self, to_strip=None): 

428 return self._str_map(lambda x: x.strip(to_strip)) 

429 

430 def _str_lstrip(self, to_strip=None): 

431 return self._str_map(lambda x: x.lstrip(to_strip)) 

432 

433 def _str_rstrip(self, to_strip=None): 

434 return self._str_map(lambda x: x.rstrip(to_strip)) 

435 

436 def _str_removeprefix(self, prefix: str) -> Series: 

437 # outstanding question on whether to use native methods for users on Python 3.9+ 

438 # https://github.com/pandas-dev/pandas/pull/39226#issuecomment-836719770, 

439 # in which case we could do return self._str_map(str.removeprefix) 

440 

441 def removeprefix(text: str) -> str: 

442 if text.startswith(prefix): 

443 return text[len(prefix) :] 

444 return text 

445 

446 return self._str_map(removeprefix) 

447 

448 def _str_removesuffix(self, suffix: str) -> Series: 

449 # this could be used on Python 3.9+ 

450 # f = lambda x: x.removesuffix(suffix) 

451 # return self._str_map(str.removesuffix) 

452 

453 def removesuffix(text: str) -> str: 

454 if text.endswith(suffix): 

455 return text[: -len(suffix)] 

456 return text 

457 

458 return self._str_map(removesuffix) 

459 

460 def _str_extract(self, pat: str, flags: int = 0, expand: bool = True): 

461 regex = re.compile(pat, flags=flags) 

462 na_value = self._str_na_value 

463 

464 if not expand: 

465 

466 def g(x): 

467 m = regex.search(x) 

468 return m.groups()[0] if m else na_value 

469 

470 return self._str_map(g, convert=False) 

471 

472 empty_row = [na_value] * regex.groups 

473 

474 def f(x): 

475 if not isinstance(x, str): 

476 return empty_row 

477 m = regex.search(x) 

478 if m: 

479 return [na_value if item is None else item for item in m.groups()] 

480 else: 

481 return empty_row 

482 

483 return [f(val) for val in np.asarray(self)]