Coverage for /var/srv/projects/api.amasfac.comuna18.com/tmp/venv/lib/python3.9/site-packages/pandas/core/arrays/string_arrow.py: 18%

240 statements  

« prev     ^ index     » next       coverage.py v6.4.4, created at 2023-07-17 14:22 -0600

1from __future__ import annotations 

2 

3from collections.abc import Callable # noqa: PDF001 

4import re 

5from typing import Union 

6 

7import numpy as np 

8 

9from pandas._libs import ( 

10 lib, 

11 missing as libmissing, 

12) 

13from pandas._typing import ( 

14 Dtype, 

15 NpDtype, 

16 Scalar, 

17 npt, 

18) 

19from pandas.compat import ( 

20 pa_version_under1p01, 

21 pa_version_under2p0, 

22 pa_version_under3p0, 

23 pa_version_under4p0, 

24) 

25 

26from pandas.core.dtypes.common import ( 

27 is_bool_dtype, 

28 is_dtype_equal, 

29 is_integer_dtype, 

30 is_object_dtype, 

31 is_scalar, 

32 is_string_dtype, 

33 pandas_dtype, 

34) 

35from pandas.core.dtypes.missing import isna 

36 

37from pandas.core.arrays.arrow import ArrowExtensionArray 

38from pandas.core.arrays.boolean import BooleanDtype 

39from pandas.core.arrays.integer import Int64Dtype 

40from pandas.core.arrays.numeric import NumericDtype 

41from pandas.core.arrays.string_ import ( 

42 BaseStringArray, 

43 StringDtype, 

44) 

45from pandas.core.strings.object_array import ObjectStringArrayMixin 

46 

47if not pa_version_under1p01: 47 ↛ 48line 47 didn't jump to line 48, because the condition on line 47 was never true

48 import pyarrow as pa 

49 import pyarrow.compute as pc 

50 

51 from pandas.core.arrays.arrow._arrow_utils import fallback_performancewarning 

52 

53ArrowStringScalarOrNAT = Union[str, libmissing.NAType] 

54 

55 

56def _chk_pyarrow_available() -> None: 

57 if pa_version_under1p01: 

58 msg = "pyarrow>=1.0.0 is required for PyArrow backed ArrowExtensionArray." 

59 raise ImportError(msg) 

60 

61 

62# TODO: Inherit directly from BaseStringArrayMethods. Currently we inherit from 

63# ObjectStringArrayMixin because we want to have the object-dtype based methods as 

64# fallback for the ones that pyarrow doesn't yet support 

65 

66 

67class ArrowStringArray(ArrowExtensionArray, BaseStringArray, ObjectStringArrayMixin): 

68 """ 

69 Extension array for string data in a ``pyarrow.ChunkedArray``. 

70 

71 .. versionadded:: 1.2.0 

72 

73 .. warning:: 

74 

75 ArrowStringArray is considered experimental. The implementation and 

76 parts of the API may change without warning. 

77 

78 Parameters 

79 ---------- 

80 values : pyarrow.Array or pyarrow.ChunkedArray 

81 The array of data. 

82 

83 Attributes 

84 ---------- 

85 None 

86 

87 Methods 

88 ------- 

89 None 

90 

91 See Also 

92 -------- 

93 array 

94 The recommended function for creating a ArrowStringArray. 

95 Series.str 

96 The string methods are available on Series backed by 

97 a ArrowStringArray. 

98 

99 Notes 

100 ----- 

101 ArrowStringArray returns a BooleanArray for comparison methods. 

102 

103 Examples 

104 -------- 

105 >>> pd.array(['This is', 'some text', None, 'data.'], dtype="string[pyarrow]") 

106 <ArrowStringArray> 

107 ['This is', 'some text', <NA>, 'data.'] 

108 Length: 4, dtype: string 

109 """ 

110 

111 # error: Incompatible types in assignment (expression has type "StringDtype", 

112 # base class "ArrowExtensionArray" defined the type as "ArrowDtype") 

113 _dtype: StringDtype # type: ignore[assignment] 

114 

115 def __init__(self, values) -> None: 

116 super().__init__(values) 

117 self._dtype = StringDtype(storage="pyarrow") 

118 

119 if not pa.types.is_string(self._data.type): 

120 raise ValueError( 

121 "ArrowStringArray requires a PyArrow (chunked) array of string type" 

122 ) 

123 

124 @classmethod 

125 def _from_sequence(cls, scalars, dtype: Dtype | None = None, copy: bool = False): 

126 from pandas.core.arrays.masked import BaseMaskedArray 

127 

128 _chk_pyarrow_available() 

129 

130 if dtype and not (isinstance(dtype, str) and dtype == "string"): 

131 dtype = pandas_dtype(dtype) 

132 assert isinstance(dtype, StringDtype) and dtype.storage == "pyarrow" 

133 

134 if isinstance(scalars, BaseMaskedArray): 

135 # avoid costly conversion to object dtype in ensure_string_array and 

136 # numerical issues with Float32Dtype 

137 na_values = scalars._mask 

138 result = scalars._data 

139 result = lib.ensure_string_array(result, copy=copy, convert_na_value=False) 

140 return cls(pa.array(result, mask=na_values, type=pa.string())) 

141 

142 # convert non-na-likes to str 

143 result = lib.ensure_string_array(scalars, copy=copy) 

144 return cls(pa.array(result, type=pa.string(), from_pandas=True)) 

145 

146 @classmethod 

147 def _from_sequence_of_strings( 

148 cls, strings, dtype: Dtype | None = None, copy: bool = False 

149 ): 

150 return cls._from_sequence(strings, dtype=dtype, copy=copy) 

151 

152 @property 

153 def dtype(self) -> StringDtype: # type: ignore[override] 

154 """ 

155 An instance of 'string[pyarrow]'. 

156 """ 

157 return self._dtype 

158 

159 def __array__(self, dtype: NpDtype | None = None) -> np.ndarray: 

160 """Correctly construct numpy arrays when passed to `np.asarray()`.""" 

161 return self.to_numpy(dtype=dtype) 

162 

163 def to_numpy( 

164 self, 

165 dtype: npt.DTypeLike | None = None, 

166 copy: bool = False, 

167 na_value=lib.no_default, 

168 ) -> np.ndarray: 

169 """ 

170 Convert to a NumPy ndarray. 

171 """ 

172 # TODO: copy argument is ignored 

173 

174 result = np.array(self._data, dtype=dtype) 

175 if self._data.null_count > 0: 

176 if na_value is lib.no_default: 

177 if dtype and np.issubdtype(dtype, np.floating): 

178 return result 

179 na_value = self._dtype.na_value 

180 mask = self.isna() 

181 result[mask] = na_value 

182 return result 

183 

184 def insert(self, loc: int, item) -> ArrowStringArray: 

185 if not isinstance(item, str) and item is not libmissing.NA: 

186 raise TypeError("Scalar must be NA or str") 

187 return super().insert(loc, item) 

188 

189 def _maybe_convert_setitem_value(self, value): 

190 """Maybe convert value to be pyarrow compatible.""" 

191 if is_scalar(value): 

192 if isna(value): 

193 value = None 

194 elif not isinstance(value, str): 

195 raise ValueError("Scalar must be NA or str") 

196 else: 

197 value = np.array(value, dtype=object, copy=True) 

198 value[isna(value)] = None 

199 for v in value: 

200 if not (v is None or isinstance(v, str)): 

201 raise ValueError("Scalar must be NA or str") 

202 return value 

203 

204 def isin(self, values) -> npt.NDArray[np.bool_]: 

205 if pa_version_under2p0: 

206 fallback_performancewarning(version="2") 

207 return super().isin(values) 

208 

209 value_set = [ 

210 pa_scalar.as_py() 

211 for pa_scalar in [pa.scalar(value, from_pandas=True) for value in values] 

212 if pa_scalar.type in (pa.string(), pa.null()) 

213 ] 

214 

215 # for an empty value_set pyarrow 3.0.0 segfaults and pyarrow 2.0.0 returns True 

216 # for null values, so we short-circuit to return all False array. 

217 if not len(value_set): 

218 return np.zeros(len(self), dtype=bool) 

219 

220 kwargs = {} 

221 if pa_version_under3p0: 

222 # in pyarrow 2.0.0 skip_null is ignored but is a required keyword and raises 

223 # with unexpected keyword argument in pyarrow 3.0.0+ 

224 kwargs["skip_null"] = True 

225 

226 result = pc.is_in(self._data, value_set=pa.array(value_set), **kwargs) 

227 # pyarrow 2.0.0 returned nulls, so we explicily specify dtype to convert nulls 

228 # to False 

229 return np.array(result, dtype=np.bool_) 

230 

231 def astype(self, dtype, copy: bool = True): 

232 dtype = pandas_dtype(dtype) 

233 

234 if is_dtype_equal(dtype, self.dtype): 

235 if copy: 

236 return self.copy() 

237 return self 

238 

239 elif isinstance(dtype, NumericDtype): 

240 data = self._data.cast(pa.from_numpy_dtype(dtype.numpy_dtype)) 

241 return dtype.__from_arrow__(data) 

242 

243 return super().astype(dtype, copy=copy) 

244 

245 # ------------------------------------------------------------------------ 

246 # String methods interface 

247 

248 # error: Incompatible types in assignment (expression has type "NAType", 

249 # base class "ObjectStringArrayMixin" defined the type as "float") 

250 _str_na_value = libmissing.NA # type: ignore[assignment] 

251 

252 def _str_map( 

253 self, f, na_value=None, dtype: Dtype | None = None, convert: bool = True 

254 ): 

255 # TODO: de-duplicate with StringArray method. This method is moreless copy and 

256 # paste. 

257 

258 from pandas.arrays import ( 

259 BooleanArray, 

260 IntegerArray, 

261 ) 

262 

263 if dtype is None: 

264 dtype = self.dtype 

265 if na_value is None: 

266 na_value = self.dtype.na_value 

267 

268 mask = isna(self) 

269 arr = np.asarray(self) 

270 

271 if is_integer_dtype(dtype) or is_bool_dtype(dtype): 

272 constructor: type[IntegerArray] | type[BooleanArray] 

273 if is_integer_dtype(dtype): 

274 constructor = IntegerArray 

275 else: 

276 constructor = BooleanArray 

277 

278 na_value_is_na = isna(na_value) 

279 if na_value_is_na: 

280 na_value = 1 

281 result = lib.map_infer_mask( 

282 arr, 

283 f, 

284 mask.view("uint8"), 

285 convert=False, 

286 na_value=na_value, 

287 # error: Argument 1 to "dtype" has incompatible type 

288 # "Union[ExtensionDtype, str, dtype[Any], Type[object]]"; expected 

289 # "Type[object]" 

290 dtype=np.dtype(dtype), # type: ignore[arg-type] 

291 ) 

292 

293 if not na_value_is_na: 

294 mask[:] = False 

295 

296 return constructor(result, mask) 

297 

298 elif is_string_dtype(dtype) and not is_object_dtype(dtype): 

299 # i.e. StringDtype 

300 result = lib.map_infer_mask( 

301 arr, f, mask.view("uint8"), convert=False, na_value=na_value 

302 ) 

303 result = pa.array(result, mask=mask, type=pa.string(), from_pandas=True) 

304 return type(self)(result) 

305 else: 

306 # This is when the result type is object. We reach this when 

307 # -> We know the result type is truly object (e.g. .encode returns bytes 

308 # or .findall returns a list). 

309 # -> We don't know the result type. E.g. `.get` can return anything. 

310 return lib.map_infer_mask(arr, f, mask.view("uint8")) 

311 

312 def _str_contains(self, pat, case=True, flags=0, na=np.nan, regex: bool = True): 

313 if flags: 

314 fallback_performancewarning() 

315 return super()._str_contains(pat, case, flags, na, regex) 

316 

317 if regex: 

318 if pa_version_under4p0 or case is False: 

319 fallback_performancewarning(version="4") 

320 return super()._str_contains(pat, case, flags, na, regex) 

321 else: 

322 result = pc.match_substring_regex(self._data, pat) 

323 else: 

324 if case: 

325 result = pc.match_substring(self._data, pat) 

326 else: 

327 result = pc.match_substring(pc.utf8_upper(self._data), pat.upper()) 

328 result = BooleanDtype().__from_arrow__(result) 

329 if not isna(na): 

330 result[isna(result)] = bool(na) 

331 return result 

332 

333 def _str_startswith(self, pat: str, na=None): 

334 if pa_version_under4p0: 

335 fallback_performancewarning(version="4") 

336 return super()._str_startswith(pat, na) 

337 

338 pat = "^" + re.escape(pat) 

339 return self._str_contains(pat, na=na, regex=True) 

340 

341 def _str_endswith(self, pat: str, na=None): 

342 if pa_version_under4p0: 

343 fallback_performancewarning(version="4") 

344 return super()._str_endswith(pat, na) 

345 

346 pat = re.escape(pat) + "$" 

347 return self._str_contains(pat, na=na, regex=True) 

348 

349 def _str_replace( 

350 self, 

351 pat: str | re.Pattern, 

352 repl: str | Callable, 

353 n: int = -1, 

354 case: bool = True, 

355 flags: int = 0, 

356 regex: bool = True, 

357 ): 

358 if ( 

359 pa_version_under4p0 

360 or isinstance(pat, re.Pattern) 

361 or callable(repl) 

362 or not case 

363 or flags 

364 ): 

365 fallback_performancewarning(version="4") 

366 return super()._str_replace(pat, repl, n, case, flags, regex) 

367 

368 func = pc.replace_substring_regex if regex else pc.replace_substring 

369 result = func(self._data, pattern=pat, replacement=repl, max_replacements=n) 

370 return type(self)(result) 

371 

372 def _str_match( 

373 self, pat: str, case: bool = True, flags: int = 0, na: Scalar | None = None 

374 ): 

375 if pa_version_under4p0: 

376 fallback_performancewarning(version="4") 

377 return super()._str_match(pat, case, flags, na) 

378 

379 if not pat.startswith("^"): 

380 pat = "^" + pat 

381 return self._str_contains(pat, case, flags, na, regex=True) 

382 

383 def _str_fullmatch( 

384 self, pat, case: bool = True, flags: int = 0, na: Scalar | None = None 

385 ): 

386 if pa_version_under4p0: 

387 fallback_performancewarning(version="4") 

388 return super()._str_fullmatch(pat, case, flags, na) 

389 

390 if not pat.endswith("$") or pat.endswith("//$"): 

391 pat = pat + "$" 

392 return self._str_match(pat, case, flags, na) 

393 

394 def _str_isalnum(self): 

395 result = pc.utf8_is_alnum(self._data) 

396 return BooleanDtype().__from_arrow__(result) 

397 

398 def _str_isalpha(self): 

399 result = pc.utf8_is_alpha(self._data) 

400 return BooleanDtype().__from_arrow__(result) 

401 

402 def _str_isdecimal(self): 

403 result = pc.utf8_is_decimal(self._data) 

404 return BooleanDtype().__from_arrow__(result) 

405 

406 def _str_isdigit(self): 

407 result = pc.utf8_is_digit(self._data) 

408 return BooleanDtype().__from_arrow__(result) 

409 

410 def _str_islower(self): 

411 result = pc.utf8_is_lower(self._data) 

412 return BooleanDtype().__from_arrow__(result) 

413 

414 def _str_isnumeric(self): 

415 result = pc.utf8_is_numeric(self._data) 

416 return BooleanDtype().__from_arrow__(result) 

417 

418 def _str_isspace(self): 

419 if pa_version_under2p0: 

420 fallback_performancewarning(version="2") 

421 return super()._str_isspace() 

422 

423 result = pc.utf8_is_space(self._data) 

424 return BooleanDtype().__from_arrow__(result) 

425 

426 def _str_istitle(self): 

427 result = pc.utf8_is_title(self._data) 

428 return BooleanDtype().__from_arrow__(result) 

429 

430 def _str_isupper(self): 

431 result = pc.utf8_is_upper(self._data) 

432 return BooleanDtype().__from_arrow__(result) 

433 

434 def _str_len(self): 

435 if pa_version_under4p0: 

436 fallback_performancewarning(version="4") 

437 return super()._str_len() 

438 

439 result = pc.utf8_length(self._data) 

440 return Int64Dtype().__from_arrow__(result) 

441 

442 def _str_lower(self): 

443 return type(self)(pc.utf8_lower(self._data)) 

444 

445 def _str_upper(self): 

446 return type(self)(pc.utf8_upper(self._data)) 

447 

448 def _str_strip(self, to_strip=None): 

449 if pa_version_under4p0: 

450 fallback_performancewarning(version="4") 

451 return super()._str_strip(to_strip) 

452 

453 if to_strip is None: 

454 result = pc.utf8_trim_whitespace(self._data) 

455 else: 

456 result = pc.utf8_trim(self._data, characters=to_strip) 

457 return type(self)(result) 

458 

459 def _str_lstrip(self, to_strip=None): 

460 if pa_version_under4p0: 

461 fallback_performancewarning(version="4") 

462 return super()._str_lstrip(to_strip) 

463 

464 if to_strip is None: 

465 result = pc.utf8_ltrim_whitespace(self._data) 

466 else: 

467 result = pc.utf8_ltrim(self._data, characters=to_strip) 

468 return type(self)(result) 

469 

470 def _str_rstrip(self, to_strip=None): 

471 if pa_version_under4p0: 

472 fallback_performancewarning(version="4") 

473 return super()._str_rstrip(to_strip) 

474 

475 if to_strip is None: 

476 result = pc.utf8_rtrim_whitespace(self._data) 

477 else: 

478 result = pc.utf8_rtrim(self._data, characters=to_strip) 

479 return type(self)(result)