Coverage for /var/srv/projects/api.amasfac.comuna18.com/tmp/venv/lib/python3.9/site-packages/pandas/core/arrays/string

1from __future__ import annotations

3from collections.abc import Callable # noqa: PDF001

4import re

5from typing import Union

7import numpy as np

9from pandas._libs import (

10 lib,

11 missing as libmissing,

12)

13from pandas._typing import (

14 Dtype,

15 NpDtype,

16 Scalar,

17 npt,

18)

19from pandas.compat import (

20 pa_version_under1p01,

21 pa_version_under2p0,

22 pa_version_under3p0,

23 pa_version_under4p0,

24)

26from pandas.core.dtypes.common import (

27 is_bool_dtype,

28 is_dtype_equal,

29 is_integer_dtype,

30 is_object_dtype,

31 is_scalar,

32 is_string_dtype,

33 pandas_dtype,

34)

35from pandas.core.dtypes.missing import isna

37from pandas.core.arrays.arrow import ArrowExtensionArray

38from pandas.core.arrays.boolean import BooleanDtype

39from pandas.core.arrays.integer import Int64Dtype

40from pandas.core.arrays.numeric import NumericDtype

41from pandas.core.arrays.string_ import (

42 BaseStringArray,

43 StringDtype,

44)

45from pandas.core.strings.object_array import ObjectStringArrayMixin

47if not pa_version_under1p01: 47 ↛ 48line 47 didn't jump to line 48, because the condition on line 47 was never true

48 import pyarrow as pa

49 import pyarrow.compute as pc

51 from pandas.core.arrays.arrow._arrow_utils import fallback_performancewarning

53ArrowStringScalarOrNAT = Union[str, libmissing.NAType]

56def _chk_pyarrow_available() -> None:

57 if pa_version_under1p01:

58 msg = "pyarrow>=1.0.0 is required for PyArrow backed ArrowExtensionArray."

59 raise ImportError(msg)

62# TODO: Inherit directly from BaseStringArrayMethods. Currently we inherit from

63# ObjectStringArrayMixin because we want to have the object-dtype based methods as

64# fallback for the ones that pyarrow doesn't yet support

67class ArrowStringArray(ArrowExtensionArray, BaseStringArray, ObjectStringArrayMixin):

68 """

69 Extension array for string data in a ``pyarrow.ChunkedArray``.

71 .. versionadded:: 1.2.0

73 .. warning::

75 ArrowStringArray is considered experimental. The implementation and

76 parts of the API may change without warning.

78 Parameters

79 ----------

80 values : pyarrow.Array or pyarrow.ChunkedArray

81 The array of data.

83 Attributes

84 ----------

85 None

87 Methods

88 -------

89 None

91 See Also

92 --------

93 array

94 The recommended function for creating a ArrowStringArray.

95 Series.str

96 The string methods are available on Series backed by

97 a ArrowStringArray.

99 Notes

100 -----

101 ArrowStringArray returns a BooleanArray for comparison methods.

102

103 Examples

104 --------

105 >>> pd.array(['This is', 'some text', None, 'data.'], dtype="string[pyarrow]")

106 <ArrowStringArray>

107 ['This is', 'some text', <NA>, 'data.']

108 Length: 4, dtype: string

109 """

110

111 # error: Incompatible types in assignment (expression has type "StringDtype",

112 # base class "ArrowExtensionArray" defined the type as "ArrowDtype")

113 _dtype: StringDtype # type: ignore[assignment]

114

115 def __init__(self, values) -> None:

116 super().__init__(values)

117 self._dtype = StringDtype(storage="pyarrow")

118

119 if not pa.types.is_string(self._data.type):

120 raise ValueError(

121 "ArrowStringArray requires a PyArrow (chunked) array of string type"

122 )

123

124 @classmethod

125 def _from_sequence(cls, scalars, dtype: Dtype | None = None, copy: bool = False):

126 from pandas.core.arrays.masked import BaseMaskedArray

127

128 _chk_pyarrow_available()

129

130 if dtype and not (isinstance(dtype, str) and dtype == "string"):

131 dtype = pandas_dtype(dtype)

132 assert isinstance(dtype, StringDtype) and dtype.storage == "pyarrow"

133

134 if isinstance(scalars, BaseMaskedArray):

135 # avoid costly conversion to object dtype in ensure_string_array and

136 # numerical issues with Float32Dtype

137 na_values = scalars._mask

138 result = scalars._data

139 result = lib.ensure_string_array(result, copy=copy, convert_na_value=False)

140 return cls(pa.array(result, mask=na_values, type=pa.string()))

141

142 # convert non-na-likes to str

143 result = lib.ensure_string_array(scalars, copy=copy)

144 return cls(pa.array(result, type=pa.string(), from_pandas=True))

145

146 @classmethod

147 def _from_sequence_of_strings(

148 cls, strings, dtype: Dtype | None = None, copy: bool = False

149 ):

150 return cls._from_sequence(strings, dtype=dtype, copy=copy)

151

152 @property

153 def dtype(self) -> StringDtype: # type: ignore[override]

154 """

155 An instance of 'string[pyarrow]'.

156 """

157 return self._dtype

158

159 def __array__(self, dtype: NpDtype | None = None) -> np.ndarray:

160 """Correctly construct numpy arrays when passed to `np.asarray()`."""

161 return self.to_numpy(dtype=dtype)

162

163 def to_numpy(

164 self,

165 dtype: npt.DTypeLike | None = None,

166 copy: bool = False,

167 na_value=lib.no_default,

168 ) -> np.ndarray:

169 """

170 Convert to a NumPy ndarray.

171 """

172 # TODO: copy argument is ignored

173

174 result = np.array(self._data, dtype=dtype)

175 if self._data.null_count > 0:

176 if na_value is lib.no_default:

177 if dtype and np.issubdtype(dtype, np.floating):

178 return result

179 na_value = self._dtype.na_value

180 mask = self.isna()

181 result[mask] = na_value

182 return result

183

184 def insert(self, loc: int, item) -> ArrowStringArray:

185 if not isinstance(item, str) and item is not libmissing.NA:

186 raise TypeError("Scalar must be NA or str")

187 return super().insert(loc, item)

188

189 def _maybe_convert_setitem_value(self, value):

190 """Maybe convert value to be pyarrow compatible."""

191 if is_scalar(value):

192 if isna(value):

193 value = None

194 elif not isinstance(value, str):

195 raise ValueError("Scalar must be NA or str")

196 else:

197 value = np.array(value, dtype=object, copy=True)

198 value[isna(value)] = None

199 for v in value:

200 if not (v is None or isinstance(v, str)):

201 raise ValueError("Scalar must be NA or str")

202 return value

203

204 def isin(self, values) -> npt.NDArray[np.bool_]:

205 if pa_version_under2p0:

206 fallback_performancewarning(version="2")

207 return super().isin(values)

208

209 value_set = [

210 pa_scalar.as_py()

211 for pa_scalar in [pa.scalar(value, from_pandas=True) for value in values]

212 if pa_scalar.type in (pa.string(), pa.null())

213 ]

214

215 # for an empty value_set pyarrow 3.0.0 segfaults and pyarrow 2.0.0 returns True

216 # for null values, so we short-circuit to return all False array.

217 if not len(value_set):

218 return np.zeros(len(self), dtype=bool)

219

220 kwargs = {}

221 if pa_version_under3p0:

222 # in pyarrow 2.0.0 skip_null is ignored but is a required keyword and raises

223 # with unexpected keyword argument in pyarrow 3.0.0+

224 kwargs["skip_null"] = True

225

226 result = pc.is_in(self._data, value_set=pa.array(value_set), **kwargs)

227 # pyarrow 2.0.0 returned nulls, so we explicily specify dtype to convert nulls

228 # to False

229 return np.array(result, dtype=np.bool_)

230

231 def astype(self, dtype, copy: bool = True):

232 dtype = pandas_dtype(dtype)

233

234 if is_dtype_equal(dtype, self.dtype):

235 if copy:

236 return self.copy()

237 return self

238

239 elif isinstance(dtype, NumericDtype):

240 data = self._data.cast(pa.from_numpy_dtype(dtype.numpy_dtype))

241 return dtype.__from_arrow__(data)

242

243 return super().astype(dtype, copy=copy)

244

245 # ------------------------------------------------------------------------

246 # String methods interface

247

248 # error: Incompatible types in assignment (expression has type "NAType",

249 # base class "ObjectStringArrayMixin" defined the type as "float")

250 _str_na_value = libmissing.NA # type: ignore[assignment]

251

252 def _str_map(

253 self, f, na_value=None, dtype: Dtype | None = None, convert: bool = True

254 ):

255 # TODO: de-duplicate with StringArray method. This method is moreless copy and

256 # paste.

257

258 from pandas.arrays import (

259 BooleanArray,

260 IntegerArray,

261 )

262

263 if dtype is None:

264 dtype = self.dtype

265 if na_value is None:

266 na_value = self.dtype.na_value

267

268 mask = isna(self)

269 arr = np.asarray(self)

270

271 if is_integer_dtype(dtype) or is_bool_dtype(dtype):

272 constructor: type[IntegerArray] | type[BooleanArray]

273 if is_integer_dtype(dtype):

274 constructor = IntegerArray

275 else:

276 constructor = BooleanArray

277

278 na_value_is_na = isna(na_value)

279 if na_value_is_na:

280 na_value = 1

281 result = lib.map_infer_mask(

282 arr,

283 f,

284 mask.view("uint8"),

285 convert=False,

286 na_value=na_value,

287 # error: Argument 1 to "dtype" has incompatible type

288 # "Union[ExtensionDtype, str, dtype[Any], Type[object]]"; expected

289 # "Type[object]"

290 dtype=np.dtype(dtype), # type: ignore[arg-type]

291 )

292

293 if not na_value_is_na:

294 mask[:] = False

295

296 return constructor(result, mask)

297

298 elif is_string_dtype(dtype) and not is_object_dtype(dtype):

299 # i.e. StringDtype

300 result = lib.map_infer_mask(

301 arr, f, mask.view("uint8"), convert=False, na_value=na_value

302 )

303 result = pa.array(result, mask=mask, type=pa.string(), from_pandas=True)

304 return type(self)(result)

305 else:

306 # This is when the result type is object. We reach this when

307 # -> We know the result type is truly object (e.g. .encode returns bytes

308 # or .findall returns a list).

309 # -> We don't know the result type. E.g. `.get` can return anything.

310 return lib.map_infer_mask(arr, f, mask.view("uint8"))

311

312 def _str_contains(self, pat, case=True, flags=0, na=np.nan, regex: bool = True):

313 if flags:

314 fallback_performancewarning()

315 return super()._str_contains(pat, case, flags, na, regex)

316

317 if regex:

318 if pa_version_under4p0 or case is False:

319 fallback_performancewarning(version="4")

320 return super()._str_contains(pat, case, flags, na, regex)

321 else:

322 result = pc.match_substring_regex(self._data, pat)

323 else:

324 if case:

325 result = pc.match_substring(self._data, pat)

326 else:

327 result = pc.match_substring(pc.utf8_upper(self._data), pat.upper())

328 result = BooleanDtype().__from_arrow__(result)

329 if not isna(na):

330 result[isna(result)] = bool(na)

331 return result

332

333 def _str_startswith(self, pat: str, na=None):

334 if pa_version_under4p0:

335 fallback_performancewarning(version="4")

336 return super()._str_startswith(pat, na)

337

338 pat = "^" + re.escape(pat)

339 return self._str_contains(pat, na=na, regex=True)

340

341 def _str_endswith(self, pat: str, na=None):

342 if pa_version_under4p0:

343 fallback_performancewarning(version="4")

344 return super()._str_endswith(pat, na)

345

346 pat = re.escape(pat) + "$"

347 return self._str_contains(pat, na=na, regex=True)

348

349 def _str_replace(

350 self,

351 pat: str | re.Pattern,

352 repl: str | Callable,

353 n: int = -1,

354 case: bool = True,

355 flags: int = 0,

356 regex: bool = True,

357 ):

358 if (

359 pa_version_under4p0

360 or isinstance(pat, re.Pattern)

361 or callable(repl)

362 or not case

363 or flags

364 ):

365 fallback_performancewarning(version="4")

366 return super()._str_replace(pat, repl, n, case, flags, regex)

367

368 func = pc.replace_substring_regex if regex else pc.replace_substring

369 result = func(self._data, pattern=pat, replacement=repl, max_replacements=n)

370 return type(self)(result)

371

372 def _str_match(

373 self, pat: str, case: bool = True, flags: int = 0, na: Scalar | None = None

374 ):

375 if pa_version_under4p0:

376 fallback_performancewarning(version="4")

377 return super()._str_match(pat, case, flags, na)

378

379 if not pat.startswith("^"):

380 pat = "^" + pat

381 return self._str_contains(pat, case, flags, na, regex=True)

382

383 def _str_fullmatch(

384 self, pat, case: bool = True, flags: int = 0, na: Scalar | None = None

385 ):

386 if pa_version_under4p0:

387 fallback_performancewarning(version="4")

388 return super()._str_fullmatch(pat, case, flags, na)

389

390 if not pat.endswith("$") or pat.endswith("//$"):

391 pat = pat + "$"

392 return self._str_match(pat, case, flags, na)

393

394 def _str_isalnum(self):

395 result = pc.utf8_is_alnum(self._data)

396 return BooleanDtype().__from_arrow__(result)

397

398 def _str_isalpha(self):

399 result = pc.utf8_is_alpha(self._data)

400 return BooleanDtype().__from_arrow__(result)

401

402 def _str_isdecimal(self):

403 result = pc.utf8_is_decimal(self._data)

404 return BooleanDtype().__from_arrow__(result)

405

406 def _str_isdigit(self):

407 result = pc.utf8_is_digit(self._data)

408 return BooleanDtype().__from_arrow__(result)

409

410 def _str_islower(self):

411 result = pc.utf8_is_lower(self._data)

412 return BooleanDtype().__from_arrow__(result)

413

414 def _str_isnumeric(self):

415 result = pc.utf8_is_numeric(self._data)

416 return BooleanDtype().__from_arrow__(result)

417

418 def _str_isspace(self):

419 if pa_version_under2p0:

420 fallback_performancewarning(version="2")

421 return super()._str_isspace()

422

423 result = pc.utf8_is_space(self._data)

424 return BooleanDtype().__from_arrow__(result)

425

426 def _str_istitle(self):

427 result = pc.utf8_is_title(self._data)

428 return BooleanDtype().__from_arrow__(result)

429

430 def _str_isupper(self):

431 result = pc.utf8_is_upper(self._data)

432 return BooleanDtype().__from_arrow__(result)

433

434 def _str_len(self):

435 if pa_version_under4p0:

436 fallback_performancewarning(version="4")

437 return super()._str_len()

438

439 result = pc.utf8_length(self._data)

440 return Int64Dtype().__from_arrow__(result)

441

442 def _str_lower(self):

443 return type(self)(pc.utf8_lower(self._data))

444

445 def _str_upper(self):

446 return type(self)(pc.utf8_upper(self._data))

447

448 def _str_strip(self, to_strip=None):

449 if pa_version_under4p0:

450 fallback_performancewarning(version="4")

451 return super()._str_strip(to_strip)

452

453 if to_strip is None:

454 result = pc.utf8_trim_whitespace(self._data)

455 else:

456 result = pc.utf8_trim(self._data, characters=to_strip)

457 return type(self)(result)

458

459 def _str_lstrip(self, to_strip=None):

460 if pa_version_under4p0:

461 fallback_performancewarning(version="4")

462 return super()._str_lstrip(to_strip)

463

464 if to_strip is None:

465 result = pc.utf8_ltrim_whitespace(self._data)

466 else:

467 result = pc.utf8_ltrim(self._data, characters=to_strip)

468 return type(self)(result)

469

470 def _str_rstrip(self, to_strip=None):

471 if pa_version_under4p0:

472 fallback_performancewarning(version="4")

473 return super()._str_rstrip(to_strip)

474

475 if to_strip is None:

476 result = pc.utf8_rtrim_whitespace(self._data)

477 else:

478 result = pc.utf8_rtrim(self._data, characters=to_strip)

479 return type(self)(result)

Coverage for /var/srv/projects/api.amasfac.comuna18.com/tmp/venv/lib/python3.9/site-packages/pandas/core/arrays/string_arrow.py: 18%

240 statements