Coverage for /var/srv/projects/api.amasfac.comuna18.com/tmp/venv/lib/python3.9/site-packages/pandas/core/arrays/string_.py: 20%

241 statements  

« prev     ^ index     » next       coverage.py v6.4.4, created at 2023-07-17 14:22 -0600

1from __future__ import annotations 

2 

3from typing import TYPE_CHECKING 

4 

5import numpy as np 

6 

7from pandas._config import get_option 

8 

9from pandas._libs import ( 

10 lib, 

11 missing as libmissing, 

12) 

13from pandas._libs.arrays import NDArrayBacked 

14from pandas._typing import ( 

15 Dtype, 

16 Scalar, 

17 npt, 

18 type_t, 

19) 

20from pandas.compat import pa_version_under1p01 

21from pandas.compat.numpy import function as nv 

22 

23from pandas.core.dtypes.base import ( 

24 ExtensionDtype, 

25 StorageExtensionDtype, 

26 register_extension_dtype, 

27) 

28from pandas.core.dtypes.common import ( 

29 is_array_like, 

30 is_bool_dtype, 

31 is_dtype_equal, 

32 is_integer_dtype, 

33 is_object_dtype, 

34 is_string_dtype, 

35 pandas_dtype, 

36) 

37 

38from pandas.core import ops 

39from pandas.core.array_algos import masked_reductions 

40from pandas.core.arrays import ( 

41 ExtensionArray, 

42 FloatingArray, 

43 IntegerArray, 

44) 

45from pandas.core.arrays.floating import FloatingDtype 

46from pandas.core.arrays.integer import IntegerDtype 

47from pandas.core.arrays.numpy_ import PandasArray 

48from pandas.core.construction import extract_array 

49from pandas.core.indexers import check_array_indexer 

50from pandas.core.missing import isna 

51 

52if TYPE_CHECKING: 52 ↛ 53line 52 didn't jump to line 53, because the condition on line 52 was never true

53 import pyarrow 

54 

55 from pandas import Series 

56 

57 

58@register_extension_dtype 

59class StringDtype(StorageExtensionDtype): 

60 """ 

61 Extension dtype for string data. 

62 

63 .. versionadded:: 1.0.0 

64 

65 .. warning:: 

66 

67 StringDtype is considered experimental. The implementation and 

68 parts of the API may change without warning. 

69 

70 Parameters 

71 ---------- 

72 storage : {"python", "pyarrow"}, optional 

73 If not given, the value of ``pd.options.mode.string_storage``. 

74 

75 Attributes 

76 ---------- 

77 None 

78 

79 Methods 

80 ------- 

81 None 

82 

83 Examples 

84 -------- 

85 >>> pd.StringDtype() 

86 string[python] 

87 

88 >>> pd.StringDtype(storage="pyarrow") 

89 string[pyarrow] 

90 """ 

91 

92 name = "string" 

93 

94 #: StringDtype().na_value uses pandas.NA 

95 @property 

96 def na_value(self) -> libmissing.NAType: 

97 return libmissing.NA 

98 

99 _metadata = ("storage",) 

100 

101 def __init__(self, storage=None) -> None: 

102 if storage is None: 

103 storage = get_option("mode.string_storage") 

104 if storage not in {"python", "pyarrow"}: 

105 raise ValueError( 

106 f"Storage must be 'python' or 'pyarrow'. Got {storage} instead." 

107 ) 

108 if storage == "pyarrow" and pa_version_under1p01: 

109 raise ImportError( 

110 "pyarrow>=1.0.0 is required for PyArrow backed StringArray." 

111 ) 

112 self.storage = storage 

113 

114 @property 

115 def type(self) -> type[str]: 

116 return str 

117 

118 @classmethod 

119 def construct_from_string(cls, string): 

120 """ 

121 Construct a StringDtype from a string. 

122 

123 Parameters 

124 ---------- 

125 string : str 

126 The type of the name. The storage type will be taking from `string`. 

127 Valid options and their storage types are 

128 

129 ========================== ============================================== 

130 string result storage 

131 ========================== ============================================== 

132 ``'string'`` pd.options.mode.string_storage, default python 

133 ``'string[python]'`` python 

134 ``'string[pyarrow]'`` pyarrow 

135 ========================== ============================================== 

136 

137 Returns 

138 ------- 

139 StringDtype 

140 

141 Raise 

142 ----- 

143 TypeError 

144 If the string is not a valid option. 

145 """ 

146 if not isinstance(string, str): 

147 raise TypeError( 

148 f"'construct_from_string' expects a string, got {type(string)}" 

149 ) 

150 if string == "string": 

151 return cls() 

152 elif string == "string[python]": 

153 return cls(storage="python") 

154 elif string == "string[pyarrow]": 

155 return cls(storage="pyarrow") 

156 else: 

157 raise TypeError(f"Cannot construct a '{cls.__name__}' from '{string}'") 

158 

159 # https://github.com/pandas-dev/pandas/issues/36126 

160 # error: Signature of "construct_array_type" incompatible with supertype 

161 # "ExtensionDtype" 

162 def construct_array_type( # type: ignore[override] 

163 self, 

164 ) -> type_t[BaseStringArray]: 

165 """ 

166 Return the array type associated with this dtype. 

167 

168 Returns 

169 ------- 

170 type 

171 """ 

172 from pandas.core.arrays.string_arrow import ArrowStringArray 

173 

174 if self.storage == "python": 

175 return StringArray 

176 else: 

177 return ArrowStringArray 

178 

179 def __from_arrow__( 

180 self, array: pyarrow.Array | pyarrow.ChunkedArray 

181 ) -> BaseStringArray: 

182 """ 

183 Construct StringArray from pyarrow Array/ChunkedArray. 

184 """ 

185 if self.storage == "pyarrow": 

186 from pandas.core.arrays.string_arrow import ArrowStringArray 

187 

188 return ArrowStringArray(array) 

189 else: 

190 

191 import pyarrow 

192 

193 if isinstance(array, pyarrow.Array): 

194 chunks = [array] 

195 else: 

196 # pyarrow.ChunkedArray 

197 chunks = array.chunks 

198 

199 results = [] 

200 for arr in chunks: 

201 # using _from_sequence to ensure None is converted to NA 

202 str_arr = StringArray._from_sequence(np.array(arr)) 

203 results.append(str_arr) 

204 

205 if results: 

206 return StringArray._concat_same_type(results) 

207 else: 

208 return StringArray(np.array([], dtype="object")) 

209 

210 

211class BaseStringArray(ExtensionArray): 

212 """ 

213 Mixin class for StringArray, ArrowStringArray. 

214 """ 

215 

216 pass 

217 

218 

219class StringArray(BaseStringArray, PandasArray): 

220 """ 

221 Extension array for string data. 

222 

223 .. versionadded:: 1.0.0 

224 

225 .. warning:: 

226 

227 StringArray is considered experimental. The implementation and 

228 parts of the API may change without warning. 

229 

230 Parameters 

231 ---------- 

232 values : array-like 

233 The array of data. 

234 

235 .. warning:: 

236 

237 Currently, this expects an object-dtype ndarray 

238 where the elements are Python strings 

239 or nan-likes (``None``, ``np.nan``, ``NA``). 

240 This may change without warning in the future. Use 

241 :meth:`pandas.array` with ``dtype="string"`` for a stable way of 

242 creating a `StringArray` from any sequence. 

243 

244 .. versionchanged:: 1.5.0 

245 

246 StringArray now accepts array-likes containing 

247 nan-likes(``None``, ``np.nan``) for the ``values`` parameter 

248 in addition to strings and :attr:`pandas.NA` 

249 

250 copy : bool, default False 

251 Whether to copy the array of data. 

252 

253 Attributes 

254 ---------- 

255 None 

256 

257 Methods 

258 ------- 

259 None 

260 

261 See Also 

262 -------- 

263 array 

264 The recommended function for creating a StringArray. 

265 Series.str 

266 The string methods are available on Series backed by 

267 a StringArray. 

268 

269 Notes 

270 ----- 

271 StringArray returns a BooleanArray for comparison methods. 

272 

273 Examples 

274 -------- 

275 >>> pd.array(['This is', 'some text', None, 'data.'], dtype="string") 

276 <StringArray> 

277 ['This is', 'some text', <NA>, 'data.'] 

278 Length: 4, dtype: string 

279 

280 Unlike arrays instantiated with ``dtype="object"``, ``StringArray`` 

281 will convert the values to strings. 

282 

283 >>> pd.array(['1', 1], dtype="object") 

284 <PandasArray> 

285 ['1', 1] 

286 Length: 2, dtype: object 

287 >>> pd.array(['1', 1], dtype="string") 

288 <StringArray> 

289 ['1', '1'] 

290 Length: 2, dtype: string 

291 

292 However, instantiating StringArrays directly with non-strings will raise an error. 

293 

294 For comparison methods, `StringArray` returns a :class:`pandas.BooleanArray`: 

295 

296 >>> pd.array(["a", None, "c"], dtype="string") == "a" 

297 <BooleanArray> 

298 [True, <NA>, False] 

299 Length: 3, dtype: boolean 

300 """ 

301 

302 # undo the PandasArray hack 

303 _typ = "extension" 

304 

305 def __init__(self, values, copy=False) -> None: 

306 values = extract_array(values) 

307 

308 super().__init__(values, copy=copy) 

309 if not isinstance(values, type(self)): 

310 self._validate() 

311 NDArrayBacked.__init__(self, self._ndarray, StringDtype(storage="python")) 

312 

313 def _validate(self): 

314 """Validate that we only store NA or strings.""" 

315 if len(self._ndarray) and not lib.is_string_array(self._ndarray, skipna=True): 

316 raise ValueError("StringArray requires a sequence of strings or pandas.NA") 

317 if self._ndarray.dtype != "object": 

318 raise ValueError( 

319 "StringArray requires a sequence of strings or pandas.NA. Got " 

320 f"'{self._ndarray.dtype}' dtype instead." 

321 ) 

322 # Check to see if need to convert Na values to pd.NA 

323 if self._ndarray.ndim > 2: 

324 # Ravel if ndims > 2 b/c no cythonized version available 

325 lib.convert_nans_to_NA(self._ndarray.ravel("K")) 

326 else: 

327 lib.convert_nans_to_NA(self._ndarray) 

328 

329 @classmethod 

330 def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy=False): 

331 if dtype and not (isinstance(dtype, str) and dtype == "string"): 

332 dtype = pandas_dtype(dtype) 

333 assert isinstance(dtype, StringDtype) and dtype.storage == "python" 

334 

335 from pandas.core.arrays.masked import BaseMaskedArray 

336 

337 if isinstance(scalars, BaseMaskedArray): 

338 # avoid costly conversion to object dtype 

339 na_values = scalars._mask 

340 result = scalars._data 

341 result = lib.ensure_string_array(result, copy=copy, convert_na_value=False) 

342 result[na_values] = libmissing.NA 

343 

344 else: 

345 # convert non-na-likes to str, and nan-likes to StringDtype().na_value 

346 result = lib.ensure_string_array(scalars, na_value=libmissing.NA, copy=copy) 

347 

348 # Manually creating new array avoids the validation step in the __init__, so is 

349 # faster. Refactor need for validation? 

350 new_string_array = cls.__new__(cls) 

351 NDArrayBacked.__init__(new_string_array, result, StringDtype(storage="python")) 

352 

353 return new_string_array 

354 

355 @classmethod 

356 def _from_sequence_of_strings( 

357 cls, strings, *, dtype: Dtype | None = None, copy=False 

358 ): 

359 return cls._from_sequence(strings, dtype=dtype, copy=copy) 

360 

361 @classmethod 

362 def _empty(cls, shape, dtype) -> StringArray: 

363 values = np.empty(shape, dtype=object) 

364 values[:] = libmissing.NA 

365 return cls(values).astype(dtype, copy=False) 

366 

367 def __arrow_array__(self, type=None): 

368 """ 

369 Convert myself into a pyarrow Array. 

370 """ 

371 import pyarrow as pa 

372 

373 if type is None: 

374 type = pa.string() 

375 

376 values = self._ndarray.copy() 

377 values[self.isna()] = None 

378 return pa.array(values, type=type, from_pandas=True) 

379 

380 def _values_for_factorize(self): 

381 arr = self._ndarray.copy() 

382 mask = self.isna() 

383 arr[mask] = None 

384 return arr, None 

385 

386 def __setitem__(self, key, value): 

387 value = extract_array(value, extract_numpy=True) 

388 if isinstance(value, type(self)): 

389 # extract_array doesn't extract PandasArray subclasses 

390 value = value._ndarray 

391 

392 key = check_array_indexer(self, key) 

393 scalar_key = lib.is_scalar(key) 

394 scalar_value = lib.is_scalar(value) 

395 if scalar_key and not scalar_value: 

396 raise ValueError("setting an array element with a sequence.") 

397 

398 # validate new items 

399 if scalar_value: 

400 if isna(value): 

401 value = libmissing.NA 

402 elif not isinstance(value, str): 

403 raise ValueError( 

404 f"Cannot set non-string value '{value}' into a StringArray." 

405 ) 

406 else: 

407 if not is_array_like(value): 

408 value = np.asarray(value, dtype=object) 

409 if len(value) and not lib.is_string_array(value, skipna=True): 

410 raise ValueError("Must provide strings.") 

411 

412 value[isna(value)] = libmissing.NA 

413 

414 super().__setitem__(key, value) 

415 

416 def _putmask(self, mask: npt.NDArray[np.bool_], value) -> None: 

417 # the super() method NDArrayBackedExtensionArray._putmask uses 

418 # np.putmask which doesn't properly handle None/pd.NA, so using the 

419 # base class implementation that uses __setitem__ 

420 ExtensionArray._putmask(self, mask, value) 

421 

422 def astype(self, dtype, copy: bool = True): 

423 dtype = pandas_dtype(dtype) 

424 

425 if is_dtype_equal(dtype, self.dtype): 

426 if copy: 

427 return self.copy() 

428 return self 

429 

430 elif isinstance(dtype, IntegerDtype): 

431 arr = self._ndarray.copy() 

432 mask = self.isna() 

433 arr[mask] = 0 

434 values = arr.astype(dtype.numpy_dtype) 

435 return IntegerArray(values, mask, copy=False) 

436 elif isinstance(dtype, FloatingDtype): 

437 arr = self.copy() 

438 mask = self.isna() 

439 arr[mask] = "0" 

440 values = arr.astype(dtype.numpy_dtype) 

441 return FloatingArray(values, mask, copy=False) 

442 elif isinstance(dtype, ExtensionDtype): 

443 return super().astype(dtype, copy=copy) 

444 elif np.issubdtype(dtype, np.floating): 

445 arr = self._ndarray.copy() 

446 mask = self.isna() 

447 arr[mask] = 0 

448 values = arr.astype(dtype) 

449 values[mask] = np.nan 

450 return values 

451 

452 return super().astype(dtype, copy) 

453 

454 def _reduce( 

455 self, name: str, *, skipna: bool = True, axis: int | None = 0, **kwargs 

456 ): 

457 if name in ["min", "max"]: 

458 return getattr(self, name)(skipna=skipna, axis=axis) 

459 

460 raise TypeError(f"Cannot perform reduction '{name}' with string dtype") 

461 

462 def min(self, axis=None, skipna: bool = True, **kwargs) -> Scalar: 

463 nv.validate_min((), kwargs) 

464 result = masked_reductions.min( 

465 values=self.to_numpy(), mask=self.isna(), skipna=skipna 

466 ) 

467 return self._wrap_reduction_result(axis, result) 

468 

469 def max(self, axis=None, skipna: bool = True, **kwargs) -> Scalar: 

470 nv.validate_max((), kwargs) 

471 result = masked_reductions.max( 

472 values=self.to_numpy(), mask=self.isna(), skipna=skipna 

473 ) 

474 return self._wrap_reduction_result(axis, result) 

475 

476 def value_counts(self, dropna: bool = True) -> Series: 

477 from pandas import value_counts 

478 

479 result = value_counts(self._ndarray, dropna=dropna).astype("Int64") 

480 result.index = result.index.astype(self.dtype) 

481 return result 

482 

483 def memory_usage(self, deep: bool = False) -> int: 

484 result = self._ndarray.nbytes 

485 if deep: 

486 return result + lib.memory_usage_of_objects(self._ndarray) 

487 return result 

488 

489 def _cmp_method(self, other, op): 

490 from pandas.arrays import BooleanArray 

491 

492 if isinstance(other, StringArray): 

493 other = other._ndarray 

494 

495 mask = isna(self) | isna(other) 

496 valid = ~mask 

497 

498 if not lib.is_scalar(other): 

499 if len(other) != len(self): 

500 # prevent improper broadcasting when other is 2D 

501 raise ValueError( 

502 f"Lengths of operands do not match: {len(self)} != {len(other)}" 

503 ) 

504 

505 other = np.asarray(other) 

506 other = other[valid] 

507 

508 if op.__name__ in ops.ARITHMETIC_BINOPS: 

509 result = np.empty_like(self._ndarray, dtype="object") 

510 result[mask] = libmissing.NA 

511 result[valid] = op(self._ndarray[valid], other) 

512 return StringArray(result) 

513 else: 

514 # logical 

515 result = np.zeros(len(self._ndarray), dtype="bool") 

516 result[valid] = op(self._ndarray[valid], other) 

517 return BooleanArray(result, mask) 

518 

519 _arith_method = _cmp_method 

520 

521 # ------------------------------------------------------------------------ 

522 # String methods interface 

523 # error: Incompatible types in assignment (expression has type "NAType", 

524 # base class "PandasArray" defined the type as "float") 

525 _str_na_value = libmissing.NA # type: ignore[assignment] 

526 

527 def _str_map( 

528 self, f, na_value=None, dtype: Dtype | None = None, convert: bool = True 

529 ): 

530 from pandas.arrays import BooleanArray 

531 

532 if dtype is None: 

533 dtype = StringDtype(storage="python") 

534 if na_value is None: 

535 na_value = self.dtype.na_value 

536 

537 mask = isna(self) 

538 arr = np.asarray(self) 

539 

540 if is_integer_dtype(dtype) or is_bool_dtype(dtype): 

541 constructor: type[IntegerArray] | type[BooleanArray] 

542 if is_integer_dtype(dtype): 

543 constructor = IntegerArray 

544 else: 

545 constructor = BooleanArray 

546 

547 na_value_is_na = isna(na_value) 

548 if na_value_is_na: 

549 na_value = 1 

550 result = lib.map_infer_mask( 

551 arr, 

552 f, 

553 mask.view("uint8"), 

554 convert=False, 

555 na_value=na_value, 

556 # error: Argument 1 to "dtype" has incompatible type 

557 # "Union[ExtensionDtype, str, dtype[Any], Type[object]]"; expected 

558 # "Type[object]" 

559 dtype=np.dtype(dtype), # type: ignore[arg-type] 

560 ) 

561 

562 if not na_value_is_na: 

563 mask[:] = False 

564 

565 return constructor(result, mask) 

566 

567 elif is_string_dtype(dtype) and not is_object_dtype(dtype): 

568 # i.e. StringDtype 

569 result = lib.map_infer_mask( 

570 arr, f, mask.view("uint8"), convert=False, na_value=na_value 

571 ) 

572 return StringArray(result) 

573 else: 

574 # This is when the result type is object. We reach this when 

575 # -> We know the result type is truly object (e.g. .encode returns bytes 

576 # or .findall returns a list). 

577 # -> We don't know the result type. E.g. `.get` can return anything. 

578 return lib.map_infer_mask(arr, f, mask.view("uint8"))