Coverage for /var/srv/projects/api.amasfac.comuna18.com/tmp/venv/lib/python3.9/site-packages/pandas/core/arrays/string

1from __future__ import annotations

3from typing import TYPE_CHECKING

5import numpy as np

7from pandas._config import get_option

9from pandas._libs import (

10 lib,

11 missing as libmissing,

12)

13from pandas._libs.arrays import NDArrayBacked

14from pandas._typing import (

15 Dtype,

16 Scalar,

17 npt,

18 type_t,

19)

20from pandas.compat import pa_version_under1p01

21from pandas.compat.numpy import function as nv

23from pandas.core.dtypes.base import (

24 ExtensionDtype,

25 StorageExtensionDtype,

26 register_extension_dtype,

27)

28from pandas.core.dtypes.common import (

29 is_array_like,

30 is_bool_dtype,

31 is_dtype_equal,

32 is_integer_dtype,

33 is_object_dtype,

34 is_string_dtype,

35 pandas_dtype,

36)

38from pandas.core import ops

39from pandas.core.array_algos import masked_reductions

40from pandas.core.arrays import (

41 ExtensionArray,

42 FloatingArray,

43 IntegerArray,

44)

45from pandas.core.arrays.floating import FloatingDtype

46from pandas.core.arrays.integer import IntegerDtype

47from pandas.core.arrays.numpy_ import PandasArray

48from pandas.core.construction import extract_array

49from pandas.core.indexers import check_array_indexer

50from pandas.core.missing import isna

52if TYPE_CHECKING: 52 ↛ 53line 52 didn't jump to line 53, because the condition on line 52 was never true

53 import pyarrow

55 from pandas import Series

58@register_extension_dtype

59class StringDtype(StorageExtensionDtype):

60 """

61 Extension dtype for string data.

63 .. versionadded:: 1.0.0

65 .. warning::

67 StringDtype is considered experimental. The implementation and

68 parts of the API may change without warning.

70 Parameters

71 ----------

72 storage : {"python", "pyarrow"}, optional

73 If not given, the value of ``pd.options.mode.string_storage``.

75 Attributes

76 ----------

77 None

79 Methods

80 -------

81 None

83 Examples

84 --------

85 >>> pd.StringDtype()

86 string[python]

88 >>> pd.StringDtype(storage="pyarrow")

89 string[pyarrow]

90 """

92 name = "string"

94 #: StringDtype().na_value uses pandas.NA

95 @property

96 def na_value(self) -> libmissing.NAType:

97 return libmissing.NA

99 _metadata = ("storage",)

100

101 def __init__(self, storage=None) -> None:

102 if storage is None:

103 storage = get_option("mode.string_storage")

104 if storage not in {"python", "pyarrow"}:

105 raise ValueError(

106 f"Storage must be 'python' or 'pyarrow'. Got {storage} instead."

107 )

108 if storage == "pyarrow" and pa_version_under1p01:

109 raise ImportError(

110 "pyarrow>=1.0.0 is required for PyArrow backed StringArray."

111 )

112 self.storage = storage

113

114 @property

115 def type(self) -> type[str]:

116 return str

117

118 @classmethod

119 def construct_from_string(cls, string):

120 """

121 Construct a StringDtype from a string.

122

123 Parameters

124 ----------

125 string : str

126 The type of the name. The storage type will be taking from `string`.

127 Valid options and their storage types are

128

129 ========================== ==============================================

130 string result storage

131 ========================== ==============================================

132 ``'string'`` pd.options.mode.string_storage, default python

133 ``'string[python]'`` python

134 ``'string[pyarrow]'`` pyarrow

135 ========================== ==============================================

136

137 Returns

138 -------

139 StringDtype

140

141 Raise

142 -----

143 TypeError

144 If the string is not a valid option.

145 """

146 if not isinstance(string, str):

147 raise TypeError(

148 f"'construct_from_string' expects a string, got {type(string)}"

149 )

150 if string == "string":

151 return cls()

152 elif string == "string[python]":

153 return cls(storage="python")

154 elif string == "string[pyarrow]":

155 return cls(storage="pyarrow")

156 else:

157 raise TypeError(f"Cannot construct a '{cls.__name__}' from '{string}'")

158

159 # https://github.com/pandas-dev/pandas/issues/36126

160 # error: Signature of "construct_array_type" incompatible with supertype

161 # "ExtensionDtype"

162 def construct_array_type( # type: ignore[override]

163 self,

164 ) -> type_t[BaseStringArray]:

165 """

166 Return the array type associated with this dtype.

167

168 Returns

169 -------

170 type

171 """

172 from pandas.core.arrays.string_arrow import ArrowStringArray

173

174 if self.storage == "python":

175 return StringArray

176 else:

177 return ArrowStringArray

178

179 def __from_arrow__(

180 self, array: pyarrow.Array | pyarrow.ChunkedArray

181 ) -> BaseStringArray:

182 """

183 Construct StringArray from pyarrow Array/ChunkedArray.

184 """

185 if self.storage == "pyarrow":

186 from pandas.core.arrays.string_arrow import ArrowStringArray

187

188 return ArrowStringArray(array)

189 else:

190

191 import pyarrow

192

193 if isinstance(array, pyarrow.Array):

194 chunks = [array]

195 else:

196 # pyarrow.ChunkedArray

197 chunks = array.chunks

198

199 results = []

200 for arr in chunks:

201 # using _from_sequence to ensure None is converted to NA

202 str_arr = StringArray._from_sequence(np.array(arr))

203 results.append(str_arr)

204

205 if results:

206 return StringArray._concat_same_type(results)

207 else:

208 return StringArray(np.array([], dtype="object"))

209

210

211class BaseStringArray(ExtensionArray):

212 """

213 Mixin class for StringArray, ArrowStringArray.

214 """

215

216 pass

217

218

219class StringArray(BaseStringArray, PandasArray):

220 """

221 Extension array for string data.

222

223 .. versionadded:: 1.0.0

224

225 .. warning::

226

227 StringArray is considered experimental. The implementation and

228 parts of the API may change without warning.

229

230 Parameters

231 ----------

232 values : array-like

233 The array of data.

234

235 .. warning::

236

237 Currently, this expects an object-dtype ndarray

238 where the elements are Python strings

239 or nan-likes (``None``, ``np.nan``, ``NA``).

240 This may change without warning in the future. Use

241 :meth:`pandas.array` with ``dtype="string"`` for a stable way of

242 creating a `StringArray` from any sequence.

243

244 .. versionchanged:: 1.5.0

245

246 StringArray now accepts array-likes containing

247 nan-likes(``None``, ``np.nan``) for the ``values`` parameter

248 in addition to strings and :attr:`pandas.NA`

249

250 copy : bool, default False

251 Whether to copy the array of data.

252

253 Attributes

254 ----------

255 None

256

257 Methods

258 -------

259 None

260

261 See Also

262 --------

263 array

264 The recommended function for creating a StringArray.

265 Series.str

266 The string methods are available on Series backed by

267 a StringArray.

268

269 Notes

270 -----

271 StringArray returns a BooleanArray for comparison methods.

272

273 Examples

274 --------

275 >>> pd.array(['This is', 'some text', None, 'data.'], dtype="string")

276 <StringArray>

277 ['This is', 'some text', <NA>, 'data.']

278 Length: 4, dtype: string

279

280 Unlike arrays instantiated with ``dtype="object"``, ``StringArray``

281 will convert the values to strings.

282

283 >>> pd.array(['1', 1], dtype="object")

284 <PandasArray>

285 ['1', 1]

286 Length: 2, dtype: object

287 >>> pd.array(['1', 1], dtype="string")

288 <StringArray>

289 ['1', '1']

290 Length: 2, dtype: string

291

292 However, instantiating StringArrays directly with non-strings will raise an error.

293

294 For comparison methods, `StringArray` returns a :class:`pandas.BooleanArray`:

295

296 >>> pd.array(["a", None, "c"], dtype="string") == "a"

297 <BooleanArray>

298 [True, <NA>, False]

299 Length: 3, dtype: boolean

300 """

301

302 # undo the PandasArray hack

303 _typ = "extension"

304

305 def __init__(self, values, copy=False) -> None:

306 values = extract_array(values)

307

308 super().__init__(values, copy=copy)

309 if not isinstance(values, type(self)):

310 self._validate()

311 NDArrayBacked.__init__(self, self._ndarray, StringDtype(storage="python"))

312

313 def _validate(self):

314 """Validate that we only store NA or strings."""

315 if len(self._ndarray) and not lib.is_string_array(self._ndarray, skipna=True):

316 raise ValueError("StringArray requires a sequence of strings or pandas.NA")

317 if self._ndarray.dtype != "object":

318 raise ValueError(

319 "StringArray requires a sequence of strings or pandas.NA. Got "

320 f"'{self._ndarray.dtype}' dtype instead."

321 )

322 # Check to see if need to convert Na values to pd.NA

323 if self._ndarray.ndim > 2:

324 # Ravel if ndims > 2 b/c no cythonized version available

325 lib.convert_nans_to_NA(self._ndarray.ravel("K"))

326 else:

327 lib.convert_nans_to_NA(self._ndarray)

328

329 @classmethod

330 def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy=False):

331 if dtype and not (isinstance(dtype, str) and dtype == "string"):

332 dtype = pandas_dtype(dtype)

333 assert isinstance(dtype, StringDtype) and dtype.storage == "python"

334

335 from pandas.core.arrays.masked import BaseMaskedArray

336

337 if isinstance(scalars, BaseMaskedArray):

338 # avoid costly conversion to object dtype

339 na_values = scalars._mask

340 result = scalars._data

341 result = lib.ensure_string_array(result, copy=copy, convert_na_value=False)

342 result[na_values] = libmissing.NA

343

344 else:

345 # convert non-na-likes to str, and nan-likes to StringDtype().na_value

346 result = lib.ensure_string_array(scalars, na_value=libmissing.NA, copy=copy)

347

348 # Manually creating new array avoids the validation step in the __init__, so is

349 # faster. Refactor need for validation?

350 new_string_array = cls.__new__(cls)

351 NDArrayBacked.__init__(new_string_array, result, StringDtype(storage="python"))

352

353 return new_string_array

354

355 @classmethod

356 def _from_sequence_of_strings(

357 cls, strings, *, dtype: Dtype | None = None, copy=False

358 ):

359 return cls._from_sequence(strings, dtype=dtype, copy=copy)

360

361 @classmethod

362 def _empty(cls, shape, dtype) -> StringArray:

363 values = np.empty(shape, dtype=object)

364 values[:] = libmissing.NA

365 return cls(values).astype(dtype, copy=False)

366

367 def __arrow_array__(self, type=None):

368 """

369 Convert myself into a pyarrow Array.

370 """

371 import pyarrow as pa

372

373 if type is None:

374 type = pa.string()

375

376 values = self._ndarray.copy()

377 values[self.isna()] = None

378 return pa.array(values, type=type, from_pandas=True)

379

380 def _values_for_factorize(self):

381 arr = self._ndarray.copy()

382 mask = self.isna()

383 arr[mask] = None

384 return arr, None

385

386 def __setitem__(self, key, value):

387 value = extract_array(value, extract_numpy=True)

388 if isinstance(value, type(self)):

389 # extract_array doesn't extract PandasArray subclasses

390 value = value._ndarray

391

392 key = check_array_indexer(self, key)

393 scalar_key = lib.is_scalar(key)

394 scalar_value = lib.is_scalar(value)

395 if scalar_key and not scalar_value:

396 raise ValueError("setting an array element with a sequence.")

397

398 # validate new items

399 if scalar_value:

400 if isna(value):

401 value = libmissing.NA

402 elif not isinstance(value, str):

403 raise ValueError(

404 f"Cannot set non-string value '{value}' into a StringArray."

405 )

406 else:

407 if not is_array_like(value):

408 value = np.asarray(value, dtype=object)

409 if len(value) and not lib.is_string_array(value, skipna=True):

410 raise ValueError("Must provide strings.")

411

412 value[isna(value)] = libmissing.NA

413

414 super().__setitem__(key, value)

415

416 def _putmask(self, mask: npt.NDArray[np.bool_], value) -> None:

417 # the super() method NDArrayBackedExtensionArray._putmask uses

418 # np.putmask which doesn't properly handle None/pd.NA, so using the

419 # base class implementation that uses __setitem__

420 ExtensionArray._putmask(self, mask, value)

421

422 def astype(self, dtype, copy: bool = True):

423 dtype = pandas_dtype(dtype)

424

425 if is_dtype_equal(dtype, self.dtype):

426 if copy:

427 return self.copy()

428 return self

429

430 elif isinstance(dtype, IntegerDtype):

431 arr = self._ndarray.copy()

432 mask = self.isna()

433 arr[mask] = 0

434 values = arr.astype(dtype.numpy_dtype)

435 return IntegerArray(values, mask, copy=False)

436 elif isinstance(dtype, FloatingDtype):

437 arr = self.copy()

438 mask = self.isna()

439 arr[mask] = "0"

440 values = arr.astype(dtype.numpy_dtype)

441 return FloatingArray(values, mask, copy=False)

442 elif isinstance(dtype, ExtensionDtype):

443 return super().astype(dtype, copy=copy)

444 elif np.issubdtype(dtype, np.floating):

445 arr = self._ndarray.copy()

446 mask = self.isna()

447 arr[mask] = 0

448 values = arr.astype(dtype)

449 values[mask] = np.nan

450 return values

451

452 return super().astype(dtype, copy)

453

454 def _reduce(

455 self, name: str, *, skipna: bool = True, axis: int | None = 0, **kwargs

456 ):

457 if name in ["min", "max"]:

458 return getattr(self, name)(skipna=skipna, axis=axis)

459

460 raise TypeError(f"Cannot perform reduction '{name}' with string dtype")

461

462 def min(self, axis=None, skipna: bool = True, **kwargs) -> Scalar:

463 nv.validate_min((), kwargs)

464 result = masked_reductions.min(

465 values=self.to_numpy(), mask=self.isna(), skipna=skipna

466 )

467 return self._wrap_reduction_result(axis, result)

468

469 def max(self, axis=None, skipna: bool = True, **kwargs) -> Scalar:

470 nv.validate_max((), kwargs)

471 result = masked_reductions.max(

472 values=self.to_numpy(), mask=self.isna(), skipna=skipna

473 )

474 return self._wrap_reduction_result(axis, result)

475

476 def value_counts(self, dropna: bool = True) -> Series:

477 from pandas import value_counts

478

479 result = value_counts(self._ndarray, dropna=dropna).astype("Int64")

480 result.index = result.index.astype(self.dtype)

481 return result

482

483 def memory_usage(self, deep: bool = False) -> int:

484 result = self._ndarray.nbytes

485 if deep:

486 return result + lib.memory_usage_of_objects(self._ndarray)

487 return result

488

489 def _cmp_method(self, other, op):

490 from pandas.arrays import BooleanArray

491

492 if isinstance(other, StringArray):

493 other = other._ndarray

494

495 mask = isna(self) | isna(other)

496 valid = ~mask

497

498 if not lib.is_scalar(other):

499 if len(other) != len(self):

500 # prevent improper broadcasting when other is 2D

501 raise ValueError(

502 f"Lengths of operands do not match: {len(self)} != {len(other)}"

503 )

504

505 other = np.asarray(other)

506 other = other[valid]

507

508 if op.__name__ in ops.ARITHMETIC_BINOPS:

509 result = np.empty_like(self._ndarray, dtype="object")

510 result[mask] = libmissing.NA

511 result[valid] = op(self._ndarray[valid], other)

512 return StringArray(result)

513 else:

514 # logical

515 result = np.zeros(len(self._ndarray), dtype="bool")

516 result[valid] = op(self._ndarray[valid], other)

517 return BooleanArray(result, mask)

518

519 _arith_method = _cmp_method

520

521 # ------------------------------------------------------------------------

522 # String methods interface

523 # error: Incompatible types in assignment (expression has type "NAType",

524 # base class "PandasArray" defined the type as "float")

525 _str_na_value = libmissing.NA # type: ignore[assignment]

526

527 def _str_map(

528 self, f, na_value=None, dtype: Dtype | None = None, convert: bool = True

529 ):

530 from pandas.arrays import BooleanArray

531

532 if dtype is None:

533 dtype = StringDtype(storage="python")

534 if na_value is None:

535 na_value = self.dtype.na_value

536

537 mask = isna(self)

538 arr = np.asarray(self)

539

540 if is_integer_dtype(dtype) or is_bool_dtype(dtype):

541 constructor: type[IntegerArray] | type[BooleanArray]

542 if is_integer_dtype(dtype):

543 constructor = IntegerArray

544 else:

545 constructor = BooleanArray

546

547 na_value_is_na = isna(na_value)

548 if na_value_is_na:

549 na_value = 1

550 result = lib.map_infer_mask(

551 arr,

552 f,

553 mask.view("uint8"),

554 convert=False,

555 na_value=na_value,

556 # error: Argument 1 to "dtype" has incompatible type

557 # "Union[ExtensionDtype, str, dtype[Any], Type[object]]"; expected

558 # "Type[object]"

559 dtype=np.dtype(dtype), # type: ignore[arg-type]

560 )

561

562 if not na_value_is_na:

563 mask[:] = False

564

565 return constructor(result, mask)

566

567 elif is_string_dtype(dtype) and not is_object_dtype(dtype):

568 # i.e. StringDtype

569 result = lib.map_infer_mask(

570 arr, f, mask.view("uint8"), convert=False, na_value=na_value

571 )

572 return StringArray(result)

573 else:

574 # This is when the result type is object. We reach this when

575 # -> We know the result type is truly object (e.g. .encode returns bytes

576 # or .findall returns a list).

577 # -> We don't know the result type. E.g. `.get` can return anything.

578 return lib.map_infer_mask(arr, f, mask.view("uint8"))

Coverage for /var/srv/projects/api.amasfac.comuna18.com/tmp/venv/lib/python3.9/site-packages/pandas/core/arrays/string_.py: 20%

241 statements