Coverage for /var/srv/projects/api.amasfac.comuna18.com/tmp/venv/lib/python3.9/site-packages/pandas/core/dtypes/base.py: 51%

136 statements  

« prev     ^ index     » next       coverage.py v6.4.4, created at 2023-07-17 14:22 -0600

1""" 

2Extend pandas with custom array types. 

3""" 

4from __future__ import annotations 

5 

6from typing import ( 

7 TYPE_CHECKING, 

8 Any, 

9 TypeVar, 

10 cast, 

11 overload, 

12) 

13 

14import numpy as np 

15 

16from pandas._libs import missing as libmissing 

17from pandas._libs.hashtable import object_hash 

18from pandas._typing import ( 

19 DtypeObj, 

20 Shape, 

21 npt, 

22 type_t, 

23) 

24from pandas.errors import AbstractMethodError 

25 

26from pandas.core.dtypes.generic import ( 

27 ABCDataFrame, 

28 ABCIndex, 

29 ABCSeries, 

30) 

31 

32if TYPE_CHECKING: 32 ↛ 33line 32 didn't jump to line 33, because the condition on line 32 was never true

33 from pandas.core.arrays import ExtensionArray 

34 

35 # To parameterize on same ExtensionDtype 

36 ExtensionDtypeT = TypeVar("ExtensionDtypeT", bound="ExtensionDtype") 

37 

38 

39class ExtensionDtype: 

40 """ 

41 A custom data type, to be paired with an ExtensionArray. 

42 

43 See Also 

44 -------- 

45 extensions.register_extension_dtype: Register an ExtensionType 

46 with pandas as class decorator. 

47 extensions.ExtensionArray: Abstract base class for custom 1-D array types. 

48 

49 Notes 

50 ----- 

51 The interface includes the following abstract methods that must 

52 be implemented by subclasses: 

53 

54 * type 

55 * name 

56 * construct_array_type 

57 

58 The following attributes and methods influence the behavior of the dtype in 

59 pandas operations 

60 

61 * _is_numeric 

62 * _is_boolean 

63 * _get_common_dtype 

64 

65 The `na_value` class attribute can be used to set the default NA value 

66 for this type. :attr:`numpy.nan` is used by default. 

67 

68 ExtensionDtypes are required to be hashable. The base class provides 

69 a default implementation, which relies on the ``_metadata`` class 

70 attribute. ``_metadata`` should be a tuple containing the strings 

71 that define your data type. For example, with ``PeriodDtype`` that's 

72 the ``freq`` attribute. 

73 

74 **If you have a parametrized dtype you should set the ``_metadata`` 

75 class property**. 

76 

77 Ideally, the attributes in ``_metadata`` will match the 

78 parameters to your ``ExtensionDtype.__init__`` (if any). If any of 

79 the attributes in ``_metadata`` don't implement the standard 

80 ``__eq__`` or ``__hash__``, the default implementations here will not 

81 work. 

82 

83 For interaction with Apache Arrow (pyarrow), a ``__from_arrow__`` method 

84 can be implemented: this method receives a pyarrow Array or ChunkedArray 

85 as only argument and is expected to return the appropriate pandas 

86 ExtensionArray for this dtype and the passed values:: 

87 

88 class ExtensionDtype: 

89 

90 def __from_arrow__( 

91 self, array: Union[pyarrow.Array, pyarrow.ChunkedArray] 

92 ) -> ExtensionArray: 

93 ... 

94 

95 This class does not inherit from 'abc.ABCMeta' for performance reasons. 

96 Methods and properties required by the interface raise 

97 ``pandas.errors.AbstractMethodError`` and no ``register`` method is 

98 provided for registering virtual subclasses. 

99 """ 

100 

101 _metadata: tuple[str, ...] = () 

102 

103 def __str__(self) -> str: 

104 return self.name 

105 

106 def __eq__(self, other: Any) -> bool: 

107 """ 

108 Check whether 'other' is equal to self. 

109 

110 By default, 'other' is considered equal if either 

111 

112 * it's a string matching 'self.name'. 

113 * it's an instance of this type and all of the attributes 

114 in ``self._metadata`` are equal between `self` and `other`. 

115 

116 Parameters 

117 ---------- 

118 other : Any 

119 

120 Returns 

121 ------- 

122 bool 

123 """ 

124 if isinstance(other, str): 

125 try: 

126 other = self.construct_from_string(other) 

127 except TypeError: 

128 return False 

129 if isinstance(other, type(self)): 

130 return all( 

131 getattr(self, attr) == getattr(other, attr) for attr in self._metadata 

132 ) 

133 return False 

134 

135 def __hash__(self) -> int: 

136 # for python>=3.10, different nan objects have different hashes 

137 # we need to avoid that und thus use hash function with old behavior 

138 return object_hash(tuple(getattr(self, attr) for attr in self._metadata)) 

139 

140 def __ne__(self, other: Any) -> bool: 

141 return not self.__eq__(other) 

142 

143 @property 

144 def na_value(self) -> object: 

145 """ 

146 Default NA value to use for this type. 

147 

148 This is used in e.g. ExtensionArray.take. This should be the 

149 user-facing "boxed" version of the NA value, not the physical NA value 

150 for storage. e.g. for JSONArray, this is an empty dictionary. 

151 """ 

152 return np.nan 

153 

154 @property 

155 def type(self) -> type_t[Any]: 

156 """ 

157 The scalar type for the array, e.g. ``int`` 

158 

159 It's expected ``ExtensionArray[item]`` returns an instance 

160 of ``ExtensionDtype.type`` for scalar ``item``, assuming 

161 that value is valid (not NA). NA values do not need to be 

162 instances of `type`. 

163 """ 

164 raise AbstractMethodError(self) 

165 

166 @property 

167 def kind(self) -> str: 

168 """ 

169 A character code (one of 'biufcmMOSUV'), default 'O' 

170 

171 This should match the NumPy dtype used when the array is 

172 converted to an ndarray, which is probably 'O' for object if 

173 the extension type cannot be represented as a built-in NumPy 

174 type. 

175 

176 See Also 

177 -------- 

178 numpy.dtype.kind 

179 """ 

180 return "O" 

181 

182 @property 

183 def name(self) -> str: 

184 """ 

185 A string identifying the data type. 

186 

187 Will be used for display in, e.g. ``Series.dtype`` 

188 """ 

189 raise AbstractMethodError(self) 

190 

191 @property 

192 def names(self) -> list[str] | None: 

193 """ 

194 Ordered list of field names, or None if there are no fields. 

195 

196 This is for compatibility with NumPy arrays, and may be removed in the 

197 future. 

198 """ 

199 return None 

200 

201 @classmethod 

202 def construct_array_type(cls) -> type_t[ExtensionArray]: 

203 """ 

204 Return the array type associated with this dtype. 

205 

206 Returns 

207 ------- 

208 type 

209 """ 

210 raise AbstractMethodError(cls) 

211 

212 def empty(self, shape: Shape) -> type_t[ExtensionArray]: 

213 """ 

214 Construct an ExtensionArray of this dtype with the given shape. 

215 

216 Analogous to numpy.empty. 

217 

218 Parameters 

219 ---------- 

220 shape : int or tuple[int] 

221 

222 Returns 

223 ------- 

224 ExtensionArray 

225 """ 

226 cls = self.construct_array_type() 

227 return cls._empty(shape, dtype=self) 

228 

229 @classmethod 

230 def construct_from_string( 

231 cls: type_t[ExtensionDtypeT], string: str 

232 ) -> ExtensionDtypeT: 

233 r""" 

234 Construct this type from a string. 

235 

236 This is useful mainly for data types that accept parameters. 

237 For example, a period dtype accepts a frequency parameter that 

238 can be set as ``period[H]`` (where H means hourly frequency). 

239 

240 By default, in the abstract class, just the name of the type is 

241 expected. But subclasses can overwrite this method to accept 

242 parameters. 

243 

244 Parameters 

245 ---------- 

246 string : str 

247 The name of the type, for example ``category``. 

248 

249 Returns 

250 ------- 

251 ExtensionDtype 

252 Instance of the dtype. 

253 

254 Raises 

255 ------ 

256 TypeError 

257 If a class cannot be constructed from this 'string'. 

258 

259 Examples 

260 -------- 

261 For extension dtypes with arguments the following may be an 

262 adequate implementation. 

263 

264 >>> @classmethod 

265 ... def construct_from_string(cls, string): 

266 ... pattern = re.compile(r"^my_type\[(?P<arg_name>.+)\]$") 

267 ... match = pattern.match(string) 

268 ... if match: 

269 ... return cls(**match.groupdict()) 

270 ... else: 

271 ... raise TypeError( 

272 ... f"Cannot construct a '{cls.__name__}' from '{string}'" 

273 ... ) 

274 """ 

275 if not isinstance(string, str): 

276 raise TypeError( 

277 f"'construct_from_string' expects a string, got {type(string)}" 

278 ) 

279 # error: Non-overlapping equality check (left operand type: "str", right 

280 # operand type: "Callable[[ExtensionDtype], str]") [comparison-overlap] 

281 assert isinstance(cls.name, str), (cls, type(cls.name)) 

282 if string != cls.name: 

283 raise TypeError(f"Cannot construct a '{cls.__name__}' from '{string}'") 

284 return cls() 

285 

286 @classmethod 

287 def is_dtype(cls, dtype: object) -> bool: 

288 """ 

289 Check if we match 'dtype'. 

290 

291 Parameters 

292 ---------- 

293 dtype : object 

294 The object to check. 

295 

296 Returns 

297 ------- 

298 bool 

299 

300 Notes 

301 ----- 

302 The default implementation is True if 

303 

304 1. ``cls.construct_from_string(dtype)`` is an instance 

305 of ``cls``. 

306 2. ``dtype`` is an object and is an instance of ``cls`` 

307 3. ``dtype`` has a ``dtype`` attribute, and any of the above 

308 conditions is true for ``dtype.dtype``. 

309 """ 

310 dtype = getattr(dtype, "dtype", dtype) 

311 

312 if isinstance(dtype, (ABCSeries, ABCIndex, ABCDataFrame, np.dtype)): 

313 # https://github.com/pandas-dev/pandas/issues/22960 

314 # avoid passing data to `construct_from_string`. This could 

315 # cause a FutureWarning from numpy about failing elementwise 

316 # comparison from, e.g., comparing DataFrame == 'category'. 

317 return False 

318 elif dtype is None: 

319 return False 

320 elif isinstance(dtype, cls): 

321 return True 

322 if isinstance(dtype, str): 

323 try: 

324 return cls.construct_from_string(dtype) is not None 

325 except TypeError: 

326 return False 

327 return False 

328 

329 @property 

330 def _is_numeric(self) -> bool: 

331 """ 

332 Whether columns with this dtype should be considered numeric. 

333 

334 By default ExtensionDtypes are assumed to be non-numeric. 

335 They'll be excluded from operations that exclude non-numeric 

336 columns, like (groupby) reductions, plotting, etc. 

337 """ 

338 return False 

339 

340 @property 

341 def _is_boolean(self) -> bool: 

342 """ 

343 Whether this dtype should be considered boolean. 

344 

345 By default, ExtensionDtypes are assumed to be non-numeric. 

346 Setting this to True will affect the behavior of several places, 

347 e.g. 

348 

349 * is_bool 

350 * boolean indexing 

351 

352 Returns 

353 ------- 

354 bool 

355 """ 

356 return False 

357 

358 def _get_common_dtype(self, dtypes: list[DtypeObj]) -> DtypeObj | None: 

359 """ 

360 Return the common dtype, if one exists. 

361 

362 Used in `find_common_type` implementation. This is for example used 

363 to determine the resulting dtype in a concat operation. 

364 

365 If no common dtype exists, return None (which gives the other dtypes 

366 the chance to determine a common dtype). If all dtypes in the list 

367 return None, then the common dtype will be "object" dtype (this means 

368 it is never needed to return "object" dtype from this method itself). 

369 

370 Parameters 

371 ---------- 

372 dtypes : list of dtypes 

373 The dtypes for which to determine a common dtype. This is a list 

374 of np.dtype or ExtensionDtype instances. 

375 

376 Returns 

377 ------- 

378 Common dtype (np.dtype or ExtensionDtype) or None 

379 """ 

380 if len(set(dtypes)) == 1: 

381 # only itself 

382 return self 

383 else: 

384 return None 

385 

386 @property 

387 def _can_hold_na(self) -> bool: 

388 """ 

389 Can arrays of this dtype hold NA values? 

390 """ 

391 return True 

392 

393 

394class StorageExtensionDtype(ExtensionDtype): 

395 """ExtensionDtype that may be backed by more than one implementation.""" 

396 

397 name: str 

398 _metadata = ("storage",) 

399 

400 def __init__(self, storage=None) -> None: 

401 self.storage = storage 

402 

403 def __repr__(self) -> str: 

404 return f"{self.name}[{self.storage}]" 

405 

406 def __str__(self): 

407 return self.name 

408 

409 def __eq__(self, other: Any) -> bool: 

410 if isinstance(other, str) and other == self.name: 

411 return True 

412 return super().__eq__(other) 

413 

414 def __hash__(self) -> int: 

415 # custom __eq__ so have to override __hash__ 

416 return super().__hash__() 

417 

418 @property 

419 def na_value(self) -> libmissing.NAType: 

420 return libmissing.NA 

421 

422 

423def register_extension_dtype(cls: type_t[ExtensionDtypeT]) -> type_t[ExtensionDtypeT]: 

424 """ 

425 Register an ExtensionType with pandas as class decorator. 

426 

427 This enables operations like ``.astype(name)`` for the name 

428 of the ExtensionDtype. 

429 

430 Returns 

431 ------- 

432 callable 

433 A class decorator. 

434 

435 Examples 

436 -------- 

437 >>> from pandas.api.extensions import register_extension_dtype, ExtensionDtype 

438 >>> @register_extension_dtype 

439 ... class MyExtensionDtype(ExtensionDtype): 

440 ... name = "myextension" 

441 """ 

442 _registry.register(cls) 

443 return cls 

444 

445 

446class Registry: 

447 """ 

448 Registry for dtype inference. 

449 

450 The registry allows one to map a string repr of a extension 

451 dtype to an extension dtype. The string alias can be used in several 

452 places, including 

453 

454 * Series and Index constructors 

455 * :meth:`pandas.array` 

456 * :meth:`pandas.Series.astype` 

457 

458 Multiple extension types can be registered. 

459 These are tried in order. 

460 """ 

461 

462 def __init__(self) -> None: 

463 self.dtypes: list[type_t[ExtensionDtype]] = [] 

464 

465 def register(self, dtype: type_t[ExtensionDtype]) -> None: 

466 """ 

467 Parameters 

468 ---------- 

469 dtype : ExtensionDtype class 

470 """ 

471 if not issubclass(dtype, ExtensionDtype): 471 ↛ 472line 471 didn't jump to line 472, because the condition on line 471 was never true

472 raise ValueError("can only register pandas extension dtypes") 

473 

474 self.dtypes.append(dtype) 

475 

476 @overload 

477 def find(self, dtype: type_t[ExtensionDtypeT]) -> type_t[ExtensionDtypeT]: 

478 ... 

479 

480 @overload 

481 def find(self, dtype: ExtensionDtypeT) -> ExtensionDtypeT: 

482 ... 

483 

484 @overload 

485 def find(self, dtype: str) -> ExtensionDtype | None: 

486 ... 

487 

488 @overload 

489 def find( 

490 self, dtype: npt.DTypeLike 

491 ) -> type_t[ExtensionDtype] | ExtensionDtype | None: 

492 ... 

493 

494 def find( 

495 self, dtype: type_t[ExtensionDtype] | ExtensionDtype | npt.DTypeLike 

496 ) -> type_t[ExtensionDtype] | ExtensionDtype | None: 

497 """ 

498 Parameters 

499 ---------- 

500 dtype : ExtensionDtype class or instance or str or numpy dtype or python type 

501 

502 Returns 

503 ------- 

504 return the first matching dtype, otherwise return None 

505 """ 

506 if not isinstance(dtype, str): 

507 dtype_type: type_t 

508 if not isinstance(dtype, type): 508 ↛ 509line 508 didn't jump to line 509, because the condition on line 508 was never true

509 dtype_type = type(dtype) 

510 else: 

511 dtype_type = dtype 

512 if issubclass(dtype_type, ExtensionDtype): 512 ↛ 517line 512 didn't jump to line 517, because the condition on line 512 was never false

513 # cast needed here as mypy doesn't know we have figured 

514 # out it is an ExtensionDtype or type_t[ExtensionDtype] 

515 return cast("ExtensionDtype | type_t[ExtensionDtype]", dtype) 

516 

517 return None 

518 

519 for dtype_type in self.dtypes: 

520 try: 

521 return dtype_type.construct_from_string(dtype) 

522 except TypeError: 

523 pass 

524 

525 return None 

526 

527 

528_registry = Registry()