Coverage for /var/srv/projects/api.amasfac.comuna18.com/tmp/venv/lib/python3.9/site-packages/pandas/core/dtypes/cast.py: 7%

1"""

2Routines for casting.

3"""

5from __future__ import annotations

7from datetime import (

8 date,

9 datetime,

10 timedelta,

11)

12import functools

13from typing import (

14 TYPE_CHECKING,

15 Any,

16 Sized,

17 TypeVar,

18 cast,

19 overload,

20)

21import warnings

23from dateutil.parser import ParserError

24import numpy as np

26from pandas._libs import lib

27from pandas._libs.tslibs import (

28 NaT,

29 OutOfBoundsDatetime,

30 OutOfBoundsTimedelta,

31 Timedelta,

32 Timestamp,

33 astype_overflowsafe,

34)

35from pandas._libs.tslibs.timedeltas import array_to_timedelta64

36from pandas._typing import (

37 ArrayLike,

38 Dtype,

39 DtypeObj,

40 Scalar,

41)

42from pandas.errors import IntCastingNaNError

43from pandas.util._exceptions import find_stack_level

44from pandas.util._validators import validate_bool_kwarg

46from pandas.core.dtypes.astype import astype_nansafe

47from pandas.core.dtypes.common import (

48 DT64NS_DTYPE,

49 TD64NS_DTYPE,

50 ensure_int8,

51 ensure_int16,

52 ensure_int32,

53 ensure_int64,

54 ensure_object,

55 ensure_str,

56 is_bool,

57 is_bool_dtype,

58 is_complex,

59 is_complex_dtype,

60 is_datetime64_dtype,

61 is_datetime64tz_dtype,

62 is_dtype_equal,

63 is_extension_array_dtype,

64 is_float,

65 is_float_dtype,

66 is_integer,

67 is_integer_dtype,

68 is_numeric_dtype,

69 is_object_dtype,

70 is_scalar,

71 is_string_dtype,

72 is_timedelta64_dtype,

73 is_unsigned_integer_dtype,

74 pandas_dtype,

75)

76from pandas.core.dtypes.dtypes import (

77 CategoricalDtype,

78 DatetimeTZDtype,

79 ExtensionDtype,

80 IntervalDtype,

81 PeriodDtype,

82)

83from pandas.core.dtypes.generic import (

84 ABCExtensionArray,

85 ABCIndex,

86 ABCSeries,

87)

88from pandas.core.dtypes.inference import is_list_like

89from pandas.core.dtypes.missing import (

90 array_equivalent,

91 is_valid_na_for_dtype,

92 isna,

93 na_value_for_dtype,

94 notna,

95)

97if TYPE_CHECKING: 97 ↛ 99line 97 didn't jump to line 99, because the condition on line 97 was never true

99 from pandas import Index

100 from pandas.core.arrays import (

101 Categorical,

102 DatetimeArray,

103 ExtensionArray,

104 IntervalArray,

105 PeriodArray,

106 TimedeltaArray,

107 )

108

109

110_int8_max = np.iinfo(np.int8).max

111_int16_max = np.iinfo(np.int16).max

112_int32_max = np.iinfo(np.int32).max

113_int64_max = np.iinfo(np.int64).max

114

115_dtype_obj = np.dtype(object)

116

117NumpyArrayT = TypeVar("NumpyArrayT", bound=np.ndarray)

118

119

120def maybe_convert_platform(

121 values: list | tuple | range | np.ndarray | ExtensionArray,

122) -> ArrayLike:

123 """try to do platform conversion, allow ndarray or list here"""

124 arr: ArrayLike

125

126 if isinstance(values, (list, tuple, range)):

127 arr = construct_1d_object_array_from_listlike(values)

128 else:

129 # The caller is responsible for ensuring that we have np.ndarray

130 # or ExtensionArray here.

131 arr = values

132

133 if arr.dtype == _dtype_obj:

134 arr = cast(np.ndarray, arr)

135 arr = lib.maybe_convert_objects(arr)

136

137 return arr

138

139

140def is_nested_object(obj) -> bool:

141 """

142 return a boolean if we have a nested object, e.g. a Series with 1 or

143 more Series elements

144

145 This may not be necessarily be performant.

146

147 """

148 return bool(

149 isinstance(obj, ABCSeries)

150 and is_object_dtype(obj.dtype)

151 and any(isinstance(v, ABCSeries) for v in obj._values)

152 )

153

154

155def maybe_box_datetimelike(value: Scalar, dtype: Dtype | None = None) -> Scalar:

156 """

157 Cast scalar to Timestamp or Timedelta if scalar is datetime-like

158 and dtype is not object.

159

160 Parameters

161 ----------

162 value : scalar

163 dtype : Dtype, optional

164

165 Returns

166 -------

167 scalar

168 """

169 if dtype == _dtype_obj:

170 pass

171 elif isinstance(value, (np.datetime64, datetime)):

172 value = Timestamp(value)

173 elif isinstance(value, (np.timedelta64, timedelta)):

174 value = Timedelta(value)

175

176 return value

177

178

179def maybe_box_native(value: Scalar) -> Scalar:

180 """

181 If passed a scalar cast the scalar to a python native type.

182

183 Parameters

184 ----------

185 value : scalar or Series

186

187 Returns

188 -------

189 scalar or Series

190 """

191 if is_float(value):

192 # error: Argument 1 to "float" has incompatible type

193 # "Union[Union[str, int, float, bool], Union[Any, Timestamp, Timedelta, Any]]";

194 # expected "Union[SupportsFloat, _SupportsIndex, str]"

195 value = float(value) # type: ignore[arg-type]

196 elif is_integer(value):

197 # error: Argument 1 to "int" has incompatible type

198 # "Union[Union[str, int, float, bool], Union[Any, Timestamp, Timedelta, Any]]";

199 # expected "Union[str, SupportsInt, _SupportsIndex, _SupportsTrunc]"

200 value = int(value) # type: ignore[arg-type]

201 elif is_bool(value):

202 value = bool(value)

203 elif isinstance(value, (np.datetime64, np.timedelta64)):

204 value = maybe_box_datetimelike(value)

205 return value

206

207

208def _maybe_unbox_datetimelike(value: Scalar, dtype: DtypeObj) -> Scalar:

209 """

210 Convert a Timedelta or Timestamp to timedelta64 or datetime64 for setting

211 into a numpy array. Failing to unbox would risk dropping nanoseconds.

212

213 Notes

214 -----

215 Caller is responsible for checking dtype.kind in ["m", "M"]

216 """

217 if is_valid_na_for_dtype(value, dtype):

218 # GH#36541: can't fill array directly with pd.NaT

219 # > np.empty(10, dtype="datetime64[64]").fill(pd.NaT)

220 # ValueError: cannot convert float NaN to integer

221 value = dtype.type("NaT", "ns")

222 elif isinstance(value, Timestamp):

223 if value.tz is None:

224 value = value.to_datetime64()

225 elif not isinstance(dtype, DatetimeTZDtype):

226 raise TypeError("Cannot unbox tzaware Timestamp to tznaive dtype")

227 elif isinstance(value, Timedelta):

228 value = value.to_timedelta64()

229

230 _disallow_mismatched_datetimelike(value, dtype)

231 return value

232

233

234def _disallow_mismatched_datetimelike(value, dtype: DtypeObj):

235 """

236 numpy allows np.array(dt64values, dtype="timedelta64[ns]") and

237 vice-versa, but we do not want to allow this, so we need to

238 check explicitly

239 """

240 vdtype = getattr(value, "dtype", None)

241 if vdtype is None:

242 return

243 elif (vdtype.kind == "m" and dtype.kind == "M") or (

244 vdtype.kind == "M" and dtype.kind == "m"

245 ):

246 raise TypeError(f"Cannot cast {repr(value)} to {dtype}")

247

248

249@overload

250def maybe_downcast_to_dtype(result: np.ndarray, dtype: str | np.dtype) -> np.ndarray:

251 ...

252

253

254@overload

255def maybe_downcast_to_dtype(result: ExtensionArray, dtype: str | np.dtype) -> ArrayLike:

256 ...

257

258

259def maybe_downcast_to_dtype(result: ArrayLike, dtype: str | np.dtype) -> ArrayLike:

260 """

261 try to cast to the specified dtype (e.g. convert back to bool/int

262 or could be an astype of float64->float32

263 """

264 do_round = False

265

266 if isinstance(dtype, str):

267 if dtype == "infer":

268 inferred_type = lib.infer_dtype(result, skipna=False)

269 if inferred_type == "boolean":

270 dtype = "bool"

271 elif inferred_type == "integer":

272 dtype = "int64"

273 elif inferred_type == "datetime64":

274 dtype = "datetime64[ns]"

275 elif inferred_type in ["timedelta", "timedelta64"]:

276 dtype = "timedelta64[ns]"

277

278 # try to upcast here

279 elif inferred_type == "floating":

280 dtype = "int64"

281 if issubclass(result.dtype.type, np.number):

282 do_round = True

283

284 else:

285 # TODO: complex? what if result is already non-object?

286 dtype = "object"

287

288 dtype = np.dtype(dtype)

289

290 if not isinstance(dtype, np.dtype):

291 # enforce our signature annotation

292 raise TypeError(dtype) # pragma: no cover

293

294 converted = maybe_downcast_numeric(result, dtype, do_round)

295 if converted is not result:

296 return converted

297

298 # a datetimelike

299 # GH12821, iNaT is cast to float

300 if dtype.kind in ["M", "m"] and result.dtype.kind in ["i", "f"]:

301 result = result.astype(dtype)

302

303 elif dtype.kind == "m" and result.dtype == _dtype_obj:

304 # test_where_downcast_to_td64

305 result = cast(np.ndarray, result)

306 result = array_to_timedelta64(result)

307

308 elif dtype == np.dtype("M8[ns]") and result.dtype == _dtype_obj:

309 return np.asarray(maybe_cast_to_datetime(result, dtype=dtype))

310

311 return result

312

313

314@overload

315def maybe_downcast_numeric(

316 result: np.ndarray, dtype: np.dtype, do_round: bool = False

317) -> np.ndarray:

318 ...

319

320

321@overload

322def maybe_downcast_numeric(

323 result: ExtensionArray, dtype: DtypeObj, do_round: bool = False

324) -> ArrayLike:

325 ...

326

327

328def maybe_downcast_numeric(

329 result: ArrayLike, dtype: DtypeObj, do_round: bool = False

330) -> ArrayLike:

331 """

332 Subset of maybe_downcast_to_dtype restricted to numeric dtypes.

333

334 Parameters

335 ----------

336 result : ndarray or ExtensionArray

337 dtype : np.dtype or ExtensionDtype

338 do_round : bool

339

340 Returns

341 -------

342 ndarray or ExtensionArray

343 """

344 if not isinstance(dtype, np.dtype) or not isinstance(result.dtype, np.dtype):

345 # e.g. SparseDtype has no itemsize attr

346 return result

347

348 def trans(x):

349 if do_round:

350 return x.round()

351 return x

352

353 if dtype.kind == result.dtype.kind:

354 # don't allow upcasts here (except if empty)

355 if result.dtype.itemsize <= dtype.itemsize and result.size:

356 return result

357

358 if is_bool_dtype(dtype) or is_integer_dtype(dtype):

359

360 if not result.size:

361 # if we don't have any elements, just astype it

362 return trans(result).astype(dtype)

363

364 # do a test on the first element, if it fails then we are done

365 r = result.ravel()

366 arr = np.array([r[0]])

367

368 if isna(arr).any():

369 # if we have any nulls, then we are done

370 return result

371

372 elif not isinstance(r[0], (np.integer, np.floating, int, float, bool)):

373 # a comparable, e.g. a Decimal may slip in here

374 return result

375

376 if (

377 issubclass(result.dtype.type, (np.object_, np.number))

378 and notna(result).all()

379 ):

380 new_result = trans(result).astype(dtype)

381 if new_result.dtype.kind == "O" or result.dtype.kind == "O":

382 # np.allclose may raise TypeError on object-dtype

383 if (new_result == result).all():

384 return new_result

385 else:

386 if np.allclose(new_result, result, rtol=0):

387 return new_result

388

389 elif (

390 issubclass(dtype.type, np.floating)

391 and not is_bool_dtype(result.dtype)

392 and not is_string_dtype(result.dtype)

393 ):

394 new_result = result.astype(dtype)

395

396 # Adjust tolerances based on floating point size

397 size_tols = {4: 5e-4, 8: 5e-8, 16: 5e-16}

398

399 atol = size_tols.get(new_result.dtype.itemsize, 0.0)

400

401 # Check downcast float values are still equal within 7 digits when

402 # converting from float64 to float32

403 if np.allclose(new_result, result, equal_nan=True, rtol=0.0, atol=atol):

404 return new_result

405

406 elif dtype.kind == result.dtype.kind == "c":

407 new_result = result.astype(dtype)

408

409 if array_equivalent(new_result, result):

410 # TODO: use tolerance like we do for float?

411 return new_result

412

413 return result

414

415

416def maybe_cast_pointwise_result(

417 result: ArrayLike,

418 dtype: DtypeObj,

419 numeric_only: bool = False,

420 same_dtype: bool = True,

421) -> ArrayLike:

422 """

423 Try casting result of a pointwise operation back to the original dtype if

424 appropriate.

425

426 Parameters

427 ----------

428 result : array-like

429 Result to cast.

430 dtype : np.dtype or ExtensionDtype

431 Input Series from which result was calculated.

432 numeric_only : bool, default False

433 Whether to cast only numerics or datetimes as well.

434 same_dtype : bool, default True

435 Specify dtype when calling _from_sequence

436

437 Returns

438 -------

439 result : array-like

440 result maybe casted to the dtype.

441 """

442

443 assert not is_scalar(result)

444

445 if isinstance(dtype, ExtensionDtype):

446 if not isinstance(dtype, (CategoricalDtype, DatetimeTZDtype)):

447 # TODO: avoid this special-casing

448 # We have to special case categorical so as not to upcast

449 # things like counts back to categorical

450

451 cls = dtype.construct_array_type()

452 if same_dtype:

453 result = maybe_cast_to_extension_array(cls, result, dtype=dtype)

454 else:

455 result = maybe_cast_to_extension_array(cls, result)

456

457 elif (numeric_only and is_numeric_dtype(dtype)) or not numeric_only:

458 result = maybe_downcast_to_dtype(result, dtype)

459

460 return result

461

462

463def maybe_cast_to_extension_array(

464 cls: type[ExtensionArray], obj: ArrayLike, dtype: ExtensionDtype | None = None

465) -> ArrayLike:

466 """

467 Call to `_from_sequence` that returns the object unchanged on Exception.

468

469 Parameters

470 ----------

471 cls : class, subclass of ExtensionArray

472 obj : arraylike

473 Values to pass to cls._from_sequence

474 dtype : ExtensionDtype, optional

475

476 Returns

477 -------

478 ExtensionArray or obj

479 """

480 from pandas.core.arrays.string_ import BaseStringArray

481

482 assert isinstance(cls, type), f"must pass a type: {cls}"

483 assertion_msg = f"must pass a subclass of ExtensionArray: {cls}"

484 assert issubclass(cls, ABCExtensionArray), assertion_msg

485

486 # Everything can be converted to StringArrays, but we may not want to convert

487 if issubclass(cls, BaseStringArray) and lib.infer_dtype(obj) != "string":

488 return obj

489

490 try:

491 result = cls._from_sequence(obj, dtype=dtype)

492 except Exception:

493 # We can't predict what downstream EA constructors may raise

494 result = obj

495 return result

496

497

498@overload

499def ensure_dtype_can_hold_na(dtype: np.dtype) -> np.dtype:

500 ...

501

502

503@overload

504def ensure_dtype_can_hold_na(dtype: ExtensionDtype) -> ExtensionDtype:

505 ...

506

507

508def ensure_dtype_can_hold_na(dtype: DtypeObj) -> DtypeObj:

509 """

510 If we have a dtype that cannot hold NA values, find the best match that can.

511 """

512 if isinstance(dtype, ExtensionDtype):

513 if dtype._can_hold_na:

514 return dtype

515 elif isinstance(dtype, IntervalDtype):

516 # TODO(GH#45349): don't special-case IntervalDtype, allow

517 # overriding instead of returning object below.

518 return IntervalDtype(np.float64, closed=dtype.closed)

519 return _dtype_obj

520 elif dtype.kind == "b":

521 return _dtype_obj

522 elif dtype.kind in ["i", "u"]:

523 return np.dtype(np.float64)

524 return dtype

525

526

527def maybe_promote(dtype: np.dtype, fill_value=np.nan):

528 """

529 Find the minimal dtype that can hold both the given dtype and fill_value.

530

531 Parameters

532 ----------

533 dtype : np.dtype

534 fill_value : scalar, default np.nan

535

536 Returns

537 -------

538 dtype

539 Upcasted from dtype argument if necessary.

540 fill_value

541 Upcasted from fill_value argument if necessary.

542

543 Raises

544 ------

545 ValueError

546 If fill_value is a non-scalar and dtype is not object.

547 """

548 # TODO(2.0): need to directly use the non-cached version as long as we

549 # possibly raise a deprecation warning for datetime dtype

550 if dtype.kind == "M":

551 return _maybe_promote(dtype, fill_value)

552 # for performance, we are using a cached version of the actual implementation

553 # of the function in _maybe_promote. However, this doesn't always work (in case

554 # of non-hashable arguments), so we fallback to the actual implementation if needed

555 try:

556 # error: Argument 3 to "__call__" of "_lru_cache_wrapper" has incompatible type

557 # "Type[Any]"; expected "Hashable" [arg-type]

558 return _maybe_promote_cached(

559 dtype, fill_value, type(fill_value) # type: ignore[arg-type]

560 )

561 except TypeError:

562 # if fill_value is not hashable (required for caching)

563 return _maybe_promote(dtype, fill_value)

564

565

566@functools.lru_cache(maxsize=128)

567def _maybe_promote_cached(dtype, fill_value, fill_value_type):

568 # The cached version of _maybe_promote below

569 # This also use fill_value_type as (unused) argument to use this in the

570 # cache lookup -> to differentiate 1 and True

571 return _maybe_promote(dtype, fill_value)

572

573

574def _maybe_promote(dtype: np.dtype, fill_value=np.nan):

575 # The actual implementation of the function, use `maybe_promote` above for

576 # a cached version.

577 if not is_scalar(fill_value):

578 # with object dtype there is nothing to promote, and the user can

579 # pass pretty much any weird fill_value they like

580 if not is_object_dtype(dtype):

581 # with object dtype there is nothing to promote, and the user can

582 # pass pretty much any weird fill_value they like

583 raise ValueError("fill_value must be a scalar")

584 dtype = _dtype_obj

585 return dtype, fill_value

586

587 kinds = ["i", "u", "f", "c", "m", "M"]

588 if is_valid_na_for_dtype(fill_value, dtype) and dtype.kind in kinds:

589 dtype = ensure_dtype_can_hold_na(dtype)

590 fv = na_value_for_dtype(dtype)

591 return dtype, fv

592

593 elif isinstance(dtype, CategoricalDtype):

594 if fill_value in dtype.categories or isna(fill_value):

595 return dtype, fill_value

596 else:

597 return object, ensure_object(fill_value)

598

599 elif isna(fill_value):

600 dtype = _dtype_obj

601 if fill_value is None:

602 # but we retain e.g. pd.NA

603 fill_value = np.nan

604 return dtype, fill_value

605

606 # returns tuple of (dtype, fill_value)

607 if issubclass(dtype.type, np.datetime64):

608 inferred, fv = infer_dtype_from_scalar(fill_value, pandas_dtype=True)

609 if inferred == dtype:

610 return dtype, fv

611

612 # TODO(2.0): once this deprecation is enforced, this whole case

613 # becomes equivalent to:

614 # dta = DatetimeArray._from_sequence([], dtype="M8[ns]")

615 # try:

616 # fv = dta._validate_setitem_value(fill_value)

617 # return dta.dtype, fv

618 # except (ValueError, TypeError):

619 # return _dtype_obj, fill_value

620 if isinstance(fill_value, date) and not isinstance(fill_value, datetime):

621 # deprecate casting of date object to match infer_dtype_from_scalar

622 # and DatetimeArray._validate_setitem_value

623 try:

624 fv = Timestamp(fill_value).to_datetime64()

625 except OutOfBoundsDatetime:

626 pass

627 else:

628 warnings.warn(

629 "Using a `date` object for fill_value with `datetime64[ns]` "

630 "dtype is deprecated. In a future version, this will be cast "

631 "to object dtype. Pass `fill_value=Timestamp(date_obj)` instead.",

632 FutureWarning,

633 stacklevel=find_stack_level(),

634 )

635 return dtype, fv

636 elif isinstance(fill_value, str):

637 try:

638 # explicitly wrap in str to convert np.str_

639 fv = Timestamp(str(fill_value))

640 except (ValueError, TypeError):

641 pass

642 else:

643 if isna(fv) or fv.tz is None:

644 return dtype, fv.asm8

645

646 return np.dtype("object"), fill_value

647

648 elif issubclass(dtype.type, np.timedelta64):

649 inferred, fv = infer_dtype_from_scalar(fill_value, pandas_dtype=True)

650 if inferred == dtype:

651 return dtype, fv

652

653 return np.dtype("object"), fill_value

654

655 elif is_float(fill_value):

656 if issubclass(dtype.type, np.bool_):

657 dtype = np.dtype(np.object_)

658

659 elif issubclass(dtype.type, np.integer):

660 dtype = np.dtype(np.float64)

661

662 elif dtype.kind == "f":

663 mst = np.min_scalar_type(fill_value)

664 if mst > dtype:

665 # e.g. mst is np.float64 and dtype is np.float32

666 dtype = mst

667

668 elif dtype.kind == "c":

669 mst = np.min_scalar_type(fill_value)

670 dtype = np.promote_types(dtype, mst)

671

672 elif is_bool(fill_value):

673 if not issubclass(dtype.type, np.bool_):

674 dtype = np.dtype(np.object_)

675

676 elif is_integer(fill_value):

677 if issubclass(dtype.type, np.bool_):

678 dtype = np.dtype(np.object_)

679

680 elif issubclass(dtype.type, np.integer):

681 if not np.can_cast(fill_value, dtype):

682 # upcast to prevent overflow

683 mst = np.min_scalar_type(fill_value)

684 dtype = np.promote_types(dtype, mst)

685 if dtype.kind == "f":

686 # Case where we disagree with numpy

687 dtype = np.dtype(np.object_)

688

689 elif is_complex(fill_value):

690 if issubclass(dtype.type, np.bool_):

691 dtype = np.dtype(np.object_)

692

693 elif issubclass(dtype.type, (np.integer, np.floating)):

694 mst = np.min_scalar_type(fill_value)

695 dtype = np.promote_types(dtype, mst)

696

697 elif dtype.kind == "c":

698 mst = np.min_scalar_type(fill_value)

699 if mst > dtype:

700 # e.g. mst is np.complex128 and dtype is np.complex64

701 dtype = mst

702

703 else:

704 dtype = np.dtype(np.object_)

705

706 # in case we have a string that looked like a number

707 if issubclass(dtype.type, (bytes, str)):

708 dtype = np.dtype(np.object_)

709

710 fill_value = _ensure_dtype_type(fill_value, dtype)

711 return dtype, fill_value

712

713

714def _ensure_dtype_type(value, dtype: np.dtype):

715 """

716 Ensure that the given value is an instance of the given dtype.

717

718 e.g. if out dtype is np.complex64_, we should have an instance of that

719 as opposed to a python complex object.

720

721 Parameters

722 ----------

723 value : object

724 dtype : np.dtype

725

726 Returns

727 -------

728 object

729 """

730 # Start with exceptions in which we do _not_ cast to numpy types

731

732 if dtype == _dtype_obj:

733 return value

734

735 # Note: before we get here we have already excluded isna(value)

736 return dtype.type(value)

737

738

739def infer_dtype_from(val, pandas_dtype: bool = False) -> tuple[DtypeObj, Any]:

740 """

741 Interpret the dtype from a scalar or array.

742

743 Parameters

744 ----------

745 val : object

746 pandas_dtype : bool, default False

747 whether to infer dtype including pandas extension types.

748 If False, scalar/array belongs to pandas extension types is inferred as

749 object

750 """

751 if not is_list_like(val):

752 return infer_dtype_from_scalar(val, pandas_dtype=pandas_dtype)

753 return infer_dtype_from_array(val, pandas_dtype=pandas_dtype)

754

755

756def infer_dtype_from_scalar(val, pandas_dtype: bool = False) -> tuple[DtypeObj, Any]:

757 """

758 Interpret the dtype from a scalar.

759

760 Parameters

761 ----------

762 pandas_dtype : bool, default False

763 whether to infer dtype including pandas extension types.

764 If False, scalar belongs to pandas extension types is inferred as

765 object

766 """

767 dtype: DtypeObj = _dtype_obj

768

769 # a 1-element ndarray

770 if isinstance(val, np.ndarray):

771 if val.ndim != 0:

772 msg = "invalid ndarray passed to infer_dtype_from_scalar"

773 raise ValueError(msg)

774

775 dtype = val.dtype

776 val = lib.item_from_zerodim(val)

777

778 elif isinstance(val, str):

779

780 # If we create an empty array using a string to infer

781 # the dtype, NumPy will only allocate one character per entry

782 # so this is kind of bad. Alternately we could use np.repeat

783 # instead of np.empty (but then you still don't want things

784 # coming out as np.str_!

785

786 dtype = _dtype_obj

787

788 elif isinstance(val, (np.datetime64, datetime)):

789 try:

790 val = Timestamp(val)

791 except OutOfBoundsDatetime:

792 return _dtype_obj, val

793

794 # error: Non-overlapping identity check (left operand type: "Timestamp",

795 # right operand type: "NaTType")

796 if val is NaT or val.tz is None: # type: ignore[comparison-overlap]

797 dtype = np.dtype("M8[ns]")

798 val = val.to_datetime64()

799 else:

800 if pandas_dtype:

801 dtype = DatetimeTZDtype(unit="ns", tz=val.tz)

802 else:

803 # return datetimetz as object

804 return _dtype_obj, val

805

806 elif isinstance(val, (np.timedelta64, timedelta)):

807 try:

808 val = Timedelta(val)

809 except (OutOfBoundsTimedelta, OverflowError):

810 dtype = _dtype_obj

811 else:

812 dtype = np.dtype("m8[ns]")

813 val = np.timedelta64(val.value, "ns")

814

815 elif is_bool(val):

816 dtype = np.dtype(np.bool_)

817

818 elif is_integer(val):

819 if isinstance(val, np.integer):

820 dtype = np.dtype(type(val))

821 else:

822 dtype = np.dtype(np.int64)

823

824 try:

825 np.array(val, dtype=dtype)

826 except OverflowError:

827 dtype = np.array(val).dtype

828

829 elif is_float(val):

830 if isinstance(val, np.floating):

831 dtype = np.dtype(type(val))

832 else:

833 dtype = np.dtype(np.float64)

834

835 elif is_complex(val):

836 dtype = np.dtype(np.complex_)

837

838 elif pandas_dtype:

839 if lib.is_period(val):

840 dtype = PeriodDtype(freq=val.freq)

841 elif lib.is_interval(val):

842 subtype = infer_dtype_from_scalar(val.left, pandas_dtype=True)[0]

843 dtype = IntervalDtype(subtype=subtype, closed=val.closed)

844

845 return dtype, val

846

847

848def dict_compat(d: dict[Scalar, Scalar]) -> dict[Scalar, Scalar]:

849 """

850 Convert datetimelike-keyed dicts to a Timestamp-keyed dict.

851

852 Parameters

853 ----------

854 d: dict-like object

855

856 Returns

857 -------

858 dict

859 """

860 return {maybe_box_datetimelike(key): value for key, value in d.items()}

861

862

863def infer_dtype_from_array(

864 arr, pandas_dtype: bool = False

865) -> tuple[DtypeObj, ArrayLike]:

866 """

867 Infer the dtype from an array.

868

869 Parameters

870 ----------

871 arr : array

872 pandas_dtype : bool, default False

873 whether to infer dtype including pandas extension types.

874 If False, array belongs to pandas extension types

875 is inferred as object

876

877 Returns

878 -------

879 tuple (numpy-compat/pandas-compat dtype, array)

880

881 Notes

882 -----

883 if pandas_dtype=False. these infer to numpy dtypes

884 exactly with the exception that mixed / object dtypes

885 are not coerced by stringifying or conversion

886

887 if pandas_dtype=True. datetime64tz-aware/categorical

888 types will retain there character.

889

890 Examples

891 --------

892 >>> np.asarray([1, '1'])

893 array(['1', '1'], dtype='<U21')

894

895 >>> infer_dtype_from_array([1, '1'])

896 (dtype('O'), [1, '1'])

897 """

898 if isinstance(arr, np.ndarray):

899 return arr.dtype, arr

900

901 if not is_list_like(arr):

902 raise TypeError("'arr' must be list-like")

903

904 if pandas_dtype and is_extension_array_dtype(arr):

905 return arr.dtype, arr

906

907 elif isinstance(arr, ABCSeries):

908 return arr.dtype, np.asarray(arr)

909

910 # don't force numpy coerce with nan's

911 inferred = lib.infer_dtype(arr, skipna=False)

912 if inferred in ["string", "bytes", "mixed", "mixed-integer"]:

913 return (np.dtype(np.object_), arr)

914

915 arr = np.asarray(arr)

916 return arr.dtype, arr

917

918

919def _maybe_infer_dtype_type(element):

920 """

921 Try to infer an object's dtype, for use in arithmetic ops.

922

923 Uses `element.dtype` if that's available.

924 Objects implementing the iterator protocol are cast to a NumPy array,

925 and from there the array's type is used.

926

927 Parameters

928 ----------

929 element : object

930 Possibly has a `.dtype` attribute, and possibly the iterator

931 protocol.

932

933 Returns

934 -------

935 tipo : type

936

937 Examples

938 --------

939 >>> from collections import namedtuple

940 >>> Foo = namedtuple("Foo", "dtype")

941 >>> _maybe_infer_dtype_type(Foo(np.dtype("i8")))

942 dtype('int64')

943 """

944 tipo = None

945 if hasattr(element, "dtype"):

946 tipo = element.dtype

947 elif is_list_like(element):

948 element = np.asarray(element)

949 tipo = element.dtype

950 return tipo

951

952

953def maybe_upcast(

954 values: NumpyArrayT,

955 fill_value: Scalar = np.nan,

956 copy: bool = False,

957) -> tuple[NumpyArrayT, Scalar]:

958 """

959 Provide explicit type promotion and coercion.

960

961 Parameters

962 ----------

963 values : np.ndarray

964 The array that we may want to upcast.

965 fill_value : what we want to fill with

966 copy : bool, default True

967 If True always make a copy even if no upcast is required.

968

969 Returns

970 -------

971 values: np.ndarray

972 the original array, possibly upcast

973 fill_value:

974 the fill value, possibly upcast

975 """

976 new_dtype, fill_value = maybe_promote(values.dtype, fill_value)

977 # We get a copy in all cases _except_ (values.dtype == new_dtype and not copy)

978 upcast_values = values.astype(new_dtype, copy=copy)

979

980 # error: Incompatible return value type (got "Tuple[ndarray[Any, dtype[Any]],

981 # Union[Union[str, int, float, bool] Union[Period, Timestamp, Timedelta, Any]]]",

982 # expected "Tuple[NumpyArrayT, Union[Union[str, int, float, bool], Union[Period,

983 # Timestamp, Timedelta, Any]]]")

984 return upcast_values, fill_value # type: ignore[return-value]

985

986

987def invalidate_string_dtypes(dtype_set: set[DtypeObj]) -> None:

988 """

989 Change string like dtypes to object for

990 ``DataFrame.select_dtypes()``.

991 """

992 # error: Argument 1 to <set> has incompatible type "Type[generic]"; expected

993 # "Union[dtype[Any], ExtensionDtype, None]"

994 # error: Argument 2 to <set> has incompatible type "Type[generic]"; expected

995 # "Union[dtype[Any], ExtensionDtype, None]"

996 non_string_dtypes = dtype_set - {

997 np.dtype("S").type, # type: ignore[arg-type]

998 np.dtype("<U").type, # type: ignore[arg-type]

999 }

1000 if non_string_dtypes != dtype_set:

1001 raise TypeError("string dtypes are not allowed, use 'object' instead")

1002

1003

1004def coerce_indexer_dtype(indexer, categories) -> np.ndarray:

1005 """coerce the indexer input array to the smallest dtype possible"""

1006 length = len(categories)

1007 if length < _int8_max:

1008 return ensure_int8(indexer)

1009 elif length < _int16_max:

1010 return ensure_int16(indexer)

1011 elif length < _int32_max:

1012 return ensure_int32(indexer)

1013 return ensure_int64(indexer)

1014

1015

1016def soft_convert_objects(

1017 values: np.ndarray,

1018 datetime: bool = True,

1019 numeric: bool = True,

1020 timedelta: bool = True,

1021 period: bool = True,

1022 copy: bool = True,

1023) -> ArrayLike:

1024 """

1025 Try to coerce datetime, timedelta, and numeric object-dtype columns

1026 to inferred dtype.

1027

1028 Parameters

1029 ----------

1030 values : np.ndarray[object]

1031 datetime : bool, default True

1032 numeric: bool, default True

1033 timedelta : bool, default True

1034 period : bool, default True

1035 copy : bool, default True

1036

1037 Returns

1038 -------

1039 np.ndarray or ExtensionArray

1040 """

1041 validate_bool_kwarg(datetime, "datetime")

1042 validate_bool_kwarg(numeric, "numeric")

1043 validate_bool_kwarg(timedelta, "timedelta")

1044 validate_bool_kwarg(copy, "copy")

1045

1046 conversion_count = sum((datetime, numeric, timedelta))

1047 if conversion_count == 0:

1048 raise ValueError("At least one of datetime, numeric or timedelta must be True.")

1049

1050 # Soft conversions

1051 if datetime or timedelta:

1052 # GH 20380, when datetime is beyond year 2262, hence outside

1053 # bound of nanosecond-resolution 64-bit integers.

1054 try:

1055 converted = lib.maybe_convert_objects(

1056 values,

1057 convert_datetime=datetime,

1058 convert_timedelta=timedelta,

1059 convert_period=period,

1060 )

1061 except (OutOfBoundsDatetime, ValueError):

1062 return values

1063 if converted is not values:

1064 return converted

1065

1066 if numeric and is_object_dtype(values.dtype):

1067 converted, _ = lib.maybe_convert_numeric(values, set(), coerce_numeric=True)

1068

1069 # If all NaNs, then do not-alter

1070 values = converted if not isna(converted).all() else values

1071 values = values.copy() if copy else values

1072

1073 return values

1074

1075

1076def convert_dtypes(

1077 input_array: ArrayLike,

1078 convert_string: bool = True,

1079 convert_integer: bool = True,

1080 convert_boolean: bool = True,

1081 convert_floating: bool = True,

1082) -> DtypeObj:

1083 """

1084 Convert objects to best possible type, and optionally,

1085 to types supporting ``pd.NA``.

1086

1087 Parameters

1088 ----------

1089 input_array : ExtensionArray or np.ndarray

1090 convert_string : bool, default True

1091 Whether object dtypes should be converted to ``StringDtype()``.

1092 convert_integer : bool, default True

1093 Whether, if possible, conversion can be done to integer extension types.

1094 convert_boolean : bool, defaults True

1095 Whether object dtypes should be converted to ``BooleanDtypes()``.

1096 convert_floating : bool, defaults True

1097 Whether, if possible, conversion can be done to floating extension types.

1098 If `convert_integer` is also True, preference will be give to integer

1099 dtypes if the floats can be faithfully casted to integers.

1100

1101 Returns

1102 -------

1103 np.dtype, or ExtensionDtype

1104 """

1105 inferred_dtype: str | DtypeObj

1106

1107 if (

1108 convert_string or convert_integer or convert_boolean or convert_floating

1109 ) and isinstance(input_array, np.ndarray):

1110

1111 if is_object_dtype(input_array.dtype):

1112 inferred_dtype = lib.infer_dtype(input_array)

1113 else:

1114 inferred_dtype = input_array.dtype

1115

1116 if is_string_dtype(inferred_dtype):

1117 if not convert_string or inferred_dtype == "bytes":

1118 return input_array.dtype

1119 else:

1120 return pandas_dtype("string")

1121

1122 if convert_integer:

1123 target_int_dtype = pandas_dtype("Int64")

1124

1125 if is_integer_dtype(input_array.dtype):

1126 from pandas.core.arrays.integer import INT_STR_TO_DTYPE

1127

1128 inferred_dtype = INT_STR_TO_DTYPE.get(

1129 input_array.dtype.name, target_int_dtype

1130 )

1131 elif is_numeric_dtype(input_array.dtype):

1132 # TODO: de-dup with maybe_cast_to_integer_array?

1133 arr = input_array[notna(input_array)]

1134 if (arr.astype(int) == arr).all():

1135 inferred_dtype = target_int_dtype

1136 else:

1137 inferred_dtype = input_array.dtype

1138

1139 if convert_floating:

1140 if not is_integer_dtype(input_array.dtype) and is_numeric_dtype(

1141 input_array.dtype

1142 ):

1143 from pandas.core.arrays.floating import FLOAT_STR_TO_DTYPE

1144

1145 inferred_float_dtype: DtypeObj = FLOAT_STR_TO_DTYPE.get(

1146 input_array.dtype.name, pandas_dtype("Float64")

1147 )

1148 # if we could also convert to integer, check if all floats

1149 # are actually integers

1150 if convert_integer:

1151 # TODO: de-dup with maybe_cast_to_integer_array?

1152 arr = input_array[notna(input_array)]

1153 if (arr.astype(int) == arr).all():

1154 inferred_dtype = pandas_dtype("Int64")

1155 else:

1156 inferred_dtype = inferred_float_dtype

1157 else:

1158 inferred_dtype = inferred_float_dtype

1159

1160 if convert_boolean:

1161 if is_bool_dtype(input_array.dtype):

1162 inferred_dtype = pandas_dtype("boolean")

1163 elif isinstance(inferred_dtype, str) and inferred_dtype == "boolean":

1164 inferred_dtype = pandas_dtype("boolean")

1165

1166 if isinstance(inferred_dtype, str):

1167 # If we couldn't do anything else, then we retain the dtype

1168 inferred_dtype = input_array.dtype

1169

1170 else:

1171 return input_array.dtype

1172

1173 # error: Incompatible return value type (got "Union[str, Union[dtype[Any],

1174 # ExtensionDtype]]", expected "Union[dtype[Any], ExtensionDtype]")

1175 return inferred_dtype # type: ignore[return-value]

1176

1177

1178def maybe_infer_to_datetimelike(

1179 value: np.ndarray,

1180) -> np.ndarray | DatetimeArray | TimedeltaArray | PeriodArray | IntervalArray:

1181 """

1182 we might have a array (or single object) that is datetime like,

1183 and no dtype is passed don't change the value unless we find a

1184 datetime/timedelta set

1185

1186 this is pretty strict in that a datetime/timedelta is REQUIRED

1187 in addition to possible nulls/string likes

1188

1189 Parameters

1190 ----------

1191 value : np.ndarray[object]

1192

1193 Returns

1194 -------

1195 np.ndarray, DatetimeArray, TimedeltaArray, PeriodArray, or IntervalArray

1196

1197 """

1198 if not isinstance(value, np.ndarray) or value.dtype != object:

1199 # Caller is responsible for passing only ndarray[object]

1200 raise TypeError(type(value)) # pragma: no cover

1201

1202 v = np.array(value, copy=False)

1203

1204 shape = v.shape

1205 if v.ndim != 1:

1206 v = v.ravel()

1207

1208 if not len(v):

1209 return value

1210

1211 def try_datetime(v: np.ndarray) -> ArrayLike:

1212 # Coerce to datetime64, datetime64tz, or in corner cases

1213 # object[datetimes]

1214 from pandas.core.arrays.datetimes import sequence_to_datetimes

1215

1216 try:

1217 # GH#19671 we pass require_iso8601 to be relatively strict

1218 # when parsing strings.

1219 dta = sequence_to_datetimes(v, require_iso8601=True)

1220 except (ValueError, TypeError):

1221 # e.g. <class 'numpy.timedelta64'> is not convertible to datetime

1222 return v.reshape(shape)

1223 else:

1224 # GH#19761 we may have mixed timezones, in which cast 'dta' is

1225 # an ndarray[object]. Only 1 test

1226 # relies on this behavior, see GH#40111

1227 return dta.reshape(shape)

1228

1229 def try_timedelta(v: np.ndarray) -> np.ndarray:

1230 # safe coerce to timedelta64

1231

1232 # will try first with a string & object conversion

1233 try:

1234 # bc we know v.dtype == object, this is equivalent to

1235 # `np.asarray(to_timedelta(v))`, but using a lower-level API that

1236 # does not require a circular import.

1237 td_values = array_to_timedelta64(v).view("m8[ns]")

1238 except (ValueError, OverflowError):

1239 return v.reshape(shape)

1240 else:

1241 return td_values.reshape(shape)

1242

1243 inferred_type, seen_str = lib.infer_datetimelike_array(ensure_object(v))

1244 if inferred_type in ["period", "interval"]:

1245 # Incompatible return value type (got "Union[ExtensionArray, ndarray]",

1246 # expected "Union[ndarray, DatetimeArray, TimedeltaArray, PeriodArray,

1247 # IntervalArray]")

1248 return lib.maybe_convert_objects( # type: ignore[return-value]

1249 v, convert_period=True, convert_interval=True

1250 )

1251

1252 if inferred_type == "datetime":

1253 # error: Incompatible types in assignment (expression has type "ExtensionArray",

1254 # variable has type "Union[ndarray, List[Any]]")

1255 value = try_datetime(v) # type: ignore[assignment]

1256 elif inferred_type == "timedelta":

1257 value = try_timedelta(v)

1258 elif inferred_type == "nat":

1259

1260 # if all NaT, return as datetime

1261 if isna(v).all():

1262 # error: Incompatible types in assignment (expression has type

1263 # "ExtensionArray", variable has type "Union[ndarray, List[Any]]")

1264 value = try_datetime(v) # type: ignore[assignment]

1265 else:

1266

1267 # We have at least a NaT and a string

1268 # try timedelta first to avoid spurious datetime conversions

1269 # e.g. '00:00:01' is a timedelta but technically is also a datetime

1270 value = try_timedelta(v)

1271 if lib.infer_dtype(value, skipna=False) in ["mixed"]:

1272 # cannot skip missing values, as NaT implies that the string

1273 # is actually a datetime

1274

1275 # error: Incompatible types in assignment (expression has type

1276 # "ExtensionArray", variable has type "Union[ndarray, List[Any]]")

1277 value = try_datetime(v) # type: ignore[assignment]

1278

1279 if value.dtype.kind in ["m", "M"] and seen_str:

1280 # TODO(2.0): enforcing this deprecation should close GH#40111

1281 warnings.warn(

1282 f"Inferring {value.dtype} from data containing strings is deprecated "

1283 "and will be removed in a future version. To retain the old behavior "

1284 f"explicitly pass Series(data, dtype={value.dtype})",

1285 FutureWarning,

1286 stacklevel=find_stack_level(),

1287 )

1288 return value

1289

1290

1291def maybe_cast_to_datetime(

1292 value: ExtensionArray | np.ndarray | list, dtype: DtypeObj | None

1293) -> ExtensionArray | np.ndarray:

1294 """

1295 try to cast the array/value to a datetimelike dtype, converting float

1296 nan to iNaT

1297

1298 We allow a list *only* when dtype is not None.

1299 """

1300 from pandas.core.arrays.datetimes import sequence_to_datetimes

1301 from pandas.core.arrays.timedeltas import TimedeltaArray

1302

1303 if not is_list_like(value):

1304 raise TypeError("value must be listlike")

1305

1306 if is_timedelta64_dtype(dtype):

1307 # TODO: _from_sequence would raise ValueError in cases where

1308 # _ensure_nanosecond_dtype raises TypeError

1309 dtype = cast(np.dtype, dtype)

1310 dtype = _ensure_nanosecond_dtype(dtype)

1311 res = TimedeltaArray._from_sequence(value, dtype=dtype)

1312 return res

1313

1314 if dtype is not None:

1315 is_datetime64 = is_datetime64_dtype(dtype)

1316 is_datetime64tz = is_datetime64tz_dtype(dtype)

1317

1318 vdtype = getattr(value, "dtype", None)

1319

1320 if is_datetime64 or is_datetime64tz:

1321 dtype = _ensure_nanosecond_dtype(dtype)

1322

1323 value = np.array(value, copy=False)

1324

1325 # we have an array of datetime or timedeltas & nulls

1326 if value.size or not is_dtype_equal(value.dtype, dtype):

1327 _disallow_mismatched_datetimelike(value, dtype)

1328

1329 try:

1330 if is_datetime64:

1331 dta = sequence_to_datetimes(value)

1332 # GH 25843: Remove tz information since the dtype

1333 # didn't specify one

1334

1335 if dta.tz is not None:

1336 warnings.warn(

1337 "Data is timezone-aware. Converting "

1338 "timezone-aware data to timezone-naive by "

1339 "passing dtype='datetime64[ns]' to "

1340 "DataFrame or Series is deprecated and will "

1341 "raise in a future version. Use "

1342 "`pd.Series(values).dt.tz_localize(None)` "

1343 "instead.",

1344 FutureWarning,

1345 stacklevel=find_stack_level(),

1346 )

1347 # equiv: dta.view(dtype)

1348 # Note: NOT equivalent to dta.astype(dtype)

1349 dta = dta.tz_localize(None)

1350

1351 value = dta

1352 elif is_datetime64tz:

1353 dtype = cast(DatetimeTZDtype, dtype)

1354 # The string check can be removed once issue #13712

1355 # is solved. String data that is passed with a

1356 # datetime64tz is assumed to be naive which should

1357 # be localized to the timezone.

1358 is_dt_string = is_string_dtype(value.dtype)

1359 dta = sequence_to_datetimes(value)

1360 if dta.tz is not None:

1361 value = dta.astype(dtype, copy=False)

1362 elif is_dt_string:

1363 # Strings here are naive, so directly localize

1364 # equiv: dta.astype(dtype) # though deprecated

1365

1366 value = dta.tz_localize(dtype.tz)

1367 else:

1368 # Numeric values are UTC at this point,

1369 # so localize and convert

1370 # equiv: Series(dta).astype(dtype) # though deprecated

1371 if getattr(vdtype, "kind", None) == "M":

1372 # GH#24559, GH#33401 deprecate behavior inconsistent

1373 # with DatetimeArray/DatetimeIndex

1374 warnings.warn(

1375 "In a future version, constructing a Series "

1376 "from datetime64[ns] data and a "

1377 "DatetimeTZDtype will interpret the data "

1378 "as wall-times instead of "

1379 "UTC times, matching the behavior of "

1380 "DatetimeIndex. To treat the data as UTC "

1381 "times, use pd.Series(data).dt"

1382 ".tz_localize('UTC').tz_convert(dtype.tz) "

1383 "or pd.Series(data.view('int64'), dtype=dtype)",

1384 FutureWarning,

1385 stacklevel=find_stack_level(),

1386 )

1387

1388 value = dta.tz_localize("UTC").tz_convert(dtype.tz)

1389 except OutOfBoundsDatetime:

1390 raise

1391 except ParserError:

1392 # Note: this is dateutil's ParserError, not ours.

1393 pass

1394

1395 elif getattr(vdtype, "kind", None) in ["m", "M"]:

1396 # we are already datetimelike and want to coerce to non-datetimelike;

1397 # astype_nansafe will raise for anything other than object, then upcast.

1398 # see test_datetimelike_values_with_object_dtype

1399 # error: Argument 2 to "astype_nansafe" has incompatible type

1400 # "Union[dtype[Any], ExtensionDtype]"; expected "dtype[Any]"

1401 return astype_nansafe(value, dtype) # type: ignore[arg-type]

1402

1403 elif isinstance(value, np.ndarray):

1404 if value.dtype.kind in ["M", "m"]:

1405 # catch a datetime/timedelta that is not of ns variety

1406 # and no coercion specified

1407 value = sanitize_to_nanoseconds(value)

1408

1409 elif value.dtype == _dtype_obj:

1410 value = maybe_infer_to_datetimelike(value)

1411

1412 elif isinstance(value, list):

1413 # we only get here with dtype=None, which we do not allow

1414 raise ValueError(

1415 "maybe_cast_to_datetime allows a list *only* if dtype is not None"

1416 )

1417

1418 # at this point we have converted or raised in all cases where we had a list

1419 return cast(ArrayLike, value)

1420

1421

1422def sanitize_to_nanoseconds(values: np.ndarray, copy: bool = False) -> np.ndarray:

1423 """

1424 Safely convert non-nanosecond datetime64 or timedelta64 values to nanosecond.

1425 """

1426 dtype = values.dtype

1427 if dtype.kind == "M" and dtype != DT64NS_DTYPE:

1428 values = astype_overflowsafe(values, dtype=DT64NS_DTYPE)

1429

1430 elif dtype.kind == "m" and dtype != TD64NS_DTYPE:

1431 values = astype_overflowsafe(values, dtype=TD64NS_DTYPE)

1432

1433 elif copy:

1434 values = values.copy()

1435

1436 return values

1437

1438

1439def _ensure_nanosecond_dtype(dtype: DtypeObj) -> DtypeObj:

1440 """

1441 Convert dtypes with granularity less than nanosecond to nanosecond

1442

1443 >>> _ensure_nanosecond_dtype(np.dtype("M8[s]"))

1444 dtype('<M8[ns]')

1445

1446 >>> _ensure_nanosecond_dtype(np.dtype("m8[ps]"))

1447 Traceback (most recent call last):

1448 ...

1449 TypeError: cannot convert timedeltalike to dtype [timedelta64[ps]]

1450 """

1451 msg = (

1452 f"The '{dtype.name}' dtype has no unit. "

1453 f"Please pass in '{dtype.name}[ns]' instead."

1454 )

1455

1456 # unpack e.g. SparseDtype

1457 dtype = getattr(dtype, "subtype", dtype)

1458

1459 if not isinstance(dtype, np.dtype):

1460 # i.e. datetime64tz

1461 pass

1462

1463 elif dtype.kind == "M" and dtype != DT64NS_DTYPE:

1464 # pandas supports dtype whose granularity is less than [ns]

1465 # e.g., [ps], [fs], [as]

1466 if dtype <= np.dtype("M8[ns]"):

1467 if dtype.name == "datetime64":

1468 raise ValueError(msg)

1469 dtype = DT64NS_DTYPE

1470 else:

1471 raise TypeError(f"cannot convert datetimelike to dtype [{dtype}]")

1472

1473 elif dtype.kind == "m" and dtype != TD64NS_DTYPE:

1474 # pandas supports dtype whose granularity is less than [ns]

1475 # e.g., [ps], [fs], [as]

1476 if dtype <= np.dtype("m8[ns]"):

1477 if dtype.name == "timedelta64":

1478 raise ValueError(msg)

1479 dtype = TD64NS_DTYPE

1480 else:

1481 raise TypeError(f"cannot convert timedeltalike to dtype [{dtype}]")

1482 return dtype

1483

1484

1485# TODO: other value-dependent functions to standardize here include

1486# dtypes.concat.cast_to_common_type and Index._find_common_type_compat

1487def find_result_type(left: ArrayLike, right: Any) -> DtypeObj:

1488 """

1489 Find the type/dtype for a the result of an operation between these objects.

1490

1491 This is similar to find_common_type, but looks at the objects instead

1492 of just their dtypes. This can be useful in particular when one of the

1493 objects does not have a `dtype`.

1494

1495 Parameters

1496 ----------

1497 left : np.ndarray or ExtensionArray

1498 right : Any

1499

1500 Returns

1501 -------

1502 np.dtype or ExtensionDtype

1503

1504 See also

1505 --------

1506 find_common_type

1507 numpy.result_type

1508 """

1509 new_dtype: DtypeObj

1510

1511 if (

1512 isinstance(left, np.ndarray)

1513 and left.dtype.kind in ["i", "u", "c"]

1514 and (lib.is_integer(right) or lib.is_float(right))

1515 ):

1516 # e.g. with int8 dtype and right=512, we want to end up with

1517 # np.int16, whereas infer_dtype_from(512) gives np.int64,

1518 # which will make us upcast too far.

1519 if lib.is_float(right) and right.is_integer() and left.dtype.kind != "f":

1520 right = int(right)

1521

1522 new_dtype = np.result_type(left, right)

1523

1524 elif is_valid_na_for_dtype(right, left.dtype):

1525 # e.g. IntervalDtype[int] and None/np.nan

1526 new_dtype = ensure_dtype_can_hold_na(left.dtype)

1527

1528 else:

1529 dtype, _ = infer_dtype_from(right, pandas_dtype=True)

1530

1531 new_dtype = find_common_type([left.dtype, dtype])

1532

1533 return new_dtype

1534

1535

1536def common_dtype_categorical_compat(

1537 objs: list[Index | ArrayLike], dtype: DtypeObj

1538) -> DtypeObj:

1539 """

1540 Update the result of find_common_type to account for NAs in a Categorical.

1541

1542 Parameters

1543 ----------

1544 objs : list[np.ndarray | ExtensionArray | Index]

1545 dtype : np.dtype or ExtensionDtype

1546

1547 Returns

1548 -------

1549 np.dtype or ExtensionDtype

1550 """

1551 # GH#38240

1552

1553 # TODO: more generally, could do `not can_hold_na(dtype)`

1554 if isinstance(dtype, np.dtype) and dtype.kind in ["i", "u"]:

1555

1556 for obj in objs:

1557 # We don't want to accientally allow e.g. "categorical" str here

1558 obj_dtype = getattr(obj, "dtype", None)

1559 if isinstance(obj_dtype, CategoricalDtype):

1560 if isinstance(obj, ABCIndex):

1561 # This check may already be cached

1562 hasnas = obj.hasnans

1563 else:

1564 # Categorical

1565 hasnas = cast("Categorical", obj)._hasna

1566

1567 if hasnas:

1568 # see test_union_int_categorical_with_nan

1569 dtype = np.dtype(np.float64)

1570 break

1571 return dtype

1572

1573

1574@overload

1575def find_common_type(types: list[np.dtype]) -> np.dtype:

1576 ...

1577

1578

1579@overload

1580def find_common_type(types: list[ExtensionDtype]) -> DtypeObj:

1581 ...

1582

1583

1584@overload

1585def find_common_type(types: list[DtypeObj]) -> DtypeObj:

1586 ...

1587

1588

1589def find_common_type(types):

1590 """

1591 Find a common data type among the given dtypes.

1592

1593 Parameters

1594 ----------

1595 types : list of dtypes

1596

1597 Returns

1598 -------

1599 pandas extension or numpy dtype

1600

1601 See Also

1602 --------

1603 numpy.find_common_type

1604

1605 """

1606 if not types:

1607 raise ValueError("no types given")

1608

1609 first = types[0]

1610

1611 # workaround for find_common_type([np.dtype('datetime64[ns]')] * 2)

1612 # => object

1613 if lib.dtypes_all_equal(list(types)):

1614 return first

1615

1616 # get unique types (dict.fromkeys is used as order-preserving set())

1617 types = list(dict.fromkeys(types).keys())

1618

1619 if any(isinstance(t, ExtensionDtype) for t in types):

1620 for t in types:

1621 if isinstance(t, ExtensionDtype):

1622 res = t._get_common_dtype(types)

1623 if res is not None:

1624 return res

1625 return np.dtype("object")

1626

1627 # take lowest unit

1628 if all(is_datetime64_dtype(t) for t in types):

1629 return np.dtype("datetime64[ns]")

1630 if all(is_timedelta64_dtype(t) for t in types):

1631 return np.dtype("timedelta64[ns]")

1632

1633 # don't mix bool / int or float or complex

1634 # this is different from numpy, which casts bool with float/int as int

1635 has_bools = any(is_bool_dtype(t) for t in types)

1636 if has_bools:

1637 for t in types:

1638 if is_integer_dtype(t) or is_float_dtype(t) or is_complex_dtype(t):

1639 return np.dtype("object")

1640

1641 return np.find_common_type(types, [])

1642

1643

1644def construct_2d_arraylike_from_scalar(

1645 value: Scalar, length: int, width: int, dtype: np.dtype, copy: bool

1646) -> np.ndarray:

1647

1648 shape = (length, width)

1649

1650 if dtype.kind in ["m", "M"]:

1651 value = _maybe_unbox_datetimelike_tz_deprecation(value, dtype)

1652 elif dtype == _dtype_obj:

1653 if isinstance(value, (np.timedelta64, np.datetime64)):

1654 # calling np.array below would cast to pytimedelta/pydatetime

1655 out = np.empty(shape, dtype=object)

1656 out.fill(value)

1657 return out

1658

1659 # Attempt to coerce to a numpy array

1660 try:

1661 arr = np.array(value, dtype=dtype, copy=copy)

1662 except (ValueError, TypeError) as err:

1663 raise TypeError(

1664 f"DataFrame constructor called with incompatible data and dtype: {err}"

1665 ) from err

1666

1667 if arr.ndim != 0:

1668 raise ValueError("DataFrame constructor not properly called!")

1669

1670 return np.full(shape, arr)

1671

1672

1673def construct_1d_arraylike_from_scalar(

1674 value: Scalar, length: int, dtype: DtypeObj | None

1675) -> ArrayLike:

1676 """

1677 create a np.ndarray / pandas type of specified shape and dtype

1678 filled with values

1679

1680 Parameters

1681 ----------

1682 value : scalar value

1683 length : int

1684 dtype : pandas_dtype or np.dtype

1685

1686 Returns

1687 -------

1688 np.ndarray / pandas type of length, filled with value

1689

1690 """

1691

1692 if dtype is None:

1693 try:

1694 dtype, value = infer_dtype_from_scalar(value, pandas_dtype=True)

1695 except OutOfBoundsDatetime:

1696 dtype = _dtype_obj

1697

1698 if isinstance(dtype, ExtensionDtype):

1699 cls = dtype.construct_array_type()

1700 seq = [] if length == 0 else [value]

1701 subarr = cls._from_sequence(seq, dtype=dtype).repeat(length)

1702

1703 else:

1704

1705 if length and is_integer_dtype(dtype) and isna(value):

1706 # coerce if we have nan for an integer dtype

1707 dtype = np.dtype("float64")

1708 elif isinstance(dtype, np.dtype) and dtype.kind in ("U", "S"):

1709 # we need to coerce to object dtype to avoid

1710 # to allow numpy to take our string as a scalar value

1711 dtype = np.dtype("object")

1712 if not isna(value):

1713 value = ensure_str(value)

1714 elif dtype.kind in ["M", "m"]:

1715 value = _maybe_unbox_datetimelike_tz_deprecation(value, dtype)

1716

1717 subarr = np.empty(length, dtype=dtype)

1718 if length:

1719 # GH 47391: numpy > 1.24 will raise filling np.nan into int dtypes

1720 subarr.fill(value)

1721

1722 return subarr

1723

1724

1725def _maybe_unbox_datetimelike_tz_deprecation(value: Scalar, dtype: DtypeObj):

1726 """

1727 Wrap _maybe_unbox_datetimelike with a check for a timezone-aware Timestamp

1728 along with a timezone-naive datetime64 dtype, which is deprecated.

1729 """

1730 # Caller is responsible for checking dtype.kind in ["m", "M"]

1731

1732 if isinstance(value, datetime):

1733 # we dont want to box dt64, in particular datetime64("NaT")

1734 value = maybe_box_datetimelike(value, dtype)

1735

1736 try:

1737 value = _maybe_unbox_datetimelike(value, dtype)

1738 except TypeError:

1739 if (

1740 isinstance(value, Timestamp)

1741 and value.tzinfo is not None

1742 and isinstance(dtype, np.dtype)

1743 and dtype.kind == "M"

1744 ):

1745 warnings.warn(

1746 "Data is timezone-aware. Converting "

1747 "timezone-aware data to timezone-naive by "

1748 "passing dtype='datetime64[ns]' to "

1749 "DataFrame or Series is deprecated and will "

1750 "raise in a future version. Use "

1751 "`pd.Series(values).dt.tz_localize(None)` "

1752 "instead.",

1753 FutureWarning,

1754 stacklevel=find_stack_level(),

1755 )

1756 new_value = value.tz_localize(None)

1757 return _maybe_unbox_datetimelike(new_value, dtype)

1758 else:

1759 raise

1760 return value

1761

1762

1763def construct_1d_object_array_from_listlike(values: Sized) -> np.ndarray:

1764 """

1765 Transform any list-like object in a 1-dimensional numpy array of object

1766 dtype.

1767

1768 Parameters

1769 ----------

1770 values : any iterable which has a len()

1771

1772 Raises

1773 ------

1774 TypeError

1775 * If `values` does not have a len()

1776

1777 Returns

1778 -------

1779 1-dimensional numpy array of dtype object

1780 """

1781 # numpy will try to interpret nested lists as further dimensions, hence

1782 # making a 1D array that contains list-likes is a bit tricky:

1783 result = np.empty(len(values), dtype="object")

1784 result[:] = values

1785 return result

1786

1787

1788def maybe_cast_to_integer_array(

1789 arr: list | np.ndarray, dtype: np.dtype, copy: bool = False

1790) -> np.ndarray:

1791 """

1792 Takes any dtype and returns the casted version, raising for when data is

1793 incompatible with integer/unsigned integer dtypes.

1794

1795 Parameters

1796 ----------

1797 arr : np.ndarray or list

1798 The array to cast.

1799 dtype : np.dtype

1800 The integer dtype to cast the array to.

1801 copy: bool, default False

1802 Whether to make a copy of the array before returning.

1803

1804 Returns

1805 -------

1806 ndarray

1807 Array of integer or unsigned integer dtype.

1808

1809 Raises

1810 ------

1811 OverflowError : the dtype is incompatible with the data

1812 ValueError : loss of precision has occurred during casting

1813

1814 Examples

1815 --------

1816 If you try to coerce negative values to unsigned integers, it raises:

1817

1818 >>> pd.Series([-1], dtype="uint64")

1819 Traceback (most recent call last):

1820 ...

1821 OverflowError: Trying to coerce negative values to unsigned integers

1822

1823 Also, if you try to coerce float values to integers, it raises:

1824

1825 >>> maybe_cast_to_integer_array([1, 2, 3.5], dtype=np.dtype("int64"))

1826 Traceback (most recent call last):

1827 ...

1828 ValueError: Trying to coerce float values to integers

1829 """

1830 assert is_integer_dtype(dtype)

1831

1832 try:

1833 if not isinstance(arr, np.ndarray):

1834 casted = np.array(arr, dtype=dtype, copy=copy)

1835 else:

1836 casted = arr.astype(dtype, copy=copy)

1837 except OverflowError as err:

1838 raise OverflowError(

1839 "The elements provided in the data cannot all be "

1840 f"casted to the dtype {dtype}"

1841 ) from err

1842

1843 if np.array_equal(arr, casted):

1844 return casted

1845

1846 # We do this casting to allow for proper

1847 # data and dtype checking.

1848 #

1849 # We didn't do this earlier because NumPy

1850 # doesn't handle `uint64` correctly.

1851 arr = np.asarray(arr)

1852

1853 if is_unsigned_integer_dtype(dtype) and (arr < 0).any():

1854 raise OverflowError("Trying to coerce negative values to unsigned integers")

1855

1856 if is_float_dtype(arr.dtype):

1857 if not np.isfinite(arr).all():

1858 raise IntCastingNaNError(

1859 "Cannot convert non-finite values (NA or inf) to integer"

1860 )

1861 raise ValueError("Trying to coerce float values to integers")

1862 if is_object_dtype(arr.dtype):

1863 raise ValueError("Trying to coerce float values to integers")

1864

1865 if casted.dtype < arr.dtype:

1866 # GH#41734 e.g. [1, 200, 923442] and dtype="int8" -> overflows

1867 warnings.warn(

1868 f"Values are too large to be losslessly cast to {dtype}. "

1869 "In a future version this will raise OverflowError. To retain the "

1870 f"old behavior, use pd.Series(values).astype({dtype})",

1871 FutureWarning,

1872 stacklevel=find_stack_level(),

1873 )

1874 return casted

1875

1876 if arr.dtype.kind in ["m", "M"]:

1877 # test_constructor_maskedarray_nonfloat

1878 warnings.warn(

1879 f"Constructing Series or DataFrame from {arr.dtype} values and "

1880 f"dtype={dtype} is deprecated and will raise in a future version. "

1881 "Use values.view(dtype) instead.",

1882 FutureWarning,

1883 stacklevel=find_stack_level(),

1884 )

1885 return casted

1886

1887 # No known cases that get here, but raising explicitly to cover our bases.

1888 raise ValueError(f"values cannot be losslessly cast to {dtype}")

1889

1890

1891def can_hold_element(arr: ArrayLike, element: Any) -> bool:

1892 """

1893 Can we do an inplace setitem with this element in an array with this dtype?

1894

1895 Parameters

1896 ----------

1897 arr : np.ndarray or ExtensionArray

1898 element : Any

1899

1900 Returns

1901 -------

1902 bool

1903 """

1904 dtype = arr.dtype

1905 if not isinstance(dtype, np.dtype) or dtype.kind in ["m", "M"]:

1906 if isinstance(dtype, (PeriodDtype, IntervalDtype, DatetimeTZDtype, np.dtype)):

1907 # np.dtype here catches datetime64ns and timedelta64ns; we assume

1908 # in this case that we have DatetimeArray/TimedeltaArray

1909 arr = cast(

1910 "PeriodArray | DatetimeArray | TimedeltaArray | IntervalArray", arr

1911 )

1912 try:

1913 arr._validate_setitem_value(element)

1914 return True

1915 except (ValueError, TypeError):

1916 # TODO(2.0): stop catching ValueError for tzaware, see

1917 # _catch_deprecated_value_error

1918 return False

1919

1920 # This is technically incorrect, but maintains the behavior of

1921 # ExtensionBlock._can_hold_element

1922 return True

1923

1924 try:

1925 np_can_hold_element(dtype, element)

1926 return True

1927 except (TypeError, LossySetitemError):

1928 return False

1929

1930

1931def np_can_hold_element(dtype: np.dtype, element: Any) -> Any:

1932 """

1933 Raise if we cannot losslessly set this element into an ndarray with this dtype.

1934

1935 Specifically about places where we disagree with numpy. i.e. there are

1936 cases where numpy will raise in doing the setitem that we do not check

1937 for here, e.g. setting str "X" into a numeric ndarray.

1938

1939 Returns

1940 -------

1941 Any

1942 The element, potentially cast to the dtype.

1943

1944 Raises

1945 ------

1946 ValueError : If we cannot losslessly store this element with this dtype.

1947 """

1948 if dtype == _dtype_obj:

1949 return element

1950

1951 tipo = _maybe_infer_dtype_type(element)

1952

1953 if dtype.kind in ["i", "u"]:

1954 if isinstance(element, range):

1955 if _dtype_can_hold_range(element, dtype):

1956 return element

1957 raise LossySetitemError

1958

1959 elif is_integer(element) or (is_float(element) and element.is_integer()):

1960 # e.g. test_setitem_series_int8 if we have a python int 1

1961 # tipo may be np.int32, despite the fact that it will fit

1962 # in smaller int dtypes.

1963 info = np.iinfo(dtype)

1964 if info.min <= element <= info.max:

1965 return dtype.type(element)

1966 raise LossySetitemError

1967

1968 if tipo is not None:

1969 if tipo.kind not in ["i", "u"]:

1970 if isinstance(element, np.ndarray) and element.dtype.kind == "f":

1971 # If all can be losslessly cast to integers, then we can hold them

1972 with np.errstate(invalid="ignore"):

1973 # We check afterwards if cast was losslessly, so no need to show

1974 # the warning

1975 casted = element.astype(dtype)

1976 comp = casted == element

1977 if comp.all():

1978 # Return the casted values bc they can be passed to

1979 # np.putmask, whereas the raw values cannot.

1980 # see TestSetitemFloatNDarrayIntoIntegerSeries

1981 return casted

1982 raise LossySetitemError

1983

1984 # Anything other than integer we cannot hold

1985 raise LossySetitemError

1986 elif (

1987 dtype.kind == "u"

1988 and isinstance(element, np.ndarray)

1989 and element.dtype.kind == "i"

1990 ):

1991 # see test_where_uint64

1992 casted = element.astype(dtype)

1993 if (casted == element).all():

1994 # TODO: faster to check (element >=0).all()? potential

1995 # itemsize issues there?

1996 return casted

1997 raise LossySetitemError

1998 elif dtype.itemsize < tipo.itemsize:

1999 raise LossySetitemError

2000 elif not isinstance(tipo, np.dtype):

2001 # i.e. nullable IntegerDtype; we can put this into an ndarray

2002 # losslessly iff it has no NAs

2003 if element._hasna:

2004 raise LossySetitemError

2005 return element

2006

2007 return element

2008

2009 raise LossySetitemError

2010

2011 elif dtype.kind == "f":

2012 if lib.is_integer(element) or lib.is_float(element):

2013 casted = dtype.type(element)

2014 if np.isnan(casted) or casted == element:

2015 return casted

2016 # otherwise e.g. overflow see TestCoercionFloat32

2017 raise LossySetitemError

2018

2019 if tipo is not None:

2020 # TODO: itemsize check?

2021 if tipo.kind not in ["f", "i", "u"]:

2022 # Anything other than float/integer we cannot hold

2023 raise LossySetitemError

2024 elif not isinstance(tipo, np.dtype):

2025 # i.e. nullable IntegerDtype or FloatingDtype;

2026 # we can put this into an ndarray losslessly iff it has no NAs

2027 if element._hasna:

2028 raise LossySetitemError

2029 return element

2030 elif tipo.itemsize > dtype.itemsize or tipo.kind != dtype.kind:

2031 if isinstance(element, np.ndarray):

2032 # e.g. TestDataFrameIndexingWhere::test_where_alignment

2033 casted = element.astype(dtype)

2034 # TODO(np>=1.20): we can just use np.array_equal with equal_nan

2035 if array_equivalent(casted, element):

2036 return casted

2037 raise LossySetitemError

2038

2039 return element

2040

2041 raise LossySetitemError

2042

2043 elif dtype.kind == "c":

2044 if lib.is_integer(element) or lib.is_complex(element) or lib.is_float(element):

2045 if np.isnan(element):

2046 # see test_where_complex GH#6345

2047 return dtype.type(element)

2048

2049 casted = dtype.type(element)

2050 if casted == element:

2051 return casted

2052 # otherwise e.g. overflow see test_32878_complex_itemsize

2053 raise LossySetitemError

2054

2055 if tipo is not None:

2056 if tipo.kind in ["c", "f", "i", "u"]:

2057 return element

2058 raise LossySetitemError

2059 raise LossySetitemError

2060

2061 elif dtype.kind == "b":

2062 if tipo is not None:

2063 if tipo.kind == "b":

2064 if not isinstance(tipo, np.dtype):

2065 # i.e. we have a BooleanArray

2066 if element._hasna:

2067 # i.e. there are pd.NA elements

2068 raise LossySetitemError

2069 return element

2070 raise LossySetitemError

2071 if lib.is_bool(element):

2072 return element

2073 raise LossySetitemError

2074

2075 elif dtype.kind == "S":

2076 # TODO: test tests.frame.methods.test_replace tests get here,

2077 # need more targeted tests. xref phofl has a PR about this

2078 if tipo is not None:

2079 if tipo.kind == "S" and tipo.itemsize <= dtype.itemsize:

2080 return element

2081 raise LossySetitemError

2082 if isinstance(element, bytes) and len(element) <= dtype.itemsize:

2083 return element

2084 raise LossySetitemError

2085

2086 raise NotImplementedError(dtype)

2087

2088

2089def _dtype_can_hold_range(rng: range, dtype: np.dtype) -> bool:

2090 """

2091 _maybe_infer_dtype_type infers to int64 (and float64 for very large endpoints),

2092 but in many cases a range can be held by a smaller integer dtype.

2093 Check if this is one of those cases.

2094 """

2095 if not len(rng):

2096 return True

2097 return np.can_cast(rng[0], dtype) and np.can_cast(rng[-1], dtype)

2098

2099

2100class LossySetitemError(Exception):

2101 """

2102 Raised when trying to do a __setitem__ on an np.ndarray that is not lossless.

2103 """

2104

2105 pass