Coverage for /var/srv/projects/api.amasfac.comuna18.com/tmp/venv/lib/python3.9/site-packages/pandas/core/internals/construction.py: 6%

1"""

2Functions for preparing various inputs passed to the DataFrame or Series

3constructors before passing them to a BlockManager.

4"""

5from __future__ import annotations

7from collections import abc

8from typing import (

9 TYPE_CHECKING,

10 Any,

11 Hashable,

12 Sequence,

13 cast,

14)

15import warnings

17import numpy as np

18import numpy.ma as ma

20from pandas._libs import lib

21from pandas._typing import (

22 ArrayLike,

23 DtypeObj,

24 Manager,

25)

26from pandas.util._exceptions import find_stack_level

28from pandas.core.dtypes.cast import (

29 construct_1d_arraylike_from_scalar,

30 dict_compat,

31 maybe_cast_to_datetime,

32 maybe_convert_platform,

33 maybe_infer_to_datetimelike,

34 maybe_upcast,

35)

36from pandas.core.dtypes.common import (

37 is_1d_only_ea_dtype,

38 is_datetime_or_timedelta_dtype,

39 is_dtype_equal,

40 is_extension_array_dtype,

41 is_integer_dtype,

42 is_list_like,

43 is_named_tuple,

44 is_object_dtype,

45)

46from pandas.core.dtypes.generic import (

47 ABCDataFrame,

48 ABCSeries,

49)

51from pandas.core import (

52 algorithms,

53 common as com,

54)

55from pandas.core.arrays import (

56 Categorical,

57 DatetimeArray,

58 ExtensionArray,

59 TimedeltaArray,

60)

61from pandas.core.construction import (

62 ensure_wrapped_if_datetimelike,

63 extract_array,

64 range_to_ndarray,

65 sanitize_array,

66)

67from pandas.core.indexes.api import (

68 DatetimeIndex,

69 Index,

70 TimedeltaIndex,

71 default_index,

72 ensure_index,

73 get_objs_combined_axis,

74 union_indexes,

75)

76from pandas.core.internals.array_manager import (

77 ArrayManager,

78 SingleArrayManager,

79)

80from pandas.core.internals.blocks import (

81 BlockPlacement,

82 ensure_block_shape,

83 new_block_2d,

84)

85from pandas.core.internals.managers import (

86 BlockManager,

87 SingleBlockManager,

88 create_block_manager_from_blocks,

89 create_block_manager_from_column_arrays,

90)

92if TYPE_CHECKING: 92 ↛ 93line 92 didn't jump to line 93, because the condition on line 92 was never true

93 from numpy.ma.mrecords import MaskedRecords

96# ---------------------------------------------------------------------

97# BlockManager Interface

100def arrays_to_mgr(

101 arrays,

102 columns: Index,

103 index,

104 *,

105 dtype: DtypeObj | None = None,

106 verify_integrity: bool = True,

107 typ: str | None = None,

108 consolidate: bool = True,

109) -> Manager:

110 """

111 Segregate Series based on type and coerce into matrices.

112

113 Needs to handle a lot of exceptional cases.

114 """

115 if verify_integrity:

116 # figure out the index, if necessary

117 if index is None:

118 index = _extract_index(arrays)

119 else:

120 index = ensure_index(index)

121

122 # don't force copy because getting jammed in an ndarray anyway

123 arrays = _homogenize(arrays, index, dtype)

124 # _homogenize ensures

125 # - all(len(x) == len(index) for x in arrays)

126 # - all(x.ndim == 1 for x in arrays)

127 # - all(isinstance(x, (np.ndarray, ExtensionArray)) for x in arrays)

128 # - all(type(x) is not PandasArray for x in arrays)

129

130 else:

131 index = ensure_index(index)

132 arrays = [extract_array(x, extract_numpy=True) for x in arrays]

133

134 # Reached via DataFrame._from_arrays; we do validation here

135 for arr in arrays:

136 if (

137 not isinstance(arr, (np.ndarray, ExtensionArray))

138 or arr.ndim != 1

139 or len(arr) != len(index)

140 ):

141 raise ValueError(

142 "Arrays must be 1-dimensional np.ndarray or ExtensionArray "

143 "with length matching len(index)"

144 )

145

146 columns = ensure_index(columns)

147 if len(columns) != len(arrays):

148 raise ValueError("len(arrays) must match len(columns)")

149

150 # from BlockManager perspective

151 axes = [columns, index]

152

153 if typ == "block":

154 return create_block_manager_from_column_arrays(

155 arrays, axes, consolidate=consolidate

156 )

157 elif typ == "array":

158 return ArrayManager(arrays, [index, columns])

159 else:

160 raise ValueError(f"'typ' needs to be one of {{'block', 'array'}}, got '{typ}'")

161

162

163def rec_array_to_mgr(

164 data: MaskedRecords | np.recarray | np.ndarray,

165 index,

166 columns,

167 dtype: DtypeObj | None,

168 copy: bool,

169 typ: str,

170) -> Manager:

171 """

172 Extract from a masked rec array and create the manager.

173 """

174 # essentially process a record array then fill it

175 fdata = ma.getdata(data)

176 if index is None:

177 index = default_index(len(fdata))

178 else:

179 index = ensure_index(index)

180

181 if columns is not None:

182 columns = ensure_index(columns)

183 arrays, arr_columns = to_arrays(fdata, columns)

184

185 # fill if needed

186 if isinstance(data, np.ma.MaskedArray):

187 # GH#42200 we only get here with MaskedRecords, but check for the

188 # parent class MaskedArray to avoid the need to import MaskedRecords

189 data = cast("MaskedRecords", data)

190 new_arrays = fill_masked_arrays(data, arr_columns)

191 else:

192 # error: Incompatible types in assignment (expression has type

193 # "List[ExtensionArray]", variable has type "List[ndarray]")

194 new_arrays = arrays # type: ignore[assignment]

195

196 # create the manager

197

198 # error: Argument 1 to "reorder_arrays" has incompatible type "List[ndarray]";

199 # expected "List[Union[ExtensionArray, ndarray]]"

200 arrays, arr_columns = reorder_arrays(

201 new_arrays, arr_columns, columns, len(index) # type: ignore[arg-type]

202 )

203 if columns is None:

204 columns = arr_columns

205

206 mgr = arrays_to_mgr(arrays, columns, index, dtype=dtype, typ=typ)

207

208 if copy:

209 mgr = mgr.copy()

210 return mgr

211

212

213def fill_masked_arrays(data: MaskedRecords, arr_columns: Index) -> list[np.ndarray]:

214 """

215 Convert numpy MaskedRecords to ensure mask is softened.

216 """

217 new_arrays = []

218

219 for col in arr_columns:

220 arr = data[col]

221 fv = arr.fill_value

222

223 mask = ma.getmaskarray(arr)

224 if mask.any():

225 arr, fv = maybe_upcast(arr, fill_value=fv, copy=True)

226 arr[mask] = fv

227 new_arrays.append(arr)

228 return new_arrays

229

230

231def mgr_to_mgr(mgr, typ: str, copy: bool = True):

232 """

233 Convert to specific type of Manager. Does not copy if the type is already

234 correct. Does not guarantee a copy otherwise. `copy` keyword only controls

235 whether conversion from Block->ArrayManager copies the 1D arrays.

236 """

237 new_mgr: Manager

238

239 if typ == "block":

240 if isinstance(mgr, BlockManager):

241 new_mgr = mgr

242 else:

243 if mgr.ndim == 2:

244 new_mgr = arrays_to_mgr(

245 mgr.arrays, mgr.axes[0], mgr.axes[1], typ="block"

246 )

247 else:

248 new_mgr = SingleBlockManager.from_array(mgr.arrays[0], mgr.index)

249 elif typ == "array":

250 if isinstance(mgr, ArrayManager):

251 new_mgr = mgr

252 else:

253 if mgr.ndim == 2:

254 arrays = [mgr.iget_values(i) for i in range(len(mgr.axes[0]))]

255 if copy:

256 arrays = [arr.copy() for arr in arrays]

257 new_mgr = ArrayManager(arrays, [mgr.axes[1], mgr.axes[0]])

258 else:

259 array = mgr.internal_values()

260 if copy:

261 array = array.copy()

262 new_mgr = SingleArrayManager([array], [mgr.index])

263 else:

264 raise ValueError(f"'typ' needs to be one of {{'block', 'array'}}, got '{typ}'")

265 return new_mgr

266

267

268# ---------------------------------------------------------------------

269# DataFrame Constructor Interface

270

271

272def ndarray_to_mgr(

273 values, index, columns, dtype: DtypeObj | None, copy: bool, typ: str

274) -> Manager:

275 # used in DataFrame.__init__

276 # input must be a ndarray, list, Series, Index, ExtensionArray

277

278 if isinstance(values, ABCSeries):

279 if columns is None:

280 if values.name is not None:

281 columns = Index([values.name])

282 if index is None:

283 index = values.index

284 else:

285 values = values.reindex(index)

286

287 # zero len case (GH #2234)

288 if not len(values) and columns is not None and len(columns):

289 values = np.empty((0, 1), dtype=object)

290

291 # if the array preparation does a copy -> avoid this for ArrayManager,

292 # since the copy is done on conversion to 1D arrays

293 copy_on_sanitize = False if typ == "array" else copy

294

295 vdtype = getattr(values, "dtype", None)

296 if is_1d_only_ea_dtype(vdtype) or is_1d_only_ea_dtype(dtype):

297 # GH#19157

298

299 if isinstance(values, (np.ndarray, ExtensionArray)) and values.ndim > 1:

300 # GH#12513 a EA dtype passed with a 2D array, split into

301 # multiple EAs that view the values

302 # error: No overload variant of "__getitem__" of "ExtensionArray"

303 # matches argument type "Tuple[slice, int]"

304 values = [

305 values[:, n] # type: ignore[call-overload]

306 for n in range(values.shape[1])

307 ]

308 else:

309 values = [values]

310

311 if columns is None:

312 columns = Index(range(len(values)))

313 else:

314 columns = ensure_index(columns)

315

316 return arrays_to_mgr(values, columns, index, dtype=dtype, typ=typ)

317

318 elif is_extension_array_dtype(vdtype) and not is_1d_only_ea_dtype(vdtype):

319 # i.e. Datetime64TZ, PeriodDtype

320 values = extract_array(values, extract_numpy=True)

321 if copy:

322 values = values.copy()

323 if values.ndim == 1:

324 values = values.reshape(-1, 1)

325

326 else:

327 # by definition an array here

328 # the dtypes will be coerced to a single dtype

329 values = _prep_ndarraylike(values, copy=copy_on_sanitize)

330

331 if dtype is not None and not is_dtype_equal(values.dtype, dtype):

332 # GH#40110 see similar check inside sanitize_array

333 rcf = not (is_integer_dtype(dtype) and values.dtype.kind == "f")

334

335 values = sanitize_array(

336 values,

337 None,

338 dtype=dtype,

339 copy=copy_on_sanitize,

340 raise_cast_failure=rcf,

341 allow_2d=True,

342 )

343

344 # _prep_ndarraylike ensures that values.ndim == 2 at this point

345 index, columns = _get_axes(

346 values.shape[0], values.shape[1], index=index, columns=columns

347 )

348

349 _check_values_indices_shape_match(values, index, columns)

350

351 if typ == "array":

352

353 if issubclass(values.dtype.type, str):

354 values = np.array(values, dtype=object)

355

356 if dtype is None and is_object_dtype(values.dtype):

357 arrays = [

358 ensure_wrapped_if_datetimelike(

359 maybe_infer_to_datetimelike(values[:, i])

360 )

361 for i in range(values.shape[1])

362 ]

363 else:

364 if is_datetime_or_timedelta_dtype(values.dtype):

365 values = ensure_wrapped_if_datetimelike(values)

366 arrays = [values[:, i] for i in range(values.shape[1])]

367

368 if copy:

369 arrays = [arr.copy() for arr in arrays]

370

371 return ArrayManager(arrays, [index, columns], verify_integrity=False)

372

373 values = values.T

374

375 # if we don't have a dtype specified, then try to convert objects

376 # on the entire block; this is to convert if we have datetimelike's

377 # embedded in an object type

378 if dtype is None and is_object_dtype(values.dtype):

379 obj_columns = list(values)

380 maybe_datetime = [maybe_infer_to_datetimelike(x) for x in obj_columns]

381 # don't convert (and copy) the objects if no type inference occurs

382 if any(x is not y for x, y in zip(obj_columns, maybe_datetime)):

383 dvals_list = [ensure_block_shape(dval, 2) for dval in maybe_datetime]

384 block_values = [

385 new_block_2d(dvals_list[n], placement=BlockPlacement(n))

386 for n in range(len(dvals_list))

387 ]

388 else:

389 bp = BlockPlacement(slice(len(columns)))

390 nb = new_block_2d(values, placement=bp)

391 block_values = [nb]

392 else:

393 bp = BlockPlacement(slice(len(columns)))

394 nb = new_block_2d(values, placement=bp)

395 block_values = [nb]

396

397 if len(columns) == 0:

398 block_values = []

399

400 return create_block_manager_from_blocks(

401 block_values, [columns, index], verify_integrity=False

402 )

403

404

405def _check_values_indices_shape_match(

406 values: np.ndarray, index: Index, columns: Index

407) -> None:

408 """

409 Check that the shape implied by our axes matches the actual shape of the

410 data.

411 """

412 if values.shape[1] != len(columns) or values.shape[0] != len(index):

413 # Could let this raise in Block constructor, but we get a more

414 # helpful exception message this way.

415 if values.shape[0] == 0:

416 raise ValueError("Empty data passed with indices specified.")

417

418 passed = values.shape

419 implied = (len(index), len(columns))

420 raise ValueError(f"Shape of passed values is {passed}, indices imply {implied}")

421

422

423def dict_to_mgr(

424 data: dict,

425 index,

426 columns,

427 *,

428 dtype: DtypeObj | None = None,

429 typ: str = "block",

430 copy: bool = True,

431) -> Manager:

432 """

433 Segregate Series based on type and coerce into matrices.

434 Needs to handle a lot of exceptional cases.

435

436 Used in DataFrame.__init__

437 """

438 arrays: Sequence[Any] | Series

439

440 if columns is not None:

441 from pandas.core.series import Series

442

443 arrays = Series(data, index=columns, dtype=object)

444 missing = arrays.isna()

445 if index is None:

446 # GH10856

447 # raise ValueError if only scalars in dict

448 index = _extract_index(arrays[~missing])

449 else:

450 index = ensure_index(index)

451

452 # no obvious "empty" int column

453 if missing.any() and not is_integer_dtype(dtype):

454 nan_dtype: DtypeObj

455

456 if dtype is not None:

457 # calling sanitize_array ensures we don't mix-and-match

458 # NA dtypes

459 midxs = missing.values.nonzero()[0]

460 for i in midxs:

461 arr = sanitize_array(arrays.iat[i], index, dtype=dtype)

462 arrays.iat[i] = arr

463 else:

464 # GH#1783

465 nan_dtype = np.dtype("object")

466 val = construct_1d_arraylike_from_scalar(np.nan, len(index), nan_dtype)

467 nmissing = missing.sum()

468 if copy:

469 rhs = [val] * nmissing

470 else:

471 # GH#45369

472 rhs = [val.copy() for _ in range(nmissing)]

473 arrays.loc[missing] = rhs

474

475 arrays = list(arrays)

476 columns = ensure_index(columns)

477

478 else:

479 keys = list(data.keys())

480 columns = Index(keys)

481 arrays = [com.maybe_iterable_to_list(data[k]) for k in keys]

482 arrays = [arr if not isinstance(arr, Index) else arr._data for arr in arrays]

483

484 if copy:

485 if typ == "block":

486 # We only need to copy arrays that will not get consolidated, i.e.

487 # only EA arrays

488 arrays = [x.copy() if isinstance(x, ExtensionArray) else x for x in arrays]

489 else:

490 # dtype check to exclude e.g. range objects, scalars

491 arrays = [x.copy() if hasattr(x, "dtype") else x for x in arrays]

492

493 return arrays_to_mgr(arrays, columns, index, dtype=dtype, typ=typ, consolidate=copy)

494

495

496def nested_data_to_arrays(

497 data: Sequence,

498 columns: Index | None,

499 index: Index | None,

500 dtype: DtypeObj | None,

501) -> tuple[list[ArrayLike], Index, Index]:

502 """

503 Convert a single sequence of arrays to multiple arrays.

504 """

505 # By the time we get here we have already checked treat_as_nested(data)

506

507 if is_named_tuple(data[0]) and columns is None:

508 columns = ensure_index(data[0]._fields)

509

510 arrays, columns = to_arrays(data, columns, dtype=dtype)

511 columns = ensure_index(columns)

512

513 if index is None:

514 if isinstance(data[0], ABCSeries):

515 index = _get_names_from_index(data)

516 elif isinstance(data[0], Categorical):

517 # GH#38845 hit in test_constructor_categorical

518 index = default_index(len(data[0]))

519 else:

520 index = default_index(len(data))

521

522 return arrays, columns, index

523

524

525def treat_as_nested(data) -> bool:

526 """

527 Check if we should use nested_data_to_arrays.

528 """

529 return (

530 len(data) > 0

531 and is_list_like(data[0])

532 and getattr(data[0], "ndim", 1) == 1

533 and not (isinstance(data, ExtensionArray) and data.ndim == 2)

534 )

535

536

537# ---------------------------------------------------------------------

538

539

540def _prep_ndarraylike(

541 values, copy: bool = True

542) -> np.ndarray | DatetimeArray | TimedeltaArray:

543 if isinstance(values, TimedeltaArray) or (

544 isinstance(values, DatetimeArray) and values.tz is None

545 ):

546 # By retaining DTA/TDA instead of unpacking, we end up retaining non-nano

547 pass

548

549 elif not isinstance(values, (np.ndarray, ABCSeries, Index)):

550 if len(values) == 0:

551 return np.empty((0, 0), dtype=object)

552 elif isinstance(values, range):

553 arr = range_to_ndarray(values)

554 return arr[..., np.newaxis]

555

556 def convert(v):

557 if not is_list_like(v) or isinstance(v, ABCDataFrame):

558 return v

559

560 v = extract_array(v, extract_numpy=True)

561 res = maybe_convert_platform(v)

562 return res

563

564 # we could have a 1-dim or 2-dim list here

565 # this is equiv of np.asarray, but does object conversion

566 # and platform dtype preservation

567 if is_list_like(values[0]):

568 values = np.array([convert(v) for v in values])

569 elif isinstance(values[0], np.ndarray) and values[0].ndim == 0:

570 # GH#21861 see test_constructor_list_of_lists

571 values = np.array([convert(v) for v in values])

572 else:

573 values = convert(values)

574

575 else:

576

577 # drop subclass info

578 values = np.array(values, copy=copy)

579

580 if values.ndim == 1:

581 values = values.reshape((values.shape[0], 1))

582 elif values.ndim != 2:

583 raise ValueError(f"Must pass 2-d input. shape={values.shape}")

584

585 return values

586

587

588def _homogenize(data, index: Index, dtype: DtypeObj | None) -> list[ArrayLike]:

589 oindex = None

590 homogenized = []

591

592 for val in data:

593 if isinstance(val, ABCSeries):

594 if dtype is not None:

595 val = val.astype(dtype, copy=False)

596 if val.index is not index:

597 # Forces alignment. No need to copy data since we

598 # are putting it into an ndarray later

599 val = val.reindex(index, copy=False)

600

601 val = val._values

602 else:

603 if isinstance(val, dict):

604 # GH#41785 this _should_ be equivalent to (but faster than)

605 # val = create_series_with_explicit_dtype(val, index=index)._values

606 if oindex is None:

607 oindex = index.astype("O")

608

609 if isinstance(index, (DatetimeIndex, TimedeltaIndex)):

610 # see test_constructor_dict_datetime64_index

611 val = dict_compat(val)

612 else:

613 # see test_constructor_subclass_dict

614 val = dict(val)

615 val = lib.fast_multiget(val, oindex._values, default=np.nan)

616

617 val = sanitize_array(

618 val, index, dtype=dtype, copy=False, raise_cast_failure=False

619 )

620 com.require_length_match(val, index)

621

622 homogenized.append(val)

623

624 return homogenized

625

626

627def _extract_index(data) -> Index:

628 """

629 Try to infer an Index from the passed data, raise ValueError on failure.

630 """

631 index = None

632 if len(data) == 0:

633 index = Index([])

634 else:

635 raw_lengths = []

636 indexes: list[list[Hashable] | Index] = []

637

638 have_raw_arrays = False

639 have_series = False

640 have_dicts = False

641

642 for val in data:

643 if isinstance(val, ABCSeries):

644 have_series = True

645 indexes.append(val.index)

646 elif isinstance(val, dict):

647 have_dicts = True

648 indexes.append(list(val.keys()))

649 elif is_list_like(val) and getattr(val, "ndim", 1) == 1:

650 have_raw_arrays = True

651 raw_lengths.append(len(val))

652 elif isinstance(val, np.ndarray) and val.ndim > 1:

653 raise ValueError("Per-column arrays must each be 1-dimensional")

654

655 if not indexes and not raw_lengths:

656 raise ValueError("If using all scalar values, you must pass an index")

657

658 elif have_series:

659 index = union_indexes(indexes)

660 elif have_dicts:

661 index = union_indexes(indexes, sort=False)

662

663 if have_raw_arrays:

664 lengths = list(set(raw_lengths))

665 if len(lengths) > 1:

666 raise ValueError("All arrays must be of the same length")

667

668 if have_dicts:

669 raise ValueError(

670 "Mixing dicts with non-Series may lead to ambiguous ordering."

671 )

672

673 if have_series:

674 assert index is not None # for mypy

675 if lengths[0] != len(index):

676 msg = (

677 f"array length {lengths[0]} does not match index "

678 f"length {len(index)}"

679 )

680 raise ValueError(msg)

681 else:

682 index = default_index(lengths[0])

683

684 # error: Argument 1 to "ensure_index" has incompatible type "Optional[Index]";

685 # expected "Union[Union[Union[ExtensionArray, ndarray], Index, Series],

686 # Sequence[Any]]"

687 return ensure_index(index) # type: ignore[arg-type]

688

689

690def reorder_arrays(

691 arrays: list[ArrayLike], arr_columns: Index, columns: Index | None, length: int

692) -> tuple[list[ArrayLike], Index]:

693 """

694 Pre-emptively (cheaply) reindex arrays with new columns.

695 """

696 # reorder according to the columns

697 if columns is not None:

698 if not columns.equals(arr_columns):

699 # if they are equal, there is nothing to do

700 new_arrays: list[ArrayLike | None]

701 new_arrays = [None] * len(columns)

702 indexer = arr_columns.get_indexer(columns)

703 for i, k in enumerate(indexer):

704 if k == -1:

705 # by convention default is all-NaN object dtype

706 arr = np.empty(length, dtype=object)

707 arr.fill(np.nan)

708 else:

709 arr = arrays[k]

710 new_arrays[i] = arr

711

712 # Incompatible types in assignment (expression has type

713 # "List[Union[ExtensionArray, ndarray[Any, Any], None]]", variable

714 # has type "List[Union[ExtensionArray, ndarray[Any, Any]]]")

715 arrays = new_arrays # type: ignore[assignment]

716 arr_columns = columns

717

718 return arrays, arr_columns

719

720

721def _get_names_from_index(data) -> Index:

722 has_some_name = any(getattr(s, "name", None) is not None for s in data)

723 if not has_some_name:

724 return default_index(len(data))

725

726 index: list[Hashable] = list(range(len(data)))

727 count = 0

728 for i, s in enumerate(data):

729 n = getattr(s, "name", None)

730 if n is not None:

731 index[i] = n

732 else:

733 index[i] = f"Unnamed {count}"

734 count += 1

735

736 return Index(index)

737

738

739def _get_axes(

740 N: int, K: int, index: Index | None, columns: Index | None

741) -> tuple[Index, Index]:

742 # helper to create the axes as indexes

743 # return axes or defaults

744

745 if index is None:

746 index = default_index(N)

747 else:

748 index = ensure_index(index)

749

750 if columns is None:

751 columns = default_index(K)

752 else:

753 columns = ensure_index(columns)

754 return index, columns

755

756

757def dataclasses_to_dicts(data):

758 """

759 Converts a list of dataclass instances to a list of dictionaries.

760

761 Parameters

762 ----------

763 data : List[Type[dataclass]]

764

765 Returns

766 --------

767 list_dict : List[dict]

768

769 Examples

770 --------

771 >>> from dataclasses import dataclass

772 >>> @dataclass

773 ... class Point:

774 ... x: int

775 ... y: int

776

777 >>> dataclasses_to_dicts([Point(1, 2), Point(2, 3)])

778 [{'x': 1, 'y': 2}, {'x': 2, 'y': 3}]

779

780 """

781 from dataclasses import asdict

782

783 return list(map(asdict, data))

784

785

786# ---------------------------------------------------------------------

787# Conversion of Inputs to Arrays

788

789

790def to_arrays(

791 data, columns: Index | None, dtype: DtypeObj | None = None

792) -> tuple[list[ArrayLike], Index]:

793 """

794 Return list of arrays, columns.

795

796 Returns

797 -------

798 list[ArrayLike]

799 These will become columns in a DataFrame.

800 Index

801 This will become frame.columns.

802

803 Notes

804 -----

805 Ensures that len(result_arrays) == len(result_index).

806 """

807 if isinstance(data, ABCDataFrame):

808 # see test_from_records_with_index_data, test_from_records_bad_index_column

809 if columns is not None:

810 arrays = [

811 data._ixs(i, axis=1).values

812 for i, col in enumerate(data.columns)

813 if col in columns

814 ]

815 else:

816 columns = data.columns

817 arrays = [data._ixs(i, axis=1).values for i in range(len(columns))]

818

819 return arrays, columns

820

821 if not len(data):

822 if isinstance(data, np.ndarray):

823 if data.dtype.names is not None:

824 # i.e. numpy structured array

825 columns = ensure_index(data.dtype.names)

826 arrays = [data[name] for name in columns]

827

828 if len(data) == 0:

829 # GH#42456 the indexing above results in list of 2D ndarrays

830 # TODO: is that an issue with numpy?

831 for i, arr in enumerate(arrays):

832 if arr.ndim == 2:

833 arrays[i] = arr[:, 0]

834

835 return arrays, columns

836 return [], ensure_index([])

837

838 elif isinstance(data[0], Categorical):

839 # GH#38845 deprecate special case

840 warnings.warn(

841 "The behavior of DataFrame([categorical, ...]) is deprecated and "

842 "in a future version will be changed to match the behavior of "

843 "DataFrame([any_listlike, ...]). "

844 "To retain the old behavior, pass as a dictionary "

845 "DataFrame({col: categorical, ..})",

846 FutureWarning,

847 stacklevel=find_stack_level(),

848 )

849 if columns is None:

850 columns = default_index(len(data))

851 elif len(columns) > len(data):

852 raise ValueError("len(columns) > len(data)")

853 elif len(columns) < len(data):

854 # doing this here is akin to a pre-emptive reindex

855 data = data[: len(columns)]

856 return data, columns

857

858 elif isinstance(data, np.ndarray) and data.dtype.names is not None:

859 # e.g. recarray

860 columns = Index(list(data.dtype.names))

861 arrays = [data[k] for k in columns]

862 return arrays, columns

863

864 if isinstance(data[0], (list, tuple)):

865 arr = _list_to_arrays(data)

866 elif isinstance(data[0], abc.Mapping):

867 arr, columns = _list_of_dict_to_arrays(data, columns)

868 elif isinstance(data[0], ABCSeries):

869 arr, columns = _list_of_series_to_arrays(data, columns)

870 else:

871 # last ditch effort

872 data = [tuple(x) for x in data]

873 arr = _list_to_arrays(data)

874

875 content, columns = _finalize_columns_and_data(arr, columns, dtype)

876 return content, columns

877

878

879def _list_to_arrays(data: list[tuple | list]) -> np.ndarray:

880 # Returned np.ndarray has ndim = 2

881 # Note: we already check len(data) > 0 before getting hre

882 if isinstance(data[0], tuple):

883 content = lib.to_object_array_tuples(data)

884 else:

885 # list of lists

886 content = lib.to_object_array(data)

887 return content

888

889

890def _list_of_series_to_arrays(

891 data: list,

892 columns: Index | None,

893) -> tuple[np.ndarray, Index]:

894 # returned np.ndarray has ndim == 2

895

896 if columns is None:

897 # We know pass_data is non-empty because data[0] is a Series

898 pass_data = [x for x in data if isinstance(x, (ABCSeries, ABCDataFrame))]

899 columns = get_objs_combined_axis(pass_data, sort=False)

900

901 indexer_cache: dict[int, np.ndarray] = {}

902

903 aligned_values = []

904 for s in data:

905 index = getattr(s, "index", None)

906 if index is None:

907 index = default_index(len(s))

908

909 if id(index) in indexer_cache:

910 indexer = indexer_cache[id(index)]

911 else:

912 indexer = indexer_cache[id(index)] = index.get_indexer(columns)

913

914 values = extract_array(s, extract_numpy=True)

915 aligned_values.append(algorithms.take_nd(values, indexer))

916

917 content = np.vstack(aligned_values)

918 return content, columns

919

920

921def _list_of_dict_to_arrays(

922 data: list[dict],

923 columns: Index | None,

924) -> tuple[np.ndarray, Index]:

925 """

926 Convert list of dicts to numpy arrays

927

928 if `columns` is not passed, column names are inferred from the records

929 - for OrderedDict and dicts, the column names match

930 the key insertion-order from the first record to the last.

931 - For other kinds of dict-likes, the keys are lexically sorted.

932

933 Parameters

934 ----------

935 data : iterable

936 collection of records (OrderedDict, dict)

937 columns: iterables or None

938

939 Returns

940 -------

941 content : np.ndarray[object, ndim=2]

942 columns : Index

943 """

944 if columns is None:

945 gen = (list(x.keys()) for x in data)

946 sort = not any(isinstance(d, dict) for d in data)

947 pre_cols = lib.fast_unique_multiple_list_gen(gen, sort=sort)

948 columns = ensure_index(pre_cols)

949

950 # assure that they are of the base dict class and not of derived

951 # classes

952 data = [d if type(d) is dict else dict(d) for d in data]

953

954 content = lib.dicts_to_array(data, list(columns))

955 return content, columns

956

957

958def _finalize_columns_and_data(

959 content: np.ndarray, # ndim == 2

960 columns: Index | None,

961 dtype: DtypeObj | None,

962) -> tuple[list[ArrayLike], Index]:

963 """

964 Ensure we have valid columns, cast object dtypes if possible.

965 """

966 contents = list(content.T)

967

968 try:

969 columns = _validate_or_indexify_columns(contents, columns)

970 except AssertionError as err:

971 # GH#26429 do not raise user-facing AssertionError

972 raise ValueError(err) from err

973

974 if len(contents) and contents[0].dtype == np.object_:

975 contents = _convert_object_array(contents, dtype=dtype)

976

977 return contents, columns

978

979

980def _validate_or_indexify_columns(

981 content: list[np.ndarray], columns: Index | None

982) -> Index:

983 """

984 If columns is None, make numbers as column names; Otherwise, validate that

985 columns have valid length.

986

987 Parameters

988 ----------

989 content : list of np.ndarrays

990 columns : Index or None

991

992 Returns

993 -------

994 Index

995 If columns is None, assign positional column index value as columns.

996

997 Raises

998 ------

999 1. AssertionError when content is not composed of list of lists, and if

1000 length of columns is not equal to length of content.

1001 2. ValueError when content is list of lists, but length of each sub-list

1002 is not equal

1003 3. ValueError when content is list of lists, but length of sub-list is

1004 not equal to length of content

1005 """

1006 if columns is None:

1007 columns = default_index(len(content))

1008 else:

1009

1010 # Add mask for data which is composed of list of lists

1011 is_mi_list = isinstance(columns, list) and all(

1012 isinstance(col, list) for col in columns

1013 )

1014

1015 if not is_mi_list and len(columns) != len(content): # pragma: no cover

1016 # caller's responsibility to check for this...

1017 raise AssertionError(

1018 f"{len(columns)} columns passed, passed data had "

1019 f"{len(content)} columns"

1020 )

1021 elif is_mi_list:

1022

1023 # check if nested list column, length of each sub-list should be equal

1024 if len({len(col) for col in columns}) > 1:

1025 raise ValueError(

1026 "Length of columns passed for MultiIndex columns is different"

1027 )

1028

1029 # if columns is not empty and length of sublist is not equal to content

1030 elif columns and len(columns[0]) != len(content):

1031 raise ValueError(

1032 f"{len(columns[0])} columns passed, passed data had "

1033 f"{len(content)} columns"

1034 )

1035 return columns

1036

1037

1038def _convert_object_array(

1039 content: list[np.ndarray], dtype: DtypeObj | None

1040) -> list[ArrayLike]:

1041 """

1042 Internal function to convert object array.

1043

1044 Parameters

1045 ----------

1046 content: List[np.ndarray]

1047 dtype: np.dtype or ExtensionDtype

1048

1049 Returns

1050 -------

1051 List[ArrayLike]

1052 """

1053 # provide soft conversion of object dtypes

1054 def convert(arr):

1055 if dtype != np.dtype("O"):

1056 arr = lib.maybe_convert_objects(arr)

1057 arr = maybe_cast_to_datetime(arr, dtype)

1058 return arr

1059

1060 arrays = [convert(arr) for arr in content]

1061

1062 return arrays