Coverage for /var/srv/projects/api.amasfac.comuna18.com/tmp/venv/lib/python3.9/site-packages/pandas/core/internals/construction.py: 6%

430 statements  

« prev     ^ index     » next       coverage.py v6.4.4, created at 2023-07-17 14:22 -0600

1""" 

2Functions for preparing various inputs passed to the DataFrame or Series 

3constructors before passing them to a BlockManager. 

4""" 

5from __future__ import annotations 

6 

7from collections import abc 

8from typing import ( 

9 TYPE_CHECKING, 

10 Any, 

11 Hashable, 

12 Sequence, 

13 cast, 

14) 

15import warnings 

16 

17import numpy as np 

18import numpy.ma as ma 

19 

20from pandas._libs import lib 

21from pandas._typing import ( 

22 ArrayLike, 

23 DtypeObj, 

24 Manager, 

25) 

26from pandas.util._exceptions import find_stack_level 

27 

28from pandas.core.dtypes.cast import ( 

29 construct_1d_arraylike_from_scalar, 

30 dict_compat, 

31 maybe_cast_to_datetime, 

32 maybe_convert_platform, 

33 maybe_infer_to_datetimelike, 

34 maybe_upcast, 

35) 

36from pandas.core.dtypes.common import ( 

37 is_1d_only_ea_dtype, 

38 is_datetime_or_timedelta_dtype, 

39 is_dtype_equal, 

40 is_extension_array_dtype, 

41 is_integer_dtype, 

42 is_list_like, 

43 is_named_tuple, 

44 is_object_dtype, 

45) 

46from pandas.core.dtypes.generic import ( 

47 ABCDataFrame, 

48 ABCSeries, 

49) 

50 

51from pandas.core import ( 

52 algorithms, 

53 common as com, 

54) 

55from pandas.core.arrays import ( 

56 Categorical, 

57 DatetimeArray, 

58 ExtensionArray, 

59 TimedeltaArray, 

60) 

61from pandas.core.construction import ( 

62 ensure_wrapped_if_datetimelike, 

63 extract_array, 

64 range_to_ndarray, 

65 sanitize_array, 

66) 

67from pandas.core.indexes.api import ( 

68 DatetimeIndex, 

69 Index, 

70 TimedeltaIndex, 

71 default_index, 

72 ensure_index, 

73 get_objs_combined_axis, 

74 union_indexes, 

75) 

76from pandas.core.internals.array_manager import ( 

77 ArrayManager, 

78 SingleArrayManager, 

79) 

80from pandas.core.internals.blocks import ( 

81 BlockPlacement, 

82 ensure_block_shape, 

83 new_block_2d, 

84) 

85from pandas.core.internals.managers import ( 

86 BlockManager, 

87 SingleBlockManager, 

88 create_block_manager_from_blocks, 

89 create_block_manager_from_column_arrays, 

90) 

91 

92if TYPE_CHECKING: 92 ↛ 93line 92 didn't jump to line 93, because the condition on line 92 was never true

93 from numpy.ma.mrecords import MaskedRecords 

94 

95 

96# --------------------------------------------------------------------- 

97# BlockManager Interface 

98 

99 

100def arrays_to_mgr( 

101 arrays, 

102 columns: Index, 

103 index, 

104 *, 

105 dtype: DtypeObj | None = None, 

106 verify_integrity: bool = True, 

107 typ: str | None = None, 

108 consolidate: bool = True, 

109) -> Manager: 

110 """ 

111 Segregate Series based on type and coerce into matrices. 

112 

113 Needs to handle a lot of exceptional cases. 

114 """ 

115 if verify_integrity: 

116 # figure out the index, if necessary 

117 if index is None: 

118 index = _extract_index(arrays) 

119 else: 

120 index = ensure_index(index) 

121 

122 # don't force copy because getting jammed in an ndarray anyway 

123 arrays = _homogenize(arrays, index, dtype) 

124 # _homogenize ensures 

125 # - all(len(x) == len(index) for x in arrays) 

126 # - all(x.ndim == 1 for x in arrays) 

127 # - all(isinstance(x, (np.ndarray, ExtensionArray)) for x in arrays) 

128 # - all(type(x) is not PandasArray for x in arrays) 

129 

130 else: 

131 index = ensure_index(index) 

132 arrays = [extract_array(x, extract_numpy=True) for x in arrays] 

133 

134 # Reached via DataFrame._from_arrays; we do validation here 

135 for arr in arrays: 

136 if ( 

137 not isinstance(arr, (np.ndarray, ExtensionArray)) 

138 or arr.ndim != 1 

139 or len(arr) != len(index) 

140 ): 

141 raise ValueError( 

142 "Arrays must be 1-dimensional np.ndarray or ExtensionArray " 

143 "with length matching len(index)" 

144 ) 

145 

146 columns = ensure_index(columns) 

147 if len(columns) != len(arrays): 

148 raise ValueError("len(arrays) must match len(columns)") 

149 

150 # from BlockManager perspective 

151 axes = [columns, index] 

152 

153 if typ == "block": 

154 return create_block_manager_from_column_arrays( 

155 arrays, axes, consolidate=consolidate 

156 ) 

157 elif typ == "array": 

158 return ArrayManager(arrays, [index, columns]) 

159 else: 

160 raise ValueError(f"'typ' needs to be one of {{'block', 'array'}}, got '{typ}'") 

161 

162 

163def rec_array_to_mgr( 

164 data: MaskedRecords | np.recarray | np.ndarray, 

165 index, 

166 columns, 

167 dtype: DtypeObj | None, 

168 copy: bool, 

169 typ: str, 

170) -> Manager: 

171 """ 

172 Extract from a masked rec array and create the manager. 

173 """ 

174 # essentially process a record array then fill it 

175 fdata = ma.getdata(data) 

176 if index is None: 

177 index = default_index(len(fdata)) 

178 else: 

179 index = ensure_index(index) 

180 

181 if columns is not None: 

182 columns = ensure_index(columns) 

183 arrays, arr_columns = to_arrays(fdata, columns) 

184 

185 # fill if needed 

186 if isinstance(data, np.ma.MaskedArray): 

187 # GH#42200 we only get here with MaskedRecords, but check for the 

188 # parent class MaskedArray to avoid the need to import MaskedRecords 

189 data = cast("MaskedRecords", data) 

190 new_arrays = fill_masked_arrays(data, arr_columns) 

191 else: 

192 # error: Incompatible types in assignment (expression has type 

193 # "List[ExtensionArray]", variable has type "List[ndarray]") 

194 new_arrays = arrays # type: ignore[assignment] 

195 

196 # create the manager 

197 

198 # error: Argument 1 to "reorder_arrays" has incompatible type "List[ndarray]"; 

199 # expected "List[Union[ExtensionArray, ndarray]]" 

200 arrays, arr_columns = reorder_arrays( 

201 new_arrays, arr_columns, columns, len(index) # type: ignore[arg-type] 

202 ) 

203 if columns is None: 

204 columns = arr_columns 

205 

206 mgr = arrays_to_mgr(arrays, columns, index, dtype=dtype, typ=typ) 

207 

208 if copy: 

209 mgr = mgr.copy() 

210 return mgr 

211 

212 

213def fill_masked_arrays(data: MaskedRecords, arr_columns: Index) -> list[np.ndarray]: 

214 """ 

215 Convert numpy MaskedRecords to ensure mask is softened. 

216 """ 

217 new_arrays = [] 

218 

219 for col in arr_columns: 

220 arr = data[col] 

221 fv = arr.fill_value 

222 

223 mask = ma.getmaskarray(arr) 

224 if mask.any(): 

225 arr, fv = maybe_upcast(arr, fill_value=fv, copy=True) 

226 arr[mask] = fv 

227 new_arrays.append(arr) 

228 return new_arrays 

229 

230 

231def mgr_to_mgr(mgr, typ: str, copy: bool = True): 

232 """ 

233 Convert to specific type of Manager. Does not copy if the type is already 

234 correct. Does not guarantee a copy otherwise. `copy` keyword only controls 

235 whether conversion from Block->ArrayManager copies the 1D arrays. 

236 """ 

237 new_mgr: Manager 

238 

239 if typ == "block": 

240 if isinstance(mgr, BlockManager): 

241 new_mgr = mgr 

242 else: 

243 if mgr.ndim == 2: 

244 new_mgr = arrays_to_mgr( 

245 mgr.arrays, mgr.axes[0], mgr.axes[1], typ="block" 

246 ) 

247 else: 

248 new_mgr = SingleBlockManager.from_array(mgr.arrays[0], mgr.index) 

249 elif typ == "array": 

250 if isinstance(mgr, ArrayManager): 

251 new_mgr = mgr 

252 else: 

253 if mgr.ndim == 2: 

254 arrays = [mgr.iget_values(i) for i in range(len(mgr.axes[0]))] 

255 if copy: 

256 arrays = [arr.copy() for arr in arrays] 

257 new_mgr = ArrayManager(arrays, [mgr.axes[1], mgr.axes[0]]) 

258 else: 

259 array = mgr.internal_values() 

260 if copy: 

261 array = array.copy() 

262 new_mgr = SingleArrayManager([array], [mgr.index]) 

263 else: 

264 raise ValueError(f"'typ' needs to be one of {{'block', 'array'}}, got '{typ}'") 

265 return new_mgr 

266 

267 

268# --------------------------------------------------------------------- 

269# DataFrame Constructor Interface 

270 

271 

272def ndarray_to_mgr( 

273 values, index, columns, dtype: DtypeObj | None, copy: bool, typ: str 

274) -> Manager: 

275 # used in DataFrame.__init__ 

276 # input must be a ndarray, list, Series, Index, ExtensionArray 

277 

278 if isinstance(values, ABCSeries): 

279 if columns is None: 

280 if values.name is not None: 

281 columns = Index([values.name]) 

282 if index is None: 

283 index = values.index 

284 else: 

285 values = values.reindex(index) 

286 

287 # zero len case (GH #2234) 

288 if not len(values) and columns is not None and len(columns): 

289 values = np.empty((0, 1), dtype=object) 

290 

291 # if the array preparation does a copy -> avoid this for ArrayManager, 

292 # since the copy is done on conversion to 1D arrays 

293 copy_on_sanitize = False if typ == "array" else copy 

294 

295 vdtype = getattr(values, "dtype", None) 

296 if is_1d_only_ea_dtype(vdtype) or is_1d_only_ea_dtype(dtype): 

297 # GH#19157 

298 

299 if isinstance(values, (np.ndarray, ExtensionArray)) and values.ndim > 1: 

300 # GH#12513 a EA dtype passed with a 2D array, split into 

301 # multiple EAs that view the values 

302 # error: No overload variant of "__getitem__" of "ExtensionArray" 

303 # matches argument type "Tuple[slice, int]" 

304 values = [ 

305 values[:, n] # type: ignore[call-overload] 

306 for n in range(values.shape[1]) 

307 ] 

308 else: 

309 values = [values] 

310 

311 if columns is None: 

312 columns = Index(range(len(values))) 

313 else: 

314 columns = ensure_index(columns) 

315 

316 return arrays_to_mgr(values, columns, index, dtype=dtype, typ=typ) 

317 

318 elif is_extension_array_dtype(vdtype) and not is_1d_only_ea_dtype(vdtype): 

319 # i.e. Datetime64TZ, PeriodDtype 

320 values = extract_array(values, extract_numpy=True) 

321 if copy: 

322 values = values.copy() 

323 if values.ndim == 1: 

324 values = values.reshape(-1, 1) 

325 

326 else: 

327 # by definition an array here 

328 # the dtypes will be coerced to a single dtype 

329 values = _prep_ndarraylike(values, copy=copy_on_sanitize) 

330 

331 if dtype is not None and not is_dtype_equal(values.dtype, dtype): 

332 # GH#40110 see similar check inside sanitize_array 

333 rcf = not (is_integer_dtype(dtype) and values.dtype.kind == "f") 

334 

335 values = sanitize_array( 

336 values, 

337 None, 

338 dtype=dtype, 

339 copy=copy_on_sanitize, 

340 raise_cast_failure=rcf, 

341 allow_2d=True, 

342 ) 

343 

344 # _prep_ndarraylike ensures that values.ndim == 2 at this point 

345 index, columns = _get_axes( 

346 values.shape[0], values.shape[1], index=index, columns=columns 

347 ) 

348 

349 _check_values_indices_shape_match(values, index, columns) 

350 

351 if typ == "array": 

352 

353 if issubclass(values.dtype.type, str): 

354 values = np.array(values, dtype=object) 

355 

356 if dtype is None and is_object_dtype(values.dtype): 

357 arrays = [ 

358 ensure_wrapped_if_datetimelike( 

359 maybe_infer_to_datetimelike(values[:, i]) 

360 ) 

361 for i in range(values.shape[1]) 

362 ] 

363 else: 

364 if is_datetime_or_timedelta_dtype(values.dtype): 

365 values = ensure_wrapped_if_datetimelike(values) 

366 arrays = [values[:, i] for i in range(values.shape[1])] 

367 

368 if copy: 

369 arrays = [arr.copy() for arr in arrays] 

370 

371 return ArrayManager(arrays, [index, columns], verify_integrity=False) 

372 

373 values = values.T 

374 

375 # if we don't have a dtype specified, then try to convert objects 

376 # on the entire block; this is to convert if we have datetimelike's 

377 # embedded in an object type 

378 if dtype is None and is_object_dtype(values.dtype): 

379 obj_columns = list(values) 

380 maybe_datetime = [maybe_infer_to_datetimelike(x) for x in obj_columns] 

381 # don't convert (and copy) the objects if no type inference occurs 

382 if any(x is not y for x, y in zip(obj_columns, maybe_datetime)): 

383 dvals_list = [ensure_block_shape(dval, 2) for dval in maybe_datetime] 

384 block_values = [ 

385 new_block_2d(dvals_list[n], placement=BlockPlacement(n)) 

386 for n in range(len(dvals_list)) 

387 ] 

388 else: 

389 bp = BlockPlacement(slice(len(columns))) 

390 nb = new_block_2d(values, placement=bp) 

391 block_values = [nb] 

392 else: 

393 bp = BlockPlacement(slice(len(columns))) 

394 nb = new_block_2d(values, placement=bp) 

395 block_values = [nb] 

396 

397 if len(columns) == 0: 

398 block_values = [] 

399 

400 return create_block_manager_from_blocks( 

401 block_values, [columns, index], verify_integrity=False 

402 ) 

403 

404 

405def _check_values_indices_shape_match( 

406 values: np.ndarray, index: Index, columns: Index 

407) -> None: 

408 """ 

409 Check that the shape implied by our axes matches the actual shape of the 

410 data. 

411 """ 

412 if values.shape[1] != len(columns) or values.shape[0] != len(index): 

413 # Could let this raise in Block constructor, but we get a more 

414 # helpful exception message this way. 

415 if values.shape[0] == 0: 

416 raise ValueError("Empty data passed with indices specified.") 

417 

418 passed = values.shape 

419 implied = (len(index), len(columns)) 

420 raise ValueError(f"Shape of passed values is {passed}, indices imply {implied}") 

421 

422 

423def dict_to_mgr( 

424 data: dict, 

425 index, 

426 columns, 

427 *, 

428 dtype: DtypeObj | None = None, 

429 typ: str = "block", 

430 copy: bool = True, 

431) -> Manager: 

432 """ 

433 Segregate Series based on type and coerce into matrices. 

434 Needs to handle a lot of exceptional cases. 

435 

436 Used in DataFrame.__init__ 

437 """ 

438 arrays: Sequence[Any] | Series 

439 

440 if columns is not None: 

441 from pandas.core.series import Series 

442 

443 arrays = Series(data, index=columns, dtype=object) 

444 missing = arrays.isna() 

445 if index is None: 

446 # GH10856 

447 # raise ValueError if only scalars in dict 

448 index = _extract_index(arrays[~missing]) 

449 else: 

450 index = ensure_index(index) 

451 

452 # no obvious "empty" int column 

453 if missing.any() and not is_integer_dtype(dtype): 

454 nan_dtype: DtypeObj 

455 

456 if dtype is not None: 

457 # calling sanitize_array ensures we don't mix-and-match 

458 # NA dtypes 

459 midxs = missing.values.nonzero()[0] 

460 for i in midxs: 

461 arr = sanitize_array(arrays.iat[i], index, dtype=dtype) 

462 arrays.iat[i] = arr 

463 else: 

464 # GH#1783 

465 nan_dtype = np.dtype("object") 

466 val = construct_1d_arraylike_from_scalar(np.nan, len(index), nan_dtype) 

467 nmissing = missing.sum() 

468 if copy: 

469 rhs = [val] * nmissing 

470 else: 

471 # GH#45369 

472 rhs = [val.copy() for _ in range(nmissing)] 

473 arrays.loc[missing] = rhs 

474 

475 arrays = list(arrays) 

476 columns = ensure_index(columns) 

477 

478 else: 

479 keys = list(data.keys()) 

480 columns = Index(keys) 

481 arrays = [com.maybe_iterable_to_list(data[k]) for k in keys] 

482 arrays = [arr if not isinstance(arr, Index) else arr._data for arr in arrays] 

483 

484 if copy: 

485 if typ == "block": 

486 # We only need to copy arrays that will not get consolidated, i.e. 

487 # only EA arrays 

488 arrays = [x.copy() if isinstance(x, ExtensionArray) else x for x in arrays] 

489 else: 

490 # dtype check to exclude e.g. range objects, scalars 

491 arrays = [x.copy() if hasattr(x, "dtype") else x for x in arrays] 

492 

493 return arrays_to_mgr(arrays, columns, index, dtype=dtype, typ=typ, consolidate=copy) 

494 

495 

496def nested_data_to_arrays( 

497 data: Sequence, 

498 columns: Index | None, 

499 index: Index | None, 

500 dtype: DtypeObj | None, 

501) -> tuple[list[ArrayLike], Index, Index]: 

502 """ 

503 Convert a single sequence of arrays to multiple arrays. 

504 """ 

505 # By the time we get here we have already checked treat_as_nested(data) 

506 

507 if is_named_tuple(data[0]) and columns is None: 

508 columns = ensure_index(data[0]._fields) 

509 

510 arrays, columns = to_arrays(data, columns, dtype=dtype) 

511 columns = ensure_index(columns) 

512 

513 if index is None: 

514 if isinstance(data[0], ABCSeries): 

515 index = _get_names_from_index(data) 

516 elif isinstance(data[0], Categorical): 

517 # GH#38845 hit in test_constructor_categorical 

518 index = default_index(len(data[0])) 

519 else: 

520 index = default_index(len(data)) 

521 

522 return arrays, columns, index 

523 

524 

525def treat_as_nested(data) -> bool: 

526 """ 

527 Check if we should use nested_data_to_arrays. 

528 """ 

529 return ( 

530 len(data) > 0 

531 and is_list_like(data[0]) 

532 and getattr(data[0], "ndim", 1) == 1 

533 and not (isinstance(data, ExtensionArray) and data.ndim == 2) 

534 ) 

535 

536 

537# --------------------------------------------------------------------- 

538 

539 

540def _prep_ndarraylike( 

541 values, copy: bool = True 

542) -> np.ndarray | DatetimeArray | TimedeltaArray: 

543 if isinstance(values, TimedeltaArray) or ( 

544 isinstance(values, DatetimeArray) and values.tz is None 

545 ): 

546 # By retaining DTA/TDA instead of unpacking, we end up retaining non-nano 

547 pass 

548 

549 elif not isinstance(values, (np.ndarray, ABCSeries, Index)): 

550 if len(values) == 0: 

551 return np.empty((0, 0), dtype=object) 

552 elif isinstance(values, range): 

553 arr = range_to_ndarray(values) 

554 return arr[..., np.newaxis] 

555 

556 def convert(v): 

557 if not is_list_like(v) or isinstance(v, ABCDataFrame): 

558 return v 

559 

560 v = extract_array(v, extract_numpy=True) 

561 res = maybe_convert_platform(v) 

562 return res 

563 

564 # we could have a 1-dim or 2-dim list here 

565 # this is equiv of np.asarray, but does object conversion 

566 # and platform dtype preservation 

567 if is_list_like(values[0]): 

568 values = np.array([convert(v) for v in values]) 

569 elif isinstance(values[0], np.ndarray) and values[0].ndim == 0: 

570 # GH#21861 see test_constructor_list_of_lists 

571 values = np.array([convert(v) for v in values]) 

572 else: 

573 values = convert(values) 

574 

575 else: 

576 

577 # drop subclass info 

578 values = np.array(values, copy=copy) 

579 

580 if values.ndim == 1: 

581 values = values.reshape((values.shape[0], 1)) 

582 elif values.ndim != 2: 

583 raise ValueError(f"Must pass 2-d input. shape={values.shape}") 

584 

585 return values 

586 

587 

588def _homogenize(data, index: Index, dtype: DtypeObj | None) -> list[ArrayLike]: 

589 oindex = None 

590 homogenized = [] 

591 

592 for val in data: 

593 if isinstance(val, ABCSeries): 

594 if dtype is not None: 

595 val = val.astype(dtype, copy=False) 

596 if val.index is not index: 

597 # Forces alignment. No need to copy data since we 

598 # are putting it into an ndarray later 

599 val = val.reindex(index, copy=False) 

600 

601 val = val._values 

602 else: 

603 if isinstance(val, dict): 

604 # GH#41785 this _should_ be equivalent to (but faster than) 

605 # val = create_series_with_explicit_dtype(val, index=index)._values 

606 if oindex is None: 

607 oindex = index.astype("O") 

608 

609 if isinstance(index, (DatetimeIndex, TimedeltaIndex)): 

610 # see test_constructor_dict_datetime64_index 

611 val = dict_compat(val) 

612 else: 

613 # see test_constructor_subclass_dict 

614 val = dict(val) 

615 val = lib.fast_multiget(val, oindex._values, default=np.nan) 

616 

617 val = sanitize_array( 

618 val, index, dtype=dtype, copy=False, raise_cast_failure=False 

619 ) 

620 com.require_length_match(val, index) 

621 

622 homogenized.append(val) 

623 

624 return homogenized 

625 

626 

627def _extract_index(data) -> Index: 

628 """ 

629 Try to infer an Index from the passed data, raise ValueError on failure. 

630 """ 

631 index = None 

632 if len(data) == 0: 

633 index = Index([]) 

634 else: 

635 raw_lengths = [] 

636 indexes: list[list[Hashable] | Index] = [] 

637 

638 have_raw_arrays = False 

639 have_series = False 

640 have_dicts = False 

641 

642 for val in data: 

643 if isinstance(val, ABCSeries): 

644 have_series = True 

645 indexes.append(val.index) 

646 elif isinstance(val, dict): 

647 have_dicts = True 

648 indexes.append(list(val.keys())) 

649 elif is_list_like(val) and getattr(val, "ndim", 1) == 1: 

650 have_raw_arrays = True 

651 raw_lengths.append(len(val)) 

652 elif isinstance(val, np.ndarray) and val.ndim > 1: 

653 raise ValueError("Per-column arrays must each be 1-dimensional") 

654 

655 if not indexes and not raw_lengths: 

656 raise ValueError("If using all scalar values, you must pass an index") 

657 

658 elif have_series: 

659 index = union_indexes(indexes) 

660 elif have_dicts: 

661 index = union_indexes(indexes, sort=False) 

662 

663 if have_raw_arrays: 

664 lengths = list(set(raw_lengths)) 

665 if len(lengths) > 1: 

666 raise ValueError("All arrays must be of the same length") 

667 

668 if have_dicts: 

669 raise ValueError( 

670 "Mixing dicts with non-Series may lead to ambiguous ordering." 

671 ) 

672 

673 if have_series: 

674 assert index is not None # for mypy 

675 if lengths[0] != len(index): 

676 msg = ( 

677 f"array length {lengths[0]} does not match index " 

678 f"length {len(index)}" 

679 ) 

680 raise ValueError(msg) 

681 else: 

682 index = default_index(lengths[0]) 

683 

684 # error: Argument 1 to "ensure_index" has incompatible type "Optional[Index]"; 

685 # expected "Union[Union[Union[ExtensionArray, ndarray], Index, Series], 

686 # Sequence[Any]]" 

687 return ensure_index(index) # type: ignore[arg-type] 

688 

689 

690def reorder_arrays( 

691 arrays: list[ArrayLike], arr_columns: Index, columns: Index | None, length: int 

692) -> tuple[list[ArrayLike], Index]: 

693 """ 

694 Pre-emptively (cheaply) reindex arrays with new columns. 

695 """ 

696 # reorder according to the columns 

697 if columns is not None: 

698 if not columns.equals(arr_columns): 

699 # if they are equal, there is nothing to do 

700 new_arrays: list[ArrayLike | None] 

701 new_arrays = [None] * len(columns) 

702 indexer = arr_columns.get_indexer(columns) 

703 for i, k in enumerate(indexer): 

704 if k == -1: 

705 # by convention default is all-NaN object dtype 

706 arr = np.empty(length, dtype=object) 

707 arr.fill(np.nan) 

708 else: 

709 arr = arrays[k] 

710 new_arrays[i] = arr 

711 

712 # Incompatible types in assignment (expression has type 

713 # "List[Union[ExtensionArray, ndarray[Any, Any], None]]", variable 

714 # has type "List[Union[ExtensionArray, ndarray[Any, Any]]]") 

715 arrays = new_arrays # type: ignore[assignment] 

716 arr_columns = columns 

717 

718 return arrays, arr_columns 

719 

720 

721def _get_names_from_index(data) -> Index: 

722 has_some_name = any(getattr(s, "name", None) is not None for s in data) 

723 if not has_some_name: 

724 return default_index(len(data)) 

725 

726 index: list[Hashable] = list(range(len(data))) 

727 count = 0 

728 for i, s in enumerate(data): 

729 n = getattr(s, "name", None) 

730 if n is not None: 

731 index[i] = n 

732 else: 

733 index[i] = f"Unnamed {count}" 

734 count += 1 

735 

736 return Index(index) 

737 

738 

739def _get_axes( 

740 N: int, K: int, index: Index | None, columns: Index | None 

741) -> tuple[Index, Index]: 

742 # helper to create the axes as indexes 

743 # return axes or defaults 

744 

745 if index is None: 

746 index = default_index(N) 

747 else: 

748 index = ensure_index(index) 

749 

750 if columns is None: 

751 columns = default_index(K) 

752 else: 

753 columns = ensure_index(columns) 

754 return index, columns 

755 

756 

757def dataclasses_to_dicts(data): 

758 """ 

759 Converts a list of dataclass instances to a list of dictionaries. 

760 

761 Parameters 

762 ---------- 

763 data : List[Type[dataclass]] 

764 

765 Returns 

766 -------- 

767 list_dict : List[dict] 

768 

769 Examples 

770 -------- 

771 >>> from dataclasses import dataclass 

772 >>> @dataclass 

773 ... class Point: 

774 ... x: int 

775 ... y: int 

776 

777 >>> dataclasses_to_dicts([Point(1, 2), Point(2, 3)]) 

778 [{'x': 1, 'y': 2}, {'x': 2, 'y': 3}] 

779 

780 """ 

781 from dataclasses import asdict 

782 

783 return list(map(asdict, data)) 

784 

785 

786# --------------------------------------------------------------------- 

787# Conversion of Inputs to Arrays 

788 

789 

790def to_arrays( 

791 data, columns: Index | None, dtype: DtypeObj | None = None 

792) -> tuple[list[ArrayLike], Index]: 

793 """ 

794 Return list of arrays, columns. 

795 

796 Returns 

797 ------- 

798 list[ArrayLike] 

799 These will become columns in a DataFrame. 

800 Index 

801 This will become frame.columns. 

802 

803 Notes 

804 ----- 

805 Ensures that len(result_arrays) == len(result_index). 

806 """ 

807 if isinstance(data, ABCDataFrame): 

808 # see test_from_records_with_index_data, test_from_records_bad_index_column 

809 if columns is not None: 

810 arrays = [ 

811 data._ixs(i, axis=1).values 

812 for i, col in enumerate(data.columns) 

813 if col in columns 

814 ] 

815 else: 

816 columns = data.columns 

817 arrays = [data._ixs(i, axis=1).values for i in range(len(columns))] 

818 

819 return arrays, columns 

820 

821 if not len(data): 

822 if isinstance(data, np.ndarray): 

823 if data.dtype.names is not None: 

824 # i.e. numpy structured array 

825 columns = ensure_index(data.dtype.names) 

826 arrays = [data[name] for name in columns] 

827 

828 if len(data) == 0: 

829 # GH#42456 the indexing above results in list of 2D ndarrays 

830 # TODO: is that an issue with numpy? 

831 for i, arr in enumerate(arrays): 

832 if arr.ndim == 2: 

833 arrays[i] = arr[:, 0] 

834 

835 return arrays, columns 

836 return [], ensure_index([]) 

837 

838 elif isinstance(data[0], Categorical): 

839 # GH#38845 deprecate special case 

840 warnings.warn( 

841 "The behavior of DataFrame([categorical, ...]) is deprecated and " 

842 "in a future version will be changed to match the behavior of " 

843 "DataFrame([any_listlike, ...]). " 

844 "To retain the old behavior, pass as a dictionary " 

845 "DataFrame({col: categorical, ..})", 

846 FutureWarning, 

847 stacklevel=find_stack_level(), 

848 ) 

849 if columns is None: 

850 columns = default_index(len(data)) 

851 elif len(columns) > len(data): 

852 raise ValueError("len(columns) > len(data)") 

853 elif len(columns) < len(data): 

854 # doing this here is akin to a pre-emptive reindex 

855 data = data[: len(columns)] 

856 return data, columns 

857 

858 elif isinstance(data, np.ndarray) and data.dtype.names is not None: 

859 # e.g. recarray 

860 columns = Index(list(data.dtype.names)) 

861 arrays = [data[k] for k in columns] 

862 return arrays, columns 

863 

864 if isinstance(data[0], (list, tuple)): 

865 arr = _list_to_arrays(data) 

866 elif isinstance(data[0], abc.Mapping): 

867 arr, columns = _list_of_dict_to_arrays(data, columns) 

868 elif isinstance(data[0], ABCSeries): 

869 arr, columns = _list_of_series_to_arrays(data, columns) 

870 else: 

871 # last ditch effort 

872 data = [tuple(x) for x in data] 

873 arr = _list_to_arrays(data) 

874 

875 content, columns = _finalize_columns_and_data(arr, columns, dtype) 

876 return content, columns 

877 

878 

879def _list_to_arrays(data: list[tuple | list]) -> np.ndarray: 

880 # Returned np.ndarray has ndim = 2 

881 # Note: we already check len(data) > 0 before getting hre 

882 if isinstance(data[0], tuple): 

883 content = lib.to_object_array_tuples(data) 

884 else: 

885 # list of lists 

886 content = lib.to_object_array(data) 

887 return content 

888 

889 

890def _list_of_series_to_arrays( 

891 data: list, 

892 columns: Index | None, 

893) -> tuple[np.ndarray, Index]: 

894 # returned np.ndarray has ndim == 2 

895 

896 if columns is None: 

897 # We know pass_data is non-empty because data[0] is a Series 

898 pass_data = [x for x in data if isinstance(x, (ABCSeries, ABCDataFrame))] 

899 columns = get_objs_combined_axis(pass_data, sort=False) 

900 

901 indexer_cache: dict[int, np.ndarray] = {} 

902 

903 aligned_values = [] 

904 for s in data: 

905 index = getattr(s, "index", None) 

906 if index is None: 

907 index = default_index(len(s)) 

908 

909 if id(index) in indexer_cache: 

910 indexer = indexer_cache[id(index)] 

911 else: 

912 indexer = indexer_cache[id(index)] = index.get_indexer(columns) 

913 

914 values = extract_array(s, extract_numpy=True) 

915 aligned_values.append(algorithms.take_nd(values, indexer)) 

916 

917 content = np.vstack(aligned_values) 

918 return content, columns 

919 

920 

921def _list_of_dict_to_arrays( 

922 data: list[dict], 

923 columns: Index | None, 

924) -> tuple[np.ndarray, Index]: 

925 """ 

926 Convert list of dicts to numpy arrays 

927 

928 if `columns` is not passed, column names are inferred from the records 

929 - for OrderedDict and dicts, the column names match 

930 the key insertion-order from the first record to the last. 

931 - For other kinds of dict-likes, the keys are lexically sorted. 

932 

933 Parameters 

934 ---------- 

935 data : iterable 

936 collection of records (OrderedDict, dict) 

937 columns: iterables or None 

938 

939 Returns 

940 ------- 

941 content : np.ndarray[object, ndim=2] 

942 columns : Index 

943 """ 

944 if columns is None: 

945 gen = (list(x.keys()) for x in data) 

946 sort = not any(isinstance(d, dict) for d in data) 

947 pre_cols = lib.fast_unique_multiple_list_gen(gen, sort=sort) 

948 columns = ensure_index(pre_cols) 

949 

950 # assure that they are of the base dict class and not of derived 

951 # classes 

952 data = [d if type(d) is dict else dict(d) for d in data] 

953 

954 content = lib.dicts_to_array(data, list(columns)) 

955 return content, columns 

956 

957 

958def _finalize_columns_and_data( 

959 content: np.ndarray, # ndim == 2 

960 columns: Index | None, 

961 dtype: DtypeObj | None, 

962) -> tuple[list[ArrayLike], Index]: 

963 """ 

964 Ensure we have valid columns, cast object dtypes if possible. 

965 """ 

966 contents = list(content.T) 

967 

968 try: 

969 columns = _validate_or_indexify_columns(contents, columns) 

970 except AssertionError as err: 

971 # GH#26429 do not raise user-facing AssertionError 

972 raise ValueError(err) from err 

973 

974 if len(contents) and contents[0].dtype == np.object_: 

975 contents = _convert_object_array(contents, dtype=dtype) 

976 

977 return contents, columns 

978 

979 

980def _validate_or_indexify_columns( 

981 content: list[np.ndarray], columns: Index | None 

982) -> Index: 

983 """ 

984 If columns is None, make numbers as column names; Otherwise, validate that 

985 columns have valid length. 

986 

987 Parameters 

988 ---------- 

989 content : list of np.ndarrays 

990 columns : Index or None 

991 

992 Returns 

993 ------- 

994 Index 

995 If columns is None, assign positional column index value as columns. 

996 

997 Raises 

998 ------ 

999 1. AssertionError when content is not composed of list of lists, and if 

1000 length of columns is not equal to length of content. 

1001 2. ValueError when content is list of lists, but length of each sub-list 

1002 is not equal 

1003 3. ValueError when content is list of lists, but length of sub-list is 

1004 not equal to length of content 

1005 """ 

1006 if columns is None: 

1007 columns = default_index(len(content)) 

1008 else: 

1009 

1010 # Add mask for data which is composed of list of lists 

1011 is_mi_list = isinstance(columns, list) and all( 

1012 isinstance(col, list) for col in columns 

1013 ) 

1014 

1015 if not is_mi_list and len(columns) != len(content): # pragma: no cover 

1016 # caller's responsibility to check for this... 

1017 raise AssertionError( 

1018 f"{len(columns)} columns passed, passed data had " 

1019 f"{len(content)} columns" 

1020 ) 

1021 elif is_mi_list: 

1022 

1023 # check if nested list column, length of each sub-list should be equal 

1024 if len({len(col) for col in columns}) > 1: 

1025 raise ValueError( 

1026 "Length of columns passed for MultiIndex columns is different" 

1027 ) 

1028 

1029 # if columns is not empty and length of sublist is not equal to content 

1030 elif columns and len(columns[0]) != len(content): 

1031 raise ValueError( 

1032 f"{len(columns[0])} columns passed, passed data had " 

1033 f"{len(content)} columns" 

1034 ) 

1035 return columns 

1036 

1037 

1038def _convert_object_array( 

1039 content: list[np.ndarray], dtype: DtypeObj | None 

1040) -> list[ArrayLike]: 

1041 """ 

1042 Internal function to convert object array. 

1043 

1044 Parameters 

1045 ---------- 

1046 content: List[np.ndarray] 

1047 dtype: np.dtype or ExtensionDtype 

1048 

1049 Returns 

1050 ------- 

1051 List[ArrayLike] 

1052 """ 

1053 # provide soft conversion of object dtypes 

1054 def convert(arr): 

1055 if dtype != np.dtype("O"): 

1056 arr = lib.maybe_convert_objects(arr) 

1057 arr = maybe_cast_to_datetime(arr, dtype) 

1058 return arr 

1059 

1060 arrays = [convert(arr) for arr in content] 

1061 

1062 return arrays