Coverage for /var/srv/projects/api.amasfac.comuna18.com/tmp/venv/lib/python3.9/site-packages/pandas/core/internals/construction.py: 6%
430 statements
« prev ^ index » next coverage.py v6.4.4, created at 2023-07-17 14:22 -0600
« prev ^ index » next coverage.py v6.4.4, created at 2023-07-17 14:22 -0600
1"""
2Functions for preparing various inputs passed to the DataFrame or Series
3constructors before passing them to a BlockManager.
4"""
5from __future__ import annotations
7from collections import abc
8from typing import (
9 TYPE_CHECKING,
10 Any,
11 Hashable,
12 Sequence,
13 cast,
14)
15import warnings
17import numpy as np
18import numpy.ma as ma
20from pandas._libs import lib
21from pandas._typing import (
22 ArrayLike,
23 DtypeObj,
24 Manager,
25)
26from pandas.util._exceptions import find_stack_level
28from pandas.core.dtypes.cast import (
29 construct_1d_arraylike_from_scalar,
30 dict_compat,
31 maybe_cast_to_datetime,
32 maybe_convert_platform,
33 maybe_infer_to_datetimelike,
34 maybe_upcast,
35)
36from pandas.core.dtypes.common import (
37 is_1d_only_ea_dtype,
38 is_datetime_or_timedelta_dtype,
39 is_dtype_equal,
40 is_extension_array_dtype,
41 is_integer_dtype,
42 is_list_like,
43 is_named_tuple,
44 is_object_dtype,
45)
46from pandas.core.dtypes.generic import (
47 ABCDataFrame,
48 ABCSeries,
49)
51from pandas.core import (
52 algorithms,
53 common as com,
54)
55from pandas.core.arrays import (
56 Categorical,
57 DatetimeArray,
58 ExtensionArray,
59 TimedeltaArray,
60)
61from pandas.core.construction import (
62 ensure_wrapped_if_datetimelike,
63 extract_array,
64 range_to_ndarray,
65 sanitize_array,
66)
67from pandas.core.indexes.api import (
68 DatetimeIndex,
69 Index,
70 TimedeltaIndex,
71 default_index,
72 ensure_index,
73 get_objs_combined_axis,
74 union_indexes,
75)
76from pandas.core.internals.array_manager import (
77 ArrayManager,
78 SingleArrayManager,
79)
80from pandas.core.internals.blocks import (
81 BlockPlacement,
82 ensure_block_shape,
83 new_block_2d,
84)
85from pandas.core.internals.managers import (
86 BlockManager,
87 SingleBlockManager,
88 create_block_manager_from_blocks,
89 create_block_manager_from_column_arrays,
90)
92if TYPE_CHECKING: 92 ↛ 93line 92 didn't jump to line 93, because the condition on line 92 was never true
93 from numpy.ma.mrecords import MaskedRecords
96# ---------------------------------------------------------------------
97# BlockManager Interface
100def arrays_to_mgr(
101 arrays,
102 columns: Index,
103 index,
104 *,
105 dtype: DtypeObj | None = None,
106 verify_integrity: bool = True,
107 typ: str | None = None,
108 consolidate: bool = True,
109) -> Manager:
110 """
111 Segregate Series based on type and coerce into matrices.
113 Needs to handle a lot of exceptional cases.
114 """
115 if verify_integrity:
116 # figure out the index, if necessary
117 if index is None:
118 index = _extract_index(arrays)
119 else:
120 index = ensure_index(index)
122 # don't force copy because getting jammed in an ndarray anyway
123 arrays = _homogenize(arrays, index, dtype)
124 # _homogenize ensures
125 # - all(len(x) == len(index) for x in arrays)
126 # - all(x.ndim == 1 for x in arrays)
127 # - all(isinstance(x, (np.ndarray, ExtensionArray)) for x in arrays)
128 # - all(type(x) is not PandasArray for x in arrays)
130 else:
131 index = ensure_index(index)
132 arrays = [extract_array(x, extract_numpy=True) for x in arrays]
134 # Reached via DataFrame._from_arrays; we do validation here
135 for arr in arrays:
136 if (
137 not isinstance(arr, (np.ndarray, ExtensionArray))
138 or arr.ndim != 1
139 or len(arr) != len(index)
140 ):
141 raise ValueError(
142 "Arrays must be 1-dimensional np.ndarray or ExtensionArray "
143 "with length matching len(index)"
144 )
146 columns = ensure_index(columns)
147 if len(columns) != len(arrays):
148 raise ValueError("len(arrays) must match len(columns)")
150 # from BlockManager perspective
151 axes = [columns, index]
153 if typ == "block":
154 return create_block_manager_from_column_arrays(
155 arrays, axes, consolidate=consolidate
156 )
157 elif typ == "array":
158 return ArrayManager(arrays, [index, columns])
159 else:
160 raise ValueError(f"'typ' needs to be one of {{'block', 'array'}}, got '{typ}'")
163def rec_array_to_mgr(
164 data: MaskedRecords | np.recarray | np.ndarray,
165 index,
166 columns,
167 dtype: DtypeObj | None,
168 copy: bool,
169 typ: str,
170) -> Manager:
171 """
172 Extract from a masked rec array and create the manager.
173 """
174 # essentially process a record array then fill it
175 fdata = ma.getdata(data)
176 if index is None:
177 index = default_index(len(fdata))
178 else:
179 index = ensure_index(index)
181 if columns is not None:
182 columns = ensure_index(columns)
183 arrays, arr_columns = to_arrays(fdata, columns)
185 # fill if needed
186 if isinstance(data, np.ma.MaskedArray):
187 # GH#42200 we only get here with MaskedRecords, but check for the
188 # parent class MaskedArray to avoid the need to import MaskedRecords
189 data = cast("MaskedRecords", data)
190 new_arrays = fill_masked_arrays(data, arr_columns)
191 else:
192 # error: Incompatible types in assignment (expression has type
193 # "List[ExtensionArray]", variable has type "List[ndarray]")
194 new_arrays = arrays # type: ignore[assignment]
196 # create the manager
198 # error: Argument 1 to "reorder_arrays" has incompatible type "List[ndarray]";
199 # expected "List[Union[ExtensionArray, ndarray]]"
200 arrays, arr_columns = reorder_arrays(
201 new_arrays, arr_columns, columns, len(index) # type: ignore[arg-type]
202 )
203 if columns is None:
204 columns = arr_columns
206 mgr = arrays_to_mgr(arrays, columns, index, dtype=dtype, typ=typ)
208 if copy:
209 mgr = mgr.copy()
210 return mgr
213def fill_masked_arrays(data: MaskedRecords, arr_columns: Index) -> list[np.ndarray]:
214 """
215 Convert numpy MaskedRecords to ensure mask is softened.
216 """
217 new_arrays = []
219 for col in arr_columns:
220 arr = data[col]
221 fv = arr.fill_value
223 mask = ma.getmaskarray(arr)
224 if mask.any():
225 arr, fv = maybe_upcast(arr, fill_value=fv, copy=True)
226 arr[mask] = fv
227 new_arrays.append(arr)
228 return new_arrays
231def mgr_to_mgr(mgr, typ: str, copy: bool = True):
232 """
233 Convert to specific type of Manager. Does not copy if the type is already
234 correct. Does not guarantee a copy otherwise. `copy` keyword only controls
235 whether conversion from Block->ArrayManager copies the 1D arrays.
236 """
237 new_mgr: Manager
239 if typ == "block":
240 if isinstance(mgr, BlockManager):
241 new_mgr = mgr
242 else:
243 if mgr.ndim == 2:
244 new_mgr = arrays_to_mgr(
245 mgr.arrays, mgr.axes[0], mgr.axes[1], typ="block"
246 )
247 else:
248 new_mgr = SingleBlockManager.from_array(mgr.arrays[0], mgr.index)
249 elif typ == "array":
250 if isinstance(mgr, ArrayManager):
251 new_mgr = mgr
252 else:
253 if mgr.ndim == 2:
254 arrays = [mgr.iget_values(i) for i in range(len(mgr.axes[0]))]
255 if copy:
256 arrays = [arr.copy() for arr in arrays]
257 new_mgr = ArrayManager(arrays, [mgr.axes[1], mgr.axes[0]])
258 else:
259 array = mgr.internal_values()
260 if copy:
261 array = array.copy()
262 new_mgr = SingleArrayManager([array], [mgr.index])
263 else:
264 raise ValueError(f"'typ' needs to be one of {{'block', 'array'}}, got '{typ}'")
265 return new_mgr
268# ---------------------------------------------------------------------
269# DataFrame Constructor Interface
272def ndarray_to_mgr(
273 values, index, columns, dtype: DtypeObj | None, copy: bool, typ: str
274) -> Manager:
275 # used in DataFrame.__init__
276 # input must be a ndarray, list, Series, Index, ExtensionArray
278 if isinstance(values, ABCSeries):
279 if columns is None:
280 if values.name is not None:
281 columns = Index([values.name])
282 if index is None:
283 index = values.index
284 else:
285 values = values.reindex(index)
287 # zero len case (GH #2234)
288 if not len(values) and columns is not None and len(columns):
289 values = np.empty((0, 1), dtype=object)
291 # if the array preparation does a copy -> avoid this for ArrayManager,
292 # since the copy is done on conversion to 1D arrays
293 copy_on_sanitize = False if typ == "array" else copy
295 vdtype = getattr(values, "dtype", None)
296 if is_1d_only_ea_dtype(vdtype) or is_1d_only_ea_dtype(dtype):
297 # GH#19157
299 if isinstance(values, (np.ndarray, ExtensionArray)) and values.ndim > 1:
300 # GH#12513 a EA dtype passed with a 2D array, split into
301 # multiple EAs that view the values
302 # error: No overload variant of "__getitem__" of "ExtensionArray"
303 # matches argument type "Tuple[slice, int]"
304 values = [
305 values[:, n] # type: ignore[call-overload]
306 for n in range(values.shape[1])
307 ]
308 else:
309 values = [values]
311 if columns is None:
312 columns = Index(range(len(values)))
313 else:
314 columns = ensure_index(columns)
316 return arrays_to_mgr(values, columns, index, dtype=dtype, typ=typ)
318 elif is_extension_array_dtype(vdtype) and not is_1d_only_ea_dtype(vdtype):
319 # i.e. Datetime64TZ, PeriodDtype
320 values = extract_array(values, extract_numpy=True)
321 if copy:
322 values = values.copy()
323 if values.ndim == 1:
324 values = values.reshape(-1, 1)
326 else:
327 # by definition an array here
328 # the dtypes will be coerced to a single dtype
329 values = _prep_ndarraylike(values, copy=copy_on_sanitize)
331 if dtype is not None and not is_dtype_equal(values.dtype, dtype):
332 # GH#40110 see similar check inside sanitize_array
333 rcf = not (is_integer_dtype(dtype) and values.dtype.kind == "f")
335 values = sanitize_array(
336 values,
337 None,
338 dtype=dtype,
339 copy=copy_on_sanitize,
340 raise_cast_failure=rcf,
341 allow_2d=True,
342 )
344 # _prep_ndarraylike ensures that values.ndim == 2 at this point
345 index, columns = _get_axes(
346 values.shape[0], values.shape[1], index=index, columns=columns
347 )
349 _check_values_indices_shape_match(values, index, columns)
351 if typ == "array":
353 if issubclass(values.dtype.type, str):
354 values = np.array(values, dtype=object)
356 if dtype is None and is_object_dtype(values.dtype):
357 arrays = [
358 ensure_wrapped_if_datetimelike(
359 maybe_infer_to_datetimelike(values[:, i])
360 )
361 for i in range(values.shape[1])
362 ]
363 else:
364 if is_datetime_or_timedelta_dtype(values.dtype):
365 values = ensure_wrapped_if_datetimelike(values)
366 arrays = [values[:, i] for i in range(values.shape[1])]
368 if copy:
369 arrays = [arr.copy() for arr in arrays]
371 return ArrayManager(arrays, [index, columns], verify_integrity=False)
373 values = values.T
375 # if we don't have a dtype specified, then try to convert objects
376 # on the entire block; this is to convert if we have datetimelike's
377 # embedded in an object type
378 if dtype is None and is_object_dtype(values.dtype):
379 obj_columns = list(values)
380 maybe_datetime = [maybe_infer_to_datetimelike(x) for x in obj_columns]
381 # don't convert (and copy) the objects if no type inference occurs
382 if any(x is not y for x, y in zip(obj_columns, maybe_datetime)):
383 dvals_list = [ensure_block_shape(dval, 2) for dval in maybe_datetime]
384 block_values = [
385 new_block_2d(dvals_list[n], placement=BlockPlacement(n))
386 for n in range(len(dvals_list))
387 ]
388 else:
389 bp = BlockPlacement(slice(len(columns)))
390 nb = new_block_2d(values, placement=bp)
391 block_values = [nb]
392 else:
393 bp = BlockPlacement(slice(len(columns)))
394 nb = new_block_2d(values, placement=bp)
395 block_values = [nb]
397 if len(columns) == 0:
398 block_values = []
400 return create_block_manager_from_blocks(
401 block_values, [columns, index], verify_integrity=False
402 )
405def _check_values_indices_shape_match(
406 values: np.ndarray, index: Index, columns: Index
407) -> None:
408 """
409 Check that the shape implied by our axes matches the actual shape of the
410 data.
411 """
412 if values.shape[1] != len(columns) or values.shape[0] != len(index):
413 # Could let this raise in Block constructor, but we get a more
414 # helpful exception message this way.
415 if values.shape[0] == 0:
416 raise ValueError("Empty data passed with indices specified.")
418 passed = values.shape
419 implied = (len(index), len(columns))
420 raise ValueError(f"Shape of passed values is {passed}, indices imply {implied}")
423def dict_to_mgr(
424 data: dict,
425 index,
426 columns,
427 *,
428 dtype: DtypeObj | None = None,
429 typ: str = "block",
430 copy: bool = True,
431) -> Manager:
432 """
433 Segregate Series based on type and coerce into matrices.
434 Needs to handle a lot of exceptional cases.
436 Used in DataFrame.__init__
437 """
438 arrays: Sequence[Any] | Series
440 if columns is not None:
441 from pandas.core.series import Series
443 arrays = Series(data, index=columns, dtype=object)
444 missing = arrays.isna()
445 if index is None:
446 # GH10856
447 # raise ValueError if only scalars in dict
448 index = _extract_index(arrays[~missing])
449 else:
450 index = ensure_index(index)
452 # no obvious "empty" int column
453 if missing.any() and not is_integer_dtype(dtype):
454 nan_dtype: DtypeObj
456 if dtype is not None:
457 # calling sanitize_array ensures we don't mix-and-match
458 # NA dtypes
459 midxs = missing.values.nonzero()[0]
460 for i in midxs:
461 arr = sanitize_array(arrays.iat[i], index, dtype=dtype)
462 arrays.iat[i] = arr
463 else:
464 # GH#1783
465 nan_dtype = np.dtype("object")
466 val = construct_1d_arraylike_from_scalar(np.nan, len(index), nan_dtype)
467 nmissing = missing.sum()
468 if copy:
469 rhs = [val] * nmissing
470 else:
471 # GH#45369
472 rhs = [val.copy() for _ in range(nmissing)]
473 arrays.loc[missing] = rhs
475 arrays = list(arrays)
476 columns = ensure_index(columns)
478 else:
479 keys = list(data.keys())
480 columns = Index(keys)
481 arrays = [com.maybe_iterable_to_list(data[k]) for k in keys]
482 arrays = [arr if not isinstance(arr, Index) else arr._data for arr in arrays]
484 if copy:
485 if typ == "block":
486 # We only need to copy arrays that will not get consolidated, i.e.
487 # only EA arrays
488 arrays = [x.copy() if isinstance(x, ExtensionArray) else x for x in arrays]
489 else:
490 # dtype check to exclude e.g. range objects, scalars
491 arrays = [x.copy() if hasattr(x, "dtype") else x for x in arrays]
493 return arrays_to_mgr(arrays, columns, index, dtype=dtype, typ=typ, consolidate=copy)
496def nested_data_to_arrays(
497 data: Sequence,
498 columns: Index | None,
499 index: Index | None,
500 dtype: DtypeObj | None,
501) -> tuple[list[ArrayLike], Index, Index]:
502 """
503 Convert a single sequence of arrays to multiple arrays.
504 """
505 # By the time we get here we have already checked treat_as_nested(data)
507 if is_named_tuple(data[0]) and columns is None:
508 columns = ensure_index(data[0]._fields)
510 arrays, columns = to_arrays(data, columns, dtype=dtype)
511 columns = ensure_index(columns)
513 if index is None:
514 if isinstance(data[0], ABCSeries):
515 index = _get_names_from_index(data)
516 elif isinstance(data[0], Categorical):
517 # GH#38845 hit in test_constructor_categorical
518 index = default_index(len(data[0]))
519 else:
520 index = default_index(len(data))
522 return arrays, columns, index
525def treat_as_nested(data) -> bool:
526 """
527 Check if we should use nested_data_to_arrays.
528 """
529 return (
530 len(data) > 0
531 and is_list_like(data[0])
532 and getattr(data[0], "ndim", 1) == 1
533 and not (isinstance(data, ExtensionArray) and data.ndim == 2)
534 )
537# ---------------------------------------------------------------------
540def _prep_ndarraylike(
541 values, copy: bool = True
542) -> np.ndarray | DatetimeArray | TimedeltaArray:
543 if isinstance(values, TimedeltaArray) or (
544 isinstance(values, DatetimeArray) and values.tz is None
545 ):
546 # By retaining DTA/TDA instead of unpacking, we end up retaining non-nano
547 pass
549 elif not isinstance(values, (np.ndarray, ABCSeries, Index)):
550 if len(values) == 0:
551 return np.empty((0, 0), dtype=object)
552 elif isinstance(values, range):
553 arr = range_to_ndarray(values)
554 return arr[..., np.newaxis]
556 def convert(v):
557 if not is_list_like(v) or isinstance(v, ABCDataFrame):
558 return v
560 v = extract_array(v, extract_numpy=True)
561 res = maybe_convert_platform(v)
562 return res
564 # we could have a 1-dim or 2-dim list here
565 # this is equiv of np.asarray, but does object conversion
566 # and platform dtype preservation
567 if is_list_like(values[0]):
568 values = np.array([convert(v) for v in values])
569 elif isinstance(values[0], np.ndarray) and values[0].ndim == 0:
570 # GH#21861 see test_constructor_list_of_lists
571 values = np.array([convert(v) for v in values])
572 else:
573 values = convert(values)
575 else:
577 # drop subclass info
578 values = np.array(values, copy=copy)
580 if values.ndim == 1:
581 values = values.reshape((values.shape[0], 1))
582 elif values.ndim != 2:
583 raise ValueError(f"Must pass 2-d input. shape={values.shape}")
585 return values
588def _homogenize(data, index: Index, dtype: DtypeObj | None) -> list[ArrayLike]:
589 oindex = None
590 homogenized = []
592 for val in data:
593 if isinstance(val, ABCSeries):
594 if dtype is not None:
595 val = val.astype(dtype, copy=False)
596 if val.index is not index:
597 # Forces alignment. No need to copy data since we
598 # are putting it into an ndarray later
599 val = val.reindex(index, copy=False)
601 val = val._values
602 else:
603 if isinstance(val, dict):
604 # GH#41785 this _should_ be equivalent to (but faster than)
605 # val = create_series_with_explicit_dtype(val, index=index)._values
606 if oindex is None:
607 oindex = index.astype("O")
609 if isinstance(index, (DatetimeIndex, TimedeltaIndex)):
610 # see test_constructor_dict_datetime64_index
611 val = dict_compat(val)
612 else:
613 # see test_constructor_subclass_dict
614 val = dict(val)
615 val = lib.fast_multiget(val, oindex._values, default=np.nan)
617 val = sanitize_array(
618 val, index, dtype=dtype, copy=False, raise_cast_failure=False
619 )
620 com.require_length_match(val, index)
622 homogenized.append(val)
624 return homogenized
627def _extract_index(data) -> Index:
628 """
629 Try to infer an Index from the passed data, raise ValueError on failure.
630 """
631 index = None
632 if len(data) == 0:
633 index = Index([])
634 else:
635 raw_lengths = []
636 indexes: list[list[Hashable] | Index] = []
638 have_raw_arrays = False
639 have_series = False
640 have_dicts = False
642 for val in data:
643 if isinstance(val, ABCSeries):
644 have_series = True
645 indexes.append(val.index)
646 elif isinstance(val, dict):
647 have_dicts = True
648 indexes.append(list(val.keys()))
649 elif is_list_like(val) and getattr(val, "ndim", 1) == 1:
650 have_raw_arrays = True
651 raw_lengths.append(len(val))
652 elif isinstance(val, np.ndarray) and val.ndim > 1:
653 raise ValueError("Per-column arrays must each be 1-dimensional")
655 if not indexes and not raw_lengths:
656 raise ValueError("If using all scalar values, you must pass an index")
658 elif have_series:
659 index = union_indexes(indexes)
660 elif have_dicts:
661 index = union_indexes(indexes, sort=False)
663 if have_raw_arrays:
664 lengths = list(set(raw_lengths))
665 if len(lengths) > 1:
666 raise ValueError("All arrays must be of the same length")
668 if have_dicts:
669 raise ValueError(
670 "Mixing dicts with non-Series may lead to ambiguous ordering."
671 )
673 if have_series:
674 assert index is not None # for mypy
675 if lengths[0] != len(index):
676 msg = (
677 f"array length {lengths[0]} does not match index "
678 f"length {len(index)}"
679 )
680 raise ValueError(msg)
681 else:
682 index = default_index(lengths[0])
684 # error: Argument 1 to "ensure_index" has incompatible type "Optional[Index]";
685 # expected "Union[Union[Union[ExtensionArray, ndarray], Index, Series],
686 # Sequence[Any]]"
687 return ensure_index(index) # type: ignore[arg-type]
690def reorder_arrays(
691 arrays: list[ArrayLike], arr_columns: Index, columns: Index | None, length: int
692) -> tuple[list[ArrayLike], Index]:
693 """
694 Pre-emptively (cheaply) reindex arrays with new columns.
695 """
696 # reorder according to the columns
697 if columns is not None:
698 if not columns.equals(arr_columns):
699 # if they are equal, there is nothing to do
700 new_arrays: list[ArrayLike | None]
701 new_arrays = [None] * len(columns)
702 indexer = arr_columns.get_indexer(columns)
703 for i, k in enumerate(indexer):
704 if k == -1:
705 # by convention default is all-NaN object dtype
706 arr = np.empty(length, dtype=object)
707 arr.fill(np.nan)
708 else:
709 arr = arrays[k]
710 new_arrays[i] = arr
712 # Incompatible types in assignment (expression has type
713 # "List[Union[ExtensionArray, ndarray[Any, Any], None]]", variable
714 # has type "List[Union[ExtensionArray, ndarray[Any, Any]]]")
715 arrays = new_arrays # type: ignore[assignment]
716 arr_columns = columns
718 return arrays, arr_columns
721def _get_names_from_index(data) -> Index:
722 has_some_name = any(getattr(s, "name", None) is not None for s in data)
723 if not has_some_name:
724 return default_index(len(data))
726 index: list[Hashable] = list(range(len(data)))
727 count = 0
728 for i, s in enumerate(data):
729 n = getattr(s, "name", None)
730 if n is not None:
731 index[i] = n
732 else:
733 index[i] = f"Unnamed {count}"
734 count += 1
736 return Index(index)
739def _get_axes(
740 N: int, K: int, index: Index | None, columns: Index | None
741) -> tuple[Index, Index]:
742 # helper to create the axes as indexes
743 # return axes or defaults
745 if index is None:
746 index = default_index(N)
747 else:
748 index = ensure_index(index)
750 if columns is None:
751 columns = default_index(K)
752 else:
753 columns = ensure_index(columns)
754 return index, columns
757def dataclasses_to_dicts(data):
758 """
759 Converts a list of dataclass instances to a list of dictionaries.
761 Parameters
762 ----------
763 data : List[Type[dataclass]]
765 Returns
766 --------
767 list_dict : List[dict]
769 Examples
770 --------
771 >>> from dataclasses import dataclass
772 >>> @dataclass
773 ... class Point:
774 ... x: int
775 ... y: int
777 >>> dataclasses_to_dicts([Point(1, 2), Point(2, 3)])
778 [{'x': 1, 'y': 2}, {'x': 2, 'y': 3}]
780 """
781 from dataclasses import asdict
783 return list(map(asdict, data))
786# ---------------------------------------------------------------------
787# Conversion of Inputs to Arrays
790def to_arrays(
791 data, columns: Index | None, dtype: DtypeObj | None = None
792) -> tuple[list[ArrayLike], Index]:
793 """
794 Return list of arrays, columns.
796 Returns
797 -------
798 list[ArrayLike]
799 These will become columns in a DataFrame.
800 Index
801 This will become frame.columns.
803 Notes
804 -----
805 Ensures that len(result_arrays) == len(result_index).
806 """
807 if isinstance(data, ABCDataFrame):
808 # see test_from_records_with_index_data, test_from_records_bad_index_column
809 if columns is not None:
810 arrays = [
811 data._ixs(i, axis=1).values
812 for i, col in enumerate(data.columns)
813 if col in columns
814 ]
815 else:
816 columns = data.columns
817 arrays = [data._ixs(i, axis=1).values for i in range(len(columns))]
819 return arrays, columns
821 if not len(data):
822 if isinstance(data, np.ndarray):
823 if data.dtype.names is not None:
824 # i.e. numpy structured array
825 columns = ensure_index(data.dtype.names)
826 arrays = [data[name] for name in columns]
828 if len(data) == 0:
829 # GH#42456 the indexing above results in list of 2D ndarrays
830 # TODO: is that an issue with numpy?
831 for i, arr in enumerate(arrays):
832 if arr.ndim == 2:
833 arrays[i] = arr[:, 0]
835 return arrays, columns
836 return [], ensure_index([])
838 elif isinstance(data[0], Categorical):
839 # GH#38845 deprecate special case
840 warnings.warn(
841 "The behavior of DataFrame([categorical, ...]) is deprecated and "
842 "in a future version will be changed to match the behavior of "
843 "DataFrame([any_listlike, ...]). "
844 "To retain the old behavior, pass as a dictionary "
845 "DataFrame({col: categorical, ..})",
846 FutureWarning,
847 stacklevel=find_stack_level(),
848 )
849 if columns is None:
850 columns = default_index(len(data))
851 elif len(columns) > len(data):
852 raise ValueError("len(columns) > len(data)")
853 elif len(columns) < len(data):
854 # doing this here is akin to a pre-emptive reindex
855 data = data[: len(columns)]
856 return data, columns
858 elif isinstance(data, np.ndarray) and data.dtype.names is not None:
859 # e.g. recarray
860 columns = Index(list(data.dtype.names))
861 arrays = [data[k] for k in columns]
862 return arrays, columns
864 if isinstance(data[0], (list, tuple)):
865 arr = _list_to_arrays(data)
866 elif isinstance(data[0], abc.Mapping):
867 arr, columns = _list_of_dict_to_arrays(data, columns)
868 elif isinstance(data[0], ABCSeries):
869 arr, columns = _list_of_series_to_arrays(data, columns)
870 else:
871 # last ditch effort
872 data = [tuple(x) for x in data]
873 arr = _list_to_arrays(data)
875 content, columns = _finalize_columns_and_data(arr, columns, dtype)
876 return content, columns
879def _list_to_arrays(data: list[tuple | list]) -> np.ndarray:
880 # Returned np.ndarray has ndim = 2
881 # Note: we already check len(data) > 0 before getting hre
882 if isinstance(data[0], tuple):
883 content = lib.to_object_array_tuples(data)
884 else:
885 # list of lists
886 content = lib.to_object_array(data)
887 return content
890def _list_of_series_to_arrays(
891 data: list,
892 columns: Index | None,
893) -> tuple[np.ndarray, Index]:
894 # returned np.ndarray has ndim == 2
896 if columns is None:
897 # We know pass_data is non-empty because data[0] is a Series
898 pass_data = [x for x in data if isinstance(x, (ABCSeries, ABCDataFrame))]
899 columns = get_objs_combined_axis(pass_data, sort=False)
901 indexer_cache: dict[int, np.ndarray] = {}
903 aligned_values = []
904 for s in data:
905 index = getattr(s, "index", None)
906 if index is None:
907 index = default_index(len(s))
909 if id(index) in indexer_cache:
910 indexer = indexer_cache[id(index)]
911 else:
912 indexer = indexer_cache[id(index)] = index.get_indexer(columns)
914 values = extract_array(s, extract_numpy=True)
915 aligned_values.append(algorithms.take_nd(values, indexer))
917 content = np.vstack(aligned_values)
918 return content, columns
921def _list_of_dict_to_arrays(
922 data: list[dict],
923 columns: Index | None,
924) -> tuple[np.ndarray, Index]:
925 """
926 Convert list of dicts to numpy arrays
928 if `columns` is not passed, column names are inferred from the records
929 - for OrderedDict and dicts, the column names match
930 the key insertion-order from the first record to the last.
931 - For other kinds of dict-likes, the keys are lexically sorted.
933 Parameters
934 ----------
935 data : iterable
936 collection of records (OrderedDict, dict)
937 columns: iterables or None
939 Returns
940 -------
941 content : np.ndarray[object, ndim=2]
942 columns : Index
943 """
944 if columns is None:
945 gen = (list(x.keys()) for x in data)
946 sort = not any(isinstance(d, dict) for d in data)
947 pre_cols = lib.fast_unique_multiple_list_gen(gen, sort=sort)
948 columns = ensure_index(pre_cols)
950 # assure that they are of the base dict class and not of derived
951 # classes
952 data = [d if type(d) is dict else dict(d) for d in data]
954 content = lib.dicts_to_array(data, list(columns))
955 return content, columns
958def _finalize_columns_and_data(
959 content: np.ndarray, # ndim == 2
960 columns: Index | None,
961 dtype: DtypeObj | None,
962) -> tuple[list[ArrayLike], Index]:
963 """
964 Ensure we have valid columns, cast object dtypes if possible.
965 """
966 contents = list(content.T)
968 try:
969 columns = _validate_or_indexify_columns(contents, columns)
970 except AssertionError as err:
971 # GH#26429 do not raise user-facing AssertionError
972 raise ValueError(err) from err
974 if len(contents) and contents[0].dtype == np.object_:
975 contents = _convert_object_array(contents, dtype=dtype)
977 return contents, columns
980def _validate_or_indexify_columns(
981 content: list[np.ndarray], columns: Index | None
982) -> Index:
983 """
984 If columns is None, make numbers as column names; Otherwise, validate that
985 columns have valid length.
987 Parameters
988 ----------
989 content : list of np.ndarrays
990 columns : Index or None
992 Returns
993 -------
994 Index
995 If columns is None, assign positional column index value as columns.
997 Raises
998 ------
999 1. AssertionError when content is not composed of list of lists, and if
1000 length of columns is not equal to length of content.
1001 2. ValueError when content is list of lists, but length of each sub-list
1002 is not equal
1003 3. ValueError when content is list of lists, but length of sub-list is
1004 not equal to length of content
1005 """
1006 if columns is None:
1007 columns = default_index(len(content))
1008 else:
1010 # Add mask for data which is composed of list of lists
1011 is_mi_list = isinstance(columns, list) and all(
1012 isinstance(col, list) for col in columns
1013 )
1015 if not is_mi_list and len(columns) != len(content): # pragma: no cover
1016 # caller's responsibility to check for this...
1017 raise AssertionError(
1018 f"{len(columns)} columns passed, passed data had "
1019 f"{len(content)} columns"
1020 )
1021 elif is_mi_list:
1023 # check if nested list column, length of each sub-list should be equal
1024 if len({len(col) for col in columns}) > 1:
1025 raise ValueError(
1026 "Length of columns passed for MultiIndex columns is different"
1027 )
1029 # if columns is not empty and length of sublist is not equal to content
1030 elif columns and len(columns[0]) != len(content):
1031 raise ValueError(
1032 f"{len(columns[0])} columns passed, passed data had "
1033 f"{len(content)} columns"
1034 )
1035 return columns
1038def _convert_object_array(
1039 content: list[np.ndarray], dtype: DtypeObj | None
1040) -> list[ArrayLike]:
1041 """
1042 Internal function to convert object array.
1044 Parameters
1045 ----------
1046 content: List[np.ndarray]
1047 dtype: np.dtype or ExtensionDtype
1049 Returns
1050 -------
1051 List[ArrayLike]
1052 """
1053 # provide soft conversion of object dtypes
1054 def convert(arr):
1055 if dtype != np.dtype("O"):
1056 arr = lib.maybe_convert_objects(arr)
1057 arr = maybe_cast_to_datetime(arr, dtype)
1058 return arr
1060 arrays = [convert(arr) for arr in content]
1062 return arrays