Coverage for /var/srv/projects/api.amasfac.comuna18.com/tmp/venv/lib/python3.9/site-packages/pandas/core/internals/managers.py: 13%
1014 statements
« prev ^ index » next coverage.py v6.4.4, created at 2023-07-17 14:22 -0600
« prev ^ index » next coverage.py v6.4.4, created at 2023-07-17 14:22 -0600
1from __future__ import annotations
3import itertools
4from typing import (
5 Any,
6 Callable,
7 Hashable,
8 Literal,
9 Sequence,
10 TypeVar,
11 cast,
12)
13import warnings
14import weakref
16import numpy as np
18from pandas._config import get_option
20from pandas._libs import (
21 algos as libalgos,
22 internals as libinternals,
23 lib,
24)
25from pandas._libs.internals import BlockPlacement
26from pandas._typing import (
27 ArrayLike,
28 DtypeObj,
29 Shape,
30 npt,
31 type_t,
32)
33from pandas.errors import PerformanceWarning
34from pandas.util._decorators import cache_readonly
35from pandas.util._exceptions import find_stack_level
36from pandas.util._validators import validate_bool_kwarg
38from pandas.core.dtypes.cast import infer_dtype_from_scalar
39from pandas.core.dtypes.common import (
40 ensure_platform_int,
41 is_1d_only_ea_dtype,
42 is_dtype_equal,
43 is_list_like,
44)
45from pandas.core.dtypes.dtypes import ExtensionDtype
46from pandas.core.dtypes.generic import (
47 ABCDataFrame,
48 ABCSeries,
49)
50from pandas.core.dtypes.missing import (
51 array_equals,
52 isna,
53)
55import pandas.core.algorithms as algos
56from pandas.core.arrays._mixins import NDArrayBackedExtensionArray
57from pandas.core.arrays.sparse import SparseDtype
58import pandas.core.common as com
59from pandas.core.construction import (
60 ensure_wrapped_if_datetimelike,
61 extract_array,
62)
63from pandas.core.indexers import maybe_convert_indices
64from pandas.core.indexes.api import (
65 Float64Index,
66 Index,
67 ensure_index,
68)
69from pandas.core.internals.base import (
70 DataManager,
71 SingleDataManager,
72 interleaved_dtype,
73)
74from pandas.core.internals.blocks import (
75 Block,
76 DatetimeTZBlock,
77 NumpyBlock,
78 ensure_block_shape,
79 extend_blocks,
80 get_block_type,
81 new_block,
82 new_block_2d,
83)
84from pandas.core.internals.ops import (
85 blockwise_all,
86 operate_blockwise,
87)
89T = TypeVar("T", bound="BaseBlockManager")
92class BaseBlockManager(DataManager):
93 """
94 Core internal data structure to implement DataFrame, Series, etc.
96 Manage a bunch of labeled 2D mixed-type ndarrays. Essentially it's a
97 lightweight blocked set of labeled data to be manipulated by the DataFrame
98 public API class
100 Attributes
101 ----------
102 shape
103 ndim
104 axes
105 values
106 items
108 Methods
109 -------
110 set_axis(axis, new_labels)
111 copy(deep=True)
113 get_dtypes
115 apply(func, axes, block_filter_fn)
117 get_bool_data
118 get_numeric_data
120 get_slice(slice_like, axis)
121 get(label)
122 iget(loc)
124 take(indexer, axis)
125 reindex_axis(new_labels, axis)
126 reindex_indexer(new_labels, indexer, axis)
128 delete(label)
129 insert(loc, label, value)
130 set(label, value)
132 Parameters
133 ----------
134 blocks: Sequence of Block
135 axes: Sequence of Index
136 verify_integrity: bool, default True
138 Notes
139 -----
140 This is *not* a public API class
141 """
143 __slots__ = ()
145 _blknos: npt.NDArray[np.intp]
146 _blklocs: npt.NDArray[np.intp]
147 blocks: tuple[Block, ...]
148 axes: list[Index]
149 refs: list[weakref.ref | None] | None
150 parent: object
152 @property
153 def ndim(self) -> int:
154 raise NotImplementedError
156 _known_consolidated: bool
157 _is_consolidated: bool
159 def __init__(self, blocks, axes, refs=None, verify_integrity: bool = True) -> None:
160 raise NotImplementedError
162 @classmethod
163 def from_blocks(
164 cls: type_t[T],
165 blocks: list[Block],
166 axes: list[Index],
167 refs: list[weakref.ref | None] | None = None,
168 parent: object = None,
169 ) -> T:
170 raise NotImplementedError
172 @property
173 def blknos(self) -> npt.NDArray[np.intp]:
174 """
175 Suppose we want to find the array corresponding to our i'th column.
177 blknos[i] identifies the block from self.blocks that contains this column.
179 blklocs[i] identifies the column of interest within
180 self.blocks[self.blknos[i]]
181 """
182 if self._blknos is None:
183 # Note: these can be altered by other BlockManager methods.
184 self._rebuild_blknos_and_blklocs()
186 return self._blknos
188 @property
189 def blklocs(self) -> npt.NDArray[np.intp]:
190 """
191 See blknos.__doc__
192 """
193 if self._blklocs is None:
194 # Note: these can be altered by other BlockManager methods.
195 self._rebuild_blknos_and_blklocs()
197 return self._blklocs
199 def make_empty(self: T, axes=None) -> T:
200 """return an empty BlockManager with the items axis of len 0"""
201 if axes is None:
202 axes = [Index([])] + self.axes[1:]
204 # preserve dtype if possible
205 if self.ndim == 1:
206 assert isinstance(self, SingleBlockManager) # for mypy
207 blk = self.blocks[0]
208 arr = blk.values[:0]
209 bp = BlockPlacement(slice(0, 0))
210 nb = blk.make_block_same_class(arr, placement=bp)
211 blocks = [nb]
212 else:
213 blocks = []
214 return type(self).from_blocks(blocks, axes)
216 def __nonzero__(self) -> bool:
217 return True
219 # Python3 compat
220 __bool__ = __nonzero__
222 def _normalize_axis(self, axis: int) -> int:
223 # switch axis to follow BlockManager logic
224 if self.ndim == 2:
225 axis = 1 if axis == 0 else 0
226 return axis
228 def set_axis(self, axis: int, new_labels: Index) -> None:
229 # Caller is responsible for ensuring we have an Index object.
230 self._validate_set_axis(axis, new_labels)
231 self.axes[axis] = new_labels
233 @property
234 def is_single_block(self) -> bool:
235 # Assumes we are 2D; overridden by SingleBlockManager
236 return len(self.blocks) == 1
238 @property
239 def items(self) -> Index:
240 return self.axes[0]
242 def _has_no_reference(self, i: int) -> bool:
243 """
244 Check for column `i` if it has references.
245 (whether it references another array or is itself being referenced)
246 Returns True if the column has no references.
247 """
248 blkno = self.blknos[i]
249 return self._has_no_reference_block(blkno)
251 def _has_no_reference_block(self, blkno: int) -> bool:
252 """
253 Check for block `i` if it has references.
254 (whether it references another array or is itself being referenced)
255 Returns True if the block has no references.
256 """
257 # TODO(CoW) include `or self.refs[blkno]() is None` ?
258 return (
259 self.refs is None or self.refs[blkno] is None
260 ) and weakref.getweakrefcount(self.blocks[blkno]) == 0
262 def _clear_reference_block(self, blkno: int) -> None:
263 """
264 Clear any reference for column `i`.
265 """
266 if self.refs is not None:
267 self.refs[blkno] = None
268 if com.all_none(*self.refs):
269 self.parent = None
271 def get_dtypes(self):
272 dtypes = np.array([blk.dtype for blk in self.blocks])
273 return dtypes.take(self.blknos)
275 @property
276 def arrays(self) -> list[ArrayLike]:
277 """
278 Quick access to the backing arrays of the Blocks.
280 Only for compatibility with ArrayManager for testing convenience.
281 Not to be used in actual code, and return value is not the same as the
282 ArrayManager method (list of 1D arrays vs iterator of 2D ndarrays / 1D EAs).
284 Warning! The returned arrays don't handle Copy-on-Write, so this should
285 be used with caution (only in read-mode).
286 """
287 return [blk.values for blk in self.blocks]
289 def __repr__(self) -> str:
290 output = type(self).__name__
291 for i, ax in enumerate(self.axes):
292 if i == 0:
293 output += f"\nItems: {ax}"
294 else:
295 output += f"\nAxis {i}: {ax}"
297 for block in self.blocks:
298 output += f"\n{block}"
299 return output
301 def apply(
302 self: T,
303 f,
304 align_keys: list[str] | None = None,
305 ignore_failures: bool = False,
306 **kwargs,
307 ) -> T:
308 """
309 Iterate over the blocks, collect and create a new BlockManager.
311 Parameters
312 ----------
313 f : str or callable
314 Name of the Block method to apply.
315 align_keys: List[str] or None, default None
316 ignore_failures: bool, default False
317 **kwargs
318 Keywords to pass to `f`
320 Returns
321 -------
322 BlockManager
323 """
324 assert "filter" not in kwargs
326 align_keys = align_keys or []
327 result_blocks: list[Block] = []
328 # fillna: Series/DataFrame is responsible for making sure value is aligned
330 aligned_args = {k: kwargs[k] for k in align_keys}
332 for b in self.blocks:
334 if aligned_args:
336 for k, obj in aligned_args.items():
337 if isinstance(obj, (ABCSeries, ABCDataFrame)):
338 # The caller is responsible for ensuring that
339 # obj.axes[-1].equals(self.items)
340 if obj.ndim == 1:
341 kwargs[k] = obj.iloc[b.mgr_locs.indexer]._values
342 else:
343 kwargs[k] = obj.iloc[:, b.mgr_locs.indexer]._values
344 else:
345 # otherwise we have an ndarray
346 kwargs[k] = obj[b.mgr_locs.indexer]
348 try:
349 if callable(f):
350 applied = b.apply(f, **kwargs)
351 else:
352 applied = getattr(b, f)(**kwargs)
353 except (TypeError, NotImplementedError):
354 if not ignore_failures:
355 raise
356 continue
357 result_blocks = extend_blocks(applied, result_blocks)
359 if ignore_failures:
360 return self._combine(result_blocks)
362 out = type(self).from_blocks(result_blocks, self.axes)
363 return out
365 def where(self: T, other, cond, align: bool) -> T:
366 if align:
367 align_keys = ["other", "cond"]
368 else:
369 align_keys = ["cond"]
370 other = extract_array(other, extract_numpy=True)
372 return self.apply(
373 "where",
374 align_keys=align_keys,
375 other=other,
376 cond=cond,
377 )
379 def setitem(self: T, indexer, value) -> T:
380 """
381 Set values with indexer.
383 For SingleBlockManager, this backs s[indexer] = value
384 """
385 if isinstance(indexer, np.ndarray) and indexer.ndim > self.ndim:
386 raise ValueError(f"Cannot set values with ndim > {self.ndim}")
388 if _using_copy_on_write() and not self._has_no_reference(0):
389 # if being referenced -> perform Copy-on-Write and clear the reference
390 # this method is only called if there is a single block -> hardcoded 0
391 self = self.copy()
393 return self.apply("setitem", indexer=indexer, value=value)
395 def putmask(self, mask, new, align: bool = True):
396 if (
397 _using_copy_on_write()
398 and self.refs is not None
399 and not all(ref is None for ref in self.refs)
400 ):
401 # some reference -> copy full dataframe
402 # TODO(CoW) this could be optimized to only copy the blocks that would
403 # get modified
404 self = self.copy()
406 if align:
407 align_keys = ["new", "mask"]
408 else:
409 align_keys = ["mask"]
410 new = extract_array(new, extract_numpy=True)
412 return self.apply(
413 "putmask",
414 align_keys=align_keys,
415 mask=mask,
416 new=new,
417 )
419 def diff(self: T, n: int, axis: int) -> T:
420 axis = self._normalize_axis(axis)
421 return self.apply("diff", n=n, axis=axis)
423 def interpolate(self: T, **kwargs) -> T:
424 return self.apply("interpolate", **kwargs)
426 def shift(self: T, periods: int, axis: int, fill_value) -> T:
427 axis = self._normalize_axis(axis)
428 if fill_value is lib.no_default:
429 fill_value = None
431 return self.apply("shift", periods=periods, axis=axis, fill_value=fill_value)
433 def fillna(self: T, value, limit, inplace: bool, downcast) -> T:
435 if limit is not None:
436 # Do this validation even if we go through one of the no-op paths
437 limit = libalgos.validate_limit(None, limit=limit)
438 if inplace:
439 # TODO(CoW) can be optimized to only copy those blocks that have refs
440 if _using_copy_on_write() and any(
441 not self._has_no_reference_block(i) for i in range(len(self.blocks))
442 ):
443 self = self.copy()
445 return self.apply(
446 "fillna", value=value, limit=limit, inplace=inplace, downcast=downcast
447 )
449 def astype(self: T, dtype, copy: bool = False, errors: str = "raise") -> T:
450 return self.apply("astype", dtype=dtype, copy=copy, errors=errors)
452 def convert(
453 self: T,
454 copy: bool = True,
455 datetime: bool = True,
456 numeric: bool = True,
457 timedelta: bool = True,
458 ) -> T:
459 return self.apply(
460 "convert",
461 copy=copy,
462 datetime=datetime,
463 numeric=numeric,
464 timedelta=timedelta,
465 )
467 def replace(self: T, to_replace, value, inplace: bool) -> T:
468 inplace = validate_bool_kwarg(inplace, "inplace")
469 # NDFrame.replace ensures the not-is_list_likes here
470 assert not is_list_like(to_replace)
471 assert not is_list_like(value)
472 return self.apply(
473 "replace", to_replace=to_replace, value=value, inplace=inplace
474 )
476 def replace_regex(self, **kwargs):
477 return self.apply("_replace_regex", **kwargs)
479 def replace_list(
480 self: T,
481 src_list: list[Any],
482 dest_list: list[Any],
483 inplace: bool = False,
484 regex: bool = False,
485 ) -> T:
486 """do a list replace"""
487 inplace = validate_bool_kwarg(inplace, "inplace")
489 bm = self.apply(
490 "replace_list",
491 src_list=src_list,
492 dest_list=dest_list,
493 inplace=inplace,
494 regex=regex,
495 )
496 bm._consolidate_inplace()
497 return bm
499 def to_native_types(self: T, **kwargs) -> T:
500 """
501 Convert values to native types (strings / python objects) that are used
502 in formatting (repr / csv).
503 """
504 return self.apply("to_native_types", **kwargs)
506 @property
507 def is_numeric_mixed_type(self) -> bool:
508 return all(block.is_numeric for block in self.blocks)
510 @property
511 def any_extension_types(self) -> bool:
512 """Whether any of the blocks in this manager are extension blocks"""
513 return any(block.is_extension for block in self.blocks)
515 @property
516 def is_view(self) -> bool:
517 """return a boolean if we are a single block and are a view"""
518 if len(self.blocks) == 1:
519 return self.blocks[0].is_view
521 # It is technically possible to figure out which blocks are views
522 # e.g. [ b.values.base is not None for b in self.blocks ]
523 # but then we have the case of possibly some blocks being a view
524 # and some blocks not. setting in theory is possible on the non-view
525 # blocks w/o causing a SettingWithCopy raise/warn. But this is a bit
526 # complicated
528 return False
530 def _get_data_subset(self: T, predicate: Callable) -> T:
531 blocks = [blk for blk in self.blocks if predicate(blk.values)]
532 return self._combine(blocks, copy=False)
534 def get_bool_data(self: T, copy: bool = False) -> T:
535 """
536 Select blocks that are bool-dtype and columns from object-dtype blocks
537 that are all-bool.
539 Parameters
540 ----------
541 copy : bool, default False
542 Whether to copy the blocks
543 """
545 new_blocks = []
547 for blk in self.blocks:
548 if blk.dtype == bool:
549 new_blocks.append(blk)
551 elif blk.is_object:
552 nbs = blk._split()
553 for nb in nbs:
554 if nb.is_bool:
555 new_blocks.append(nb)
557 return self._combine(new_blocks, copy)
559 def get_numeric_data(self: T, copy: bool = False) -> T:
560 """
561 Parameters
562 ----------
563 copy : bool, default False
564 Whether to copy the blocks
565 """
566 numeric_blocks = [blk for blk in self.blocks if blk.is_numeric]
567 if len(numeric_blocks) == len(self.blocks):
568 # Avoid somewhat expensive _combine
569 if copy:
570 return self.copy(deep=True)
571 return self
572 return self._combine(numeric_blocks, copy)
574 def _combine(
575 self: T, blocks: list[Block], copy: bool = True, index: Index | None = None
576 ) -> T:
577 """return a new manager with the blocks"""
578 if len(blocks) == 0:
579 if self.ndim == 2:
580 # retain our own Index dtype
581 if index is not None:
582 axes = [self.items[:0], index]
583 else:
584 axes = [self.items[:0]] + self.axes[1:]
585 return self.make_empty(axes)
586 return self.make_empty()
588 # FIXME: optimization potential
589 indexer = np.sort(np.concatenate([b.mgr_locs.as_array for b in blocks]))
590 inv_indexer = lib.get_reverse_indexer(indexer, self.shape[0])
592 new_blocks: list[Block] = []
593 # TODO(CoW) we could optimize here if we know that the passed blocks
594 # are fully "owned" (eg created from an operation, not coming from
595 # an existing manager)
596 new_refs: list[weakref.ref | None] | None = None if copy else []
597 for b in blocks:
598 nb = b.copy(deep=copy)
599 nb.mgr_locs = BlockPlacement(inv_indexer[nb.mgr_locs.indexer])
600 new_blocks.append(nb)
601 if not copy:
602 # None has no attribute "append"
603 new_refs.append(weakref.ref(b)) # type: ignore[union-attr]
605 axes = list(self.axes)
606 if index is not None:
607 axes[-1] = index
608 axes[0] = self.items.take(indexer)
610 return type(self).from_blocks(
611 new_blocks, axes, new_refs, parent=None if copy else self
612 )
614 @property
615 def nblocks(self) -> int:
616 return len(self.blocks)
618 def copy(self: T, deep=True) -> T:
619 """
620 Make deep or shallow copy of BlockManager
622 Parameters
623 ----------
624 deep : bool, string or None, default True
625 If False or None, return a shallow copy (do not copy data)
626 If 'all', copy data and a deep copy of the index
628 Returns
629 -------
630 BlockManager
631 """
632 if deep is None:
633 if _using_copy_on_write():
634 # use shallow copy
635 deep = False
636 else:
637 # preserve deep copy for BlockManager with copy=None
638 deep = True
640 # this preserves the notion of view copying of axes
641 if deep:
642 # hit in e.g. tests.io.json.test_pandas
644 def copy_func(ax):
645 return ax.copy(deep=True) if deep == "all" else ax.view()
647 new_axes = [copy_func(ax) for ax in self.axes]
648 else:
649 new_axes = list(self.axes)
651 res = self.apply("copy", deep=deep)
652 new_refs: list[weakref.ref | None] | None
653 if deep:
654 new_refs = None
655 parent = None
656 else:
657 new_refs = [weakref.ref(blk) for blk in self.blocks]
658 parent = self
660 res.axes = new_axes
661 res.refs = new_refs
662 res.parent = parent
664 if self.ndim > 1:
665 # Avoid needing to re-compute these
666 blknos = self._blknos
667 if blknos is not None:
668 res._blknos = blknos.copy()
669 res._blklocs = self._blklocs.copy()
671 if deep:
672 res._consolidate_inplace()
673 return res
675 def consolidate(self: T) -> T:
676 """
677 Join together blocks having same dtype
679 Returns
680 -------
681 y : BlockManager
682 """
683 if self.is_consolidated():
684 return self
686 bm = type(self)(self.blocks, self.axes, self.refs, verify_integrity=False)
687 bm._is_consolidated = False
688 bm._consolidate_inplace()
689 return bm
691 def reindex_indexer(
692 self: T,
693 new_axis: Index,
694 indexer: npt.NDArray[np.intp] | None,
695 axis: int,
696 fill_value=None,
697 allow_dups: bool = False,
698 copy: bool | None = True,
699 only_slice: bool = False,
700 *,
701 use_na_proxy: bool = False,
702 ) -> T:
703 """
704 Parameters
705 ----------
706 new_axis : Index
707 indexer : ndarray[intp] or None
708 axis : int
709 fill_value : object, default None
710 allow_dups : bool, default False
711 copy : bool or None, default True
712 If None, regard as False to get shallow copy.
713 only_slice : bool, default False
714 Whether to take views, not copies, along columns.
715 use_na_proxy : bool, default False
716 Whether to use a np.void ndarray for newly introduced columns.
718 pandas-indexer with -1's only.
719 """
720 if copy is None:
721 if _using_copy_on_write():
722 # use shallow copy
723 copy = False
724 else:
725 # preserve deep copy for BlockManager with copy=None
726 copy = True
728 if indexer is None:
729 if new_axis is self.axes[axis] and not copy:
730 return self
732 result = self.copy(deep=copy)
733 result.axes = list(self.axes)
734 result.axes[axis] = new_axis
735 return result
737 # some axes don't allow reindexing with dups
738 if not allow_dups:
739 self.axes[axis]._validate_can_reindex(indexer)
741 if axis >= self.ndim:
742 raise IndexError("Requested axis not found in manager")
744 if axis == 0:
745 new_blocks, new_refs = self._slice_take_blocks_ax0(
746 indexer,
747 fill_value=fill_value,
748 only_slice=only_slice,
749 use_na_proxy=use_na_proxy,
750 )
751 parent = None if com.all_none(*new_refs) else self
752 else:
753 new_blocks = [
754 blk.take_nd(
755 indexer,
756 axis=1,
757 fill_value=(
758 fill_value if fill_value is not None else blk.fill_value
759 ),
760 )
761 for blk in self.blocks
762 ]
763 new_refs = None
764 parent = None
766 new_axes = list(self.axes)
767 new_axes[axis] = new_axis
769 new_mgr = type(self).from_blocks(new_blocks, new_axes, new_refs, parent=parent)
770 if axis == 1:
771 # We can avoid the need to rebuild these
772 new_mgr._blknos = self.blknos.copy()
773 new_mgr._blklocs = self.blklocs.copy()
774 return new_mgr
776 def _slice_take_blocks_ax0(
777 self,
778 slice_or_indexer: slice | np.ndarray,
779 fill_value=lib.no_default,
780 only_slice: bool = False,
781 *,
782 use_na_proxy: bool = False,
783 ) -> tuple[list[Block], list[weakref.ref | None]]:
784 """
785 Slice/take blocks along axis=0.
787 Overloaded for SingleBlock
789 Parameters
790 ----------
791 slice_or_indexer : slice or np.ndarray[int64]
792 fill_value : scalar, default lib.no_default
793 only_slice : bool, default False
794 If True, we always return views on existing arrays, never copies.
795 This is used when called from ops.blockwise.operate_blockwise.
796 use_na_proxy : bool, default False
797 Whether to use a np.void ndarray for newly introduced columns.
799 Returns
800 -------
801 new_blocks : list of Block
802 """
803 allow_fill = fill_value is not lib.no_default
805 sl_type, slobj, sllen = _preprocess_slice_or_indexer(
806 slice_or_indexer, self.shape[0], allow_fill=allow_fill
807 )
809 if self.is_single_block:
810 blk = self.blocks[0]
812 if sl_type == "slice":
813 # GH#32959 EABlock would fail since we can't make 0-width
814 # TODO(EA2D): special casing unnecessary with 2D EAs
815 if sllen == 0:
816 return [], []
817 bp = BlockPlacement(slice(0, sllen))
818 return [blk.getitem_block_columns(slobj, new_mgr_locs=bp)], [
819 weakref.ref(blk)
820 ]
821 elif not allow_fill or self.ndim == 1:
822 if allow_fill and fill_value is None:
823 fill_value = blk.fill_value
825 if not allow_fill and only_slice:
826 # GH#33597 slice instead of take, so we get
827 # views instead of copies
828 blocks = [
829 blk.getitem_block_columns(
830 slice(ml, ml + 1), new_mgr_locs=BlockPlacement(i)
831 )
832 for i, ml in enumerate(slobj)
833 ]
834 # We have
835 # all(np.shares_memory(nb.values, blk.values) for nb in blocks)
836 return blocks, [weakref.ref(blk)] * len(blocks)
837 else:
838 bp = BlockPlacement(slice(0, sllen))
839 return [
840 blk.take_nd(
841 slobj,
842 axis=0,
843 new_mgr_locs=bp,
844 fill_value=fill_value,
845 )
846 ], [None]
848 if sl_type == "slice":
849 blknos = self.blknos[slobj]
850 blklocs = self.blklocs[slobj]
851 else:
852 blknos = algos.take_nd(
853 self.blknos, slobj, fill_value=-1, allow_fill=allow_fill
854 )
855 blklocs = algos.take_nd(
856 self.blklocs, slobj, fill_value=-1, allow_fill=allow_fill
857 )
859 # When filling blknos, make sure blknos is updated before appending to
860 # blocks list, that way new blkno is exactly len(blocks).
861 blocks = []
862 refs: list[weakref.ref | None] = []
863 group = not only_slice
864 for blkno, mgr_locs in libinternals.get_blkno_placements(blknos, group=group):
865 if blkno == -1:
866 # If we've got here, fill_value was not lib.no_default
868 blocks.append(
869 self._make_na_block(
870 placement=mgr_locs,
871 fill_value=fill_value,
872 use_na_proxy=use_na_proxy,
873 )
874 )
875 refs.append(None)
876 else:
877 blk = self.blocks[blkno]
879 # Otherwise, slicing along items axis is necessary.
880 if not blk._can_consolidate and not blk._validate_ndim:
881 # i.e. we dont go through here for DatetimeTZBlock
882 # A non-consolidatable block, it's easy, because there's
883 # only one item and each mgr loc is a copy of that single
884 # item.
885 for mgr_loc in mgr_locs:
886 newblk = blk.copy(deep=False)
887 newblk.mgr_locs = BlockPlacement(slice(mgr_loc, mgr_loc + 1))
888 blocks.append(newblk)
889 refs.append(weakref.ref(blk))
891 else:
892 # GH#32779 to avoid the performance penalty of copying,
893 # we may try to only slice
894 taker = blklocs[mgr_locs.indexer]
895 max_len = max(len(mgr_locs), taker.max() + 1)
896 if only_slice or _using_copy_on_write():
897 taker = lib.maybe_indices_to_slice(taker, max_len)
899 if isinstance(taker, slice):
900 nb = blk.getitem_block_columns(taker, new_mgr_locs=mgr_locs)
901 blocks.append(nb)
902 refs.append(weakref.ref(blk))
903 elif only_slice:
904 # GH#33597 slice instead of take, so we get
905 # views instead of copies
906 for i, ml in zip(taker, mgr_locs):
907 slc = slice(i, i + 1)
908 bp = BlockPlacement(ml)
909 nb = blk.getitem_block_columns(slc, new_mgr_locs=bp)
910 # We have np.shares_memory(nb.values, blk.values)
911 blocks.append(nb)
912 refs.append(weakref.ref(blk))
913 else:
914 nb = blk.take_nd(taker, axis=0, new_mgr_locs=mgr_locs)
915 blocks.append(nb)
916 refs.append(None)
918 return blocks, refs
920 def _make_na_block(
921 self, placement: BlockPlacement, fill_value=None, use_na_proxy: bool = False
922 ) -> Block:
923 # Note: we only get here with self.ndim == 2
925 if use_na_proxy:
926 assert fill_value is None
927 shape = (len(placement), self.shape[1])
928 vals = np.empty(shape, dtype=np.void)
929 nb = NumpyBlock(vals, placement, ndim=2)
930 return nb
932 if fill_value is None:
933 fill_value = np.nan
934 block_shape = list(self.shape)
935 block_shape[0] = len(placement)
937 dtype, fill_value = infer_dtype_from_scalar(fill_value)
938 # error: Argument "dtype" to "empty" has incompatible type "Union[dtype,
939 # ExtensionDtype]"; expected "Union[dtype, None, type, _SupportsDtype, str,
940 # Tuple[Any, int], Tuple[Any, Union[int, Sequence[int]]], List[Any], _DtypeDict,
941 # Tuple[Any, Any]]"
942 block_values = np.empty(block_shape, dtype=dtype) # type: ignore[arg-type]
943 block_values.fill(fill_value)
944 return new_block_2d(block_values, placement=placement)
946 def take(
947 self: T,
948 indexer,
949 axis: int = 1,
950 verify: bool = True,
951 convert_indices: bool = True,
952 ) -> T:
953 """
954 Take items along any axis.
956 indexer : np.ndarray or slice
957 axis : int, default 1
958 verify : bool, default True
959 Check that all entries are between 0 and len(self) - 1, inclusive.
960 Pass verify=False if this check has been done by the caller.
961 convert_indices : bool, default True
962 Whether to attempt to convert indices to positive values.
964 Returns
965 -------
966 BlockManager
967 """
968 # We have 6 tests that get here with a slice
969 indexer = (
970 np.arange(indexer.start, indexer.stop, indexer.step, dtype=np.intp)
971 if isinstance(indexer, slice)
972 else np.asanyarray(indexer, dtype=np.intp)
973 )
975 n = self.shape[axis]
976 if convert_indices:
977 indexer = maybe_convert_indices(indexer, n, verify=verify)
979 new_labels = self.axes[axis].take(indexer)
980 return self.reindex_indexer(
981 new_axis=new_labels,
982 indexer=indexer,
983 axis=axis,
984 allow_dups=True,
985 copy=None,
986 )
989class BlockManager(libinternals.BlockManager, BaseBlockManager):
990 """
991 BaseBlockManager that holds 2D blocks.
992 """
994 ndim = 2
996 # ----------------------------------------------------------------
997 # Constructors
999 def __init__(
1000 self,
1001 blocks: Sequence[Block],
1002 axes: Sequence[Index],
1003 refs: list[weakref.ref | None] | None = None,
1004 parent: object = None,
1005 verify_integrity: bool = True,
1006 ) -> None:
1008 if verify_integrity:
1009 # Assertion disabled for performance
1010 # assert all(isinstance(x, Index) for x in axes)
1012 for block in blocks:
1013 if self.ndim != block.ndim:
1014 raise AssertionError(
1015 f"Number of Block dimensions ({block.ndim}) must equal "
1016 f"number of axes ({self.ndim})"
1017 )
1018 if isinstance(block, DatetimeTZBlock) and block.values.ndim == 1:
1019 # TODO(2.0): remove once fastparquet no longer needs this
1020 warnings.warn(
1021 "In a future version, the BlockManager constructor "
1022 "will assume that a DatetimeTZBlock with block.ndim==2 "
1023 "has block.values.ndim == 2.",
1024 DeprecationWarning,
1025 stacklevel=find_stack_level(),
1026 )
1028 # error: Incompatible types in assignment (expression has type
1029 # "Union[ExtensionArray, ndarray]", variable has type
1030 # "DatetimeArray")
1031 block.values = ensure_block_shape( # type: ignore[assignment]
1032 block.values, self.ndim
1033 )
1034 try:
1035 block._cache.clear()
1036 except AttributeError:
1037 # _cache not initialized
1038 pass
1040 self._verify_integrity()
1042 def _verify_integrity(self) -> None:
1043 mgr_shape = self.shape
1044 tot_items = sum(len(x.mgr_locs) for x in self.blocks)
1045 for block in self.blocks:
1046 if block.shape[1:] != mgr_shape[1:]:
1047 raise construction_error(tot_items, block.shape[1:], self.axes)
1048 if len(self.items) != tot_items:
1049 raise AssertionError(
1050 "Number of manager items must equal union of "
1051 f"block items\n# manager items: {len(self.items)}, # "
1052 f"tot_items: {tot_items}"
1053 )
1054 if self.refs is not None:
1055 if len(self.refs) != len(self.blocks):
1056 raise AssertionError(
1057 "Number of passed refs must equal the number of blocks: "
1058 f"{len(self.refs)} refs vs {len(self.blocks)} blocks."
1059 "\nIf you see this error, please report a bug at "
1060 "https://github.com/pandas-dev/pandas/issues"
1061 )
1063 @classmethod
1064 def from_blocks(
1065 cls,
1066 blocks: list[Block],
1067 axes: list[Index],
1068 refs: list[weakref.ref | None] | None = None,
1069 parent: object = None,
1070 ) -> BlockManager:
1071 """
1072 Constructor for BlockManager and SingleBlockManager with same signature.
1073 """
1074 parent = parent if _using_copy_on_write() else None
1075 return cls(blocks, axes, refs, parent, verify_integrity=False)
1077 # ----------------------------------------------------------------
1078 # Indexing
1080 def fast_xs(self, loc: int) -> SingleBlockManager:
1081 """
1082 Return the array corresponding to `frame.iloc[loc]`.
1084 Parameters
1085 ----------
1086 loc : int
1088 Returns
1089 -------
1090 np.ndarray or ExtensionArray
1091 """
1092 if len(self.blocks) == 1:
1093 result = self.blocks[0].iget((slice(None), loc))
1094 block = new_block(result, placement=slice(0, len(result)), ndim=1)
1095 # in the case of a single block, the new block is a view
1096 ref = weakref.ref(self.blocks[0])
1097 return SingleBlockManager(block, self.axes[0], [ref], parent=self)
1099 dtype = interleaved_dtype([blk.dtype for blk in self.blocks])
1101 n = len(self)
1103 # GH#46406
1104 immutable_ea = isinstance(dtype, SparseDtype)
1106 if isinstance(dtype, ExtensionDtype) and not immutable_ea:
1107 cls = dtype.construct_array_type()
1108 result = cls._empty((n,), dtype=dtype)
1109 else:
1110 # error: Argument "dtype" to "empty" has incompatible type
1111 # "Union[Type[object], dtype[Any], ExtensionDtype, None]"; expected
1112 # "None"
1113 result = np.empty(
1114 n, dtype=object if immutable_ea else dtype # type: ignore[arg-type]
1115 )
1116 result = ensure_wrapped_if_datetimelike(result)
1118 for blk in self.blocks:
1119 # Such assignment may incorrectly coerce NaT to None
1120 # result[blk.mgr_locs] = blk._slice((slice(None), loc))
1121 for i, rl in enumerate(blk.mgr_locs):
1122 result[rl] = blk.iget((i, loc))
1124 if immutable_ea:
1125 dtype = cast(ExtensionDtype, dtype)
1126 result = dtype.construct_array_type()._from_sequence(result, dtype=dtype)
1128 block = new_block(result, placement=slice(0, len(result)), ndim=1)
1129 return SingleBlockManager(block, self.axes[0])
1131 def iget(self, i: int, track_ref: bool = True) -> SingleBlockManager:
1132 """
1133 Return the data as a SingleBlockManager.
1134 """
1135 block = self.blocks[self.blknos[i]]
1136 values = block.iget(self.blklocs[i])
1138 # shortcut for select a single-dim from a 2-dim BM
1139 bp = BlockPlacement(slice(0, len(values)))
1140 nb = type(block)(values, placement=bp, ndim=1)
1141 ref = weakref.ref(block) if track_ref else None
1142 parent = self if track_ref else None
1143 return SingleBlockManager(nb, self.axes[1], [ref], parent)
1145 def iget_values(self, i: int) -> ArrayLike:
1146 """
1147 Return the data for column i as the values (ndarray or ExtensionArray).
1149 Warning! The returned array is a view but doesn't handle Copy-on-Write,
1150 so this should be used with caution.
1151 """
1152 # TODO(CoW) making the arrays read-only might make this safer to use?
1153 block = self.blocks[self.blknos[i]]
1154 values = block.iget(self.blklocs[i])
1155 return values
1157 @property
1158 def column_arrays(self) -> list[np.ndarray]:
1159 """
1160 Used in the JSON C code to access column arrays.
1161 This optimizes compared to using `iget_values` by converting each
1163 Warning! This doesn't handle Copy-on-Write, so should be used with
1164 caution (current use case of consuming this in the JSON code is fine).
1165 """
1166 # This is an optimized equivalent to
1167 # result = [self.iget_values(i) for i in range(len(self.items))]
1168 result: list[np.ndarray | None] = [None] * len(self.items)
1170 for blk in self.blocks:
1171 mgr_locs = blk._mgr_locs
1172 values = blk.values_for_json()
1173 if values.ndim == 1:
1174 # TODO(EA2D): special casing not needed with 2D EAs
1175 result[mgr_locs[0]] = values
1177 else:
1178 for i, loc in enumerate(mgr_locs):
1179 result[loc] = values[i]
1181 # error: Incompatible return value type (got "List[None]",
1182 # expected "List[ndarray[Any, Any]]")
1183 return result # type: ignore[return-value]
1185 def iset(
1186 self, loc: int | slice | np.ndarray, value: ArrayLike, inplace: bool = False
1187 ):
1188 """
1189 Set new item in-place. Does not consolidate. Adds new Block if not
1190 contained in the current set of items
1191 """
1193 # FIXME: refactor, clearly separate broadcasting & zip-like assignment
1194 # can prob also fix the various if tests for sparse/categorical
1195 if self._blklocs is None and self.ndim > 1:
1196 self._rebuild_blknos_and_blklocs()
1198 # Note: we exclude DTA/TDA here
1199 value_is_extension_type = is_1d_only_ea_dtype(value.dtype)
1200 if not value_is_extension_type:
1201 if value.ndim == 2:
1202 value = value.T
1203 else:
1204 value = ensure_block_shape(value, ndim=2)
1206 if value.shape[1:] != self.shape[1:]:
1207 raise AssertionError(
1208 "Shape of new values must be compatible with manager shape"
1209 )
1211 if lib.is_integer(loc):
1212 # We have 6 tests where loc is _not_ an int.
1213 # In this case, get_blkno_placements will yield only one tuple,
1214 # containing (self._blknos[loc], BlockPlacement(slice(0, 1, 1)))
1216 # Check if we can use _iset_single fastpath
1217 loc = cast(int, loc)
1218 blkno = self.blknos[loc]
1219 blk = self.blocks[blkno]
1220 if len(blk._mgr_locs) == 1: # TODO: fastest way to check this?
1221 return self._iset_single(
1222 loc,
1223 value,
1224 inplace=inplace,
1225 blkno=blkno,
1226 blk=blk,
1227 )
1229 # error: Incompatible types in assignment (expression has type
1230 # "List[Union[int, slice, ndarray]]", variable has type "Union[int,
1231 # slice, ndarray]")
1232 loc = [loc] # type: ignore[assignment]
1234 # categorical/sparse/datetimetz
1235 if value_is_extension_type:
1237 def value_getitem(placement):
1238 return value
1240 else:
1242 def value_getitem(placement):
1243 return value[placement.indexer]
1245 # Accessing public blknos ensures the public versions are initialized
1246 blknos = self.blknos[loc]
1247 blklocs = self.blklocs[loc].copy()
1249 unfit_mgr_locs = []
1250 unfit_val_locs = []
1251 removed_blknos = []
1252 for blkno_l, val_locs in libinternals.get_blkno_placements(blknos, group=True):
1253 blk = self.blocks[blkno_l]
1254 blk_locs = blklocs[val_locs.indexer]
1255 if inplace and blk.should_store(value):
1256 # Updating inplace -> check if we need to do Copy-on-Write
1257 if _using_copy_on_write() and not self._has_no_reference_block(blkno_l):
1258 blk.set_inplace(blk_locs, value_getitem(val_locs), copy=True)
1259 self._clear_reference_block(blkno_l)
1260 else:
1261 blk.set_inplace(blk_locs, value_getitem(val_locs))
1262 else:
1263 unfit_mgr_locs.append(blk.mgr_locs.as_array[blk_locs])
1264 unfit_val_locs.append(val_locs)
1266 # If all block items are unfit, schedule the block for removal.
1267 if len(val_locs) == len(blk.mgr_locs):
1268 removed_blknos.append(blkno_l)
1269 else:
1270 nb = blk.delete(blk_locs)
1271 blocks_tup = (
1272 self.blocks[:blkno_l] + (nb,) + self.blocks[blkno_l + 1 :]
1273 )
1274 self.blocks = blocks_tup
1275 self._blklocs[nb.mgr_locs.indexer] = np.arange(len(nb))
1276 # blk.delete gives a copy, so we can remove a possible reference
1277 self._clear_reference_block(blkno_l)
1279 if len(removed_blknos):
1280 # Remove blocks & update blknos and refs accordingly
1281 is_deleted = np.zeros(self.nblocks, dtype=np.bool_)
1282 is_deleted[removed_blknos] = True
1284 new_blknos = np.empty(self.nblocks, dtype=np.intp)
1285 new_blknos.fill(-1)
1286 new_blknos[~is_deleted] = np.arange(self.nblocks - len(removed_blknos))
1287 self._blknos = new_blknos[self._blknos]
1288 self.blocks = tuple(
1289 blk for i, blk in enumerate(self.blocks) if i not in set(removed_blknos)
1290 )
1291 if self.refs is not None:
1292 self.refs = [
1293 ref
1294 for i, ref in enumerate(self.refs)
1295 if i not in set(removed_blknos)
1296 ]
1298 if unfit_val_locs:
1299 unfit_idxr = np.concatenate(unfit_mgr_locs)
1300 unfit_count = len(unfit_idxr)
1302 new_blocks: list[Block] = []
1303 if value_is_extension_type:
1304 # This code (ab-)uses the fact that EA blocks contain only
1305 # one item.
1306 # TODO(EA2D): special casing unnecessary with 2D EAs
1307 new_blocks.extend(
1308 new_block_2d(
1309 values=value,
1310 placement=BlockPlacement(slice(mgr_loc, mgr_loc + 1)),
1311 )
1312 for mgr_loc in unfit_idxr
1313 )
1315 self._blknos[unfit_idxr] = np.arange(unfit_count) + len(self.blocks)
1316 self._blklocs[unfit_idxr] = 0
1318 else:
1319 # unfit_val_locs contains BlockPlacement objects
1320 unfit_val_items = unfit_val_locs[0].append(unfit_val_locs[1:])
1322 new_blocks.append(
1323 new_block_2d(
1324 values=value_getitem(unfit_val_items),
1325 placement=BlockPlacement(unfit_idxr),
1326 )
1327 )
1329 self._blknos[unfit_idxr] = len(self.blocks)
1330 self._blklocs[unfit_idxr] = np.arange(unfit_count)
1332 self.blocks += tuple(new_blocks)
1333 # TODO(CoW) is this always correct to assume that the new_blocks
1334 # are not referencing anything else?
1335 if self.refs is not None:
1336 self.refs = list(self.refs) + [None] * len(new_blocks)
1338 # Newly created block's dtype may already be present.
1339 self._known_consolidated = False
1341 def _iset_single(
1342 self, loc: int, value: ArrayLike, inplace: bool, blkno: int, blk: Block
1343 ) -> None:
1344 """
1345 Fastpath for iset when we are only setting a single position and
1346 the Block currently in that position is itself single-column.
1348 In this case we can swap out the entire Block and blklocs and blknos
1349 are unaffected.
1350 """
1351 # Caller is responsible for verifying value.shape
1353 if inplace and blk.should_store(value):
1354 copy = False
1355 if _using_copy_on_write() and not self._has_no_reference_block(blkno):
1356 # perform Copy-on-Write and clear the reference
1357 copy = True
1358 self._clear_reference_block(blkno)
1359 iloc = self.blklocs[loc]
1360 blk.set_inplace(slice(iloc, iloc + 1), value, copy=copy)
1361 return
1363 nb = new_block_2d(value, placement=blk._mgr_locs)
1364 old_blocks = self.blocks
1365 new_blocks = old_blocks[:blkno] + (nb,) + old_blocks[blkno + 1 :]
1366 self.blocks = new_blocks
1367 self._clear_reference_block(blkno)
1368 return
1370 def column_setitem(self, loc: int, idx: int | slice | np.ndarray, value) -> None:
1371 """
1372 Set values ("setitem") into a single column (not setting the full column).
1374 This is a method on the BlockManager level, to avoid creating an
1375 intermediate Series at the DataFrame level (`s = df[loc]; s[idx] = value`)
1376 """
1377 if _using_copy_on_write() and not self._has_no_reference(loc):
1378 # otherwise perform Copy-on-Write and clear the reference
1379 blkno = self.blknos[loc]
1380 blocks = list(self.blocks)
1381 blocks[blkno] = blocks[blkno].copy()
1382 self.blocks = tuple(blocks)
1383 self._clear_reference_block(blkno)
1385 # this manager is only created temporarily to mutate the values in place
1386 # so don't track references, otherwise the `setitem` would perform CoW again
1387 col_mgr = self.iget(loc, track_ref=False)
1388 new_mgr = col_mgr.setitem((idx,), value)
1389 self.iset(loc, new_mgr._block.values, inplace=True)
1391 def insert(self, loc: int, item: Hashable, value: ArrayLike) -> None:
1392 """
1393 Insert item at selected position.
1395 Parameters
1396 ----------
1397 loc : int
1398 item : hashable
1399 value : np.ndarray or ExtensionArray
1400 """
1401 # insert to the axis; this could possibly raise a TypeError
1402 new_axis = self.items.insert(loc, item)
1404 if value.ndim == 2:
1405 value = value.T
1406 if len(value) > 1:
1407 raise ValueError(
1408 f"Expected a 1D array, got an array with shape {value.T.shape}"
1409 )
1410 else:
1411 value = ensure_block_shape(value, ndim=self.ndim)
1413 bp = BlockPlacement(slice(loc, loc + 1))
1414 block = new_block_2d(values=value, placement=bp)
1416 if not len(self.blocks):
1417 # Fastpath
1418 self._blklocs = np.array([0], dtype=np.intp)
1419 self._blknos = np.array([0], dtype=np.intp)
1420 else:
1421 self._insert_update_mgr_locs(loc)
1422 self._insert_update_blklocs_and_blknos(loc)
1424 self.axes[0] = new_axis
1425 self.blocks += (block,)
1426 # TODO(CoW) do we always "own" the passed `value`?
1427 if self.refs is not None:
1428 self.refs += [None]
1430 self._known_consolidated = False
1432 if sum(not block.is_extension for block in self.blocks) > 100:
1433 warnings.warn(
1434 "DataFrame is highly fragmented. This is usually the result "
1435 "of calling `frame.insert` many times, which has poor performance. "
1436 "Consider joining all columns at once using pd.concat(axis=1) "
1437 "instead. To get a de-fragmented frame, use `newframe = frame.copy()`",
1438 PerformanceWarning,
1439 stacklevel=find_stack_level(),
1440 )
1442 def _insert_update_mgr_locs(self, loc) -> None:
1443 """
1444 When inserting a new Block at location 'loc', we increment
1445 all of the mgr_locs of blocks above that by one.
1446 """
1447 for blkno, count in _fast_count_smallints(self.blknos[loc:]):
1448 # .620 this way, .326 of which is in increment_above
1449 blk = self.blocks[blkno]
1450 blk._mgr_locs = blk._mgr_locs.increment_above(loc)
1452 def _insert_update_blklocs_and_blknos(self, loc) -> None:
1453 """
1454 When inserting a new Block at location 'loc', we update our
1455 _blklocs and _blknos.
1456 """
1458 # Accessing public blklocs ensures the public versions are initialized
1459 if loc == self.blklocs.shape[0]:
1460 # np.append is a lot faster, let's use it if we can.
1461 self._blklocs = np.append(self._blklocs, 0)
1462 self._blknos = np.append(self._blknos, len(self.blocks))
1463 elif loc == 0:
1464 # np.append is a lot faster, let's use it if we can.
1465 self._blklocs = np.append(self._blklocs[::-1], 0)[::-1]
1466 self._blknos = np.append(self._blknos[::-1], len(self.blocks))[::-1]
1467 else:
1468 new_blklocs, new_blknos = libinternals.update_blklocs_and_blknos(
1469 self.blklocs, self.blknos, loc, len(self.blocks)
1470 )
1471 self._blklocs = new_blklocs
1472 self._blknos = new_blknos
1474 def idelete(self, indexer) -> BlockManager:
1475 """
1476 Delete selected locations, returning a new BlockManager.
1477 """
1478 is_deleted = np.zeros(self.shape[0], dtype=np.bool_)
1479 is_deleted[indexer] = True
1480 taker = (~is_deleted).nonzero()[0]
1482 nbs, new_refs = self._slice_take_blocks_ax0(taker, only_slice=True)
1483 new_columns = self.items[~is_deleted]
1484 axes = [new_columns, self.axes[1]]
1485 # TODO this might not be needed (can a delete ever be done in chained manner?)
1486 parent = None if com.all_none(*new_refs) else self
1487 return type(self)(tuple(nbs), axes, new_refs, parent, verify_integrity=False)
1489 # ----------------------------------------------------------------
1490 # Block-wise Operation
1492 def grouped_reduce(self: T, func: Callable, ignore_failures: bool = False) -> T:
1493 """
1494 Apply grouped reduction function blockwise, returning a new BlockManager.
1496 Parameters
1497 ----------
1498 func : grouped reduction function
1499 ignore_failures : bool, default False
1500 Whether to drop blocks where func raises TypeError.
1502 Returns
1503 -------
1504 BlockManager
1505 """
1506 result_blocks: list[Block] = []
1507 dropped_any = False
1509 for blk in self.blocks:
1510 if blk.is_object:
1511 # split on object-dtype blocks bc some columns may raise
1512 # while others do not.
1513 for sb in blk._split():
1514 try:
1515 applied = sb.apply(func)
1516 except (TypeError, NotImplementedError):
1517 if not ignore_failures:
1518 raise
1519 dropped_any = True
1520 continue
1521 result_blocks = extend_blocks(applied, result_blocks)
1522 else:
1523 try:
1524 applied = blk.apply(func)
1525 except (TypeError, NotImplementedError):
1526 if not ignore_failures:
1527 raise
1528 dropped_any = True
1529 continue
1530 result_blocks = extend_blocks(applied, result_blocks)
1532 if len(result_blocks) == 0:
1533 index = Index([None]) # placeholder
1534 else:
1535 index = Index(range(result_blocks[0].values.shape[-1]))
1537 if dropped_any:
1538 # faster to skip _combine if we haven't dropped any blocks
1539 return self._combine(result_blocks, copy=False, index=index)
1541 return type(self).from_blocks(result_blocks, [self.axes[0], index])
1543 def reduce(
1544 self: T, func: Callable, ignore_failures: bool = False
1545 ) -> tuple[T, np.ndarray]:
1546 """
1547 Apply reduction function blockwise, returning a single-row BlockManager.
1549 Parameters
1550 ----------
1551 func : reduction function
1552 ignore_failures : bool, default False
1553 Whether to drop blocks where func raises TypeError.
1555 Returns
1556 -------
1557 BlockManager
1558 np.ndarray
1559 Indexer of mgr_locs that are retained.
1560 """
1561 # If 2D, we assume that we're operating column-wise
1562 assert self.ndim == 2
1564 res_blocks: list[Block] = []
1565 for blk in self.blocks:
1566 nbs = blk.reduce(func, ignore_failures)
1567 res_blocks.extend(nbs)
1569 index = Index([None]) # placeholder
1570 if ignore_failures:
1571 if res_blocks:
1572 indexer = np.concatenate([blk.mgr_locs.as_array for blk in res_blocks])
1573 new_mgr = self._combine(res_blocks, copy=False, index=index)
1574 else:
1575 indexer = []
1576 new_mgr = type(self).from_blocks([], [self.items[:0], index])
1577 else:
1578 indexer = np.arange(self.shape[0])
1579 new_mgr = type(self).from_blocks(res_blocks, [self.items, index])
1580 return new_mgr, indexer
1582 def operate_blockwise(self, other: BlockManager, array_op) -> BlockManager:
1583 """
1584 Apply array_op blockwise with another (aligned) BlockManager.
1585 """
1586 return operate_blockwise(self, other, array_op)
1588 def _equal_values(self: BlockManager, other: BlockManager) -> bool:
1589 """
1590 Used in .equals defined in base class. Only check the column values
1591 assuming shape and indexes have already been checked.
1592 """
1593 return blockwise_all(self, other, array_equals)
1595 def quantile(
1596 self: T,
1597 *,
1598 qs: Float64Index,
1599 axis: int = 0,
1600 interpolation="linear",
1601 ) -> T:
1602 """
1603 Iterate over blocks applying quantile reduction.
1604 This routine is intended for reduction type operations and
1605 will do inference on the generated blocks.
1607 Parameters
1608 ----------
1609 axis: reduction axis, default 0
1610 consolidate: bool, default True. Join together blocks having same
1611 dtype
1612 interpolation : type of interpolation, default 'linear'
1613 qs : list of the quantiles to be computed
1615 Returns
1616 -------
1617 BlockManager
1618 """
1619 # Series dispatches to DataFrame for quantile, which allows us to
1620 # simplify some of the code here and in the blocks
1621 assert self.ndim >= 2
1622 assert is_list_like(qs) # caller is responsible for this
1623 assert axis == 1 # only ever called this way
1625 new_axes = list(self.axes)
1626 new_axes[1] = Float64Index(qs)
1628 blocks = [
1629 blk.quantile(axis=axis, qs=qs, interpolation=interpolation)
1630 for blk in self.blocks
1631 ]
1633 return type(self)(blocks, new_axes)
1635 # ----------------------------------------------------------------
1637 def unstack(self, unstacker, fill_value) -> BlockManager:
1638 """
1639 Return a BlockManager with all blocks unstacked.
1641 Parameters
1642 ----------
1643 unstacker : reshape._Unstacker
1644 fill_value : Any
1645 fill_value for newly introduced missing values.
1647 Returns
1648 -------
1649 unstacked : BlockManager
1650 """
1651 new_columns = unstacker.get_new_columns(self.items)
1652 new_index = unstacker.new_index
1654 allow_fill = not unstacker.mask_all
1655 if allow_fill:
1656 # calculating the full mask once and passing it to Block._unstack is
1657 # faster than letting calculating it in each repeated call
1658 new_mask2D = (~unstacker.mask).reshape(*unstacker.full_shape)
1659 needs_masking = new_mask2D.any(axis=0)
1660 else:
1661 needs_masking = np.zeros(unstacker.full_shape[1], dtype=bool)
1663 new_blocks: list[Block] = []
1664 columns_mask: list[np.ndarray] = []
1666 if len(self.items) == 0:
1667 factor = 1
1668 else:
1669 fac = len(new_columns) / len(self.items)
1670 assert fac == int(fac)
1671 factor = int(fac)
1673 for blk in self.blocks:
1674 mgr_locs = blk.mgr_locs
1675 new_placement = mgr_locs.tile_for_unstack(factor)
1677 blocks, mask = blk._unstack(
1678 unstacker,
1679 fill_value,
1680 new_placement=new_placement,
1681 needs_masking=needs_masking,
1682 )
1684 new_blocks.extend(blocks)
1685 columns_mask.extend(mask)
1687 # Block._unstack should ensure this holds,
1688 assert mask.sum() == sum(len(nb._mgr_locs) for nb in blocks)
1689 # In turn this ensures that in the BlockManager call below
1690 # we have len(new_columns) == sum(x.shape[0] for x in new_blocks)
1691 # which suffices to allow us to pass verify_inegrity=False
1693 new_columns = new_columns[columns_mask]
1695 bm = BlockManager(new_blocks, [new_columns, new_index], verify_integrity=False)
1696 return bm
1698 def to_dict(self, copy: bool = True):
1699 """
1700 Return a dict of str(dtype) -> BlockManager
1702 Parameters
1703 ----------
1704 copy : bool, default True
1706 Returns
1707 -------
1708 values : a dict of dtype -> BlockManager
1709 """
1711 bd: dict[str, list[Block]] = {}
1712 for b in self.blocks:
1713 bd.setdefault(str(b.dtype), []).append(b)
1715 # TODO(EA2D): the combine will be unnecessary with 2D EAs
1716 return {dtype: self._combine(blocks, copy=copy) for dtype, blocks in bd.items()}
1718 def as_array(
1719 self,
1720 dtype: np.dtype | None = None,
1721 copy: bool = False,
1722 na_value: object = lib.no_default,
1723 ) -> np.ndarray:
1724 """
1725 Convert the blockmanager data into an numpy array.
1727 Parameters
1728 ----------
1729 dtype : np.dtype or None, default None
1730 Data type of the return array.
1731 copy : bool, default False
1732 If True then guarantee that a copy is returned. A value of
1733 False does not guarantee that the underlying data is not
1734 copied.
1735 na_value : object, default lib.no_default
1736 Value to be used as the missing value sentinel.
1738 Returns
1739 -------
1740 arr : ndarray
1741 """
1742 # TODO(CoW) handle case where resulting array is a view
1743 if len(self.blocks) == 0:
1744 arr = np.empty(self.shape, dtype=float)
1745 return arr.transpose()
1747 # We want to copy when na_value is provided to avoid
1748 # mutating the original object
1749 copy = copy or na_value is not lib.no_default
1751 if self.is_single_block:
1752 blk = self.blocks[0]
1753 if blk.is_extension:
1754 # Avoid implicit conversion of extension blocks to object
1756 # error: Item "ndarray" of "Union[ndarray, ExtensionArray]" has no
1757 # attribute "to_numpy"
1758 arr = blk.values.to_numpy( # type: ignore[union-attr]
1759 dtype=dtype,
1760 na_value=na_value,
1761 ).reshape(blk.shape)
1762 else:
1763 arr = np.asarray(blk.get_values())
1764 if dtype:
1765 arr = arr.astype(dtype, copy=False)
1766 else:
1767 arr = self._interleave(dtype=dtype, na_value=na_value)
1768 # The underlying data was copied within _interleave
1769 copy = False
1771 if copy:
1772 arr = arr.copy()
1774 if na_value is not lib.no_default:
1775 arr[isna(arr)] = na_value
1777 return arr.transpose()
1779 def _interleave(
1780 self,
1781 dtype: np.dtype | None = None,
1782 na_value: object = lib.no_default,
1783 ) -> np.ndarray:
1784 """
1785 Return ndarray from blocks with specified item order
1786 Items must be contained in the blocks
1787 """
1788 if not dtype:
1789 # Incompatible types in assignment (expression has type
1790 # "Optional[Union[dtype[Any], ExtensionDtype]]", variable has
1791 # type "Optional[dtype[Any]]")
1792 dtype = interleaved_dtype( # type: ignore[assignment]
1793 [blk.dtype for blk in self.blocks]
1794 )
1796 # TODO: https://github.com/pandas-dev/pandas/issues/22791
1797 # Give EAs some input on what happens here. Sparse needs this.
1798 if isinstance(dtype, SparseDtype):
1799 dtype = dtype.subtype
1800 dtype = cast(np.dtype, dtype)
1801 elif isinstance(dtype, ExtensionDtype):
1802 dtype = np.dtype("object")
1803 elif is_dtype_equal(dtype, str):
1804 dtype = np.dtype("object")
1806 result = np.empty(self.shape, dtype=dtype)
1808 itemmask = np.zeros(self.shape[0])
1810 if dtype == np.dtype("object") and na_value is lib.no_default:
1811 # much more performant than using to_numpy below
1812 for blk in self.blocks:
1813 rl = blk.mgr_locs
1814 arr = blk.get_values(dtype)
1815 result[rl.indexer] = arr
1816 itemmask[rl.indexer] = 1
1817 return result
1819 for blk in self.blocks:
1820 rl = blk.mgr_locs
1821 if blk.is_extension:
1822 # Avoid implicit conversion of extension blocks to object
1824 # error: Item "ndarray" of "Union[ndarray, ExtensionArray]" has no
1825 # attribute "to_numpy"
1826 arr = blk.values.to_numpy( # type: ignore[union-attr]
1827 dtype=dtype,
1828 na_value=na_value,
1829 )
1830 else:
1831 arr = blk.get_values(dtype)
1832 result[rl.indexer] = arr
1833 itemmask[rl.indexer] = 1
1835 if not itemmask.all():
1836 raise AssertionError("Some items were not contained in blocks")
1838 return result
1840 # ----------------------------------------------------------------
1841 # Consolidation
1843 def is_consolidated(self) -> bool:
1844 """
1845 Return True if more than one block with the same dtype
1846 """
1847 if not self._known_consolidated:
1848 self._consolidate_check()
1849 return self._is_consolidated
1851 def _consolidate_check(self) -> None:
1852 if len(self.blocks) == 1:
1853 # fastpath
1854 self._is_consolidated = True
1855 self._known_consolidated = True
1856 return
1857 dtypes = [blk.dtype for blk in self.blocks if blk._can_consolidate]
1858 self._is_consolidated = len(dtypes) == len(set(dtypes))
1859 self._known_consolidated = True
1861 def _consolidate_inplace(self) -> None:
1862 # In general, _consolidate_inplace should only be called via
1863 # DataFrame._consolidate_inplace, otherwise we will fail to invalidate
1864 # the DataFrame's _item_cache. The exception is for newly-created
1865 # BlockManager objects not yet attached to a DataFrame.
1866 if not self.is_consolidated():
1867 if self.refs is None:
1868 self.blocks = _consolidate(self.blocks)
1869 else:
1870 self.blocks, self.refs = _consolidate_with_refs(self.blocks, self.refs)
1871 self._is_consolidated = True
1872 self._known_consolidated = True
1873 self._rebuild_blknos_and_blklocs()
1876class SingleBlockManager(BaseBlockManager, SingleDataManager):
1877 """manage a single block with"""
1879 @property
1880 def ndim(self) -> Literal[1]:
1881 return 1
1883 _is_consolidated = True
1884 _known_consolidated = True
1885 __slots__ = ()
1886 is_single_block = True
1888 def __init__(
1889 self,
1890 block: Block,
1891 axis: Index,
1892 refs: list[weakref.ref | None] | None = None,
1893 parent: object = None,
1894 verify_integrity: bool = False,
1895 fastpath=lib.no_default,
1896 ) -> None:
1897 # Assertions disabled for performance
1898 # assert isinstance(block, Block), type(block)
1899 # assert isinstance(axis, Index), type(axis)
1901 if fastpath is not lib.no_default:
1902 warnings.warn(
1903 "The `fastpath` keyword is deprecated and will be removed "
1904 "in a future version.",
1905 FutureWarning,
1906 stacklevel=find_stack_level(),
1907 )
1909 self.axes = [axis]
1910 self.blocks = (block,)
1911 self.refs = refs
1912 self.parent = parent if _using_copy_on_write() else None
1914 @classmethod
1915 def from_blocks(
1916 cls,
1917 blocks: list[Block],
1918 axes: list[Index],
1919 refs: list[weakref.ref | None] | None = None,
1920 parent: object = None,
1921 ) -> SingleBlockManager:
1922 """
1923 Constructor for BlockManager and SingleBlockManager with same signature.
1924 """
1925 assert len(blocks) == 1
1926 assert len(axes) == 1
1927 if refs is not None:
1928 assert len(refs) == 1
1929 return cls(blocks[0], axes[0], refs, parent, verify_integrity=False)
1931 @classmethod
1932 def from_array(cls, array: ArrayLike, index: Index) -> SingleBlockManager:
1933 """
1934 Constructor for if we have an array that is not yet a Block.
1935 """
1936 block = new_block(array, placement=slice(0, len(index)), ndim=1)
1937 return cls(block, index)
1939 def to_2d_mgr(self, columns: Index) -> BlockManager:
1940 """
1941 Manager analogue of Series.to_frame
1942 """
1943 blk = self.blocks[0]
1944 arr = ensure_block_shape(blk.values, ndim=2)
1945 bp = BlockPlacement(0)
1946 new_blk = type(blk)(arr, placement=bp, ndim=2)
1947 axes = [columns, self.axes[0]]
1948 refs: list[weakref.ref | None] = [weakref.ref(blk)]
1949 parent = self if _using_copy_on_write() else None
1950 return BlockManager(
1951 [new_blk], axes=axes, refs=refs, parent=parent, verify_integrity=False
1952 )
1954 def _has_no_reference(self, i: int = 0) -> bool:
1955 """
1956 Check for column `i` if it has references.
1957 (whether it references another array or is itself being referenced)
1958 Returns True if the column has no references.
1959 """
1960 return (self.refs is None or self.refs[0] is None) and weakref.getweakrefcount(
1961 self.blocks[0]
1962 ) == 0
1964 def __getstate__(self):
1965 block_values = [b.values for b in self.blocks]
1966 block_items = [self.items[b.mgr_locs.indexer] for b in self.blocks]
1967 axes_array = list(self.axes)
1969 extra_state = {
1970 "0.14.1": {
1971 "axes": axes_array,
1972 "blocks": [
1973 {"values": b.values, "mgr_locs": b.mgr_locs.indexer}
1974 for b in self.blocks
1975 ],
1976 }
1977 }
1979 # First three elements of the state are to maintain forward
1980 # compatibility with 0.13.1.
1981 return axes_array, block_values, block_items, extra_state
1983 def __setstate__(self, state):
1984 def unpickle_block(values, mgr_locs, ndim: int) -> Block:
1985 # TODO(EA2D): ndim would be unnecessary with 2D EAs
1986 # older pickles may store e.g. DatetimeIndex instead of DatetimeArray
1987 values = extract_array(values, extract_numpy=True)
1988 return new_block(values, placement=mgr_locs, ndim=ndim)
1990 if isinstance(state, tuple) and len(state) >= 4 and "0.14.1" in state[3]:
1991 state = state[3]["0.14.1"]
1992 self.axes = [ensure_index(ax) for ax in state["axes"]]
1993 ndim = len(self.axes)
1994 self.blocks = tuple(
1995 unpickle_block(b["values"], b["mgr_locs"], ndim=ndim)
1996 for b in state["blocks"]
1997 )
1998 else:
1999 raise NotImplementedError("pre-0.14.1 pickles are no longer supported")
2001 self._post_setstate()
2003 def _post_setstate(self):
2004 pass
2006 @cache_readonly
2007 def _block(self) -> Block:
2008 return self.blocks[0]
2010 @property
2011 def _blknos(self):
2012 """compat with BlockManager"""
2013 return None
2015 @property
2016 def _blklocs(self):
2017 """compat with BlockManager"""
2018 return None
2020 def getitem_mgr(self, indexer: slice | npt.NDArray[np.bool_]) -> SingleBlockManager:
2021 # similar to get_slice, but not restricted to slice indexer
2022 blk = self._block
2023 array = blk._slice(indexer)
2024 if array.ndim > 1:
2025 # This will be caught by Series._get_values
2026 raise ValueError("dimension-expanding indexing not allowed")
2028 bp = BlockPlacement(slice(0, len(array)))
2029 block = type(blk)(array, placement=bp, ndim=1)
2031 new_idx = self.index[indexer]
2032 # TODO(CoW) in theory only need to track reference if new_array is a view
2033 ref = weakref.ref(blk)
2034 return type(self)(block, new_idx, [ref], parent=self)
2036 def get_slice(self, slobj: slice, axis: int = 0) -> SingleBlockManager:
2037 # Assertion disabled for performance
2038 # assert isinstance(slobj, slice), type(slobj)
2039 if axis >= self.ndim:
2040 raise IndexError("Requested axis not found in manager")
2042 blk = self._block
2043 array = blk._slice(slobj)
2044 bp = BlockPlacement(slice(0, len(array)))
2045 block = type(blk)(array, placement=bp, ndim=1)
2046 new_index = self.index._getitem_slice(slobj)
2047 # TODO this method is only used in groupby SeriesSplitter at the moment,
2048 # so passing refs / parent is not yet covered by the tests
2049 return type(self)(block, new_index, [weakref.ref(blk)], parent=self)
2051 @property
2052 def index(self) -> Index:
2053 return self.axes[0]
2055 @property
2056 def dtype(self) -> DtypeObj:
2057 return self._block.dtype
2059 def get_dtypes(self) -> np.ndarray:
2060 return np.array([self._block.dtype])
2062 def external_values(self):
2063 """The array that Series.values returns"""
2064 return self._block.external_values()
2066 def internal_values(self):
2067 """The array that Series._values returns"""
2068 return self._block.values
2070 def array_values(self):
2071 """The array that Series.array returns"""
2072 return self._block.array_values
2074 def get_numeric_data(self, copy: bool = False):
2075 if self._block.is_numeric:
2076 return self.copy(deep=copy)
2077 return self.make_empty()
2079 @property
2080 def _can_hold_na(self) -> bool:
2081 return self._block._can_hold_na
2083 def setitem_inplace(self, indexer, value) -> None:
2084 """
2085 Set values with indexer.
2087 For Single[Block/Array]Manager, this backs s[indexer] = value
2089 This is an inplace version of `setitem()`, mutating the manager/values
2090 in place, not returning a new Manager (and Block), and thus never changing
2091 the dtype.
2092 """
2093 if _using_copy_on_write() and not self._has_no_reference(0):
2094 self.blocks = (self._block.copy(),)
2095 self.refs = None
2096 self.parent = None
2097 self._cache.clear()
2099 super().setitem_inplace(indexer, value)
2101 def idelete(self, indexer) -> SingleBlockManager:
2102 """
2103 Delete single location from SingleBlockManager.
2105 Ensures that self.blocks doesn't become empty.
2106 """
2107 nb = self._block.delete(indexer)
2108 self.blocks = (nb,)
2109 self.axes[0] = self.axes[0].delete(indexer)
2110 self._cache.clear()
2111 # clear reference since delete always results in a new array
2112 self.refs = None
2113 self.parent = None
2114 return self
2116 def fast_xs(self, loc):
2117 """
2118 fast path for getting a cross-section
2119 return a view of the data
2120 """
2121 raise NotImplementedError("Use series._values[loc] instead")
2123 def set_values(self, values: ArrayLike):
2124 """
2125 Set the values of the single block in place.
2127 Use at your own risk! This does not check if the passed values are
2128 valid for the current Block/SingleBlockManager (length, dtype, etc).
2129 """
2130 # TODO(CoW) do we need to handle copy on write here? Currently this is
2131 # only used for FrameColumnApply.series_generator (what if apply is
2132 # mutating inplace?)
2133 self.blocks[0].values = values
2134 self.blocks[0]._mgr_locs = BlockPlacement(slice(len(values)))
2136 def _equal_values(self: T, other: T) -> bool:
2137 """
2138 Used in .equals defined in base class. Only check the column values
2139 assuming shape and indexes have already been checked.
2140 """
2141 # For SingleBlockManager (i.e.Series)
2142 if other.ndim != 1:
2143 return False
2144 left = self.blocks[0].values
2145 right = other.blocks[0].values
2146 return array_equals(left, right)
2149# --------------------------------------------------------------------
2150# Constructor Helpers
2153def create_block_manager_from_blocks(
2154 blocks: list[Block],
2155 axes: list[Index],
2156 consolidate: bool = True,
2157 verify_integrity: bool = True,
2158) -> BlockManager:
2159 # If verify_integrity=False, then caller is responsible for checking
2160 # all(x.shape[-1] == len(axes[1]) for x in blocks)
2161 # sum(x.shape[0] for x in blocks) == len(axes[0])
2162 # set(x for blk in blocks for x in blk.mgr_locs) == set(range(len(axes[0])))
2163 # all(blk.ndim == 2 for blk in blocks)
2164 # This allows us to safely pass verify_integrity=False
2166 try:
2167 mgr = BlockManager(blocks, axes, verify_integrity=verify_integrity)
2169 except ValueError as err:
2170 arrays = [blk.values for blk in blocks]
2171 tot_items = sum(arr.shape[0] for arr in arrays)
2172 raise construction_error(tot_items, arrays[0].shape[1:], axes, err)
2174 if consolidate:
2175 mgr._consolidate_inplace()
2176 return mgr
2179def create_block_manager_from_column_arrays(
2180 arrays: list[ArrayLike],
2181 axes: list[Index],
2182 consolidate: bool = True,
2183) -> BlockManager:
2184 # Assertions disabled for performance (caller is responsible for verifying)
2185 # assert isinstance(axes, list)
2186 # assert all(isinstance(x, Index) for x in axes)
2187 # assert all(isinstance(x, (np.ndarray, ExtensionArray)) for x in arrays)
2188 # assert all(type(x) is not PandasArray for x in arrays)
2189 # assert all(x.ndim == 1 for x in arrays)
2190 # assert all(len(x) == len(axes[1]) for x in arrays)
2191 # assert len(arrays) == len(axes[0])
2192 # These last three are sufficient to allow us to safely pass
2193 # verify_integrity=False below.
2195 try:
2196 blocks = _form_blocks(arrays, consolidate)
2197 mgr = BlockManager(blocks, axes, verify_integrity=False)
2198 except ValueError as e:
2199 raise construction_error(len(arrays), arrays[0].shape, axes, e)
2200 if consolidate:
2201 mgr._consolidate_inplace()
2202 return mgr
2205def construction_error(
2206 tot_items: int,
2207 block_shape: Shape,
2208 axes: list[Index],
2209 e: ValueError | None = None,
2210):
2211 """raise a helpful message about our construction"""
2212 passed = tuple(map(int, [tot_items] + list(block_shape)))
2213 # Correcting the user facing error message during dataframe construction
2214 if len(passed) <= 2:
2215 passed = passed[::-1]
2217 implied = tuple(len(ax) for ax in axes)
2218 # Correcting the user facing error message during dataframe construction
2219 if len(implied) <= 2:
2220 implied = implied[::-1]
2222 # We return the exception object instead of raising it so that we
2223 # can raise it in the caller; mypy plays better with that
2224 if passed == implied and e is not None:
2225 return e
2226 if block_shape[0] == 0:
2227 return ValueError("Empty data passed with indices specified.")
2228 return ValueError(f"Shape of passed values is {passed}, indices imply {implied}")
2231# -----------------------------------------------------------------------
2234def _grouping_func(tup: tuple[int, ArrayLike]) -> tuple[int, bool, DtypeObj]:
2235 # compat for numpy<1.21, in which comparing a np.dtype with an ExtensionDtype
2236 # raises instead of returning False. Once earlier numpy versions are dropped,
2237 # this can be simplified to `return tup[1].dtype`
2238 dtype = tup[1].dtype
2240 if is_1d_only_ea_dtype(dtype):
2241 # We know these won't be consolidated, so don't need to group these.
2242 # This avoids expensive comparisons of CategoricalDtype objects
2243 sep = id(dtype)
2244 else:
2245 sep = 0
2247 return sep, isinstance(dtype, np.dtype), dtype
2250def _form_blocks(arrays: list[ArrayLike], consolidate: bool) -> list[Block]:
2251 tuples = list(enumerate(arrays))
2253 if not consolidate:
2254 nbs = _tuples_to_blocks_no_consolidate(tuples)
2255 return nbs
2257 # group by dtype
2258 grouper = itertools.groupby(tuples, _grouping_func)
2260 nbs = []
2261 for (_, _, dtype), tup_block in grouper:
2262 block_type = get_block_type(dtype)
2264 if isinstance(dtype, np.dtype):
2265 is_dtlike = dtype.kind in ["m", "M"]
2267 if issubclass(dtype.type, (str, bytes)):
2268 dtype = np.dtype(object)
2270 values, placement = _stack_arrays(list(tup_block), dtype)
2271 if is_dtlike:
2272 values = ensure_wrapped_if_datetimelike(values)
2273 blk = block_type(values, placement=BlockPlacement(placement), ndim=2)
2274 nbs.append(blk)
2276 elif is_1d_only_ea_dtype(dtype):
2277 dtype_blocks = [
2278 block_type(x[1], placement=BlockPlacement(x[0]), ndim=2)
2279 for x in tup_block
2280 ]
2281 nbs.extend(dtype_blocks)
2283 else:
2284 dtype_blocks = [
2285 block_type(
2286 ensure_block_shape(x[1], 2), placement=BlockPlacement(x[0]), ndim=2
2287 )
2288 for x in tup_block
2289 ]
2290 nbs.extend(dtype_blocks)
2291 return nbs
2294def _tuples_to_blocks_no_consolidate(tuples) -> list[Block]:
2295 # tuples produced within _form_blocks are of the form (placement, array)
2296 return [
2297 new_block_2d(ensure_block_shape(x[1], ndim=2), placement=BlockPlacement(x[0]))
2298 for x in tuples
2299 ]
2302def _stack_arrays(tuples, dtype: np.dtype):
2304 placement, arrays = zip(*tuples)
2306 first = arrays[0]
2307 shape = (len(arrays),) + first.shape
2309 stacked = np.empty(shape, dtype=dtype)
2310 for i, arr in enumerate(arrays):
2311 stacked[i] = arr
2313 return stacked, placement
2316def _consolidate(blocks: tuple[Block, ...]) -> tuple[Block, ...]:
2317 """
2318 Merge blocks having same dtype, exclude non-consolidating blocks
2319 """
2320 # sort by _can_consolidate, dtype
2321 gkey = lambda x: x._consolidate_key
2322 grouper = itertools.groupby(sorted(blocks, key=gkey), gkey)
2324 new_blocks: list[Block] = []
2325 for (_can_consolidate, dtype), group_blocks in grouper:
2326 merged_blocks, _ = _merge_blocks(
2327 list(group_blocks), dtype=dtype, can_consolidate=_can_consolidate
2328 )
2329 new_blocks = extend_blocks(merged_blocks, new_blocks)
2330 return tuple(new_blocks)
2333def _consolidate_with_refs(
2334 blocks: tuple[Block, ...], refs
2335) -> tuple[tuple[Block, ...], list[weakref.ref | None]]:
2336 """
2337 Merge blocks having same dtype, exclude non-consolidating blocks, handling
2338 refs
2339 """
2340 gkey = lambda x: x[0]._consolidate_key
2341 grouper = itertools.groupby(sorted(zip(blocks, refs), key=gkey), gkey)
2343 new_blocks: list[Block] = []
2344 new_refs: list[weakref.ref | None] = []
2345 for (_can_consolidate, dtype), group_blocks_refs in grouper:
2346 group_blocks, group_refs = list(zip(*list(group_blocks_refs)))
2347 merged_blocks, consolidated = _merge_blocks(
2348 list(group_blocks), dtype=dtype, can_consolidate=_can_consolidate
2349 )
2350 new_blocks = extend_blocks(merged_blocks, new_blocks)
2351 if consolidated:
2352 new_refs.extend([None])
2353 else:
2354 new_refs.extend(group_refs)
2355 return tuple(new_blocks), new_refs
2358def _merge_blocks(
2359 blocks: list[Block], dtype: DtypeObj, can_consolidate: bool
2360) -> tuple[list[Block], bool]:
2362 if len(blocks) == 1:
2363 return blocks, False
2365 if can_consolidate:
2367 # TODO: optimization potential in case all mgrs contain slices and
2368 # combination of those slices is a slice, too.
2369 new_mgr_locs = np.concatenate([b.mgr_locs.as_array for b in blocks])
2371 new_values: ArrayLike
2373 if isinstance(blocks[0].dtype, np.dtype):
2374 # error: List comprehension has incompatible type List[Union[ndarray,
2375 # ExtensionArray]]; expected List[Union[complex, generic,
2376 # Sequence[Union[int, float, complex, str, bytes, generic]],
2377 # Sequence[Sequence[Any]], SupportsArray]]
2378 new_values = np.vstack([b.values for b in blocks]) # type: ignore[misc]
2379 else:
2380 bvals = [blk.values for blk in blocks]
2381 bvals2 = cast(Sequence[NDArrayBackedExtensionArray], bvals)
2382 new_values = bvals2[0]._concat_same_type(bvals2, axis=0)
2384 argsort = np.argsort(new_mgr_locs)
2385 new_values = new_values[argsort]
2386 new_mgr_locs = new_mgr_locs[argsort]
2388 bp = BlockPlacement(new_mgr_locs)
2389 return [new_block_2d(new_values, placement=bp)], True
2391 # can't consolidate --> no merge
2392 return blocks, False
2395def _fast_count_smallints(arr: npt.NDArray[np.intp]):
2396 """Faster version of set(arr) for sequences of small numbers."""
2397 counts = np.bincount(arr)
2398 nz = counts.nonzero()[0]
2399 # Note: list(zip(...) outperforms list(np.c_[nz, counts[nz]]) here,
2400 # in one benchmark by a factor of 11
2401 return zip(nz, counts[nz])
2404def _preprocess_slice_or_indexer(
2405 slice_or_indexer: slice | np.ndarray, length: int, allow_fill: bool
2406):
2407 if isinstance(slice_or_indexer, slice):
2408 return (
2409 "slice",
2410 slice_or_indexer,
2411 libinternals.slice_len(slice_or_indexer, length),
2412 )
2413 else:
2414 if (
2415 not isinstance(slice_or_indexer, np.ndarray)
2416 or slice_or_indexer.dtype.kind != "i"
2417 ):
2418 dtype = getattr(slice_or_indexer, "dtype", None)
2419 raise TypeError(type(slice_or_indexer), dtype)
2421 indexer = ensure_platform_int(slice_or_indexer)
2422 if not allow_fill:
2423 indexer = maybe_convert_indices(indexer, length)
2424 return "fancy", indexer, len(indexer)
2427def _using_copy_on_write():
2428 return get_option("mode.copy_on_write")