Coverage for /var/srv/projects/api.amasfac.comuna18.com/tmp/venv/lib/python3.9/site-packages/pandas/core/internals/managers.py: 13%

1from __future__ import annotations

3import itertools

4from typing import (

5 Any,

6 Callable,

7 Hashable,

8 Literal,

9 Sequence,

10 TypeVar,

11 cast,

12)

13import warnings

14import weakref

16import numpy as np

18from pandas._config import get_option

20from pandas._libs import (

21 algos as libalgos,

22 internals as libinternals,

23 lib,

24)

25from pandas._libs.internals import BlockPlacement

26from pandas._typing import (

27 ArrayLike,

28 DtypeObj,

29 Shape,

30 npt,

31 type_t,

32)

33from pandas.errors import PerformanceWarning

34from pandas.util._decorators import cache_readonly

35from pandas.util._exceptions import find_stack_level

36from pandas.util._validators import validate_bool_kwarg

38from pandas.core.dtypes.cast import infer_dtype_from_scalar

39from pandas.core.dtypes.common import (

40 ensure_platform_int,

41 is_1d_only_ea_dtype,

42 is_dtype_equal,

43 is_list_like,

44)

45from pandas.core.dtypes.dtypes import ExtensionDtype

46from pandas.core.dtypes.generic import (

47 ABCDataFrame,

48 ABCSeries,

49)

50from pandas.core.dtypes.missing import (

51 array_equals,

52 isna,

53)

55import pandas.core.algorithms as algos

56from pandas.core.arrays._mixins import NDArrayBackedExtensionArray

57from pandas.core.arrays.sparse import SparseDtype

58import pandas.core.common as com

59from pandas.core.construction import (

60 ensure_wrapped_if_datetimelike,

61 extract_array,

62)

63from pandas.core.indexers import maybe_convert_indices

64from pandas.core.indexes.api import (

65 Float64Index,

66 Index,

67 ensure_index,

68)

69from pandas.core.internals.base import (

70 DataManager,

71 SingleDataManager,

72 interleaved_dtype,

73)

74from pandas.core.internals.blocks import (

75 Block,

76 DatetimeTZBlock,

77 NumpyBlock,

78 ensure_block_shape,

79 extend_blocks,

80 get_block_type,

81 new_block,

82 new_block_2d,

83)

84from pandas.core.internals.ops import (

85 blockwise_all,

86 operate_blockwise,

87)

89T = TypeVar("T", bound="BaseBlockManager")

92class BaseBlockManager(DataManager):

93 """

94 Core internal data structure to implement DataFrame, Series, etc.

96 Manage a bunch of labeled 2D mixed-type ndarrays. Essentially it's a

97 lightweight blocked set of labeled data to be manipulated by the DataFrame

98 public API class

100 Attributes

101 ----------

102 shape

103 ndim

104 axes

105 values

106 items

107

108 Methods

109 -------

110 set_axis(axis, new_labels)

111 copy(deep=True)

112

113 get_dtypes

114

115 apply(func, axes, block_filter_fn)

116

117 get_bool_data

118 get_numeric_data

119

120 get_slice(slice_like, axis)

121 get(label)

122 iget(loc)

123

124 take(indexer, axis)

125 reindex_axis(new_labels, axis)

126 reindex_indexer(new_labels, indexer, axis)

127

128 delete(label)

129 insert(loc, label, value)

130 set(label, value)

131

132 Parameters

133 ----------

134 blocks: Sequence of Block

135 axes: Sequence of Index

136 verify_integrity: bool, default True

137

138 Notes

139 -----

140 This is *not* a public API class

141 """

142

143 __slots__ = ()

144

145 _blknos: npt.NDArray[np.intp]

146 _blklocs: npt.NDArray[np.intp]

147 blocks: tuple[Block, ...]

148 axes: list[Index]

149 refs: list[weakref.ref | None] | None

150 parent: object

151

152 @property

153 def ndim(self) -> int:

154 raise NotImplementedError

155

156 _known_consolidated: bool

157 _is_consolidated: bool

158

159 def __init__(self, blocks, axes, refs=None, verify_integrity: bool = True) -> None:

160 raise NotImplementedError

161

162 @classmethod

163 def from_blocks(

164 cls: type_t[T],

165 blocks: list[Block],

166 axes: list[Index],

167 refs: list[weakref.ref | None] | None = None,

168 parent: object = None,

169 ) -> T:

170 raise NotImplementedError

171

172 @property

173 def blknos(self) -> npt.NDArray[np.intp]:

174 """

175 Suppose we want to find the array corresponding to our i'th column.

176

177 blknos[i] identifies the block from self.blocks that contains this column.

178

179 blklocs[i] identifies the column of interest within

180 self.blocks[self.blknos[i]]

181 """

182 if self._blknos is None:

183 # Note: these can be altered by other BlockManager methods.

184 self._rebuild_blknos_and_blklocs()

185

186 return self._blknos

187

188 @property

189 def blklocs(self) -> npt.NDArray[np.intp]:

190 """

191 See blknos.__doc__

192 """

193 if self._blklocs is None:

194 # Note: these can be altered by other BlockManager methods.

195 self._rebuild_blknos_and_blklocs()

196

197 return self._blklocs

198

199 def make_empty(self: T, axes=None) -> T:

200 """return an empty BlockManager with the items axis of len 0"""

201 if axes is None:

202 axes = [Index([])] + self.axes[1:]

203

204 # preserve dtype if possible

205 if self.ndim == 1:

206 assert isinstance(self, SingleBlockManager) # for mypy

207 blk = self.blocks[0]

208 arr = blk.values[:0]

209 bp = BlockPlacement(slice(0, 0))

210 nb = blk.make_block_same_class(arr, placement=bp)

211 blocks = [nb]

212 else:

213 blocks = []

214 return type(self).from_blocks(blocks, axes)

215

216 def __nonzero__(self) -> bool:

217 return True

218

219 # Python3 compat

220 __bool__ = __nonzero__

221

222 def _normalize_axis(self, axis: int) -> int:

223 # switch axis to follow BlockManager logic

224 if self.ndim == 2:

225 axis = 1 if axis == 0 else 0

226 return axis

227

228 def set_axis(self, axis: int, new_labels: Index) -> None:

229 # Caller is responsible for ensuring we have an Index object.

230 self._validate_set_axis(axis, new_labels)

231 self.axes[axis] = new_labels

232

233 @property

234 def is_single_block(self) -> bool:

235 # Assumes we are 2D; overridden by SingleBlockManager

236 return len(self.blocks) == 1

237

238 @property

239 def items(self) -> Index:

240 return self.axes[0]

241

242 def _has_no_reference(self, i: int) -> bool:

243 """

244 Check for column `i` if it has references.

245 (whether it references another array or is itself being referenced)

246 Returns True if the column has no references.

247 """

248 blkno = self.blknos[i]

249 return self._has_no_reference_block(blkno)

250

251 def _has_no_reference_block(self, blkno: int) -> bool:

252 """

253 Check for block `i` if it has references.

254 (whether it references another array or is itself being referenced)

255 Returns True if the block has no references.

256 """

257 # TODO(CoW) include `or self.refs[blkno]() is None` ?

258 return (

259 self.refs is None or self.refs[blkno] is None

260 ) and weakref.getweakrefcount(self.blocks[blkno]) == 0

261

262 def _clear_reference_block(self, blkno: int) -> None:

263 """

264 Clear any reference for column `i`.

265 """

266 if self.refs is not None:

267 self.refs[blkno] = None

268 if com.all_none(*self.refs):

269 self.parent = None

270

271 def get_dtypes(self):

272 dtypes = np.array([blk.dtype for blk in self.blocks])

273 return dtypes.take(self.blknos)

274

275 @property

276 def arrays(self) -> list[ArrayLike]:

277 """

278 Quick access to the backing arrays of the Blocks.

279

280 Only for compatibility with ArrayManager for testing convenience.

281 Not to be used in actual code, and return value is not the same as the

282 ArrayManager method (list of 1D arrays vs iterator of 2D ndarrays / 1D EAs).

283

284 Warning! The returned arrays don't handle Copy-on-Write, so this should

285 be used with caution (only in read-mode).

286 """

287 return [blk.values for blk in self.blocks]

288

289 def __repr__(self) -> str:

290 output = type(self).__name__

291 for i, ax in enumerate(self.axes):

292 if i == 0:

293 output += f"\nItems: {ax}"

294 else:

295 output += f"\nAxis {i}: {ax}"

296

297 for block in self.blocks:

298 output += f"\n{block}"

299 return output

300

301 def apply(

302 self: T,

303 f,

304 align_keys: list[str] | None = None,

305 ignore_failures: bool = False,

306 **kwargs,

307 ) -> T:

308 """

309 Iterate over the blocks, collect and create a new BlockManager.

310

311 Parameters

312 ----------

313 f : str or callable

314 Name of the Block method to apply.

315 align_keys: List[str] or None, default None

316 ignore_failures: bool, default False

317 **kwargs

318 Keywords to pass to `f`

319

320 Returns

321 -------

322 BlockManager

323 """

324 assert "filter" not in kwargs

325

326 align_keys = align_keys or []

327 result_blocks: list[Block] = []

328 # fillna: Series/DataFrame is responsible for making sure value is aligned

329

330 aligned_args = {k: kwargs[k] for k in align_keys}

331

332 for b in self.blocks:

333

334 if aligned_args:

335

336 for k, obj in aligned_args.items():

337 if isinstance(obj, (ABCSeries, ABCDataFrame)):

338 # The caller is responsible for ensuring that

339 # obj.axes[-1].equals(self.items)

340 if obj.ndim == 1:

341 kwargs[k] = obj.iloc[b.mgr_locs.indexer]._values

342 else:

343 kwargs[k] = obj.iloc[:, b.mgr_locs.indexer]._values

344 else:

345 # otherwise we have an ndarray

346 kwargs[k] = obj[b.mgr_locs.indexer]

347

348 try:

349 if callable(f):

350 applied = b.apply(f, **kwargs)

351 else:

352 applied = getattr(b, f)(**kwargs)

353 except (TypeError, NotImplementedError):

354 if not ignore_failures:

355 raise

356 continue

357 result_blocks = extend_blocks(applied, result_blocks)

358

359 if ignore_failures:

360 return self._combine(result_blocks)

361

362 out = type(self).from_blocks(result_blocks, self.axes)

363 return out

364

365 def where(self: T, other, cond, align: bool) -> T:

366 if align:

367 align_keys = ["other", "cond"]

368 else:

369 align_keys = ["cond"]

370 other = extract_array(other, extract_numpy=True)

371

372 return self.apply(

373 "where",

374 align_keys=align_keys,

375 other=other,

376 cond=cond,

377 )

378

379 def setitem(self: T, indexer, value) -> T:

380 """

381 Set values with indexer.

382

383 For SingleBlockManager, this backs s[indexer] = value

384 """

385 if isinstance(indexer, np.ndarray) and indexer.ndim > self.ndim:

386 raise ValueError(f"Cannot set values with ndim > {self.ndim}")

387

388 if _using_copy_on_write() and not self._has_no_reference(0):

389 # if being referenced -> perform Copy-on-Write and clear the reference

390 # this method is only called if there is a single block -> hardcoded 0

391 self = self.copy()

392

393 return self.apply("setitem", indexer=indexer, value=value)

394

395 def putmask(self, mask, new, align: bool = True):

396 if (

397 _using_copy_on_write()

398 and self.refs is not None

399 and not all(ref is None for ref in self.refs)

400 ):

401 # some reference -> copy full dataframe

402 # TODO(CoW) this could be optimized to only copy the blocks that would

403 # get modified

404 self = self.copy()

405

406 if align:

407 align_keys = ["new", "mask"]

408 else:

409 align_keys = ["mask"]

410 new = extract_array(new, extract_numpy=True)

411

412 return self.apply(

413 "putmask",

414 align_keys=align_keys,

415 mask=mask,

416 new=new,

417 )

418

419 def diff(self: T, n: int, axis: int) -> T:

420 axis = self._normalize_axis(axis)

421 return self.apply("diff", n=n, axis=axis)

422

423 def interpolate(self: T, **kwargs) -> T:

424 return self.apply("interpolate", **kwargs)

425

426 def shift(self: T, periods: int, axis: int, fill_value) -> T:

427 axis = self._normalize_axis(axis)

428 if fill_value is lib.no_default:

429 fill_value = None

430

431 return self.apply("shift", periods=periods, axis=axis, fill_value=fill_value)

432

433 def fillna(self: T, value, limit, inplace: bool, downcast) -> T:

434

435 if limit is not None:

436 # Do this validation even if we go through one of the no-op paths

437 limit = libalgos.validate_limit(None, limit=limit)

438 if inplace:

439 # TODO(CoW) can be optimized to only copy those blocks that have refs

440 if _using_copy_on_write() and any(

441 not self._has_no_reference_block(i) for i in range(len(self.blocks))

442 ):

443 self = self.copy()

444

445 return self.apply(

446 "fillna", value=value, limit=limit, inplace=inplace, downcast=downcast

447 )

448

449 def astype(self: T, dtype, copy: bool = False, errors: str = "raise") -> T:

450 return self.apply("astype", dtype=dtype, copy=copy, errors=errors)

451

452 def convert(

453 self: T,

454 copy: bool = True,

455 datetime: bool = True,

456 numeric: bool = True,

457 timedelta: bool = True,

458 ) -> T:

459 return self.apply(

460 "convert",

461 copy=copy,

462 datetime=datetime,

463 numeric=numeric,

464 timedelta=timedelta,

465 )

466

467 def replace(self: T, to_replace, value, inplace: bool) -> T:

468 inplace = validate_bool_kwarg(inplace, "inplace")

469 # NDFrame.replace ensures the not-is_list_likes here

470 assert not is_list_like(to_replace)

471 assert not is_list_like(value)

472 return self.apply(

473 "replace", to_replace=to_replace, value=value, inplace=inplace

474 )

475

476 def replace_regex(self, **kwargs):

477 return self.apply("_replace_regex", **kwargs)

478

479 def replace_list(

480 self: T,

481 src_list: list[Any],

482 dest_list: list[Any],

483 inplace: bool = False,

484 regex: bool = False,

485 ) -> T:

486 """do a list replace"""

487 inplace = validate_bool_kwarg(inplace, "inplace")

488

489 bm = self.apply(

490 "replace_list",

491 src_list=src_list,

492 dest_list=dest_list,

493 inplace=inplace,

494 regex=regex,

495 )

496 bm._consolidate_inplace()

497 return bm

498

499 def to_native_types(self: T, **kwargs) -> T:

500 """

501 Convert values to native types (strings / python objects) that are used

502 in formatting (repr / csv).

503 """

504 return self.apply("to_native_types", **kwargs)

505

506 @property

507 def is_numeric_mixed_type(self) -> bool:

508 return all(block.is_numeric for block in self.blocks)

509

510 @property

511 def any_extension_types(self) -> bool:

512 """Whether any of the blocks in this manager are extension blocks"""

513 return any(block.is_extension for block in self.blocks)

514

515 @property

516 def is_view(self) -> bool:

517 """return a boolean if we are a single block and are a view"""

518 if len(self.blocks) == 1:

519 return self.blocks[0].is_view

520

521 # It is technically possible to figure out which blocks are views

522 # e.g. [ b.values.base is not None for b in self.blocks ]

523 # but then we have the case of possibly some blocks being a view

524 # and some blocks not. setting in theory is possible on the non-view

525 # blocks w/o causing a SettingWithCopy raise/warn. But this is a bit

526 # complicated

527

528 return False

529

530 def _get_data_subset(self: T, predicate: Callable) -> T:

531 blocks = [blk for blk in self.blocks if predicate(blk.values)]

532 return self._combine(blocks, copy=False)

533

534 def get_bool_data(self: T, copy: bool = False) -> T:

535 """

536 Select blocks that are bool-dtype and columns from object-dtype blocks

537 that are all-bool.

538

539 Parameters

540 ----------

541 copy : bool, default False

542 Whether to copy the blocks

543 """

544

545 new_blocks = []

546

547 for blk in self.blocks:

548 if blk.dtype == bool:

549 new_blocks.append(blk)

550

551 elif blk.is_object:

552 nbs = blk._split()

553 for nb in nbs:

554 if nb.is_bool:

555 new_blocks.append(nb)

556

557 return self._combine(new_blocks, copy)

558

559 def get_numeric_data(self: T, copy: bool = False) -> T:

560 """

561 Parameters

562 ----------

563 copy : bool, default False

564 Whether to copy the blocks

565 """

566 numeric_blocks = [blk for blk in self.blocks if blk.is_numeric]

567 if len(numeric_blocks) == len(self.blocks):

568 # Avoid somewhat expensive _combine

569 if copy:

570 return self.copy(deep=True)

571 return self

572 return self._combine(numeric_blocks, copy)

573

574 def _combine(

575 self: T, blocks: list[Block], copy: bool = True, index: Index | None = None

576 ) -> T:

577 """return a new manager with the blocks"""

578 if len(blocks) == 0:

579 if self.ndim == 2:

580 # retain our own Index dtype

581 if index is not None:

582 axes = [self.items[:0], index]

583 else:

584 axes = [self.items[:0]] + self.axes[1:]

585 return self.make_empty(axes)

586 return self.make_empty()

587

588 # FIXME: optimization potential

589 indexer = np.sort(np.concatenate([b.mgr_locs.as_array for b in blocks]))

590 inv_indexer = lib.get_reverse_indexer(indexer, self.shape[0])

591

592 new_blocks: list[Block] = []

593 # TODO(CoW) we could optimize here if we know that the passed blocks

594 # are fully "owned" (eg created from an operation, not coming from

595 # an existing manager)

596 new_refs: list[weakref.ref | None] | None = None if copy else []

597 for b in blocks:

598 nb = b.copy(deep=copy)

599 nb.mgr_locs = BlockPlacement(inv_indexer[nb.mgr_locs.indexer])

600 new_blocks.append(nb)

601 if not copy:

602 # None has no attribute "append"

603 new_refs.append(weakref.ref(b)) # type: ignore[union-attr]

604

605 axes = list(self.axes)

606 if index is not None:

607 axes[-1] = index

608 axes[0] = self.items.take(indexer)

609

610 return type(self).from_blocks(

611 new_blocks, axes, new_refs, parent=None if copy else self

612 )

613

614 @property

615 def nblocks(self) -> int:

616 return len(self.blocks)

617

618 def copy(self: T, deep=True) -> T:

619 """

620 Make deep or shallow copy of BlockManager

621

622 Parameters

623 ----------

624 deep : bool, string or None, default True

625 If False or None, return a shallow copy (do not copy data)

626 If 'all', copy data and a deep copy of the index

627

628 Returns

629 -------

630 BlockManager

631 """

632 if deep is None:

633 if _using_copy_on_write():

634 # use shallow copy

635 deep = False

636 else:

637 # preserve deep copy for BlockManager with copy=None

638 deep = True

639

640 # this preserves the notion of view copying of axes

641 if deep:

642 # hit in e.g. tests.io.json.test_pandas

643

644 def copy_func(ax):

645 return ax.copy(deep=True) if deep == "all" else ax.view()

646

647 new_axes = [copy_func(ax) for ax in self.axes]

648 else:

649 new_axes = list(self.axes)

650

651 res = self.apply("copy", deep=deep)

652 new_refs: list[weakref.ref | None] | None

653 if deep:

654 new_refs = None

655 parent = None

656 else:

657 new_refs = [weakref.ref(blk) for blk in self.blocks]

658 parent = self

659

660 res.axes = new_axes

661 res.refs = new_refs

662 res.parent = parent

663

664 if self.ndim > 1:

665 # Avoid needing to re-compute these

666 blknos = self._blknos

667 if blknos is not None:

668 res._blknos = blknos.copy()

669 res._blklocs = self._blklocs.copy()

670

671 if deep:

672 res._consolidate_inplace()

673 return res

674

675 def consolidate(self: T) -> T:

676 """

677 Join together blocks having same dtype

678

679 Returns

680 -------

681 y : BlockManager

682 """

683 if self.is_consolidated():

684 return self

685

686 bm = type(self)(self.blocks, self.axes, self.refs, verify_integrity=False)

687 bm._is_consolidated = False

688 bm._consolidate_inplace()

689 return bm

690

691 def reindex_indexer(

692 self: T,

693 new_axis: Index,

694 indexer: npt.NDArray[np.intp] | None,

695 axis: int,

696 fill_value=None,

697 allow_dups: bool = False,

698 copy: bool | None = True,

699 only_slice: bool = False,

700 *,

701 use_na_proxy: bool = False,

702 ) -> T:

703 """

704 Parameters

705 ----------

706 new_axis : Index

707 indexer : ndarray[intp] or None

708 axis : int

709 fill_value : object, default None

710 allow_dups : bool, default False

711 copy : bool or None, default True

712 If None, regard as False to get shallow copy.

713 only_slice : bool, default False

714 Whether to take views, not copies, along columns.

715 use_na_proxy : bool, default False

716 Whether to use a np.void ndarray for newly introduced columns.

717

718 pandas-indexer with -1's only.

719 """

720 if copy is None:

721 if _using_copy_on_write():

722 # use shallow copy

723 copy = False

724 else:

725 # preserve deep copy for BlockManager with copy=None

726 copy = True

727

728 if indexer is None:

729 if new_axis is self.axes[axis] and not copy:

730 return self

731

732 result = self.copy(deep=copy)

733 result.axes = list(self.axes)

734 result.axes[axis] = new_axis

735 return result

736

737 # some axes don't allow reindexing with dups

738 if not allow_dups:

739 self.axes[axis]._validate_can_reindex(indexer)

740

741 if axis >= self.ndim:

742 raise IndexError("Requested axis not found in manager")

743

744 if axis == 0:

745 new_blocks, new_refs = self._slice_take_blocks_ax0(

746 indexer,

747 fill_value=fill_value,

748 only_slice=only_slice,

749 use_na_proxy=use_na_proxy,

750 )

751 parent = None if com.all_none(*new_refs) else self

752 else:

753 new_blocks = [

754 blk.take_nd(

755 indexer,

756 axis=1,

757 fill_value=(

758 fill_value if fill_value is not None else blk.fill_value

759 ),

760 )

761 for blk in self.blocks

762 ]

763 new_refs = None

764 parent = None

765

766 new_axes = list(self.axes)

767 new_axes[axis] = new_axis

768

769 new_mgr = type(self).from_blocks(new_blocks, new_axes, new_refs, parent=parent)

770 if axis == 1:

771 # We can avoid the need to rebuild these

772 new_mgr._blknos = self.blknos.copy()

773 new_mgr._blklocs = self.blklocs.copy()

774 return new_mgr

775

776 def _slice_take_blocks_ax0(

777 self,

778 slice_or_indexer: slice | np.ndarray,

779 fill_value=lib.no_default,

780 only_slice: bool = False,

781 *,

782 use_na_proxy: bool = False,

783 ) -> tuple[list[Block], list[weakref.ref | None]]:

784 """

785 Slice/take blocks along axis=0.

786

787 Overloaded for SingleBlock

788

789 Parameters

790 ----------

791 slice_or_indexer : slice or np.ndarray[int64]

792 fill_value : scalar, default lib.no_default

793 only_slice : bool, default False

794 If True, we always return views on existing arrays, never copies.

795 This is used when called from ops.blockwise.operate_blockwise.

796 use_na_proxy : bool, default False

797 Whether to use a np.void ndarray for newly introduced columns.

798

799 Returns

800 -------

801 new_blocks : list of Block

802 """

803 allow_fill = fill_value is not lib.no_default

804

805 sl_type, slobj, sllen = _preprocess_slice_or_indexer(

806 slice_or_indexer, self.shape[0], allow_fill=allow_fill

807 )

808

809 if self.is_single_block:

810 blk = self.blocks[0]

811

812 if sl_type == "slice":

813 # GH#32959 EABlock would fail since we can't make 0-width

814 # TODO(EA2D): special casing unnecessary with 2D EAs

815 if sllen == 0:

816 return [], []

817 bp = BlockPlacement(slice(0, sllen))

818 return [blk.getitem_block_columns(slobj, new_mgr_locs=bp)], [

819 weakref.ref(blk)

820 ]

821 elif not allow_fill or self.ndim == 1:

822 if allow_fill and fill_value is None:

823 fill_value = blk.fill_value

824

825 if not allow_fill and only_slice:

826 # GH#33597 slice instead of take, so we get

827 # views instead of copies

828 blocks = [

829 blk.getitem_block_columns(

830 slice(ml, ml + 1), new_mgr_locs=BlockPlacement(i)

831 )

832 for i, ml in enumerate(slobj)

833 ]

834 # We have

835 # all(np.shares_memory(nb.values, blk.values) for nb in blocks)

836 return blocks, [weakref.ref(blk)] * len(blocks)

837 else:

838 bp = BlockPlacement(slice(0, sllen))

839 return [

840 blk.take_nd(

841 slobj,

842 axis=0,

843 new_mgr_locs=bp,

844 fill_value=fill_value,

845 )

846 ], [None]

847

848 if sl_type == "slice":

849 blknos = self.blknos[slobj]

850 blklocs = self.blklocs[slobj]

851 else:

852 blknos = algos.take_nd(

853 self.blknos, slobj, fill_value=-1, allow_fill=allow_fill

854 )

855 blklocs = algos.take_nd(

856 self.blklocs, slobj, fill_value=-1, allow_fill=allow_fill

857 )

858

859 # When filling blknos, make sure blknos is updated before appending to

860 # blocks list, that way new blkno is exactly len(blocks).

861 blocks = []

862 refs: list[weakref.ref | None] = []

863 group = not only_slice

864 for blkno, mgr_locs in libinternals.get_blkno_placements(blknos, group=group):

865 if blkno == -1:

866 # If we've got here, fill_value was not lib.no_default

867

868 blocks.append(

869 self._make_na_block(

870 placement=mgr_locs,

871 fill_value=fill_value,

872 use_na_proxy=use_na_proxy,

873 )

874 )

875 refs.append(None)

876 else:

877 blk = self.blocks[blkno]

878

879 # Otherwise, slicing along items axis is necessary.

880 if not blk._can_consolidate and not blk._validate_ndim:

881 # i.e. we dont go through here for DatetimeTZBlock

882 # A non-consolidatable block, it's easy, because there's

883 # only one item and each mgr loc is a copy of that single

884 # item.

885 for mgr_loc in mgr_locs:

886 newblk = blk.copy(deep=False)

887 newblk.mgr_locs = BlockPlacement(slice(mgr_loc, mgr_loc + 1))

888 blocks.append(newblk)

889 refs.append(weakref.ref(blk))

890

891 else:

892 # GH#32779 to avoid the performance penalty of copying,

893 # we may try to only slice

894 taker = blklocs[mgr_locs.indexer]

895 max_len = max(len(mgr_locs), taker.max() + 1)

896 if only_slice or _using_copy_on_write():

897 taker = lib.maybe_indices_to_slice(taker, max_len)

898

899 if isinstance(taker, slice):

900 nb = blk.getitem_block_columns(taker, new_mgr_locs=mgr_locs)

901 blocks.append(nb)

902 refs.append(weakref.ref(blk))

903 elif only_slice:

904 # GH#33597 slice instead of take, so we get

905 # views instead of copies

906 for i, ml in zip(taker, mgr_locs):

907 slc = slice(i, i + 1)

908 bp = BlockPlacement(ml)

909 nb = blk.getitem_block_columns(slc, new_mgr_locs=bp)

910 # We have np.shares_memory(nb.values, blk.values)

911 blocks.append(nb)

912 refs.append(weakref.ref(blk))

913 else:

914 nb = blk.take_nd(taker, axis=0, new_mgr_locs=mgr_locs)

915 blocks.append(nb)

916 refs.append(None)

917

918 return blocks, refs

919

920 def _make_na_block(

921 self, placement: BlockPlacement, fill_value=None, use_na_proxy: bool = False

922 ) -> Block:

923 # Note: we only get here with self.ndim == 2

924

925 if use_na_proxy:

926 assert fill_value is None

927 shape = (len(placement), self.shape[1])

928 vals = np.empty(shape, dtype=np.void)

929 nb = NumpyBlock(vals, placement, ndim=2)

930 return nb

931

932 if fill_value is None:

933 fill_value = np.nan

934 block_shape = list(self.shape)

935 block_shape[0] = len(placement)

936

937 dtype, fill_value = infer_dtype_from_scalar(fill_value)

938 # error: Argument "dtype" to "empty" has incompatible type "Union[dtype,

939 # ExtensionDtype]"; expected "Union[dtype, None, type, _SupportsDtype, str,

940 # Tuple[Any, int], Tuple[Any, Union[int, Sequence[int]]], List[Any], _DtypeDict,

941 # Tuple[Any, Any]]"

942 block_values = np.empty(block_shape, dtype=dtype) # type: ignore[arg-type]

943 block_values.fill(fill_value)

944 return new_block_2d(block_values, placement=placement)

945

946 def take(

947 self: T,

948 indexer,

949 axis: int = 1,

950 verify: bool = True,

951 convert_indices: bool = True,

952 ) -> T:

953 """

954 Take items along any axis.

955

956 indexer : np.ndarray or slice

957 axis : int, default 1

958 verify : bool, default True

959 Check that all entries are between 0 and len(self) - 1, inclusive.

960 Pass verify=False if this check has been done by the caller.

961 convert_indices : bool, default True

962 Whether to attempt to convert indices to positive values.

963

964 Returns

965 -------

966 BlockManager

967 """

968 # We have 6 tests that get here with a slice

969 indexer = (

970 np.arange(indexer.start, indexer.stop, indexer.step, dtype=np.intp)

971 if isinstance(indexer, slice)

972 else np.asanyarray(indexer, dtype=np.intp)

973 )

974

975 n = self.shape[axis]

976 if convert_indices:

977 indexer = maybe_convert_indices(indexer, n, verify=verify)

978

979 new_labels = self.axes[axis].take(indexer)

980 return self.reindex_indexer(

981 new_axis=new_labels,

982 indexer=indexer,

983 axis=axis,

984 allow_dups=True,

985 copy=None,

986 )

987

988

989class BlockManager(libinternals.BlockManager, BaseBlockManager):

990 """

991 BaseBlockManager that holds 2D blocks.

992 """

993

994 ndim = 2

995

996 # ----------------------------------------------------------------

997 # Constructors

998

999 def __init__(

1000 self,

1001 blocks: Sequence[Block],

1002 axes: Sequence[Index],

1003 refs: list[weakref.ref | None] | None = None,

1004 parent: object = None,

1005 verify_integrity: bool = True,

1006 ) -> None:

1007

1008 if verify_integrity:

1009 # Assertion disabled for performance

1010 # assert all(isinstance(x, Index) for x in axes)

1011

1012 for block in blocks:

1013 if self.ndim != block.ndim:

1014 raise AssertionError(

1015 f"Number of Block dimensions ({block.ndim}) must equal "

1016 f"number of axes ({self.ndim})"

1017 )

1018 if isinstance(block, DatetimeTZBlock) and block.values.ndim == 1:

1019 # TODO(2.0): remove once fastparquet no longer needs this

1020 warnings.warn(

1021 "In a future version, the BlockManager constructor "

1022 "will assume that a DatetimeTZBlock with block.ndim==2 "

1023 "has block.values.ndim == 2.",

1024 DeprecationWarning,

1025 stacklevel=find_stack_level(),

1026 )

1027

1028 # error: Incompatible types in assignment (expression has type

1029 # "Union[ExtensionArray, ndarray]", variable has type

1030 # "DatetimeArray")

1031 block.values = ensure_block_shape( # type: ignore[assignment]

1032 block.values, self.ndim

1033 )

1034 try:

1035 block._cache.clear()

1036 except AttributeError:

1037 # _cache not initialized

1038 pass

1039

1040 self._verify_integrity()

1041

1042 def _verify_integrity(self) -> None:

1043 mgr_shape = self.shape

1044 tot_items = sum(len(x.mgr_locs) for x in self.blocks)

1045 for block in self.blocks:

1046 if block.shape[1:] != mgr_shape[1:]:

1047 raise construction_error(tot_items, block.shape[1:], self.axes)

1048 if len(self.items) != tot_items:

1049 raise AssertionError(

1050 "Number of manager items must equal union of "

1051 f"block items\n# manager items: {len(self.items)}, # "

1052 f"tot_items: {tot_items}"

1053 )

1054 if self.refs is not None:

1055 if len(self.refs) != len(self.blocks):

1056 raise AssertionError(

1057 "Number of passed refs must equal the number of blocks: "

1058 f"{len(self.refs)} refs vs {len(self.blocks)} blocks."

1059 "\nIf you see this error, please report a bug at "

1060 "https://github.com/pandas-dev/pandas/issues"

1061 )

1062

1063 @classmethod

1064 def from_blocks(

1065 cls,

1066 blocks: list[Block],

1067 axes: list[Index],

1068 refs: list[weakref.ref | None] | None = None,

1069 parent: object = None,

1070 ) -> BlockManager:

1071 """

1072 Constructor for BlockManager and SingleBlockManager with same signature.

1073 """

1074 parent = parent if _using_copy_on_write() else None

1075 return cls(blocks, axes, refs, parent, verify_integrity=False)

1076

1077 # ----------------------------------------------------------------

1078 # Indexing

1079

1080 def fast_xs(self, loc: int) -> SingleBlockManager:

1081 """

1082 Return the array corresponding to `frame.iloc[loc]`.

1083

1084 Parameters

1085 ----------

1086 loc : int

1087

1088 Returns

1089 -------

1090 np.ndarray or ExtensionArray

1091 """

1092 if len(self.blocks) == 1:

1093 result = self.blocks[0].iget((slice(None), loc))

1094 block = new_block(result, placement=slice(0, len(result)), ndim=1)

1095 # in the case of a single block, the new block is a view

1096 ref = weakref.ref(self.blocks[0])

1097 return SingleBlockManager(block, self.axes[0], [ref], parent=self)

1098

1099 dtype = interleaved_dtype([blk.dtype for blk in self.blocks])

1100

1101 n = len(self)

1102

1103 # GH#46406

1104 immutable_ea = isinstance(dtype, SparseDtype)

1105

1106 if isinstance(dtype, ExtensionDtype) and not immutable_ea:

1107 cls = dtype.construct_array_type()

1108 result = cls._empty((n,), dtype=dtype)

1109 else:

1110 # error: Argument "dtype" to "empty" has incompatible type

1111 # "Union[Type[object], dtype[Any], ExtensionDtype, None]"; expected

1112 # "None"

1113 result = np.empty(

1114 n, dtype=object if immutable_ea else dtype # type: ignore[arg-type]

1115 )

1116 result = ensure_wrapped_if_datetimelike(result)

1117

1118 for blk in self.blocks:

1119 # Such assignment may incorrectly coerce NaT to None

1120 # result[blk.mgr_locs] = blk._slice((slice(None), loc))

1121 for i, rl in enumerate(blk.mgr_locs):

1122 result[rl] = blk.iget((i, loc))

1123

1124 if immutable_ea:

1125 dtype = cast(ExtensionDtype, dtype)

1126 result = dtype.construct_array_type()._from_sequence(result, dtype=dtype)

1127

1128 block = new_block(result, placement=slice(0, len(result)), ndim=1)

1129 return SingleBlockManager(block, self.axes[0])

1130

1131 def iget(self, i: int, track_ref: bool = True) -> SingleBlockManager:

1132 """

1133 Return the data as a SingleBlockManager.

1134 """

1135 block = self.blocks[self.blknos[i]]

1136 values = block.iget(self.blklocs[i])

1137

1138 # shortcut for select a single-dim from a 2-dim BM

1139 bp = BlockPlacement(slice(0, len(values)))

1140 nb = type(block)(values, placement=bp, ndim=1)

1141 ref = weakref.ref(block) if track_ref else None

1142 parent = self if track_ref else None

1143 return SingleBlockManager(nb, self.axes[1], [ref], parent)

1144

1145 def iget_values(self, i: int) -> ArrayLike:

1146 """

1147 Return the data for column i as the values (ndarray or ExtensionArray).

1148

1149 Warning! The returned array is a view but doesn't handle Copy-on-Write,

1150 so this should be used with caution.

1151 """

1152 # TODO(CoW) making the arrays read-only might make this safer to use?

1153 block = self.blocks[self.blknos[i]]

1154 values = block.iget(self.blklocs[i])

1155 return values

1156

1157 @property

1158 def column_arrays(self) -> list[np.ndarray]:

1159 """

1160 Used in the JSON C code to access column arrays.

1161 This optimizes compared to using `iget_values` by converting each

1162

1163 Warning! This doesn't handle Copy-on-Write, so should be used with

1164 caution (current use case of consuming this in the JSON code is fine).

1165 """

1166 # This is an optimized equivalent to

1167 # result = [self.iget_values(i) for i in range(len(self.items))]

1168 result: list[np.ndarray | None] = [None] * len(self.items)

1169

1170 for blk in self.blocks:

1171 mgr_locs = blk._mgr_locs

1172 values = blk.values_for_json()

1173 if values.ndim == 1:

1174 # TODO(EA2D): special casing not needed with 2D EAs

1175 result[mgr_locs[0]] = values

1176

1177 else:

1178 for i, loc in enumerate(mgr_locs):

1179 result[loc] = values[i]

1180

1181 # error: Incompatible return value type (got "List[None]",

1182 # expected "List[ndarray[Any, Any]]")

1183 return result # type: ignore[return-value]

1184

1185 def iset(

1186 self, loc: int | slice | np.ndarray, value: ArrayLike, inplace: bool = False

1187 ):

1188 """

1189 Set new item in-place. Does not consolidate. Adds new Block if not

1190 contained in the current set of items

1191 """

1192

1193 # FIXME: refactor, clearly separate broadcasting & zip-like assignment

1194 # can prob also fix the various if tests for sparse/categorical

1195 if self._blklocs is None and self.ndim > 1:

1196 self._rebuild_blknos_and_blklocs()

1197

1198 # Note: we exclude DTA/TDA here

1199 value_is_extension_type = is_1d_only_ea_dtype(value.dtype)

1200 if not value_is_extension_type:

1201 if value.ndim == 2:

1202 value = value.T

1203 else:

1204 value = ensure_block_shape(value, ndim=2)

1205

1206 if value.shape[1:] != self.shape[1:]:

1207 raise AssertionError(

1208 "Shape of new values must be compatible with manager shape"

1209 )

1210

1211 if lib.is_integer(loc):

1212 # We have 6 tests where loc is _not_ an int.

1213 # In this case, get_blkno_placements will yield only one tuple,

1214 # containing (self._blknos[loc], BlockPlacement(slice(0, 1, 1)))

1215

1216 # Check if we can use _iset_single fastpath

1217 loc = cast(int, loc)

1218 blkno = self.blknos[loc]

1219 blk = self.blocks[blkno]

1220 if len(blk._mgr_locs) == 1: # TODO: fastest way to check this?

1221 return self._iset_single(

1222 loc,

1223 value,

1224 inplace=inplace,

1225 blkno=blkno,

1226 blk=blk,

1227 )

1228

1229 # error: Incompatible types in assignment (expression has type

1230 # "List[Union[int, slice, ndarray]]", variable has type "Union[int,

1231 # slice, ndarray]")

1232 loc = [loc] # type: ignore[assignment]

1233

1234 # categorical/sparse/datetimetz

1235 if value_is_extension_type:

1236

1237 def value_getitem(placement):

1238 return value

1239

1240 else:

1241

1242 def value_getitem(placement):

1243 return value[placement.indexer]

1244

1245 # Accessing public blknos ensures the public versions are initialized

1246 blknos = self.blknos[loc]

1247 blklocs = self.blklocs[loc].copy()

1248

1249 unfit_mgr_locs = []

1250 unfit_val_locs = []

1251 removed_blknos = []

1252 for blkno_l, val_locs in libinternals.get_blkno_placements(blknos, group=True):

1253 blk = self.blocks[blkno_l]

1254 blk_locs = blklocs[val_locs.indexer]

1255 if inplace and blk.should_store(value):

1256 # Updating inplace -> check if we need to do Copy-on-Write

1257 if _using_copy_on_write() and not self._has_no_reference_block(blkno_l):

1258 blk.set_inplace(blk_locs, value_getitem(val_locs), copy=True)

1259 self._clear_reference_block(blkno_l)

1260 else:

1261 blk.set_inplace(blk_locs, value_getitem(val_locs))

1262 else:

1263 unfit_mgr_locs.append(blk.mgr_locs.as_array[blk_locs])

1264 unfit_val_locs.append(val_locs)

1265

1266 # If all block items are unfit, schedule the block for removal.

1267 if len(val_locs) == len(blk.mgr_locs):

1268 removed_blknos.append(blkno_l)

1269 else:

1270 nb = blk.delete(blk_locs)

1271 blocks_tup = (

1272 self.blocks[:blkno_l] + (nb,) + self.blocks[blkno_l + 1 :]

1273 )

1274 self.blocks = blocks_tup

1275 self._blklocs[nb.mgr_locs.indexer] = np.arange(len(nb))

1276 # blk.delete gives a copy, so we can remove a possible reference

1277 self._clear_reference_block(blkno_l)

1278

1279 if len(removed_blknos):

1280 # Remove blocks & update blknos and refs accordingly

1281 is_deleted = np.zeros(self.nblocks, dtype=np.bool_)

1282 is_deleted[removed_blknos] = True

1283

1284 new_blknos = np.empty(self.nblocks, dtype=np.intp)

1285 new_blknos.fill(-1)

1286 new_blknos[~is_deleted] = np.arange(self.nblocks - len(removed_blknos))

1287 self._blknos = new_blknos[self._blknos]

1288 self.blocks = tuple(

1289 blk for i, blk in enumerate(self.blocks) if i not in set(removed_blknos)

1290 )

1291 if self.refs is not None:

1292 self.refs = [

1293 ref

1294 for i, ref in enumerate(self.refs)

1295 if i not in set(removed_blknos)

1296 ]

1297

1298 if unfit_val_locs:

1299 unfit_idxr = np.concatenate(unfit_mgr_locs)

1300 unfit_count = len(unfit_idxr)

1301

1302 new_blocks: list[Block] = []

1303 if value_is_extension_type:

1304 # This code (ab-)uses the fact that EA blocks contain only

1305 # one item.

1306 # TODO(EA2D): special casing unnecessary with 2D EAs

1307 new_blocks.extend(

1308 new_block_2d(

1309 values=value,

1310 placement=BlockPlacement(slice(mgr_loc, mgr_loc + 1)),

1311 )

1312 for mgr_loc in unfit_idxr

1313 )

1314

1315 self._blknos[unfit_idxr] = np.arange(unfit_count) + len(self.blocks)

1316 self._blklocs[unfit_idxr] = 0

1317

1318 else:

1319 # unfit_val_locs contains BlockPlacement objects

1320 unfit_val_items = unfit_val_locs[0].append(unfit_val_locs[1:])

1321

1322 new_blocks.append(

1323 new_block_2d(

1324 values=value_getitem(unfit_val_items),

1325 placement=BlockPlacement(unfit_idxr),

1326 )

1327 )

1328

1329 self._blknos[unfit_idxr] = len(self.blocks)

1330 self._blklocs[unfit_idxr] = np.arange(unfit_count)

1331

1332 self.blocks += tuple(new_blocks)

1333 # TODO(CoW) is this always correct to assume that the new_blocks

1334 # are not referencing anything else?

1335 if self.refs is not None:

1336 self.refs = list(self.refs) + [None] * len(new_blocks)

1337

1338 # Newly created block's dtype may already be present.

1339 self._known_consolidated = False

1340

1341 def _iset_single(

1342 self, loc: int, value: ArrayLike, inplace: bool, blkno: int, blk: Block

1343 ) -> None:

1344 """

1345 Fastpath for iset when we are only setting a single position and

1346 the Block currently in that position is itself single-column.

1347

1348 In this case we can swap out the entire Block and blklocs and blknos

1349 are unaffected.

1350 """

1351 # Caller is responsible for verifying value.shape

1352

1353 if inplace and blk.should_store(value):

1354 copy = False

1355 if _using_copy_on_write() and not self._has_no_reference_block(blkno):

1356 # perform Copy-on-Write and clear the reference

1357 copy = True

1358 self._clear_reference_block(blkno)

1359 iloc = self.blklocs[loc]

1360 blk.set_inplace(slice(iloc, iloc + 1), value, copy=copy)

1361 return

1362

1363 nb = new_block_2d(value, placement=blk._mgr_locs)

1364 old_blocks = self.blocks

1365 new_blocks = old_blocks[:blkno] + (nb,) + old_blocks[blkno + 1 :]

1366 self.blocks = new_blocks

1367 self._clear_reference_block(blkno)

1368 return

1369

1370 def column_setitem(self, loc: int, idx: int | slice | np.ndarray, value) -> None:

1371 """

1372 Set values ("setitem") into a single column (not setting the full column).

1373

1374 This is a method on the BlockManager level, to avoid creating an

1375 intermediate Series at the DataFrame level (`s = df[loc]; s[idx] = value`)

1376 """

1377 if _using_copy_on_write() and not self._has_no_reference(loc):

1378 # otherwise perform Copy-on-Write and clear the reference

1379 blkno = self.blknos[loc]

1380 blocks = list(self.blocks)

1381 blocks[blkno] = blocks[blkno].copy()

1382 self.blocks = tuple(blocks)

1383 self._clear_reference_block(blkno)

1384

1385 # this manager is only created temporarily to mutate the values in place

1386 # so don't track references, otherwise the `setitem` would perform CoW again

1387 col_mgr = self.iget(loc, track_ref=False)

1388 new_mgr = col_mgr.setitem((idx,), value)

1389 self.iset(loc, new_mgr._block.values, inplace=True)

1390

1391 def insert(self, loc: int, item: Hashable, value: ArrayLike) -> None:

1392 """

1393 Insert item at selected position.

1394

1395 Parameters

1396 ----------

1397 loc : int

1398 item : hashable

1399 value : np.ndarray or ExtensionArray

1400 """

1401 # insert to the axis; this could possibly raise a TypeError

1402 new_axis = self.items.insert(loc, item)

1403

1404 if value.ndim == 2:

1405 value = value.T

1406 if len(value) > 1:

1407 raise ValueError(

1408 f"Expected a 1D array, got an array with shape {value.T.shape}"

1409 )

1410 else:

1411 value = ensure_block_shape(value, ndim=self.ndim)

1412

1413 bp = BlockPlacement(slice(loc, loc + 1))

1414 block = new_block_2d(values=value, placement=bp)

1415

1416 if not len(self.blocks):

1417 # Fastpath

1418 self._blklocs = np.array([0], dtype=np.intp)

1419 self._blknos = np.array([0], dtype=np.intp)

1420 else:

1421 self._insert_update_mgr_locs(loc)

1422 self._insert_update_blklocs_and_blknos(loc)

1423

1424 self.axes[0] = new_axis

1425 self.blocks += (block,)

1426 # TODO(CoW) do we always "own" the passed `value`?

1427 if self.refs is not None:

1428 self.refs += [None]

1429

1430 self._known_consolidated = False

1431

1432 if sum(not block.is_extension for block in self.blocks) > 100:

1433 warnings.warn(

1434 "DataFrame is highly fragmented. This is usually the result "

1435 "of calling `frame.insert` many times, which has poor performance. "

1436 "Consider joining all columns at once using pd.concat(axis=1) "

1437 "instead. To get a de-fragmented frame, use `newframe = frame.copy()`",

1438 PerformanceWarning,

1439 stacklevel=find_stack_level(),

1440 )

1441

1442 def _insert_update_mgr_locs(self, loc) -> None:

1443 """

1444 When inserting a new Block at location 'loc', we increment

1445 all of the mgr_locs of blocks above that by one.

1446 """

1447 for blkno, count in _fast_count_smallints(self.blknos[loc:]):

1448 # .620 this way, .326 of which is in increment_above

1449 blk = self.blocks[blkno]

1450 blk._mgr_locs = blk._mgr_locs.increment_above(loc)

1451

1452 def _insert_update_blklocs_and_blknos(self, loc) -> None:

1453 """

1454 When inserting a new Block at location 'loc', we update our

1455 _blklocs and _blknos.

1456 """

1457

1458 # Accessing public blklocs ensures the public versions are initialized

1459 if loc == self.blklocs.shape[0]:

1460 # np.append is a lot faster, let's use it if we can.

1461 self._blklocs = np.append(self._blklocs, 0)

1462 self._blknos = np.append(self._blknos, len(self.blocks))

1463 elif loc == 0:

1464 # np.append is a lot faster, let's use it if we can.

1465 self._blklocs = np.append(self._blklocs[::-1], 0)[::-1]

1466 self._blknos = np.append(self._blknos[::-1], len(self.blocks))[::-1]

1467 else:

1468 new_blklocs, new_blknos = libinternals.update_blklocs_and_blknos(

1469 self.blklocs, self.blknos, loc, len(self.blocks)

1470 )

1471 self._blklocs = new_blklocs

1472 self._blknos = new_blknos

1473

1474 def idelete(self, indexer) -> BlockManager:

1475 """

1476 Delete selected locations, returning a new BlockManager.

1477 """

1478 is_deleted = np.zeros(self.shape[0], dtype=np.bool_)

1479 is_deleted[indexer] = True

1480 taker = (~is_deleted).nonzero()[0]

1481

1482 nbs, new_refs = self._slice_take_blocks_ax0(taker, only_slice=True)

1483 new_columns = self.items[~is_deleted]

1484 axes = [new_columns, self.axes[1]]

1485 # TODO this might not be needed (can a delete ever be done in chained manner?)

1486 parent = None if com.all_none(*new_refs) else self

1487 return type(self)(tuple(nbs), axes, new_refs, parent, verify_integrity=False)

1488

1489 # ----------------------------------------------------------------

1490 # Block-wise Operation

1491

1492 def grouped_reduce(self: T, func: Callable, ignore_failures: bool = False) -> T:

1493 """

1494 Apply grouped reduction function blockwise, returning a new BlockManager.

1495

1496 Parameters

1497 ----------

1498 func : grouped reduction function

1499 ignore_failures : bool, default False

1500 Whether to drop blocks where func raises TypeError.

1501

1502 Returns

1503 -------

1504 BlockManager

1505 """

1506 result_blocks: list[Block] = []

1507 dropped_any = False

1508

1509 for blk in self.blocks:

1510 if blk.is_object:

1511 # split on object-dtype blocks bc some columns may raise

1512 # while others do not.

1513 for sb in blk._split():

1514 try:

1515 applied = sb.apply(func)

1516 except (TypeError, NotImplementedError):

1517 if not ignore_failures:

1518 raise

1519 dropped_any = True

1520 continue

1521 result_blocks = extend_blocks(applied, result_blocks)

1522 else:

1523 try:

1524 applied = blk.apply(func)

1525 except (TypeError, NotImplementedError):

1526 if not ignore_failures:

1527 raise

1528 dropped_any = True

1529 continue

1530 result_blocks = extend_blocks(applied, result_blocks)

1531

1532 if len(result_blocks) == 0:

1533 index = Index([None]) # placeholder

1534 else:

1535 index = Index(range(result_blocks[0].values.shape[-1]))

1536

1537 if dropped_any:

1538 # faster to skip _combine if we haven't dropped any blocks

1539 return self._combine(result_blocks, copy=False, index=index)

1540

1541 return type(self).from_blocks(result_blocks, [self.axes[0], index])

1542

1543 def reduce(

1544 self: T, func: Callable, ignore_failures: bool = False

1545 ) -> tuple[T, np.ndarray]:

1546 """

1547 Apply reduction function blockwise, returning a single-row BlockManager.

1548

1549 Parameters

1550 ----------

1551 func : reduction function

1552 ignore_failures : bool, default False

1553 Whether to drop blocks where func raises TypeError.

1554

1555 Returns

1556 -------

1557 BlockManager

1558 np.ndarray

1559 Indexer of mgr_locs that are retained.

1560 """

1561 # If 2D, we assume that we're operating column-wise

1562 assert self.ndim == 2

1563

1564 res_blocks: list[Block] = []

1565 for blk in self.blocks:

1566 nbs = blk.reduce(func, ignore_failures)

1567 res_blocks.extend(nbs)

1568

1569 index = Index([None]) # placeholder

1570 if ignore_failures:

1571 if res_blocks:

1572 indexer = np.concatenate([blk.mgr_locs.as_array for blk in res_blocks])

1573 new_mgr = self._combine(res_blocks, copy=False, index=index)

1574 else:

1575 indexer = []

1576 new_mgr = type(self).from_blocks([], [self.items[:0], index])

1577 else:

1578 indexer = np.arange(self.shape[0])

1579 new_mgr = type(self).from_blocks(res_blocks, [self.items, index])

1580 return new_mgr, indexer

1581

1582 def operate_blockwise(self, other: BlockManager, array_op) -> BlockManager:

1583 """

1584 Apply array_op blockwise with another (aligned) BlockManager.

1585 """

1586 return operate_blockwise(self, other, array_op)

1587

1588 def _equal_values(self: BlockManager, other: BlockManager) -> bool:

1589 """

1590 Used in .equals defined in base class. Only check the column values

1591 assuming shape and indexes have already been checked.

1592 """

1593 return blockwise_all(self, other, array_equals)

1594

1595 def quantile(

1596 self: T,

1597 *,

1598 qs: Float64Index,

1599 axis: int = 0,

1600 interpolation="linear",

1601 ) -> T:

1602 """

1603 Iterate over blocks applying quantile reduction.

1604 This routine is intended for reduction type operations and

1605 will do inference on the generated blocks.

1606

1607 Parameters

1608 ----------

1609 axis: reduction axis, default 0

1610 consolidate: bool, default True. Join together blocks having same

1611 dtype

1612 interpolation : type of interpolation, default 'linear'

1613 qs : list of the quantiles to be computed

1614

1615 Returns

1616 -------

1617 BlockManager

1618 """

1619 # Series dispatches to DataFrame for quantile, which allows us to

1620 # simplify some of the code here and in the blocks

1621 assert self.ndim >= 2

1622 assert is_list_like(qs) # caller is responsible for this

1623 assert axis == 1 # only ever called this way

1624

1625 new_axes = list(self.axes)

1626 new_axes[1] = Float64Index(qs)

1627

1628 blocks = [

1629 blk.quantile(axis=axis, qs=qs, interpolation=interpolation)

1630 for blk in self.blocks

1631 ]

1632

1633 return type(self)(blocks, new_axes)

1634

1635 # ----------------------------------------------------------------

1636

1637 def unstack(self, unstacker, fill_value) -> BlockManager:

1638 """

1639 Return a BlockManager with all blocks unstacked.

1640

1641 Parameters

1642 ----------

1643 unstacker : reshape._Unstacker

1644 fill_value : Any

1645 fill_value for newly introduced missing values.

1646

1647 Returns

1648 -------

1649 unstacked : BlockManager

1650 """

1651 new_columns = unstacker.get_new_columns(self.items)

1652 new_index = unstacker.new_index

1653

1654 allow_fill = not unstacker.mask_all

1655 if allow_fill:

1656 # calculating the full mask once and passing it to Block._unstack is

1657 # faster than letting calculating it in each repeated call

1658 new_mask2D = (~unstacker.mask).reshape(*unstacker.full_shape)

1659 needs_masking = new_mask2D.any(axis=0)

1660 else:

1661 needs_masking = np.zeros(unstacker.full_shape[1], dtype=bool)

1662

1663 new_blocks: list[Block] = []

1664 columns_mask: list[np.ndarray] = []

1665

1666 if len(self.items) == 0:

1667 factor = 1

1668 else:

1669 fac = len(new_columns) / len(self.items)

1670 assert fac == int(fac)

1671 factor = int(fac)

1672

1673 for blk in self.blocks:

1674 mgr_locs = blk.mgr_locs

1675 new_placement = mgr_locs.tile_for_unstack(factor)

1676

1677 blocks, mask = blk._unstack(

1678 unstacker,

1679 fill_value,

1680 new_placement=new_placement,

1681 needs_masking=needs_masking,

1682 )

1683

1684 new_blocks.extend(blocks)

1685 columns_mask.extend(mask)

1686

1687 # Block._unstack should ensure this holds,

1688 assert mask.sum() == sum(len(nb._mgr_locs) for nb in blocks)

1689 # In turn this ensures that in the BlockManager call below

1690 # we have len(new_columns) == sum(x.shape[0] for x in new_blocks)

1691 # which suffices to allow us to pass verify_inegrity=False

1692

1693 new_columns = new_columns[columns_mask]

1694

1695 bm = BlockManager(new_blocks, [new_columns, new_index], verify_integrity=False)

1696 return bm

1697

1698 def to_dict(self, copy: bool = True):

1699 """

1700 Return a dict of str(dtype) -> BlockManager

1701

1702 Parameters

1703 ----------

1704 copy : bool, default True

1705

1706 Returns

1707 -------

1708 values : a dict of dtype -> BlockManager

1709 """

1710

1711 bd: dict[str, list[Block]] = {}

1712 for b in self.blocks:

1713 bd.setdefault(str(b.dtype), []).append(b)

1714

1715 # TODO(EA2D): the combine will be unnecessary with 2D EAs

1716 return {dtype: self._combine(blocks, copy=copy) for dtype, blocks in bd.items()}

1717

1718 def as_array(

1719 self,

1720 dtype: np.dtype | None = None,

1721 copy: bool = False,

1722 na_value: object = lib.no_default,

1723 ) -> np.ndarray:

1724 """

1725 Convert the blockmanager data into an numpy array.

1726

1727 Parameters

1728 ----------

1729 dtype : np.dtype or None, default None

1730 Data type of the return array.

1731 copy : bool, default False

1732 If True then guarantee that a copy is returned. A value of

1733 False does not guarantee that the underlying data is not

1734 copied.

1735 na_value : object, default lib.no_default

1736 Value to be used as the missing value sentinel.

1737

1738 Returns

1739 -------

1740 arr : ndarray

1741 """

1742 # TODO(CoW) handle case where resulting array is a view

1743 if len(self.blocks) == 0:

1744 arr = np.empty(self.shape, dtype=float)

1745 return arr.transpose()

1746

1747 # We want to copy when na_value is provided to avoid

1748 # mutating the original object

1749 copy = copy or na_value is not lib.no_default

1750

1751 if self.is_single_block:

1752 blk = self.blocks[0]

1753 if blk.is_extension:

1754 # Avoid implicit conversion of extension blocks to object

1755

1756 # error: Item "ndarray" of "Union[ndarray, ExtensionArray]" has no

1757 # attribute "to_numpy"

1758 arr = blk.values.to_numpy( # type: ignore[union-attr]

1759 dtype=dtype,

1760 na_value=na_value,

1761 ).reshape(blk.shape)

1762 else:

1763 arr = np.asarray(blk.get_values())

1764 if dtype:

1765 arr = arr.astype(dtype, copy=False)

1766 else:

1767 arr = self._interleave(dtype=dtype, na_value=na_value)

1768 # The underlying data was copied within _interleave

1769 copy = False

1770

1771 if copy:

1772 arr = arr.copy()

1773

1774 if na_value is not lib.no_default:

1775 arr[isna(arr)] = na_value

1776

1777 return arr.transpose()

1778

1779 def _interleave(

1780 self,

1781 dtype: np.dtype | None = None,

1782 na_value: object = lib.no_default,

1783 ) -> np.ndarray:

1784 """

1785 Return ndarray from blocks with specified item order

1786 Items must be contained in the blocks

1787 """

1788 if not dtype:

1789 # Incompatible types in assignment (expression has type

1790 # "Optional[Union[dtype[Any], ExtensionDtype]]", variable has

1791 # type "Optional[dtype[Any]]")

1792 dtype = interleaved_dtype( # type: ignore[assignment]

1793 [blk.dtype for blk in self.blocks]

1794 )

1795

1796 # TODO: https://github.com/pandas-dev/pandas/issues/22791

1797 # Give EAs some input on what happens here. Sparse needs this.

1798 if isinstance(dtype, SparseDtype):

1799 dtype = dtype.subtype

1800 dtype = cast(np.dtype, dtype)

1801 elif isinstance(dtype, ExtensionDtype):

1802 dtype = np.dtype("object")

1803 elif is_dtype_equal(dtype, str):

1804 dtype = np.dtype("object")

1805

1806 result = np.empty(self.shape, dtype=dtype)

1807

1808 itemmask = np.zeros(self.shape[0])

1809

1810 if dtype == np.dtype("object") and na_value is lib.no_default:

1811 # much more performant than using to_numpy below

1812 for blk in self.blocks:

1813 rl = blk.mgr_locs

1814 arr = blk.get_values(dtype)

1815 result[rl.indexer] = arr

1816 itemmask[rl.indexer] = 1

1817 return result

1818

1819 for blk in self.blocks:

1820 rl = blk.mgr_locs

1821 if blk.is_extension:

1822 # Avoid implicit conversion of extension blocks to object

1823

1824 # error: Item "ndarray" of "Union[ndarray, ExtensionArray]" has no

1825 # attribute "to_numpy"

1826 arr = blk.values.to_numpy( # type: ignore[union-attr]

1827 dtype=dtype,

1828 na_value=na_value,

1829 )

1830 else:

1831 arr = blk.get_values(dtype)

1832 result[rl.indexer] = arr

1833 itemmask[rl.indexer] = 1

1834

1835 if not itemmask.all():

1836 raise AssertionError("Some items were not contained in blocks")

1837

1838 return result

1839

1840 # ----------------------------------------------------------------

1841 # Consolidation

1842

1843 def is_consolidated(self) -> bool:

1844 """

1845 Return True if more than one block with the same dtype

1846 """

1847 if not self._known_consolidated:

1848 self._consolidate_check()

1849 return self._is_consolidated

1850

1851 def _consolidate_check(self) -> None:

1852 if len(self.blocks) == 1:

1853 # fastpath

1854 self._is_consolidated = True

1855 self._known_consolidated = True

1856 return

1857 dtypes = [blk.dtype for blk in self.blocks if blk._can_consolidate]

1858 self._is_consolidated = len(dtypes) == len(set(dtypes))

1859 self._known_consolidated = True

1860

1861 def _consolidate_inplace(self) -> None:

1862 # In general, _consolidate_inplace should only be called via

1863 # DataFrame._consolidate_inplace, otherwise we will fail to invalidate

1864 # the DataFrame's _item_cache. The exception is for newly-created

1865 # BlockManager objects not yet attached to a DataFrame.

1866 if not self.is_consolidated():

1867 if self.refs is None:

1868 self.blocks = _consolidate(self.blocks)

1869 else:

1870 self.blocks, self.refs = _consolidate_with_refs(self.blocks, self.refs)

1871 self._is_consolidated = True

1872 self._known_consolidated = True

1873 self._rebuild_blknos_and_blklocs()

1874

1875

1876class SingleBlockManager(BaseBlockManager, SingleDataManager):

1877 """manage a single block with"""

1878

1879 @property

1880 def ndim(self) -> Literal[1]:

1881 return 1

1882

1883 _is_consolidated = True

1884 _known_consolidated = True

1885 __slots__ = ()

1886 is_single_block = True

1887

1888 def __init__(

1889 self,

1890 block: Block,

1891 axis: Index,

1892 refs: list[weakref.ref | None] | None = None,

1893 parent: object = None,

1894 verify_integrity: bool = False,

1895 fastpath=lib.no_default,

1896 ) -> None:

1897 # Assertions disabled for performance

1898 # assert isinstance(block, Block), type(block)

1899 # assert isinstance(axis, Index), type(axis)

1900

1901 if fastpath is not lib.no_default:

1902 warnings.warn(

1903 "The `fastpath` keyword is deprecated and will be removed "

1904 "in a future version.",

1905 FutureWarning,

1906 stacklevel=find_stack_level(),

1907 )

1908

1909 self.axes = [axis]

1910 self.blocks = (block,)

1911 self.refs = refs

1912 self.parent = parent if _using_copy_on_write() else None

1913

1914 @classmethod

1915 def from_blocks(

1916 cls,

1917 blocks: list[Block],

1918 axes: list[Index],

1919 refs: list[weakref.ref | None] | None = None,

1920 parent: object = None,

1921 ) -> SingleBlockManager:

1922 """

1923 Constructor for BlockManager and SingleBlockManager with same signature.

1924 """

1925 assert len(blocks) == 1

1926 assert len(axes) == 1

1927 if refs is not None:

1928 assert len(refs) == 1

1929 return cls(blocks[0], axes[0], refs, parent, verify_integrity=False)

1930

1931 @classmethod

1932 def from_array(cls, array: ArrayLike, index: Index) -> SingleBlockManager:

1933 """

1934 Constructor for if we have an array that is not yet a Block.

1935 """

1936 block = new_block(array, placement=slice(0, len(index)), ndim=1)

1937 return cls(block, index)

1938

1939 def to_2d_mgr(self, columns: Index) -> BlockManager:

1940 """

1941 Manager analogue of Series.to_frame

1942 """

1943 blk = self.blocks[0]

1944 arr = ensure_block_shape(blk.values, ndim=2)

1945 bp = BlockPlacement(0)

1946 new_blk = type(blk)(arr, placement=bp, ndim=2)

1947 axes = [columns, self.axes[0]]

1948 refs: list[weakref.ref | None] = [weakref.ref(blk)]

1949 parent = self if _using_copy_on_write() else None

1950 return BlockManager(

1951 [new_blk], axes=axes, refs=refs, parent=parent, verify_integrity=False

1952 )

1953

1954 def _has_no_reference(self, i: int = 0) -> bool:

1955 """

1956 Check for column `i` if it has references.

1957 (whether it references another array or is itself being referenced)

1958 Returns True if the column has no references.

1959 """

1960 return (self.refs is None or self.refs[0] is None) and weakref.getweakrefcount(

1961 self.blocks[0]

1962 ) == 0

1963

1964 def __getstate__(self):

1965 block_values = [b.values for b in self.blocks]

1966 block_items = [self.items[b.mgr_locs.indexer] for b in self.blocks]

1967 axes_array = list(self.axes)

1968

1969 extra_state = {

1970 "0.14.1": {

1971 "axes": axes_array,

1972 "blocks": [

1973 {"values": b.values, "mgr_locs": b.mgr_locs.indexer}

1974 for b in self.blocks

1975 ],

1976 }

1977 }

1978

1979 # First three elements of the state are to maintain forward

1980 # compatibility with 0.13.1.

1981 return axes_array, block_values, block_items, extra_state

1982

1983 def __setstate__(self, state):

1984 def unpickle_block(values, mgr_locs, ndim: int) -> Block:

1985 # TODO(EA2D): ndim would be unnecessary with 2D EAs

1986 # older pickles may store e.g. DatetimeIndex instead of DatetimeArray

1987 values = extract_array(values, extract_numpy=True)

1988 return new_block(values, placement=mgr_locs, ndim=ndim)

1989

1990 if isinstance(state, tuple) and len(state) >= 4 and "0.14.1" in state[3]:

1991 state = state[3]["0.14.1"]

1992 self.axes = [ensure_index(ax) for ax in state["axes"]]

1993 ndim = len(self.axes)

1994 self.blocks = tuple(

1995 unpickle_block(b["values"], b["mgr_locs"], ndim=ndim)

1996 for b in state["blocks"]

1997 )

1998 else:

1999 raise NotImplementedError("pre-0.14.1 pickles are no longer supported")

2000

2001 self._post_setstate()

2002

2003 def _post_setstate(self):

2004 pass

2005

2006 @cache_readonly

2007 def _block(self) -> Block:

2008 return self.blocks[0]

2009

2010 @property

2011 def _blknos(self):

2012 """compat with BlockManager"""

2013 return None

2014

2015 @property

2016 def _blklocs(self):

2017 """compat with BlockManager"""

2018 return None

2019

2020 def getitem_mgr(self, indexer: slice | npt.NDArray[np.bool_]) -> SingleBlockManager:

2021 # similar to get_slice, but not restricted to slice indexer

2022 blk = self._block

2023 array = blk._slice(indexer)

2024 if array.ndim > 1:

2025 # This will be caught by Series._get_values

2026 raise ValueError("dimension-expanding indexing not allowed")

2027

2028 bp = BlockPlacement(slice(0, len(array)))

2029 block = type(blk)(array, placement=bp, ndim=1)

2030

2031 new_idx = self.index[indexer]

2032 # TODO(CoW) in theory only need to track reference if new_array is a view

2033 ref = weakref.ref(blk)

2034 return type(self)(block, new_idx, [ref], parent=self)

2035

2036 def get_slice(self, slobj: slice, axis: int = 0) -> SingleBlockManager:

2037 # Assertion disabled for performance

2038 # assert isinstance(slobj, slice), type(slobj)

2039 if axis >= self.ndim:

2040 raise IndexError("Requested axis not found in manager")

2041

2042 blk = self._block

2043 array = blk._slice(slobj)

2044 bp = BlockPlacement(slice(0, len(array)))

2045 block = type(blk)(array, placement=bp, ndim=1)

2046 new_index = self.index._getitem_slice(slobj)

2047 # TODO this method is only used in groupby SeriesSplitter at the moment,

2048 # so passing refs / parent is not yet covered by the tests

2049 return type(self)(block, new_index, [weakref.ref(blk)], parent=self)

2050

2051 @property

2052 def index(self) -> Index:

2053 return self.axes[0]

2054

2055 @property

2056 def dtype(self) -> DtypeObj:

2057 return self._block.dtype

2058

2059 def get_dtypes(self) -> np.ndarray:

2060 return np.array([self._block.dtype])

2061

2062 def external_values(self):

2063 """The array that Series.values returns"""

2064 return self._block.external_values()

2065

2066 def internal_values(self):

2067 """The array that Series._values returns"""

2068 return self._block.values

2069

2070 def array_values(self):

2071 """The array that Series.array returns"""

2072 return self._block.array_values

2073

2074 def get_numeric_data(self, copy: bool = False):

2075 if self._block.is_numeric:

2076 return self.copy(deep=copy)

2077 return self.make_empty()

2078

2079 @property

2080 def _can_hold_na(self) -> bool:

2081 return self._block._can_hold_na

2082

2083 def setitem_inplace(self, indexer, value) -> None:

2084 """

2085 Set values with indexer.

2086

2087 For Single[Block/Array]Manager, this backs s[indexer] = value

2088

2089 This is an inplace version of `setitem()`, mutating the manager/values

2090 in place, not returning a new Manager (and Block), and thus never changing

2091 the dtype.

2092 """

2093 if _using_copy_on_write() and not self._has_no_reference(0):

2094 self.blocks = (self._block.copy(),)

2095 self.refs = None

2096 self.parent = None

2097 self._cache.clear()

2098

2099 super().setitem_inplace(indexer, value)

2100

2101 def idelete(self, indexer) -> SingleBlockManager:

2102 """

2103 Delete single location from SingleBlockManager.

2104

2105 Ensures that self.blocks doesn't become empty.

2106 """

2107 nb = self._block.delete(indexer)

2108 self.blocks = (nb,)

2109 self.axes[0] = self.axes[0].delete(indexer)

2110 self._cache.clear()

2111 # clear reference since delete always results in a new array

2112 self.refs = None

2113 self.parent = None

2114 return self

2115

2116 def fast_xs(self, loc):

2117 """

2118 fast path for getting a cross-section

2119 return a view of the data

2120 """

2121 raise NotImplementedError("Use series._values[loc] instead")

2122

2123 def set_values(self, values: ArrayLike):

2124 """

2125 Set the values of the single block in place.

2126

2127 Use at your own risk! This does not check if the passed values are

2128 valid for the current Block/SingleBlockManager (length, dtype, etc).

2129 """

2130 # TODO(CoW) do we need to handle copy on write here? Currently this is

2131 # only used for FrameColumnApply.series_generator (what if apply is

2132 # mutating inplace?)

2133 self.blocks[0].values = values

2134 self.blocks[0]._mgr_locs = BlockPlacement(slice(len(values)))

2135

2136 def _equal_values(self: T, other: T) -> bool:

2137 """

2138 Used in .equals defined in base class. Only check the column values

2139 assuming shape and indexes have already been checked.

2140 """

2141 # For SingleBlockManager (i.e.Series)

2142 if other.ndim != 1:

2143 return False

2144 left = self.blocks[0].values

2145 right = other.blocks[0].values

2146 return array_equals(left, right)

2147

2148

2149# --------------------------------------------------------------------

2150# Constructor Helpers

2151

2152

2153def create_block_manager_from_blocks(

2154 blocks: list[Block],

2155 axes: list[Index],

2156 consolidate: bool = True,

2157 verify_integrity: bool = True,

2158) -> BlockManager:

2159 # If verify_integrity=False, then caller is responsible for checking

2160 # all(x.shape[-1] == len(axes[1]) for x in blocks)

2161 # sum(x.shape[0] for x in blocks) == len(axes[0])

2162 # set(x for blk in blocks for x in blk.mgr_locs) == set(range(len(axes[0])))

2163 # all(blk.ndim == 2 for blk in blocks)

2164 # This allows us to safely pass verify_integrity=False

2165

2166 try:

2167 mgr = BlockManager(blocks, axes, verify_integrity=verify_integrity)

2168

2169 except ValueError as err:

2170 arrays = [blk.values for blk in blocks]

2171 tot_items = sum(arr.shape[0] for arr in arrays)

2172 raise construction_error(tot_items, arrays[0].shape[1:], axes, err)

2173

2174 if consolidate:

2175 mgr._consolidate_inplace()

2176 return mgr

2177

2178

2179def create_block_manager_from_column_arrays(

2180 arrays: list[ArrayLike],

2181 axes: list[Index],

2182 consolidate: bool = True,

2183) -> BlockManager:

2184 # Assertions disabled for performance (caller is responsible for verifying)

2185 # assert isinstance(axes, list)

2186 # assert all(isinstance(x, Index) for x in axes)

2187 # assert all(isinstance(x, (np.ndarray, ExtensionArray)) for x in arrays)

2188 # assert all(type(x) is not PandasArray for x in arrays)

2189 # assert all(x.ndim == 1 for x in arrays)

2190 # assert all(len(x) == len(axes[1]) for x in arrays)

2191 # assert len(arrays) == len(axes[0])

2192 # These last three are sufficient to allow us to safely pass

2193 # verify_integrity=False below.

2194

2195 try:

2196 blocks = _form_blocks(arrays, consolidate)

2197 mgr = BlockManager(blocks, axes, verify_integrity=False)

2198 except ValueError as e:

2199 raise construction_error(len(arrays), arrays[0].shape, axes, e)

2200 if consolidate:

2201 mgr._consolidate_inplace()

2202 return mgr

2203

2204

2205def construction_error(

2206 tot_items: int,

2207 block_shape: Shape,

2208 axes: list[Index],

2209 e: ValueError | None = None,

2210):

2211 """raise a helpful message about our construction"""

2212 passed = tuple(map(int, [tot_items] + list(block_shape)))

2213 # Correcting the user facing error message during dataframe construction

2214 if len(passed) <= 2:

2215 passed = passed[::-1]

2216

2217 implied = tuple(len(ax) for ax in axes)

2218 # Correcting the user facing error message during dataframe construction

2219 if len(implied) <= 2:

2220 implied = implied[::-1]

2221

2222 # We return the exception object instead of raising it so that we

2223 # can raise it in the caller; mypy plays better with that

2224 if passed == implied and e is not None:

2225 return e

2226 if block_shape[0] == 0:

2227 return ValueError("Empty data passed with indices specified.")

2228 return ValueError(f"Shape of passed values is {passed}, indices imply {implied}")

2229

2230

2231# -----------------------------------------------------------------------

2232

2233

2234def _grouping_func(tup: tuple[int, ArrayLike]) -> tuple[int, bool, DtypeObj]:

2235 # compat for numpy<1.21, in which comparing a np.dtype with an ExtensionDtype

2236 # raises instead of returning False. Once earlier numpy versions are dropped,

2237 # this can be simplified to `return tup[1].dtype`

2238 dtype = tup[1].dtype

2239

2240 if is_1d_only_ea_dtype(dtype):

2241 # We know these won't be consolidated, so don't need to group these.

2242 # This avoids expensive comparisons of CategoricalDtype objects

2243 sep = id(dtype)

2244 else:

2245 sep = 0

2246

2247 return sep, isinstance(dtype, np.dtype), dtype

2248

2249

2250def _form_blocks(arrays: list[ArrayLike], consolidate: bool) -> list[Block]:

2251 tuples = list(enumerate(arrays))

2252

2253 if not consolidate:

2254 nbs = _tuples_to_blocks_no_consolidate(tuples)

2255 return nbs

2256

2257 # group by dtype

2258 grouper = itertools.groupby(tuples, _grouping_func)

2259

2260 nbs = []

2261 for (_, _, dtype), tup_block in grouper:

2262 block_type = get_block_type(dtype)

2263

2264 if isinstance(dtype, np.dtype):

2265 is_dtlike = dtype.kind in ["m", "M"]

2266

2267 if issubclass(dtype.type, (str, bytes)):

2268 dtype = np.dtype(object)

2269

2270 values, placement = _stack_arrays(list(tup_block), dtype)

2271 if is_dtlike:

2272 values = ensure_wrapped_if_datetimelike(values)

2273 blk = block_type(values, placement=BlockPlacement(placement), ndim=2)

2274 nbs.append(blk)

2275

2276 elif is_1d_only_ea_dtype(dtype):

2277 dtype_blocks = [

2278 block_type(x[1], placement=BlockPlacement(x[0]), ndim=2)

2279 for x in tup_block

2280 ]

2281 nbs.extend(dtype_blocks)

2282

2283 else:

2284 dtype_blocks = [

2285 block_type(

2286 ensure_block_shape(x[1], 2), placement=BlockPlacement(x[0]), ndim=2

2287 )

2288 for x in tup_block

2289 ]

2290 nbs.extend(dtype_blocks)

2291 return nbs

2292

2293

2294def _tuples_to_blocks_no_consolidate(tuples) -> list[Block]:

2295 # tuples produced within _form_blocks are of the form (placement, array)

2296 return [

2297 new_block_2d(ensure_block_shape(x[1], ndim=2), placement=BlockPlacement(x[0]))

2298 for x in tuples

2299 ]

2300

2301

2302def _stack_arrays(tuples, dtype: np.dtype):

2303

2304 placement, arrays = zip(*tuples)

2305

2306 first = arrays[0]

2307 shape = (len(arrays),) + first.shape

2308

2309 stacked = np.empty(shape, dtype=dtype)

2310 for i, arr in enumerate(arrays):

2311 stacked[i] = arr

2312

2313 return stacked, placement

2314

2315

2316def _consolidate(blocks: tuple[Block, ...]) -> tuple[Block, ...]:

2317 """

2318 Merge blocks having same dtype, exclude non-consolidating blocks

2319 """

2320 # sort by _can_consolidate, dtype

2321 gkey = lambda x: x._consolidate_key

2322 grouper = itertools.groupby(sorted(blocks, key=gkey), gkey)

2323

2324 new_blocks: list[Block] = []

2325 for (_can_consolidate, dtype), group_blocks in grouper:

2326 merged_blocks, _ = _merge_blocks(

2327 list(group_blocks), dtype=dtype, can_consolidate=_can_consolidate

2328 )

2329 new_blocks = extend_blocks(merged_blocks, new_blocks)

2330 return tuple(new_blocks)

2331

2332

2333def _consolidate_with_refs(

2334 blocks: tuple[Block, ...], refs

2335) -> tuple[tuple[Block, ...], list[weakref.ref | None]]:

2336 """

2337 Merge blocks having same dtype, exclude non-consolidating blocks, handling

2338 refs

2339 """

2340 gkey = lambda x: x[0]._consolidate_key

2341 grouper = itertools.groupby(sorted(zip(blocks, refs), key=gkey), gkey)

2342

2343 new_blocks: list[Block] = []

2344 new_refs: list[weakref.ref | None] = []

2345 for (_can_consolidate, dtype), group_blocks_refs in grouper:

2346 group_blocks, group_refs = list(zip(*list(group_blocks_refs)))

2347 merged_blocks, consolidated = _merge_blocks(

2348 list(group_blocks), dtype=dtype, can_consolidate=_can_consolidate

2349 )

2350 new_blocks = extend_blocks(merged_blocks, new_blocks)

2351 if consolidated:

2352 new_refs.extend([None])

2353 else:

2354 new_refs.extend(group_refs)

2355 return tuple(new_blocks), new_refs

2356

2357

2358def _merge_blocks(

2359 blocks: list[Block], dtype: DtypeObj, can_consolidate: bool

2360) -> tuple[list[Block], bool]:

2361

2362 if len(blocks) == 1:

2363 return blocks, False

2364

2365 if can_consolidate:

2366

2367 # TODO: optimization potential in case all mgrs contain slices and

2368 # combination of those slices is a slice, too.

2369 new_mgr_locs = np.concatenate([b.mgr_locs.as_array for b in blocks])

2370

2371 new_values: ArrayLike

2372

2373 if isinstance(blocks[0].dtype, np.dtype):

2374 # error: List comprehension has incompatible type List[Union[ndarray,

2375 # ExtensionArray]]; expected List[Union[complex, generic,

2376 # Sequence[Union[int, float, complex, str, bytes, generic]],

2377 # Sequence[Sequence[Any]], SupportsArray]]

2378 new_values = np.vstack([b.values for b in blocks]) # type: ignore[misc]

2379 else:

2380 bvals = [blk.values for blk in blocks]

2381 bvals2 = cast(Sequence[NDArrayBackedExtensionArray], bvals)

2382 new_values = bvals2[0]._concat_same_type(bvals2, axis=0)

2383

2384 argsort = np.argsort(new_mgr_locs)

2385 new_values = new_values[argsort]

2386 new_mgr_locs = new_mgr_locs[argsort]

2387

2388 bp = BlockPlacement(new_mgr_locs)

2389 return [new_block_2d(new_values, placement=bp)], True

2390

2391 # can't consolidate --> no merge

2392 return blocks, False

2393

2394

2395def _fast_count_smallints(arr: npt.NDArray[np.intp]):

2396 """Faster version of set(arr) for sequences of small numbers."""

2397 counts = np.bincount(arr)

2398 nz = counts.nonzero()[0]

2399 # Note: list(zip(...) outperforms list(np.c_[nz, counts[nz]]) here,

2400 # in one benchmark by a factor of 11

2401 return zip(nz, counts[nz])

2402

2403

2404def _preprocess_slice_or_indexer(

2405 slice_or_indexer: slice | np.ndarray, length: int, allow_fill: bool

2406):

2407 if isinstance(slice_or_indexer, slice):

2408 return (

2409 "slice",

2410 slice_or_indexer,

2411 libinternals.slice_len(slice_or_indexer, length),

2412 )

2413 else:

2414 if (

2415 not isinstance(slice_or_indexer, np.ndarray)

2416 or slice_or_indexer.dtype.kind != "i"

2417 ):

2418 dtype = getattr(slice_or_indexer, "dtype", None)

2419 raise TypeError(type(slice_or_indexer), dtype)

2420

2421 indexer = ensure_platform_int(slice_or_indexer)

2422 if not allow_fill:

2423 indexer = maybe_convert_indices(indexer, length)

2424 return "fancy", indexer, len(indexer)

2425

2426

2427def _using_copy_on_write():

2428 return get_option("mode.copy_on_write")