Coverage for /var/srv/projects/api.amasfac.comuna18.com/tmp/venv/lib/python3.9/site-packages/pandas/io/pytables.py: 14%

1"""

2High level interface to PyTables for reading and writing pandas data structures

3to disk

4"""

5from __future__ import annotations

7from contextlib import suppress

8import copy

9from datetime import (

10 date,

11 tzinfo,

12)

13import itertools

14import os

15import re

16from textwrap import dedent

17from typing import (

18 TYPE_CHECKING,

19 Any,

20 Callable,

21 Final,

22 Hashable,

23 Iterator,

24 Literal,

25 Sequence,

26 cast,

27 overload,

28)

29import warnings

31import numpy as np

33from pandas._config import (

34 config,

35 get_option,

36)

38from pandas._libs import (

39 lib,

40 writers as libwriters,

41)

42from pandas._libs.tslibs import timezones

43from pandas._typing import (

44 AnyArrayLike,

45 ArrayLike,

46 DtypeArg,

47 FilePath,

48 Shape,

49 npt,

50)

51from pandas.compat._optional import import_optional_dependency

52from pandas.compat.pickle_compat import patch_pickle

53from pandas.errors import (

54 AttributeConflictWarning,

55 ClosedFileError,

56 IncompatibilityWarning,

57 PerformanceWarning,

58 PossibleDataLossError,

59)

60from pandas.util._decorators import cache_readonly

61from pandas.util._exceptions import find_stack_level

63from pandas.core.dtypes.common import (

64 ensure_object,

65 is_bool_dtype,

66 is_categorical_dtype,

67 is_complex_dtype,

68 is_datetime64_dtype,

69 is_datetime64tz_dtype,

70 is_extension_array_dtype,

71 is_list_like,

72 is_string_dtype,

73 is_timedelta64_dtype,

74 needs_i8_conversion,

75)

76from pandas.core.dtypes.missing import array_equivalent

78from pandas import (

79 DataFrame,

80 DatetimeIndex,

81 Index,

82 MultiIndex,

83 PeriodIndex,

84 Series,

85 TimedeltaIndex,

86 concat,

87 isna,

88)

89from pandas.core.api import Int64Index

90from pandas.core.arrays import (

91 Categorical,

92 DatetimeArray,

93 PeriodArray,

94)

95import pandas.core.common as com

96from pandas.core.computation.pytables import (

97 PyTablesExpr,

98 maybe_expression,

99)

100from pandas.core.construction import extract_array

101from pandas.core.indexes.api import ensure_index

102from pandas.core.internals import (

103 ArrayManager,

104 BlockManager,

105)

106

107from pandas.io.common import stringify_path

108from pandas.io.formats.printing import (

109 adjoin,

110 pprint_thing,

111)

112

113if TYPE_CHECKING: 113 ↛ 114line 113 didn't jump to line 114, because the condition on line 113 was never true

114 from tables import (

115 Col,

116 File,

117 Node,

118 )

119

120 from pandas.core.internals import Block

121

122

123# versioning attribute

124_version = "0.15.2"

125

126# encoding

127_default_encoding = "UTF-8"

128

129

130def _ensure_decoded(s):

131 """if we have bytes, decode them to unicode"""

132 if isinstance(s, np.bytes_):

133 s = s.decode("UTF-8")

134 return s

135

136

137def _ensure_encoding(encoding):

138 # set the encoding if we need

139 if encoding is None:

140 encoding = _default_encoding

141

142 return encoding

143

144

145def _ensure_str(name):

146 """

147 Ensure that an index / column name is a str (python 3); otherwise they

148 may be np.string dtype. Non-string dtypes are passed through unchanged.

149

150 https://github.com/pandas-dev/pandas/issues/13492

151 """

152 if isinstance(name, str):

153 name = str(name)

154 return name

155

156

157Term = PyTablesExpr

158

159

160def _ensure_term(where, scope_level: int):

161 """

162 Ensure that the where is a Term or a list of Term.

163

164 This makes sure that we are capturing the scope of variables that are

165 passed create the terms here with a frame_level=2 (we are 2 levels down)

166 """

167 # only consider list/tuple here as an ndarray is automatically a coordinate

168 # list

169 level = scope_level + 1

170 if isinstance(where, (list, tuple)):

171 where = [

172 Term(term, scope_level=level + 1) if maybe_expression(term) else term

173 for term in where

174 if term is not None

175 ]

176 elif maybe_expression(where):

177 where = Term(where, scope_level=level)

178 return where if where is None or len(where) else None

179

180

181incompatibility_doc: Final = """

182where criteria is being ignored as this version [%s] is too old (or

183not-defined), read the file in and write it out to a new file to upgrade (with

184the copy_to method)

185"""

186

187attribute_conflict_doc: Final = """

188the [%s] attribute of the existing index is [%s] which conflicts with the new

189[%s], resetting the attribute to None

190"""

191

192performance_doc: Final = """

193your performance may suffer as PyTables will pickle object types that it cannot

194map directly to c-types [inferred_type->%s,key->%s] [items->%s]

195"""

196

197# formats

198_FORMAT_MAP = {"f": "fixed", "fixed": "fixed", "t": "table", "table": "table"}

199

200# axes map

201_AXES_MAP = {DataFrame: [0]}

202

203# register our configuration options

204dropna_doc: Final = """

205: boolean

206 drop ALL nan rows when appending to a table

207"""

208format_doc: Final = """

209: format

210 default format writing format, if None, then

211 put will default to 'fixed' and append will default to 'table'

212"""

213

214with config.config_prefix("io.hdf"):

215 config.register_option("dropna_table", False, dropna_doc, validator=config.is_bool)

216 config.register_option(

217 "default_format",

218 None,

219 format_doc,

220 validator=config.is_one_of_factory(["fixed", "table", None]),

221 )

222

223# oh the troubles to reduce import time

224_table_mod = None

225_table_file_open_policy_is_strict = False

226

227

228def _tables():

229 global _table_mod

230 global _table_file_open_policy_is_strict

231 if _table_mod is None:

232 import tables

233

234 _table_mod = tables

235

236 # set the file open policy

237 # return the file open policy; this changes as of pytables 3.1

238 # depending on the HDF5 version

239 with suppress(AttributeError):

240 _table_file_open_policy_is_strict = (

241 tables.file._FILE_OPEN_POLICY == "strict"

242 )

243

244 return _table_mod

245

246

247# interface to/from ###

248

249

250def to_hdf(

251 path_or_buf: FilePath | HDFStore,

252 key: str,

253 value: DataFrame | Series,

254 mode: str = "a",

255 complevel: int | None = None,

256 complib: str | None = None,

257 append: bool = False,

258 format: str | None = None,

259 index: bool = True,

260 min_itemsize: int | dict[str, int] | None = None,

261 nan_rep=None,

262 dropna: bool | None = None,

263 data_columns: Literal[True] | list[str] | None = None,

264 errors: str = "strict",

265 encoding: str = "UTF-8",

266) -> None:

267 """store this object, close it if we opened it"""

268 if append:

269 f = lambda store: store.append(

270 key,

271 value,

272 format=format,

273 index=index,

274 min_itemsize=min_itemsize,

275 nan_rep=nan_rep,

276 dropna=dropna,

277 data_columns=data_columns,

278 errors=errors,

279 encoding=encoding,

280 )

281 else:

282 # NB: dropna is not passed to `put`

283 f = lambda store: store.put(

284 key,

285 value,

286 format=format,

287 index=index,

288 min_itemsize=min_itemsize,

289 nan_rep=nan_rep,

290 data_columns=data_columns,

291 errors=errors,

292 encoding=encoding,

293 dropna=dropna,

294 )

295

296 path_or_buf = stringify_path(path_or_buf)

297 if isinstance(path_or_buf, str):

298 with HDFStore(

299 path_or_buf, mode=mode, complevel=complevel, complib=complib

300 ) as store:

301 f(store)

302 else:

303 f(path_or_buf)

304

305

306def read_hdf(

307 path_or_buf: FilePath | HDFStore,

308 key=None,

309 mode: str = "r",

310 errors: str = "strict",

311 where: str | list | None = None,

312 start: int | None = None,

313 stop: int | None = None,

314 columns: list[str] | None = None,

315 iterator: bool = False,

316 chunksize: int | None = None,

317 **kwargs,

318):

319 """

320 Read from the store, close it if we opened it.

321

322 Retrieve pandas object stored in file, optionally based on where

323 criteria.

324

325 .. warning::

326

327 Pandas uses PyTables for reading and writing HDF5 files, which allows

328 serializing object-dtype data with pickle when using the "fixed" format.

329 Loading pickled data received from untrusted sources can be unsafe.

330

331 See: https://docs.python.org/3/library/pickle.html for more.

332

333 Parameters

334 ----------

335 path_or_buf : str, path object, pandas.HDFStore

336 Any valid string path is acceptable. Only supports the local file system,

337 remote URLs and file-like objects are not supported.

338

339 If you want to pass in a path object, pandas accepts any

340 ``os.PathLike``.

341

342 Alternatively, pandas accepts an open :class:`pandas.HDFStore` object.

343

344 key : object, optional

345 The group identifier in the store. Can be omitted if the HDF file

346 contains a single pandas object.

347 mode : {'r', 'r+', 'a'}, default 'r'

348 Mode to use when opening the file. Ignored if path_or_buf is a

349 :class:`pandas.HDFStore`. Default is 'r'.

350 errors : str, default 'strict'

351 Specifies how encoding and decoding errors are to be handled.

352 See the errors argument for :func:`open` for a full list

353 of options.

354 where : list, optional

355 A list of Term (or convertible) objects.

356 start : int, optional

357 Row number to start selection.

358 stop : int, optional

359 Row number to stop selection.

360 columns : list, optional

361 A list of columns names to return.

362 iterator : bool, optional

363 Return an iterator object.

364 chunksize : int, optional

365 Number of rows to include in an iteration when using an iterator.

366 **kwargs

367 Additional keyword arguments passed to HDFStore.

368

369 Returns

370 -------

371 item : object

372 The selected object. Return type depends on the object stored.

373

374 See Also

375 --------

376 DataFrame.to_hdf : Write a HDF file from a DataFrame.

377 HDFStore : Low-level access to HDF files.

378

379 Examples

380 --------

381 >>> df = pd.DataFrame([[1, 1.0, 'a']], columns=['x', 'y', 'z']) # doctest: +SKIP

382 >>> df.to_hdf('./store.h5', 'data') # doctest: +SKIP

383 >>> reread = pd.read_hdf('./store.h5') # doctest: +SKIP

384 """

385 if mode not in ["r", "r+", "a"]:

386 raise ValueError(

387 f"mode {mode} is not allowed while performing a read. "

388 f"Allowed modes are r, r+ and a."

389 )

390 # grab the scope

391 if where is not None:

392 where = _ensure_term(where, scope_level=1)

393

394 if isinstance(path_or_buf, HDFStore):

395 if not path_or_buf.is_open:

396 raise OSError("The HDFStore must be open for reading.")

397

398 store = path_or_buf

399 auto_close = False

400 else:

401 path_or_buf = stringify_path(path_or_buf)

402 if not isinstance(path_or_buf, str):

403 raise NotImplementedError(

404 "Support for generic buffers has not been implemented."

405 )

406 try:

407 exists = os.path.exists(path_or_buf)

408

409 # if filepath is too long

410 except (TypeError, ValueError):

411 exists = False

412

413 if not exists:

414 raise FileNotFoundError(f"File {path_or_buf} does not exist")

415

416 store = HDFStore(path_or_buf, mode=mode, errors=errors, **kwargs)

417 # can't auto open/close if we are using an iterator

418 # so delegate to the iterator

419 auto_close = True

420

421 try:

422 if key is None:

423 groups = store.groups()

424 if len(groups) == 0:

425 raise ValueError(

426 "Dataset(s) incompatible with Pandas data types, "

427 "not table, or no datasets found in HDF5 file."

428 )

429 candidate_only_group = groups[0]

430

431 # For the HDF file to have only one dataset, all other groups

432 # should then be metadata groups for that candidate group. (This

433 # assumes that the groups() method enumerates parent groups

434 # before their children.)

435 for group_to_check in groups[1:]:

436 if not _is_metadata_of(group_to_check, candidate_only_group):

437 raise ValueError(

438 "key must be provided when HDF5 "

439 "file contains multiple datasets."

440 )

441 key = candidate_only_group._v_pathname

442 return store.select(

443 key,

444 where=where,

445 start=start,

446 stop=stop,

447 columns=columns,

448 iterator=iterator,

449 chunksize=chunksize,

450 auto_close=auto_close,

451 )

452 except (ValueError, TypeError, KeyError):

453 if not isinstance(path_or_buf, HDFStore):

454 # if there is an error, close the store if we opened it.

455 with suppress(AttributeError):

456 store.close()

457

458 raise

459

460

461def _is_metadata_of(group: Node, parent_group: Node) -> bool:

462 """Check if a given group is a metadata group for a given parent_group."""

463 if group._v_depth <= parent_group._v_depth:

464 return False

465

466 current = group

467 while current._v_depth > 1:

468 parent = current._v_parent

469 if parent == parent_group and current._v_name == "meta":

470 return True

471 current = current._v_parent

472 return False

473

474

475class HDFStore:

476 """

477 Dict-like IO interface for storing pandas objects in PyTables.

478

479 Either Fixed or Table format.

480

481 .. warning::

482

483 Pandas uses PyTables for reading and writing HDF5 files, which allows

484 serializing object-dtype data with pickle when using the "fixed" format.

485 Loading pickled data received from untrusted sources can be unsafe.

486

487 See: https://docs.python.org/3/library/pickle.html for more.

488

489 Parameters

490 ----------

491 path : str

492 File path to HDF5 file.

493 mode : {'a', 'w', 'r', 'r+'}, default 'a'

494

495 ``'r'``

496 Read-only; no data can be modified.

497 ``'w'``

498 Write; a new file is created (an existing file with the same

499 name would be deleted).

500 ``'a'``

501 Append; an existing file is opened for reading and writing,

502 and if the file does not exist it is created.

503 ``'r+'``

504 It is similar to ``'a'``, but the file must already exist.

505 complevel : int, 0-9, default None

506 Specifies a compression level for data.

507 A value of 0 or None disables compression.

508 complib : {'zlib', 'lzo', 'bzip2', 'blosc'}, default 'zlib'

509 Specifies the compression library to be used.

510 As of v0.20.2 these additional compressors for Blosc are supported

511 (default if no compressor specified: 'blosc:blosclz'):

512 {'blosc:blosclz', 'blosc:lz4', 'blosc:lz4hc', 'blosc:snappy',

513 'blosc:zlib', 'blosc:zstd'}.

514 Specifying a compression library which is not available issues

515 a ValueError.

516 fletcher32 : bool, default False

517 If applying compression use the fletcher32 checksum.

518 **kwargs

519 These parameters will be passed to the PyTables open_file method.

520

521 Examples

522 --------

523 >>> bar = pd.DataFrame(np.random.randn(10, 4))

524 >>> store = pd.HDFStore('test.h5')

525 >>> store['foo'] = bar # write to HDF5

526 >>> bar = store['foo'] # retrieve

527 >>> store.close()

528

529 **Create or load HDF5 file in-memory**

530

531 When passing the `driver` option to the PyTables open_file method through

532 **kwargs, the HDF5 file is loaded or created in-memory and will only be

533 written when closed:

534

535 >>> bar = pd.DataFrame(np.random.randn(10, 4))

536 >>> store = pd.HDFStore('test.h5', driver='H5FD_CORE')

537 >>> store['foo'] = bar

538 >>> store.close() # only now, data is written to disk

539 """

540

541 _handle: File | None

542 _mode: str

543 _complevel: int

544 _fletcher32: bool

545

546 def __init__(

547 self,

548 path,

549 mode: str = "a",

550 complevel: int | None = None,

551 complib=None,

552 fletcher32: bool = False,

553 **kwargs,

554 ) -> None:

555

556 if "format" in kwargs:

557 raise ValueError("format is not a defined argument for HDFStore")

558

559 tables = import_optional_dependency("tables")

560

561 if complib is not None and complib not in tables.filters.all_complibs:

562 raise ValueError(

563 f"complib only supports {tables.filters.all_complibs} compression."

564 )

565

566 if complib is None and complevel is not None:

567 complib = tables.filters.default_complib

568

569 self._path = stringify_path(path)

570 if mode is None:

571 mode = "a"

572 self._mode = mode

573 self._handle = None

574 self._complevel = complevel if complevel else 0

575 self._complib = complib

576 self._fletcher32 = fletcher32

577 self._filters = None

578 self.open(mode=mode, **kwargs)

579

580 def __fspath__(self) -> str:

581 return self._path

582

583 @property

584 def root(self):

585 """return the root node"""

586 self._check_if_open()

587 assert self._handle is not None # for mypy

588 return self._handle.root

589

590 @property

591 def filename(self) -> str:

592 return self._path

593

594 def __getitem__(self, key: str):

595 return self.get(key)

596

597 def __setitem__(self, key: str, value) -> None:

598 self.put(key, value)

599

600 def __delitem__(self, key: str) -> None:

601 return self.remove(key)

602

603 def __getattr__(self, name: str):

604 """allow attribute access to get stores"""

605 try:

606 return self.get(name)

607 except (KeyError, ClosedFileError):

608 pass

609 raise AttributeError(

610 f"'{type(self).__name__}' object has no attribute '{name}'"

611 )

612

613 def __contains__(self, key: str) -> bool:

614 """

615 check for existence of this key

616 can match the exact pathname or the pathnm w/o the leading '/'

617 """

618 node = self.get_node(key)

619 if node is not None:

620 name = node._v_pathname

621 if name == key or name[1:] == key:

622 return True

623 return False

624

625 def __len__(self) -> int:

626 return len(self.groups())

627

628 def __repr__(self) -> str:

629 pstr = pprint_thing(self._path)

630 return f"{type(self)}\nFile path: {pstr}\n"

631

632 def __enter__(self) -> HDFStore:

633 return self

634

635 def __exit__(self, exc_type, exc_value, traceback) -> None:

636 self.close()

637

638 def keys(self, include: str = "pandas") -> list[str]:

639 """

640 Return a list of keys corresponding to objects stored in HDFStore.

641

642 Parameters

643 ----------

644

645 include : str, default 'pandas'

646 When kind equals 'pandas' return pandas objects.

647 When kind equals 'native' return native HDF5 Table objects.

648

649 .. versionadded:: 1.1.0

650

651 Returns

652 -------

653 list

654 List of ABSOLUTE path-names (e.g. have the leading '/').

655

656 Raises

657 ------

658 raises ValueError if kind has an illegal value

659 """

660 if include == "pandas":

661 return [n._v_pathname for n in self.groups()]

662

663 elif include == "native":

664 assert self._handle is not None # mypy

665 return [

666 n._v_pathname for n in self._handle.walk_nodes("/", classname="Table")

667 ]

668 raise ValueError(

669 f"`include` should be either 'pandas' or 'native' but is '{include}'"

670 )

671

672 def __iter__(self) -> Iterator[str]:

673 return iter(self.keys())

674

675 def items(self) -> Iterator[tuple[str, list]]:

676 """

677 iterate on key->group

678 """

679 for g in self.groups():

680 yield g._v_pathname, g

681

682 def iteritems(self):

683 """

684 iterate on key->group

685 """

686 warnings.warn(

687 "iteritems is deprecated and will be removed in a future version. "

688 "Use .items instead.",

689 FutureWarning,

690 stacklevel=find_stack_level(),

691 )

692 yield from self.items()

693

694 def open(self, mode: str = "a", **kwargs) -> None:

695 """

696 Open the file in the specified mode

697

698 Parameters

699 ----------

700 mode : {'a', 'w', 'r', 'r+'}, default 'a'

701 See HDFStore docstring or tables.open_file for info about modes

702 **kwargs

703 These parameters will be passed to the PyTables open_file method.

704 """

705 tables = _tables()

706

707 if self._mode != mode:

708 # if we are changing a write mode to read, ok

709 if self._mode in ["a", "w"] and mode in ["r", "r+"]:

710 pass

711 elif mode in ["w"]:

712 # this would truncate, raise here

713 if self.is_open:

714 raise PossibleDataLossError(

715 f"Re-opening the file [{self._path}] with mode [{self._mode}] "

716 "will delete the current file!"

717 )

718

719 self._mode = mode

720

721 # close and reopen the handle

722 if self.is_open:

723 self.close()

724

725 if self._complevel and self._complevel > 0:

726 self._filters = _tables().Filters(

727 self._complevel, self._complib, fletcher32=self._fletcher32

728 )

729

730 if _table_file_open_policy_is_strict and self.is_open:

731 msg = (

732 "Cannot open HDF5 file, which is already opened, "

733 "even in read-only mode."

734 )

735 raise ValueError(msg)

736

737 self._handle = tables.open_file(self._path, self._mode, **kwargs)

738

739 def close(self) -> None:

740 """

741 Close the PyTables file handle

742 """

743 if self._handle is not None:

744 self._handle.close()

745 self._handle = None

746

747 @property

748 def is_open(self) -> bool:

749 """

750 return a boolean indicating whether the file is open

751 """

752 if self._handle is None:

753 return False

754 return bool(self._handle.isopen)

755

756 def flush(self, fsync: bool = False) -> None:

757 """

758 Force all buffered modifications to be written to disk.

759

760 Parameters

761 ----------

762 fsync : bool (default False)

763 call ``os.fsync()`` on the file handle to force writing to disk.

764

765 Notes

766 -----

767 Without ``fsync=True``, flushing may not guarantee that the OS writes

768 to disk. With fsync, the operation will block until the OS claims the

769 file has been written; however, other caching layers may still

770 interfere.

771 """

772 if self._handle is not None:

773 self._handle.flush()

774 if fsync:

775 with suppress(OSError):

776 os.fsync(self._handle.fileno())

777

778 def get(self, key: str):

779 """

780 Retrieve pandas object stored in file.

781

782 Parameters

783 ----------

784 key : str

785

786 Returns

787 -------

788 object

789 Same type as object stored in file.

790 """

791 with patch_pickle():

792 # GH#31167 Without this patch, pickle doesn't know how to unpickle

793 # old DateOffset objects now that they are cdef classes.

794 group = self.get_node(key)

795 if group is None:

796 raise KeyError(f"No object named {key} in the file")

797 return self._read_group(group)

798

799 def select(

800 self,

801 key: str,

802 where=None,

803 start=None,

804 stop=None,

805 columns=None,

806 iterator=False,

807 chunksize=None,

808 auto_close: bool = False,

809 ):

810 """

811 Retrieve pandas object stored in file, optionally based on where criteria.

812

813 .. warning::

814

815 Pandas uses PyTables for reading and writing HDF5 files, which allows

816 serializing object-dtype data with pickle when using the "fixed" format.

817 Loading pickled data received from untrusted sources can be unsafe.

818

819 See: https://docs.python.org/3/library/pickle.html for more.

820

821 Parameters

822 ----------

823 key : str

824 Object being retrieved from file.

825 where : list or None

826 List of Term (or convertible) objects, optional.

827 start : int or None

828 Row number to start selection.

829 stop : int, default None

830 Row number to stop selection.

831 columns : list or None

832 A list of columns that if not None, will limit the return columns.

833 iterator : bool or False

834 Returns an iterator.

835 chunksize : int or None

836 Number or rows to include in iteration, return an iterator.

837 auto_close : bool or False

838 Should automatically close the store when finished.

839

840 Returns

841 -------

842 object

843 Retrieved object from file.

844 """

845 group = self.get_node(key)

846 if group is None:

847 raise KeyError(f"No object named {key} in the file")

848

849 # create the storer and axes

850 where = _ensure_term(where, scope_level=1)

851 s = self._create_storer(group)

852 s.infer_axes()

853

854 # function to call on iteration

855 def func(_start, _stop, _where):

856 return s.read(start=_start, stop=_stop, where=_where, columns=columns)

857

858 # create the iterator

859 it = TableIterator(

860 self,

861 s,

862 func,

863 where=where,

864 nrows=s.nrows,

865 start=start,

866 stop=stop,

867 iterator=iterator,

868 chunksize=chunksize,

869 auto_close=auto_close,

870 )

871

872 return it.get_result()

873

874 def select_as_coordinates(

875 self,

876 key: str,

877 where=None,

878 start: int | None = None,

879 stop: int | None = None,

880 ):

881 """

882 return the selection as an Index

883

884 .. warning::

885

886 Pandas uses PyTables for reading and writing HDF5 files, which allows

887 serializing object-dtype data with pickle when using the "fixed" format.

888 Loading pickled data received from untrusted sources can be unsafe.

889

890 See: https://docs.python.org/3/library/pickle.html for more.

891

892

893 Parameters

894 ----------

895 key : str

896 where : list of Term (or convertible) objects, optional

897 start : integer (defaults to None), row number to start selection

898 stop : integer (defaults to None), row number to stop selection

899 """

900 where = _ensure_term(where, scope_level=1)

901 tbl = self.get_storer(key)

902 if not isinstance(tbl, Table):

903 raise TypeError("can only read_coordinates with a table")

904 return tbl.read_coordinates(where=where, start=start, stop=stop)

905

906 def select_column(

907 self,

908 key: str,

909 column: str,

910 start: int | None = None,

911 stop: int | None = None,

912 ):

913 """

914 return a single column from the table. This is generally only useful to

915 select an indexable

916

917 .. warning::

918

919 Pandas uses PyTables for reading and writing HDF5 files, which allows

920 serializing object-dtype data with pickle when using the "fixed" format.

921 Loading pickled data received from untrusted sources can be unsafe.

922

923 See: https://docs.python.org/3/library/pickle.html for more.

924

925 Parameters

926 ----------

927 key : str

928 column : str

929 The column of interest.

930 start : int or None, default None

931 stop : int or None, default None

932

933 Raises

934 ------

935 raises KeyError if the column is not found (or key is not a valid

936 store)

937 raises ValueError if the column can not be extracted individually (it

938 is part of a data block)

939

940 """

941 tbl = self.get_storer(key)

942 if not isinstance(tbl, Table):

943 raise TypeError("can only read_column with a table")

944 return tbl.read_column(column=column, start=start, stop=stop)

945

946 def select_as_multiple(

947 self,

948 keys,

949 where=None,

950 selector=None,

951 columns=None,

952 start=None,

953 stop=None,

954 iterator=False,

955 chunksize=None,

956 auto_close: bool = False,

957 ):

958 """

959 Retrieve pandas objects from multiple tables.

960

961 .. warning::

962

963 Pandas uses PyTables for reading and writing HDF5 files, which allows

964 serializing object-dtype data with pickle when using the "fixed" format.

965 Loading pickled data received from untrusted sources can be unsafe.

966

967 See: https://docs.python.org/3/library/pickle.html for more.

968

969 Parameters

970 ----------

971 keys : a list of the tables

972 selector : the table to apply the where criteria (defaults to keys[0]

973 if not supplied)

974 columns : the columns I want back

975 start : integer (defaults to None), row number to start selection

976 stop : integer (defaults to None), row number to stop selection

977 iterator : bool, return an iterator, default False

978 chunksize : nrows to include in iteration, return an iterator

979 auto_close : bool, default False

980 Should automatically close the store when finished.

981

982 Raises

983 ------

984 raises KeyError if keys or selector is not found or keys is empty

985 raises TypeError if keys is not a list or tuple

986 raises ValueError if the tables are not ALL THE SAME DIMENSIONS

987 """

988 # default to single select

989 where = _ensure_term(where, scope_level=1)

990 if isinstance(keys, (list, tuple)) and len(keys) == 1:

991 keys = keys[0]

992 if isinstance(keys, str):

993 return self.select(

994 key=keys,

995 where=where,

996 columns=columns,

997 start=start,

998 stop=stop,

999 iterator=iterator,

1000 chunksize=chunksize,

1001 auto_close=auto_close,

1002 )

1003

1004 if not isinstance(keys, (list, tuple)):

1005 raise TypeError("keys must be a list/tuple")

1006

1007 if not len(keys):

1008 raise ValueError("keys must have a non-zero length")

1009

1010 if selector is None:

1011 selector = keys[0]

1012

1013 # collect the tables

1014 tbls = [self.get_storer(k) for k in keys]

1015 s = self.get_storer(selector)

1016

1017 # validate rows

1018 nrows = None

1019 for t, k in itertools.chain([(s, selector)], zip(tbls, keys)):

1020 if t is None:

1021 raise KeyError(f"Invalid table [{k}]")

1022 if not t.is_table:

1023 raise TypeError(

1024 f"object [{t.pathname}] is not a table, and cannot be used in all "

1025 "select as multiple"

1026 )

1027

1028 if nrows is None:

1029 nrows = t.nrows

1030 elif t.nrows != nrows:

1031 raise ValueError("all tables must have exactly the same nrows!")

1032

1033 # The isinstance checks here are redundant with the check above,

1034 # but necessary for mypy; see GH#29757

1035 _tbls = [x for x in tbls if isinstance(x, Table)]

1036

1037 # axis is the concentration axes

1038 axis = list({t.non_index_axes[0][0] for t in _tbls})[0]

1039

1040 def func(_start, _stop, _where):

1041

1042 # retrieve the objs, _where is always passed as a set of

1043 # coordinates here

1044 objs = [

1045 t.read(where=_where, columns=columns, start=_start, stop=_stop)

1046 for t in tbls

1047 ]

1048

1049 # concat and return

1050 return concat(objs, axis=axis, verify_integrity=False)._consolidate()

1051

1052 # create the iterator

1053 it = TableIterator(

1054 self,

1055 s,

1056 func,

1057 where=where,

1058 nrows=nrows,

1059 start=start,

1060 stop=stop,

1061 iterator=iterator,

1062 chunksize=chunksize,

1063 auto_close=auto_close,

1064 )

1065

1066 return it.get_result(coordinates=True)

1067

1068 def put(

1069 self,

1070 key: str,

1071 value: DataFrame | Series,

1072 format=None,

1073 index=True,

1074 append=False,

1075 complib=None,

1076 complevel: int | None = None,

1077 min_itemsize: int | dict[str, int] | None = None,

1078 nan_rep=None,

1079 data_columns: Literal[True] | list[str] | None = None,

1080 encoding=None,

1081 errors: str = "strict",

1082 track_times: bool = True,

1083 dropna: bool = False,

1084 ) -> None:

1085 """

1086 Store object in HDFStore.

1087

1088 Parameters

1089 ----------

1090 key : str

1091 value : {Series, DataFrame}

1092 format : 'fixed(f)|table(t)', default is 'fixed'

1093 Format to use when storing object in HDFStore. Value can be one of:

1094

1095 ``'fixed'``

1096 Fixed format. Fast writing/reading. Not-appendable, nor searchable.

1097 ``'table'``

1098 Table format. Write as a PyTables Table structure which may perform

1099 worse but allow more flexible operations like searching / selecting

1100 subsets of the data.

1101 index : bool, default True

1102 Write DataFrame index as a column.

1103 append : bool, default False

1104 This will force Table format, append the input data to the existing.

1105 data_columns : list of columns or True, default None

1106 List of columns to create as data columns, or True to use all columns.

1107 See `here

1108 <https://pandas.pydata.org/pandas-docs/stable/user_guide/io.html#query-via-data-columns>`__.

1109 encoding : str, default None

1110 Provide an encoding for strings.

1111 track_times : bool, default True

1112 Parameter is propagated to 'create_table' method of 'PyTables'.

1113 If set to False it enables to have the same h5 files (same hashes)

1114 independent on creation time.

1115 dropna : bool, default False, optional

1116 Remove missing values.

1117

1118 .. versionadded:: 1.1.0

1119 """

1120 if format is None:

1121 format = get_option("io.hdf.default_format") or "fixed"

1122 format = self._validate_format(format)

1123 self._write_to_group(

1124 key,

1125 value,

1126 format=format,

1127 index=index,

1128 append=append,

1129 complib=complib,

1130 complevel=complevel,

1131 min_itemsize=min_itemsize,

1132 nan_rep=nan_rep,

1133 data_columns=data_columns,

1134 encoding=encoding,

1135 errors=errors,

1136 track_times=track_times,

1137 dropna=dropna,

1138 )

1139

1140 def remove(self, key: str, where=None, start=None, stop=None) -> None:

1141 """

1142 Remove pandas object partially by specifying the where condition

1143

1144 Parameters

1145 ----------

1146 key : str

1147 Node to remove or delete rows from

1148 where : list of Term (or convertible) objects, optional

1149 start : integer (defaults to None), row number to start selection

1150 stop : integer (defaults to None), row number to stop selection

1151

1152 Returns

1153 -------

1154 number of rows removed (or None if not a Table)

1155

1156 Raises

1157 ------

1158 raises KeyError if key is not a valid store

1159

1160 """

1161 where = _ensure_term(where, scope_level=1)

1162 try:

1163 s = self.get_storer(key)

1164 except KeyError:

1165 # the key is not a valid store, re-raising KeyError

1166 raise

1167 except AssertionError:

1168 # surface any assertion errors for e.g. debugging

1169 raise

1170 except Exception as err:

1171 # In tests we get here with ClosedFileError, TypeError, and

1172 # _table_mod.NoSuchNodeError. TODO: Catch only these?

1173

1174 if where is not None:

1175 raise ValueError(

1176 "trying to remove a node with a non-None where clause!"

1177 ) from err

1178

1179 # we are actually trying to remove a node (with children)

1180 node = self.get_node(key)

1181 if node is not None:

1182 node._f_remove(recursive=True)

1183 return None

1184

1185 # remove the node

1186 if com.all_none(where, start, stop):

1187 s.group._f_remove(recursive=True)

1188

1189 # delete from the table

1190 else:

1191 if not s.is_table:

1192 raise ValueError(

1193 "can only remove with where on objects written as tables"

1194 )

1195 return s.delete(where=where, start=start, stop=stop)

1196

1197 def append(

1198 self,

1199 key: str,

1200 value: DataFrame | Series,

1201 format=None,

1202 axes=None,

1203 index=True,

1204 append=True,

1205 complib=None,

1206 complevel: int | None = None,

1207 columns=None,

1208 min_itemsize: int | dict[str, int] | None = None,

1209 nan_rep=None,

1210 chunksize=None,

1211 expectedrows=None,

1212 dropna: bool | None = None,

1213 data_columns: Literal[True] | list[str] | None = None,

1214 encoding=None,

1215 errors: str = "strict",

1216 ) -> None:

1217 """

1218 Append to Table in file.

1219

1220 Node must already exist and be Table format.

1221

1222 Parameters

1223 ----------

1224 key : str

1225 value : {Series, DataFrame}

1226 format : 'table' is the default

1227 Format to use when storing object in HDFStore. Value can be one of:

1228

1229 ``'table'``

1230 Table format. Write as a PyTables Table structure which may perform

1231 worse but allow more flexible operations like searching / selecting

1232 subsets of the data.

1233 index : bool, default True

1234 Write DataFrame index as a column.

1235 append : bool, default True

1236 Append the input data to the existing.

1237 data_columns : list of columns, or True, default None

1238 List of columns to create as indexed data columns for on-disk

1239 queries, or True to use all columns. By default only the axes

1240 of the object are indexed. See `here

1241 <https://pandas.pydata.org/pandas-docs/stable/user_guide/io.html#query-via-data-columns>`__.

1242 min_itemsize : dict of columns that specify minimum str sizes

1243 nan_rep : str to use as str nan representation

1244 chunksize : size to chunk the writing

1245 expectedrows : expected TOTAL row size of this table

1246 encoding : default None, provide an encoding for str

1247 dropna : bool, default False, optional

1248 Do not write an ALL nan row to the store settable

1249 by the option 'io.hdf.dropna_table'.

1250

1251 Notes

1252 -----

1253 Does *not* check if data being appended overlaps with existing

1254 data in the table, so be careful

1255 """

1256 if columns is not None:

1257 raise TypeError(

1258 "columns is not a supported keyword in append, try data_columns"

1259 )

1260

1261 if dropna is None:

1262 dropna = get_option("io.hdf.dropna_table")

1263 if format is None:

1264 format = get_option("io.hdf.default_format") or "table"

1265 format = self._validate_format(format)

1266 self._write_to_group(

1267 key,

1268 value,

1269 format=format,

1270 axes=axes,

1271 index=index,

1272 append=append,

1273 complib=complib,

1274 complevel=complevel,

1275 min_itemsize=min_itemsize,

1276 nan_rep=nan_rep,

1277 chunksize=chunksize,

1278 expectedrows=expectedrows,

1279 dropna=dropna,

1280 data_columns=data_columns,

1281 encoding=encoding,

1282 errors=errors,

1283 )

1284

1285 def append_to_multiple(

1286 self,

1287 d: dict,

1288 value,

1289 selector,

1290 data_columns=None,

1291 axes=None,

1292 dropna=False,

1293 **kwargs,

1294 ) -> None:

1295 """

1296 Append to multiple tables

1297

1298 Parameters

1299 ----------

1300 d : a dict of table_name to table_columns, None is acceptable as the

1301 values of one node (this will get all the remaining columns)

1302 value : a pandas object

1303 selector : a string that designates the indexable table; all of its

1304 columns will be designed as data_columns, unless data_columns is

1305 passed, in which case these are used

1306 data_columns : list of columns to create as data columns, or True to

1307 use all columns

1308 dropna : if evaluates to True, drop rows from all tables if any single

1309 row in each table has all NaN. Default False.

1310

1311 Notes

1312 -----

1313 axes parameter is currently not accepted

1314

1315 """

1316 if axes is not None:

1317 raise TypeError(

1318 "axes is currently not accepted as a parameter to append_to_multiple; "

1319 "you can create the tables independently instead"

1320 )

1321

1322 if not isinstance(d, dict):

1323 raise ValueError(

1324 "append_to_multiple must have a dictionary specified as the "

1325 "way to split the value"

1326 )

1327

1328 if selector not in d:

1329 raise ValueError(

1330 "append_to_multiple requires a selector that is in passed dict"

1331 )

1332

1333 # figure out the splitting axis (the non_index_axis)

1334 axis = list(set(range(value.ndim)) - set(_AXES_MAP[type(value)]))[0]

1335

1336 # figure out how to split the value

1337 remain_key = None

1338 remain_values: list = []

1339 for k, v in d.items():

1340 if v is None:

1341 if remain_key is not None:

1342 raise ValueError(

1343 "append_to_multiple can only have one value in d that is None"

1344 )

1345 remain_key = k

1346 else:

1347 remain_values.extend(v)

1348 if remain_key is not None:

1349 ordered = value.axes[axis]

1350 ordd = ordered.difference(Index(remain_values))

1351 ordd = sorted(ordered.get_indexer(ordd))

1352 d[remain_key] = ordered.take(ordd)

1353

1354 # data_columns

1355 if data_columns is None:

1356 data_columns = d[selector]

1357

1358 # ensure rows are synchronized across the tables

1359 if dropna:

1360 idxs = (value[cols].dropna(how="all").index for cols in d.values())

1361 valid_index = next(idxs)

1362 for index in idxs:

1363 valid_index = valid_index.intersection(index)

1364 value = value.loc[valid_index]

1365

1366 min_itemsize = kwargs.pop("min_itemsize", None)

1367

1368 # append

1369 for k, v in d.items():

1370 dc = data_columns if k == selector else None

1371

1372 # compute the val

1373 val = value.reindex(v, axis=axis)

1374

1375 filtered = (

1376 {key: value for (key, value) in min_itemsize.items() if key in v}

1377 if min_itemsize is not None

1378 else None

1379 )

1380 self.append(k, val, data_columns=dc, min_itemsize=filtered, **kwargs)

1381

1382 def create_table_index(

1383 self,

1384 key: str,

1385 columns=None,

1386 optlevel: int | None = None,

1387 kind: str | None = None,

1388 ) -> None:

1389 """

1390 Create a pytables index on the table.

1391

1392 Parameters

1393 ----------

1394 key : str

1395 columns : None, bool, or listlike[str]

1396 Indicate which columns to create an index on.

1397

1398 * False : Do not create any indexes.

1399 * True : Create indexes on all columns.

1400 * None : Create indexes on all columns.

1401 * listlike : Create indexes on the given columns.

1402

1403 optlevel : int or None, default None

1404 Optimization level, if None, pytables defaults to 6.

1405 kind : str or None, default None

1406 Kind of index, if None, pytables defaults to "medium".

1407

1408 Raises

1409 ------

1410 TypeError: raises if the node is not a table

1411 """

1412 # version requirements

1413 _tables()

1414 s = self.get_storer(key)

1415 if s is None:

1416 return

1417

1418 if not isinstance(s, Table):

1419 raise TypeError("cannot create table index on a Fixed format store")

1420 s.create_index(columns=columns, optlevel=optlevel, kind=kind)

1421

1422 def groups(self) -> list:

1423 """

1424 Return a list of all the top-level nodes.

1425

1426 Each node returned is not a pandas storage object.

1427

1428 Returns

1429 -------

1430 list

1431 List of objects.

1432 """

1433 _tables()

1434 self._check_if_open()

1435 assert self._handle is not None # for mypy

1436 assert _table_mod is not None # for mypy

1437 return [

1438 g

1439 for g in self._handle.walk_groups()

1440 if (

1441 not isinstance(g, _table_mod.link.Link)

1442 and (

1443 getattr(g._v_attrs, "pandas_type", None)

1444 or getattr(g, "table", None)

1445 or (isinstance(g, _table_mod.table.Table) and g._v_name != "table")

1446 )

1447 )

1448 ]

1449

1450 def walk(self, where: str = "/") -> Iterator[tuple[str, list[str], list[str]]]:

1451 """

1452 Walk the pytables group hierarchy for pandas objects.

1453

1454 This generator will yield the group path, subgroups and pandas object

1455 names for each group.

1456

1457 Any non-pandas PyTables objects that are not a group will be ignored.

1458

1459 The `where` group itself is listed first (preorder), then each of its

1460 child groups (following an alphanumerical order) is also traversed,

1461 following the same procedure.

1462

1463 Parameters

1464 ----------

1465 where : str, default "/"

1466 Group where to start walking.

1467

1468 Yields

1469 ------

1470 path : str

1471 Full path to a group (without trailing '/').

1472 groups : list

1473 Names (strings) of the groups contained in `path`.

1474 leaves : list

1475 Names (strings) of the pandas objects contained in `path`.

1476 """

1477 _tables()

1478 self._check_if_open()

1479 assert self._handle is not None # for mypy

1480 assert _table_mod is not None # for mypy

1481

1482 for g in self._handle.walk_groups(where):

1483 if getattr(g._v_attrs, "pandas_type", None) is not None:

1484 continue

1485

1486 groups = []

1487 leaves = []

1488 for child in g._v_children.values():

1489 pandas_type = getattr(child._v_attrs, "pandas_type", None)

1490 if pandas_type is None:

1491 if isinstance(child, _table_mod.group.Group):

1492 groups.append(child._v_name)

1493 else:

1494 leaves.append(child._v_name)

1495

1496 yield (g._v_pathname.rstrip("/"), groups, leaves)

1497

1498 def get_node(self, key: str) -> Node | None:

1499 """return the node with the key or None if it does not exist"""

1500 self._check_if_open()

1501 if not key.startswith("/"):

1502 key = "/" + key

1503

1504 assert self._handle is not None

1505 assert _table_mod is not None # for mypy

1506 try:

1507 node = self._handle.get_node(self.root, key)

1508 except _table_mod.exceptions.NoSuchNodeError:

1509 return None

1510

1511 assert isinstance(node, _table_mod.Node), type(node)

1512 return node

1513

1514 def get_storer(self, key: str) -> GenericFixed | Table:

1515 """return the storer object for a key, raise if not in the file"""

1516 group = self.get_node(key)

1517 if group is None:

1518 raise KeyError(f"No object named {key} in the file")

1519

1520 s = self._create_storer(group)

1521 s.infer_axes()

1522 return s

1523

1524 def copy(

1525 self,

1526 file,

1527 mode="w",

1528 propindexes: bool = True,

1529 keys=None,

1530 complib=None,

1531 complevel: int | None = None,

1532 fletcher32: bool = False,

1533 overwrite=True,

1534 ) -> HDFStore:

1535 """

1536 Copy the existing store to a new file, updating in place.

1537

1538 Parameters

1539 ----------

1540 propindexes : bool, default True

1541 Restore indexes in copied file.

1542 keys : list, optional

1543 List of keys to include in the copy (defaults to all).

1544 overwrite : bool, default True

1545 Whether to overwrite (remove and replace) existing nodes in the new store.

1546 mode, complib, complevel, fletcher32 same as in HDFStore.__init__

1547

1548 Returns

1549 -------

1550 open file handle of the new store

1551 """

1552 new_store = HDFStore(

1553 file, mode=mode, complib=complib, complevel=complevel, fletcher32=fletcher32

1554 )

1555 if keys is None:

1556 keys = list(self.keys())

1557 if not isinstance(keys, (tuple, list)):

1558 keys = [keys]

1559 for k in keys:

1560 s = self.get_storer(k)

1561 if s is not None:

1562

1563 if k in new_store:

1564 if overwrite:

1565 new_store.remove(k)

1566

1567 data = self.select(k)

1568 if isinstance(s, Table):

1569

1570 index: bool | list[str] = False

1571 if propindexes:

1572 index = [a.name for a in s.axes if a.is_indexed]

1573 new_store.append(

1574 k,

1575 data,

1576 index=index,

1577 data_columns=getattr(s, "data_columns", None),

1578 encoding=s.encoding,

1579 )

1580 else:

1581 new_store.put(k, data, encoding=s.encoding)

1582

1583 return new_store

1584

1585 def info(self) -> str:

1586 """

1587 Print detailed information on the store.

1588

1589 Returns

1590 -------

1591 str

1592 """

1593 path = pprint_thing(self._path)

1594 output = f"{type(self)}\nFile path: {path}\n"

1595

1596 if self.is_open:

1597 lkeys = sorted(self.keys())

1598 if len(lkeys):

1599 keys = []

1600 values = []

1601

1602 for k in lkeys:

1603 try:

1604 s = self.get_storer(k)

1605 if s is not None:

1606 keys.append(pprint_thing(s.pathname or k))

1607 values.append(pprint_thing(s or "invalid_HDFStore node"))

1608 except AssertionError:

1609 # surface any assertion errors for e.g. debugging

1610 raise

1611 except Exception as detail:

1612 keys.append(k)

1613 dstr = pprint_thing(detail)

1614 values.append(f"[invalid_HDFStore node: {dstr}]")

1615

1616 output += adjoin(12, keys, values)

1617 else:

1618 output += "Empty"

1619 else:

1620 output += "File is CLOSED"

1621

1622 return output

1623

1624 # ------------------------------------------------------------------------

1625 # private methods

1626

1627 def _check_if_open(self):

1628 if not self.is_open:

1629 raise ClosedFileError(f"{self._path} file is not open!")

1630

1631 def _validate_format(self, format: str) -> str:

1632 """validate / deprecate formats"""

1633 # validate

1634 try:

1635 format = _FORMAT_MAP[format.lower()]

1636 except KeyError as err:

1637 raise TypeError(f"invalid HDFStore format specified [{format}]") from err

1638

1639 return format

1640

1641 def _create_storer(

1642 self,

1643 group,

1644 format=None,

1645 value: DataFrame | Series | None = None,

1646 encoding: str = "UTF-8",

1647 errors: str = "strict",

1648 ) -> GenericFixed | Table:

1649 """return a suitable class to operate"""

1650 cls: type[GenericFixed] | type[Table]

1651

1652 if value is not None and not isinstance(value, (Series, DataFrame)):

1653 raise TypeError("value must be None, Series, or DataFrame")

1654

1655 def error(t):

1656 # return instead of raising so mypy can tell where we are raising

1657 return TypeError(

1658 f"cannot properly create the storer for: [{t}] [group->"

1659 f"{group},value->{type(value)},format->{format}"

1660 )

1661

1662 pt = _ensure_decoded(getattr(group._v_attrs, "pandas_type", None))

1663 tt = _ensure_decoded(getattr(group._v_attrs, "table_type", None))

1664

1665 # infer the pt from the passed value

1666 if pt is None:

1667 if value is None:

1668 _tables()

1669 assert _table_mod is not None # for mypy

1670 if getattr(group, "table", None) or isinstance(

1671 group, _table_mod.table.Table

1672 ):

1673 pt = "frame_table"

1674 tt = "generic_table"

1675 else:

1676 raise TypeError(

1677 "cannot create a storer if the object is not existing "

1678 "nor a value are passed"

1679 )

1680 else:

1681 if isinstance(value, Series):

1682 pt = "series"

1683 else:

1684 pt = "frame"

1685

1686 # we are actually a table

1687 if format == "table":

1688 pt += "_table"

1689

1690 # a storer node

1691 if "table" not in pt:

1692 _STORER_MAP = {"series": SeriesFixed, "frame": FrameFixed}

1693 try:

1694 cls = _STORER_MAP[pt]

1695 except KeyError as err:

1696 raise error("_STORER_MAP") from err

1697 return cls(self, group, encoding=encoding, errors=errors)

1698

1699 # existing node (and must be a table)

1700 if tt is None:

1701 # if we are a writer, determine the tt

1702 if value is not None:

1703 if pt == "series_table":

1704 index = getattr(value, "index", None)

1705 if index is not None:

1706 if index.nlevels == 1:

1707 tt = "appendable_series"

1708 elif index.nlevels > 1:

1709 tt = "appendable_multiseries"

1710 elif pt == "frame_table":

1711 index = getattr(value, "index", None)

1712 if index is not None:

1713 if index.nlevels == 1:

1714 tt = "appendable_frame"

1715 elif index.nlevels > 1:

1716 tt = "appendable_multiframe"

1717

1718 _TABLE_MAP = {

1719 "generic_table": GenericTable,

1720 "appendable_series": AppendableSeriesTable,

1721 "appendable_multiseries": AppendableMultiSeriesTable,

1722 "appendable_frame": AppendableFrameTable,

1723 "appendable_multiframe": AppendableMultiFrameTable,

1724 "worm": WORMTable,

1725 }

1726 try:

1727 cls = _TABLE_MAP[tt]

1728 except KeyError as err:

1729 raise error("_TABLE_MAP") from err

1730

1731 return cls(self, group, encoding=encoding, errors=errors)

1732

1733 def _write_to_group(

1734 self,

1735 key: str,

1736 value: DataFrame | Series,

1737 format,

1738 axes=None,

1739 index=True,

1740 append=False,

1741 complib=None,

1742 complevel: int | None = None,

1743 fletcher32=None,

1744 min_itemsize: int | dict[str, int] | None = None,

1745 chunksize=None,

1746 expectedrows=None,

1747 dropna=False,

1748 nan_rep=None,

1749 data_columns=None,

1750 encoding=None,

1751 errors: str = "strict",

1752 track_times: bool = True,

1753 ) -> None:

1754 # we don't want to store a table node at all if our object is 0-len

1755 # as there are not dtypes

1756 if getattr(value, "empty", None) and (format == "table" or append):

1757 return

1758

1759 group = self._identify_group(key, append)

1760

1761 s = self._create_storer(group, format, value, encoding=encoding, errors=errors)

1762 if append:

1763 # raise if we are trying to append to a Fixed format,

1764 # or a table that exists (and we are putting)

1765 if not s.is_table or (s.is_table and format == "fixed" and s.is_exists):

1766 raise ValueError("Can only append to Tables")

1767 if not s.is_exists:

1768 s.set_object_info()

1769 else:

1770 s.set_object_info()

1771

1772 if not s.is_table and complib:

1773 raise ValueError("Compression not supported on Fixed format stores")

1774

1775 # write the object

1776 s.write(

1777 obj=value,

1778 axes=axes,

1779 append=append,

1780 complib=complib,

1781 complevel=complevel,

1782 fletcher32=fletcher32,

1783 min_itemsize=min_itemsize,

1784 chunksize=chunksize,

1785 expectedrows=expectedrows,

1786 dropna=dropna,

1787 nan_rep=nan_rep,

1788 data_columns=data_columns,

1789 track_times=track_times,

1790 )

1791

1792 if isinstance(s, Table) and index:

1793 s.create_index(columns=index)

1794

1795 def _read_group(self, group: Node):

1796 s = self._create_storer(group)

1797 s.infer_axes()

1798 return s.read()

1799

1800 def _identify_group(self, key: str, append: bool) -> Node:

1801 """Identify HDF5 group based on key, delete/create group if needed."""

1802 group = self.get_node(key)

1803

1804 # we make this assertion for mypy; the get_node call will already

1805 # have raised if this is incorrect

1806 assert self._handle is not None

1807

1808 # remove the node if we are not appending

1809 if group is not None and not append:

1810 self._handle.remove_node(group, recursive=True)

1811 group = None

1812

1813 if group is None:

1814 group = self._create_nodes_and_group(key)

1815

1816 return group

1817

1818 def _create_nodes_and_group(self, key: str) -> Node:

1819 """Create nodes from key and return group name."""

1820 # assertion for mypy

1821 assert self._handle is not None

1822

1823 paths = key.split("/")

1824 # recursively create the groups

1825 path = "/"

1826 for p in paths:

1827 if not len(p):

1828 continue

1829 new_path = path

1830 if not path.endswith("/"):

1831 new_path += "/"

1832 new_path += p

1833 group = self.get_node(new_path)

1834 if group is None:

1835 group = self._handle.create_group(path, p)

1836 path = new_path

1837 return group

1838

1839

1840class TableIterator:

1841 """

1842 Define the iteration interface on a table

1843

1844 Parameters

1845 ----------

1846 store : HDFStore

1847 s : the referred storer

1848 func : the function to execute the query

1849 where : the where of the query

1850 nrows : the rows to iterate on

1851 start : the passed start value (default is None)

1852 stop : the passed stop value (default is None)

1853 iterator : bool, default False

1854 Whether to use the default iterator.

1855 chunksize : the passed chunking value (default is 100000)

1856 auto_close : bool, default False

1857 Whether to automatically close the store at the end of iteration.

1858 """

1859

1860 chunksize: int | None

1861 store: HDFStore

1862 s: GenericFixed | Table

1863

1864 def __init__(

1865 self,

1866 store: HDFStore,

1867 s: GenericFixed | Table,

1868 func,

1869 where,

1870 nrows,

1871 start=None,

1872 stop=None,

1873 iterator: bool = False,

1874 chunksize: int | None = None,

1875 auto_close: bool = False,

1876 ) -> None:

1877 self.store = store

1878 self.s = s

1879 self.func = func

1880 self.where = where

1881

1882 # set start/stop if they are not set if we are a table

1883 if self.s.is_table:

1884 if nrows is None:

1885 nrows = 0

1886 if start is None:

1887 start = 0

1888 if stop is None:

1889 stop = nrows

1890 stop = min(nrows, stop)

1891

1892 self.nrows = nrows

1893 self.start = start

1894 self.stop = stop

1895

1896 self.coordinates = None

1897 if iterator or chunksize is not None:

1898 if chunksize is None:

1899 chunksize = 100000

1900 self.chunksize = int(chunksize)

1901 else:

1902 self.chunksize = None

1903

1904 self.auto_close = auto_close

1905

1906 def __iter__(self):

1907 # iterate

1908 current = self.start

1909 if self.coordinates is None:

1910 raise ValueError("Cannot iterate until get_result is called.")

1911 while current < self.stop:

1912 stop = min(current + self.chunksize, self.stop)

1913 value = self.func(None, None, self.coordinates[current:stop])

1914 current = stop

1915 if value is None or not len(value):

1916 continue

1917

1918 yield value

1919

1920 self.close()

1921

1922 def close(self) -> None:

1923 if self.auto_close:

1924 self.store.close()

1925

1926 def get_result(self, coordinates: bool = False):

1927 # return the actual iterator

1928 if self.chunksize is not None:

1929 if not isinstance(self.s, Table):

1930 raise TypeError("can only use an iterator or chunksize on a table")

1931

1932 self.coordinates = self.s.read_coordinates(where=self.where)

1933

1934 return self

1935

1936 # if specified read via coordinates (necessary for multiple selections

1937 if coordinates:

1938 if not isinstance(self.s, Table):

1939 raise TypeError("can only read_coordinates on a table")

1940 where = self.s.read_coordinates(

1941 where=self.where, start=self.start, stop=self.stop

1942 )

1943 else:

1944 where = self.where

1945

1946 # directly return the result

1947 results = self.func(self.start, self.stop, where)

1948 self.close()

1949 return results

1950

1951

1952class IndexCol:

1953 """

1954 an index column description class

1955

1956 Parameters

1957 ----------

1958 axis : axis which I reference

1959 values : the ndarray like converted values

1960 kind : a string description of this type

1961 typ : the pytables type

1962 pos : the position in the pytables

1963

1964 """

1965

1966 is_an_indexable: bool = True

1967 is_data_indexable: bool = True

1968 _info_fields = ["freq", "tz", "index_name"]

1969

1970 name: str

1971 cname: str

1972

1973 def __init__(

1974 self,

1975 name: str,

1976 values=None,

1977 kind=None,

1978 typ=None,

1979 cname: str | None = None,

1980 axis=None,

1981 pos=None,

1982 freq=None,

1983 tz=None,

1984 index_name=None,

1985 ordered=None,

1986 table=None,

1987 meta=None,

1988 metadata=None,

1989 ) -> None:

1990

1991 if not isinstance(name, str):

1992 raise ValueError("`name` must be a str.")

1993

1994 self.values = values

1995 self.kind = kind

1996 self.typ = typ

1997 self.name = name

1998 self.cname = cname or name

1999 self.axis = axis

2000 self.pos = pos

2001 self.freq = freq

2002 self.tz = tz

2003 self.index_name = index_name

2004 self.ordered = ordered

2005 self.table = table

2006 self.meta = meta

2007 self.metadata = metadata

2008

2009 if pos is not None:

2010 self.set_pos(pos)

2011

2012 # These are ensured as long as the passed arguments match the

2013 # constructor annotations.

2014 assert isinstance(self.name, str)

2015 assert isinstance(self.cname, str)

2016

2017 @property

2018 def itemsize(self) -> int:

2019 # Assumes self.typ has already been initialized

2020 return self.typ.itemsize

2021

2022 @property

2023 def kind_attr(self) -> str:

2024 return f"{self.name}_kind"

2025

2026 def set_pos(self, pos: int) -> None:

2027 """set the position of this column in the Table"""

2028 self.pos = pos

2029 if pos is not None and self.typ is not None:

2030 self.typ._v_pos = pos

2031

2032 def __repr__(self) -> str:

2033 temp = tuple(

2034 map(pprint_thing, (self.name, self.cname, self.axis, self.pos, self.kind))

2035 )

2036 return ",".join(

2037 [

2038 f"{key}->{value}"

2039 for key, value in zip(["name", "cname", "axis", "pos", "kind"], temp)

2040 ]

2041 )

2042

2043 def __eq__(self, other: Any) -> bool:

2044 """compare 2 col items"""

2045 return all(

2046 getattr(self, a, None) == getattr(other, a, None)

2047 for a in ["name", "cname", "axis", "pos"]

2048 )

2049

2050 def __ne__(self, other) -> bool:

2051 return not self.__eq__(other)

2052

2053 @property

2054 def is_indexed(self) -> bool:

2055 """return whether I am an indexed column"""

2056 if not hasattr(self.table, "cols"):

2057 # e.g. if infer hasn't been called yet, self.table will be None.

2058 return False

2059 return getattr(self.table.cols, self.cname).is_indexed

2060

2061 def convert(

2062 self, values: np.ndarray, nan_rep, encoding: str, errors: str

2063 ) -> tuple[np.ndarray, np.ndarray] | tuple[DatetimeIndex, DatetimeIndex]:

2064 """

2065 Convert the data from this selection to the appropriate pandas type.

2066 """

2067 assert isinstance(values, np.ndarray), type(values)

2068

2069 # values is a recarray

2070 if values.dtype.fields is not None:

2071 values = values[self.cname]

2072

2073 val_kind = _ensure_decoded(self.kind)

2074 values = _maybe_convert(values, val_kind, encoding, errors)

2075

2076 kwargs = {}

2077 kwargs["name"] = _ensure_decoded(self.index_name)

2078

2079 if self.freq is not None:

2080 kwargs["freq"] = _ensure_decoded(self.freq)

2081

2082 factory: type[Index] | type[DatetimeIndex] = Index

2083 if is_datetime64_dtype(values.dtype) or is_datetime64tz_dtype(values.dtype):

2084 factory = DatetimeIndex

2085 elif values.dtype == "i8" and "freq" in kwargs:

2086 # PeriodIndex data is stored as i8

2087 # error: Incompatible types in assignment (expression has type

2088 # "Callable[[Any, KwArg(Any)], PeriodIndex]", variable has type

2089 # "Union[Type[Index], Type[DatetimeIndex]]")

2090 factory = lambda x, **kwds: PeriodIndex( # type: ignore[assignment]

2091 ordinal=x, **kwds

2092 )

2093

2094 # making an Index instance could throw a number of different errors

2095 try:

2096 new_pd_index = factory(values, **kwargs)

2097 except ValueError:

2098 # if the output freq is different that what we recorded,

2099 # it should be None (see also 'doc example part 2')

2100 if "freq" in kwargs:

2101 kwargs["freq"] = None

2102 new_pd_index = factory(values, **kwargs)

2103 final_pd_index = _set_tz(new_pd_index, self.tz)

2104 return final_pd_index, final_pd_index

2105

2106 def take_data(self):

2107 """return the values"""

2108 return self.values

2109

2110 @property

2111 def attrs(self):

2112 return self.table._v_attrs

2113

2114 @property

2115 def description(self):

2116 return self.table.description

2117

2118 @property

2119 def col(self):

2120 """return my current col description"""

2121 return getattr(self.description, self.cname, None)

2122

2123 @property

2124 def cvalues(self):

2125 """return my cython values"""

2126 return self.values

2127

2128 def __iter__(self):

2129 return iter(self.values)

2130

2131 def maybe_set_size(self, min_itemsize=None) -> None:

2132 """

2133 maybe set a string col itemsize:

2134 min_itemsize can be an integer or a dict with this columns name

2135 with an integer size

2136 """

2137 if _ensure_decoded(self.kind) == "string":

2138 if isinstance(min_itemsize, dict):

2139 min_itemsize = min_itemsize.get(self.name)

2140

2141 if min_itemsize is not None and self.typ.itemsize < min_itemsize:

2142 self.typ = _tables().StringCol(itemsize=min_itemsize, pos=self.pos)

2143

2144 def validate_names(self) -> None:

2145 pass

2146

2147 def validate_and_set(self, handler: AppendableTable, append: bool) -> None:

2148 self.table = handler.table

2149 self.validate_col()

2150 self.validate_attr(append)

2151 self.validate_metadata(handler)

2152 self.write_metadata(handler)

2153 self.set_attr()

2154

2155 def validate_col(self, itemsize=None):

2156 """validate this column: return the compared against itemsize"""

2157 # validate this column for string truncation (or reset to the max size)

2158 if _ensure_decoded(self.kind) == "string":

2159 c = self.col

2160 if c is not None:

2161 if itemsize is None:

2162 itemsize = self.itemsize

2163 if c.itemsize < itemsize:

2164 raise ValueError(

2165 f"Trying to store a string with len [{itemsize}] in "

2166 f"[{self.cname}] column but\nthis column has a limit of "

2167 f"[{c.itemsize}]!\nConsider using min_itemsize to "

2168 "preset the sizes on these columns"

2169 )

2170 return c.itemsize

2171

2172 return None

2173

2174 def validate_attr(self, append: bool) -> None:

2175 # check for backwards incompatibility

2176 if append:

2177 existing_kind = getattr(self.attrs, self.kind_attr, None)

2178 if existing_kind is not None and existing_kind != self.kind:

2179 raise TypeError(

2180 f"incompatible kind in col [{existing_kind} - {self.kind}]"

2181 )

2182

2183 def update_info(self, info) -> None:

2184 """

2185 set/update the info for this indexable with the key/value

2186 if there is a conflict raise/warn as needed

2187 """

2188 for key in self._info_fields:

2189

2190 value = getattr(self, key, None)

2191 idx = info.setdefault(self.name, {})

2192

2193 existing_value = idx.get(key)

2194 if key in idx and value is not None and existing_value != value:

2195 # frequency/name just warn

2196 if key in ["freq", "index_name"]:

2197 ws = attribute_conflict_doc % (key, existing_value, value)

2198 warnings.warn(

2199 ws, AttributeConflictWarning, stacklevel=find_stack_level()

2200 )

2201

2202 # reset

2203 idx[key] = None

2204 setattr(self, key, None)

2205

2206 else:

2207 raise ValueError(

2208 f"invalid info for [{self.name}] for [{key}], "

2209 f"existing_value [{existing_value}] conflicts with "

2210 f"new value [{value}]"

2211 )

2212 else:

2213 if value is not None or existing_value is not None:

2214 idx[key] = value

2215

2216 def set_info(self, info) -> None:

2217 """set my state from the passed info"""

2218 idx = info.get(self.name)

2219 if idx is not None:

2220 self.__dict__.update(idx)

2221

2222 def set_attr(self) -> None:

2223 """set the kind for this column"""

2224 setattr(self.attrs, self.kind_attr, self.kind)

2225

2226 def validate_metadata(self, handler: AppendableTable) -> None:

2227 """validate that kind=category does not change the categories"""

2228 if self.meta == "category":

2229 new_metadata = self.metadata

2230 cur_metadata = handler.read_metadata(self.cname)

2231 if (

2232 new_metadata is not None

2233 and cur_metadata is not None

2234 and not array_equivalent(new_metadata, cur_metadata)

2235 ):

2236 raise ValueError(

2237 "cannot append a categorical with "

2238 "different categories to the existing"

2239 )

2240

2241 def write_metadata(self, handler: AppendableTable) -> None:

2242 """set the meta data"""

2243 if self.metadata is not None:

2244 handler.write_metadata(self.cname, self.metadata)

2245

2246

2247class GenericIndexCol(IndexCol):

2248 """an index which is not represented in the data of the table"""

2249

2250 @property

2251 def is_indexed(self) -> bool:

2252 return False

2253

2254 # error: Return type "Tuple[Int64Index, Int64Index]" of "convert"

2255 # incompatible with return type "Union[Tuple[ndarray[Any, Any],

2256 # ndarray[Any, Any]], Tuple[DatetimeIndex, DatetimeIndex]]" in

2257 # supertype "IndexCol"

2258 def convert( # type: ignore[override]

2259 self, values: np.ndarray, nan_rep, encoding: str, errors: str

2260 ) -> tuple[Int64Index, Int64Index]:

2261 """

2262 Convert the data from this selection to the appropriate pandas type.

2263

2264 Parameters

2265 ----------

2266 values : np.ndarray

2267 nan_rep : str

2268 encoding : str

2269 errors : str

2270 """

2271 assert isinstance(values, np.ndarray), type(values)

2272

2273 index = Int64Index(np.arange(len(values)))

2274 return index, index

2275

2276 def set_attr(self) -> None:

2277 pass

2278

2279

2280class DataCol(IndexCol):

2281 """

2282 a data holding column, by definition this is not indexable

2283

2284 Parameters

2285 ----------

2286 data : the actual data

2287 cname : the column name in the table to hold the data (typically

2288 values)

2289 meta : a string description of the metadata

2290 metadata : the actual metadata

2291 """

2292

2293 is_an_indexable = False

2294 is_data_indexable = False

2295 _info_fields = ["tz", "ordered"]

2296

2297 def __init__(

2298 self,

2299 name: str,

2300 values=None,

2301 kind=None,

2302 typ=None,

2303 cname=None,

2304 pos=None,

2305 tz=None,

2306 ordered=None,

2307 table=None,

2308 meta=None,

2309 metadata=None,

2310 dtype: DtypeArg | None = None,

2311 data=None,

2312 ) -> None:

2313 super().__init__(

2314 name=name,

2315 values=values,

2316 kind=kind,

2317 typ=typ,

2318 pos=pos,

2319 cname=cname,

2320 tz=tz,

2321 ordered=ordered,

2322 table=table,

2323 meta=meta,

2324 metadata=metadata,

2325 )

2326 self.dtype = dtype

2327 self.data = data

2328

2329 @property

2330 def dtype_attr(self) -> str:

2331 return f"{self.name}_dtype"

2332

2333 @property

2334 def meta_attr(self) -> str:

2335 return f"{self.name}_meta"

2336

2337 def __repr__(self) -> str:

2338 temp = tuple(

2339 map(

2340 pprint_thing, (self.name, self.cname, self.dtype, self.kind, self.shape)

2341 )

2342 )

2343 return ",".join(

2344 [

2345 f"{key}->{value}"

2346 for key, value in zip(["name", "cname", "dtype", "kind", "shape"], temp)

2347 ]

2348 )

2349

2350 def __eq__(self, other: Any) -> bool:

2351 """compare 2 col items"""

2352 return all(

2353 getattr(self, a, None) == getattr(other, a, None)

2354 for a in ["name", "cname", "dtype", "pos"]

2355 )

2356

2357 def set_data(self, data: ArrayLike) -> None:

2358 assert data is not None

2359 assert self.dtype is None

2360

2361 data, dtype_name = _get_data_and_dtype_name(data)

2362

2363 self.data = data

2364 self.dtype = dtype_name

2365 self.kind = _dtype_to_kind(dtype_name)

2366

2367 def take_data(self):

2368 """return the data"""

2369 return self.data

2370

2371 @classmethod

2372 def _get_atom(cls, values: ArrayLike) -> Col:

2373 """

2374 Get an appropriately typed and shaped pytables.Col object for values.

2375 """

2376 dtype = values.dtype

2377 # error: Item "ExtensionDtype" of "Union[ExtensionDtype, dtype[Any]]" has no

2378 # attribute "itemsize"

2379 itemsize = dtype.itemsize # type: ignore[union-attr]

2380

2381 shape = values.shape

2382 if values.ndim == 1:

2383 # EA, use block shape pretending it is 2D

2384 # TODO(EA2D): not necessary with 2D EAs

2385 shape = (1, values.size)

2386

2387 if isinstance(values, Categorical):

2388 codes = values.codes

2389 atom = cls.get_atom_data(shape, kind=codes.dtype.name)

2390 elif is_datetime64_dtype(dtype) or is_datetime64tz_dtype(dtype):

2391 atom = cls.get_atom_datetime64(shape)

2392 elif is_timedelta64_dtype(dtype):

2393 atom = cls.get_atom_timedelta64(shape)

2394 elif is_complex_dtype(dtype):

2395 atom = _tables().ComplexCol(itemsize=itemsize, shape=shape[0])

2396 elif is_string_dtype(dtype):

2397 atom = cls.get_atom_string(shape, itemsize)

2398 else:

2399 atom = cls.get_atom_data(shape, kind=dtype.name)

2400

2401 return atom

2402

2403 @classmethod

2404 def get_atom_string(cls, shape, itemsize):

2405 return _tables().StringCol(itemsize=itemsize, shape=shape[0])

2406

2407 @classmethod

2408 def get_atom_coltype(cls, kind: str) -> type[Col]:

2409 """return the PyTables column class for this column"""

2410 if kind.startswith("uint"):

2411 k4 = kind[4:]

2412 col_name = f"UInt{k4}Col"

2413 elif kind.startswith("period"):

2414 # we store as integer

2415 col_name = "Int64Col"

2416 else:

2417 kcap = kind.capitalize()

2418 col_name = f"{kcap}Col"

2419

2420 return getattr(_tables(), col_name)

2421

2422 @classmethod

2423 def get_atom_data(cls, shape, kind: str) -> Col:

2424 return cls.get_atom_coltype(kind=kind)(shape=shape[0])

2425

2426 @classmethod

2427 def get_atom_datetime64(cls, shape):

2428 return _tables().Int64Col(shape=shape[0])

2429

2430 @classmethod

2431 def get_atom_timedelta64(cls, shape):

2432 return _tables().Int64Col(shape=shape[0])

2433

2434 @property

2435 def shape(self):

2436 return getattr(self.data, "shape", None)

2437

2438 @property

2439 def cvalues(self):

2440 """return my cython values"""

2441 return self.data

2442

2443 def validate_attr(self, append) -> None:

2444 """validate that we have the same order as the existing & same dtype"""

2445 if append:

2446 existing_fields = getattr(self.attrs, self.kind_attr, None)

2447 if existing_fields is not None and existing_fields != list(self.values):

2448 raise ValueError("appended items do not match existing items in table!")

2449

2450 existing_dtype = getattr(self.attrs, self.dtype_attr, None)

2451 if existing_dtype is not None and existing_dtype != self.dtype:

2452 raise ValueError(

2453 "appended items dtype do not match existing items dtype in table!"

2454 )

2455

2456 def convert(self, values: np.ndarray, nan_rep, encoding: str, errors: str):

2457 """

2458 Convert the data from this selection to the appropriate pandas type.

2459

2460 Parameters

2461 ----------

2462 values : np.ndarray

2463 nan_rep :

2464 encoding : str

2465 errors : str

2466

2467 Returns

2468 -------

2469 index : listlike to become an Index

2470 data : ndarraylike to become a column

2471 """

2472 assert isinstance(values, np.ndarray), type(values)

2473

2474 # values is a recarray

2475 if values.dtype.fields is not None:

2476 values = values[self.cname]

2477

2478 assert self.typ is not None

2479 if self.dtype is None:

2480 # Note: in tests we never have timedelta64 or datetime64,

2481 # so the _get_data_and_dtype_name may be unnecessary

2482 converted, dtype_name = _get_data_and_dtype_name(values)

2483 kind = _dtype_to_kind(dtype_name)

2484 else:

2485 converted = values

2486 dtype_name = self.dtype

2487 kind = self.kind

2488

2489 assert isinstance(converted, np.ndarray) # for mypy

2490

2491 # use the meta if needed

2492 meta = _ensure_decoded(self.meta)

2493 metadata = self.metadata

2494 ordered = self.ordered

2495 tz = self.tz

2496

2497 assert dtype_name is not None

2498 # convert to the correct dtype

2499 dtype = _ensure_decoded(dtype_name)

2500

2501 # reverse converts

2502 if dtype == "datetime64":

2503 # recreate with tz if indicated

2504 converted = _set_tz(converted, tz, coerce=True)

2505

2506 elif dtype == "timedelta64":

2507 converted = np.asarray(converted, dtype="m8[ns]")

2508 elif dtype == "date":

2509 try:

2510 converted = np.asarray(

2511 [date.fromordinal(v) for v in converted], dtype=object

2512 )

2513 except ValueError:

2514 converted = np.asarray(

2515 [date.fromtimestamp(v) for v in converted], dtype=object

2516 )

2517

2518 elif meta == "category":

2519 # we have a categorical

2520 categories = metadata

2521 codes = converted.ravel()

2522

2523 # if we have stored a NaN in the categories

2524 # then strip it; in theory we could have BOTH

2525 # -1s in the codes and nulls :<

2526 if categories is None:

2527 # Handle case of NaN-only categorical columns in which case

2528 # the categories are an empty array; when this is stored,

2529 # pytables cannot write a zero-len array, so on readback

2530 # the categories would be None and `read_hdf()` would fail.

2531 categories = Index([], dtype=np.float64)

2532 else:

2533 mask = isna(categories)

2534 if mask.any():

2535 categories = categories[~mask]

2536 codes[codes != -1] -= mask.astype(int).cumsum()._values

2537

2538 converted = Categorical.from_codes(

2539 codes, categories=categories, ordered=ordered

2540 )

2541

2542 else:

2543

2544 try:

2545 converted = converted.astype(dtype, copy=False)

2546 except TypeError:

2547 converted = converted.astype("O", copy=False)

2548

2549 # convert nans / decode

2550 if _ensure_decoded(kind) == "string":

2551 converted = _unconvert_string_array(

2552 converted, nan_rep=nan_rep, encoding=encoding, errors=errors

2553 )

2554

2555 return self.values, converted

2556

2557 def set_attr(self) -> None:

2558 """set the data for this column"""

2559 setattr(self.attrs, self.kind_attr, self.values)

2560 setattr(self.attrs, self.meta_attr, self.meta)

2561 assert self.dtype is not None

2562 setattr(self.attrs, self.dtype_attr, self.dtype)

2563

2564

2565class DataIndexableCol(DataCol):

2566 """represent a data column that can be indexed"""

2567

2568 is_data_indexable = True

2569

2570 def validate_names(self) -> None:

2571 if not Index(self.values).is_object():

2572 # TODO: should the message here be more specifically non-str?

2573 raise ValueError("cannot have non-object label DataIndexableCol")

2574

2575 @classmethod

2576 def get_atom_string(cls, shape, itemsize):

2577 return _tables().StringCol(itemsize=itemsize)

2578

2579 @classmethod

2580 def get_atom_data(cls, shape, kind: str) -> Col:

2581 return cls.get_atom_coltype(kind=kind)()

2582

2583 @classmethod

2584 def get_atom_datetime64(cls, shape):

2585 return _tables().Int64Col()

2586

2587 @classmethod

2588 def get_atom_timedelta64(cls, shape):

2589 return _tables().Int64Col()

2590

2591

2592class GenericDataIndexableCol(DataIndexableCol):

2593 """represent a generic pytables data column"""

2594

2595 pass

2596

2597

2598class Fixed:

2599 """

2600 represent an object in my store

2601 facilitate read/write of various types of objects

2602 this is an abstract base class

2603

2604 Parameters

2605 ----------

2606 parent : HDFStore

2607 group : Node

2608 The group node where the table resides.

2609 """

2610

2611 pandas_kind: str

2612 format_type: str = "fixed" # GH#30962 needed by dask

2613 obj_type: type[DataFrame | Series]

2614 ndim: int

2615 encoding: str

2616 parent: HDFStore

2617 group: Node

2618 errors: str

2619 is_table: bool = False

2620

2621 def __init__(

2622 self,

2623 parent: HDFStore,

2624 group: Node,

2625 encoding: str = "UTF-8",

2626 errors: str = "strict",

2627 ) -> None:

2628 assert isinstance(parent, HDFStore), type(parent)

2629 assert _table_mod is not None # needed for mypy

2630 assert isinstance(group, _table_mod.Node), type(group)

2631 self.parent = parent

2632 self.group = group

2633 self.encoding = _ensure_encoding(encoding)

2634 self.errors = errors

2635

2636 @property

2637 def is_old_version(self) -> bool:

2638 return self.version[0] <= 0 and self.version[1] <= 10 and self.version[2] < 1

2639

2640 @property

2641 def version(self) -> tuple[int, int, int]:

2642 """compute and set our version"""

2643 version = _ensure_decoded(getattr(self.group._v_attrs, "pandas_version", None))

2644 try:

2645 version = tuple(int(x) for x in version.split("."))

2646 if len(version) == 2:

2647 version = version + (0,)

2648 except AttributeError:

2649 version = (0, 0, 0)

2650 return version

2651

2652 @property

2653 def pandas_type(self):

2654 return _ensure_decoded(getattr(self.group._v_attrs, "pandas_type", None))

2655

2656 def __repr__(self) -> str:

2657 """return a pretty representation of myself"""

2658 self.infer_axes()

2659 s = self.shape

2660 if s is not None:

2661 if isinstance(s, (list, tuple)):

2662 jshape = ",".join([pprint_thing(x) for x in s])

2663 s = f"[{jshape}]"

2664 return f"{self.pandas_type:12.12} (shape->{s})"

2665 return self.pandas_type

2666

2667 def set_object_info(self) -> None:

2668 """set my pandas type & version"""

2669 self.attrs.pandas_type = str(self.pandas_kind)

2670 self.attrs.pandas_version = str(_version)

2671

2672 def copy(self) -> Fixed:

2673 new_self = copy.copy(self)

2674 return new_self

2675

2676 @property

2677 def shape(self):

2678 return self.nrows

2679

2680 @property

2681 def pathname(self):

2682 return self.group._v_pathname

2683

2684 @property

2685 def _handle(self):

2686 return self.parent._handle

2687

2688 @property

2689 def _filters(self):

2690 return self.parent._filters

2691

2692 @property

2693 def _complevel(self) -> int:

2694 return self.parent._complevel

2695

2696 @property

2697 def _fletcher32(self) -> bool:

2698 return self.parent._fletcher32

2699

2700 @property

2701 def attrs(self):

2702 return self.group._v_attrs

2703

2704 def set_attrs(self) -> None:

2705 """set our object attributes"""

2706 pass

2707

2708 def get_attrs(self) -> None:

2709 """get our object attributes"""

2710 pass

2711

2712 @property

2713 def storable(self):

2714 """return my storable"""

2715 return self.group

2716

2717 @property

2718 def is_exists(self) -> bool:

2719 return False

2720

2721 @property

2722 def nrows(self):

2723 return getattr(self.storable, "nrows", None)

2724

2725 def validate(self, other) -> Literal[True] | None:

2726 """validate against an existing storable"""

2727 if other is None:

2728 return None

2729 return True

2730

2731 def validate_version(self, where=None) -> None:

2732 """are we trying to operate on an old version?"""

2733 pass

2734

2735 def infer_axes(self) -> bool:

2736 """

2737 infer the axes of my storer

2738 return a boolean indicating if we have a valid storer or not

2739 """

2740 s = self.storable

2741 if s is None:

2742 return False

2743 self.get_attrs()

2744 return True

2745

2746 def read(

2747 self,

2748 where=None,

2749 columns=None,

2750 start: int | None = None,

2751 stop: int | None = None,

2752 ):

2753 raise NotImplementedError(

2754 "cannot read on an abstract storer: subclasses should implement"

2755 )

2756

2757 def write(self, **kwargs):

2758 raise NotImplementedError(

2759 "cannot write on an abstract storer: subclasses should implement"

2760 )

2761

2762 def delete(

2763 self, where=None, start: int | None = None, stop: int | None = None

2764 ) -> None:

2765 """

2766 support fully deleting the node in its entirety (only) - where

2767 specification must be None

2768 """

2769 if com.all_none(where, start, stop):

2770 self._handle.remove_node(self.group, recursive=True)

2771 return None

2772

2773 raise TypeError("cannot delete on an abstract storer")

2774

2775

2776class GenericFixed(Fixed):

2777 """a generified fixed version"""

2778

2779 _index_type_map = {DatetimeIndex: "datetime", PeriodIndex: "period"}

2780 _reverse_index_map = {v: k for k, v in _index_type_map.items()}

2781 attributes: list[str] = []

2782

2783 # indexer helpers

2784 def _class_to_alias(self, cls) -> str:

2785 return self._index_type_map.get(cls, "")

2786

2787 def _alias_to_class(self, alias):

2788 if isinstance(alias, type): # pragma: no cover

2789 # compat: for a short period of time master stored types

2790 return alias

2791 return self._reverse_index_map.get(alias, Index)

2792

2793 def _get_index_factory(self, attrs):

2794 index_class = self._alias_to_class(

2795 _ensure_decoded(getattr(attrs, "index_class", ""))

2796 )

2797

2798 factory: Callable

2799

2800 if index_class == DatetimeIndex:

2801

2802 def f(values, freq=None, tz=None):

2803 # data are already in UTC, localize and convert if tz present

2804 dta = DatetimeArray._simple_new(values.values, freq=freq)

2805 result = DatetimeIndex._simple_new(dta, name=None)

2806 if tz is not None:

2807 result = result.tz_localize("UTC").tz_convert(tz)

2808 return result

2809

2810 factory = f

2811 elif index_class == PeriodIndex:

2812

2813 def f(values, freq=None, tz=None):

2814 parr = PeriodArray._simple_new(values, freq=freq)

2815 return PeriodIndex._simple_new(parr, name=None)

2816

2817 factory = f

2818 else:

2819 factory = index_class

2820

2821 kwargs = {}

2822 if "freq" in attrs:

2823 kwargs["freq"] = attrs["freq"]

2824 if index_class is Index:

2825 # DTI/PI would be gotten by _alias_to_class

2826 factory = TimedeltaIndex

2827

2828 if "tz" in attrs:

2829 if isinstance(attrs["tz"], bytes):

2830 # created by python2

2831 kwargs["tz"] = attrs["tz"].decode("utf-8")

2832 else:

2833 # created by python3

2834 kwargs["tz"] = attrs["tz"]

2835 assert index_class is DatetimeIndex # just checking

2836

2837 return factory, kwargs

2838

2839 def validate_read(self, columns, where) -> None:

2840 """

2841 raise if any keywords are passed which are not-None

2842 """

2843 if columns is not None:

2844 raise TypeError(

2845 "cannot pass a column specification when reading "

2846 "a Fixed format store. this store must be selected in its entirety"

2847 )

2848 if where is not None:

2849 raise TypeError(

2850 "cannot pass a where specification when reading "

2851 "from a Fixed format store. this store must be selected in its entirety"

2852 )

2853

2854 @property

2855 def is_exists(self) -> bool:

2856 return True

2857

2858 def set_attrs(self) -> None:

2859 """set our object attributes"""

2860 self.attrs.encoding = self.encoding

2861 self.attrs.errors = self.errors

2862

2863 def get_attrs(self) -> None:

2864 """retrieve our attributes"""

2865 self.encoding = _ensure_encoding(getattr(self.attrs, "encoding", None))

2866 self.errors = _ensure_decoded(getattr(self.attrs, "errors", "strict"))

2867 for n in self.attributes:

2868 setattr(self, n, _ensure_decoded(getattr(self.attrs, n, None)))

2869

2870 # error: Signature of "write" incompatible with supertype "Fixed"

2871 def write(self, obj, **kwargs) -> None: # type: ignore[override]

2872 self.set_attrs()

2873

2874 def read_array(self, key: str, start: int | None = None, stop: int | None = None):

2875 """read an array for the specified node (off of group"""

2876 import tables

2877

2878 node = getattr(self.group, key)

2879 attrs = node._v_attrs

2880

2881 transposed = getattr(attrs, "transposed", False)

2882

2883 if isinstance(node, tables.VLArray):

2884 ret = node[0][start:stop]

2885 else:

2886 dtype = _ensure_decoded(getattr(attrs, "value_type", None))

2887 shape = getattr(attrs, "shape", None)

2888

2889 if shape is not None:

2890 # length 0 axis

2891 ret = np.empty(shape, dtype=dtype)

2892 else:

2893 ret = node[start:stop]

2894

2895 if dtype == "datetime64":

2896 # reconstruct a timezone if indicated

2897 tz = getattr(attrs, "tz", None)

2898 ret = _set_tz(ret, tz, coerce=True)

2899

2900 elif dtype == "timedelta64":

2901 ret = np.asarray(ret, dtype="m8[ns]")

2902

2903 if transposed:

2904 return ret.T

2905 else:

2906 return ret

2907

2908 def read_index(

2909 self, key: str, start: int | None = None, stop: int | None = None

2910 ) -> Index:

2911 variety = _ensure_decoded(getattr(self.attrs, f"{key}_variety"))

2912

2913 if variety == "multi":

2914 return self.read_multi_index(key, start=start, stop=stop)

2915 elif variety == "regular":

2916 node = getattr(self.group, key)

2917 index = self.read_index_node(node, start=start, stop=stop)

2918 return index

2919 else: # pragma: no cover

2920 raise TypeError(f"unrecognized index variety: {variety}")

2921

2922 def write_index(self, key: str, index: Index) -> None:

2923 if isinstance(index, MultiIndex):

2924 setattr(self.attrs, f"{key}_variety", "multi")

2925 self.write_multi_index(key, index)

2926 else:

2927 setattr(self.attrs, f"{key}_variety", "regular")

2928 converted = _convert_index("index", index, self.encoding, self.errors)

2929

2930 self.write_array(key, converted.values)

2931

2932 node = getattr(self.group, key)

2933 node._v_attrs.kind = converted.kind

2934 node._v_attrs.name = index.name

2935

2936 if isinstance(index, (DatetimeIndex, PeriodIndex)):

2937 node._v_attrs.index_class = self._class_to_alias(type(index))

2938

2939 if isinstance(index, (DatetimeIndex, PeriodIndex, TimedeltaIndex)):

2940 node._v_attrs.freq = index.freq

2941

2942 if isinstance(index, DatetimeIndex) and index.tz is not None:

2943 node._v_attrs.tz = _get_tz(index.tz)

2944

2945 def write_multi_index(self, key: str, index: MultiIndex) -> None:

2946 setattr(self.attrs, f"{key}_nlevels", index.nlevels)

2947

2948 for i, (lev, level_codes, name) in enumerate(

2949 zip(index.levels, index.codes, index.names)

2950 ):

2951 # write the level

2952 if is_extension_array_dtype(lev):

2953 raise NotImplementedError(

2954 "Saving a MultiIndex with an extension dtype is not supported."

2955 )

2956 level_key = f"{key}_level{i}"

2957 conv_level = _convert_index(level_key, lev, self.encoding, self.errors)

2958 self.write_array(level_key, conv_level.values)

2959 node = getattr(self.group, level_key)

2960 node._v_attrs.kind = conv_level.kind

2961 node._v_attrs.name = name

2962

2963 # write the name

2964 setattr(node._v_attrs, f"{key}_name{name}", name)

2965

2966 # write the labels

2967 label_key = f"{key}_label{i}"

2968 self.write_array(label_key, level_codes)

2969

2970 def read_multi_index(

2971 self, key: str, start: int | None = None, stop: int | None = None

2972 ) -> MultiIndex:

2973 nlevels = getattr(self.attrs, f"{key}_nlevels")

2974

2975 levels = []

2976 codes = []

2977 names: list[Hashable] = []

2978 for i in range(nlevels):

2979 level_key = f"{key}_level{i}"

2980 node = getattr(self.group, level_key)

2981 lev = self.read_index_node(node, start=start, stop=stop)

2982 levels.append(lev)

2983 names.append(lev.name)

2984

2985 label_key = f"{key}_label{i}"

2986 level_codes = self.read_array(label_key, start=start, stop=stop)

2987 codes.append(level_codes)

2988

2989 return MultiIndex(

2990 levels=levels, codes=codes, names=names, verify_integrity=True

2991 )

2992

2993 def read_index_node(

2994 self, node: Node, start: int | None = None, stop: int | None = None

2995 ) -> Index:

2996 data = node[start:stop]

2997 # If the index was an empty array write_array_empty() will

2998 # have written a sentinel. Here we replace it with the original.

2999 if "shape" in node._v_attrs and np.prod(node._v_attrs.shape) == 0:

3000 data = np.empty(node._v_attrs.shape, dtype=node._v_attrs.value_type)

3001 kind = _ensure_decoded(node._v_attrs.kind)

3002 name = None

3003

3004 if "name" in node._v_attrs:

3005 name = _ensure_str(node._v_attrs.name)

3006 name = _ensure_decoded(name)

3007

3008 attrs = node._v_attrs

3009 factory, kwargs = self._get_index_factory(attrs)

3010

3011 if kind == "date":

3012 index = factory(

3013 _unconvert_index(

3014 data, kind, encoding=self.encoding, errors=self.errors

3015 ),

3016 dtype=object,

3017 **kwargs,

3018 )

3019 else:

3020 index = factory(

3021 _unconvert_index(

3022 data, kind, encoding=self.encoding, errors=self.errors

3023 ),

3024 **kwargs,

3025 )

3026

3027 index.name = name

3028

3029 return index

3030

3031 def write_array_empty(self, key: str, value: ArrayLike) -> None:

3032 """write a 0-len array"""

3033 # ugly hack for length 0 axes

3034 arr = np.empty((1,) * value.ndim)

3035 self._handle.create_array(self.group, key, arr)

3036 node = getattr(self.group, key)

3037 node._v_attrs.value_type = str(value.dtype)

3038 node._v_attrs.shape = value.shape

3039

3040 def write_array(

3041 self, key: str, obj: AnyArrayLike, items: Index | None = None

3042 ) -> None:

3043 # TODO: we only have a few tests that get here, the only EA

3044 # that gets passed is DatetimeArray, and we never have

3045 # both self._filters and EA

3046

3047 value = extract_array(obj, extract_numpy=True)

3048

3049 if key in self.group:

3050 self._handle.remove_node(self.group, key)

3051

3052 # Transform needed to interface with pytables row/col notation

3053 empty_array = value.size == 0

3054 transposed = False

3055

3056 if is_categorical_dtype(value.dtype):

3057 raise NotImplementedError(

3058 "Cannot store a category dtype in a HDF5 dataset that uses format="

3059 '"fixed". Use format="table".'

3060 )

3061 if not empty_array:

3062 if hasattr(value, "T"):

3063 # ExtensionArrays (1d) may not have transpose.

3064 value = value.T

3065 transposed = True

3066

3067 atom = None

3068 if self._filters is not None:

3069 with suppress(ValueError):

3070 # get the atom for this datatype

3071 atom = _tables().Atom.from_dtype(value.dtype)

3072

3073 if atom is not None:

3074 # We only get here if self._filters is non-None and

3075 # the Atom.from_dtype call succeeded

3076

3077 # create an empty chunked array and fill it from value

3078 if not empty_array:

3079 ca = self._handle.create_carray(

3080 self.group, key, atom, value.shape, filters=self._filters

3081 )

3082 ca[:] = value

3083

3084 else:

3085 self.write_array_empty(key, value)

3086

3087 elif value.dtype.type == np.object_:

3088 # infer the type, warn if we have a non-string type here (for

3089 # performance)

3090 inferred_type = lib.infer_dtype(value, skipna=False)

3091 if empty_array:

3092 pass

3093 elif inferred_type == "string":

3094 pass

3095 else:

3096 ws = performance_doc % (inferred_type, key, items)

3097 warnings.warn(ws, PerformanceWarning, stacklevel=find_stack_level())

3098

3099 vlarr = self._handle.create_vlarray(self.group, key, _tables().ObjectAtom())

3100 vlarr.append(value)

3101

3102 elif is_datetime64_dtype(value.dtype):

3103 self._handle.create_array(self.group, key, value.view("i8"))

3104 getattr(self.group, key)._v_attrs.value_type = "datetime64"

3105 elif is_datetime64tz_dtype(value.dtype):

3106 # store as UTC

3107 # with a zone

3108

3109 # error: Item "ExtensionArray" of "Union[Any, ExtensionArray]" has no

3110 # attribute "asi8"

3111 self._handle.create_array(

3112 self.group, key, value.asi8 # type: ignore[union-attr]

3113 )

3114

3115 node = getattr(self.group, key)

3116 # error: Item "ExtensionArray" of "Union[Any, ExtensionArray]" has no

3117 # attribute "tz"

3118 node._v_attrs.tz = _get_tz(value.tz) # type: ignore[union-attr]

3119 node._v_attrs.value_type = "datetime64"

3120 elif is_timedelta64_dtype(value.dtype):

3121 self._handle.create_array(self.group, key, value.view("i8"))

3122 getattr(self.group, key)._v_attrs.value_type = "timedelta64"

3123 elif empty_array:

3124 self.write_array_empty(key, value)

3125 else:

3126 self._handle.create_array(self.group, key, value)

3127

3128 getattr(self.group, key)._v_attrs.transposed = transposed

3129

3130

3131class SeriesFixed(GenericFixed):

3132 pandas_kind = "series"

3133 attributes = ["name"]

3134

3135 name: Hashable

3136

3137 @property

3138 def shape(self):

3139 try:

3140 return (len(self.group.values),)

3141 except (TypeError, AttributeError):

3142 return None

3143

3144 def read(

3145 self,

3146 where=None,

3147 columns=None,

3148 start: int | None = None,

3149 stop: int | None = None,

3150 ) -> Series:

3151 self.validate_read(columns, where)

3152 index = self.read_index("index", start=start, stop=stop)

3153 values = self.read_array("values", start=start, stop=stop)

3154 return Series(values, index=index, name=self.name)

3155

3156 # error: Signature of "write" incompatible with supertype "Fixed"

3157 def write(self, obj, **kwargs) -> None: # type: ignore[override]

3158 super().write(obj, **kwargs)

3159 self.write_index("index", obj.index)

3160 self.write_array("values", obj)

3161 self.attrs.name = obj.name

3162

3163

3164class BlockManagerFixed(GenericFixed):

3165 attributes = ["ndim", "nblocks"]

3166

3167 nblocks: int

3168

3169 @property

3170 def shape(self) -> Shape | None:

3171 try:

3172 ndim = self.ndim

3173

3174 # items

3175 items = 0

3176 for i in range(self.nblocks):

3177 node = getattr(self.group, f"block{i}_items")

3178 shape = getattr(node, "shape", None)

3179 if shape is not None:

3180 items += shape[0]

3181

3182 # data shape

3183 node = self.group.block0_values

3184 shape = getattr(node, "shape", None)

3185 if shape is not None:

3186 shape = list(shape[0 : (ndim - 1)])

3187 else:

3188 shape = []

3189

3190 shape.append(items)

3191

3192 return shape

3193 except AttributeError:

3194 return None

3195

3196 def read(

3197 self,

3198 where=None,

3199 columns=None,

3200 start: int | None = None,

3201 stop: int | None = None,

3202 ) -> DataFrame:

3203 # start, stop applied to rows, so 0th axis only

3204 self.validate_read(columns, where)

3205 select_axis = self.obj_type()._get_block_manager_axis(0)

3206

3207 axes = []

3208 for i in range(self.ndim):

3209

3210 _start, _stop = (start, stop) if i == select_axis else (None, None)

3211 ax = self.read_index(f"axis{i}", start=_start, stop=_stop)

3212 axes.append(ax)

3213

3214 items = axes[0]

3215 dfs = []

3216

3217 for i in range(self.nblocks):

3218

3219 blk_items = self.read_index(f"block{i}_items")

3220 values = self.read_array(f"block{i}_values", start=_start, stop=_stop)

3221

3222 columns = items[items.get_indexer(blk_items)]

3223 df = DataFrame(values.T, columns=columns, index=axes[1])

3224 dfs.append(df)

3225

3226 if len(dfs) > 0:

3227 out = concat(dfs, axis=1)

3228 out = out.reindex(columns=items, copy=False)

3229 return out

3230

3231 return DataFrame(columns=axes[0], index=axes[1])

3232

3233 # error: Signature of "write" incompatible with supertype "Fixed"

3234 def write(self, obj, **kwargs) -> None: # type: ignore[override]

3235 super().write(obj, **kwargs)

3236

3237 # TODO(ArrayManager) HDFStore relies on accessing the blocks

3238 if isinstance(obj._mgr, ArrayManager):

3239 obj = obj._as_manager("block")

3240

3241 data = obj._mgr

3242 if not data.is_consolidated():

3243 data = data.consolidate()

3244

3245 self.attrs.ndim = data.ndim

3246 for i, ax in enumerate(data.axes):

3247 if i == 0 and (not ax.is_unique):

3248 raise ValueError("Columns index has to be unique for fixed format")

3249 self.write_index(f"axis{i}", ax)

3250

3251 # Supporting mixed-type DataFrame objects...nontrivial

3252 self.attrs.nblocks = len(data.blocks)

3253 for i, blk in enumerate(data.blocks):

3254 # I have no idea why, but writing values before items fixed #2299

3255 blk_items = data.items.take(blk.mgr_locs)

3256 self.write_array(f"block{i}_values", blk.values, items=blk_items)

3257 self.write_index(f"block{i}_items", blk_items)

3258

3259

3260class FrameFixed(BlockManagerFixed):

3261 pandas_kind = "frame"

3262 obj_type = DataFrame

3263

3264

3265class Table(Fixed):

3266 """

3267 represent a table:

3268 facilitate read/write of various types of tables

3269

3270 Attrs in Table Node

3271 -------------------

3272 These are attributes that are store in the main table node, they are

3273 necessary to recreate these tables when read back in.

3274

3275 index_axes : a list of tuples of the (original indexing axis and

3276 index column)

3277 non_index_axes: a list of tuples of the (original index axis and

3278 columns on a non-indexing axis)

3279 values_axes : a list of the columns which comprise the data of this

3280 table

3281 data_columns : a list of the columns that we are allowing indexing

3282 (these become single columns in values_axes)

3283 nan_rep : the string to use for nan representations for string

3284 objects

3285 levels : the names of levels

3286 metadata : the names of the metadata columns

3287 """

3288

3289 pandas_kind = "wide_table"

3290 format_type: str = "table" # GH#30962 needed by dask

3291 table_type: str

3292 levels: int | list[Hashable] = 1

3293 is_table = True

3294

3295 index_axes: list[IndexCol]

3296 non_index_axes: list[tuple[int, Any]]

3297 values_axes: list[DataCol]

3298 data_columns: list

3299 metadata: list

3300 info: dict

3301

3302 def __init__(

3303 self,

3304 parent: HDFStore,

3305 group: Node,

3306 encoding=None,

3307 errors: str = "strict",

3308 index_axes=None,

3309 non_index_axes=None,

3310 values_axes=None,

3311 data_columns=None,

3312 info=None,

3313 nan_rep=None,

3314 ) -> None:

3315 super().__init__(parent, group, encoding=encoding, errors=errors)

3316 self.index_axes = index_axes or []

3317 self.non_index_axes = non_index_axes or []

3318 self.values_axes = values_axes or []

3319 self.data_columns = data_columns or []

3320 self.info = info or {}

3321 self.nan_rep = nan_rep

3322

3323 @property

3324 def table_type_short(self) -> str:

3325 return self.table_type.split("_")[0]

3326

3327 def __repr__(self) -> str:

3328 """return a pretty representation of myself"""

3329 self.infer_axes()

3330 jdc = ",".join(self.data_columns) if len(self.data_columns) else ""

3331 dc = f",dc->[{jdc}]"

3332

3333 ver = ""

3334 if self.is_old_version:

3335 jver = ".".join([str(x) for x in self.version])

3336 ver = f"[{jver}]"

3337

3338 jindex_axes = ",".join([a.name for a in self.index_axes])

3339 return (

3340 f"{self.pandas_type:12.12}{ver} "

3341 f"(typ->{self.table_type_short},nrows->{self.nrows},"

3342 f"ncols->{self.ncols},indexers->[{jindex_axes}]{dc})"

3343 )

3344

3345 def __getitem__(self, c: str):

3346 """return the axis for c"""

3347 for a in self.axes:

3348 if c == a.name:

3349 return a

3350 return None

3351

3352 def validate(self, other) -> None:

3353 """validate against an existing table"""

3354 if other is None:

3355 return

3356

3357 if other.table_type != self.table_type:

3358 raise TypeError(

3359 "incompatible table_type with existing "

3360 f"[{other.table_type} - {self.table_type}]"

3361 )

3362

3363 for c in ["index_axes", "non_index_axes", "values_axes"]:

3364 sv = getattr(self, c, None)

3365 ov = getattr(other, c, None)

3366 if sv != ov:

3367

3368 # show the error for the specific axes

3369 # Argument 1 to "enumerate" has incompatible type

3370 # "Optional[Any]"; expected "Iterable[Any]" [arg-type]

3371 for i, sax in enumerate(sv): # type: ignore[arg-type]

3372 # Value of type "Optional[Any]" is not indexable [index]

3373 oax = ov[i] # type: ignore[index]

3374 if sax != oax:

3375 raise ValueError(

3376 f"invalid combination of [{c}] on appending data "

3377 f"[{sax}] vs current table [{oax}]"

3378 )

3379

3380 # should never get here

3381 raise Exception(

3382 f"invalid combination of [{c}] on appending data [{sv}] vs "

3383 f"current table [{ov}]"

3384 )

3385

3386 @property

3387 def is_multi_index(self) -> bool:

3388 """the levels attribute is 1 or a list in the case of a multi-index"""

3389 return isinstance(self.levels, list)

3390

3391 def validate_multiindex(

3392 self, obj: DataFrame | Series

3393 ) -> tuple[DataFrame, list[Hashable]]:

3394 """

3395 validate that we can store the multi-index; reset and return the

3396 new object

3397 """

3398 levels = com.fill_missing_names(obj.index.names)

3399 try:

3400 reset_obj = obj.reset_index()

3401 except ValueError as err:

3402 raise ValueError(

3403 "duplicate names/columns in the multi-index when storing as a table"

3404 ) from err

3405 assert isinstance(reset_obj, DataFrame) # for mypy

3406 return reset_obj, levels

3407

3408 @property

3409 def nrows_expected(self) -> int:

3410 """based on our axes, compute the expected nrows"""

3411 return np.prod([i.cvalues.shape[0] for i in self.index_axes])

3412

3413 @property

3414 def is_exists(self) -> bool:

3415 """has this table been created"""

3416 return "table" in self.group

3417

3418 @property

3419 def storable(self):

3420 return getattr(self.group, "table", None)

3421

3422 @property

3423 def table(self):

3424 """return the table group (this is my storable)"""

3425 return self.storable

3426

3427 @property

3428 def dtype(self):

3429 return self.table.dtype

3430

3431 @property

3432 def description(self):

3433 return self.table.description

3434

3435 @property

3436 def axes(self):

3437 return itertools.chain(self.index_axes, self.values_axes)

3438

3439 @property

3440 def ncols(self) -> int:

3441 """the number of total columns in the values axes"""

3442 return sum(len(a.values) for a in self.values_axes)

3443

3444 @property

3445 def is_transposed(self) -> bool:

3446 return False

3447

3448 @property

3449 def data_orientation(self) -> tuple[int, ...]:

3450 """return a tuple of my permutated axes, non_indexable at the front"""

3451 return tuple(

3452 itertools.chain(

3453 [int(a[0]) for a in self.non_index_axes],

3454 [int(a.axis) for a in self.index_axes],

3455 )

3456 )

3457

3458 def queryables(self) -> dict[str, Any]:

3459 """return a dict of the kinds allowable columns for this object"""

3460 # mypy doesn't recognize DataFrame._AXIS_NAMES, so we re-write it here

3461 axis_names = {0: "index", 1: "columns"}

3462

3463 # compute the values_axes queryables

3464 d1 = [(a.cname, a) for a in self.index_axes]

3465 d2 = [(axis_names[axis], None) for axis, values in self.non_index_axes]

3466 d3 = [

3467 (v.cname, v) for v in self.values_axes if v.name in set(self.data_columns)

3468 ]

3469

3470 # error: Unsupported operand types for + ("List[Tuple[str, IndexCol]]" and

3471 # "List[Tuple[str, None]]")

3472 return dict(d1 + d2 + d3) # type: ignore[operator]

3473

3474 def index_cols(self):

3475 """return a list of my index cols"""

3476 # Note: each `i.cname` below is assured to be a str.

3477 return [(i.axis, i.cname) for i in self.index_axes]

3478

3479 def values_cols(self) -> list[str]:

3480 """return a list of my values cols"""

3481 return [i.cname for i in self.values_axes]

3482

3483 def _get_metadata_path(self, key: str) -> str:

3484 """return the metadata pathname for this key"""

3485 group = self.group._v_pathname

3486 return f"{group}/meta/{key}/meta"

3487

3488 def write_metadata(self, key: str, values: np.ndarray) -> None:

3489 """

3490 Write out a metadata array to the key as a fixed-format Series.

3491

3492 Parameters

3493 ----------

3494 key : str

3495 values : ndarray

3496 """

3497 self.parent.put(

3498 self._get_metadata_path(key),

3499 Series(values),

3500 format="table",

3501 encoding=self.encoding,

3502 errors=self.errors,

3503 nan_rep=self.nan_rep,

3504 )

3505

3506 def read_metadata(self, key: str):

3507 """return the meta data array for this key"""

3508 if getattr(getattr(self.group, "meta", None), key, None) is not None:

3509 return self.parent.select(self._get_metadata_path(key))

3510 return None

3511

3512 def set_attrs(self) -> None:

3513 """set our table type & indexables"""

3514 self.attrs.table_type = str(self.table_type)

3515 self.attrs.index_cols = self.index_cols()

3516 self.attrs.values_cols = self.values_cols()

3517 self.attrs.non_index_axes = self.non_index_axes

3518 self.attrs.data_columns = self.data_columns

3519 self.attrs.nan_rep = self.nan_rep

3520 self.attrs.encoding = self.encoding

3521 self.attrs.errors = self.errors

3522 self.attrs.levels = self.levels

3523 self.attrs.info = self.info

3524

3525 def get_attrs(self) -> None:

3526 """retrieve our attributes"""

3527 self.non_index_axes = getattr(self.attrs, "non_index_axes", None) or []

3528 self.data_columns = getattr(self.attrs, "data_columns", None) or []

3529 self.info = getattr(self.attrs, "info", None) or {}

3530 self.nan_rep = getattr(self.attrs, "nan_rep", None)

3531 self.encoding = _ensure_encoding(getattr(self.attrs, "encoding", None))

3532 self.errors = _ensure_decoded(getattr(self.attrs, "errors", "strict"))

3533 self.levels: list[Hashable] = getattr(self.attrs, "levels", None) or []

3534 self.index_axes = [a for a in self.indexables if a.is_an_indexable]

3535 self.values_axes = [a for a in self.indexables if not a.is_an_indexable]

3536

3537 def validate_version(self, where=None) -> None:

3538 """are we trying to operate on an old version?"""

3539 if where is not None:

3540 if self.is_old_version:

3541 ws = incompatibility_doc % ".".join([str(x) for x in self.version])

3542 warnings.warn(

3543 ws,

3544 IncompatibilityWarning,

3545 stacklevel=find_stack_level(),

3546 )

3547

3548 def validate_min_itemsize(self, min_itemsize) -> None:

3549 """

3550 validate the min_itemsize doesn't contain items that are not in the

3551 axes this needs data_columns to be defined

3552 """

3553 if min_itemsize is None:

3554 return

3555 if not isinstance(min_itemsize, dict):

3556 return

3557

3558 q = self.queryables()

3559 for k in min_itemsize:

3560

3561 # ok, apply generally

3562 if k == "values":

3563 continue

3564 if k not in q:

3565 raise ValueError(

3566 f"min_itemsize has the key [{k}] which is not an axis or "

3567 "data_column"

3568 )

3569

3570 @cache_readonly

3571 def indexables(self):

3572 """create/cache the indexables if they don't exist"""

3573 _indexables = []

3574

3575 desc = self.description

3576 table_attrs = self.table.attrs

3577

3578 # Note: each of the `name` kwargs below are str, ensured

3579 # by the definition in index_cols.

3580 # index columns

3581 for i, (axis, name) in enumerate(self.attrs.index_cols):

3582 atom = getattr(desc, name)

3583 md = self.read_metadata(name)

3584 meta = "category" if md is not None else None

3585

3586 kind_attr = f"{name}_kind"

3587 kind = getattr(table_attrs, kind_attr, None)

3588

3589 index_col = IndexCol(

3590 name=name,

3591 axis=axis,

3592 pos=i,

3593 kind=kind,

3594 typ=atom,

3595 table=self.table,

3596 meta=meta,

3597 metadata=md,

3598 )

3599 _indexables.append(index_col)

3600

3601 # values columns

3602 dc = set(self.data_columns)

3603 base_pos = len(_indexables)

3604

3605 def f(i, c):

3606 assert isinstance(c, str)

3607 klass = DataCol

3608 if c in dc:

3609 klass = DataIndexableCol

3610

3611 atom = getattr(desc, c)

3612 adj_name = _maybe_adjust_name(c, self.version)

3613

3614 # TODO: why kind_attr here?

3615 values = getattr(table_attrs, f"{adj_name}_kind", None)

3616 dtype = getattr(table_attrs, f"{adj_name}_dtype", None)

3617 # Argument 1 to "_dtype_to_kind" has incompatible type

3618 # "Optional[Any]"; expected "str" [arg-type]

3619 kind = _dtype_to_kind(dtype) # type: ignore[arg-type]

3620

3621 md = self.read_metadata(c)

3622 # TODO: figure out why these two versions of `meta` dont always match.

3623 # meta = "category" if md is not None else None

3624 meta = getattr(table_attrs, f"{adj_name}_meta", None)

3625

3626 obj = klass(

3627 name=adj_name,

3628 cname=c,

3629 values=values,

3630 kind=kind,

3631 pos=base_pos + i,

3632 typ=atom,

3633 table=self.table,

3634 meta=meta,

3635 metadata=md,

3636 dtype=dtype,

3637 )

3638 return obj

3639

3640 # Note: the definition of `values_cols` ensures that each

3641 # `c` below is a str.

3642 _indexables.extend([f(i, c) for i, c in enumerate(self.attrs.values_cols)])

3643

3644 return _indexables

3645

3646 def create_index(

3647 self, columns=None, optlevel=None, kind: str | None = None

3648 ) -> None:

3649 """

3650 Create a pytables index on the specified columns.

3651

3652 Parameters

3653 ----------

3654 columns : None, bool, or listlike[str]

3655 Indicate which columns to create an index on.

3656

3657 * False : Do not create any indexes.

3658 * True : Create indexes on all columns.

3659 * None : Create indexes on all columns.

3660 * listlike : Create indexes on the given columns.

3661

3662 optlevel : int or None, default None

3663 Optimization level, if None, pytables defaults to 6.

3664 kind : str or None, default None

3665 Kind of index, if None, pytables defaults to "medium".

3666

3667 Raises

3668 ------

3669 TypeError if trying to create an index on a complex-type column.

3670

3671 Notes

3672 -----

3673 Cannot index Time64Col or ComplexCol.

3674 Pytables must be >= 3.0.

3675 """

3676 if not self.infer_axes():

3677 return

3678 if columns is False:

3679 return

3680

3681 # index all indexables and data_columns

3682 if columns is None or columns is True:

3683 columns = [a.cname for a in self.axes if a.is_data_indexable]

3684 if not isinstance(columns, (tuple, list)):

3685 columns = [columns]

3686

3687 kw = {}

3688 if optlevel is not None:

3689 kw["optlevel"] = optlevel

3690 if kind is not None:

3691 kw["kind"] = kind

3692

3693 table = self.table

3694 for c in columns:

3695 v = getattr(table.cols, c, None)

3696 if v is not None:

3697 # remove the index if the kind/optlevel have changed

3698 if v.is_indexed:

3699 index = v.index

3700 cur_optlevel = index.optlevel

3701 cur_kind = index.kind

3702

3703 if kind is not None and cur_kind != kind:

3704 v.remove_index()

3705 else:

3706 kw["kind"] = cur_kind

3707

3708 if optlevel is not None and cur_optlevel != optlevel:

3709 v.remove_index()

3710 else:

3711 kw["optlevel"] = cur_optlevel

3712

3713 # create the index

3714 if not v.is_indexed:

3715 if v.type.startswith("complex"):

3716 raise TypeError(

3717 "Columns containing complex values can be stored but "

3718 "cannot be indexed when using table format. Either use "

3719 "fixed format, set index=False, or do not include "

3720 "the columns containing complex values to "

3721 "data_columns when initializing the table."

3722 )

3723 v.create_index(**kw)

3724 elif c in self.non_index_axes[0][1]:

3725 # GH 28156

3726 raise AttributeError(

3727 f"column {c} is not a data_column.\n"

3728 f"In order to read column {c} you must reload the dataframe \n"

3729 f"into HDFStore and include {c} with the data_columns argument."

3730 )

3731

3732 def _read_axes(

3733 self, where, start: int | None = None, stop: int | None = None

3734 ) -> list[tuple[ArrayLike, ArrayLike]]:

3735 """

3736 Create the axes sniffed from the table.

3737

3738 Parameters

3739 ----------

3740 where : ???

3741 start : int or None, default None

3742 stop : int or None, default None

3743

3744 Returns

3745 -------

3746 List[Tuple[index_values, column_values]]

3747 """

3748 # create the selection

3749 selection = Selection(self, where=where, start=start, stop=stop)

3750 values = selection.select()

3751

3752 results = []

3753 # convert the data

3754 for a in self.axes:

3755 a.set_info(self.info)

3756 res = a.convert(

3757 values,

3758 nan_rep=self.nan_rep,

3759 encoding=self.encoding,

3760 errors=self.errors,

3761 )

3762 results.append(res)

3763

3764 return results

3765

3766 @classmethod

3767 def get_object(cls, obj, transposed: bool):

3768 """return the data for this obj"""

3769 return obj

3770

3771 def validate_data_columns(self, data_columns, min_itemsize, non_index_axes):

3772 """

3773 take the input data_columns and min_itemize and create a data

3774 columns spec

3775 """

3776 if not len(non_index_axes):

3777 return []

3778

3779 axis, axis_labels = non_index_axes[0]

3780 info = self.info.get(axis, {})

3781 if info.get("type") == "MultiIndex" and data_columns:

3782 raise ValueError(

3783 f"cannot use a multi-index on axis [{axis}] with "

3784 f"data_columns {data_columns}"

3785 )

3786

3787 # evaluate the passed data_columns, True == use all columns

3788 # take only valid axis labels

3789 if data_columns is True:

3790 data_columns = list(axis_labels)

3791 elif data_columns is None:

3792 data_columns = []

3793

3794 # if min_itemsize is a dict, add the keys (exclude 'values')

3795 if isinstance(min_itemsize, dict):

3796 existing_data_columns = set(data_columns)

3797 data_columns = list(data_columns) # ensure we do not modify

3798 data_columns.extend(

3799 [

3800 k

3801 for k in min_itemsize.keys()

3802 if k != "values" and k not in existing_data_columns

3803 ]

3804 )

3805

3806 # return valid columns in the order of our axis

3807 return [c for c in data_columns if c in axis_labels]

3808

3809 def _create_axes(

3810 self,

3811 axes,

3812 obj: DataFrame,

3813 validate: bool = True,

3814 nan_rep=None,

3815 data_columns=None,

3816 min_itemsize=None,

3817 ):

3818 """

3819 Create and return the axes.

3820

3821 Parameters

3822 ----------

3823 axes: list or None

3824 The names or numbers of the axes to create.

3825 obj : DataFrame

3826 The object to create axes on.

3827 validate: bool, default True

3828 Whether to validate the obj against an existing object already written.

3829 nan_rep :

3830 A value to use for string column nan_rep.

3831 data_columns : List[str], True, or None, default None

3832 Specify the columns that we want to create to allow indexing on.

3833

3834 * True : Use all available columns.

3835 * None : Use no columns.

3836 * List[str] : Use the specified columns.

3837

3838 min_itemsize: Dict[str, int] or None, default None

3839 The min itemsize for a column in bytes.

3840 """

3841 if not isinstance(obj, DataFrame):

3842 group = self.group._v_name

3843 raise TypeError(

3844 f"cannot properly create the storer for: [group->{group},"

3845 f"value->{type(obj)}]"

3846 )

3847

3848 # set the default axes if needed

3849 if axes is None:

3850 axes = [0]

3851

3852 # map axes to numbers

3853 axes = [obj._get_axis_number(a) for a in axes]

3854

3855 # do we have an existing table (if so, use its axes & data_columns)

3856 if self.infer_axes():

3857 table_exists = True

3858 axes = [a.axis for a in self.index_axes]

3859 data_columns = list(self.data_columns)

3860 nan_rep = self.nan_rep

3861 # TODO: do we always have validate=True here?

3862 else:

3863 table_exists = False

3864

3865 new_info = self.info

3866

3867 assert self.ndim == 2 # with next check, we must have len(axes) == 1

3868 # currently support on ndim-1 axes

3869 if len(axes) != self.ndim - 1:

3870 raise ValueError(

3871 "currently only support ndim-1 indexers in an AppendableTable"

3872 )

3873

3874 # create according to the new data

3875 new_non_index_axes: list = []

3876

3877 # nan_representation

3878 if nan_rep is None:

3879 nan_rep = "nan"

3880

3881 # We construct the non-index-axis first, since that alters new_info

3882 idx = [x for x in [0, 1] if x not in axes][0]

3883

3884 a = obj.axes[idx]

3885 # we might be able to change the axes on the appending data if necessary

3886 append_axis = list(a)

3887 if table_exists:

3888 indexer = len(new_non_index_axes) # i.e. 0

3889 exist_axis = self.non_index_axes[indexer][1]

3890 if not array_equivalent(np.array(append_axis), np.array(exist_axis)):

3891

3892 # ahah! -> reindex

3893 if array_equivalent(

3894 np.array(sorted(append_axis)), np.array(sorted(exist_axis))

3895 ):

3896 append_axis = exist_axis

3897

3898 # the non_index_axes info

3899 info = new_info.setdefault(idx, {})

3900 info["names"] = list(a.names)

3901 info["type"] = type(a).__name__

3902

3903 new_non_index_axes.append((idx, append_axis))

3904

3905 # Now we can construct our new index axis

3906 idx = axes[0]

3907 a = obj.axes[idx]

3908 axis_name = obj._get_axis_name(idx)

3909 new_index = _convert_index(axis_name, a, self.encoding, self.errors)

3910 new_index.axis = idx

3911

3912 # Because we are always 2D, there is only one new_index, so

3913 # we know it will have pos=0

3914 new_index.set_pos(0)

3915 new_index.update_info(new_info)

3916 new_index.maybe_set_size(min_itemsize) # check for column conflicts

3917

3918 new_index_axes = [new_index]

3919 j = len(new_index_axes) # i.e. 1

3920 assert j == 1

3921

3922 # reindex by our non_index_axes & compute data_columns

3923 assert len(new_non_index_axes) == 1

3924 for a in new_non_index_axes:

3925 obj = _reindex_axis(obj, a[0], a[1])

3926

3927 transposed = new_index.axis == 1

3928

3929 # figure out data_columns and get out blocks

3930 data_columns = self.validate_data_columns(

3931 data_columns, min_itemsize, new_non_index_axes

3932 )

3933

3934 frame = self.get_object(obj, transposed)._consolidate()

3935

3936 blocks, blk_items = self._get_blocks_and_items(

3937 frame, table_exists, new_non_index_axes, self.values_axes, data_columns

3938 )

3939

3940 # add my values

3941 vaxes = []

3942 for i, (blk, b_items) in enumerate(zip(blocks, blk_items)):

3943

3944 # shape of the data column are the indexable axes

3945 klass = DataCol

3946 name = None

3947

3948 # we have a data_column

3949 if data_columns and len(b_items) == 1 and b_items[0] in data_columns:

3950 klass = DataIndexableCol

3951 name = b_items[0]

3952 if not (name is None or isinstance(name, str)):

3953 # TODO: should the message here be more specifically non-str?

3954 raise ValueError("cannot have non-object label DataIndexableCol")

3955

3956 # make sure that we match up the existing columns

3957 # if we have an existing table

3958 existing_col: DataCol | None

3959

3960 if table_exists and validate:

3961 try:

3962 existing_col = self.values_axes[i]

3963 except (IndexError, KeyError) as err:

3964 raise ValueError(

3965 f"Incompatible appended table [{blocks}]"

3966 f"with existing table [{self.values_axes}]"

3967 ) from err

3968 else:

3969 existing_col = None

3970

3971 new_name = name or f"values_block_{i}"

3972 data_converted = _maybe_convert_for_string_atom(

3973 new_name,

3974 blk.values,

3975 existing_col=existing_col,

3976 min_itemsize=min_itemsize,

3977 nan_rep=nan_rep,

3978 encoding=self.encoding,

3979 errors=self.errors,

3980 columns=b_items,

3981 )

3982 adj_name = _maybe_adjust_name(new_name, self.version)

3983

3984 typ = klass._get_atom(data_converted)

3985 kind = _dtype_to_kind(data_converted.dtype.name)

3986 tz = None

3987 if getattr(data_converted, "tz", None) is not None:

3988 tz = _get_tz(data_converted.tz)

3989

3990 meta = metadata = ordered = None

3991 if is_categorical_dtype(data_converted.dtype):

3992 ordered = data_converted.ordered

3993 meta = "category"

3994 metadata = np.array(data_converted.categories, copy=False).ravel()

3995

3996 data, dtype_name = _get_data_and_dtype_name(data_converted)

3997

3998 col = klass(

3999 name=adj_name,

4000 cname=new_name,

4001 values=list(b_items),

4002 typ=typ,

4003 pos=j,

4004 kind=kind,

4005 tz=tz,

4006 ordered=ordered,

4007 meta=meta,

4008 metadata=metadata,

4009 dtype=dtype_name,

4010 data=data,

4011 )

4012 col.update_info(new_info)

4013

4014 vaxes.append(col)

4015

4016 j += 1

4017

4018 dcs = [col.name for col in vaxes if col.is_data_indexable]

4019

4020 new_table = type(self)(

4021 parent=self.parent,

4022 group=self.group,

4023 encoding=self.encoding,

4024 errors=self.errors,

4025 index_axes=new_index_axes,

4026 non_index_axes=new_non_index_axes,

4027 values_axes=vaxes,

4028 data_columns=dcs,

4029 info=new_info,

4030 nan_rep=nan_rep,

4031 )

4032 if hasattr(self, "levels"):

4033 # TODO: get this into constructor, only for appropriate subclass

4034 new_table.levels = self.levels

4035

4036 new_table.validate_min_itemsize(min_itemsize)

4037

4038 if validate and table_exists:

4039 new_table.validate(self)

4040

4041 return new_table

4042

4043 @staticmethod

4044 def _get_blocks_and_items(

4045 frame: DataFrame,

4046 table_exists: bool,

4047 new_non_index_axes,

4048 values_axes,

4049 data_columns,

4050 ):

4051 # Helper to clarify non-state-altering parts of _create_axes

4052

4053 # TODO(ArrayManager) HDFStore relies on accessing the blocks

4054 if isinstance(frame._mgr, ArrayManager):

4055 frame = frame._as_manager("block")

4056

4057 def get_blk_items(mgr):

4058 return [mgr.items.take(blk.mgr_locs) for blk in mgr.blocks]

4059

4060 mgr = frame._mgr

4061 mgr = cast(BlockManager, mgr)

4062 blocks: list[Block] = list(mgr.blocks)

4063 blk_items: list[Index] = get_blk_items(mgr)

4064

4065 if len(data_columns):

4066 axis, axis_labels = new_non_index_axes[0]

4067 new_labels = Index(axis_labels).difference(Index(data_columns))

4068 mgr = frame.reindex(new_labels, axis=axis)._mgr

4069

4070 # error: Item "ArrayManager" of "Union[ArrayManager, BlockManager]" has no

4071 # attribute "blocks"

4072 blocks = list(mgr.blocks) # type: ignore[union-attr]

4073 blk_items = get_blk_items(mgr)

4074 for c in data_columns:

4075 mgr = frame.reindex([c], axis=axis)._mgr

4076 # error: Item "ArrayManager" of "Union[ArrayManager, BlockManager]" has

4077 # no attribute "blocks"

4078 blocks.extend(mgr.blocks) # type: ignore[union-attr]

4079 blk_items.extend(get_blk_items(mgr))

4080

4081 # reorder the blocks in the same order as the existing table if we can

4082 if table_exists:

4083 by_items = {

4084 tuple(b_items.tolist()): (b, b_items)

4085 for b, b_items in zip(blocks, blk_items)

4086 }

4087 new_blocks: list[Block] = []

4088 new_blk_items = []

4089 for ea in values_axes:

4090 items = tuple(ea.values)

4091 try:

4092 b, b_items = by_items.pop(items)

4093 new_blocks.append(b)

4094 new_blk_items.append(b_items)

4095 except (IndexError, KeyError) as err:

4096 jitems = ",".join([pprint_thing(item) for item in items])

4097 raise ValueError(

4098 f"cannot match existing table structure for [{jitems}] "

4099 "on appending data"

4100 ) from err

4101 blocks = new_blocks

4102 blk_items = new_blk_items

4103

4104 return blocks, blk_items

4105

4106 def process_axes(self, obj, selection: Selection, columns=None) -> DataFrame:

4107 """process axes filters"""

4108 # make a copy to avoid side effects

4109 if columns is not None:

4110 columns = list(columns)

4111

4112 # make sure to include levels if we have them

4113 if columns is not None and self.is_multi_index:

4114 assert isinstance(self.levels, list) # assured by is_multi_index

4115 for n in self.levels:

4116 if n not in columns:

4117 columns.insert(0, n)

4118

4119 # reorder by any non_index_axes & limit to the select columns

4120 for axis, labels in self.non_index_axes:

4121 obj = _reindex_axis(obj, axis, labels, columns)

4122

4123 # apply the selection filters (but keep in the same order)

4124 if selection.filter is not None:

4125 for field, op, filt in selection.filter.format():

4126

4127 def process_filter(field, filt):

4128

4129 for axis_name in obj._AXIS_ORDERS:

4130 axis_number = obj._get_axis_number(axis_name)

4131 axis_values = obj._get_axis(axis_name)

4132 assert axis_number is not None

4133

4134 # see if the field is the name of an axis

4135 if field == axis_name:

4136

4137 # if we have a multi-index, then need to include

4138 # the levels

4139 if self.is_multi_index:

4140 filt = filt.union(Index(self.levels))

4141

4142 takers = op(axis_values, filt)

4143 return obj.loc(axis=axis_number)[takers]

4144

4145 # this might be the name of a file IN an axis

4146 elif field in axis_values:

4147

4148 # we need to filter on this dimension

4149 values = ensure_index(getattr(obj, field).values)

4150 filt = ensure_index(filt)

4151

4152 # hack until we support reversed dim flags

4153 if isinstance(obj, DataFrame):

4154 axis_number = 1 - axis_number

4155 takers = op(values, filt)

4156 return obj.loc(axis=axis_number)[takers]

4157

4158 raise ValueError(f"cannot find the field [{field}] for filtering!")

4159

4160 obj = process_filter(field, filt)

4161

4162 return obj

4163

4164 def create_description(

4165 self,

4166 complib,

4167 complevel: int | None,

4168 fletcher32: bool,

4169 expectedrows: int | None,

4170 ) -> dict[str, Any]:

4171 """create the description of the table from the axes & values"""

4172 # provided expected rows if its passed

4173 if expectedrows is None:

4174 expectedrows = max(self.nrows_expected, 10000)

4175

4176 d = {"name": "table", "expectedrows": expectedrows}

4177

4178 # description from the axes & values

4179 d["description"] = {a.cname: a.typ for a in self.axes}

4180

4181 if complib:

4182 if complevel is None:

4183 complevel = self._complevel or 9

4184 filters = _tables().Filters(

4185 complevel=complevel,

4186 complib=complib,

4187 fletcher32=fletcher32 or self._fletcher32,

4188 )

4189 d["filters"] = filters

4190 elif self._filters is not None:

4191 d["filters"] = self._filters

4192

4193 return d

4194

4195 def read_coordinates(

4196 self, where=None, start: int | None = None, stop: int | None = None

4197 ):

4198 """

4199 select coordinates (row numbers) from a table; return the

4200 coordinates object

4201 """

4202 # validate the version

4203 self.validate_version(where)

4204

4205 # infer the data kind

4206 if not self.infer_axes():

4207 return False

4208

4209 # create the selection

4210 selection = Selection(self, where=where, start=start, stop=stop)

4211 coords = selection.select_coords()

4212 if selection.filter is not None:

4213 for field, op, filt in selection.filter.format():

4214 data = self.read_column(

4215 field, start=coords.min(), stop=coords.max() + 1

4216 )

4217 coords = coords[op(data.iloc[coords - coords.min()], filt).values]

4218

4219 return Index(coords)

4220

4221 def read_column(

4222 self,

4223 column: str,

4224 where=None,

4225 start: int | None = None,

4226 stop: int | None = None,

4227 ):

4228 """

4229 return a single column from the table, generally only indexables

4230 are interesting

4231 """

4232 # validate the version

4233 self.validate_version()

4234

4235 # infer the data kind

4236 if not self.infer_axes():

4237 return False

4238

4239 if where is not None:

4240 raise TypeError("read_column does not currently accept a where clause")

4241

4242 # find the axes

4243 for a in self.axes:

4244 if column == a.name:

4245 if not a.is_data_indexable:

4246 raise ValueError(

4247 f"column [{column}] can not be extracted individually; "

4248 "it is not data indexable"

4249 )

4250

4251 # column must be an indexable or a data column

4252 c = getattr(self.table.cols, column)

4253 a.set_info(self.info)

4254 col_values = a.convert(

4255 c[start:stop],

4256 nan_rep=self.nan_rep,

4257 encoding=self.encoding,

4258 errors=self.errors,

4259 )

4260 return Series(_set_tz(col_values[1], a.tz), name=column)

4261

4262 raise KeyError(f"column [{column}] not found in the table")

4263

4264

4265class WORMTable(Table):

4266 """

4267 a write-once read-many table: this format DOES NOT ALLOW appending to a

4268 table. writing is a one-time operation the data are stored in a format

4269 that allows for searching the data on disk

4270 """

4271

4272 table_type = "worm"

4273

4274 def read(

4275 self,

4276 where=None,

4277 columns=None,

4278 start: int | None = None,

4279 stop: int | None = None,

4280 ):

4281 """

4282 read the indices and the indexing array, calculate offset rows and return

4283 """

4284 raise NotImplementedError("WORMTable needs to implement read")

4285

4286 def write(self, **kwargs) -> None:

4287 """

4288 write in a format that we can search later on (but cannot append

4289 to): write out the indices and the values using _write_array

4290 (e.g. a CArray) create an indexing table so that we can search

4291 """

4292 raise NotImplementedError("WORMTable needs to implement write")

4293

4294

4295class AppendableTable(Table):

4296 """support the new appendable table formats"""

4297

4298 table_type = "appendable"

4299

4300 # error: Signature of "write" incompatible with supertype "Fixed"

4301 def write( # type: ignore[override]

4302 self,

4303 obj,

4304 axes=None,

4305 append: bool = False,

4306 complib=None,

4307 complevel=None,

4308 fletcher32=None,

4309 min_itemsize=None,

4310 chunksize=None,

4311 expectedrows=None,

4312 dropna: bool = False,

4313 nan_rep=None,

4314 data_columns=None,

4315 track_times=True,

4316 ) -> None:

4317 if not append and self.is_exists:

4318 self._handle.remove_node(self.group, "table")

4319

4320 # create the axes

4321 table = self._create_axes(

4322 axes=axes,

4323 obj=obj,

4324 validate=append,

4325 min_itemsize=min_itemsize,

4326 nan_rep=nan_rep,

4327 data_columns=data_columns,

4328 )

4329

4330 for a in table.axes:

4331 a.validate_names()

4332

4333 if not table.is_exists:

4334

4335 # create the table

4336 options = table.create_description(

4337 complib=complib,

4338 complevel=complevel,

4339 fletcher32=fletcher32,

4340 expectedrows=expectedrows,

4341 )

4342

4343 # set the table attributes

4344 table.set_attrs()

4345

4346 options["track_times"] = track_times

4347

4348 # create the table

4349 table._handle.create_table(table.group, **options)

4350

4351 # update my info

4352 table.attrs.info = table.info

4353

4354 # validate the axes and set the kinds

4355 for a in table.axes:

4356 a.validate_and_set(table, append)

4357

4358 # add the rows

4359 table.write_data(chunksize, dropna=dropna)

4360

4361 def write_data(self, chunksize: int | None, dropna: bool = False) -> None:

4362 """

4363 we form the data into a 2-d including indexes,values,mask write chunk-by-chunk

4364 """

4365 names = self.dtype.names

4366 nrows = self.nrows_expected

4367

4368 # if dropna==True, then drop ALL nan rows

4369 masks = []

4370 if dropna:

4371 for a in self.values_axes:

4372 # figure the mask: only do if we can successfully process this

4373 # column, otherwise ignore the mask

4374 mask = isna(a.data).all(axis=0)

4375 if isinstance(mask, np.ndarray):

4376 masks.append(mask.astype("u1", copy=False))

4377

4378 # consolidate masks

4379 if len(masks):

4380 mask = masks[0]

4381 for m in masks[1:]:

4382 mask = mask & m

4383 mask = mask.ravel()

4384 else:

4385 mask = None

4386

4387 # broadcast the indexes if needed

4388 indexes = [a.cvalues for a in self.index_axes]

4389 nindexes = len(indexes)

4390 assert nindexes == 1, nindexes # ensures we dont need to broadcast

4391

4392 # transpose the values so first dimension is last

4393 # reshape the values if needed

4394 values = [a.take_data() for a in self.values_axes]

4395 values = [v.transpose(np.roll(np.arange(v.ndim), v.ndim - 1)) for v in values]

4396 bvalues = []

4397 for i, v in enumerate(values):

4398 new_shape = (nrows,) + self.dtype[names[nindexes + i]].shape

4399 bvalues.append(values[i].reshape(new_shape))

4400

4401 # write the chunks

4402 if chunksize is None:

4403 chunksize = 100000

4404

4405 rows = np.empty(min(chunksize, nrows), dtype=self.dtype)

4406 chunks = nrows // chunksize + 1

4407 for i in range(chunks):

4408 start_i = i * chunksize

4409 end_i = min((i + 1) * chunksize, nrows)

4410 if start_i >= end_i:

4411 break

4412

4413 self.write_data_chunk(

4414 rows,

4415 indexes=[a[start_i:end_i] for a in indexes],

4416 mask=mask[start_i:end_i] if mask is not None else None,

4417 values=[v[start_i:end_i] for v in bvalues],

4418 )

4419

4420 def write_data_chunk(

4421 self,

4422 rows: np.ndarray,

4423 indexes: list[np.ndarray],

4424 mask: npt.NDArray[np.bool_] | None,

4425 values: list[np.ndarray],

4426 ) -> None:

4427 """

4428 Parameters

4429 ----------

4430 rows : an empty memory space where we are putting the chunk

4431 indexes : an array of the indexes

4432 mask : an array of the masks

4433 values : an array of the values

4434 """

4435 # 0 len

4436 for v in values:

4437 if not np.prod(v.shape):

4438 return

4439

4440 nrows = indexes[0].shape[0]

4441 if nrows != len(rows):

4442 rows = np.empty(nrows, dtype=self.dtype)

4443 names = self.dtype.names

4444 nindexes = len(indexes)

4445

4446 # indexes

4447 for i, idx in enumerate(indexes):

4448 rows[names[i]] = idx

4449

4450 # values

4451 for i, v in enumerate(values):

4452 rows[names[i + nindexes]] = v

4453

4454 # mask

4455 if mask is not None:

4456 m = ~mask.ravel().astype(bool, copy=False)

4457 if not m.all():

4458 rows = rows[m]

4459

4460 if len(rows):

4461 self.table.append(rows)

4462 self.table.flush()

4463

4464 def delete(self, where=None, start: int | None = None, stop: int | None = None):

4465

4466 # delete all rows (and return the nrows)

4467 if where is None or not len(where):

4468 if start is None and stop is None:

4469 nrows = self.nrows

4470 self._handle.remove_node(self.group, recursive=True)

4471 else:

4472 # pytables<3.0 would remove a single row with stop=None

4473 if stop is None:

4474 stop = self.nrows

4475 nrows = self.table.remove_rows(start=start, stop=stop)

4476 self.table.flush()

4477 return nrows

4478

4479 # infer the data kind

4480 if not self.infer_axes():

4481 return None

4482

4483 # create the selection

4484 table = self.table

4485 selection = Selection(self, where, start=start, stop=stop)

4486 values = selection.select_coords()

4487

4488 # delete the rows in reverse order

4489 sorted_series = Series(values).sort_values()

4490 ln = len(sorted_series)

4491

4492 if ln:

4493

4494 # construct groups of consecutive rows

4495 diff = sorted_series.diff()

4496 groups = list(diff[diff > 1].index)

4497

4498 # 1 group

4499 if not len(groups):

4500 groups = [0]

4501

4502 # final element

4503 if groups[-1] != ln:

4504 groups.append(ln)

4505

4506 # initial element

4507 if groups[0] != 0:

4508 groups.insert(0, 0)

4509

4510 # we must remove in reverse order!

4511 pg = groups.pop()

4512 for g in reversed(groups):

4513 rows = sorted_series.take(range(g, pg))

4514 table.remove_rows(

4515 start=rows[rows.index[0]], stop=rows[rows.index[-1]] + 1

4516 )

4517 pg = g

4518

4519 self.table.flush()

4520

4521 # return the number of rows removed

4522 return ln

4523

4524

4525class AppendableFrameTable(AppendableTable):

4526 """support the new appendable table formats"""

4527

4528 pandas_kind = "frame_table"

4529 table_type = "appendable_frame"

4530 ndim = 2

4531 obj_type: type[DataFrame | Series] = DataFrame

4532

4533 @property

4534 def is_transposed(self) -> bool:

4535 return self.index_axes[0].axis == 1

4536

4537 @classmethod

4538 def get_object(cls, obj, transposed: bool):

4539 """these are written transposed"""

4540 if transposed:

4541 obj = obj.T

4542 return obj

4543

4544 def read(

4545 self,

4546 where=None,

4547 columns=None,

4548 start: int | None = None,

4549 stop: int | None = None,

4550 ):

4551

4552 # validate the version

4553 self.validate_version(where)

4554

4555 # infer the data kind

4556 if not self.infer_axes():

4557 return None

4558

4559 result = self._read_axes(where=where, start=start, stop=stop)

4560

4561 info = (

4562 self.info.get(self.non_index_axes[0][0], {})

4563 if len(self.non_index_axes)

4564 else {}

4565 )

4566

4567 inds = [i for i, ax in enumerate(self.axes) if ax is self.index_axes[0]]

4568 assert len(inds) == 1

4569 ind = inds[0]

4570

4571 index = result[ind][0]

4572

4573 frames = []

4574 for i, a in enumerate(self.axes):

4575 if a not in self.values_axes:

4576 continue

4577 index_vals, cvalues = result[i]

4578

4579 # we could have a multi-index constructor here

4580 # ensure_index doesn't recognized our list-of-tuples here

4581 if info.get("type") != "MultiIndex":

4582 cols = Index(index_vals)

4583 else:

4584 cols = MultiIndex.from_tuples(index_vals)

4585

4586 names = info.get("names")

4587 if names is not None:

4588 cols.set_names(names, inplace=True)

4589

4590 if self.is_transposed:

4591 values = cvalues

4592 index_ = cols

4593 cols_ = Index(index, name=getattr(index, "name", None))

4594 else:

4595 values = cvalues.T

4596 index_ = Index(index, name=getattr(index, "name", None))

4597 cols_ = cols

4598

4599 # if we have a DataIndexableCol, its shape will only be 1 dim

4600 if values.ndim == 1 and isinstance(values, np.ndarray):

4601 values = values.reshape((1, values.shape[0]))

4602

4603 if isinstance(values, np.ndarray):

4604 df = DataFrame(values.T, columns=cols_, index=index_)

4605 elif isinstance(values, Index):

4606 df = DataFrame(values, columns=cols_, index=index_)

4607 else:

4608 # Categorical

4609 df = DataFrame._from_arrays([values], columns=cols_, index=index_)

4610 assert (df.dtypes == values.dtype).all(), (df.dtypes, values.dtype)

4611 frames.append(df)

4612

4613 if len(frames) == 1:

4614 df = frames[0]

4615 else:

4616 df = concat(frames, axis=1)

4617

4618 selection = Selection(self, where=where, start=start, stop=stop)

4619 # apply the selection filters & axis orderings

4620 df = self.process_axes(df, selection=selection, columns=columns)

4621

4622 return df

4623

4624

4625class AppendableSeriesTable(AppendableFrameTable):

4626 """support the new appendable table formats"""

4627

4628 pandas_kind = "series_table"

4629 table_type = "appendable_series"

4630 ndim = 2

4631 obj_type = Series

4632

4633 @property

4634 def is_transposed(self) -> bool:

4635 return False

4636

4637 @classmethod

4638 def get_object(cls, obj, transposed: bool):

4639 return obj

4640

4641 def write(self, obj, data_columns=None, **kwargs):

4642 """we are going to write this as a frame table"""

4643 if not isinstance(obj, DataFrame):

4644 name = obj.name or "values"

4645 obj = obj.to_frame(name)

4646 return super().write(obj=obj, data_columns=obj.columns.tolist(), **kwargs)

4647

4648 def read(

4649 self,

4650 where=None,

4651 columns=None,

4652 start: int | None = None,

4653 stop: int | None = None,

4654 ) -> Series:

4655

4656 is_multi_index = self.is_multi_index

4657 if columns is not None and is_multi_index:

4658 assert isinstance(self.levels, list) # needed for mypy

4659 for n in self.levels:

4660 if n not in columns:

4661 columns.insert(0, n)

4662 s = super().read(where=where, columns=columns, start=start, stop=stop)

4663 if is_multi_index:

4664 s.set_index(self.levels, inplace=True)

4665

4666 s = s.iloc[:, 0]

4667

4668 # remove the default name

4669 if s.name == "values":

4670 s.name = None

4671 return s

4672

4673

4674class AppendableMultiSeriesTable(AppendableSeriesTable):

4675 """support the new appendable table formats"""

4676

4677 pandas_kind = "series_table"

4678 table_type = "appendable_multiseries"

4679

4680 def write(self, obj, **kwargs):

4681 """we are going to write this as a frame table"""

4682 name = obj.name or "values"

4683 newobj, self.levels = self.validate_multiindex(obj)

4684 assert isinstance(self.levels, list) # for mypy

4685 cols = list(self.levels)

4686 cols.append(name)

4687 newobj.columns = Index(cols)

4688 return super().write(obj=newobj, **kwargs)

4689

4690

4691class GenericTable(AppendableFrameTable):

4692 """a table that read/writes the generic pytables table format"""

4693

4694 pandas_kind = "frame_table"

4695 table_type = "generic_table"

4696 ndim = 2

4697 obj_type = DataFrame

4698 levels: list[Hashable]

4699

4700 @property

4701 def pandas_type(self) -> str:

4702 return self.pandas_kind

4703

4704 @property

4705 def storable(self):

4706 return getattr(self.group, "table", None) or self.group

4707

4708 def get_attrs(self) -> None:

4709 """retrieve our attributes"""

4710 self.non_index_axes = []

4711 self.nan_rep = None

4712 self.levels = []

4713

4714 self.index_axes = [a for a in self.indexables if a.is_an_indexable]

4715 self.values_axes = [a for a in self.indexables if not a.is_an_indexable]

4716 self.data_columns = [a.name for a in self.values_axes]

4717

4718 @cache_readonly

4719 def indexables(self):

4720 """create the indexables from the table description"""

4721 d = self.description

4722

4723 # TODO: can we get a typ for this? AFAICT it is the only place

4724 # where we aren't passing one

4725 # the index columns is just a simple index

4726 md = self.read_metadata("index")

4727 meta = "category" if md is not None else None

4728 index_col = GenericIndexCol(

4729 name="index", axis=0, table=self.table, meta=meta, metadata=md

4730 )

4731

4732 _indexables: list[GenericIndexCol | GenericDataIndexableCol] = [index_col]

4733

4734 for i, n in enumerate(d._v_names):

4735 assert isinstance(n, str)

4736

4737 atom = getattr(d, n)

4738 md = self.read_metadata(n)

4739 meta = "category" if md is not None else None

4740 dc = GenericDataIndexableCol(

4741 name=n,

4742 pos=i,

4743 values=[n],

4744 typ=atom,

4745 table=self.table,

4746 meta=meta,

4747 metadata=md,

4748 )

4749 _indexables.append(dc)

4750

4751 return _indexables

4752

4753 def write(self, **kwargs):

4754 raise NotImplementedError("cannot write on an generic table")

4755

4756

4757class AppendableMultiFrameTable(AppendableFrameTable):

4758 """a frame with a multi-index"""

4759

4760 table_type = "appendable_multiframe"

4761 obj_type = DataFrame

4762 ndim = 2

4763 _re_levels = re.compile(r"^level_\d+$")

4764

4765 @property

4766 def table_type_short(self) -> str:

4767 return "appendable_multi"

4768

4769 def write(self, obj, data_columns=None, **kwargs):

4770 if data_columns is None:

4771 data_columns = []

4772 elif data_columns is True:

4773 data_columns = obj.columns.tolist()

4774 obj, self.levels = self.validate_multiindex(obj)

4775 assert isinstance(self.levels, list) # for mypy

4776 for n in self.levels:

4777 if n not in data_columns:

4778 data_columns.insert(0, n)

4779 return super().write(obj=obj, data_columns=data_columns, **kwargs)

4780

4781 def read(

4782 self,

4783 where=None,

4784 columns=None,

4785 start: int | None = None,

4786 stop: int | None = None,

4787 ):

4788

4789 df = super().read(where=where, columns=columns, start=start, stop=stop)

4790 df = df.set_index(self.levels)

4791

4792 # remove names for 'level_%d'

4793 df.index = df.index.set_names(

4794 [None if self._re_levels.search(name) else name for name in df.index.names]

4795 )

4796

4797 return df

4798

4799

4800def _reindex_axis(obj: DataFrame, axis: int, labels: Index, other=None) -> DataFrame:

4801 ax = obj._get_axis(axis)

4802 labels = ensure_index(labels)

4803

4804 # try not to reindex even if other is provided

4805 # if it equals our current index

4806 if other is not None:

4807 other = ensure_index(other)

4808 if (other is None or labels.equals(other)) and labels.equals(ax):

4809 return obj

4810

4811 labels = ensure_index(labels.unique())

4812 if other is not None:

4813 labels = ensure_index(other.unique()).intersection(labels, sort=False)

4814 if not labels.equals(ax):

4815 slicer: list[slice | Index] = [slice(None, None)] * obj.ndim

4816 slicer[axis] = labels

4817 obj = obj.loc[tuple(slicer)]

4818 return obj

4819

4820

4821# tz to/from coercion

4822

4823

4824def _get_tz(tz: tzinfo) -> str | tzinfo:

4825 """for a tz-aware type, return an encoded zone"""

4826 zone = timezones.get_timezone(tz)

4827 return zone

4828

4829

4830@overload

4831def _set_tz(

4832 values: np.ndarray | Index, tz: str | tzinfo, coerce: bool = False

4833) -> DatetimeIndex:

4834 ...

4835

4836

4837@overload

4838def _set_tz(values: np.ndarray | Index, tz: None, coerce: bool = False) -> np.ndarray:

4839 ...

4840

4841

4842def _set_tz(

4843 values: np.ndarray | Index, tz: str | tzinfo | None, coerce: bool = False

4844) -> np.ndarray | DatetimeIndex:

4845 """

4846 coerce the values to a DatetimeIndex if tz is set

4847 preserve the input shape if possible

4848

4849 Parameters

4850 ----------

4851 values : ndarray or Index

4852 tz : str or tzinfo

4853 coerce : if we do not have a passed timezone, coerce to M8[ns] ndarray

4854 """

4855 if isinstance(values, DatetimeIndex):

4856 # If values is tzaware, the tz gets dropped in the values.ravel()

4857 # call below (which returns an ndarray). So we are only non-lossy

4858 # if `tz` matches `values.tz`.

4859 assert values.tz is None or values.tz == tz

4860

4861 if tz is not None:

4862 if isinstance(values, DatetimeIndex):

4863 name = values.name

4864 values = values.asi8

4865 else:

4866 name = None

4867 values = values.ravel()

4868

4869 tz = _ensure_decoded(tz)

4870 values = DatetimeIndex(values, name=name)

4871 values = values.tz_localize("UTC").tz_convert(tz)

4872 elif coerce:

4873 values = np.asarray(values, dtype="M8[ns]")

4874

4875 # error: Incompatible return value type (got "Union[ndarray, Index]",

4876 # expected "Union[ndarray, DatetimeIndex]")

4877 return values # type: ignore[return-value]

4878

4879

4880def _convert_index(name: str, index: Index, encoding: str, errors: str) -> IndexCol:

4881 assert isinstance(name, str)

4882

4883 index_name = index.name

4884 # error: Argument 1 to "_get_data_and_dtype_name" has incompatible type "Index";

4885 # expected "Union[ExtensionArray, ndarray]"

4886 converted, dtype_name = _get_data_and_dtype_name(index) # type: ignore[arg-type]

4887 kind = _dtype_to_kind(dtype_name)

4888 atom = DataIndexableCol._get_atom(converted)

4889

4890 if (

4891 isinstance(index, Int64Index)

4892 or needs_i8_conversion(index.dtype)

4893 or is_bool_dtype(index.dtype)

4894 ):

4895 # Includes Int64Index, RangeIndex, DatetimeIndex, TimedeltaIndex, PeriodIndex,

4896 # in which case "kind" is "integer", "integer", "datetime64",

4897 # "timedelta64", and "integer", respectively.

4898 return IndexCol(

4899 name,

4900 values=converted,

4901 kind=kind,

4902 typ=atom,

4903 freq=getattr(index, "freq", None),

4904 tz=getattr(index, "tz", None),

4905 index_name=index_name,

4906 )

4907

4908 if isinstance(index, MultiIndex):

4909 raise TypeError("MultiIndex not supported here!")

4910

4911 inferred_type = lib.infer_dtype(index, skipna=False)

4912 # we won't get inferred_type of "datetime64" or "timedelta64" as these

4913 # would go through the DatetimeIndex/TimedeltaIndex paths above

4914

4915 values = np.asarray(index)

4916

4917 if inferred_type == "date":

4918 converted = np.asarray([v.toordinal() for v in values], dtype=np.int32)

4919 return IndexCol(

4920 name, converted, "date", _tables().Time32Col(), index_name=index_name

4921 )

4922 elif inferred_type == "string":

4923

4924 converted = _convert_string_array(values, encoding, errors)

4925 itemsize = converted.dtype.itemsize

4926 return IndexCol(

4927 name,

4928 converted,

4929 "string",

4930 _tables().StringCol(itemsize),

4931 index_name=index_name,

4932 )

4933

4934 elif inferred_type in ["integer", "floating"]:

4935 return IndexCol(

4936 name, values=converted, kind=kind, typ=atom, index_name=index_name

4937 )

4938 else:

4939 assert isinstance(converted, np.ndarray) and converted.dtype == object

4940 assert kind == "object", kind

4941 atom = _tables().ObjectAtom()

4942 return IndexCol(name, converted, kind, atom, index_name=index_name)

4943

4944

4945def _unconvert_index(data, kind: str, encoding: str, errors: str) -> np.ndarray | Index:

4946 index: Index | np.ndarray

4947

4948 if kind == "datetime64":

4949 index = DatetimeIndex(data)

4950 elif kind == "timedelta64":

4951 index = TimedeltaIndex(data)

4952 elif kind == "date":

4953 try:

4954 index = np.asarray([date.fromordinal(v) for v in data], dtype=object)

4955 except (ValueError):

4956 index = np.asarray([date.fromtimestamp(v) for v in data], dtype=object)

4957 elif kind in ("integer", "float", "bool"):

4958 index = np.asarray(data)

4959 elif kind in ("string"):

4960 index = _unconvert_string_array(

4961 data, nan_rep=None, encoding=encoding, errors=errors

4962 )

4963 elif kind == "object":

4964 index = np.asarray(data[0])

4965 else: # pragma: no cover

4966 raise ValueError(f"unrecognized index type {kind}")

4967 return index

4968

4969

4970def _maybe_convert_for_string_atom(

4971 name: str,

4972 bvalues: ArrayLike,

4973 existing_col,

4974 min_itemsize,

4975 nan_rep,

4976 encoding,

4977 errors,

4978 columns: list[str],

4979):

4980

4981 if bvalues.dtype != object:

4982 return bvalues

4983

4984 bvalues = cast(np.ndarray, bvalues)

4985

4986 dtype_name = bvalues.dtype.name

4987 inferred_type = lib.infer_dtype(bvalues, skipna=False)

4988

4989 if inferred_type == "date":

4990 raise TypeError("[date] is not implemented as a table column")

4991 elif inferred_type == "datetime":

4992 # after GH#8260

4993 # this only would be hit for a multi-timezone dtype which is an error

4994 raise TypeError(

4995 "too many timezones in this block, create separate data columns"

4996 )

4997

4998 elif not (inferred_type == "string" or dtype_name == "object"):

4999 return bvalues

5000

5001 mask = isna(bvalues)

5002 data = bvalues.copy()

5003 data[mask] = nan_rep

5004

5005 # see if we have a valid string type

5006 inferred_type = lib.infer_dtype(data, skipna=False)

5007 if inferred_type != "string":

5008

5009 # we cannot serialize this data, so report an exception on a column

5010 # by column basis

5011

5012 # expected behaviour:

5013 # search block for a non-string object column by column

5014 for i in range(data.shape[0]):

5015 col = data[i]

5016 inferred_type = lib.infer_dtype(col, skipna=False)

5017 if inferred_type != "string":

5018 error_column_label = columns[i] if len(columns) > i else f"No.{i}"

5019 raise TypeError(

5020 f"Cannot serialize the column [{error_column_label}]\n"

5021 f"because its data contents are not [string] but "

5022 f"[{inferred_type}] object dtype"

5023 )

5024

5025 # itemsize is the maximum length of a string (along any dimension)

5026

5027 data_converted = _convert_string_array(data, encoding, errors).reshape(data.shape)

5028 itemsize = data_converted.itemsize

5029

5030 # specified min_itemsize?

5031 if isinstance(min_itemsize, dict):

5032 min_itemsize = int(min_itemsize.get(name) or min_itemsize.get("values") or 0)

5033 itemsize = max(min_itemsize or 0, itemsize)

5034

5035 # check for column in the values conflicts

5036 if existing_col is not None:

5037 eci = existing_col.validate_col(itemsize)

5038 if eci is not None and eci > itemsize:

5039 itemsize = eci

5040

5041 data_converted = data_converted.astype(f"|S{itemsize}", copy=False)

5042 return data_converted

5043

5044

5045def _convert_string_array(data: np.ndarray, encoding: str, errors: str) -> np.ndarray:

5046 """

5047 Take a string-like that is object dtype and coerce to a fixed size string type.

5048

5049 Parameters

5050 ----------

5051 data : np.ndarray[object]

5052 encoding : str

5053 errors : str

5054 Handler for encoding errors.

5055

5056 Returns

5057 -------

5058 np.ndarray[fixed-length-string]

5059 """

5060 # encode if needed

5061 if len(data):

5062 data = (

5063 Series(data.ravel())

5064 .str.encode(encoding, errors)

5065 ._values.reshape(data.shape)

5066 )

5067

5068 # create the sized dtype

5069 ensured = ensure_object(data.ravel())

5070 itemsize = max(1, libwriters.max_len_string_array(ensured))

5071

5072 data = np.asarray(data, dtype=f"S{itemsize}")

5073 return data

5074

5075

5076def _unconvert_string_array(

5077 data: np.ndarray, nan_rep, encoding: str, errors: str

5078) -> np.ndarray:

5079 """

5080 Inverse of _convert_string_array.

5081

5082 Parameters

5083 ----------

5084 data : np.ndarray[fixed-length-string]

5085 nan_rep : the storage repr of NaN

5086 encoding : str

5087 errors : str

5088 Handler for encoding errors.

5089

5090 Returns

5091 -------

5092 np.ndarray[object]

5093 Decoded data.

5094 """

5095 shape = data.shape

5096 data = np.asarray(data.ravel(), dtype=object)

5097

5098 if len(data):

5099

5100 itemsize = libwriters.max_len_string_array(ensure_object(data))

5101 dtype = f"U{itemsize}"

5102

5103 if isinstance(data[0], bytes):

5104 data = Series(data).str.decode(encoding, errors=errors)._values

5105 else:

5106 data = data.astype(dtype, copy=False).astype(object, copy=False)

5107

5108 if nan_rep is None:

5109 nan_rep = "nan"

5110

5111 libwriters.string_array_replace_from_nan_rep(data, nan_rep)

5112 return data.reshape(shape)

5113

5114

5115def _maybe_convert(values: np.ndarray, val_kind: str, encoding: str, errors: str):

5116 assert isinstance(val_kind, str), type(val_kind)

5117 if _need_convert(val_kind):

5118 conv = _get_converter(val_kind, encoding, errors)

5119 values = conv(values)

5120 return values

5121

5122

5123def _get_converter(kind: str, encoding: str, errors: str):

5124 if kind == "datetime64":

5125 return lambda x: np.asarray(x, dtype="M8[ns]")

5126 elif kind == "string":

5127 return lambda x: _unconvert_string_array(

5128 x, nan_rep=None, encoding=encoding, errors=errors

5129 )

5130 else: # pragma: no cover

5131 raise ValueError(f"invalid kind {kind}")

5132

5133

5134def _need_convert(kind: str) -> bool:

5135 if kind in ("datetime64", "string"):

5136 return True

5137 return False

5138

5139

5140def _maybe_adjust_name(name: str, version: Sequence[int]) -> str:

5141 """

5142 Prior to 0.10.1, we named values blocks like: values_block_0 an the

5143 name values_0, adjust the given name if necessary.

5144

5145 Parameters

5146 ----------

5147 name : str

5148 version : Tuple[int, int, int]

5149

5150 Returns

5151 -------

5152 str

5153 """

5154 if isinstance(version, str) or len(version) < 3:

5155 raise ValueError("Version is incorrect, expected sequence of 3 integers.")

5156

5157 if version[0] == 0 and version[1] <= 10 and version[2] == 0:

5158 m = re.search(r"values_block_(\d+)", name)

5159 if m:

5160 grp = m.groups()[0]

5161 name = f"values_{grp}"

5162 return name

5163

5164

5165def _dtype_to_kind(dtype_str: str) -> str:

5166 """

5167 Find the "kind" string describing the given dtype name.

5168 """

5169 dtype_str = _ensure_decoded(dtype_str)

5170

5171 if dtype_str.startswith("string") or dtype_str.startswith("bytes"):

5172 kind = "string"

5173 elif dtype_str.startswith("float"):

5174 kind = "float"

5175 elif dtype_str.startswith("complex"):

5176 kind = "complex"

5177 elif dtype_str.startswith("int") or dtype_str.startswith("uint"):

5178 kind = "integer"

5179 elif dtype_str.startswith("datetime64"):

5180 kind = "datetime64"

5181 elif dtype_str.startswith("timedelta"):

5182 kind = "timedelta64"

5183 elif dtype_str.startswith("bool"):

5184 kind = "bool"

5185 elif dtype_str.startswith("category"):

5186 kind = "category"

5187 elif dtype_str.startswith("period"):

5188 # We store the `freq` attr so we can restore from integers

5189 kind = "integer"

5190 elif dtype_str == "object":

5191 kind = "object"

5192 else:

5193 raise ValueError(f"cannot interpret dtype of [{dtype_str}]")

5194

5195 return kind

5196

5197

5198def _get_data_and_dtype_name(data: ArrayLike):

5199 """

5200 Convert the passed data into a storable form and a dtype string.

5201 """

5202 if isinstance(data, Categorical):

5203 data = data.codes

5204

5205 # For datetime64tz we need to drop the TZ in tests TODO: why?

5206 dtype_name = data.dtype.name.split("[")[0]

5207

5208 if data.dtype.kind in ["m", "M"]:

5209 data = np.asarray(data.view("i8"))

5210 # TODO: we used to reshape for the dt64tz case, but no longer

5211 # doing that doesn't seem to break anything. why?

5212

5213 elif isinstance(data, PeriodIndex):

5214 data = data.asi8

5215

5216 data = np.asarray(data)

5217 return data, dtype_name

5218

5219

5220class Selection:

5221 """

5222 Carries out a selection operation on a tables.Table object.

5223

5224 Parameters

5225 ----------

5226 table : a Table object

5227 where : list of Terms (or convertible to)

5228 start, stop: indices to start and/or stop selection

5229

5230 """

5231

5232 def __init__(

5233 self,

5234 table: Table,

5235 where=None,

5236 start: int | None = None,

5237 stop: int | None = None,

5238 ) -> None:

5239 self.table = table

5240 self.where = where

5241 self.start = start

5242 self.stop = stop

5243 self.condition = None

5244 self.filter = None

5245 self.terms = None

5246 self.coordinates = None

5247

5248 if is_list_like(where):

5249

5250 # see if we have a passed coordinate like

5251 with suppress(ValueError):

5252 inferred = lib.infer_dtype(where, skipna=False)

5253 if inferred == "integer" or inferred == "boolean":

5254 where = np.asarray(where)

5255 if where.dtype == np.bool_:

5256 start, stop = self.start, self.stop

5257 if start is None:

5258 start = 0

5259 if stop is None:

5260 stop = self.table.nrows

5261 self.coordinates = np.arange(start, stop)[where]

5262 elif issubclass(where.dtype.type, np.integer):

5263 if (self.start is not None and (where < self.start).any()) or (

5264 self.stop is not None and (where >= self.stop).any()

5265 ):

5266 raise ValueError(

5267 "where must have index locations >= start and < stop"

5268 )

5269 self.coordinates = where

5270

5271 if self.coordinates is None:

5272

5273 self.terms = self.generate(where)

5274

5275 # create the numexpr & the filter

5276 if self.terms is not None:

5277 self.condition, self.filter = self.terms.evaluate()

5278

5279 def generate(self, where):

5280 """where can be a : dict,list,tuple,string"""

5281 if where is None:

5282 return None

5283

5284 q = self.table.queryables()

5285 try:

5286 return PyTablesExpr(where, queryables=q, encoding=self.table.encoding)

5287 except NameError as err:

5288 # raise a nice message, suggesting that the user should use

5289 # data_columns

5290 qkeys = ",".join(q.keys())

5291 msg = dedent(

5292 f"""\

5293 The passed where expression: {where}

5294 contains an invalid variable reference

5295 all of the variable references must be a reference to

5296 an axis (e.g. 'index' or 'columns'), or a data_column

5297 The currently defined references are: {qkeys}

5298 """

5299 )

5300 raise ValueError(msg) from err

5301

5302 def select(self):

5303 """

5304 generate the selection

5305 """

5306 if self.condition is not None:

5307 return self.table.table.read_where(

5308 self.condition.format(), start=self.start, stop=self.stop

5309 )

5310 elif self.coordinates is not None:

5311 return self.table.table.read_coordinates(self.coordinates)

5312 return self.table.table.read(start=self.start, stop=self.stop)

5313

5314 def select_coords(self):

5315 """

5316 generate the selection

5317 """

5318 start, stop = self.start, self.stop

5319 nrows = self.table.nrows

5320 if start is None:

5321 start = 0

5322 elif start < 0:

5323 start += nrows

5324 if stop is None:

5325 stop = nrows

5326 elif stop < 0:

5327 stop += nrows

5328

5329 if self.condition is not None:

5330 return self.table.table.get_where_list(

5331 self.condition.format(), start=start, stop=stop, sort=True

5332 )

5333 elif self.coordinates is not None:

5334 return self.coordinates

5335

5336 return np.arange(start, stop)