Coverage for /var/srv/projects/api.amasfac.comuna18.com/tmp/venv/lib/python3.9/site-packages/pandas/io/pytables.py: 14%

2279 statements  

« prev     ^ index     » next       coverage.py v6.4.4, created at 2023-07-17 14:22 -0600

1""" 

2High level interface to PyTables for reading and writing pandas data structures 

3to disk 

4""" 

5from __future__ import annotations 

6 

7from contextlib import suppress 

8import copy 

9from datetime import ( 

10 date, 

11 tzinfo, 

12) 

13import itertools 

14import os 

15import re 

16from textwrap import dedent 

17from typing import ( 

18 TYPE_CHECKING, 

19 Any, 

20 Callable, 

21 Final, 

22 Hashable, 

23 Iterator, 

24 Literal, 

25 Sequence, 

26 cast, 

27 overload, 

28) 

29import warnings 

30 

31import numpy as np 

32 

33from pandas._config import ( 

34 config, 

35 get_option, 

36) 

37 

38from pandas._libs import ( 

39 lib, 

40 writers as libwriters, 

41) 

42from pandas._libs.tslibs import timezones 

43from pandas._typing import ( 

44 AnyArrayLike, 

45 ArrayLike, 

46 DtypeArg, 

47 FilePath, 

48 Shape, 

49 npt, 

50) 

51from pandas.compat._optional import import_optional_dependency 

52from pandas.compat.pickle_compat import patch_pickle 

53from pandas.errors import ( 

54 AttributeConflictWarning, 

55 ClosedFileError, 

56 IncompatibilityWarning, 

57 PerformanceWarning, 

58 PossibleDataLossError, 

59) 

60from pandas.util._decorators import cache_readonly 

61from pandas.util._exceptions import find_stack_level 

62 

63from pandas.core.dtypes.common import ( 

64 ensure_object, 

65 is_bool_dtype, 

66 is_categorical_dtype, 

67 is_complex_dtype, 

68 is_datetime64_dtype, 

69 is_datetime64tz_dtype, 

70 is_extension_array_dtype, 

71 is_list_like, 

72 is_string_dtype, 

73 is_timedelta64_dtype, 

74 needs_i8_conversion, 

75) 

76from pandas.core.dtypes.missing import array_equivalent 

77 

78from pandas import ( 

79 DataFrame, 

80 DatetimeIndex, 

81 Index, 

82 MultiIndex, 

83 PeriodIndex, 

84 Series, 

85 TimedeltaIndex, 

86 concat, 

87 isna, 

88) 

89from pandas.core.api import Int64Index 

90from pandas.core.arrays import ( 

91 Categorical, 

92 DatetimeArray, 

93 PeriodArray, 

94) 

95import pandas.core.common as com 

96from pandas.core.computation.pytables import ( 

97 PyTablesExpr, 

98 maybe_expression, 

99) 

100from pandas.core.construction import extract_array 

101from pandas.core.indexes.api import ensure_index 

102from pandas.core.internals import ( 

103 ArrayManager, 

104 BlockManager, 

105) 

106 

107from pandas.io.common import stringify_path 

108from pandas.io.formats.printing import ( 

109 adjoin, 

110 pprint_thing, 

111) 

112 

113if TYPE_CHECKING: 113 ↛ 114line 113 didn't jump to line 114, because the condition on line 113 was never true

114 from tables import ( 

115 Col, 

116 File, 

117 Node, 

118 ) 

119 

120 from pandas.core.internals import Block 

121 

122 

123# versioning attribute 

124_version = "0.15.2" 

125 

126# encoding 

127_default_encoding = "UTF-8" 

128 

129 

130def _ensure_decoded(s): 

131 """if we have bytes, decode them to unicode""" 

132 if isinstance(s, np.bytes_): 

133 s = s.decode("UTF-8") 

134 return s 

135 

136 

137def _ensure_encoding(encoding): 

138 # set the encoding if we need 

139 if encoding is None: 

140 encoding = _default_encoding 

141 

142 return encoding 

143 

144 

145def _ensure_str(name): 

146 """ 

147 Ensure that an index / column name is a str (python 3); otherwise they 

148 may be np.string dtype. Non-string dtypes are passed through unchanged. 

149 

150 https://github.com/pandas-dev/pandas/issues/13492 

151 """ 

152 if isinstance(name, str): 

153 name = str(name) 

154 return name 

155 

156 

157Term = PyTablesExpr 

158 

159 

160def _ensure_term(where, scope_level: int): 

161 """ 

162 Ensure that the where is a Term or a list of Term. 

163 

164 This makes sure that we are capturing the scope of variables that are 

165 passed create the terms here with a frame_level=2 (we are 2 levels down) 

166 """ 

167 # only consider list/tuple here as an ndarray is automatically a coordinate 

168 # list 

169 level = scope_level + 1 

170 if isinstance(where, (list, tuple)): 

171 where = [ 

172 Term(term, scope_level=level + 1) if maybe_expression(term) else term 

173 for term in where 

174 if term is not None 

175 ] 

176 elif maybe_expression(where): 

177 where = Term(where, scope_level=level) 

178 return where if where is None or len(where) else None 

179 

180 

181incompatibility_doc: Final = """ 

182where criteria is being ignored as this version [%s] is too old (or 

183not-defined), read the file in and write it out to a new file to upgrade (with 

184the copy_to method) 

185""" 

186 

187attribute_conflict_doc: Final = """ 

188the [%s] attribute of the existing index is [%s] which conflicts with the new 

189[%s], resetting the attribute to None 

190""" 

191 

192performance_doc: Final = """ 

193your performance may suffer as PyTables will pickle object types that it cannot 

194map directly to c-types [inferred_type->%s,key->%s] [items->%s] 

195""" 

196 

197# formats 

198_FORMAT_MAP = {"f": "fixed", "fixed": "fixed", "t": "table", "table": "table"} 

199 

200# axes map 

201_AXES_MAP = {DataFrame: [0]} 

202 

203# register our configuration options 

204dropna_doc: Final = """ 

205: boolean 

206 drop ALL nan rows when appending to a table 

207""" 

208format_doc: Final = """ 

209: format 

210 default format writing format, if None, then 

211 put will default to 'fixed' and append will default to 'table' 

212""" 

213 

214with config.config_prefix("io.hdf"): 

215 config.register_option("dropna_table", False, dropna_doc, validator=config.is_bool) 

216 config.register_option( 

217 "default_format", 

218 None, 

219 format_doc, 

220 validator=config.is_one_of_factory(["fixed", "table", None]), 

221 ) 

222 

223# oh the troubles to reduce import time 

224_table_mod = None 

225_table_file_open_policy_is_strict = False 

226 

227 

228def _tables(): 

229 global _table_mod 

230 global _table_file_open_policy_is_strict 

231 if _table_mod is None: 

232 import tables 

233 

234 _table_mod = tables 

235 

236 # set the file open policy 

237 # return the file open policy; this changes as of pytables 3.1 

238 # depending on the HDF5 version 

239 with suppress(AttributeError): 

240 _table_file_open_policy_is_strict = ( 

241 tables.file._FILE_OPEN_POLICY == "strict" 

242 ) 

243 

244 return _table_mod 

245 

246 

247# interface to/from ### 

248 

249 

250def to_hdf( 

251 path_or_buf: FilePath | HDFStore, 

252 key: str, 

253 value: DataFrame | Series, 

254 mode: str = "a", 

255 complevel: int | None = None, 

256 complib: str | None = None, 

257 append: bool = False, 

258 format: str | None = None, 

259 index: bool = True, 

260 min_itemsize: int | dict[str, int] | None = None, 

261 nan_rep=None, 

262 dropna: bool | None = None, 

263 data_columns: Literal[True] | list[str] | None = None, 

264 errors: str = "strict", 

265 encoding: str = "UTF-8", 

266) -> None: 

267 """store this object, close it if we opened it""" 

268 if append: 

269 f = lambda store: store.append( 

270 key, 

271 value, 

272 format=format, 

273 index=index, 

274 min_itemsize=min_itemsize, 

275 nan_rep=nan_rep, 

276 dropna=dropna, 

277 data_columns=data_columns, 

278 errors=errors, 

279 encoding=encoding, 

280 ) 

281 else: 

282 # NB: dropna is not passed to `put` 

283 f = lambda store: store.put( 

284 key, 

285 value, 

286 format=format, 

287 index=index, 

288 min_itemsize=min_itemsize, 

289 nan_rep=nan_rep, 

290 data_columns=data_columns, 

291 errors=errors, 

292 encoding=encoding, 

293 dropna=dropna, 

294 ) 

295 

296 path_or_buf = stringify_path(path_or_buf) 

297 if isinstance(path_or_buf, str): 

298 with HDFStore( 

299 path_or_buf, mode=mode, complevel=complevel, complib=complib 

300 ) as store: 

301 f(store) 

302 else: 

303 f(path_or_buf) 

304 

305 

306def read_hdf( 

307 path_or_buf: FilePath | HDFStore, 

308 key=None, 

309 mode: str = "r", 

310 errors: str = "strict", 

311 where: str | list | None = None, 

312 start: int | None = None, 

313 stop: int | None = None, 

314 columns: list[str] | None = None, 

315 iterator: bool = False, 

316 chunksize: int | None = None, 

317 **kwargs, 

318): 

319 """ 

320 Read from the store, close it if we opened it. 

321 

322 Retrieve pandas object stored in file, optionally based on where 

323 criteria. 

324 

325 .. warning:: 

326 

327 Pandas uses PyTables for reading and writing HDF5 files, which allows 

328 serializing object-dtype data with pickle when using the "fixed" format. 

329 Loading pickled data received from untrusted sources can be unsafe. 

330 

331 See: https://docs.python.org/3/library/pickle.html for more. 

332 

333 Parameters 

334 ---------- 

335 path_or_buf : str, path object, pandas.HDFStore 

336 Any valid string path is acceptable. Only supports the local file system, 

337 remote URLs and file-like objects are not supported. 

338 

339 If you want to pass in a path object, pandas accepts any 

340 ``os.PathLike``. 

341 

342 Alternatively, pandas accepts an open :class:`pandas.HDFStore` object. 

343 

344 key : object, optional 

345 The group identifier in the store. Can be omitted if the HDF file 

346 contains a single pandas object. 

347 mode : {'r', 'r+', 'a'}, default 'r' 

348 Mode to use when opening the file. Ignored if path_or_buf is a 

349 :class:`pandas.HDFStore`. Default is 'r'. 

350 errors : str, default 'strict' 

351 Specifies how encoding and decoding errors are to be handled. 

352 See the errors argument for :func:`open` for a full list 

353 of options. 

354 where : list, optional 

355 A list of Term (or convertible) objects. 

356 start : int, optional 

357 Row number to start selection. 

358 stop : int, optional 

359 Row number to stop selection. 

360 columns : list, optional 

361 A list of columns names to return. 

362 iterator : bool, optional 

363 Return an iterator object. 

364 chunksize : int, optional 

365 Number of rows to include in an iteration when using an iterator. 

366 **kwargs 

367 Additional keyword arguments passed to HDFStore. 

368 

369 Returns 

370 ------- 

371 item : object 

372 The selected object. Return type depends on the object stored. 

373 

374 See Also 

375 -------- 

376 DataFrame.to_hdf : Write a HDF file from a DataFrame. 

377 HDFStore : Low-level access to HDF files. 

378 

379 Examples 

380 -------- 

381 >>> df = pd.DataFrame([[1, 1.0, 'a']], columns=['x', 'y', 'z']) # doctest: +SKIP 

382 >>> df.to_hdf('./store.h5', 'data') # doctest: +SKIP 

383 >>> reread = pd.read_hdf('./store.h5') # doctest: +SKIP 

384 """ 

385 if mode not in ["r", "r+", "a"]: 

386 raise ValueError( 

387 f"mode {mode} is not allowed while performing a read. " 

388 f"Allowed modes are r, r+ and a." 

389 ) 

390 # grab the scope 

391 if where is not None: 

392 where = _ensure_term(where, scope_level=1) 

393 

394 if isinstance(path_or_buf, HDFStore): 

395 if not path_or_buf.is_open: 

396 raise OSError("The HDFStore must be open for reading.") 

397 

398 store = path_or_buf 

399 auto_close = False 

400 else: 

401 path_or_buf = stringify_path(path_or_buf) 

402 if not isinstance(path_or_buf, str): 

403 raise NotImplementedError( 

404 "Support for generic buffers has not been implemented." 

405 ) 

406 try: 

407 exists = os.path.exists(path_or_buf) 

408 

409 # if filepath is too long 

410 except (TypeError, ValueError): 

411 exists = False 

412 

413 if not exists: 

414 raise FileNotFoundError(f"File {path_or_buf} does not exist") 

415 

416 store = HDFStore(path_or_buf, mode=mode, errors=errors, **kwargs) 

417 # can't auto open/close if we are using an iterator 

418 # so delegate to the iterator 

419 auto_close = True 

420 

421 try: 

422 if key is None: 

423 groups = store.groups() 

424 if len(groups) == 0: 

425 raise ValueError( 

426 "Dataset(s) incompatible with Pandas data types, " 

427 "not table, or no datasets found in HDF5 file." 

428 ) 

429 candidate_only_group = groups[0] 

430 

431 # For the HDF file to have only one dataset, all other groups 

432 # should then be metadata groups for that candidate group. (This 

433 # assumes that the groups() method enumerates parent groups 

434 # before their children.) 

435 for group_to_check in groups[1:]: 

436 if not _is_metadata_of(group_to_check, candidate_only_group): 

437 raise ValueError( 

438 "key must be provided when HDF5 " 

439 "file contains multiple datasets." 

440 ) 

441 key = candidate_only_group._v_pathname 

442 return store.select( 

443 key, 

444 where=where, 

445 start=start, 

446 stop=stop, 

447 columns=columns, 

448 iterator=iterator, 

449 chunksize=chunksize, 

450 auto_close=auto_close, 

451 ) 

452 except (ValueError, TypeError, KeyError): 

453 if not isinstance(path_or_buf, HDFStore): 

454 # if there is an error, close the store if we opened it. 

455 with suppress(AttributeError): 

456 store.close() 

457 

458 raise 

459 

460 

461def _is_metadata_of(group: Node, parent_group: Node) -> bool: 

462 """Check if a given group is a metadata group for a given parent_group.""" 

463 if group._v_depth <= parent_group._v_depth: 

464 return False 

465 

466 current = group 

467 while current._v_depth > 1: 

468 parent = current._v_parent 

469 if parent == parent_group and current._v_name == "meta": 

470 return True 

471 current = current._v_parent 

472 return False 

473 

474 

475class HDFStore: 

476 """ 

477 Dict-like IO interface for storing pandas objects in PyTables. 

478 

479 Either Fixed or Table format. 

480 

481 .. warning:: 

482 

483 Pandas uses PyTables for reading and writing HDF5 files, which allows 

484 serializing object-dtype data with pickle when using the "fixed" format. 

485 Loading pickled data received from untrusted sources can be unsafe. 

486 

487 See: https://docs.python.org/3/library/pickle.html for more. 

488 

489 Parameters 

490 ---------- 

491 path : str 

492 File path to HDF5 file. 

493 mode : {'a', 'w', 'r', 'r+'}, default 'a' 

494 

495 ``'r'`` 

496 Read-only; no data can be modified. 

497 ``'w'`` 

498 Write; a new file is created (an existing file with the same 

499 name would be deleted). 

500 ``'a'`` 

501 Append; an existing file is opened for reading and writing, 

502 and if the file does not exist it is created. 

503 ``'r+'`` 

504 It is similar to ``'a'``, but the file must already exist. 

505 complevel : int, 0-9, default None 

506 Specifies a compression level for data. 

507 A value of 0 or None disables compression. 

508 complib : {'zlib', 'lzo', 'bzip2', 'blosc'}, default 'zlib' 

509 Specifies the compression library to be used. 

510 As of v0.20.2 these additional compressors for Blosc are supported 

511 (default if no compressor specified: 'blosc:blosclz'): 

512 {'blosc:blosclz', 'blosc:lz4', 'blosc:lz4hc', 'blosc:snappy', 

513 'blosc:zlib', 'blosc:zstd'}. 

514 Specifying a compression library which is not available issues 

515 a ValueError. 

516 fletcher32 : bool, default False 

517 If applying compression use the fletcher32 checksum. 

518 **kwargs 

519 These parameters will be passed to the PyTables open_file method. 

520 

521 Examples 

522 -------- 

523 >>> bar = pd.DataFrame(np.random.randn(10, 4)) 

524 >>> store = pd.HDFStore('test.h5') 

525 >>> store['foo'] = bar # write to HDF5 

526 >>> bar = store['foo'] # retrieve 

527 >>> store.close() 

528 

529 **Create or load HDF5 file in-memory** 

530 

531 When passing the `driver` option to the PyTables open_file method through 

532 **kwargs, the HDF5 file is loaded or created in-memory and will only be 

533 written when closed: 

534 

535 >>> bar = pd.DataFrame(np.random.randn(10, 4)) 

536 >>> store = pd.HDFStore('test.h5', driver='H5FD_CORE') 

537 >>> store['foo'] = bar 

538 >>> store.close() # only now, data is written to disk 

539 """ 

540 

541 _handle: File | None 

542 _mode: str 

543 _complevel: int 

544 _fletcher32: bool 

545 

546 def __init__( 

547 self, 

548 path, 

549 mode: str = "a", 

550 complevel: int | None = None, 

551 complib=None, 

552 fletcher32: bool = False, 

553 **kwargs, 

554 ) -> None: 

555 

556 if "format" in kwargs: 

557 raise ValueError("format is not a defined argument for HDFStore") 

558 

559 tables = import_optional_dependency("tables") 

560 

561 if complib is not None and complib not in tables.filters.all_complibs: 

562 raise ValueError( 

563 f"complib only supports {tables.filters.all_complibs} compression." 

564 ) 

565 

566 if complib is None and complevel is not None: 

567 complib = tables.filters.default_complib 

568 

569 self._path = stringify_path(path) 

570 if mode is None: 

571 mode = "a" 

572 self._mode = mode 

573 self._handle = None 

574 self._complevel = complevel if complevel else 0 

575 self._complib = complib 

576 self._fletcher32 = fletcher32 

577 self._filters = None 

578 self.open(mode=mode, **kwargs) 

579 

580 def __fspath__(self) -> str: 

581 return self._path 

582 

583 @property 

584 def root(self): 

585 """return the root node""" 

586 self._check_if_open() 

587 assert self._handle is not None # for mypy 

588 return self._handle.root 

589 

590 @property 

591 def filename(self) -> str: 

592 return self._path 

593 

594 def __getitem__(self, key: str): 

595 return self.get(key) 

596 

597 def __setitem__(self, key: str, value) -> None: 

598 self.put(key, value) 

599 

600 def __delitem__(self, key: str) -> None: 

601 return self.remove(key) 

602 

603 def __getattr__(self, name: str): 

604 """allow attribute access to get stores""" 

605 try: 

606 return self.get(name) 

607 except (KeyError, ClosedFileError): 

608 pass 

609 raise AttributeError( 

610 f"'{type(self).__name__}' object has no attribute '{name}'" 

611 ) 

612 

613 def __contains__(self, key: str) -> bool: 

614 """ 

615 check for existence of this key 

616 can match the exact pathname or the pathnm w/o the leading '/' 

617 """ 

618 node = self.get_node(key) 

619 if node is not None: 

620 name = node._v_pathname 

621 if name == key or name[1:] == key: 

622 return True 

623 return False 

624 

625 def __len__(self) -> int: 

626 return len(self.groups()) 

627 

628 def __repr__(self) -> str: 

629 pstr = pprint_thing(self._path) 

630 return f"{type(self)}\nFile path: {pstr}\n" 

631 

632 def __enter__(self) -> HDFStore: 

633 return self 

634 

635 def __exit__(self, exc_type, exc_value, traceback) -> None: 

636 self.close() 

637 

638 def keys(self, include: str = "pandas") -> list[str]: 

639 """ 

640 Return a list of keys corresponding to objects stored in HDFStore. 

641 

642 Parameters 

643 ---------- 

644 

645 include : str, default 'pandas' 

646 When kind equals 'pandas' return pandas objects. 

647 When kind equals 'native' return native HDF5 Table objects. 

648 

649 .. versionadded:: 1.1.0 

650 

651 Returns 

652 ------- 

653 list 

654 List of ABSOLUTE path-names (e.g. have the leading '/'). 

655 

656 Raises 

657 ------ 

658 raises ValueError if kind has an illegal value 

659 """ 

660 if include == "pandas": 

661 return [n._v_pathname for n in self.groups()] 

662 

663 elif include == "native": 

664 assert self._handle is not None # mypy 

665 return [ 

666 n._v_pathname for n in self._handle.walk_nodes("/", classname="Table") 

667 ] 

668 raise ValueError( 

669 f"`include` should be either 'pandas' or 'native' but is '{include}'" 

670 ) 

671 

672 def __iter__(self) -> Iterator[str]: 

673 return iter(self.keys()) 

674 

675 def items(self) -> Iterator[tuple[str, list]]: 

676 """ 

677 iterate on key->group 

678 """ 

679 for g in self.groups(): 

680 yield g._v_pathname, g 

681 

682 def iteritems(self): 

683 """ 

684 iterate on key->group 

685 """ 

686 warnings.warn( 

687 "iteritems is deprecated and will be removed in a future version. " 

688 "Use .items instead.", 

689 FutureWarning, 

690 stacklevel=find_stack_level(), 

691 ) 

692 yield from self.items() 

693 

694 def open(self, mode: str = "a", **kwargs) -> None: 

695 """ 

696 Open the file in the specified mode 

697 

698 Parameters 

699 ---------- 

700 mode : {'a', 'w', 'r', 'r+'}, default 'a' 

701 See HDFStore docstring or tables.open_file for info about modes 

702 **kwargs 

703 These parameters will be passed to the PyTables open_file method. 

704 """ 

705 tables = _tables() 

706 

707 if self._mode != mode: 

708 # if we are changing a write mode to read, ok 

709 if self._mode in ["a", "w"] and mode in ["r", "r+"]: 

710 pass 

711 elif mode in ["w"]: 

712 # this would truncate, raise here 

713 if self.is_open: 

714 raise PossibleDataLossError( 

715 f"Re-opening the file [{self._path}] with mode [{self._mode}] " 

716 "will delete the current file!" 

717 ) 

718 

719 self._mode = mode 

720 

721 # close and reopen the handle 

722 if self.is_open: 

723 self.close() 

724 

725 if self._complevel and self._complevel > 0: 

726 self._filters = _tables().Filters( 

727 self._complevel, self._complib, fletcher32=self._fletcher32 

728 ) 

729 

730 if _table_file_open_policy_is_strict and self.is_open: 

731 msg = ( 

732 "Cannot open HDF5 file, which is already opened, " 

733 "even in read-only mode." 

734 ) 

735 raise ValueError(msg) 

736 

737 self._handle = tables.open_file(self._path, self._mode, **kwargs) 

738 

739 def close(self) -> None: 

740 """ 

741 Close the PyTables file handle 

742 """ 

743 if self._handle is not None: 

744 self._handle.close() 

745 self._handle = None 

746 

747 @property 

748 def is_open(self) -> bool: 

749 """ 

750 return a boolean indicating whether the file is open 

751 """ 

752 if self._handle is None: 

753 return False 

754 return bool(self._handle.isopen) 

755 

756 def flush(self, fsync: bool = False) -> None: 

757 """ 

758 Force all buffered modifications to be written to disk. 

759 

760 Parameters 

761 ---------- 

762 fsync : bool (default False) 

763 call ``os.fsync()`` on the file handle to force writing to disk. 

764 

765 Notes 

766 ----- 

767 Without ``fsync=True``, flushing may not guarantee that the OS writes 

768 to disk. With fsync, the operation will block until the OS claims the 

769 file has been written; however, other caching layers may still 

770 interfere. 

771 """ 

772 if self._handle is not None: 

773 self._handle.flush() 

774 if fsync: 

775 with suppress(OSError): 

776 os.fsync(self._handle.fileno()) 

777 

778 def get(self, key: str): 

779 """ 

780 Retrieve pandas object stored in file. 

781 

782 Parameters 

783 ---------- 

784 key : str 

785 

786 Returns 

787 ------- 

788 object 

789 Same type as object stored in file. 

790 """ 

791 with patch_pickle(): 

792 # GH#31167 Without this patch, pickle doesn't know how to unpickle 

793 # old DateOffset objects now that they are cdef classes. 

794 group = self.get_node(key) 

795 if group is None: 

796 raise KeyError(f"No object named {key} in the file") 

797 return self._read_group(group) 

798 

799 def select( 

800 self, 

801 key: str, 

802 where=None, 

803 start=None, 

804 stop=None, 

805 columns=None, 

806 iterator=False, 

807 chunksize=None, 

808 auto_close: bool = False, 

809 ): 

810 """ 

811 Retrieve pandas object stored in file, optionally based on where criteria. 

812 

813 .. warning:: 

814 

815 Pandas uses PyTables for reading and writing HDF5 files, which allows 

816 serializing object-dtype data with pickle when using the "fixed" format. 

817 Loading pickled data received from untrusted sources can be unsafe. 

818 

819 See: https://docs.python.org/3/library/pickle.html for more. 

820 

821 Parameters 

822 ---------- 

823 key : str 

824 Object being retrieved from file. 

825 where : list or None 

826 List of Term (or convertible) objects, optional. 

827 start : int or None 

828 Row number to start selection. 

829 stop : int, default None 

830 Row number to stop selection. 

831 columns : list or None 

832 A list of columns that if not None, will limit the return columns. 

833 iterator : bool or False 

834 Returns an iterator. 

835 chunksize : int or None 

836 Number or rows to include in iteration, return an iterator. 

837 auto_close : bool or False 

838 Should automatically close the store when finished. 

839 

840 Returns 

841 ------- 

842 object 

843 Retrieved object from file. 

844 """ 

845 group = self.get_node(key) 

846 if group is None: 

847 raise KeyError(f"No object named {key} in the file") 

848 

849 # create the storer and axes 

850 where = _ensure_term(where, scope_level=1) 

851 s = self._create_storer(group) 

852 s.infer_axes() 

853 

854 # function to call on iteration 

855 def func(_start, _stop, _where): 

856 return s.read(start=_start, stop=_stop, where=_where, columns=columns) 

857 

858 # create the iterator 

859 it = TableIterator( 

860 self, 

861 s, 

862 func, 

863 where=where, 

864 nrows=s.nrows, 

865 start=start, 

866 stop=stop, 

867 iterator=iterator, 

868 chunksize=chunksize, 

869 auto_close=auto_close, 

870 ) 

871 

872 return it.get_result() 

873 

874 def select_as_coordinates( 

875 self, 

876 key: str, 

877 where=None, 

878 start: int | None = None, 

879 stop: int | None = None, 

880 ): 

881 """ 

882 return the selection as an Index 

883 

884 .. warning:: 

885 

886 Pandas uses PyTables for reading and writing HDF5 files, which allows 

887 serializing object-dtype data with pickle when using the "fixed" format. 

888 Loading pickled data received from untrusted sources can be unsafe. 

889 

890 See: https://docs.python.org/3/library/pickle.html for more. 

891 

892 

893 Parameters 

894 ---------- 

895 key : str 

896 where : list of Term (or convertible) objects, optional 

897 start : integer (defaults to None), row number to start selection 

898 stop : integer (defaults to None), row number to stop selection 

899 """ 

900 where = _ensure_term(where, scope_level=1) 

901 tbl = self.get_storer(key) 

902 if not isinstance(tbl, Table): 

903 raise TypeError("can only read_coordinates with a table") 

904 return tbl.read_coordinates(where=where, start=start, stop=stop) 

905 

906 def select_column( 

907 self, 

908 key: str, 

909 column: str, 

910 start: int | None = None, 

911 stop: int | None = None, 

912 ): 

913 """ 

914 return a single column from the table. This is generally only useful to 

915 select an indexable 

916 

917 .. warning:: 

918 

919 Pandas uses PyTables for reading and writing HDF5 files, which allows 

920 serializing object-dtype data with pickle when using the "fixed" format. 

921 Loading pickled data received from untrusted sources can be unsafe. 

922 

923 See: https://docs.python.org/3/library/pickle.html for more. 

924 

925 Parameters 

926 ---------- 

927 key : str 

928 column : str 

929 The column of interest. 

930 start : int or None, default None 

931 stop : int or None, default None 

932 

933 Raises 

934 ------ 

935 raises KeyError if the column is not found (or key is not a valid 

936 store) 

937 raises ValueError if the column can not be extracted individually (it 

938 is part of a data block) 

939 

940 """ 

941 tbl = self.get_storer(key) 

942 if not isinstance(tbl, Table): 

943 raise TypeError("can only read_column with a table") 

944 return tbl.read_column(column=column, start=start, stop=stop) 

945 

946 def select_as_multiple( 

947 self, 

948 keys, 

949 where=None, 

950 selector=None, 

951 columns=None, 

952 start=None, 

953 stop=None, 

954 iterator=False, 

955 chunksize=None, 

956 auto_close: bool = False, 

957 ): 

958 """ 

959 Retrieve pandas objects from multiple tables. 

960 

961 .. warning:: 

962 

963 Pandas uses PyTables for reading and writing HDF5 files, which allows 

964 serializing object-dtype data with pickle when using the "fixed" format. 

965 Loading pickled data received from untrusted sources can be unsafe. 

966 

967 See: https://docs.python.org/3/library/pickle.html for more. 

968 

969 Parameters 

970 ---------- 

971 keys : a list of the tables 

972 selector : the table to apply the where criteria (defaults to keys[0] 

973 if not supplied) 

974 columns : the columns I want back 

975 start : integer (defaults to None), row number to start selection 

976 stop : integer (defaults to None), row number to stop selection 

977 iterator : bool, return an iterator, default False 

978 chunksize : nrows to include in iteration, return an iterator 

979 auto_close : bool, default False 

980 Should automatically close the store when finished. 

981 

982 Raises 

983 ------ 

984 raises KeyError if keys or selector is not found or keys is empty 

985 raises TypeError if keys is not a list or tuple 

986 raises ValueError if the tables are not ALL THE SAME DIMENSIONS 

987 """ 

988 # default to single select 

989 where = _ensure_term(where, scope_level=1) 

990 if isinstance(keys, (list, tuple)) and len(keys) == 1: 

991 keys = keys[0] 

992 if isinstance(keys, str): 

993 return self.select( 

994 key=keys, 

995 where=where, 

996 columns=columns, 

997 start=start, 

998 stop=stop, 

999 iterator=iterator, 

1000 chunksize=chunksize, 

1001 auto_close=auto_close, 

1002 ) 

1003 

1004 if not isinstance(keys, (list, tuple)): 

1005 raise TypeError("keys must be a list/tuple") 

1006 

1007 if not len(keys): 

1008 raise ValueError("keys must have a non-zero length") 

1009 

1010 if selector is None: 

1011 selector = keys[0] 

1012 

1013 # collect the tables 

1014 tbls = [self.get_storer(k) for k in keys] 

1015 s = self.get_storer(selector) 

1016 

1017 # validate rows 

1018 nrows = None 

1019 for t, k in itertools.chain([(s, selector)], zip(tbls, keys)): 

1020 if t is None: 

1021 raise KeyError(f"Invalid table [{k}]") 

1022 if not t.is_table: 

1023 raise TypeError( 

1024 f"object [{t.pathname}] is not a table, and cannot be used in all " 

1025 "select as multiple" 

1026 ) 

1027 

1028 if nrows is None: 

1029 nrows = t.nrows 

1030 elif t.nrows != nrows: 

1031 raise ValueError("all tables must have exactly the same nrows!") 

1032 

1033 # The isinstance checks here are redundant with the check above, 

1034 # but necessary for mypy; see GH#29757 

1035 _tbls = [x for x in tbls if isinstance(x, Table)] 

1036 

1037 # axis is the concentration axes 

1038 axis = list({t.non_index_axes[0][0] for t in _tbls})[0] 

1039 

1040 def func(_start, _stop, _where): 

1041 

1042 # retrieve the objs, _where is always passed as a set of 

1043 # coordinates here 

1044 objs = [ 

1045 t.read(where=_where, columns=columns, start=_start, stop=_stop) 

1046 for t in tbls 

1047 ] 

1048 

1049 # concat and return 

1050 return concat(objs, axis=axis, verify_integrity=False)._consolidate() 

1051 

1052 # create the iterator 

1053 it = TableIterator( 

1054 self, 

1055 s, 

1056 func, 

1057 where=where, 

1058 nrows=nrows, 

1059 start=start, 

1060 stop=stop, 

1061 iterator=iterator, 

1062 chunksize=chunksize, 

1063 auto_close=auto_close, 

1064 ) 

1065 

1066 return it.get_result(coordinates=True) 

1067 

1068 def put( 

1069 self, 

1070 key: str, 

1071 value: DataFrame | Series, 

1072 format=None, 

1073 index=True, 

1074 append=False, 

1075 complib=None, 

1076 complevel: int | None = None, 

1077 min_itemsize: int | dict[str, int] | None = None, 

1078 nan_rep=None, 

1079 data_columns: Literal[True] | list[str] | None = None, 

1080 encoding=None, 

1081 errors: str = "strict", 

1082 track_times: bool = True, 

1083 dropna: bool = False, 

1084 ) -> None: 

1085 """ 

1086 Store object in HDFStore. 

1087 

1088 Parameters 

1089 ---------- 

1090 key : str 

1091 value : {Series, DataFrame} 

1092 format : 'fixed(f)|table(t)', default is 'fixed' 

1093 Format to use when storing object in HDFStore. Value can be one of: 

1094 

1095 ``'fixed'`` 

1096 Fixed format. Fast writing/reading. Not-appendable, nor searchable. 

1097 ``'table'`` 

1098 Table format. Write as a PyTables Table structure which may perform 

1099 worse but allow more flexible operations like searching / selecting 

1100 subsets of the data. 

1101 index : bool, default True 

1102 Write DataFrame index as a column. 

1103 append : bool, default False 

1104 This will force Table format, append the input data to the existing. 

1105 data_columns : list of columns or True, default None 

1106 List of columns to create as data columns, or True to use all columns. 

1107 See `here 

1108 <https://pandas.pydata.org/pandas-docs/stable/user_guide/io.html#query-via-data-columns>`__. 

1109 encoding : str, default None 

1110 Provide an encoding for strings. 

1111 track_times : bool, default True 

1112 Parameter is propagated to 'create_table' method of 'PyTables'. 

1113 If set to False it enables to have the same h5 files (same hashes) 

1114 independent on creation time. 

1115 dropna : bool, default False, optional 

1116 Remove missing values. 

1117 

1118 .. versionadded:: 1.1.0 

1119 """ 

1120 if format is None: 

1121 format = get_option("io.hdf.default_format") or "fixed" 

1122 format = self._validate_format(format) 

1123 self._write_to_group( 

1124 key, 

1125 value, 

1126 format=format, 

1127 index=index, 

1128 append=append, 

1129 complib=complib, 

1130 complevel=complevel, 

1131 min_itemsize=min_itemsize, 

1132 nan_rep=nan_rep, 

1133 data_columns=data_columns, 

1134 encoding=encoding, 

1135 errors=errors, 

1136 track_times=track_times, 

1137 dropna=dropna, 

1138 ) 

1139 

1140 def remove(self, key: str, where=None, start=None, stop=None) -> None: 

1141 """ 

1142 Remove pandas object partially by specifying the where condition 

1143 

1144 Parameters 

1145 ---------- 

1146 key : str 

1147 Node to remove or delete rows from 

1148 where : list of Term (or convertible) objects, optional 

1149 start : integer (defaults to None), row number to start selection 

1150 stop : integer (defaults to None), row number to stop selection 

1151 

1152 Returns 

1153 ------- 

1154 number of rows removed (or None if not a Table) 

1155 

1156 Raises 

1157 ------ 

1158 raises KeyError if key is not a valid store 

1159 

1160 """ 

1161 where = _ensure_term(where, scope_level=1) 

1162 try: 

1163 s = self.get_storer(key) 

1164 except KeyError: 

1165 # the key is not a valid store, re-raising KeyError 

1166 raise 

1167 except AssertionError: 

1168 # surface any assertion errors for e.g. debugging 

1169 raise 

1170 except Exception as err: 

1171 # In tests we get here with ClosedFileError, TypeError, and 

1172 # _table_mod.NoSuchNodeError. TODO: Catch only these? 

1173 

1174 if where is not None: 

1175 raise ValueError( 

1176 "trying to remove a node with a non-None where clause!" 

1177 ) from err 

1178 

1179 # we are actually trying to remove a node (with children) 

1180 node = self.get_node(key) 

1181 if node is not None: 

1182 node._f_remove(recursive=True) 

1183 return None 

1184 

1185 # remove the node 

1186 if com.all_none(where, start, stop): 

1187 s.group._f_remove(recursive=True) 

1188 

1189 # delete from the table 

1190 else: 

1191 if not s.is_table: 

1192 raise ValueError( 

1193 "can only remove with where on objects written as tables" 

1194 ) 

1195 return s.delete(where=where, start=start, stop=stop) 

1196 

1197 def append( 

1198 self, 

1199 key: str, 

1200 value: DataFrame | Series, 

1201 format=None, 

1202 axes=None, 

1203 index=True, 

1204 append=True, 

1205 complib=None, 

1206 complevel: int | None = None, 

1207 columns=None, 

1208 min_itemsize: int | dict[str, int] | None = None, 

1209 nan_rep=None, 

1210 chunksize=None, 

1211 expectedrows=None, 

1212 dropna: bool | None = None, 

1213 data_columns: Literal[True] | list[str] | None = None, 

1214 encoding=None, 

1215 errors: str = "strict", 

1216 ) -> None: 

1217 """ 

1218 Append to Table in file. 

1219 

1220 Node must already exist and be Table format. 

1221 

1222 Parameters 

1223 ---------- 

1224 key : str 

1225 value : {Series, DataFrame} 

1226 format : 'table' is the default 

1227 Format to use when storing object in HDFStore. Value can be one of: 

1228 

1229 ``'table'`` 

1230 Table format. Write as a PyTables Table structure which may perform 

1231 worse but allow more flexible operations like searching / selecting 

1232 subsets of the data. 

1233 index : bool, default True 

1234 Write DataFrame index as a column. 

1235 append : bool, default True 

1236 Append the input data to the existing. 

1237 data_columns : list of columns, or True, default None 

1238 List of columns to create as indexed data columns for on-disk 

1239 queries, or True to use all columns. By default only the axes 

1240 of the object are indexed. See `here 

1241 <https://pandas.pydata.org/pandas-docs/stable/user_guide/io.html#query-via-data-columns>`__. 

1242 min_itemsize : dict of columns that specify minimum str sizes 

1243 nan_rep : str to use as str nan representation 

1244 chunksize : size to chunk the writing 

1245 expectedrows : expected TOTAL row size of this table 

1246 encoding : default None, provide an encoding for str 

1247 dropna : bool, default False, optional 

1248 Do not write an ALL nan row to the store settable 

1249 by the option 'io.hdf.dropna_table'. 

1250 

1251 Notes 

1252 ----- 

1253 Does *not* check if data being appended overlaps with existing 

1254 data in the table, so be careful 

1255 """ 

1256 if columns is not None: 

1257 raise TypeError( 

1258 "columns is not a supported keyword in append, try data_columns" 

1259 ) 

1260 

1261 if dropna is None: 

1262 dropna = get_option("io.hdf.dropna_table") 

1263 if format is None: 

1264 format = get_option("io.hdf.default_format") or "table" 

1265 format = self._validate_format(format) 

1266 self._write_to_group( 

1267 key, 

1268 value, 

1269 format=format, 

1270 axes=axes, 

1271 index=index, 

1272 append=append, 

1273 complib=complib, 

1274 complevel=complevel, 

1275 min_itemsize=min_itemsize, 

1276 nan_rep=nan_rep, 

1277 chunksize=chunksize, 

1278 expectedrows=expectedrows, 

1279 dropna=dropna, 

1280 data_columns=data_columns, 

1281 encoding=encoding, 

1282 errors=errors, 

1283 ) 

1284 

1285 def append_to_multiple( 

1286 self, 

1287 d: dict, 

1288 value, 

1289 selector, 

1290 data_columns=None, 

1291 axes=None, 

1292 dropna=False, 

1293 **kwargs, 

1294 ) -> None: 

1295 """ 

1296 Append to multiple tables 

1297 

1298 Parameters 

1299 ---------- 

1300 d : a dict of table_name to table_columns, None is acceptable as the 

1301 values of one node (this will get all the remaining columns) 

1302 value : a pandas object 

1303 selector : a string that designates the indexable table; all of its 

1304 columns will be designed as data_columns, unless data_columns is 

1305 passed, in which case these are used 

1306 data_columns : list of columns to create as data columns, or True to 

1307 use all columns 

1308 dropna : if evaluates to True, drop rows from all tables if any single 

1309 row in each table has all NaN. Default False. 

1310 

1311 Notes 

1312 ----- 

1313 axes parameter is currently not accepted 

1314 

1315 """ 

1316 if axes is not None: 

1317 raise TypeError( 

1318 "axes is currently not accepted as a parameter to append_to_multiple; " 

1319 "you can create the tables independently instead" 

1320 ) 

1321 

1322 if not isinstance(d, dict): 

1323 raise ValueError( 

1324 "append_to_multiple must have a dictionary specified as the " 

1325 "way to split the value" 

1326 ) 

1327 

1328 if selector not in d: 

1329 raise ValueError( 

1330 "append_to_multiple requires a selector that is in passed dict" 

1331 ) 

1332 

1333 # figure out the splitting axis (the non_index_axis) 

1334 axis = list(set(range(value.ndim)) - set(_AXES_MAP[type(value)]))[0] 

1335 

1336 # figure out how to split the value 

1337 remain_key = None 

1338 remain_values: list = [] 

1339 for k, v in d.items(): 

1340 if v is None: 

1341 if remain_key is not None: 

1342 raise ValueError( 

1343 "append_to_multiple can only have one value in d that is None" 

1344 ) 

1345 remain_key = k 

1346 else: 

1347 remain_values.extend(v) 

1348 if remain_key is not None: 

1349 ordered = value.axes[axis] 

1350 ordd = ordered.difference(Index(remain_values)) 

1351 ordd = sorted(ordered.get_indexer(ordd)) 

1352 d[remain_key] = ordered.take(ordd) 

1353 

1354 # data_columns 

1355 if data_columns is None: 

1356 data_columns = d[selector] 

1357 

1358 # ensure rows are synchronized across the tables 

1359 if dropna: 

1360 idxs = (value[cols].dropna(how="all").index for cols in d.values()) 

1361 valid_index = next(idxs) 

1362 for index in idxs: 

1363 valid_index = valid_index.intersection(index) 

1364 value = value.loc[valid_index] 

1365 

1366 min_itemsize = kwargs.pop("min_itemsize", None) 

1367 

1368 # append 

1369 for k, v in d.items(): 

1370 dc = data_columns if k == selector else None 

1371 

1372 # compute the val 

1373 val = value.reindex(v, axis=axis) 

1374 

1375 filtered = ( 

1376 {key: value for (key, value) in min_itemsize.items() if key in v} 

1377 if min_itemsize is not None 

1378 else None 

1379 ) 

1380 self.append(k, val, data_columns=dc, min_itemsize=filtered, **kwargs) 

1381 

1382 def create_table_index( 

1383 self, 

1384 key: str, 

1385 columns=None, 

1386 optlevel: int | None = None, 

1387 kind: str | None = None, 

1388 ) -> None: 

1389 """ 

1390 Create a pytables index on the table. 

1391 

1392 Parameters 

1393 ---------- 

1394 key : str 

1395 columns : None, bool, or listlike[str] 

1396 Indicate which columns to create an index on. 

1397 

1398 * False : Do not create any indexes. 

1399 * True : Create indexes on all columns. 

1400 * None : Create indexes on all columns. 

1401 * listlike : Create indexes on the given columns. 

1402 

1403 optlevel : int or None, default None 

1404 Optimization level, if None, pytables defaults to 6. 

1405 kind : str or None, default None 

1406 Kind of index, if None, pytables defaults to "medium". 

1407 

1408 Raises 

1409 ------ 

1410 TypeError: raises if the node is not a table 

1411 """ 

1412 # version requirements 

1413 _tables() 

1414 s = self.get_storer(key) 

1415 if s is None: 

1416 return 

1417 

1418 if not isinstance(s, Table): 

1419 raise TypeError("cannot create table index on a Fixed format store") 

1420 s.create_index(columns=columns, optlevel=optlevel, kind=kind) 

1421 

1422 def groups(self) -> list: 

1423 """ 

1424 Return a list of all the top-level nodes. 

1425 

1426 Each node returned is not a pandas storage object. 

1427 

1428 Returns 

1429 ------- 

1430 list 

1431 List of objects. 

1432 """ 

1433 _tables() 

1434 self._check_if_open() 

1435 assert self._handle is not None # for mypy 

1436 assert _table_mod is not None # for mypy 

1437 return [ 

1438 g 

1439 for g in self._handle.walk_groups() 

1440 if ( 

1441 not isinstance(g, _table_mod.link.Link) 

1442 and ( 

1443 getattr(g._v_attrs, "pandas_type", None) 

1444 or getattr(g, "table", None) 

1445 or (isinstance(g, _table_mod.table.Table) and g._v_name != "table") 

1446 ) 

1447 ) 

1448 ] 

1449 

1450 def walk(self, where: str = "/") -> Iterator[tuple[str, list[str], list[str]]]: 

1451 """ 

1452 Walk the pytables group hierarchy for pandas objects. 

1453 

1454 This generator will yield the group path, subgroups and pandas object 

1455 names for each group. 

1456 

1457 Any non-pandas PyTables objects that are not a group will be ignored. 

1458 

1459 The `where` group itself is listed first (preorder), then each of its 

1460 child groups (following an alphanumerical order) is also traversed, 

1461 following the same procedure. 

1462 

1463 Parameters 

1464 ---------- 

1465 where : str, default "/" 

1466 Group where to start walking. 

1467 

1468 Yields 

1469 ------ 

1470 path : str 

1471 Full path to a group (without trailing '/'). 

1472 groups : list 

1473 Names (strings) of the groups contained in `path`. 

1474 leaves : list 

1475 Names (strings) of the pandas objects contained in `path`. 

1476 """ 

1477 _tables() 

1478 self._check_if_open() 

1479 assert self._handle is not None # for mypy 

1480 assert _table_mod is not None # for mypy 

1481 

1482 for g in self._handle.walk_groups(where): 

1483 if getattr(g._v_attrs, "pandas_type", None) is not None: 

1484 continue 

1485 

1486 groups = [] 

1487 leaves = [] 

1488 for child in g._v_children.values(): 

1489 pandas_type = getattr(child._v_attrs, "pandas_type", None) 

1490 if pandas_type is None: 

1491 if isinstance(child, _table_mod.group.Group): 

1492 groups.append(child._v_name) 

1493 else: 

1494 leaves.append(child._v_name) 

1495 

1496 yield (g._v_pathname.rstrip("/"), groups, leaves) 

1497 

1498 def get_node(self, key: str) -> Node | None: 

1499 """return the node with the key or None if it does not exist""" 

1500 self._check_if_open() 

1501 if not key.startswith("/"): 

1502 key = "/" + key 

1503 

1504 assert self._handle is not None 

1505 assert _table_mod is not None # for mypy 

1506 try: 

1507 node = self._handle.get_node(self.root, key) 

1508 except _table_mod.exceptions.NoSuchNodeError: 

1509 return None 

1510 

1511 assert isinstance(node, _table_mod.Node), type(node) 

1512 return node 

1513 

1514 def get_storer(self, key: str) -> GenericFixed | Table: 

1515 """return the storer object for a key, raise if not in the file""" 

1516 group = self.get_node(key) 

1517 if group is None: 

1518 raise KeyError(f"No object named {key} in the file") 

1519 

1520 s = self._create_storer(group) 

1521 s.infer_axes() 

1522 return s 

1523 

1524 def copy( 

1525 self, 

1526 file, 

1527 mode="w", 

1528 propindexes: bool = True, 

1529 keys=None, 

1530 complib=None, 

1531 complevel: int | None = None, 

1532 fletcher32: bool = False, 

1533 overwrite=True, 

1534 ) -> HDFStore: 

1535 """ 

1536 Copy the existing store to a new file, updating in place. 

1537 

1538 Parameters 

1539 ---------- 

1540 propindexes : bool, default True 

1541 Restore indexes in copied file. 

1542 keys : list, optional 

1543 List of keys to include in the copy (defaults to all). 

1544 overwrite : bool, default True 

1545 Whether to overwrite (remove and replace) existing nodes in the new store. 

1546 mode, complib, complevel, fletcher32 same as in HDFStore.__init__ 

1547 

1548 Returns 

1549 ------- 

1550 open file handle of the new store 

1551 """ 

1552 new_store = HDFStore( 

1553 file, mode=mode, complib=complib, complevel=complevel, fletcher32=fletcher32 

1554 ) 

1555 if keys is None: 

1556 keys = list(self.keys()) 

1557 if not isinstance(keys, (tuple, list)): 

1558 keys = [keys] 

1559 for k in keys: 

1560 s = self.get_storer(k) 

1561 if s is not None: 

1562 

1563 if k in new_store: 

1564 if overwrite: 

1565 new_store.remove(k) 

1566 

1567 data = self.select(k) 

1568 if isinstance(s, Table): 

1569 

1570 index: bool | list[str] = False 

1571 if propindexes: 

1572 index = [a.name for a in s.axes if a.is_indexed] 

1573 new_store.append( 

1574 k, 

1575 data, 

1576 index=index, 

1577 data_columns=getattr(s, "data_columns", None), 

1578 encoding=s.encoding, 

1579 ) 

1580 else: 

1581 new_store.put(k, data, encoding=s.encoding) 

1582 

1583 return new_store 

1584 

1585 def info(self) -> str: 

1586 """ 

1587 Print detailed information on the store. 

1588 

1589 Returns 

1590 ------- 

1591 str 

1592 """ 

1593 path = pprint_thing(self._path) 

1594 output = f"{type(self)}\nFile path: {path}\n" 

1595 

1596 if self.is_open: 

1597 lkeys = sorted(self.keys()) 

1598 if len(lkeys): 

1599 keys = [] 

1600 values = [] 

1601 

1602 for k in lkeys: 

1603 try: 

1604 s = self.get_storer(k) 

1605 if s is not None: 

1606 keys.append(pprint_thing(s.pathname or k)) 

1607 values.append(pprint_thing(s or "invalid_HDFStore node")) 

1608 except AssertionError: 

1609 # surface any assertion errors for e.g. debugging 

1610 raise 

1611 except Exception as detail: 

1612 keys.append(k) 

1613 dstr = pprint_thing(detail) 

1614 values.append(f"[invalid_HDFStore node: {dstr}]") 

1615 

1616 output += adjoin(12, keys, values) 

1617 else: 

1618 output += "Empty" 

1619 else: 

1620 output += "File is CLOSED" 

1621 

1622 return output 

1623 

1624 # ------------------------------------------------------------------------ 

1625 # private methods 

1626 

1627 def _check_if_open(self): 

1628 if not self.is_open: 

1629 raise ClosedFileError(f"{self._path} file is not open!") 

1630 

1631 def _validate_format(self, format: str) -> str: 

1632 """validate / deprecate formats""" 

1633 # validate 

1634 try: 

1635 format = _FORMAT_MAP[format.lower()] 

1636 except KeyError as err: 

1637 raise TypeError(f"invalid HDFStore format specified [{format}]") from err 

1638 

1639 return format 

1640 

1641 def _create_storer( 

1642 self, 

1643 group, 

1644 format=None, 

1645 value: DataFrame | Series | None = None, 

1646 encoding: str = "UTF-8", 

1647 errors: str = "strict", 

1648 ) -> GenericFixed | Table: 

1649 """return a suitable class to operate""" 

1650 cls: type[GenericFixed] | type[Table] 

1651 

1652 if value is not None and not isinstance(value, (Series, DataFrame)): 

1653 raise TypeError("value must be None, Series, or DataFrame") 

1654 

1655 def error(t): 

1656 # return instead of raising so mypy can tell where we are raising 

1657 return TypeError( 

1658 f"cannot properly create the storer for: [{t}] [group->" 

1659 f"{group},value->{type(value)},format->{format}" 

1660 ) 

1661 

1662 pt = _ensure_decoded(getattr(group._v_attrs, "pandas_type", None)) 

1663 tt = _ensure_decoded(getattr(group._v_attrs, "table_type", None)) 

1664 

1665 # infer the pt from the passed value 

1666 if pt is None: 

1667 if value is None: 

1668 _tables() 

1669 assert _table_mod is not None # for mypy 

1670 if getattr(group, "table", None) or isinstance( 

1671 group, _table_mod.table.Table 

1672 ): 

1673 pt = "frame_table" 

1674 tt = "generic_table" 

1675 else: 

1676 raise TypeError( 

1677 "cannot create a storer if the object is not existing " 

1678 "nor a value are passed" 

1679 ) 

1680 else: 

1681 if isinstance(value, Series): 

1682 pt = "series" 

1683 else: 

1684 pt = "frame" 

1685 

1686 # we are actually a table 

1687 if format == "table": 

1688 pt += "_table" 

1689 

1690 # a storer node 

1691 if "table" not in pt: 

1692 _STORER_MAP = {"series": SeriesFixed, "frame": FrameFixed} 

1693 try: 

1694 cls = _STORER_MAP[pt] 

1695 except KeyError as err: 

1696 raise error("_STORER_MAP") from err 

1697 return cls(self, group, encoding=encoding, errors=errors) 

1698 

1699 # existing node (and must be a table) 

1700 if tt is None: 

1701 # if we are a writer, determine the tt 

1702 if value is not None: 

1703 if pt == "series_table": 

1704 index = getattr(value, "index", None) 

1705 if index is not None: 

1706 if index.nlevels == 1: 

1707 tt = "appendable_series" 

1708 elif index.nlevels > 1: 

1709 tt = "appendable_multiseries" 

1710 elif pt == "frame_table": 

1711 index = getattr(value, "index", None) 

1712 if index is not None: 

1713 if index.nlevels == 1: 

1714 tt = "appendable_frame" 

1715 elif index.nlevels > 1: 

1716 tt = "appendable_multiframe" 

1717 

1718 _TABLE_MAP = { 

1719 "generic_table": GenericTable, 

1720 "appendable_series": AppendableSeriesTable, 

1721 "appendable_multiseries": AppendableMultiSeriesTable, 

1722 "appendable_frame": AppendableFrameTable, 

1723 "appendable_multiframe": AppendableMultiFrameTable, 

1724 "worm": WORMTable, 

1725 } 

1726 try: 

1727 cls = _TABLE_MAP[tt] 

1728 except KeyError as err: 

1729 raise error("_TABLE_MAP") from err 

1730 

1731 return cls(self, group, encoding=encoding, errors=errors) 

1732 

1733 def _write_to_group( 

1734 self, 

1735 key: str, 

1736 value: DataFrame | Series, 

1737 format, 

1738 axes=None, 

1739 index=True, 

1740 append=False, 

1741 complib=None, 

1742 complevel: int | None = None, 

1743 fletcher32=None, 

1744 min_itemsize: int | dict[str, int] | None = None, 

1745 chunksize=None, 

1746 expectedrows=None, 

1747 dropna=False, 

1748 nan_rep=None, 

1749 data_columns=None, 

1750 encoding=None, 

1751 errors: str = "strict", 

1752 track_times: bool = True, 

1753 ) -> None: 

1754 # we don't want to store a table node at all if our object is 0-len 

1755 # as there are not dtypes 

1756 if getattr(value, "empty", None) and (format == "table" or append): 

1757 return 

1758 

1759 group = self._identify_group(key, append) 

1760 

1761 s = self._create_storer(group, format, value, encoding=encoding, errors=errors) 

1762 if append: 

1763 # raise if we are trying to append to a Fixed format, 

1764 # or a table that exists (and we are putting) 

1765 if not s.is_table or (s.is_table and format == "fixed" and s.is_exists): 

1766 raise ValueError("Can only append to Tables") 

1767 if not s.is_exists: 

1768 s.set_object_info() 

1769 else: 

1770 s.set_object_info() 

1771 

1772 if not s.is_table and complib: 

1773 raise ValueError("Compression not supported on Fixed format stores") 

1774 

1775 # write the object 

1776 s.write( 

1777 obj=value, 

1778 axes=axes, 

1779 append=append, 

1780 complib=complib, 

1781 complevel=complevel, 

1782 fletcher32=fletcher32, 

1783 min_itemsize=min_itemsize, 

1784 chunksize=chunksize, 

1785 expectedrows=expectedrows, 

1786 dropna=dropna, 

1787 nan_rep=nan_rep, 

1788 data_columns=data_columns, 

1789 track_times=track_times, 

1790 ) 

1791 

1792 if isinstance(s, Table) and index: 

1793 s.create_index(columns=index) 

1794 

1795 def _read_group(self, group: Node): 

1796 s = self._create_storer(group) 

1797 s.infer_axes() 

1798 return s.read() 

1799 

1800 def _identify_group(self, key: str, append: bool) -> Node: 

1801 """Identify HDF5 group based on key, delete/create group if needed.""" 

1802 group = self.get_node(key) 

1803 

1804 # we make this assertion for mypy; the get_node call will already 

1805 # have raised if this is incorrect 

1806 assert self._handle is not None 

1807 

1808 # remove the node if we are not appending 

1809 if group is not None and not append: 

1810 self._handle.remove_node(group, recursive=True) 

1811 group = None 

1812 

1813 if group is None: 

1814 group = self._create_nodes_and_group(key) 

1815 

1816 return group 

1817 

1818 def _create_nodes_and_group(self, key: str) -> Node: 

1819 """Create nodes from key and return group name.""" 

1820 # assertion for mypy 

1821 assert self._handle is not None 

1822 

1823 paths = key.split("/") 

1824 # recursively create the groups 

1825 path = "/" 

1826 for p in paths: 

1827 if not len(p): 

1828 continue 

1829 new_path = path 

1830 if not path.endswith("/"): 

1831 new_path += "/" 

1832 new_path += p 

1833 group = self.get_node(new_path) 

1834 if group is None: 

1835 group = self._handle.create_group(path, p) 

1836 path = new_path 

1837 return group 

1838 

1839 

1840class TableIterator: 

1841 """ 

1842 Define the iteration interface on a table 

1843 

1844 Parameters 

1845 ---------- 

1846 store : HDFStore 

1847 s : the referred storer 

1848 func : the function to execute the query 

1849 where : the where of the query 

1850 nrows : the rows to iterate on 

1851 start : the passed start value (default is None) 

1852 stop : the passed stop value (default is None) 

1853 iterator : bool, default False 

1854 Whether to use the default iterator. 

1855 chunksize : the passed chunking value (default is 100000) 

1856 auto_close : bool, default False 

1857 Whether to automatically close the store at the end of iteration. 

1858 """ 

1859 

1860 chunksize: int | None 

1861 store: HDFStore 

1862 s: GenericFixed | Table 

1863 

1864 def __init__( 

1865 self, 

1866 store: HDFStore, 

1867 s: GenericFixed | Table, 

1868 func, 

1869 where, 

1870 nrows, 

1871 start=None, 

1872 stop=None, 

1873 iterator: bool = False, 

1874 chunksize: int | None = None, 

1875 auto_close: bool = False, 

1876 ) -> None: 

1877 self.store = store 

1878 self.s = s 

1879 self.func = func 

1880 self.where = where 

1881 

1882 # set start/stop if they are not set if we are a table 

1883 if self.s.is_table: 

1884 if nrows is None: 

1885 nrows = 0 

1886 if start is None: 

1887 start = 0 

1888 if stop is None: 

1889 stop = nrows 

1890 stop = min(nrows, stop) 

1891 

1892 self.nrows = nrows 

1893 self.start = start 

1894 self.stop = stop 

1895 

1896 self.coordinates = None 

1897 if iterator or chunksize is not None: 

1898 if chunksize is None: 

1899 chunksize = 100000 

1900 self.chunksize = int(chunksize) 

1901 else: 

1902 self.chunksize = None 

1903 

1904 self.auto_close = auto_close 

1905 

1906 def __iter__(self): 

1907 # iterate 

1908 current = self.start 

1909 if self.coordinates is None: 

1910 raise ValueError("Cannot iterate until get_result is called.") 

1911 while current < self.stop: 

1912 stop = min(current + self.chunksize, self.stop) 

1913 value = self.func(None, None, self.coordinates[current:stop]) 

1914 current = stop 

1915 if value is None or not len(value): 

1916 continue 

1917 

1918 yield value 

1919 

1920 self.close() 

1921 

1922 def close(self) -> None: 

1923 if self.auto_close: 

1924 self.store.close() 

1925 

1926 def get_result(self, coordinates: bool = False): 

1927 # return the actual iterator 

1928 if self.chunksize is not None: 

1929 if not isinstance(self.s, Table): 

1930 raise TypeError("can only use an iterator or chunksize on a table") 

1931 

1932 self.coordinates = self.s.read_coordinates(where=self.where) 

1933 

1934 return self 

1935 

1936 # if specified read via coordinates (necessary for multiple selections 

1937 if coordinates: 

1938 if not isinstance(self.s, Table): 

1939 raise TypeError("can only read_coordinates on a table") 

1940 where = self.s.read_coordinates( 

1941 where=self.where, start=self.start, stop=self.stop 

1942 ) 

1943 else: 

1944 where = self.where 

1945 

1946 # directly return the result 

1947 results = self.func(self.start, self.stop, where) 

1948 self.close() 

1949 return results 

1950 

1951 

1952class IndexCol: 

1953 """ 

1954 an index column description class 

1955 

1956 Parameters 

1957 ---------- 

1958 axis : axis which I reference 

1959 values : the ndarray like converted values 

1960 kind : a string description of this type 

1961 typ : the pytables type 

1962 pos : the position in the pytables 

1963 

1964 """ 

1965 

1966 is_an_indexable: bool = True 

1967 is_data_indexable: bool = True 

1968 _info_fields = ["freq", "tz", "index_name"] 

1969 

1970 name: str 

1971 cname: str 

1972 

1973 def __init__( 

1974 self, 

1975 name: str, 

1976 values=None, 

1977 kind=None, 

1978 typ=None, 

1979 cname: str | None = None, 

1980 axis=None, 

1981 pos=None, 

1982 freq=None, 

1983 tz=None, 

1984 index_name=None, 

1985 ordered=None, 

1986 table=None, 

1987 meta=None, 

1988 metadata=None, 

1989 ) -> None: 

1990 

1991 if not isinstance(name, str): 

1992 raise ValueError("`name` must be a str.") 

1993 

1994 self.values = values 

1995 self.kind = kind 

1996 self.typ = typ 

1997 self.name = name 

1998 self.cname = cname or name 

1999 self.axis = axis 

2000 self.pos = pos 

2001 self.freq = freq 

2002 self.tz = tz 

2003 self.index_name = index_name 

2004 self.ordered = ordered 

2005 self.table = table 

2006 self.meta = meta 

2007 self.metadata = metadata 

2008 

2009 if pos is not None: 

2010 self.set_pos(pos) 

2011 

2012 # These are ensured as long as the passed arguments match the 

2013 # constructor annotations. 

2014 assert isinstance(self.name, str) 

2015 assert isinstance(self.cname, str) 

2016 

2017 @property 

2018 def itemsize(self) -> int: 

2019 # Assumes self.typ has already been initialized 

2020 return self.typ.itemsize 

2021 

2022 @property 

2023 def kind_attr(self) -> str: 

2024 return f"{self.name}_kind" 

2025 

2026 def set_pos(self, pos: int) -> None: 

2027 """set the position of this column in the Table""" 

2028 self.pos = pos 

2029 if pos is not None and self.typ is not None: 

2030 self.typ._v_pos = pos 

2031 

2032 def __repr__(self) -> str: 

2033 temp = tuple( 

2034 map(pprint_thing, (self.name, self.cname, self.axis, self.pos, self.kind)) 

2035 ) 

2036 return ",".join( 

2037 [ 

2038 f"{key}->{value}" 

2039 for key, value in zip(["name", "cname", "axis", "pos", "kind"], temp) 

2040 ] 

2041 ) 

2042 

2043 def __eq__(self, other: Any) -> bool: 

2044 """compare 2 col items""" 

2045 return all( 

2046 getattr(self, a, None) == getattr(other, a, None) 

2047 for a in ["name", "cname", "axis", "pos"] 

2048 ) 

2049 

2050 def __ne__(self, other) -> bool: 

2051 return not self.__eq__(other) 

2052 

2053 @property 

2054 def is_indexed(self) -> bool: 

2055 """return whether I am an indexed column""" 

2056 if not hasattr(self.table, "cols"): 

2057 # e.g. if infer hasn't been called yet, self.table will be None. 

2058 return False 

2059 return getattr(self.table.cols, self.cname).is_indexed 

2060 

2061 def convert( 

2062 self, values: np.ndarray, nan_rep, encoding: str, errors: str 

2063 ) -> tuple[np.ndarray, np.ndarray] | tuple[DatetimeIndex, DatetimeIndex]: 

2064 """ 

2065 Convert the data from this selection to the appropriate pandas type. 

2066 """ 

2067 assert isinstance(values, np.ndarray), type(values) 

2068 

2069 # values is a recarray 

2070 if values.dtype.fields is not None: 

2071 values = values[self.cname] 

2072 

2073 val_kind = _ensure_decoded(self.kind) 

2074 values = _maybe_convert(values, val_kind, encoding, errors) 

2075 

2076 kwargs = {} 

2077 kwargs["name"] = _ensure_decoded(self.index_name) 

2078 

2079 if self.freq is not None: 

2080 kwargs["freq"] = _ensure_decoded(self.freq) 

2081 

2082 factory: type[Index] | type[DatetimeIndex] = Index 

2083 if is_datetime64_dtype(values.dtype) or is_datetime64tz_dtype(values.dtype): 

2084 factory = DatetimeIndex 

2085 elif values.dtype == "i8" and "freq" in kwargs: 

2086 # PeriodIndex data is stored as i8 

2087 # error: Incompatible types in assignment (expression has type 

2088 # "Callable[[Any, KwArg(Any)], PeriodIndex]", variable has type 

2089 # "Union[Type[Index], Type[DatetimeIndex]]") 

2090 factory = lambda x, **kwds: PeriodIndex( # type: ignore[assignment] 

2091 ordinal=x, **kwds 

2092 ) 

2093 

2094 # making an Index instance could throw a number of different errors 

2095 try: 

2096 new_pd_index = factory(values, **kwargs) 

2097 except ValueError: 

2098 # if the output freq is different that what we recorded, 

2099 # it should be None (see also 'doc example part 2') 

2100 if "freq" in kwargs: 

2101 kwargs["freq"] = None 

2102 new_pd_index = factory(values, **kwargs) 

2103 final_pd_index = _set_tz(new_pd_index, self.tz) 

2104 return final_pd_index, final_pd_index 

2105 

2106 def take_data(self): 

2107 """return the values""" 

2108 return self.values 

2109 

2110 @property 

2111 def attrs(self): 

2112 return self.table._v_attrs 

2113 

2114 @property 

2115 def description(self): 

2116 return self.table.description 

2117 

2118 @property 

2119 def col(self): 

2120 """return my current col description""" 

2121 return getattr(self.description, self.cname, None) 

2122 

2123 @property 

2124 def cvalues(self): 

2125 """return my cython values""" 

2126 return self.values 

2127 

2128 def __iter__(self): 

2129 return iter(self.values) 

2130 

2131 def maybe_set_size(self, min_itemsize=None) -> None: 

2132 """ 

2133 maybe set a string col itemsize: 

2134 min_itemsize can be an integer or a dict with this columns name 

2135 with an integer size 

2136 """ 

2137 if _ensure_decoded(self.kind) == "string": 

2138 if isinstance(min_itemsize, dict): 

2139 min_itemsize = min_itemsize.get(self.name) 

2140 

2141 if min_itemsize is not None and self.typ.itemsize < min_itemsize: 

2142 self.typ = _tables().StringCol(itemsize=min_itemsize, pos=self.pos) 

2143 

2144 def validate_names(self) -> None: 

2145 pass 

2146 

2147 def validate_and_set(self, handler: AppendableTable, append: bool) -> None: 

2148 self.table = handler.table 

2149 self.validate_col() 

2150 self.validate_attr(append) 

2151 self.validate_metadata(handler) 

2152 self.write_metadata(handler) 

2153 self.set_attr() 

2154 

2155 def validate_col(self, itemsize=None): 

2156 """validate this column: return the compared against itemsize""" 

2157 # validate this column for string truncation (or reset to the max size) 

2158 if _ensure_decoded(self.kind) == "string": 

2159 c = self.col 

2160 if c is not None: 

2161 if itemsize is None: 

2162 itemsize = self.itemsize 

2163 if c.itemsize < itemsize: 

2164 raise ValueError( 

2165 f"Trying to store a string with len [{itemsize}] in " 

2166 f"[{self.cname}] column but\nthis column has a limit of " 

2167 f"[{c.itemsize}]!\nConsider using min_itemsize to " 

2168 "preset the sizes on these columns" 

2169 ) 

2170 return c.itemsize 

2171 

2172 return None 

2173 

2174 def validate_attr(self, append: bool) -> None: 

2175 # check for backwards incompatibility 

2176 if append: 

2177 existing_kind = getattr(self.attrs, self.kind_attr, None) 

2178 if existing_kind is not None and existing_kind != self.kind: 

2179 raise TypeError( 

2180 f"incompatible kind in col [{existing_kind} - {self.kind}]" 

2181 ) 

2182 

2183 def update_info(self, info) -> None: 

2184 """ 

2185 set/update the info for this indexable with the key/value 

2186 if there is a conflict raise/warn as needed 

2187 """ 

2188 for key in self._info_fields: 

2189 

2190 value = getattr(self, key, None) 

2191 idx = info.setdefault(self.name, {}) 

2192 

2193 existing_value = idx.get(key) 

2194 if key in idx and value is not None and existing_value != value: 

2195 # frequency/name just warn 

2196 if key in ["freq", "index_name"]: 

2197 ws = attribute_conflict_doc % (key, existing_value, value) 

2198 warnings.warn( 

2199 ws, AttributeConflictWarning, stacklevel=find_stack_level() 

2200 ) 

2201 

2202 # reset 

2203 idx[key] = None 

2204 setattr(self, key, None) 

2205 

2206 else: 

2207 raise ValueError( 

2208 f"invalid info for [{self.name}] for [{key}], " 

2209 f"existing_value [{existing_value}] conflicts with " 

2210 f"new value [{value}]" 

2211 ) 

2212 else: 

2213 if value is not None or existing_value is not None: 

2214 idx[key] = value 

2215 

2216 def set_info(self, info) -> None: 

2217 """set my state from the passed info""" 

2218 idx = info.get(self.name) 

2219 if idx is not None: 

2220 self.__dict__.update(idx) 

2221 

2222 def set_attr(self) -> None: 

2223 """set the kind for this column""" 

2224 setattr(self.attrs, self.kind_attr, self.kind) 

2225 

2226 def validate_metadata(self, handler: AppendableTable) -> None: 

2227 """validate that kind=category does not change the categories""" 

2228 if self.meta == "category": 

2229 new_metadata = self.metadata 

2230 cur_metadata = handler.read_metadata(self.cname) 

2231 if ( 

2232 new_metadata is not None 

2233 and cur_metadata is not None 

2234 and not array_equivalent(new_metadata, cur_metadata) 

2235 ): 

2236 raise ValueError( 

2237 "cannot append a categorical with " 

2238 "different categories to the existing" 

2239 ) 

2240 

2241 def write_metadata(self, handler: AppendableTable) -> None: 

2242 """set the meta data""" 

2243 if self.metadata is not None: 

2244 handler.write_metadata(self.cname, self.metadata) 

2245 

2246 

2247class GenericIndexCol(IndexCol): 

2248 """an index which is not represented in the data of the table""" 

2249 

2250 @property 

2251 def is_indexed(self) -> bool: 

2252 return False 

2253 

2254 # error: Return type "Tuple[Int64Index, Int64Index]" of "convert" 

2255 # incompatible with return type "Union[Tuple[ndarray[Any, Any], 

2256 # ndarray[Any, Any]], Tuple[DatetimeIndex, DatetimeIndex]]" in 

2257 # supertype "IndexCol" 

2258 def convert( # type: ignore[override] 

2259 self, values: np.ndarray, nan_rep, encoding: str, errors: str 

2260 ) -> tuple[Int64Index, Int64Index]: 

2261 """ 

2262 Convert the data from this selection to the appropriate pandas type. 

2263 

2264 Parameters 

2265 ---------- 

2266 values : np.ndarray 

2267 nan_rep : str 

2268 encoding : str 

2269 errors : str 

2270 """ 

2271 assert isinstance(values, np.ndarray), type(values) 

2272 

2273 index = Int64Index(np.arange(len(values))) 

2274 return index, index 

2275 

2276 def set_attr(self) -> None: 

2277 pass 

2278 

2279 

2280class DataCol(IndexCol): 

2281 """ 

2282 a data holding column, by definition this is not indexable 

2283 

2284 Parameters 

2285 ---------- 

2286 data : the actual data 

2287 cname : the column name in the table to hold the data (typically 

2288 values) 

2289 meta : a string description of the metadata 

2290 metadata : the actual metadata 

2291 """ 

2292 

2293 is_an_indexable = False 

2294 is_data_indexable = False 

2295 _info_fields = ["tz", "ordered"] 

2296 

2297 def __init__( 

2298 self, 

2299 name: str, 

2300 values=None, 

2301 kind=None, 

2302 typ=None, 

2303 cname=None, 

2304 pos=None, 

2305 tz=None, 

2306 ordered=None, 

2307 table=None, 

2308 meta=None, 

2309 metadata=None, 

2310 dtype: DtypeArg | None = None, 

2311 data=None, 

2312 ) -> None: 

2313 super().__init__( 

2314 name=name, 

2315 values=values, 

2316 kind=kind, 

2317 typ=typ, 

2318 pos=pos, 

2319 cname=cname, 

2320 tz=tz, 

2321 ordered=ordered, 

2322 table=table, 

2323 meta=meta, 

2324 metadata=metadata, 

2325 ) 

2326 self.dtype = dtype 

2327 self.data = data 

2328 

2329 @property 

2330 def dtype_attr(self) -> str: 

2331 return f"{self.name}_dtype" 

2332 

2333 @property 

2334 def meta_attr(self) -> str: 

2335 return f"{self.name}_meta" 

2336 

2337 def __repr__(self) -> str: 

2338 temp = tuple( 

2339 map( 

2340 pprint_thing, (self.name, self.cname, self.dtype, self.kind, self.shape) 

2341 ) 

2342 ) 

2343 return ",".join( 

2344 [ 

2345 f"{key}->{value}" 

2346 for key, value in zip(["name", "cname", "dtype", "kind", "shape"], temp) 

2347 ] 

2348 ) 

2349 

2350 def __eq__(self, other: Any) -> bool: 

2351 """compare 2 col items""" 

2352 return all( 

2353 getattr(self, a, None) == getattr(other, a, None) 

2354 for a in ["name", "cname", "dtype", "pos"] 

2355 ) 

2356 

2357 def set_data(self, data: ArrayLike) -> None: 

2358 assert data is not None 

2359 assert self.dtype is None 

2360 

2361 data, dtype_name = _get_data_and_dtype_name(data) 

2362 

2363 self.data = data 

2364 self.dtype = dtype_name 

2365 self.kind = _dtype_to_kind(dtype_name) 

2366 

2367 def take_data(self): 

2368 """return the data""" 

2369 return self.data 

2370 

2371 @classmethod 

2372 def _get_atom(cls, values: ArrayLike) -> Col: 

2373 """ 

2374 Get an appropriately typed and shaped pytables.Col object for values. 

2375 """ 

2376 dtype = values.dtype 

2377 # error: Item "ExtensionDtype" of "Union[ExtensionDtype, dtype[Any]]" has no 

2378 # attribute "itemsize" 

2379 itemsize = dtype.itemsize # type: ignore[union-attr] 

2380 

2381 shape = values.shape 

2382 if values.ndim == 1: 

2383 # EA, use block shape pretending it is 2D 

2384 # TODO(EA2D): not necessary with 2D EAs 

2385 shape = (1, values.size) 

2386 

2387 if isinstance(values, Categorical): 

2388 codes = values.codes 

2389 atom = cls.get_atom_data(shape, kind=codes.dtype.name) 

2390 elif is_datetime64_dtype(dtype) or is_datetime64tz_dtype(dtype): 

2391 atom = cls.get_atom_datetime64(shape) 

2392 elif is_timedelta64_dtype(dtype): 

2393 atom = cls.get_atom_timedelta64(shape) 

2394 elif is_complex_dtype(dtype): 

2395 atom = _tables().ComplexCol(itemsize=itemsize, shape=shape[0]) 

2396 elif is_string_dtype(dtype): 

2397 atom = cls.get_atom_string(shape, itemsize) 

2398 else: 

2399 atom = cls.get_atom_data(shape, kind=dtype.name) 

2400 

2401 return atom 

2402 

2403 @classmethod 

2404 def get_atom_string(cls, shape, itemsize): 

2405 return _tables().StringCol(itemsize=itemsize, shape=shape[0]) 

2406 

2407 @classmethod 

2408 def get_atom_coltype(cls, kind: str) -> type[Col]: 

2409 """return the PyTables column class for this column""" 

2410 if kind.startswith("uint"): 

2411 k4 = kind[4:] 

2412 col_name = f"UInt{k4}Col" 

2413 elif kind.startswith("period"): 

2414 # we store as integer 

2415 col_name = "Int64Col" 

2416 else: 

2417 kcap = kind.capitalize() 

2418 col_name = f"{kcap}Col" 

2419 

2420 return getattr(_tables(), col_name) 

2421 

2422 @classmethod 

2423 def get_atom_data(cls, shape, kind: str) -> Col: 

2424 return cls.get_atom_coltype(kind=kind)(shape=shape[0]) 

2425 

2426 @classmethod 

2427 def get_atom_datetime64(cls, shape): 

2428 return _tables().Int64Col(shape=shape[0]) 

2429 

2430 @classmethod 

2431 def get_atom_timedelta64(cls, shape): 

2432 return _tables().Int64Col(shape=shape[0]) 

2433 

2434 @property 

2435 def shape(self): 

2436 return getattr(self.data, "shape", None) 

2437 

2438 @property 

2439 def cvalues(self): 

2440 """return my cython values""" 

2441 return self.data 

2442 

2443 def validate_attr(self, append) -> None: 

2444 """validate that we have the same order as the existing & same dtype""" 

2445 if append: 

2446 existing_fields = getattr(self.attrs, self.kind_attr, None) 

2447 if existing_fields is not None and existing_fields != list(self.values): 

2448 raise ValueError("appended items do not match existing items in table!") 

2449 

2450 existing_dtype = getattr(self.attrs, self.dtype_attr, None) 

2451 if existing_dtype is not None and existing_dtype != self.dtype: 

2452 raise ValueError( 

2453 "appended items dtype do not match existing items dtype in table!" 

2454 ) 

2455 

2456 def convert(self, values: np.ndarray, nan_rep, encoding: str, errors: str): 

2457 """ 

2458 Convert the data from this selection to the appropriate pandas type. 

2459 

2460 Parameters 

2461 ---------- 

2462 values : np.ndarray 

2463 nan_rep : 

2464 encoding : str 

2465 errors : str 

2466 

2467 Returns 

2468 ------- 

2469 index : listlike to become an Index 

2470 data : ndarraylike to become a column 

2471 """ 

2472 assert isinstance(values, np.ndarray), type(values) 

2473 

2474 # values is a recarray 

2475 if values.dtype.fields is not None: 

2476 values = values[self.cname] 

2477 

2478 assert self.typ is not None 

2479 if self.dtype is None: 

2480 # Note: in tests we never have timedelta64 or datetime64, 

2481 # so the _get_data_and_dtype_name may be unnecessary 

2482 converted, dtype_name = _get_data_and_dtype_name(values) 

2483 kind = _dtype_to_kind(dtype_name) 

2484 else: 

2485 converted = values 

2486 dtype_name = self.dtype 

2487 kind = self.kind 

2488 

2489 assert isinstance(converted, np.ndarray) # for mypy 

2490 

2491 # use the meta if needed 

2492 meta = _ensure_decoded(self.meta) 

2493 metadata = self.metadata 

2494 ordered = self.ordered 

2495 tz = self.tz 

2496 

2497 assert dtype_name is not None 

2498 # convert to the correct dtype 

2499 dtype = _ensure_decoded(dtype_name) 

2500 

2501 # reverse converts 

2502 if dtype == "datetime64": 

2503 # recreate with tz if indicated 

2504 converted = _set_tz(converted, tz, coerce=True) 

2505 

2506 elif dtype == "timedelta64": 

2507 converted = np.asarray(converted, dtype="m8[ns]") 

2508 elif dtype == "date": 

2509 try: 

2510 converted = np.asarray( 

2511 [date.fromordinal(v) for v in converted], dtype=object 

2512 ) 

2513 except ValueError: 

2514 converted = np.asarray( 

2515 [date.fromtimestamp(v) for v in converted], dtype=object 

2516 ) 

2517 

2518 elif meta == "category": 

2519 # we have a categorical 

2520 categories = metadata 

2521 codes = converted.ravel() 

2522 

2523 # if we have stored a NaN in the categories 

2524 # then strip it; in theory we could have BOTH 

2525 # -1s in the codes and nulls :< 

2526 if categories is None: 

2527 # Handle case of NaN-only categorical columns in which case 

2528 # the categories are an empty array; when this is stored, 

2529 # pytables cannot write a zero-len array, so on readback 

2530 # the categories would be None and `read_hdf()` would fail. 

2531 categories = Index([], dtype=np.float64) 

2532 else: 

2533 mask = isna(categories) 

2534 if mask.any(): 

2535 categories = categories[~mask] 

2536 codes[codes != -1] -= mask.astype(int).cumsum()._values 

2537 

2538 converted = Categorical.from_codes( 

2539 codes, categories=categories, ordered=ordered 

2540 ) 

2541 

2542 else: 

2543 

2544 try: 

2545 converted = converted.astype(dtype, copy=False) 

2546 except TypeError: 

2547 converted = converted.astype("O", copy=False) 

2548 

2549 # convert nans / decode 

2550 if _ensure_decoded(kind) == "string": 

2551 converted = _unconvert_string_array( 

2552 converted, nan_rep=nan_rep, encoding=encoding, errors=errors 

2553 ) 

2554 

2555 return self.values, converted 

2556 

2557 def set_attr(self) -> None: 

2558 """set the data for this column""" 

2559 setattr(self.attrs, self.kind_attr, self.values) 

2560 setattr(self.attrs, self.meta_attr, self.meta) 

2561 assert self.dtype is not None 

2562 setattr(self.attrs, self.dtype_attr, self.dtype) 

2563 

2564 

2565class DataIndexableCol(DataCol): 

2566 """represent a data column that can be indexed""" 

2567 

2568 is_data_indexable = True 

2569 

2570 def validate_names(self) -> None: 

2571 if not Index(self.values).is_object(): 

2572 # TODO: should the message here be more specifically non-str? 

2573 raise ValueError("cannot have non-object label DataIndexableCol") 

2574 

2575 @classmethod 

2576 def get_atom_string(cls, shape, itemsize): 

2577 return _tables().StringCol(itemsize=itemsize) 

2578 

2579 @classmethod 

2580 def get_atom_data(cls, shape, kind: str) -> Col: 

2581 return cls.get_atom_coltype(kind=kind)() 

2582 

2583 @classmethod 

2584 def get_atom_datetime64(cls, shape): 

2585 return _tables().Int64Col() 

2586 

2587 @classmethod 

2588 def get_atom_timedelta64(cls, shape): 

2589 return _tables().Int64Col() 

2590 

2591 

2592class GenericDataIndexableCol(DataIndexableCol): 

2593 """represent a generic pytables data column""" 

2594 

2595 pass 

2596 

2597 

2598class Fixed: 

2599 """ 

2600 represent an object in my store 

2601 facilitate read/write of various types of objects 

2602 this is an abstract base class 

2603 

2604 Parameters 

2605 ---------- 

2606 parent : HDFStore 

2607 group : Node 

2608 The group node where the table resides. 

2609 """ 

2610 

2611 pandas_kind: str 

2612 format_type: str = "fixed" # GH#30962 needed by dask 

2613 obj_type: type[DataFrame | Series] 

2614 ndim: int 

2615 encoding: str 

2616 parent: HDFStore 

2617 group: Node 

2618 errors: str 

2619 is_table: bool = False 

2620 

2621 def __init__( 

2622 self, 

2623 parent: HDFStore, 

2624 group: Node, 

2625 encoding: str = "UTF-8", 

2626 errors: str = "strict", 

2627 ) -> None: 

2628 assert isinstance(parent, HDFStore), type(parent) 

2629 assert _table_mod is not None # needed for mypy 

2630 assert isinstance(group, _table_mod.Node), type(group) 

2631 self.parent = parent 

2632 self.group = group 

2633 self.encoding = _ensure_encoding(encoding) 

2634 self.errors = errors 

2635 

2636 @property 

2637 def is_old_version(self) -> bool: 

2638 return self.version[0] <= 0 and self.version[1] <= 10 and self.version[2] < 1 

2639 

2640 @property 

2641 def version(self) -> tuple[int, int, int]: 

2642 """compute and set our version""" 

2643 version = _ensure_decoded(getattr(self.group._v_attrs, "pandas_version", None)) 

2644 try: 

2645 version = tuple(int(x) for x in version.split(".")) 

2646 if len(version) == 2: 

2647 version = version + (0,) 

2648 except AttributeError: 

2649 version = (0, 0, 0) 

2650 return version 

2651 

2652 @property 

2653 def pandas_type(self): 

2654 return _ensure_decoded(getattr(self.group._v_attrs, "pandas_type", None)) 

2655 

2656 def __repr__(self) -> str: 

2657 """return a pretty representation of myself""" 

2658 self.infer_axes() 

2659 s = self.shape 

2660 if s is not None: 

2661 if isinstance(s, (list, tuple)): 

2662 jshape = ",".join([pprint_thing(x) for x in s]) 

2663 s = f"[{jshape}]" 

2664 return f"{self.pandas_type:12.12} (shape->{s})" 

2665 return self.pandas_type 

2666 

2667 def set_object_info(self) -> None: 

2668 """set my pandas type & version""" 

2669 self.attrs.pandas_type = str(self.pandas_kind) 

2670 self.attrs.pandas_version = str(_version) 

2671 

2672 def copy(self) -> Fixed: 

2673 new_self = copy.copy(self) 

2674 return new_self 

2675 

2676 @property 

2677 def shape(self): 

2678 return self.nrows 

2679 

2680 @property 

2681 def pathname(self): 

2682 return self.group._v_pathname 

2683 

2684 @property 

2685 def _handle(self): 

2686 return self.parent._handle 

2687 

2688 @property 

2689 def _filters(self): 

2690 return self.parent._filters 

2691 

2692 @property 

2693 def _complevel(self) -> int: 

2694 return self.parent._complevel 

2695 

2696 @property 

2697 def _fletcher32(self) -> bool: 

2698 return self.parent._fletcher32 

2699 

2700 @property 

2701 def attrs(self): 

2702 return self.group._v_attrs 

2703 

2704 def set_attrs(self) -> None: 

2705 """set our object attributes""" 

2706 pass 

2707 

2708 def get_attrs(self) -> None: 

2709 """get our object attributes""" 

2710 pass 

2711 

2712 @property 

2713 def storable(self): 

2714 """return my storable""" 

2715 return self.group 

2716 

2717 @property 

2718 def is_exists(self) -> bool: 

2719 return False 

2720 

2721 @property 

2722 def nrows(self): 

2723 return getattr(self.storable, "nrows", None) 

2724 

2725 def validate(self, other) -> Literal[True] | None: 

2726 """validate against an existing storable""" 

2727 if other is None: 

2728 return None 

2729 return True 

2730 

2731 def validate_version(self, where=None) -> None: 

2732 """are we trying to operate on an old version?""" 

2733 pass 

2734 

2735 def infer_axes(self) -> bool: 

2736 """ 

2737 infer the axes of my storer 

2738 return a boolean indicating if we have a valid storer or not 

2739 """ 

2740 s = self.storable 

2741 if s is None: 

2742 return False 

2743 self.get_attrs() 

2744 return True 

2745 

2746 def read( 

2747 self, 

2748 where=None, 

2749 columns=None, 

2750 start: int | None = None, 

2751 stop: int | None = None, 

2752 ): 

2753 raise NotImplementedError( 

2754 "cannot read on an abstract storer: subclasses should implement" 

2755 ) 

2756 

2757 def write(self, **kwargs): 

2758 raise NotImplementedError( 

2759 "cannot write on an abstract storer: subclasses should implement" 

2760 ) 

2761 

2762 def delete( 

2763 self, where=None, start: int | None = None, stop: int | None = None 

2764 ) -> None: 

2765 """ 

2766 support fully deleting the node in its entirety (only) - where 

2767 specification must be None 

2768 """ 

2769 if com.all_none(where, start, stop): 

2770 self._handle.remove_node(self.group, recursive=True) 

2771 return None 

2772 

2773 raise TypeError("cannot delete on an abstract storer") 

2774 

2775 

2776class GenericFixed(Fixed): 

2777 """a generified fixed version""" 

2778 

2779 _index_type_map = {DatetimeIndex: "datetime", PeriodIndex: "period"} 

2780 _reverse_index_map = {v: k for k, v in _index_type_map.items()} 

2781 attributes: list[str] = [] 

2782 

2783 # indexer helpers 

2784 def _class_to_alias(self, cls) -> str: 

2785 return self._index_type_map.get(cls, "") 

2786 

2787 def _alias_to_class(self, alias): 

2788 if isinstance(alias, type): # pragma: no cover 

2789 # compat: for a short period of time master stored types 

2790 return alias 

2791 return self._reverse_index_map.get(alias, Index) 

2792 

2793 def _get_index_factory(self, attrs): 

2794 index_class = self._alias_to_class( 

2795 _ensure_decoded(getattr(attrs, "index_class", "")) 

2796 ) 

2797 

2798 factory: Callable 

2799 

2800 if index_class == DatetimeIndex: 

2801 

2802 def f(values, freq=None, tz=None): 

2803 # data are already in UTC, localize and convert if tz present 

2804 dta = DatetimeArray._simple_new(values.values, freq=freq) 

2805 result = DatetimeIndex._simple_new(dta, name=None) 

2806 if tz is not None: 

2807 result = result.tz_localize("UTC").tz_convert(tz) 

2808 return result 

2809 

2810 factory = f 

2811 elif index_class == PeriodIndex: 

2812 

2813 def f(values, freq=None, tz=None): 

2814 parr = PeriodArray._simple_new(values, freq=freq) 

2815 return PeriodIndex._simple_new(parr, name=None) 

2816 

2817 factory = f 

2818 else: 

2819 factory = index_class 

2820 

2821 kwargs = {} 

2822 if "freq" in attrs: 

2823 kwargs["freq"] = attrs["freq"] 

2824 if index_class is Index: 

2825 # DTI/PI would be gotten by _alias_to_class 

2826 factory = TimedeltaIndex 

2827 

2828 if "tz" in attrs: 

2829 if isinstance(attrs["tz"], bytes): 

2830 # created by python2 

2831 kwargs["tz"] = attrs["tz"].decode("utf-8") 

2832 else: 

2833 # created by python3 

2834 kwargs["tz"] = attrs["tz"] 

2835 assert index_class is DatetimeIndex # just checking 

2836 

2837 return factory, kwargs 

2838 

2839 def validate_read(self, columns, where) -> None: 

2840 """ 

2841 raise if any keywords are passed which are not-None 

2842 """ 

2843 if columns is not None: 

2844 raise TypeError( 

2845 "cannot pass a column specification when reading " 

2846 "a Fixed format store. this store must be selected in its entirety" 

2847 ) 

2848 if where is not None: 

2849 raise TypeError( 

2850 "cannot pass a where specification when reading " 

2851 "from a Fixed format store. this store must be selected in its entirety" 

2852 ) 

2853 

2854 @property 

2855 def is_exists(self) -> bool: 

2856 return True 

2857 

2858 def set_attrs(self) -> None: 

2859 """set our object attributes""" 

2860 self.attrs.encoding = self.encoding 

2861 self.attrs.errors = self.errors 

2862 

2863 def get_attrs(self) -> None: 

2864 """retrieve our attributes""" 

2865 self.encoding = _ensure_encoding(getattr(self.attrs, "encoding", None)) 

2866 self.errors = _ensure_decoded(getattr(self.attrs, "errors", "strict")) 

2867 for n in self.attributes: 

2868 setattr(self, n, _ensure_decoded(getattr(self.attrs, n, None))) 

2869 

2870 # error: Signature of "write" incompatible with supertype "Fixed" 

2871 def write(self, obj, **kwargs) -> None: # type: ignore[override] 

2872 self.set_attrs() 

2873 

2874 def read_array(self, key: str, start: int | None = None, stop: int | None = None): 

2875 """read an array for the specified node (off of group""" 

2876 import tables 

2877 

2878 node = getattr(self.group, key) 

2879 attrs = node._v_attrs 

2880 

2881 transposed = getattr(attrs, "transposed", False) 

2882 

2883 if isinstance(node, tables.VLArray): 

2884 ret = node[0][start:stop] 

2885 else: 

2886 dtype = _ensure_decoded(getattr(attrs, "value_type", None)) 

2887 shape = getattr(attrs, "shape", None) 

2888 

2889 if shape is not None: 

2890 # length 0 axis 

2891 ret = np.empty(shape, dtype=dtype) 

2892 else: 

2893 ret = node[start:stop] 

2894 

2895 if dtype == "datetime64": 

2896 # reconstruct a timezone if indicated 

2897 tz = getattr(attrs, "tz", None) 

2898 ret = _set_tz(ret, tz, coerce=True) 

2899 

2900 elif dtype == "timedelta64": 

2901 ret = np.asarray(ret, dtype="m8[ns]") 

2902 

2903 if transposed: 

2904 return ret.T 

2905 else: 

2906 return ret 

2907 

2908 def read_index( 

2909 self, key: str, start: int | None = None, stop: int | None = None 

2910 ) -> Index: 

2911 variety = _ensure_decoded(getattr(self.attrs, f"{key}_variety")) 

2912 

2913 if variety == "multi": 

2914 return self.read_multi_index(key, start=start, stop=stop) 

2915 elif variety == "regular": 

2916 node = getattr(self.group, key) 

2917 index = self.read_index_node(node, start=start, stop=stop) 

2918 return index 

2919 else: # pragma: no cover 

2920 raise TypeError(f"unrecognized index variety: {variety}") 

2921 

2922 def write_index(self, key: str, index: Index) -> None: 

2923 if isinstance(index, MultiIndex): 

2924 setattr(self.attrs, f"{key}_variety", "multi") 

2925 self.write_multi_index(key, index) 

2926 else: 

2927 setattr(self.attrs, f"{key}_variety", "regular") 

2928 converted = _convert_index("index", index, self.encoding, self.errors) 

2929 

2930 self.write_array(key, converted.values) 

2931 

2932 node = getattr(self.group, key) 

2933 node._v_attrs.kind = converted.kind 

2934 node._v_attrs.name = index.name 

2935 

2936 if isinstance(index, (DatetimeIndex, PeriodIndex)): 

2937 node._v_attrs.index_class = self._class_to_alias(type(index)) 

2938 

2939 if isinstance(index, (DatetimeIndex, PeriodIndex, TimedeltaIndex)): 

2940 node._v_attrs.freq = index.freq 

2941 

2942 if isinstance(index, DatetimeIndex) and index.tz is not None: 

2943 node._v_attrs.tz = _get_tz(index.tz) 

2944 

2945 def write_multi_index(self, key: str, index: MultiIndex) -> None: 

2946 setattr(self.attrs, f"{key}_nlevels", index.nlevels) 

2947 

2948 for i, (lev, level_codes, name) in enumerate( 

2949 zip(index.levels, index.codes, index.names) 

2950 ): 

2951 # write the level 

2952 if is_extension_array_dtype(lev): 

2953 raise NotImplementedError( 

2954 "Saving a MultiIndex with an extension dtype is not supported." 

2955 ) 

2956 level_key = f"{key}_level{i}" 

2957 conv_level = _convert_index(level_key, lev, self.encoding, self.errors) 

2958 self.write_array(level_key, conv_level.values) 

2959 node = getattr(self.group, level_key) 

2960 node._v_attrs.kind = conv_level.kind 

2961 node._v_attrs.name = name 

2962 

2963 # write the name 

2964 setattr(node._v_attrs, f"{key}_name{name}", name) 

2965 

2966 # write the labels 

2967 label_key = f"{key}_label{i}" 

2968 self.write_array(label_key, level_codes) 

2969 

2970 def read_multi_index( 

2971 self, key: str, start: int | None = None, stop: int | None = None 

2972 ) -> MultiIndex: 

2973 nlevels = getattr(self.attrs, f"{key}_nlevels") 

2974 

2975 levels = [] 

2976 codes = [] 

2977 names: list[Hashable] = [] 

2978 for i in range(nlevels): 

2979 level_key = f"{key}_level{i}" 

2980 node = getattr(self.group, level_key) 

2981 lev = self.read_index_node(node, start=start, stop=stop) 

2982 levels.append(lev) 

2983 names.append(lev.name) 

2984 

2985 label_key = f"{key}_label{i}" 

2986 level_codes = self.read_array(label_key, start=start, stop=stop) 

2987 codes.append(level_codes) 

2988 

2989 return MultiIndex( 

2990 levels=levels, codes=codes, names=names, verify_integrity=True 

2991 ) 

2992 

2993 def read_index_node( 

2994 self, node: Node, start: int | None = None, stop: int | None = None 

2995 ) -> Index: 

2996 data = node[start:stop] 

2997 # If the index was an empty array write_array_empty() will 

2998 # have written a sentinel. Here we replace it with the original. 

2999 if "shape" in node._v_attrs and np.prod(node._v_attrs.shape) == 0: 

3000 data = np.empty(node._v_attrs.shape, dtype=node._v_attrs.value_type) 

3001 kind = _ensure_decoded(node._v_attrs.kind) 

3002 name = None 

3003 

3004 if "name" in node._v_attrs: 

3005 name = _ensure_str(node._v_attrs.name) 

3006 name = _ensure_decoded(name) 

3007 

3008 attrs = node._v_attrs 

3009 factory, kwargs = self._get_index_factory(attrs) 

3010 

3011 if kind == "date": 

3012 index = factory( 

3013 _unconvert_index( 

3014 data, kind, encoding=self.encoding, errors=self.errors 

3015 ), 

3016 dtype=object, 

3017 **kwargs, 

3018 ) 

3019 else: 

3020 index = factory( 

3021 _unconvert_index( 

3022 data, kind, encoding=self.encoding, errors=self.errors 

3023 ), 

3024 **kwargs, 

3025 ) 

3026 

3027 index.name = name 

3028 

3029 return index 

3030 

3031 def write_array_empty(self, key: str, value: ArrayLike) -> None: 

3032 """write a 0-len array""" 

3033 # ugly hack for length 0 axes 

3034 arr = np.empty((1,) * value.ndim) 

3035 self._handle.create_array(self.group, key, arr) 

3036 node = getattr(self.group, key) 

3037 node._v_attrs.value_type = str(value.dtype) 

3038 node._v_attrs.shape = value.shape 

3039 

3040 def write_array( 

3041 self, key: str, obj: AnyArrayLike, items: Index | None = None 

3042 ) -> None: 

3043 # TODO: we only have a few tests that get here, the only EA 

3044 # that gets passed is DatetimeArray, and we never have 

3045 # both self._filters and EA 

3046 

3047 value = extract_array(obj, extract_numpy=True) 

3048 

3049 if key in self.group: 

3050 self._handle.remove_node(self.group, key) 

3051 

3052 # Transform needed to interface with pytables row/col notation 

3053 empty_array = value.size == 0 

3054 transposed = False 

3055 

3056 if is_categorical_dtype(value.dtype): 

3057 raise NotImplementedError( 

3058 "Cannot store a category dtype in a HDF5 dataset that uses format=" 

3059 '"fixed". Use format="table".' 

3060 ) 

3061 if not empty_array: 

3062 if hasattr(value, "T"): 

3063 # ExtensionArrays (1d) may not have transpose. 

3064 value = value.T 

3065 transposed = True 

3066 

3067 atom = None 

3068 if self._filters is not None: 

3069 with suppress(ValueError): 

3070 # get the atom for this datatype 

3071 atom = _tables().Atom.from_dtype(value.dtype) 

3072 

3073 if atom is not None: 

3074 # We only get here if self._filters is non-None and 

3075 # the Atom.from_dtype call succeeded 

3076 

3077 # create an empty chunked array and fill it from value 

3078 if not empty_array: 

3079 ca = self._handle.create_carray( 

3080 self.group, key, atom, value.shape, filters=self._filters 

3081 ) 

3082 ca[:] = value 

3083 

3084 else: 

3085 self.write_array_empty(key, value) 

3086 

3087 elif value.dtype.type == np.object_: 

3088 # infer the type, warn if we have a non-string type here (for 

3089 # performance) 

3090 inferred_type = lib.infer_dtype(value, skipna=False) 

3091 if empty_array: 

3092 pass 

3093 elif inferred_type == "string": 

3094 pass 

3095 else: 

3096 ws = performance_doc % (inferred_type, key, items) 

3097 warnings.warn(ws, PerformanceWarning, stacklevel=find_stack_level()) 

3098 

3099 vlarr = self._handle.create_vlarray(self.group, key, _tables().ObjectAtom()) 

3100 vlarr.append(value) 

3101 

3102 elif is_datetime64_dtype(value.dtype): 

3103 self._handle.create_array(self.group, key, value.view("i8")) 

3104 getattr(self.group, key)._v_attrs.value_type = "datetime64" 

3105 elif is_datetime64tz_dtype(value.dtype): 

3106 # store as UTC 

3107 # with a zone 

3108 

3109 # error: Item "ExtensionArray" of "Union[Any, ExtensionArray]" has no 

3110 # attribute "asi8" 

3111 self._handle.create_array( 

3112 self.group, key, value.asi8 # type: ignore[union-attr] 

3113 ) 

3114 

3115 node = getattr(self.group, key) 

3116 # error: Item "ExtensionArray" of "Union[Any, ExtensionArray]" has no 

3117 # attribute "tz" 

3118 node._v_attrs.tz = _get_tz(value.tz) # type: ignore[union-attr] 

3119 node._v_attrs.value_type = "datetime64" 

3120 elif is_timedelta64_dtype(value.dtype): 

3121 self._handle.create_array(self.group, key, value.view("i8")) 

3122 getattr(self.group, key)._v_attrs.value_type = "timedelta64" 

3123 elif empty_array: 

3124 self.write_array_empty(key, value) 

3125 else: 

3126 self._handle.create_array(self.group, key, value) 

3127 

3128 getattr(self.group, key)._v_attrs.transposed = transposed 

3129 

3130 

3131class SeriesFixed(GenericFixed): 

3132 pandas_kind = "series" 

3133 attributes = ["name"] 

3134 

3135 name: Hashable 

3136 

3137 @property 

3138 def shape(self): 

3139 try: 

3140 return (len(self.group.values),) 

3141 except (TypeError, AttributeError): 

3142 return None 

3143 

3144 def read( 

3145 self, 

3146 where=None, 

3147 columns=None, 

3148 start: int | None = None, 

3149 stop: int | None = None, 

3150 ) -> Series: 

3151 self.validate_read(columns, where) 

3152 index = self.read_index("index", start=start, stop=stop) 

3153 values = self.read_array("values", start=start, stop=stop) 

3154 return Series(values, index=index, name=self.name) 

3155 

3156 # error: Signature of "write" incompatible with supertype "Fixed" 

3157 def write(self, obj, **kwargs) -> None: # type: ignore[override] 

3158 super().write(obj, **kwargs) 

3159 self.write_index("index", obj.index) 

3160 self.write_array("values", obj) 

3161 self.attrs.name = obj.name 

3162 

3163 

3164class BlockManagerFixed(GenericFixed): 

3165 attributes = ["ndim", "nblocks"] 

3166 

3167 nblocks: int 

3168 

3169 @property 

3170 def shape(self) -> Shape | None: 

3171 try: 

3172 ndim = self.ndim 

3173 

3174 # items 

3175 items = 0 

3176 for i in range(self.nblocks): 

3177 node = getattr(self.group, f"block{i}_items") 

3178 shape = getattr(node, "shape", None) 

3179 if shape is not None: 

3180 items += shape[0] 

3181 

3182 # data shape 

3183 node = self.group.block0_values 

3184 shape = getattr(node, "shape", None) 

3185 if shape is not None: 

3186 shape = list(shape[0 : (ndim - 1)]) 

3187 else: 

3188 shape = [] 

3189 

3190 shape.append(items) 

3191 

3192 return shape 

3193 except AttributeError: 

3194 return None 

3195 

3196 def read( 

3197 self, 

3198 where=None, 

3199 columns=None, 

3200 start: int | None = None, 

3201 stop: int | None = None, 

3202 ) -> DataFrame: 

3203 # start, stop applied to rows, so 0th axis only 

3204 self.validate_read(columns, where) 

3205 select_axis = self.obj_type()._get_block_manager_axis(0) 

3206 

3207 axes = [] 

3208 for i in range(self.ndim): 

3209 

3210 _start, _stop = (start, stop) if i == select_axis else (None, None) 

3211 ax = self.read_index(f"axis{i}", start=_start, stop=_stop) 

3212 axes.append(ax) 

3213 

3214 items = axes[0] 

3215 dfs = [] 

3216 

3217 for i in range(self.nblocks): 

3218 

3219 blk_items = self.read_index(f"block{i}_items") 

3220 values = self.read_array(f"block{i}_values", start=_start, stop=_stop) 

3221 

3222 columns = items[items.get_indexer(blk_items)] 

3223 df = DataFrame(values.T, columns=columns, index=axes[1]) 

3224 dfs.append(df) 

3225 

3226 if len(dfs) > 0: 

3227 out = concat(dfs, axis=1) 

3228 out = out.reindex(columns=items, copy=False) 

3229 return out 

3230 

3231 return DataFrame(columns=axes[0], index=axes[1]) 

3232 

3233 # error: Signature of "write" incompatible with supertype "Fixed" 

3234 def write(self, obj, **kwargs) -> None: # type: ignore[override] 

3235 super().write(obj, **kwargs) 

3236 

3237 # TODO(ArrayManager) HDFStore relies on accessing the blocks 

3238 if isinstance(obj._mgr, ArrayManager): 

3239 obj = obj._as_manager("block") 

3240 

3241 data = obj._mgr 

3242 if not data.is_consolidated(): 

3243 data = data.consolidate() 

3244 

3245 self.attrs.ndim = data.ndim 

3246 for i, ax in enumerate(data.axes): 

3247 if i == 0 and (not ax.is_unique): 

3248 raise ValueError("Columns index has to be unique for fixed format") 

3249 self.write_index(f"axis{i}", ax) 

3250 

3251 # Supporting mixed-type DataFrame objects...nontrivial 

3252 self.attrs.nblocks = len(data.blocks) 

3253 for i, blk in enumerate(data.blocks): 

3254 # I have no idea why, but writing values before items fixed #2299 

3255 blk_items = data.items.take(blk.mgr_locs) 

3256 self.write_array(f"block{i}_values", blk.values, items=blk_items) 

3257 self.write_index(f"block{i}_items", blk_items) 

3258 

3259 

3260class FrameFixed(BlockManagerFixed): 

3261 pandas_kind = "frame" 

3262 obj_type = DataFrame 

3263 

3264 

3265class Table(Fixed): 

3266 """ 

3267 represent a table: 

3268 facilitate read/write of various types of tables 

3269 

3270 Attrs in Table Node 

3271 ------------------- 

3272 These are attributes that are store in the main table node, they are 

3273 necessary to recreate these tables when read back in. 

3274 

3275 index_axes : a list of tuples of the (original indexing axis and 

3276 index column) 

3277 non_index_axes: a list of tuples of the (original index axis and 

3278 columns on a non-indexing axis) 

3279 values_axes : a list of the columns which comprise the data of this 

3280 table 

3281 data_columns : a list of the columns that we are allowing indexing 

3282 (these become single columns in values_axes) 

3283 nan_rep : the string to use for nan representations for string 

3284 objects 

3285 levels : the names of levels 

3286 metadata : the names of the metadata columns 

3287 """ 

3288 

3289 pandas_kind = "wide_table" 

3290 format_type: str = "table" # GH#30962 needed by dask 

3291 table_type: str 

3292 levels: int | list[Hashable] = 1 

3293 is_table = True 

3294 

3295 index_axes: list[IndexCol] 

3296 non_index_axes: list[tuple[int, Any]] 

3297 values_axes: list[DataCol] 

3298 data_columns: list 

3299 metadata: list 

3300 info: dict 

3301 

3302 def __init__( 

3303 self, 

3304 parent: HDFStore, 

3305 group: Node, 

3306 encoding=None, 

3307 errors: str = "strict", 

3308 index_axes=None, 

3309 non_index_axes=None, 

3310 values_axes=None, 

3311 data_columns=None, 

3312 info=None, 

3313 nan_rep=None, 

3314 ) -> None: 

3315 super().__init__(parent, group, encoding=encoding, errors=errors) 

3316 self.index_axes = index_axes or [] 

3317 self.non_index_axes = non_index_axes or [] 

3318 self.values_axes = values_axes or [] 

3319 self.data_columns = data_columns or [] 

3320 self.info = info or {} 

3321 self.nan_rep = nan_rep 

3322 

3323 @property 

3324 def table_type_short(self) -> str: 

3325 return self.table_type.split("_")[0] 

3326 

3327 def __repr__(self) -> str: 

3328 """return a pretty representation of myself""" 

3329 self.infer_axes() 

3330 jdc = ",".join(self.data_columns) if len(self.data_columns) else "" 

3331 dc = f",dc->[{jdc}]" 

3332 

3333 ver = "" 

3334 if self.is_old_version: 

3335 jver = ".".join([str(x) for x in self.version]) 

3336 ver = f"[{jver}]" 

3337 

3338 jindex_axes = ",".join([a.name for a in self.index_axes]) 

3339 return ( 

3340 f"{self.pandas_type:12.12}{ver} " 

3341 f"(typ->{self.table_type_short},nrows->{self.nrows}," 

3342 f"ncols->{self.ncols},indexers->[{jindex_axes}]{dc})" 

3343 ) 

3344 

3345 def __getitem__(self, c: str): 

3346 """return the axis for c""" 

3347 for a in self.axes: 

3348 if c == a.name: 

3349 return a 

3350 return None 

3351 

3352 def validate(self, other) -> None: 

3353 """validate against an existing table""" 

3354 if other is None: 

3355 return 

3356 

3357 if other.table_type != self.table_type: 

3358 raise TypeError( 

3359 "incompatible table_type with existing " 

3360 f"[{other.table_type} - {self.table_type}]" 

3361 ) 

3362 

3363 for c in ["index_axes", "non_index_axes", "values_axes"]: 

3364 sv = getattr(self, c, None) 

3365 ov = getattr(other, c, None) 

3366 if sv != ov: 

3367 

3368 # show the error for the specific axes 

3369 # Argument 1 to "enumerate" has incompatible type 

3370 # "Optional[Any]"; expected "Iterable[Any]" [arg-type] 

3371 for i, sax in enumerate(sv): # type: ignore[arg-type] 

3372 # Value of type "Optional[Any]" is not indexable [index] 

3373 oax = ov[i] # type: ignore[index] 

3374 if sax != oax: 

3375 raise ValueError( 

3376 f"invalid combination of [{c}] on appending data " 

3377 f"[{sax}] vs current table [{oax}]" 

3378 ) 

3379 

3380 # should never get here 

3381 raise Exception( 

3382 f"invalid combination of [{c}] on appending data [{sv}] vs " 

3383 f"current table [{ov}]" 

3384 ) 

3385 

3386 @property 

3387 def is_multi_index(self) -> bool: 

3388 """the levels attribute is 1 or a list in the case of a multi-index""" 

3389 return isinstance(self.levels, list) 

3390 

3391 def validate_multiindex( 

3392 self, obj: DataFrame | Series 

3393 ) -> tuple[DataFrame, list[Hashable]]: 

3394 """ 

3395 validate that we can store the multi-index; reset and return the 

3396 new object 

3397 """ 

3398 levels = com.fill_missing_names(obj.index.names) 

3399 try: 

3400 reset_obj = obj.reset_index() 

3401 except ValueError as err: 

3402 raise ValueError( 

3403 "duplicate names/columns in the multi-index when storing as a table" 

3404 ) from err 

3405 assert isinstance(reset_obj, DataFrame) # for mypy 

3406 return reset_obj, levels 

3407 

3408 @property 

3409 def nrows_expected(self) -> int: 

3410 """based on our axes, compute the expected nrows""" 

3411 return np.prod([i.cvalues.shape[0] for i in self.index_axes]) 

3412 

3413 @property 

3414 def is_exists(self) -> bool: 

3415 """has this table been created""" 

3416 return "table" in self.group 

3417 

3418 @property 

3419 def storable(self): 

3420 return getattr(self.group, "table", None) 

3421 

3422 @property 

3423 def table(self): 

3424 """return the table group (this is my storable)""" 

3425 return self.storable 

3426 

3427 @property 

3428 def dtype(self): 

3429 return self.table.dtype 

3430 

3431 @property 

3432 def description(self): 

3433 return self.table.description 

3434 

3435 @property 

3436 def axes(self): 

3437 return itertools.chain(self.index_axes, self.values_axes) 

3438 

3439 @property 

3440 def ncols(self) -> int: 

3441 """the number of total columns in the values axes""" 

3442 return sum(len(a.values) for a in self.values_axes) 

3443 

3444 @property 

3445 def is_transposed(self) -> bool: 

3446 return False 

3447 

3448 @property 

3449 def data_orientation(self) -> tuple[int, ...]: 

3450 """return a tuple of my permutated axes, non_indexable at the front""" 

3451 return tuple( 

3452 itertools.chain( 

3453 [int(a[0]) for a in self.non_index_axes], 

3454 [int(a.axis) for a in self.index_axes], 

3455 ) 

3456 ) 

3457 

3458 def queryables(self) -> dict[str, Any]: 

3459 """return a dict of the kinds allowable columns for this object""" 

3460 # mypy doesn't recognize DataFrame._AXIS_NAMES, so we re-write it here 

3461 axis_names = {0: "index", 1: "columns"} 

3462 

3463 # compute the values_axes queryables 

3464 d1 = [(a.cname, a) for a in self.index_axes] 

3465 d2 = [(axis_names[axis], None) for axis, values in self.non_index_axes] 

3466 d3 = [ 

3467 (v.cname, v) for v in self.values_axes if v.name in set(self.data_columns) 

3468 ] 

3469 

3470 # error: Unsupported operand types for + ("List[Tuple[str, IndexCol]]" and 

3471 # "List[Tuple[str, None]]") 

3472 return dict(d1 + d2 + d3) # type: ignore[operator] 

3473 

3474 def index_cols(self): 

3475 """return a list of my index cols""" 

3476 # Note: each `i.cname` below is assured to be a str. 

3477 return [(i.axis, i.cname) for i in self.index_axes] 

3478 

3479 def values_cols(self) -> list[str]: 

3480 """return a list of my values cols""" 

3481 return [i.cname for i in self.values_axes] 

3482 

3483 def _get_metadata_path(self, key: str) -> str: 

3484 """return the metadata pathname for this key""" 

3485 group = self.group._v_pathname 

3486 return f"{group}/meta/{key}/meta" 

3487 

3488 def write_metadata(self, key: str, values: np.ndarray) -> None: 

3489 """ 

3490 Write out a metadata array to the key as a fixed-format Series. 

3491 

3492 Parameters 

3493 ---------- 

3494 key : str 

3495 values : ndarray 

3496 """ 

3497 self.parent.put( 

3498 self._get_metadata_path(key), 

3499 Series(values), 

3500 format="table", 

3501 encoding=self.encoding, 

3502 errors=self.errors, 

3503 nan_rep=self.nan_rep, 

3504 ) 

3505 

3506 def read_metadata(self, key: str): 

3507 """return the meta data array for this key""" 

3508 if getattr(getattr(self.group, "meta", None), key, None) is not None: 

3509 return self.parent.select(self._get_metadata_path(key)) 

3510 return None 

3511 

3512 def set_attrs(self) -> None: 

3513 """set our table type & indexables""" 

3514 self.attrs.table_type = str(self.table_type) 

3515 self.attrs.index_cols = self.index_cols() 

3516 self.attrs.values_cols = self.values_cols() 

3517 self.attrs.non_index_axes = self.non_index_axes 

3518 self.attrs.data_columns = self.data_columns 

3519 self.attrs.nan_rep = self.nan_rep 

3520 self.attrs.encoding = self.encoding 

3521 self.attrs.errors = self.errors 

3522 self.attrs.levels = self.levels 

3523 self.attrs.info = self.info 

3524 

3525 def get_attrs(self) -> None: 

3526 """retrieve our attributes""" 

3527 self.non_index_axes = getattr(self.attrs, "non_index_axes", None) or [] 

3528 self.data_columns = getattr(self.attrs, "data_columns", None) or [] 

3529 self.info = getattr(self.attrs, "info", None) or {} 

3530 self.nan_rep = getattr(self.attrs, "nan_rep", None) 

3531 self.encoding = _ensure_encoding(getattr(self.attrs, "encoding", None)) 

3532 self.errors = _ensure_decoded(getattr(self.attrs, "errors", "strict")) 

3533 self.levels: list[Hashable] = getattr(self.attrs, "levels", None) or [] 

3534 self.index_axes = [a for a in self.indexables if a.is_an_indexable] 

3535 self.values_axes = [a for a in self.indexables if not a.is_an_indexable] 

3536 

3537 def validate_version(self, where=None) -> None: 

3538 """are we trying to operate on an old version?""" 

3539 if where is not None: 

3540 if self.is_old_version: 

3541 ws = incompatibility_doc % ".".join([str(x) for x in self.version]) 

3542 warnings.warn( 

3543 ws, 

3544 IncompatibilityWarning, 

3545 stacklevel=find_stack_level(), 

3546 ) 

3547 

3548 def validate_min_itemsize(self, min_itemsize) -> None: 

3549 """ 

3550 validate the min_itemsize doesn't contain items that are not in the 

3551 axes this needs data_columns to be defined 

3552 """ 

3553 if min_itemsize is None: 

3554 return 

3555 if not isinstance(min_itemsize, dict): 

3556 return 

3557 

3558 q = self.queryables() 

3559 for k in min_itemsize: 

3560 

3561 # ok, apply generally 

3562 if k == "values": 

3563 continue 

3564 if k not in q: 

3565 raise ValueError( 

3566 f"min_itemsize has the key [{k}] which is not an axis or " 

3567 "data_column" 

3568 ) 

3569 

3570 @cache_readonly 

3571 def indexables(self): 

3572 """create/cache the indexables if they don't exist""" 

3573 _indexables = [] 

3574 

3575 desc = self.description 

3576 table_attrs = self.table.attrs 

3577 

3578 # Note: each of the `name` kwargs below are str, ensured 

3579 # by the definition in index_cols. 

3580 # index columns 

3581 for i, (axis, name) in enumerate(self.attrs.index_cols): 

3582 atom = getattr(desc, name) 

3583 md = self.read_metadata(name) 

3584 meta = "category" if md is not None else None 

3585 

3586 kind_attr = f"{name}_kind" 

3587 kind = getattr(table_attrs, kind_attr, None) 

3588 

3589 index_col = IndexCol( 

3590 name=name, 

3591 axis=axis, 

3592 pos=i, 

3593 kind=kind, 

3594 typ=atom, 

3595 table=self.table, 

3596 meta=meta, 

3597 metadata=md, 

3598 ) 

3599 _indexables.append(index_col) 

3600 

3601 # values columns 

3602 dc = set(self.data_columns) 

3603 base_pos = len(_indexables) 

3604 

3605 def f(i, c): 

3606 assert isinstance(c, str) 

3607 klass = DataCol 

3608 if c in dc: 

3609 klass = DataIndexableCol 

3610 

3611 atom = getattr(desc, c) 

3612 adj_name = _maybe_adjust_name(c, self.version) 

3613 

3614 # TODO: why kind_attr here? 

3615 values = getattr(table_attrs, f"{adj_name}_kind", None) 

3616 dtype = getattr(table_attrs, f"{adj_name}_dtype", None) 

3617 # Argument 1 to "_dtype_to_kind" has incompatible type 

3618 # "Optional[Any]"; expected "str" [arg-type] 

3619 kind = _dtype_to_kind(dtype) # type: ignore[arg-type] 

3620 

3621 md = self.read_metadata(c) 

3622 # TODO: figure out why these two versions of `meta` dont always match. 

3623 # meta = "category" if md is not None else None 

3624 meta = getattr(table_attrs, f"{adj_name}_meta", None) 

3625 

3626 obj = klass( 

3627 name=adj_name, 

3628 cname=c, 

3629 values=values, 

3630 kind=kind, 

3631 pos=base_pos + i, 

3632 typ=atom, 

3633 table=self.table, 

3634 meta=meta, 

3635 metadata=md, 

3636 dtype=dtype, 

3637 ) 

3638 return obj 

3639 

3640 # Note: the definition of `values_cols` ensures that each 

3641 # `c` below is a str. 

3642 _indexables.extend([f(i, c) for i, c in enumerate(self.attrs.values_cols)]) 

3643 

3644 return _indexables 

3645 

3646 def create_index( 

3647 self, columns=None, optlevel=None, kind: str | None = None 

3648 ) -> None: 

3649 """ 

3650 Create a pytables index on the specified columns. 

3651 

3652 Parameters 

3653 ---------- 

3654 columns : None, bool, or listlike[str] 

3655 Indicate which columns to create an index on. 

3656 

3657 * False : Do not create any indexes. 

3658 * True : Create indexes on all columns. 

3659 * None : Create indexes on all columns. 

3660 * listlike : Create indexes on the given columns. 

3661 

3662 optlevel : int or None, default None 

3663 Optimization level, if None, pytables defaults to 6. 

3664 kind : str or None, default None 

3665 Kind of index, if None, pytables defaults to "medium". 

3666 

3667 Raises 

3668 ------ 

3669 TypeError if trying to create an index on a complex-type column. 

3670 

3671 Notes 

3672 ----- 

3673 Cannot index Time64Col or ComplexCol. 

3674 Pytables must be >= 3.0. 

3675 """ 

3676 if not self.infer_axes(): 

3677 return 

3678 if columns is False: 

3679 return 

3680 

3681 # index all indexables and data_columns 

3682 if columns is None or columns is True: 

3683 columns = [a.cname for a in self.axes if a.is_data_indexable] 

3684 if not isinstance(columns, (tuple, list)): 

3685 columns = [columns] 

3686 

3687 kw = {} 

3688 if optlevel is not None: 

3689 kw["optlevel"] = optlevel 

3690 if kind is not None: 

3691 kw["kind"] = kind 

3692 

3693 table = self.table 

3694 for c in columns: 

3695 v = getattr(table.cols, c, None) 

3696 if v is not None: 

3697 # remove the index if the kind/optlevel have changed 

3698 if v.is_indexed: 

3699 index = v.index 

3700 cur_optlevel = index.optlevel 

3701 cur_kind = index.kind 

3702 

3703 if kind is not None and cur_kind != kind: 

3704 v.remove_index() 

3705 else: 

3706 kw["kind"] = cur_kind 

3707 

3708 if optlevel is not None and cur_optlevel != optlevel: 

3709 v.remove_index() 

3710 else: 

3711 kw["optlevel"] = cur_optlevel 

3712 

3713 # create the index 

3714 if not v.is_indexed: 

3715 if v.type.startswith("complex"): 

3716 raise TypeError( 

3717 "Columns containing complex values can be stored but " 

3718 "cannot be indexed when using table format. Either use " 

3719 "fixed format, set index=False, or do not include " 

3720 "the columns containing complex values to " 

3721 "data_columns when initializing the table." 

3722 ) 

3723 v.create_index(**kw) 

3724 elif c in self.non_index_axes[0][1]: 

3725 # GH 28156 

3726 raise AttributeError( 

3727 f"column {c} is not a data_column.\n" 

3728 f"In order to read column {c} you must reload the dataframe \n" 

3729 f"into HDFStore and include {c} with the data_columns argument." 

3730 ) 

3731 

3732 def _read_axes( 

3733 self, where, start: int | None = None, stop: int | None = None 

3734 ) -> list[tuple[ArrayLike, ArrayLike]]: 

3735 """ 

3736 Create the axes sniffed from the table. 

3737 

3738 Parameters 

3739 ---------- 

3740 where : ??? 

3741 start : int or None, default None 

3742 stop : int or None, default None 

3743 

3744 Returns 

3745 ------- 

3746 List[Tuple[index_values, column_values]] 

3747 """ 

3748 # create the selection 

3749 selection = Selection(self, where=where, start=start, stop=stop) 

3750 values = selection.select() 

3751 

3752 results = [] 

3753 # convert the data 

3754 for a in self.axes: 

3755 a.set_info(self.info) 

3756 res = a.convert( 

3757 values, 

3758 nan_rep=self.nan_rep, 

3759 encoding=self.encoding, 

3760 errors=self.errors, 

3761 ) 

3762 results.append(res) 

3763 

3764 return results 

3765 

3766 @classmethod 

3767 def get_object(cls, obj, transposed: bool): 

3768 """return the data for this obj""" 

3769 return obj 

3770 

3771 def validate_data_columns(self, data_columns, min_itemsize, non_index_axes): 

3772 """ 

3773 take the input data_columns and min_itemize and create a data 

3774 columns spec 

3775 """ 

3776 if not len(non_index_axes): 

3777 return [] 

3778 

3779 axis, axis_labels = non_index_axes[0] 

3780 info = self.info.get(axis, {}) 

3781 if info.get("type") == "MultiIndex" and data_columns: 

3782 raise ValueError( 

3783 f"cannot use a multi-index on axis [{axis}] with " 

3784 f"data_columns {data_columns}" 

3785 ) 

3786 

3787 # evaluate the passed data_columns, True == use all columns 

3788 # take only valid axis labels 

3789 if data_columns is True: 

3790 data_columns = list(axis_labels) 

3791 elif data_columns is None: 

3792 data_columns = [] 

3793 

3794 # if min_itemsize is a dict, add the keys (exclude 'values') 

3795 if isinstance(min_itemsize, dict): 

3796 existing_data_columns = set(data_columns) 

3797 data_columns = list(data_columns) # ensure we do not modify 

3798 data_columns.extend( 

3799 [ 

3800 k 

3801 for k in min_itemsize.keys() 

3802 if k != "values" and k not in existing_data_columns 

3803 ] 

3804 ) 

3805 

3806 # return valid columns in the order of our axis 

3807 return [c for c in data_columns if c in axis_labels] 

3808 

3809 def _create_axes( 

3810 self, 

3811 axes, 

3812 obj: DataFrame, 

3813 validate: bool = True, 

3814 nan_rep=None, 

3815 data_columns=None, 

3816 min_itemsize=None, 

3817 ): 

3818 """ 

3819 Create and return the axes. 

3820 

3821 Parameters 

3822 ---------- 

3823 axes: list or None 

3824 The names or numbers of the axes to create. 

3825 obj : DataFrame 

3826 The object to create axes on. 

3827 validate: bool, default True 

3828 Whether to validate the obj against an existing object already written. 

3829 nan_rep : 

3830 A value to use for string column nan_rep. 

3831 data_columns : List[str], True, or None, default None 

3832 Specify the columns that we want to create to allow indexing on. 

3833 

3834 * True : Use all available columns. 

3835 * None : Use no columns. 

3836 * List[str] : Use the specified columns. 

3837 

3838 min_itemsize: Dict[str, int] or None, default None 

3839 The min itemsize for a column in bytes. 

3840 """ 

3841 if not isinstance(obj, DataFrame): 

3842 group = self.group._v_name 

3843 raise TypeError( 

3844 f"cannot properly create the storer for: [group->{group}," 

3845 f"value->{type(obj)}]" 

3846 ) 

3847 

3848 # set the default axes if needed 

3849 if axes is None: 

3850 axes = [0] 

3851 

3852 # map axes to numbers 

3853 axes = [obj._get_axis_number(a) for a in axes] 

3854 

3855 # do we have an existing table (if so, use its axes & data_columns) 

3856 if self.infer_axes(): 

3857 table_exists = True 

3858 axes = [a.axis for a in self.index_axes] 

3859 data_columns = list(self.data_columns) 

3860 nan_rep = self.nan_rep 

3861 # TODO: do we always have validate=True here? 

3862 else: 

3863 table_exists = False 

3864 

3865 new_info = self.info 

3866 

3867 assert self.ndim == 2 # with next check, we must have len(axes) == 1 

3868 # currently support on ndim-1 axes 

3869 if len(axes) != self.ndim - 1: 

3870 raise ValueError( 

3871 "currently only support ndim-1 indexers in an AppendableTable" 

3872 ) 

3873 

3874 # create according to the new data 

3875 new_non_index_axes: list = [] 

3876 

3877 # nan_representation 

3878 if nan_rep is None: 

3879 nan_rep = "nan" 

3880 

3881 # We construct the non-index-axis first, since that alters new_info 

3882 idx = [x for x in [0, 1] if x not in axes][0] 

3883 

3884 a = obj.axes[idx] 

3885 # we might be able to change the axes on the appending data if necessary 

3886 append_axis = list(a) 

3887 if table_exists: 

3888 indexer = len(new_non_index_axes) # i.e. 0 

3889 exist_axis = self.non_index_axes[indexer][1] 

3890 if not array_equivalent(np.array(append_axis), np.array(exist_axis)): 

3891 

3892 # ahah! -> reindex 

3893 if array_equivalent( 

3894 np.array(sorted(append_axis)), np.array(sorted(exist_axis)) 

3895 ): 

3896 append_axis = exist_axis 

3897 

3898 # the non_index_axes info 

3899 info = new_info.setdefault(idx, {}) 

3900 info["names"] = list(a.names) 

3901 info["type"] = type(a).__name__ 

3902 

3903 new_non_index_axes.append((idx, append_axis)) 

3904 

3905 # Now we can construct our new index axis 

3906 idx = axes[0] 

3907 a = obj.axes[idx] 

3908 axis_name = obj._get_axis_name(idx) 

3909 new_index = _convert_index(axis_name, a, self.encoding, self.errors) 

3910 new_index.axis = idx 

3911 

3912 # Because we are always 2D, there is only one new_index, so 

3913 # we know it will have pos=0 

3914 new_index.set_pos(0) 

3915 new_index.update_info(new_info) 

3916 new_index.maybe_set_size(min_itemsize) # check for column conflicts 

3917 

3918 new_index_axes = [new_index] 

3919 j = len(new_index_axes) # i.e. 1 

3920 assert j == 1 

3921 

3922 # reindex by our non_index_axes & compute data_columns 

3923 assert len(new_non_index_axes) == 1 

3924 for a in new_non_index_axes: 

3925 obj = _reindex_axis(obj, a[0], a[1]) 

3926 

3927 transposed = new_index.axis == 1 

3928 

3929 # figure out data_columns and get out blocks 

3930 data_columns = self.validate_data_columns( 

3931 data_columns, min_itemsize, new_non_index_axes 

3932 ) 

3933 

3934 frame = self.get_object(obj, transposed)._consolidate() 

3935 

3936 blocks, blk_items = self._get_blocks_and_items( 

3937 frame, table_exists, new_non_index_axes, self.values_axes, data_columns 

3938 ) 

3939 

3940 # add my values 

3941 vaxes = [] 

3942 for i, (blk, b_items) in enumerate(zip(blocks, blk_items)): 

3943 

3944 # shape of the data column are the indexable axes 

3945 klass = DataCol 

3946 name = None 

3947 

3948 # we have a data_column 

3949 if data_columns and len(b_items) == 1 and b_items[0] in data_columns: 

3950 klass = DataIndexableCol 

3951 name = b_items[0] 

3952 if not (name is None or isinstance(name, str)): 

3953 # TODO: should the message here be more specifically non-str? 

3954 raise ValueError("cannot have non-object label DataIndexableCol") 

3955 

3956 # make sure that we match up the existing columns 

3957 # if we have an existing table 

3958 existing_col: DataCol | None 

3959 

3960 if table_exists and validate: 

3961 try: 

3962 existing_col = self.values_axes[i] 

3963 except (IndexError, KeyError) as err: 

3964 raise ValueError( 

3965 f"Incompatible appended table [{blocks}]" 

3966 f"with existing table [{self.values_axes}]" 

3967 ) from err 

3968 else: 

3969 existing_col = None 

3970 

3971 new_name = name or f"values_block_{i}" 

3972 data_converted = _maybe_convert_for_string_atom( 

3973 new_name, 

3974 blk.values, 

3975 existing_col=existing_col, 

3976 min_itemsize=min_itemsize, 

3977 nan_rep=nan_rep, 

3978 encoding=self.encoding, 

3979 errors=self.errors, 

3980 columns=b_items, 

3981 ) 

3982 adj_name = _maybe_adjust_name(new_name, self.version) 

3983 

3984 typ = klass._get_atom(data_converted) 

3985 kind = _dtype_to_kind(data_converted.dtype.name) 

3986 tz = None 

3987 if getattr(data_converted, "tz", None) is not None: 

3988 tz = _get_tz(data_converted.tz) 

3989 

3990 meta = metadata = ordered = None 

3991 if is_categorical_dtype(data_converted.dtype): 

3992 ordered = data_converted.ordered 

3993 meta = "category" 

3994 metadata = np.array(data_converted.categories, copy=False).ravel() 

3995 

3996 data, dtype_name = _get_data_and_dtype_name(data_converted) 

3997 

3998 col = klass( 

3999 name=adj_name, 

4000 cname=new_name, 

4001 values=list(b_items), 

4002 typ=typ, 

4003 pos=j, 

4004 kind=kind, 

4005 tz=tz, 

4006 ordered=ordered, 

4007 meta=meta, 

4008 metadata=metadata, 

4009 dtype=dtype_name, 

4010 data=data, 

4011 ) 

4012 col.update_info(new_info) 

4013 

4014 vaxes.append(col) 

4015 

4016 j += 1 

4017 

4018 dcs = [col.name for col in vaxes if col.is_data_indexable] 

4019 

4020 new_table = type(self)( 

4021 parent=self.parent, 

4022 group=self.group, 

4023 encoding=self.encoding, 

4024 errors=self.errors, 

4025 index_axes=new_index_axes, 

4026 non_index_axes=new_non_index_axes, 

4027 values_axes=vaxes, 

4028 data_columns=dcs, 

4029 info=new_info, 

4030 nan_rep=nan_rep, 

4031 ) 

4032 if hasattr(self, "levels"): 

4033 # TODO: get this into constructor, only for appropriate subclass 

4034 new_table.levels = self.levels 

4035 

4036 new_table.validate_min_itemsize(min_itemsize) 

4037 

4038 if validate and table_exists: 

4039 new_table.validate(self) 

4040 

4041 return new_table 

4042 

4043 @staticmethod 

4044 def _get_blocks_and_items( 

4045 frame: DataFrame, 

4046 table_exists: bool, 

4047 new_non_index_axes, 

4048 values_axes, 

4049 data_columns, 

4050 ): 

4051 # Helper to clarify non-state-altering parts of _create_axes 

4052 

4053 # TODO(ArrayManager) HDFStore relies on accessing the blocks 

4054 if isinstance(frame._mgr, ArrayManager): 

4055 frame = frame._as_manager("block") 

4056 

4057 def get_blk_items(mgr): 

4058 return [mgr.items.take(blk.mgr_locs) for blk in mgr.blocks] 

4059 

4060 mgr = frame._mgr 

4061 mgr = cast(BlockManager, mgr) 

4062 blocks: list[Block] = list(mgr.blocks) 

4063 blk_items: list[Index] = get_blk_items(mgr) 

4064 

4065 if len(data_columns): 

4066 axis, axis_labels = new_non_index_axes[0] 

4067 new_labels = Index(axis_labels).difference(Index(data_columns)) 

4068 mgr = frame.reindex(new_labels, axis=axis)._mgr 

4069 

4070 # error: Item "ArrayManager" of "Union[ArrayManager, BlockManager]" has no 

4071 # attribute "blocks" 

4072 blocks = list(mgr.blocks) # type: ignore[union-attr] 

4073 blk_items = get_blk_items(mgr) 

4074 for c in data_columns: 

4075 mgr = frame.reindex([c], axis=axis)._mgr 

4076 # error: Item "ArrayManager" of "Union[ArrayManager, BlockManager]" has 

4077 # no attribute "blocks" 

4078 blocks.extend(mgr.blocks) # type: ignore[union-attr] 

4079 blk_items.extend(get_blk_items(mgr)) 

4080 

4081 # reorder the blocks in the same order as the existing table if we can 

4082 if table_exists: 

4083 by_items = { 

4084 tuple(b_items.tolist()): (b, b_items) 

4085 for b, b_items in zip(blocks, blk_items) 

4086 } 

4087 new_blocks: list[Block] = [] 

4088 new_blk_items = [] 

4089 for ea in values_axes: 

4090 items = tuple(ea.values) 

4091 try: 

4092 b, b_items = by_items.pop(items) 

4093 new_blocks.append(b) 

4094 new_blk_items.append(b_items) 

4095 except (IndexError, KeyError) as err: 

4096 jitems = ",".join([pprint_thing(item) for item in items]) 

4097 raise ValueError( 

4098 f"cannot match existing table structure for [{jitems}] " 

4099 "on appending data" 

4100 ) from err 

4101 blocks = new_blocks 

4102 blk_items = new_blk_items 

4103 

4104 return blocks, blk_items 

4105 

4106 def process_axes(self, obj, selection: Selection, columns=None) -> DataFrame: 

4107 """process axes filters""" 

4108 # make a copy to avoid side effects 

4109 if columns is not None: 

4110 columns = list(columns) 

4111 

4112 # make sure to include levels if we have them 

4113 if columns is not None and self.is_multi_index: 

4114 assert isinstance(self.levels, list) # assured by is_multi_index 

4115 for n in self.levels: 

4116 if n not in columns: 

4117 columns.insert(0, n) 

4118 

4119 # reorder by any non_index_axes & limit to the select columns 

4120 for axis, labels in self.non_index_axes: 

4121 obj = _reindex_axis(obj, axis, labels, columns) 

4122 

4123 # apply the selection filters (but keep in the same order) 

4124 if selection.filter is not None: 

4125 for field, op, filt in selection.filter.format(): 

4126 

4127 def process_filter(field, filt): 

4128 

4129 for axis_name in obj._AXIS_ORDERS: 

4130 axis_number = obj._get_axis_number(axis_name) 

4131 axis_values = obj._get_axis(axis_name) 

4132 assert axis_number is not None 

4133 

4134 # see if the field is the name of an axis 

4135 if field == axis_name: 

4136 

4137 # if we have a multi-index, then need to include 

4138 # the levels 

4139 if self.is_multi_index: 

4140 filt = filt.union(Index(self.levels)) 

4141 

4142 takers = op(axis_values, filt) 

4143 return obj.loc(axis=axis_number)[takers] 

4144 

4145 # this might be the name of a file IN an axis 

4146 elif field in axis_values: 

4147 

4148 # we need to filter on this dimension 

4149 values = ensure_index(getattr(obj, field).values) 

4150 filt = ensure_index(filt) 

4151 

4152 # hack until we support reversed dim flags 

4153 if isinstance(obj, DataFrame): 

4154 axis_number = 1 - axis_number 

4155 takers = op(values, filt) 

4156 return obj.loc(axis=axis_number)[takers] 

4157 

4158 raise ValueError(f"cannot find the field [{field}] for filtering!") 

4159 

4160 obj = process_filter(field, filt) 

4161 

4162 return obj 

4163 

4164 def create_description( 

4165 self, 

4166 complib, 

4167 complevel: int | None, 

4168 fletcher32: bool, 

4169 expectedrows: int | None, 

4170 ) -> dict[str, Any]: 

4171 """create the description of the table from the axes & values""" 

4172 # provided expected rows if its passed 

4173 if expectedrows is None: 

4174 expectedrows = max(self.nrows_expected, 10000) 

4175 

4176 d = {"name": "table", "expectedrows": expectedrows} 

4177 

4178 # description from the axes & values 

4179 d["description"] = {a.cname: a.typ for a in self.axes} 

4180 

4181 if complib: 

4182 if complevel is None: 

4183 complevel = self._complevel or 9 

4184 filters = _tables().Filters( 

4185 complevel=complevel, 

4186 complib=complib, 

4187 fletcher32=fletcher32 or self._fletcher32, 

4188 ) 

4189 d["filters"] = filters 

4190 elif self._filters is not None: 

4191 d["filters"] = self._filters 

4192 

4193 return d 

4194 

4195 def read_coordinates( 

4196 self, where=None, start: int | None = None, stop: int | None = None 

4197 ): 

4198 """ 

4199 select coordinates (row numbers) from a table; return the 

4200 coordinates object 

4201 """ 

4202 # validate the version 

4203 self.validate_version(where) 

4204 

4205 # infer the data kind 

4206 if not self.infer_axes(): 

4207 return False 

4208 

4209 # create the selection 

4210 selection = Selection(self, where=where, start=start, stop=stop) 

4211 coords = selection.select_coords() 

4212 if selection.filter is not None: 

4213 for field, op, filt in selection.filter.format(): 

4214 data = self.read_column( 

4215 field, start=coords.min(), stop=coords.max() + 1 

4216 ) 

4217 coords = coords[op(data.iloc[coords - coords.min()], filt).values] 

4218 

4219 return Index(coords) 

4220 

4221 def read_column( 

4222 self, 

4223 column: str, 

4224 where=None, 

4225 start: int | None = None, 

4226 stop: int | None = None, 

4227 ): 

4228 """ 

4229 return a single column from the table, generally only indexables 

4230 are interesting 

4231 """ 

4232 # validate the version 

4233 self.validate_version() 

4234 

4235 # infer the data kind 

4236 if not self.infer_axes(): 

4237 return False 

4238 

4239 if where is not None: 

4240 raise TypeError("read_column does not currently accept a where clause") 

4241 

4242 # find the axes 

4243 for a in self.axes: 

4244 if column == a.name: 

4245 if not a.is_data_indexable: 

4246 raise ValueError( 

4247 f"column [{column}] can not be extracted individually; " 

4248 "it is not data indexable" 

4249 ) 

4250 

4251 # column must be an indexable or a data column 

4252 c = getattr(self.table.cols, column) 

4253 a.set_info(self.info) 

4254 col_values = a.convert( 

4255 c[start:stop], 

4256 nan_rep=self.nan_rep, 

4257 encoding=self.encoding, 

4258 errors=self.errors, 

4259 ) 

4260 return Series(_set_tz(col_values[1], a.tz), name=column) 

4261 

4262 raise KeyError(f"column [{column}] not found in the table") 

4263 

4264 

4265class WORMTable(Table): 

4266 """ 

4267 a write-once read-many table: this format DOES NOT ALLOW appending to a 

4268 table. writing is a one-time operation the data are stored in a format 

4269 that allows for searching the data on disk 

4270 """ 

4271 

4272 table_type = "worm" 

4273 

4274 def read( 

4275 self, 

4276 where=None, 

4277 columns=None, 

4278 start: int | None = None, 

4279 stop: int | None = None, 

4280 ): 

4281 """ 

4282 read the indices and the indexing array, calculate offset rows and return 

4283 """ 

4284 raise NotImplementedError("WORMTable needs to implement read") 

4285 

4286 def write(self, **kwargs) -> None: 

4287 """ 

4288 write in a format that we can search later on (but cannot append 

4289 to): write out the indices and the values using _write_array 

4290 (e.g. a CArray) create an indexing table so that we can search 

4291 """ 

4292 raise NotImplementedError("WORMTable needs to implement write") 

4293 

4294 

4295class AppendableTable(Table): 

4296 """support the new appendable table formats""" 

4297 

4298 table_type = "appendable" 

4299 

4300 # error: Signature of "write" incompatible with supertype "Fixed" 

4301 def write( # type: ignore[override] 

4302 self, 

4303 obj, 

4304 axes=None, 

4305 append: bool = False, 

4306 complib=None, 

4307 complevel=None, 

4308 fletcher32=None, 

4309 min_itemsize=None, 

4310 chunksize=None, 

4311 expectedrows=None, 

4312 dropna: bool = False, 

4313 nan_rep=None, 

4314 data_columns=None, 

4315 track_times=True, 

4316 ) -> None: 

4317 if not append and self.is_exists: 

4318 self._handle.remove_node(self.group, "table") 

4319 

4320 # create the axes 

4321 table = self._create_axes( 

4322 axes=axes, 

4323 obj=obj, 

4324 validate=append, 

4325 min_itemsize=min_itemsize, 

4326 nan_rep=nan_rep, 

4327 data_columns=data_columns, 

4328 ) 

4329 

4330 for a in table.axes: 

4331 a.validate_names() 

4332 

4333 if not table.is_exists: 

4334 

4335 # create the table 

4336 options = table.create_description( 

4337 complib=complib, 

4338 complevel=complevel, 

4339 fletcher32=fletcher32, 

4340 expectedrows=expectedrows, 

4341 ) 

4342 

4343 # set the table attributes 

4344 table.set_attrs() 

4345 

4346 options["track_times"] = track_times 

4347 

4348 # create the table 

4349 table._handle.create_table(table.group, **options) 

4350 

4351 # update my info 

4352 table.attrs.info = table.info 

4353 

4354 # validate the axes and set the kinds 

4355 for a in table.axes: 

4356 a.validate_and_set(table, append) 

4357 

4358 # add the rows 

4359 table.write_data(chunksize, dropna=dropna) 

4360 

4361 def write_data(self, chunksize: int | None, dropna: bool = False) -> None: 

4362 """ 

4363 we form the data into a 2-d including indexes,values,mask write chunk-by-chunk 

4364 """ 

4365 names = self.dtype.names 

4366 nrows = self.nrows_expected 

4367 

4368 # if dropna==True, then drop ALL nan rows 

4369 masks = [] 

4370 if dropna: 

4371 for a in self.values_axes: 

4372 # figure the mask: only do if we can successfully process this 

4373 # column, otherwise ignore the mask 

4374 mask = isna(a.data).all(axis=0) 

4375 if isinstance(mask, np.ndarray): 

4376 masks.append(mask.astype("u1", copy=False)) 

4377 

4378 # consolidate masks 

4379 if len(masks): 

4380 mask = masks[0] 

4381 for m in masks[1:]: 

4382 mask = mask & m 

4383 mask = mask.ravel() 

4384 else: 

4385 mask = None 

4386 

4387 # broadcast the indexes if needed 

4388 indexes = [a.cvalues for a in self.index_axes] 

4389 nindexes = len(indexes) 

4390 assert nindexes == 1, nindexes # ensures we dont need to broadcast 

4391 

4392 # transpose the values so first dimension is last 

4393 # reshape the values if needed 

4394 values = [a.take_data() for a in self.values_axes] 

4395 values = [v.transpose(np.roll(np.arange(v.ndim), v.ndim - 1)) for v in values] 

4396 bvalues = [] 

4397 for i, v in enumerate(values): 

4398 new_shape = (nrows,) + self.dtype[names[nindexes + i]].shape 

4399 bvalues.append(values[i].reshape(new_shape)) 

4400 

4401 # write the chunks 

4402 if chunksize is None: 

4403 chunksize = 100000 

4404 

4405 rows = np.empty(min(chunksize, nrows), dtype=self.dtype) 

4406 chunks = nrows // chunksize + 1 

4407 for i in range(chunks): 

4408 start_i = i * chunksize 

4409 end_i = min((i + 1) * chunksize, nrows) 

4410 if start_i >= end_i: 

4411 break 

4412 

4413 self.write_data_chunk( 

4414 rows, 

4415 indexes=[a[start_i:end_i] for a in indexes], 

4416 mask=mask[start_i:end_i] if mask is not None else None, 

4417 values=[v[start_i:end_i] for v in bvalues], 

4418 ) 

4419 

4420 def write_data_chunk( 

4421 self, 

4422 rows: np.ndarray, 

4423 indexes: list[np.ndarray], 

4424 mask: npt.NDArray[np.bool_] | None, 

4425 values: list[np.ndarray], 

4426 ) -> None: 

4427 """ 

4428 Parameters 

4429 ---------- 

4430 rows : an empty memory space where we are putting the chunk 

4431 indexes : an array of the indexes 

4432 mask : an array of the masks 

4433 values : an array of the values 

4434 """ 

4435 # 0 len 

4436 for v in values: 

4437 if not np.prod(v.shape): 

4438 return 

4439 

4440 nrows = indexes[0].shape[0] 

4441 if nrows != len(rows): 

4442 rows = np.empty(nrows, dtype=self.dtype) 

4443 names = self.dtype.names 

4444 nindexes = len(indexes) 

4445 

4446 # indexes 

4447 for i, idx in enumerate(indexes): 

4448 rows[names[i]] = idx 

4449 

4450 # values 

4451 for i, v in enumerate(values): 

4452 rows[names[i + nindexes]] = v 

4453 

4454 # mask 

4455 if mask is not None: 

4456 m = ~mask.ravel().astype(bool, copy=False) 

4457 if not m.all(): 

4458 rows = rows[m] 

4459 

4460 if len(rows): 

4461 self.table.append(rows) 

4462 self.table.flush() 

4463 

4464 def delete(self, where=None, start: int | None = None, stop: int | None = None): 

4465 

4466 # delete all rows (and return the nrows) 

4467 if where is None or not len(where): 

4468 if start is None and stop is None: 

4469 nrows = self.nrows 

4470 self._handle.remove_node(self.group, recursive=True) 

4471 else: 

4472 # pytables<3.0 would remove a single row with stop=None 

4473 if stop is None: 

4474 stop = self.nrows 

4475 nrows = self.table.remove_rows(start=start, stop=stop) 

4476 self.table.flush() 

4477 return nrows 

4478 

4479 # infer the data kind 

4480 if not self.infer_axes(): 

4481 return None 

4482 

4483 # create the selection 

4484 table = self.table 

4485 selection = Selection(self, where, start=start, stop=stop) 

4486 values = selection.select_coords() 

4487 

4488 # delete the rows in reverse order 

4489 sorted_series = Series(values).sort_values() 

4490 ln = len(sorted_series) 

4491 

4492 if ln: 

4493 

4494 # construct groups of consecutive rows 

4495 diff = sorted_series.diff() 

4496 groups = list(diff[diff > 1].index) 

4497 

4498 # 1 group 

4499 if not len(groups): 

4500 groups = [0] 

4501 

4502 # final element 

4503 if groups[-1] != ln: 

4504 groups.append(ln) 

4505 

4506 # initial element 

4507 if groups[0] != 0: 

4508 groups.insert(0, 0) 

4509 

4510 # we must remove in reverse order! 

4511 pg = groups.pop() 

4512 for g in reversed(groups): 

4513 rows = sorted_series.take(range(g, pg)) 

4514 table.remove_rows( 

4515 start=rows[rows.index[0]], stop=rows[rows.index[-1]] + 1 

4516 ) 

4517 pg = g 

4518 

4519 self.table.flush() 

4520 

4521 # return the number of rows removed 

4522 return ln 

4523 

4524 

4525class AppendableFrameTable(AppendableTable): 

4526 """support the new appendable table formats""" 

4527 

4528 pandas_kind = "frame_table" 

4529 table_type = "appendable_frame" 

4530 ndim = 2 

4531 obj_type: type[DataFrame | Series] = DataFrame 

4532 

4533 @property 

4534 def is_transposed(self) -> bool: 

4535 return self.index_axes[0].axis == 1 

4536 

4537 @classmethod 

4538 def get_object(cls, obj, transposed: bool): 

4539 """these are written transposed""" 

4540 if transposed: 

4541 obj = obj.T 

4542 return obj 

4543 

4544 def read( 

4545 self, 

4546 where=None, 

4547 columns=None, 

4548 start: int | None = None, 

4549 stop: int | None = None, 

4550 ): 

4551 

4552 # validate the version 

4553 self.validate_version(where) 

4554 

4555 # infer the data kind 

4556 if not self.infer_axes(): 

4557 return None 

4558 

4559 result = self._read_axes(where=where, start=start, stop=stop) 

4560 

4561 info = ( 

4562 self.info.get(self.non_index_axes[0][0], {}) 

4563 if len(self.non_index_axes) 

4564 else {} 

4565 ) 

4566 

4567 inds = [i for i, ax in enumerate(self.axes) if ax is self.index_axes[0]] 

4568 assert len(inds) == 1 

4569 ind = inds[0] 

4570 

4571 index = result[ind][0] 

4572 

4573 frames = [] 

4574 for i, a in enumerate(self.axes): 

4575 if a not in self.values_axes: 

4576 continue 

4577 index_vals, cvalues = result[i] 

4578 

4579 # we could have a multi-index constructor here 

4580 # ensure_index doesn't recognized our list-of-tuples here 

4581 if info.get("type") != "MultiIndex": 

4582 cols = Index(index_vals) 

4583 else: 

4584 cols = MultiIndex.from_tuples(index_vals) 

4585 

4586 names = info.get("names") 

4587 if names is not None: 

4588 cols.set_names(names, inplace=True) 

4589 

4590 if self.is_transposed: 

4591 values = cvalues 

4592 index_ = cols 

4593 cols_ = Index(index, name=getattr(index, "name", None)) 

4594 else: 

4595 values = cvalues.T 

4596 index_ = Index(index, name=getattr(index, "name", None)) 

4597 cols_ = cols 

4598 

4599 # if we have a DataIndexableCol, its shape will only be 1 dim 

4600 if values.ndim == 1 and isinstance(values, np.ndarray): 

4601 values = values.reshape((1, values.shape[0])) 

4602 

4603 if isinstance(values, np.ndarray): 

4604 df = DataFrame(values.T, columns=cols_, index=index_) 

4605 elif isinstance(values, Index): 

4606 df = DataFrame(values, columns=cols_, index=index_) 

4607 else: 

4608 # Categorical 

4609 df = DataFrame._from_arrays([values], columns=cols_, index=index_) 

4610 assert (df.dtypes == values.dtype).all(), (df.dtypes, values.dtype) 

4611 frames.append(df) 

4612 

4613 if len(frames) == 1: 

4614 df = frames[0] 

4615 else: 

4616 df = concat(frames, axis=1) 

4617 

4618 selection = Selection(self, where=where, start=start, stop=stop) 

4619 # apply the selection filters & axis orderings 

4620 df = self.process_axes(df, selection=selection, columns=columns) 

4621 

4622 return df 

4623 

4624 

4625class AppendableSeriesTable(AppendableFrameTable): 

4626 """support the new appendable table formats""" 

4627 

4628 pandas_kind = "series_table" 

4629 table_type = "appendable_series" 

4630 ndim = 2 

4631 obj_type = Series 

4632 

4633 @property 

4634 def is_transposed(self) -> bool: 

4635 return False 

4636 

4637 @classmethod 

4638 def get_object(cls, obj, transposed: bool): 

4639 return obj 

4640 

4641 def write(self, obj, data_columns=None, **kwargs): 

4642 """we are going to write this as a frame table""" 

4643 if not isinstance(obj, DataFrame): 

4644 name = obj.name or "values" 

4645 obj = obj.to_frame(name) 

4646 return super().write(obj=obj, data_columns=obj.columns.tolist(), **kwargs) 

4647 

4648 def read( 

4649 self, 

4650 where=None, 

4651 columns=None, 

4652 start: int | None = None, 

4653 stop: int | None = None, 

4654 ) -> Series: 

4655 

4656 is_multi_index = self.is_multi_index 

4657 if columns is not None and is_multi_index: 

4658 assert isinstance(self.levels, list) # needed for mypy 

4659 for n in self.levels: 

4660 if n not in columns: 

4661 columns.insert(0, n) 

4662 s = super().read(where=where, columns=columns, start=start, stop=stop) 

4663 if is_multi_index: 

4664 s.set_index(self.levels, inplace=True) 

4665 

4666 s = s.iloc[:, 0] 

4667 

4668 # remove the default name 

4669 if s.name == "values": 

4670 s.name = None 

4671 return s 

4672 

4673 

4674class AppendableMultiSeriesTable(AppendableSeriesTable): 

4675 """support the new appendable table formats""" 

4676 

4677 pandas_kind = "series_table" 

4678 table_type = "appendable_multiseries" 

4679 

4680 def write(self, obj, **kwargs): 

4681 """we are going to write this as a frame table""" 

4682 name = obj.name or "values" 

4683 newobj, self.levels = self.validate_multiindex(obj) 

4684 assert isinstance(self.levels, list) # for mypy 

4685 cols = list(self.levels) 

4686 cols.append(name) 

4687 newobj.columns = Index(cols) 

4688 return super().write(obj=newobj, **kwargs) 

4689 

4690 

4691class GenericTable(AppendableFrameTable): 

4692 """a table that read/writes the generic pytables table format""" 

4693 

4694 pandas_kind = "frame_table" 

4695 table_type = "generic_table" 

4696 ndim = 2 

4697 obj_type = DataFrame 

4698 levels: list[Hashable] 

4699 

4700 @property 

4701 def pandas_type(self) -> str: 

4702 return self.pandas_kind 

4703 

4704 @property 

4705 def storable(self): 

4706 return getattr(self.group, "table", None) or self.group 

4707 

4708 def get_attrs(self) -> None: 

4709 """retrieve our attributes""" 

4710 self.non_index_axes = [] 

4711 self.nan_rep = None 

4712 self.levels = [] 

4713 

4714 self.index_axes = [a for a in self.indexables if a.is_an_indexable] 

4715 self.values_axes = [a for a in self.indexables if not a.is_an_indexable] 

4716 self.data_columns = [a.name for a in self.values_axes] 

4717 

4718 @cache_readonly 

4719 def indexables(self): 

4720 """create the indexables from the table description""" 

4721 d = self.description 

4722 

4723 # TODO: can we get a typ for this? AFAICT it is the only place 

4724 # where we aren't passing one 

4725 # the index columns is just a simple index 

4726 md = self.read_metadata("index") 

4727 meta = "category" if md is not None else None 

4728 index_col = GenericIndexCol( 

4729 name="index", axis=0, table=self.table, meta=meta, metadata=md 

4730 ) 

4731 

4732 _indexables: list[GenericIndexCol | GenericDataIndexableCol] = [index_col] 

4733 

4734 for i, n in enumerate(d._v_names): 

4735 assert isinstance(n, str) 

4736 

4737 atom = getattr(d, n) 

4738 md = self.read_metadata(n) 

4739 meta = "category" if md is not None else None 

4740 dc = GenericDataIndexableCol( 

4741 name=n, 

4742 pos=i, 

4743 values=[n], 

4744 typ=atom, 

4745 table=self.table, 

4746 meta=meta, 

4747 metadata=md, 

4748 ) 

4749 _indexables.append(dc) 

4750 

4751 return _indexables 

4752 

4753 def write(self, **kwargs): 

4754 raise NotImplementedError("cannot write on an generic table") 

4755 

4756 

4757class AppendableMultiFrameTable(AppendableFrameTable): 

4758 """a frame with a multi-index""" 

4759 

4760 table_type = "appendable_multiframe" 

4761 obj_type = DataFrame 

4762 ndim = 2 

4763 _re_levels = re.compile(r"^level_\d+$") 

4764 

4765 @property 

4766 def table_type_short(self) -> str: 

4767 return "appendable_multi" 

4768 

4769 def write(self, obj, data_columns=None, **kwargs): 

4770 if data_columns is None: 

4771 data_columns = [] 

4772 elif data_columns is True: 

4773 data_columns = obj.columns.tolist() 

4774 obj, self.levels = self.validate_multiindex(obj) 

4775 assert isinstance(self.levels, list) # for mypy 

4776 for n in self.levels: 

4777 if n not in data_columns: 

4778 data_columns.insert(0, n) 

4779 return super().write(obj=obj, data_columns=data_columns, **kwargs) 

4780 

4781 def read( 

4782 self, 

4783 where=None, 

4784 columns=None, 

4785 start: int | None = None, 

4786 stop: int | None = None, 

4787 ): 

4788 

4789 df = super().read(where=where, columns=columns, start=start, stop=stop) 

4790 df = df.set_index(self.levels) 

4791 

4792 # remove names for 'level_%d' 

4793 df.index = df.index.set_names( 

4794 [None if self._re_levels.search(name) else name for name in df.index.names] 

4795 ) 

4796 

4797 return df 

4798 

4799 

4800def _reindex_axis(obj: DataFrame, axis: int, labels: Index, other=None) -> DataFrame: 

4801 ax = obj._get_axis(axis) 

4802 labels = ensure_index(labels) 

4803 

4804 # try not to reindex even if other is provided 

4805 # if it equals our current index 

4806 if other is not None: 

4807 other = ensure_index(other) 

4808 if (other is None or labels.equals(other)) and labels.equals(ax): 

4809 return obj 

4810 

4811 labels = ensure_index(labels.unique()) 

4812 if other is not None: 

4813 labels = ensure_index(other.unique()).intersection(labels, sort=False) 

4814 if not labels.equals(ax): 

4815 slicer: list[slice | Index] = [slice(None, None)] * obj.ndim 

4816 slicer[axis] = labels 

4817 obj = obj.loc[tuple(slicer)] 

4818 return obj 

4819 

4820 

4821# tz to/from coercion 

4822 

4823 

4824def _get_tz(tz: tzinfo) -> str | tzinfo: 

4825 """for a tz-aware type, return an encoded zone""" 

4826 zone = timezones.get_timezone(tz) 

4827 return zone 

4828 

4829 

4830@overload 

4831def _set_tz( 

4832 values: np.ndarray | Index, tz: str | tzinfo, coerce: bool = False 

4833) -> DatetimeIndex: 

4834 ... 

4835 

4836 

4837@overload 

4838def _set_tz(values: np.ndarray | Index, tz: None, coerce: bool = False) -> np.ndarray: 

4839 ... 

4840 

4841 

4842def _set_tz( 

4843 values: np.ndarray | Index, tz: str | tzinfo | None, coerce: bool = False 

4844) -> np.ndarray | DatetimeIndex: 

4845 """ 

4846 coerce the values to a DatetimeIndex if tz is set 

4847 preserve the input shape if possible 

4848 

4849 Parameters 

4850 ---------- 

4851 values : ndarray or Index 

4852 tz : str or tzinfo 

4853 coerce : if we do not have a passed timezone, coerce to M8[ns] ndarray 

4854 """ 

4855 if isinstance(values, DatetimeIndex): 

4856 # If values is tzaware, the tz gets dropped in the values.ravel() 

4857 # call below (which returns an ndarray). So we are only non-lossy 

4858 # if `tz` matches `values.tz`. 

4859 assert values.tz is None or values.tz == tz 

4860 

4861 if tz is not None: 

4862 if isinstance(values, DatetimeIndex): 

4863 name = values.name 

4864 values = values.asi8 

4865 else: 

4866 name = None 

4867 values = values.ravel() 

4868 

4869 tz = _ensure_decoded(tz) 

4870 values = DatetimeIndex(values, name=name) 

4871 values = values.tz_localize("UTC").tz_convert(tz) 

4872 elif coerce: 

4873 values = np.asarray(values, dtype="M8[ns]") 

4874 

4875 # error: Incompatible return value type (got "Union[ndarray, Index]", 

4876 # expected "Union[ndarray, DatetimeIndex]") 

4877 return values # type: ignore[return-value] 

4878 

4879 

4880def _convert_index(name: str, index: Index, encoding: str, errors: str) -> IndexCol: 

4881 assert isinstance(name, str) 

4882 

4883 index_name = index.name 

4884 # error: Argument 1 to "_get_data_and_dtype_name" has incompatible type "Index"; 

4885 # expected "Union[ExtensionArray, ndarray]" 

4886 converted, dtype_name = _get_data_and_dtype_name(index) # type: ignore[arg-type] 

4887 kind = _dtype_to_kind(dtype_name) 

4888 atom = DataIndexableCol._get_atom(converted) 

4889 

4890 if ( 

4891 isinstance(index, Int64Index) 

4892 or needs_i8_conversion(index.dtype) 

4893 or is_bool_dtype(index.dtype) 

4894 ): 

4895 # Includes Int64Index, RangeIndex, DatetimeIndex, TimedeltaIndex, PeriodIndex, 

4896 # in which case "kind" is "integer", "integer", "datetime64", 

4897 # "timedelta64", and "integer", respectively. 

4898 return IndexCol( 

4899 name, 

4900 values=converted, 

4901 kind=kind, 

4902 typ=atom, 

4903 freq=getattr(index, "freq", None), 

4904 tz=getattr(index, "tz", None), 

4905 index_name=index_name, 

4906 ) 

4907 

4908 if isinstance(index, MultiIndex): 

4909 raise TypeError("MultiIndex not supported here!") 

4910 

4911 inferred_type = lib.infer_dtype(index, skipna=False) 

4912 # we won't get inferred_type of "datetime64" or "timedelta64" as these 

4913 # would go through the DatetimeIndex/TimedeltaIndex paths above 

4914 

4915 values = np.asarray(index) 

4916 

4917 if inferred_type == "date": 

4918 converted = np.asarray([v.toordinal() for v in values], dtype=np.int32) 

4919 return IndexCol( 

4920 name, converted, "date", _tables().Time32Col(), index_name=index_name 

4921 ) 

4922 elif inferred_type == "string": 

4923 

4924 converted = _convert_string_array(values, encoding, errors) 

4925 itemsize = converted.dtype.itemsize 

4926 return IndexCol( 

4927 name, 

4928 converted, 

4929 "string", 

4930 _tables().StringCol(itemsize), 

4931 index_name=index_name, 

4932 ) 

4933 

4934 elif inferred_type in ["integer", "floating"]: 

4935 return IndexCol( 

4936 name, values=converted, kind=kind, typ=atom, index_name=index_name 

4937 ) 

4938 else: 

4939 assert isinstance(converted, np.ndarray) and converted.dtype == object 

4940 assert kind == "object", kind 

4941 atom = _tables().ObjectAtom() 

4942 return IndexCol(name, converted, kind, atom, index_name=index_name) 

4943 

4944 

4945def _unconvert_index(data, kind: str, encoding: str, errors: str) -> np.ndarray | Index: 

4946 index: Index | np.ndarray 

4947 

4948 if kind == "datetime64": 

4949 index = DatetimeIndex(data) 

4950 elif kind == "timedelta64": 

4951 index = TimedeltaIndex(data) 

4952 elif kind == "date": 

4953 try: 

4954 index = np.asarray([date.fromordinal(v) for v in data], dtype=object) 

4955 except (ValueError): 

4956 index = np.asarray([date.fromtimestamp(v) for v in data], dtype=object) 

4957 elif kind in ("integer", "float", "bool"): 

4958 index = np.asarray(data) 

4959 elif kind in ("string"): 

4960 index = _unconvert_string_array( 

4961 data, nan_rep=None, encoding=encoding, errors=errors 

4962 ) 

4963 elif kind == "object": 

4964 index = np.asarray(data[0]) 

4965 else: # pragma: no cover 

4966 raise ValueError(f"unrecognized index type {kind}") 

4967 return index 

4968 

4969 

4970def _maybe_convert_for_string_atom( 

4971 name: str, 

4972 bvalues: ArrayLike, 

4973 existing_col, 

4974 min_itemsize, 

4975 nan_rep, 

4976 encoding, 

4977 errors, 

4978 columns: list[str], 

4979): 

4980 

4981 if bvalues.dtype != object: 

4982 return bvalues 

4983 

4984 bvalues = cast(np.ndarray, bvalues) 

4985 

4986 dtype_name = bvalues.dtype.name 

4987 inferred_type = lib.infer_dtype(bvalues, skipna=False) 

4988 

4989 if inferred_type == "date": 

4990 raise TypeError("[date] is not implemented as a table column") 

4991 elif inferred_type == "datetime": 

4992 # after GH#8260 

4993 # this only would be hit for a multi-timezone dtype which is an error 

4994 raise TypeError( 

4995 "too many timezones in this block, create separate data columns" 

4996 ) 

4997 

4998 elif not (inferred_type == "string" or dtype_name == "object"): 

4999 return bvalues 

5000 

5001 mask = isna(bvalues) 

5002 data = bvalues.copy() 

5003 data[mask] = nan_rep 

5004 

5005 # see if we have a valid string type 

5006 inferred_type = lib.infer_dtype(data, skipna=False) 

5007 if inferred_type != "string": 

5008 

5009 # we cannot serialize this data, so report an exception on a column 

5010 # by column basis 

5011 

5012 # expected behaviour: 

5013 # search block for a non-string object column by column 

5014 for i in range(data.shape[0]): 

5015 col = data[i] 

5016 inferred_type = lib.infer_dtype(col, skipna=False) 

5017 if inferred_type != "string": 

5018 error_column_label = columns[i] if len(columns) > i else f"No.{i}" 

5019 raise TypeError( 

5020 f"Cannot serialize the column [{error_column_label}]\n" 

5021 f"because its data contents are not [string] but " 

5022 f"[{inferred_type}] object dtype" 

5023 ) 

5024 

5025 # itemsize is the maximum length of a string (along any dimension) 

5026 

5027 data_converted = _convert_string_array(data, encoding, errors).reshape(data.shape) 

5028 itemsize = data_converted.itemsize 

5029 

5030 # specified min_itemsize? 

5031 if isinstance(min_itemsize, dict): 

5032 min_itemsize = int(min_itemsize.get(name) or min_itemsize.get("values") or 0) 

5033 itemsize = max(min_itemsize or 0, itemsize) 

5034 

5035 # check for column in the values conflicts 

5036 if existing_col is not None: 

5037 eci = existing_col.validate_col(itemsize) 

5038 if eci is not None and eci > itemsize: 

5039 itemsize = eci 

5040 

5041 data_converted = data_converted.astype(f"|S{itemsize}", copy=False) 

5042 return data_converted 

5043 

5044 

5045def _convert_string_array(data: np.ndarray, encoding: str, errors: str) -> np.ndarray: 

5046 """ 

5047 Take a string-like that is object dtype and coerce to a fixed size string type. 

5048 

5049 Parameters 

5050 ---------- 

5051 data : np.ndarray[object] 

5052 encoding : str 

5053 errors : str 

5054 Handler for encoding errors. 

5055 

5056 Returns 

5057 ------- 

5058 np.ndarray[fixed-length-string] 

5059 """ 

5060 # encode if needed 

5061 if len(data): 

5062 data = ( 

5063 Series(data.ravel()) 

5064 .str.encode(encoding, errors) 

5065 ._values.reshape(data.shape) 

5066 ) 

5067 

5068 # create the sized dtype 

5069 ensured = ensure_object(data.ravel()) 

5070 itemsize = max(1, libwriters.max_len_string_array(ensured)) 

5071 

5072 data = np.asarray(data, dtype=f"S{itemsize}") 

5073 return data 

5074 

5075 

5076def _unconvert_string_array( 

5077 data: np.ndarray, nan_rep, encoding: str, errors: str 

5078) -> np.ndarray: 

5079 """ 

5080 Inverse of _convert_string_array. 

5081 

5082 Parameters 

5083 ---------- 

5084 data : np.ndarray[fixed-length-string] 

5085 nan_rep : the storage repr of NaN 

5086 encoding : str 

5087 errors : str 

5088 Handler for encoding errors. 

5089 

5090 Returns 

5091 ------- 

5092 np.ndarray[object] 

5093 Decoded data. 

5094 """ 

5095 shape = data.shape 

5096 data = np.asarray(data.ravel(), dtype=object) 

5097 

5098 if len(data): 

5099 

5100 itemsize = libwriters.max_len_string_array(ensure_object(data)) 

5101 dtype = f"U{itemsize}" 

5102 

5103 if isinstance(data[0], bytes): 

5104 data = Series(data).str.decode(encoding, errors=errors)._values 

5105 else: 

5106 data = data.astype(dtype, copy=False).astype(object, copy=False) 

5107 

5108 if nan_rep is None: 

5109 nan_rep = "nan" 

5110 

5111 libwriters.string_array_replace_from_nan_rep(data, nan_rep) 

5112 return data.reshape(shape) 

5113 

5114 

5115def _maybe_convert(values: np.ndarray, val_kind: str, encoding: str, errors: str): 

5116 assert isinstance(val_kind, str), type(val_kind) 

5117 if _need_convert(val_kind): 

5118 conv = _get_converter(val_kind, encoding, errors) 

5119 values = conv(values) 

5120 return values 

5121 

5122 

5123def _get_converter(kind: str, encoding: str, errors: str): 

5124 if kind == "datetime64": 

5125 return lambda x: np.asarray(x, dtype="M8[ns]") 

5126 elif kind == "string": 

5127 return lambda x: _unconvert_string_array( 

5128 x, nan_rep=None, encoding=encoding, errors=errors 

5129 ) 

5130 else: # pragma: no cover 

5131 raise ValueError(f"invalid kind {kind}") 

5132 

5133 

5134def _need_convert(kind: str) -> bool: 

5135 if kind in ("datetime64", "string"): 

5136 return True 

5137 return False 

5138 

5139 

5140def _maybe_adjust_name(name: str, version: Sequence[int]) -> str: 

5141 """ 

5142 Prior to 0.10.1, we named values blocks like: values_block_0 an the 

5143 name values_0, adjust the given name if necessary. 

5144 

5145 Parameters 

5146 ---------- 

5147 name : str 

5148 version : Tuple[int, int, int] 

5149 

5150 Returns 

5151 ------- 

5152 str 

5153 """ 

5154 if isinstance(version, str) or len(version) < 3: 

5155 raise ValueError("Version is incorrect, expected sequence of 3 integers.") 

5156 

5157 if version[0] == 0 and version[1] <= 10 and version[2] == 0: 

5158 m = re.search(r"values_block_(\d+)", name) 

5159 if m: 

5160 grp = m.groups()[0] 

5161 name = f"values_{grp}" 

5162 return name 

5163 

5164 

5165def _dtype_to_kind(dtype_str: str) -> str: 

5166 """ 

5167 Find the "kind" string describing the given dtype name. 

5168 """ 

5169 dtype_str = _ensure_decoded(dtype_str) 

5170 

5171 if dtype_str.startswith("string") or dtype_str.startswith("bytes"): 

5172 kind = "string" 

5173 elif dtype_str.startswith("float"): 

5174 kind = "float" 

5175 elif dtype_str.startswith("complex"): 

5176 kind = "complex" 

5177 elif dtype_str.startswith("int") or dtype_str.startswith("uint"): 

5178 kind = "integer" 

5179 elif dtype_str.startswith("datetime64"): 

5180 kind = "datetime64" 

5181 elif dtype_str.startswith("timedelta"): 

5182 kind = "timedelta64" 

5183 elif dtype_str.startswith("bool"): 

5184 kind = "bool" 

5185 elif dtype_str.startswith("category"): 

5186 kind = "category" 

5187 elif dtype_str.startswith("period"): 

5188 # We store the `freq` attr so we can restore from integers 

5189 kind = "integer" 

5190 elif dtype_str == "object": 

5191 kind = "object" 

5192 else: 

5193 raise ValueError(f"cannot interpret dtype of [{dtype_str}]") 

5194 

5195 return kind 

5196 

5197 

5198def _get_data_and_dtype_name(data: ArrayLike): 

5199 """ 

5200 Convert the passed data into a storable form and a dtype string. 

5201 """ 

5202 if isinstance(data, Categorical): 

5203 data = data.codes 

5204 

5205 # For datetime64tz we need to drop the TZ in tests TODO: why? 

5206 dtype_name = data.dtype.name.split("[")[0] 

5207 

5208 if data.dtype.kind in ["m", "M"]: 

5209 data = np.asarray(data.view("i8")) 

5210 # TODO: we used to reshape for the dt64tz case, but no longer 

5211 # doing that doesn't seem to break anything. why? 

5212 

5213 elif isinstance(data, PeriodIndex): 

5214 data = data.asi8 

5215 

5216 data = np.asarray(data) 

5217 return data, dtype_name 

5218 

5219 

5220class Selection: 

5221 """ 

5222 Carries out a selection operation on a tables.Table object. 

5223 

5224 Parameters 

5225 ---------- 

5226 table : a Table object 

5227 where : list of Terms (or convertible to) 

5228 start, stop: indices to start and/or stop selection 

5229 

5230 """ 

5231 

5232 def __init__( 

5233 self, 

5234 table: Table, 

5235 where=None, 

5236 start: int | None = None, 

5237 stop: int | None = None, 

5238 ) -> None: 

5239 self.table = table 

5240 self.where = where 

5241 self.start = start 

5242 self.stop = stop 

5243 self.condition = None 

5244 self.filter = None 

5245 self.terms = None 

5246 self.coordinates = None 

5247 

5248 if is_list_like(where): 

5249 

5250 # see if we have a passed coordinate like 

5251 with suppress(ValueError): 

5252 inferred = lib.infer_dtype(where, skipna=False) 

5253 if inferred == "integer" or inferred == "boolean": 

5254 where = np.asarray(where) 

5255 if where.dtype == np.bool_: 

5256 start, stop = self.start, self.stop 

5257 if start is None: 

5258 start = 0 

5259 if stop is None: 

5260 stop = self.table.nrows 

5261 self.coordinates = np.arange(start, stop)[where] 

5262 elif issubclass(where.dtype.type, np.integer): 

5263 if (self.start is not None and (where < self.start).any()) or ( 

5264 self.stop is not None and (where >= self.stop).any() 

5265 ): 

5266 raise ValueError( 

5267 "where must have index locations >= start and < stop" 

5268 ) 

5269 self.coordinates = where 

5270 

5271 if self.coordinates is None: 

5272 

5273 self.terms = self.generate(where) 

5274 

5275 # create the numexpr & the filter 

5276 if self.terms is not None: 

5277 self.condition, self.filter = self.terms.evaluate() 

5278 

5279 def generate(self, where): 

5280 """where can be a : dict,list,tuple,string""" 

5281 if where is None: 

5282 return None 

5283 

5284 q = self.table.queryables() 

5285 try: 

5286 return PyTablesExpr(where, queryables=q, encoding=self.table.encoding) 

5287 except NameError as err: 

5288 # raise a nice message, suggesting that the user should use 

5289 # data_columns 

5290 qkeys = ",".join(q.keys()) 

5291 msg = dedent( 

5292 f"""\ 

5293 The passed where expression: {where} 

5294 contains an invalid variable reference 

5295 all of the variable references must be a reference to 

5296 an axis (e.g. 'index' or 'columns'), or a data_column 

5297 The currently defined references are: {qkeys} 

5298 """ 

5299 ) 

5300 raise ValueError(msg) from err 

5301 

5302 def select(self): 

5303 """ 

5304 generate the selection 

5305 """ 

5306 if self.condition is not None: 

5307 return self.table.table.read_where( 

5308 self.condition.format(), start=self.start, stop=self.stop 

5309 ) 

5310 elif self.coordinates is not None: 

5311 return self.table.table.read_coordinates(self.coordinates) 

5312 return self.table.table.read(start=self.start, stop=self.stop) 

5313 

5314 def select_coords(self): 

5315 """ 

5316 generate the selection 

5317 """ 

5318 start, stop = self.start, self.stop 

5319 nrows = self.table.nrows 

5320 if start is None: 

5321 start = 0 

5322 elif start < 0: 

5323 start += nrows 

5324 if stop is None: 

5325 stop = nrows 

5326 elif stop < 0: 

5327 stop += nrows 

5328 

5329 if self.condition is not None: 

5330 return self.table.table.get_where_list( 

5331 self.condition.format(), start=start, stop=stop, sort=True 

5332 ) 

5333 elif self.coordinates is not None: 

5334 return self.coordinates 

5335 

5336 return np.arange(start, stop)