Coverage for /var/srv/projects/api.amasfac.comuna18.com/tmp/venv/lib/python3.9/site-packages/pandas/io/pytables.py: 14%
2279 statements
« prev ^ index » next coverage.py v6.4.4, created at 2023-07-17 14:22 -0600
« prev ^ index » next coverage.py v6.4.4, created at 2023-07-17 14:22 -0600
1"""
2High level interface to PyTables for reading and writing pandas data structures
3to disk
4"""
5from __future__ import annotations
7from contextlib import suppress
8import copy
9from datetime import (
10 date,
11 tzinfo,
12)
13import itertools
14import os
15import re
16from textwrap import dedent
17from typing import (
18 TYPE_CHECKING,
19 Any,
20 Callable,
21 Final,
22 Hashable,
23 Iterator,
24 Literal,
25 Sequence,
26 cast,
27 overload,
28)
29import warnings
31import numpy as np
33from pandas._config import (
34 config,
35 get_option,
36)
38from pandas._libs import (
39 lib,
40 writers as libwriters,
41)
42from pandas._libs.tslibs import timezones
43from pandas._typing import (
44 AnyArrayLike,
45 ArrayLike,
46 DtypeArg,
47 FilePath,
48 Shape,
49 npt,
50)
51from pandas.compat._optional import import_optional_dependency
52from pandas.compat.pickle_compat import patch_pickle
53from pandas.errors import (
54 AttributeConflictWarning,
55 ClosedFileError,
56 IncompatibilityWarning,
57 PerformanceWarning,
58 PossibleDataLossError,
59)
60from pandas.util._decorators import cache_readonly
61from pandas.util._exceptions import find_stack_level
63from pandas.core.dtypes.common import (
64 ensure_object,
65 is_bool_dtype,
66 is_categorical_dtype,
67 is_complex_dtype,
68 is_datetime64_dtype,
69 is_datetime64tz_dtype,
70 is_extension_array_dtype,
71 is_list_like,
72 is_string_dtype,
73 is_timedelta64_dtype,
74 needs_i8_conversion,
75)
76from pandas.core.dtypes.missing import array_equivalent
78from pandas import (
79 DataFrame,
80 DatetimeIndex,
81 Index,
82 MultiIndex,
83 PeriodIndex,
84 Series,
85 TimedeltaIndex,
86 concat,
87 isna,
88)
89from pandas.core.api import Int64Index
90from pandas.core.arrays import (
91 Categorical,
92 DatetimeArray,
93 PeriodArray,
94)
95import pandas.core.common as com
96from pandas.core.computation.pytables import (
97 PyTablesExpr,
98 maybe_expression,
99)
100from pandas.core.construction import extract_array
101from pandas.core.indexes.api import ensure_index
102from pandas.core.internals import (
103 ArrayManager,
104 BlockManager,
105)
107from pandas.io.common import stringify_path
108from pandas.io.formats.printing import (
109 adjoin,
110 pprint_thing,
111)
113if TYPE_CHECKING: 113 ↛ 114line 113 didn't jump to line 114, because the condition on line 113 was never true
114 from tables import (
115 Col,
116 File,
117 Node,
118 )
120 from pandas.core.internals import Block
123# versioning attribute
124_version = "0.15.2"
126# encoding
127_default_encoding = "UTF-8"
130def _ensure_decoded(s):
131 """if we have bytes, decode them to unicode"""
132 if isinstance(s, np.bytes_):
133 s = s.decode("UTF-8")
134 return s
137def _ensure_encoding(encoding):
138 # set the encoding if we need
139 if encoding is None:
140 encoding = _default_encoding
142 return encoding
145def _ensure_str(name):
146 """
147 Ensure that an index / column name is a str (python 3); otherwise they
148 may be np.string dtype. Non-string dtypes are passed through unchanged.
150 https://github.com/pandas-dev/pandas/issues/13492
151 """
152 if isinstance(name, str):
153 name = str(name)
154 return name
157Term = PyTablesExpr
160def _ensure_term(where, scope_level: int):
161 """
162 Ensure that the where is a Term or a list of Term.
164 This makes sure that we are capturing the scope of variables that are
165 passed create the terms here with a frame_level=2 (we are 2 levels down)
166 """
167 # only consider list/tuple here as an ndarray is automatically a coordinate
168 # list
169 level = scope_level + 1
170 if isinstance(where, (list, tuple)):
171 where = [
172 Term(term, scope_level=level + 1) if maybe_expression(term) else term
173 for term in where
174 if term is not None
175 ]
176 elif maybe_expression(where):
177 where = Term(where, scope_level=level)
178 return where if where is None or len(where) else None
181incompatibility_doc: Final = """
182where criteria is being ignored as this version [%s] is too old (or
183not-defined), read the file in and write it out to a new file to upgrade (with
184the copy_to method)
185"""
187attribute_conflict_doc: Final = """
188the [%s] attribute of the existing index is [%s] which conflicts with the new
189[%s], resetting the attribute to None
190"""
192performance_doc: Final = """
193your performance may suffer as PyTables will pickle object types that it cannot
194map directly to c-types [inferred_type->%s,key->%s] [items->%s]
195"""
197# formats
198_FORMAT_MAP = {"f": "fixed", "fixed": "fixed", "t": "table", "table": "table"}
200# axes map
201_AXES_MAP = {DataFrame: [0]}
203# register our configuration options
204dropna_doc: Final = """
205: boolean
206 drop ALL nan rows when appending to a table
207"""
208format_doc: Final = """
209: format
210 default format writing format, if None, then
211 put will default to 'fixed' and append will default to 'table'
212"""
214with config.config_prefix("io.hdf"):
215 config.register_option("dropna_table", False, dropna_doc, validator=config.is_bool)
216 config.register_option(
217 "default_format",
218 None,
219 format_doc,
220 validator=config.is_one_of_factory(["fixed", "table", None]),
221 )
223# oh the troubles to reduce import time
224_table_mod = None
225_table_file_open_policy_is_strict = False
228def _tables():
229 global _table_mod
230 global _table_file_open_policy_is_strict
231 if _table_mod is None:
232 import tables
234 _table_mod = tables
236 # set the file open policy
237 # return the file open policy; this changes as of pytables 3.1
238 # depending on the HDF5 version
239 with suppress(AttributeError):
240 _table_file_open_policy_is_strict = (
241 tables.file._FILE_OPEN_POLICY == "strict"
242 )
244 return _table_mod
247# interface to/from ###
250def to_hdf(
251 path_or_buf: FilePath | HDFStore,
252 key: str,
253 value: DataFrame | Series,
254 mode: str = "a",
255 complevel: int | None = None,
256 complib: str | None = None,
257 append: bool = False,
258 format: str | None = None,
259 index: bool = True,
260 min_itemsize: int | dict[str, int] | None = None,
261 nan_rep=None,
262 dropna: bool | None = None,
263 data_columns: Literal[True] | list[str] | None = None,
264 errors: str = "strict",
265 encoding: str = "UTF-8",
266) -> None:
267 """store this object, close it if we opened it"""
268 if append:
269 f = lambda store: store.append(
270 key,
271 value,
272 format=format,
273 index=index,
274 min_itemsize=min_itemsize,
275 nan_rep=nan_rep,
276 dropna=dropna,
277 data_columns=data_columns,
278 errors=errors,
279 encoding=encoding,
280 )
281 else:
282 # NB: dropna is not passed to `put`
283 f = lambda store: store.put(
284 key,
285 value,
286 format=format,
287 index=index,
288 min_itemsize=min_itemsize,
289 nan_rep=nan_rep,
290 data_columns=data_columns,
291 errors=errors,
292 encoding=encoding,
293 dropna=dropna,
294 )
296 path_or_buf = stringify_path(path_or_buf)
297 if isinstance(path_or_buf, str):
298 with HDFStore(
299 path_or_buf, mode=mode, complevel=complevel, complib=complib
300 ) as store:
301 f(store)
302 else:
303 f(path_or_buf)
306def read_hdf(
307 path_or_buf: FilePath | HDFStore,
308 key=None,
309 mode: str = "r",
310 errors: str = "strict",
311 where: str | list | None = None,
312 start: int | None = None,
313 stop: int | None = None,
314 columns: list[str] | None = None,
315 iterator: bool = False,
316 chunksize: int | None = None,
317 **kwargs,
318):
319 """
320 Read from the store, close it if we opened it.
322 Retrieve pandas object stored in file, optionally based on where
323 criteria.
325 .. warning::
327 Pandas uses PyTables for reading and writing HDF5 files, which allows
328 serializing object-dtype data with pickle when using the "fixed" format.
329 Loading pickled data received from untrusted sources can be unsafe.
331 See: https://docs.python.org/3/library/pickle.html for more.
333 Parameters
334 ----------
335 path_or_buf : str, path object, pandas.HDFStore
336 Any valid string path is acceptable. Only supports the local file system,
337 remote URLs and file-like objects are not supported.
339 If you want to pass in a path object, pandas accepts any
340 ``os.PathLike``.
342 Alternatively, pandas accepts an open :class:`pandas.HDFStore` object.
344 key : object, optional
345 The group identifier in the store. Can be omitted if the HDF file
346 contains a single pandas object.
347 mode : {'r', 'r+', 'a'}, default 'r'
348 Mode to use when opening the file. Ignored if path_or_buf is a
349 :class:`pandas.HDFStore`. Default is 'r'.
350 errors : str, default 'strict'
351 Specifies how encoding and decoding errors are to be handled.
352 See the errors argument for :func:`open` for a full list
353 of options.
354 where : list, optional
355 A list of Term (or convertible) objects.
356 start : int, optional
357 Row number to start selection.
358 stop : int, optional
359 Row number to stop selection.
360 columns : list, optional
361 A list of columns names to return.
362 iterator : bool, optional
363 Return an iterator object.
364 chunksize : int, optional
365 Number of rows to include in an iteration when using an iterator.
366 **kwargs
367 Additional keyword arguments passed to HDFStore.
369 Returns
370 -------
371 item : object
372 The selected object. Return type depends on the object stored.
374 See Also
375 --------
376 DataFrame.to_hdf : Write a HDF file from a DataFrame.
377 HDFStore : Low-level access to HDF files.
379 Examples
380 --------
381 >>> df = pd.DataFrame([[1, 1.0, 'a']], columns=['x', 'y', 'z']) # doctest: +SKIP
382 >>> df.to_hdf('./store.h5', 'data') # doctest: +SKIP
383 >>> reread = pd.read_hdf('./store.h5') # doctest: +SKIP
384 """
385 if mode not in ["r", "r+", "a"]:
386 raise ValueError(
387 f"mode {mode} is not allowed while performing a read. "
388 f"Allowed modes are r, r+ and a."
389 )
390 # grab the scope
391 if where is not None:
392 where = _ensure_term(where, scope_level=1)
394 if isinstance(path_or_buf, HDFStore):
395 if not path_or_buf.is_open:
396 raise OSError("The HDFStore must be open for reading.")
398 store = path_or_buf
399 auto_close = False
400 else:
401 path_or_buf = stringify_path(path_or_buf)
402 if not isinstance(path_or_buf, str):
403 raise NotImplementedError(
404 "Support for generic buffers has not been implemented."
405 )
406 try:
407 exists = os.path.exists(path_or_buf)
409 # if filepath is too long
410 except (TypeError, ValueError):
411 exists = False
413 if not exists:
414 raise FileNotFoundError(f"File {path_or_buf} does not exist")
416 store = HDFStore(path_or_buf, mode=mode, errors=errors, **kwargs)
417 # can't auto open/close if we are using an iterator
418 # so delegate to the iterator
419 auto_close = True
421 try:
422 if key is None:
423 groups = store.groups()
424 if len(groups) == 0:
425 raise ValueError(
426 "Dataset(s) incompatible with Pandas data types, "
427 "not table, or no datasets found in HDF5 file."
428 )
429 candidate_only_group = groups[0]
431 # For the HDF file to have only one dataset, all other groups
432 # should then be metadata groups for that candidate group. (This
433 # assumes that the groups() method enumerates parent groups
434 # before their children.)
435 for group_to_check in groups[1:]:
436 if not _is_metadata_of(group_to_check, candidate_only_group):
437 raise ValueError(
438 "key must be provided when HDF5 "
439 "file contains multiple datasets."
440 )
441 key = candidate_only_group._v_pathname
442 return store.select(
443 key,
444 where=where,
445 start=start,
446 stop=stop,
447 columns=columns,
448 iterator=iterator,
449 chunksize=chunksize,
450 auto_close=auto_close,
451 )
452 except (ValueError, TypeError, KeyError):
453 if not isinstance(path_or_buf, HDFStore):
454 # if there is an error, close the store if we opened it.
455 with suppress(AttributeError):
456 store.close()
458 raise
461def _is_metadata_of(group: Node, parent_group: Node) -> bool:
462 """Check if a given group is a metadata group for a given parent_group."""
463 if group._v_depth <= parent_group._v_depth:
464 return False
466 current = group
467 while current._v_depth > 1:
468 parent = current._v_parent
469 if parent == parent_group and current._v_name == "meta":
470 return True
471 current = current._v_parent
472 return False
475class HDFStore:
476 """
477 Dict-like IO interface for storing pandas objects in PyTables.
479 Either Fixed or Table format.
481 .. warning::
483 Pandas uses PyTables for reading and writing HDF5 files, which allows
484 serializing object-dtype data with pickle when using the "fixed" format.
485 Loading pickled data received from untrusted sources can be unsafe.
487 See: https://docs.python.org/3/library/pickle.html for more.
489 Parameters
490 ----------
491 path : str
492 File path to HDF5 file.
493 mode : {'a', 'w', 'r', 'r+'}, default 'a'
495 ``'r'``
496 Read-only; no data can be modified.
497 ``'w'``
498 Write; a new file is created (an existing file with the same
499 name would be deleted).
500 ``'a'``
501 Append; an existing file is opened for reading and writing,
502 and if the file does not exist it is created.
503 ``'r+'``
504 It is similar to ``'a'``, but the file must already exist.
505 complevel : int, 0-9, default None
506 Specifies a compression level for data.
507 A value of 0 or None disables compression.
508 complib : {'zlib', 'lzo', 'bzip2', 'blosc'}, default 'zlib'
509 Specifies the compression library to be used.
510 As of v0.20.2 these additional compressors for Blosc are supported
511 (default if no compressor specified: 'blosc:blosclz'):
512 {'blosc:blosclz', 'blosc:lz4', 'blosc:lz4hc', 'blosc:snappy',
513 'blosc:zlib', 'blosc:zstd'}.
514 Specifying a compression library which is not available issues
515 a ValueError.
516 fletcher32 : bool, default False
517 If applying compression use the fletcher32 checksum.
518 **kwargs
519 These parameters will be passed to the PyTables open_file method.
521 Examples
522 --------
523 >>> bar = pd.DataFrame(np.random.randn(10, 4))
524 >>> store = pd.HDFStore('test.h5')
525 >>> store['foo'] = bar # write to HDF5
526 >>> bar = store['foo'] # retrieve
527 >>> store.close()
529 **Create or load HDF5 file in-memory**
531 When passing the `driver` option to the PyTables open_file method through
532 **kwargs, the HDF5 file is loaded or created in-memory and will only be
533 written when closed:
535 >>> bar = pd.DataFrame(np.random.randn(10, 4))
536 >>> store = pd.HDFStore('test.h5', driver='H5FD_CORE')
537 >>> store['foo'] = bar
538 >>> store.close() # only now, data is written to disk
539 """
541 _handle: File | None
542 _mode: str
543 _complevel: int
544 _fletcher32: bool
546 def __init__(
547 self,
548 path,
549 mode: str = "a",
550 complevel: int | None = None,
551 complib=None,
552 fletcher32: bool = False,
553 **kwargs,
554 ) -> None:
556 if "format" in kwargs:
557 raise ValueError("format is not a defined argument for HDFStore")
559 tables = import_optional_dependency("tables")
561 if complib is not None and complib not in tables.filters.all_complibs:
562 raise ValueError(
563 f"complib only supports {tables.filters.all_complibs} compression."
564 )
566 if complib is None and complevel is not None:
567 complib = tables.filters.default_complib
569 self._path = stringify_path(path)
570 if mode is None:
571 mode = "a"
572 self._mode = mode
573 self._handle = None
574 self._complevel = complevel if complevel else 0
575 self._complib = complib
576 self._fletcher32 = fletcher32
577 self._filters = None
578 self.open(mode=mode, **kwargs)
580 def __fspath__(self) -> str:
581 return self._path
583 @property
584 def root(self):
585 """return the root node"""
586 self._check_if_open()
587 assert self._handle is not None # for mypy
588 return self._handle.root
590 @property
591 def filename(self) -> str:
592 return self._path
594 def __getitem__(self, key: str):
595 return self.get(key)
597 def __setitem__(self, key: str, value) -> None:
598 self.put(key, value)
600 def __delitem__(self, key: str) -> None:
601 return self.remove(key)
603 def __getattr__(self, name: str):
604 """allow attribute access to get stores"""
605 try:
606 return self.get(name)
607 except (KeyError, ClosedFileError):
608 pass
609 raise AttributeError(
610 f"'{type(self).__name__}' object has no attribute '{name}'"
611 )
613 def __contains__(self, key: str) -> bool:
614 """
615 check for existence of this key
616 can match the exact pathname or the pathnm w/o the leading '/'
617 """
618 node = self.get_node(key)
619 if node is not None:
620 name = node._v_pathname
621 if name == key or name[1:] == key:
622 return True
623 return False
625 def __len__(self) -> int:
626 return len(self.groups())
628 def __repr__(self) -> str:
629 pstr = pprint_thing(self._path)
630 return f"{type(self)}\nFile path: {pstr}\n"
632 def __enter__(self) -> HDFStore:
633 return self
635 def __exit__(self, exc_type, exc_value, traceback) -> None:
636 self.close()
638 def keys(self, include: str = "pandas") -> list[str]:
639 """
640 Return a list of keys corresponding to objects stored in HDFStore.
642 Parameters
643 ----------
645 include : str, default 'pandas'
646 When kind equals 'pandas' return pandas objects.
647 When kind equals 'native' return native HDF5 Table objects.
649 .. versionadded:: 1.1.0
651 Returns
652 -------
653 list
654 List of ABSOLUTE path-names (e.g. have the leading '/').
656 Raises
657 ------
658 raises ValueError if kind has an illegal value
659 """
660 if include == "pandas":
661 return [n._v_pathname for n in self.groups()]
663 elif include == "native":
664 assert self._handle is not None # mypy
665 return [
666 n._v_pathname for n in self._handle.walk_nodes("/", classname="Table")
667 ]
668 raise ValueError(
669 f"`include` should be either 'pandas' or 'native' but is '{include}'"
670 )
672 def __iter__(self) -> Iterator[str]:
673 return iter(self.keys())
675 def items(self) -> Iterator[tuple[str, list]]:
676 """
677 iterate on key->group
678 """
679 for g in self.groups():
680 yield g._v_pathname, g
682 def iteritems(self):
683 """
684 iterate on key->group
685 """
686 warnings.warn(
687 "iteritems is deprecated and will be removed in a future version. "
688 "Use .items instead.",
689 FutureWarning,
690 stacklevel=find_stack_level(),
691 )
692 yield from self.items()
694 def open(self, mode: str = "a", **kwargs) -> None:
695 """
696 Open the file in the specified mode
698 Parameters
699 ----------
700 mode : {'a', 'w', 'r', 'r+'}, default 'a'
701 See HDFStore docstring or tables.open_file for info about modes
702 **kwargs
703 These parameters will be passed to the PyTables open_file method.
704 """
705 tables = _tables()
707 if self._mode != mode:
708 # if we are changing a write mode to read, ok
709 if self._mode in ["a", "w"] and mode in ["r", "r+"]:
710 pass
711 elif mode in ["w"]:
712 # this would truncate, raise here
713 if self.is_open:
714 raise PossibleDataLossError(
715 f"Re-opening the file [{self._path}] with mode [{self._mode}] "
716 "will delete the current file!"
717 )
719 self._mode = mode
721 # close and reopen the handle
722 if self.is_open:
723 self.close()
725 if self._complevel and self._complevel > 0:
726 self._filters = _tables().Filters(
727 self._complevel, self._complib, fletcher32=self._fletcher32
728 )
730 if _table_file_open_policy_is_strict and self.is_open:
731 msg = (
732 "Cannot open HDF5 file, which is already opened, "
733 "even in read-only mode."
734 )
735 raise ValueError(msg)
737 self._handle = tables.open_file(self._path, self._mode, **kwargs)
739 def close(self) -> None:
740 """
741 Close the PyTables file handle
742 """
743 if self._handle is not None:
744 self._handle.close()
745 self._handle = None
747 @property
748 def is_open(self) -> bool:
749 """
750 return a boolean indicating whether the file is open
751 """
752 if self._handle is None:
753 return False
754 return bool(self._handle.isopen)
756 def flush(self, fsync: bool = False) -> None:
757 """
758 Force all buffered modifications to be written to disk.
760 Parameters
761 ----------
762 fsync : bool (default False)
763 call ``os.fsync()`` on the file handle to force writing to disk.
765 Notes
766 -----
767 Without ``fsync=True``, flushing may not guarantee that the OS writes
768 to disk. With fsync, the operation will block until the OS claims the
769 file has been written; however, other caching layers may still
770 interfere.
771 """
772 if self._handle is not None:
773 self._handle.flush()
774 if fsync:
775 with suppress(OSError):
776 os.fsync(self._handle.fileno())
778 def get(self, key: str):
779 """
780 Retrieve pandas object stored in file.
782 Parameters
783 ----------
784 key : str
786 Returns
787 -------
788 object
789 Same type as object stored in file.
790 """
791 with patch_pickle():
792 # GH#31167 Without this patch, pickle doesn't know how to unpickle
793 # old DateOffset objects now that they are cdef classes.
794 group = self.get_node(key)
795 if group is None:
796 raise KeyError(f"No object named {key} in the file")
797 return self._read_group(group)
799 def select(
800 self,
801 key: str,
802 where=None,
803 start=None,
804 stop=None,
805 columns=None,
806 iterator=False,
807 chunksize=None,
808 auto_close: bool = False,
809 ):
810 """
811 Retrieve pandas object stored in file, optionally based on where criteria.
813 .. warning::
815 Pandas uses PyTables for reading and writing HDF5 files, which allows
816 serializing object-dtype data with pickle when using the "fixed" format.
817 Loading pickled data received from untrusted sources can be unsafe.
819 See: https://docs.python.org/3/library/pickle.html for more.
821 Parameters
822 ----------
823 key : str
824 Object being retrieved from file.
825 where : list or None
826 List of Term (or convertible) objects, optional.
827 start : int or None
828 Row number to start selection.
829 stop : int, default None
830 Row number to stop selection.
831 columns : list or None
832 A list of columns that if not None, will limit the return columns.
833 iterator : bool or False
834 Returns an iterator.
835 chunksize : int or None
836 Number or rows to include in iteration, return an iterator.
837 auto_close : bool or False
838 Should automatically close the store when finished.
840 Returns
841 -------
842 object
843 Retrieved object from file.
844 """
845 group = self.get_node(key)
846 if group is None:
847 raise KeyError(f"No object named {key} in the file")
849 # create the storer and axes
850 where = _ensure_term(where, scope_level=1)
851 s = self._create_storer(group)
852 s.infer_axes()
854 # function to call on iteration
855 def func(_start, _stop, _where):
856 return s.read(start=_start, stop=_stop, where=_where, columns=columns)
858 # create the iterator
859 it = TableIterator(
860 self,
861 s,
862 func,
863 where=where,
864 nrows=s.nrows,
865 start=start,
866 stop=stop,
867 iterator=iterator,
868 chunksize=chunksize,
869 auto_close=auto_close,
870 )
872 return it.get_result()
874 def select_as_coordinates(
875 self,
876 key: str,
877 where=None,
878 start: int | None = None,
879 stop: int | None = None,
880 ):
881 """
882 return the selection as an Index
884 .. warning::
886 Pandas uses PyTables for reading and writing HDF5 files, which allows
887 serializing object-dtype data with pickle when using the "fixed" format.
888 Loading pickled data received from untrusted sources can be unsafe.
890 See: https://docs.python.org/3/library/pickle.html for more.
893 Parameters
894 ----------
895 key : str
896 where : list of Term (or convertible) objects, optional
897 start : integer (defaults to None), row number to start selection
898 stop : integer (defaults to None), row number to stop selection
899 """
900 where = _ensure_term(where, scope_level=1)
901 tbl = self.get_storer(key)
902 if not isinstance(tbl, Table):
903 raise TypeError("can only read_coordinates with a table")
904 return tbl.read_coordinates(where=where, start=start, stop=stop)
906 def select_column(
907 self,
908 key: str,
909 column: str,
910 start: int | None = None,
911 stop: int | None = None,
912 ):
913 """
914 return a single column from the table. This is generally only useful to
915 select an indexable
917 .. warning::
919 Pandas uses PyTables for reading and writing HDF5 files, which allows
920 serializing object-dtype data with pickle when using the "fixed" format.
921 Loading pickled data received from untrusted sources can be unsafe.
923 See: https://docs.python.org/3/library/pickle.html for more.
925 Parameters
926 ----------
927 key : str
928 column : str
929 The column of interest.
930 start : int or None, default None
931 stop : int or None, default None
933 Raises
934 ------
935 raises KeyError if the column is not found (or key is not a valid
936 store)
937 raises ValueError if the column can not be extracted individually (it
938 is part of a data block)
940 """
941 tbl = self.get_storer(key)
942 if not isinstance(tbl, Table):
943 raise TypeError("can only read_column with a table")
944 return tbl.read_column(column=column, start=start, stop=stop)
946 def select_as_multiple(
947 self,
948 keys,
949 where=None,
950 selector=None,
951 columns=None,
952 start=None,
953 stop=None,
954 iterator=False,
955 chunksize=None,
956 auto_close: bool = False,
957 ):
958 """
959 Retrieve pandas objects from multiple tables.
961 .. warning::
963 Pandas uses PyTables for reading and writing HDF5 files, which allows
964 serializing object-dtype data with pickle when using the "fixed" format.
965 Loading pickled data received from untrusted sources can be unsafe.
967 See: https://docs.python.org/3/library/pickle.html for more.
969 Parameters
970 ----------
971 keys : a list of the tables
972 selector : the table to apply the where criteria (defaults to keys[0]
973 if not supplied)
974 columns : the columns I want back
975 start : integer (defaults to None), row number to start selection
976 stop : integer (defaults to None), row number to stop selection
977 iterator : bool, return an iterator, default False
978 chunksize : nrows to include in iteration, return an iterator
979 auto_close : bool, default False
980 Should automatically close the store when finished.
982 Raises
983 ------
984 raises KeyError if keys or selector is not found or keys is empty
985 raises TypeError if keys is not a list or tuple
986 raises ValueError if the tables are not ALL THE SAME DIMENSIONS
987 """
988 # default to single select
989 where = _ensure_term(where, scope_level=1)
990 if isinstance(keys, (list, tuple)) and len(keys) == 1:
991 keys = keys[0]
992 if isinstance(keys, str):
993 return self.select(
994 key=keys,
995 where=where,
996 columns=columns,
997 start=start,
998 stop=stop,
999 iterator=iterator,
1000 chunksize=chunksize,
1001 auto_close=auto_close,
1002 )
1004 if not isinstance(keys, (list, tuple)):
1005 raise TypeError("keys must be a list/tuple")
1007 if not len(keys):
1008 raise ValueError("keys must have a non-zero length")
1010 if selector is None:
1011 selector = keys[0]
1013 # collect the tables
1014 tbls = [self.get_storer(k) for k in keys]
1015 s = self.get_storer(selector)
1017 # validate rows
1018 nrows = None
1019 for t, k in itertools.chain([(s, selector)], zip(tbls, keys)):
1020 if t is None:
1021 raise KeyError(f"Invalid table [{k}]")
1022 if not t.is_table:
1023 raise TypeError(
1024 f"object [{t.pathname}] is not a table, and cannot be used in all "
1025 "select as multiple"
1026 )
1028 if nrows is None:
1029 nrows = t.nrows
1030 elif t.nrows != nrows:
1031 raise ValueError("all tables must have exactly the same nrows!")
1033 # The isinstance checks here are redundant with the check above,
1034 # but necessary for mypy; see GH#29757
1035 _tbls = [x for x in tbls if isinstance(x, Table)]
1037 # axis is the concentration axes
1038 axis = list({t.non_index_axes[0][0] for t in _tbls})[0]
1040 def func(_start, _stop, _where):
1042 # retrieve the objs, _where is always passed as a set of
1043 # coordinates here
1044 objs = [
1045 t.read(where=_where, columns=columns, start=_start, stop=_stop)
1046 for t in tbls
1047 ]
1049 # concat and return
1050 return concat(objs, axis=axis, verify_integrity=False)._consolidate()
1052 # create the iterator
1053 it = TableIterator(
1054 self,
1055 s,
1056 func,
1057 where=where,
1058 nrows=nrows,
1059 start=start,
1060 stop=stop,
1061 iterator=iterator,
1062 chunksize=chunksize,
1063 auto_close=auto_close,
1064 )
1066 return it.get_result(coordinates=True)
1068 def put(
1069 self,
1070 key: str,
1071 value: DataFrame | Series,
1072 format=None,
1073 index=True,
1074 append=False,
1075 complib=None,
1076 complevel: int | None = None,
1077 min_itemsize: int | dict[str, int] | None = None,
1078 nan_rep=None,
1079 data_columns: Literal[True] | list[str] | None = None,
1080 encoding=None,
1081 errors: str = "strict",
1082 track_times: bool = True,
1083 dropna: bool = False,
1084 ) -> None:
1085 """
1086 Store object in HDFStore.
1088 Parameters
1089 ----------
1090 key : str
1091 value : {Series, DataFrame}
1092 format : 'fixed(f)|table(t)', default is 'fixed'
1093 Format to use when storing object in HDFStore. Value can be one of:
1095 ``'fixed'``
1096 Fixed format. Fast writing/reading. Not-appendable, nor searchable.
1097 ``'table'``
1098 Table format. Write as a PyTables Table structure which may perform
1099 worse but allow more flexible operations like searching / selecting
1100 subsets of the data.
1101 index : bool, default True
1102 Write DataFrame index as a column.
1103 append : bool, default False
1104 This will force Table format, append the input data to the existing.
1105 data_columns : list of columns or True, default None
1106 List of columns to create as data columns, or True to use all columns.
1107 See `here
1108 <https://pandas.pydata.org/pandas-docs/stable/user_guide/io.html#query-via-data-columns>`__.
1109 encoding : str, default None
1110 Provide an encoding for strings.
1111 track_times : bool, default True
1112 Parameter is propagated to 'create_table' method of 'PyTables'.
1113 If set to False it enables to have the same h5 files (same hashes)
1114 independent on creation time.
1115 dropna : bool, default False, optional
1116 Remove missing values.
1118 .. versionadded:: 1.1.0
1119 """
1120 if format is None:
1121 format = get_option("io.hdf.default_format") or "fixed"
1122 format = self._validate_format(format)
1123 self._write_to_group(
1124 key,
1125 value,
1126 format=format,
1127 index=index,
1128 append=append,
1129 complib=complib,
1130 complevel=complevel,
1131 min_itemsize=min_itemsize,
1132 nan_rep=nan_rep,
1133 data_columns=data_columns,
1134 encoding=encoding,
1135 errors=errors,
1136 track_times=track_times,
1137 dropna=dropna,
1138 )
1140 def remove(self, key: str, where=None, start=None, stop=None) -> None:
1141 """
1142 Remove pandas object partially by specifying the where condition
1144 Parameters
1145 ----------
1146 key : str
1147 Node to remove or delete rows from
1148 where : list of Term (or convertible) objects, optional
1149 start : integer (defaults to None), row number to start selection
1150 stop : integer (defaults to None), row number to stop selection
1152 Returns
1153 -------
1154 number of rows removed (or None if not a Table)
1156 Raises
1157 ------
1158 raises KeyError if key is not a valid store
1160 """
1161 where = _ensure_term(where, scope_level=1)
1162 try:
1163 s = self.get_storer(key)
1164 except KeyError:
1165 # the key is not a valid store, re-raising KeyError
1166 raise
1167 except AssertionError:
1168 # surface any assertion errors for e.g. debugging
1169 raise
1170 except Exception as err:
1171 # In tests we get here with ClosedFileError, TypeError, and
1172 # _table_mod.NoSuchNodeError. TODO: Catch only these?
1174 if where is not None:
1175 raise ValueError(
1176 "trying to remove a node with a non-None where clause!"
1177 ) from err
1179 # we are actually trying to remove a node (with children)
1180 node = self.get_node(key)
1181 if node is not None:
1182 node._f_remove(recursive=True)
1183 return None
1185 # remove the node
1186 if com.all_none(where, start, stop):
1187 s.group._f_remove(recursive=True)
1189 # delete from the table
1190 else:
1191 if not s.is_table:
1192 raise ValueError(
1193 "can only remove with where on objects written as tables"
1194 )
1195 return s.delete(where=where, start=start, stop=stop)
1197 def append(
1198 self,
1199 key: str,
1200 value: DataFrame | Series,
1201 format=None,
1202 axes=None,
1203 index=True,
1204 append=True,
1205 complib=None,
1206 complevel: int | None = None,
1207 columns=None,
1208 min_itemsize: int | dict[str, int] | None = None,
1209 nan_rep=None,
1210 chunksize=None,
1211 expectedrows=None,
1212 dropna: bool | None = None,
1213 data_columns: Literal[True] | list[str] | None = None,
1214 encoding=None,
1215 errors: str = "strict",
1216 ) -> None:
1217 """
1218 Append to Table in file.
1220 Node must already exist and be Table format.
1222 Parameters
1223 ----------
1224 key : str
1225 value : {Series, DataFrame}
1226 format : 'table' is the default
1227 Format to use when storing object in HDFStore. Value can be one of:
1229 ``'table'``
1230 Table format. Write as a PyTables Table structure which may perform
1231 worse but allow more flexible operations like searching / selecting
1232 subsets of the data.
1233 index : bool, default True
1234 Write DataFrame index as a column.
1235 append : bool, default True
1236 Append the input data to the existing.
1237 data_columns : list of columns, or True, default None
1238 List of columns to create as indexed data columns for on-disk
1239 queries, or True to use all columns. By default only the axes
1240 of the object are indexed. See `here
1241 <https://pandas.pydata.org/pandas-docs/stable/user_guide/io.html#query-via-data-columns>`__.
1242 min_itemsize : dict of columns that specify minimum str sizes
1243 nan_rep : str to use as str nan representation
1244 chunksize : size to chunk the writing
1245 expectedrows : expected TOTAL row size of this table
1246 encoding : default None, provide an encoding for str
1247 dropna : bool, default False, optional
1248 Do not write an ALL nan row to the store settable
1249 by the option 'io.hdf.dropna_table'.
1251 Notes
1252 -----
1253 Does *not* check if data being appended overlaps with existing
1254 data in the table, so be careful
1255 """
1256 if columns is not None:
1257 raise TypeError(
1258 "columns is not a supported keyword in append, try data_columns"
1259 )
1261 if dropna is None:
1262 dropna = get_option("io.hdf.dropna_table")
1263 if format is None:
1264 format = get_option("io.hdf.default_format") or "table"
1265 format = self._validate_format(format)
1266 self._write_to_group(
1267 key,
1268 value,
1269 format=format,
1270 axes=axes,
1271 index=index,
1272 append=append,
1273 complib=complib,
1274 complevel=complevel,
1275 min_itemsize=min_itemsize,
1276 nan_rep=nan_rep,
1277 chunksize=chunksize,
1278 expectedrows=expectedrows,
1279 dropna=dropna,
1280 data_columns=data_columns,
1281 encoding=encoding,
1282 errors=errors,
1283 )
1285 def append_to_multiple(
1286 self,
1287 d: dict,
1288 value,
1289 selector,
1290 data_columns=None,
1291 axes=None,
1292 dropna=False,
1293 **kwargs,
1294 ) -> None:
1295 """
1296 Append to multiple tables
1298 Parameters
1299 ----------
1300 d : a dict of table_name to table_columns, None is acceptable as the
1301 values of one node (this will get all the remaining columns)
1302 value : a pandas object
1303 selector : a string that designates the indexable table; all of its
1304 columns will be designed as data_columns, unless data_columns is
1305 passed, in which case these are used
1306 data_columns : list of columns to create as data columns, or True to
1307 use all columns
1308 dropna : if evaluates to True, drop rows from all tables if any single
1309 row in each table has all NaN. Default False.
1311 Notes
1312 -----
1313 axes parameter is currently not accepted
1315 """
1316 if axes is not None:
1317 raise TypeError(
1318 "axes is currently not accepted as a parameter to append_to_multiple; "
1319 "you can create the tables independently instead"
1320 )
1322 if not isinstance(d, dict):
1323 raise ValueError(
1324 "append_to_multiple must have a dictionary specified as the "
1325 "way to split the value"
1326 )
1328 if selector not in d:
1329 raise ValueError(
1330 "append_to_multiple requires a selector that is in passed dict"
1331 )
1333 # figure out the splitting axis (the non_index_axis)
1334 axis = list(set(range(value.ndim)) - set(_AXES_MAP[type(value)]))[0]
1336 # figure out how to split the value
1337 remain_key = None
1338 remain_values: list = []
1339 for k, v in d.items():
1340 if v is None:
1341 if remain_key is not None:
1342 raise ValueError(
1343 "append_to_multiple can only have one value in d that is None"
1344 )
1345 remain_key = k
1346 else:
1347 remain_values.extend(v)
1348 if remain_key is not None:
1349 ordered = value.axes[axis]
1350 ordd = ordered.difference(Index(remain_values))
1351 ordd = sorted(ordered.get_indexer(ordd))
1352 d[remain_key] = ordered.take(ordd)
1354 # data_columns
1355 if data_columns is None:
1356 data_columns = d[selector]
1358 # ensure rows are synchronized across the tables
1359 if dropna:
1360 idxs = (value[cols].dropna(how="all").index for cols in d.values())
1361 valid_index = next(idxs)
1362 for index in idxs:
1363 valid_index = valid_index.intersection(index)
1364 value = value.loc[valid_index]
1366 min_itemsize = kwargs.pop("min_itemsize", None)
1368 # append
1369 for k, v in d.items():
1370 dc = data_columns if k == selector else None
1372 # compute the val
1373 val = value.reindex(v, axis=axis)
1375 filtered = (
1376 {key: value for (key, value) in min_itemsize.items() if key in v}
1377 if min_itemsize is not None
1378 else None
1379 )
1380 self.append(k, val, data_columns=dc, min_itemsize=filtered, **kwargs)
1382 def create_table_index(
1383 self,
1384 key: str,
1385 columns=None,
1386 optlevel: int | None = None,
1387 kind: str | None = None,
1388 ) -> None:
1389 """
1390 Create a pytables index on the table.
1392 Parameters
1393 ----------
1394 key : str
1395 columns : None, bool, or listlike[str]
1396 Indicate which columns to create an index on.
1398 * False : Do not create any indexes.
1399 * True : Create indexes on all columns.
1400 * None : Create indexes on all columns.
1401 * listlike : Create indexes on the given columns.
1403 optlevel : int or None, default None
1404 Optimization level, if None, pytables defaults to 6.
1405 kind : str or None, default None
1406 Kind of index, if None, pytables defaults to "medium".
1408 Raises
1409 ------
1410 TypeError: raises if the node is not a table
1411 """
1412 # version requirements
1413 _tables()
1414 s = self.get_storer(key)
1415 if s is None:
1416 return
1418 if not isinstance(s, Table):
1419 raise TypeError("cannot create table index on a Fixed format store")
1420 s.create_index(columns=columns, optlevel=optlevel, kind=kind)
1422 def groups(self) -> list:
1423 """
1424 Return a list of all the top-level nodes.
1426 Each node returned is not a pandas storage object.
1428 Returns
1429 -------
1430 list
1431 List of objects.
1432 """
1433 _tables()
1434 self._check_if_open()
1435 assert self._handle is not None # for mypy
1436 assert _table_mod is not None # for mypy
1437 return [
1438 g
1439 for g in self._handle.walk_groups()
1440 if (
1441 not isinstance(g, _table_mod.link.Link)
1442 and (
1443 getattr(g._v_attrs, "pandas_type", None)
1444 or getattr(g, "table", None)
1445 or (isinstance(g, _table_mod.table.Table) and g._v_name != "table")
1446 )
1447 )
1448 ]
1450 def walk(self, where: str = "/") -> Iterator[tuple[str, list[str], list[str]]]:
1451 """
1452 Walk the pytables group hierarchy for pandas objects.
1454 This generator will yield the group path, subgroups and pandas object
1455 names for each group.
1457 Any non-pandas PyTables objects that are not a group will be ignored.
1459 The `where` group itself is listed first (preorder), then each of its
1460 child groups (following an alphanumerical order) is also traversed,
1461 following the same procedure.
1463 Parameters
1464 ----------
1465 where : str, default "/"
1466 Group where to start walking.
1468 Yields
1469 ------
1470 path : str
1471 Full path to a group (without trailing '/').
1472 groups : list
1473 Names (strings) of the groups contained in `path`.
1474 leaves : list
1475 Names (strings) of the pandas objects contained in `path`.
1476 """
1477 _tables()
1478 self._check_if_open()
1479 assert self._handle is not None # for mypy
1480 assert _table_mod is not None # for mypy
1482 for g in self._handle.walk_groups(where):
1483 if getattr(g._v_attrs, "pandas_type", None) is not None:
1484 continue
1486 groups = []
1487 leaves = []
1488 for child in g._v_children.values():
1489 pandas_type = getattr(child._v_attrs, "pandas_type", None)
1490 if pandas_type is None:
1491 if isinstance(child, _table_mod.group.Group):
1492 groups.append(child._v_name)
1493 else:
1494 leaves.append(child._v_name)
1496 yield (g._v_pathname.rstrip("/"), groups, leaves)
1498 def get_node(self, key: str) -> Node | None:
1499 """return the node with the key or None if it does not exist"""
1500 self._check_if_open()
1501 if not key.startswith("/"):
1502 key = "/" + key
1504 assert self._handle is not None
1505 assert _table_mod is not None # for mypy
1506 try:
1507 node = self._handle.get_node(self.root, key)
1508 except _table_mod.exceptions.NoSuchNodeError:
1509 return None
1511 assert isinstance(node, _table_mod.Node), type(node)
1512 return node
1514 def get_storer(self, key: str) -> GenericFixed | Table:
1515 """return the storer object for a key, raise if not in the file"""
1516 group = self.get_node(key)
1517 if group is None:
1518 raise KeyError(f"No object named {key} in the file")
1520 s = self._create_storer(group)
1521 s.infer_axes()
1522 return s
1524 def copy(
1525 self,
1526 file,
1527 mode="w",
1528 propindexes: bool = True,
1529 keys=None,
1530 complib=None,
1531 complevel: int | None = None,
1532 fletcher32: bool = False,
1533 overwrite=True,
1534 ) -> HDFStore:
1535 """
1536 Copy the existing store to a new file, updating in place.
1538 Parameters
1539 ----------
1540 propindexes : bool, default True
1541 Restore indexes in copied file.
1542 keys : list, optional
1543 List of keys to include in the copy (defaults to all).
1544 overwrite : bool, default True
1545 Whether to overwrite (remove and replace) existing nodes in the new store.
1546 mode, complib, complevel, fletcher32 same as in HDFStore.__init__
1548 Returns
1549 -------
1550 open file handle of the new store
1551 """
1552 new_store = HDFStore(
1553 file, mode=mode, complib=complib, complevel=complevel, fletcher32=fletcher32
1554 )
1555 if keys is None:
1556 keys = list(self.keys())
1557 if not isinstance(keys, (tuple, list)):
1558 keys = [keys]
1559 for k in keys:
1560 s = self.get_storer(k)
1561 if s is not None:
1563 if k in new_store:
1564 if overwrite:
1565 new_store.remove(k)
1567 data = self.select(k)
1568 if isinstance(s, Table):
1570 index: bool | list[str] = False
1571 if propindexes:
1572 index = [a.name for a in s.axes if a.is_indexed]
1573 new_store.append(
1574 k,
1575 data,
1576 index=index,
1577 data_columns=getattr(s, "data_columns", None),
1578 encoding=s.encoding,
1579 )
1580 else:
1581 new_store.put(k, data, encoding=s.encoding)
1583 return new_store
1585 def info(self) -> str:
1586 """
1587 Print detailed information on the store.
1589 Returns
1590 -------
1591 str
1592 """
1593 path = pprint_thing(self._path)
1594 output = f"{type(self)}\nFile path: {path}\n"
1596 if self.is_open:
1597 lkeys = sorted(self.keys())
1598 if len(lkeys):
1599 keys = []
1600 values = []
1602 for k in lkeys:
1603 try:
1604 s = self.get_storer(k)
1605 if s is not None:
1606 keys.append(pprint_thing(s.pathname or k))
1607 values.append(pprint_thing(s or "invalid_HDFStore node"))
1608 except AssertionError:
1609 # surface any assertion errors for e.g. debugging
1610 raise
1611 except Exception as detail:
1612 keys.append(k)
1613 dstr = pprint_thing(detail)
1614 values.append(f"[invalid_HDFStore node: {dstr}]")
1616 output += adjoin(12, keys, values)
1617 else:
1618 output += "Empty"
1619 else:
1620 output += "File is CLOSED"
1622 return output
1624 # ------------------------------------------------------------------------
1625 # private methods
1627 def _check_if_open(self):
1628 if not self.is_open:
1629 raise ClosedFileError(f"{self._path} file is not open!")
1631 def _validate_format(self, format: str) -> str:
1632 """validate / deprecate formats"""
1633 # validate
1634 try:
1635 format = _FORMAT_MAP[format.lower()]
1636 except KeyError as err:
1637 raise TypeError(f"invalid HDFStore format specified [{format}]") from err
1639 return format
1641 def _create_storer(
1642 self,
1643 group,
1644 format=None,
1645 value: DataFrame | Series | None = None,
1646 encoding: str = "UTF-8",
1647 errors: str = "strict",
1648 ) -> GenericFixed | Table:
1649 """return a suitable class to operate"""
1650 cls: type[GenericFixed] | type[Table]
1652 if value is not None and not isinstance(value, (Series, DataFrame)):
1653 raise TypeError("value must be None, Series, or DataFrame")
1655 def error(t):
1656 # return instead of raising so mypy can tell where we are raising
1657 return TypeError(
1658 f"cannot properly create the storer for: [{t}] [group->"
1659 f"{group},value->{type(value)},format->{format}"
1660 )
1662 pt = _ensure_decoded(getattr(group._v_attrs, "pandas_type", None))
1663 tt = _ensure_decoded(getattr(group._v_attrs, "table_type", None))
1665 # infer the pt from the passed value
1666 if pt is None:
1667 if value is None:
1668 _tables()
1669 assert _table_mod is not None # for mypy
1670 if getattr(group, "table", None) or isinstance(
1671 group, _table_mod.table.Table
1672 ):
1673 pt = "frame_table"
1674 tt = "generic_table"
1675 else:
1676 raise TypeError(
1677 "cannot create a storer if the object is not existing "
1678 "nor a value are passed"
1679 )
1680 else:
1681 if isinstance(value, Series):
1682 pt = "series"
1683 else:
1684 pt = "frame"
1686 # we are actually a table
1687 if format == "table":
1688 pt += "_table"
1690 # a storer node
1691 if "table" not in pt:
1692 _STORER_MAP = {"series": SeriesFixed, "frame": FrameFixed}
1693 try:
1694 cls = _STORER_MAP[pt]
1695 except KeyError as err:
1696 raise error("_STORER_MAP") from err
1697 return cls(self, group, encoding=encoding, errors=errors)
1699 # existing node (and must be a table)
1700 if tt is None:
1701 # if we are a writer, determine the tt
1702 if value is not None:
1703 if pt == "series_table":
1704 index = getattr(value, "index", None)
1705 if index is not None:
1706 if index.nlevels == 1:
1707 tt = "appendable_series"
1708 elif index.nlevels > 1:
1709 tt = "appendable_multiseries"
1710 elif pt == "frame_table":
1711 index = getattr(value, "index", None)
1712 if index is not None:
1713 if index.nlevels == 1:
1714 tt = "appendable_frame"
1715 elif index.nlevels > 1:
1716 tt = "appendable_multiframe"
1718 _TABLE_MAP = {
1719 "generic_table": GenericTable,
1720 "appendable_series": AppendableSeriesTable,
1721 "appendable_multiseries": AppendableMultiSeriesTable,
1722 "appendable_frame": AppendableFrameTable,
1723 "appendable_multiframe": AppendableMultiFrameTable,
1724 "worm": WORMTable,
1725 }
1726 try:
1727 cls = _TABLE_MAP[tt]
1728 except KeyError as err:
1729 raise error("_TABLE_MAP") from err
1731 return cls(self, group, encoding=encoding, errors=errors)
1733 def _write_to_group(
1734 self,
1735 key: str,
1736 value: DataFrame | Series,
1737 format,
1738 axes=None,
1739 index=True,
1740 append=False,
1741 complib=None,
1742 complevel: int | None = None,
1743 fletcher32=None,
1744 min_itemsize: int | dict[str, int] | None = None,
1745 chunksize=None,
1746 expectedrows=None,
1747 dropna=False,
1748 nan_rep=None,
1749 data_columns=None,
1750 encoding=None,
1751 errors: str = "strict",
1752 track_times: bool = True,
1753 ) -> None:
1754 # we don't want to store a table node at all if our object is 0-len
1755 # as there are not dtypes
1756 if getattr(value, "empty", None) and (format == "table" or append):
1757 return
1759 group = self._identify_group(key, append)
1761 s = self._create_storer(group, format, value, encoding=encoding, errors=errors)
1762 if append:
1763 # raise if we are trying to append to a Fixed format,
1764 # or a table that exists (and we are putting)
1765 if not s.is_table or (s.is_table and format == "fixed" and s.is_exists):
1766 raise ValueError("Can only append to Tables")
1767 if not s.is_exists:
1768 s.set_object_info()
1769 else:
1770 s.set_object_info()
1772 if not s.is_table and complib:
1773 raise ValueError("Compression not supported on Fixed format stores")
1775 # write the object
1776 s.write(
1777 obj=value,
1778 axes=axes,
1779 append=append,
1780 complib=complib,
1781 complevel=complevel,
1782 fletcher32=fletcher32,
1783 min_itemsize=min_itemsize,
1784 chunksize=chunksize,
1785 expectedrows=expectedrows,
1786 dropna=dropna,
1787 nan_rep=nan_rep,
1788 data_columns=data_columns,
1789 track_times=track_times,
1790 )
1792 if isinstance(s, Table) and index:
1793 s.create_index(columns=index)
1795 def _read_group(self, group: Node):
1796 s = self._create_storer(group)
1797 s.infer_axes()
1798 return s.read()
1800 def _identify_group(self, key: str, append: bool) -> Node:
1801 """Identify HDF5 group based on key, delete/create group if needed."""
1802 group = self.get_node(key)
1804 # we make this assertion for mypy; the get_node call will already
1805 # have raised if this is incorrect
1806 assert self._handle is not None
1808 # remove the node if we are not appending
1809 if group is not None and not append:
1810 self._handle.remove_node(group, recursive=True)
1811 group = None
1813 if group is None:
1814 group = self._create_nodes_and_group(key)
1816 return group
1818 def _create_nodes_and_group(self, key: str) -> Node:
1819 """Create nodes from key and return group name."""
1820 # assertion for mypy
1821 assert self._handle is not None
1823 paths = key.split("/")
1824 # recursively create the groups
1825 path = "/"
1826 for p in paths:
1827 if not len(p):
1828 continue
1829 new_path = path
1830 if not path.endswith("/"):
1831 new_path += "/"
1832 new_path += p
1833 group = self.get_node(new_path)
1834 if group is None:
1835 group = self._handle.create_group(path, p)
1836 path = new_path
1837 return group
1840class TableIterator:
1841 """
1842 Define the iteration interface on a table
1844 Parameters
1845 ----------
1846 store : HDFStore
1847 s : the referred storer
1848 func : the function to execute the query
1849 where : the where of the query
1850 nrows : the rows to iterate on
1851 start : the passed start value (default is None)
1852 stop : the passed stop value (default is None)
1853 iterator : bool, default False
1854 Whether to use the default iterator.
1855 chunksize : the passed chunking value (default is 100000)
1856 auto_close : bool, default False
1857 Whether to automatically close the store at the end of iteration.
1858 """
1860 chunksize: int | None
1861 store: HDFStore
1862 s: GenericFixed | Table
1864 def __init__(
1865 self,
1866 store: HDFStore,
1867 s: GenericFixed | Table,
1868 func,
1869 where,
1870 nrows,
1871 start=None,
1872 stop=None,
1873 iterator: bool = False,
1874 chunksize: int | None = None,
1875 auto_close: bool = False,
1876 ) -> None:
1877 self.store = store
1878 self.s = s
1879 self.func = func
1880 self.where = where
1882 # set start/stop if they are not set if we are a table
1883 if self.s.is_table:
1884 if nrows is None:
1885 nrows = 0
1886 if start is None:
1887 start = 0
1888 if stop is None:
1889 stop = nrows
1890 stop = min(nrows, stop)
1892 self.nrows = nrows
1893 self.start = start
1894 self.stop = stop
1896 self.coordinates = None
1897 if iterator or chunksize is not None:
1898 if chunksize is None:
1899 chunksize = 100000
1900 self.chunksize = int(chunksize)
1901 else:
1902 self.chunksize = None
1904 self.auto_close = auto_close
1906 def __iter__(self):
1907 # iterate
1908 current = self.start
1909 if self.coordinates is None:
1910 raise ValueError("Cannot iterate until get_result is called.")
1911 while current < self.stop:
1912 stop = min(current + self.chunksize, self.stop)
1913 value = self.func(None, None, self.coordinates[current:stop])
1914 current = stop
1915 if value is None or not len(value):
1916 continue
1918 yield value
1920 self.close()
1922 def close(self) -> None:
1923 if self.auto_close:
1924 self.store.close()
1926 def get_result(self, coordinates: bool = False):
1927 # return the actual iterator
1928 if self.chunksize is not None:
1929 if not isinstance(self.s, Table):
1930 raise TypeError("can only use an iterator or chunksize on a table")
1932 self.coordinates = self.s.read_coordinates(where=self.where)
1934 return self
1936 # if specified read via coordinates (necessary for multiple selections
1937 if coordinates:
1938 if not isinstance(self.s, Table):
1939 raise TypeError("can only read_coordinates on a table")
1940 where = self.s.read_coordinates(
1941 where=self.where, start=self.start, stop=self.stop
1942 )
1943 else:
1944 where = self.where
1946 # directly return the result
1947 results = self.func(self.start, self.stop, where)
1948 self.close()
1949 return results
1952class IndexCol:
1953 """
1954 an index column description class
1956 Parameters
1957 ----------
1958 axis : axis which I reference
1959 values : the ndarray like converted values
1960 kind : a string description of this type
1961 typ : the pytables type
1962 pos : the position in the pytables
1964 """
1966 is_an_indexable: bool = True
1967 is_data_indexable: bool = True
1968 _info_fields = ["freq", "tz", "index_name"]
1970 name: str
1971 cname: str
1973 def __init__(
1974 self,
1975 name: str,
1976 values=None,
1977 kind=None,
1978 typ=None,
1979 cname: str | None = None,
1980 axis=None,
1981 pos=None,
1982 freq=None,
1983 tz=None,
1984 index_name=None,
1985 ordered=None,
1986 table=None,
1987 meta=None,
1988 metadata=None,
1989 ) -> None:
1991 if not isinstance(name, str):
1992 raise ValueError("`name` must be a str.")
1994 self.values = values
1995 self.kind = kind
1996 self.typ = typ
1997 self.name = name
1998 self.cname = cname or name
1999 self.axis = axis
2000 self.pos = pos
2001 self.freq = freq
2002 self.tz = tz
2003 self.index_name = index_name
2004 self.ordered = ordered
2005 self.table = table
2006 self.meta = meta
2007 self.metadata = metadata
2009 if pos is not None:
2010 self.set_pos(pos)
2012 # These are ensured as long as the passed arguments match the
2013 # constructor annotations.
2014 assert isinstance(self.name, str)
2015 assert isinstance(self.cname, str)
2017 @property
2018 def itemsize(self) -> int:
2019 # Assumes self.typ has already been initialized
2020 return self.typ.itemsize
2022 @property
2023 def kind_attr(self) -> str:
2024 return f"{self.name}_kind"
2026 def set_pos(self, pos: int) -> None:
2027 """set the position of this column in the Table"""
2028 self.pos = pos
2029 if pos is not None and self.typ is not None:
2030 self.typ._v_pos = pos
2032 def __repr__(self) -> str:
2033 temp = tuple(
2034 map(pprint_thing, (self.name, self.cname, self.axis, self.pos, self.kind))
2035 )
2036 return ",".join(
2037 [
2038 f"{key}->{value}"
2039 for key, value in zip(["name", "cname", "axis", "pos", "kind"], temp)
2040 ]
2041 )
2043 def __eq__(self, other: Any) -> bool:
2044 """compare 2 col items"""
2045 return all(
2046 getattr(self, a, None) == getattr(other, a, None)
2047 for a in ["name", "cname", "axis", "pos"]
2048 )
2050 def __ne__(self, other) -> bool:
2051 return not self.__eq__(other)
2053 @property
2054 def is_indexed(self) -> bool:
2055 """return whether I am an indexed column"""
2056 if not hasattr(self.table, "cols"):
2057 # e.g. if infer hasn't been called yet, self.table will be None.
2058 return False
2059 return getattr(self.table.cols, self.cname).is_indexed
2061 def convert(
2062 self, values: np.ndarray, nan_rep, encoding: str, errors: str
2063 ) -> tuple[np.ndarray, np.ndarray] | tuple[DatetimeIndex, DatetimeIndex]:
2064 """
2065 Convert the data from this selection to the appropriate pandas type.
2066 """
2067 assert isinstance(values, np.ndarray), type(values)
2069 # values is a recarray
2070 if values.dtype.fields is not None:
2071 values = values[self.cname]
2073 val_kind = _ensure_decoded(self.kind)
2074 values = _maybe_convert(values, val_kind, encoding, errors)
2076 kwargs = {}
2077 kwargs["name"] = _ensure_decoded(self.index_name)
2079 if self.freq is not None:
2080 kwargs["freq"] = _ensure_decoded(self.freq)
2082 factory: type[Index] | type[DatetimeIndex] = Index
2083 if is_datetime64_dtype(values.dtype) or is_datetime64tz_dtype(values.dtype):
2084 factory = DatetimeIndex
2085 elif values.dtype == "i8" and "freq" in kwargs:
2086 # PeriodIndex data is stored as i8
2087 # error: Incompatible types in assignment (expression has type
2088 # "Callable[[Any, KwArg(Any)], PeriodIndex]", variable has type
2089 # "Union[Type[Index], Type[DatetimeIndex]]")
2090 factory = lambda x, **kwds: PeriodIndex( # type: ignore[assignment]
2091 ordinal=x, **kwds
2092 )
2094 # making an Index instance could throw a number of different errors
2095 try:
2096 new_pd_index = factory(values, **kwargs)
2097 except ValueError:
2098 # if the output freq is different that what we recorded,
2099 # it should be None (see also 'doc example part 2')
2100 if "freq" in kwargs:
2101 kwargs["freq"] = None
2102 new_pd_index = factory(values, **kwargs)
2103 final_pd_index = _set_tz(new_pd_index, self.tz)
2104 return final_pd_index, final_pd_index
2106 def take_data(self):
2107 """return the values"""
2108 return self.values
2110 @property
2111 def attrs(self):
2112 return self.table._v_attrs
2114 @property
2115 def description(self):
2116 return self.table.description
2118 @property
2119 def col(self):
2120 """return my current col description"""
2121 return getattr(self.description, self.cname, None)
2123 @property
2124 def cvalues(self):
2125 """return my cython values"""
2126 return self.values
2128 def __iter__(self):
2129 return iter(self.values)
2131 def maybe_set_size(self, min_itemsize=None) -> None:
2132 """
2133 maybe set a string col itemsize:
2134 min_itemsize can be an integer or a dict with this columns name
2135 with an integer size
2136 """
2137 if _ensure_decoded(self.kind) == "string":
2138 if isinstance(min_itemsize, dict):
2139 min_itemsize = min_itemsize.get(self.name)
2141 if min_itemsize is not None and self.typ.itemsize < min_itemsize:
2142 self.typ = _tables().StringCol(itemsize=min_itemsize, pos=self.pos)
2144 def validate_names(self) -> None:
2145 pass
2147 def validate_and_set(self, handler: AppendableTable, append: bool) -> None:
2148 self.table = handler.table
2149 self.validate_col()
2150 self.validate_attr(append)
2151 self.validate_metadata(handler)
2152 self.write_metadata(handler)
2153 self.set_attr()
2155 def validate_col(self, itemsize=None):
2156 """validate this column: return the compared against itemsize"""
2157 # validate this column for string truncation (or reset to the max size)
2158 if _ensure_decoded(self.kind) == "string":
2159 c = self.col
2160 if c is not None:
2161 if itemsize is None:
2162 itemsize = self.itemsize
2163 if c.itemsize < itemsize:
2164 raise ValueError(
2165 f"Trying to store a string with len [{itemsize}] in "
2166 f"[{self.cname}] column but\nthis column has a limit of "
2167 f"[{c.itemsize}]!\nConsider using min_itemsize to "
2168 "preset the sizes on these columns"
2169 )
2170 return c.itemsize
2172 return None
2174 def validate_attr(self, append: bool) -> None:
2175 # check for backwards incompatibility
2176 if append:
2177 existing_kind = getattr(self.attrs, self.kind_attr, None)
2178 if existing_kind is not None and existing_kind != self.kind:
2179 raise TypeError(
2180 f"incompatible kind in col [{existing_kind} - {self.kind}]"
2181 )
2183 def update_info(self, info) -> None:
2184 """
2185 set/update the info for this indexable with the key/value
2186 if there is a conflict raise/warn as needed
2187 """
2188 for key in self._info_fields:
2190 value = getattr(self, key, None)
2191 idx = info.setdefault(self.name, {})
2193 existing_value = idx.get(key)
2194 if key in idx and value is not None and existing_value != value:
2195 # frequency/name just warn
2196 if key in ["freq", "index_name"]:
2197 ws = attribute_conflict_doc % (key, existing_value, value)
2198 warnings.warn(
2199 ws, AttributeConflictWarning, stacklevel=find_stack_level()
2200 )
2202 # reset
2203 idx[key] = None
2204 setattr(self, key, None)
2206 else:
2207 raise ValueError(
2208 f"invalid info for [{self.name}] for [{key}], "
2209 f"existing_value [{existing_value}] conflicts with "
2210 f"new value [{value}]"
2211 )
2212 else:
2213 if value is not None or existing_value is not None:
2214 idx[key] = value
2216 def set_info(self, info) -> None:
2217 """set my state from the passed info"""
2218 idx = info.get(self.name)
2219 if idx is not None:
2220 self.__dict__.update(idx)
2222 def set_attr(self) -> None:
2223 """set the kind for this column"""
2224 setattr(self.attrs, self.kind_attr, self.kind)
2226 def validate_metadata(self, handler: AppendableTable) -> None:
2227 """validate that kind=category does not change the categories"""
2228 if self.meta == "category":
2229 new_metadata = self.metadata
2230 cur_metadata = handler.read_metadata(self.cname)
2231 if (
2232 new_metadata is not None
2233 and cur_metadata is not None
2234 and not array_equivalent(new_metadata, cur_metadata)
2235 ):
2236 raise ValueError(
2237 "cannot append a categorical with "
2238 "different categories to the existing"
2239 )
2241 def write_metadata(self, handler: AppendableTable) -> None:
2242 """set the meta data"""
2243 if self.metadata is not None:
2244 handler.write_metadata(self.cname, self.metadata)
2247class GenericIndexCol(IndexCol):
2248 """an index which is not represented in the data of the table"""
2250 @property
2251 def is_indexed(self) -> bool:
2252 return False
2254 # error: Return type "Tuple[Int64Index, Int64Index]" of "convert"
2255 # incompatible with return type "Union[Tuple[ndarray[Any, Any],
2256 # ndarray[Any, Any]], Tuple[DatetimeIndex, DatetimeIndex]]" in
2257 # supertype "IndexCol"
2258 def convert( # type: ignore[override]
2259 self, values: np.ndarray, nan_rep, encoding: str, errors: str
2260 ) -> tuple[Int64Index, Int64Index]:
2261 """
2262 Convert the data from this selection to the appropriate pandas type.
2264 Parameters
2265 ----------
2266 values : np.ndarray
2267 nan_rep : str
2268 encoding : str
2269 errors : str
2270 """
2271 assert isinstance(values, np.ndarray), type(values)
2273 index = Int64Index(np.arange(len(values)))
2274 return index, index
2276 def set_attr(self) -> None:
2277 pass
2280class DataCol(IndexCol):
2281 """
2282 a data holding column, by definition this is not indexable
2284 Parameters
2285 ----------
2286 data : the actual data
2287 cname : the column name in the table to hold the data (typically
2288 values)
2289 meta : a string description of the metadata
2290 metadata : the actual metadata
2291 """
2293 is_an_indexable = False
2294 is_data_indexable = False
2295 _info_fields = ["tz", "ordered"]
2297 def __init__(
2298 self,
2299 name: str,
2300 values=None,
2301 kind=None,
2302 typ=None,
2303 cname=None,
2304 pos=None,
2305 tz=None,
2306 ordered=None,
2307 table=None,
2308 meta=None,
2309 metadata=None,
2310 dtype: DtypeArg | None = None,
2311 data=None,
2312 ) -> None:
2313 super().__init__(
2314 name=name,
2315 values=values,
2316 kind=kind,
2317 typ=typ,
2318 pos=pos,
2319 cname=cname,
2320 tz=tz,
2321 ordered=ordered,
2322 table=table,
2323 meta=meta,
2324 metadata=metadata,
2325 )
2326 self.dtype = dtype
2327 self.data = data
2329 @property
2330 def dtype_attr(self) -> str:
2331 return f"{self.name}_dtype"
2333 @property
2334 def meta_attr(self) -> str:
2335 return f"{self.name}_meta"
2337 def __repr__(self) -> str:
2338 temp = tuple(
2339 map(
2340 pprint_thing, (self.name, self.cname, self.dtype, self.kind, self.shape)
2341 )
2342 )
2343 return ",".join(
2344 [
2345 f"{key}->{value}"
2346 for key, value in zip(["name", "cname", "dtype", "kind", "shape"], temp)
2347 ]
2348 )
2350 def __eq__(self, other: Any) -> bool:
2351 """compare 2 col items"""
2352 return all(
2353 getattr(self, a, None) == getattr(other, a, None)
2354 for a in ["name", "cname", "dtype", "pos"]
2355 )
2357 def set_data(self, data: ArrayLike) -> None:
2358 assert data is not None
2359 assert self.dtype is None
2361 data, dtype_name = _get_data_and_dtype_name(data)
2363 self.data = data
2364 self.dtype = dtype_name
2365 self.kind = _dtype_to_kind(dtype_name)
2367 def take_data(self):
2368 """return the data"""
2369 return self.data
2371 @classmethod
2372 def _get_atom(cls, values: ArrayLike) -> Col:
2373 """
2374 Get an appropriately typed and shaped pytables.Col object for values.
2375 """
2376 dtype = values.dtype
2377 # error: Item "ExtensionDtype" of "Union[ExtensionDtype, dtype[Any]]" has no
2378 # attribute "itemsize"
2379 itemsize = dtype.itemsize # type: ignore[union-attr]
2381 shape = values.shape
2382 if values.ndim == 1:
2383 # EA, use block shape pretending it is 2D
2384 # TODO(EA2D): not necessary with 2D EAs
2385 shape = (1, values.size)
2387 if isinstance(values, Categorical):
2388 codes = values.codes
2389 atom = cls.get_atom_data(shape, kind=codes.dtype.name)
2390 elif is_datetime64_dtype(dtype) or is_datetime64tz_dtype(dtype):
2391 atom = cls.get_atom_datetime64(shape)
2392 elif is_timedelta64_dtype(dtype):
2393 atom = cls.get_atom_timedelta64(shape)
2394 elif is_complex_dtype(dtype):
2395 atom = _tables().ComplexCol(itemsize=itemsize, shape=shape[0])
2396 elif is_string_dtype(dtype):
2397 atom = cls.get_atom_string(shape, itemsize)
2398 else:
2399 atom = cls.get_atom_data(shape, kind=dtype.name)
2401 return atom
2403 @classmethod
2404 def get_atom_string(cls, shape, itemsize):
2405 return _tables().StringCol(itemsize=itemsize, shape=shape[0])
2407 @classmethod
2408 def get_atom_coltype(cls, kind: str) -> type[Col]:
2409 """return the PyTables column class for this column"""
2410 if kind.startswith("uint"):
2411 k4 = kind[4:]
2412 col_name = f"UInt{k4}Col"
2413 elif kind.startswith("period"):
2414 # we store as integer
2415 col_name = "Int64Col"
2416 else:
2417 kcap = kind.capitalize()
2418 col_name = f"{kcap}Col"
2420 return getattr(_tables(), col_name)
2422 @classmethod
2423 def get_atom_data(cls, shape, kind: str) -> Col:
2424 return cls.get_atom_coltype(kind=kind)(shape=shape[0])
2426 @classmethod
2427 def get_atom_datetime64(cls, shape):
2428 return _tables().Int64Col(shape=shape[0])
2430 @classmethod
2431 def get_atom_timedelta64(cls, shape):
2432 return _tables().Int64Col(shape=shape[0])
2434 @property
2435 def shape(self):
2436 return getattr(self.data, "shape", None)
2438 @property
2439 def cvalues(self):
2440 """return my cython values"""
2441 return self.data
2443 def validate_attr(self, append) -> None:
2444 """validate that we have the same order as the existing & same dtype"""
2445 if append:
2446 existing_fields = getattr(self.attrs, self.kind_attr, None)
2447 if existing_fields is not None and existing_fields != list(self.values):
2448 raise ValueError("appended items do not match existing items in table!")
2450 existing_dtype = getattr(self.attrs, self.dtype_attr, None)
2451 if existing_dtype is not None and existing_dtype != self.dtype:
2452 raise ValueError(
2453 "appended items dtype do not match existing items dtype in table!"
2454 )
2456 def convert(self, values: np.ndarray, nan_rep, encoding: str, errors: str):
2457 """
2458 Convert the data from this selection to the appropriate pandas type.
2460 Parameters
2461 ----------
2462 values : np.ndarray
2463 nan_rep :
2464 encoding : str
2465 errors : str
2467 Returns
2468 -------
2469 index : listlike to become an Index
2470 data : ndarraylike to become a column
2471 """
2472 assert isinstance(values, np.ndarray), type(values)
2474 # values is a recarray
2475 if values.dtype.fields is not None:
2476 values = values[self.cname]
2478 assert self.typ is not None
2479 if self.dtype is None:
2480 # Note: in tests we never have timedelta64 or datetime64,
2481 # so the _get_data_and_dtype_name may be unnecessary
2482 converted, dtype_name = _get_data_and_dtype_name(values)
2483 kind = _dtype_to_kind(dtype_name)
2484 else:
2485 converted = values
2486 dtype_name = self.dtype
2487 kind = self.kind
2489 assert isinstance(converted, np.ndarray) # for mypy
2491 # use the meta if needed
2492 meta = _ensure_decoded(self.meta)
2493 metadata = self.metadata
2494 ordered = self.ordered
2495 tz = self.tz
2497 assert dtype_name is not None
2498 # convert to the correct dtype
2499 dtype = _ensure_decoded(dtype_name)
2501 # reverse converts
2502 if dtype == "datetime64":
2503 # recreate with tz if indicated
2504 converted = _set_tz(converted, tz, coerce=True)
2506 elif dtype == "timedelta64":
2507 converted = np.asarray(converted, dtype="m8[ns]")
2508 elif dtype == "date":
2509 try:
2510 converted = np.asarray(
2511 [date.fromordinal(v) for v in converted], dtype=object
2512 )
2513 except ValueError:
2514 converted = np.asarray(
2515 [date.fromtimestamp(v) for v in converted], dtype=object
2516 )
2518 elif meta == "category":
2519 # we have a categorical
2520 categories = metadata
2521 codes = converted.ravel()
2523 # if we have stored a NaN in the categories
2524 # then strip it; in theory we could have BOTH
2525 # -1s in the codes and nulls :<
2526 if categories is None:
2527 # Handle case of NaN-only categorical columns in which case
2528 # the categories are an empty array; when this is stored,
2529 # pytables cannot write a zero-len array, so on readback
2530 # the categories would be None and `read_hdf()` would fail.
2531 categories = Index([], dtype=np.float64)
2532 else:
2533 mask = isna(categories)
2534 if mask.any():
2535 categories = categories[~mask]
2536 codes[codes != -1] -= mask.astype(int).cumsum()._values
2538 converted = Categorical.from_codes(
2539 codes, categories=categories, ordered=ordered
2540 )
2542 else:
2544 try:
2545 converted = converted.astype(dtype, copy=False)
2546 except TypeError:
2547 converted = converted.astype("O", copy=False)
2549 # convert nans / decode
2550 if _ensure_decoded(kind) == "string":
2551 converted = _unconvert_string_array(
2552 converted, nan_rep=nan_rep, encoding=encoding, errors=errors
2553 )
2555 return self.values, converted
2557 def set_attr(self) -> None:
2558 """set the data for this column"""
2559 setattr(self.attrs, self.kind_attr, self.values)
2560 setattr(self.attrs, self.meta_attr, self.meta)
2561 assert self.dtype is not None
2562 setattr(self.attrs, self.dtype_attr, self.dtype)
2565class DataIndexableCol(DataCol):
2566 """represent a data column that can be indexed"""
2568 is_data_indexable = True
2570 def validate_names(self) -> None:
2571 if not Index(self.values).is_object():
2572 # TODO: should the message here be more specifically non-str?
2573 raise ValueError("cannot have non-object label DataIndexableCol")
2575 @classmethod
2576 def get_atom_string(cls, shape, itemsize):
2577 return _tables().StringCol(itemsize=itemsize)
2579 @classmethod
2580 def get_atom_data(cls, shape, kind: str) -> Col:
2581 return cls.get_atom_coltype(kind=kind)()
2583 @classmethod
2584 def get_atom_datetime64(cls, shape):
2585 return _tables().Int64Col()
2587 @classmethod
2588 def get_atom_timedelta64(cls, shape):
2589 return _tables().Int64Col()
2592class GenericDataIndexableCol(DataIndexableCol):
2593 """represent a generic pytables data column"""
2595 pass
2598class Fixed:
2599 """
2600 represent an object in my store
2601 facilitate read/write of various types of objects
2602 this is an abstract base class
2604 Parameters
2605 ----------
2606 parent : HDFStore
2607 group : Node
2608 The group node where the table resides.
2609 """
2611 pandas_kind: str
2612 format_type: str = "fixed" # GH#30962 needed by dask
2613 obj_type: type[DataFrame | Series]
2614 ndim: int
2615 encoding: str
2616 parent: HDFStore
2617 group: Node
2618 errors: str
2619 is_table: bool = False
2621 def __init__(
2622 self,
2623 parent: HDFStore,
2624 group: Node,
2625 encoding: str = "UTF-8",
2626 errors: str = "strict",
2627 ) -> None:
2628 assert isinstance(parent, HDFStore), type(parent)
2629 assert _table_mod is not None # needed for mypy
2630 assert isinstance(group, _table_mod.Node), type(group)
2631 self.parent = parent
2632 self.group = group
2633 self.encoding = _ensure_encoding(encoding)
2634 self.errors = errors
2636 @property
2637 def is_old_version(self) -> bool:
2638 return self.version[0] <= 0 and self.version[1] <= 10 and self.version[2] < 1
2640 @property
2641 def version(self) -> tuple[int, int, int]:
2642 """compute and set our version"""
2643 version = _ensure_decoded(getattr(self.group._v_attrs, "pandas_version", None))
2644 try:
2645 version = tuple(int(x) for x in version.split("."))
2646 if len(version) == 2:
2647 version = version + (0,)
2648 except AttributeError:
2649 version = (0, 0, 0)
2650 return version
2652 @property
2653 def pandas_type(self):
2654 return _ensure_decoded(getattr(self.group._v_attrs, "pandas_type", None))
2656 def __repr__(self) -> str:
2657 """return a pretty representation of myself"""
2658 self.infer_axes()
2659 s = self.shape
2660 if s is not None:
2661 if isinstance(s, (list, tuple)):
2662 jshape = ",".join([pprint_thing(x) for x in s])
2663 s = f"[{jshape}]"
2664 return f"{self.pandas_type:12.12} (shape->{s})"
2665 return self.pandas_type
2667 def set_object_info(self) -> None:
2668 """set my pandas type & version"""
2669 self.attrs.pandas_type = str(self.pandas_kind)
2670 self.attrs.pandas_version = str(_version)
2672 def copy(self) -> Fixed:
2673 new_self = copy.copy(self)
2674 return new_self
2676 @property
2677 def shape(self):
2678 return self.nrows
2680 @property
2681 def pathname(self):
2682 return self.group._v_pathname
2684 @property
2685 def _handle(self):
2686 return self.parent._handle
2688 @property
2689 def _filters(self):
2690 return self.parent._filters
2692 @property
2693 def _complevel(self) -> int:
2694 return self.parent._complevel
2696 @property
2697 def _fletcher32(self) -> bool:
2698 return self.parent._fletcher32
2700 @property
2701 def attrs(self):
2702 return self.group._v_attrs
2704 def set_attrs(self) -> None:
2705 """set our object attributes"""
2706 pass
2708 def get_attrs(self) -> None:
2709 """get our object attributes"""
2710 pass
2712 @property
2713 def storable(self):
2714 """return my storable"""
2715 return self.group
2717 @property
2718 def is_exists(self) -> bool:
2719 return False
2721 @property
2722 def nrows(self):
2723 return getattr(self.storable, "nrows", None)
2725 def validate(self, other) -> Literal[True] | None:
2726 """validate against an existing storable"""
2727 if other is None:
2728 return None
2729 return True
2731 def validate_version(self, where=None) -> None:
2732 """are we trying to operate on an old version?"""
2733 pass
2735 def infer_axes(self) -> bool:
2736 """
2737 infer the axes of my storer
2738 return a boolean indicating if we have a valid storer or not
2739 """
2740 s = self.storable
2741 if s is None:
2742 return False
2743 self.get_attrs()
2744 return True
2746 def read(
2747 self,
2748 where=None,
2749 columns=None,
2750 start: int | None = None,
2751 stop: int | None = None,
2752 ):
2753 raise NotImplementedError(
2754 "cannot read on an abstract storer: subclasses should implement"
2755 )
2757 def write(self, **kwargs):
2758 raise NotImplementedError(
2759 "cannot write on an abstract storer: subclasses should implement"
2760 )
2762 def delete(
2763 self, where=None, start: int | None = None, stop: int | None = None
2764 ) -> None:
2765 """
2766 support fully deleting the node in its entirety (only) - where
2767 specification must be None
2768 """
2769 if com.all_none(where, start, stop):
2770 self._handle.remove_node(self.group, recursive=True)
2771 return None
2773 raise TypeError("cannot delete on an abstract storer")
2776class GenericFixed(Fixed):
2777 """a generified fixed version"""
2779 _index_type_map = {DatetimeIndex: "datetime", PeriodIndex: "period"}
2780 _reverse_index_map = {v: k for k, v in _index_type_map.items()}
2781 attributes: list[str] = []
2783 # indexer helpers
2784 def _class_to_alias(self, cls) -> str:
2785 return self._index_type_map.get(cls, "")
2787 def _alias_to_class(self, alias):
2788 if isinstance(alias, type): # pragma: no cover
2789 # compat: for a short period of time master stored types
2790 return alias
2791 return self._reverse_index_map.get(alias, Index)
2793 def _get_index_factory(self, attrs):
2794 index_class = self._alias_to_class(
2795 _ensure_decoded(getattr(attrs, "index_class", ""))
2796 )
2798 factory: Callable
2800 if index_class == DatetimeIndex:
2802 def f(values, freq=None, tz=None):
2803 # data are already in UTC, localize and convert if tz present
2804 dta = DatetimeArray._simple_new(values.values, freq=freq)
2805 result = DatetimeIndex._simple_new(dta, name=None)
2806 if tz is not None:
2807 result = result.tz_localize("UTC").tz_convert(tz)
2808 return result
2810 factory = f
2811 elif index_class == PeriodIndex:
2813 def f(values, freq=None, tz=None):
2814 parr = PeriodArray._simple_new(values, freq=freq)
2815 return PeriodIndex._simple_new(parr, name=None)
2817 factory = f
2818 else:
2819 factory = index_class
2821 kwargs = {}
2822 if "freq" in attrs:
2823 kwargs["freq"] = attrs["freq"]
2824 if index_class is Index:
2825 # DTI/PI would be gotten by _alias_to_class
2826 factory = TimedeltaIndex
2828 if "tz" in attrs:
2829 if isinstance(attrs["tz"], bytes):
2830 # created by python2
2831 kwargs["tz"] = attrs["tz"].decode("utf-8")
2832 else:
2833 # created by python3
2834 kwargs["tz"] = attrs["tz"]
2835 assert index_class is DatetimeIndex # just checking
2837 return factory, kwargs
2839 def validate_read(self, columns, where) -> None:
2840 """
2841 raise if any keywords are passed which are not-None
2842 """
2843 if columns is not None:
2844 raise TypeError(
2845 "cannot pass a column specification when reading "
2846 "a Fixed format store. this store must be selected in its entirety"
2847 )
2848 if where is not None:
2849 raise TypeError(
2850 "cannot pass a where specification when reading "
2851 "from a Fixed format store. this store must be selected in its entirety"
2852 )
2854 @property
2855 def is_exists(self) -> bool:
2856 return True
2858 def set_attrs(self) -> None:
2859 """set our object attributes"""
2860 self.attrs.encoding = self.encoding
2861 self.attrs.errors = self.errors
2863 def get_attrs(self) -> None:
2864 """retrieve our attributes"""
2865 self.encoding = _ensure_encoding(getattr(self.attrs, "encoding", None))
2866 self.errors = _ensure_decoded(getattr(self.attrs, "errors", "strict"))
2867 for n in self.attributes:
2868 setattr(self, n, _ensure_decoded(getattr(self.attrs, n, None)))
2870 # error: Signature of "write" incompatible with supertype "Fixed"
2871 def write(self, obj, **kwargs) -> None: # type: ignore[override]
2872 self.set_attrs()
2874 def read_array(self, key: str, start: int | None = None, stop: int | None = None):
2875 """read an array for the specified node (off of group"""
2876 import tables
2878 node = getattr(self.group, key)
2879 attrs = node._v_attrs
2881 transposed = getattr(attrs, "transposed", False)
2883 if isinstance(node, tables.VLArray):
2884 ret = node[0][start:stop]
2885 else:
2886 dtype = _ensure_decoded(getattr(attrs, "value_type", None))
2887 shape = getattr(attrs, "shape", None)
2889 if shape is not None:
2890 # length 0 axis
2891 ret = np.empty(shape, dtype=dtype)
2892 else:
2893 ret = node[start:stop]
2895 if dtype == "datetime64":
2896 # reconstruct a timezone if indicated
2897 tz = getattr(attrs, "tz", None)
2898 ret = _set_tz(ret, tz, coerce=True)
2900 elif dtype == "timedelta64":
2901 ret = np.asarray(ret, dtype="m8[ns]")
2903 if transposed:
2904 return ret.T
2905 else:
2906 return ret
2908 def read_index(
2909 self, key: str, start: int | None = None, stop: int | None = None
2910 ) -> Index:
2911 variety = _ensure_decoded(getattr(self.attrs, f"{key}_variety"))
2913 if variety == "multi":
2914 return self.read_multi_index(key, start=start, stop=stop)
2915 elif variety == "regular":
2916 node = getattr(self.group, key)
2917 index = self.read_index_node(node, start=start, stop=stop)
2918 return index
2919 else: # pragma: no cover
2920 raise TypeError(f"unrecognized index variety: {variety}")
2922 def write_index(self, key: str, index: Index) -> None:
2923 if isinstance(index, MultiIndex):
2924 setattr(self.attrs, f"{key}_variety", "multi")
2925 self.write_multi_index(key, index)
2926 else:
2927 setattr(self.attrs, f"{key}_variety", "regular")
2928 converted = _convert_index("index", index, self.encoding, self.errors)
2930 self.write_array(key, converted.values)
2932 node = getattr(self.group, key)
2933 node._v_attrs.kind = converted.kind
2934 node._v_attrs.name = index.name
2936 if isinstance(index, (DatetimeIndex, PeriodIndex)):
2937 node._v_attrs.index_class = self._class_to_alias(type(index))
2939 if isinstance(index, (DatetimeIndex, PeriodIndex, TimedeltaIndex)):
2940 node._v_attrs.freq = index.freq
2942 if isinstance(index, DatetimeIndex) and index.tz is not None:
2943 node._v_attrs.tz = _get_tz(index.tz)
2945 def write_multi_index(self, key: str, index: MultiIndex) -> None:
2946 setattr(self.attrs, f"{key}_nlevels", index.nlevels)
2948 for i, (lev, level_codes, name) in enumerate(
2949 zip(index.levels, index.codes, index.names)
2950 ):
2951 # write the level
2952 if is_extension_array_dtype(lev):
2953 raise NotImplementedError(
2954 "Saving a MultiIndex with an extension dtype is not supported."
2955 )
2956 level_key = f"{key}_level{i}"
2957 conv_level = _convert_index(level_key, lev, self.encoding, self.errors)
2958 self.write_array(level_key, conv_level.values)
2959 node = getattr(self.group, level_key)
2960 node._v_attrs.kind = conv_level.kind
2961 node._v_attrs.name = name
2963 # write the name
2964 setattr(node._v_attrs, f"{key}_name{name}", name)
2966 # write the labels
2967 label_key = f"{key}_label{i}"
2968 self.write_array(label_key, level_codes)
2970 def read_multi_index(
2971 self, key: str, start: int | None = None, stop: int | None = None
2972 ) -> MultiIndex:
2973 nlevels = getattr(self.attrs, f"{key}_nlevels")
2975 levels = []
2976 codes = []
2977 names: list[Hashable] = []
2978 for i in range(nlevels):
2979 level_key = f"{key}_level{i}"
2980 node = getattr(self.group, level_key)
2981 lev = self.read_index_node(node, start=start, stop=stop)
2982 levels.append(lev)
2983 names.append(lev.name)
2985 label_key = f"{key}_label{i}"
2986 level_codes = self.read_array(label_key, start=start, stop=stop)
2987 codes.append(level_codes)
2989 return MultiIndex(
2990 levels=levels, codes=codes, names=names, verify_integrity=True
2991 )
2993 def read_index_node(
2994 self, node: Node, start: int | None = None, stop: int | None = None
2995 ) -> Index:
2996 data = node[start:stop]
2997 # If the index was an empty array write_array_empty() will
2998 # have written a sentinel. Here we replace it with the original.
2999 if "shape" in node._v_attrs and np.prod(node._v_attrs.shape) == 0:
3000 data = np.empty(node._v_attrs.shape, dtype=node._v_attrs.value_type)
3001 kind = _ensure_decoded(node._v_attrs.kind)
3002 name = None
3004 if "name" in node._v_attrs:
3005 name = _ensure_str(node._v_attrs.name)
3006 name = _ensure_decoded(name)
3008 attrs = node._v_attrs
3009 factory, kwargs = self._get_index_factory(attrs)
3011 if kind == "date":
3012 index = factory(
3013 _unconvert_index(
3014 data, kind, encoding=self.encoding, errors=self.errors
3015 ),
3016 dtype=object,
3017 **kwargs,
3018 )
3019 else:
3020 index = factory(
3021 _unconvert_index(
3022 data, kind, encoding=self.encoding, errors=self.errors
3023 ),
3024 **kwargs,
3025 )
3027 index.name = name
3029 return index
3031 def write_array_empty(self, key: str, value: ArrayLike) -> None:
3032 """write a 0-len array"""
3033 # ugly hack for length 0 axes
3034 arr = np.empty((1,) * value.ndim)
3035 self._handle.create_array(self.group, key, arr)
3036 node = getattr(self.group, key)
3037 node._v_attrs.value_type = str(value.dtype)
3038 node._v_attrs.shape = value.shape
3040 def write_array(
3041 self, key: str, obj: AnyArrayLike, items: Index | None = None
3042 ) -> None:
3043 # TODO: we only have a few tests that get here, the only EA
3044 # that gets passed is DatetimeArray, and we never have
3045 # both self._filters and EA
3047 value = extract_array(obj, extract_numpy=True)
3049 if key in self.group:
3050 self._handle.remove_node(self.group, key)
3052 # Transform needed to interface with pytables row/col notation
3053 empty_array = value.size == 0
3054 transposed = False
3056 if is_categorical_dtype(value.dtype):
3057 raise NotImplementedError(
3058 "Cannot store a category dtype in a HDF5 dataset that uses format="
3059 '"fixed". Use format="table".'
3060 )
3061 if not empty_array:
3062 if hasattr(value, "T"):
3063 # ExtensionArrays (1d) may not have transpose.
3064 value = value.T
3065 transposed = True
3067 atom = None
3068 if self._filters is not None:
3069 with suppress(ValueError):
3070 # get the atom for this datatype
3071 atom = _tables().Atom.from_dtype(value.dtype)
3073 if atom is not None:
3074 # We only get here if self._filters is non-None and
3075 # the Atom.from_dtype call succeeded
3077 # create an empty chunked array and fill it from value
3078 if not empty_array:
3079 ca = self._handle.create_carray(
3080 self.group, key, atom, value.shape, filters=self._filters
3081 )
3082 ca[:] = value
3084 else:
3085 self.write_array_empty(key, value)
3087 elif value.dtype.type == np.object_:
3088 # infer the type, warn if we have a non-string type here (for
3089 # performance)
3090 inferred_type = lib.infer_dtype(value, skipna=False)
3091 if empty_array:
3092 pass
3093 elif inferred_type == "string":
3094 pass
3095 else:
3096 ws = performance_doc % (inferred_type, key, items)
3097 warnings.warn(ws, PerformanceWarning, stacklevel=find_stack_level())
3099 vlarr = self._handle.create_vlarray(self.group, key, _tables().ObjectAtom())
3100 vlarr.append(value)
3102 elif is_datetime64_dtype(value.dtype):
3103 self._handle.create_array(self.group, key, value.view("i8"))
3104 getattr(self.group, key)._v_attrs.value_type = "datetime64"
3105 elif is_datetime64tz_dtype(value.dtype):
3106 # store as UTC
3107 # with a zone
3109 # error: Item "ExtensionArray" of "Union[Any, ExtensionArray]" has no
3110 # attribute "asi8"
3111 self._handle.create_array(
3112 self.group, key, value.asi8 # type: ignore[union-attr]
3113 )
3115 node = getattr(self.group, key)
3116 # error: Item "ExtensionArray" of "Union[Any, ExtensionArray]" has no
3117 # attribute "tz"
3118 node._v_attrs.tz = _get_tz(value.tz) # type: ignore[union-attr]
3119 node._v_attrs.value_type = "datetime64"
3120 elif is_timedelta64_dtype(value.dtype):
3121 self._handle.create_array(self.group, key, value.view("i8"))
3122 getattr(self.group, key)._v_attrs.value_type = "timedelta64"
3123 elif empty_array:
3124 self.write_array_empty(key, value)
3125 else:
3126 self._handle.create_array(self.group, key, value)
3128 getattr(self.group, key)._v_attrs.transposed = transposed
3131class SeriesFixed(GenericFixed):
3132 pandas_kind = "series"
3133 attributes = ["name"]
3135 name: Hashable
3137 @property
3138 def shape(self):
3139 try:
3140 return (len(self.group.values),)
3141 except (TypeError, AttributeError):
3142 return None
3144 def read(
3145 self,
3146 where=None,
3147 columns=None,
3148 start: int | None = None,
3149 stop: int | None = None,
3150 ) -> Series:
3151 self.validate_read(columns, where)
3152 index = self.read_index("index", start=start, stop=stop)
3153 values = self.read_array("values", start=start, stop=stop)
3154 return Series(values, index=index, name=self.name)
3156 # error: Signature of "write" incompatible with supertype "Fixed"
3157 def write(self, obj, **kwargs) -> None: # type: ignore[override]
3158 super().write(obj, **kwargs)
3159 self.write_index("index", obj.index)
3160 self.write_array("values", obj)
3161 self.attrs.name = obj.name
3164class BlockManagerFixed(GenericFixed):
3165 attributes = ["ndim", "nblocks"]
3167 nblocks: int
3169 @property
3170 def shape(self) -> Shape | None:
3171 try:
3172 ndim = self.ndim
3174 # items
3175 items = 0
3176 for i in range(self.nblocks):
3177 node = getattr(self.group, f"block{i}_items")
3178 shape = getattr(node, "shape", None)
3179 if shape is not None:
3180 items += shape[0]
3182 # data shape
3183 node = self.group.block0_values
3184 shape = getattr(node, "shape", None)
3185 if shape is not None:
3186 shape = list(shape[0 : (ndim - 1)])
3187 else:
3188 shape = []
3190 shape.append(items)
3192 return shape
3193 except AttributeError:
3194 return None
3196 def read(
3197 self,
3198 where=None,
3199 columns=None,
3200 start: int | None = None,
3201 stop: int | None = None,
3202 ) -> DataFrame:
3203 # start, stop applied to rows, so 0th axis only
3204 self.validate_read(columns, where)
3205 select_axis = self.obj_type()._get_block_manager_axis(0)
3207 axes = []
3208 for i in range(self.ndim):
3210 _start, _stop = (start, stop) if i == select_axis else (None, None)
3211 ax = self.read_index(f"axis{i}", start=_start, stop=_stop)
3212 axes.append(ax)
3214 items = axes[0]
3215 dfs = []
3217 for i in range(self.nblocks):
3219 blk_items = self.read_index(f"block{i}_items")
3220 values = self.read_array(f"block{i}_values", start=_start, stop=_stop)
3222 columns = items[items.get_indexer(blk_items)]
3223 df = DataFrame(values.T, columns=columns, index=axes[1])
3224 dfs.append(df)
3226 if len(dfs) > 0:
3227 out = concat(dfs, axis=1)
3228 out = out.reindex(columns=items, copy=False)
3229 return out
3231 return DataFrame(columns=axes[0], index=axes[1])
3233 # error: Signature of "write" incompatible with supertype "Fixed"
3234 def write(self, obj, **kwargs) -> None: # type: ignore[override]
3235 super().write(obj, **kwargs)
3237 # TODO(ArrayManager) HDFStore relies on accessing the blocks
3238 if isinstance(obj._mgr, ArrayManager):
3239 obj = obj._as_manager("block")
3241 data = obj._mgr
3242 if not data.is_consolidated():
3243 data = data.consolidate()
3245 self.attrs.ndim = data.ndim
3246 for i, ax in enumerate(data.axes):
3247 if i == 0 and (not ax.is_unique):
3248 raise ValueError("Columns index has to be unique for fixed format")
3249 self.write_index(f"axis{i}", ax)
3251 # Supporting mixed-type DataFrame objects...nontrivial
3252 self.attrs.nblocks = len(data.blocks)
3253 for i, blk in enumerate(data.blocks):
3254 # I have no idea why, but writing values before items fixed #2299
3255 blk_items = data.items.take(blk.mgr_locs)
3256 self.write_array(f"block{i}_values", blk.values, items=blk_items)
3257 self.write_index(f"block{i}_items", blk_items)
3260class FrameFixed(BlockManagerFixed):
3261 pandas_kind = "frame"
3262 obj_type = DataFrame
3265class Table(Fixed):
3266 """
3267 represent a table:
3268 facilitate read/write of various types of tables
3270 Attrs in Table Node
3271 -------------------
3272 These are attributes that are store in the main table node, they are
3273 necessary to recreate these tables when read back in.
3275 index_axes : a list of tuples of the (original indexing axis and
3276 index column)
3277 non_index_axes: a list of tuples of the (original index axis and
3278 columns on a non-indexing axis)
3279 values_axes : a list of the columns which comprise the data of this
3280 table
3281 data_columns : a list of the columns that we are allowing indexing
3282 (these become single columns in values_axes)
3283 nan_rep : the string to use for nan representations for string
3284 objects
3285 levels : the names of levels
3286 metadata : the names of the metadata columns
3287 """
3289 pandas_kind = "wide_table"
3290 format_type: str = "table" # GH#30962 needed by dask
3291 table_type: str
3292 levels: int | list[Hashable] = 1
3293 is_table = True
3295 index_axes: list[IndexCol]
3296 non_index_axes: list[tuple[int, Any]]
3297 values_axes: list[DataCol]
3298 data_columns: list
3299 metadata: list
3300 info: dict
3302 def __init__(
3303 self,
3304 parent: HDFStore,
3305 group: Node,
3306 encoding=None,
3307 errors: str = "strict",
3308 index_axes=None,
3309 non_index_axes=None,
3310 values_axes=None,
3311 data_columns=None,
3312 info=None,
3313 nan_rep=None,
3314 ) -> None:
3315 super().__init__(parent, group, encoding=encoding, errors=errors)
3316 self.index_axes = index_axes or []
3317 self.non_index_axes = non_index_axes or []
3318 self.values_axes = values_axes or []
3319 self.data_columns = data_columns or []
3320 self.info = info or {}
3321 self.nan_rep = nan_rep
3323 @property
3324 def table_type_short(self) -> str:
3325 return self.table_type.split("_")[0]
3327 def __repr__(self) -> str:
3328 """return a pretty representation of myself"""
3329 self.infer_axes()
3330 jdc = ",".join(self.data_columns) if len(self.data_columns) else ""
3331 dc = f",dc->[{jdc}]"
3333 ver = ""
3334 if self.is_old_version:
3335 jver = ".".join([str(x) for x in self.version])
3336 ver = f"[{jver}]"
3338 jindex_axes = ",".join([a.name for a in self.index_axes])
3339 return (
3340 f"{self.pandas_type:12.12}{ver} "
3341 f"(typ->{self.table_type_short},nrows->{self.nrows},"
3342 f"ncols->{self.ncols},indexers->[{jindex_axes}]{dc})"
3343 )
3345 def __getitem__(self, c: str):
3346 """return the axis for c"""
3347 for a in self.axes:
3348 if c == a.name:
3349 return a
3350 return None
3352 def validate(self, other) -> None:
3353 """validate against an existing table"""
3354 if other is None:
3355 return
3357 if other.table_type != self.table_type:
3358 raise TypeError(
3359 "incompatible table_type with existing "
3360 f"[{other.table_type} - {self.table_type}]"
3361 )
3363 for c in ["index_axes", "non_index_axes", "values_axes"]:
3364 sv = getattr(self, c, None)
3365 ov = getattr(other, c, None)
3366 if sv != ov:
3368 # show the error for the specific axes
3369 # Argument 1 to "enumerate" has incompatible type
3370 # "Optional[Any]"; expected "Iterable[Any]" [arg-type]
3371 for i, sax in enumerate(sv): # type: ignore[arg-type]
3372 # Value of type "Optional[Any]" is not indexable [index]
3373 oax = ov[i] # type: ignore[index]
3374 if sax != oax:
3375 raise ValueError(
3376 f"invalid combination of [{c}] on appending data "
3377 f"[{sax}] vs current table [{oax}]"
3378 )
3380 # should never get here
3381 raise Exception(
3382 f"invalid combination of [{c}] on appending data [{sv}] vs "
3383 f"current table [{ov}]"
3384 )
3386 @property
3387 def is_multi_index(self) -> bool:
3388 """the levels attribute is 1 or a list in the case of a multi-index"""
3389 return isinstance(self.levels, list)
3391 def validate_multiindex(
3392 self, obj: DataFrame | Series
3393 ) -> tuple[DataFrame, list[Hashable]]:
3394 """
3395 validate that we can store the multi-index; reset and return the
3396 new object
3397 """
3398 levels = com.fill_missing_names(obj.index.names)
3399 try:
3400 reset_obj = obj.reset_index()
3401 except ValueError as err:
3402 raise ValueError(
3403 "duplicate names/columns in the multi-index when storing as a table"
3404 ) from err
3405 assert isinstance(reset_obj, DataFrame) # for mypy
3406 return reset_obj, levels
3408 @property
3409 def nrows_expected(self) -> int:
3410 """based on our axes, compute the expected nrows"""
3411 return np.prod([i.cvalues.shape[0] for i in self.index_axes])
3413 @property
3414 def is_exists(self) -> bool:
3415 """has this table been created"""
3416 return "table" in self.group
3418 @property
3419 def storable(self):
3420 return getattr(self.group, "table", None)
3422 @property
3423 def table(self):
3424 """return the table group (this is my storable)"""
3425 return self.storable
3427 @property
3428 def dtype(self):
3429 return self.table.dtype
3431 @property
3432 def description(self):
3433 return self.table.description
3435 @property
3436 def axes(self):
3437 return itertools.chain(self.index_axes, self.values_axes)
3439 @property
3440 def ncols(self) -> int:
3441 """the number of total columns in the values axes"""
3442 return sum(len(a.values) for a in self.values_axes)
3444 @property
3445 def is_transposed(self) -> bool:
3446 return False
3448 @property
3449 def data_orientation(self) -> tuple[int, ...]:
3450 """return a tuple of my permutated axes, non_indexable at the front"""
3451 return tuple(
3452 itertools.chain(
3453 [int(a[0]) for a in self.non_index_axes],
3454 [int(a.axis) for a in self.index_axes],
3455 )
3456 )
3458 def queryables(self) -> dict[str, Any]:
3459 """return a dict of the kinds allowable columns for this object"""
3460 # mypy doesn't recognize DataFrame._AXIS_NAMES, so we re-write it here
3461 axis_names = {0: "index", 1: "columns"}
3463 # compute the values_axes queryables
3464 d1 = [(a.cname, a) for a in self.index_axes]
3465 d2 = [(axis_names[axis], None) for axis, values in self.non_index_axes]
3466 d3 = [
3467 (v.cname, v) for v in self.values_axes if v.name in set(self.data_columns)
3468 ]
3470 # error: Unsupported operand types for + ("List[Tuple[str, IndexCol]]" and
3471 # "List[Tuple[str, None]]")
3472 return dict(d1 + d2 + d3) # type: ignore[operator]
3474 def index_cols(self):
3475 """return a list of my index cols"""
3476 # Note: each `i.cname` below is assured to be a str.
3477 return [(i.axis, i.cname) for i in self.index_axes]
3479 def values_cols(self) -> list[str]:
3480 """return a list of my values cols"""
3481 return [i.cname for i in self.values_axes]
3483 def _get_metadata_path(self, key: str) -> str:
3484 """return the metadata pathname for this key"""
3485 group = self.group._v_pathname
3486 return f"{group}/meta/{key}/meta"
3488 def write_metadata(self, key: str, values: np.ndarray) -> None:
3489 """
3490 Write out a metadata array to the key as a fixed-format Series.
3492 Parameters
3493 ----------
3494 key : str
3495 values : ndarray
3496 """
3497 self.parent.put(
3498 self._get_metadata_path(key),
3499 Series(values),
3500 format="table",
3501 encoding=self.encoding,
3502 errors=self.errors,
3503 nan_rep=self.nan_rep,
3504 )
3506 def read_metadata(self, key: str):
3507 """return the meta data array for this key"""
3508 if getattr(getattr(self.group, "meta", None), key, None) is not None:
3509 return self.parent.select(self._get_metadata_path(key))
3510 return None
3512 def set_attrs(self) -> None:
3513 """set our table type & indexables"""
3514 self.attrs.table_type = str(self.table_type)
3515 self.attrs.index_cols = self.index_cols()
3516 self.attrs.values_cols = self.values_cols()
3517 self.attrs.non_index_axes = self.non_index_axes
3518 self.attrs.data_columns = self.data_columns
3519 self.attrs.nan_rep = self.nan_rep
3520 self.attrs.encoding = self.encoding
3521 self.attrs.errors = self.errors
3522 self.attrs.levels = self.levels
3523 self.attrs.info = self.info
3525 def get_attrs(self) -> None:
3526 """retrieve our attributes"""
3527 self.non_index_axes = getattr(self.attrs, "non_index_axes", None) or []
3528 self.data_columns = getattr(self.attrs, "data_columns", None) or []
3529 self.info = getattr(self.attrs, "info", None) or {}
3530 self.nan_rep = getattr(self.attrs, "nan_rep", None)
3531 self.encoding = _ensure_encoding(getattr(self.attrs, "encoding", None))
3532 self.errors = _ensure_decoded(getattr(self.attrs, "errors", "strict"))
3533 self.levels: list[Hashable] = getattr(self.attrs, "levels", None) or []
3534 self.index_axes = [a for a in self.indexables if a.is_an_indexable]
3535 self.values_axes = [a for a in self.indexables if not a.is_an_indexable]
3537 def validate_version(self, where=None) -> None:
3538 """are we trying to operate on an old version?"""
3539 if where is not None:
3540 if self.is_old_version:
3541 ws = incompatibility_doc % ".".join([str(x) for x in self.version])
3542 warnings.warn(
3543 ws,
3544 IncompatibilityWarning,
3545 stacklevel=find_stack_level(),
3546 )
3548 def validate_min_itemsize(self, min_itemsize) -> None:
3549 """
3550 validate the min_itemsize doesn't contain items that are not in the
3551 axes this needs data_columns to be defined
3552 """
3553 if min_itemsize is None:
3554 return
3555 if not isinstance(min_itemsize, dict):
3556 return
3558 q = self.queryables()
3559 for k in min_itemsize:
3561 # ok, apply generally
3562 if k == "values":
3563 continue
3564 if k not in q:
3565 raise ValueError(
3566 f"min_itemsize has the key [{k}] which is not an axis or "
3567 "data_column"
3568 )
3570 @cache_readonly
3571 def indexables(self):
3572 """create/cache the indexables if they don't exist"""
3573 _indexables = []
3575 desc = self.description
3576 table_attrs = self.table.attrs
3578 # Note: each of the `name` kwargs below are str, ensured
3579 # by the definition in index_cols.
3580 # index columns
3581 for i, (axis, name) in enumerate(self.attrs.index_cols):
3582 atom = getattr(desc, name)
3583 md = self.read_metadata(name)
3584 meta = "category" if md is not None else None
3586 kind_attr = f"{name}_kind"
3587 kind = getattr(table_attrs, kind_attr, None)
3589 index_col = IndexCol(
3590 name=name,
3591 axis=axis,
3592 pos=i,
3593 kind=kind,
3594 typ=atom,
3595 table=self.table,
3596 meta=meta,
3597 metadata=md,
3598 )
3599 _indexables.append(index_col)
3601 # values columns
3602 dc = set(self.data_columns)
3603 base_pos = len(_indexables)
3605 def f(i, c):
3606 assert isinstance(c, str)
3607 klass = DataCol
3608 if c in dc:
3609 klass = DataIndexableCol
3611 atom = getattr(desc, c)
3612 adj_name = _maybe_adjust_name(c, self.version)
3614 # TODO: why kind_attr here?
3615 values = getattr(table_attrs, f"{adj_name}_kind", None)
3616 dtype = getattr(table_attrs, f"{adj_name}_dtype", None)
3617 # Argument 1 to "_dtype_to_kind" has incompatible type
3618 # "Optional[Any]"; expected "str" [arg-type]
3619 kind = _dtype_to_kind(dtype) # type: ignore[arg-type]
3621 md = self.read_metadata(c)
3622 # TODO: figure out why these two versions of `meta` dont always match.
3623 # meta = "category" if md is not None else None
3624 meta = getattr(table_attrs, f"{adj_name}_meta", None)
3626 obj = klass(
3627 name=adj_name,
3628 cname=c,
3629 values=values,
3630 kind=kind,
3631 pos=base_pos + i,
3632 typ=atom,
3633 table=self.table,
3634 meta=meta,
3635 metadata=md,
3636 dtype=dtype,
3637 )
3638 return obj
3640 # Note: the definition of `values_cols` ensures that each
3641 # `c` below is a str.
3642 _indexables.extend([f(i, c) for i, c in enumerate(self.attrs.values_cols)])
3644 return _indexables
3646 def create_index(
3647 self, columns=None, optlevel=None, kind: str | None = None
3648 ) -> None:
3649 """
3650 Create a pytables index on the specified columns.
3652 Parameters
3653 ----------
3654 columns : None, bool, or listlike[str]
3655 Indicate which columns to create an index on.
3657 * False : Do not create any indexes.
3658 * True : Create indexes on all columns.
3659 * None : Create indexes on all columns.
3660 * listlike : Create indexes on the given columns.
3662 optlevel : int or None, default None
3663 Optimization level, if None, pytables defaults to 6.
3664 kind : str or None, default None
3665 Kind of index, if None, pytables defaults to "medium".
3667 Raises
3668 ------
3669 TypeError if trying to create an index on a complex-type column.
3671 Notes
3672 -----
3673 Cannot index Time64Col or ComplexCol.
3674 Pytables must be >= 3.0.
3675 """
3676 if not self.infer_axes():
3677 return
3678 if columns is False:
3679 return
3681 # index all indexables and data_columns
3682 if columns is None or columns is True:
3683 columns = [a.cname for a in self.axes if a.is_data_indexable]
3684 if not isinstance(columns, (tuple, list)):
3685 columns = [columns]
3687 kw = {}
3688 if optlevel is not None:
3689 kw["optlevel"] = optlevel
3690 if kind is not None:
3691 kw["kind"] = kind
3693 table = self.table
3694 for c in columns:
3695 v = getattr(table.cols, c, None)
3696 if v is not None:
3697 # remove the index if the kind/optlevel have changed
3698 if v.is_indexed:
3699 index = v.index
3700 cur_optlevel = index.optlevel
3701 cur_kind = index.kind
3703 if kind is not None and cur_kind != kind:
3704 v.remove_index()
3705 else:
3706 kw["kind"] = cur_kind
3708 if optlevel is not None and cur_optlevel != optlevel:
3709 v.remove_index()
3710 else:
3711 kw["optlevel"] = cur_optlevel
3713 # create the index
3714 if not v.is_indexed:
3715 if v.type.startswith("complex"):
3716 raise TypeError(
3717 "Columns containing complex values can be stored but "
3718 "cannot be indexed when using table format. Either use "
3719 "fixed format, set index=False, or do not include "
3720 "the columns containing complex values to "
3721 "data_columns when initializing the table."
3722 )
3723 v.create_index(**kw)
3724 elif c in self.non_index_axes[0][1]:
3725 # GH 28156
3726 raise AttributeError(
3727 f"column {c} is not a data_column.\n"
3728 f"In order to read column {c} you must reload the dataframe \n"
3729 f"into HDFStore and include {c} with the data_columns argument."
3730 )
3732 def _read_axes(
3733 self, where, start: int | None = None, stop: int | None = None
3734 ) -> list[tuple[ArrayLike, ArrayLike]]:
3735 """
3736 Create the axes sniffed from the table.
3738 Parameters
3739 ----------
3740 where : ???
3741 start : int or None, default None
3742 stop : int or None, default None
3744 Returns
3745 -------
3746 List[Tuple[index_values, column_values]]
3747 """
3748 # create the selection
3749 selection = Selection(self, where=where, start=start, stop=stop)
3750 values = selection.select()
3752 results = []
3753 # convert the data
3754 for a in self.axes:
3755 a.set_info(self.info)
3756 res = a.convert(
3757 values,
3758 nan_rep=self.nan_rep,
3759 encoding=self.encoding,
3760 errors=self.errors,
3761 )
3762 results.append(res)
3764 return results
3766 @classmethod
3767 def get_object(cls, obj, transposed: bool):
3768 """return the data for this obj"""
3769 return obj
3771 def validate_data_columns(self, data_columns, min_itemsize, non_index_axes):
3772 """
3773 take the input data_columns and min_itemize and create a data
3774 columns spec
3775 """
3776 if not len(non_index_axes):
3777 return []
3779 axis, axis_labels = non_index_axes[0]
3780 info = self.info.get(axis, {})
3781 if info.get("type") == "MultiIndex" and data_columns:
3782 raise ValueError(
3783 f"cannot use a multi-index on axis [{axis}] with "
3784 f"data_columns {data_columns}"
3785 )
3787 # evaluate the passed data_columns, True == use all columns
3788 # take only valid axis labels
3789 if data_columns is True:
3790 data_columns = list(axis_labels)
3791 elif data_columns is None:
3792 data_columns = []
3794 # if min_itemsize is a dict, add the keys (exclude 'values')
3795 if isinstance(min_itemsize, dict):
3796 existing_data_columns = set(data_columns)
3797 data_columns = list(data_columns) # ensure we do not modify
3798 data_columns.extend(
3799 [
3800 k
3801 for k in min_itemsize.keys()
3802 if k != "values" and k not in existing_data_columns
3803 ]
3804 )
3806 # return valid columns in the order of our axis
3807 return [c for c in data_columns if c in axis_labels]
3809 def _create_axes(
3810 self,
3811 axes,
3812 obj: DataFrame,
3813 validate: bool = True,
3814 nan_rep=None,
3815 data_columns=None,
3816 min_itemsize=None,
3817 ):
3818 """
3819 Create and return the axes.
3821 Parameters
3822 ----------
3823 axes: list or None
3824 The names or numbers of the axes to create.
3825 obj : DataFrame
3826 The object to create axes on.
3827 validate: bool, default True
3828 Whether to validate the obj against an existing object already written.
3829 nan_rep :
3830 A value to use for string column nan_rep.
3831 data_columns : List[str], True, or None, default None
3832 Specify the columns that we want to create to allow indexing on.
3834 * True : Use all available columns.
3835 * None : Use no columns.
3836 * List[str] : Use the specified columns.
3838 min_itemsize: Dict[str, int] or None, default None
3839 The min itemsize for a column in bytes.
3840 """
3841 if not isinstance(obj, DataFrame):
3842 group = self.group._v_name
3843 raise TypeError(
3844 f"cannot properly create the storer for: [group->{group},"
3845 f"value->{type(obj)}]"
3846 )
3848 # set the default axes if needed
3849 if axes is None:
3850 axes = [0]
3852 # map axes to numbers
3853 axes = [obj._get_axis_number(a) for a in axes]
3855 # do we have an existing table (if so, use its axes & data_columns)
3856 if self.infer_axes():
3857 table_exists = True
3858 axes = [a.axis for a in self.index_axes]
3859 data_columns = list(self.data_columns)
3860 nan_rep = self.nan_rep
3861 # TODO: do we always have validate=True here?
3862 else:
3863 table_exists = False
3865 new_info = self.info
3867 assert self.ndim == 2 # with next check, we must have len(axes) == 1
3868 # currently support on ndim-1 axes
3869 if len(axes) != self.ndim - 1:
3870 raise ValueError(
3871 "currently only support ndim-1 indexers in an AppendableTable"
3872 )
3874 # create according to the new data
3875 new_non_index_axes: list = []
3877 # nan_representation
3878 if nan_rep is None:
3879 nan_rep = "nan"
3881 # We construct the non-index-axis first, since that alters new_info
3882 idx = [x for x in [0, 1] if x not in axes][0]
3884 a = obj.axes[idx]
3885 # we might be able to change the axes on the appending data if necessary
3886 append_axis = list(a)
3887 if table_exists:
3888 indexer = len(new_non_index_axes) # i.e. 0
3889 exist_axis = self.non_index_axes[indexer][1]
3890 if not array_equivalent(np.array(append_axis), np.array(exist_axis)):
3892 # ahah! -> reindex
3893 if array_equivalent(
3894 np.array(sorted(append_axis)), np.array(sorted(exist_axis))
3895 ):
3896 append_axis = exist_axis
3898 # the non_index_axes info
3899 info = new_info.setdefault(idx, {})
3900 info["names"] = list(a.names)
3901 info["type"] = type(a).__name__
3903 new_non_index_axes.append((idx, append_axis))
3905 # Now we can construct our new index axis
3906 idx = axes[0]
3907 a = obj.axes[idx]
3908 axis_name = obj._get_axis_name(idx)
3909 new_index = _convert_index(axis_name, a, self.encoding, self.errors)
3910 new_index.axis = idx
3912 # Because we are always 2D, there is only one new_index, so
3913 # we know it will have pos=0
3914 new_index.set_pos(0)
3915 new_index.update_info(new_info)
3916 new_index.maybe_set_size(min_itemsize) # check for column conflicts
3918 new_index_axes = [new_index]
3919 j = len(new_index_axes) # i.e. 1
3920 assert j == 1
3922 # reindex by our non_index_axes & compute data_columns
3923 assert len(new_non_index_axes) == 1
3924 for a in new_non_index_axes:
3925 obj = _reindex_axis(obj, a[0], a[1])
3927 transposed = new_index.axis == 1
3929 # figure out data_columns and get out blocks
3930 data_columns = self.validate_data_columns(
3931 data_columns, min_itemsize, new_non_index_axes
3932 )
3934 frame = self.get_object(obj, transposed)._consolidate()
3936 blocks, blk_items = self._get_blocks_and_items(
3937 frame, table_exists, new_non_index_axes, self.values_axes, data_columns
3938 )
3940 # add my values
3941 vaxes = []
3942 for i, (blk, b_items) in enumerate(zip(blocks, blk_items)):
3944 # shape of the data column are the indexable axes
3945 klass = DataCol
3946 name = None
3948 # we have a data_column
3949 if data_columns and len(b_items) == 1 and b_items[0] in data_columns:
3950 klass = DataIndexableCol
3951 name = b_items[0]
3952 if not (name is None or isinstance(name, str)):
3953 # TODO: should the message here be more specifically non-str?
3954 raise ValueError("cannot have non-object label DataIndexableCol")
3956 # make sure that we match up the existing columns
3957 # if we have an existing table
3958 existing_col: DataCol | None
3960 if table_exists and validate:
3961 try:
3962 existing_col = self.values_axes[i]
3963 except (IndexError, KeyError) as err:
3964 raise ValueError(
3965 f"Incompatible appended table [{blocks}]"
3966 f"with existing table [{self.values_axes}]"
3967 ) from err
3968 else:
3969 existing_col = None
3971 new_name = name or f"values_block_{i}"
3972 data_converted = _maybe_convert_for_string_atom(
3973 new_name,
3974 blk.values,
3975 existing_col=existing_col,
3976 min_itemsize=min_itemsize,
3977 nan_rep=nan_rep,
3978 encoding=self.encoding,
3979 errors=self.errors,
3980 columns=b_items,
3981 )
3982 adj_name = _maybe_adjust_name(new_name, self.version)
3984 typ = klass._get_atom(data_converted)
3985 kind = _dtype_to_kind(data_converted.dtype.name)
3986 tz = None
3987 if getattr(data_converted, "tz", None) is not None:
3988 tz = _get_tz(data_converted.tz)
3990 meta = metadata = ordered = None
3991 if is_categorical_dtype(data_converted.dtype):
3992 ordered = data_converted.ordered
3993 meta = "category"
3994 metadata = np.array(data_converted.categories, copy=False).ravel()
3996 data, dtype_name = _get_data_and_dtype_name(data_converted)
3998 col = klass(
3999 name=adj_name,
4000 cname=new_name,
4001 values=list(b_items),
4002 typ=typ,
4003 pos=j,
4004 kind=kind,
4005 tz=tz,
4006 ordered=ordered,
4007 meta=meta,
4008 metadata=metadata,
4009 dtype=dtype_name,
4010 data=data,
4011 )
4012 col.update_info(new_info)
4014 vaxes.append(col)
4016 j += 1
4018 dcs = [col.name for col in vaxes if col.is_data_indexable]
4020 new_table = type(self)(
4021 parent=self.parent,
4022 group=self.group,
4023 encoding=self.encoding,
4024 errors=self.errors,
4025 index_axes=new_index_axes,
4026 non_index_axes=new_non_index_axes,
4027 values_axes=vaxes,
4028 data_columns=dcs,
4029 info=new_info,
4030 nan_rep=nan_rep,
4031 )
4032 if hasattr(self, "levels"):
4033 # TODO: get this into constructor, only for appropriate subclass
4034 new_table.levels = self.levels
4036 new_table.validate_min_itemsize(min_itemsize)
4038 if validate and table_exists:
4039 new_table.validate(self)
4041 return new_table
4043 @staticmethod
4044 def _get_blocks_and_items(
4045 frame: DataFrame,
4046 table_exists: bool,
4047 new_non_index_axes,
4048 values_axes,
4049 data_columns,
4050 ):
4051 # Helper to clarify non-state-altering parts of _create_axes
4053 # TODO(ArrayManager) HDFStore relies on accessing the blocks
4054 if isinstance(frame._mgr, ArrayManager):
4055 frame = frame._as_manager("block")
4057 def get_blk_items(mgr):
4058 return [mgr.items.take(blk.mgr_locs) for blk in mgr.blocks]
4060 mgr = frame._mgr
4061 mgr = cast(BlockManager, mgr)
4062 blocks: list[Block] = list(mgr.blocks)
4063 blk_items: list[Index] = get_blk_items(mgr)
4065 if len(data_columns):
4066 axis, axis_labels = new_non_index_axes[0]
4067 new_labels = Index(axis_labels).difference(Index(data_columns))
4068 mgr = frame.reindex(new_labels, axis=axis)._mgr
4070 # error: Item "ArrayManager" of "Union[ArrayManager, BlockManager]" has no
4071 # attribute "blocks"
4072 blocks = list(mgr.blocks) # type: ignore[union-attr]
4073 blk_items = get_blk_items(mgr)
4074 for c in data_columns:
4075 mgr = frame.reindex([c], axis=axis)._mgr
4076 # error: Item "ArrayManager" of "Union[ArrayManager, BlockManager]" has
4077 # no attribute "blocks"
4078 blocks.extend(mgr.blocks) # type: ignore[union-attr]
4079 blk_items.extend(get_blk_items(mgr))
4081 # reorder the blocks in the same order as the existing table if we can
4082 if table_exists:
4083 by_items = {
4084 tuple(b_items.tolist()): (b, b_items)
4085 for b, b_items in zip(blocks, blk_items)
4086 }
4087 new_blocks: list[Block] = []
4088 new_blk_items = []
4089 for ea in values_axes:
4090 items = tuple(ea.values)
4091 try:
4092 b, b_items = by_items.pop(items)
4093 new_blocks.append(b)
4094 new_blk_items.append(b_items)
4095 except (IndexError, KeyError) as err:
4096 jitems = ",".join([pprint_thing(item) for item in items])
4097 raise ValueError(
4098 f"cannot match existing table structure for [{jitems}] "
4099 "on appending data"
4100 ) from err
4101 blocks = new_blocks
4102 blk_items = new_blk_items
4104 return blocks, blk_items
4106 def process_axes(self, obj, selection: Selection, columns=None) -> DataFrame:
4107 """process axes filters"""
4108 # make a copy to avoid side effects
4109 if columns is not None:
4110 columns = list(columns)
4112 # make sure to include levels if we have them
4113 if columns is not None and self.is_multi_index:
4114 assert isinstance(self.levels, list) # assured by is_multi_index
4115 for n in self.levels:
4116 if n not in columns:
4117 columns.insert(0, n)
4119 # reorder by any non_index_axes & limit to the select columns
4120 for axis, labels in self.non_index_axes:
4121 obj = _reindex_axis(obj, axis, labels, columns)
4123 # apply the selection filters (but keep in the same order)
4124 if selection.filter is not None:
4125 for field, op, filt in selection.filter.format():
4127 def process_filter(field, filt):
4129 for axis_name in obj._AXIS_ORDERS:
4130 axis_number = obj._get_axis_number(axis_name)
4131 axis_values = obj._get_axis(axis_name)
4132 assert axis_number is not None
4134 # see if the field is the name of an axis
4135 if field == axis_name:
4137 # if we have a multi-index, then need to include
4138 # the levels
4139 if self.is_multi_index:
4140 filt = filt.union(Index(self.levels))
4142 takers = op(axis_values, filt)
4143 return obj.loc(axis=axis_number)[takers]
4145 # this might be the name of a file IN an axis
4146 elif field in axis_values:
4148 # we need to filter on this dimension
4149 values = ensure_index(getattr(obj, field).values)
4150 filt = ensure_index(filt)
4152 # hack until we support reversed dim flags
4153 if isinstance(obj, DataFrame):
4154 axis_number = 1 - axis_number
4155 takers = op(values, filt)
4156 return obj.loc(axis=axis_number)[takers]
4158 raise ValueError(f"cannot find the field [{field}] for filtering!")
4160 obj = process_filter(field, filt)
4162 return obj
4164 def create_description(
4165 self,
4166 complib,
4167 complevel: int | None,
4168 fletcher32: bool,
4169 expectedrows: int | None,
4170 ) -> dict[str, Any]:
4171 """create the description of the table from the axes & values"""
4172 # provided expected rows if its passed
4173 if expectedrows is None:
4174 expectedrows = max(self.nrows_expected, 10000)
4176 d = {"name": "table", "expectedrows": expectedrows}
4178 # description from the axes & values
4179 d["description"] = {a.cname: a.typ for a in self.axes}
4181 if complib:
4182 if complevel is None:
4183 complevel = self._complevel or 9
4184 filters = _tables().Filters(
4185 complevel=complevel,
4186 complib=complib,
4187 fletcher32=fletcher32 or self._fletcher32,
4188 )
4189 d["filters"] = filters
4190 elif self._filters is not None:
4191 d["filters"] = self._filters
4193 return d
4195 def read_coordinates(
4196 self, where=None, start: int | None = None, stop: int | None = None
4197 ):
4198 """
4199 select coordinates (row numbers) from a table; return the
4200 coordinates object
4201 """
4202 # validate the version
4203 self.validate_version(where)
4205 # infer the data kind
4206 if not self.infer_axes():
4207 return False
4209 # create the selection
4210 selection = Selection(self, where=where, start=start, stop=stop)
4211 coords = selection.select_coords()
4212 if selection.filter is not None:
4213 for field, op, filt in selection.filter.format():
4214 data = self.read_column(
4215 field, start=coords.min(), stop=coords.max() + 1
4216 )
4217 coords = coords[op(data.iloc[coords - coords.min()], filt).values]
4219 return Index(coords)
4221 def read_column(
4222 self,
4223 column: str,
4224 where=None,
4225 start: int | None = None,
4226 stop: int | None = None,
4227 ):
4228 """
4229 return a single column from the table, generally only indexables
4230 are interesting
4231 """
4232 # validate the version
4233 self.validate_version()
4235 # infer the data kind
4236 if not self.infer_axes():
4237 return False
4239 if where is not None:
4240 raise TypeError("read_column does not currently accept a where clause")
4242 # find the axes
4243 for a in self.axes:
4244 if column == a.name:
4245 if not a.is_data_indexable:
4246 raise ValueError(
4247 f"column [{column}] can not be extracted individually; "
4248 "it is not data indexable"
4249 )
4251 # column must be an indexable or a data column
4252 c = getattr(self.table.cols, column)
4253 a.set_info(self.info)
4254 col_values = a.convert(
4255 c[start:stop],
4256 nan_rep=self.nan_rep,
4257 encoding=self.encoding,
4258 errors=self.errors,
4259 )
4260 return Series(_set_tz(col_values[1], a.tz), name=column)
4262 raise KeyError(f"column [{column}] not found in the table")
4265class WORMTable(Table):
4266 """
4267 a write-once read-many table: this format DOES NOT ALLOW appending to a
4268 table. writing is a one-time operation the data are stored in a format
4269 that allows for searching the data on disk
4270 """
4272 table_type = "worm"
4274 def read(
4275 self,
4276 where=None,
4277 columns=None,
4278 start: int | None = None,
4279 stop: int | None = None,
4280 ):
4281 """
4282 read the indices and the indexing array, calculate offset rows and return
4283 """
4284 raise NotImplementedError("WORMTable needs to implement read")
4286 def write(self, **kwargs) -> None:
4287 """
4288 write in a format that we can search later on (but cannot append
4289 to): write out the indices and the values using _write_array
4290 (e.g. a CArray) create an indexing table so that we can search
4291 """
4292 raise NotImplementedError("WORMTable needs to implement write")
4295class AppendableTable(Table):
4296 """support the new appendable table formats"""
4298 table_type = "appendable"
4300 # error: Signature of "write" incompatible with supertype "Fixed"
4301 def write( # type: ignore[override]
4302 self,
4303 obj,
4304 axes=None,
4305 append: bool = False,
4306 complib=None,
4307 complevel=None,
4308 fletcher32=None,
4309 min_itemsize=None,
4310 chunksize=None,
4311 expectedrows=None,
4312 dropna: bool = False,
4313 nan_rep=None,
4314 data_columns=None,
4315 track_times=True,
4316 ) -> None:
4317 if not append and self.is_exists:
4318 self._handle.remove_node(self.group, "table")
4320 # create the axes
4321 table = self._create_axes(
4322 axes=axes,
4323 obj=obj,
4324 validate=append,
4325 min_itemsize=min_itemsize,
4326 nan_rep=nan_rep,
4327 data_columns=data_columns,
4328 )
4330 for a in table.axes:
4331 a.validate_names()
4333 if not table.is_exists:
4335 # create the table
4336 options = table.create_description(
4337 complib=complib,
4338 complevel=complevel,
4339 fletcher32=fletcher32,
4340 expectedrows=expectedrows,
4341 )
4343 # set the table attributes
4344 table.set_attrs()
4346 options["track_times"] = track_times
4348 # create the table
4349 table._handle.create_table(table.group, **options)
4351 # update my info
4352 table.attrs.info = table.info
4354 # validate the axes and set the kinds
4355 for a in table.axes:
4356 a.validate_and_set(table, append)
4358 # add the rows
4359 table.write_data(chunksize, dropna=dropna)
4361 def write_data(self, chunksize: int | None, dropna: bool = False) -> None:
4362 """
4363 we form the data into a 2-d including indexes,values,mask write chunk-by-chunk
4364 """
4365 names = self.dtype.names
4366 nrows = self.nrows_expected
4368 # if dropna==True, then drop ALL nan rows
4369 masks = []
4370 if dropna:
4371 for a in self.values_axes:
4372 # figure the mask: only do if we can successfully process this
4373 # column, otherwise ignore the mask
4374 mask = isna(a.data).all(axis=0)
4375 if isinstance(mask, np.ndarray):
4376 masks.append(mask.astype("u1", copy=False))
4378 # consolidate masks
4379 if len(masks):
4380 mask = masks[0]
4381 for m in masks[1:]:
4382 mask = mask & m
4383 mask = mask.ravel()
4384 else:
4385 mask = None
4387 # broadcast the indexes if needed
4388 indexes = [a.cvalues for a in self.index_axes]
4389 nindexes = len(indexes)
4390 assert nindexes == 1, nindexes # ensures we dont need to broadcast
4392 # transpose the values so first dimension is last
4393 # reshape the values if needed
4394 values = [a.take_data() for a in self.values_axes]
4395 values = [v.transpose(np.roll(np.arange(v.ndim), v.ndim - 1)) for v in values]
4396 bvalues = []
4397 for i, v in enumerate(values):
4398 new_shape = (nrows,) + self.dtype[names[nindexes + i]].shape
4399 bvalues.append(values[i].reshape(new_shape))
4401 # write the chunks
4402 if chunksize is None:
4403 chunksize = 100000
4405 rows = np.empty(min(chunksize, nrows), dtype=self.dtype)
4406 chunks = nrows // chunksize + 1
4407 for i in range(chunks):
4408 start_i = i * chunksize
4409 end_i = min((i + 1) * chunksize, nrows)
4410 if start_i >= end_i:
4411 break
4413 self.write_data_chunk(
4414 rows,
4415 indexes=[a[start_i:end_i] for a in indexes],
4416 mask=mask[start_i:end_i] if mask is not None else None,
4417 values=[v[start_i:end_i] for v in bvalues],
4418 )
4420 def write_data_chunk(
4421 self,
4422 rows: np.ndarray,
4423 indexes: list[np.ndarray],
4424 mask: npt.NDArray[np.bool_] | None,
4425 values: list[np.ndarray],
4426 ) -> None:
4427 """
4428 Parameters
4429 ----------
4430 rows : an empty memory space where we are putting the chunk
4431 indexes : an array of the indexes
4432 mask : an array of the masks
4433 values : an array of the values
4434 """
4435 # 0 len
4436 for v in values:
4437 if not np.prod(v.shape):
4438 return
4440 nrows = indexes[0].shape[0]
4441 if nrows != len(rows):
4442 rows = np.empty(nrows, dtype=self.dtype)
4443 names = self.dtype.names
4444 nindexes = len(indexes)
4446 # indexes
4447 for i, idx in enumerate(indexes):
4448 rows[names[i]] = idx
4450 # values
4451 for i, v in enumerate(values):
4452 rows[names[i + nindexes]] = v
4454 # mask
4455 if mask is not None:
4456 m = ~mask.ravel().astype(bool, copy=False)
4457 if not m.all():
4458 rows = rows[m]
4460 if len(rows):
4461 self.table.append(rows)
4462 self.table.flush()
4464 def delete(self, where=None, start: int | None = None, stop: int | None = None):
4466 # delete all rows (and return the nrows)
4467 if where is None or not len(where):
4468 if start is None and stop is None:
4469 nrows = self.nrows
4470 self._handle.remove_node(self.group, recursive=True)
4471 else:
4472 # pytables<3.0 would remove a single row with stop=None
4473 if stop is None:
4474 stop = self.nrows
4475 nrows = self.table.remove_rows(start=start, stop=stop)
4476 self.table.flush()
4477 return nrows
4479 # infer the data kind
4480 if not self.infer_axes():
4481 return None
4483 # create the selection
4484 table = self.table
4485 selection = Selection(self, where, start=start, stop=stop)
4486 values = selection.select_coords()
4488 # delete the rows in reverse order
4489 sorted_series = Series(values).sort_values()
4490 ln = len(sorted_series)
4492 if ln:
4494 # construct groups of consecutive rows
4495 diff = sorted_series.diff()
4496 groups = list(diff[diff > 1].index)
4498 # 1 group
4499 if not len(groups):
4500 groups = [0]
4502 # final element
4503 if groups[-1] != ln:
4504 groups.append(ln)
4506 # initial element
4507 if groups[0] != 0:
4508 groups.insert(0, 0)
4510 # we must remove in reverse order!
4511 pg = groups.pop()
4512 for g in reversed(groups):
4513 rows = sorted_series.take(range(g, pg))
4514 table.remove_rows(
4515 start=rows[rows.index[0]], stop=rows[rows.index[-1]] + 1
4516 )
4517 pg = g
4519 self.table.flush()
4521 # return the number of rows removed
4522 return ln
4525class AppendableFrameTable(AppendableTable):
4526 """support the new appendable table formats"""
4528 pandas_kind = "frame_table"
4529 table_type = "appendable_frame"
4530 ndim = 2
4531 obj_type: type[DataFrame | Series] = DataFrame
4533 @property
4534 def is_transposed(self) -> bool:
4535 return self.index_axes[0].axis == 1
4537 @classmethod
4538 def get_object(cls, obj, transposed: bool):
4539 """these are written transposed"""
4540 if transposed:
4541 obj = obj.T
4542 return obj
4544 def read(
4545 self,
4546 where=None,
4547 columns=None,
4548 start: int | None = None,
4549 stop: int | None = None,
4550 ):
4552 # validate the version
4553 self.validate_version(where)
4555 # infer the data kind
4556 if not self.infer_axes():
4557 return None
4559 result = self._read_axes(where=where, start=start, stop=stop)
4561 info = (
4562 self.info.get(self.non_index_axes[0][0], {})
4563 if len(self.non_index_axes)
4564 else {}
4565 )
4567 inds = [i for i, ax in enumerate(self.axes) if ax is self.index_axes[0]]
4568 assert len(inds) == 1
4569 ind = inds[0]
4571 index = result[ind][0]
4573 frames = []
4574 for i, a in enumerate(self.axes):
4575 if a not in self.values_axes:
4576 continue
4577 index_vals, cvalues = result[i]
4579 # we could have a multi-index constructor here
4580 # ensure_index doesn't recognized our list-of-tuples here
4581 if info.get("type") != "MultiIndex":
4582 cols = Index(index_vals)
4583 else:
4584 cols = MultiIndex.from_tuples(index_vals)
4586 names = info.get("names")
4587 if names is not None:
4588 cols.set_names(names, inplace=True)
4590 if self.is_transposed:
4591 values = cvalues
4592 index_ = cols
4593 cols_ = Index(index, name=getattr(index, "name", None))
4594 else:
4595 values = cvalues.T
4596 index_ = Index(index, name=getattr(index, "name", None))
4597 cols_ = cols
4599 # if we have a DataIndexableCol, its shape will only be 1 dim
4600 if values.ndim == 1 and isinstance(values, np.ndarray):
4601 values = values.reshape((1, values.shape[0]))
4603 if isinstance(values, np.ndarray):
4604 df = DataFrame(values.T, columns=cols_, index=index_)
4605 elif isinstance(values, Index):
4606 df = DataFrame(values, columns=cols_, index=index_)
4607 else:
4608 # Categorical
4609 df = DataFrame._from_arrays([values], columns=cols_, index=index_)
4610 assert (df.dtypes == values.dtype).all(), (df.dtypes, values.dtype)
4611 frames.append(df)
4613 if len(frames) == 1:
4614 df = frames[0]
4615 else:
4616 df = concat(frames, axis=1)
4618 selection = Selection(self, where=where, start=start, stop=stop)
4619 # apply the selection filters & axis orderings
4620 df = self.process_axes(df, selection=selection, columns=columns)
4622 return df
4625class AppendableSeriesTable(AppendableFrameTable):
4626 """support the new appendable table formats"""
4628 pandas_kind = "series_table"
4629 table_type = "appendable_series"
4630 ndim = 2
4631 obj_type = Series
4633 @property
4634 def is_transposed(self) -> bool:
4635 return False
4637 @classmethod
4638 def get_object(cls, obj, transposed: bool):
4639 return obj
4641 def write(self, obj, data_columns=None, **kwargs):
4642 """we are going to write this as a frame table"""
4643 if not isinstance(obj, DataFrame):
4644 name = obj.name or "values"
4645 obj = obj.to_frame(name)
4646 return super().write(obj=obj, data_columns=obj.columns.tolist(), **kwargs)
4648 def read(
4649 self,
4650 where=None,
4651 columns=None,
4652 start: int | None = None,
4653 stop: int | None = None,
4654 ) -> Series:
4656 is_multi_index = self.is_multi_index
4657 if columns is not None and is_multi_index:
4658 assert isinstance(self.levels, list) # needed for mypy
4659 for n in self.levels:
4660 if n not in columns:
4661 columns.insert(0, n)
4662 s = super().read(where=where, columns=columns, start=start, stop=stop)
4663 if is_multi_index:
4664 s.set_index(self.levels, inplace=True)
4666 s = s.iloc[:, 0]
4668 # remove the default name
4669 if s.name == "values":
4670 s.name = None
4671 return s
4674class AppendableMultiSeriesTable(AppendableSeriesTable):
4675 """support the new appendable table formats"""
4677 pandas_kind = "series_table"
4678 table_type = "appendable_multiseries"
4680 def write(self, obj, **kwargs):
4681 """we are going to write this as a frame table"""
4682 name = obj.name or "values"
4683 newobj, self.levels = self.validate_multiindex(obj)
4684 assert isinstance(self.levels, list) # for mypy
4685 cols = list(self.levels)
4686 cols.append(name)
4687 newobj.columns = Index(cols)
4688 return super().write(obj=newobj, **kwargs)
4691class GenericTable(AppendableFrameTable):
4692 """a table that read/writes the generic pytables table format"""
4694 pandas_kind = "frame_table"
4695 table_type = "generic_table"
4696 ndim = 2
4697 obj_type = DataFrame
4698 levels: list[Hashable]
4700 @property
4701 def pandas_type(self) -> str:
4702 return self.pandas_kind
4704 @property
4705 def storable(self):
4706 return getattr(self.group, "table", None) or self.group
4708 def get_attrs(self) -> None:
4709 """retrieve our attributes"""
4710 self.non_index_axes = []
4711 self.nan_rep = None
4712 self.levels = []
4714 self.index_axes = [a for a in self.indexables if a.is_an_indexable]
4715 self.values_axes = [a for a in self.indexables if not a.is_an_indexable]
4716 self.data_columns = [a.name for a in self.values_axes]
4718 @cache_readonly
4719 def indexables(self):
4720 """create the indexables from the table description"""
4721 d = self.description
4723 # TODO: can we get a typ for this? AFAICT it is the only place
4724 # where we aren't passing one
4725 # the index columns is just a simple index
4726 md = self.read_metadata("index")
4727 meta = "category" if md is not None else None
4728 index_col = GenericIndexCol(
4729 name="index", axis=0, table=self.table, meta=meta, metadata=md
4730 )
4732 _indexables: list[GenericIndexCol | GenericDataIndexableCol] = [index_col]
4734 for i, n in enumerate(d._v_names):
4735 assert isinstance(n, str)
4737 atom = getattr(d, n)
4738 md = self.read_metadata(n)
4739 meta = "category" if md is not None else None
4740 dc = GenericDataIndexableCol(
4741 name=n,
4742 pos=i,
4743 values=[n],
4744 typ=atom,
4745 table=self.table,
4746 meta=meta,
4747 metadata=md,
4748 )
4749 _indexables.append(dc)
4751 return _indexables
4753 def write(self, **kwargs):
4754 raise NotImplementedError("cannot write on an generic table")
4757class AppendableMultiFrameTable(AppendableFrameTable):
4758 """a frame with a multi-index"""
4760 table_type = "appendable_multiframe"
4761 obj_type = DataFrame
4762 ndim = 2
4763 _re_levels = re.compile(r"^level_\d+$")
4765 @property
4766 def table_type_short(self) -> str:
4767 return "appendable_multi"
4769 def write(self, obj, data_columns=None, **kwargs):
4770 if data_columns is None:
4771 data_columns = []
4772 elif data_columns is True:
4773 data_columns = obj.columns.tolist()
4774 obj, self.levels = self.validate_multiindex(obj)
4775 assert isinstance(self.levels, list) # for mypy
4776 for n in self.levels:
4777 if n not in data_columns:
4778 data_columns.insert(0, n)
4779 return super().write(obj=obj, data_columns=data_columns, **kwargs)
4781 def read(
4782 self,
4783 where=None,
4784 columns=None,
4785 start: int | None = None,
4786 stop: int | None = None,
4787 ):
4789 df = super().read(where=where, columns=columns, start=start, stop=stop)
4790 df = df.set_index(self.levels)
4792 # remove names for 'level_%d'
4793 df.index = df.index.set_names(
4794 [None if self._re_levels.search(name) else name for name in df.index.names]
4795 )
4797 return df
4800def _reindex_axis(obj: DataFrame, axis: int, labels: Index, other=None) -> DataFrame:
4801 ax = obj._get_axis(axis)
4802 labels = ensure_index(labels)
4804 # try not to reindex even if other is provided
4805 # if it equals our current index
4806 if other is not None:
4807 other = ensure_index(other)
4808 if (other is None or labels.equals(other)) and labels.equals(ax):
4809 return obj
4811 labels = ensure_index(labels.unique())
4812 if other is not None:
4813 labels = ensure_index(other.unique()).intersection(labels, sort=False)
4814 if not labels.equals(ax):
4815 slicer: list[slice | Index] = [slice(None, None)] * obj.ndim
4816 slicer[axis] = labels
4817 obj = obj.loc[tuple(slicer)]
4818 return obj
4821# tz to/from coercion
4824def _get_tz(tz: tzinfo) -> str | tzinfo:
4825 """for a tz-aware type, return an encoded zone"""
4826 zone = timezones.get_timezone(tz)
4827 return zone
4830@overload
4831def _set_tz(
4832 values: np.ndarray | Index, tz: str | tzinfo, coerce: bool = False
4833) -> DatetimeIndex:
4834 ...
4837@overload
4838def _set_tz(values: np.ndarray | Index, tz: None, coerce: bool = False) -> np.ndarray:
4839 ...
4842def _set_tz(
4843 values: np.ndarray | Index, tz: str | tzinfo | None, coerce: bool = False
4844) -> np.ndarray | DatetimeIndex:
4845 """
4846 coerce the values to a DatetimeIndex if tz is set
4847 preserve the input shape if possible
4849 Parameters
4850 ----------
4851 values : ndarray or Index
4852 tz : str or tzinfo
4853 coerce : if we do not have a passed timezone, coerce to M8[ns] ndarray
4854 """
4855 if isinstance(values, DatetimeIndex):
4856 # If values is tzaware, the tz gets dropped in the values.ravel()
4857 # call below (which returns an ndarray). So we are only non-lossy
4858 # if `tz` matches `values.tz`.
4859 assert values.tz is None or values.tz == tz
4861 if tz is not None:
4862 if isinstance(values, DatetimeIndex):
4863 name = values.name
4864 values = values.asi8
4865 else:
4866 name = None
4867 values = values.ravel()
4869 tz = _ensure_decoded(tz)
4870 values = DatetimeIndex(values, name=name)
4871 values = values.tz_localize("UTC").tz_convert(tz)
4872 elif coerce:
4873 values = np.asarray(values, dtype="M8[ns]")
4875 # error: Incompatible return value type (got "Union[ndarray, Index]",
4876 # expected "Union[ndarray, DatetimeIndex]")
4877 return values # type: ignore[return-value]
4880def _convert_index(name: str, index: Index, encoding: str, errors: str) -> IndexCol:
4881 assert isinstance(name, str)
4883 index_name = index.name
4884 # error: Argument 1 to "_get_data_and_dtype_name" has incompatible type "Index";
4885 # expected "Union[ExtensionArray, ndarray]"
4886 converted, dtype_name = _get_data_and_dtype_name(index) # type: ignore[arg-type]
4887 kind = _dtype_to_kind(dtype_name)
4888 atom = DataIndexableCol._get_atom(converted)
4890 if (
4891 isinstance(index, Int64Index)
4892 or needs_i8_conversion(index.dtype)
4893 or is_bool_dtype(index.dtype)
4894 ):
4895 # Includes Int64Index, RangeIndex, DatetimeIndex, TimedeltaIndex, PeriodIndex,
4896 # in which case "kind" is "integer", "integer", "datetime64",
4897 # "timedelta64", and "integer", respectively.
4898 return IndexCol(
4899 name,
4900 values=converted,
4901 kind=kind,
4902 typ=atom,
4903 freq=getattr(index, "freq", None),
4904 tz=getattr(index, "tz", None),
4905 index_name=index_name,
4906 )
4908 if isinstance(index, MultiIndex):
4909 raise TypeError("MultiIndex not supported here!")
4911 inferred_type = lib.infer_dtype(index, skipna=False)
4912 # we won't get inferred_type of "datetime64" or "timedelta64" as these
4913 # would go through the DatetimeIndex/TimedeltaIndex paths above
4915 values = np.asarray(index)
4917 if inferred_type == "date":
4918 converted = np.asarray([v.toordinal() for v in values], dtype=np.int32)
4919 return IndexCol(
4920 name, converted, "date", _tables().Time32Col(), index_name=index_name
4921 )
4922 elif inferred_type == "string":
4924 converted = _convert_string_array(values, encoding, errors)
4925 itemsize = converted.dtype.itemsize
4926 return IndexCol(
4927 name,
4928 converted,
4929 "string",
4930 _tables().StringCol(itemsize),
4931 index_name=index_name,
4932 )
4934 elif inferred_type in ["integer", "floating"]:
4935 return IndexCol(
4936 name, values=converted, kind=kind, typ=atom, index_name=index_name
4937 )
4938 else:
4939 assert isinstance(converted, np.ndarray) and converted.dtype == object
4940 assert kind == "object", kind
4941 atom = _tables().ObjectAtom()
4942 return IndexCol(name, converted, kind, atom, index_name=index_name)
4945def _unconvert_index(data, kind: str, encoding: str, errors: str) -> np.ndarray | Index:
4946 index: Index | np.ndarray
4948 if kind == "datetime64":
4949 index = DatetimeIndex(data)
4950 elif kind == "timedelta64":
4951 index = TimedeltaIndex(data)
4952 elif kind == "date":
4953 try:
4954 index = np.asarray([date.fromordinal(v) for v in data], dtype=object)
4955 except (ValueError):
4956 index = np.asarray([date.fromtimestamp(v) for v in data], dtype=object)
4957 elif kind in ("integer", "float", "bool"):
4958 index = np.asarray(data)
4959 elif kind in ("string"):
4960 index = _unconvert_string_array(
4961 data, nan_rep=None, encoding=encoding, errors=errors
4962 )
4963 elif kind == "object":
4964 index = np.asarray(data[0])
4965 else: # pragma: no cover
4966 raise ValueError(f"unrecognized index type {kind}")
4967 return index
4970def _maybe_convert_for_string_atom(
4971 name: str,
4972 bvalues: ArrayLike,
4973 existing_col,
4974 min_itemsize,
4975 nan_rep,
4976 encoding,
4977 errors,
4978 columns: list[str],
4979):
4981 if bvalues.dtype != object:
4982 return bvalues
4984 bvalues = cast(np.ndarray, bvalues)
4986 dtype_name = bvalues.dtype.name
4987 inferred_type = lib.infer_dtype(bvalues, skipna=False)
4989 if inferred_type == "date":
4990 raise TypeError("[date] is not implemented as a table column")
4991 elif inferred_type == "datetime":
4992 # after GH#8260
4993 # this only would be hit for a multi-timezone dtype which is an error
4994 raise TypeError(
4995 "too many timezones in this block, create separate data columns"
4996 )
4998 elif not (inferred_type == "string" or dtype_name == "object"):
4999 return bvalues
5001 mask = isna(bvalues)
5002 data = bvalues.copy()
5003 data[mask] = nan_rep
5005 # see if we have a valid string type
5006 inferred_type = lib.infer_dtype(data, skipna=False)
5007 if inferred_type != "string":
5009 # we cannot serialize this data, so report an exception on a column
5010 # by column basis
5012 # expected behaviour:
5013 # search block for a non-string object column by column
5014 for i in range(data.shape[0]):
5015 col = data[i]
5016 inferred_type = lib.infer_dtype(col, skipna=False)
5017 if inferred_type != "string":
5018 error_column_label = columns[i] if len(columns) > i else f"No.{i}"
5019 raise TypeError(
5020 f"Cannot serialize the column [{error_column_label}]\n"
5021 f"because its data contents are not [string] but "
5022 f"[{inferred_type}] object dtype"
5023 )
5025 # itemsize is the maximum length of a string (along any dimension)
5027 data_converted = _convert_string_array(data, encoding, errors).reshape(data.shape)
5028 itemsize = data_converted.itemsize
5030 # specified min_itemsize?
5031 if isinstance(min_itemsize, dict):
5032 min_itemsize = int(min_itemsize.get(name) or min_itemsize.get("values") or 0)
5033 itemsize = max(min_itemsize or 0, itemsize)
5035 # check for column in the values conflicts
5036 if existing_col is not None:
5037 eci = existing_col.validate_col(itemsize)
5038 if eci is not None and eci > itemsize:
5039 itemsize = eci
5041 data_converted = data_converted.astype(f"|S{itemsize}", copy=False)
5042 return data_converted
5045def _convert_string_array(data: np.ndarray, encoding: str, errors: str) -> np.ndarray:
5046 """
5047 Take a string-like that is object dtype and coerce to a fixed size string type.
5049 Parameters
5050 ----------
5051 data : np.ndarray[object]
5052 encoding : str
5053 errors : str
5054 Handler for encoding errors.
5056 Returns
5057 -------
5058 np.ndarray[fixed-length-string]
5059 """
5060 # encode if needed
5061 if len(data):
5062 data = (
5063 Series(data.ravel())
5064 .str.encode(encoding, errors)
5065 ._values.reshape(data.shape)
5066 )
5068 # create the sized dtype
5069 ensured = ensure_object(data.ravel())
5070 itemsize = max(1, libwriters.max_len_string_array(ensured))
5072 data = np.asarray(data, dtype=f"S{itemsize}")
5073 return data
5076def _unconvert_string_array(
5077 data: np.ndarray, nan_rep, encoding: str, errors: str
5078) -> np.ndarray:
5079 """
5080 Inverse of _convert_string_array.
5082 Parameters
5083 ----------
5084 data : np.ndarray[fixed-length-string]
5085 nan_rep : the storage repr of NaN
5086 encoding : str
5087 errors : str
5088 Handler for encoding errors.
5090 Returns
5091 -------
5092 np.ndarray[object]
5093 Decoded data.
5094 """
5095 shape = data.shape
5096 data = np.asarray(data.ravel(), dtype=object)
5098 if len(data):
5100 itemsize = libwriters.max_len_string_array(ensure_object(data))
5101 dtype = f"U{itemsize}"
5103 if isinstance(data[0], bytes):
5104 data = Series(data).str.decode(encoding, errors=errors)._values
5105 else:
5106 data = data.astype(dtype, copy=False).astype(object, copy=False)
5108 if nan_rep is None:
5109 nan_rep = "nan"
5111 libwriters.string_array_replace_from_nan_rep(data, nan_rep)
5112 return data.reshape(shape)
5115def _maybe_convert(values: np.ndarray, val_kind: str, encoding: str, errors: str):
5116 assert isinstance(val_kind, str), type(val_kind)
5117 if _need_convert(val_kind):
5118 conv = _get_converter(val_kind, encoding, errors)
5119 values = conv(values)
5120 return values
5123def _get_converter(kind: str, encoding: str, errors: str):
5124 if kind == "datetime64":
5125 return lambda x: np.asarray(x, dtype="M8[ns]")
5126 elif kind == "string":
5127 return lambda x: _unconvert_string_array(
5128 x, nan_rep=None, encoding=encoding, errors=errors
5129 )
5130 else: # pragma: no cover
5131 raise ValueError(f"invalid kind {kind}")
5134def _need_convert(kind: str) -> bool:
5135 if kind in ("datetime64", "string"):
5136 return True
5137 return False
5140def _maybe_adjust_name(name: str, version: Sequence[int]) -> str:
5141 """
5142 Prior to 0.10.1, we named values blocks like: values_block_0 an the
5143 name values_0, adjust the given name if necessary.
5145 Parameters
5146 ----------
5147 name : str
5148 version : Tuple[int, int, int]
5150 Returns
5151 -------
5152 str
5153 """
5154 if isinstance(version, str) or len(version) < 3:
5155 raise ValueError("Version is incorrect, expected sequence of 3 integers.")
5157 if version[0] == 0 and version[1] <= 10 and version[2] == 0:
5158 m = re.search(r"values_block_(\d+)", name)
5159 if m:
5160 grp = m.groups()[0]
5161 name = f"values_{grp}"
5162 return name
5165def _dtype_to_kind(dtype_str: str) -> str:
5166 """
5167 Find the "kind" string describing the given dtype name.
5168 """
5169 dtype_str = _ensure_decoded(dtype_str)
5171 if dtype_str.startswith("string") or dtype_str.startswith("bytes"):
5172 kind = "string"
5173 elif dtype_str.startswith("float"):
5174 kind = "float"
5175 elif dtype_str.startswith("complex"):
5176 kind = "complex"
5177 elif dtype_str.startswith("int") or dtype_str.startswith("uint"):
5178 kind = "integer"
5179 elif dtype_str.startswith("datetime64"):
5180 kind = "datetime64"
5181 elif dtype_str.startswith("timedelta"):
5182 kind = "timedelta64"
5183 elif dtype_str.startswith("bool"):
5184 kind = "bool"
5185 elif dtype_str.startswith("category"):
5186 kind = "category"
5187 elif dtype_str.startswith("period"):
5188 # We store the `freq` attr so we can restore from integers
5189 kind = "integer"
5190 elif dtype_str == "object":
5191 kind = "object"
5192 else:
5193 raise ValueError(f"cannot interpret dtype of [{dtype_str}]")
5195 return kind
5198def _get_data_and_dtype_name(data: ArrayLike):
5199 """
5200 Convert the passed data into a storable form and a dtype string.
5201 """
5202 if isinstance(data, Categorical):
5203 data = data.codes
5205 # For datetime64tz we need to drop the TZ in tests TODO: why?
5206 dtype_name = data.dtype.name.split("[")[0]
5208 if data.dtype.kind in ["m", "M"]:
5209 data = np.asarray(data.view("i8"))
5210 # TODO: we used to reshape for the dt64tz case, but no longer
5211 # doing that doesn't seem to break anything. why?
5213 elif isinstance(data, PeriodIndex):
5214 data = data.asi8
5216 data = np.asarray(data)
5217 return data, dtype_name
5220class Selection:
5221 """
5222 Carries out a selection operation on a tables.Table object.
5224 Parameters
5225 ----------
5226 table : a Table object
5227 where : list of Terms (or convertible to)
5228 start, stop: indices to start and/or stop selection
5230 """
5232 def __init__(
5233 self,
5234 table: Table,
5235 where=None,
5236 start: int | None = None,
5237 stop: int | None = None,
5238 ) -> None:
5239 self.table = table
5240 self.where = where
5241 self.start = start
5242 self.stop = stop
5243 self.condition = None
5244 self.filter = None
5245 self.terms = None
5246 self.coordinates = None
5248 if is_list_like(where):
5250 # see if we have a passed coordinate like
5251 with suppress(ValueError):
5252 inferred = lib.infer_dtype(where, skipna=False)
5253 if inferred == "integer" or inferred == "boolean":
5254 where = np.asarray(where)
5255 if where.dtype == np.bool_:
5256 start, stop = self.start, self.stop
5257 if start is None:
5258 start = 0
5259 if stop is None:
5260 stop = self.table.nrows
5261 self.coordinates = np.arange(start, stop)[where]
5262 elif issubclass(where.dtype.type, np.integer):
5263 if (self.start is not None and (where < self.start).any()) or (
5264 self.stop is not None and (where >= self.stop).any()
5265 ):
5266 raise ValueError(
5267 "where must have index locations >= start and < stop"
5268 )
5269 self.coordinates = where
5271 if self.coordinates is None:
5273 self.terms = self.generate(where)
5275 # create the numexpr & the filter
5276 if self.terms is not None:
5277 self.condition, self.filter = self.terms.evaluate()
5279 def generate(self, where):
5280 """where can be a : dict,list,tuple,string"""
5281 if where is None:
5282 return None
5284 q = self.table.queryables()
5285 try:
5286 return PyTablesExpr(where, queryables=q, encoding=self.table.encoding)
5287 except NameError as err:
5288 # raise a nice message, suggesting that the user should use
5289 # data_columns
5290 qkeys = ",".join(q.keys())
5291 msg = dedent(
5292 f"""\
5293 The passed where expression: {where}
5294 contains an invalid variable reference
5295 all of the variable references must be a reference to
5296 an axis (e.g. 'index' or 'columns'), or a data_column
5297 The currently defined references are: {qkeys}
5298 """
5299 )
5300 raise ValueError(msg) from err
5302 def select(self):
5303 """
5304 generate the selection
5305 """
5306 if self.condition is not None:
5307 return self.table.table.read_where(
5308 self.condition.format(), start=self.start, stop=self.stop
5309 )
5310 elif self.coordinates is not None:
5311 return self.table.table.read_coordinates(self.coordinates)
5312 return self.table.table.read(start=self.start, stop=self.stop)
5314 def select_coords(self):
5315 """
5316 generate the selection
5317 """
5318 start, stop = self.start, self.stop
5319 nrows = self.table.nrows
5320 if start is None:
5321 start = 0
5322 elif start < 0:
5323 start += nrows
5324 if stop is None:
5325 stop = nrows
5326 elif stop < 0:
5327 stop += nrows
5329 if self.condition is not None:
5330 return self.table.table.get_where_list(
5331 self.condition.format(), start=start, stop=stop, sort=True
5332 )
5333 elif self.coordinates is not None:
5334 return self.coordinates
5336 return np.arange(start, stop)