Coverage for /var/srv/projects/api.amasfac.comuna18.com/tmp/venv/lib/python3.9/site-packages/pandas/io/orc.py: 21%
41 statements
« prev ^ index » next coverage.py v6.4.4, created at 2023-07-17 14:22 -0600
« prev ^ index » next coverage.py v6.4.4, created at 2023-07-17 14:22 -0600
1""" orc compat """
2from __future__ import annotations
4import io
5from types import ModuleType
6from typing import (
7 TYPE_CHECKING,
8 Any,
9 Literal,
10)
12from pandas._typing import (
13 FilePath,
14 ReadBuffer,
15 WriteBuffer,
16)
17from pandas.compat._optional import import_optional_dependency
19from pandas.core.dtypes.common import (
20 is_categorical_dtype,
21 is_interval_dtype,
22 is_period_dtype,
23 is_unsigned_integer_dtype,
24)
26from pandas.io.common import get_handle
28if TYPE_CHECKING: 28 ↛ 29line 28 didn't jump to line 29, because the condition on line 28 was never true
29 from pandas import DataFrame
32def read_orc(
33 path: FilePath | ReadBuffer[bytes], columns: list[str] | None = None, **kwargs
34) -> DataFrame:
35 """
36 Load an ORC object from the file path, returning a DataFrame.
38 .. versionadded:: 1.0.0
40 Parameters
41 ----------
42 path : str, path object, or file-like object
43 String, path object (implementing ``os.PathLike[str]``), or file-like
44 object implementing a binary ``read()`` function. The string could be a URL.
45 Valid URL schemes include http, ftp, s3, and file. For file URLs, a host is
46 expected. A local file could be:
47 ``file://localhost/path/to/table.orc``.
48 columns : list, default None
49 If not None, only these columns will be read from the file.
50 **kwargs
51 Any additional kwargs are passed to pyarrow.
53 Returns
54 -------
55 DataFrame
57 Notes
58 -----
59 Before using this function you should read the :ref:`user guide about ORC <io.orc>`
60 and :ref:`install optional dependencies <install.warn_orc>`.
61 """
62 # we require a newer version of pyarrow than we support for parquet
64 orc = import_optional_dependency("pyarrow.orc")
66 with get_handle(path, "rb", is_text=False) as handles:
67 orc_file = orc.ORCFile(handles.handle)
68 return orc_file.read(columns=columns, **kwargs).to_pandas()
71def to_orc(
72 df: DataFrame,
73 path: FilePath | WriteBuffer[bytes] | None = None,
74 *,
75 engine: Literal["pyarrow"] = "pyarrow",
76 index: bool | None = None,
77 engine_kwargs: dict[str, Any] | None = None,
78) -> bytes | None:
79 """
80 Write a DataFrame to the ORC format.
82 .. versionadded:: 1.5.0
84 Parameters
85 ----------
86 df : DataFrame
87 The dataframe to be written to ORC. Raises NotImplementedError
88 if dtype of one or more columns is category, unsigned integers,
89 intervals, periods or sparse.
90 path : str, file-like object or None, default None
91 If a string, it will be used as Root Directory path
92 when writing a partitioned dataset. By file-like object,
93 we refer to objects with a write() method, such as a file handle
94 (e.g. via builtin open function). If path is None,
95 a bytes object is returned.
96 engine : str, default 'pyarrow'
97 ORC library to use. Pyarrow must be >= 7.0.0.
98 index : bool, optional
99 If ``True``, include the dataframe's index(es) in the file output. If
100 ``False``, they will not be written to the file.
101 If ``None``, similar to ``infer`` the dataframe's index(es)
102 will be saved. However, instead of being saved as values,
103 the RangeIndex will be stored as a range in the metadata so it
104 doesn't require much space and is faster. Other indexes will
105 be included as columns in the file output.
106 engine_kwargs : dict[str, Any] or None, default None
107 Additional keyword arguments passed to :func:`pyarrow.orc.write_table`.
109 Returns
110 -------
111 bytes if no path argument is provided else None
113 Raises
114 ------
115 NotImplementedError
116 Dtype of one or more columns is category, unsigned integers, interval,
117 period or sparse.
118 ValueError
119 engine is not pyarrow.
121 Notes
122 -----
123 * Before using this function you should read the
124 :ref:`user guide about ORC <io.orc>` and
125 :ref:`install optional dependencies <install.warn_orc>`.
126 * This function requires `pyarrow <https://arrow.apache.org/docs/python/>`_
127 library.
128 * For supported dtypes please refer to `supported ORC features in Arrow
129 <https://arrow.apache.org/docs/cpp/orc.html#data-types>`__.
130 * Currently timezones in datetime columns are not preserved when a
131 dataframe is converted into ORC files.
132 """
133 if index is None:
134 index = df.index.names[0] is not None
135 if engine_kwargs is None:
136 engine_kwargs = {}
138 # If unsupported dtypes are found raise NotImplementedError
139 # In Pyarrow 9.0.0 this check will no longer be needed
140 for dtype in df.dtypes:
141 if (
142 is_categorical_dtype(dtype)
143 or is_interval_dtype(dtype)
144 or is_period_dtype(dtype)
145 or is_unsigned_integer_dtype(dtype)
146 ):
147 raise NotImplementedError(
148 "The dtype of one or more columns is not supported yet."
149 )
151 if engine != "pyarrow":
152 raise ValueError("engine must be 'pyarrow'")
153 engine = import_optional_dependency(engine, min_version="7.0.0")
154 orc = import_optional_dependency("pyarrow.orc")
156 was_none = path is None
157 if was_none:
158 path = io.BytesIO()
159 assert path is not None # For mypy
160 with get_handle(path, "wb", is_text=False) as handles:
161 assert isinstance(engine, ModuleType) # For mypy
162 try:
163 orc.write_table(
164 engine.Table.from_pandas(df, preserve_index=index),
165 handles.handle,
166 **engine_kwargs,
167 )
168 except TypeError as e:
169 raise NotImplementedError(
170 "The dtype of one or more columns is not supported yet."
171 ) from e
173 if was_none:
174 assert isinstance(path, io.BytesIO) # For mypy
175 return path.getvalue()
176 return None