Coverage for /var/srv/projects/api.amasfac.comuna18.com/tmp/venv/lib/python3.9/site-packages/pandas/io/orc.py: 21%

1""" orc compat """

2from __future__ import annotations

4import io

5from types import ModuleType

6from typing import (

7 TYPE_CHECKING,

8 Any,

9 Literal,

10)

12from pandas._typing import (

13 FilePath,

14 ReadBuffer,

15 WriteBuffer,

16)

17from pandas.compat._optional import import_optional_dependency

19from pandas.core.dtypes.common import (

20 is_categorical_dtype,

21 is_interval_dtype,

22 is_period_dtype,

23 is_unsigned_integer_dtype,

24)

26from pandas.io.common import get_handle

28if TYPE_CHECKING: 28 ↛ 29line 28 didn't jump to line 29, because the condition on line 28 was never true

29 from pandas import DataFrame

32def read_orc(

33 path: FilePath | ReadBuffer[bytes], columns: list[str] | None = None, **kwargs

34) -> DataFrame:

35 """

36 Load an ORC object from the file path, returning a DataFrame.

38 .. versionadded:: 1.0.0

40 Parameters

41 ----------

42 path : str, path object, or file-like object

43 String, path object (implementing ``os.PathLike[str]``), or file-like

44 object implementing a binary ``read()`` function. The string could be a URL.

45 Valid URL schemes include http, ftp, s3, and file. For file URLs, a host is

46 expected. A local file could be:

47 ``file://localhost/path/to/table.orc``.

48 columns : list, default None

49 If not None, only these columns will be read from the file.

50 **kwargs

51 Any additional kwargs are passed to pyarrow.

53 Returns

54 -------

55 DataFrame

57 Notes

58 -----

59 Before using this function you should read the :ref:`user guide about ORC <io.orc>`

60 and :ref:`install optional dependencies <install.warn_orc>`.

61 """

62 # we require a newer version of pyarrow than we support for parquet

64 orc = import_optional_dependency("pyarrow.orc")

66 with get_handle(path, "rb", is_text=False) as handles:

67 orc_file = orc.ORCFile(handles.handle)

68 return orc_file.read(columns=columns, **kwargs).to_pandas()

71def to_orc(

72 df: DataFrame,

73 path: FilePath | WriteBuffer[bytes] | None = None,

74 *,

75 engine: Literal["pyarrow"] = "pyarrow",

76 index: bool | None = None,

77 engine_kwargs: dict[str, Any] | None = None,

78) -> bytes | None:

79 """

80 Write a DataFrame to the ORC format.

82 .. versionadded:: 1.5.0

84 Parameters

85 ----------

86 df : DataFrame

87 The dataframe to be written to ORC. Raises NotImplementedError

88 if dtype of one or more columns is category, unsigned integers,

89 intervals, periods or sparse.

90 path : str, file-like object or None, default None

91 If a string, it will be used as Root Directory path

92 when writing a partitioned dataset. By file-like object,

93 we refer to objects with a write() method, such as a file handle

94 (e.g. via builtin open function). If path is None,

95 a bytes object is returned.

96 engine : str, default 'pyarrow'

97 ORC library to use. Pyarrow must be >= 7.0.0.

98 index : bool, optional

99 If ``True``, include the dataframe's index(es) in the file output. If

100 ``False``, they will not be written to the file.

101 If ``None``, similar to ``infer`` the dataframe's index(es)

102 will be saved. However, instead of being saved as values,

103 the RangeIndex will be stored as a range in the metadata so it

104 doesn't require much space and is faster. Other indexes will

105 be included as columns in the file output.

106 engine_kwargs : dict[str, Any] or None, default None

107 Additional keyword arguments passed to :func:`pyarrow.orc.write_table`.

108

109 Returns

110 -------

111 bytes if no path argument is provided else None

112

113 Raises

114 ------

115 NotImplementedError

116 Dtype of one or more columns is category, unsigned integers, interval,

117 period or sparse.

118 ValueError

119 engine is not pyarrow.

120

121 Notes

122 -----

123 * Before using this function you should read the

124 :ref:`user guide about ORC <io.orc>` and

125 :ref:`install optional dependencies <install.warn_orc>`.

126 * This function requires `pyarrow <https://arrow.apache.org/docs/python/>`_

127 library.

128 * For supported dtypes please refer to `supported ORC features in Arrow

129 <https://arrow.apache.org/docs/cpp/orc.html#data-types>`__.

130 * Currently timezones in datetime columns are not preserved when a

131 dataframe is converted into ORC files.

132 """

133 if index is None:

134 index = df.index.names[0] is not None

135 if engine_kwargs is None:

136 engine_kwargs = {}

137

138 # If unsupported dtypes are found raise NotImplementedError

139 # In Pyarrow 9.0.0 this check will no longer be needed

140 for dtype in df.dtypes:

141 if (

142 is_categorical_dtype(dtype)

143 or is_interval_dtype(dtype)

144 or is_period_dtype(dtype)

145 or is_unsigned_integer_dtype(dtype)

146 ):

147 raise NotImplementedError(

148 "The dtype of one or more columns is not supported yet."

149 )

150

151 if engine != "pyarrow":

152 raise ValueError("engine must be 'pyarrow'")

153 engine = import_optional_dependency(engine, min_version="7.0.0")

154 orc = import_optional_dependency("pyarrow.orc")

155

156 was_none = path is None

157 if was_none:

158 path = io.BytesIO()

159 assert path is not None # For mypy

160 with get_handle(path, "wb", is_text=False) as handles:

161 assert isinstance(engine, ModuleType) # For mypy

162 try:

163 orc.write_table(

164 engine.Table.from_pandas(df, preserve_index=index),

165 handles.handle,

166 **engine_kwargs,

167 )

168 except TypeError as e:

169 raise NotImplementedError(

170 "The dtype of one or more columns is not supported yet."

171 ) from e

172

173 if was_none:

174 assert isinstance(path, io.BytesIO) # For mypy

175 return path.getvalue()

176 return None