Coverage for /var/srv/projects/api.amasfac.comuna18.com/tmp/venv/lib/python3.9/site-packages/pandas/io/orc.py: 21%

41 statements  

« prev     ^ index     » next       coverage.py v6.4.4, created at 2023-07-17 14:22 -0600

1""" orc compat """ 

2from __future__ import annotations 

3 

4import io 

5from types import ModuleType 

6from typing import ( 

7 TYPE_CHECKING, 

8 Any, 

9 Literal, 

10) 

11 

12from pandas._typing import ( 

13 FilePath, 

14 ReadBuffer, 

15 WriteBuffer, 

16) 

17from pandas.compat._optional import import_optional_dependency 

18 

19from pandas.core.dtypes.common import ( 

20 is_categorical_dtype, 

21 is_interval_dtype, 

22 is_period_dtype, 

23 is_unsigned_integer_dtype, 

24) 

25 

26from pandas.io.common import get_handle 

27 

28if TYPE_CHECKING: 28 ↛ 29line 28 didn't jump to line 29, because the condition on line 28 was never true

29 from pandas import DataFrame 

30 

31 

32def read_orc( 

33 path: FilePath | ReadBuffer[bytes], columns: list[str] | None = None, **kwargs 

34) -> DataFrame: 

35 """ 

36 Load an ORC object from the file path, returning a DataFrame. 

37 

38 .. versionadded:: 1.0.0 

39 

40 Parameters 

41 ---------- 

42 path : str, path object, or file-like object 

43 String, path object (implementing ``os.PathLike[str]``), or file-like 

44 object implementing a binary ``read()`` function. The string could be a URL. 

45 Valid URL schemes include http, ftp, s3, and file. For file URLs, a host is 

46 expected. A local file could be: 

47 ``file://localhost/path/to/table.orc``. 

48 columns : list, default None 

49 If not None, only these columns will be read from the file. 

50 **kwargs 

51 Any additional kwargs are passed to pyarrow. 

52 

53 Returns 

54 ------- 

55 DataFrame 

56 

57 Notes 

58 ----- 

59 Before using this function you should read the :ref:`user guide about ORC <io.orc>` 

60 and :ref:`install optional dependencies <install.warn_orc>`. 

61 """ 

62 # we require a newer version of pyarrow than we support for parquet 

63 

64 orc = import_optional_dependency("pyarrow.orc") 

65 

66 with get_handle(path, "rb", is_text=False) as handles: 

67 orc_file = orc.ORCFile(handles.handle) 

68 return orc_file.read(columns=columns, **kwargs).to_pandas() 

69 

70 

71def to_orc( 

72 df: DataFrame, 

73 path: FilePath | WriteBuffer[bytes] | None = None, 

74 *, 

75 engine: Literal["pyarrow"] = "pyarrow", 

76 index: bool | None = None, 

77 engine_kwargs: dict[str, Any] | None = None, 

78) -> bytes | None: 

79 """ 

80 Write a DataFrame to the ORC format. 

81 

82 .. versionadded:: 1.5.0 

83 

84 Parameters 

85 ---------- 

86 df : DataFrame 

87 The dataframe to be written to ORC. Raises NotImplementedError 

88 if dtype of one or more columns is category, unsigned integers, 

89 intervals, periods or sparse. 

90 path : str, file-like object or None, default None 

91 If a string, it will be used as Root Directory path 

92 when writing a partitioned dataset. By file-like object, 

93 we refer to objects with a write() method, such as a file handle 

94 (e.g. via builtin open function). If path is None, 

95 a bytes object is returned. 

96 engine : str, default 'pyarrow' 

97 ORC library to use. Pyarrow must be >= 7.0.0. 

98 index : bool, optional 

99 If ``True``, include the dataframe's index(es) in the file output. If 

100 ``False``, they will not be written to the file. 

101 If ``None``, similar to ``infer`` the dataframe's index(es) 

102 will be saved. However, instead of being saved as values, 

103 the RangeIndex will be stored as a range in the metadata so it 

104 doesn't require much space and is faster. Other indexes will 

105 be included as columns in the file output. 

106 engine_kwargs : dict[str, Any] or None, default None 

107 Additional keyword arguments passed to :func:`pyarrow.orc.write_table`. 

108 

109 Returns 

110 ------- 

111 bytes if no path argument is provided else None 

112 

113 Raises 

114 ------ 

115 NotImplementedError 

116 Dtype of one or more columns is category, unsigned integers, interval, 

117 period or sparse. 

118 ValueError 

119 engine is not pyarrow. 

120 

121 Notes 

122 ----- 

123 * Before using this function you should read the 

124 :ref:`user guide about ORC <io.orc>` and 

125 :ref:`install optional dependencies <install.warn_orc>`. 

126 * This function requires `pyarrow <https://arrow.apache.org/docs/python/>`_ 

127 library. 

128 * For supported dtypes please refer to `supported ORC features in Arrow 

129 <https://arrow.apache.org/docs/cpp/orc.html#data-types>`__. 

130 * Currently timezones in datetime columns are not preserved when a 

131 dataframe is converted into ORC files. 

132 """ 

133 if index is None: 

134 index = df.index.names[0] is not None 

135 if engine_kwargs is None: 

136 engine_kwargs = {} 

137 

138 # If unsupported dtypes are found raise NotImplementedError 

139 # In Pyarrow 9.0.0 this check will no longer be needed 

140 for dtype in df.dtypes: 

141 if ( 

142 is_categorical_dtype(dtype) 

143 or is_interval_dtype(dtype) 

144 or is_period_dtype(dtype) 

145 or is_unsigned_integer_dtype(dtype) 

146 ): 

147 raise NotImplementedError( 

148 "The dtype of one or more columns is not supported yet." 

149 ) 

150 

151 if engine != "pyarrow": 

152 raise ValueError("engine must be 'pyarrow'") 

153 engine = import_optional_dependency(engine, min_version="7.0.0") 

154 orc = import_optional_dependency("pyarrow.orc") 

155 

156 was_none = path is None 

157 if was_none: 

158 path = io.BytesIO() 

159 assert path is not None # For mypy 

160 with get_handle(path, "wb", is_text=False) as handles: 

161 assert isinstance(engine, ModuleType) # For mypy 

162 try: 

163 orc.write_table( 

164 engine.Table.from_pandas(df, preserve_index=index), 

165 handles.handle, 

166 **engine_kwargs, 

167 ) 

168 except TypeError as e: 

169 raise NotImplementedError( 

170 "The dtype of one or more columns is not supported yet." 

171 ) from e 

172 

173 if was_none: 

174 assert isinstance(path, io.BytesIO) # For mypy 

175 return path.getvalue() 

176 return None