Coverage for /var/srv/projects/api.amasfac.comuna18.com/tmp/venv/lib/python3.9/site-packages/pandas/io/excel/_pyxlsb.py: 24%
56 statements
« prev ^ index » next coverage.py v6.4.4, created at 2023-07-17 14:22 -0600
« prev ^ index » next coverage.py v6.4.4, created at 2023-07-17 14:22 -0600
1# pyright: reportMissingImports=false
2from __future__ import annotations
4from pandas._typing import (
5 FilePath,
6 ReadBuffer,
7 Scalar,
8 StorageOptions,
9)
10from pandas.compat._optional import import_optional_dependency
11from pandas.util._decorators import doc
13from pandas.core.shared_docs import _shared_docs
15from pandas.io.excel._base import BaseExcelReader
18class PyxlsbReader(BaseExcelReader):
19 @doc(storage_options=_shared_docs["storage_options"])
20 def __init__(
21 self,
22 filepath_or_buffer: FilePath | ReadBuffer[bytes],
23 storage_options: StorageOptions = None,
24 ) -> None:
25 """
26 Reader using pyxlsb engine.
28 Parameters
29 ----------
30 filepath_or_buffer : str, path object, or Workbook
31 Object to be parsed.
32 {storage_options}
33 """
34 import_optional_dependency("pyxlsb")
35 # This will call load_workbook on the filepath or buffer
36 # And set the result to the book-attribute
37 super().__init__(filepath_or_buffer, storage_options=storage_options)
39 @property
40 def _workbook_class(self):
41 from pyxlsb import Workbook
43 return Workbook
45 def load_workbook(self, filepath_or_buffer: FilePath | ReadBuffer[bytes]):
46 from pyxlsb import open_workbook
48 # TODO: hack in buffer capability
49 # This might need some modifications to the Pyxlsb library
50 # Actual work for opening it is in xlsbpackage.py, line 20-ish
52 return open_workbook(filepath_or_buffer)
54 @property
55 def sheet_names(self) -> list[str]:
56 return self.book.sheets
58 def get_sheet_by_name(self, name: str):
59 self.raise_if_bad_sheet_by_name(name)
60 return self.book.get_sheet(name)
62 def get_sheet_by_index(self, index: int):
63 self.raise_if_bad_sheet_by_index(index)
64 # pyxlsb sheets are indexed from 1 onwards
65 # There's a fix for this in the source, but the pypi package doesn't have it
66 return self.book.get_sheet(index + 1)
68 def _convert_cell(self, cell, convert_float: bool) -> Scalar:
69 # TODO: there is no way to distinguish between floats and datetimes in pyxlsb
70 # This means that there is no way to read datetime types from an xlsb file yet
71 if cell.v is None:
72 return "" # Prevents non-named columns from not showing up as Unnamed: i
73 if isinstance(cell.v, float) and convert_float:
74 val = int(cell.v)
75 if val == cell.v:
76 return val
77 else:
78 return float(cell.v)
80 return cell.v
82 def get_sheet_data(
83 self,
84 sheet,
85 convert_float: bool,
86 file_rows_needed: int | None = None,
87 ) -> list[list[Scalar]]:
88 data: list[list[Scalar]] = []
89 prevous_row_number = -1
90 # When sparse=True the rows can have different lengths and empty rows are
91 # not returned. The cells are namedtuples of row, col, value (r, c, v).
92 for row in sheet.rows(sparse=True):
93 row_number = row[0].r
94 converted_row = [self._convert_cell(cell, convert_float) for cell in row]
95 while converted_row and converted_row[-1] == "":
96 # trim trailing empty elements
97 converted_row.pop()
98 if converted_row:
99 data.extend([[]] * (row_number - prevous_row_number - 1))
100 data.append(converted_row)
101 prevous_row_number = row_number
102 if file_rows_needed is not None and len(data) >= file_rows_needed:
103 break
104 if data:
105 # extend rows to max_width
106 max_width = max(len(data_row) for data_row in data)
107 if min(len(data_row) for data_row in data) < max_width:
108 empty_cell: list[Scalar] = [""]
109 data = [
110 data_row + (max_width - len(data_row)) * empty_cell
111 for data_row in data
112 ]
113 return data