Coverage for /var/srv/projects/api.amasfac.comuna18.com/tmp/venv/lib/python3.9/site-packages/pandas/io/excel/_base.py: 20%
466 statements
« prev ^ index » next coverage.py v6.4.4, created at 2023-07-17 14:22 -0600
« prev ^ index » next coverage.py v6.4.4, created at 2023-07-17 14:22 -0600
1from __future__ import annotations
3import abc
4import datetime
5from functools import partial
6from io import BytesIO
7import os
8from textwrap import fill
9from typing import (
10 IO,
11 Any,
12 Callable,
13 Hashable,
14 Iterable,
15 List,
16 Literal,
17 Mapping,
18 Sequence,
19 Union,
20 cast,
21 overload,
22)
23import warnings
24import zipfile
26from pandas._config import config
28from pandas._libs.parsers import STR_NA_VALUES
29from pandas._typing import (
30 DtypeArg,
31 FilePath,
32 IntStrT,
33 ReadBuffer,
34 StorageOptions,
35 WriteExcelBuffer,
36)
37from pandas.compat._optional import (
38 get_version,
39 import_optional_dependency,
40)
41from pandas.errors import EmptyDataError
42from pandas.util._decorators import (
43 Appender,
44 deprecate_kwarg,
45 deprecate_nonkeyword_arguments,
46 doc,
47)
48from pandas.util._exceptions import find_stack_level
50from pandas.core.dtypes.common import (
51 is_bool,
52 is_float,
53 is_integer,
54 is_list_like,
55)
57from pandas.core.frame import DataFrame
58from pandas.core.shared_docs import _shared_docs
59from pandas.util.version import Version
61from pandas.io.common import (
62 IOHandles,
63 get_handle,
64 stringify_path,
65 validate_header_arg,
66)
67from pandas.io.excel._util import (
68 fill_mi_header,
69 get_default_engine,
70 get_writer,
71 maybe_convert_usecols,
72 pop_header_name,
73)
74from pandas.io.parsers import TextParser
75from pandas.io.parsers.readers import validate_integer
77_read_excel_doc = (
78 """
79Read an Excel file into a pandas DataFrame.
81Supports `xls`, `xlsx`, `xlsm`, `xlsb`, `odf`, `ods` and `odt` file extensions
82read from a local filesystem or URL. Supports an option to read
83a single sheet or a list of sheets.
85Parameters
86----------
87io : str, bytes, ExcelFile, xlrd.Book, path object, or file-like object
88 Any valid string path is acceptable. The string could be a URL. Valid
89 URL schemes include http, ftp, s3, and file. For file URLs, a host is
90 expected. A local file could be: ``file://localhost/path/to/table.xlsx``.
92 If you want to pass in a path object, pandas accepts any ``os.PathLike``.
94 By file-like object, we refer to objects with a ``read()`` method,
95 such as a file handle (e.g. via builtin ``open`` function)
96 or ``StringIO``.
97sheet_name : str, int, list, or None, default 0
98 Strings are used for sheet names. Integers are used in zero-indexed
99 sheet positions (chart sheets do not count as a sheet position).
100 Lists of strings/integers are used to request multiple sheets.
101 Specify None to get all worksheets.
103 Available cases:
105 * Defaults to ``0``: 1st sheet as a `DataFrame`
106 * ``1``: 2nd sheet as a `DataFrame`
107 * ``"Sheet1"``: Load sheet with name "Sheet1"
108 * ``[0, 1, "Sheet5"]``: Load first, second and sheet named "Sheet5"
109 as a dict of `DataFrame`
110 * None: All worksheets.
112header : int, list of int, default 0
113 Row (0-indexed) to use for the column labels of the parsed
114 DataFrame. If a list of integers is passed those row positions will
115 be combined into a ``MultiIndex``. Use None if there is no header.
116names : array-like, default None
117 List of column names to use. If file contains no header row,
118 then you should explicitly pass header=None.
119index_col : int, list of int, default None
120 Column (0-indexed) to use as the row labels of the DataFrame.
121 Pass None if there is no such column. If a list is passed,
122 those columns will be combined into a ``MultiIndex``. If a
123 subset of data is selected with ``usecols``, index_col
124 is based on the subset.
126 Missing values will be forward filled to allow roundtripping with
127 ``to_excel`` for ``merged_cells=True``. To avoid forward filling the
128 missing values use ``set_index`` after reading the data instead of
129 ``index_col``.
130usecols : str, list-like, or callable, default None
131 * If None, then parse all columns.
132 * If str, then indicates comma separated list of Excel column letters
133 and column ranges (e.g. "A:E" or "A,C,E:F"). Ranges are inclusive of
134 both sides.
135 * If list of int, then indicates list of column numbers to be parsed
136 (0-indexed).
137 * If list of string, then indicates list of column names to be parsed.
138 * If callable, then evaluate each column name against it and parse the
139 column if the callable returns ``True``.
141 Returns a subset of the columns according to behavior above.
142squeeze : bool, default False
143 If the parsed data only contains one column then return a Series.
145 .. deprecated:: 1.4.0
146 Append ``.squeeze("columns")`` to the call to ``read_excel`` to squeeze
147 the data.
148dtype : Type name or dict of column -> type, default None
149 Data type for data or columns. E.g. {{'a': np.float64, 'b': np.int32}}
150 Use `object` to preserve data as stored in Excel and not interpret dtype.
151 If converters are specified, they will be applied INSTEAD
152 of dtype conversion.
153engine : str, default None
154 If io is not a buffer or path, this must be set to identify io.
155 Supported engines: "xlrd", "openpyxl", "odf", "pyxlsb".
156 Engine compatibility :
158 - "xlrd" supports old-style Excel files (.xls).
159 - "openpyxl" supports newer Excel file formats.
160 - "odf" supports OpenDocument file formats (.odf, .ods, .odt).
161 - "pyxlsb" supports Binary Excel files.
163 .. versionchanged:: 1.2.0
164 The engine `xlrd <https://xlrd.readthedocs.io/en/latest/>`_
165 now only supports old-style ``.xls`` files.
166 When ``engine=None``, the following logic will be
167 used to determine the engine:
169 - If ``path_or_buffer`` is an OpenDocument format (.odf, .ods, .odt),
170 then `odf <https://pypi.org/project/odfpy/>`_ will be used.
171 - Otherwise if ``path_or_buffer`` is an xls format,
172 ``xlrd`` will be used.
173 - Otherwise if ``path_or_buffer`` is in xlsb format,
174 ``pyxlsb`` will be used.
176 .. versionadded:: 1.3.0
177 - Otherwise ``openpyxl`` will be used.
179 .. versionchanged:: 1.3.0
181converters : dict, default None
182 Dict of functions for converting values in certain columns. Keys can
183 either be integers or column labels, values are functions that take one
184 input argument, the Excel cell content, and return the transformed
185 content.
186true_values : list, default None
187 Values to consider as True.
188false_values : list, default None
189 Values to consider as False.
190skiprows : list-like, int, or callable, optional
191 Line numbers to skip (0-indexed) or number of lines to skip (int) at the
192 start of the file. If callable, the callable function will be evaluated
193 against the row indices, returning True if the row should be skipped and
194 False otherwise. An example of a valid callable argument would be ``lambda
195 x: x in [0, 2]``.
196nrows : int, default None
197 Number of rows to parse.
198na_values : scalar, str, list-like, or dict, default None
199 Additional strings to recognize as NA/NaN. If dict passed, specific
200 per-column NA values. By default the following values are interpreted
201 as NaN: '"""
202 + fill("', '".join(sorted(STR_NA_VALUES)), 70, subsequent_indent=" ")
203 + """'.
204keep_default_na : bool, default True
205 Whether or not to include the default NaN values when parsing the data.
206 Depending on whether `na_values` is passed in, the behavior is as follows:
208 * If `keep_default_na` is True, and `na_values` are specified, `na_values`
209 is appended to the default NaN values used for parsing.
210 * If `keep_default_na` is True, and `na_values` are not specified, only
211 the default NaN values are used for parsing.
212 * If `keep_default_na` is False, and `na_values` are specified, only
213 the NaN values specified `na_values` are used for parsing.
214 * If `keep_default_na` is False, and `na_values` are not specified, no
215 strings will be parsed as NaN.
217 Note that if `na_filter` is passed in as False, the `keep_default_na` and
218 `na_values` parameters will be ignored.
219na_filter : bool, default True
220 Detect missing value markers (empty strings and the value of na_values). In
221 data without any NAs, passing na_filter=False can improve the performance
222 of reading a large file.
223verbose : bool, default False
224 Indicate number of NA values placed in non-numeric columns.
225parse_dates : bool, list-like, or dict, default False
226 The behavior is as follows:
228 * bool. If True -> try parsing the index.
229 * list of int or names. e.g. If [1, 2, 3] -> try parsing columns 1, 2, 3
230 each as a separate date column.
231 * list of lists. e.g. If [[1, 3]] -> combine columns 1 and 3 and parse as
232 a single date column.
233 * dict, e.g. {{'foo' : [1, 3]}} -> parse columns 1, 3 as date and call
234 result 'foo'
236 If a column or index contains an unparsable date, the entire column or
237 index will be returned unaltered as an object data type. If you don`t want to
238 parse some cells as date just change their type in Excel to "Text".
239 For non-standard datetime parsing, use ``pd.to_datetime`` after ``pd.read_excel``.
241 Note: A fast-path exists for iso8601-formatted dates.
242date_parser : function, optional
243 Function to use for converting a sequence of string columns to an array of
244 datetime instances. The default uses ``dateutil.parser.parser`` to do the
245 conversion. Pandas will try to call `date_parser` in three different ways,
246 advancing to the next if an exception occurs: 1) Pass one or more arrays
247 (as defined by `parse_dates`) as arguments; 2) concatenate (row-wise) the
248 string values from the columns defined by `parse_dates` into a single array
249 and pass that; and 3) call `date_parser` once for each row using one or
250 more strings (corresponding to the columns defined by `parse_dates`) as
251 arguments.
252thousands : str, default None
253 Thousands separator for parsing string columns to numeric. Note that
254 this parameter is only necessary for columns stored as TEXT in Excel,
255 any numeric columns will automatically be parsed, regardless of display
256 format.
257decimal : str, default '.'
258 Character to recognize as decimal point for parsing string columns to numeric.
259 Note that this parameter is only necessary for columns stored as TEXT in Excel,
260 any numeric columns will automatically be parsed, regardless of display
261 format.(e.g. use ',' for European data).
263 .. versionadded:: 1.4.0
265comment : str, default None
266 Comments out remainder of line. Pass a character or characters to this
267 argument to indicate comments in the input file. Any data between the
268 comment string and the end of the current line is ignored.
269skipfooter : int, default 0
270 Rows at the end to skip (0-indexed).
271convert_float : bool, default True
272 Convert integral floats to int (i.e., 1.0 --> 1). If False, all numeric
273 data will be read in as floats: Excel stores all numbers as floats
274 internally.
276 .. deprecated:: 1.3.0
277 convert_float will be removed in a future version
279mangle_dupe_cols : bool, default True
280 Duplicate columns will be specified as 'X', 'X.1', ...'X.N', rather than
281 'X'...'X'. Passing in False will cause data to be overwritten if there
282 are duplicate names in the columns.
284 .. deprecated:: 1.5.0
285 Not implemented, and a new argument to specify the pattern for the
286 names of duplicated columns will be added instead
288{storage_options}
290 .. versionadded:: 1.2.0
292Returns
293-------
294DataFrame or dict of DataFrames
295 DataFrame from the passed in Excel file. See notes in sheet_name
296 argument for more information on when a dict of DataFrames is returned.
298See Also
299--------
300DataFrame.to_excel : Write DataFrame to an Excel file.
301DataFrame.to_csv : Write DataFrame to a comma-separated values (csv) file.
302read_csv : Read a comma-separated values (csv) file into DataFrame.
303read_fwf : Read a table of fixed-width formatted lines into DataFrame.
305Examples
306--------
307The file can be read using the file name as string or an open file object:
309>>> pd.read_excel('tmp.xlsx', index_col=0) # doctest: +SKIP
310 Name Value
3110 string1 1
3121 string2 2
3132 #Comment 3
315>>> pd.read_excel(open('tmp.xlsx', 'rb'),
316... sheet_name='Sheet3') # doctest: +SKIP
317 Unnamed: 0 Name Value
3180 0 string1 1
3191 1 string2 2
3202 2 #Comment 3
322Index and header can be specified via the `index_col` and `header` arguments
324>>> pd.read_excel('tmp.xlsx', index_col=None, header=None) # doctest: +SKIP
325 0 1 2
3260 NaN Name Value
3271 0.0 string1 1
3282 1.0 string2 2
3293 2.0 #Comment 3
331Column types are inferred but can be explicitly specified
333>>> pd.read_excel('tmp.xlsx', index_col=0,
334... dtype={{'Name': str, 'Value': float}}) # doctest: +SKIP
335 Name Value
3360 string1 1.0
3371 string2 2.0
3382 #Comment 3.0
340True, False, and NA values, and thousands separators have defaults,
341but can be explicitly specified, too. Supply the values you would like
342as strings or lists of strings!
344>>> pd.read_excel('tmp.xlsx', index_col=0,
345... na_values=['string1', 'string2']) # doctest: +SKIP
346 Name Value
3470 NaN 1
3481 NaN 2
3492 #Comment 3
351Comment lines in the excel input file can be skipped using the `comment` kwarg
353>>> pd.read_excel('tmp.xlsx', index_col=0, comment='#') # doctest: +SKIP
354 Name Value
3550 string1 1.0
3561 string2 2.0
3572 None NaN
358"""
359)
362@overload
363def read_excel(
364 io,
365 # sheet name is str or int -> DataFrame
366 sheet_name: str | int = ...,
367 header: int | Sequence[int] | None = ...,
368 names: list[str] | None = ...,
369 index_col: int | Sequence[int] | None = ...,
370 usecols: int
371 | str
372 | Sequence[int]
373 | Sequence[str]
374 | Callable[[str], bool]
375 | None = ...,
376 squeeze: bool | None = ...,
377 dtype: DtypeArg | None = ...,
378 engine: Literal["xlrd", "openpyxl", "odf", "pyxlsb"] | None = ...,
379 converters: dict[str, Callable] | dict[int, Callable] | None = ...,
380 true_values: Iterable[Hashable] | None = ...,
381 false_values: Iterable[Hashable] | None = ...,
382 skiprows: Sequence[int] | int | Callable[[int], object] | None = ...,
383 nrows: int | None = ...,
384 na_values=...,
385 keep_default_na: bool = ...,
386 na_filter: bool = ...,
387 verbose: bool = ...,
388 parse_dates: list | dict | bool = ...,
389 date_parser: Callable | None = ...,
390 thousands: str | None = ...,
391 decimal: str = ...,
392 comment: str | None = ...,
393 skipfooter: int = ...,
394 convert_float: bool | None = ...,
395 mangle_dupe_cols: bool = ...,
396 storage_options: StorageOptions = ...,
397) -> DataFrame:
398 ...
401@overload
402def read_excel(
403 io,
404 # sheet name is list or None -> dict[IntStrT, DataFrame]
405 sheet_name: list[IntStrT] | None,
406 header: int | Sequence[int] | None = ...,
407 names: list[str] | None = ...,
408 index_col: int | Sequence[int] | None = ...,
409 usecols: int
410 | str
411 | Sequence[int]
412 | Sequence[str]
413 | Callable[[str], bool]
414 | None = ...,
415 squeeze: bool | None = ...,
416 dtype: DtypeArg | None = ...,
417 engine: Literal["xlrd", "openpyxl", "odf", "pyxlsb"] | None = ...,
418 converters: dict[str, Callable] | dict[int, Callable] | None = ...,
419 true_values: Iterable[Hashable] | None = ...,
420 false_values: Iterable[Hashable] | None = ...,
421 skiprows: Sequence[int] | int | Callable[[int], object] | None = ...,
422 nrows: int | None = ...,
423 na_values=...,
424 keep_default_na: bool = ...,
425 na_filter: bool = ...,
426 verbose: bool = ...,
427 parse_dates: list | dict | bool = ...,
428 date_parser: Callable | None = ...,
429 thousands: str | None = ...,
430 decimal: str = ...,
431 comment: str | None = ...,
432 skipfooter: int = ...,
433 convert_float: bool | None = ...,
434 mangle_dupe_cols: bool = ...,
435 storage_options: StorageOptions = ...,
436) -> dict[IntStrT, DataFrame]:
437 ...
440@doc(storage_options=_shared_docs["storage_options"])
441@deprecate_kwarg(old_arg_name="mangle_dupe_cols", new_arg_name=None)
442@deprecate_nonkeyword_arguments(allowed_args=["io", "sheet_name"], version="2.0")
443@Appender(_read_excel_doc)
444def read_excel(
445 io,
446 sheet_name: str | int | list[IntStrT] | None = 0,
447 header: int | Sequence[int] | None = 0,
448 names: list[str] | None = None,
449 index_col: int | Sequence[int] | None = None,
450 usecols: int
451 | str
452 | Sequence[int]
453 | Sequence[str]
454 | Callable[[str], bool]
455 | None = None,
456 squeeze: bool | None = None,
457 dtype: DtypeArg | None = None,
458 engine: Literal["xlrd", "openpyxl", "odf", "pyxlsb"] | None = None,
459 converters: dict[str, Callable] | dict[int, Callable] | None = None,
460 true_values: Iterable[Hashable] | None = None,
461 false_values: Iterable[Hashable] | None = None,
462 skiprows: Sequence[int] | int | Callable[[int], object] | None = None,
463 nrows: int | None = None,
464 na_values=None,
465 keep_default_na: bool = True,
466 na_filter: bool = True,
467 verbose: bool = False,
468 parse_dates: list | dict | bool = False,
469 date_parser: Callable | None = None,
470 thousands: str | None = None,
471 decimal: str = ".",
472 comment: str | None = None,
473 skipfooter: int = 0,
474 convert_float: bool | None = None,
475 mangle_dupe_cols: bool = True,
476 storage_options: StorageOptions = None,
477) -> DataFrame | dict[IntStrT, DataFrame]:
479 should_close = False
480 if not isinstance(io, ExcelFile):
481 should_close = True
482 io = ExcelFile(io, storage_options=storage_options, engine=engine)
483 elif engine and engine != io.engine:
484 raise ValueError(
485 "Engine should not be specified when passing "
486 "an ExcelFile - ExcelFile already has the engine set"
487 )
489 try:
490 data = io.parse(
491 sheet_name=sheet_name,
492 header=header,
493 names=names,
494 index_col=index_col,
495 usecols=usecols,
496 squeeze=squeeze,
497 dtype=dtype,
498 converters=converters,
499 true_values=true_values,
500 false_values=false_values,
501 skiprows=skiprows,
502 nrows=nrows,
503 na_values=na_values,
504 keep_default_na=keep_default_na,
505 na_filter=na_filter,
506 verbose=verbose,
507 parse_dates=parse_dates,
508 date_parser=date_parser,
509 thousands=thousands,
510 decimal=decimal,
511 comment=comment,
512 skipfooter=skipfooter,
513 convert_float=convert_float,
514 mangle_dupe_cols=mangle_dupe_cols,
515 )
516 finally:
517 # make sure to close opened file handles
518 if should_close:
519 io.close()
520 return data
523class BaseExcelReader(metaclass=abc.ABCMeta):
524 def __init__(
525 self, filepath_or_buffer, storage_options: StorageOptions = None
526 ) -> None:
527 # First argument can also be bytes, so create a buffer
528 if isinstance(filepath_or_buffer, bytes):
529 filepath_or_buffer = BytesIO(filepath_or_buffer)
531 self.handles = IOHandles(
532 handle=filepath_or_buffer, compression={"method": None}
533 )
534 if not isinstance(filepath_or_buffer, (ExcelFile, self._workbook_class)):
535 self.handles = get_handle(
536 filepath_or_buffer, "rb", storage_options=storage_options, is_text=False
537 )
539 if isinstance(self.handles.handle, self._workbook_class):
540 self.book = self.handles.handle
541 elif hasattr(self.handles.handle, "read"):
542 # N.B. xlrd.Book has a read attribute too
543 self.handles.handle.seek(0)
544 try:
545 self.book = self.load_workbook(self.handles.handle)
546 except Exception:
547 self.close()
548 raise
549 else:
550 raise ValueError(
551 "Must explicitly set engine if not passing in buffer or path for io."
552 )
554 @property
555 @abc.abstractmethod
556 def _workbook_class(self):
557 pass
559 @abc.abstractmethod
560 def load_workbook(self, filepath_or_buffer):
561 pass
563 def close(self) -> None:
564 if hasattr(self, "book"):
565 if hasattr(self.book, "close"):
566 # pyxlsb: opens a TemporaryFile
567 # openpyxl: https://stackoverflow.com/questions/31416842/
568 # openpyxl-does-not-close-excel-workbook-in-read-only-mode
569 self.book.close()
570 elif hasattr(self.book, "release_resources"):
571 # xlrd
572 # https://github.com/python-excel/xlrd/blob/2.0.1/xlrd/book.py#L548
573 self.book.release_resources()
574 self.handles.close()
576 @property
577 @abc.abstractmethod
578 def sheet_names(self) -> list[str]:
579 pass
581 @abc.abstractmethod
582 def get_sheet_by_name(self, name: str):
583 pass
585 @abc.abstractmethod
586 def get_sheet_by_index(self, index: int):
587 pass
589 @abc.abstractmethod
590 def get_sheet_data(self, sheet, convert_float: bool, rows: int | None = None):
591 pass
593 def raise_if_bad_sheet_by_index(self, index: int) -> None:
594 n_sheets = len(self.sheet_names)
595 if index >= n_sheets:
596 raise ValueError(
597 f"Worksheet index {index} is invalid, {n_sheets} worksheets found"
598 )
600 def raise_if_bad_sheet_by_name(self, name: str) -> None:
601 if name not in self.sheet_names:
602 raise ValueError(f"Worksheet named '{name}' not found")
604 def _check_skiprows_func(
605 self,
606 skiprows: Callable,
607 rows_to_use: int,
608 ) -> int:
609 """
610 Determine how many file rows are required to obtain `nrows` data
611 rows when `skiprows` is a function.
613 Parameters
614 ----------
615 skiprows : function
616 The function passed to read_excel by the user.
617 rows_to_use : int
618 The number of rows that will be needed for the header and
619 the data.
621 Returns
622 -------
623 int
624 """
625 i = 0
626 rows_used_so_far = 0
627 while rows_used_so_far < rows_to_use:
628 if not skiprows(i):
629 rows_used_so_far += 1
630 i += 1
631 return i
633 def _calc_rows(
634 self,
635 header: int | Sequence[int] | None,
636 index_col: int | Sequence[int] | None,
637 skiprows: Sequence[int] | int | Callable[[int], object] | None,
638 nrows: int | None,
639 ) -> int | None:
640 """
641 If nrows specified, find the number of rows needed from the
642 file, otherwise return None.
645 Parameters
646 ----------
647 header : int, list of int, or None
648 See read_excel docstring.
649 index_col : int, list of int, or None
650 See read_excel docstring.
651 skiprows : list-like, int, callable, or None
652 See read_excel docstring.
653 nrows : int or None
654 See read_excel docstring.
656 Returns
657 -------
658 int or None
659 """
660 if nrows is None:
661 return None
662 if header is None:
663 header_rows = 1
664 elif is_integer(header):
665 header = cast(int, header)
666 header_rows = 1 + header
667 else:
668 header = cast(Sequence, header)
669 header_rows = 1 + header[-1]
670 # If there is a MultiIndex header and an index then there is also
671 # a row containing just the index name(s)
672 if is_list_like(header) and index_col is not None:
673 header = cast(Sequence, header)
674 if len(header) > 1:
675 header_rows += 1
676 if skiprows is None:
677 return header_rows + nrows
678 if is_integer(skiprows):
679 skiprows = cast(int, skiprows)
680 return header_rows + nrows + skiprows
681 if is_list_like(skiprows):
683 def f(skiprows: Sequence, x: int) -> bool:
684 return x in skiprows
686 skiprows = cast(Sequence, skiprows)
687 return self._check_skiprows_func(partial(f, skiprows), header_rows + nrows)
688 if callable(skiprows):
689 return self._check_skiprows_func(
690 skiprows,
691 header_rows + nrows,
692 )
693 # else unexpected skiprows type: read_excel will not optimize
694 # the number of rows read from file
695 return None
697 def parse(
698 self,
699 sheet_name: str | int | list[int] | list[str] | None = 0,
700 header: int | Sequence[int] | None = 0,
701 names=None,
702 index_col: int | Sequence[int] | None = None,
703 usecols=None,
704 squeeze: bool | None = None,
705 dtype: DtypeArg | None = None,
706 true_values: Iterable[Hashable] | None = None,
707 false_values: Iterable[Hashable] | None = None,
708 skiprows: Sequence[int] | int | Callable[[int], object] | None = None,
709 nrows: int | None = None,
710 na_values=None,
711 verbose: bool = False,
712 parse_dates: list | dict | bool = False,
713 date_parser: Callable | None = None,
714 thousands: str | None = None,
715 decimal: str = ".",
716 comment: str | None = None,
717 skipfooter: int = 0,
718 convert_float: bool | None = None,
719 mangle_dupe_cols: bool = True,
720 **kwds,
721 ):
723 if convert_float is None:
724 convert_float = True
725 else:
726 warnings.warn(
727 "convert_float is deprecated and will be removed in a future version.",
728 FutureWarning,
729 stacklevel=find_stack_level(),
730 )
732 validate_header_arg(header)
733 validate_integer("nrows", nrows)
735 ret_dict = False
737 # Keep sheetname to maintain backwards compatibility.
738 sheets: list[int] | list[str]
739 if isinstance(sheet_name, list):
740 sheets = sheet_name
741 ret_dict = True
742 elif sheet_name is None:
743 sheets = self.sheet_names
744 ret_dict = True
745 elif isinstance(sheet_name, str):
746 sheets = [sheet_name]
747 else:
748 sheets = [sheet_name]
750 # handle same-type duplicates.
751 sheets = cast(Union[List[int], List[str]], list(dict.fromkeys(sheets).keys()))
753 output = {}
755 for asheetname in sheets:
756 if verbose:
757 print(f"Reading sheet {asheetname}")
759 if isinstance(asheetname, str):
760 sheet = self.get_sheet_by_name(asheetname)
761 else: # assume an integer if not a string
762 sheet = self.get_sheet_by_index(asheetname)
764 file_rows_needed = self._calc_rows(header, index_col, skiprows, nrows)
765 data = self.get_sheet_data(sheet, convert_float, file_rows_needed)
766 if hasattr(sheet, "close"):
767 # pyxlsb opens two TemporaryFiles
768 sheet.close()
769 usecols = maybe_convert_usecols(usecols)
771 if not data:
772 output[asheetname] = DataFrame()
773 continue
775 is_list_header = False
776 is_len_one_list_header = False
777 if is_list_like(header):
778 assert isinstance(header, Sequence)
779 is_list_header = True
780 if len(header) == 1:
781 is_len_one_list_header = True
783 if is_len_one_list_header:
784 header = cast(Sequence[int], header)[0]
786 # forward fill and pull out names for MultiIndex column
787 header_names = None
788 if header is not None and is_list_like(header):
789 assert isinstance(header, Sequence)
791 header_names = []
792 control_row = [True] * len(data[0])
794 for row in header:
795 if is_integer(skiprows):
796 assert isinstance(skiprows, int)
797 row += skiprows
799 if row > len(data) - 1:
800 raise ValueError(
801 f"header index {row} exceeds maximum index "
802 f"{len(data) - 1} of data.",
803 )
805 data[row], control_row = fill_mi_header(data[row], control_row)
807 if index_col is not None:
808 header_name, _ = pop_header_name(data[row], index_col)
809 header_names.append(header_name)
811 # If there is a MultiIndex header and an index then there is also
812 # a row containing just the index name(s)
813 has_index_names = False
814 if is_list_header and not is_len_one_list_header and index_col is not None:
816 index_col_list: Sequence[int]
817 if isinstance(index_col, int):
818 index_col_list = [index_col]
819 else:
820 assert isinstance(index_col, Sequence)
821 index_col_list = index_col
823 # We have to handle mi without names. If any of the entries in the data
824 # columns are not empty, this is a regular row
825 assert isinstance(header, Sequence)
826 if len(header) < len(data):
827 potential_index_names = data[len(header)]
828 potential_data = [
829 x
830 for i, x in enumerate(potential_index_names)
831 if not control_row[i] and i not in index_col_list
832 ]
833 has_index_names = all(x == "" or x is None for x in potential_data)
835 if is_list_like(index_col):
836 # Forward fill values for MultiIndex index.
837 if header is None:
838 offset = 0
839 elif isinstance(header, int):
840 offset = 1 + header
841 else:
842 offset = 1 + max(header)
844 # GH34673: if MultiIndex names present and not defined in the header,
845 # offset needs to be incremented so that forward filling starts
846 # from the first MI value instead of the name
847 if has_index_names:
848 offset += 1
850 # Check if we have an empty dataset
851 # before trying to collect data.
852 if offset < len(data):
853 assert isinstance(index_col, Sequence)
855 for col in index_col:
856 last = data[offset][col]
858 for row in range(offset + 1, len(data)):
859 if data[row][col] == "" or data[row][col] is None:
860 data[row][col] = last
861 else:
862 last = data[row][col]
864 # GH 12292 : error when read one empty column from excel file
865 try:
866 parser = TextParser(
867 data,
868 names=names,
869 header=header,
870 index_col=index_col,
871 has_index_names=has_index_names,
872 squeeze=squeeze,
873 dtype=dtype,
874 true_values=true_values,
875 false_values=false_values,
876 skiprows=skiprows,
877 nrows=nrows,
878 na_values=na_values,
879 skip_blank_lines=False, # GH 39808
880 parse_dates=parse_dates,
881 date_parser=date_parser,
882 thousands=thousands,
883 decimal=decimal,
884 comment=comment,
885 skipfooter=skipfooter,
886 usecols=usecols,
887 mangle_dupe_cols=mangle_dupe_cols,
888 **kwds,
889 )
891 output[asheetname] = parser.read(nrows=nrows)
893 if not squeeze or isinstance(output[asheetname], DataFrame):
894 if header_names:
895 output[asheetname].columns = output[
896 asheetname
897 ].columns.set_names(header_names)
899 except EmptyDataError:
900 # No Data, return an empty DataFrame
901 output[asheetname] = DataFrame()
903 if ret_dict:
904 return output
905 else:
906 return output[asheetname]
909@doc(storage_options=_shared_docs["storage_options"])
910class ExcelWriter(metaclass=abc.ABCMeta):
911 """
912 Class for writing DataFrame objects into excel sheets.
914 Default is to use:
916 * `xlwt <https://pypi.org/project/xlwt/>`__ for xls files
917 * `xlsxwriter <https://pypi.org/project/XlsxWriter/>`__ for xlsx files if xlsxwriter
918 is installed otherwise `openpyxl <https://pypi.org/project/openpyxl/>`__
919 * `odswriter <https://pypi.org/project/odswriter/>`__ for ods files
921 See ``DataFrame.to_excel`` for typical usage.
923 The writer should be used as a context manager. Otherwise, call `close()` to save
924 and close any opened file handles.
926 Parameters
927 ----------
928 path : str or typing.BinaryIO
929 Path to xls or xlsx or ods file.
930 engine : str (optional)
931 Engine to use for writing. If None, defaults to
932 ``io.excel.<extension>.writer``. NOTE: can only be passed as a keyword
933 argument.
935 .. deprecated:: 1.2.0
937 As the `xlwt <https://pypi.org/project/xlwt/>`__ package is no longer
938 maintained, the ``xlwt`` engine will be removed in a future
939 version of pandas.
941 date_format : str, default None
942 Format string for dates written into Excel files (e.g. 'YYYY-MM-DD').
943 datetime_format : str, default None
944 Format string for datetime objects written into Excel files.
945 (e.g. 'YYYY-MM-DD HH:MM:SS').
946 mode : {{'w', 'a'}}, default 'w'
947 File mode to use (write or append). Append does not work with fsspec URLs.
948 {storage_options}
950 .. versionadded:: 1.2.0
952 if_sheet_exists : {{'error', 'new', 'replace', 'overlay'}}, default 'error'
953 How to behave when trying to write to a sheet that already
954 exists (append mode only).
956 * error: raise a ValueError.
957 * new: Create a new sheet, with a name determined by the engine.
958 * replace: Delete the contents of the sheet before writing to it.
959 * overlay: Write contents to the existing sheet without removing the old
960 contents.
962 .. versionadded:: 1.3.0
964 .. versionchanged:: 1.4.0
966 Added ``overlay`` option
968 engine_kwargs : dict, optional
969 Keyword arguments to be passed into the engine. These will be passed to
970 the following functions of the respective engines:
972 * xlsxwriter: ``xlsxwriter.Workbook(file, **engine_kwargs)``
973 * openpyxl (write mode): ``openpyxl.Workbook(**engine_kwargs)``
974 * openpyxl (append mode): ``openpyxl.load_workbook(file, **engine_kwargs)``
975 * odswriter: ``odf.opendocument.OpenDocumentSpreadsheet(**engine_kwargs)``
977 .. versionadded:: 1.3.0
978 **kwargs : dict, optional
979 Keyword arguments to be passed into the engine.
981 .. deprecated:: 1.3.0
983 Use engine_kwargs instead.
985 Notes
986 -----
987 For compatibility with CSV writers, ExcelWriter serializes lists
988 and dicts to strings before writing.
990 Examples
991 --------
992 Default usage:
994 >>> df = pd.DataFrame([["ABC", "XYZ"]], columns=["Foo", "Bar"]) # doctest: +SKIP
995 >>> with pd.ExcelWriter("path_to_file.xlsx") as writer:
996 ... df.to_excel(writer) # doctest: +SKIP
998 To write to separate sheets in a single file:
1000 >>> df1 = pd.DataFrame([["AAA", "BBB"]], columns=["Spam", "Egg"]) # doctest: +SKIP
1001 >>> df2 = pd.DataFrame([["ABC", "XYZ"]], columns=["Foo", "Bar"]) # doctest: +SKIP
1002 >>> with pd.ExcelWriter("path_to_file.xlsx") as writer:
1003 ... df1.to_excel(writer, sheet_name="Sheet1") # doctest: +SKIP
1004 ... df2.to_excel(writer, sheet_name="Sheet2") # doctest: +SKIP
1006 You can set the date format or datetime format:
1008 >>> from datetime import date, datetime # doctest: +SKIP
1009 >>> df = pd.DataFrame(
1010 ... [
1011 ... [date(2014, 1, 31), date(1999, 9, 24)],
1012 ... [datetime(1998, 5, 26, 23, 33, 4), datetime(2014, 2, 28, 13, 5, 13)],
1013 ... ],
1014 ... index=["Date", "Datetime"],
1015 ... columns=["X", "Y"],
1016 ... ) # doctest: +SKIP
1017 >>> with pd.ExcelWriter(
1018 ... "path_to_file.xlsx",
1019 ... date_format="YYYY-MM-DD",
1020 ... datetime_format="YYYY-MM-DD HH:MM:SS"
1021 ... ) as writer:
1022 ... df.to_excel(writer) # doctest: +SKIP
1024 You can also append to an existing Excel file:
1026 >>> with pd.ExcelWriter("path_to_file.xlsx", mode="a", engine="openpyxl") as writer:
1027 ... df.to_excel(writer, sheet_name="Sheet3") # doctest: +SKIP
1029 Here, the `if_sheet_exists` parameter can be set to replace a sheet if it
1030 already exists:
1032 >>> with ExcelWriter(
1033 ... "path_to_file.xlsx",
1034 ... mode="a",
1035 ... engine="openpyxl",
1036 ... if_sheet_exists="replace",
1037 ... ) as writer:
1038 ... df.to_excel(writer, sheet_name="Sheet1") # doctest: +SKIP
1040 You can also write multiple DataFrames to a single sheet. Note that the
1041 ``if_sheet_exists`` parameter needs to be set to ``overlay``:
1043 >>> with ExcelWriter("path_to_file.xlsx",
1044 ... mode="a",
1045 ... engine="openpyxl",
1046 ... if_sheet_exists="overlay",
1047 ... ) as writer:
1048 ... df1.to_excel(writer, sheet_name="Sheet1")
1049 ... df2.to_excel(writer, sheet_name="Sheet1", startcol=3) # doctest: +SKIP
1051 You can store Excel file in RAM:
1053 >>> import io
1054 >>> df = pd.DataFrame([["ABC", "XYZ"]], columns=["Foo", "Bar"])
1055 >>> buffer = io.BytesIO()
1056 >>> with pd.ExcelWriter(buffer) as writer:
1057 ... df.to_excel(writer)
1059 You can pack Excel file into zip archive:
1061 >>> import zipfile # doctest: +SKIP
1062 >>> df = pd.DataFrame([["ABC", "XYZ"]], columns=["Foo", "Bar"]) # doctest: +SKIP
1063 >>> with zipfile.ZipFile("path_to_file.zip", "w") as zf:
1064 ... with zf.open("filename.xlsx", "w") as buffer:
1065 ... with pd.ExcelWriter(buffer) as writer:
1066 ... df.to_excel(writer) # doctest: +SKIP
1068 You can specify additional arguments to the underlying engine:
1070 >>> with pd.ExcelWriter(
1071 ... "path_to_file.xlsx",
1072 ... engine="xlsxwriter",
1073 ... engine_kwargs={{"options": {{"nan_inf_to_errors": True}}}}
1074 ... ) as writer:
1075 ... df.to_excel(writer) # doctest: +SKIP
1077 In append mode, ``engine_kwargs`` are passed through to
1078 openpyxl's ``load_workbook``:
1080 >>> with pd.ExcelWriter(
1081 ... "path_to_file.xlsx",
1082 ... engine="openpyxl",
1083 ... mode="a",
1084 ... engine_kwargs={{"keep_vba": True}}
1085 ... ) as writer:
1086 ... df.to_excel(writer, sheet_name="Sheet2") # doctest: +SKIP
1087 """
1089 # Defining an ExcelWriter implementation (see abstract methods for more...)
1091 # - Mandatory
1092 # - ``write_cells(self, cells, sheet_name=None, startrow=0, startcol=0)``
1093 # --> called to write additional DataFrames to disk
1094 # - ``_supported_extensions`` (tuple of supported extensions), used to
1095 # check that engine supports the given extension.
1096 # - ``_engine`` - string that gives the engine name. Necessary to
1097 # instantiate class directly and bypass ``ExcelWriterMeta`` engine
1098 # lookup.
1099 # - ``save(self)`` --> called to save file to disk
1100 # - Mostly mandatory (i.e. should at least exist)
1101 # - book, cur_sheet, path
1103 # - Optional:
1104 # - ``__init__(self, path, engine=None, **kwargs)`` --> always called
1105 # with path as first argument.
1107 # You also need to register the class with ``register_writer()``.
1108 # Technically, ExcelWriter implementations don't need to subclass
1109 # ExcelWriter.
1111 _engine: str
1112 _supported_extensions: tuple[str, ...]
1114 def __new__(
1115 cls: type[ExcelWriter],
1116 path: FilePath | WriteExcelBuffer | ExcelWriter,
1117 engine: str | None = None,
1118 date_format: str | None = None,
1119 datetime_format: str | None = None,
1120 mode: str = "w",
1121 storage_options: StorageOptions = None,
1122 if_sheet_exists: Literal["error", "new", "replace", "overlay"] | None = None,
1123 engine_kwargs: dict | None = None,
1124 **kwargs,
1125 ) -> ExcelWriter:
1126 if kwargs:
1127 if engine_kwargs is not None:
1128 raise ValueError("Cannot use both engine_kwargs and **kwargs")
1129 warnings.warn(
1130 "Use of **kwargs is deprecated, use engine_kwargs instead.",
1131 FutureWarning,
1132 stacklevel=find_stack_level(),
1133 )
1135 # only switch class if generic(ExcelWriter)
1136 if cls is ExcelWriter:
1137 if engine is None or (isinstance(engine, str) and engine == "auto"):
1138 if isinstance(path, str):
1139 ext = os.path.splitext(path)[-1][1:]
1140 else:
1141 ext = "xlsx"
1143 try:
1144 engine = config.get_option(f"io.excel.{ext}.writer", silent=True)
1145 if engine == "auto":
1146 engine = get_default_engine(ext, mode="writer")
1147 except KeyError as err:
1148 raise ValueError(f"No engine for filetype: '{ext}'") from err
1150 if engine == "xlwt":
1151 xls_config_engine = config.get_option(
1152 "io.excel.xls.writer", silent=True
1153 )
1154 # Don't warn a 2nd time if user has changed the default engine for xls
1155 if xls_config_engine != "xlwt":
1156 warnings.warn(
1157 "As the xlwt package is no longer maintained, the xlwt "
1158 "engine will be removed in a future version of pandas. "
1159 "This is the only engine in pandas that supports writing "
1160 "in the xls format. Install openpyxl and write to an xlsx "
1161 "file instead. You can set the option io.excel.xls.writer "
1162 "to 'xlwt' to silence this warning. While this option is "
1163 "deprecated and will also raise a warning, it can "
1164 "be globally set and the warning suppressed.",
1165 FutureWarning,
1166 stacklevel=find_stack_level(),
1167 )
1169 # for mypy
1170 assert engine is not None
1171 cls = get_writer(engine)
1173 return object.__new__(cls)
1175 # declare external properties you can count on
1176 _path = None
1178 @property
1179 def supported_extensions(self) -> tuple[str, ...]:
1180 """Extensions that writer engine supports."""
1181 return self._supported_extensions
1183 @property
1184 def engine(self) -> str:
1185 """Name of engine."""
1186 return self._engine
1188 @property
1189 @abc.abstractmethod
1190 def sheets(self) -> dict[str, Any]:
1191 """Mapping of sheet names to sheet objects."""
1192 pass
1194 # mypy doesn't handle abstract setters prior to 0.981
1195 # https://github.com/python/mypy/issues/4165
1196 @property # type: ignore[misc]
1197 @abc.abstractmethod
1198 def book(self):
1199 """
1200 Book instance. Class type will depend on the engine used.
1202 This attribute can be used to access engine-specific features.
1203 """
1204 pass
1206 # mypy doesn't handle abstract setters prior to 0.981
1207 # https://github.com/python/mypy/issues/4165
1208 @book.setter # type: ignore[misc]
1209 @abc.abstractmethod
1210 def book(self, other) -> None:
1211 """
1212 Set book instance. Class type will depend on the engine used.
1213 """
1214 pass
1216 def write_cells(
1217 self,
1218 cells,
1219 sheet_name: str | None = None,
1220 startrow: int = 0,
1221 startcol: int = 0,
1222 freeze_panes: tuple[int, int] | None = None,
1223 ) -> None:
1224 """
1225 Write given formatted cells into Excel an excel sheet
1227 .. deprecated:: 1.5.0
1229 Parameters
1230 ----------
1231 cells : generator
1232 cell of formatted data to save to Excel sheet
1233 sheet_name : str, default None
1234 Name of Excel sheet, if None, then use self.cur_sheet
1235 startrow : upper left cell row to dump data frame
1236 startcol : upper left cell column to dump data frame
1237 freeze_panes: int tuple of length 2
1238 contains the bottom-most row and right-most column to freeze
1239 """
1240 self._deprecate("write_cells")
1241 return self._write_cells(cells, sheet_name, startrow, startcol, freeze_panes)
1243 @abc.abstractmethod
1244 def _write_cells(
1245 self,
1246 cells,
1247 sheet_name: str | None = None,
1248 startrow: int = 0,
1249 startcol: int = 0,
1250 freeze_panes: tuple[int, int] | None = None,
1251 ) -> None:
1252 """
1253 Write given formatted cells into Excel an excel sheet
1255 Parameters
1256 ----------
1257 cells : generator
1258 cell of formatted data to save to Excel sheet
1259 sheet_name : str, default None
1260 Name of Excel sheet, if None, then use self.cur_sheet
1261 startrow : upper left cell row to dump data frame
1262 startcol : upper left cell column to dump data frame
1263 freeze_panes: int tuple of length 2
1264 contains the bottom-most row and right-most column to freeze
1265 """
1266 pass
1268 def save(self) -> None:
1269 """
1270 Save workbook to disk.
1272 .. deprecated:: 1.5.0
1273 """
1274 self._deprecate("save")
1275 return self._save()
1277 @abc.abstractmethod
1278 def _save(self) -> None:
1279 """
1280 Save workbook to disk.
1281 """
1282 pass
1284 def __init__(
1285 self,
1286 path: FilePath | WriteExcelBuffer | ExcelWriter,
1287 engine: str | None = None,
1288 date_format: str | None = None,
1289 datetime_format: str | None = None,
1290 mode: str = "w",
1291 storage_options: StorageOptions = None,
1292 if_sheet_exists: str | None = None,
1293 engine_kwargs: dict[str, Any] | None = None,
1294 **kwargs,
1295 ) -> None:
1296 # validate that this engine can handle the extension
1297 if isinstance(path, str):
1298 ext = os.path.splitext(path)[-1]
1299 self.check_extension(ext)
1301 # use mode to open the file
1302 if "b" not in mode:
1303 mode += "b"
1304 # use "a" for the user to append data to excel but internally use "r+" to let
1305 # the excel backend first read the existing file and then write any data to it
1306 mode = mode.replace("a", "r+")
1308 # cast ExcelWriter to avoid adding 'if self.handles is not None'
1309 self._handles = IOHandles(
1310 cast(IO[bytes], path), compression={"compression": None}
1311 )
1312 if not isinstance(path, ExcelWriter):
1313 self._handles = get_handle(
1314 path, mode, storage_options=storage_options, is_text=False
1315 )
1316 self._cur_sheet = None
1318 if date_format is None:
1319 self._date_format = "YYYY-MM-DD"
1320 else:
1321 self._date_format = date_format
1322 if datetime_format is None:
1323 self._datetime_format = "YYYY-MM-DD HH:MM:SS"
1324 else:
1325 self._datetime_format = datetime_format
1327 self._mode = mode
1329 if if_sheet_exists not in (None, "error", "new", "replace", "overlay"):
1330 raise ValueError(
1331 f"'{if_sheet_exists}' is not valid for if_sheet_exists. "
1332 "Valid options are 'error', 'new', 'replace' and 'overlay'."
1333 )
1334 if if_sheet_exists and "r+" not in mode:
1335 raise ValueError("if_sheet_exists is only valid in append mode (mode='a')")
1336 if if_sheet_exists is None:
1337 if_sheet_exists = "error"
1338 self._if_sheet_exists = if_sheet_exists
1340 def _deprecate(self, attr: str):
1341 """
1342 Deprecate attribute or method for ExcelWriter.
1343 """
1344 warnings.warn(
1345 f"{attr} is not part of the public API, usage can give unexpected "
1346 "results and will be removed in a future version",
1347 FutureWarning,
1348 stacklevel=find_stack_level(),
1349 )
1351 def _deprecate_set_book(self) -> None:
1352 """
1353 Deprecate setting the book attribute - GH#48780.
1354 """
1355 warnings.warn(
1356 "Setting the `book` attribute is not part of the public API, "
1357 "usage can give unexpected or corrupted results and will be "
1358 "removed in a future version",
1359 FutureWarning,
1360 stacklevel=find_stack_level(),
1361 )
1363 @property
1364 def date_format(self) -> str:
1365 """
1366 Format string for dates written into Excel files (e.g. ‘YYYY-MM-DD’).
1367 """
1368 return self._date_format
1370 @property
1371 def datetime_format(self) -> str:
1372 """
1373 Format string for dates written into Excel files (e.g. ‘YYYY-MM-DD’).
1374 """
1375 return self._datetime_format
1377 @property
1378 def if_sheet_exists(self) -> str:
1379 """
1380 How to behave when writing to a sheet that already exists in append mode.
1381 """
1382 return self._if_sheet_exists
1384 @property
1385 def cur_sheet(self):
1386 """
1387 Current sheet for writing.
1389 .. deprecated:: 1.5.0
1390 """
1391 self._deprecate("cur_sheet")
1392 return self._cur_sheet
1394 @property
1395 def handles(self) -> IOHandles[bytes]:
1396 """
1397 Handles to Excel sheets.
1399 .. deprecated:: 1.5.0
1400 """
1401 self._deprecate("handles")
1402 return self._handles
1404 @property
1405 def path(self):
1406 """
1407 Path to Excel file.
1409 .. deprecated:: 1.5.0
1410 """
1411 self._deprecate("path")
1412 return self._path
1414 def __fspath__(self) -> str:
1415 return getattr(self._handles.handle, "name", "")
1417 def _get_sheet_name(self, sheet_name: str | None) -> str:
1418 if sheet_name is None:
1419 sheet_name = self._cur_sheet
1420 if sheet_name is None: # pragma: no cover
1421 raise ValueError("Must pass explicit sheet_name or set _cur_sheet property")
1422 return sheet_name
1424 def _value_with_fmt(self, val) -> tuple[object, str | None]:
1425 """
1426 Convert numpy types to Python types for the Excel writers.
1428 Parameters
1429 ----------
1430 val : object
1431 Value to be written into cells
1433 Returns
1434 -------
1435 Tuple with the first element being the converted value and the second
1436 being an optional format
1437 """
1438 fmt = None
1440 if is_integer(val):
1441 val = int(val)
1442 elif is_float(val):
1443 val = float(val)
1444 elif is_bool(val):
1445 val = bool(val)
1446 elif isinstance(val, datetime.datetime):
1447 fmt = self._datetime_format
1448 elif isinstance(val, datetime.date):
1449 fmt = self._date_format
1450 elif isinstance(val, datetime.timedelta):
1451 val = val.total_seconds() / 86400
1452 fmt = "0"
1453 else:
1454 val = str(val)
1456 return val, fmt
1458 @classmethod
1459 def check_extension(cls, ext: str) -> Literal[True]:
1460 """
1461 checks that path's extension against the Writer's supported
1462 extensions. If it isn't supported, raises UnsupportedFiletypeError.
1463 """
1464 if ext.startswith("."):
1465 ext = ext[1:]
1466 if not any(ext in extension for extension in cls._supported_extensions):
1467 raise ValueError(f"Invalid extension for engine '{cls.engine}': '{ext}'")
1468 else:
1469 return True
1471 # Allow use as a contextmanager
1472 def __enter__(self) -> ExcelWriter:
1473 return self
1475 def __exit__(self, exc_type, exc_value, traceback) -> None:
1476 self.close()
1478 def close(self) -> None:
1479 """synonym for save, to make it more file-like"""
1480 self._save()
1481 self._handles.close()
1484XLS_SIGNATURES = (
1485 b"\x09\x00\x04\x00\x07\x00\x10\x00", # BIFF2
1486 b"\x09\x02\x06\x00\x00\x00\x10\x00", # BIFF3
1487 b"\x09\x04\x06\x00\x00\x00\x10\x00", # BIFF4
1488 b"\xD0\xCF\x11\xE0\xA1\xB1\x1A\xE1", # Compound File Binary
1489)
1490ZIP_SIGNATURE = b"PK\x03\x04"
1491PEEK_SIZE = max(map(len, XLS_SIGNATURES + (ZIP_SIGNATURE,)))
1494@doc(storage_options=_shared_docs["storage_options"])
1495def inspect_excel_format(
1496 content_or_path: FilePath | ReadBuffer[bytes],
1497 storage_options: StorageOptions = None,
1498) -> str | None:
1499 """
1500 Inspect the path or content of an excel file and get its format.
1502 Adopted from xlrd: https://github.com/python-excel/xlrd.
1504 Parameters
1505 ----------
1506 content_or_path : str or file-like object
1507 Path to file or content of file to inspect. May be a URL.
1508 {storage_options}
1510 Returns
1511 -------
1512 str or None
1513 Format of file if it can be determined.
1515 Raises
1516 ------
1517 ValueError
1518 If resulting stream is empty.
1519 BadZipFile
1520 If resulting stream does not have an XLS signature and is not a valid zipfile.
1521 """
1522 if isinstance(content_or_path, bytes):
1523 content_or_path = BytesIO(content_or_path)
1525 with get_handle(
1526 content_or_path, "rb", storage_options=storage_options, is_text=False
1527 ) as handle:
1528 stream = handle.handle
1529 stream.seek(0)
1530 buf = stream.read(PEEK_SIZE)
1531 if buf is None:
1532 raise ValueError("stream is empty")
1533 else:
1534 assert isinstance(buf, bytes)
1535 peek = buf
1536 stream.seek(0)
1538 if any(peek.startswith(sig) for sig in XLS_SIGNATURES):
1539 return "xls"
1540 elif not peek.startswith(ZIP_SIGNATURE):
1541 return None
1543 with zipfile.ZipFile(stream) as zf:
1544 # Workaround for some third party files that use forward slashes and
1545 # lower case names.
1546 component_names = [
1547 name.replace("\\", "/").lower() for name in zf.namelist()
1548 ]
1550 if "xl/workbook.xml" in component_names:
1551 return "xlsx"
1552 if "xl/workbook.bin" in component_names:
1553 return "xlsb"
1554 if "content.xml" in component_names:
1555 return "ods"
1556 return "zip"
1559class ExcelFile:
1560 """
1561 Class for parsing tabular excel sheets into DataFrame objects.
1563 See read_excel for more documentation.
1565 Parameters
1566 ----------
1567 path_or_buffer : str, bytes, path object (pathlib.Path or py._path.local.LocalPath),
1568 a file-like object, xlrd workbook or openpyxl workbook.
1569 If a string or path object, expected to be a path to a
1570 .xls, .xlsx, .xlsb, .xlsm, .odf, .ods, or .odt file.
1571 engine : str, default None
1572 If io is not a buffer or path, this must be set to identify io.
1573 Supported engines: ``xlrd``, ``openpyxl``, ``odf``, ``pyxlsb``
1574 Engine compatibility :
1576 - ``xlrd`` supports old-style Excel files (.xls).
1577 - ``openpyxl`` supports newer Excel file formats.
1578 - ``odf`` supports OpenDocument file formats (.odf, .ods, .odt).
1579 - ``pyxlsb`` supports Binary Excel files.
1581 .. versionchanged:: 1.2.0
1583 The engine `xlrd <https://xlrd.readthedocs.io/en/latest/>`_
1584 now only supports old-style ``.xls`` files.
1585 When ``engine=None``, the following logic will be
1586 used to determine the engine:
1588 - If ``path_or_buffer`` is an OpenDocument format (.odf, .ods, .odt),
1589 then `odf <https://pypi.org/project/odfpy/>`_ will be used.
1590 - Otherwise if ``path_or_buffer`` is an xls format,
1591 ``xlrd`` will be used.
1592 - Otherwise if ``path_or_buffer`` is in xlsb format,
1593 `pyxlsb <https://pypi.org/project/pyxlsb/>`_ will be used.
1595 .. versionadded:: 1.3.0
1596 - Otherwise if `openpyxl <https://pypi.org/project/openpyxl/>`_ is installed,
1597 then ``openpyxl`` will be used.
1598 - Otherwise if ``xlrd >= 2.0`` is installed, a ``ValueError`` will be raised.
1599 - Otherwise ``xlrd`` will be used and a ``FutureWarning`` will be raised.
1600 This case will raise a ``ValueError`` in a future version of pandas.
1602 .. warning::
1604 Please do not report issues when using ``xlrd`` to read ``.xlsx`` files.
1605 This is not supported, switch to using ``openpyxl`` instead.
1606 """
1608 from pandas.io.excel._odfreader import ODFReader
1609 from pandas.io.excel._openpyxl import OpenpyxlReader
1610 from pandas.io.excel._pyxlsb import PyxlsbReader
1611 from pandas.io.excel._xlrd import XlrdReader
1613 _engines: Mapping[str, Any] = {
1614 "xlrd": XlrdReader,
1615 "openpyxl": OpenpyxlReader,
1616 "odf": ODFReader,
1617 "pyxlsb": PyxlsbReader,
1618 }
1620 def __init__(
1621 self,
1622 path_or_buffer,
1623 engine: str | None = None,
1624 storage_options: StorageOptions = None,
1625 ) -> None:
1626 if engine is not None and engine not in self._engines:
1627 raise ValueError(f"Unknown engine: {engine}")
1629 # First argument can also be bytes, so create a buffer
1630 if isinstance(path_or_buffer, bytes):
1631 path_or_buffer = BytesIO(path_or_buffer)
1633 # Could be a str, ExcelFile, Book, etc.
1634 self.io = path_or_buffer
1635 # Always a string
1636 self._io = stringify_path(path_or_buffer)
1638 # Determine xlrd version if installed
1639 if import_optional_dependency("xlrd", errors="ignore") is None:
1640 xlrd_version = None
1641 else:
1642 import xlrd
1644 xlrd_version = Version(get_version(xlrd))
1646 ext = None
1647 if engine is None:
1648 # Only determine ext if it is needed
1649 if xlrd_version is not None and isinstance(path_or_buffer, xlrd.Book):
1650 ext = "xls"
1651 else:
1652 ext = inspect_excel_format(
1653 content_or_path=path_or_buffer, storage_options=storage_options
1654 )
1655 if ext is None:
1656 raise ValueError(
1657 "Excel file format cannot be determined, you must specify "
1658 "an engine manually."
1659 )
1661 engine = config.get_option(f"io.excel.{ext}.reader", silent=True)
1662 if engine == "auto":
1663 engine = get_default_engine(ext, mode="reader")
1665 if engine == "xlrd" and xlrd_version is not None:
1666 if ext is None:
1667 # Need ext to determine ext in order to raise/warn
1668 if isinstance(path_or_buffer, xlrd.Book):
1669 ext = "xls"
1670 else:
1671 ext = inspect_excel_format(
1672 path_or_buffer, storage_options=storage_options
1673 )
1675 # Pass through if ext is None, otherwise check if ext valid for xlrd
1676 if ext and ext != "xls" and xlrd_version >= Version("2"):
1677 raise ValueError(
1678 f"Your version of xlrd is {xlrd_version}. In xlrd >= 2.0, "
1679 f"only the xls format is supported. Install openpyxl instead."
1680 )
1681 elif ext and ext != "xls":
1682 stacklevel = find_stack_level()
1683 warnings.warn(
1684 f"Your version of xlrd is {xlrd_version}. In xlrd >= 2.0, "
1685 f"only the xls format is supported. Install "
1686 f"openpyxl instead.",
1687 FutureWarning,
1688 stacklevel=stacklevel,
1689 )
1691 assert engine is not None
1692 self.engine = engine
1693 self.storage_options = storage_options
1695 self._reader = self._engines[engine](self._io, storage_options=storage_options)
1697 def __fspath__(self):
1698 return self._io
1700 def parse(
1701 self,
1702 sheet_name: str | int | list[int] | list[str] | None = 0,
1703 header: int | Sequence[int] | None = 0,
1704 names=None,
1705 index_col: int | Sequence[int] | None = None,
1706 usecols=None,
1707 squeeze: bool | None = None,
1708 converters=None,
1709 true_values: Iterable[Hashable] | None = None,
1710 false_values: Iterable[Hashable] | None = None,
1711 skiprows: Sequence[int] | int | Callable[[int], object] | None = None,
1712 nrows: int | None = None,
1713 na_values=None,
1714 parse_dates: list | dict | bool = False,
1715 date_parser: Callable | None = None,
1716 thousands: str | None = None,
1717 comment: str | None = None,
1718 skipfooter: int = 0,
1719 convert_float: bool | None = None,
1720 mangle_dupe_cols: bool = True,
1721 **kwds,
1722 ) -> DataFrame | dict[str, DataFrame] | dict[int, DataFrame]:
1723 """
1724 Parse specified sheet(s) into a DataFrame.
1726 Equivalent to read_excel(ExcelFile, ...) See the read_excel
1727 docstring for more info on accepted parameters.
1729 Returns
1730 -------
1731 DataFrame or dict of DataFrames
1732 DataFrame from the passed in Excel file.
1733 """
1734 return self._reader.parse(
1735 sheet_name=sheet_name,
1736 header=header,
1737 names=names,
1738 index_col=index_col,
1739 usecols=usecols,
1740 squeeze=squeeze,
1741 converters=converters,
1742 true_values=true_values,
1743 false_values=false_values,
1744 skiprows=skiprows,
1745 nrows=nrows,
1746 na_values=na_values,
1747 parse_dates=parse_dates,
1748 date_parser=date_parser,
1749 thousands=thousands,
1750 comment=comment,
1751 skipfooter=skipfooter,
1752 convert_float=convert_float,
1753 mangle_dupe_cols=mangle_dupe_cols,
1754 **kwds,
1755 )
1757 @property
1758 def book(self):
1759 return self._reader.book
1761 @property
1762 def sheet_names(self):
1763 return self._reader.sheet_names
1765 def close(self) -> None:
1766 """close io if necessary"""
1767 self._reader.close()
1769 def __enter__(self) -> ExcelFile:
1770 return self
1772 def __exit__(self, exc_type, exc_value, traceback) -> None:
1773 self.close()
1775 def __del__(self) -> None:
1776 # Ensure we don't leak file descriptors, but put in try/except in case
1777 # attributes are already deleted
1778 try:
1779 self.close()
1780 except AttributeError:
1781 pass