Coverage for /var/srv/projects/api.amasfac.comuna18.com/tmp/venv/lib/python3.9/site-packages/pandas/io/excel/_base.py: 20%

466 statements  

« prev     ^ index     » next       coverage.py v6.4.4, created at 2023-07-17 14:22 -0600

1from __future__ import annotations 

2 

3import abc 

4import datetime 

5from functools import partial 

6from io import BytesIO 

7import os 

8from textwrap import fill 

9from typing import ( 

10 IO, 

11 Any, 

12 Callable, 

13 Hashable, 

14 Iterable, 

15 List, 

16 Literal, 

17 Mapping, 

18 Sequence, 

19 Union, 

20 cast, 

21 overload, 

22) 

23import warnings 

24import zipfile 

25 

26from pandas._config import config 

27 

28from pandas._libs.parsers import STR_NA_VALUES 

29from pandas._typing import ( 

30 DtypeArg, 

31 FilePath, 

32 IntStrT, 

33 ReadBuffer, 

34 StorageOptions, 

35 WriteExcelBuffer, 

36) 

37from pandas.compat._optional import ( 

38 get_version, 

39 import_optional_dependency, 

40) 

41from pandas.errors import EmptyDataError 

42from pandas.util._decorators import ( 

43 Appender, 

44 deprecate_kwarg, 

45 deprecate_nonkeyword_arguments, 

46 doc, 

47) 

48from pandas.util._exceptions import find_stack_level 

49 

50from pandas.core.dtypes.common import ( 

51 is_bool, 

52 is_float, 

53 is_integer, 

54 is_list_like, 

55) 

56 

57from pandas.core.frame import DataFrame 

58from pandas.core.shared_docs import _shared_docs 

59from pandas.util.version import Version 

60 

61from pandas.io.common import ( 

62 IOHandles, 

63 get_handle, 

64 stringify_path, 

65 validate_header_arg, 

66) 

67from pandas.io.excel._util import ( 

68 fill_mi_header, 

69 get_default_engine, 

70 get_writer, 

71 maybe_convert_usecols, 

72 pop_header_name, 

73) 

74from pandas.io.parsers import TextParser 

75from pandas.io.parsers.readers import validate_integer 

76 

77_read_excel_doc = ( 

78 """ 

79Read an Excel file into a pandas DataFrame. 

80 

81Supports `xls`, `xlsx`, `xlsm`, `xlsb`, `odf`, `ods` and `odt` file extensions 

82read from a local filesystem or URL. Supports an option to read 

83a single sheet or a list of sheets. 

84 

85Parameters 

86---------- 

87io : str, bytes, ExcelFile, xlrd.Book, path object, or file-like object 

88 Any valid string path is acceptable. The string could be a URL. Valid 

89 URL schemes include http, ftp, s3, and file. For file URLs, a host is 

90 expected. A local file could be: ``file://localhost/path/to/table.xlsx``. 

91 

92 If you want to pass in a path object, pandas accepts any ``os.PathLike``. 

93 

94 By file-like object, we refer to objects with a ``read()`` method, 

95 such as a file handle (e.g. via builtin ``open`` function) 

96 or ``StringIO``. 

97sheet_name : str, int, list, or None, default 0 

98 Strings are used for sheet names. Integers are used in zero-indexed 

99 sheet positions (chart sheets do not count as a sheet position). 

100 Lists of strings/integers are used to request multiple sheets. 

101 Specify None to get all worksheets. 

102 

103 Available cases: 

104 

105 * Defaults to ``0``: 1st sheet as a `DataFrame` 

106 * ``1``: 2nd sheet as a `DataFrame` 

107 * ``"Sheet1"``: Load sheet with name "Sheet1" 

108 * ``[0, 1, "Sheet5"]``: Load first, second and sheet named "Sheet5" 

109 as a dict of `DataFrame` 

110 * None: All worksheets. 

111 

112header : int, list of int, default 0 

113 Row (0-indexed) to use for the column labels of the parsed 

114 DataFrame. If a list of integers is passed those row positions will 

115 be combined into a ``MultiIndex``. Use None if there is no header. 

116names : array-like, default None 

117 List of column names to use. If file contains no header row, 

118 then you should explicitly pass header=None. 

119index_col : int, list of int, default None 

120 Column (0-indexed) to use as the row labels of the DataFrame. 

121 Pass None if there is no such column. If a list is passed, 

122 those columns will be combined into a ``MultiIndex``. If a 

123 subset of data is selected with ``usecols``, index_col 

124 is based on the subset. 

125 

126 Missing values will be forward filled to allow roundtripping with 

127 ``to_excel`` for ``merged_cells=True``. To avoid forward filling the 

128 missing values use ``set_index`` after reading the data instead of 

129 ``index_col``. 

130usecols : str, list-like, or callable, default None 

131 * If None, then parse all columns. 

132 * If str, then indicates comma separated list of Excel column letters 

133 and column ranges (e.g. "A:E" or "A,C,E:F"). Ranges are inclusive of 

134 both sides. 

135 * If list of int, then indicates list of column numbers to be parsed 

136 (0-indexed). 

137 * If list of string, then indicates list of column names to be parsed. 

138 * If callable, then evaluate each column name against it and parse the 

139 column if the callable returns ``True``. 

140 

141 Returns a subset of the columns according to behavior above. 

142squeeze : bool, default False 

143 If the parsed data only contains one column then return a Series. 

144 

145 .. deprecated:: 1.4.0 

146 Append ``.squeeze("columns")`` to the call to ``read_excel`` to squeeze 

147 the data. 

148dtype : Type name or dict of column -> type, default None 

149 Data type for data or columns. E.g. {{'a': np.float64, 'b': np.int32}} 

150 Use `object` to preserve data as stored in Excel and not interpret dtype. 

151 If converters are specified, they will be applied INSTEAD 

152 of dtype conversion. 

153engine : str, default None 

154 If io is not a buffer or path, this must be set to identify io. 

155 Supported engines: "xlrd", "openpyxl", "odf", "pyxlsb". 

156 Engine compatibility : 

157 

158 - "xlrd" supports old-style Excel files (.xls). 

159 - "openpyxl" supports newer Excel file formats. 

160 - "odf" supports OpenDocument file formats (.odf, .ods, .odt). 

161 - "pyxlsb" supports Binary Excel files. 

162 

163 .. versionchanged:: 1.2.0 

164 The engine `xlrd <https://xlrd.readthedocs.io/en/latest/>`_ 

165 now only supports old-style ``.xls`` files. 

166 When ``engine=None``, the following logic will be 

167 used to determine the engine: 

168 

169 - If ``path_or_buffer`` is an OpenDocument format (.odf, .ods, .odt), 

170 then `odf <https://pypi.org/project/odfpy/>`_ will be used. 

171 - Otherwise if ``path_or_buffer`` is an xls format, 

172 ``xlrd`` will be used. 

173 - Otherwise if ``path_or_buffer`` is in xlsb format, 

174 ``pyxlsb`` will be used. 

175 

176 .. versionadded:: 1.3.0 

177 - Otherwise ``openpyxl`` will be used. 

178 

179 .. versionchanged:: 1.3.0 

180 

181converters : dict, default None 

182 Dict of functions for converting values in certain columns. Keys can 

183 either be integers or column labels, values are functions that take one 

184 input argument, the Excel cell content, and return the transformed 

185 content. 

186true_values : list, default None 

187 Values to consider as True. 

188false_values : list, default None 

189 Values to consider as False. 

190skiprows : list-like, int, or callable, optional 

191 Line numbers to skip (0-indexed) or number of lines to skip (int) at the 

192 start of the file. If callable, the callable function will be evaluated 

193 against the row indices, returning True if the row should be skipped and 

194 False otherwise. An example of a valid callable argument would be ``lambda 

195 x: x in [0, 2]``. 

196nrows : int, default None 

197 Number of rows to parse. 

198na_values : scalar, str, list-like, or dict, default None 

199 Additional strings to recognize as NA/NaN. If dict passed, specific 

200 per-column NA values. By default the following values are interpreted 

201 as NaN: '""" 

202 + fill("', '".join(sorted(STR_NA_VALUES)), 70, subsequent_indent=" ") 

203 + """'. 

204keep_default_na : bool, default True 

205 Whether or not to include the default NaN values when parsing the data. 

206 Depending on whether `na_values` is passed in, the behavior is as follows: 

207 

208 * If `keep_default_na` is True, and `na_values` are specified, `na_values` 

209 is appended to the default NaN values used for parsing. 

210 * If `keep_default_na` is True, and `na_values` are not specified, only 

211 the default NaN values are used for parsing. 

212 * If `keep_default_na` is False, and `na_values` are specified, only 

213 the NaN values specified `na_values` are used for parsing. 

214 * If `keep_default_na` is False, and `na_values` are not specified, no 

215 strings will be parsed as NaN. 

216 

217 Note that if `na_filter` is passed in as False, the `keep_default_na` and 

218 `na_values` parameters will be ignored. 

219na_filter : bool, default True 

220 Detect missing value markers (empty strings and the value of na_values). In 

221 data without any NAs, passing na_filter=False can improve the performance 

222 of reading a large file. 

223verbose : bool, default False 

224 Indicate number of NA values placed in non-numeric columns. 

225parse_dates : bool, list-like, or dict, default False 

226 The behavior is as follows: 

227 

228 * bool. If True -> try parsing the index. 

229 * list of int or names. e.g. If [1, 2, 3] -> try parsing columns 1, 2, 3 

230 each as a separate date column. 

231 * list of lists. e.g. If [[1, 3]] -> combine columns 1 and 3 and parse as 

232 a single date column. 

233 * dict, e.g. {{'foo' : [1, 3]}} -> parse columns 1, 3 as date and call 

234 result 'foo' 

235 

236 If a column or index contains an unparsable date, the entire column or 

237 index will be returned unaltered as an object data type. If you don`t want to 

238 parse some cells as date just change their type in Excel to "Text". 

239 For non-standard datetime parsing, use ``pd.to_datetime`` after ``pd.read_excel``. 

240 

241 Note: A fast-path exists for iso8601-formatted dates. 

242date_parser : function, optional 

243 Function to use for converting a sequence of string columns to an array of 

244 datetime instances. The default uses ``dateutil.parser.parser`` to do the 

245 conversion. Pandas will try to call `date_parser` in three different ways, 

246 advancing to the next if an exception occurs: 1) Pass one or more arrays 

247 (as defined by `parse_dates`) as arguments; 2) concatenate (row-wise) the 

248 string values from the columns defined by `parse_dates` into a single array 

249 and pass that; and 3) call `date_parser` once for each row using one or 

250 more strings (corresponding to the columns defined by `parse_dates`) as 

251 arguments. 

252thousands : str, default None 

253 Thousands separator for parsing string columns to numeric. Note that 

254 this parameter is only necessary for columns stored as TEXT in Excel, 

255 any numeric columns will automatically be parsed, regardless of display 

256 format. 

257decimal : str, default '.' 

258 Character to recognize as decimal point for parsing string columns to numeric. 

259 Note that this parameter is only necessary for columns stored as TEXT in Excel, 

260 any numeric columns will automatically be parsed, regardless of display 

261 format.(e.g. use ',' for European data). 

262 

263 .. versionadded:: 1.4.0 

264 

265comment : str, default None 

266 Comments out remainder of line. Pass a character or characters to this 

267 argument to indicate comments in the input file. Any data between the 

268 comment string and the end of the current line is ignored. 

269skipfooter : int, default 0 

270 Rows at the end to skip (0-indexed). 

271convert_float : bool, default True 

272 Convert integral floats to int (i.e., 1.0 --> 1). If False, all numeric 

273 data will be read in as floats: Excel stores all numbers as floats 

274 internally. 

275 

276 .. deprecated:: 1.3.0 

277 convert_float will be removed in a future version 

278 

279mangle_dupe_cols : bool, default True 

280 Duplicate columns will be specified as 'X', 'X.1', ...'X.N', rather than 

281 'X'...'X'. Passing in False will cause data to be overwritten if there 

282 are duplicate names in the columns. 

283 

284 .. deprecated:: 1.5.0 

285 Not implemented, and a new argument to specify the pattern for the 

286 names of duplicated columns will be added instead 

287 

288{storage_options} 

289 

290 .. versionadded:: 1.2.0 

291 

292Returns 

293------- 

294DataFrame or dict of DataFrames 

295 DataFrame from the passed in Excel file. See notes in sheet_name 

296 argument for more information on when a dict of DataFrames is returned. 

297 

298See Also 

299-------- 

300DataFrame.to_excel : Write DataFrame to an Excel file. 

301DataFrame.to_csv : Write DataFrame to a comma-separated values (csv) file. 

302read_csv : Read a comma-separated values (csv) file into DataFrame. 

303read_fwf : Read a table of fixed-width formatted lines into DataFrame. 

304 

305Examples 

306-------- 

307The file can be read using the file name as string or an open file object: 

308 

309>>> pd.read_excel('tmp.xlsx', index_col=0) # doctest: +SKIP 

310 Name Value 

3110 string1 1 

3121 string2 2 

3132 #Comment 3 

314 

315>>> pd.read_excel(open('tmp.xlsx', 'rb'), 

316... sheet_name='Sheet3') # doctest: +SKIP 

317 Unnamed: 0 Name Value 

3180 0 string1 1 

3191 1 string2 2 

3202 2 #Comment 3 

321 

322Index and header can be specified via the `index_col` and `header` arguments 

323 

324>>> pd.read_excel('tmp.xlsx', index_col=None, header=None) # doctest: +SKIP 

325 0 1 2 

3260 NaN Name Value 

3271 0.0 string1 1 

3282 1.0 string2 2 

3293 2.0 #Comment 3 

330 

331Column types are inferred but can be explicitly specified 

332 

333>>> pd.read_excel('tmp.xlsx', index_col=0, 

334... dtype={{'Name': str, 'Value': float}}) # doctest: +SKIP 

335 Name Value 

3360 string1 1.0 

3371 string2 2.0 

3382 #Comment 3.0 

339 

340True, False, and NA values, and thousands separators have defaults, 

341but can be explicitly specified, too. Supply the values you would like 

342as strings or lists of strings! 

343 

344>>> pd.read_excel('tmp.xlsx', index_col=0, 

345... na_values=['string1', 'string2']) # doctest: +SKIP 

346 Name Value 

3470 NaN 1 

3481 NaN 2 

3492 #Comment 3 

350 

351Comment lines in the excel input file can be skipped using the `comment` kwarg 

352 

353>>> pd.read_excel('tmp.xlsx', index_col=0, comment='#') # doctest: +SKIP 

354 Name Value 

3550 string1 1.0 

3561 string2 2.0 

3572 None NaN 

358""" 

359) 

360 

361 

362@overload 

363def read_excel( 

364 io, 

365 # sheet name is str or int -> DataFrame 

366 sheet_name: str | int = ..., 

367 header: int | Sequence[int] | None = ..., 

368 names: list[str] | None = ..., 

369 index_col: int | Sequence[int] | None = ..., 

370 usecols: int 

371 | str 

372 | Sequence[int] 

373 | Sequence[str] 

374 | Callable[[str], bool] 

375 | None = ..., 

376 squeeze: bool | None = ..., 

377 dtype: DtypeArg | None = ..., 

378 engine: Literal["xlrd", "openpyxl", "odf", "pyxlsb"] | None = ..., 

379 converters: dict[str, Callable] | dict[int, Callable] | None = ..., 

380 true_values: Iterable[Hashable] | None = ..., 

381 false_values: Iterable[Hashable] | None = ..., 

382 skiprows: Sequence[int] | int | Callable[[int], object] | None = ..., 

383 nrows: int | None = ..., 

384 na_values=..., 

385 keep_default_na: bool = ..., 

386 na_filter: bool = ..., 

387 verbose: bool = ..., 

388 parse_dates: list | dict | bool = ..., 

389 date_parser: Callable | None = ..., 

390 thousands: str | None = ..., 

391 decimal: str = ..., 

392 comment: str | None = ..., 

393 skipfooter: int = ..., 

394 convert_float: bool | None = ..., 

395 mangle_dupe_cols: bool = ..., 

396 storage_options: StorageOptions = ..., 

397) -> DataFrame: 

398 ... 

399 

400 

401@overload 

402def read_excel( 

403 io, 

404 # sheet name is list or None -> dict[IntStrT, DataFrame] 

405 sheet_name: list[IntStrT] | None, 

406 header: int | Sequence[int] | None = ..., 

407 names: list[str] | None = ..., 

408 index_col: int | Sequence[int] | None = ..., 

409 usecols: int 

410 | str 

411 | Sequence[int] 

412 | Sequence[str] 

413 | Callable[[str], bool] 

414 | None = ..., 

415 squeeze: bool | None = ..., 

416 dtype: DtypeArg | None = ..., 

417 engine: Literal["xlrd", "openpyxl", "odf", "pyxlsb"] | None = ..., 

418 converters: dict[str, Callable] | dict[int, Callable] | None = ..., 

419 true_values: Iterable[Hashable] | None = ..., 

420 false_values: Iterable[Hashable] | None = ..., 

421 skiprows: Sequence[int] | int | Callable[[int], object] | None = ..., 

422 nrows: int | None = ..., 

423 na_values=..., 

424 keep_default_na: bool = ..., 

425 na_filter: bool = ..., 

426 verbose: bool = ..., 

427 parse_dates: list | dict | bool = ..., 

428 date_parser: Callable | None = ..., 

429 thousands: str | None = ..., 

430 decimal: str = ..., 

431 comment: str | None = ..., 

432 skipfooter: int = ..., 

433 convert_float: bool | None = ..., 

434 mangle_dupe_cols: bool = ..., 

435 storage_options: StorageOptions = ..., 

436) -> dict[IntStrT, DataFrame]: 

437 ... 

438 

439 

440@doc(storage_options=_shared_docs["storage_options"]) 

441@deprecate_kwarg(old_arg_name="mangle_dupe_cols", new_arg_name=None) 

442@deprecate_nonkeyword_arguments(allowed_args=["io", "sheet_name"], version="2.0") 

443@Appender(_read_excel_doc) 

444def read_excel( 

445 io, 

446 sheet_name: str | int | list[IntStrT] | None = 0, 

447 header: int | Sequence[int] | None = 0, 

448 names: list[str] | None = None, 

449 index_col: int | Sequence[int] | None = None, 

450 usecols: int 

451 | str 

452 | Sequence[int] 

453 | Sequence[str] 

454 | Callable[[str], bool] 

455 | None = None, 

456 squeeze: bool | None = None, 

457 dtype: DtypeArg | None = None, 

458 engine: Literal["xlrd", "openpyxl", "odf", "pyxlsb"] | None = None, 

459 converters: dict[str, Callable] | dict[int, Callable] | None = None, 

460 true_values: Iterable[Hashable] | None = None, 

461 false_values: Iterable[Hashable] | None = None, 

462 skiprows: Sequence[int] | int | Callable[[int], object] | None = None, 

463 nrows: int | None = None, 

464 na_values=None, 

465 keep_default_na: bool = True, 

466 na_filter: bool = True, 

467 verbose: bool = False, 

468 parse_dates: list | dict | bool = False, 

469 date_parser: Callable | None = None, 

470 thousands: str | None = None, 

471 decimal: str = ".", 

472 comment: str | None = None, 

473 skipfooter: int = 0, 

474 convert_float: bool | None = None, 

475 mangle_dupe_cols: bool = True, 

476 storage_options: StorageOptions = None, 

477) -> DataFrame | dict[IntStrT, DataFrame]: 

478 

479 should_close = False 

480 if not isinstance(io, ExcelFile): 

481 should_close = True 

482 io = ExcelFile(io, storage_options=storage_options, engine=engine) 

483 elif engine and engine != io.engine: 

484 raise ValueError( 

485 "Engine should not be specified when passing " 

486 "an ExcelFile - ExcelFile already has the engine set" 

487 ) 

488 

489 try: 

490 data = io.parse( 

491 sheet_name=sheet_name, 

492 header=header, 

493 names=names, 

494 index_col=index_col, 

495 usecols=usecols, 

496 squeeze=squeeze, 

497 dtype=dtype, 

498 converters=converters, 

499 true_values=true_values, 

500 false_values=false_values, 

501 skiprows=skiprows, 

502 nrows=nrows, 

503 na_values=na_values, 

504 keep_default_na=keep_default_na, 

505 na_filter=na_filter, 

506 verbose=verbose, 

507 parse_dates=parse_dates, 

508 date_parser=date_parser, 

509 thousands=thousands, 

510 decimal=decimal, 

511 comment=comment, 

512 skipfooter=skipfooter, 

513 convert_float=convert_float, 

514 mangle_dupe_cols=mangle_dupe_cols, 

515 ) 

516 finally: 

517 # make sure to close opened file handles 

518 if should_close: 

519 io.close() 

520 return data 

521 

522 

523class BaseExcelReader(metaclass=abc.ABCMeta): 

524 def __init__( 

525 self, filepath_or_buffer, storage_options: StorageOptions = None 

526 ) -> None: 

527 # First argument can also be bytes, so create a buffer 

528 if isinstance(filepath_or_buffer, bytes): 

529 filepath_or_buffer = BytesIO(filepath_or_buffer) 

530 

531 self.handles = IOHandles( 

532 handle=filepath_or_buffer, compression={"method": None} 

533 ) 

534 if not isinstance(filepath_or_buffer, (ExcelFile, self._workbook_class)): 

535 self.handles = get_handle( 

536 filepath_or_buffer, "rb", storage_options=storage_options, is_text=False 

537 ) 

538 

539 if isinstance(self.handles.handle, self._workbook_class): 

540 self.book = self.handles.handle 

541 elif hasattr(self.handles.handle, "read"): 

542 # N.B. xlrd.Book has a read attribute too 

543 self.handles.handle.seek(0) 

544 try: 

545 self.book = self.load_workbook(self.handles.handle) 

546 except Exception: 

547 self.close() 

548 raise 

549 else: 

550 raise ValueError( 

551 "Must explicitly set engine if not passing in buffer or path for io." 

552 ) 

553 

554 @property 

555 @abc.abstractmethod 

556 def _workbook_class(self): 

557 pass 

558 

559 @abc.abstractmethod 

560 def load_workbook(self, filepath_or_buffer): 

561 pass 

562 

563 def close(self) -> None: 

564 if hasattr(self, "book"): 

565 if hasattr(self.book, "close"): 

566 # pyxlsb: opens a TemporaryFile 

567 # openpyxl: https://stackoverflow.com/questions/31416842/ 

568 # openpyxl-does-not-close-excel-workbook-in-read-only-mode 

569 self.book.close() 

570 elif hasattr(self.book, "release_resources"): 

571 # xlrd 

572 # https://github.com/python-excel/xlrd/blob/2.0.1/xlrd/book.py#L548 

573 self.book.release_resources() 

574 self.handles.close() 

575 

576 @property 

577 @abc.abstractmethod 

578 def sheet_names(self) -> list[str]: 

579 pass 

580 

581 @abc.abstractmethod 

582 def get_sheet_by_name(self, name: str): 

583 pass 

584 

585 @abc.abstractmethod 

586 def get_sheet_by_index(self, index: int): 

587 pass 

588 

589 @abc.abstractmethod 

590 def get_sheet_data(self, sheet, convert_float: bool, rows: int | None = None): 

591 pass 

592 

593 def raise_if_bad_sheet_by_index(self, index: int) -> None: 

594 n_sheets = len(self.sheet_names) 

595 if index >= n_sheets: 

596 raise ValueError( 

597 f"Worksheet index {index} is invalid, {n_sheets} worksheets found" 

598 ) 

599 

600 def raise_if_bad_sheet_by_name(self, name: str) -> None: 

601 if name not in self.sheet_names: 

602 raise ValueError(f"Worksheet named '{name}' not found") 

603 

604 def _check_skiprows_func( 

605 self, 

606 skiprows: Callable, 

607 rows_to_use: int, 

608 ) -> int: 

609 """ 

610 Determine how many file rows are required to obtain `nrows` data 

611 rows when `skiprows` is a function. 

612 

613 Parameters 

614 ---------- 

615 skiprows : function 

616 The function passed to read_excel by the user. 

617 rows_to_use : int 

618 The number of rows that will be needed for the header and 

619 the data. 

620 

621 Returns 

622 ------- 

623 int 

624 """ 

625 i = 0 

626 rows_used_so_far = 0 

627 while rows_used_so_far < rows_to_use: 

628 if not skiprows(i): 

629 rows_used_so_far += 1 

630 i += 1 

631 return i 

632 

633 def _calc_rows( 

634 self, 

635 header: int | Sequence[int] | None, 

636 index_col: int | Sequence[int] | None, 

637 skiprows: Sequence[int] | int | Callable[[int], object] | None, 

638 nrows: int | None, 

639 ) -> int | None: 

640 """ 

641 If nrows specified, find the number of rows needed from the 

642 file, otherwise return None. 

643 

644 

645 Parameters 

646 ---------- 

647 header : int, list of int, or None 

648 See read_excel docstring. 

649 index_col : int, list of int, or None 

650 See read_excel docstring. 

651 skiprows : list-like, int, callable, or None 

652 See read_excel docstring. 

653 nrows : int or None 

654 See read_excel docstring. 

655 

656 Returns 

657 ------- 

658 int or None 

659 """ 

660 if nrows is None: 

661 return None 

662 if header is None: 

663 header_rows = 1 

664 elif is_integer(header): 

665 header = cast(int, header) 

666 header_rows = 1 + header 

667 else: 

668 header = cast(Sequence, header) 

669 header_rows = 1 + header[-1] 

670 # If there is a MultiIndex header and an index then there is also 

671 # a row containing just the index name(s) 

672 if is_list_like(header) and index_col is not None: 

673 header = cast(Sequence, header) 

674 if len(header) > 1: 

675 header_rows += 1 

676 if skiprows is None: 

677 return header_rows + nrows 

678 if is_integer(skiprows): 

679 skiprows = cast(int, skiprows) 

680 return header_rows + nrows + skiprows 

681 if is_list_like(skiprows): 

682 

683 def f(skiprows: Sequence, x: int) -> bool: 

684 return x in skiprows 

685 

686 skiprows = cast(Sequence, skiprows) 

687 return self._check_skiprows_func(partial(f, skiprows), header_rows + nrows) 

688 if callable(skiprows): 

689 return self._check_skiprows_func( 

690 skiprows, 

691 header_rows + nrows, 

692 ) 

693 # else unexpected skiprows type: read_excel will not optimize 

694 # the number of rows read from file 

695 return None 

696 

697 def parse( 

698 self, 

699 sheet_name: str | int | list[int] | list[str] | None = 0, 

700 header: int | Sequence[int] | None = 0, 

701 names=None, 

702 index_col: int | Sequence[int] | None = None, 

703 usecols=None, 

704 squeeze: bool | None = None, 

705 dtype: DtypeArg | None = None, 

706 true_values: Iterable[Hashable] | None = None, 

707 false_values: Iterable[Hashable] | None = None, 

708 skiprows: Sequence[int] | int | Callable[[int], object] | None = None, 

709 nrows: int | None = None, 

710 na_values=None, 

711 verbose: bool = False, 

712 parse_dates: list | dict | bool = False, 

713 date_parser: Callable | None = None, 

714 thousands: str | None = None, 

715 decimal: str = ".", 

716 comment: str | None = None, 

717 skipfooter: int = 0, 

718 convert_float: bool | None = None, 

719 mangle_dupe_cols: bool = True, 

720 **kwds, 

721 ): 

722 

723 if convert_float is None: 

724 convert_float = True 

725 else: 

726 warnings.warn( 

727 "convert_float is deprecated and will be removed in a future version.", 

728 FutureWarning, 

729 stacklevel=find_stack_level(), 

730 ) 

731 

732 validate_header_arg(header) 

733 validate_integer("nrows", nrows) 

734 

735 ret_dict = False 

736 

737 # Keep sheetname to maintain backwards compatibility. 

738 sheets: list[int] | list[str] 

739 if isinstance(sheet_name, list): 

740 sheets = sheet_name 

741 ret_dict = True 

742 elif sheet_name is None: 

743 sheets = self.sheet_names 

744 ret_dict = True 

745 elif isinstance(sheet_name, str): 

746 sheets = [sheet_name] 

747 else: 

748 sheets = [sheet_name] 

749 

750 # handle same-type duplicates. 

751 sheets = cast(Union[List[int], List[str]], list(dict.fromkeys(sheets).keys())) 

752 

753 output = {} 

754 

755 for asheetname in sheets: 

756 if verbose: 

757 print(f"Reading sheet {asheetname}") 

758 

759 if isinstance(asheetname, str): 

760 sheet = self.get_sheet_by_name(asheetname) 

761 else: # assume an integer if not a string 

762 sheet = self.get_sheet_by_index(asheetname) 

763 

764 file_rows_needed = self._calc_rows(header, index_col, skiprows, nrows) 

765 data = self.get_sheet_data(sheet, convert_float, file_rows_needed) 

766 if hasattr(sheet, "close"): 

767 # pyxlsb opens two TemporaryFiles 

768 sheet.close() 

769 usecols = maybe_convert_usecols(usecols) 

770 

771 if not data: 

772 output[asheetname] = DataFrame() 

773 continue 

774 

775 is_list_header = False 

776 is_len_one_list_header = False 

777 if is_list_like(header): 

778 assert isinstance(header, Sequence) 

779 is_list_header = True 

780 if len(header) == 1: 

781 is_len_one_list_header = True 

782 

783 if is_len_one_list_header: 

784 header = cast(Sequence[int], header)[0] 

785 

786 # forward fill and pull out names for MultiIndex column 

787 header_names = None 

788 if header is not None and is_list_like(header): 

789 assert isinstance(header, Sequence) 

790 

791 header_names = [] 

792 control_row = [True] * len(data[0]) 

793 

794 for row in header: 

795 if is_integer(skiprows): 

796 assert isinstance(skiprows, int) 

797 row += skiprows 

798 

799 if row > len(data) - 1: 

800 raise ValueError( 

801 f"header index {row} exceeds maximum index " 

802 f"{len(data) - 1} of data.", 

803 ) 

804 

805 data[row], control_row = fill_mi_header(data[row], control_row) 

806 

807 if index_col is not None: 

808 header_name, _ = pop_header_name(data[row], index_col) 

809 header_names.append(header_name) 

810 

811 # If there is a MultiIndex header and an index then there is also 

812 # a row containing just the index name(s) 

813 has_index_names = False 

814 if is_list_header and not is_len_one_list_header and index_col is not None: 

815 

816 index_col_list: Sequence[int] 

817 if isinstance(index_col, int): 

818 index_col_list = [index_col] 

819 else: 

820 assert isinstance(index_col, Sequence) 

821 index_col_list = index_col 

822 

823 # We have to handle mi without names. If any of the entries in the data 

824 # columns are not empty, this is a regular row 

825 assert isinstance(header, Sequence) 

826 if len(header) < len(data): 

827 potential_index_names = data[len(header)] 

828 potential_data = [ 

829 x 

830 for i, x in enumerate(potential_index_names) 

831 if not control_row[i] and i not in index_col_list 

832 ] 

833 has_index_names = all(x == "" or x is None for x in potential_data) 

834 

835 if is_list_like(index_col): 

836 # Forward fill values for MultiIndex index. 

837 if header is None: 

838 offset = 0 

839 elif isinstance(header, int): 

840 offset = 1 + header 

841 else: 

842 offset = 1 + max(header) 

843 

844 # GH34673: if MultiIndex names present and not defined in the header, 

845 # offset needs to be incremented so that forward filling starts 

846 # from the first MI value instead of the name 

847 if has_index_names: 

848 offset += 1 

849 

850 # Check if we have an empty dataset 

851 # before trying to collect data. 

852 if offset < len(data): 

853 assert isinstance(index_col, Sequence) 

854 

855 for col in index_col: 

856 last = data[offset][col] 

857 

858 for row in range(offset + 1, len(data)): 

859 if data[row][col] == "" or data[row][col] is None: 

860 data[row][col] = last 

861 else: 

862 last = data[row][col] 

863 

864 # GH 12292 : error when read one empty column from excel file 

865 try: 

866 parser = TextParser( 

867 data, 

868 names=names, 

869 header=header, 

870 index_col=index_col, 

871 has_index_names=has_index_names, 

872 squeeze=squeeze, 

873 dtype=dtype, 

874 true_values=true_values, 

875 false_values=false_values, 

876 skiprows=skiprows, 

877 nrows=nrows, 

878 na_values=na_values, 

879 skip_blank_lines=False, # GH 39808 

880 parse_dates=parse_dates, 

881 date_parser=date_parser, 

882 thousands=thousands, 

883 decimal=decimal, 

884 comment=comment, 

885 skipfooter=skipfooter, 

886 usecols=usecols, 

887 mangle_dupe_cols=mangle_dupe_cols, 

888 **kwds, 

889 ) 

890 

891 output[asheetname] = parser.read(nrows=nrows) 

892 

893 if not squeeze or isinstance(output[asheetname], DataFrame): 

894 if header_names: 

895 output[asheetname].columns = output[ 

896 asheetname 

897 ].columns.set_names(header_names) 

898 

899 except EmptyDataError: 

900 # No Data, return an empty DataFrame 

901 output[asheetname] = DataFrame() 

902 

903 if ret_dict: 

904 return output 

905 else: 

906 return output[asheetname] 

907 

908 

909@doc(storage_options=_shared_docs["storage_options"]) 

910class ExcelWriter(metaclass=abc.ABCMeta): 

911 """ 

912 Class for writing DataFrame objects into excel sheets. 

913 

914 Default is to use: 

915 

916 * `xlwt <https://pypi.org/project/xlwt/>`__ for xls files 

917 * `xlsxwriter <https://pypi.org/project/XlsxWriter/>`__ for xlsx files if xlsxwriter 

918 is installed otherwise `openpyxl <https://pypi.org/project/openpyxl/>`__ 

919 * `odswriter <https://pypi.org/project/odswriter/>`__ for ods files 

920 

921 See ``DataFrame.to_excel`` for typical usage. 

922 

923 The writer should be used as a context manager. Otherwise, call `close()` to save 

924 and close any opened file handles. 

925 

926 Parameters 

927 ---------- 

928 path : str or typing.BinaryIO 

929 Path to xls or xlsx or ods file. 

930 engine : str (optional) 

931 Engine to use for writing. If None, defaults to 

932 ``io.excel.<extension>.writer``. NOTE: can only be passed as a keyword 

933 argument. 

934 

935 .. deprecated:: 1.2.0 

936 

937 As the `xlwt <https://pypi.org/project/xlwt/>`__ package is no longer 

938 maintained, the ``xlwt`` engine will be removed in a future 

939 version of pandas. 

940 

941 date_format : str, default None 

942 Format string for dates written into Excel files (e.g. 'YYYY-MM-DD'). 

943 datetime_format : str, default None 

944 Format string for datetime objects written into Excel files. 

945 (e.g. 'YYYY-MM-DD HH:MM:SS'). 

946 mode : {{'w', 'a'}}, default 'w' 

947 File mode to use (write or append). Append does not work with fsspec URLs. 

948 {storage_options} 

949 

950 .. versionadded:: 1.2.0 

951 

952 if_sheet_exists : {{'error', 'new', 'replace', 'overlay'}}, default 'error' 

953 How to behave when trying to write to a sheet that already 

954 exists (append mode only). 

955 

956 * error: raise a ValueError. 

957 * new: Create a new sheet, with a name determined by the engine. 

958 * replace: Delete the contents of the sheet before writing to it. 

959 * overlay: Write contents to the existing sheet without removing the old 

960 contents. 

961 

962 .. versionadded:: 1.3.0 

963 

964 .. versionchanged:: 1.4.0 

965 

966 Added ``overlay`` option 

967 

968 engine_kwargs : dict, optional 

969 Keyword arguments to be passed into the engine. These will be passed to 

970 the following functions of the respective engines: 

971 

972 * xlsxwriter: ``xlsxwriter.Workbook(file, **engine_kwargs)`` 

973 * openpyxl (write mode): ``openpyxl.Workbook(**engine_kwargs)`` 

974 * openpyxl (append mode): ``openpyxl.load_workbook(file, **engine_kwargs)`` 

975 * odswriter: ``odf.opendocument.OpenDocumentSpreadsheet(**engine_kwargs)`` 

976 

977 .. versionadded:: 1.3.0 

978 **kwargs : dict, optional 

979 Keyword arguments to be passed into the engine. 

980 

981 .. deprecated:: 1.3.0 

982 

983 Use engine_kwargs instead. 

984 

985 Notes 

986 ----- 

987 For compatibility with CSV writers, ExcelWriter serializes lists 

988 and dicts to strings before writing. 

989 

990 Examples 

991 -------- 

992 Default usage: 

993 

994 >>> df = pd.DataFrame([["ABC", "XYZ"]], columns=["Foo", "Bar"]) # doctest: +SKIP 

995 >>> with pd.ExcelWriter("path_to_file.xlsx") as writer: 

996 ... df.to_excel(writer) # doctest: +SKIP 

997 

998 To write to separate sheets in a single file: 

999 

1000 >>> df1 = pd.DataFrame([["AAA", "BBB"]], columns=["Spam", "Egg"]) # doctest: +SKIP 

1001 >>> df2 = pd.DataFrame([["ABC", "XYZ"]], columns=["Foo", "Bar"]) # doctest: +SKIP 

1002 >>> with pd.ExcelWriter("path_to_file.xlsx") as writer: 

1003 ... df1.to_excel(writer, sheet_name="Sheet1") # doctest: +SKIP 

1004 ... df2.to_excel(writer, sheet_name="Sheet2") # doctest: +SKIP 

1005 

1006 You can set the date format or datetime format: 

1007 

1008 >>> from datetime import date, datetime # doctest: +SKIP 

1009 >>> df = pd.DataFrame( 

1010 ... [ 

1011 ... [date(2014, 1, 31), date(1999, 9, 24)], 

1012 ... [datetime(1998, 5, 26, 23, 33, 4), datetime(2014, 2, 28, 13, 5, 13)], 

1013 ... ], 

1014 ... index=["Date", "Datetime"], 

1015 ... columns=["X", "Y"], 

1016 ... ) # doctest: +SKIP 

1017 >>> with pd.ExcelWriter( 

1018 ... "path_to_file.xlsx", 

1019 ... date_format="YYYY-MM-DD", 

1020 ... datetime_format="YYYY-MM-DD HH:MM:SS" 

1021 ... ) as writer: 

1022 ... df.to_excel(writer) # doctest: +SKIP 

1023 

1024 You can also append to an existing Excel file: 

1025 

1026 >>> with pd.ExcelWriter("path_to_file.xlsx", mode="a", engine="openpyxl") as writer: 

1027 ... df.to_excel(writer, sheet_name="Sheet3") # doctest: +SKIP 

1028 

1029 Here, the `if_sheet_exists` parameter can be set to replace a sheet if it 

1030 already exists: 

1031 

1032 >>> with ExcelWriter( 

1033 ... "path_to_file.xlsx", 

1034 ... mode="a", 

1035 ... engine="openpyxl", 

1036 ... if_sheet_exists="replace", 

1037 ... ) as writer: 

1038 ... df.to_excel(writer, sheet_name="Sheet1") # doctest: +SKIP 

1039 

1040 You can also write multiple DataFrames to a single sheet. Note that the 

1041 ``if_sheet_exists`` parameter needs to be set to ``overlay``: 

1042 

1043 >>> with ExcelWriter("path_to_file.xlsx", 

1044 ... mode="a", 

1045 ... engine="openpyxl", 

1046 ... if_sheet_exists="overlay", 

1047 ... ) as writer: 

1048 ... df1.to_excel(writer, sheet_name="Sheet1") 

1049 ... df2.to_excel(writer, sheet_name="Sheet1", startcol=3) # doctest: +SKIP 

1050 

1051 You can store Excel file in RAM: 

1052 

1053 >>> import io 

1054 >>> df = pd.DataFrame([["ABC", "XYZ"]], columns=["Foo", "Bar"]) 

1055 >>> buffer = io.BytesIO() 

1056 >>> with pd.ExcelWriter(buffer) as writer: 

1057 ... df.to_excel(writer) 

1058 

1059 You can pack Excel file into zip archive: 

1060 

1061 >>> import zipfile # doctest: +SKIP 

1062 >>> df = pd.DataFrame([["ABC", "XYZ"]], columns=["Foo", "Bar"]) # doctest: +SKIP 

1063 >>> with zipfile.ZipFile("path_to_file.zip", "w") as zf: 

1064 ... with zf.open("filename.xlsx", "w") as buffer: 

1065 ... with pd.ExcelWriter(buffer) as writer: 

1066 ... df.to_excel(writer) # doctest: +SKIP 

1067 

1068 You can specify additional arguments to the underlying engine: 

1069 

1070 >>> with pd.ExcelWriter( 

1071 ... "path_to_file.xlsx", 

1072 ... engine="xlsxwriter", 

1073 ... engine_kwargs={{"options": {{"nan_inf_to_errors": True}}}} 

1074 ... ) as writer: 

1075 ... df.to_excel(writer) # doctest: +SKIP 

1076 

1077 In append mode, ``engine_kwargs`` are passed through to 

1078 openpyxl's ``load_workbook``: 

1079 

1080 >>> with pd.ExcelWriter( 

1081 ... "path_to_file.xlsx", 

1082 ... engine="openpyxl", 

1083 ... mode="a", 

1084 ... engine_kwargs={{"keep_vba": True}} 

1085 ... ) as writer: 

1086 ... df.to_excel(writer, sheet_name="Sheet2") # doctest: +SKIP 

1087 """ 

1088 

1089 # Defining an ExcelWriter implementation (see abstract methods for more...) 

1090 

1091 # - Mandatory 

1092 # - ``write_cells(self, cells, sheet_name=None, startrow=0, startcol=0)`` 

1093 # --> called to write additional DataFrames to disk 

1094 # - ``_supported_extensions`` (tuple of supported extensions), used to 

1095 # check that engine supports the given extension. 

1096 # - ``_engine`` - string that gives the engine name. Necessary to 

1097 # instantiate class directly and bypass ``ExcelWriterMeta`` engine 

1098 # lookup. 

1099 # - ``save(self)`` --> called to save file to disk 

1100 # - Mostly mandatory (i.e. should at least exist) 

1101 # - book, cur_sheet, path 

1102 

1103 # - Optional: 

1104 # - ``__init__(self, path, engine=None, **kwargs)`` --> always called 

1105 # with path as first argument. 

1106 

1107 # You also need to register the class with ``register_writer()``. 

1108 # Technically, ExcelWriter implementations don't need to subclass 

1109 # ExcelWriter. 

1110 

1111 _engine: str 

1112 _supported_extensions: tuple[str, ...] 

1113 

1114 def __new__( 

1115 cls: type[ExcelWriter], 

1116 path: FilePath | WriteExcelBuffer | ExcelWriter, 

1117 engine: str | None = None, 

1118 date_format: str | None = None, 

1119 datetime_format: str | None = None, 

1120 mode: str = "w", 

1121 storage_options: StorageOptions = None, 

1122 if_sheet_exists: Literal["error", "new", "replace", "overlay"] | None = None, 

1123 engine_kwargs: dict | None = None, 

1124 **kwargs, 

1125 ) -> ExcelWriter: 

1126 if kwargs: 

1127 if engine_kwargs is not None: 

1128 raise ValueError("Cannot use both engine_kwargs and **kwargs") 

1129 warnings.warn( 

1130 "Use of **kwargs is deprecated, use engine_kwargs instead.", 

1131 FutureWarning, 

1132 stacklevel=find_stack_level(), 

1133 ) 

1134 

1135 # only switch class if generic(ExcelWriter) 

1136 if cls is ExcelWriter: 

1137 if engine is None or (isinstance(engine, str) and engine == "auto"): 

1138 if isinstance(path, str): 

1139 ext = os.path.splitext(path)[-1][1:] 

1140 else: 

1141 ext = "xlsx" 

1142 

1143 try: 

1144 engine = config.get_option(f"io.excel.{ext}.writer", silent=True) 

1145 if engine == "auto": 

1146 engine = get_default_engine(ext, mode="writer") 

1147 except KeyError as err: 

1148 raise ValueError(f"No engine for filetype: '{ext}'") from err 

1149 

1150 if engine == "xlwt": 

1151 xls_config_engine = config.get_option( 

1152 "io.excel.xls.writer", silent=True 

1153 ) 

1154 # Don't warn a 2nd time if user has changed the default engine for xls 

1155 if xls_config_engine != "xlwt": 

1156 warnings.warn( 

1157 "As the xlwt package is no longer maintained, the xlwt " 

1158 "engine will be removed in a future version of pandas. " 

1159 "This is the only engine in pandas that supports writing " 

1160 "in the xls format. Install openpyxl and write to an xlsx " 

1161 "file instead. You can set the option io.excel.xls.writer " 

1162 "to 'xlwt' to silence this warning. While this option is " 

1163 "deprecated and will also raise a warning, it can " 

1164 "be globally set and the warning suppressed.", 

1165 FutureWarning, 

1166 stacklevel=find_stack_level(), 

1167 ) 

1168 

1169 # for mypy 

1170 assert engine is not None 

1171 cls = get_writer(engine) 

1172 

1173 return object.__new__(cls) 

1174 

1175 # declare external properties you can count on 

1176 _path = None 

1177 

1178 @property 

1179 def supported_extensions(self) -> tuple[str, ...]: 

1180 """Extensions that writer engine supports.""" 

1181 return self._supported_extensions 

1182 

1183 @property 

1184 def engine(self) -> str: 

1185 """Name of engine.""" 

1186 return self._engine 

1187 

1188 @property 

1189 @abc.abstractmethod 

1190 def sheets(self) -> dict[str, Any]: 

1191 """Mapping of sheet names to sheet objects.""" 

1192 pass 

1193 

1194 # mypy doesn't handle abstract setters prior to 0.981 

1195 # https://github.com/python/mypy/issues/4165 

1196 @property # type: ignore[misc] 

1197 @abc.abstractmethod 

1198 def book(self): 

1199 """ 

1200 Book instance. Class type will depend on the engine used. 

1201 

1202 This attribute can be used to access engine-specific features. 

1203 """ 

1204 pass 

1205 

1206 # mypy doesn't handle abstract setters prior to 0.981 

1207 # https://github.com/python/mypy/issues/4165 

1208 @book.setter # type: ignore[misc] 

1209 @abc.abstractmethod 

1210 def book(self, other) -> None: 

1211 """ 

1212 Set book instance. Class type will depend on the engine used. 

1213 """ 

1214 pass 

1215 

1216 def write_cells( 

1217 self, 

1218 cells, 

1219 sheet_name: str | None = None, 

1220 startrow: int = 0, 

1221 startcol: int = 0, 

1222 freeze_panes: tuple[int, int] | None = None, 

1223 ) -> None: 

1224 """ 

1225 Write given formatted cells into Excel an excel sheet 

1226 

1227 .. deprecated:: 1.5.0 

1228 

1229 Parameters 

1230 ---------- 

1231 cells : generator 

1232 cell of formatted data to save to Excel sheet 

1233 sheet_name : str, default None 

1234 Name of Excel sheet, if None, then use self.cur_sheet 

1235 startrow : upper left cell row to dump data frame 

1236 startcol : upper left cell column to dump data frame 

1237 freeze_panes: int tuple of length 2 

1238 contains the bottom-most row and right-most column to freeze 

1239 """ 

1240 self._deprecate("write_cells") 

1241 return self._write_cells(cells, sheet_name, startrow, startcol, freeze_panes) 

1242 

1243 @abc.abstractmethod 

1244 def _write_cells( 

1245 self, 

1246 cells, 

1247 sheet_name: str | None = None, 

1248 startrow: int = 0, 

1249 startcol: int = 0, 

1250 freeze_panes: tuple[int, int] | None = None, 

1251 ) -> None: 

1252 """ 

1253 Write given formatted cells into Excel an excel sheet 

1254 

1255 Parameters 

1256 ---------- 

1257 cells : generator 

1258 cell of formatted data to save to Excel sheet 

1259 sheet_name : str, default None 

1260 Name of Excel sheet, if None, then use self.cur_sheet 

1261 startrow : upper left cell row to dump data frame 

1262 startcol : upper left cell column to dump data frame 

1263 freeze_panes: int tuple of length 2 

1264 contains the bottom-most row and right-most column to freeze 

1265 """ 

1266 pass 

1267 

1268 def save(self) -> None: 

1269 """ 

1270 Save workbook to disk. 

1271 

1272 .. deprecated:: 1.5.0 

1273 """ 

1274 self._deprecate("save") 

1275 return self._save() 

1276 

1277 @abc.abstractmethod 

1278 def _save(self) -> None: 

1279 """ 

1280 Save workbook to disk. 

1281 """ 

1282 pass 

1283 

1284 def __init__( 

1285 self, 

1286 path: FilePath | WriteExcelBuffer | ExcelWriter, 

1287 engine: str | None = None, 

1288 date_format: str | None = None, 

1289 datetime_format: str | None = None, 

1290 mode: str = "w", 

1291 storage_options: StorageOptions = None, 

1292 if_sheet_exists: str | None = None, 

1293 engine_kwargs: dict[str, Any] | None = None, 

1294 **kwargs, 

1295 ) -> None: 

1296 # validate that this engine can handle the extension 

1297 if isinstance(path, str): 

1298 ext = os.path.splitext(path)[-1] 

1299 self.check_extension(ext) 

1300 

1301 # use mode to open the file 

1302 if "b" not in mode: 

1303 mode += "b" 

1304 # use "a" for the user to append data to excel but internally use "r+" to let 

1305 # the excel backend first read the existing file and then write any data to it 

1306 mode = mode.replace("a", "r+") 

1307 

1308 # cast ExcelWriter to avoid adding 'if self.handles is not None' 

1309 self._handles = IOHandles( 

1310 cast(IO[bytes], path), compression={"compression": None} 

1311 ) 

1312 if not isinstance(path, ExcelWriter): 

1313 self._handles = get_handle( 

1314 path, mode, storage_options=storage_options, is_text=False 

1315 ) 

1316 self._cur_sheet = None 

1317 

1318 if date_format is None: 

1319 self._date_format = "YYYY-MM-DD" 

1320 else: 

1321 self._date_format = date_format 

1322 if datetime_format is None: 

1323 self._datetime_format = "YYYY-MM-DD HH:MM:SS" 

1324 else: 

1325 self._datetime_format = datetime_format 

1326 

1327 self._mode = mode 

1328 

1329 if if_sheet_exists not in (None, "error", "new", "replace", "overlay"): 

1330 raise ValueError( 

1331 f"'{if_sheet_exists}' is not valid for if_sheet_exists. " 

1332 "Valid options are 'error', 'new', 'replace' and 'overlay'." 

1333 ) 

1334 if if_sheet_exists and "r+" not in mode: 

1335 raise ValueError("if_sheet_exists is only valid in append mode (mode='a')") 

1336 if if_sheet_exists is None: 

1337 if_sheet_exists = "error" 

1338 self._if_sheet_exists = if_sheet_exists 

1339 

1340 def _deprecate(self, attr: str): 

1341 """ 

1342 Deprecate attribute or method for ExcelWriter. 

1343 """ 

1344 warnings.warn( 

1345 f"{attr} is not part of the public API, usage can give unexpected " 

1346 "results and will be removed in a future version", 

1347 FutureWarning, 

1348 stacklevel=find_stack_level(), 

1349 ) 

1350 

1351 def _deprecate_set_book(self) -> None: 

1352 """ 

1353 Deprecate setting the book attribute - GH#48780. 

1354 """ 

1355 warnings.warn( 

1356 "Setting the `book` attribute is not part of the public API, " 

1357 "usage can give unexpected or corrupted results and will be " 

1358 "removed in a future version", 

1359 FutureWarning, 

1360 stacklevel=find_stack_level(), 

1361 ) 

1362 

1363 @property 

1364 def date_format(self) -> str: 

1365 """ 

1366 Format string for dates written into Excel files (e.g. ‘YYYY-MM-DD’). 

1367 """ 

1368 return self._date_format 

1369 

1370 @property 

1371 def datetime_format(self) -> str: 

1372 """ 

1373 Format string for dates written into Excel files (e.g. ‘YYYY-MM-DD’). 

1374 """ 

1375 return self._datetime_format 

1376 

1377 @property 

1378 def if_sheet_exists(self) -> str: 

1379 """ 

1380 How to behave when writing to a sheet that already exists in append mode. 

1381 """ 

1382 return self._if_sheet_exists 

1383 

1384 @property 

1385 def cur_sheet(self): 

1386 """ 

1387 Current sheet for writing. 

1388 

1389 .. deprecated:: 1.5.0 

1390 """ 

1391 self._deprecate("cur_sheet") 

1392 return self._cur_sheet 

1393 

1394 @property 

1395 def handles(self) -> IOHandles[bytes]: 

1396 """ 

1397 Handles to Excel sheets. 

1398 

1399 .. deprecated:: 1.5.0 

1400 """ 

1401 self._deprecate("handles") 

1402 return self._handles 

1403 

1404 @property 

1405 def path(self): 

1406 """ 

1407 Path to Excel file. 

1408 

1409 .. deprecated:: 1.5.0 

1410 """ 

1411 self._deprecate("path") 

1412 return self._path 

1413 

1414 def __fspath__(self) -> str: 

1415 return getattr(self._handles.handle, "name", "") 

1416 

1417 def _get_sheet_name(self, sheet_name: str | None) -> str: 

1418 if sheet_name is None: 

1419 sheet_name = self._cur_sheet 

1420 if sheet_name is None: # pragma: no cover 

1421 raise ValueError("Must pass explicit sheet_name or set _cur_sheet property") 

1422 return sheet_name 

1423 

1424 def _value_with_fmt(self, val) -> tuple[object, str | None]: 

1425 """ 

1426 Convert numpy types to Python types for the Excel writers. 

1427 

1428 Parameters 

1429 ---------- 

1430 val : object 

1431 Value to be written into cells 

1432 

1433 Returns 

1434 ------- 

1435 Tuple with the first element being the converted value and the second 

1436 being an optional format 

1437 """ 

1438 fmt = None 

1439 

1440 if is_integer(val): 

1441 val = int(val) 

1442 elif is_float(val): 

1443 val = float(val) 

1444 elif is_bool(val): 

1445 val = bool(val) 

1446 elif isinstance(val, datetime.datetime): 

1447 fmt = self._datetime_format 

1448 elif isinstance(val, datetime.date): 

1449 fmt = self._date_format 

1450 elif isinstance(val, datetime.timedelta): 

1451 val = val.total_seconds() / 86400 

1452 fmt = "0" 

1453 else: 

1454 val = str(val) 

1455 

1456 return val, fmt 

1457 

1458 @classmethod 

1459 def check_extension(cls, ext: str) -> Literal[True]: 

1460 """ 

1461 checks that path's extension against the Writer's supported 

1462 extensions. If it isn't supported, raises UnsupportedFiletypeError. 

1463 """ 

1464 if ext.startswith("."): 

1465 ext = ext[1:] 

1466 if not any(ext in extension for extension in cls._supported_extensions): 

1467 raise ValueError(f"Invalid extension for engine '{cls.engine}': '{ext}'") 

1468 else: 

1469 return True 

1470 

1471 # Allow use as a contextmanager 

1472 def __enter__(self) -> ExcelWriter: 

1473 return self 

1474 

1475 def __exit__(self, exc_type, exc_value, traceback) -> None: 

1476 self.close() 

1477 

1478 def close(self) -> None: 

1479 """synonym for save, to make it more file-like""" 

1480 self._save() 

1481 self._handles.close() 

1482 

1483 

1484XLS_SIGNATURES = ( 

1485 b"\x09\x00\x04\x00\x07\x00\x10\x00", # BIFF2 

1486 b"\x09\x02\x06\x00\x00\x00\x10\x00", # BIFF3 

1487 b"\x09\x04\x06\x00\x00\x00\x10\x00", # BIFF4 

1488 b"\xD0\xCF\x11\xE0\xA1\xB1\x1A\xE1", # Compound File Binary 

1489) 

1490ZIP_SIGNATURE = b"PK\x03\x04" 

1491PEEK_SIZE = max(map(len, XLS_SIGNATURES + (ZIP_SIGNATURE,))) 

1492 

1493 

1494@doc(storage_options=_shared_docs["storage_options"]) 

1495def inspect_excel_format( 

1496 content_or_path: FilePath | ReadBuffer[bytes], 

1497 storage_options: StorageOptions = None, 

1498) -> str | None: 

1499 """ 

1500 Inspect the path or content of an excel file and get its format. 

1501 

1502 Adopted from xlrd: https://github.com/python-excel/xlrd. 

1503 

1504 Parameters 

1505 ---------- 

1506 content_or_path : str or file-like object 

1507 Path to file or content of file to inspect. May be a URL. 

1508 {storage_options} 

1509 

1510 Returns 

1511 ------- 

1512 str or None 

1513 Format of file if it can be determined. 

1514 

1515 Raises 

1516 ------ 

1517 ValueError 

1518 If resulting stream is empty. 

1519 BadZipFile 

1520 If resulting stream does not have an XLS signature and is not a valid zipfile. 

1521 """ 

1522 if isinstance(content_or_path, bytes): 

1523 content_or_path = BytesIO(content_or_path) 

1524 

1525 with get_handle( 

1526 content_or_path, "rb", storage_options=storage_options, is_text=False 

1527 ) as handle: 

1528 stream = handle.handle 

1529 stream.seek(0) 

1530 buf = stream.read(PEEK_SIZE) 

1531 if buf is None: 

1532 raise ValueError("stream is empty") 

1533 else: 

1534 assert isinstance(buf, bytes) 

1535 peek = buf 

1536 stream.seek(0) 

1537 

1538 if any(peek.startswith(sig) for sig in XLS_SIGNATURES): 

1539 return "xls" 

1540 elif not peek.startswith(ZIP_SIGNATURE): 

1541 return None 

1542 

1543 with zipfile.ZipFile(stream) as zf: 

1544 # Workaround for some third party files that use forward slashes and 

1545 # lower case names. 

1546 component_names = [ 

1547 name.replace("\\", "/").lower() for name in zf.namelist() 

1548 ] 

1549 

1550 if "xl/workbook.xml" in component_names: 

1551 return "xlsx" 

1552 if "xl/workbook.bin" in component_names: 

1553 return "xlsb" 

1554 if "content.xml" in component_names: 

1555 return "ods" 

1556 return "zip" 

1557 

1558 

1559class ExcelFile: 

1560 """ 

1561 Class for parsing tabular excel sheets into DataFrame objects. 

1562 

1563 See read_excel for more documentation. 

1564 

1565 Parameters 

1566 ---------- 

1567 path_or_buffer : str, bytes, path object (pathlib.Path or py._path.local.LocalPath), 

1568 a file-like object, xlrd workbook or openpyxl workbook. 

1569 If a string or path object, expected to be a path to a 

1570 .xls, .xlsx, .xlsb, .xlsm, .odf, .ods, or .odt file. 

1571 engine : str, default None 

1572 If io is not a buffer or path, this must be set to identify io. 

1573 Supported engines: ``xlrd``, ``openpyxl``, ``odf``, ``pyxlsb`` 

1574 Engine compatibility : 

1575 

1576 - ``xlrd`` supports old-style Excel files (.xls). 

1577 - ``openpyxl`` supports newer Excel file formats. 

1578 - ``odf`` supports OpenDocument file formats (.odf, .ods, .odt). 

1579 - ``pyxlsb`` supports Binary Excel files. 

1580 

1581 .. versionchanged:: 1.2.0 

1582 

1583 The engine `xlrd <https://xlrd.readthedocs.io/en/latest/>`_ 

1584 now only supports old-style ``.xls`` files. 

1585 When ``engine=None``, the following logic will be 

1586 used to determine the engine: 

1587 

1588 - If ``path_or_buffer`` is an OpenDocument format (.odf, .ods, .odt), 

1589 then `odf <https://pypi.org/project/odfpy/>`_ will be used. 

1590 - Otherwise if ``path_or_buffer`` is an xls format, 

1591 ``xlrd`` will be used. 

1592 - Otherwise if ``path_or_buffer`` is in xlsb format, 

1593 `pyxlsb <https://pypi.org/project/pyxlsb/>`_ will be used. 

1594 

1595 .. versionadded:: 1.3.0 

1596 - Otherwise if `openpyxl <https://pypi.org/project/openpyxl/>`_ is installed, 

1597 then ``openpyxl`` will be used. 

1598 - Otherwise if ``xlrd >= 2.0`` is installed, a ``ValueError`` will be raised. 

1599 - Otherwise ``xlrd`` will be used and a ``FutureWarning`` will be raised. 

1600 This case will raise a ``ValueError`` in a future version of pandas. 

1601 

1602 .. warning:: 

1603 

1604 Please do not report issues when using ``xlrd`` to read ``.xlsx`` files. 

1605 This is not supported, switch to using ``openpyxl`` instead. 

1606 """ 

1607 

1608 from pandas.io.excel._odfreader import ODFReader 

1609 from pandas.io.excel._openpyxl import OpenpyxlReader 

1610 from pandas.io.excel._pyxlsb import PyxlsbReader 

1611 from pandas.io.excel._xlrd import XlrdReader 

1612 

1613 _engines: Mapping[str, Any] = { 

1614 "xlrd": XlrdReader, 

1615 "openpyxl": OpenpyxlReader, 

1616 "odf": ODFReader, 

1617 "pyxlsb": PyxlsbReader, 

1618 } 

1619 

1620 def __init__( 

1621 self, 

1622 path_or_buffer, 

1623 engine: str | None = None, 

1624 storage_options: StorageOptions = None, 

1625 ) -> None: 

1626 if engine is not None and engine not in self._engines: 

1627 raise ValueError(f"Unknown engine: {engine}") 

1628 

1629 # First argument can also be bytes, so create a buffer 

1630 if isinstance(path_or_buffer, bytes): 

1631 path_or_buffer = BytesIO(path_or_buffer) 

1632 

1633 # Could be a str, ExcelFile, Book, etc. 

1634 self.io = path_or_buffer 

1635 # Always a string 

1636 self._io = stringify_path(path_or_buffer) 

1637 

1638 # Determine xlrd version if installed 

1639 if import_optional_dependency("xlrd", errors="ignore") is None: 

1640 xlrd_version = None 

1641 else: 

1642 import xlrd 

1643 

1644 xlrd_version = Version(get_version(xlrd)) 

1645 

1646 ext = None 

1647 if engine is None: 

1648 # Only determine ext if it is needed 

1649 if xlrd_version is not None and isinstance(path_or_buffer, xlrd.Book): 

1650 ext = "xls" 

1651 else: 

1652 ext = inspect_excel_format( 

1653 content_or_path=path_or_buffer, storage_options=storage_options 

1654 ) 

1655 if ext is None: 

1656 raise ValueError( 

1657 "Excel file format cannot be determined, you must specify " 

1658 "an engine manually." 

1659 ) 

1660 

1661 engine = config.get_option(f"io.excel.{ext}.reader", silent=True) 

1662 if engine == "auto": 

1663 engine = get_default_engine(ext, mode="reader") 

1664 

1665 if engine == "xlrd" and xlrd_version is not None: 

1666 if ext is None: 

1667 # Need ext to determine ext in order to raise/warn 

1668 if isinstance(path_or_buffer, xlrd.Book): 

1669 ext = "xls" 

1670 else: 

1671 ext = inspect_excel_format( 

1672 path_or_buffer, storage_options=storage_options 

1673 ) 

1674 

1675 # Pass through if ext is None, otherwise check if ext valid for xlrd 

1676 if ext and ext != "xls" and xlrd_version >= Version("2"): 

1677 raise ValueError( 

1678 f"Your version of xlrd is {xlrd_version}. In xlrd >= 2.0, " 

1679 f"only the xls format is supported. Install openpyxl instead." 

1680 ) 

1681 elif ext and ext != "xls": 

1682 stacklevel = find_stack_level() 

1683 warnings.warn( 

1684 f"Your version of xlrd is {xlrd_version}. In xlrd >= 2.0, " 

1685 f"only the xls format is supported. Install " 

1686 f"openpyxl instead.", 

1687 FutureWarning, 

1688 stacklevel=stacklevel, 

1689 ) 

1690 

1691 assert engine is not None 

1692 self.engine = engine 

1693 self.storage_options = storage_options 

1694 

1695 self._reader = self._engines[engine](self._io, storage_options=storage_options) 

1696 

1697 def __fspath__(self): 

1698 return self._io 

1699 

1700 def parse( 

1701 self, 

1702 sheet_name: str | int | list[int] | list[str] | None = 0, 

1703 header: int | Sequence[int] | None = 0, 

1704 names=None, 

1705 index_col: int | Sequence[int] | None = None, 

1706 usecols=None, 

1707 squeeze: bool | None = None, 

1708 converters=None, 

1709 true_values: Iterable[Hashable] | None = None, 

1710 false_values: Iterable[Hashable] | None = None, 

1711 skiprows: Sequence[int] | int | Callable[[int], object] | None = None, 

1712 nrows: int | None = None, 

1713 na_values=None, 

1714 parse_dates: list | dict | bool = False, 

1715 date_parser: Callable | None = None, 

1716 thousands: str | None = None, 

1717 comment: str | None = None, 

1718 skipfooter: int = 0, 

1719 convert_float: bool | None = None, 

1720 mangle_dupe_cols: bool = True, 

1721 **kwds, 

1722 ) -> DataFrame | dict[str, DataFrame] | dict[int, DataFrame]: 

1723 """ 

1724 Parse specified sheet(s) into a DataFrame. 

1725 

1726 Equivalent to read_excel(ExcelFile, ...) See the read_excel 

1727 docstring for more info on accepted parameters. 

1728 

1729 Returns 

1730 ------- 

1731 DataFrame or dict of DataFrames 

1732 DataFrame from the passed in Excel file. 

1733 """ 

1734 return self._reader.parse( 

1735 sheet_name=sheet_name, 

1736 header=header, 

1737 names=names, 

1738 index_col=index_col, 

1739 usecols=usecols, 

1740 squeeze=squeeze, 

1741 converters=converters, 

1742 true_values=true_values, 

1743 false_values=false_values, 

1744 skiprows=skiprows, 

1745 nrows=nrows, 

1746 na_values=na_values, 

1747 parse_dates=parse_dates, 

1748 date_parser=date_parser, 

1749 thousands=thousands, 

1750 comment=comment, 

1751 skipfooter=skipfooter, 

1752 convert_float=convert_float, 

1753 mangle_dupe_cols=mangle_dupe_cols, 

1754 **kwds, 

1755 ) 

1756 

1757 @property 

1758 def book(self): 

1759 return self._reader.book 

1760 

1761 @property 

1762 def sheet_names(self): 

1763 return self._reader.sheet_names 

1764 

1765 def close(self) -> None: 

1766 """close io if necessary""" 

1767 self._reader.close() 

1768 

1769 def __enter__(self) -> ExcelFile: 

1770 return self 

1771 

1772 def __exit__(self, exc_type, exc_value, traceback) -> None: 

1773 self.close() 

1774 

1775 def __del__(self) -> None: 

1776 # Ensure we don't leak file descriptors, but put in try/except in case 

1777 # attributes are already deleted 

1778 try: 

1779 self.close() 

1780 except AttributeError: 

1781 pass