Coverage for /var/srv/projects/api.amasfac.comuna18.com/tmp/venv/lib/python3.9/site-packages/pandas/io/parsers/readers.py: 12%

519 statements  

« prev     ^ index     » next       coverage.py v6.4.4, created at 2023-07-17 14:22 -0600

1""" 

2Module contains tools for processing files into DataFrames or other objects 

3""" 

4from __future__ import annotations 

5 

6from collections import abc 

7import csv 

8import sys 

9from textwrap import fill 

10from typing import ( 

11 IO, 

12 Any, 

13 Callable, 

14 Hashable, 

15 Literal, 

16 NamedTuple, 

17 Sequence, 

18 overload, 

19) 

20import warnings 

21 

22import numpy as np 

23 

24import pandas._libs.lib as lib 

25from pandas._libs.parsers import STR_NA_VALUES 

26from pandas._typing import ( 

27 CompressionOptions, 

28 CSVEngine, 

29 DtypeArg, 

30 FilePath, 

31 IndexLabel, 

32 ReadCsvBuffer, 

33 StorageOptions, 

34) 

35from pandas.errors import ( 

36 AbstractMethodError, 

37 ParserWarning, 

38) 

39from pandas.util._decorators import ( 

40 Appender, 

41 deprecate_kwarg, 

42 deprecate_nonkeyword_arguments, 

43) 

44from pandas.util._exceptions import find_stack_level 

45from pandas.util._validators import validate_bool_kwarg 

46 

47from pandas.core.dtypes.common import ( 

48 is_file_like, 

49 is_float, 

50 is_integer, 

51 is_list_like, 

52) 

53 

54from pandas.core.frame import DataFrame 

55from pandas.core.indexes.api import RangeIndex 

56from pandas.core.shared_docs import _shared_docs 

57 

58from pandas.io.common import ( 

59 IOHandles, 

60 get_handle, 

61 stringify_path, 

62 validate_header_arg, 

63) 

64from pandas.io.parsers.arrow_parser_wrapper import ArrowParserWrapper 

65from pandas.io.parsers.base_parser import ( 

66 ParserBase, 

67 is_index_col, 

68 parser_defaults, 

69) 

70from pandas.io.parsers.c_parser_wrapper import CParserWrapper 

71from pandas.io.parsers.python_parser import ( 

72 FixedWidthFieldParser, 

73 PythonParser, 

74) 

75 

76_doc_read_csv_and_table = ( 

77 r""" 

78{summary} 

79 

80Also supports optionally iterating or breaking of the file 

81into chunks. 

82 

83Additional help can be found in the online docs for 

84`IO Tools <https://pandas.pydata.org/pandas-docs/stable/user_guide/io.html>`_. 

85 

86Parameters 

87---------- 

88filepath_or_buffer : str, path object or file-like object 

89 Any valid string path is acceptable. The string could be a URL. Valid 

90 URL schemes include http, ftp, s3, gs, and file. For file URLs, a host is 

91 expected. A local file could be: file://localhost/path/to/table.csv. 

92 

93 If you want to pass in a path object, pandas accepts any ``os.PathLike``. 

94 

95 By file-like object, we refer to objects with a ``read()`` method, such as 

96 a file handle (e.g. via builtin ``open`` function) or ``StringIO``. 

97sep : str, default {_default_sep} 

98 Delimiter to use. If sep is None, the C engine cannot automatically detect 

99 the separator, but the Python parsing engine can, meaning the latter will 

100 be used and automatically detect the separator by Python's builtin sniffer 

101 tool, ``csv.Sniffer``. In addition, separators longer than 1 character and 

102 different from ``'\s+'`` will be interpreted as regular expressions and 

103 will also force the use of the Python parsing engine. Note that regex 

104 delimiters are prone to ignoring quoted data. Regex example: ``'\r\t'``. 

105delimiter : str, default ``None`` 

106 Alias for sep. 

107header : int, list of int, None, default 'infer' 

108 Row number(s) to use as the column names, and the start of the 

109 data. Default behavior is to infer the column names: if no names 

110 are passed the behavior is identical to ``header=0`` and column 

111 names are inferred from the first line of the file, if column 

112 names are passed explicitly then the behavior is identical to 

113 ``header=None``. Explicitly pass ``header=0`` to be able to 

114 replace existing names. The header can be a list of integers that 

115 specify row locations for a multi-index on the columns 

116 e.g. [0,1,3]. Intervening rows that are not specified will be 

117 skipped (e.g. 2 in this example is skipped). Note that this 

118 parameter ignores commented lines and empty lines if 

119 ``skip_blank_lines=True``, so ``header=0`` denotes the first line of 

120 data rather than the first line of the file. 

121names : array-like, optional 

122 List of column names to use. If the file contains a header row, 

123 then you should explicitly pass ``header=0`` to override the column names. 

124 Duplicates in this list are not allowed. 

125index_col : int, str, sequence of int / str, or False, optional, default ``None`` 

126 Column(s) to use as the row labels of the ``DataFrame``, either given as 

127 string name or column index. If a sequence of int / str is given, a 

128 MultiIndex is used. 

129 

130 Note: ``index_col=False`` can be used to force pandas to *not* use the first 

131 column as the index, e.g. when you have a malformed file with delimiters at 

132 the end of each line. 

133usecols : list-like or callable, optional 

134 Return a subset of the columns. If list-like, all elements must either 

135 be positional (i.e. integer indices into the document columns) or strings 

136 that correspond to column names provided either by the user in `names` or 

137 inferred from the document header row(s). If ``names`` are given, the document 

138 header row(s) are not taken into account. For example, a valid list-like 

139 `usecols` parameter would be ``[0, 1, 2]`` or ``['foo', 'bar', 'baz']``. 

140 Element order is ignored, so ``usecols=[0, 1]`` is the same as ``[1, 0]``. 

141 To instantiate a DataFrame from ``data`` with element order preserved use 

142 ``pd.read_csv(data, usecols=['foo', 'bar'])[['foo', 'bar']]`` for columns 

143 in ``['foo', 'bar']`` order or 

144 ``pd.read_csv(data, usecols=['foo', 'bar'])[['bar', 'foo']]`` 

145 for ``['bar', 'foo']`` order. 

146 

147 If callable, the callable function will be evaluated against the column 

148 names, returning names where the callable function evaluates to True. An 

149 example of a valid callable argument would be ``lambda x: x.upper() in 

150 ['AAA', 'BBB', 'DDD']``. Using this parameter results in much faster 

151 parsing time and lower memory usage. 

152squeeze : bool, default False 

153 If the parsed data only contains one column then return a Series. 

154 

155 .. deprecated:: 1.4.0 

156 Append ``.squeeze("columns")`` to the call to ``{func_name}`` to squeeze 

157 the data. 

158prefix : str, optional 

159 Prefix to add to column numbers when no header, e.g. 'X' for X0, X1, ... 

160 

161 .. deprecated:: 1.4.0 

162 Use a list comprehension on the DataFrame's columns after calling ``read_csv``. 

163mangle_dupe_cols : bool, default True 

164 Duplicate columns will be specified as 'X', 'X.1', ...'X.N', rather than 

165 'X'...'X'. Passing in False will cause data to be overwritten if there 

166 are duplicate names in the columns. 

167 

168 .. deprecated:: 1.5.0 

169 Not implemented, and a new argument to specify the pattern for the 

170 names of duplicated columns will be added instead 

171dtype : Type name or dict of column -> type, optional 

172 Data type for data or columns. E.g. {{'a': np.float64, 'b': np.int32, 

173 'c': 'Int64'}} 

174 Use `str` or `object` together with suitable `na_values` settings 

175 to preserve and not interpret dtype. 

176 If converters are specified, they will be applied INSTEAD 

177 of dtype conversion. 

178 

179 .. versionadded:: 1.5.0 

180 

181 Support for defaultdict was added. Specify a defaultdict as input where 

182 the default determines the dtype of the columns which are not explicitly 

183 listed. 

184engine : {{'c', 'python', 'pyarrow'}}, optional 

185 Parser engine to use. The C and pyarrow engines are faster, while the python engine 

186 is currently more feature-complete. Multithreading is currently only supported by 

187 the pyarrow engine. 

188 

189 .. versionadded:: 1.4.0 

190 

191 The "pyarrow" engine was added as an *experimental* engine, and some features 

192 are unsupported, or may not work correctly, with this engine. 

193converters : dict, optional 

194 Dict of functions for converting values in certain columns. Keys can either 

195 be integers or column labels. 

196true_values : list, optional 

197 Values to consider as True. 

198false_values : list, optional 

199 Values to consider as False. 

200skipinitialspace : bool, default False 

201 Skip spaces after delimiter. 

202skiprows : list-like, int or callable, optional 

203 Line numbers to skip (0-indexed) or number of lines to skip (int) 

204 at the start of the file. 

205 

206 If callable, the callable function will be evaluated against the row 

207 indices, returning True if the row should be skipped and False otherwise. 

208 An example of a valid callable argument would be ``lambda x: x in [0, 2]``. 

209skipfooter : int, default 0 

210 Number of lines at bottom of file to skip (Unsupported with engine='c'). 

211nrows : int, optional 

212 Number of rows of file to read. Useful for reading pieces of large files. 

213na_values : scalar, str, list-like, or dict, optional 

214 Additional strings to recognize as NA/NaN. If dict passed, specific 

215 per-column NA values. By default the following values are interpreted as 

216 NaN: '""" 

217 + fill("', '".join(sorted(STR_NA_VALUES)), 70, subsequent_indent=" ") 

218 + """'. 

219keep_default_na : bool, default True 

220 Whether or not to include the default NaN values when parsing the data. 

221 Depending on whether `na_values` is passed in, the behavior is as follows: 

222 

223 * If `keep_default_na` is True, and `na_values` are specified, `na_values` 

224 is appended to the default NaN values used for parsing. 

225 * If `keep_default_na` is True, and `na_values` are not specified, only 

226 the default NaN values are used for parsing. 

227 * If `keep_default_na` is False, and `na_values` are specified, only 

228 the NaN values specified `na_values` are used for parsing. 

229 * If `keep_default_na` is False, and `na_values` are not specified, no 

230 strings will be parsed as NaN. 

231 

232 Note that if `na_filter` is passed in as False, the `keep_default_na` and 

233 `na_values` parameters will be ignored. 

234na_filter : bool, default True 

235 Detect missing value markers (empty strings and the value of na_values). In 

236 data without any NAs, passing na_filter=False can improve the performance 

237 of reading a large file. 

238verbose : bool, default False 

239 Indicate number of NA values placed in non-numeric columns. 

240skip_blank_lines : bool, default True 

241 If True, skip over blank lines rather than interpreting as NaN values. 

242parse_dates : bool or list of int or names or list of lists or dict, \ 

243default False 

244 The behavior is as follows: 

245 

246 * boolean. If True -> try parsing the index. 

247 * list of int or names. e.g. If [1, 2, 3] -> try parsing columns 1, 2, 3 

248 each as a separate date column. 

249 * list of lists. e.g. If [[1, 3]] -> combine columns 1 and 3 and parse as 

250 a single date column. 

251 * dict, e.g. {{'foo' : [1, 3]}} -> parse columns 1, 3 as date and call 

252 result 'foo' 

253 

254 If a column or index cannot be represented as an array of datetimes, 

255 say because of an unparsable value or a mixture of timezones, the column 

256 or index will be returned unaltered as an object data type. For 

257 non-standard datetime parsing, use ``pd.to_datetime`` after 

258 ``pd.read_csv``. To parse an index or column with a mixture of timezones, 

259 specify ``date_parser`` to be a partially-applied 

260 :func:`pandas.to_datetime` with ``utc=True``. See 

261 :ref:`io.csv.mixed_timezones` for more. 

262 

263 Note: A fast-path exists for iso8601-formatted dates. 

264infer_datetime_format : bool, default False 

265 If True and `parse_dates` is enabled, pandas will attempt to infer the 

266 format of the datetime strings in the columns, and if it can be inferred, 

267 switch to a faster method of parsing them. In some cases this can increase 

268 the parsing speed by 5-10x. 

269keep_date_col : bool, default False 

270 If True and `parse_dates` specifies combining multiple columns then 

271 keep the original columns. 

272date_parser : function, optional 

273 Function to use for converting a sequence of string columns to an array of 

274 datetime instances. The default uses ``dateutil.parser.parser`` to do the 

275 conversion. Pandas will try to call `date_parser` in three different ways, 

276 advancing to the next if an exception occurs: 1) Pass one or more arrays 

277 (as defined by `parse_dates`) as arguments; 2) concatenate (row-wise) the 

278 string values from the columns defined by `parse_dates` into a single array 

279 and pass that; and 3) call `date_parser` once for each row using one or 

280 more strings (corresponding to the columns defined by `parse_dates`) as 

281 arguments. 

282dayfirst : bool, default False 

283 DD/MM format dates, international and European format. 

284cache_dates : bool, default True 

285 If True, use a cache of unique, converted dates to apply the datetime 

286 conversion. May produce significant speed-up when parsing duplicate 

287 date strings, especially ones with timezone offsets. 

288 

289 .. versionadded:: 0.25.0 

290iterator : bool, default False 

291 Return TextFileReader object for iteration or getting chunks with 

292 ``get_chunk()``. 

293 

294 .. versionchanged:: 1.2 

295 

296 ``TextFileReader`` is a context manager. 

297chunksize : int, optional 

298 Return TextFileReader object for iteration. 

299 See the `IO Tools docs 

300 <https://pandas.pydata.org/pandas-docs/stable/io.html#io-chunking>`_ 

301 for more information on ``iterator`` and ``chunksize``. 

302 

303 .. versionchanged:: 1.2 

304 

305 ``TextFileReader`` is a context manager. 

306{decompression_options} 

307 

308 .. versionchanged:: 1.4.0 Zstandard support. 

309 

310thousands : str, optional 

311 Thousands separator. 

312decimal : str, default '.' 

313 Character to recognize as decimal point (e.g. use ',' for European data). 

314lineterminator : str (length 1), optional 

315 Character to break file into lines. Only valid with C parser. 

316quotechar : str (length 1), optional 

317 The character used to denote the start and end of a quoted item. Quoted 

318 items can include the delimiter and it will be ignored. 

319quoting : int or csv.QUOTE_* instance, default 0 

320 Control field quoting behavior per ``csv.QUOTE_*`` constants. Use one of 

321 QUOTE_MINIMAL (0), QUOTE_ALL (1), QUOTE_NONNUMERIC (2) or QUOTE_NONE (3). 

322doublequote : bool, default ``True`` 

323 When quotechar is specified and quoting is not ``QUOTE_NONE``, indicate 

324 whether or not to interpret two consecutive quotechar elements INSIDE a 

325 field as a single ``quotechar`` element. 

326escapechar : str (length 1), optional 

327 One-character string used to escape other characters. 

328comment : str, optional 

329 Indicates remainder of line should not be parsed. If found at the beginning 

330 of a line, the line will be ignored altogether. This parameter must be a 

331 single character. Like empty lines (as long as ``skip_blank_lines=True``), 

332 fully commented lines are ignored by the parameter `header` but not by 

333 `skiprows`. For example, if ``comment='#'``, parsing 

334 ``#empty\\na,b,c\\n1,2,3`` with ``header=0`` will result in 'a,b,c' being 

335 treated as the header. 

336encoding : str, optional 

337 Encoding to use for UTF when reading/writing (ex. 'utf-8'). `List of Python 

338 standard encodings 

339 <https://docs.python.org/3/library/codecs.html#standard-encodings>`_ . 

340 

341 .. versionchanged:: 1.2 

342 

343 When ``encoding`` is ``None``, ``errors="replace"`` is passed to 

344 ``open()``. Otherwise, ``errors="strict"`` is passed to ``open()``. 

345 This behavior was previously only the case for ``engine="python"``. 

346 

347 .. versionchanged:: 1.3.0 

348 

349 ``encoding_errors`` is a new argument. ``encoding`` has no longer an 

350 influence on how encoding errors are handled. 

351 

352encoding_errors : str, optional, default "strict" 

353 How encoding errors are treated. `List of possible values 

354 <https://docs.python.org/3/library/codecs.html#error-handlers>`_ . 

355 

356 .. versionadded:: 1.3.0 

357 

358dialect : str or csv.Dialect, optional 

359 If provided, this parameter will override values (default or not) for the 

360 following parameters: `delimiter`, `doublequote`, `escapechar`, 

361 `skipinitialspace`, `quotechar`, and `quoting`. If it is necessary to 

362 override values, a ParserWarning will be issued. See csv.Dialect 

363 documentation for more details. 

364error_bad_lines : bool, optional, default ``None`` 

365 Lines with too many fields (e.g. a csv line with too many commas) will by 

366 default cause an exception to be raised, and no DataFrame will be returned. 

367 If False, then these "bad lines" will be dropped from the DataFrame that is 

368 returned. 

369 

370 .. deprecated:: 1.3.0 

371 The ``on_bad_lines`` parameter should be used instead to specify behavior upon 

372 encountering a bad line instead. 

373warn_bad_lines : bool, optional, default ``None`` 

374 If error_bad_lines is False, and warn_bad_lines is True, a warning for each 

375 "bad line" will be output. 

376 

377 .. deprecated:: 1.3.0 

378 The ``on_bad_lines`` parameter should be used instead to specify behavior upon 

379 encountering a bad line instead. 

380on_bad_lines : {{'error', 'warn', 'skip'}} or callable, default 'error' 

381 Specifies what to do upon encountering a bad line (a line with too many fields). 

382 Allowed values are : 

383 

384 - 'error', raise an Exception when a bad line is encountered. 

385 - 'warn', raise a warning when a bad line is encountered and skip that line. 

386 - 'skip', skip bad lines without raising or warning when they are encountered. 

387 

388 .. versionadded:: 1.3.0 

389 

390 .. versionadded:: 1.4.0 

391 

392 - callable, function with signature 

393 ``(bad_line: list[str]) -> list[str] | None`` that will process a single 

394 bad line. ``bad_line`` is a list of strings split by the ``sep``. 

395 If the function returns ``None``, the bad line will be ignored. 

396 If the function returns a new list of strings with more elements than 

397 expected, a ``ParserWarning`` will be emitted while dropping extra elements. 

398 Only supported when ``engine="python"`` 

399 

400delim_whitespace : bool, default False 

401 Specifies whether or not whitespace (e.g. ``' '`` or ``'\t'``) will be 

402 used as the sep. Equivalent to setting ``sep='\\s+'``. If this option 

403 is set to True, nothing should be passed in for the ``delimiter`` 

404 parameter. 

405low_memory : bool, default True 

406 Internally process the file in chunks, resulting in lower memory use 

407 while parsing, but possibly mixed type inference. To ensure no mixed 

408 types either set False, or specify the type with the `dtype` parameter. 

409 Note that the entire file is read into a single DataFrame regardless, 

410 use the `chunksize` or `iterator` parameter to return the data in chunks. 

411 (Only valid with C parser). 

412memory_map : bool, default False 

413 If a filepath is provided for `filepath_or_buffer`, map the file object 

414 directly onto memory and access the data directly from there. Using this 

415 option can improve performance because there is no longer any I/O overhead. 

416float_precision : str, optional 

417 Specifies which converter the C engine should use for floating-point 

418 values. The options are ``None`` or 'high' for the ordinary converter, 

419 'legacy' for the original lower precision pandas converter, and 

420 'round_trip' for the round-trip converter. 

421 

422 .. versionchanged:: 1.2 

423 

424{storage_options} 

425 

426 .. versionadded:: 1.2 

427 

428Returns 

429------- 

430DataFrame or TextParser 

431 A comma-separated values (csv) file is returned as two-dimensional 

432 data structure with labeled axes. 

433 

434See Also 

435-------- 

436DataFrame.to_csv : Write DataFrame to a comma-separated values (csv) file. 

437read_csv : Read a comma-separated values (csv) file into DataFrame. 

438read_fwf : Read a table of fixed-width formatted lines into DataFrame. 

439 

440Examples 

441-------- 

442>>> pd.{func_name}('data.csv') # doctest: +SKIP 

443""" 

444) 

445 

446 

447_c_parser_defaults = { 

448 "delim_whitespace": False, 

449 "na_filter": True, 

450 "low_memory": True, 

451 "memory_map": False, 

452 "float_precision": None, 

453} 

454 

455_fwf_defaults = {"colspecs": "infer", "infer_nrows": 100, "widths": None} 

456 

457_c_unsupported = {"skipfooter"} 

458_python_unsupported = {"low_memory", "float_precision"} 

459_pyarrow_unsupported = { 

460 "skipfooter", 

461 "float_precision", 

462 "chunksize", 

463 "comment", 

464 "nrows", 

465 "thousands", 

466 "memory_map", 

467 "dialect", 

468 "warn_bad_lines", 

469 "error_bad_lines", 

470 "on_bad_lines", 

471 "delim_whitespace", 

472 "quoting", 

473 "lineterminator", 

474 "converters", 

475 "decimal", 

476 "iterator", 

477 "dayfirst", 

478 "infer_datetime_format", 

479 "verbose", 

480 "skipinitialspace", 

481 "low_memory", 

482} 

483 

484 

485class _DeprecationConfig(NamedTuple): 

486 default_value: Any 

487 msg: str | None 

488 

489 

490_deprecated_defaults: dict[str, _DeprecationConfig] = { 

491 "error_bad_lines": _DeprecationConfig(None, "Use on_bad_lines in the future."), 

492 "warn_bad_lines": _DeprecationConfig(None, "Use on_bad_lines in the future."), 

493 "squeeze": _DeprecationConfig( 

494 None, 'Append .squeeze("columns") to the call to squeeze.' 

495 ), 

496 "prefix": _DeprecationConfig( 

497 None, "Use a list comprehension on the column names in the future." 

498 ), 

499} 

500 

501 

502@overload 

503def validate_integer(name, val: None, min_val=...) -> None: 

504 ... 

505 

506 

507@overload 

508def validate_integer(name, val: float, min_val=...) -> int: 

509 ... 

510 

511 

512@overload 

513def validate_integer(name, val: int | None, min_val=...) -> int | None: 

514 ... 

515 

516 

517def validate_integer(name, val: int | float | None, min_val=0) -> int | None: 

518 """ 

519 Checks whether the 'name' parameter for parsing is either 

520 an integer OR float that can SAFELY be cast to an integer 

521 without losing accuracy. Raises a ValueError if that is 

522 not the case. 

523 

524 Parameters 

525 ---------- 

526 name : str 

527 Parameter name (used for error reporting) 

528 val : int or float 

529 The value to check 

530 min_val : int 

531 Minimum allowed value (val < min_val will result in a ValueError) 

532 """ 

533 if val is None: 

534 return val 

535 

536 msg = f"'{name:s}' must be an integer >={min_val:d}" 

537 if is_float(val): 

538 if int(val) != val: 

539 raise ValueError(msg) 

540 val = int(val) 

541 elif not (is_integer(val) and val >= min_val): 

542 raise ValueError(msg) 

543 

544 return int(val) 

545 

546 

547def _validate_names(names: Sequence[Hashable] | None) -> None: 

548 """ 

549 Raise ValueError if the `names` parameter contains duplicates or has an 

550 invalid data type. 

551 

552 Parameters 

553 ---------- 

554 names : array-like or None 

555 An array containing a list of the names used for the output DataFrame. 

556 

557 Raises 

558 ------ 

559 ValueError 

560 If names are not unique or are not ordered (e.g. set). 

561 """ 

562 if names is not None: 

563 if len(names) != len(set(names)): 

564 raise ValueError("Duplicate names are not allowed.") 

565 if not ( 

566 is_list_like(names, allow_sets=False) or isinstance(names, abc.KeysView) 

567 ): 

568 raise ValueError("Names should be an ordered collection.") 

569 

570 

571def _read( 

572 filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str], kwds 

573) -> DataFrame | TextFileReader: 

574 """Generic reader of line files.""" 

575 # if we pass a date_parser and parse_dates=False, we should not parse the 

576 # dates GH#44366 

577 if kwds.get("parse_dates", None) is None: 

578 if kwds.get("date_parser", None) is None: 

579 kwds["parse_dates"] = False 

580 else: 

581 kwds["parse_dates"] = True 

582 

583 # Extract some of the arguments (pass chunksize on). 

584 iterator = kwds.get("iterator", False) 

585 chunksize = kwds.get("chunksize", None) 

586 if kwds.get("engine") == "pyarrow": 

587 if iterator: 

588 raise ValueError( 

589 "The 'iterator' option is not supported with the 'pyarrow' engine" 

590 ) 

591 

592 if chunksize is not None: 

593 raise ValueError( 

594 "The 'chunksize' option is not supported with the 'pyarrow' engine" 

595 ) 

596 else: 

597 chunksize = validate_integer("chunksize", chunksize, 1) 

598 

599 nrows = kwds.get("nrows", None) 

600 

601 # Check for duplicates in names. 

602 _validate_names(kwds.get("names", None)) 

603 

604 # Create the parser. 

605 parser = TextFileReader(filepath_or_buffer, **kwds) 

606 

607 if chunksize or iterator: 

608 return parser 

609 

610 with parser: 

611 return parser.read(nrows) 

612 

613 

614# iterator=True -> TextFileReader 

615@overload 

616def read_csv( 

617 filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str], 

618 *, 

619 sep: str | None | lib.NoDefault = ..., 

620 delimiter: str | None | lib.NoDefault = ..., 

621 header: int | Sequence[int] | None | Literal["infer"] = ..., 

622 names: Sequence[Hashable] | None | lib.NoDefault = ..., 

623 index_col: IndexLabel | Literal[False] | None = ..., 

624 usecols=..., 

625 squeeze: bool | None = ..., 

626 prefix: str | lib.NoDefault = ..., 

627 mangle_dupe_cols: bool = ..., 

628 dtype: DtypeArg | None = ..., 

629 engine: CSVEngine | None = ..., 

630 converters=..., 

631 true_values=..., 

632 false_values=..., 

633 skipinitialspace: bool = ..., 

634 skiprows=..., 

635 skipfooter: int = ..., 

636 nrows: int | None = ..., 

637 na_values=..., 

638 keep_default_na: bool = ..., 

639 na_filter: bool = ..., 

640 verbose: bool = ..., 

641 skip_blank_lines: bool = ..., 

642 parse_dates=..., 

643 infer_datetime_format: bool = ..., 

644 keep_date_col: bool = ..., 

645 date_parser=..., 

646 dayfirst: bool = ..., 

647 cache_dates: bool = ..., 

648 iterator: Literal[True], 

649 chunksize: int | None = ..., 

650 compression: CompressionOptions = ..., 

651 thousands: str | None = ..., 

652 decimal: str = ..., 

653 lineterminator: str | None = ..., 

654 quotechar: str = ..., 

655 quoting: int = ..., 

656 doublequote: bool = ..., 

657 escapechar: str | None = ..., 

658 comment: str | None = ..., 

659 encoding: str | None = ..., 

660 encoding_errors: str | None = ..., 

661 dialect: str | csv.Dialect | None = ..., 

662 error_bad_lines: bool | None = ..., 

663 warn_bad_lines: bool | None = ..., 

664 on_bad_lines=..., 

665 delim_whitespace: bool = ..., 

666 low_memory=..., 

667 memory_map: bool = ..., 

668 float_precision: Literal["high", "legacy"] | None = ..., 

669 storage_options: StorageOptions = ..., 

670) -> TextFileReader: 

671 ... 

672 

673 

674# chunksize=int -> TextFileReader 

675@overload 

676def read_csv( 

677 filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str], 

678 *, 

679 sep: str | None | lib.NoDefault = ..., 

680 delimiter: str | None | lib.NoDefault = ..., 

681 header: int | Sequence[int] | None | Literal["infer"] = ..., 

682 names: Sequence[Hashable] | None | lib.NoDefault = ..., 

683 index_col: IndexLabel | Literal[False] | None = ..., 

684 usecols=..., 

685 squeeze: bool | None = ..., 

686 prefix: str | lib.NoDefault = ..., 

687 mangle_dupe_cols: bool = ..., 

688 dtype: DtypeArg | None = ..., 

689 engine: CSVEngine | None = ..., 

690 converters=..., 

691 true_values=..., 

692 false_values=..., 

693 skipinitialspace: bool = ..., 

694 skiprows=..., 

695 skipfooter: int = ..., 

696 nrows: int | None = ..., 

697 na_values=..., 

698 keep_default_na: bool = ..., 

699 na_filter: bool = ..., 

700 verbose: bool = ..., 

701 skip_blank_lines: bool = ..., 

702 parse_dates=..., 

703 infer_datetime_format: bool = ..., 

704 keep_date_col: bool = ..., 

705 date_parser=..., 

706 dayfirst: bool = ..., 

707 cache_dates: bool = ..., 

708 iterator: bool = ..., 

709 chunksize: int, 

710 compression: CompressionOptions = ..., 

711 thousands: str | None = ..., 

712 decimal: str = ..., 

713 lineterminator: str | None = ..., 

714 quotechar: str = ..., 

715 quoting: int = ..., 

716 doublequote: bool = ..., 

717 escapechar: str | None = ..., 

718 comment: str | None = ..., 

719 encoding: str | None = ..., 

720 encoding_errors: str | None = ..., 

721 dialect: str | csv.Dialect | None = ..., 

722 error_bad_lines: bool | None = ..., 

723 warn_bad_lines: bool | None = ..., 

724 on_bad_lines=..., 

725 delim_whitespace: bool = ..., 

726 low_memory=..., 

727 memory_map: bool = ..., 

728 float_precision: Literal["high", "legacy"] | None = ..., 

729 storage_options: StorageOptions = ..., 

730) -> TextFileReader: 

731 ... 

732 

733 

734# default case -> DataFrame 

735@overload 

736def read_csv( 

737 filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str], 

738 *, 

739 sep: str | None | lib.NoDefault = ..., 

740 delimiter: str | None | lib.NoDefault = ..., 

741 header: int | Sequence[int] | None | Literal["infer"] = ..., 

742 names: Sequence[Hashable] | None | lib.NoDefault = ..., 

743 index_col: IndexLabel | Literal[False] | None = ..., 

744 usecols=..., 

745 squeeze: bool | None = ..., 

746 prefix: str | lib.NoDefault = ..., 

747 mangle_dupe_cols: bool = ..., 

748 dtype: DtypeArg | None = ..., 

749 engine: CSVEngine | None = ..., 

750 converters=..., 

751 true_values=..., 

752 false_values=..., 

753 skipinitialspace: bool = ..., 

754 skiprows=..., 

755 skipfooter: int = ..., 

756 nrows: int | None = ..., 

757 na_values=..., 

758 keep_default_na: bool = ..., 

759 na_filter: bool = ..., 

760 verbose: bool = ..., 

761 skip_blank_lines: bool = ..., 

762 parse_dates=..., 

763 infer_datetime_format: bool = ..., 

764 keep_date_col: bool = ..., 

765 date_parser=..., 

766 dayfirst: bool = ..., 

767 cache_dates: bool = ..., 

768 iterator: Literal[False] = ..., 

769 chunksize: None = ..., 

770 compression: CompressionOptions = ..., 

771 thousands: str | None = ..., 

772 decimal: str = ..., 

773 lineterminator: str | None = ..., 

774 quotechar: str = ..., 

775 quoting: int = ..., 

776 doublequote: bool = ..., 

777 escapechar: str | None = ..., 

778 comment: str | None = ..., 

779 encoding: str | None = ..., 

780 encoding_errors: str | None = ..., 

781 dialect: str | csv.Dialect | None = ..., 

782 error_bad_lines: bool | None = ..., 

783 warn_bad_lines: bool | None = ..., 

784 on_bad_lines=..., 

785 delim_whitespace: bool = ..., 

786 low_memory=..., 

787 memory_map: bool = ..., 

788 float_precision: Literal["high", "legacy"] | None = ..., 

789 storage_options: StorageOptions = ..., 

790) -> DataFrame: 

791 ... 

792 

793 

794# Unions -> DataFrame | TextFileReader 

795@overload 

796def read_csv( 

797 filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str], 

798 *, 

799 sep: str | None | lib.NoDefault = ..., 

800 delimiter: str | None | lib.NoDefault = ..., 

801 header: int | Sequence[int] | None | Literal["infer"] = ..., 

802 names: Sequence[Hashable] | None | lib.NoDefault = ..., 

803 index_col: IndexLabel | Literal[False] | None = ..., 

804 usecols=..., 

805 squeeze: bool | None = ..., 

806 prefix: str | lib.NoDefault = ..., 

807 mangle_dupe_cols: bool = ..., 

808 dtype: DtypeArg | None = ..., 

809 engine: CSVEngine | None = ..., 

810 converters=..., 

811 true_values=..., 

812 false_values=..., 

813 skipinitialspace: bool = ..., 

814 skiprows=..., 

815 skipfooter: int = ..., 

816 nrows: int | None = ..., 

817 na_values=..., 

818 keep_default_na: bool = ..., 

819 na_filter: bool = ..., 

820 verbose: bool = ..., 

821 skip_blank_lines: bool = ..., 

822 parse_dates=..., 

823 infer_datetime_format: bool = ..., 

824 keep_date_col: bool = ..., 

825 date_parser=..., 

826 dayfirst: bool = ..., 

827 cache_dates: bool = ..., 

828 iterator: bool = ..., 

829 chunksize: int | None = ..., 

830 compression: CompressionOptions = ..., 

831 thousands: str | None = ..., 

832 decimal: str = ..., 

833 lineterminator: str | None = ..., 

834 quotechar: str = ..., 

835 quoting: int = ..., 

836 doublequote: bool = ..., 

837 escapechar: str | None = ..., 

838 comment: str | None = ..., 

839 encoding: str | None = ..., 

840 encoding_errors: str | None = ..., 

841 dialect: str | csv.Dialect | None = ..., 

842 error_bad_lines: bool | None = ..., 

843 warn_bad_lines: bool | None = ..., 

844 on_bad_lines=..., 

845 delim_whitespace: bool = ..., 

846 low_memory=..., 

847 memory_map: bool = ..., 

848 float_precision: Literal["high", "legacy"] | None = ..., 

849 storage_options: StorageOptions = ..., 

850) -> DataFrame | TextFileReader: 

851 ... 

852 

853 

854@deprecate_kwarg(old_arg_name="mangle_dupe_cols", new_arg_name=None) 

855@deprecate_nonkeyword_arguments(version=None, allowed_args=["filepath_or_buffer"]) 

856@Appender( 

857 _doc_read_csv_and_table.format( 

858 func_name="read_csv", 

859 summary="Read a comma-separated values (csv) file into DataFrame.", 

860 _default_sep="','", 

861 storage_options=_shared_docs["storage_options"], 

862 decompression_options=_shared_docs["decompression_options"] 

863 % "filepath_or_buffer", 

864 ) 

865) 

866def read_csv( 

867 filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str], 

868 sep: str | None | lib.NoDefault = lib.no_default, 

869 delimiter: str | None | lib.NoDefault = None, 

870 # Column and Index Locations and Names 

871 header: int | Sequence[int] | None | Literal["infer"] = "infer", 

872 names: Sequence[Hashable] | None | lib.NoDefault = lib.no_default, 

873 index_col: IndexLabel | Literal[False] | None = None, 

874 usecols=None, 

875 squeeze: bool | None = None, 

876 prefix: str | lib.NoDefault = lib.no_default, 

877 mangle_dupe_cols: bool = True, 

878 # General Parsing Configuration 

879 dtype: DtypeArg | None = None, 

880 engine: CSVEngine | None = None, 

881 converters=None, 

882 true_values=None, 

883 false_values=None, 

884 skipinitialspace: bool = False, 

885 skiprows=None, 

886 skipfooter: int = 0, 

887 nrows: int | None = None, 

888 # NA and Missing Data Handling 

889 na_values=None, 

890 keep_default_na: bool = True, 

891 na_filter: bool = True, 

892 verbose: bool = False, 

893 skip_blank_lines: bool = True, 

894 # Datetime Handling 

895 parse_dates=None, 

896 infer_datetime_format: bool = False, 

897 keep_date_col: bool = False, 

898 date_parser=None, 

899 dayfirst: bool = False, 

900 cache_dates: bool = True, 

901 # Iteration 

902 iterator: bool = False, 

903 chunksize: int | None = None, 

904 # Quoting, Compression, and File Format 

905 compression: CompressionOptions = "infer", 

906 thousands: str | None = None, 

907 decimal: str = ".", 

908 lineterminator: str | None = None, 

909 quotechar: str = '"', 

910 quoting: int = csv.QUOTE_MINIMAL, 

911 doublequote: bool = True, 

912 escapechar: str | None = None, 

913 comment: str | None = None, 

914 encoding: str | None = None, 

915 encoding_errors: str | None = "strict", 

916 dialect: str | csv.Dialect | None = None, 

917 # Error Handling 

918 error_bad_lines: bool | None = None, 

919 warn_bad_lines: bool | None = None, 

920 # TODO(2.0): set on_bad_lines to "error". 

921 # See _refine_defaults_read comment for why we do this. 

922 on_bad_lines=None, 

923 # Internal 

924 delim_whitespace: bool = False, 

925 low_memory=_c_parser_defaults["low_memory"], 

926 memory_map: bool = False, 

927 float_precision: Literal["high", "legacy"] | None = None, 

928 storage_options: StorageOptions = None, 

929) -> DataFrame | TextFileReader: 

930 # locals() should never be modified 

931 kwds = locals().copy() 

932 del kwds["filepath_or_buffer"] 

933 del kwds["sep"] 

934 

935 kwds_defaults = _refine_defaults_read( 

936 dialect, 

937 delimiter, 

938 delim_whitespace, 

939 engine, 

940 sep, 

941 error_bad_lines, 

942 warn_bad_lines, 

943 on_bad_lines, 

944 names, 

945 prefix, 

946 defaults={"delimiter": ","}, 

947 ) 

948 kwds.update(kwds_defaults) 

949 

950 return _read(filepath_or_buffer, kwds) 

951 

952 

953# iterator=True -> TextFileReader 

954@overload 

955def read_table( 

956 filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str], 

957 *, 

958 sep: str | None | lib.NoDefault = ..., 

959 delimiter: str | None | lib.NoDefault = ..., 

960 header: int | Sequence[int] | None | Literal["infer"] = ..., 

961 names: Sequence[Hashable] | None | lib.NoDefault = ..., 

962 index_col: IndexLabel | Literal[False] | None = ..., 

963 usecols=..., 

964 squeeze: bool | None = ..., 

965 prefix: str | lib.NoDefault = ..., 

966 mangle_dupe_cols: bool = ..., 

967 dtype: DtypeArg | None = ..., 

968 engine: CSVEngine | None = ..., 

969 converters=..., 

970 true_values=..., 

971 false_values=..., 

972 skipinitialspace: bool = ..., 

973 skiprows=..., 

974 skipfooter: int = ..., 

975 nrows: int | None = ..., 

976 na_values=..., 

977 keep_default_na: bool = ..., 

978 na_filter: bool = ..., 

979 verbose: bool = ..., 

980 skip_blank_lines: bool = ..., 

981 parse_dates=..., 

982 infer_datetime_format: bool = ..., 

983 keep_date_col: bool = ..., 

984 date_parser=..., 

985 dayfirst: bool = ..., 

986 cache_dates: bool = ..., 

987 iterator: Literal[True], 

988 chunksize: int | None = ..., 

989 compression: CompressionOptions = ..., 

990 thousands: str | None = ..., 

991 decimal: str = ..., 

992 lineterminator: str | None = ..., 

993 quotechar: str = ..., 

994 quoting: int = ..., 

995 doublequote: bool = ..., 

996 escapechar: str | None = ..., 

997 comment: str | None = ..., 

998 encoding: str | None = ..., 

999 encoding_errors: str | None = ..., 

1000 dialect: str | csv.Dialect | None = ..., 

1001 error_bad_lines: bool | None = ..., 

1002 warn_bad_lines: bool | None = ..., 

1003 on_bad_lines=..., 

1004 delim_whitespace=..., 

1005 low_memory=..., 

1006 memory_map: bool = ..., 

1007 float_precision: str | None = ..., 

1008 storage_options: StorageOptions = ..., 

1009) -> TextFileReader: 

1010 ... 

1011 

1012 

1013# chunksize=int -> TextFileReader 

1014@overload 

1015def read_table( 

1016 filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str], 

1017 *, 

1018 sep: str | None | lib.NoDefault = ..., 

1019 delimiter: str | None | lib.NoDefault = ..., 

1020 header: int | Sequence[int] | None | Literal["infer"] = ..., 

1021 names: Sequence[Hashable] | None | lib.NoDefault = ..., 

1022 index_col: IndexLabel | Literal[False] | None = ..., 

1023 usecols=..., 

1024 squeeze: bool | None = ..., 

1025 prefix: str | lib.NoDefault = ..., 

1026 mangle_dupe_cols: bool = ..., 

1027 dtype: DtypeArg | None = ..., 

1028 engine: CSVEngine | None = ..., 

1029 converters=..., 

1030 true_values=..., 

1031 false_values=..., 

1032 skipinitialspace: bool = ..., 

1033 skiprows=..., 

1034 skipfooter: int = ..., 

1035 nrows: int | None = ..., 

1036 na_values=..., 

1037 keep_default_na: bool = ..., 

1038 na_filter: bool = ..., 

1039 verbose: bool = ..., 

1040 skip_blank_lines: bool = ..., 

1041 parse_dates=..., 

1042 infer_datetime_format: bool = ..., 

1043 keep_date_col: bool = ..., 

1044 date_parser=..., 

1045 dayfirst: bool = ..., 

1046 cache_dates: bool = ..., 

1047 iterator: bool = ..., 

1048 chunksize: int, 

1049 compression: CompressionOptions = ..., 

1050 thousands: str | None = ..., 

1051 decimal: str = ..., 

1052 lineterminator: str | None = ..., 

1053 quotechar: str = ..., 

1054 quoting: int = ..., 

1055 doublequote: bool = ..., 

1056 escapechar: str | None = ..., 

1057 comment: str | None = ..., 

1058 encoding: str | None = ..., 

1059 encoding_errors: str | None = ..., 

1060 dialect: str | csv.Dialect | None = ..., 

1061 error_bad_lines: bool | None = ..., 

1062 warn_bad_lines: bool | None = ..., 

1063 on_bad_lines=..., 

1064 delim_whitespace=..., 

1065 low_memory=..., 

1066 memory_map: bool = ..., 

1067 float_precision: str | None = ..., 

1068 storage_options: StorageOptions = ..., 

1069) -> TextFileReader: 

1070 ... 

1071 

1072 

1073# default -> DataFrame 

1074@overload 

1075def read_table( 

1076 filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str], 

1077 *, 

1078 sep: str | None | lib.NoDefault = ..., 

1079 delimiter: str | None | lib.NoDefault = ..., 

1080 header: int | Sequence[int] | None | Literal["infer"] = ..., 

1081 names: Sequence[Hashable] | None | lib.NoDefault = ..., 

1082 index_col: IndexLabel | Literal[False] | None = ..., 

1083 usecols=..., 

1084 squeeze: bool | None = ..., 

1085 prefix: str | lib.NoDefault = ..., 

1086 mangle_dupe_cols: bool = ..., 

1087 dtype: DtypeArg | None = ..., 

1088 engine: CSVEngine | None = ..., 

1089 converters=..., 

1090 true_values=..., 

1091 false_values=..., 

1092 skipinitialspace: bool = ..., 

1093 skiprows=..., 

1094 skipfooter: int = ..., 

1095 nrows: int | None = ..., 

1096 na_values=..., 

1097 keep_default_na: bool = ..., 

1098 na_filter: bool = ..., 

1099 verbose: bool = ..., 

1100 skip_blank_lines: bool = ..., 

1101 parse_dates=..., 

1102 infer_datetime_format: bool = ..., 

1103 keep_date_col: bool = ..., 

1104 date_parser=..., 

1105 dayfirst: bool = ..., 

1106 cache_dates: bool = ..., 

1107 iterator: Literal[False] = ..., 

1108 chunksize: None = ..., 

1109 compression: CompressionOptions = ..., 

1110 thousands: str | None = ..., 

1111 decimal: str = ..., 

1112 lineterminator: str | None = ..., 

1113 quotechar: str = ..., 

1114 quoting: int = ..., 

1115 doublequote: bool = ..., 

1116 escapechar: str | None = ..., 

1117 comment: str | None = ..., 

1118 encoding: str | None = ..., 

1119 encoding_errors: str | None = ..., 

1120 dialect: str | csv.Dialect | None = ..., 

1121 error_bad_lines: bool | None = ..., 

1122 warn_bad_lines: bool | None = ..., 

1123 on_bad_lines=..., 

1124 delim_whitespace=..., 

1125 low_memory=..., 

1126 memory_map: bool = ..., 

1127 float_precision: str | None = ..., 

1128 storage_options: StorageOptions = ..., 

1129) -> DataFrame: 

1130 ... 

1131 

1132 

1133# Unions -> DataFrame | TextFileReader 

1134@overload 

1135def read_table( 

1136 filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str], 

1137 *, 

1138 sep: str | None | lib.NoDefault = ..., 

1139 delimiter: str | None | lib.NoDefault = ..., 

1140 header: int | Sequence[int] | None | Literal["infer"] = ..., 

1141 names: Sequence[Hashable] | None | lib.NoDefault = ..., 

1142 index_col: IndexLabel | Literal[False] | None = ..., 

1143 usecols=..., 

1144 squeeze: bool | None = ..., 

1145 prefix: str | lib.NoDefault = ..., 

1146 mangle_dupe_cols: bool = ..., 

1147 dtype: DtypeArg | None = ..., 

1148 engine: CSVEngine | None = ..., 

1149 converters=..., 

1150 true_values=..., 

1151 false_values=..., 

1152 skipinitialspace: bool = ..., 

1153 skiprows=..., 

1154 skipfooter: int = ..., 

1155 nrows: int | None = ..., 

1156 na_values=..., 

1157 keep_default_na: bool = ..., 

1158 na_filter: bool = ..., 

1159 verbose: bool = ..., 

1160 skip_blank_lines: bool = ..., 

1161 parse_dates=..., 

1162 infer_datetime_format: bool = ..., 

1163 keep_date_col: bool = ..., 

1164 date_parser=..., 

1165 dayfirst: bool = ..., 

1166 cache_dates: bool = ..., 

1167 iterator: bool = ..., 

1168 chunksize: int | None = ..., 

1169 compression: CompressionOptions = ..., 

1170 thousands: str | None = ..., 

1171 decimal: str = ..., 

1172 lineterminator: str | None = ..., 

1173 quotechar: str = ..., 

1174 quoting: int = ..., 

1175 doublequote: bool = ..., 

1176 escapechar: str | None = ..., 

1177 comment: str | None = ..., 

1178 encoding: str | None = ..., 

1179 encoding_errors: str | None = ..., 

1180 dialect: str | csv.Dialect | None = ..., 

1181 error_bad_lines: bool | None = ..., 

1182 warn_bad_lines: bool | None = ..., 

1183 on_bad_lines=..., 

1184 delim_whitespace=..., 

1185 low_memory=..., 

1186 memory_map: bool = ..., 

1187 float_precision: str | None = ..., 

1188 storage_options: StorageOptions = ..., 

1189) -> DataFrame | TextFileReader: 

1190 ... 

1191 

1192 

1193@deprecate_kwarg(old_arg_name="mangle_dupe_cols", new_arg_name=None) 

1194@deprecate_nonkeyword_arguments(version=None, allowed_args=["filepath_or_buffer"]) 

1195@Appender( 

1196 _doc_read_csv_and_table.format( 

1197 func_name="read_table", 

1198 summary="Read general delimited file into DataFrame.", 

1199 _default_sep=r"'\\t' (tab-stop)", 

1200 storage_options=_shared_docs["storage_options"], 

1201 decompression_options=_shared_docs["decompression_options"] 

1202 % "filepath_or_buffer", 

1203 ) 

1204) 

1205def read_table( 

1206 filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str], 

1207 sep: str | None | lib.NoDefault = lib.no_default, 

1208 delimiter: str | None | lib.NoDefault = None, 

1209 # Column and Index Locations and Names 

1210 header: int | Sequence[int] | None | Literal["infer"] = "infer", 

1211 names: Sequence[Hashable] | None | lib.NoDefault = lib.no_default, 

1212 index_col: IndexLabel | Literal[False] | None = None, 

1213 usecols=None, 

1214 squeeze: bool | None = None, 

1215 prefix: str | lib.NoDefault = lib.no_default, 

1216 mangle_dupe_cols: bool = True, 

1217 # General Parsing Configuration 

1218 dtype: DtypeArg | None = None, 

1219 engine: CSVEngine | None = None, 

1220 converters=None, 

1221 true_values=None, 

1222 false_values=None, 

1223 skipinitialspace: bool = False, 

1224 skiprows=None, 

1225 skipfooter: int = 0, 

1226 nrows: int | None = None, 

1227 # NA and Missing Data Handling 

1228 na_values=None, 

1229 keep_default_na: bool = True, 

1230 na_filter: bool = True, 

1231 verbose: bool = False, 

1232 skip_blank_lines: bool = True, 

1233 # Datetime Handling 

1234 parse_dates=False, 

1235 infer_datetime_format: bool = False, 

1236 keep_date_col: bool = False, 

1237 date_parser=None, 

1238 dayfirst: bool = False, 

1239 cache_dates: bool = True, 

1240 # Iteration 

1241 iterator: bool = False, 

1242 chunksize: int | None = None, 

1243 # Quoting, Compression, and File Format 

1244 compression: CompressionOptions = "infer", 

1245 thousands: str | None = None, 

1246 decimal: str = ".", 

1247 lineterminator: str | None = None, 

1248 quotechar: str = '"', 

1249 quoting: int = csv.QUOTE_MINIMAL, 

1250 doublequote: bool = True, 

1251 escapechar: str | None = None, 

1252 comment: str | None = None, 

1253 encoding: str | None = None, 

1254 encoding_errors: str | None = "strict", 

1255 dialect: str | csv.Dialect | None = None, 

1256 # Error Handling 

1257 error_bad_lines: bool | None = None, 

1258 warn_bad_lines: bool | None = None, 

1259 # TODO(2.0): set on_bad_lines to "error". 

1260 # See _refine_defaults_read comment for why we do this. 

1261 on_bad_lines=None, 

1262 # Internal 

1263 delim_whitespace=False, 

1264 low_memory=_c_parser_defaults["low_memory"], 

1265 memory_map: bool = False, 

1266 float_precision: str | None = None, 

1267 storage_options: StorageOptions = None, 

1268) -> DataFrame | TextFileReader: 

1269 # locals() should never be modified 

1270 kwds = locals().copy() 

1271 del kwds["filepath_or_buffer"] 

1272 del kwds["sep"] 

1273 

1274 kwds_defaults = _refine_defaults_read( 

1275 dialect, 

1276 delimiter, 

1277 delim_whitespace, 

1278 engine, 

1279 sep, 

1280 error_bad_lines, 

1281 warn_bad_lines, 

1282 on_bad_lines, 

1283 names, 

1284 prefix, 

1285 defaults={"delimiter": "\t"}, 

1286 ) 

1287 kwds.update(kwds_defaults) 

1288 

1289 return _read(filepath_or_buffer, kwds) 

1290 

1291 

1292@deprecate_nonkeyword_arguments(version=None, allowed_args=["filepath_or_buffer"]) 

1293def read_fwf( 

1294 filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str], 

1295 colspecs: Sequence[tuple[int, int]] | str | None = "infer", 

1296 widths: Sequence[int] | None = None, 

1297 infer_nrows: int = 100, 

1298 **kwds, 

1299) -> DataFrame | TextFileReader: 

1300 r""" 

1301 Read a table of fixed-width formatted lines into DataFrame. 

1302 

1303 Also supports optionally iterating or breaking of the file 

1304 into chunks. 

1305 

1306 Additional help can be found in the `online docs for IO Tools 

1307 <https://pandas.pydata.org/pandas-docs/stable/user_guide/io.html>`_. 

1308 

1309 Parameters 

1310 ---------- 

1311 filepath_or_buffer : str, path object, or file-like object 

1312 String, path object (implementing ``os.PathLike[str]``), or file-like 

1313 object implementing a text ``read()`` function.The string could be a URL. 

1314 Valid URL schemes include http, ftp, s3, and file. For file URLs, a host is 

1315 expected. A local file could be: 

1316 ``file://localhost/path/to/table.csv``. 

1317 colspecs : list of tuple (int, int) or 'infer'. optional 

1318 A list of tuples giving the extents of the fixed-width 

1319 fields of each line as half-open intervals (i.e., [from, to[ ). 

1320 String value 'infer' can be used to instruct the parser to try 

1321 detecting the column specifications from the first 100 rows of 

1322 the data which are not being skipped via skiprows (default='infer'). 

1323 widths : list of int, optional 

1324 A list of field widths which can be used instead of 'colspecs' if 

1325 the intervals are contiguous. 

1326 infer_nrows : int, default 100 

1327 The number of rows to consider when letting the parser determine the 

1328 `colspecs`. 

1329 **kwds : optional 

1330 Optional keyword arguments can be passed to ``TextFileReader``. 

1331 

1332 Returns 

1333 ------- 

1334 DataFrame or TextFileReader 

1335 A comma-separated values (csv) file is returned as two-dimensional 

1336 data structure with labeled axes. 

1337 

1338 See Also 

1339 -------- 

1340 DataFrame.to_csv : Write DataFrame to a comma-separated values (csv) file. 

1341 read_csv : Read a comma-separated values (csv) file into DataFrame. 

1342 

1343 Examples 

1344 -------- 

1345 >>> pd.read_fwf('data.csv') # doctest: +SKIP 

1346 """ 

1347 # Check input arguments. 

1348 if colspecs is None and widths is None: 

1349 raise ValueError("Must specify either colspecs or widths") 

1350 elif colspecs not in (None, "infer") and widths is not None: 

1351 raise ValueError("You must specify only one of 'widths' and 'colspecs'") 

1352 

1353 # Compute 'colspecs' from 'widths', if specified. 

1354 if widths is not None: 

1355 colspecs, col = [], 0 

1356 for w in widths: 

1357 colspecs.append((col, col + w)) 

1358 col += w 

1359 

1360 # for mypy 

1361 assert colspecs is not None 

1362 

1363 # GH#40830 

1364 # Ensure length of `colspecs` matches length of `names` 

1365 names = kwds.get("names") 

1366 if names is not None: 

1367 if len(names) != len(colspecs) and colspecs != "infer": 

1368 # need to check len(index_col) as it might contain 

1369 # unnamed indices, in which case it's name is not required 

1370 len_index = 0 

1371 if kwds.get("index_col") is not None: 

1372 index_col: Any = kwds.get("index_col") 

1373 if index_col is not False: 

1374 if not is_list_like(index_col): 

1375 len_index = 1 

1376 else: 

1377 len_index = len(index_col) 

1378 if kwds.get("usecols") is None and len(names) + len_index != len(colspecs): 

1379 # If usecols is used colspec may be longer than names 

1380 raise ValueError("Length of colspecs must match length of names") 

1381 

1382 kwds["colspecs"] = colspecs 

1383 kwds["infer_nrows"] = infer_nrows 

1384 kwds["engine"] = "python-fwf" 

1385 return _read(filepath_or_buffer, kwds) 

1386 

1387 

1388class TextFileReader(abc.Iterator): 

1389 """ 

1390 

1391 Passed dialect overrides any of the related parser options 

1392 

1393 """ 

1394 

1395 def __init__( 

1396 self, 

1397 f: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str] | list, 

1398 engine: CSVEngine | None = None, 

1399 **kwds, 

1400 ) -> None: 

1401 if engine is not None: 

1402 engine_specified = True 

1403 else: 

1404 engine = "python" 

1405 engine_specified = False 

1406 self.engine = engine 

1407 self._engine_specified = kwds.get("engine_specified", engine_specified) 

1408 

1409 _validate_skipfooter(kwds) 

1410 

1411 dialect = _extract_dialect(kwds) 

1412 if dialect is not None: 

1413 if engine == "pyarrow": 

1414 raise ValueError( 

1415 "The 'dialect' option is not supported with the 'pyarrow' engine" 

1416 ) 

1417 kwds = _merge_with_dialect_properties(dialect, kwds) 

1418 

1419 if kwds.get("header", "infer") == "infer": 

1420 kwds["header"] = 0 if kwds.get("names") is None else None 

1421 

1422 self.orig_options = kwds 

1423 

1424 # miscellanea 

1425 self._currow = 0 

1426 

1427 options = self._get_options_with_defaults(engine) 

1428 options["storage_options"] = kwds.get("storage_options", None) 

1429 

1430 self.chunksize = options.pop("chunksize", None) 

1431 self.nrows = options.pop("nrows", None) 

1432 

1433 self._check_file_or_buffer(f, engine) 

1434 self.options, self.engine = self._clean_options(options, engine) 

1435 

1436 self.squeeze = self.options.pop("squeeze", False) 

1437 

1438 if "has_index_names" in kwds: 

1439 self.options["has_index_names"] = kwds["has_index_names"] 

1440 

1441 self.handles: IOHandles | None = None 

1442 self._engine = self._make_engine(f, self.engine) 

1443 

1444 def close(self) -> None: 

1445 if self.handles is not None: 

1446 self.handles.close() 

1447 self._engine.close() 

1448 

1449 def _get_options_with_defaults(self, engine: CSVEngine) -> dict[str, Any]: 

1450 kwds = self.orig_options 

1451 

1452 options = {} 

1453 default: object | None 

1454 

1455 for argname, default in parser_defaults.items(): 

1456 value = kwds.get(argname, default) 

1457 

1458 # see gh-12935 

1459 if ( 

1460 engine == "pyarrow" 

1461 and argname in _pyarrow_unsupported 

1462 and value != default 

1463 and value != getattr(value, "value", default) 

1464 ): 

1465 if ( 

1466 argname == "on_bad_lines" 

1467 and kwds.get("error_bad_lines") is not None 

1468 ): 

1469 argname = "error_bad_lines" 

1470 elif ( 

1471 argname == "on_bad_lines" and kwds.get("warn_bad_lines") is not None 

1472 ): 

1473 argname = "warn_bad_lines" 

1474 

1475 raise ValueError( 

1476 f"The {repr(argname)} option is not supported with the " 

1477 f"'pyarrow' engine" 

1478 ) 

1479 elif argname == "mangle_dupe_cols" and value is False: 

1480 # GH12935 

1481 raise ValueError("Setting mangle_dupe_cols=False is not supported yet") 

1482 else: 

1483 options[argname] = value 

1484 

1485 for argname, default in _c_parser_defaults.items(): 

1486 if argname in kwds: 

1487 value = kwds[argname] 

1488 

1489 if engine != "c" and value != default: 

1490 if "python" in engine and argname not in _python_unsupported: 

1491 pass 

1492 elif ( 

1493 value 

1494 == _deprecated_defaults.get( 

1495 argname, _DeprecationConfig(default, None) 

1496 ).default_value 

1497 ): 

1498 pass 

1499 else: 

1500 raise ValueError( 

1501 f"The {repr(argname)} option is not supported with the " 

1502 f"{repr(engine)} engine" 

1503 ) 

1504 else: 

1505 value = _deprecated_defaults.get( 

1506 argname, _DeprecationConfig(default, None) 

1507 ).default_value 

1508 options[argname] = value 

1509 

1510 if engine == "python-fwf": 

1511 for argname, default in _fwf_defaults.items(): 

1512 options[argname] = kwds.get(argname, default) 

1513 

1514 return options 

1515 

1516 def _check_file_or_buffer(self, f, engine: CSVEngine) -> None: 

1517 # see gh-16530 

1518 if is_file_like(f) and engine != "c" and not hasattr(f, "__iter__"): 

1519 # The C engine doesn't need the file-like to have the "__iter__" 

1520 # attribute. However, the Python engine needs "__iter__(...)" 

1521 # when iterating through such an object, meaning it 

1522 # needs to have that attribute 

1523 raise ValueError( 

1524 "The 'python' engine cannot iterate through this file buffer." 

1525 ) 

1526 

1527 def _clean_options( 

1528 self, options: dict[str, Any], engine: CSVEngine 

1529 ) -> tuple[dict[str, Any], CSVEngine]: 

1530 result = options.copy() 

1531 

1532 fallback_reason = None 

1533 

1534 # C engine not supported yet 

1535 if engine == "c": 

1536 if options["skipfooter"] > 0: 

1537 fallback_reason = "the 'c' engine does not support skipfooter" 

1538 engine = "python" 

1539 

1540 sep = options["delimiter"] 

1541 delim_whitespace = options["delim_whitespace"] 

1542 

1543 if sep is None and not delim_whitespace: 

1544 if engine in ("c", "pyarrow"): 

1545 fallback_reason = ( 

1546 f"the '{engine}' engine does not support " 

1547 "sep=None with delim_whitespace=False" 

1548 ) 

1549 engine = "python" 

1550 elif sep is not None and len(sep) > 1: 

1551 if engine == "c" and sep == r"\s+": 

1552 result["delim_whitespace"] = True 

1553 del result["delimiter"] 

1554 elif engine not in ("python", "python-fwf"): 

1555 # wait until regex engine integrated 

1556 fallback_reason = ( 

1557 f"the '{engine}' engine does not support " 

1558 "regex separators (separators > 1 char and " 

1559 r"different from '\s+' are interpreted as regex)" 

1560 ) 

1561 engine = "python" 

1562 elif delim_whitespace: 

1563 if "python" in engine: 

1564 result["delimiter"] = r"\s+" 

1565 elif sep is not None: 

1566 encodeable = True 

1567 encoding = sys.getfilesystemencoding() or "utf-8" 

1568 try: 

1569 if len(sep.encode(encoding)) > 1: 

1570 encodeable = False 

1571 except UnicodeDecodeError: 

1572 encodeable = False 

1573 if not encodeable and engine not in ("python", "python-fwf"): 

1574 fallback_reason = ( 

1575 f"the separator encoded in {encoding} " 

1576 f"is > 1 char long, and the '{engine}' engine " 

1577 "does not support such separators" 

1578 ) 

1579 engine = "python" 

1580 

1581 quotechar = options["quotechar"] 

1582 if quotechar is not None and isinstance(quotechar, (str, bytes)): 

1583 if ( 

1584 len(quotechar) == 1 

1585 and ord(quotechar) > 127 

1586 and engine not in ("python", "python-fwf") 

1587 ): 

1588 fallback_reason = ( 

1589 "ord(quotechar) > 127, meaning the " 

1590 "quotechar is larger than one byte, " 

1591 f"and the '{engine}' engine does not support such quotechars" 

1592 ) 

1593 engine = "python" 

1594 

1595 if fallback_reason and self._engine_specified: 

1596 raise ValueError(fallback_reason) 

1597 

1598 if engine == "c": 

1599 for arg in _c_unsupported: 

1600 del result[arg] 

1601 

1602 if "python" in engine: 

1603 for arg in _python_unsupported: 

1604 if fallback_reason and result[arg] != _c_parser_defaults[arg]: 

1605 raise ValueError( 

1606 "Falling back to the 'python' engine because " 

1607 f"{fallback_reason}, but this causes {repr(arg)} to be " 

1608 "ignored as it is not supported by the 'python' engine." 

1609 ) 

1610 del result[arg] 

1611 

1612 if fallback_reason: 

1613 warnings.warn( 

1614 ( 

1615 "Falling back to the 'python' engine because " 

1616 f"{fallback_reason}; you can avoid this warning by specifying " 

1617 "engine='python'." 

1618 ), 

1619 ParserWarning, 

1620 stacklevel=find_stack_level(), 

1621 ) 

1622 

1623 index_col = options["index_col"] 

1624 names = options["names"] 

1625 converters = options["converters"] 

1626 na_values = options["na_values"] 

1627 skiprows = options["skiprows"] 

1628 

1629 validate_header_arg(options["header"]) 

1630 

1631 for arg in _deprecated_defaults.keys(): 

1632 parser_default = _c_parser_defaults.get(arg, parser_defaults[arg]) 

1633 depr_default = _deprecated_defaults[arg] 

1634 if result.get(arg, depr_default) != depr_default.default_value: 

1635 msg = ( 

1636 f"The {arg} argument has been deprecated and will be " 

1637 f"removed in a future version. {depr_default.msg}\n\n" 

1638 ) 

1639 warnings.warn(msg, FutureWarning, stacklevel=find_stack_level()) 

1640 else: 

1641 result[arg] = parser_default 

1642 

1643 if index_col is True: 

1644 raise ValueError("The value of index_col couldn't be 'True'") 

1645 if is_index_col(index_col): 

1646 if not isinstance(index_col, (list, tuple, np.ndarray)): 

1647 index_col = [index_col] 

1648 result["index_col"] = index_col 

1649 

1650 names = list(names) if names is not None else names 

1651 

1652 # type conversion-related 

1653 if converters is not None: 

1654 if not isinstance(converters, dict): 

1655 raise TypeError( 

1656 "Type converters must be a dict or subclass, " 

1657 f"input was a {type(converters).__name__}" 

1658 ) 

1659 else: 

1660 converters = {} 

1661 

1662 # Converting values to NA 

1663 keep_default_na = options["keep_default_na"] 

1664 na_values, na_fvalues = _clean_na_values(na_values, keep_default_na) 

1665 

1666 # handle skiprows; this is internally handled by the 

1667 # c-engine, so only need for python and pyarrow parsers 

1668 if engine == "pyarrow": 

1669 if not is_integer(skiprows) and skiprows is not None: 

1670 # pyarrow expects skiprows to be passed as an integer 

1671 raise ValueError( 

1672 "skiprows argument must be an integer when using " 

1673 "engine='pyarrow'" 

1674 ) 

1675 else: 

1676 if is_integer(skiprows): 

1677 skiprows = list(range(skiprows)) 

1678 if skiprows is None: 

1679 skiprows = set() 

1680 elif not callable(skiprows): 

1681 skiprows = set(skiprows) 

1682 

1683 # put stuff back 

1684 result["names"] = names 

1685 result["converters"] = converters 

1686 result["na_values"] = na_values 

1687 result["na_fvalues"] = na_fvalues 

1688 result["skiprows"] = skiprows 

1689 # Default for squeeze is none since we need to check 

1690 # if user sets it. We then set to False to preserve 

1691 # previous behavior. 

1692 result["squeeze"] = False if options["squeeze"] is None else options["squeeze"] 

1693 

1694 return result, engine 

1695 

1696 def __next__(self) -> DataFrame: 

1697 try: 

1698 return self.get_chunk() 

1699 except StopIteration: 

1700 self.close() 

1701 raise 

1702 

1703 def _make_engine( 

1704 self, 

1705 f: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str] | list | IO, 

1706 engine: CSVEngine = "c", 

1707 ) -> ParserBase: 

1708 mapping: dict[str, type[ParserBase]] = { 

1709 "c": CParserWrapper, 

1710 "python": PythonParser, 

1711 "pyarrow": ArrowParserWrapper, 

1712 "python-fwf": FixedWidthFieldParser, 

1713 } 

1714 if engine not in mapping: 

1715 raise ValueError( 

1716 f"Unknown engine: {engine} (valid options are {mapping.keys()})" 

1717 ) 

1718 if not isinstance(f, list): 

1719 # open file here 

1720 is_text = True 

1721 mode = "r" 

1722 if engine == "pyarrow": 

1723 is_text = False 

1724 mode = "rb" 

1725 elif ( 

1726 engine == "c" 

1727 and self.options.get("encoding", "utf-8") == "utf-8" 

1728 and isinstance(stringify_path(f), str) 

1729 ): 

1730 # c engine can decode utf-8 bytes, adding TextIOWrapper makes 

1731 # the c-engine especially for memory_map=True far slower 

1732 is_text = False 

1733 if "b" not in mode: 

1734 mode += "b" 

1735 self.handles = get_handle( 

1736 f, 

1737 mode, 

1738 encoding=self.options.get("encoding", None), 

1739 compression=self.options.get("compression", None), 

1740 memory_map=self.options.get("memory_map", False), 

1741 is_text=is_text, 

1742 errors=self.options.get("encoding_errors", "strict"), 

1743 storage_options=self.options.get("storage_options", None), 

1744 ) 

1745 assert self.handles is not None 

1746 f = self.handles.handle 

1747 

1748 elif engine != "python": 

1749 msg = f"Invalid file path or buffer object type: {type(f)}" 

1750 raise ValueError(msg) 

1751 

1752 try: 

1753 return mapping[engine](f, **self.options) 

1754 except Exception: 

1755 if self.handles is not None: 

1756 self.handles.close() 

1757 raise 

1758 

1759 def _failover_to_python(self) -> None: 

1760 raise AbstractMethodError(self) 

1761 

1762 def read(self, nrows: int | None = None) -> DataFrame: 

1763 if self.engine == "pyarrow": 

1764 try: 

1765 # error: "ParserBase" has no attribute "read" 

1766 df = self._engine.read() # type: ignore[attr-defined] 

1767 except Exception: 

1768 self.close() 

1769 raise 

1770 else: 

1771 nrows = validate_integer("nrows", nrows) 

1772 try: 

1773 # error: "ParserBase" has no attribute "read" 

1774 ( 

1775 index, 

1776 columns, 

1777 col_dict, 

1778 ) = self._engine.read( # type: ignore[attr-defined] 

1779 nrows 

1780 ) 

1781 except Exception: 

1782 self.close() 

1783 raise 

1784 

1785 if index is None: 

1786 if col_dict: 

1787 # Any column is actually fine: 

1788 new_rows = len(next(iter(col_dict.values()))) 

1789 index = RangeIndex(self._currow, self._currow + new_rows) 

1790 else: 

1791 new_rows = 0 

1792 else: 

1793 new_rows = len(index) 

1794 

1795 df = DataFrame(col_dict, columns=columns, index=index) 

1796 

1797 self._currow += new_rows 

1798 

1799 if self.squeeze and len(df.columns) == 1: 

1800 return df.squeeze("columns").copy() 

1801 return df 

1802 

1803 def get_chunk(self, size: int | None = None) -> DataFrame: 

1804 if size is None: 

1805 size = self.chunksize 

1806 if self.nrows is not None: 

1807 if self._currow >= self.nrows: 

1808 raise StopIteration 

1809 size = min(size, self.nrows - self._currow) 

1810 return self.read(nrows=size) 

1811 

1812 def __enter__(self) -> TextFileReader: 

1813 return self 

1814 

1815 def __exit__(self, exc_type, exc_value, traceback) -> None: 

1816 self.close() 

1817 

1818 

1819def TextParser(*args, **kwds) -> TextFileReader: 

1820 """ 

1821 Converts lists of lists/tuples into DataFrames with proper type inference 

1822 and optional (e.g. string to datetime) conversion. Also enables iterating 

1823 lazily over chunks of large files 

1824 

1825 Parameters 

1826 ---------- 

1827 data : file-like object or list 

1828 delimiter : separator character to use 

1829 dialect : str or csv.Dialect instance, optional 

1830 Ignored if delimiter is longer than 1 character 

1831 names : sequence, default 

1832 header : int, default 0 

1833 Row to use to parse column labels. Defaults to the first row. Prior 

1834 rows will be discarded 

1835 index_col : int or list, optional 

1836 Column or columns to use as the (possibly hierarchical) index 

1837 has_index_names: bool, default False 

1838 True if the cols defined in index_col have an index name and are 

1839 not in the header. 

1840 na_values : scalar, str, list-like, or dict, optional 

1841 Additional strings to recognize as NA/NaN. 

1842 keep_default_na : bool, default True 

1843 thousands : str, optional 

1844 Thousands separator 

1845 comment : str, optional 

1846 Comment out remainder of line 

1847 parse_dates : bool, default False 

1848 keep_date_col : bool, default False 

1849 date_parser : function, optional 

1850 skiprows : list of integers 

1851 Row numbers to skip 

1852 skipfooter : int 

1853 Number of line at bottom of file to skip 

1854 converters : dict, optional 

1855 Dict of functions for converting values in certain columns. Keys can 

1856 either be integers or column labels, values are functions that take one 

1857 input argument, the cell (not column) content, and return the 

1858 transformed content. 

1859 encoding : str, optional 

1860 Encoding to use for UTF when reading/writing (ex. 'utf-8') 

1861 squeeze : bool, default False 

1862 returns Series if only one column. 

1863 infer_datetime_format: bool, default False 

1864 If True and `parse_dates` is True for a column, try to infer the 

1865 datetime format based on the first datetime string. If the format 

1866 can be inferred, there often will be a large parsing speed-up. 

1867 float_precision : str, optional 

1868 Specifies which converter the C engine should use for floating-point 

1869 values. The options are `None` or `high` for the ordinary converter, 

1870 `legacy` for the original lower precision pandas converter, and 

1871 `round_trip` for the round-trip converter. 

1872 

1873 .. versionchanged:: 1.2 

1874 """ 

1875 kwds["engine"] = "python" 

1876 return TextFileReader(*args, **kwds) 

1877 

1878 

1879def _clean_na_values(na_values, keep_default_na=True): 

1880 na_fvalues: set | dict 

1881 if na_values is None: 

1882 if keep_default_na: 

1883 na_values = STR_NA_VALUES 

1884 else: 

1885 na_values = set() 

1886 na_fvalues = set() 

1887 elif isinstance(na_values, dict): 

1888 old_na_values = na_values.copy() 

1889 na_values = {} # Prevent aliasing. 

1890 

1891 # Convert the values in the na_values dictionary 

1892 # into array-likes for further use. This is also 

1893 # where we append the default NaN values, provided 

1894 # that `keep_default_na=True`. 

1895 for k, v in old_na_values.items(): 

1896 if not is_list_like(v): 

1897 v = [v] 

1898 

1899 if keep_default_na: 

1900 v = set(v) | STR_NA_VALUES 

1901 

1902 na_values[k] = v 

1903 na_fvalues = {k: _floatify_na_values(v) for k, v in na_values.items()} 

1904 else: 

1905 if not is_list_like(na_values): 

1906 na_values = [na_values] 

1907 na_values = _stringify_na_values(na_values) 

1908 if keep_default_na: 

1909 na_values = na_values | STR_NA_VALUES 

1910 

1911 na_fvalues = _floatify_na_values(na_values) 

1912 

1913 return na_values, na_fvalues 

1914 

1915 

1916def _floatify_na_values(na_values): 

1917 # create float versions of the na_values 

1918 result = set() 

1919 for v in na_values: 

1920 try: 

1921 v = float(v) 

1922 if not np.isnan(v): 

1923 result.add(v) 

1924 except (TypeError, ValueError, OverflowError): 

1925 pass 

1926 return result 

1927 

1928 

1929def _stringify_na_values(na_values): 

1930 """return a stringified and numeric for these values""" 

1931 result: list[str | float] = [] 

1932 for x in na_values: 

1933 result.append(str(x)) 

1934 result.append(x) 

1935 try: 

1936 v = float(x) 

1937 

1938 # we are like 999 here 

1939 if v == int(v): 

1940 v = int(v) 

1941 result.append(f"{v}.0") 

1942 result.append(str(v)) 

1943 

1944 result.append(v) 

1945 except (TypeError, ValueError, OverflowError): 

1946 pass 

1947 try: 

1948 result.append(int(x)) 

1949 except (TypeError, ValueError, OverflowError): 

1950 pass 

1951 return set(result) 

1952 

1953 

1954def _refine_defaults_read( 

1955 dialect: str | csv.Dialect | None, 

1956 delimiter: str | None | lib.NoDefault, 

1957 delim_whitespace: bool, 

1958 engine: CSVEngine | None, 

1959 sep: str | None | lib.NoDefault, 

1960 error_bad_lines: bool | None, 

1961 warn_bad_lines: bool | None, 

1962 on_bad_lines: str | Callable | None, 

1963 names: Sequence[Hashable] | None | lib.NoDefault, 

1964 prefix: str | None | lib.NoDefault, 

1965 defaults: dict[str, Any], 

1966): 

1967 """Validate/refine default values of input parameters of read_csv, read_table. 

1968 

1969 Parameters 

1970 ---------- 

1971 dialect : str or csv.Dialect 

1972 If provided, this parameter will override values (default or not) for the 

1973 following parameters: `delimiter`, `doublequote`, `escapechar`, 

1974 `skipinitialspace`, `quotechar`, and `quoting`. If it is necessary to 

1975 override values, a ParserWarning will be issued. See csv.Dialect 

1976 documentation for more details. 

1977 delimiter : str or object 

1978 Alias for sep. 

1979 delim_whitespace : bool 

1980 Specifies whether or not whitespace (e.g. ``' '`` or ``'\t'``) will be 

1981 used as the sep. Equivalent to setting ``sep='\\s+'``. If this option 

1982 is set to True, nothing should be passed in for the ``delimiter`` 

1983 parameter. 

1984 engine : {{'c', 'python'}} 

1985 Parser engine to use. The C engine is faster while the python engine is 

1986 currently more feature-complete. 

1987 sep : str or object 

1988 A delimiter provided by the user (str) or a sentinel value, i.e. 

1989 pandas._libs.lib.no_default. 

1990 error_bad_lines : str or None 

1991 Whether to error on a bad line or not. 

1992 warn_bad_lines : str or None 

1993 Whether to warn on a bad line or not. 

1994 on_bad_lines : str, callable or None 

1995 An option for handling bad lines or a sentinel value(None). 

1996 names : array-like, optional 

1997 List of column names to use. If the file contains a header row, 

1998 then you should explicitly pass ``header=0`` to override the column names. 

1999 Duplicates in this list are not allowed. 

2000 prefix : str, optional 

2001 Prefix to add to column numbers when no header, e.g. 'X' for X0, X1, ... 

2002 defaults: dict 

2003 Default values of input parameters. 

2004 

2005 Returns 

2006 ------- 

2007 kwds : dict 

2008 Input parameters with correct values. 

2009 

2010 Raises 

2011 ------ 

2012 ValueError : 

2013 If a delimiter was specified with ``sep`` (or ``delimiter``) and 

2014 ``delim_whitespace=True``. 

2015 If on_bad_lines is specified(not ``None``) and ``error_bad_lines``/ 

2016 ``warn_bad_lines`` is True. 

2017 """ 

2018 # fix types for sep, delimiter to Union(str, Any) 

2019 delim_default = defaults["delimiter"] 

2020 kwds: dict[str, Any] = {} 

2021 # gh-23761 

2022 # 

2023 # When a dialect is passed, it overrides any of the overlapping 

2024 # parameters passed in directly. We don't want to warn if the 

2025 # default parameters were passed in (since it probably means 

2026 # that the user didn't pass them in explicitly in the first place). 

2027 # 

2028 # "delimiter" is the annoying corner case because we alias it to 

2029 # "sep" before doing comparison to the dialect values later on. 

2030 # Thus, we need a flag to indicate that we need to "override" 

2031 # the comparison to dialect values by checking if default values 

2032 # for BOTH "delimiter" and "sep" were provided. 

2033 if dialect is not None: 

2034 kwds["sep_override"] = delimiter is None and ( 

2035 sep is lib.no_default or sep == delim_default 

2036 ) 

2037 

2038 if delimiter and (sep is not lib.no_default): 

2039 raise ValueError("Specified a sep and a delimiter; you can only specify one.") 

2040 

2041 if ( 

2042 names is not None 

2043 and names is not lib.no_default 

2044 and prefix is not None 

2045 and prefix is not lib.no_default 

2046 ): 

2047 raise ValueError("Specified named and prefix; you can only specify one.") 

2048 

2049 kwds["names"] = None if names is lib.no_default else names 

2050 kwds["prefix"] = None if prefix is lib.no_default else prefix 

2051 

2052 # Alias sep -> delimiter. 

2053 if delimiter is None: 

2054 delimiter = sep 

2055 

2056 if delim_whitespace and (delimiter is not lib.no_default): 

2057 raise ValueError( 

2058 "Specified a delimiter with both sep and " 

2059 "delim_whitespace=True; you can only specify one." 

2060 ) 

2061 

2062 if delimiter == "\n": 

2063 raise ValueError( 

2064 r"Specified \n as separator or delimiter. This forces the python engine " 

2065 "which does not accept a line terminator. Hence it is not allowed to use " 

2066 "the line terminator as separator.", 

2067 ) 

2068 

2069 if delimiter is lib.no_default: 

2070 # assign default separator value 

2071 kwds["delimiter"] = delim_default 

2072 else: 

2073 kwds["delimiter"] = delimiter 

2074 

2075 if engine is not None: 

2076 kwds["engine_specified"] = True 

2077 else: 

2078 kwds["engine"] = "c" 

2079 kwds["engine_specified"] = False 

2080 

2081 # Ensure that on_bad_lines and error_bad_lines/warn_bad_lines 

2082 # aren't specified at the same time. If so, raise. Otherwise, 

2083 # alias on_bad_lines to "error" if error/warn_bad_lines not set 

2084 # and on_bad_lines is not set. on_bad_lines is defaulted to None 

2085 # so we can tell if it is set (this is why this hack exists). 

2086 if on_bad_lines is not None: 

2087 if error_bad_lines is not None or warn_bad_lines is not None: 

2088 raise ValueError( 

2089 "Both on_bad_lines and error_bad_lines/warn_bad_lines are set. " 

2090 "Please only set on_bad_lines." 

2091 ) 

2092 if on_bad_lines == "error": 

2093 kwds["on_bad_lines"] = ParserBase.BadLineHandleMethod.ERROR 

2094 elif on_bad_lines == "warn": 

2095 kwds["on_bad_lines"] = ParserBase.BadLineHandleMethod.WARN 

2096 elif on_bad_lines == "skip": 

2097 kwds["on_bad_lines"] = ParserBase.BadLineHandleMethod.SKIP 

2098 elif callable(on_bad_lines): 

2099 if engine != "python": 

2100 raise ValueError( 

2101 "on_bad_line can only be a callable function if engine='python'" 

2102 ) 

2103 kwds["on_bad_lines"] = on_bad_lines 

2104 else: 

2105 raise ValueError(f"Argument {on_bad_lines} is invalid for on_bad_lines") 

2106 else: 

2107 if error_bad_lines is not None: 

2108 # Must check is_bool, because other stuff(e.g. non-empty lists) eval to true 

2109 validate_bool_kwarg(error_bad_lines, "error_bad_lines") 

2110 if error_bad_lines: 

2111 kwds["on_bad_lines"] = ParserBase.BadLineHandleMethod.ERROR 

2112 else: 

2113 if warn_bad_lines is not None: 

2114 # This is the case where error_bad_lines is False 

2115 # We can only warn/skip if error_bad_lines is False 

2116 # None doesn't work because backwards-compatibility reasons 

2117 validate_bool_kwarg(warn_bad_lines, "warn_bad_lines") 

2118 if warn_bad_lines: 

2119 kwds["on_bad_lines"] = ParserBase.BadLineHandleMethod.WARN 

2120 else: 

2121 kwds["on_bad_lines"] = ParserBase.BadLineHandleMethod.SKIP 

2122 else: 

2123 # Backwards compat, when only error_bad_lines = false, we warn 

2124 kwds["on_bad_lines"] = ParserBase.BadLineHandleMethod.WARN 

2125 else: 

2126 # Everything None -> Error 

2127 kwds["on_bad_lines"] = ParserBase.BadLineHandleMethod.ERROR 

2128 

2129 return kwds 

2130 

2131 

2132def _extract_dialect(kwds: dict[str, Any]) -> csv.Dialect | None: 

2133 """ 

2134 Extract concrete csv dialect instance. 

2135 

2136 Returns 

2137 ------- 

2138 csv.Dialect or None 

2139 """ 

2140 if kwds.get("dialect") is None: 

2141 return None 

2142 

2143 dialect = kwds["dialect"] 

2144 if dialect in csv.list_dialects(): 

2145 dialect = csv.get_dialect(dialect) 

2146 

2147 _validate_dialect(dialect) 

2148 

2149 return dialect 

2150 

2151 

2152MANDATORY_DIALECT_ATTRS = ( 

2153 "delimiter", 

2154 "doublequote", 

2155 "escapechar", 

2156 "skipinitialspace", 

2157 "quotechar", 

2158 "quoting", 

2159) 

2160 

2161 

2162def _validate_dialect(dialect: csv.Dialect) -> None: 

2163 """ 

2164 Validate csv dialect instance. 

2165 

2166 Raises 

2167 ------ 

2168 ValueError 

2169 If incorrect dialect is provided. 

2170 """ 

2171 for param in MANDATORY_DIALECT_ATTRS: 

2172 if not hasattr(dialect, param): 

2173 raise ValueError(f"Invalid dialect {dialect} provided") 

2174 

2175 

2176def _merge_with_dialect_properties( 

2177 dialect: csv.Dialect, 

2178 defaults: dict[str, Any], 

2179) -> dict[str, Any]: 

2180 """ 

2181 Merge default kwargs in TextFileReader with dialect parameters. 

2182 

2183 Parameters 

2184 ---------- 

2185 dialect : csv.Dialect 

2186 Concrete csv dialect. See csv.Dialect documentation for more details. 

2187 defaults : dict 

2188 Keyword arguments passed to TextFileReader. 

2189 

2190 Returns 

2191 ------- 

2192 kwds : dict 

2193 Updated keyword arguments, merged with dialect parameters. 

2194 """ 

2195 kwds = defaults.copy() 

2196 

2197 for param in MANDATORY_DIALECT_ATTRS: 

2198 dialect_val = getattr(dialect, param) 

2199 

2200 parser_default = parser_defaults[param] 

2201 provided = kwds.get(param, parser_default) 

2202 

2203 # Messages for conflicting values between the dialect 

2204 # instance and the actual parameters provided. 

2205 conflict_msgs = [] 

2206 

2207 # Don't warn if the default parameter was passed in, 

2208 # even if it conflicts with the dialect (gh-23761). 

2209 if provided != parser_default and provided != dialect_val: 

2210 msg = ( 

2211 f"Conflicting values for '{param}': '{provided}' was " 

2212 f"provided, but the dialect specifies '{dialect_val}'. " 

2213 "Using the dialect-specified value." 

2214 ) 

2215 

2216 # Annoying corner case for not warning about 

2217 # conflicts between dialect and delimiter parameter. 

2218 # Refer to the outer "_read_" function for more info. 

2219 if not (param == "delimiter" and kwds.pop("sep_override", False)): 

2220 conflict_msgs.append(msg) 

2221 

2222 if conflict_msgs: 

2223 warnings.warn( 

2224 "\n\n".join(conflict_msgs), ParserWarning, stacklevel=find_stack_level() 

2225 ) 

2226 kwds[param] = dialect_val 

2227 return kwds 

2228 

2229 

2230def _validate_skipfooter(kwds: dict[str, Any]) -> None: 

2231 """ 

2232 Check whether skipfooter is compatible with other kwargs in TextFileReader. 

2233 

2234 Parameters 

2235 ---------- 

2236 kwds : dict 

2237 Keyword arguments passed to TextFileReader. 

2238 

2239 Raises 

2240 ------ 

2241 ValueError 

2242 If skipfooter is not compatible with other parameters. 

2243 """ 

2244 if kwds.get("skipfooter"): 

2245 if kwds.get("iterator") or kwds.get("chunksize"): 

2246 raise ValueError("'skipfooter' not supported for iteration") 

2247 if kwds.get("nrows"): 

2248 raise ValueError("'skipfooter' not supported with 'nrows'")