Coverage for /var/srv/projects/api.amasfac.comuna18.com/tmp/venv/lib/python3.9/site-packages/pandas/io/parsers/readers.py: 12%

1"""

2Module contains tools for processing files into DataFrames or other objects

3"""

4from __future__ import annotations

6from collections import abc

7import csv

8import sys

9from textwrap import fill

10from typing import (

11 IO,

12 Any,

13 Callable,

14 Hashable,

15 Literal,

16 NamedTuple,

17 Sequence,

18 overload,

19)

20import warnings

22import numpy as np

24import pandas._libs.lib as lib

25from pandas._libs.parsers import STR_NA_VALUES

26from pandas._typing import (

27 CompressionOptions,

28 CSVEngine,

29 DtypeArg,

30 FilePath,

31 IndexLabel,

32 ReadCsvBuffer,

33 StorageOptions,

34)

35from pandas.errors import (

36 AbstractMethodError,

37 ParserWarning,

38)

39from pandas.util._decorators import (

40 Appender,

41 deprecate_kwarg,

42 deprecate_nonkeyword_arguments,

43)

44from pandas.util._exceptions import find_stack_level

45from pandas.util._validators import validate_bool_kwarg

47from pandas.core.dtypes.common import (

48 is_file_like,

49 is_float,

50 is_integer,

51 is_list_like,

52)

54from pandas.core.frame import DataFrame

55from pandas.core.indexes.api import RangeIndex

56from pandas.core.shared_docs import _shared_docs

58from pandas.io.common import (

59 IOHandles,

60 get_handle,

61 stringify_path,

62 validate_header_arg,

63)

64from pandas.io.parsers.arrow_parser_wrapper import ArrowParserWrapper

65from pandas.io.parsers.base_parser import (

66 ParserBase,

67 is_index_col,

68 parser_defaults,

69)

70from pandas.io.parsers.c_parser_wrapper import CParserWrapper

71from pandas.io.parsers.python_parser import (

72 FixedWidthFieldParser,

73 PythonParser,

74)

76_doc_read_csv_and_table = (

77 r"""

78{summary}

80Also supports optionally iterating or breaking of the file

81into chunks.

83Additional help can be found in the online docs for

84`IO Tools <https://pandas.pydata.org/pandas-docs/stable/user_guide/io.html>`_.

86Parameters

87----------

88filepath_or_buffer : str, path object or file-like object

89 Any valid string path is acceptable. The string could be a URL. Valid

90 URL schemes include http, ftp, s3, gs, and file. For file URLs, a host is

91 expected. A local file could be: file://localhost/path/to/table.csv.

93 If you want to pass in a path object, pandas accepts any ``os.PathLike``.

95 By file-like object, we refer to objects with a ``read()`` method, such as

96 a file handle (e.g. via builtin ``open`` function) or ``StringIO``.

97sep : str, default {_default_sep}

98 Delimiter to use. If sep is None, the C engine cannot automatically detect

99 the separator, but the Python parsing engine can, meaning the latter will

100 be used and automatically detect the separator by Python's builtin sniffer

101 tool, ``csv.Sniffer``. In addition, separators longer than 1 character and

102 different from ``'\s+'`` will be interpreted as regular expressions and

103 will also force the use of the Python parsing engine. Note that regex

104 delimiters are prone to ignoring quoted data. Regex example: ``'\r\t'``.

105delimiter : str, default ``None``

106 Alias for sep.

107header : int, list of int, None, default 'infer'

108 Row number(s) to use as the column names, and the start of the

109 data. Default behavior is to infer the column names: if no names

110 are passed the behavior is identical to ``header=0`` and column

111 names are inferred from the first line of the file, if column

112 names are passed explicitly then the behavior is identical to

113 ``header=None``. Explicitly pass ``header=0`` to be able to

114 replace existing names. The header can be a list of integers that

115 specify row locations for a multi-index on the columns

116 e.g. [0,1,3]. Intervening rows that are not specified will be

117 skipped (e.g. 2 in this example is skipped). Note that this

118 parameter ignores commented lines and empty lines if

119 ``skip_blank_lines=True``, so ``header=0`` denotes the first line of

120 data rather than the first line of the file.

121names : array-like, optional

122 List of column names to use. If the file contains a header row,

123 then you should explicitly pass ``header=0`` to override the column names.

124 Duplicates in this list are not allowed.

125index_col : int, str, sequence of int / str, or False, optional, default ``None``

126 Column(s) to use as the row labels of the ``DataFrame``, either given as

127 string name or column index. If a sequence of int / str is given, a

128 MultiIndex is used.

129

130 Note: ``index_col=False`` can be used to force pandas to *not* use the first

131 column as the index, e.g. when you have a malformed file with delimiters at

132 the end of each line.

133usecols : list-like or callable, optional

134 Return a subset of the columns. If list-like, all elements must either

135 be positional (i.e. integer indices into the document columns) or strings

136 that correspond to column names provided either by the user in `names` or

137 inferred from the document header row(s). If ``names`` are given, the document

138 header row(s) are not taken into account. For example, a valid list-like

139 `usecols` parameter would be ``[0, 1, 2]`` or ``['foo', 'bar', 'baz']``.

140 Element order is ignored, so ``usecols=[0, 1]`` is the same as ``[1, 0]``.

141 To instantiate a DataFrame from ``data`` with element order preserved use

142 ``pd.read_csv(data, usecols=['foo', 'bar'])[['foo', 'bar']]`` for columns

143 in ``['foo', 'bar']`` order or

144 ``pd.read_csv(data, usecols=['foo', 'bar'])[['bar', 'foo']]``

145 for ``['bar', 'foo']`` order.

146

147 If callable, the callable function will be evaluated against the column

148 names, returning names where the callable function evaluates to True. An

149 example of a valid callable argument would be ``lambda x: x.upper() in

150 ['AAA', 'BBB', 'DDD']``. Using this parameter results in much faster

151 parsing time and lower memory usage.

152squeeze : bool, default False

153 If the parsed data only contains one column then return a Series.

154

155 .. deprecated:: 1.4.0

156 Append ``.squeeze("columns")`` to the call to ``{func_name}`` to squeeze

157 the data.

158prefix : str, optional

159 Prefix to add to column numbers when no header, e.g. 'X' for X0, X1, ...

160

161 .. deprecated:: 1.4.0

162 Use a list comprehension on the DataFrame's columns after calling ``read_csv``.

163mangle_dupe_cols : bool, default True

164 Duplicate columns will be specified as 'X', 'X.1', ...'X.N', rather than

165 'X'...'X'. Passing in False will cause data to be overwritten if there

166 are duplicate names in the columns.

167

168 .. deprecated:: 1.5.0

169 Not implemented, and a new argument to specify the pattern for the

170 names of duplicated columns will be added instead

171dtype : Type name or dict of column -> type, optional

172 Data type for data or columns. E.g. {{'a': np.float64, 'b': np.int32,

173 'c': 'Int64'}}

174 Use `str` or `object` together with suitable `na_values` settings

175 to preserve and not interpret dtype.

176 If converters are specified, they will be applied INSTEAD

177 of dtype conversion.

178

179 .. versionadded:: 1.5.0

180

181 Support for defaultdict was added. Specify a defaultdict as input where

182 the default determines the dtype of the columns which are not explicitly

183 listed.

184engine : {{'c', 'python', 'pyarrow'}}, optional

185 Parser engine to use. The C and pyarrow engines are faster, while the python engine

186 is currently more feature-complete. Multithreading is currently only supported by

187 the pyarrow engine.

188

189 .. versionadded:: 1.4.0

190

191 The "pyarrow" engine was added as an *experimental* engine, and some features

192 are unsupported, or may not work correctly, with this engine.

193converters : dict, optional

194 Dict of functions for converting values in certain columns. Keys can either

195 be integers or column labels.

196true_values : list, optional

197 Values to consider as True.

198false_values : list, optional

199 Values to consider as False.

200skipinitialspace : bool, default False

201 Skip spaces after delimiter.

202skiprows : list-like, int or callable, optional

203 Line numbers to skip (0-indexed) or number of lines to skip (int)

204 at the start of the file.

205

206 If callable, the callable function will be evaluated against the row

207 indices, returning True if the row should be skipped and False otherwise.

208 An example of a valid callable argument would be ``lambda x: x in [0, 2]``.

209skipfooter : int, default 0

210 Number of lines at bottom of file to skip (Unsupported with engine='c').

211nrows : int, optional

212 Number of rows of file to read. Useful for reading pieces of large files.

213na_values : scalar, str, list-like, or dict, optional

214 Additional strings to recognize as NA/NaN. If dict passed, specific

215 per-column NA values. By default the following values are interpreted as

216 NaN: '"""

217 + fill("', '".join(sorted(STR_NA_VALUES)), 70, subsequent_indent=" ")

218 + """'.

219keep_default_na : bool, default True

220 Whether or not to include the default NaN values when parsing the data.

221 Depending on whether `na_values` is passed in, the behavior is as follows:

222

223 * If `keep_default_na` is True, and `na_values` are specified, `na_values`

224 is appended to the default NaN values used for parsing.

225 * If `keep_default_na` is True, and `na_values` are not specified, only

226 the default NaN values are used for parsing.

227 * If `keep_default_na` is False, and `na_values` are specified, only

228 the NaN values specified `na_values` are used for parsing.

229 * If `keep_default_na` is False, and `na_values` are not specified, no

230 strings will be parsed as NaN.

231

232 Note that if `na_filter` is passed in as False, the `keep_default_na` and

233 `na_values` parameters will be ignored.

234na_filter : bool, default True

235 Detect missing value markers (empty strings and the value of na_values). In

236 data without any NAs, passing na_filter=False can improve the performance

237 of reading a large file.

238verbose : bool, default False

239 Indicate number of NA values placed in non-numeric columns.

240skip_blank_lines : bool, default True

241 If True, skip over blank lines rather than interpreting as NaN values.

242parse_dates : bool or list of int or names or list of lists or dict, \

243default False

244 The behavior is as follows:

245

246 * boolean. If True -> try parsing the index.

247 * list of int or names. e.g. If [1, 2, 3] -> try parsing columns 1, 2, 3

248 each as a separate date column.

249 * list of lists. e.g. If [[1, 3]] -> combine columns 1 and 3 and parse as

250 a single date column.

251 * dict, e.g. {{'foo' : [1, 3]}} -> parse columns 1, 3 as date and call

252 result 'foo'

253

254 If a column or index cannot be represented as an array of datetimes,

255 say because of an unparsable value or a mixture of timezones, the column

256 or index will be returned unaltered as an object data type. For

257 non-standard datetime parsing, use ``pd.to_datetime`` after

258 ``pd.read_csv``. To parse an index or column with a mixture of timezones,

259 specify ``date_parser`` to be a partially-applied

260 :func:`pandas.to_datetime` with ``utc=True``. See

261 :ref:`io.csv.mixed_timezones` for more.

262

263 Note: A fast-path exists for iso8601-formatted dates.

264infer_datetime_format : bool, default False

265 If True and `parse_dates` is enabled, pandas will attempt to infer the

266 format of the datetime strings in the columns, and if it can be inferred,

267 switch to a faster method of parsing them. In some cases this can increase

268 the parsing speed by 5-10x.

269keep_date_col : bool, default False

270 If True and `parse_dates` specifies combining multiple columns then

271 keep the original columns.

272date_parser : function, optional

273 Function to use for converting a sequence of string columns to an array of

274 datetime instances. The default uses ``dateutil.parser.parser`` to do the

275 conversion. Pandas will try to call `date_parser` in three different ways,

276 advancing to the next if an exception occurs: 1) Pass one or more arrays

277 (as defined by `parse_dates`) as arguments; 2) concatenate (row-wise) the

278 string values from the columns defined by `parse_dates` into a single array

279 and pass that; and 3) call `date_parser` once for each row using one or

280 more strings (corresponding to the columns defined by `parse_dates`) as

281 arguments.

282dayfirst : bool, default False

283 DD/MM format dates, international and European format.

284cache_dates : bool, default True

285 If True, use a cache of unique, converted dates to apply the datetime

286 conversion. May produce significant speed-up when parsing duplicate

287 date strings, especially ones with timezone offsets.

288

289 .. versionadded:: 0.25.0

290iterator : bool, default False

291 Return TextFileReader object for iteration or getting chunks with

292 ``get_chunk()``.

293

294 .. versionchanged:: 1.2

295

296 ``TextFileReader`` is a context manager.

297chunksize : int, optional

298 Return TextFileReader object for iteration.

299 See the `IO Tools docs

300 <https://pandas.pydata.org/pandas-docs/stable/io.html#io-chunking>`_

301 for more information on ``iterator`` and ``chunksize``.

302

303 .. versionchanged:: 1.2

304

305 ``TextFileReader`` is a context manager.

306{decompression_options}

307

308 .. versionchanged:: 1.4.0 Zstandard support.

309

310thousands : str, optional

311 Thousands separator.

312decimal : str, default '.'

313 Character to recognize as decimal point (e.g. use ',' for European data).

314lineterminator : str (length 1), optional

315 Character to break file into lines. Only valid with C parser.

316quotechar : str (length 1), optional

317 The character used to denote the start and end of a quoted item. Quoted

318 items can include the delimiter and it will be ignored.

319quoting : int or csv.QUOTE_* instance, default 0

320 Control field quoting behavior per ``csv.QUOTE_*`` constants. Use one of

321 QUOTE_MINIMAL (0), QUOTE_ALL (1), QUOTE_NONNUMERIC (2) or QUOTE_NONE (3).

322doublequote : bool, default ``True``

323 When quotechar is specified and quoting is not ``QUOTE_NONE``, indicate

324 whether or not to interpret two consecutive quotechar elements INSIDE a

325 field as a single ``quotechar`` element.

326escapechar : str (length 1), optional

327 One-character string used to escape other characters.

328comment : str, optional

329 Indicates remainder of line should not be parsed. If found at the beginning

330 of a line, the line will be ignored altogether. This parameter must be a

331 single character. Like empty lines (as long as ``skip_blank_lines=True``),

332 fully commented lines are ignored by the parameter `header` but not by

333 `skiprows`. For example, if ``comment='#'``, parsing

334 ``#empty\\na,b,c\\n1,2,3`` with ``header=0`` will result in 'a,b,c' being

335 treated as the header.

336encoding : str, optional

337 Encoding to use for UTF when reading/writing (ex. 'utf-8'). `List of Python

338 standard encodings

339 <https://docs.python.org/3/library/codecs.html#standard-encodings>`_ .

340

341 .. versionchanged:: 1.2

342

343 When ``encoding`` is ``None``, ``errors="replace"`` is passed to

344 ``open()``. Otherwise, ``errors="strict"`` is passed to ``open()``.

345 This behavior was previously only the case for ``engine="python"``.

346

347 .. versionchanged:: 1.3.0

348

349 ``encoding_errors`` is a new argument. ``encoding`` has no longer an

350 influence on how encoding errors are handled.

351

352encoding_errors : str, optional, default "strict"

353 How encoding errors are treated. `List of possible values

354 <https://docs.python.org/3/library/codecs.html#error-handlers>`_ .

355

356 .. versionadded:: 1.3.0

357

358dialect : str or csv.Dialect, optional

359 If provided, this parameter will override values (default or not) for the

360 following parameters: `delimiter`, `doublequote`, `escapechar`,

361 `skipinitialspace`, `quotechar`, and `quoting`. If it is necessary to

362 override values, a ParserWarning will be issued. See csv.Dialect

363 documentation for more details.

364error_bad_lines : bool, optional, default ``None``

365 Lines with too many fields (e.g. a csv line with too many commas) will by

366 default cause an exception to be raised, and no DataFrame will be returned.

367 If False, then these "bad lines" will be dropped from the DataFrame that is

368 returned.

369

370 .. deprecated:: 1.3.0

371 The ``on_bad_lines`` parameter should be used instead to specify behavior upon

372 encountering a bad line instead.

373warn_bad_lines : bool, optional, default ``None``

374 If error_bad_lines is False, and warn_bad_lines is True, a warning for each

375 "bad line" will be output.

376

377 .. deprecated:: 1.3.0

378 The ``on_bad_lines`` parameter should be used instead to specify behavior upon

379 encountering a bad line instead.

380on_bad_lines : {{'error', 'warn', 'skip'}} or callable, default 'error'

381 Specifies what to do upon encountering a bad line (a line with too many fields).

382 Allowed values are :

383

384 - 'error', raise an Exception when a bad line is encountered.

385 - 'warn', raise a warning when a bad line is encountered and skip that line.

386 - 'skip', skip bad lines without raising or warning when they are encountered.

387

388 .. versionadded:: 1.3.0

389

390 .. versionadded:: 1.4.0

391

392 - callable, function with signature

393 ``(bad_line: list[str]) -> list[str] | None`` that will process a single

394 bad line. ``bad_line`` is a list of strings split by the ``sep``.

395 If the function returns ``None``, the bad line will be ignored.

396 If the function returns a new list of strings with more elements than

397 expected, a ``ParserWarning`` will be emitted while dropping extra elements.

398 Only supported when ``engine="python"``

399

400delim_whitespace : bool, default False

401 Specifies whether or not whitespace (e.g. ``' '`` or ``'\t'``) will be

402 used as the sep. Equivalent to setting ``sep='\\s+'``. If this option

403 is set to True, nothing should be passed in for the ``delimiter``

404 parameter.

405low_memory : bool, default True

406 Internally process the file in chunks, resulting in lower memory use

407 while parsing, but possibly mixed type inference. To ensure no mixed

408 types either set False, or specify the type with the `dtype` parameter.

409 Note that the entire file is read into a single DataFrame regardless,

410 use the `chunksize` or `iterator` parameter to return the data in chunks.

411 (Only valid with C parser).

412memory_map : bool, default False

413 If a filepath is provided for `filepath_or_buffer`, map the file object

414 directly onto memory and access the data directly from there. Using this

415 option can improve performance because there is no longer any I/O overhead.

416float_precision : str, optional

417 Specifies which converter the C engine should use for floating-point

418 values. The options are ``None`` or 'high' for the ordinary converter,

419 'legacy' for the original lower precision pandas converter, and

420 'round_trip' for the round-trip converter.

421

422 .. versionchanged:: 1.2

423

424{storage_options}

425

426 .. versionadded:: 1.2

427

428Returns

429-------

430DataFrame or TextParser

431 A comma-separated values (csv) file is returned as two-dimensional

432 data structure with labeled axes.

433

434See Also

435--------

436DataFrame.to_csv : Write DataFrame to a comma-separated values (csv) file.

437read_csv : Read a comma-separated values (csv) file into DataFrame.

438read_fwf : Read a table of fixed-width formatted lines into DataFrame.

439

440Examples

441--------

442>>> pd.{func_name}('data.csv') # doctest: +SKIP

443"""

444)

445

446

447_c_parser_defaults = {

448 "delim_whitespace": False,

449 "na_filter": True,

450 "low_memory": True,

451 "memory_map": False,

452 "float_precision": None,

453}

454

455_fwf_defaults = {"colspecs": "infer", "infer_nrows": 100, "widths": None}

456

457_c_unsupported = {"skipfooter"}

458_python_unsupported = {"low_memory", "float_precision"}

459_pyarrow_unsupported = {

460 "skipfooter",

461 "float_precision",

462 "chunksize",

463 "comment",

464 "nrows",

465 "thousands",

466 "memory_map",

467 "dialect",

468 "warn_bad_lines",

469 "error_bad_lines",

470 "on_bad_lines",

471 "delim_whitespace",

472 "quoting",

473 "lineterminator",

474 "converters",

475 "decimal",

476 "iterator",

477 "dayfirst",

478 "infer_datetime_format",

479 "verbose",

480 "skipinitialspace",

481 "low_memory",

482}

483

484

485class _DeprecationConfig(NamedTuple):

486 default_value: Any

487 msg: str | None

488

489

490_deprecated_defaults: dict[str, _DeprecationConfig] = {

491 "error_bad_lines": _DeprecationConfig(None, "Use on_bad_lines in the future."),

492 "warn_bad_lines": _DeprecationConfig(None, "Use on_bad_lines in the future."),

493 "squeeze": _DeprecationConfig(

494 None, 'Append .squeeze("columns") to the call to squeeze.'

495 ),

496 "prefix": _DeprecationConfig(

497 None, "Use a list comprehension on the column names in the future."

498 ),

499}

500

501

502@overload

503def validate_integer(name, val: None, min_val=...) -> None:

504 ...

505

506

507@overload

508def validate_integer(name, val: float, min_val=...) -> int:

509 ...

510

511

512@overload

513def validate_integer(name, val: int | None, min_val=...) -> int | None:

514 ...

515

516

517def validate_integer(name, val: int | float | None, min_val=0) -> int | None:

518 """

519 Checks whether the 'name' parameter for parsing is either

520 an integer OR float that can SAFELY be cast to an integer

521 without losing accuracy. Raises a ValueError if that is

522 not the case.

523

524 Parameters

525 ----------

526 name : str

527 Parameter name (used for error reporting)

528 val : int or float

529 The value to check

530 min_val : int

531 Minimum allowed value (val < min_val will result in a ValueError)

532 """

533 if val is None:

534 return val

535

536 msg = f"'{name:s}' must be an integer >={min_val:d}"

537 if is_float(val):

538 if int(val) != val:

539 raise ValueError(msg)

540 val = int(val)

541 elif not (is_integer(val) and val >= min_val):

542 raise ValueError(msg)

543

544 return int(val)

545

546

547def _validate_names(names: Sequence[Hashable] | None) -> None:

548 """

549 Raise ValueError if the `names` parameter contains duplicates or has an

550 invalid data type.

551

552 Parameters

553 ----------

554 names : array-like or None

555 An array containing a list of the names used for the output DataFrame.

556

557 Raises

558 ------

559 ValueError

560 If names are not unique or are not ordered (e.g. set).

561 """

562 if names is not None:

563 if len(names) != len(set(names)):

564 raise ValueError("Duplicate names are not allowed.")

565 if not (

566 is_list_like(names, allow_sets=False) or isinstance(names, abc.KeysView)

567 ):

568 raise ValueError("Names should be an ordered collection.")

569

570

571def _read(

572 filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str], kwds

573) -> DataFrame | TextFileReader:

574 """Generic reader of line files."""

575 # if we pass a date_parser and parse_dates=False, we should not parse the

576 # dates GH#44366

577 if kwds.get("parse_dates", None) is None:

578 if kwds.get("date_parser", None) is None:

579 kwds["parse_dates"] = False

580 else:

581 kwds["parse_dates"] = True

582

583 # Extract some of the arguments (pass chunksize on).

584 iterator = kwds.get("iterator", False)

585 chunksize = kwds.get("chunksize", None)

586 if kwds.get("engine") == "pyarrow":

587 if iterator:

588 raise ValueError(

589 "The 'iterator' option is not supported with the 'pyarrow' engine"

590 )

591

592 if chunksize is not None:

593 raise ValueError(

594 "The 'chunksize' option is not supported with the 'pyarrow' engine"

595 )

596 else:

597 chunksize = validate_integer("chunksize", chunksize, 1)

598

599 nrows = kwds.get("nrows", None)

600

601 # Check for duplicates in names.

602 _validate_names(kwds.get("names", None))

603

604 # Create the parser.

605 parser = TextFileReader(filepath_or_buffer, **kwds)

606

607 if chunksize or iterator:

608 return parser

609

610 with parser:

611 return parser.read(nrows)

612

613

614# iterator=True -> TextFileReader

615@overload

616def read_csv(

617 filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str],

618 *,

619 sep: str | None | lib.NoDefault = ...,

620 delimiter: str | None | lib.NoDefault = ...,

621 header: int | Sequence[int] | None | Literal["infer"] = ...,

622 names: Sequence[Hashable] | None | lib.NoDefault = ...,

623 index_col: IndexLabel | Literal[False] | None = ...,

624 usecols=...,

625 squeeze: bool | None = ...,

626 prefix: str | lib.NoDefault = ...,

627 mangle_dupe_cols: bool = ...,

628 dtype: DtypeArg | None = ...,

629 engine: CSVEngine | None = ...,

630 converters=...,

631 true_values=...,

632 false_values=...,

633 skipinitialspace: bool = ...,

634 skiprows=...,

635 skipfooter: int = ...,

636 nrows: int | None = ...,

637 na_values=...,

638 keep_default_na: bool = ...,

639 na_filter: bool = ...,

640 verbose: bool = ...,

641 skip_blank_lines: bool = ...,

642 parse_dates=...,

643 infer_datetime_format: bool = ...,

644 keep_date_col: bool = ...,

645 date_parser=...,

646 dayfirst: bool = ...,

647 cache_dates: bool = ...,

648 iterator: Literal[True],

649 chunksize: int | None = ...,

650 compression: CompressionOptions = ...,

651 thousands: str | None = ...,

652 decimal: str = ...,

653 lineterminator: str | None = ...,

654 quotechar: str = ...,

655 quoting: int = ...,

656 doublequote: bool = ...,

657 escapechar: str | None = ...,

658 comment: str | None = ...,

659 encoding: str | None = ...,

660 encoding_errors: str | None = ...,

661 dialect: str | csv.Dialect | None = ...,

662 error_bad_lines: bool | None = ...,

663 warn_bad_lines: bool | None = ...,

664 on_bad_lines=...,

665 delim_whitespace: bool = ...,

666 low_memory=...,

667 memory_map: bool = ...,

668 float_precision: Literal["high", "legacy"] | None = ...,

669 storage_options: StorageOptions = ...,

670) -> TextFileReader:

671 ...

672

673

674# chunksize=int -> TextFileReader

675@overload

676def read_csv(

677 filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str],

678 *,

679 sep: str | None | lib.NoDefault = ...,

680 delimiter: str | None | lib.NoDefault = ...,

681 header: int | Sequence[int] | None | Literal["infer"] = ...,

682 names: Sequence[Hashable] | None | lib.NoDefault = ...,

683 index_col: IndexLabel | Literal[False] | None = ...,

684 usecols=...,

685 squeeze: bool | None = ...,

686 prefix: str | lib.NoDefault = ...,

687 mangle_dupe_cols: bool = ...,

688 dtype: DtypeArg | None = ...,

689 engine: CSVEngine | None = ...,

690 converters=...,

691 true_values=...,

692 false_values=...,

693 skipinitialspace: bool = ...,

694 skiprows=...,

695 skipfooter: int = ...,

696 nrows: int | None = ...,

697 na_values=...,

698 keep_default_na: bool = ...,

699 na_filter: bool = ...,

700 verbose: bool = ...,

701 skip_blank_lines: bool = ...,

702 parse_dates=...,

703 infer_datetime_format: bool = ...,

704 keep_date_col: bool = ...,

705 date_parser=...,

706 dayfirst: bool = ...,

707 cache_dates: bool = ...,

708 iterator: bool = ...,

709 chunksize: int,

710 compression: CompressionOptions = ...,

711 thousands: str | None = ...,

712 decimal: str = ...,

713 lineterminator: str | None = ...,

714 quotechar: str = ...,

715 quoting: int = ...,

716 doublequote: bool = ...,

717 escapechar: str | None = ...,

718 comment: str | None = ...,

719 encoding: str | None = ...,

720 encoding_errors: str | None = ...,

721 dialect: str | csv.Dialect | None = ...,

722 error_bad_lines: bool | None = ...,

723 warn_bad_lines: bool | None = ...,

724 on_bad_lines=...,

725 delim_whitespace: bool = ...,

726 low_memory=...,

727 memory_map: bool = ...,

728 float_precision: Literal["high", "legacy"] | None = ...,

729 storage_options: StorageOptions = ...,

730) -> TextFileReader:

731 ...

732

733

734# default case -> DataFrame

735@overload

736def read_csv(

737 filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str],

738 *,

739 sep: str | None | lib.NoDefault = ...,

740 delimiter: str | None | lib.NoDefault = ...,

741 header: int | Sequence[int] | None | Literal["infer"] = ...,

742 names: Sequence[Hashable] | None | lib.NoDefault = ...,

743 index_col: IndexLabel | Literal[False] | None = ...,

744 usecols=...,

745 squeeze: bool | None = ...,

746 prefix: str | lib.NoDefault = ...,

747 mangle_dupe_cols: bool = ...,

748 dtype: DtypeArg | None = ...,

749 engine: CSVEngine | None = ...,

750 converters=...,

751 true_values=...,

752 false_values=...,

753 skipinitialspace: bool = ...,

754 skiprows=...,

755 skipfooter: int = ...,

756 nrows: int | None = ...,

757 na_values=...,

758 keep_default_na: bool = ...,

759 na_filter: bool = ...,

760 verbose: bool = ...,

761 skip_blank_lines: bool = ...,

762 parse_dates=...,

763 infer_datetime_format: bool = ...,

764 keep_date_col: bool = ...,

765 date_parser=...,

766 dayfirst: bool = ...,

767 cache_dates: bool = ...,

768 iterator: Literal[False] = ...,

769 chunksize: None = ...,

770 compression: CompressionOptions = ...,

771 thousands: str | None = ...,

772 decimal: str = ...,

773 lineterminator: str | None = ...,

774 quotechar: str = ...,

775 quoting: int = ...,

776 doublequote: bool = ...,

777 escapechar: str | None = ...,

778 comment: str | None = ...,

779 encoding: str | None = ...,

780 encoding_errors: str | None = ...,

781 dialect: str | csv.Dialect | None = ...,

782 error_bad_lines: bool | None = ...,

783 warn_bad_lines: bool | None = ...,

784 on_bad_lines=...,

785 delim_whitespace: bool = ...,

786 low_memory=...,

787 memory_map: bool = ...,

788 float_precision: Literal["high", "legacy"] | None = ...,

789 storage_options: StorageOptions = ...,

790) -> DataFrame:

791 ...

792

793

794# Unions -> DataFrame | TextFileReader

795@overload

796def read_csv(

797 filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str],

798 *,

799 sep: str | None | lib.NoDefault = ...,

800 delimiter: str | None | lib.NoDefault = ...,

801 header: int | Sequence[int] | None | Literal["infer"] = ...,

802 names: Sequence[Hashable] | None | lib.NoDefault = ...,

803 index_col: IndexLabel | Literal[False] | None = ...,

804 usecols=...,

805 squeeze: bool | None = ...,

806 prefix: str | lib.NoDefault = ...,

807 mangle_dupe_cols: bool = ...,

808 dtype: DtypeArg | None = ...,

809 engine: CSVEngine | None = ...,

810 converters=...,

811 true_values=...,

812 false_values=...,

813 skipinitialspace: bool = ...,

814 skiprows=...,

815 skipfooter: int = ...,

816 nrows: int | None = ...,

817 na_values=...,

818 keep_default_na: bool = ...,

819 na_filter: bool = ...,

820 verbose: bool = ...,

821 skip_blank_lines: bool = ...,

822 parse_dates=...,

823 infer_datetime_format: bool = ...,

824 keep_date_col: bool = ...,

825 date_parser=...,

826 dayfirst: bool = ...,

827 cache_dates: bool = ...,

828 iterator: bool = ...,

829 chunksize: int | None = ...,

830 compression: CompressionOptions = ...,

831 thousands: str | None = ...,

832 decimal: str = ...,

833 lineterminator: str | None = ...,

834 quotechar: str = ...,

835 quoting: int = ...,

836 doublequote: bool = ...,

837 escapechar: str | None = ...,

838 comment: str | None = ...,

839 encoding: str | None = ...,

840 encoding_errors: str | None = ...,

841 dialect: str | csv.Dialect | None = ...,

842 error_bad_lines: bool | None = ...,

843 warn_bad_lines: bool | None = ...,

844 on_bad_lines=...,

845 delim_whitespace: bool = ...,

846 low_memory=...,

847 memory_map: bool = ...,

848 float_precision: Literal["high", "legacy"] | None = ...,

849 storage_options: StorageOptions = ...,

850) -> DataFrame | TextFileReader:

851 ...

852

853

854@deprecate_kwarg(old_arg_name="mangle_dupe_cols", new_arg_name=None)

855@deprecate_nonkeyword_arguments(version=None, allowed_args=["filepath_or_buffer"])

856@Appender(

857 _doc_read_csv_and_table.format(

858 func_name="read_csv",

859 summary="Read a comma-separated values (csv) file into DataFrame.",

860 _default_sep="','",

861 storage_options=_shared_docs["storage_options"],

862 decompression_options=_shared_docs["decompression_options"]

863 % "filepath_or_buffer",

864 )

865)

866def read_csv(

867 filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str],

868 sep: str | None | lib.NoDefault = lib.no_default,

869 delimiter: str | None | lib.NoDefault = None,

870 # Column and Index Locations and Names

871 header: int | Sequence[int] | None | Literal["infer"] = "infer",

872 names: Sequence[Hashable] | None | lib.NoDefault = lib.no_default,

873 index_col: IndexLabel | Literal[False] | None = None,

874 usecols=None,

875 squeeze: bool | None = None,

876 prefix: str | lib.NoDefault = lib.no_default,

877 mangle_dupe_cols: bool = True,

878 # General Parsing Configuration

879 dtype: DtypeArg | None = None,

880 engine: CSVEngine | None = None,

881 converters=None,

882 true_values=None,

883 false_values=None,

884 skipinitialspace: bool = False,

885 skiprows=None,

886 skipfooter: int = 0,

887 nrows: int | None = None,

888 # NA and Missing Data Handling

889 na_values=None,

890 keep_default_na: bool = True,

891 na_filter: bool = True,

892 verbose: bool = False,

893 skip_blank_lines: bool = True,

894 # Datetime Handling

895 parse_dates=None,

896 infer_datetime_format: bool = False,

897 keep_date_col: bool = False,

898 date_parser=None,

899 dayfirst: bool = False,

900 cache_dates: bool = True,

901 # Iteration

902 iterator: bool = False,

903 chunksize: int | None = None,

904 # Quoting, Compression, and File Format

905 compression: CompressionOptions = "infer",

906 thousands: str | None = None,

907 decimal: str = ".",

908 lineterminator: str | None = None,

909 quotechar: str = '"',

910 quoting: int = csv.QUOTE_MINIMAL,

911 doublequote: bool = True,

912 escapechar: str | None = None,

913 comment: str | None = None,

914 encoding: str | None = None,

915 encoding_errors: str | None = "strict",

916 dialect: str | csv.Dialect | None = None,

917 # Error Handling

918 error_bad_lines: bool | None = None,

919 warn_bad_lines: bool | None = None,

920 # TODO(2.0): set on_bad_lines to "error".

921 # See _refine_defaults_read comment for why we do this.

922 on_bad_lines=None,

923 # Internal

924 delim_whitespace: bool = False,

925 low_memory=_c_parser_defaults["low_memory"],

926 memory_map: bool = False,

927 float_precision: Literal["high", "legacy"] | None = None,

928 storage_options: StorageOptions = None,

929) -> DataFrame | TextFileReader:

930 # locals() should never be modified

931 kwds = locals().copy()

932 del kwds["filepath_or_buffer"]

933 del kwds["sep"]

934

935 kwds_defaults = _refine_defaults_read(

936 dialect,

937 delimiter,

938 delim_whitespace,

939 engine,

940 sep,

941 error_bad_lines,

942 warn_bad_lines,

943 on_bad_lines,

944 names,

945 prefix,

946 defaults={"delimiter": ","},

947 )

948 kwds.update(kwds_defaults)

949

950 return _read(filepath_or_buffer, kwds)

951

952

953# iterator=True -> TextFileReader

954@overload

955def read_table(

956 filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str],

957 *,

958 sep: str | None | lib.NoDefault = ...,

959 delimiter: str | None | lib.NoDefault = ...,

960 header: int | Sequence[int] | None | Literal["infer"] = ...,

961 names: Sequence[Hashable] | None | lib.NoDefault = ...,

962 index_col: IndexLabel | Literal[False] | None = ...,

963 usecols=...,

964 squeeze: bool | None = ...,

965 prefix: str | lib.NoDefault = ...,

966 mangle_dupe_cols: bool = ...,

967 dtype: DtypeArg | None = ...,

968 engine: CSVEngine | None = ...,

969 converters=...,

970 true_values=...,

971 false_values=...,

972 skipinitialspace: bool = ...,

973 skiprows=...,

974 skipfooter: int = ...,

975 nrows: int | None = ...,

976 na_values=...,

977 keep_default_na: bool = ...,

978 na_filter: bool = ...,

979 verbose: bool = ...,

980 skip_blank_lines: bool = ...,

981 parse_dates=...,

982 infer_datetime_format: bool = ...,

983 keep_date_col: bool = ...,

984 date_parser=...,

985 dayfirst: bool = ...,

986 cache_dates: bool = ...,

987 iterator: Literal[True],

988 chunksize: int | None = ...,

989 compression: CompressionOptions = ...,

990 thousands: str | None = ...,

991 decimal: str = ...,

992 lineterminator: str | None = ...,

993 quotechar: str = ...,

994 quoting: int = ...,

995 doublequote: bool = ...,

996 escapechar: str | None = ...,

997 comment: str | None = ...,

998 encoding: str | None = ...,

999 encoding_errors: str | None = ...,

1000 dialect: str | csv.Dialect | None = ...,

1001 error_bad_lines: bool | None = ...,

1002 warn_bad_lines: bool | None = ...,

1003 on_bad_lines=...,

1004 delim_whitespace=...,

1005 low_memory=...,

1006 memory_map: bool = ...,

1007 float_precision: str | None = ...,

1008 storage_options: StorageOptions = ...,

1009) -> TextFileReader:

1010 ...

1011

1012

1013# chunksize=int -> TextFileReader

1014@overload

1015def read_table(

1016 filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str],

1017 *,

1018 sep: str | None | lib.NoDefault = ...,

1019 delimiter: str | None | lib.NoDefault = ...,

1020 header: int | Sequence[int] | None | Literal["infer"] = ...,

1021 names: Sequence[Hashable] | None | lib.NoDefault = ...,

1022 index_col: IndexLabel | Literal[False] | None = ...,

1023 usecols=...,

1024 squeeze: bool | None = ...,

1025 prefix: str | lib.NoDefault = ...,

1026 mangle_dupe_cols: bool = ...,

1027 dtype: DtypeArg | None = ...,

1028 engine: CSVEngine | None = ...,

1029 converters=...,

1030 true_values=...,

1031 false_values=...,

1032 skipinitialspace: bool = ...,

1033 skiprows=...,

1034 skipfooter: int = ...,

1035 nrows: int | None = ...,

1036 na_values=...,

1037 keep_default_na: bool = ...,

1038 na_filter: bool = ...,

1039 verbose: bool = ...,

1040 skip_blank_lines: bool = ...,

1041 parse_dates=...,

1042 infer_datetime_format: bool = ...,

1043 keep_date_col: bool = ...,

1044 date_parser=...,

1045 dayfirst: bool = ...,

1046 cache_dates: bool = ...,

1047 iterator: bool = ...,

1048 chunksize: int,

1049 compression: CompressionOptions = ...,

1050 thousands: str | None = ...,

1051 decimal: str = ...,

1052 lineterminator: str | None = ...,

1053 quotechar: str = ...,

1054 quoting: int = ...,

1055 doublequote: bool = ...,

1056 escapechar: str | None = ...,

1057 comment: str | None = ...,

1058 encoding: str | None = ...,

1059 encoding_errors: str | None = ...,

1060 dialect: str | csv.Dialect | None = ...,

1061 error_bad_lines: bool | None = ...,

1062 warn_bad_lines: bool | None = ...,

1063 on_bad_lines=...,

1064 delim_whitespace=...,

1065 low_memory=...,

1066 memory_map: bool = ...,

1067 float_precision: str | None = ...,

1068 storage_options: StorageOptions = ...,

1069) -> TextFileReader:

1070 ...

1071

1072

1073# default -> DataFrame

1074@overload

1075def read_table(

1076 filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str],

1077 *,

1078 sep: str | None | lib.NoDefault = ...,

1079 delimiter: str | None | lib.NoDefault = ...,

1080 header: int | Sequence[int] | None | Literal["infer"] = ...,

1081 names: Sequence[Hashable] | None | lib.NoDefault = ...,

1082 index_col: IndexLabel | Literal[False] | None = ...,

1083 usecols=...,

1084 squeeze: bool | None = ...,

1085 prefix: str | lib.NoDefault = ...,

1086 mangle_dupe_cols: bool = ...,

1087 dtype: DtypeArg | None = ...,

1088 engine: CSVEngine | None = ...,

1089 converters=...,

1090 true_values=...,

1091 false_values=...,

1092 skipinitialspace: bool = ...,

1093 skiprows=...,

1094 skipfooter: int = ...,

1095 nrows: int | None = ...,

1096 na_values=...,

1097 keep_default_na: bool = ...,

1098 na_filter: bool = ...,

1099 verbose: bool = ...,

1100 skip_blank_lines: bool = ...,

1101 parse_dates=...,

1102 infer_datetime_format: bool = ...,

1103 keep_date_col: bool = ...,

1104 date_parser=...,

1105 dayfirst: bool = ...,

1106 cache_dates: bool = ...,

1107 iterator: Literal[False] = ...,

1108 chunksize: None = ...,

1109 compression: CompressionOptions = ...,

1110 thousands: str | None = ...,

1111 decimal: str = ...,

1112 lineterminator: str | None = ...,

1113 quotechar: str = ...,

1114 quoting: int = ...,

1115 doublequote: bool = ...,

1116 escapechar: str | None = ...,

1117 comment: str | None = ...,

1118 encoding: str | None = ...,

1119 encoding_errors: str | None = ...,

1120 dialect: str | csv.Dialect | None = ...,

1121 error_bad_lines: bool | None = ...,

1122 warn_bad_lines: bool | None = ...,

1123 on_bad_lines=...,

1124 delim_whitespace=...,

1125 low_memory=...,

1126 memory_map: bool = ...,

1127 float_precision: str | None = ...,

1128 storage_options: StorageOptions = ...,

1129) -> DataFrame:

1130 ...

1131

1132

1133# Unions -> DataFrame | TextFileReader

1134@overload

1135def read_table(

1136 filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str],

1137 *,

1138 sep: str | None | lib.NoDefault = ...,

1139 delimiter: str | None | lib.NoDefault = ...,

1140 header: int | Sequence[int] | None | Literal["infer"] = ...,

1141 names: Sequence[Hashable] | None | lib.NoDefault = ...,

1142 index_col: IndexLabel | Literal[False] | None = ...,

1143 usecols=...,

1144 squeeze: bool | None = ...,

1145 prefix: str | lib.NoDefault = ...,

1146 mangle_dupe_cols: bool = ...,

1147 dtype: DtypeArg | None = ...,

1148 engine: CSVEngine | None = ...,

1149 converters=...,

1150 true_values=...,

1151 false_values=...,

1152 skipinitialspace: bool = ...,

1153 skiprows=...,

1154 skipfooter: int = ...,

1155 nrows: int | None = ...,

1156 na_values=...,

1157 keep_default_na: bool = ...,

1158 na_filter: bool = ...,

1159 verbose: bool = ...,

1160 skip_blank_lines: bool = ...,

1161 parse_dates=...,

1162 infer_datetime_format: bool = ...,

1163 keep_date_col: bool = ...,

1164 date_parser=...,

1165 dayfirst: bool = ...,

1166 cache_dates: bool = ...,

1167 iterator: bool = ...,

1168 chunksize: int | None = ...,

1169 compression: CompressionOptions = ...,

1170 thousands: str | None = ...,

1171 decimal: str = ...,

1172 lineterminator: str | None = ...,

1173 quotechar: str = ...,

1174 quoting: int = ...,

1175 doublequote: bool = ...,

1176 escapechar: str | None = ...,

1177 comment: str | None = ...,

1178 encoding: str | None = ...,

1179 encoding_errors: str | None = ...,

1180 dialect: str | csv.Dialect | None = ...,

1181 error_bad_lines: bool | None = ...,

1182 warn_bad_lines: bool | None = ...,

1183 on_bad_lines=...,

1184 delim_whitespace=...,

1185 low_memory=...,

1186 memory_map: bool = ...,

1187 float_precision: str | None = ...,

1188 storage_options: StorageOptions = ...,

1189) -> DataFrame | TextFileReader:

1190 ...

1191

1192

1193@deprecate_kwarg(old_arg_name="mangle_dupe_cols", new_arg_name=None)

1194@deprecate_nonkeyword_arguments(version=None, allowed_args=["filepath_or_buffer"])

1195@Appender(

1196 _doc_read_csv_and_table.format(

1197 func_name="read_table",

1198 summary="Read general delimited file into DataFrame.",

1199 _default_sep=r"'\\t' (tab-stop)",

1200 storage_options=_shared_docs["storage_options"],

1201 decompression_options=_shared_docs["decompression_options"]

1202 % "filepath_or_buffer",

1203 )

1204)

1205def read_table(

1206 filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str],

1207 sep: str | None | lib.NoDefault = lib.no_default,

1208 delimiter: str | None | lib.NoDefault = None,

1209 # Column and Index Locations and Names

1210 header: int | Sequence[int] | None | Literal["infer"] = "infer",

1211 names: Sequence[Hashable] | None | lib.NoDefault = lib.no_default,

1212 index_col: IndexLabel | Literal[False] | None = None,

1213 usecols=None,

1214 squeeze: bool | None = None,

1215 prefix: str | lib.NoDefault = lib.no_default,

1216 mangle_dupe_cols: bool = True,

1217 # General Parsing Configuration

1218 dtype: DtypeArg | None = None,

1219 engine: CSVEngine | None = None,

1220 converters=None,

1221 true_values=None,

1222 false_values=None,

1223 skipinitialspace: bool = False,

1224 skiprows=None,

1225 skipfooter: int = 0,

1226 nrows: int | None = None,

1227 # NA and Missing Data Handling

1228 na_values=None,

1229 keep_default_na: bool = True,

1230 na_filter: bool = True,

1231 verbose: bool = False,

1232 skip_blank_lines: bool = True,

1233 # Datetime Handling

1234 parse_dates=False,

1235 infer_datetime_format: bool = False,

1236 keep_date_col: bool = False,

1237 date_parser=None,

1238 dayfirst: bool = False,

1239 cache_dates: bool = True,

1240 # Iteration

1241 iterator: bool = False,

1242 chunksize: int | None = None,

1243 # Quoting, Compression, and File Format

1244 compression: CompressionOptions = "infer",

1245 thousands: str | None = None,

1246 decimal: str = ".",

1247 lineterminator: str | None = None,

1248 quotechar: str = '"',

1249 quoting: int = csv.QUOTE_MINIMAL,

1250 doublequote: bool = True,

1251 escapechar: str | None = None,

1252 comment: str | None = None,

1253 encoding: str | None = None,

1254 encoding_errors: str | None = "strict",

1255 dialect: str | csv.Dialect | None = None,

1256 # Error Handling

1257 error_bad_lines: bool | None = None,

1258 warn_bad_lines: bool | None = None,

1259 # TODO(2.0): set on_bad_lines to "error".

1260 # See _refine_defaults_read comment for why we do this.

1261 on_bad_lines=None,

1262 # Internal

1263 delim_whitespace=False,

1264 low_memory=_c_parser_defaults["low_memory"],

1265 memory_map: bool = False,

1266 float_precision: str | None = None,

1267 storage_options: StorageOptions = None,

1268) -> DataFrame | TextFileReader:

1269 # locals() should never be modified

1270 kwds = locals().copy()

1271 del kwds["filepath_or_buffer"]

1272 del kwds["sep"]

1273

1274 kwds_defaults = _refine_defaults_read(

1275 dialect,

1276 delimiter,

1277 delim_whitespace,

1278 engine,

1279 sep,

1280 error_bad_lines,

1281 warn_bad_lines,

1282 on_bad_lines,

1283 names,

1284 prefix,

1285 defaults={"delimiter": "\t"},

1286 )

1287 kwds.update(kwds_defaults)

1288

1289 return _read(filepath_or_buffer, kwds)

1290

1291

1292@deprecate_nonkeyword_arguments(version=None, allowed_args=["filepath_or_buffer"])

1293def read_fwf(

1294 filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str],

1295 colspecs: Sequence[tuple[int, int]] | str | None = "infer",

1296 widths: Sequence[int] | None = None,

1297 infer_nrows: int = 100,

1298 **kwds,

1299) -> DataFrame | TextFileReader:

1300 r"""

1301 Read a table of fixed-width formatted lines into DataFrame.

1302

1303 Also supports optionally iterating or breaking of the file

1304 into chunks.

1305

1306 Additional help can be found in the `online docs for IO Tools

1307 <https://pandas.pydata.org/pandas-docs/stable/user_guide/io.html>`_.

1308

1309 Parameters

1310 ----------

1311 filepath_or_buffer : str, path object, or file-like object

1312 String, path object (implementing ``os.PathLike[str]``), or file-like

1313 object implementing a text ``read()`` function.The string could be a URL.

1314 Valid URL schemes include http, ftp, s3, and file. For file URLs, a host is

1315 expected. A local file could be:

1316 ``file://localhost/path/to/table.csv``.

1317 colspecs : list of tuple (int, int) or 'infer'. optional

1318 A list of tuples giving the extents of the fixed-width

1319 fields of each line as half-open intervals (i.e., [from, to[ ).

1320 String value 'infer' can be used to instruct the parser to try

1321 detecting the column specifications from the first 100 rows of

1322 the data which are not being skipped via skiprows (default='infer').

1323 widths : list of int, optional

1324 A list of field widths which can be used instead of 'colspecs' if

1325 the intervals are contiguous.

1326 infer_nrows : int, default 100

1327 The number of rows to consider when letting the parser determine the

1328 `colspecs`.

1329 **kwds : optional

1330 Optional keyword arguments can be passed to ``TextFileReader``.

1331

1332 Returns

1333 -------

1334 DataFrame or TextFileReader

1335 A comma-separated values (csv) file is returned as two-dimensional

1336 data structure with labeled axes.

1337

1338 See Also

1339 --------

1340 DataFrame.to_csv : Write DataFrame to a comma-separated values (csv) file.

1341 read_csv : Read a comma-separated values (csv) file into DataFrame.

1342

1343 Examples

1344 --------

1345 >>> pd.read_fwf('data.csv') # doctest: +SKIP

1346 """

1347 # Check input arguments.

1348 if colspecs is None and widths is None:

1349 raise ValueError("Must specify either colspecs or widths")

1350 elif colspecs not in (None, "infer") and widths is not None:

1351 raise ValueError("You must specify only one of 'widths' and 'colspecs'")

1352

1353 # Compute 'colspecs' from 'widths', if specified.

1354 if widths is not None:

1355 colspecs, col = [], 0

1356 for w in widths:

1357 colspecs.append((col, col + w))

1358 col += w

1359

1360 # for mypy

1361 assert colspecs is not None

1362

1363 # GH#40830

1364 # Ensure length of `colspecs` matches length of `names`

1365 names = kwds.get("names")

1366 if names is not None:

1367 if len(names) != len(colspecs) and colspecs != "infer":

1368 # need to check len(index_col) as it might contain

1369 # unnamed indices, in which case it's name is not required

1370 len_index = 0

1371 if kwds.get("index_col") is not None:

1372 index_col: Any = kwds.get("index_col")

1373 if index_col is not False:

1374 if not is_list_like(index_col):

1375 len_index = 1

1376 else:

1377 len_index = len(index_col)

1378 if kwds.get("usecols") is None and len(names) + len_index != len(colspecs):

1379 # If usecols is used colspec may be longer than names

1380 raise ValueError("Length of colspecs must match length of names")

1381

1382 kwds["colspecs"] = colspecs

1383 kwds["infer_nrows"] = infer_nrows

1384 kwds["engine"] = "python-fwf"

1385 return _read(filepath_or_buffer, kwds)

1386

1387

1388class TextFileReader(abc.Iterator):

1389 """

1390

1391 Passed dialect overrides any of the related parser options

1392

1393 """

1394

1395 def __init__(

1396 self,

1397 f: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str] | list,

1398 engine: CSVEngine | None = None,

1399 **kwds,

1400 ) -> None:

1401 if engine is not None:

1402 engine_specified = True

1403 else:

1404 engine = "python"

1405 engine_specified = False

1406 self.engine = engine

1407 self._engine_specified = kwds.get("engine_specified", engine_specified)

1408

1409 _validate_skipfooter(kwds)

1410

1411 dialect = _extract_dialect(kwds)

1412 if dialect is not None:

1413 if engine == "pyarrow":

1414 raise ValueError(

1415 "The 'dialect' option is not supported with the 'pyarrow' engine"

1416 )

1417 kwds = _merge_with_dialect_properties(dialect, kwds)

1418

1419 if kwds.get("header", "infer") == "infer":

1420 kwds["header"] = 0 if kwds.get("names") is None else None

1421

1422 self.orig_options = kwds

1423

1424 # miscellanea

1425 self._currow = 0

1426

1427 options = self._get_options_with_defaults(engine)

1428 options["storage_options"] = kwds.get("storage_options", None)

1429

1430 self.chunksize = options.pop("chunksize", None)

1431 self.nrows = options.pop("nrows", None)

1432

1433 self._check_file_or_buffer(f, engine)

1434 self.options, self.engine = self._clean_options(options, engine)

1435

1436 self.squeeze = self.options.pop("squeeze", False)

1437

1438 if "has_index_names" in kwds:

1439 self.options["has_index_names"] = kwds["has_index_names"]

1440

1441 self.handles: IOHandles | None = None

1442 self._engine = self._make_engine(f, self.engine)

1443

1444 def close(self) -> None:

1445 if self.handles is not None:

1446 self.handles.close()

1447 self._engine.close()

1448

1449 def _get_options_with_defaults(self, engine: CSVEngine) -> dict[str, Any]:

1450 kwds = self.orig_options

1451

1452 options = {}

1453 default: object | None

1454

1455 for argname, default in parser_defaults.items():

1456 value = kwds.get(argname, default)

1457

1458 # see gh-12935

1459 if (

1460 engine == "pyarrow"

1461 and argname in _pyarrow_unsupported

1462 and value != default

1463 and value != getattr(value, "value", default)

1464 ):

1465 if (

1466 argname == "on_bad_lines"

1467 and kwds.get("error_bad_lines") is not None

1468 ):

1469 argname = "error_bad_lines"

1470 elif (

1471 argname == "on_bad_lines" and kwds.get("warn_bad_lines") is not None

1472 ):

1473 argname = "warn_bad_lines"

1474

1475 raise ValueError(

1476 f"The {repr(argname)} option is not supported with the "

1477 f"'pyarrow' engine"

1478 )

1479 elif argname == "mangle_dupe_cols" and value is False:

1480 # GH12935

1481 raise ValueError("Setting mangle_dupe_cols=False is not supported yet")

1482 else:

1483 options[argname] = value

1484

1485 for argname, default in _c_parser_defaults.items():

1486 if argname in kwds:

1487 value = kwds[argname]

1488

1489 if engine != "c" and value != default:

1490 if "python" in engine and argname not in _python_unsupported:

1491 pass

1492 elif (

1493 value

1494 == _deprecated_defaults.get(

1495 argname, _DeprecationConfig(default, None)

1496 ).default_value

1497 ):

1498 pass

1499 else:

1500 raise ValueError(

1501 f"The {repr(argname)} option is not supported with the "

1502 f"{repr(engine)} engine"

1503 )

1504 else:

1505 value = _deprecated_defaults.get(

1506 argname, _DeprecationConfig(default, None)

1507 ).default_value

1508 options[argname] = value

1509

1510 if engine == "python-fwf":

1511 for argname, default in _fwf_defaults.items():

1512 options[argname] = kwds.get(argname, default)

1513

1514 return options

1515

1516 def _check_file_or_buffer(self, f, engine: CSVEngine) -> None:

1517 # see gh-16530

1518 if is_file_like(f) and engine != "c" and not hasattr(f, "__iter__"):

1519 # The C engine doesn't need the file-like to have the "__iter__"

1520 # attribute. However, the Python engine needs "__iter__(...)"

1521 # when iterating through such an object, meaning it

1522 # needs to have that attribute

1523 raise ValueError(

1524 "The 'python' engine cannot iterate through this file buffer."

1525 )

1526

1527 def _clean_options(

1528 self, options: dict[str, Any], engine: CSVEngine

1529 ) -> tuple[dict[str, Any], CSVEngine]:

1530 result = options.copy()

1531

1532 fallback_reason = None

1533

1534 # C engine not supported yet

1535 if engine == "c":

1536 if options["skipfooter"] > 0:

1537 fallback_reason = "the 'c' engine does not support skipfooter"

1538 engine = "python"

1539

1540 sep = options["delimiter"]

1541 delim_whitespace = options["delim_whitespace"]

1542

1543 if sep is None and not delim_whitespace:

1544 if engine in ("c", "pyarrow"):

1545 fallback_reason = (

1546 f"the '{engine}' engine does not support "

1547 "sep=None with delim_whitespace=False"

1548 )

1549 engine = "python"

1550 elif sep is not None and len(sep) > 1:

1551 if engine == "c" and sep == r"\s+":

1552 result["delim_whitespace"] = True

1553 del result["delimiter"]

1554 elif engine not in ("python", "python-fwf"):

1555 # wait until regex engine integrated

1556 fallback_reason = (

1557 f"the '{engine}' engine does not support "

1558 "regex separators (separators > 1 char and "

1559 r"different from '\s+' are interpreted as regex)"

1560 )

1561 engine = "python"

1562 elif delim_whitespace:

1563 if "python" in engine:

1564 result["delimiter"] = r"\s+"

1565 elif sep is not None:

1566 encodeable = True

1567 encoding = sys.getfilesystemencoding() or "utf-8"

1568 try:

1569 if len(sep.encode(encoding)) > 1:

1570 encodeable = False

1571 except UnicodeDecodeError:

1572 encodeable = False

1573 if not encodeable and engine not in ("python", "python-fwf"):

1574 fallback_reason = (

1575 f"the separator encoded in {encoding} "

1576 f"is > 1 char long, and the '{engine}' engine "

1577 "does not support such separators"

1578 )

1579 engine = "python"

1580

1581 quotechar = options["quotechar"]

1582 if quotechar is not None and isinstance(quotechar, (str, bytes)):

1583 if (

1584 len(quotechar) == 1

1585 and ord(quotechar) > 127

1586 and engine not in ("python", "python-fwf")

1587 ):

1588 fallback_reason = (

1589 "ord(quotechar) > 127, meaning the "

1590 "quotechar is larger than one byte, "

1591 f"and the '{engine}' engine does not support such quotechars"

1592 )

1593 engine = "python"

1594

1595 if fallback_reason and self._engine_specified:

1596 raise ValueError(fallback_reason)

1597

1598 if engine == "c":

1599 for arg in _c_unsupported:

1600 del result[arg]

1601

1602 if "python" in engine:

1603 for arg in _python_unsupported:

1604 if fallback_reason and result[arg] != _c_parser_defaults[arg]:

1605 raise ValueError(

1606 "Falling back to the 'python' engine because "

1607 f"{fallback_reason}, but this causes {repr(arg)} to be "

1608 "ignored as it is not supported by the 'python' engine."

1609 )

1610 del result[arg]

1611

1612 if fallback_reason:

1613 warnings.warn(

1614 (

1615 "Falling back to the 'python' engine because "

1616 f"{fallback_reason}; you can avoid this warning by specifying "

1617 "engine='python'."

1618 ),

1619 ParserWarning,

1620 stacklevel=find_stack_level(),

1621 )

1622

1623 index_col = options["index_col"]

1624 names = options["names"]

1625 converters = options["converters"]

1626 na_values = options["na_values"]

1627 skiprows = options["skiprows"]

1628

1629 validate_header_arg(options["header"])

1630

1631 for arg in _deprecated_defaults.keys():

1632 parser_default = _c_parser_defaults.get(arg, parser_defaults[arg])

1633 depr_default = _deprecated_defaults[arg]

1634 if result.get(arg, depr_default) != depr_default.default_value:

1635 msg = (

1636 f"The {arg} argument has been deprecated and will be "

1637 f"removed in a future version. {depr_default.msg}\n\n"

1638 )

1639 warnings.warn(msg, FutureWarning, stacklevel=find_stack_level())

1640 else:

1641 result[arg] = parser_default

1642

1643 if index_col is True:

1644 raise ValueError("The value of index_col couldn't be 'True'")

1645 if is_index_col(index_col):

1646 if not isinstance(index_col, (list, tuple, np.ndarray)):

1647 index_col = [index_col]

1648 result["index_col"] = index_col

1649

1650 names = list(names) if names is not None else names

1651

1652 # type conversion-related

1653 if converters is not None:

1654 if not isinstance(converters, dict):

1655 raise TypeError(

1656 "Type converters must be a dict or subclass, "

1657 f"input was a {type(converters).__name__}"

1658 )

1659 else:

1660 converters = {}

1661

1662 # Converting values to NA

1663 keep_default_na = options["keep_default_na"]

1664 na_values, na_fvalues = _clean_na_values(na_values, keep_default_na)

1665

1666 # handle skiprows; this is internally handled by the

1667 # c-engine, so only need for python and pyarrow parsers

1668 if engine == "pyarrow":

1669 if not is_integer(skiprows) and skiprows is not None:

1670 # pyarrow expects skiprows to be passed as an integer

1671 raise ValueError(

1672 "skiprows argument must be an integer when using "

1673 "engine='pyarrow'"

1674 )

1675 else:

1676 if is_integer(skiprows):

1677 skiprows = list(range(skiprows))

1678 if skiprows is None:

1679 skiprows = set()

1680 elif not callable(skiprows):

1681 skiprows = set(skiprows)

1682

1683 # put stuff back

1684 result["names"] = names

1685 result["converters"] = converters

1686 result["na_values"] = na_values

1687 result["na_fvalues"] = na_fvalues

1688 result["skiprows"] = skiprows

1689 # Default for squeeze is none since we need to check

1690 # if user sets it. We then set to False to preserve

1691 # previous behavior.

1692 result["squeeze"] = False if options["squeeze"] is None else options["squeeze"]

1693

1694 return result, engine

1695

1696 def __next__(self) -> DataFrame:

1697 try:

1698 return self.get_chunk()

1699 except StopIteration:

1700 self.close()

1701 raise

1702

1703 def _make_engine(

1704 self,

1705 f: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str] | list | IO,

1706 engine: CSVEngine = "c",

1707 ) -> ParserBase:

1708 mapping: dict[str, type[ParserBase]] = {

1709 "c": CParserWrapper,

1710 "python": PythonParser,

1711 "pyarrow": ArrowParserWrapper,

1712 "python-fwf": FixedWidthFieldParser,

1713 }

1714 if engine not in mapping:

1715 raise ValueError(

1716 f"Unknown engine: {engine} (valid options are {mapping.keys()})"

1717 )

1718 if not isinstance(f, list):

1719 # open file here

1720 is_text = True

1721 mode = "r"

1722 if engine == "pyarrow":

1723 is_text = False

1724 mode = "rb"

1725 elif (

1726 engine == "c"

1727 and self.options.get("encoding", "utf-8") == "utf-8"

1728 and isinstance(stringify_path(f), str)

1729 ):

1730 # c engine can decode utf-8 bytes, adding TextIOWrapper makes

1731 # the c-engine especially for memory_map=True far slower

1732 is_text = False

1733 if "b" not in mode:

1734 mode += "b"

1735 self.handles = get_handle(

1736 f,

1737 mode,

1738 encoding=self.options.get("encoding", None),

1739 compression=self.options.get("compression", None),

1740 memory_map=self.options.get("memory_map", False),

1741 is_text=is_text,

1742 errors=self.options.get("encoding_errors", "strict"),

1743 storage_options=self.options.get("storage_options", None),

1744 )

1745 assert self.handles is not None

1746 f = self.handles.handle

1747

1748 elif engine != "python":

1749 msg = f"Invalid file path or buffer object type: {type(f)}"

1750 raise ValueError(msg)

1751

1752 try:

1753 return mapping[engine](f, **self.options)

1754 except Exception:

1755 if self.handles is not None:

1756 self.handles.close()

1757 raise

1758

1759 def _failover_to_python(self) -> None:

1760 raise AbstractMethodError(self)

1761

1762 def read(self, nrows: int | None = None) -> DataFrame:

1763 if self.engine == "pyarrow":

1764 try:

1765 # error: "ParserBase" has no attribute "read"

1766 df = self._engine.read() # type: ignore[attr-defined]

1767 except Exception:

1768 self.close()

1769 raise

1770 else:

1771 nrows = validate_integer("nrows", nrows)

1772 try:

1773 # error: "ParserBase" has no attribute "read"

1774 (

1775 index,

1776 columns,

1777 col_dict,

1778 ) = self._engine.read( # type: ignore[attr-defined]

1779 nrows

1780 )

1781 except Exception:

1782 self.close()

1783 raise

1784

1785 if index is None:

1786 if col_dict:

1787 # Any column is actually fine:

1788 new_rows = len(next(iter(col_dict.values())))

1789 index = RangeIndex(self._currow, self._currow + new_rows)

1790 else:

1791 new_rows = 0

1792 else:

1793 new_rows = len(index)

1794

1795 df = DataFrame(col_dict, columns=columns, index=index)

1796

1797 self._currow += new_rows

1798

1799 if self.squeeze and len(df.columns) == 1:

1800 return df.squeeze("columns").copy()

1801 return df

1802

1803 def get_chunk(self, size: int | None = None) -> DataFrame:

1804 if size is None:

1805 size = self.chunksize

1806 if self.nrows is not None:

1807 if self._currow >= self.nrows:

1808 raise StopIteration

1809 size = min(size, self.nrows - self._currow)

1810 return self.read(nrows=size)

1811

1812 def __enter__(self) -> TextFileReader:

1813 return self

1814

1815 def __exit__(self, exc_type, exc_value, traceback) -> None:

1816 self.close()

1817

1818

1819def TextParser(*args, **kwds) -> TextFileReader:

1820 """

1821 Converts lists of lists/tuples into DataFrames with proper type inference

1822 and optional (e.g. string to datetime) conversion. Also enables iterating

1823 lazily over chunks of large files

1824

1825 Parameters

1826 ----------

1827 data : file-like object or list

1828 delimiter : separator character to use

1829 dialect : str or csv.Dialect instance, optional

1830 Ignored if delimiter is longer than 1 character

1831 names : sequence, default

1832 header : int, default 0

1833 Row to use to parse column labels. Defaults to the first row. Prior

1834 rows will be discarded

1835 index_col : int or list, optional

1836 Column or columns to use as the (possibly hierarchical) index

1837 has_index_names: bool, default False

1838 True if the cols defined in index_col have an index name and are

1839 not in the header.

1840 na_values : scalar, str, list-like, or dict, optional

1841 Additional strings to recognize as NA/NaN.

1842 keep_default_na : bool, default True

1843 thousands : str, optional

1844 Thousands separator

1845 comment : str, optional

1846 Comment out remainder of line

1847 parse_dates : bool, default False

1848 keep_date_col : bool, default False

1849 date_parser : function, optional

1850 skiprows : list of integers

1851 Row numbers to skip

1852 skipfooter : int

1853 Number of line at bottom of file to skip

1854 converters : dict, optional

1855 Dict of functions for converting values in certain columns. Keys can

1856 either be integers or column labels, values are functions that take one

1857 input argument, the cell (not column) content, and return the

1858 transformed content.

1859 encoding : str, optional

1860 Encoding to use for UTF when reading/writing (ex. 'utf-8')

1861 squeeze : bool, default False

1862 returns Series if only one column.

1863 infer_datetime_format: bool, default False

1864 If True and `parse_dates` is True for a column, try to infer the

1865 datetime format based on the first datetime string. If the format

1866 can be inferred, there often will be a large parsing speed-up.

1867 float_precision : str, optional

1868 Specifies which converter the C engine should use for floating-point

1869 values. The options are `None` or `high` for the ordinary converter,

1870 `legacy` for the original lower precision pandas converter, and

1871 `round_trip` for the round-trip converter.

1872

1873 .. versionchanged:: 1.2

1874 """

1875 kwds["engine"] = "python"

1876 return TextFileReader(*args, **kwds)

1877

1878

1879def _clean_na_values(na_values, keep_default_na=True):

1880 na_fvalues: set | dict

1881 if na_values is None:

1882 if keep_default_na:

1883 na_values = STR_NA_VALUES

1884 else:

1885 na_values = set()

1886 na_fvalues = set()

1887 elif isinstance(na_values, dict):

1888 old_na_values = na_values.copy()

1889 na_values = {} # Prevent aliasing.

1890

1891 # Convert the values in the na_values dictionary

1892 # into array-likes for further use. This is also

1893 # where we append the default NaN values, provided

1894 # that `keep_default_na=True`.

1895 for k, v in old_na_values.items():

1896 if not is_list_like(v):

1897 v = [v]

1898

1899 if keep_default_na:

1900 v = set(v) | STR_NA_VALUES

1901

1902 na_values[k] = v

1903 na_fvalues = {k: _floatify_na_values(v) for k, v in na_values.items()}

1904 else:

1905 if not is_list_like(na_values):

1906 na_values = [na_values]

1907 na_values = _stringify_na_values(na_values)

1908 if keep_default_na:

1909 na_values = na_values | STR_NA_VALUES

1910

1911 na_fvalues = _floatify_na_values(na_values)

1912

1913 return na_values, na_fvalues

1914

1915

1916def _floatify_na_values(na_values):

1917 # create float versions of the na_values

1918 result = set()

1919 for v in na_values:

1920 try:

1921 v = float(v)

1922 if not np.isnan(v):

1923 result.add(v)

1924 except (TypeError, ValueError, OverflowError):

1925 pass

1926 return result

1927

1928

1929def _stringify_na_values(na_values):

1930 """return a stringified and numeric for these values"""

1931 result: list[str | float] = []

1932 for x in na_values:

1933 result.append(str(x))

1934 result.append(x)

1935 try:

1936 v = float(x)

1937

1938 # we are like 999 here

1939 if v == int(v):

1940 v = int(v)

1941 result.append(f"{v}.0")

1942 result.append(str(v))

1943

1944 result.append(v)

1945 except (TypeError, ValueError, OverflowError):

1946 pass

1947 try:

1948 result.append(int(x))

1949 except (TypeError, ValueError, OverflowError):

1950 pass

1951 return set(result)

1952

1953

1954def _refine_defaults_read(

1955 dialect: str | csv.Dialect | None,

1956 delimiter: str | None | lib.NoDefault,

1957 delim_whitespace: bool,

1958 engine: CSVEngine | None,

1959 sep: str | None | lib.NoDefault,

1960 error_bad_lines: bool | None,

1961 warn_bad_lines: bool | None,

1962 on_bad_lines: str | Callable | None,

1963 names: Sequence[Hashable] | None | lib.NoDefault,

1964 prefix: str | None | lib.NoDefault,

1965 defaults: dict[str, Any],

1966):

1967 """Validate/refine default values of input parameters of read_csv, read_table.

1968

1969 Parameters

1970 ----------

1971 dialect : str or csv.Dialect

1972 If provided, this parameter will override values (default or not) for the

1973 following parameters: `delimiter`, `doublequote`, `escapechar`,

1974 `skipinitialspace`, `quotechar`, and `quoting`. If it is necessary to

1975 override values, a ParserWarning will be issued. See csv.Dialect

1976 documentation for more details.

1977 delimiter : str or object

1978 Alias for sep.

1979 delim_whitespace : bool

1980 Specifies whether or not whitespace (e.g. ``' '`` or ``'\t'``) will be

1981 used as the sep. Equivalent to setting ``sep='\\s+'``. If this option

1982 is set to True, nothing should be passed in for the ``delimiter``

1983 parameter.

1984 engine : {{'c', 'python'}}

1985 Parser engine to use. The C engine is faster while the python engine is

1986 currently more feature-complete.

1987 sep : str or object

1988 A delimiter provided by the user (str) or a sentinel value, i.e.

1989 pandas._libs.lib.no_default.

1990 error_bad_lines : str or None

1991 Whether to error on a bad line or not.

1992 warn_bad_lines : str or None

1993 Whether to warn on a bad line or not.

1994 on_bad_lines : str, callable or None

1995 An option for handling bad lines or a sentinel value(None).

1996 names : array-like, optional

1997 List of column names to use. If the file contains a header row,

1998 then you should explicitly pass ``header=0`` to override the column names.

1999 Duplicates in this list are not allowed.

2000 prefix : str, optional

2001 Prefix to add to column numbers when no header, e.g. 'X' for X0, X1, ...

2002 defaults: dict

2003 Default values of input parameters.

2004

2005 Returns

2006 -------

2007 kwds : dict

2008 Input parameters with correct values.

2009

2010 Raises

2011 ------

2012 ValueError :

2013 If a delimiter was specified with ``sep`` (or ``delimiter``) and

2014 ``delim_whitespace=True``.

2015 If on_bad_lines is specified(not ``None``) and ``error_bad_lines``/

2016 ``warn_bad_lines`` is True.

2017 """

2018 # fix types for sep, delimiter to Union(str, Any)

2019 delim_default = defaults["delimiter"]

2020 kwds: dict[str, Any] = {}

2021 # gh-23761

2022 #

2023 # When a dialect is passed, it overrides any of the overlapping

2024 # parameters passed in directly. We don't want to warn if the

2025 # default parameters were passed in (since it probably means

2026 # that the user didn't pass them in explicitly in the first place).

2027 #

2028 # "delimiter" is the annoying corner case because we alias it to

2029 # "sep" before doing comparison to the dialect values later on.

2030 # Thus, we need a flag to indicate that we need to "override"

2031 # the comparison to dialect values by checking if default values

2032 # for BOTH "delimiter" and "sep" were provided.

2033 if dialect is not None:

2034 kwds["sep_override"] = delimiter is None and (

2035 sep is lib.no_default or sep == delim_default

2036 )

2037

2038 if delimiter and (sep is not lib.no_default):

2039 raise ValueError("Specified a sep and a delimiter; you can only specify one.")

2040

2041 if (

2042 names is not None

2043 and names is not lib.no_default

2044 and prefix is not None

2045 and prefix is not lib.no_default

2046 ):

2047 raise ValueError("Specified named and prefix; you can only specify one.")

2048

2049 kwds["names"] = None if names is lib.no_default else names

2050 kwds["prefix"] = None if prefix is lib.no_default else prefix

2051

2052 # Alias sep -> delimiter.

2053 if delimiter is None:

2054 delimiter = sep

2055

2056 if delim_whitespace and (delimiter is not lib.no_default):

2057 raise ValueError(

2058 "Specified a delimiter with both sep and "

2059 "delim_whitespace=True; you can only specify one."

2060 )

2061

2062 if delimiter == "\n":

2063 raise ValueError(

2064 r"Specified \n as separator or delimiter. This forces the python engine "

2065 "which does not accept a line terminator. Hence it is not allowed to use "

2066 "the line terminator as separator.",

2067 )

2068

2069 if delimiter is lib.no_default:

2070 # assign default separator value

2071 kwds["delimiter"] = delim_default

2072 else:

2073 kwds["delimiter"] = delimiter

2074

2075 if engine is not None:

2076 kwds["engine_specified"] = True

2077 else:

2078 kwds["engine"] = "c"

2079 kwds["engine_specified"] = False

2080

2081 # Ensure that on_bad_lines and error_bad_lines/warn_bad_lines

2082 # aren't specified at the same time. If so, raise. Otherwise,

2083 # alias on_bad_lines to "error" if error/warn_bad_lines not set

2084 # and on_bad_lines is not set. on_bad_lines is defaulted to None

2085 # so we can tell if it is set (this is why this hack exists).

2086 if on_bad_lines is not None:

2087 if error_bad_lines is not None or warn_bad_lines is not None:

2088 raise ValueError(

2089 "Both on_bad_lines and error_bad_lines/warn_bad_lines are set. "

2090 "Please only set on_bad_lines."

2091 )

2092 if on_bad_lines == "error":

2093 kwds["on_bad_lines"] = ParserBase.BadLineHandleMethod.ERROR

2094 elif on_bad_lines == "warn":

2095 kwds["on_bad_lines"] = ParserBase.BadLineHandleMethod.WARN

2096 elif on_bad_lines == "skip":

2097 kwds["on_bad_lines"] = ParserBase.BadLineHandleMethod.SKIP

2098 elif callable(on_bad_lines):

2099 if engine != "python":

2100 raise ValueError(

2101 "on_bad_line can only be a callable function if engine='python'"

2102 )

2103 kwds["on_bad_lines"] = on_bad_lines

2104 else:

2105 raise ValueError(f"Argument {on_bad_lines} is invalid for on_bad_lines")

2106 else:

2107 if error_bad_lines is not None:

2108 # Must check is_bool, because other stuff(e.g. non-empty lists) eval to true

2109 validate_bool_kwarg(error_bad_lines, "error_bad_lines")

2110 if error_bad_lines:

2111 kwds["on_bad_lines"] = ParserBase.BadLineHandleMethod.ERROR

2112 else:

2113 if warn_bad_lines is not None:

2114 # This is the case where error_bad_lines is False

2115 # We can only warn/skip if error_bad_lines is False

2116 # None doesn't work because backwards-compatibility reasons

2117 validate_bool_kwarg(warn_bad_lines, "warn_bad_lines")

2118 if warn_bad_lines:

2119 kwds["on_bad_lines"] = ParserBase.BadLineHandleMethod.WARN

2120 else:

2121 kwds["on_bad_lines"] = ParserBase.BadLineHandleMethod.SKIP

2122 else:

2123 # Backwards compat, when only error_bad_lines = false, we warn

2124 kwds["on_bad_lines"] = ParserBase.BadLineHandleMethod.WARN

2125 else:

2126 # Everything None -> Error

2127 kwds["on_bad_lines"] = ParserBase.BadLineHandleMethod.ERROR

2128

2129 return kwds

2130

2131

2132def _extract_dialect(kwds: dict[str, Any]) -> csv.Dialect | None:

2133 """

2134 Extract concrete csv dialect instance.

2135

2136 Returns

2137 -------

2138 csv.Dialect or None

2139 """

2140 if kwds.get("dialect") is None:

2141 return None

2142

2143 dialect = kwds["dialect"]

2144 if dialect in csv.list_dialects():

2145 dialect = csv.get_dialect(dialect)

2146

2147 _validate_dialect(dialect)

2148

2149 return dialect

2150

2151

2152MANDATORY_DIALECT_ATTRS = (

2153 "delimiter",

2154 "doublequote",

2155 "escapechar",

2156 "skipinitialspace",

2157 "quotechar",

2158 "quoting",

2159)

2160

2161

2162def _validate_dialect(dialect: csv.Dialect) -> None:

2163 """

2164 Validate csv dialect instance.

2165

2166 Raises

2167 ------

2168 ValueError

2169 If incorrect dialect is provided.

2170 """

2171 for param in MANDATORY_DIALECT_ATTRS:

2172 if not hasattr(dialect, param):

2173 raise ValueError(f"Invalid dialect {dialect} provided")

2174

2175

2176def _merge_with_dialect_properties(

2177 dialect: csv.Dialect,

2178 defaults: dict[str, Any],

2179) -> dict[str, Any]:

2180 """

2181 Merge default kwargs in TextFileReader with dialect parameters.

2182

2183 Parameters

2184 ----------

2185 dialect : csv.Dialect

2186 Concrete csv dialect. See csv.Dialect documentation for more details.

2187 defaults : dict

2188 Keyword arguments passed to TextFileReader.

2189

2190 Returns

2191 -------

2192 kwds : dict

2193 Updated keyword arguments, merged with dialect parameters.

2194 """

2195 kwds = defaults.copy()

2196

2197 for param in MANDATORY_DIALECT_ATTRS:

2198 dialect_val = getattr(dialect, param)

2199

2200 parser_default = parser_defaults[param]

2201 provided = kwds.get(param, parser_default)

2202

2203 # Messages for conflicting values between the dialect

2204 # instance and the actual parameters provided.

2205 conflict_msgs = []

2206

2207 # Don't warn if the default parameter was passed in,

2208 # even if it conflicts with the dialect (gh-23761).

2209 if provided != parser_default and provided != dialect_val:

2210 msg = (

2211 f"Conflicting values for '{param}': '{provided}' was "

2212 f"provided, but the dialect specifies '{dialect_val}'. "

2213 "Using the dialect-specified value."

2214 )

2215

2216 # Annoying corner case for not warning about

2217 # conflicts between dialect and delimiter parameter.

2218 # Refer to the outer "_read_" function for more info.

2219 if not (param == "delimiter" and kwds.pop("sep_override", False)):

2220 conflict_msgs.append(msg)

2221

2222 if conflict_msgs:

2223 warnings.warn(

2224 "\n\n".join(conflict_msgs), ParserWarning, stacklevel=find_stack_level()

2225 )

2226 kwds[param] = dialect_val

2227 return kwds

2228

2229

2230def _validate_skipfooter(kwds: dict[str, Any]) -> None:

2231 """

2232 Check whether skipfooter is compatible with other kwargs in TextFileReader.

2233

2234 Parameters

2235 ----------

2236 kwds : dict

2237 Keyword arguments passed to TextFileReader.

2238

2239 Raises

2240 ------

2241 ValueError

2242 If skipfooter is not compatible with other parameters.

2243 """

2244 if kwds.get("skipfooter"):

2245 if kwds.get("iterator") or kwds.get("chunksize"):

2246 raise ValueError("'skipfooter' not supported for iteration")

2247 if kwds.get("nrows"):

2248 raise ValueError("'skipfooter' not supported with 'nrows'")