Coverage for /var/srv/projects/api.amasfac.comuna18.com/tmp/venv/lib/python3.9/site-packages/pandas/io/parsers/readers.py: 12%
519 statements
« prev ^ index » next coverage.py v6.4.4, created at 2023-07-17 14:22 -0600
« prev ^ index » next coverage.py v6.4.4, created at 2023-07-17 14:22 -0600
1"""
2Module contains tools for processing files into DataFrames or other objects
3"""
4from __future__ import annotations
6from collections import abc
7import csv
8import sys
9from textwrap import fill
10from typing import (
11 IO,
12 Any,
13 Callable,
14 Hashable,
15 Literal,
16 NamedTuple,
17 Sequence,
18 overload,
19)
20import warnings
22import numpy as np
24import pandas._libs.lib as lib
25from pandas._libs.parsers import STR_NA_VALUES
26from pandas._typing import (
27 CompressionOptions,
28 CSVEngine,
29 DtypeArg,
30 FilePath,
31 IndexLabel,
32 ReadCsvBuffer,
33 StorageOptions,
34)
35from pandas.errors import (
36 AbstractMethodError,
37 ParserWarning,
38)
39from pandas.util._decorators import (
40 Appender,
41 deprecate_kwarg,
42 deprecate_nonkeyword_arguments,
43)
44from pandas.util._exceptions import find_stack_level
45from pandas.util._validators import validate_bool_kwarg
47from pandas.core.dtypes.common import (
48 is_file_like,
49 is_float,
50 is_integer,
51 is_list_like,
52)
54from pandas.core.frame import DataFrame
55from pandas.core.indexes.api import RangeIndex
56from pandas.core.shared_docs import _shared_docs
58from pandas.io.common import (
59 IOHandles,
60 get_handle,
61 stringify_path,
62 validate_header_arg,
63)
64from pandas.io.parsers.arrow_parser_wrapper import ArrowParserWrapper
65from pandas.io.parsers.base_parser import (
66 ParserBase,
67 is_index_col,
68 parser_defaults,
69)
70from pandas.io.parsers.c_parser_wrapper import CParserWrapper
71from pandas.io.parsers.python_parser import (
72 FixedWidthFieldParser,
73 PythonParser,
74)
76_doc_read_csv_and_table = (
77 r"""
78{summary}
80Also supports optionally iterating or breaking of the file
81into chunks.
83Additional help can be found in the online docs for
84`IO Tools <https://pandas.pydata.org/pandas-docs/stable/user_guide/io.html>`_.
86Parameters
87----------
88filepath_or_buffer : str, path object or file-like object
89 Any valid string path is acceptable. The string could be a URL. Valid
90 URL schemes include http, ftp, s3, gs, and file. For file URLs, a host is
91 expected. A local file could be: file://localhost/path/to/table.csv.
93 If you want to pass in a path object, pandas accepts any ``os.PathLike``.
95 By file-like object, we refer to objects with a ``read()`` method, such as
96 a file handle (e.g. via builtin ``open`` function) or ``StringIO``.
97sep : str, default {_default_sep}
98 Delimiter to use. If sep is None, the C engine cannot automatically detect
99 the separator, but the Python parsing engine can, meaning the latter will
100 be used and automatically detect the separator by Python's builtin sniffer
101 tool, ``csv.Sniffer``. In addition, separators longer than 1 character and
102 different from ``'\s+'`` will be interpreted as regular expressions and
103 will also force the use of the Python parsing engine. Note that regex
104 delimiters are prone to ignoring quoted data. Regex example: ``'\r\t'``.
105delimiter : str, default ``None``
106 Alias for sep.
107header : int, list of int, None, default 'infer'
108 Row number(s) to use as the column names, and the start of the
109 data. Default behavior is to infer the column names: if no names
110 are passed the behavior is identical to ``header=0`` and column
111 names are inferred from the first line of the file, if column
112 names are passed explicitly then the behavior is identical to
113 ``header=None``. Explicitly pass ``header=0`` to be able to
114 replace existing names. The header can be a list of integers that
115 specify row locations for a multi-index on the columns
116 e.g. [0,1,3]. Intervening rows that are not specified will be
117 skipped (e.g. 2 in this example is skipped). Note that this
118 parameter ignores commented lines and empty lines if
119 ``skip_blank_lines=True``, so ``header=0`` denotes the first line of
120 data rather than the first line of the file.
121names : array-like, optional
122 List of column names to use. If the file contains a header row,
123 then you should explicitly pass ``header=0`` to override the column names.
124 Duplicates in this list are not allowed.
125index_col : int, str, sequence of int / str, or False, optional, default ``None``
126 Column(s) to use as the row labels of the ``DataFrame``, either given as
127 string name or column index. If a sequence of int / str is given, a
128 MultiIndex is used.
130 Note: ``index_col=False`` can be used to force pandas to *not* use the first
131 column as the index, e.g. when you have a malformed file with delimiters at
132 the end of each line.
133usecols : list-like or callable, optional
134 Return a subset of the columns. If list-like, all elements must either
135 be positional (i.e. integer indices into the document columns) or strings
136 that correspond to column names provided either by the user in `names` or
137 inferred from the document header row(s). If ``names`` are given, the document
138 header row(s) are not taken into account. For example, a valid list-like
139 `usecols` parameter would be ``[0, 1, 2]`` or ``['foo', 'bar', 'baz']``.
140 Element order is ignored, so ``usecols=[0, 1]`` is the same as ``[1, 0]``.
141 To instantiate a DataFrame from ``data`` with element order preserved use
142 ``pd.read_csv(data, usecols=['foo', 'bar'])[['foo', 'bar']]`` for columns
143 in ``['foo', 'bar']`` order or
144 ``pd.read_csv(data, usecols=['foo', 'bar'])[['bar', 'foo']]``
145 for ``['bar', 'foo']`` order.
147 If callable, the callable function will be evaluated against the column
148 names, returning names where the callable function evaluates to True. An
149 example of a valid callable argument would be ``lambda x: x.upper() in
150 ['AAA', 'BBB', 'DDD']``. Using this parameter results in much faster
151 parsing time and lower memory usage.
152squeeze : bool, default False
153 If the parsed data only contains one column then return a Series.
155 .. deprecated:: 1.4.0
156 Append ``.squeeze("columns")`` to the call to ``{func_name}`` to squeeze
157 the data.
158prefix : str, optional
159 Prefix to add to column numbers when no header, e.g. 'X' for X0, X1, ...
161 .. deprecated:: 1.4.0
162 Use a list comprehension on the DataFrame's columns after calling ``read_csv``.
163mangle_dupe_cols : bool, default True
164 Duplicate columns will be specified as 'X', 'X.1', ...'X.N', rather than
165 'X'...'X'. Passing in False will cause data to be overwritten if there
166 are duplicate names in the columns.
168 .. deprecated:: 1.5.0
169 Not implemented, and a new argument to specify the pattern for the
170 names of duplicated columns will be added instead
171dtype : Type name or dict of column -> type, optional
172 Data type for data or columns. E.g. {{'a': np.float64, 'b': np.int32,
173 'c': 'Int64'}}
174 Use `str` or `object` together with suitable `na_values` settings
175 to preserve and not interpret dtype.
176 If converters are specified, they will be applied INSTEAD
177 of dtype conversion.
179 .. versionadded:: 1.5.0
181 Support for defaultdict was added. Specify a defaultdict as input where
182 the default determines the dtype of the columns which are not explicitly
183 listed.
184engine : {{'c', 'python', 'pyarrow'}}, optional
185 Parser engine to use. The C and pyarrow engines are faster, while the python engine
186 is currently more feature-complete. Multithreading is currently only supported by
187 the pyarrow engine.
189 .. versionadded:: 1.4.0
191 The "pyarrow" engine was added as an *experimental* engine, and some features
192 are unsupported, or may not work correctly, with this engine.
193converters : dict, optional
194 Dict of functions for converting values in certain columns. Keys can either
195 be integers or column labels.
196true_values : list, optional
197 Values to consider as True.
198false_values : list, optional
199 Values to consider as False.
200skipinitialspace : bool, default False
201 Skip spaces after delimiter.
202skiprows : list-like, int or callable, optional
203 Line numbers to skip (0-indexed) or number of lines to skip (int)
204 at the start of the file.
206 If callable, the callable function will be evaluated against the row
207 indices, returning True if the row should be skipped and False otherwise.
208 An example of a valid callable argument would be ``lambda x: x in [0, 2]``.
209skipfooter : int, default 0
210 Number of lines at bottom of file to skip (Unsupported with engine='c').
211nrows : int, optional
212 Number of rows of file to read. Useful for reading pieces of large files.
213na_values : scalar, str, list-like, or dict, optional
214 Additional strings to recognize as NA/NaN. If dict passed, specific
215 per-column NA values. By default the following values are interpreted as
216 NaN: '"""
217 + fill("', '".join(sorted(STR_NA_VALUES)), 70, subsequent_indent=" ")
218 + """'.
219keep_default_na : bool, default True
220 Whether or not to include the default NaN values when parsing the data.
221 Depending on whether `na_values` is passed in, the behavior is as follows:
223 * If `keep_default_na` is True, and `na_values` are specified, `na_values`
224 is appended to the default NaN values used for parsing.
225 * If `keep_default_na` is True, and `na_values` are not specified, only
226 the default NaN values are used for parsing.
227 * If `keep_default_na` is False, and `na_values` are specified, only
228 the NaN values specified `na_values` are used for parsing.
229 * If `keep_default_na` is False, and `na_values` are not specified, no
230 strings will be parsed as NaN.
232 Note that if `na_filter` is passed in as False, the `keep_default_na` and
233 `na_values` parameters will be ignored.
234na_filter : bool, default True
235 Detect missing value markers (empty strings and the value of na_values). In
236 data without any NAs, passing na_filter=False can improve the performance
237 of reading a large file.
238verbose : bool, default False
239 Indicate number of NA values placed in non-numeric columns.
240skip_blank_lines : bool, default True
241 If True, skip over blank lines rather than interpreting as NaN values.
242parse_dates : bool or list of int or names or list of lists or dict, \
243default False
244 The behavior is as follows:
246 * boolean. If True -> try parsing the index.
247 * list of int or names. e.g. If [1, 2, 3] -> try parsing columns 1, 2, 3
248 each as a separate date column.
249 * list of lists. e.g. If [[1, 3]] -> combine columns 1 and 3 and parse as
250 a single date column.
251 * dict, e.g. {{'foo' : [1, 3]}} -> parse columns 1, 3 as date and call
252 result 'foo'
254 If a column or index cannot be represented as an array of datetimes,
255 say because of an unparsable value or a mixture of timezones, the column
256 or index will be returned unaltered as an object data type. For
257 non-standard datetime parsing, use ``pd.to_datetime`` after
258 ``pd.read_csv``. To parse an index or column with a mixture of timezones,
259 specify ``date_parser`` to be a partially-applied
260 :func:`pandas.to_datetime` with ``utc=True``. See
261 :ref:`io.csv.mixed_timezones` for more.
263 Note: A fast-path exists for iso8601-formatted dates.
264infer_datetime_format : bool, default False
265 If True and `parse_dates` is enabled, pandas will attempt to infer the
266 format of the datetime strings in the columns, and if it can be inferred,
267 switch to a faster method of parsing them. In some cases this can increase
268 the parsing speed by 5-10x.
269keep_date_col : bool, default False
270 If True and `parse_dates` specifies combining multiple columns then
271 keep the original columns.
272date_parser : function, optional
273 Function to use for converting a sequence of string columns to an array of
274 datetime instances. The default uses ``dateutil.parser.parser`` to do the
275 conversion. Pandas will try to call `date_parser` in three different ways,
276 advancing to the next if an exception occurs: 1) Pass one or more arrays
277 (as defined by `parse_dates`) as arguments; 2) concatenate (row-wise) the
278 string values from the columns defined by `parse_dates` into a single array
279 and pass that; and 3) call `date_parser` once for each row using one or
280 more strings (corresponding to the columns defined by `parse_dates`) as
281 arguments.
282dayfirst : bool, default False
283 DD/MM format dates, international and European format.
284cache_dates : bool, default True
285 If True, use a cache of unique, converted dates to apply the datetime
286 conversion. May produce significant speed-up when parsing duplicate
287 date strings, especially ones with timezone offsets.
289 .. versionadded:: 0.25.0
290iterator : bool, default False
291 Return TextFileReader object for iteration or getting chunks with
292 ``get_chunk()``.
294 .. versionchanged:: 1.2
296 ``TextFileReader`` is a context manager.
297chunksize : int, optional
298 Return TextFileReader object for iteration.
299 See the `IO Tools docs
300 <https://pandas.pydata.org/pandas-docs/stable/io.html#io-chunking>`_
301 for more information on ``iterator`` and ``chunksize``.
303 .. versionchanged:: 1.2
305 ``TextFileReader`` is a context manager.
306{decompression_options}
308 .. versionchanged:: 1.4.0 Zstandard support.
310thousands : str, optional
311 Thousands separator.
312decimal : str, default '.'
313 Character to recognize as decimal point (e.g. use ',' for European data).
314lineterminator : str (length 1), optional
315 Character to break file into lines. Only valid with C parser.
316quotechar : str (length 1), optional
317 The character used to denote the start and end of a quoted item. Quoted
318 items can include the delimiter and it will be ignored.
319quoting : int or csv.QUOTE_* instance, default 0
320 Control field quoting behavior per ``csv.QUOTE_*`` constants. Use one of
321 QUOTE_MINIMAL (0), QUOTE_ALL (1), QUOTE_NONNUMERIC (2) or QUOTE_NONE (3).
322doublequote : bool, default ``True``
323 When quotechar is specified and quoting is not ``QUOTE_NONE``, indicate
324 whether or not to interpret two consecutive quotechar elements INSIDE a
325 field as a single ``quotechar`` element.
326escapechar : str (length 1), optional
327 One-character string used to escape other characters.
328comment : str, optional
329 Indicates remainder of line should not be parsed. If found at the beginning
330 of a line, the line will be ignored altogether. This parameter must be a
331 single character. Like empty lines (as long as ``skip_blank_lines=True``),
332 fully commented lines are ignored by the parameter `header` but not by
333 `skiprows`. For example, if ``comment='#'``, parsing
334 ``#empty\\na,b,c\\n1,2,3`` with ``header=0`` will result in 'a,b,c' being
335 treated as the header.
336encoding : str, optional
337 Encoding to use for UTF when reading/writing (ex. 'utf-8'). `List of Python
338 standard encodings
339 <https://docs.python.org/3/library/codecs.html#standard-encodings>`_ .
341 .. versionchanged:: 1.2
343 When ``encoding`` is ``None``, ``errors="replace"`` is passed to
344 ``open()``. Otherwise, ``errors="strict"`` is passed to ``open()``.
345 This behavior was previously only the case for ``engine="python"``.
347 .. versionchanged:: 1.3.0
349 ``encoding_errors`` is a new argument. ``encoding`` has no longer an
350 influence on how encoding errors are handled.
352encoding_errors : str, optional, default "strict"
353 How encoding errors are treated. `List of possible values
354 <https://docs.python.org/3/library/codecs.html#error-handlers>`_ .
356 .. versionadded:: 1.3.0
358dialect : str or csv.Dialect, optional
359 If provided, this parameter will override values (default or not) for the
360 following parameters: `delimiter`, `doublequote`, `escapechar`,
361 `skipinitialspace`, `quotechar`, and `quoting`. If it is necessary to
362 override values, a ParserWarning will be issued. See csv.Dialect
363 documentation for more details.
364error_bad_lines : bool, optional, default ``None``
365 Lines with too many fields (e.g. a csv line with too many commas) will by
366 default cause an exception to be raised, and no DataFrame will be returned.
367 If False, then these "bad lines" will be dropped from the DataFrame that is
368 returned.
370 .. deprecated:: 1.3.0
371 The ``on_bad_lines`` parameter should be used instead to specify behavior upon
372 encountering a bad line instead.
373warn_bad_lines : bool, optional, default ``None``
374 If error_bad_lines is False, and warn_bad_lines is True, a warning for each
375 "bad line" will be output.
377 .. deprecated:: 1.3.0
378 The ``on_bad_lines`` parameter should be used instead to specify behavior upon
379 encountering a bad line instead.
380on_bad_lines : {{'error', 'warn', 'skip'}} or callable, default 'error'
381 Specifies what to do upon encountering a bad line (a line with too many fields).
382 Allowed values are :
384 - 'error', raise an Exception when a bad line is encountered.
385 - 'warn', raise a warning when a bad line is encountered and skip that line.
386 - 'skip', skip bad lines without raising or warning when they are encountered.
388 .. versionadded:: 1.3.0
390 .. versionadded:: 1.4.0
392 - callable, function with signature
393 ``(bad_line: list[str]) -> list[str] | None`` that will process a single
394 bad line. ``bad_line`` is a list of strings split by the ``sep``.
395 If the function returns ``None``, the bad line will be ignored.
396 If the function returns a new list of strings with more elements than
397 expected, a ``ParserWarning`` will be emitted while dropping extra elements.
398 Only supported when ``engine="python"``
400delim_whitespace : bool, default False
401 Specifies whether or not whitespace (e.g. ``' '`` or ``'\t'``) will be
402 used as the sep. Equivalent to setting ``sep='\\s+'``. If this option
403 is set to True, nothing should be passed in for the ``delimiter``
404 parameter.
405low_memory : bool, default True
406 Internally process the file in chunks, resulting in lower memory use
407 while parsing, but possibly mixed type inference. To ensure no mixed
408 types either set False, or specify the type with the `dtype` parameter.
409 Note that the entire file is read into a single DataFrame regardless,
410 use the `chunksize` or `iterator` parameter to return the data in chunks.
411 (Only valid with C parser).
412memory_map : bool, default False
413 If a filepath is provided for `filepath_or_buffer`, map the file object
414 directly onto memory and access the data directly from there. Using this
415 option can improve performance because there is no longer any I/O overhead.
416float_precision : str, optional
417 Specifies which converter the C engine should use for floating-point
418 values. The options are ``None`` or 'high' for the ordinary converter,
419 'legacy' for the original lower precision pandas converter, and
420 'round_trip' for the round-trip converter.
422 .. versionchanged:: 1.2
424{storage_options}
426 .. versionadded:: 1.2
428Returns
429-------
430DataFrame or TextParser
431 A comma-separated values (csv) file is returned as two-dimensional
432 data structure with labeled axes.
434See Also
435--------
436DataFrame.to_csv : Write DataFrame to a comma-separated values (csv) file.
437read_csv : Read a comma-separated values (csv) file into DataFrame.
438read_fwf : Read a table of fixed-width formatted lines into DataFrame.
440Examples
441--------
442>>> pd.{func_name}('data.csv') # doctest: +SKIP
443"""
444)
447_c_parser_defaults = {
448 "delim_whitespace": False,
449 "na_filter": True,
450 "low_memory": True,
451 "memory_map": False,
452 "float_precision": None,
453}
455_fwf_defaults = {"colspecs": "infer", "infer_nrows": 100, "widths": None}
457_c_unsupported = {"skipfooter"}
458_python_unsupported = {"low_memory", "float_precision"}
459_pyarrow_unsupported = {
460 "skipfooter",
461 "float_precision",
462 "chunksize",
463 "comment",
464 "nrows",
465 "thousands",
466 "memory_map",
467 "dialect",
468 "warn_bad_lines",
469 "error_bad_lines",
470 "on_bad_lines",
471 "delim_whitespace",
472 "quoting",
473 "lineterminator",
474 "converters",
475 "decimal",
476 "iterator",
477 "dayfirst",
478 "infer_datetime_format",
479 "verbose",
480 "skipinitialspace",
481 "low_memory",
482}
485class _DeprecationConfig(NamedTuple):
486 default_value: Any
487 msg: str | None
490_deprecated_defaults: dict[str, _DeprecationConfig] = {
491 "error_bad_lines": _DeprecationConfig(None, "Use on_bad_lines in the future."),
492 "warn_bad_lines": _DeprecationConfig(None, "Use on_bad_lines in the future."),
493 "squeeze": _DeprecationConfig(
494 None, 'Append .squeeze("columns") to the call to squeeze.'
495 ),
496 "prefix": _DeprecationConfig(
497 None, "Use a list comprehension on the column names in the future."
498 ),
499}
502@overload
503def validate_integer(name, val: None, min_val=...) -> None:
504 ...
507@overload
508def validate_integer(name, val: float, min_val=...) -> int:
509 ...
512@overload
513def validate_integer(name, val: int | None, min_val=...) -> int | None:
514 ...
517def validate_integer(name, val: int | float | None, min_val=0) -> int | None:
518 """
519 Checks whether the 'name' parameter for parsing is either
520 an integer OR float that can SAFELY be cast to an integer
521 without losing accuracy. Raises a ValueError if that is
522 not the case.
524 Parameters
525 ----------
526 name : str
527 Parameter name (used for error reporting)
528 val : int or float
529 The value to check
530 min_val : int
531 Minimum allowed value (val < min_val will result in a ValueError)
532 """
533 if val is None:
534 return val
536 msg = f"'{name:s}' must be an integer >={min_val:d}"
537 if is_float(val):
538 if int(val) != val:
539 raise ValueError(msg)
540 val = int(val)
541 elif not (is_integer(val) and val >= min_val):
542 raise ValueError(msg)
544 return int(val)
547def _validate_names(names: Sequence[Hashable] | None) -> None:
548 """
549 Raise ValueError if the `names` parameter contains duplicates or has an
550 invalid data type.
552 Parameters
553 ----------
554 names : array-like or None
555 An array containing a list of the names used for the output DataFrame.
557 Raises
558 ------
559 ValueError
560 If names are not unique or are not ordered (e.g. set).
561 """
562 if names is not None:
563 if len(names) != len(set(names)):
564 raise ValueError("Duplicate names are not allowed.")
565 if not (
566 is_list_like(names, allow_sets=False) or isinstance(names, abc.KeysView)
567 ):
568 raise ValueError("Names should be an ordered collection.")
571def _read(
572 filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str], kwds
573) -> DataFrame | TextFileReader:
574 """Generic reader of line files."""
575 # if we pass a date_parser and parse_dates=False, we should not parse the
576 # dates GH#44366
577 if kwds.get("parse_dates", None) is None:
578 if kwds.get("date_parser", None) is None:
579 kwds["parse_dates"] = False
580 else:
581 kwds["parse_dates"] = True
583 # Extract some of the arguments (pass chunksize on).
584 iterator = kwds.get("iterator", False)
585 chunksize = kwds.get("chunksize", None)
586 if kwds.get("engine") == "pyarrow":
587 if iterator:
588 raise ValueError(
589 "The 'iterator' option is not supported with the 'pyarrow' engine"
590 )
592 if chunksize is not None:
593 raise ValueError(
594 "The 'chunksize' option is not supported with the 'pyarrow' engine"
595 )
596 else:
597 chunksize = validate_integer("chunksize", chunksize, 1)
599 nrows = kwds.get("nrows", None)
601 # Check for duplicates in names.
602 _validate_names(kwds.get("names", None))
604 # Create the parser.
605 parser = TextFileReader(filepath_or_buffer, **kwds)
607 if chunksize or iterator:
608 return parser
610 with parser:
611 return parser.read(nrows)
614# iterator=True -> TextFileReader
615@overload
616def read_csv(
617 filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str],
618 *,
619 sep: str | None | lib.NoDefault = ...,
620 delimiter: str | None | lib.NoDefault = ...,
621 header: int | Sequence[int] | None | Literal["infer"] = ...,
622 names: Sequence[Hashable] | None | lib.NoDefault = ...,
623 index_col: IndexLabel | Literal[False] | None = ...,
624 usecols=...,
625 squeeze: bool | None = ...,
626 prefix: str | lib.NoDefault = ...,
627 mangle_dupe_cols: bool = ...,
628 dtype: DtypeArg | None = ...,
629 engine: CSVEngine | None = ...,
630 converters=...,
631 true_values=...,
632 false_values=...,
633 skipinitialspace: bool = ...,
634 skiprows=...,
635 skipfooter: int = ...,
636 nrows: int | None = ...,
637 na_values=...,
638 keep_default_na: bool = ...,
639 na_filter: bool = ...,
640 verbose: bool = ...,
641 skip_blank_lines: bool = ...,
642 parse_dates=...,
643 infer_datetime_format: bool = ...,
644 keep_date_col: bool = ...,
645 date_parser=...,
646 dayfirst: bool = ...,
647 cache_dates: bool = ...,
648 iterator: Literal[True],
649 chunksize: int | None = ...,
650 compression: CompressionOptions = ...,
651 thousands: str | None = ...,
652 decimal: str = ...,
653 lineterminator: str | None = ...,
654 quotechar: str = ...,
655 quoting: int = ...,
656 doublequote: bool = ...,
657 escapechar: str | None = ...,
658 comment: str | None = ...,
659 encoding: str | None = ...,
660 encoding_errors: str | None = ...,
661 dialect: str | csv.Dialect | None = ...,
662 error_bad_lines: bool | None = ...,
663 warn_bad_lines: bool | None = ...,
664 on_bad_lines=...,
665 delim_whitespace: bool = ...,
666 low_memory=...,
667 memory_map: bool = ...,
668 float_precision: Literal["high", "legacy"] | None = ...,
669 storage_options: StorageOptions = ...,
670) -> TextFileReader:
671 ...
674# chunksize=int -> TextFileReader
675@overload
676def read_csv(
677 filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str],
678 *,
679 sep: str | None | lib.NoDefault = ...,
680 delimiter: str | None | lib.NoDefault = ...,
681 header: int | Sequence[int] | None | Literal["infer"] = ...,
682 names: Sequence[Hashable] | None | lib.NoDefault = ...,
683 index_col: IndexLabel | Literal[False] | None = ...,
684 usecols=...,
685 squeeze: bool | None = ...,
686 prefix: str | lib.NoDefault = ...,
687 mangle_dupe_cols: bool = ...,
688 dtype: DtypeArg | None = ...,
689 engine: CSVEngine | None = ...,
690 converters=...,
691 true_values=...,
692 false_values=...,
693 skipinitialspace: bool = ...,
694 skiprows=...,
695 skipfooter: int = ...,
696 nrows: int | None = ...,
697 na_values=...,
698 keep_default_na: bool = ...,
699 na_filter: bool = ...,
700 verbose: bool = ...,
701 skip_blank_lines: bool = ...,
702 parse_dates=...,
703 infer_datetime_format: bool = ...,
704 keep_date_col: bool = ...,
705 date_parser=...,
706 dayfirst: bool = ...,
707 cache_dates: bool = ...,
708 iterator: bool = ...,
709 chunksize: int,
710 compression: CompressionOptions = ...,
711 thousands: str | None = ...,
712 decimal: str = ...,
713 lineterminator: str | None = ...,
714 quotechar: str = ...,
715 quoting: int = ...,
716 doublequote: bool = ...,
717 escapechar: str | None = ...,
718 comment: str | None = ...,
719 encoding: str | None = ...,
720 encoding_errors: str | None = ...,
721 dialect: str | csv.Dialect | None = ...,
722 error_bad_lines: bool | None = ...,
723 warn_bad_lines: bool | None = ...,
724 on_bad_lines=...,
725 delim_whitespace: bool = ...,
726 low_memory=...,
727 memory_map: bool = ...,
728 float_precision: Literal["high", "legacy"] | None = ...,
729 storage_options: StorageOptions = ...,
730) -> TextFileReader:
731 ...
734# default case -> DataFrame
735@overload
736def read_csv(
737 filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str],
738 *,
739 sep: str | None | lib.NoDefault = ...,
740 delimiter: str | None | lib.NoDefault = ...,
741 header: int | Sequence[int] | None | Literal["infer"] = ...,
742 names: Sequence[Hashable] | None | lib.NoDefault = ...,
743 index_col: IndexLabel | Literal[False] | None = ...,
744 usecols=...,
745 squeeze: bool | None = ...,
746 prefix: str | lib.NoDefault = ...,
747 mangle_dupe_cols: bool = ...,
748 dtype: DtypeArg | None = ...,
749 engine: CSVEngine | None = ...,
750 converters=...,
751 true_values=...,
752 false_values=...,
753 skipinitialspace: bool = ...,
754 skiprows=...,
755 skipfooter: int = ...,
756 nrows: int | None = ...,
757 na_values=...,
758 keep_default_na: bool = ...,
759 na_filter: bool = ...,
760 verbose: bool = ...,
761 skip_blank_lines: bool = ...,
762 parse_dates=...,
763 infer_datetime_format: bool = ...,
764 keep_date_col: bool = ...,
765 date_parser=...,
766 dayfirst: bool = ...,
767 cache_dates: bool = ...,
768 iterator: Literal[False] = ...,
769 chunksize: None = ...,
770 compression: CompressionOptions = ...,
771 thousands: str | None = ...,
772 decimal: str = ...,
773 lineterminator: str | None = ...,
774 quotechar: str = ...,
775 quoting: int = ...,
776 doublequote: bool = ...,
777 escapechar: str | None = ...,
778 comment: str | None = ...,
779 encoding: str | None = ...,
780 encoding_errors: str | None = ...,
781 dialect: str | csv.Dialect | None = ...,
782 error_bad_lines: bool | None = ...,
783 warn_bad_lines: bool | None = ...,
784 on_bad_lines=...,
785 delim_whitespace: bool = ...,
786 low_memory=...,
787 memory_map: bool = ...,
788 float_precision: Literal["high", "legacy"] | None = ...,
789 storage_options: StorageOptions = ...,
790) -> DataFrame:
791 ...
794# Unions -> DataFrame | TextFileReader
795@overload
796def read_csv(
797 filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str],
798 *,
799 sep: str | None | lib.NoDefault = ...,
800 delimiter: str | None | lib.NoDefault = ...,
801 header: int | Sequence[int] | None | Literal["infer"] = ...,
802 names: Sequence[Hashable] | None | lib.NoDefault = ...,
803 index_col: IndexLabel | Literal[False] | None = ...,
804 usecols=...,
805 squeeze: bool | None = ...,
806 prefix: str | lib.NoDefault = ...,
807 mangle_dupe_cols: bool = ...,
808 dtype: DtypeArg | None = ...,
809 engine: CSVEngine | None = ...,
810 converters=...,
811 true_values=...,
812 false_values=...,
813 skipinitialspace: bool = ...,
814 skiprows=...,
815 skipfooter: int = ...,
816 nrows: int | None = ...,
817 na_values=...,
818 keep_default_na: bool = ...,
819 na_filter: bool = ...,
820 verbose: bool = ...,
821 skip_blank_lines: bool = ...,
822 parse_dates=...,
823 infer_datetime_format: bool = ...,
824 keep_date_col: bool = ...,
825 date_parser=...,
826 dayfirst: bool = ...,
827 cache_dates: bool = ...,
828 iterator: bool = ...,
829 chunksize: int | None = ...,
830 compression: CompressionOptions = ...,
831 thousands: str | None = ...,
832 decimal: str = ...,
833 lineterminator: str | None = ...,
834 quotechar: str = ...,
835 quoting: int = ...,
836 doublequote: bool = ...,
837 escapechar: str | None = ...,
838 comment: str | None = ...,
839 encoding: str | None = ...,
840 encoding_errors: str | None = ...,
841 dialect: str | csv.Dialect | None = ...,
842 error_bad_lines: bool | None = ...,
843 warn_bad_lines: bool | None = ...,
844 on_bad_lines=...,
845 delim_whitespace: bool = ...,
846 low_memory=...,
847 memory_map: bool = ...,
848 float_precision: Literal["high", "legacy"] | None = ...,
849 storage_options: StorageOptions = ...,
850) -> DataFrame | TextFileReader:
851 ...
854@deprecate_kwarg(old_arg_name="mangle_dupe_cols", new_arg_name=None)
855@deprecate_nonkeyword_arguments(version=None, allowed_args=["filepath_or_buffer"])
856@Appender(
857 _doc_read_csv_and_table.format(
858 func_name="read_csv",
859 summary="Read a comma-separated values (csv) file into DataFrame.",
860 _default_sep="','",
861 storage_options=_shared_docs["storage_options"],
862 decompression_options=_shared_docs["decompression_options"]
863 % "filepath_or_buffer",
864 )
865)
866def read_csv(
867 filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str],
868 sep: str | None | lib.NoDefault = lib.no_default,
869 delimiter: str | None | lib.NoDefault = None,
870 # Column and Index Locations and Names
871 header: int | Sequence[int] | None | Literal["infer"] = "infer",
872 names: Sequence[Hashable] | None | lib.NoDefault = lib.no_default,
873 index_col: IndexLabel | Literal[False] | None = None,
874 usecols=None,
875 squeeze: bool | None = None,
876 prefix: str | lib.NoDefault = lib.no_default,
877 mangle_dupe_cols: bool = True,
878 # General Parsing Configuration
879 dtype: DtypeArg | None = None,
880 engine: CSVEngine | None = None,
881 converters=None,
882 true_values=None,
883 false_values=None,
884 skipinitialspace: bool = False,
885 skiprows=None,
886 skipfooter: int = 0,
887 nrows: int | None = None,
888 # NA and Missing Data Handling
889 na_values=None,
890 keep_default_na: bool = True,
891 na_filter: bool = True,
892 verbose: bool = False,
893 skip_blank_lines: bool = True,
894 # Datetime Handling
895 parse_dates=None,
896 infer_datetime_format: bool = False,
897 keep_date_col: bool = False,
898 date_parser=None,
899 dayfirst: bool = False,
900 cache_dates: bool = True,
901 # Iteration
902 iterator: bool = False,
903 chunksize: int | None = None,
904 # Quoting, Compression, and File Format
905 compression: CompressionOptions = "infer",
906 thousands: str | None = None,
907 decimal: str = ".",
908 lineterminator: str | None = None,
909 quotechar: str = '"',
910 quoting: int = csv.QUOTE_MINIMAL,
911 doublequote: bool = True,
912 escapechar: str | None = None,
913 comment: str | None = None,
914 encoding: str | None = None,
915 encoding_errors: str | None = "strict",
916 dialect: str | csv.Dialect | None = None,
917 # Error Handling
918 error_bad_lines: bool | None = None,
919 warn_bad_lines: bool | None = None,
920 # TODO(2.0): set on_bad_lines to "error".
921 # See _refine_defaults_read comment for why we do this.
922 on_bad_lines=None,
923 # Internal
924 delim_whitespace: bool = False,
925 low_memory=_c_parser_defaults["low_memory"],
926 memory_map: bool = False,
927 float_precision: Literal["high", "legacy"] | None = None,
928 storage_options: StorageOptions = None,
929) -> DataFrame | TextFileReader:
930 # locals() should never be modified
931 kwds = locals().copy()
932 del kwds["filepath_or_buffer"]
933 del kwds["sep"]
935 kwds_defaults = _refine_defaults_read(
936 dialect,
937 delimiter,
938 delim_whitespace,
939 engine,
940 sep,
941 error_bad_lines,
942 warn_bad_lines,
943 on_bad_lines,
944 names,
945 prefix,
946 defaults={"delimiter": ","},
947 )
948 kwds.update(kwds_defaults)
950 return _read(filepath_or_buffer, kwds)
953# iterator=True -> TextFileReader
954@overload
955def read_table(
956 filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str],
957 *,
958 sep: str | None | lib.NoDefault = ...,
959 delimiter: str | None | lib.NoDefault = ...,
960 header: int | Sequence[int] | None | Literal["infer"] = ...,
961 names: Sequence[Hashable] | None | lib.NoDefault = ...,
962 index_col: IndexLabel | Literal[False] | None = ...,
963 usecols=...,
964 squeeze: bool | None = ...,
965 prefix: str | lib.NoDefault = ...,
966 mangle_dupe_cols: bool = ...,
967 dtype: DtypeArg | None = ...,
968 engine: CSVEngine | None = ...,
969 converters=...,
970 true_values=...,
971 false_values=...,
972 skipinitialspace: bool = ...,
973 skiprows=...,
974 skipfooter: int = ...,
975 nrows: int | None = ...,
976 na_values=...,
977 keep_default_na: bool = ...,
978 na_filter: bool = ...,
979 verbose: bool = ...,
980 skip_blank_lines: bool = ...,
981 parse_dates=...,
982 infer_datetime_format: bool = ...,
983 keep_date_col: bool = ...,
984 date_parser=...,
985 dayfirst: bool = ...,
986 cache_dates: bool = ...,
987 iterator: Literal[True],
988 chunksize: int | None = ...,
989 compression: CompressionOptions = ...,
990 thousands: str | None = ...,
991 decimal: str = ...,
992 lineterminator: str | None = ...,
993 quotechar: str = ...,
994 quoting: int = ...,
995 doublequote: bool = ...,
996 escapechar: str | None = ...,
997 comment: str | None = ...,
998 encoding: str | None = ...,
999 encoding_errors: str | None = ...,
1000 dialect: str | csv.Dialect | None = ...,
1001 error_bad_lines: bool | None = ...,
1002 warn_bad_lines: bool | None = ...,
1003 on_bad_lines=...,
1004 delim_whitespace=...,
1005 low_memory=...,
1006 memory_map: bool = ...,
1007 float_precision: str | None = ...,
1008 storage_options: StorageOptions = ...,
1009) -> TextFileReader:
1010 ...
1013# chunksize=int -> TextFileReader
1014@overload
1015def read_table(
1016 filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str],
1017 *,
1018 sep: str | None | lib.NoDefault = ...,
1019 delimiter: str | None | lib.NoDefault = ...,
1020 header: int | Sequence[int] | None | Literal["infer"] = ...,
1021 names: Sequence[Hashable] | None | lib.NoDefault = ...,
1022 index_col: IndexLabel | Literal[False] | None = ...,
1023 usecols=...,
1024 squeeze: bool | None = ...,
1025 prefix: str | lib.NoDefault = ...,
1026 mangle_dupe_cols: bool = ...,
1027 dtype: DtypeArg | None = ...,
1028 engine: CSVEngine | None = ...,
1029 converters=...,
1030 true_values=...,
1031 false_values=...,
1032 skipinitialspace: bool = ...,
1033 skiprows=...,
1034 skipfooter: int = ...,
1035 nrows: int | None = ...,
1036 na_values=...,
1037 keep_default_na: bool = ...,
1038 na_filter: bool = ...,
1039 verbose: bool = ...,
1040 skip_blank_lines: bool = ...,
1041 parse_dates=...,
1042 infer_datetime_format: bool = ...,
1043 keep_date_col: bool = ...,
1044 date_parser=...,
1045 dayfirst: bool = ...,
1046 cache_dates: bool = ...,
1047 iterator: bool = ...,
1048 chunksize: int,
1049 compression: CompressionOptions = ...,
1050 thousands: str | None = ...,
1051 decimal: str = ...,
1052 lineterminator: str | None = ...,
1053 quotechar: str = ...,
1054 quoting: int = ...,
1055 doublequote: bool = ...,
1056 escapechar: str | None = ...,
1057 comment: str | None = ...,
1058 encoding: str | None = ...,
1059 encoding_errors: str | None = ...,
1060 dialect: str | csv.Dialect | None = ...,
1061 error_bad_lines: bool | None = ...,
1062 warn_bad_lines: bool | None = ...,
1063 on_bad_lines=...,
1064 delim_whitespace=...,
1065 low_memory=...,
1066 memory_map: bool = ...,
1067 float_precision: str | None = ...,
1068 storage_options: StorageOptions = ...,
1069) -> TextFileReader:
1070 ...
1073# default -> DataFrame
1074@overload
1075def read_table(
1076 filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str],
1077 *,
1078 sep: str | None | lib.NoDefault = ...,
1079 delimiter: str | None | lib.NoDefault = ...,
1080 header: int | Sequence[int] | None | Literal["infer"] = ...,
1081 names: Sequence[Hashable] | None | lib.NoDefault = ...,
1082 index_col: IndexLabel | Literal[False] | None = ...,
1083 usecols=...,
1084 squeeze: bool | None = ...,
1085 prefix: str | lib.NoDefault = ...,
1086 mangle_dupe_cols: bool = ...,
1087 dtype: DtypeArg | None = ...,
1088 engine: CSVEngine | None = ...,
1089 converters=...,
1090 true_values=...,
1091 false_values=...,
1092 skipinitialspace: bool = ...,
1093 skiprows=...,
1094 skipfooter: int = ...,
1095 nrows: int | None = ...,
1096 na_values=...,
1097 keep_default_na: bool = ...,
1098 na_filter: bool = ...,
1099 verbose: bool = ...,
1100 skip_blank_lines: bool = ...,
1101 parse_dates=...,
1102 infer_datetime_format: bool = ...,
1103 keep_date_col: bool = ...,
1104 date_parser=...,
1105 dayfirst: bool = ...,
1106 cache_dates: bool = ...,
1107 iterator: Literal[False] = ...,
1108 chunksize: None = ...,
1109 compression: CompressionOptions = ...,
1110 thousands: str | None = ...,
1111 decimal: str = ...,
1112 lineterminator: str | None = ...,
1113 quotechar: str = ...,
1114 quoting: int = ...,
1115 doublequote: bool = ...,
1116 escapechar: str | None = ...,
1117 comment: str | None = ...,
1118 encoding: str | None = ...,
1119 encoding_errors: str | None = ...,
1120 dialect: str | csv.Dialect | None = ...,
1121 error_bad_lines: bool | None = ...,
1122 warn_bad_lines: bool | None = ...,
1123 on_bad_lines=...,
1124 delim_whitespace=...,
1125 low_memory=...,
1126 memory_map: bool = ...,
1127 float_precision: str | None = ...,
1128 storage_options: StorageOptions = ...,
1129) -> DataFrame:
1130 ...
1133# Unions -> DataFrame | TextFileReader
1134@overload
1135def read_table(
1136 filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str],
1137 *,
1138 sep: str | None | lib.NoDefault = ...,
1139 delimiter: str | None | lib.NoDefault = ...,
1140 header: int | Sequence[int] | None | Literal["infer"] = ...,
1141 names: Sequence[Hashable] | None | lib.NoDefault = ...,
1142 index_col: IndexLabel | Literal[False] | None = ...,
1143 usecols=...,
1144 squeeze: bool | None = ...,
1145 prefix: str | lib.NoDefault = ...,
1146 mangle_dupe_cols: bool = ...,
1147 dtype: DtypeArg | None = ...,
1148 engine: CSVEngine | None = ...,
1149 converters=...,
1150 true_values=...,
1151 false_values=...,
1152 skipinitialspace: bool = ...,
1153 skiprows=...,
1154 skipfooter: int = ...,
1155 nrows: int | None = ...,
1156 na_values=...,
1157 keep_default_na: bool = ...,
1158 na_filter: bool = ...,
1159 verbose: bool = ...,
1160 skip_blank_lines: bool = ...,
1161 parse_dates=...,
1162 infer_datetime_format: bool = ...,
1163 keep_date_col: bool = ...,
1164 date_parser=...,
1165 dayfirst: bool = ...,
1166 cache_dates: bool = ...,
1167 iterator: bool = ...,
1168 chunksize: int | None = ...,
1169 compression: CompressionOptions = ...,
1170 thousands: str | None = ...,
1171 decimal: str = ...,
1172 lineterminator: str | None = ...,
1173 quotechar: str = ...,
1174 quoting: int = ...,
1175 doublequote: bool = ...,
1176 escapechar: str | None = ...,
1177 comment: str | None = ...,
1178 encoding: str | None = ...,
1179 encoding_errors: str | None = ...,
1180 dialect: str | csv.Dialect | None = ...,
1181 error_bad_lines: bool | None = ...,
1182 warn_bad_lines: bool | None = ...,
1183 on_bad_lines=...,
1184 delim_whitespace=...,
1185 low_memory=...,
1186 memory_map: bool = ...,
1187 float_precision: str | None = ...,
1188 storage_options: StorageOptions = ...,
1189) -> DataFrame | TextFileReader:
1190 ...
1193@deprecate_kwarg(old_arg_name="mangle_dupe_cols", new_arg_name=None)
1194@deprecate_nonkeyword_arguments(version=None, allowed_args=["filepath_or_buffer"])
1195@Appender(
1196 _doc_read_csv_and_table.format(
1197 func_name="read_table",
1198 summary="Read general delimited file into DataFrame.",
1199 _default_sep=r"'\\t' (tab-stop)",
1200 storage_options=_shared_docs["storage_options"],
1201 decompression_options=_shared_docs["decompression_options"]
1202 % "filepath_or_buffer",
1203 )
1204)
1205def read_table(
1206 filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str],
1207 sep: str | None | lib.NoDefault = lib.no_default,
1208 delimiter: str | None | lib.NoDefault = None,
1209 # Column and Index Locations and Names
1210 header: int | Sequence[int] | None | Literal["infer"] = "infer",
1211 names: Sequence[Hashable] | None | lib.NoDefault = lib.no_default,
1212 index_col: IndexLabel | Literal[False] | None = None,
1213 usecols=None,
1214 squeeze: bool | None = None,
1215 prefix: str | lib.NoDefault = lib.no_default,
1216 mangle_dupe_cols: bool = True,
1217 # General Parsing Configuration
1218 dtype: DtypeArg | None = None,
1219 engine: CSVEngine | None = None,
1220 converters=None,
1221 true_values=None,
1222 false_values=None,
1223 skipinitialspace: bool = False,
1224 skiprows=None,
1225 skipfooter: int = 0,
1226 nrows: int | None = None,
1227 # NA and Missing Data Handling
1228 na_values=None,
1229 keep_default_na: bool = True,
1230 na_filter: bool = True,
1231 verbose: bool = False,
1232 skip_blank_lines: bool = True,
1233 # Datetime Handling
1234 parse_dates=False,
1235 infer_datetime_format: bool = False,
1236 keep_date_col: bool = False,
1237 date_parser=None,
1238 dayfirst: bool = False,
1239 cache_dates: bool = True,
1240 # Iteration
1241 iterator: bool = False,
1242 chunksize: int | None = None,
1243 # Quoting, Compression, and File Format
1244 compression: CompressionOptions = "infer",
1245 thousands: str | None = None,
1246 decimal: str = ".",
1247 lineterminator: str | None = None,
1248 quotechar: str = '"',
1249 quoting: int = csv.QUOTE_MINIMAL,
1250 doublequote: bool = True,
1251 escapechar: str | None = None,
1252 comment: str | None = None,
1253 encoding: str | None = None,
1254 encoding_errors: str | None = "strict",
1255 dialect: str | csv.Dialect | None = None,
1256 # Error Handling
1257 error_bad_lines: bool | None = None,
1258 warn_bad_lines: bool | None = None,
1259 # TODO(2.0): set on_bad_lines to "error".
1260 # See _refine_defaults_read comment for why we do this.
1261 on_bad_lines=None,
1262 # Internal
1263 delim_whitespace=False,
1264 low_memory=_c_parser_defaults["low_memory"],
1265 memory_map: bool = False,
1266 float_precision: str | None = None,
1267 storage_options: StorageOptions = None,
1268) -> DataFrame | TextFileReader:
1269 # locals() should never be modified
1270 kwds = locals().copy()
1271 del kwds["filepath_or_buffer"]
1272 del kwds["sep"]
1274 kwds_defaults = _refine_defaults_read(
1275 dialect,
1276 delimiter,
1277 delim_whitespace,
1278 engine,
1279 sep,
1280 error_bad_lines,
1281 warn_bad_lines,
1282 on_bad_lines,
1283 names,
1284 prefix,
1285 defaults={"delimiter": "\t"},
1286 )
1287 kwds.update(kwds_defaults)
1289 return _read(filepath_or_buffer, kwds)
1292@deprecate_nonkeyword_arguments(version=None, allowed_args=["filepath_or_buffer"])
1293def read_fwf(
1294 filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str],
1295 colspecs: Sequence[tuple[int, int]] | str | None = "infer",
1296 widths: Sequence[int] | None = None,
1297 infer_nrows: int = 100,
1298 **kwds,
1299) -> DataFrame | TextFileReader:
1300 r"""
1301 Read a table of fixed-width formatted lines into DataFrame.
1303 Also supports optionally iterating or breaking of the file
1304 into chunks.
1306 Additional help can be found in the `online docs for IO Tools
1307 <https://pandas.pydata.org/pandas-docs/stable/user_guide/io.html>`_.
1309 Parameters
1310 ----------
1311 filepath_or_buffer : str, path object, or file-like object
1312 String, path object (implementing ``os.PathLike[str]``), or file-like
1313 object implementing a text ``read()`` function.The string could be a URL.
1314 Valid URL schemes include http, ftp, s3, and file. For file URLs, a host is
1315 expected. A local file could be:
1316 ``file://localhost/path/to/table.csv``.
1317 colspecs : list of tuple (int, int) or 'infer'. optional
1318 A list of tuples giving the extents of the fixed-width
1319 fields of each line as half-open intervals (i.e., [from, to[ ).
1320 String value 'infer' can be used to instruct the parser to try
1321 detecting the column specifications from the first 100 rows of
1322 the data which are not being skipped via skiprows (default='infer').
1323 widths : list of int, optional
1324 A list of field widths which can be used instead of 'colspecs' if
1325 the intervals are contiguous.
1326 infer_nrows : int, default 100
1327 The number of rows to consider when letting the parser determine the
1328 `colspecs`.
1329 **kwds : optional
1330 Optional keyword arguments can be passed to ``TextFileReader``.
1332 Returns
1333 -------
1334 DataFrame or TextFileReader
1335 A comma-separated values (csv) file is returned as two-dimensional
1336 data structure with labeled axes.
1338 See Also
1339 --------
1340 DataFrame.to_csv : Write DataFrame to a comma-separated values (csv) file.
1341 read_csv : Read a comma-separated values (csv) file into DataFrame.
1343 Examples
1344 --------
1345 >>> pd.read_fwf('data.csv') # doctest: +SKIP
1346 """
1347 # Check input arguments.
1348 if colspecs is None and widths is None:
1349 raise ValueError("Must specify either colspecs or widths")
1350 elif colspecs not in (None, "infer") and widths is not None:
1351 raise ValueError("You must specify only one of 'widths' and 'colspecs'")
1353 # Compute 'colspecs' from 'widths', if specified.
1354 if widths is not None:
1355 colspecs, col = [], 0
1356 for w in widths:
1357 colspecs.append((col, col + w))
1358 col += w
1360 # for mypy
1361 assert colspecs is not None
1363 # GH#40830
1364 # Ensure length of `colspecs` matches length of `names`
1365 names = kwds.get("names")
1366 if names is not None:
1367 if len(names) != len(colspecs) and colspecs != "infer":
1368 # need to check len(index_col) as it might contain
1369 # unnamed indices, in which case it's name is not required
1370 len_index = 0
1371 if kwds.get("index_col") is not None:
1372 index_col: Any = kwds.get("index_col")
1373 if index_col is not False:
1374 if not is_list_like(index_col):
1375 len_index = 1
1376 else:
1377 len_index = len(index_col)
1378 if kwds.get("usecols") is None and len(names) + len_index != len(colspecs):
1379 # If usecols is used colspec may be longer than names
1380 raise ValueError("Length of colspecs must match length of names")
1382 kwds["colspecs"] = colspecs
1383 kwds["infer_nrows"] = infer_nrows
1384 kwds["engine"] = "python-fwf"
1385 return _read(filepath_or_buffer, kwds)
1388class TextFileReader(abc.Iterator):
1389 """
1391 Passed dialect overrides any of the related parser options
1393 """
1395 def __init__(
1396 self,
1397 f: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str] | list,
1398 engine: CSVEngine | None = None,
1399 **kwds,
1400 ) -> None:
1401 if engine is not None:
1402 engine_specified = True
1403 else:
1404 engine = "python"
1405 engine_specified = False
1406 self.engine = engine
1407 self._engine_specified = kwds.get("engine_specified", engine_specified)
1409 _validate_skipfooter(kwds)
1411 dialect = _extract_dialect(kwds)
1412 if dialect is not None:
1413 if engine == "pyarrow":
1414 raise ValueError(
1415 "The 'dialect' option is not supported with the 'pyarrow' engine"
1416 )
1417 kwds = _merge_with_dialect_properties(dialect, kwds)
1419 if kwds.get("header", "infer") == "infer":
1420 kwds["header"] = 0 if kwds.get("names") is None else None
1422 self.orig_options = kwds
1424 # miscellanea
1425 self._currow = 0
1427 options = self._get_options_with_defaults(engine)
1428 options["storage_options"] = kwds.get("storage_options", None)
1430 self.chunksize = options.pop("chunksize", None)
1431 self.nrows = options.pop("nrows", None)
1433 self._check_file_or_buffer(f, engine)
1434 self.options, self.engine = self._clean_options(options, engine)
1436 self.squeeze = self.options.pop("squeeze", False)
1438 if "has_index_names" in kwds:
1439 self.options["has_index_names"] = kwds["has_index_names"]
1441 self.handles: IOHandles | None = None
1442 self._engine = self._make_engine(f, self.engine)
1444 def close(self) -> None:
1445 if self.handles is not None:
1446 self.handles.close()
1447 self._engine.close()
1449 def _get_options_with_defaults(self, engine: CSVEngine) -> dict[str, Any]:
1450 kwds = self.orig_options
1452 options = {}
1453 default: object | None
1455 for argname, default in parser_defaults.items():
1456 value = kwds.get(argname, default)
1458 # see gh-12935
1459 if (
1460 engine == "pyarrow"
1461 and argname in _pyarrow_unsupported
1462 and value != default
1463 and value != getattr(value, "value", default)
1464 ):
1465 if (
1466 argname == "on_bad_lines"
1467 and kwds.get("error_bad_lines") is not None
1468 ):
1469 argname = "error_bad_lines"
1470 elif (
1471 argname == "on_bad_lines" and kwds.get("warn_bad_lines") is not None
1472 ):
1473 argname = "warn_bad_lines"
1475 raise ValueError(
1476 f"The {repr(argname)} option is not supported with the "
1477 f"'pyarrow' engine"
1478 )
1479 elif argname == "mangle_dupe_cols" and value is False:
1480 # GH12935
1481 raise ValueError("Setting mangle_dupe_cols=False is not supported yet")
1482 else:
1483 options[argname] = value
1485 for argname, default in _c_parser_defaults.items():
1486 if argname in kwds:
1487 value = kwds[argname]
1489 if engine != "c" and value != default:
1490 if "python" in engine and argname not in _python_unsupported:
1491 pass
1492 elif (
1493 value
1494 == _deprecated_defaults.get(
1495 argname, _DeprecationConfig(default, None)
1496 ).default_value
1497 ):
1498 pass
1499 else:
1500 raise ValueError(
1501 f"The {repr(argname)} option is not supported with the "
1502 f"{repr(engine)} engine"
1503 )
1504 else:
1505 value = _deprecated_defaults.get(
1506 argname, _DeprecationConfig(default, None)
1507 ).default_value
1508 options[argname] = value
1510 if engine == "python-fwf":
1511 for argname, default in _fwf_defaults.items():
1512 options[argname] = kwds.get(argname, default)
1514 return options
1516 def _check_file_or_buffer(self, f, engine: CSVEngine) -> None:
1517 # see gh-16530
1518 if is_file_like(f) and engine != "c" and not hasattr(f, "__iter__"):
1519 # The C engine doesn't need the file-like to have the "__iter__"
1520 # attribute. However, the Python engine needs "__iter__(...)"
1521 # when iterating through such an object, meaning it
1522 # needs to have that attribute
1523 raise ValueError(
1524 "The 'python' engine cannot iterate through this file buffer."
1525 )
1527 def _clean_options(
1528 self, options: dict[str, Any], engine: CSVEngine
1529 ) -> tuple[dict[str, Any], CSVEngine]:
1530 result = options.copy()
1532 fallback_reason = None
1534 # C engine not supported yet
1535 if engine == "c":
1536 if options["skipfooter"] > 0:
1537 fallback_reason = "the 'c' engine does not support skipfooter"
1538 engine = "python"
1540 sep = options["delimiter"]
1541 delim_whitespace = options["delim_whitespace"]
1543 if sep is None and not delim_whitespace:
1544 if engine in ("c", "pyarrow"):
1545 fallback_reason = (
1546 f"the '{engine}' engine does not support "
1547 "sep=None with delim_whitespace=False"
1548 )
1549 engine = "python"
1550 elif sep is not None and len(sep) > 1:
1551 if engine == "c" and sep == r"\s+":
1552 result["delim_whitespace"] = True
1553 del result["delimiter"]
1554 elif engine not in ("python", "python-fwf"):
1555 # wait until regex engine integrated
1556 fallback_reason = (
1557 f"the '{engine}' engine does not support "
1558 "regex separators (separators > 1 char and "
1559 r"different from '\s+' are interpreted as regex)"
1560 )
1561 engine = "python"
1562 elif delim_whitespace:
1563 if "python" in engine:
1564 result["delimiter"] = r"\s+"
1565 elif sep is not None:
1566 encodeable = True
1567 encoding = sys.getfilesystemencoding() or "utf-8"
1568 try:
1569 if len(sep.encode(encoding)) > 1:
1570 encodeable = False
1571 except UnicodeDecodeError:
1572 encodeable = False
1573 if not encodeable and engine not in ("python", "python-fwf"):
1574 fallback_reason = (
1575 f"the separator encoded in {encoding} "
1576 f"is > 1 char long, and the '{engine}' engine "
1577 "does not support such separators"
1578 )
1579 engine = "python"
1581 quotechar = options["quotechar"]
1582 if quotechar is not None and isinstance(quotechar, (str, bytes)):
1583 if (
1584 len(quotechar) == 1
1585 and ord(quotechar) > 127
1586 and engine not in ("python", "python-fwf")
1587 ):
1588 fallback_reason = (
1589 "ord(quotechar) > 127, meaning the "
1590 "quotechar is larger than one byte, "
1591 f"and the '{engine}' engine does not support such quotechars"
1592 )
1593 engine = "python"
1595 if fallback_reason and self._engine_specified:
1596 raise ValueError(fallback_reason)
1598 if engine == "c":
1599 for arg in _c_unsupported:
1600 del result[arg]
1602 if "python" in engine:
1603 for arg in _python_unsupported:
1604 if fallback_reason and result[arg] != _c_parser_defaults[arg]:
1605 raise ValueError(
1606 "Falling back to the 'python' engine because "
1607 f"{fallback_reason}, but this causes {repr(arg)} to be "
1608 "ignored as it is not supported by the 'python' engine."
1609 )
1610 del result[arg]
1612 if fallback_reason:
1613 warnings.warn(
1614 (
1615 "Falling back to the 'python' engine because "
1616 f"{fallback_reason}; you can avoid this warning by specifying "
1617 "engine='python'."
1618 ),
1619 ParserWarning,
1620 stacklevel=find_stack_level(),
1621 )
1623 index_col = options["index_col"]
1624 names = options["names"]
1625 converters = options["converters"]
1626 na_values = options["na_values"]
1627 skiprows = options["skiprows"]
1629 validate_header_arg(options["header"])
1631 for arg in _deprecated_defaults.keys():
1632 parser_default = _c_parser_defaults.get(arg, parser_defaults[arg])
1633 depr_default = _deprecated_defaults[arg]
1634 if result.get(arg, depr_default) != depr_default.default_value:
1635 msg = (
1636 f"The {arg} argument has been deprecated and will be "
1637 f"removed in a future version. {depr_default.msg}\n\n"
1638 )
1639 warnings.warn(msg, FutureWarning, stacklevel=find_stack_level())
1640 else:
1641 result[arg] = parser_default
1643 if index_col is True:
1644 raise ValueError("The value of index_col couldn't be 'True'")
1645 if is_index_col(index_col):
1646 if not isinstance(index_col, (list, tuple, np.ndarray)):
1647 index_col = [index_col]
1648 result["index_col"] = index_col
1650 names = list(names) if names is not None else names
1652 # type conversion-related
1653 if converters is not None:
1654 if not isinstance(converters, dict):
1655 raise TypeError(
1656 "Type converters must be a dict or subclass, "
1657 f"input was a {type(converters).__name__}"
1658 )
1659 else:
1660 converters = {}
1662 # Converting values to NA
1663 keep_default_na = options["keep_default_na"]
1664 na_values, na_fvalues = _clean_na_values(na_values, keep_default_na)
1666 # handle skiprows; this is internally handled by the
1667 # c-engine, so only need for python and pyarrow parsers
1668 if engine == "pyarrow":
1669 if not is_integer(skiprows) and skiprows is not None:
1670 # pyarrow expects skiprows to be passed as an integer
1671 raise ValueError(
1672 "skiprows argument must be an integer when using "
1673 "engine='pyarrow'"
1674 )
1675 else:
1676 if is_integer(skiprows):
1677 skiprows = list(range(skiprows))
1678 if skiprows is None:
1679 skiprows = set()
1680 elif not callable(skiprows):
1681 skiprows = set(skiprows)
1683 # put stuff back
1684 result["names"] = names
1685 result["converters"] = converters
1686 result["na_values"] = na_values
1687 result["na_fvalues"] = na_fvalues
1688 result["skiprows"] = skiprows
1689 # Default for squeeze is none since we need to check
1690 # if user sets it. We then set to False to preserve
1691 # previous behavior.
1692 result["squeeze"] = False if options["squeeze"] is None else options["squeeze"]
1694 return result, engine
1696 def __next__(self) -> DataFrame:
1697 try:
1698 return self.get_chunk()
1699 except StopIteration:
1700 self.close()
1701 raise
1703 def _make_engine(
1704 self,
1705 f: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str] | list | IO,
1706 engine: CSVEngine = "c",
1707 ) -> ParserBase:
1708 mapping: dict[str, type[ParserBase]] = {
1709 "c": CParserWrapper,
1710 "python": PythonParser,
1711 "pyarrow": ArrowParserWrapper,
1712 "python-fwf": FixedWidthFieldParser,
1713 }
1714 if engine not in mapping:
1715 raise ValueError(
1716 f"Unknown engine: {engine} (valid options are {mapping.keys()})"
1717 )
1718 if not isinstance(f, list):
1719 # open file here
1720 is_text = True
1721 mode = "r"
1722 if engine == "pyarrow":
1723 is_text = False
1724 mode = "rb"
1725 elif (
1726 engine == "c"
1727 and self.options.get("encoding", "utf-8") == "utf-8"
1728 and isinstance(stringify_path(f), str)
1729 ):
1730 # c engine can decode utf-8 bytes, adding TextIOWrapper makes
1731 # the c-engine especially for memory_map=True far slower
1732 is_text = False
1733 if "b" not in mode:
1734 mode += "b"
1735 self.handles = get_handle(
1736 f,
1737 mode,
1738 encoding=self.options.get("encoding", None),
1739 compression=self.options.get("compression", None),
1740 memory_map=self.options.get("memory_map", False),
1741 is_text=is_text,
1742 errors=self.options.get("encoding_errors", "strict"),
1743 storage_options=self.options.get("storage_options", None),
1744 )
1745 assert self.handles is not None
1746 f = self.handles.handle
1748 elif engine != "python":
1749 msg = f"Invalid file path or buffer object type: {type(f)}"
1750 raise ValueError(msg)
1752 try:
1753 return mapping[engine](f, **self.options)
1754 except Exception:
1755 if self.handles is not None:
1756 self.handles.close()
1757 raise
1759 def _failover_to_python(self) -> None:
1760 raise AbstractMethodError(self)
1762 def read(self, nrows: int | None = None) -> DataFrame:
1763 if self.engine == "pyarrow":
1764 try:
1765 # error: "ParserBase" has no attribute "read"
1766 df = self._engine.read() # type: ignore[attr-defined]
1767 except Exception:
1768 self.close()
1769 raise
1770 else:
1771 nrows = validate_integer("nrows", nrows)
1772 try:
1773 # error: "ParserBase" has no attribute "read"
1774 (
1775 index,
1776 columns,
1777 col_dict,
1778 ) = self._engine.read( # type: ignore[attr-defined]
1779 nrows
1780 )
1781 except Exception:
1782 self.close()
1783 raise
1785 if index is None:
1786 if col_dict:
1787 # Any column is actually fine:
1788 new_rows = len(next(iter(col_dict.values())))
1789 index = RangeIndex(self._currow, self._currow + new_rows)
1790 else:
1791 new_rows = 0
1792 else:
1793 new_rows = len(index)
1795 df = DataFrame(col_dict, columns=columns, index=index)
1797 self._currow += new_rows
1799 if self.squeeze and len(df.columns) == 1:
1800 return df.squeeze("columns").copy()
1801 return df
1803 def get_chunk(self, size: int | None = None) -> DataFrame:
1804 if size is None:
1805 size = self.chunksize
1806 if self.nrows is not None:
1807 if self._currow >= self.nrows:
1808 raise StopIteration
1809 size = min(size, self.nrows - self._currow)
1810 return self.read(nrows=size)
1812 def __enter__(self) -> TextFileReader:
1813 return self
1815 def __exit__(self, exc_type, exc_value, traceback) -> None:
1816 self.close()
1819def TextParser(*args, **kwds) -> TextFileReader:
1820 """
1821 Converts lists of lists/tuples into DataFrames with proper type inference
1822 and optional (e.g. string to datetime) conversion. Also enables iterating
1823 lazily over chunks of large files
1825 Parameters
1826 ----------
1827 data : file-like object or list
1828 delimiter : separator character to use
1829 dialect : str or csv.Dialect instance, optional
1830 Ignored if delimiter is longer than 1 character
1831 names : sequence, default
1832 header : int, default 0
1833 Row to use to parse column labels. Defaults to the first row. Prior
1834 rows will be discarded
1835 index_col : int or list, optional
1836 Column or columns to use as the (possibly hierarchical) index
1837 has_index_names: bool, default False
1838 True if the cols defined in index_col have an index name and are
1839 not in the header.
1840 na_values : scalar, str, list-like, or dict, optional
1841 Additional strings to recognize as NA/NaN.
1842 keep_default_na : bool, default True
1843 thousands : str, optional
1844 Thousands separator
1845 comment : str, optional
1846 Comment out remainder of line
1847 parse_dates : bool, default False
1848 keep_date_col : bool, default False
1849 date_parser : function, optional
1850 skiprows : list of integers
1851 Row numbers to skip
1852 skipfooter : int
1853 Number of line at bottom of file to skip
1854 converters : dict, optional
1855 Dict of functions for converting values in certain columns. Keys can
1856 either be integers or column labels, values are functions that take one
1857 input argument, the cell (not column) content, and return the
1858 transformed content.
1859 encoding : str, optional
1860 Encoding to use for UTF when reading/writing (ex. 'utf-8')
1861 squeeze : bool, default False
1862 returns Series if only one column.
1863 infer_datetime_format: bool, default False
1864 If True and `parse_dates` is True for a column, try to infer the
1865 datetime format based on the first datetime string. If the format
1866 can be inferred, there often will be a large parsing speed-up.
1867 float_precision : str, optional
1868 Specifies which converter the C engine should use for floating-point
1869 values. The options are `None` or `high` for the ordinary converter,
1870 `legacy` for the original lower precision pandas converter, and
1871 `round_trip` for the round-trip converter.
1873 .. versionchanged:: 1.2
1874 """
1875 kwds["engine"] = "python"
1876 return TextFileReader(*args, **kwds)
1879def _clean_na_values(na_values, keep_default_na=True):
1880 na_fvalues: set | dict
1881 if na_values is None:
1882 if keep_default_na:
1883 na_values = STR_NA_VALUES
1884 else:
1885 na_values = set()
1886 na_fvalues = set()
1887 elif isinstance(na_values, dict):
1888 old_na_values = na_values.copy()
1889 na_values = {} # Prevent aliasing.
1891 # Convert the values in the na_values dictionary
1892 # into array-likes for further use. This is also
1893 # where we append the default NaN values, provided
1894 # that `keep_default_na=True`.
1895 for k, v in old_na_values.items():
1896 if not is_list_like(v):
1897 v = [v]
1899 if keep_default_na:
1900 v = set(v) | STR_NA_VALUES
1902 na_values[k] = v
1903 na_fvalues = {k: _floatify_na_values(v) for k, v in na_values.items()}
1904 else:
1905 if not is_list_like(na_values):
1906 na_values = [na_values]
1907 na_values = _stringify_na_values(na_values)
1908 if keep_default_na:
1909 na_values = na_values | STR_NA_VALUES
1911 na_fvalues = _floatify_na_values(na_values)
1913 return na_values, na_fvalues
1916def _floatify_na_values(na_values):
1917 # create float versions of the na_values
1918 result = set()
1919 for v in na_values:
1920 try:
1921 v = float(v)
1922 if not np.isnan(v):
1923 result.add(v)
1924 except (TypeError, ValueError, OverflowError):
1925 pass
1926 return result
1929def _stringify_na_values(na_values):
1930 """return a stringified and numeric for these values"""
1931 result: list[str | float] = []
1932 for x in na_values:
1933 result.append(str(x))
1934 result.append(x)
1935 try:
1936 v = float(x)
1938 # we are like 999 here
1939 if v == int(v):
1940 v = int(v)
1941 result.append(f"{v}.0")
1942 result.append(str(v))
1944 result.append(v)
1945 except (TypeError, ValueError, OverflowError):
1946 pass
1947 try:
1948 result.append(int(x))
1949 except (TypeError, ValueError, OverflowError):
1950 pass
1951 return set(result)
1954def _refine_defaults_read(
1955 dialect: str | csv.Dialect | None,
1956 delimiter: str | None | lib.NoDefault,
1957 delim_whitespace: bool,
1958 engine: CSVEngine | None,
1959 sep: str | None | lib.NoDefault,
1960 error_bad_lines: bool | None,
1961 warn_bad_lines: bool | None,
1962 on_bad_lines: str | Callable | None,
1963 names: Sequence[Hashable] | None | lib.NoDefault,
1964 prefix: str | None | lib.NoDefault,
1965 defaults: dict[str, Any],
1966):
1967 """Validate/refine default values of input parameters of read_csv, read_table.
1969 Parameters
1970 ----------
1971 dialect : str or csv.Dialect
1972 If provided, this parameter will override values (default or not) for the
1973 following parameters: `delimiter`, `doublequote`, `escapechar`,
1974 `skipinitialspace`, `quotechar`, and `quoting`. If it is necessary to
1975 override values, a ParserWarning will be issued. See csv.Dialect
1976 documentation for more details.
1977 delimiter : str or object
1978 Alias for sep.
1979 delim_whitespace : bool
1980 Specifies whether or not whitespace (e.g. ``' '`` or ``'\t'``) will be
1981 used as the sep. Equivalent to setting ``sep='\\s+'``. If this option
1982 is set to True, nothing should be passed in for the ``delimiter``
1983 parameter.
1984 engine : {{'c', 'python'}}
1985 Parser engine to use. The C engine is faster while the python engine is
1986 currently more feature-complete.
1987 sep : str or object
1988 A delimiter provided by the user (str) or a sentinel value, i.e.
1989 pandas._libs.lib.no_default.
1990 error_bad_lines : str or None
1991 Whether to error on a bad line or not.
1992 warn_bad_lines : str or None
1993 Whether to warn on a bad line or not.
1994 on_bad_lines : str, callable or None
1995 An option for handling bad lines or a sentinel value(None).
1996 names : array-like, optional
1997 List of column names to use. If the file contains a header row,
1998 then you should explicitly pass ``header=0`` to override the column names.
1999 Duplicates in this list are not allowed.
2000 prefix : str, optional
2001 Prefix to add to column numbers when no header, e.g. 'X' for X0, X1, ...
2002 defaults: dict
2003 Default values of input parameters.
2005 Returns
2006 -------
2007 kwds : dict
2008 Input parameters with correct values.
2010 Raises
2011 ------
2012 ValueError :
2013 If a delimiter was specified with ``sep`` (or ``delimiter``) and
2014 ``delim_whitespace=True``.
2015 If on_bad_lines is specified(not ``None``) and ``error_bad_lines``/
2016 ``warn_bad_lines`` is True.
2017 """
2018 # fix types for sep, delimiter to Union(str, Any)
2019 delim_default = defaults["delimiter"]
2020 kwds: dict[str, Any] = {}
2021 # gh-23761
2022 #
2023 # When a dialect is passed, it overrides any of the overlapping
2024 # parameters passed in directly. We don't want to warn if the
2025 # default parameters were passed in (since it probably means
2026 # that the user didn't pass them in explicitly in the first place).
2027 #
2028 # "delimiter" is the annoying corner case because we alias it to
2029 # "sep" before doing comparison to the dialect values later on.
2030 # Thus, we need a flag to indicate that we need to "override"
2031 # the comparison to dialect values by checking if default values
2032 # for BOTH "delimiter" and "sep" were provided.
2033 if dialect is not None:
2034 kwds["sep_override"] = delimiter is None and (
2035 sep is lib.no_default or sep == delim_default
2036 )
2038 if delimiter and (sep is not lib.no_default):
2039 raise ValueError("Specified a sep and a delimiter; you can only specify one.")
2041 if (
2042 names is not None
2043 and names is not lib.no_default
2044 and prefix is not None
2045 and prefix is not lib.no_default
2046 ):
2047 raise ValueError("Specified named and prefix; you can only specify one.")
2049 kwds["names"] = None if names is lib.no_default else names
2050 kwds["prefix"] = None if prefix is lib.no_default else prefix
2052 # Alias sep -> delimiter.
2053 if delimiter is None:
2054 delimiter = sep
2056 if delim_whitespace and (delimiter is not lib.no_default):
2057 raise ValueError(
2058 "Specified a delimiter with both sep and "
2059 "delim_whitespace=True; you can only specify one."
2060 )
2062 if delimiter == "\n":
2063 raise ValueError(
2064 r"Specified \n as separator or delimiter. This forces the python engine "
2065 "which does not accept a line terminator. Hence it is not allowed to use "
2066 "the line terminator as separator.",
2067 )
2069 if delimiter is lib.no_default:
2070 # assign default separator value
2071 kwds["delimiter"] = delim_default
2072 else:
2073 kwds["delimiter"] = delimiter
2075 if engine is not None:
2076 kwds["engine_specified"] = True
2077 else:
2078 kwds["engine"] = "c"
2079 kwds["engine_specified"] = False
2081 # Ensure that on_bad_lines and error_bad_lines/warn_bad_lines
2082 # aren't specified at the same time. If so, raise. Otherwise,
2083 # alias on_bad_lines to "error" if error/warn_bad_lines not set
2084 # and on_bad_lines is not set. on_bad_lines is defaulted to None
2085 # so we can tell if it is set (this is why this hack exists).
2086 if on_bad_lines is not None:
2087 if error_bad_lines is not None or warn_bad_lines is not None:
2088 raise ValueError(
2089 "Both on_bad_lines and error_bad_lines/warn_bad_lines are set. "
2090 "Please only set on_bad_lines."
2091 )
2092 if on_bad_lines == "error":
2093 kwds["on_bad_lines"] = ParserBase.BadLineHandleMethod.ERROR
2094 elif on_bad_lines == "warn":
2095 kwds["on_bad_lines"] = ParserBase.BadLineHandleMethod.WARN
2096 elif on_bad_lines == "skip":
2097 kwds["on_bad_lines"] = ParserBase.BadLineHandleMethod.SKIP
2098 elif callable(on_bad_lines):
2099 if engine != "python":
2100 raise ValueError(
2101 "on_bad_line can only be a callable function if engine='python'"
2102 )
2103 kwds["on_bad_lines"] = on_bad_lines
2104 else:
2105 raise ValueError(f"Argument {on_bad_lines} is invalid for on_bad_lines")
2106 else:
2107 if error_bad_lines is not None:
2108 # Must check is_bool, because other stuff(e.g. non-empty lists) eval to true
2109 validate_bool_kwarg(error_bad_lines, "error_bad_lines")
2110 if error_bad_lines:
2111 kwds["on_bad_lines"] = ParserBase.BadLineHandleMethod.ERROR
2112 else:
2113 if warn_bad_lines is not None:
2114 # This is the case where error_bad_lines is False
2115 # We can only warn/skip if error_bad_lines is False
2116 # None doesn't work because backwards-compatibility reasons
2117 validate_bool_kwarg(warn_bad_lines, "warn_bad_lines")
2118 if warn_bad_lines:
2119 kwds["on_bad_lines"] = ParserBase.BadLineHandleMethod.WARN
2120 else:
2121 kwds["on_bad_lines"] = ParserBase.BadLineHandleMethod.SKIP
2122 else:
2123 # Backwards compat, when only error_bad_lines = false, we warn
2124 kwds["on_bad_lines"] = ParserBase.BadLineHandleMethod.WARN
2125 else:
2126 # Everything None -> Error
2127 kwds["on_bad_lines"] = ParserBase.BadLineHandleMethod.ERROR
2129 return kwds
2132def _extract_dialect(kwds: dict[str, Any]) -> csv.Dialect | None:
2133 """
2134 Extract concrete csv dialect instance.
2136 Returns
2137 -------
2138 csv.Dialect or None
2139 """
2140 if kwds.get("dialect") is None:
2141 return None
2143 dialect = kwds["dialect"]
2144 if dialect in csv.list_dialects():
2145 dialect = csv.get_dialect(dialect)
2147 _validate_dialect(dialect)
2149 return dialect
2152MANDATORY_DIALECT_ATTRS = (
2153 "delimiter",
2154 "doublequote",
2155 "escapechar",
2156 "skipinitialspace",
2157 "quotechar",
2158 "quoting",
2159)
2162def _validate_dialect(dialect: csv.Dialect) -> None:
2163 """
2164 Validate csv dialect instance.
2166 Raises
2167 ------
2168 ValueError
2169 If incorrect dialect is provided.
2170 """
2171 for param in MANDATORY_DIALECT_ATTRS:
2172 if not hasattr(dialect, param):
2173 raise ValueError(f"Invalid dialect {dialect} provided")
2176def _merge_with_dialect_properties(
2177 dialect: csv.Dialect,
2178 defaults: dict[str, Any],
2179) -> dict[str, Any]:
2180 """
2181 Merge default kwargs in TextFileReader with dialect parameters.
2183 Parameters
2184 ----------
2185 dialect : csv.Dialect
2186 Concrete csv dialect. See csv.Dialect documentation for more details.
2187 defaults : dict
2188 Keyword arguments passed to TextFileReader.
2190 Returns
2191 -------
2192 kwds : dict
2193 Updated keyword arguments, merged with dialect parameters.
2194 """
2195 kwds = defaults.copy()
2197 for param in MANDATORY_DIALECT_ATTRS:
2198 dialect_val = getattr(dialect, param)
2200 parser_default = parser_defaults[param]
2201 provided = kwds.get(param, parser_default)
2203 # Messages for conflicting values between the dialect
2204 # instance and the actual parameters provided.
2205 conflict_msgs = []
2207 # Don't warn if the default parameter was passed in,
2208 # even if it conflicts with the dialect (gh-23761).
2209 if provided != parser_default and provided != dialect_val:
2210 msg = (
2211 f"Conflicting values for '{param}': '{provided}' was "
2212 f"provided, but the dialect specifies '{dialect_val}'. "
2213 "Using the dialect-specified value."
2214 )
2216 # Annoying corner case for not warning about
2217 # conflicts between dialect and delimiter parameter.
2218 # Refer to the outer "_read_" function for more info.
2219 if not (param == "delimiter" and kwds.pop("sep_override", False)):
2220 conflict_msgs.append(msg)
2222 if conflict_msgs:
2223 warnings.warn(
2224 "\n\n".join(conflict_msgs), ParserWarning, stacklevel=find_stack_level()
2225 )
2226 kwds[param] = dialect_val
2227 return kwds
2230def _validate_skipfooter(kwds: dict[str, Any]) -> None:
2231 """
2232 Check whether skipfooter is compatible with other kwargs in TextFileReader.
2234 Parameters
2235 ----------
2236 kwds : dict
2237 Keyword arguments passed to TextFileReader.
2239 Raises
2240 ------
2241 ValueError
2242 If skipfooter is not compatible with other parameters.
2243 """
2244 if kwds.get("skipfooter"):
2245 if kwds.get("iterator") or kwds.get("chunksize"):
2246 raise ValueError("'skipfooter' not supported for iteration")
2247 if kwds.get("nrows"):
2248 raise ValueError("'skipfooter' not supported with 'nrows'")