Coverage for /var/srv/projects/api.amasfac.comuna18.com/tmp/venv/lib/python3.9/site-packages/pandas/io/parsers/base_parser.py: 11%
559 statements
« prev ^ index » next coverage.py v6.4.4, created at 2023-07-17 14:22 -0600
« prev ^ index » next coverage.py v6.4.4, created at 2023-07-17 14:22 -0600
1from __future__ import annotations
3from collections import defaultdict
4from copy import copy
5import csv
6import datetime
7from enum import Enum
8import itertools
9from typing import (
10 TYPE_CHECKING,
11 Any,
12 Callable,
13 DefaultDict,
14 Hashable,
15 Iterable,
16 List,
17 Mapping,
18 Sequence,
19 Tuple,
20 cast,
21 final,
22 overload,
23)
24import warnings
26import numpy as np
28import pandas._libs.lib as lib
29import pandas._libs.ops as libops
30import pandas._libs.parsers as parsers
31from pandas._libs.parsers import STR_NA_VALUES
32from pandas._libs.tslibs import parsing
33from pandas._typing import (
34 ArrayLike,
35 DtypeArg,
36 Scalar,
37)
38from pandas.errors import (
39 ParserError,
40 ParserWarning,
41)
42from pandas.util._exceptions import find_stack_level
44from pandas.core.dtypes.astype import astype_nansafe
45from pandas.core.dtypes.common import (
46 ensure_object,
47 is_bool_dtype,
48 is_categorical_dtype,
49 is_dict_like,
50 is_dtype_equal,
51 is_extension_array_dtype,
52 is_integer,
53 is_integer_dtype,
54 is_list_like,
55 is_object_dtype,
56 is_scalar,
57 is_string_dtype,
58 pandas_dtype,
59)
60from pandas.core.dtypes.dtypes import CategoricalDtype
61from pandas.core.dtypes.missing import isna
63from pandas.core import algorithms
64from pandas.core.arrays import Categorical
65from pandas.core.indexes.api import (
66 Index,
67 MultiIndex,
68 ensure_index_from_sequences,
69)
70from pandas.core.series import Series
71from pandas.core.tools import datetimes as tools
73from pandas.io.date_converters import generic_parser
75if TYPE_CHECKING: 75 ↛ 76line 75 didn't jump to line 76, because the condition on line 75 was never true
76 from pandas import DataFrame
79class ParserBase:
80 class BadLineHandleMethod(Enum):
81 ERROR = 0
82 WARN = 1
83 SKIP = 2
85 _implicit_index: bool = False
86 _first_chunk: bool
88 def __init__(self, kwds) -> None:
90 self.names = kwds.get("names")
91 self.orig_names: list | None = None
92 self.prefix = kwds.pop("prefix", None)
94 self.index_col = kwds.get("index_col", None)
95 self.unnamed_cols: set = set()
96 self.index_names: Sequence[Hashable] | None = None
97 self.col_names = None
99 self.parse_dates = _validate_parse_dates_arg(kwds.pop("parse_dates", False))
100 self._parse_date_cols: Iterable = []
101 self.date_parser = kwds.pop("date_parser", None)
102 self.dayfirst = kwds.pop("dayfirst", False)
103 self.keep_date_col = kwds.pop("keep_date_col", False)
105 self.na_values = kwds.get("na_values")
106 self.na_fvalues = kwds.get("na_fvalues")
107 self.na_filter = kwds.get("na_filter", False)
108 self.keep_default_na = kwds.get("keep_default_na", True)
110 self.dtype = copy(kwds.get("dtype", None))
111 self.converters = kwds.get("converters")
113 self.true_values = kwds.get("true_values")
114 self.false_values = kwds.get("false_values")
115 self.mangle_dupe_cols = kwds.get("mangle_dupe_cols", True)
116 self.infer_datetime_format = kwds.pop("infer_datetime_format", False)
117 self.cache_dates = kwds.pop("cache_dates", True)
119 self._date_conv = _make_date_converter(
120 date_parser=self.date_parser,
121 dayfirst=self.dayfirst,
122 infer_datetime_format=self.infer_datetime_format,
123 cache_dates=self.cache_dates,
124 )
126 # validate header options for mi
127 self.header = kwds.get("header")
128 if is_list_like(self.header, allow_sets=False):
129 if kwds.get("usecols"):
130 raise ValueError(
131 "cannot specify usecols when specifying a multi-index header"
132 )
133 if kwds.get("names"):
134 raise ValueError(
135 "cannot specify names when specifying a multi-index header"
136 )
138 # validate index_col that only contains integers
139 if self.index_col is not None:
140 if not (
141 is_list_like(self.index_col, allow_sets=False)
142 and all(map(is_integer, self.index_col))
143 or is_integer(self.index_col)
144 ):
145 raise ValueError(
146 "index_col must only contain row numbers "
147 "when specifying a multi-index header"
148 )
149 elif self.header is not None and self.prefix is not None:
150 # GH 27394
151 raise ValueError(
152 "Argument prefix must be None if argument header is not None"
153 )
155 self._name_processed = False
157 self._first_chunk = True
159 self.usecols, self.usecols_dtype = self._validate_usecols_arg(kwds["usecols"])
161 # Fallback to error to pass a sketchy test(test_override_set_noconvert_columns)
162 # Normally, this arg would get pre-processed earlier on
163 self.on_bad_lines = kwds.get("on_bad_lines", self.BadLineHandleMethod.ERROR)
165 def _validate_parse_dates_presence(self, columns: Sequence[Hashable]) -> Iterable:
166 """
167 Check if parse_dates are in columns.
169 If user has provided names for parse_dates, check if those columns
170 are available.
172 Parameters
173 ----------
174 columns : list
175 List of names of the dataframe.
177 Returns
178 -------
179 The names of the columns which will get parsed later if a dict or list
180 is given as specification.
182 Raises
183 ------
184 ValueError
185 If column to parse_date is not in dataframe.
187 """
188 cols_needed: Iterable
189 if is_dict_like(self.parse_dates):
190 cols_needed = itertools.chain(*self.parse_dates.values())
191 elif is_list_like(self.parse_dates):
192 # a column in parse_dates could be represented
193 # ColReference = Union[int, str]
194 # DateGroups = List[ColReference]
195 # ParseDates = Union[DateGroups, List[DateGroups],
196 # Dict[ColReference, DateGroups]]
197 cols_needed = itertools.chain.from_iterable(
198 col if is_list_like(col) and not isinstance(col, tuple) else [col]
199 for col in self.parse_dates
200 )
201 else:
202 cols_needed = []
204 cols_needed = list(cols_needed)
206 # get only columns that are references using names (str), not by index
207 missing_cols = ", ".join(
208 sorted(
209 {
210 col
211 for col in cols_needed
212 if isinstance(col, str) and col not in columns
213 }
214 )
215 )
216 if missing_cols:
217 raise ValueError(
218 f"Missing column provided to 'parse_dates': '{missing_cols}'"
219 )
220 # Convert positions to actual column names
221 return [
222 col if (isinstance(col, str) or col in columns) else columns[col]
223 for col in cols_needed
224 ]
226 def close(self) -> None:
227 pass
229 @final
230 @property
231 def _has_complex_date_col(self) -> bool:
232 return isinstance(self.parse_dates, dict) or (
233 isinstance(self.parse_dates, list)
234 and len(self.parse_dates) > 0
235 and isinstance(self.parse_dates[0], list)
236 )
238 @final
239 def _should_parse_dates(self, i: int) -> bool:
240 if isinstance(self.parse_dates, bool):
241 return self.parse_dates
242 else:
243 if self.index_names is not None:
244 name = self.index_names[i]
245 else:
246 name = None
247 j = i if self.index_col is None else self.index_col[i]
249 if is_scalar(self.parse_dates):
250 return (j == self.parse_dates) or (
251 name is not None and name == self.parse_dates
252 )
253 else:
254 return (j in self.parse_dates) or (
255 name is not None and name in self.parse_dates
256 )
258 @final
259 def _extract_multi_indexer_columns(
260 self,
261 header,
262 index_names: list | None,
263 passed_names: bool = False,
264 ):
265 """
266 Extract and return the names, index_names, col_names if the column
267 names are a MultiIndex.
269 Parameters
270 ----------
271 header: list of lists
272 The header rows
273 index_names: list, optional
274 The names of the future index
275 passed_names: bool, default False
276 A flag specifying if names where passed
278 """
279 if len(header) < 2:
280 return header[0], index_names, None, passed_names
282 # the names are the tuples of the header that are not the index cols
283 # 0 is the name of the index, assuming index_col is a list of column
284 # numbers
285 ic = self.index_col
286 if ic is None:
287 ic = []
289 if not isinstance(ic, (list, tuple, np.ndarray)):
290 ic = [ic]
291 sic = set(ic)
293 # clean the index_names
294 index_names = header.pop(-1)
295 index_names, _, _ = self._clean_index_names(index_names, self.index_col)
297 # extract the columns
298 field_count = len(header[0])
300 # check if header lengths are equal
301 if not all(len(header_iter) == field_count for header_iter in header[1:]):
302 raise ParserError("Header rows must have an equal number of columns.")
304 def extract(r):
305 return tuple(r[i] for i in range(field_count) if i not in sic)
307 columns = list(zip(*(extract(r) for r in header)))
308 names = columns.copy()
309 for single_ic in sorted(ic):
310 names.insert(single_ic, single_ic)
312 # Clean the column names (if we have an index_col).
313 if len(ic):
314 col_names = [
315 r[ic[0]]
316 if ((r[ic[0]] is not None) and r[ic[0]] not in self.unnamed_cols)
317 else None
318 for r in header
319 ]
320 else:
321 col_names = [None] * len(header)
323 passed_names = True
325 return names, index_names, col_names, passed_names
327 @final
328 def _maybe_dedup_names(self, names: Sequence[Hashable]) -> Sequence[Hashable]:
329 # see gh-7160 and gh-9424: this helps to provide
330 # immediate alleviation of the duplicate names
331 # issue and appears to be satisfactory to users,
332 # but ultimately, not needing to butcher the names
333 # would be nice!
334 if self.mangle_dupe_cols:
335 names = list(names) # so we can index
336 counts: DefaultDict[Hashable, int] = defaultdict(int)
337 is_potential_mi = _is_potential_multi_index(names, self.index_col)
339 for i, col in enumerate(names):
340 cur_count = counts[col]
342 while cur_count > 0:
343 counts[col] = cur_count + 1
345 if is_potential_mi:
346 # for mypy
347 assert isinstance(col, tuple)
348 col = col[:-1] + (f"{col[-1]}.{cur_count}",)
349 else:
350 col = f"{col}.{cur_count}"
351 cur_count = counts[col]
353 names[i] = col
354 counts[col] = cur_count + 1
356 return names
358 @final
359 def _maybe_make_multi_index_columns(
360 self,
361 columns: Sequence[Hashable],
362 col_names: Sequence[Hashable] | None = None,
363 ) -> Sequence[Hashable] | MultiIndex:
364 # possibly create a column mi here
365 if _is_potential_multi_index(columns):
366 list_columns = cast(List[Tuple], columns)
367 return MultiIndex.from_tuples(list_columns, names=col_names)
368 return columns
370 @final
371 def _make_index(
372 self, data, alldata, columns, indexnamerow: list[Scalar] | None = None
373 ) -> tuple[Index | None, Sequence[Hashable] | MultiIndex]:
374 index: Index | None
375 if not is_index_col(self.index_col) or not self.index_col:
376 index = None
378 elif not self._has_complex_date_col:
379 simple_index = self._get_simple_index(alldata, columns)
380 index = self._agg_index(simple_index)
381 elif self._has_complex_date_col:
382 if not self._name_processed:
383 (self.index_names, _, self.index_col) = self._clean_index_names(
384 list(columns), self.index_col
385 )
386 self._name_processed = True
387 date_index = self._get_complex_date_index(data, columns)
388 index = self._agg_index(date_index, try_parse_dates=False)
390 # add names for the index
391 if indexnamerow:
392 coffset = len(indexnamerow) - len(columns)
393 assert index is not None
394 index = index.set_names(indexnamerow[:coffset])
396 # maybe create a mi on the columns
397 columns = self._maybe_make_multi_index_columns(columns, self.col_names)
399 return index, columns
401 @final
402 def _get_simple_index(self, data, columns):
403 def ix(col):
404 if not isinstance(col, str):
405 return col
406 raise ValueError(f"Index {col} invalid")
408 to_remove = []
409 index = []
410 for idx in self.index_col:
411 i = ix(idx)
412 to_remove.append(i)
413 index.append(data[i])
415 # remove index items from content and columns, don't pop in
416 # loop
417 for i in sorted(to_remove, reverse=True):
418 data.pop(i)
419 if not self._implicit_index:
420 columns.pop(i)
422 return index
424 @final
425 def _get_complex_date_index(self, data, col_names):
426 def _get_name(icol):
427 if isinstance(icol, str):
428 return icol
430 if col_names is None:
431 raise ValueError(f"Must supply column order to use {icol!s} as index")
433 for i, c in enumerate(col_names):
434 if i == icol:
435 return c
437 to_remove = []
438 index = []
439 for idx in self.index_col:
440 name = _get_name(idx)
441 to_remove.append(name)
442 index.append(data[name])
444 # remove index items from content and columns, don't pop in
445 # loop
446 for c in sorted(to_remove, reverse=True):
447 data.pop(c)
448 col_names.remove(c)
450 return index
452 def _clean_mapping(self, mapping):
453 """converts col numbers to names"""
454 if not isinstance(mapping, dict):
455 return mapping
456 clean = {}
457 # for mypy
458 assert self.orig_names is not None
460 for col, v in mapping.items():
461 if isinstance(col, int) and col not in self.orig_names:
462 col = self.orig_names[col]
463 clean[col] = v
464 if isinstance(mapping, defaultdict):
465 remaining_cols = set(self.orig_names) - set(clean.keys())
466 clean.update({col: mapping[col] for col in remaining_cols})
467 return clean
469 @final
470 def _agg_index(self, index, try_parse_dates: bool = True) -> Index:
471 arrays = []
472 converters = self._clean_mapping(self.converters)
474 for i, arr in enumerate(index):
476 if try_parse_dates and self._should_parse_dates(i):
477 arr = self._date_conv(arr)
479 if self.na_filter:
480 col_na_values = self.na_values
481 col_na_fvalues = self.na_fvalues
482 else:
483 col_na_values = set()
484 col_na_fvalues = set()
486 if isinstance(self.na_values, dict):
487 assert self.index_names is not None
488 col_name = self.index_names[i]
489 if col_name is not None:
490 col_na_values, col_na_fvalues = _get_na_values(
491 col_name, self.na_values, self.na_fvalues, self.keep_default_na
492 )
494 clean_dtypes = self._clean_mapping(self.dtype)
496 cast_type = None
497 index_converter = False
498 if self.index_names is not None:
499 if isinstance(clean_dtypes, dict):
500 cast_type = clean_dtypes.get(self.index_names[i], None)
502 if isinstance(converters, dict):
503 index_converter = converters.get(self.index_names[i]) is not None
505 try_num_bool = not (
506 cast_type and is_string_dtype(cast_type) or index_converter
507 )
509 arr, _ = self._infer_types(
510 arr, col_na_values | col_na_fvalues, try_num_bool
511 )
512 arrays.append(arr)
514 names = self.index_names
515 index = ensure_index_from_sequences(arrays, names)
517 return index
519 @final
520 def _convert_to_ndarrays(
521 self,
522 dct: Mapping,
523 na_values,
524 na_fvalues,
525 verbose: bool = False,
526 converters=None,
527 dtypes=None,
528 ):
529 result = {}
530 for c, values in dct.items():
531 conv_f = None if converters is None else converters.get(c, None)
532 if isinstance(dtypes, dict):
533 cast_type = dtypes.get(c, None)
534 else:
535 # single dtype or None
536 cast_type = dtypes
538 if self.na_filter:
539 col_na_values, col_na_fvalues = _get_na_values(
540 c, na_values, na_fvalues, self.keep_default_na
541 )
542 else:
543 col_na_values, col_na_fvalues = set(), set()
545 if c in self._parse_date_cols:
546 # GH#26203 Do not convert columns which get converted to dates
547 # but replace nans to ensure to_datetime works
548 mask = algorithms.isin(values, set(col_na_values) | col_na_fvalues)
549 np.putmask(values, mask, np.nan)
550 result[c] = values
551 continue
553 if conv_f is not None:
554 # conv_f applied to data before inference
555 if cast_type is not None:
556 warnings.warn(
557 (
558 "Both a converter and dtype were specified "
559 f"for column {c} - only the converter will be used."
560 ),
561 ParserWarning,
562 stacklevel=find_stack_level(),
563 )
565 try:
566 values = lib.map_infer(values, conv_f)
567 except ValueError:
568 # error: Argument 2 to "isin" has incompatible type "List[Any]";
569 # expected "Union[Union[ExtensionArray, ndarray], Index, Series]"
570 mask = algorithms.isin(
571 values, list(na_values) # type: ignore[arg-type]
572 ).view(np.uint8)
573 values = lib.map_infer_mask(values, conv_f, mask)
575 cvals, na_count = self._infer_types(
576 values, set(col_na_values) | col_na_fvalues, try_num_bool=False
577 )
578 else:
579 is_ea = is_extension_array_dtype(cast_type)
580 is_str_or_ea_dtype = is_ea or is_string_dtype(cast_type)
581 # skip inference if specified dtype is object
582 # or casting to an EA
583 try_num_bool = not (cast_type and is_str_or_ea_dtype)
585 # general type inference and conversion
586 cvals, na_count = self._infer_types(
587 values, set(col_na_values) | col_na_fvalues, try_num_bool
588 )
590 # type specified in dtype param or cast_type is an EA
591 if cast_type and (
592 not is_dtype_equal(cvals, cast_type)
593 or is_extension_array_dtype(cast_type)
594 ):
595 if not is_ea and na_count > 0:
596 try:
597 if is_bool_dtype(cast_type):
598 raise ValueError(
599 f"Bool column has NA values in column {c}"
600 )
601 except (AttributeError, TypeError):
602 # invalid input to is_bool_dtype
603 pass
604 cast_type = pandas_dtype(cast_type)
605 cvals = self._cast_types(cvals, cast_type, c)
607 result[c] = cvals
608 if verbose and na_count:
609 print(f"Filled {na_count} NA values in column {c!s}")
610 return result
612 @final
613 def _set_noconvert_dtype_columns(
614 self, col_indices: list[int], names: Sequence[Hashable]
615 ) -> set[int]:
616 """
617 Set the columns that should not undergo dtype conversions.
619 Currently, any column that is involved with date parsing will not
620 undergo such conversions. If usecols is specified, the positions of the columns
621 not to cast is relative to the usecols not to all columns.
623 Parameters
624 ----------
625 col_indices: The indices specifying order and positions of the columns
626 names: The column names which order is corresponding with the order
627 of col_indices
629 Returns
630 -------
631 A set of integers containing the positions of the columns not to convert.
632 """
633 usecols: list[int] | list[str] | None
634 noconvert_columns = set()
635 if self.usecols_dtype == "integer":
636 # A set of integers will be converted to a list in
637 # the correct order every single time.
638 usecols = sorted(self.usecols)
639 elif callable(self.usecols) or self.usecols_dtype not in ("empty", None):
640 # The names attribute should have the correct columns
641 # in the proper order for indexing with parse_dates.
642 usecols = col_indices
643 else:
644 # Usecols is empty.
645 usecols = None
647 def _set(x) -> int:
648 if usecols is not None and is_integer(x):
649 x = usecols[x]
651 if not is_integer(x):
652 x = col_indices[names.index(x)]
654 return x
656 if isinstance(self.parse_dates, list):
657 for val in self.parse_dates:
658 if isinstance(val, list):
659 for k in val:
660 noconvert_columns.add(_set(k))
661 else:
662 noconvert_columns.add(_set(val))
664 elif isinstance(self.parse_dates, dict):
665 for val in self.parse_dates.values():
666 if isinstance(val, list):
667 for k in val:
668 noconvert_columns.add(_set(k))
669 else:
670 noconvert_columns.add(_set(val))
672 elif self.parse_dates:
673 if isinstance(self.index_col, list):
674 for k in self.index_col:
675 noconvert_columns.add(_set(k))
676 elif self.index_col is not None:
677 noconvert_columns.add(_set(self.index_col))
679 return noconvert_columns
681 def _infer_types(self, values, na_values, try_num_bool=True):
682 """
683 Infer types of values, possibly casting
685 Parameters
686 ----------
687 values : ndarray
688 na_values : set
689 try_num_bool : bool, default try
690 try to cast values to numeric (first preference) or boolean
692 Returns
693 -------
694 converted : ndarray
695 na_count : int
696 """
697 na_count = 0
698 if issubclass(values.dtype.type, (np.number, np.bool_)):
699 # If our array has numeric dtype, we don't have to check for strings in isin
700 na_values = np.array([val for val in na_values if not isinstance(val, str)])
701 mask = algorithms.isin(values, na_values)
702 na_count = mask.astype("uint8", copy=False).sum()
703 if na_count > 0:
704 if is_integer_dtype(values):
705 values = values.astype(np.float64)
706 np.putmask(values, mask, np.nan)
707 return values, na_count
709 if try_num_bool and is_object_dtype(values.dtype):
710 # exclude e.g DatetimeIndex here
711 try:
712 result, _ = lib.maybe_convert_numeric(values, na_values, False)
713 except (ValueError, TypeError):
714 # e.g. encountering datetime string gets ValueError
715 # TypeError can be raised in floatify
716 result = values
717 na_count = parsers.sanitize_objects(result, na_values)
718 else:
719 na_count = isna(result).sum()
720 else:
721 result = values
722 if values.dtype == np.object_:
723 na_count = parsers.sanitize_objects(values, na_values)
725 if result.dtype == np.object_ and try_num_bool:
726 result, _ = libops.maybe_convert_bool(
727 np.asarray(values),
728 true_values=self.true_values,
729 false_values=self.false_values,
730 )
732 return result, na_count
734 def _cast_types(self, values, cast_type, column):
735 """
736 Cast values to specified type
738 Parameters
739 ----------
740 values : ndarray
741 cast_type : string or np.dtype
742 dtype to cast values to
743 column : string
744 column name - used only for error reporting
746 Returns
747 -------
748 converted : ndarray
749 """
750 if is_categorical_dtype(cast_type):
751 known_cats = (
752 isinstance(cast_type, CategoricalDtype)
753 and cast_type.categories is not None
754 )
756 if not is_object_dtype(values) and not known_cats:
757 # TODO: this is for consistency with
758 # c-parser which parses all categories
759 # as strings
761 values = astype_nansafe(values, np.dtype(str))
763 cats = Index(values).unique().dropna()
764 values = Categorical._from_inferred_categories(
765 cats, cats.get_indexer(values), cast_type, true_values=self.true_values
766 )
768 # use the EA's implementation of casting
769 elif is_extension_array_dtype(cast_type):
770 # ensure cast_type is an actual dtype and not a string
771 cast_type = pandas_dtype(cast_type)
772 array_type = cast_type.construct_array_type()
773 try:
774 if is_bool_dtype(cast_type):
775 return array_type._from_sequence_of_strings(
776 values,
777 dtype=cast_type,
778 true_values=self.true_values,
779 false_values=self.false_values,
780 )
781 else:
782 return array_type._from_sequence_of_strings(values, dtype=cast_type)
783 except NotImplementedError as err:
784 raise NotImplementedError(
785 f"Extension Array: {array_type} must implement "
786 "_from_sequence_of_strings in order to be used in parser methods"
787 ) from err
789 else:
790 try:
791 values = astype_nansafe(values, cast_type, copy=True, skipna=True)
792 except ValueError as err:
793 raise ValueError(
794 f"Unable to convert column {column} to type {cast_type}"
795 ) from err
796 return values
798 @overload
799 def _do_date_conversions(
800 self,
801 names: Index,
802 data: DataFrame,
803 ) -> tuple[Sequence[Hashable] | Index, DataFrame]:
804 ...
806 @overload
807 def _do_date_conversions(
808 self,
809 names: Sequence[Hashable],
810 data: Mapping[Hashable, ArrayLike],
811 ) -> tuple[Sequence[Hashable], Mapping[Hashable, ArrayLike]]:
812 ...
814 def _do_date_conversions(
815 self,
816 names: Sequence[Hashable] | Index,
817 data: Mapping[Hashable, ArrayLike] | DataFrame,
818 ) -> tuple[Sequence[Hashable] | Index, Mapping[Hashable, ArrayLike] | DataFrame]:
819 # returns data, columns
821 if self.parse_dates is not None:
822 data, names = _process_date_conversion(
823 data,
824 self._date_conv,
825 self.parse_dates,
826 self.index_col,
827 self.index_names,
828 names,
829 keep_date_col=self.keep_date_col,
830 )
832 return names, data
834 def _check_data_length(
835 self,
836 columns: Sequence[Hashable],
837 data: Sequence[ArrayLike],
838 ) -> None:
839 """Checks if length of data is equal to length of column names.
841 One set of trailing commas is allowed. self.index_col not False
842 results in a ParserError previously when lengths do not match.
844 Parameters
845 ----------
846 columns: list of column names
847 data: list of array-likes containing the data column-wise.
848 """
849 if not self.index_col and len(columns) != len(data) and columns:
850 empty_str = is_object_dtype(data[-1]) and data[-1] == ""
851 # error: No overload variant of "__ror__" of "ndarray" matches
852 # argument type "ExtensionArray"
853 empty_str_or_na = empty_str | isna(data[-1]) # type: ignore[operator]
854 if len(columns) == len(data) - 1 and np.all(empty_str_or_na):
855 return
856 warnings.warn(
857 "Length of header or names does not match length of data. This leads "
858 "to a loss of data with index_col=False.",
859 ParserWarning,
860 stacklevel=find_stack_level(),
861 )
863 @overload
864 def _evaluate_usecols(
865 self,
866 usecols: set[int] | Callable[[Hashable], object],
867 names: Sequence[Hashable],
868 ) -> set[int]:
869 ...
871 @overload
872 def _evaluate_usecols(
873 self, usecols: set[str], names: Sequence[Hashable]
874 ) -> set[str]:
875 ...
877 def _evaluate_usecols(
878 self,
879 usecols: Callable[[Hashable], object] | set[str] | set[int],
880 names: Sequence[Hashable],
881 ) -> set[str] | set[int]:
882 """
883 Check whether or not the 'usecols' parameter
884 is a callable. If so, enumerates the 'names'
885 parameter and returns a set of indices for
886 each entry in 'names' that evaluates to True.
887 If not a callable, returns 'usecols'.
888 """
889 if callable(usecols):
890 return {i for i, name in enumerate(names) if usecols(name)}
891 return usecols
893 def _validate_usecols_names(self, usecols, names):
894 """
895 Validates that all usecols are present in a given
896 list of names. If not, raise a ValueError that
897 shows what usecols are missing.
899 Parameters
900 ----------
901 usecols : iterable of usecols
902 The columns to validate are present in names.
903 names : iterable of names
904 The column names to check against.
906 Returns
907 -------
908 usecols : iterable of usecols
909 The `usecols` parameter if the validation succeeds.
911 Raises
912 ------
913 ValueError : Columns were missing. Error message will list them.
914 """
915 missing = [c for c in usecols if c not in names]
916 if len(missing) > 0:
917 raise ValueError(
918 f"Usecols do not match columns, columns expected but not found: "
919 f"{missing}"
920 )
922 return usecols
924 def _validate_usecols_arg(self, usecols):
925 """
926 Validate the 'usecols' parameter.
928 Checks whether or not the 'usecols' parameter contains all integers
929 (column selection by index), strings (column by name) or is a callable.
930 Raises a ValueError if that is not the case.
932 Parameters
933 ----------
934 usecols : list-like, callable, or None
935 List of columns to use when parsing or a callable that can be used
936 to filter a list of table columns.
938 Returns
939 -------
940 usecols_tuple : tuple
941 A tuple of (verified_usecols, usecols_dtype).
943 'verified_usecols' is either a set if an array-like is passed in or
944 'usecols' if a callable or None is passed in.
946 'usecols_dtype` is the inferred dtype of 'usecols' if an array-like
947 is passed in or None if a callable or None is passed in.
948 """
949 msg = (
950 "'usecols' must either be list-like of all strings, all unicode, "
951 "all integers or a callable."
952 )
953 if usecols is not None:
954 if callable(usecols):
955 return usecols, None
957 if not is_list_like(usecols):
958 # see gh-20529
959 #
960 # Ensure it is iterable container but not string.
961 raise ValueError(msg)
963 usecols_dtype = lib.infer_dtype(usecols, skipna=False)
965 if usecols_dtype not in ("empty", "integer", "string"):
966 raise ValueError(msg)
968 usecols = set(usecols)
970 return usecols, usecols_dtype
971 return usecols, None
973 def _clean_index_names(self, columns, index_col):
974 if not is_index_col(index_col):
975 return None, columns, index_col
977 columns = list(columns)
979 # In case of no rows and multiindex columns we have to set index_names to
980 # list of Nones GH#38292
981 if not columns:
982 return [None] * len(index_col), columns, index_col
984 cp_cols = list(columns)
985 index_names: list[str | int | None] = []
987 # don't mutate
988 index_col = list(index_col)
990 for i, c in enumerate(index_col):
991 if isinstance(c, str):
992 index_names.append(c)
993 for j, name in enumerate(cp_cols):
994 if name == c:
995 index_col[i] = j
996 columns.remove(name)
997 break
998 else:
999 name = cp_cols[c]
1000 columns.remove(name)
1001 index_names.append(name)
1003 # Only clean index names that were placeholders.
1004 for i, name in enumerate(index_names):
1005 if isinstance(name, str) and name in self.unnamed_cols:
1006 index_names[i] = None
1008 return index_names, columns, index_col
1010 def _get_empty_meta(
1011 self, columns, index_col, index_names, dtype: DtypeArg | None = None
1012 ):
1013 columns = list(columns)
1015 # Convert `dtype` to a defaultdict of some kind.
1016 # This will enable us to write `dtype[col_name]`
1017 # without worrying about KeyError issues later on.
1018 dtype_dict: defaultdict[Hashable, Any]
1019 if not is_dict_like(dtype):
1020 # if dtype == None, default will be object.
1021 default_dtype = dtype or object
1022 dtype_dict = defaultdict(lambda: default_dtype)
1023 else:
1024 dtype = cast(dict, dtype)
1025 dtype_dict = defaultdict(
1026 lambda: object,
1027 {columns[k] if is_integer(k) else k: v for k, v in dtype.items()},
1028 )
1030 # Even though we have no data, the "index" of the empty DataFrame
1031 # could for example still be an empty MultiIndex. Thus, we need to
1032 # check whether we have any index columns specified, via either:
1033 #
1034 # 1) index_col (column indices)
1035 # 2) index_names (column names)
1036 #
1037 # Both must be non-null to ensure a successful construction. Otherwise,
1038 # we have to create a generic empty Index.
1039 if (index_col is None or index_col is False) or index_names is None:
1040 index = Index([])
1041 else:
1042 data = [Series([], dtype=dtype_dict[name]) for name in index_names]
1043 index = ensure_index_from_sequences(data, names=index_names)
1044 index_col.sort()
1046 for i, n in enumerate(index_col):
1047 columns.pop(n - i)
1049 col_dict = {
1050 col_name: Series([], dtype=dtype_dict[col_name]) for col_name in columns
1051 }
1053 return index, columns, col_dict
1056def _make_date_converter(
1057 date_parser=None, dayfirst=False, infer_datetime_format=False, cache_dates=True
1058):
1059 def converter(*date_cols):
1060 if date_parser is None:
1061 strs = parsing.concat_date_cols(date_cols)
1063 try:
1064 return tools.to_datetime(
1065 ensure_object(strs),
1066 utc=None,
1067 dayfirst=dayfirst,
1068 errors="ignore",
1069 infer_datetime_format=infer_datetime_format,
1070 cache=cache_dates,
1071 ).to_numpy()
1073 except ValueError:
1074 return tools.to_datetime(
1075 parsing.try_parse_dates(strs, dayfirst=dayfirst), cache=cache_dates
1076 )
1077 else:
1078 try:
1079 result = tools.to_datetime(
1080 date_parser(*date_cols), errors="ignore", cache=cache_dates
1081 )
1082 if isinstance(result, datetime.datetime):
1083 raise Exception("scalar parser")
1084 return result
1085 except Exception:
1086 try:
1087 return tools.to_datetime(
1088 parsing.try_parse_dates(
1089 parsing.concat_date_cols(date_cols),
1090 parser=date_parser,
1091 dayfirst=dayfirst,
1092 ),
1093 errors="ignore",
1094 )
1095 except Exception:
1096 return generic_parser(date_parser, *date_cols)
1098 return converter
1101parser_defaults = {
1102 "delimiter": None,
1103 "escapechar": None,
1104 "quotechar": '"',
1105 "quoting": csv.QUOTE_MINIMAL,
1106 "doublequote": True,
1107 "skipinitialspace": False,
1108 "lineterminator": None,
1109 "header": "infer",
1110 "index_col": None,
1111 "names": None,
1112 "prefix": None,
1113 "skiprows": None,
1114 "skipfooter": 0,
1115 "nrows": None,
1116 "na_values": None,
1117 "keep_default_na": True,
1118 "true_values": None,
1119 "false_values": None,
1120 "converters": None,
1121 "dtype": None,
1122 "cache_dates": True,
1123 "thousands": None,
1124 "comment": None,
1125 "decimal": ".",
1126 # 'engine': 'c',
1127 "parse_dates": False,
1128 "keep_date_col": False,
1129 "dayfirst": False,
1130 "date_parser": None,
1131 "usecols": None,
1132 # 'iterator': False,
1133 "chunksize": None,
1134 "verbose": False,
1135 "encoding": None,
1136 "squeeze": None,
1137 "compression": None,
1138 "mangle_dupe_cols": True,
1139 "infer_datetime_format": False,
1140 "skip_blank_lines": True,
1141 "encoding_errors": "strict",
1142 "on_bad_lines": ParserBase.BadLineHandleMethod.ERROR,
1143 "error_bad_lines": None,
1144 "warn_bad_lines": None,
1145}
1148def _process_date_conversion(
1149 data_dict,
1150 converter: Callable,
1151 parse_spec,
1152 index_col,
1153 index_names,
1154 columns,
1155 keep_date_col: bool = False,
1156):
1157 def _isindex(colspec):
1158 return (isinstance(index_col, list) and colspec in index_col) or (
1159 isinstance(index_names, list) and colspec in index_names
1160 )
1162 new_cols = []
1163 new_data = {}
1165 orig_names = columns
1166 columns = list(columns)
1168 date_cols = set()
1170 if parse_spec is None or isinstance(parse_spec, bool):
1171 return data_dict, columns
1173 if isinstance(parse_spec, list):
1174 # list of column lists
1175 for colspec in parse_spec:
1176 if is_scalar(colspec) or isinstance(colspec, tuple):
1177 if isinstance(colspec, int) and colspec not in data_dict:
1178 colspec = orig_names[colspec]
1179 if _isindex(colspec):
1180 continue
1181 # Pyarrow engine returns Series which we need to convert to
1182 # numpy array before converter, its a no-op for other parsers
1183 data_dict[colspec] = converter(np.asarray(data_dict[colspec]))
1184 else:
1185 new_name, col, old_names = _try_convert_dates(
1186 converter, colspec, data_dict, orig_names
1187 )
1188 if new_name in data_dict:
1189 raise ValueError(f"New date column already in dict {new_name}")
1190 new_data[new_name] = col
1191 new_cols.append(new_name)
1192 date_cols.update(old_names)
1194 elif isinstance(parse_spec, dict):
1195 # dict of new name to column list
1196 for new_name, colspec in parse_spec.items():
1197 if new_name in data_dict:
1198 raise ValueError(f"Date column {new_name} already in dict")
1200 _, col, old_names = _try_convert_dates(
1201 converter, colspec, data_dict, orig_names
1202 )
1204 new_data[new_name] = col
1206 # If original column can be converted to date we keep the converted values
1207 # This can only happen if values are from single column
1208 if len(colspec) == 1:
1209 new_data[colspec[0]] = col
1211 new_cols.append(new_name)
1212 date_cols.update(old_names)
1214 data_dict.update(new_data)
1215 new_cols.extend(columns)
1217 if not keep_date_col:
1218 for c in list(date_cols):
1219 data_dict.pop(c)
1220 new_cols.remove(c)
1222 return data_dict, new_cols
1225def _try_convert_dates(parser: Callable, colspec, data_dict, columns):
1226 colset = set(columns)
1227 colnames = []
1229 for c in colspec:
1230 if c in colset:
1231 colnames.append(c)
1232 elif isinstance(c, int) and c not in columns:
1233 colnames.append(columns[c])
1234 else:
1235 colnames.append(c)
1237 new_name: tuple | str
1238 if all(isinstance(x, tuple) for x in colnames):
1239 new_name = tuple(map("_".join, zip(*colnames)))
1240 else:
1241 new_name = "_".join([str(x) for x in colnames])
1242 to_parse = [np.asarray(data_dict[c]) for c in colnames if c in data_dict]
1244 new_col = parser(*to_parse)
1245 return new_name, new_col, colnames
1248def _get_na_values(col, na_values, na_fvalues, keep_default_na):
1249 """
1250 Get the NaN values for a given column.
1252 Parameters
1253 ----------
1254 col : str
1255 The name of the column.
1256 na_values : array-like, dict
1257 The object listing the NaN values as strings.
1258 na_fvalues : array-like, dict
1259 The object listing the NaN values as floats.
1260 keep_default_na : bool
1261 If `na_values` is a dict, and the column is not mapped in the
1262 dictionary, whether to return the default NaN values or the empty set.
1264 Returns
1265 -------
1266 nan_tuple : A length-two tuple composed of
1268 1) na_values : the string NaN values for that column.
1269 2) na_fvalues : the float NaN values for that column.
1270 """
1271 if isinstance(na_values, dict):
1272 if col in na_values:
1273 return na_values[col], na_fvalues[col]
1274 else:
1275 if keep_default_na:
1276 return STR_NA_VALUES, set()
1278 return set(), set()
1279 else:
1280 return na_values, na_fvalues
1283def _is_potential_multi_index(
1284 columns: Sequence[Hashable] | MultiIndex,
1285 index_col: bool | Sequence[int] | None = None,
1286) -> bool:
1287 """
1288 Check whether or not the `columns` parameter
1289 could be converted into a MultiIndex.
1291 Parameters
1292 ----------
1293 columns : array-like
1294 Object which may or may not be convertible into a MultiIndex
1295 index_col : None, bool or list, optional
1296 Column or columns to use as the (possibly hierarchical) index
1298 Returns
1299 -------
1300 bool : Whether or not columns could become a MultiIndex
1301 """
1302 if index_col is None or isinstance(index_col, bool):
1303 index_col = []
1305 return bool(
1306 len(columns)
1307 and not isinstance(columns, MultiIndex)
1308 and all(isinstance(c, tuple) for c in columns if c not in list(index_col))
1309 )
1312def _validate_parse_dates_arg(parse_dates):
1313 """
1314 Check whether or not the 'parse_dates' parameter
1315 is a non-boolean scalar. Raises a ValueError if
1316 that is the case.
1317 """
1318 msg = (
1319 "Only booleans, lists, and dictionaries are accepted "
1320 "for the 'parse_dates' parameter"
1321 )
1323 if parse_dates is not None:
1324 if is_scalar(parse_dates):
1325 if not lib.is_bool(parse_dates):
1326 raise TypeError(msg)
1328 elif not isinstance(parse_dates, (list, dict)):
1329 raise TypeError(msg)
1331 return parse_dates
1334def is_index_col(col) -> bool:
1335 return col is not None and col is not False