Coverage for /var/srv/projects/api.amasfac.comuna18.com/tmp/venv/lib/python3.9/site-packages/pandas/io/parsers/base

1from __future__ import annotations

3from collections import defaultdict

4from copy import copy

5import csv

6import datetime

7from enum import Enum

8import itertools

9from typing import (

10 TYPE_CHECKING,

11 Any,

12 Callable,

13 DefaultDict,

14 Hashable,

15 Iterable,

16 List,

17 Mapping,

18 Sequence,

19 Tuple,

20 cast,

21 final,

22 overload,

23)

24import warnings

26import numpy as np

28import pandas._libs.lib as lib

29import pandas._libs.ops as libops

30import pandas._libs.parsers as parsers

31from pandas._libs.parsers import STR_NA_VALUES

32from pandas._libs.tslibs import parsing

33from pandas._typing import (

34 ArrayLike,

35 DtypeArg,

36 Scalar,

37)

38from pandas.errors import (

39 ParserError,

40 ParserWarning,

41)

42from pandas.util._exceptions import find_stack_level

44from pandas.core.dtypes.astype import astype_nansafe

45from pandas.core.dtypes.common import (

46 ensure_object,

47 is_bool_dtype,

48 is_categorical_dtype,

49 is_dict_like,

50 is_dtype_equal,

51 is_extension_array_dtype,

52 is_integer,

53 is_integer_dtype,

54 is_list_like,

55 is_object_dtype,

56 is_scalar,

57 is_string_dtype,

58 pandas_dtype,

59)

60from pandas.core.dtypes.dtypes import CategoricalDtype

61from pandas.core.dtypes.missing import isna

63from pandas.core import algorithms

64from pandas.core.arrays import Categorical

65from pandas.core.indexes.api import (

66 Index,

67 MultiIndex,

68 ensure_index_from_sequences,

69)

70from pandas.core.series import Series

71from pandas.core.tools import datetimes as tools

73from pandas.io.date_converters import generic_parser

75if TYPE_CHECKING: 75 ↛ 76line 75 didn't jump to line 76, because the condition on line 75 was never true

76 from pandas import DataFrame

79class ParserBase:

80 class BadLineHandleMethod(Enum):

81 ERROR = 0

82 WARN = 1

83 SKIP = 2

85 _implicit_index: bool = False

86 _first_chunk: bool

88 def __init__(self, kwds) -> None:

90 self.names = kwds.get("names")

91 self.orig_names: list | None = None

92 self.prefix = kwds.pop("prefix", None)

94 self.index_col = kwds.get("index_col", None)

95 self.unnamed_cols: set = set()

96 self.index_names: Sequence[Hashable] | None = None

97 self.col_names = None

99 self.parse_dates = _validate_parse_dates_arg(kwds.pop("parse_dates", False))

100 self._parse_date_cols: Iterable = []

101 self.date_parser = kwds.pop("date_parser", None)

102 self.dayfirst = kwds.pop("dayfirst", False)

103 self.keep_date_col = kwds.pop("keep_date_col", False)

104

105 self.na_values = kwds.get("na_values")

106 self.na_fvalues = kwds.get("na_fvalues")

107 self.na_filter = kwds.get("na_filter", False)

108 self.keep_default_na = kwds.get("keep_default_na", True)

109

110 self.dtype = copy(kwds.get("dtype", None))

111 self.converters = kwds.get("converters")

112

113 self.true_values = kwds.get("true_values")

114 self.false_values = kwds.get("false_values")

115 self.mangle_dupe_cols = kwds.get("mangle_dupe_cols", True)

116 self.infer_datetime_format = kwds.pop("infer_datetime_format", False)

117 self.cache_dates = kwds.pop("cache_dates", True)

118

119 self._date_conv = _make_date_converter(

120 date_parser=self.date_parser,

121 dayfirst=self.dayfirst,

122 infer_datetime_format=self.infer_datetime_format,

123 cache_dates=self.cache_dates,

124 )

125

126 # validate header options for mi

127 self.header = kwds.get("header")

128 if is_list_like(self.header, allow_sets=False):

129 if kwds.get("usecols"):

130 raise ValueError(

131 "cannot specify usecols when specifying a multi-index header"

132 )

133 if kwds.get("names"):

134 raise ValueError(

135 "cannot specify names when specifying a multi-index header"

136 )

137

138 # validate index_col that only contains integers

139 if self.index_col is not None:

140 if not (

141 is_list_like(self.index_col, allow_sets=False)

142 and all(map(is_integer, self.index_col))

143 or is_integer(self.index_col)

144 ):

145 raise ValueError(

146 "index_col must only contain row numbers "

147 "when specifying a multi-index header"

148 )

149 elif self.header is not None and self.prefix is not None:

150 # GH 27394

151 raise ValueError(

152 "Argument prefix must be None if argument header is not None"

153 )

154

155 self._name_processed = False

156

157 self._first_chunk = True

158

159 self.usecols, self.usecols_dtype = self._validate_usecols_arg(kwds["usecols"])

160

161 # Fallback to error to pass a sketchy test(test_override_set_noconvert_columns)

162 # Normally, this arg would get pre-processed earlier on

163 self.on_bad_lines = kwds.get("on_bad_lines", self.BadLineHandleMethod.ERROR)

164

165 def _validate_parse_dates_presence(self, columns: Sequence[Hashable]) -> Iterable:

166 """

167 Check if parse_dates are in columns.

168

169 If user has provided names for parse_dates, check if those columns

170 are available.

171

172 Parameters

173 ----------

174 columns : list

175 List of names of the dataframe.

176

177 Returns

178 -------

179 The names of the columns which will get parsed later if a dict or list

180 is given as specification.

181

182 Raises

183 ------

184 ValueError

185 If column to parse_date is not in dataframe.

186

187 """

188 cols_needed: Iterable

189 if is_dict_like(self.parse_dates):

190 cols_needed = itertools.chain(*self.parse_dates.values())

191 elif is_list_like(self.parse_dates):

192 # a column in parse_dates could be represented

193 # ColReference = Union[int, str]

194 # DateGroups = List[ColReference]

195 # ParseDates = Union[DateGroups, List[DateGroups],

196 # Dict[ColReference, DateGroups]]

197 cols_needed = itertools.chain.from_iterable(

198 col if is_list_like(col) and not isinstance(col, tuple) else [col]

199 for col in self.parse_dates

200 )

201 else:

202 cols_needed = []

203

204 cols_needed = list(cols_needed)

205

206 # get only columns that are references using names (str), not by index

207 missing_cols = ", ".join(

208 sorted(

209 {

210 col

211 for col in cols_needed

212 if isinstance(col, str) and col not in columns

213 }

214 )

215 )

216 if missing_cols:

217 raise ValueError(

218 f"Missing column provided to 'parse_dates': '{missing_cols}'"

219 )

220 # Convert positions to actual column names

221 return [

222 col if (isinstance(col, str) or col in columns) else columns[col]

223 for col in cols_needed

224 ]

225

226 def close(self) -> None:

227 pass

228

229 @final

230 @property

231 def _has_complex_date_col(self) -> bool:

232 return isinstance(self.parse_dates, dict) or (

233 isinstance(self.parse_dates, list)

234 and len(self.parse_dates) > 0

235 and isinstance(self.parse_dates[0], list)

236 )

237

238 @final

239 def _should_parse_dates(self, i: int) -> bool:

240 if isinstance(self.parse_dates, bool):

241 return self.parse_dates

242 else:

243 if self.index_names is not None:

244 name = self.index_names[i]

245 else:

246 name = None

247 j = i if self.index_col is None else self.index_col[i]

248

249 if is_scalar(self.parse_dates):

250 return (j == self.parse_dates) or (

251 name is not None and name == self.parse_dates

252 )

253 else:

254 return (j in self.parse_dates) or (

255 name is not None and name in self.parse_dates

256 )

257

258 @final

259 def _extract_multi_indexer_columns(

260 self,

261 header,

262 index_names: list | None,

263 passed_names: bool = False,

264 ):

265 """

266 Extract and return the names, index_names, col_names if the column

267 names are a MultiIndex.

268

269 Parameters

270 ----------

271 header: list of lists

272 The header rows

273 index_names: list, optional

274 The names of the future index

275 passed_names: bool, default False

276 A flag specifying if names where passed

277

278 """

279 if len(header) < 2:

280 return header[0], index_names, None, passed_names

281

282 # the names are the tuples of the header that are not the index cols

283 # 0 is the name of the index, assuming index_col is a list of column

284 # numbers

285 ic = self.index_col

286 if ic is None:

287 ic = []

288

289 if not isinstance(ic, (list, tuple, np.ndarray)):

290 ic = [ic]

291 sic = set(ic)

292

293 # clean the index_names

294 index_names = header.pop(-1)

295 index_names, _, _ = self._clean_index_names(index_names, self.index_col)

296

297 # extract the columns

298 field_count = len(header[0])

299

300 # check if header lengths are equal

301 if not all(len(header_iter) == field_count for header_iter in header[1:]):

302 raise ParserError("Header rows must have an equal number of columns.")

303

304 def extract(r):

305 return tuple(r[i] for i in range(field_count) if i not in sic)

306

307 columns = list(zip(*(extract(r) for r in header)))

308 names = columns.copy()

309 for single_ic in sorted(ic):

310 names.insert(single_ic, single_ic)

311

312 # Clean the column names (if we have an index_col).

313 if len(ic):

314 col_names = [

315 r[ic[0]]

316 if ((r[ic[0]] is not None) and r[ic[0]] not in self.unnamed_cols)

317 else None

318 for r in header

319 ]

320 else:

321 col_names = [None] * len(header)

322

323 passed_names = True

324

325 return names, index_names, col_names, passed_names

326

327 @final

328 def _maybe_dedup_names(self, names: Sequence[Hashable]) -> Sequence[Hashable]:

329 # see gh-7160 and gh-9424: this helps to provide

330 # immediate alleviation of the duplicate names

331 # issue and appears to be satisfactory to users,

332 # but ultimately, not needing to butcher the names

333 # would be nice!

334 if self.mangle_dupe_cols:

335 names = list(names) # so we can index

336 counts: DefaultDict[Hashable, int] = defaultdict(int)

337 is_potential_mi = _is_potential_multi_index(names, self.index_col)

338

339 for i, col in enumerate(names):

340 cur_count = counts[col]

341

342 while cur_count > 0:

343 counts[col] = cur_count + 1

344

345 if is_potential_mi:

346 # for mypy

347 assert isinstance(col, tuple)

348 col = col[:-1] + (f"{col[-1]}.{cur_count}",)

349 else:

350 col = f"{col}.{cur_count}"

351 cur_count = counts[col]

352

353 names[i] = col

354 counts[col] = cur_count + 1

355

356 return names

357

358 @final

359 def _maybe_make_multi_index_columns(

360 self,

361 columns: Sequence[Hashable],

362 col_names: Sequence[Hashable] | None = None,

363 ) -> Sequence[Hashable] | MultiIndex:

364 # possibly create a column mi here

365 if _is_potential_multi_index(columns):

366 list_columns = cast(List[Tuple], columns)

367 return MultiIndex.from_tuples(list_columns, names=col_names)

368 return columns

369

370 @final

371 def _make_index(

372 self, data, alldata, columns, indexnamerow: list[Scalar] | None = None

373 ) -> tuple[Index | None, Sequence[Hashable] | MultiIndex]:

374 index: Index | None

375 if not is_index_col(self.index_col) or not self.index_col:

376 index = None

377

378 elif not self._has_complex_date_col:

379 simple_index = self._get_simple_index(alldata, columns)

380 index = self._agg_index(simple_index)

381 elif self._has_complex_date_col:

382 if not self._name_processed:

383 (self.index_names, _, self.index_col) = self._clean_index_names(

384 list(columns), self.index_col

385 )

386 self._name_processed = True

387 date_index = self._get_complex_date_index(data, columns)

388 index = self._agg_index(date_index, try_parse_dates=False)

389

390 # add names for the index

391 if indexnamerow:

392 coffset = len(indexnamerow) - len(columns)

393 assert index is not None

394 index = index.set_names(indexnamerow[:coffset])

395

396 # maybe create a mi on the columns

397 columns = self._maybe_make_multi_index_columns(columns, self.col_names)

398

399 return index, columns

400

401 @final

402 def _get_simple_index(self, data, columns):

403 def ix(col):

404 if not isinstance(col, str):

405 return col

406 raise ValueError(f"Index {col} invalid")

407

408 to_remove = []

409 index = []

410 for idx in self.index_col:

411 i = ix(idx)

412 to_remove.append(i)

413 index.append(data[i])

414

415 # remove index items from content and columns, don't pop in

416 # loop

417 for i in sorted(to_remove, reverse=True):

418 data.pop(i)

419 if not self._implicit_index:

420 columns.pop(i)

421

422 return index

423

424 @final

425 def _get_complex_date_index(self, data, col_names):

426 def _get_name(icol):

427 if isinstance(icol, str):

428 return icol

429

430 if col_names is None:

431 raise ValueError(f"Must supply column order to use {icol!s} as index")

432

433 for i, c in enumerate(col_names):

434 if i == icol:

435 return c

436

437 to_remove = []

438 index = []

439 for idx in self.index_col:

440 name = _get_name(idx)

441 to_remove.append(name)

442 index.append(data[name])

443

444 # remove index items from content and columns, don't pop in

445 # loop

446 for c in sorted(to_remove, reverse=True):

447 data.pop(c)

448 col_names.remove(c)

449

450 return index

451

452 def _clean_mapping(self, mapping):

453 """converts col numbers to names"""

454 if not isinstance(mapping, dict):

455 return mapping

456 clean = {}

457 # for mypy

458 assert self.orig_names is not None

459

460 for col, v in mapping.items():

461 if isinstance(col, int) and col not in self.orig_names:

462 col = self.orig_names[col]

463 clean[col] = v

464 if isinstance(mapping, defaultdict):

465 remaining_cols = set(self.orig_names) - set(clean.keys())

466 clean.update({col: mapping[col] for col in remaining_cols})

467 return clean

468

469 @final

470 def _agg_index(self, index, try_parse_dates: bool = True) -> Index:

471 arrays = []

472 converters = self._clean_mapping(self.converters)

473

474 for i, arr in enumerate(index):

475

476 if try_parse_dates and self._should_parse_dates(i):

477 arr = self._date_conv(arr)

478

479 if self.na_filter:

480 col_na_values = self.na_values

481 col_na_fvalues = self.na_fvalues

482 else:

483 col_na_values = set()

484 col_na_fvalues = set()

485

486 if isinstance(self.na_values, dict):

487 assert self.index_names is not None

488 col_name = self.index_names[i]

489 if col_name is not None:

490 col_na_values, col_na_fvalues = _get_na_values(

491 col_name, self.na_values, self.na_fvalues, self.keep_default_na

492 )

493

494 clean_dtypes = self._clean_mapping(self.dtype)

495

496 cast_type = None

497 index_converter = False

498 if self.index_names is not None:

499 if isinstance(clean_dtypes, dict):

500 cast_type = clean_dtypes.get(self.index_names[i], None)

501

502 if isinstance(converters, dict):

503 index_converter = converters.get(self.index_names[i]) is not None

504

505 try_num_bool = not (

506 cast_type and is_string_dtype(cast_type) or index_converter

507 )

508

509 arr, _ = self._infer_types(

510 arr, col_na_values | col_na_fvalues, try_num_bool

511 )

512 arrays.append(arr)

513

514 names = self.index_names

515 index = ensure_index_from_sequences(arrays, names)

516

517 return index

518

519 @final

520 def _convert_to_ndarrays(

521 self,

522 dct: Mapping,

523 na_values,

524 na_fvalues,

525 verbose: bool = False,

526 converters=None,

527 dtypes=None,

528 ):

529 result = {}

530 for c, values in dct.items():

531 conv_f = None if converters is None else converters.get(c, None)

532 if isinstance(dtypes, dict):

533 cast_type = dtypes.get(c, None)

534 else:

535 # single dtype or None

536 cast_type = dtypes

537

538 if self.na_filter:

539 col_na_values, col_na_fvalues = _get_na_values(

540 c, na_values, na_fvalues, self.keep_default_na

541 )

542 else:

543 col_na_values, col_na_fvalues = set(), set()

544

545 if c in self._parse_date_cols:

546 # GH#26203 Do not convert columns which get converted to dates

547 # but replace nans to ensure to_datetime works

548 mask = algorithms.isin(values, set(col_na_values) | col_na_fvalues)

549 np.putmask(values, mask, np.nan)

550 result[c] = values

551 continue

552

553 if conv_f is not None:

554 # conv_f applied to data before inference

555 if cast_type is not None:

556 warnings.warn(

557 (

558 "Both a converter and dtype were specified "

559 f"for column {c} - only the converter will be used."

560 ),

561 ParserWarning,

562 stacklevel=find_stack_level(),

563 )

564

565 try:

566 values = lib.map_infer(values, conv_f)

567 except ValueError:

568 # error: Argument 2 to "isin" has incompatible type "List[Any]";

569 # expected "Union[Union[ExtensionArray, ndarray], Index, Series]"

570 mask = algorithms.isin(

571 values, list(na_values) # type: ignore[arg-type]

572 ).view(np.uint8)

573 values = lib.map_infer_mask(values, conv_f, mask)

574

575 cvals, na_count = self._infer_types(

576 values, set(col_na_values) | col_na_fvalues, try_num_bool=False

577 )

578 else:

579 is_ea = is_extension_array_dtype(cast_type)

580 is_str_or_ea_dtype = is_ea or is_string_dtype(cast_type)

581 # skip inference if specified dtype is object

582 # or casting to an EA

583 try_num_bool = not (cast_type and is_str_or_ea_dtype)

584

585 # general type inference and conversion

586 cvals, na_count = self._infer_types(

587 values, set(col_na_values) | col_na_fvalues, try_num_bool

588 )

589

590 # type specified in dtype param or cast_type is an EA

591 if cast_type and (

592 not is_dtype_equal(cvals, cast_type)

593 or is_extension_array_dtype(cast_type)

594 ):

595 if not is_ea and na_count > 0:

596 try:

597 if is_bool_dtype(cast_type):

598 raise ValueError(

599 f"Bool column has NA values in column {c}"

600 )

601 except (AttributeError, TypeError):

602 # invalid input to is_bool_dtype

603 pass

604 cast_type = pandas_dtype(cast_type)

605 cvals = self._cast_types(cvals, cast_type, c)

606

607 result[c] = cvals

608 if verbose and na_count:

609 print(f"Filled {na_count} NA values in column {c!s}")

610 return result

611

612 @final

613 def _set_noconvert_dtype_columns(

614 self, col_indices: list[int], names: Sequence[Hashable]

615 ) -> set[int]:

616 """

617 Set the columns that should not undergo dtype conversions.

618

619 Currently, any column that is involved with date parsing will not

620 undergo such conversions. If usecols is specified, the positions of the columns

621 not to cast is relative to the usecols not to all columns.

622

623 Parameters

624 ----------

625 col_indices: The indices specifying order and positions of the columns

626 names: The column names which order is corresponding with the order

627 of col_indices

628

629 Returns

630 -------

631 A set of integers containing the positions of the columns not to convert.

632 """

633 usecols: list[int] | list[str] | None

634 noconvert_columns = set()

635 if self.usecols_dtype == "integer":

636 # A set of integers will be converted to a list in

637 # the correct order every single time.

638 usecols = sorted(self.usecols)

639 elif callable(self.usecols) or self.usecols_dtype not in ("empty", None):

640 # The names attribute should have the correct columns

641 # in the proper order for indexing with parse_dates.

642 usecols = col_indices

643 else:

644 # Usecols is empty.

645 usecols = None

646

647 def _set(x) -> int:

648 if usecols is not None and is_integer(x):

649 x = usecols[x]

650

651 if not is_integer(x):

652 x = col_indices[names.index(x)]

653

654 return x

655

656 if isinstance(self.parse_dates, list):

657 for val in self.parse_dates:

658 if isinstance(val, list):

659 for k in val:

660 noconvert_columns.add(_set(k))

661 else:

662 noconvert_columns.add(_set(val))

663

664 elif isinstance(self.parse_dates, dict):

665 for val in self.parse_dates.values():

666 if isinstance(val, list):

667 for k in val:

668 noconvert_columns.add(_set(k))

669 else:

670 noconvert_columns.add(_set(val))

671

672 elif self.parse_dates:

673 if isinstance(self.index_col, list):

674 for k in self.index_col:

675 noconvert_columns.add(_set(k))

676 elif self.index_col is not None:

677 noconvert_columns.add(_set(self.index_col))

678

679 return noconvert_columns

680

681 def _infer_types(self, values, na_values, try_num_bool=True):

682 """

683 Infer types of values, possibly casting

684

685 Parameters

686 ----------

687 values : ndarray

688 na_values : set

689 try_num_bool : bool, default try

690 try to cast values to numeric (first preference) or boolean

691

692 Returns

693 -------

694 converted : ndarray

695 na_count : int

696 """

697 na_count = 0

698 if issubclass(values.dtype.type, (np.number, np.bool_)):

699 # If our array has numeric dtype, we don't have to check for strings in isin

700 na_values = np.array([val for val in na_values if not isinstance(val, str)])

701 mask = algorithms.isin(values, na_values)

702 na_count = mask.astype("uint8", copy=False).sum()

703 if na_count > 0:

704 if is_integer_dtype(values):

705 values = values.astype(np.float64)

706 np.putmask(values, mask, np.nan)

707 return values, na_count

708

709 if try_num_bool and is_object_dtype(values.dtype):

710 # exclude e.g DatetimeIndex here

711 try:

712 result, _ = lib.maybe_convert_numeric(values, na_values, False)

713 except (ValueError, TypeError):

714 # e.g. encountering datetime string gets ValueError

715 # TypeError can be raised in floatify

716 result = values

717 na_count = parsers.sanitize_objects(result, na_values)

718 else:

719 na_count = isna(result).sum()

720 else:

721 result = values

722 if values.dtype == np.object_:

723 na_count = parsers.sanitize_objects(values, na_values)

724

725 if result.dtype == np.object_ and try_num_bool:

726 result, _ = libops.maybe_convert_bool(

727 np.asarray(values),

728 true_values=self.true_values,

729 false_values=self.false_values,

730 )

731

732 return result, na_count

733

734 def _cast_types(self, values, cast_type, column):

735 """

736 Cast values to specified type

737

738 Parameters

739 ----------

740 values : ndarray

741 cast_type : string or np.dtype

742 dtype to cast values to

743 column : string

744 column name - used only for error reporting

745

746 Returns

747 -------

748 converted : ndarray

749 """

750 if is_categorical_dtype(cast_type):

751 known_cats = (

752 isinstance(cast_type, CategoricalDtype)

753 and cast_type.categories is not None

754 )

755

756 if not is_object_dtype(values) and not known_cats:

757 # TODO: this is for consistency with

758 # c-parser which parses all categories

759 # as strings

760

761 values = astype_nansafe(values, np.dtype(str))

762

763 cats = Index(values).unique().dropna()

764 values = Categorical._from_inferred_categories(

765 cats, cats.get_indexer(values), cast_type, true_values=self.true_values

766 )

767

768 # use the EA's implementation of casting

769 elif is_extension_array_dtype(cast_type):

770 # ensure cast_type is an actual dtype and not a string

771 cast_type = pandas_dtype(cast_type)

772 array_type = cast_type.construct_array_type()

773 try:

774 if is_bool_dtype(cast_type):

775 return array_type._from_sequence_of_strings(

776 values,

777 dtype=cast_type,

778 true_values=self.true_values,

779 false_values=self.false_values,

780 )

781 else:

782 return array_type._from_sequence_of_strings(values, dtype=cast_type)

783 except NotImplementedError as err:

784 raise NotImplementedError(

785 f"Extension Array: {array_type} must implement "

786 "_from_sequence_of_strings in order to be used in parser methods"

787 ) from err

788

789 else:

790 try:

791 values = astype_nansafe(values, cast_type, copy=True, skipna=True)

792 except ValueError as err:

793 raise ValueError(

794 f"Unable to convert column {column} to type {cast_type}"

795 ) from err

796 return values

797

798 @overload

799 def _do_date_conversions(

800 self,

801 names: Index,

802 data: DataFrame,

803 ) -> tuple[Sequence[Hashable] | Index, DataFrame]:

804 ...

805

806 @overload

807 def _do_date_conversions(

808 self,

809 names: Sequence[Hashable],

810 data: Mapping[Hashable, ArrayLike],

811 ) -> tuple[Sequence[Hashable], Mapping[Hashable, ArrayLike]]:

812 ...

813

814 def _do_date_conversions(

815 self,

816 names: Sequence[Hashable] | Index,

817 data: Mapping[Hashable, ArrayLike] | DataFrame,

818 ) -> tuple[Sequence[Hashable] | Index, Mapping[Hashable, ArrayLike] | DataFrame]:

819 # returns data, columns

820

821 if self.parse_dates is not None:

822 data, names = _process_date_conversion(

823 data,

824 self._date_conv,

825 self.parse_dates,

826 self.index_col,

827 self.index_names,

828 names,

829 keep_date_col=self.keep_date_col,

830 )

831

832 return names, data

833

834 def _check_data_length(

835 self,

836 columns: Sequence[Hashable],

837 data: Sequence[ArrayLike],

838 ) -> None:

839 """Checks if length of data is equal to length of column names.

840

841 One set of trailing commas is allowed. self.index_col not False

842 results in a ParserError previously when lengths do not match.

843

844 Parameters

845 ----------

846 columns: list of column names

847 data: list of array-likes containing the data column-wise.

848 """

849 if not self.index_col and len(columns) != len(data) and columns:

850 empty_str = is_object_dtype(data[-1]) and data[-1] == ""

851 # error: No overload variant of "__ror__" of "ndarray" matches

852 # argument type "ExtensionArray"

853 empty_str_or_na = empty_str | isna(data[-1]) # type: ignore[operator]

854 if len(columns) == len(data) - 1 and np.all(empty_str_or_na):

855 return

856 warnings.warn(

857 "Length of header or names does not match length of data. This leads "

858 "to a loss of data with index_col=False.",

859 ParserWarning,

860 stacklevel=find_stack_level(),

861 )

862

863 @overload

864 def _evaluate_usecols(

865 self,

866 usecols: set[int] | Callable[[Hashable], object],

867 names: Sequence[Hashable],

868 ) -> set[int]:

869 ...

870

871 @overload

872 def _evaluate_usecols(

873 self, usecols: set[str], names: Sequence[Hashable]

874 ) -> set[str]:

875 ...

876

877 def _evaluate_usecols(

878 self,

879 usecols: Callable[[Hashable], object] | set[str] | set[int],

880 names: Sequence[Hashable],

881 ) -> set[str] | set[int]:

882 """

883 Check whether or not the 'usecols' parameter

884 is a callable. If so, enumerates the 'names'

885 parameter and returns a set of indices for

886 each entry in 'names' that evaluates to True.

887 If not a callable, returns 'usecols'.

888 """

889 if callable(usecols):

890 return {i for i, name in enumerate(names) if usecols(name)}

891 return usecols

892

893 def _validate_usecols_names(self, usecols, names):

894 """

895 Validates that all usecols are present in a given

896 list of names. If not, raise a ValueError that

897 shows what usecols are missing.

898

899 Parameters

900 ----------

901 usecols : iterable of usecols

902 The columns to validate are present in names.

903 names : iterable of names

904 The column names to check against.

905

906 Returns

907 -------

908 usecols : iterable of usecols

909 The `usecols` parameter if the validation succeeds.

910

911 Raises

912 ------

913 ValueError : Columns were missing. Error message will list them.

914 """

915 missing = [c for c in usecols if c not in names]

916 if len(missing) > 0:

917 raise ValueError(

918 f"Usecols do not match columns, columns expected but not found: "

919 f"{missing}"

920 )

921

922 return usecols

923

924 def _validate_usecols_arg(self, usecols):

925 """

926 Validate the 'usecols' parameter.

927

928 Checks whether or not the 'usecols' parameter contains all integers

929 (column selection by index), strings (column by name) or is a callable.

930 Raises a ValueError if that is not the case.

931

932 Parameters

933 ----------

934 usecols : list-like, callable, or None

935 List of columns to use when parsing or a callable that can be used

936 to filter a list of table columns.

937

938 Returns

939 -------

940 usecols_tuple : tuple

941 A tuple of (verified_usecols, usecols_dtype).

942

943 'verified_usecols' is either a set if an array-like is passed in or

944 'usecols' if a callable or None is passed in.

945

946 'usecols_dtype` is the inferred dtype of 'usecols' if an array-like

947 is passed in or None if a callable or None is passed in.

948 """

949 msg = (

950 "'usecols' must either be list-like of all strings, all unicode, "

951 "all integers or a callable."

952 )

953 if usecols is not None:

954 if callable(usecols):

955 return usecols, None

956

957 if not is_list_like(usecols):

958 # see gh-20529

959 #

960 # Ensure it is iterable container but not string.

961 raise ValueError(msg)

962

963 usecols_dtype = lib.infer_dtype(usecols, skipna=False)

964

965 if usecols_dtype not in ("empty", "integer", "string"):

966 raise ValueError(msg)

967

968 usecols = set(usecols)

969

970 return usecols, usecols_dtype

971 return usecols, None

972

973 def _clean_index_names(self, columns, index_col):

974 if not is_index_col(index_col):

975 return None, columns, index_col

976

977 columns = list(columns)

978

979 # In case of no rows and multiindex columns we have to set index_names to

980 # list of Nones GH#38292

981 if not columns:

982 return [None] * len(index_col), columns, index_col

983

984 cp_cols = list(columns)

985 index_names: list[str | int | None] = []

986

987 # don't mutate

988 index_col = list(index_col)

989

990 for i, c in enumerate(index_col):

991 if isinstance(c, str):

992 index_names.append(c)

993 for j, name in enumerate(cp_cols):

994 if name == c:

995 index_col[i] = j

996 columns.remove(name)

997 break

998 else:

999 name = cp_cols[c]

1000 columns.remove(name)

1001 index_names.append(name)

1002

1003 # Only clean index names that were placeholders.

1004 for i, name in enumerate(index_names):

1005 if isinstance(name, str) and name in self.unnamed_cols:

1006 index_names[i] = None

1007

1008 return index_names, columns, index_col

1009

1010 def _get_empty_meta(

1011 self, columns, index_col, index_names, dtype: DtypeArg | None = None

1012 ):

1013 columns = list(columns)

1014

1015 # Convert `dtype` to a defaultdict of some kind.

1016 # This will enable us to write `dtype[col_name]`

1017 # without worrying about KeyError issues later on.

1018 dtype_dict: defaultdict[Hashable, Any]

1019 if not is_dict_like(dtype):

1020 # if dtype == None, default will be object.

1021 default_dtype = dtype or object

1022 dtype_dict = defaultdict(lambda: default_dtype)

1023 else:

1024 dtype = cast(dict, dtype)

1025 dtype_dict = defaultdict(

1026 lambda: object,

1027 {columns[k] if is_integer(k) else k: v for k, v in dtype.items()},

1028 )

1029

1030 # Even though we have no data, the "index" of the empty DataFrame

1031 # could for example still be an empty MultiIndex. Thus, we need to

1032 # check whether we have any index columns specified, via either:

1033 #

1034 # 1) index_col (column indices)

1035 # 2) index_names (column names)

1036 #

1037 # Both must be non-null to ensure a successful construction. Otherwise,

1038 # we have to create a generic empty Index.

1039 if (index_col is None or index_col is False) or index_names is None:

1040 index = Index([])

1041 else:

1042 data = [Series([], dtype=dtype_dict[name]) for name in index_names]

1043 index = ensure_index_from_sequences(data, names=index_names)

1044 index_col.sort()

1045

1046 for i, n in enumerate(index_col):

1047 columns.pop(n - i)

1048

1049 col_dict = {

1050 col_name: Series([], dtype=dtype_dict[col_name]) for col_name in columns

1051 }

1052

1053 return index, columns, col_dict

1054

1055

1056def _make_date_converter(

1057 date_parser=None, dayfirst=False, infer_datetime_format=False, cache_dates=True

1058):

1059 def converter(*date_cols):

1060 if date_parser is None:

1061 strs = parsing.concat_date_cols(date_cols)

1062

1063 try:

1064 return tools.to_datetime(

1065 ensure_object(strs),

1066 utc=None,

1067 dayfirst=dayfirst,

1068 errors="ignore",

1069 infer_datetime_format=infer_datetime_format,

1070 cache=cache_dates,

1071 ).to_numpy()

1072

1073 except ValueError:

1074 return tools.to_datetime(

1075 parsing.try_parse_dates(strs, dayfirst=dayfirst), cache=cache_dates

1076 )

1077 else:

1078 try:

1079 result = tools.to_datetime(

1080 date_parser(*date_cols), errors="ignore", cache=cache_dates

1081 )

1082 if isinstance(result, datetime.datetime):

1083 raise Exception("scalar parser")

1084 return result

1085 except Exception:

1086 try:

1087 return tools.to_datetime(

1088 parsing.try_parse_dates(

1089 parsing.concat_date_cols(date_cols),

1090 parser=date_parser,

1091 dayfirst=dayfirst,

1092 ),

1093 errors="ignore",

1094 )

1095 except Exception:

1096 return generic_parser(date_parser, *date_cols)

1097

1098 return converter

1099

1100

1101parser_defaults = {

1102 "delimiter": None,

1103 "escapechar": None,

1104 "quotechar": '"',

1105 "quoting": csv.QUOTE_MINIMAL,

1106 "doublequote": True,

1107 "skipinitialspace": False,

1108 "lineterminator": None,

1109 "header": "infer",

1110 "index_col": None,

1111 "names": None,

1112 "prefix": None,

1113 "skiprows": None,

1114 "skipfooter": 0,

1115 "nrows": None,

1116 "na_values": None,

1117 "keep_default_na": True,

1118 "true_values": None,

1119 "false_values": None,

1120 "converters": None,

1121 "dtype": None,

1122 "cache_dates": True,

1123 "thousands": None,

1124 "comment": None,

1125 "decimal": ".",

1126 # 'engine': 'c',

1127 "parse_dates": False,

1128 "keep_date_col": False,

1129 "dayfirst": False,

1130 "date_parser": None,

1131 "usecols": None,

1132 # 'iterator': False,

1133 "chunksize": None,

1134 "verbose": False,

1135 "encoding": None,

1136 "squeeze": None,

1137 "compression": None,

1138 "mangle_dupe_cols": True,

1139 "infer_datetime_format": False,

1140 "skip_blank_lines": True,

1141 "encoding_errors": "strict",

1142 "on_bad_lines": ParserBase.BadLineHandleMethod.ERROR,

1143 "error_bad_lines": None,

1144 "warn_bad_lines": None,

1145}

1146

1147

1148def _process_date_conversion(

1149 data_dict,

1150 converter: Callable,

1151 parse_spec,

1152 index_col,

1153 index_names,

1154 columns,

1155 keep_date_col: bool = False,

1156):

1157 def _isindex(colspec):

1158 return (isinstance(index_col, list) and colspec in index_col) or (

1159 isinstance(index_names, list) and colspec in index_names

1160 )

1161

1162 new_cols = []

1163 new_data = {}

1164

1165 orig_names = columns

1166 columns = list(columns)

1167

1168 date_cols = set()

1169

1170 if parse_spec is None or isinstance(parse_spec, bool):

1171 return data_dict, columns

1172

1173 if isinstance(parse_spec, list):

1174 # list of column lists

1175 for colspec in parse_spec:

1176 if is_scalar(colspec) or isinstance(colspec, tuple):

1177 if isinstance(colspec, int) and colspec not in data_dict:

1178 colspec = orig_names[colspec]

1179 if _isindex(colspec):

1180 continue

1181 # Pyarrow engine returns Series which we need to convert to

1182 # numpy array before converter, its a no-op for other parsers

1183 data_dict[colspec] = converter(np.asarray(data_dict[colspec]))

1184 else:

1185 new_name, col, old_names = _try_convert_dates(

1186 converter, colspec, data_dict, orig_names

1187 )

1188 if new_name in data_dict:

1189 raise ValueError(f"New date column already in dict {new_name}")

1190 new_data[new_name] = col

1191 new_cols.append(new_name)

1192 date_cols.update(old_names)

1193

1194 elif isinstance(parse_spec, dict):

1195 # dict of new name to column list

1196 for new_name, colspec in parse_spec.items():

1197 if new_name in data_dict:

1198 raise ValueError(f"Date column {new_name} already in dict")

1199

1200 _, col, old_names = _try_convert_dates(

1201 converter, colspec, data_dict, orig_names

1202 )

1203

1204 new_data[new_name] = col

1205

1206 # If original column can be converted to date we keep the converted values

1207 # This can only happen if values are from single column

1208 if len(colspec) == 1:

1209 new_data[colspec[0]] = col

1210

1211 new_cols.append(new_name)

1212 date_cols.update(old_names)

1213

1214 data_dict.update(new_data)

1215 new_cols.extend(columns)

1216

1217 if not keep_date_col:

1218 for c in list(date_cols):

1219 data_dict.pop(c)

1220 new_cols.remove(c)

1221

1222 return data_dict, new_cols

1223

1224

1225def _try_convert_dates(parser: Callable, colspec, data_dict, columns):

1226 colset = set(columns)

1227 colnames = []

1228

1229 for c in colspec:

1230 if c in colset:

1231 colnames.append(c)

1232 elif isinstance(c, int) and c not in columns:

1233 colnames.append(columns[c])

1234 else:

1235 colnames.append(c)

1236

1237 new_name: tuple | str

1238 if all(isinstance(x, tuple) for x in colnames):

1239 new_name = tuple(map("_".join, zip(*colnames)))

1240 else:

1241 new_name = "_".join([str(x) for x in colnames])

1242 to_parse = [np.asarray(data_dict[c]) for c in colnames if c in data_dict]

1243

1244 new_col = parser(*to_parse)

1245 return new_name, new_col, colnames

1246

1247

1248def _get_na_values(col, na_values, na_fvalues, keep_default_na):

1249 """

1250 Get the NaN values for a given column.

1251

1252 Parameters

1253 ----------

1254 col : str

1255 The name of the column.

1256 na_values : array-like, dict

1257 The object listing the NaN values as strings.

1258 na_fvalues : array-like, dict

1259 The object listing the NaN values as floats.

1260 keep_default_na : bool

1261 If `na_values` is a dict, and the column is not mapped in the

1262 dictionary, whether to return the default NaN values or the empty set.

1263

1264 Returns

1265 -------

1266 nan_tuple : A length-two tuple composed of

1267

1268 1) na_values : the string NaN values for that column.

1269 2) na_fvalues : the float NaN values for that column.

1270 """

1271 if isinstance(na_values, dict):

1272 if col in na_values:

1273 return na_values[col], na_fvalues[col]

1274 else:

1275 if keep_default_na:

1276 return STR_NA_VALUES, set()

1277

1278 return set(), set()

1279 else:

1280 return na_values, na_fvalues

1281

1282

1283def _is_potential_multi_index(

1284 columns: Sequence[Hashable] | MultiIndex,

1285 index_col: bool | Sequence[int] | None = None,

1286) -> bool:

1287 """

1288 Check whether or not the `columns` parameter

1289 could be converted into a MultiIndex.

1290

1291 Parameters

1292 ----------

1293 columns : array-like

1294 Object which may or may not be convertible into a MultiIndex

1295 index_col : None, bool or list, optional

1296 Column or columns to use as the (possibly hierarchical) index

1297

1298 Returns

1299 -------

1300 bool : Whether or not columns could become a MultiIndex

1301 """

1302 if index_col is None or isinstance(index_col, bool):

1303 index_col = []

1304

1305 return bool(

1306 len(columns)

1307 and not isinstance(columns, MultiIndex)

1308 and all(isinstance(c, tuple) for c in columns if c not in list(index_col))

1309 )

1310

1311

1312def _validate_parse_dates_arg(parse_dates):

1313 """

1314 Check whether or not the 'parse_dates' parameter

1315 is a non-boolean scalar. Raises a ValueError if

1316 that is the case.

1317 """

1318 msg = (

1319 "Only booleans, lists, and dictionaries are accepted "

1320 "for the 'parse_dates' parameter"

1321 )

1322

1323 if parse_dates is not None:

1324 if is_scalar(parse_dates):

1325 if not lib.is_bool(parse_dates):

1326 raise TypeError(msg)

1327

1328 elif not isinstance(parse_dates, (list, dict)):

1329 raise TypeError(msg)

1330

1331 return parse_dates

1332

1333

1334def is_index_col(col) -> bool:

1335 return col is not None and col is not False

Coverage for /var/srv/projects/api.amasfac.comuna18.com/tmp/venv/lib/python3.9/site-packages/pandas/io/parsers/base_parser.py: 11%

559 statements