Coverage for /var/srv/projects/api.amasfac.comuna18.com/tmp/venv/lib/python3.9/site-packages/pandas/io/parsers/base_parser.py: 11%

559 statements  

« prev     ^ index     » next       coverage.py v6.4.4, created at 2023-07-17 14:22 -0600

1from __future__ import annotations 

2 

3from collections import defaultdict 

4from copy import copy 

5import csv 

6import datetime 

7from enum import Enum 

8import itertools 

9from typing import ( 

10 TYPE_CHECKING, 

11 Any, 

12 Callable, 

13 DefaultDict, 

14 Hashable, 

15 Iterable, 

16 List, 

17 Mapping, 

18 Sequence, 

19 Tuple, 

20 cast, 

21 final, 

22 overload, 

23) 

24import warnings 

25 

26import numpy as np 

27 

28import pandas._libs.lib as lib 

29import pandas._libs.ops as libops 

30import pandas._libs.parsers as parsers 

31from pandas._libs.parsers import STR_NA_VALUES 

32from pandas._libs.tslibs import parsing 

33from pandas._typing import ( 

34 ArrayLike, 

35 DtypeArg, 

36 Scalar, 

37) 

38from pandas.errors import ( 

39 ParserError, 

40 ParserWarning, 

41) 

42from pandas.util._exceptions import find_stack_level 

43 

44from pandas.core.dtypes.astype import astype_nansafe 

45from pandas.core.dtypes.common import ( 

46 ensure_object, 

47 is_bool_dtype, 

48 is_categorical_dtype, 

49 is_dict_like, 

50 is_dtype_equal, 

51 is_extension_array_dtype, 

52 is_integer, 

53 is_integer_dtype, 

54 is_list_like, 

55 is_object_dtype, 

56 is_scalar, 

57 is_string_dtype, 

58 pandas_dtype, 

59) 

60from pandas.core.dtypes.dtypes import CategoricalDtype 

61from pandas.core.dtypes.missing import isna 

62 

63from pandas.core import algorithms 

64from pandas.core.arrays import Categorical 

65from pandas.core.indexes.api import ( 

66 Index, 

67 MultiIndex, 

68 ensure_index_from_sequences, 

69) 

70from pandas.core.series import Series 

71from pandas.core.tools import datetimes as tools 

72 

73from pandas.io.date_converters import generic_parser 

74 

75if TYPE_CHECKING: 75 ↛ 76line 75 didn't jump to line 76, because the condition on line 75 was never true

76 from pandas import DataFrame 

77 

78 

79class ParserBase: 

80 class BadLineHandleMethod(Enum): 

81 ERROR = 0 

82 WARN = 1 

83 SKIP = 2 

84 

85 _implicit_index: bool = False 

86 _first_chunk: bool 

87 

88 def __init__(self, kwds) -> None: 

89 

90 self.names = kwds.get("names") 

91 self.orig_names: list | None = None 

92 self.prefix = kwds.pop("prefix", None) 

93 

94 self.index_col = kwds.get("index_col", None) 

95 self.unnamed_cols: set = set() 

96 self.index_names: Sequence[Hashable] | None = None 

97 self.col_names = None 

98 

99 self.parse_dates = _validate_parse_dates_arg(kwds.pop("parse_dates", False)) 

100 self._parse_date_cols: Iterable = [] 

101 self.date_parser = kwds.pop("date_parser", None) 

102 self.dayfirst = kwds.pop("dayfirst", False) 

103 self.keep_date_col = kwds.pop("keep_date_col", False) 

104 

105 self.na_values = kwds.get("na_values") 

106 self.na_fvalues = kwds.get("na_fvalues") 

107 self.na_filter = kwds.get("na_filter", False) 

108 self.keep_default_na = kwds.get("keep_default_na", True) 

109 

110 self.dtype = copy(kwds.get("dtype", None)) 

111 self.converters = kwds.get("converters") 

112 

113 self.true_values = kwds.get("true_values") 

114 self.false_values = kwds.get("false_values") 

115 self.mangle_dupe_cols = kwds.get("mangle_dupe_cols", True) 

116 self.infer_datetime_format = kwds.pop("infer_datetime_format", False) 

117 self.cache_dates = kwds.pop("cache_dates", True) 

118 

119 self._date_conv = _make_date_converter( 

120 date_parser=self.date_parser, 

121 dayfirst=self.dayfirst, 

122 infer_datetime_format=self.infer_datetime_format, 

123 cache_dates=self.cache_dates, 

124 ) 

125 

126 # validate header options for mi 

127 self.header = kwds.get("header") 

128 if is_list_like(self.header, allow_sets=False): 

129 if kwds.get("usecols"): 

130 raise ValueError( 

131 "cannot specify usecols when specifying a multi-index header" 

132 ) 

133 if kwds.get("names"): 

134 raise ValueError( 

135 "cannot specify names when specifying a multi-index header" 

136 ) 

137 

138 # validate index_col that only contains integers 

139 if self.index_col is not None: 

140 if not ( 

141 is_list_like(self.index_col, allow_sets=False) 

142 and all(map(is_integer, self.index_col)) 

143 or is_integer(self.index_col) 

144 ): 

145 raise ValueError( 

146 "index_col must only contain row numbers " 

147 "when specifying a multi-index header" 

148 ) 

149 elif self.header is not None and self.prefix is not None: 

150 # GH 27394 

151 raise ValueError( 

152 "Argument prefix must be None if argument header is not None" 

153 ) 

154 

155 self._name_processed = False 

156 

157 self._first_chunk = True 

158 

159 self.usecols, self.usecols_dtype = self._validate_usecols_arg(kwds["usecols"]) 

160 

161 # Fallback to error to pass a sketchy test(test_override_set_noconvert_columns) 

162 # Normally, this arg would get pre-processed earlier on 

163 self.on_bad_lines = kwds.get("on_bad_lines", self.BadLineHandleMethod.ERROR) 

164 

165 def _validate_parse_dates_presence(self, columns: Sequence[Hashable]) -> Iterable: 

166 """ 

167 Check if parse_dates are in columns. 

168 

169 If user has provided names for parse_dates, check if those columns 

170 are available. 

171 

172 Parameters 

173 ---------- 

174 columns : list 

175 List of names of the dataframe. 

176 

177 Returns 

178 ------- 

179 The names of the columns which will get parsed later if a dict or list 

180 is given as specification. 

181 

182 Raises 

183 ------ 

184 ValueError 

185 If column to parse_date is not in dataframe. 

186 

187 """ 

188 cols_needed: Iterable 

189 if is_dict_like(self.parse_dates): 

190 cols_needed = itertools.chain(*self.parse_dates.values()) 

191 elif is_list_like(self.parse_dates): 

192 # a column in parse_dates could be represented 

193 # ColReference = Union[int, str] 

194 # DateGroups = List[ColReference] 

195 # ParseDates = Union[DateGroups, List[DateGroups], 

196 # Dict[ColReference, DateGroups]] 

197 cols_needed = itertools.chain.from_iterable( 

198 col if is_list_like(col) and not isinstance(col, tuple) else [col] 

199 for col in self.parse_dates 

200 ) 

201 else: 

202 cols_needed = [] 

203 

204 cols_needed = list(cols_needed) 

205 

206 # get only columns that are references using names (str), not by index 

207 missing_cols = ", ".join( 

208 sorted( 

209 { 

210 col 

211 for col in cols_needed 

212 if isinstance(col, str) and col not in columns 

213 } 

214 ) 

215 ) 

216 if missing_cols: 

217 raise ValueError( 

218 f"Missing column provided to 'parse_dates': '{missing_cols}'" 

219 ) 

220 # Convert positions to actual column names 

221 return [ 

222 col if (isinstance(col, str) or col in columns) else columns[col] 

223 for col in cols_needed 

224 ] 

225 

226 def close(self) -> None: 

227 pass 

228 

229 @final 

230 @property 

231 def _has_complex_date_col(self) -> bool: 

232 return isinstance(self.parse_dates, dict) or ( 

233 isinstance(self.parse_dates, list) 

234 and len(self.parse_dates) > 0 

235 and isinstance(self.parse_dates[0], list) 

236 ) 

237 

238 @final 

239 def _should_parse_dates(self, i: int) -> bool: 

240 if isinstance(self.parse_dates, bool): 

241 return self.parse_dates 

242 else: 

243 if self.index_names is not None: 

244 name = self.index_names[i] 

245 else: 

246 name = None 

247 j = i if self.index_col is None else self.index_col[i] 

248 

249 if is_scalar(self.parse_dates): 

250 return (j == self.parse_dates) or ( 

251 name is not None and name == self.parse_dates 

252 ) 

253 else: 

254 return (j in self.parse_dates) or ( 

255 name is not None and name in self.parse_dates 

256 ) 

257 

258 @final 

259 def _extract_multi_indexer_columns( 

260 self, 

261 header, 

262 index_names: list | None, 

263 passed_names: bool = False, 

264 ): 

265 """ 

266 Extract and return the names, index_names, col_names if the column 

267 names are a MultiIndex. 

268 

269 Parameters 

270 ---------- 

271 header: list of lists 

272 The header rows 

273 index_names: list, optional 

274 The names of the future index 

275 passed_names: bool, default False 

276 A flag specifying if names where passed 

277 

278 """ 

279 if len(header) < 2: 

280 return header[0], index_names, None, passed_names 

281 

282 # the names are the tuples of the header that are not the index cols 

283 # 0 is the name of the index, assuming index_col is a list of column 

284 # numbers 

285 ic = self.index_col 

286 if ic is None: 

287 ic = [] 

288 

289 if not isinstance(ic, (list, tuple, np.ndarray)): 

290 ic = [ic] 

291 sic = set(ic) 

292 

293 # clean the index_names 

294 index_names = header.pop(-1) 

295 index_names, _, _ = self._clean_index_names(index_names, self.index_col) 

296 

297 # extract the columns 

298 field_count = len(header[0]) 

299 

300 # check if header lengths are equal 

301 if not all(len(header_iter) == field_count for header_iter in header[1:]): 

302 raise ParserError("Header rows must have an equal number of columns.") 

303 

304 def extract(r): 

305 return tuple(r[i] for i in range(field_count) if i not in sic) 

306 

307 columns = list(zip(*(extract(r) for r in header))) 

308 names = columns.copy() 

309 for single_ic in sorted(ic): 

310 names.insert(single_ic, single_ic) 

311 

312 # Clean the column names (if we have an index_col). 

313 if len(ic): 

314 col_names = [ 

315 r[ic[0]] 

316 if ((r[ic[0]] is not None) and r[ic[0]] not in self.unnamed_cols) 

317 else None 

318 for r in header 

319 ] 

320 else: 

321 col_names = [None] * len(header) 

322 

323 passed_names = True 

324 

325 return names, index_names, col_names, passed_names 

326 

327 @final 

328 def _maybe_dedup_names(self, names: Sequence[Hashable]) -> Sequence[Hashable]: 

329 # see gh-7160 and gh-9424: this helps to provide 

330 # immediate alleviation of the duplicate names 

331 # issue and appears to be satisfactory to users, 

332 # but ultimately, not needing to butcher the names 

333 # would be nice! 

334 if self.mangle_dupe_cols: 

335 names = list(names) # so we can index 

336 counts: DefaultDict[Hashable, int] = defaultdict(int) 

337 is_potential_mi = _is_potential_multi_index(names, self.index_col) 

338 

339 for i, col in enumerate(names): 

340 cur_count = counts[col] 

341 

342 while cur_count > 0: 

343 counts[col] = cur_count + 1 

344 

345 if is_potential_mi: 

346 # for mypy 

347 assert isinstance(col, tuple) 

348 col = col[:-1] + (f"{col[-1]}.{cur_count}",) 

349 else: 

350 col = f"{col}.{cur_count}" 

351 cur_count = counts[col] 

352 

353 names[i] = col 

354 counts[col] = cur_count + 1 

355 

356 return names 

357 

358 @final 

359 def _maybe_make_multi_index_columns( 

360 self, 

361 columns: Sequence[Hashable], 

362 col_names: Sequence[Hashable] | None = None, 

363 ) -> Sequence[Hashable] | MultiIndex: 

364 # possibly create a column mi here 

365 if _is_potential_multi_index(columns): 

366 list_columns = cast(List[Tuple], columns) 

367 return MultiIndex.from_tuples(list_columns, names=col_names) 

368 return columns 

369 

370 @final 

371 def _make_index( 

372 self, data, alldata, columns, indexnamerow: list[Scalar] | None = None 

373 ) -> tuple[Index | None, Sequence[Hashable] | MultiIndex]: 

374 index: Index | None 

375 if not is_index_col(self.index_col) or not self.index_col: 

376 index = None 

377 

378 elif not self._has_complex_date_col: 

379 simple_index = self._get_simple_index(alldata, columns) 

380 index = self._agg_index(simple_index) 

381 elif self._has_complex_date_col: 

382 if not self._name_processed: 

383 (self.index_names, _, self.index_col) = self._clean_index_names( 

384 list(columns), self.index_col 

385 ) 

386 self._name_processed = True 

387 date_index = self._get_complex_date_index(data, columns) 

388 index = self._agg_index(date_index, try_parse_dates=False) 

389 

390 # add names for the index 

391 if indexnamerow: 

392 coffset = len(indexnamerow) - len(columns) 

393 assert index is not None 

394 index = index.set_names(indexnamerow[:coffset]) 

395 

396 # maybe create a mi on the columns 

397 columns = self._maybe_make_multi_index_columns(columns, self.col_names) 

398 

399 return index, columns 

400 

401 @final 

402 def _get_simple_index(self, data, columns): 

403 def ix(col): 

404 if not isinstance(col, str): 

405 return col 

406 raise ValueError(f"Index {col} invalid") 

407 

408 to_remove = [] 

409 index = [] 

410 for idx in self.index_col: 

411 i = ix(idx) 

412 to_remove.append(i) 

413 index.append(data[i]) 

414 

415 # remove index items from content and columns, don't pop in 

416 # loop 

417 for i in sorted(to_remove, reverse=True): 

418 data.pop(i) 

419 if not self._implicit_index: 

420 columns.pop(i) 

421 

422 return index 

423 

424 @final 

425 def _get_complex_date_index(self, data, col_names): 

426 def _get_name(icol): 

427 if isinstance(icol, str): 

428 return icol 

429 

430 if col_names is None: 

431 raise ValueError(f"Must supply column order to use {icol!s} as index") 

432 

433 for i, c in enumerate(col_names): 

434 if i == icol: 

435 return c 

436 

437 to_remove = [] 

438 index = [] 

439 for idx in self.index_col: 

440 name = _get_name(idx) 

441 to_remove.append(name) 

442 index.append(data[name]) 

443 

444 # remove index items from content and columns, don't pop in 

445 # loop 

446 for c in sorted(to_remove, reverse=True): 

447 data.pop(c) 

448 col_names.remove(c) 

449 

450 return index 

451 

452 def _clean_mapping(self, mapping): 

453 """converts col numbers to names""" 

454 if not isinstance(mapping, dict): 

455 return mapping 

456 clean = {} 

457 # for mypy 

458 assert self.orig_names is not None 

459 

460 for col, v in mapping.items(): 

461 if isinstance(col, int) and col not in self.orig_names: 

462 col = self.orig_names[col] 

463 clean[col] = v 

464 if isinstance(mapping, defaultdict): 

465 remaining_cols = set(self.orig_names) - set(clean.keys()) 

466 clean.update({col: mapping[col] for col in remaining_cols}) 

467 return clean 

468 

469 @final 

470 def _agg_index(self, index, try_parse_dates: bool = True) -> Index: 

471 arrays = [] 

472 converters = self._clean_mapping(self.converters) 

473 

474 for i, arr in enumerate(index): 

475 

476 if try_parse_dates and self._should_parse_dates(i): 

477 arr = self._date_conv(arr) 

478 

479 if self.na_filter: 

480 col_na_values = self.na_values 

481 col_na_fvalues = self.na_fvalues 

482 else: 

483 col_na_values = set() 

484 col_na_fvalues = set() 

485 

486 if isinstance(self.na_values, dict): 

487 assert self.index_names is not None 

488 col_name = self.index_names[i] 

489 if col_name is not None: 

490 col_na_values, col_na_fvalues = _get_na_values( 

491 col_name, self.na_values, self.na_fvalues, self.keep_default_na 

492 ) 

493 

494 clean_dtypes = self._clean_mapping(self.dtype) 

495 

496 cast_type = None 

497 index_converter = False 

498 if self.index_names is not None: 

499 if isinstance(clean_dtypes, dict): 

500 cast_type = clean_dtypes.get(self.index_names[i], None) 

501 

502 if isinstance(converters, dict): 

503 index_converter = converters.get(self.index_names[i]) is not None 

504 

505 try_num_bool = not ( 

506 cast_type and is_string_dtype(cast_type) or index_converter 

507 ) 

508 

509 arr, _ = self._infer_types( 

510 arr, col_na_values | col_na_fvalues, try_num_bool 

511 ) 

512 arrays.append(arr) 

513 

514 names = self.index_names 

515 index = ensure_index_from_sequences(arrays, names) 

516 

517 return index 

518 

519 @final 

520 def _convert_to_ndarrays( 

521 self, 

522 dct: Mapping, 

523 na_values, 

524 na_fvalues, 

525 verbose: bool = False, 

526 converters=None, 

527 dtypes=None, 

528 ): 

529 result = {} 

530 for c, values in dct.items(): 

531 conv_f = None if converters is None else converters.get(c, None) 

532 if isinstance(dtypes, dict): 

533 cast_type = dtypes.get(c, None) 

534 else: 

535 # single dtype or None 

536 cast_type = dtypes 

537 

538 if self.na_filter: 

539 col_na_values, col_na_fvalues = _get_na_values( 

540 c, na_values, na_fvalues, self.keep_default_na 

541 ) 

542 else: 

543 col_na_values, col_na_fvalues = set(), set() 

544 

545 if c in self._parse_date_cols: 

546 # GH#26203 Do not convert columns which get converted to dates 

547 # but replace nans to ensure to_datetime works 

548 mask = algorithms.isin(values, set(col_na_values) | col_na_fvalues) 

549 np.putmask(values, mask, np.nan) 

550 result[c] = values 

551 continue 

552 

553 if conv_f is not None: 

554 # conv_f applied to data before inference 

555 if cast_type is not None: 

556 warnings.warn( 

557 ( 

558 "Both a converter and dtype were specified " 

559 f"for column {c} - only the converter will be used." 

560 ), 

561 ParserWarning, 

562 stacklevel=find_stack_level(), 

563 ) 

564 

565 try: 

566 values = lib.map_infer(values, conv_f) 

567 except ValueError: 

568 # error: Argument 2 to "isin" has incompatible type "List[Any]"; 

569 # expected "Union[Union[ExtensionArray, ndarray], Index, Series]" 

570 mask = algorithms.isin( 

571 values, list(na_values) # type: ignore[arg-type] 

572 ).view(np.uint8) 

573 values = lib.map_infer_mask(values, conv_f, mask) 

574 

575 cvals, na_count = self._infer_types( 

576 values, set(col_na_values) | col_na_fvalues, try_num_bool=False 

577 ) 

578 else: 

579 is_ea = is_extension_array_dtype(cast_type) 

580 is_str_or_ea_dtype = is_ea or is_string_dtype(cast_type) 

581 # skip inference if specified dtype is object 

582 # or casting to an EA 

583 try_num_bool = not (cast_type and is_str_or_ea_dtype) 

584 

585 # general type inference and conversion 

586 cvals, na_count = self._infer_types( 

587 values, set(col_na_values) | col_na_fvalues, try_num_bool 

588 ) 

589 

590 # type specified in dtype param or cast_type is an EA 

591 if cast_type and ( 

592 not is_dtype_equal(cvals, cast_type) 

593 or is_extension_array_dtype(cast_type) 

594 ): 

595 if not is_ea and na_count > 0: 

596 try: 

597 if is_bool_dtype(cast_type): 

598 raise ValueError( 

599 f"Bool column has NA values in column {c}" 

600 ) 

601 except (AttributeError, TypeError): 

602 # invalid input to is_bool_dtype 

603 pass 

604 cast_type = pandas_dtype(cast_type) 

605 cvals = self._cast_types(cvals, cast_type, c) 

606 

607 result[c] = cvals 

608 if verbose and na_count: 

609 print(f"Filled {na_count} NA values in column {c!s}") 

610 return result 

611 

612 @final 

613 def _set_noconvert_dtype_columns( 

614 self, col_indices: list[int], names: Sequence[Hashable] 

615 ) -> set[int]: 

616 """ 

617 Set the columns that should not undergo dtype conversions. 

618 

619 Currently, any column that is involved with date parsing will not 

620 undergo such conversions. If usecols is specified, the positions of the columns 

621 not to cast is relative to the usecols not to all columns. 

622 

623 Parameters 

624 ---------- 

625 col_indices: The indices specifying order and positions of the columns 

626 names: The column names which order is corresponding with the order 

627 of col_indices 

628 

629 Returns 

630 ------- 

631 A set of integers containing the positions of the columns not to convert. 

632 """ 

633 usecols: list[int] | list[str] | None 

634 noconvert_columns = set() 

635 if self.usecols_dtype == "integer": 

636 # A set of integers will be converted to a list in 

637 # the correct order every single time. 

638 usecols = sorted(self.usecols) 

639 elif callable(self.usecols) or self.usecols_dtype not in ("empty", None): 

640 # The names attribute should have the correct columns 

641 # in the proper order for indexing with parse_dates. 

642 usecols = col_indices 

643 else: 

644 # Usecols is empty. 

645 usecols = None 

646 

647 def _set(x) -> int: 

648 if usecols is not None and is_integer(x): 

649 x = usecols[x] 

650 

651 if not is_integer(x): 

652 x = col_indices[names.index(x)] 

653 

654 return x 

655 

656 if isinstance(self.parse_dates, list): 

657 for val in self.parse_dates: 

658 if isinstance(val, list): 

659 for k in val: 

660 noconvert_columns.add(_set(k)) 

661 else: 

662 noconvert_columns.add(_set(val)) 

663 

664 elif isinstance(self.parse_dates, dict): 

665 for val in self.parse_dates.values(): 

666 if isinstance(val, list): 

667 for k in val: 

668 noconvert_columns.add(_set(k)) 

669 else: 

670 noconvert_columns.add(_set(val)) 

671 

672 elif self.parse_dates: 

673 if isinstance(self.index_col, list): 

674 for k in self.index_col: 

675 noconvert_columns.add(_set(k)) 

676 elif self.index_col is not None: 

677 noconvert_columns.add(_set(self.index_col)) 

678 

679 return noconvert_columns 

680 

681 def _infer_types(self, values, na_values, try_num_bool=True): 

682 """ 

683 Infer types of values, possibly casting 

684 

685 Parameters 

686 ---------- 

687 values : ndarray 

688 na_values : set 

689 try_num_bool : bool, default try 

690 try to cast values to numeric (first preference) or boolean 

691 

692 Returns 

693 ------- 

694 converted : ndarray 

695 na_count : int 

696 """ 

697 na_count = 0 

698 if issubclass(values.dtype.type, (np.number, np.bool_)): 

699 # If our array has numeric dtype, we don't have to check for strings in isin 

700 na_values = np.array([val for val in na_values if not isinstance(val, str)]) 

701 mask = algorithms.isin(values, na_values) 

702 na_count = mask.astype("uint8", copy=False).sum() 

703 if na_count > 0: 

704 if is_integer_dtype(values): 

705 values = values.astype(np.float64) 

706 np.putmask(values, mask, np.nan) 

707 return values, na_count 

708 

709 if try_num_bool and is_object_dtype(values.dtype): 

710 # exclude e.g DatetimeIndex here 

711 try: 

712 result, _ = lib.maybe_convert_numeric(values, na_values, False) 

713 except (ValueError, TypeError): 

714 # e.g. encountering datetime string gets ValueError 

715 # TypeError can be raised in floatify 

716 result = values 

717 na_count = parsers.sanitize_objects(result, na_values) 

718 else: 

719 na_count = isna(result).sum() 

720 else: 

721 result = values 

722 if values.dtype == np.object_: 

723 na_count = parsers.sanitize_objects(values, na_values) 

724 

725 if result.dtype == np.object_ and try_num_bool: 

726 result, _ = libops.maybe_convert_bool( 

727 np.asarray(values), 

728 true_values=self.true_values, 

729 false_values=self.false_values, 

730 ) 

731 

732 return result, na_count 

733 

734 def _cast_types(self, values, cast_type, column): 

735 """ 

736 Cast values to specified type 

737 

738 Parameters 

739 ---------- 

740 values : ndarray 

741 cast_type : string or np.dtype 

742 dtype to cast values to 

743 column : string 

744 column name - used only for error reporting 

745 

746 Returns 

747 ------- 

748 converted : ndarray 

749 """ 

750 if is_categorical_dtype(cast_type): 

751 known_cats = ( 

752 isinstance(cast_type, CategoricalDtype) 

753 and cast_type.categories is not None 

754 ) 

755 

756 if not is_object_dtype(values) and not known_cats: 

757 # TODO: this is for consistency with 

758 # c-parser which parses all categories 

759 # as strings 

760 

761 values = astype_nansafe(values, np.dtype(str)) 

762 

763 cats = Index(values).unique().dropna() 

764 values = Categorical._from_inferred_categories( 

765 cats, cats.get_indexer(values), cast_type, true_values=self.true_values 

766 ) 

767 

768 # use the EA's implementation of casting 

769 elif is_extension_array_dtype(cast_type): 

770 # ensure cast_type is an actual dtype and not a string 

771 cast_type = pandas_dtype(cast_type) 

772 array_type = cast_type.construct_array_type() 

773 try: 

774 if is_bool_dtype(cast_type): 

775 return array_type._from_sequence_of_strings( 

776 values, 

777 dtype=cast_type, 

778 true_values=self.true_values, 

779 false_values=self.false_values, 

780 ) 

781 else: 

782 return array_type._from_sequence_of_strings(values, dtype=cast_type) 

783 except NotImplementedError as err: 

784 raise NotImplementedError( 

785 f"Extension Array: {array_type} must implement " 

786 "_from_sequence_of_strings in order to be used in parser methods" 

787 ) from err 

788 

789 else: 

790 try: 

791 values = astype_nansafe(values, cast_type, copy=True, skipna=True) 

792 except ValueError as err: 

793 raise ValueError( 

794 f"Unable to convert column {column} to type {cast_type}" 

795 ) from err 

796 return values 

797 

798 @overload 

799 def _do_date_conversions( 

800 self, 

801 names: Index, 

802 data: DataFrame, 

803 ) -> tuple[Sequence[Hashable] | Index, DataFrame]: 

804 ... 

805 

806 @overload 

807 def _do_date_conversions( 

808 self, 

809 names: Sequence[Hashable], 

810 data: Mapping[Hashable, ArrayLike], 

811 ) -> tuple[Sequence[Hashable], Mapping[Hashable, ArrayLike]]: 

812 ... 

813 

814 def _do_date_conversions( 

815 self, 

816 names: Sequence[Hashable] | Index, 

817 data: Mapping[Hashable, ArrayLike] | DataFrame, 

818 ) -> tuple[Sequence[Hashable] | Index, Mapping[Hashable, ArrayLike] | DataFrame]: 

819 # returns data, columns 

820 

821 if self.parse_dates is not None: 

822 data, names = _process_date_conversion( 

823 data, 

824 self._date_conv, 

825 self.parse_dates, 

826 self.index_col, 

827 self.index_names, 

828 names, 

829 keep_date_col=self.keep_date_col, 

830 ) 

831 

832 return names, data 

833 

834 def _check_data_length( 

835 self, 

836 columns: Sequence[Hashable], 

837 data: Sequence[ArrayLike], 

838 ) -> None: 

839 """Checks if length of data is equal to length of column names. 

840 

841 One set of trailing commas is allowed. self.index_col not False 

842 results in a ParserError previously when lengths do not match. 

843 

844 Parameters 

845 ---------- 

846 columns: list of column names 

847 data: list of array-likes containing the data column-wise. 

848 """ 

849 if not self.index_col and len(columns) != len(data) and columns: 

850 empty_str = is_object_dtype(data[-1]) and data[-1] == "" 

851 # error: No overload variant of "__ror__" of "ndarray" matches 

852 # argument type "ExtensionArray" 

853 empty_str_or_na = empty_str | isna(data[-1]) # type: ignore[operator] 

854 if len(columns) == len(data) - 1 and np.all(empty_str_or_na): 

855 return 

856 warnings.warn( 

857 "Length of header or names does not match length of data. This leads " 

858 "to a loss of data with index_col=False.", 

859 ParserWarning, 

860 stacklevel=find_stack_level(), 

861 ) 

862 

863 @overload 

864 def _evaluate_usecols( 

865 self, 

866 usecols: set[int] | Callable[[Hashable], object], 

867 names: Sequence[Hashable], 

868 ) -> set[int]: 

869 ... 

870 

871 @overload 

872 def _evaluate_usecols( 

873 self, usecols: set[str], names: Sequence[Hashable] 

874 ) -> set[str]: 

875 ... 

876 

877 def _evaluate_usecols( 

878 self, 

879 usecols: Callable[[Hashable], object] | set[str] | set[int], 

880 names: Sequence[Hashable], 

881 ) -> set[str] | set[int]: 

882 """ 

883 Check whether or not the 'usecols' parameter 

884 is a callable. If so, enumerates the 'names' 

885 parameter and returns a set of indices for 

886 each entry in 'names' that evaluates to True. 

887 If not a callable, returns 'usecols'. 

888 """ 

889 if callable(usecols): 

890 return {i for i, name in enumerate(names) if usecols(name)} 

891 return usecols 

892 

893 def _validate_usecols_names(self, usecols, names): 

894 """ 

895 Validates that all usecols are present in a given 

896 list of names. If not, raise a ValueError that 

897 shows what usecols are missing. 

898 

899 Parameters 

900 ---------- 

901 usecols : iterable of usecols 

902 The columns to validate are present in names. 

903 names : iterable of names 

904 The column names to check against. 

905 

906 Returns 

907 ------- 

908 usecols : iterable of usecols 

909 The `usecols` parameter if the validation succeeds. 

910 

911 Raises 

912 ------ 

913 ValueError : Columns were missing. Error message will list them. 

914 """ 

915 missing = [c for c in usecols if c not in names] 

916 if len(missing) > 0: 

917 raise ValueError( 

918 f"Usecols do not match columns, columns expected but not found: " 

919 f"{missing}" 

920 ) 

921 

922 return usecols 

923 

924 def _validate_usecols_arg(self, usecols): 

925 """ 

926 Validate the 'usecols' parameter. 

927 

928 Checks whether or not the 'usecols' parameter contains all integers 

929 (column selection by index), strings (column by name) or is a callable. 

930 Raises a ValueError if that is not the case. 

931 

932 Parameters 

933 ---------- 

934 usecols : list-like, callable, or None 

935 List of columns to use when parsing or a callable that can be used 

936 to filter a list of table columns. 

937 

938 Returns 

939 ------- 

940 usecols_tuple : tuple 

941 A tuple of (verified_usecols, usecols_dtype). 

942 

943 'verified_usecols' is either a set if an array-like is passed in or 

944 'usecols' if a callable or None is passed in. 

945 

946 'usecols_dtype` is the inferred dtype of 'usecols' if an array-like 

947 is passed in or None if a callable or None is passed in. 

948 """ 

949 msg = ( 

950 "'usecols' must either be list-like of all strings, all unicode, " 

951 "all integers or a callable." 

952 ) 

953 if usecols is not None: 

954 if callable(usecols): 

955 return usecols, None 

956 

957 if not is_list_like(usecols): 

958 # see gh-20529 

959 # 

960 # Ensure it is iterable container but not string. 

961 raise ValueError(msg) 

962 

963 usecols_dtype = lib.infer_dtype(usecols, skipna=False) 

964 

965 if usecols_dtype not in ("empty", "integer", "string"): 

966 raise ValueError(msg) 

967 

968 usecols = set(usecols) 

969 

970 return usecols, usecols_dtype 

971 return usecols, None 

972 

973 def _clean_index_names(self, columns, index_col): 

974 if not is_index_col(index_col): 

975 return None, columns, index_col 

976 

977 columns = list(columns) 

978 

979 # In case of no rows and multiindex columns we have to set index_names to 

980 # list of Nones GH#38292 

981 if not columns: 

982 return [None] * len(index_col), columns, index_col 

983 

984 cp_cols = list(columns) 

985 index_names: list[str | int | None] = [] 

986 

987 # don't mutate 

988 index_col = list(index_col) 

989 

990 for i, c in enumerate(index_col): 

991 if isinstance(c, str): 

992 index_names.append(c) 

993 for j, name in enumerate(cp_cols): 

994 if name == c: 

995 index_col[i] = j 

996 columns.remove(name) 

997 break 

998 else: 

999 name = cp_cols[c] 

1000 columns.remove(name) 

1001 index_names.append(name) 

1002 

1003 # Only clean index names that were placeholders. 

1004 for i, name in enumerate(index_names): 

1005 if isinstance(name, str) and name in self.unnamed_cols: 

1006 index_names[i] = None 

1007 

1008 return index_names, columns, index_col 

1009 

1010 def _get_empty_meta( 

1011 self, columns, index_col, index_names, dtype: DtypeArg | None = None 

1012 ): 

1013 columns = list(columns) 

1014 

1015 # Convert `dtype` to a defaultdict of some kind. 

1016 # This will enable us to write `dtype[col_name]` 

1017 # without worrying about KeyError issues later on. 

1018 dtype_dict: defaultdict[Hashable, Any] 

1019 if not is_dict_like(dtype): 

1020 # if dtype == None, default will be object. 

1021 default_dtype = dtype or object 

1022 dtype_dict = defaultdict(lambda: default_dtype) 

1023 else: 

1024 dtype = cast(dict, dtype) 

1025 dtype_dict = defaultdict( 

1026 lambda: object, 

1027 {columns[k] if is_integer(k) else k: v for k, v in dtype.items()}, 

1028 ) 

1029 

1030 # Even though we have no data, the "index" of the empty DataFrame 

1031 # could for example still be an empty MultiIndex. Thus, we need to 

1032 # check whether we have any index columns specified, via either: 

1033 # 

1034 # 1) index_col (column indices) 

1035 # 2) index_names (column names) 

1036 # 

1037 # Both must be non-null to ensure a successful construction. Otherwise, 

1038 # we have to create a generic empty Index. 

1039 if (index_col is None or index_col is False) or index_names is None: 

1040 index = Index([]) 

1041 else: 

1042 data = [Series([], dtype=dtype_dict[name]) for name in index_names] 

1043 index = ensure_index_from_sequences(data, names=index_names) 

1044 index_col.sort() 

1045 

1046 for i, n in enumerate(index_col): 

1047 columns.pop(n - i) 

1048 

1049 col_dict = { 

1050 col_name: Series([], dtype=dtype_dict[col_name]) for col_name in columns 

1051 } 

1052 

1053 return index, columns, col_dict 

1054 

1055 

1056def _make_date_converter( 

1057 date_parser=None, dayfirst=False, infer_datetime_format=False, cache_dates=True 

1058): 

1059 def converter(*date_cols): 

1060 if date_parser is None: 

1061 strs = parsing.concat_date_cols(date_cols) 

1062 

1063 try: 

1064 return tools.to_datetime( 

1065 ensure_object(strs), 

1066 utc=None, 

1067 dayfirst=dayfirst, 

1068 errors="ignore", 

1069 infer_datetime_format=infer_datetime_format, 

1070 cache=cache_dates, 

1071 ).to_numpy() 

1072 

1073 except ValueError: 

1074 return tools.to_datetime( 

1075 parsing.try_parse_dates(strs, dayfirst=dayfirst), cache=cache_dates 

1076 ) 

1077 else: 

1078 try: 

1079 result = tools.to_datetime( 

1080 date_parser(*date_cols), errors="ignore", cache=cache_dates 

1081 ) 

1082 if isinstance(result, datetime.datetime): 

1083 raise Exception("scalar parser") 

1084 return result 

1085 except Exception: 

1086 try: 

1087 return tools.to_datetime( 

1088 parsing.try_parse_dates( 

1089 parsing.concat_date_cols(date_cols), 

1090 parser=date_parser, 

1091 dayfirst=dayfirst, 

1092 ), 

1093 errors="ignore", 

1094 ) 

1095 except Exception: 

1096 return generic_parser(date_parser, *date_cols) 

1097 

1098 return converter 

1099 

1100 

1101parser_defaults = { 

1102 "delimiter": None, 

1103 "escapechar": None, 

1104 "quotechar": '"', 

1105 "quoting": csv.QUOTE_MINIMAL, 

1106 "doublequote": True, 

1107 "skipinitialspace": False, 

1108 "lineterminator": None, 

1109 "header": "infer", 

1110 "index_col": None, 

1111 "names": None, 

1112 "prefix": None, 

1113 "skiprows": None, 

1114 "skipfooter": 0, 

1115 "nrows": None, 

1116 "na_values": None, 

1117 "keep_default_na": True, 

1118 "true_values": None, 

1119 "false_values": None, 

1120 "converters": None, 

1121 "dtype": None, 

1122 "cache_dates": True, 

1123 "thousands": None, 

1124 "comment": None, 

1125 "decimal": ".", 

1126 # 'engine': 'c', 

1127 "parse_dates": False, 

1128 "keep_date_col": False, 

1129 "dayfirst": False, 

1130 "date_parser": None, 

1131 "usecols": None, 

1132 # 'iterator': False, 

1133 "chunksize": None, 

1134 "verbose": False, 

1135 "encoding": None, 

1136 "squeeze": None, 

1137 "compression": None, 

1138 "mangle_dupe_cols": True, 

1139 "infer_datetime_format": False, 

1140 "skip_blank_lines": True, 

1141 "encoding_errors": "strict", 

1142 "on_bad_lines": ParserBase.BadLineHandleMethod.ERROR, 

1143 "error_bad_lines": None, 

1144 "warn_bad_lines": None, 

1145} 

1146 

1147 

1148def _process_date_conversion( 

1149 data_dict, 

1150 converter: Callable, 

1151 parse_spec, 

1152 index_col, 

1153 index_names, 

1154 columns, 

1155 keep_date_col: bool = False, 

1156): 

1157 def _isindex(colspec): 

1158 return (isinstance(index_col, list) and colspec in index_col) or ( 

1159 isinstance(index_names, list) and colspec in index_names 

1160 ) 

1161 

1162 new_cols = [] 

1163 new_data = {} 

1164 

1165 orig_names = columns 

1166 columns = list(columns) 

1167 

1168 date_cols = set() 

1169 

1170 if parse_spec is None or isinstance(parse_spec, bool): 

1171 return data_dict, columns 

1172 

1173 if isinstance(parse_spec, list): 

1174 # list of column lists 

1175 for colspec in parse_spec: 

1176 if is_scalar(colspec) or isinstance(colspec, tuple): 

1177 if isinstance(colspec, int) and colspec not in data_dict: 

1178 colspec = orig_names[colspec] 

1179 if _isindex(colspec): 

1180 continue 

1181 # Pyarrow engine returns Series which we need to convert to 

1182 # numpy array before converter, its a no-op for other parsers 

1183 data_dict[colspec] = converter(np.asarray(data_dict[colspec])) 

1184 else: 

1185 new_name, col, old_names = _try_convert_dates( 

1186 converter, colspec, data_dict, orig_names 

1187 ) 

1188 if new_name in data_dict: 

1189 raise ValueError(f"New date column already in dict {new_name}") 

1190 new_data[new_name] = col 

1191 new_cols.append(new_name) 

1192 date_cols.update(old_names) 

1193 

1194 elif isinstance(parse_spec, dict): 

1195 # dict of new name to column list 

1196 for new_name, colspec in parse_spec.items(): 

1197 if new_name in data_dict: 

1198 raise ValueError(f"Date column {new_name} already in dict") 

1199 

1200 _, col, old_names = _try_convert_dates( 

1201 converter, colspec, data_dict, orig_names 

1202 ) 

1203 

1204 new_data[new_name] = col 

1205 

1206 # If original column can be converted to date we keep the converted values 

1207 # This can only happen if values are from single column 

1208 if len(colspec) == 1: 

1209 new_data[colspec[0]] = col 

1210 

1211 new_cols.append(new_name) 

1212 date_cols.update(old_names) 

1213 

1214 data_dict.update(new_data) 

1215 new_cols.extend(columns) 

1216 

1217 if not keep_date_col: 

1218 for c in list(date_cols): 

1219 data_dict.pop(c) 

1220 new_cols.remove(c) 

1221 

1222 return data_dict, new_cols 

1223 

1224 

1225def _try_convert_dates(parser: Callable, colspec, data_dict, columns): 

1226 colset = set(columns) 

1227 colnames = [] 

1228 

1229 for c in colspec: 

1230 if c in colset: 

1231 colnames.append(c) 

1232 elif isinstance(c, int) and c not in columns: 

1233 colnames.append(columns[c]) 

1234 else: 

1235 colnames.append(c) 

1236 

1237 new_name: tuple | str 

1238 if all(isinstance(x, tuple) for x in colnames): 

1239 new_name = tuple(map("_".join, zip(*colnames))) 

1240 else: 

1241 new_name = "_".join([str(x) for x in colnames]) 

1242 to_parse = [np.asarray(data_dict[c]) for c in colnames if c in data_dict] 

1243 

1244 new_col = parser(*to_parse) 

1245 return new_name, new_col, colnames 

1246 

1247 

1248def _get_na_values(col, na_values, na_fvalues, keep_default_na): 

1249 """ 

1250 Get the NaN values for a given column. 

1251 

1252 Parameters 

1253 ---------- 

1254 col : str 

1255 The name of the column. 

1256 na_values : array-like, dict 

1257 The object listing the NaN values as strings. 

1258 na_fvalues : array-like, dict 

1259 The object listing the NaN values as floats. 

1260 keep_default_na : bool 

1261 If `na_values` is a dict, and the column is not mapped in the 

1262 dictionary, whether to return the default NaN values or the empty set. 

1263 

1264 Returns 

1265 ------- 

1266 nan_tuple : A length-two tuple composed of 

1267 

1268 1) na_values : the string NaN values for that column. 

1269 2) na_fvalues : the float NaN values for that column. 

1270 """ 

1271 if isinstance(na_values, dict): 

1272 if col in na_values: 

1273 return na_values[col], na_fvalues[col] 

1274 else: 

1275 if keep_default_na: 

1276 return STR_NA_VALUES, set() 

1277 

1278 return set(), set() 

1279 else: 

1280 return na_values, na_fvalues 

1281 

1282 

1283def _is_potential_multi_index( 

1284 columns: Sequence[Hashable] | MultiIndex, 

1285 index_col: bool | Sequence[int] | None = None, 

1286) -> bool: 

1287 """ 

1288 Check whether or not the `columns` parameter 

1289 could be converted into a MultiIndex. 

1290 

1291 Parameters 

1292 ---------- 

1293 columns : array-like 

1294 Object which may or may not be convertible into a MultiIndex 

1295 index_col : None, bool or list, optional 

1296 Column or columns to use as the (possibly hierarchical) index 

1297 

1298 Returns 

1299 ------- 

1300 bool : Whether or not columns could become a MultiIndex 

1301 """ 

1302 if index_col is None or isinstance(index_col, bool): 

1303 index_col = [] 

1304 

1305 return bool( 

1306 len(columns) 

1307 and not isinstance(columns, MultiIndex) 

1308 and all(isinstance(c, tuple) for c in columns if c not in list(index_col)) 

1309 ) 

1310 

1311 

1312def _validate_parse_dates_arg(parse_dates): 

1313 """ 

1314 Check whether or not the 'parse_dates' parameter 

1315 is a non-boolean scalar. Raises a ValueError if 

1316 that is the case. 

1317 """ 

1318 msg = ( 

1319 "Only booleans, lists, and dictionaries are accepted " 

1320 "for the 'parse_dates' parameter" 

1321 ) 

1322 

1323 if parse_dates is not None: 

1324 if is_scalar(parse_dates): 

1325 if not lib.is_bool(parse_dates): 

1326 raise TypeError(msg) 

1327 

1328 elif not isinstance(parse_dates, (list, dict)): 

1329 raise TypeError(msg) 

1330 

1331 return parse_dates 

1332 

1333 

1334def is_index_col(col) -> bool: 

1335 return col is not None and col is not False