Coverage for /var/srv/projects/api.amasfac.comuna18.com/tmp/venv/lib/python3.9/site-packages/pandas/io/json/_json.py: 19%

498 statements  

« prev     ^ index     » next       coverage.py v6.4.4, created at 2023-07-17 14:22 -0600

1from __future__ import annotations 

2 

3from abc import ( 

4 ABC, 

5 abstractmethod, 

6) 

7from collections import abc 

8import functools 

9from io import StringIO 

10from itertools import islice 

11from typing import ( 

12 TYPE_CHECKING, 

13 Any, 

14 Callable, 

15 Generic, 

16 Literal, 

17 Mapping, 

18 TypeVar, 

19 overload, 

20) 

21 

22import numpy as np 

23 

24import pandas._libs.json as json 

25from pandas._libs.tslibs import iNaT 

26from pandas._typing import ( 

27 CompressionOptions, 

28 DtypeArg, 

29 FilePath, 

30 IndexLabel, 

31 JSONSerializable, 

32 ReadBuffer, 

33 StorageOptions, 

34 WriteBuffer, 

35) 

36from pandas.errors import AbstractMethodError 

37from pandas.util._decorators import ( 

38 deprecate_kwarg, 

39 deprecate_nonkeyword_arguments, 

40 doc, 

41) 

42 

43from pandas.core.dtypes.common import ( 

44 ensure_str, 

45 is_period_dtype, 

46) 

47 

48from pandas import ( 

49 DataFrame, 

50 MultiIndex, 

51 Series, 

52 isna, 

53 notna, 

54 to_datetime, 

55) 

56from pandas.core.construction import create_series_with_explicit_dtype 

57from pandas.core.reshape.concat import concat 

58from pandas.core.shared_docs import _shared_docs 

59 

60from pandas.io.common import ( 

61 IOHandles, 

62 _extension_to_compression, 

63 file_exists, 

64 get_handle, 

65 is_fsspec_url, 

66 is_url, 

67 stringify_path, 

68) 

69from pandas.io.json._normalize import convert_to_line_delimits 

70from pandas.io.json._table_schema import ( 

71 build_table_schema, 

72 parse_table_schema, 

73) 

74from pandas.io.parsers.readers import validate_integer 

75 

76if TYPE_CHECKING: 76 ↛ 77line 76 didn't jump to line 77, because the condition on line 76 was never true

77 from pandas.core.generic import NDFrame 

78 

79FrameSeriesStrT = TypeVar("FrameSeriesStrT", bound=Literal["frame", "series"]) 

80 

81loads = json.loads 

82dumps = json.dumps 

83 

84 

85# interface to/from 

86@overload 

87def to_json( 

88 path_or_buf: FilePath | WriteBuffer[str] | WriteBuffer[bytes], 

89 obj: NDFrame, 

90 orient: str | None = ..., 

91 date_format: str = ..., 

92 double_precision: int = ..., 

93 force_ascii: bool = ..., 

94 date_unit: str = ..., 

95 default_handler: Callable[[Any], JSONSerializable] | None = ..., 

96 lines: bool = ..., 

97 compression: CompressionOptions = ..., 

98 index: bool = ..., 

99 indent: int = ..., 

100 storage_options: StorageOptions = ..., 

101) -> None: 

102 ... 

103 

104 

105@overload 

106def to_json( 

107 path_or_buf: None, 

108 obj: NDFrame, 

109 orient: str | None = ..., 

110 date_format: str = ..., 

111 double_precision: int = ..., 

112 force_ascii: bool = ..., 

113 date_unit: str = ..., 

114 default_handler: Callable[[Any], JSONSerializable] | None = ..., 

115 lines: bool = ..., 

116 compression: CompressionOptions = ..., 

117 index: bool = ..., 

118 indent: int = ..., 

119 storage_options: StorageOptions = ..., 

120) -> str: 

121 ... 

122 

123 

124def to_json( 

125 path_or_buf: FilePath | WriteBuffer[str] | WriteBuffer[bytes] | None, 

126 obj: NDFrame, 

127 orient: str | None = None, 

128 date_format: str = "epoch", 

129 double_precision: int = 10, 

130 force_ascii: bool = True, 

131 date_unit: str = "ms", 

132 default_handler: Callable[[Any], JSONSerializable] | None = None, 

133 lines: bool = False, 

134 compression: CompressionOptions = "infer", 

135 index: bool = True, 

136 indent: int = 0, 

137 storage_options: StorageOptions = None, 

138) -> str | None: 

139 

140 if not index and orient not in ["split", "table"]: 

141 raise ValueError( 

142 "'index=False' is only valid when 'orient' is 'split' or 'table'" 

143 ) 

144 

145 if lines and orient != "records": 

146 raise ValueError("'lines' keyword only valid when 'orient' is records") 

147 

148 if orient == "table" and isinstance(obj, Series): 

149 obj = obj.to_frame(name=obj.name or "values") 

150 

151 writer: type[Writer] 

152 if orient == "table" and isinstance(obj, DataFrame): 

153 writer = JSONTableWriter 

154 elif isinstance(obj, Series): 

155 writer = SeriesWriter 

156 elif isinstance(obj, DataFrame): 

157 writer = FrameWriter 

158 else: 

159 raise NotImplementedError("'obj' should be a Series or a DataFrame") 

160 

161 s = writer( 

162 obj, 

163 orient=orient, 

164 date_format=date_format, 

165 double_precision=double_precision, 

166 ensure_ascii=force_ascii, 

167 date_unit=date_unit, 

168 default_handler=default_handler, 

169 index=index, 

170 indent=indent, 

171 ).write() 

172 

173 if lines: 

174 s = convert_to_line_delimits(s) 

175 

176 if path_or_buf is not None: 

177 # apply compression and byte/text conversion 

178 with get_handle( 

179 path_or_buf, "w", compression=compression, storage_options=storage_options 

180 ) as handles: 

181 handles.handle.write(s) 

182 else: 

183 return s 

184 return None 

185 

186 

187class Writer(ABC): 

188 _default_orient: str 

189 

190 def __init__( 

191 self, 

192 obj, 

193 orient: str | None, 

194 date_format: str, 

195 double_precision: int, 

196 ensure_ascii: bool, 

197 date_unit: str, 

198 index: bool, 

199 default_handler: Callable[[Any], JSONSerializable] | None = None, 

200 indent: int = 0, 

201 ) -> None: 

202 self.obj = obj 

203 

204 if orient is None: 

205 orient = self._default_orient 

206 

207 self.orient = orient 

208 self.date_format = date_format 

209 self.double_precision = double_precision 

210 self.ensure_ascii = ensure_ascii 

211 self.date_unit = date_unit 

212 self.default_handler = default_handler 

213 self.index = index 

214 self.indent = indent 

215 

216 self.is_copy = None 

217 self._format_axes() 

218 

219 def _format_axes(self): 

220 raise AbstractMethodError(self) 

221 

222 def write(self) -> str: 

223 iso_dates = self.date_format == "iso" 

224 return dumps( 

225 self.obj_to_write, 

226 orient=self.orient, 

227 double_precision=self.double_precision, 

228 ensure_ascii=self.ensure_ascii, 

229 date_unit=self.date_unit, 

230 iso_dates=iso_dates, 

231 default_handler=self.default_handler, 

232 indent=self.indent, 

233 ) 

234 

235 @property 

236 @abstractmethod 

237 def obj_to_write(self) -> NDFrame | Mapping[IndexLabel, Any]: 

238 """Object to write in JSON format.""" 

239 pass 

240 

241 

242class SeriesWriter(Writer): 

243 _default_orient = "index" 

244 

245 @property 

246 def obj_to_write(self) -> NDFrame | Mapping[IndexLabel, Any]: 

247 if not self.index and self.orient == "split": 

248 return {"name": self.obj.name, "data": self.obj.values} 

249 else: 

250 return self.obj 

251 

252 def _format_axes(self): 

253 if not self.obj.index.is_unique and self.orient == "index": 

254 raise ValueError(f"Series index must be unique for orient='{self.orient}'") 

255 

256 

257class FrameWriter(Writer): 

258 _default_orient = "columns" 

259 

260 @property 

261 def obj_to_write(self) -> NDFrame | Mapping[IndexLabel, Any]: 

262 if not self.index and self.orient == "split": 

263 obj_to_write = self.obj.to_dict(orient="split") 

264 del obj_to_write["index"] 

265 else: 

266 obj_to_write = self.obj 

267 return obj_to_write 

268 

269 def _format_axes(self): 

270 """ 

271 Try to format axes if they are datelike. 

272 """ 

273 if not self.obj.index.is_unique and self.orient in ("index", "columns"): 

274 raise ValueError( 

275 f"DataFrame index must be unique for orient='{self.orient}'." 

276 ) 

277 if not self.obj.columns.is_unique and self.orient in ( 

278 "index", 

279 "columns", 

280 "records", 

281 ): 

282 raise ValueError( 

283 f"DataFrame columns must be unique for orient='{self.orient}'." 

284 ) 

285 

286 

287class JSONTableWriter(FrameWriter): 

288 _default_orient = "records" 

289 

290 def __init__( 

291 self, 

292 obj, 

293 orient: str | None, 

294 date_format: str, 

295 double_precision: int, 

296 ensure_ascii: bool, 

297 date_unit: str, 

298 index: bool, 

299 default_handler: Callable[[Any], JSONSerializable] | None = None, 

300 indent: int = 0, 

301 ) -> None: 

302 """ 

303 Adds a `schema` attribute with the Table Schema, resets 

304 the index (can't do in caller, because the schema inference needs 

305 to know what the index is, forces orient to records, and forces 

306 date_format to 'iso'. 

307 """ 

308 super().__init__( 

309 obj, 

310 orient, 

311 date_format, 

312 double_precision, 

313 ensure_ascii, 

314 date_unit, 

315 index, 

316 default_handler=default_handler, 

317 indent=indent, 

318 ) 

319 

320 if date_format != "iso": 

321 msg = ( 

322 "Trying to write with `orient='table'` and " 

323 f"`date_format='{date_format}'`. Table Schema requires dates " 

324 "to be formatted with `date_format='iso'`" 

325 ) 

326 raise ValueError(msg) 

327 

328 self.schema = build_table_schema(obj, index=self.index) 

329 

330 # NotImplemented on a column MultiIndex 

331 if obj.ndim == 2 and isinstance(obj.columns, MultiIndex): 

332 raise NotImplementedError( 

333 "orient='table' is not supported for MultiIndex columns" 

334 ) 

335 

336 # TODO: Do this timedelta properly in objToJSON.c See GH #15137 

337 if ( 

338 (obj.ndim == 1) 

339 and (obj.name in set(obj.index.names)) 

340 or len(obj.columns.intersection(obj.index.names)) 

341 ): 

342 msg = "Overlapping names between the index and columns" 

343 raise ValueError(msg) 

344 

345 obj = obj.copy() 

346 timedeltas = obj.select_dtypes(include=["timedelta"]).columns 

347 if len(timedeltas): 

348 obj[timedeltas] = obj[timedeltas].applymap(lambda x: x.isoformat()) 

349 # Convert PeriodIndex to datetimes before serializing 

350 if is_period_dtype(obj.index.dtype): 

351 obj.index = obj.index.to_timestamp() 

352 

353 # exclude index from obj if index=False 

354 if not self.index: 

355 self.obj = obj.reset_index(drop=True) 

356 else: 

357 self.obj = obj.reset_index(drop=False) 

358 self.date_format = "iso" 

359 self.orient = "records" 

360 self.index = index 

361 

362 @property 

363 def obj_to_write(self) -> NDFrame | Mapping[IndexLabel, Any]: 

364 return {"schema": self.schema, "data": self.obj} 

365 

366 

367@overload 

368def read_json( 

369 path_or_buf: FilePath | ReadBuffer[str] | ReadBuffer[bytes], 

370 *, 

371 orient: str | None = ..., 

372 typ: Literal["frame"] = ..., 

373 dtype: DtypeArg | None = ..., 

374 convert_axes=..., 

375 convert_dates: bool | list[str] = ..., 

376 keep_default_dates: bool = ..., 

377 numpy: bool = ..., 

378 precise_float: bool = ..., 

379 date_unit: str | None = ..., 

380 encoding: str | None = ..., 

381 encoding_errors: str | None = ..., 

382 lines: bool = ..., 

383 chunksize: int, 

384 compression: CompressionOptions = ..., 

385 nrows: int | None = ..., 

386 storage_options: StorageOptions = ..., 

387) -> JsonReader[Literal["frame"]]: 

388 ... 

389 

390 

391@overload 

392def read_json( 

393 path_or_buf: FilePath | ReadBuffer[str] | ReadBuffer[bytes], 

394 *, 

395 orient: str | None = ..., 

396 typ: Literal["series"], 

397 dtype: DtypeArg | None = ..., 

398 convert_axes=..., 

399 convert_dates: bool | list[str] = ..., 

400 keep_default_dates: bool = ..., 

401 numpy: bool = ..., 

402 precise_float: bool = ..., 

403 date_unit: str | None = ..., 

404 encoding: str | None = ..., 

405 encoding_errors: str | None = ..., 

406 lines: bool = ..., 

407 chunksize: int, 

408 compression: CompressionOptions = ..., 

409 nrows: int | None = ..., 

410 storage_options: StorageOptions = ..., 

411) -> JsonReader[Literal["series"]]: 

412 ... 

413 

414 

415@overload 

416def read_json( 

417 path_or_buf: FilePath | ReadBuffer[str] | ReadBuffer[bytes], 

418 *, 

419 orient: str | None = ..., 

420 typ: Literal["series"], 

421 dtype: DtypeArg | None = ..., 

422 convert_axes=..., 

423 convert_dates: bool | list[str] = ..., 

424 keep_default_dates: bool = ..., 

425 numpy: bool = ..., 

426 precise_float: bool = ..., 

427 date_unit: str | None = ..., 

428 encoding: str | None = ..., 

429 encoding_errors: str | None = ..., 

430 lines: bool = ..., 

431 chunksize: None = ..., 

432 compression: CompressionOptions = ..., 

433 nrows: int | None = ..., 

434 storage_options: StorageOptions = ..., 

435) -> Series: 

436 ... 

437 

438 

439@overload 

440def read_json( 

441 path_or_buf: FilePath | ReadBuffer[str] | ReadBuffer[bytes], 

442 orient: str | None = ..., 

443 typ: Literal["frame"] = ..., 

444 dtype: DtypeArg | None = ..., 

445 convert_axes=..., 

446 convert_dates: bool | list[str] = ..., 

447 keep_default_dates: bool = ..., 

448 numpy: bool = ..., 

449 precise_float: bool = ..., 

450 date_unit: str | None = ..., 

451 encoding: str | None = ..., 

452 encoding_errors: str | None = ..., 

453 lines: bool = ..., 

454 chunksize: None = ..., 

455 compression: CompressionOptions = ..., 

456 nrows: int | None = ..., 

457 storage_options: StorageOptions = ..., 

458) -> DataFrame: 

459 ... 

460 

461 

462@doc( 

463 storage_options=_shared_docs["storage_options"], 

464 decompression_options=_shared_docs["decompression_options"] % "path_or_buf", 

465) 

466@deprecate_kwarg(old_arg_name="numpy", new_arg_name=None) 

467@deprecate_nonkeyword_arguments(version="2.0", allowed_args=["path_or_buf"]) 

468def read_json( 

469 path_or_buf: FilePath | ReadBuffer[str] | ReadBuffer[bytes], 

470 orient: str | None = None, 

471 typ: Literal["frame", "series"] = "frame", 

472 dtype: DtypeArg | None = None, 

473 convert_axes=None, 

474 convert_dates: bool | list[str] = True, 

475 keep_default_dates: bool = True, 

476 numpy: bool = False, 

477 precise_float: bool = False, 

478 date_unit: str | None = None, 

479 encoding: str | None = None, 

480 encoding_errors: str | None = "strict", 

481 lines: bool = False, 

482 chunksize: int | None = None, 

483 compression: CompressionOptions = "infer", 

484 nrows: int | None = None, 

485 storage_options: StorageOptions = None, 

486) -> DataFrame | Series | JsonReader: 

487 """ 

488 Convert a JSON string to pandas object. 

489 

490 Parameters 

491 ---------- 

492 path_or_buf : a valid JSON str, path object or file-like object 

493 Any valid string path is acceptable. The string could be a URL. Valid 

494 URL schemes include http, ftp, s3, and file. For file URLs, a host is 

495 expected. A local file could be: 

496 ``file://localhost/path/to/table.json``. 

497 

498 If you want to pass in a path object, pandas accepts any 

499 ``os.PathLike``. 

500 

501 By file-like object, we refer to objects with a ``read()`` method, 

502 such as a file handle (e.g. via builtin ``open`` function) 

503 or ``StringIO``. 

504 orient : str 

505 Indication of expected JSON string format. 

506 Compatible JSON strings can be produced by ``to_json()`` with a 

507 corresponding orient value. 

508 The set of possible orients is: 

509 

510 - ``'split'`` : dict like 

511 ``{{index -> [index], columns -> [columns], data -> [values]}}`` 

512 - ``'records'`` : list like 

513 ``[{{column -> value}}, ... , {{column -> value}}]`` 

514 - ``'index'`` : dict like ``{{index -> {{column -> value}}}}`` 

515 - ``'columns'`` : dict like ``{{column -> {{index -> value}}}}`` 

516 - ``'values'`` : just the values array 

517 

518 The allowed and default values depend on the value 

519 of the `typ` parameter. 

520 

521 * when ``typ == 'series'``, 

522 

523 - allowed orients are ``{{'split','records','index'}}`` 

524 - default is ``'index'`` 

525 - The Series index must be unique for orient ``'index'``. 

526 

527 * when ``typ == 'frame'``, 

528 

529 - allowed orients are ``{{'split','records','index', 

530 'columns','values', 'table'}}`` 

531 - default is ``'columns'`` 

532 - The DataFrame index must be unique for orients ``'index'`` and 

533 ``'columns'``. 

534 - The DataFrame columns must be unique for orients ``'index'``, 

535 ``'columns'``, and ``'records'``. 

536 

537 typ : {{'frame', 'series'}}, default 'frame' 

538 The type of object to recover. 

539 

540 dtype : bool or dict, default None 

541 If True, infer dtypes; if a dict of column to dtype, then use those; 

542 if False, then don't infer dtypes at all, applies only to the data. 

543 

544 For all ``orient`` values except ``'table'``, default is True. 

545 

546 .. versionchanged:: 0.25.0 

547 

548 Not applicable for ``orient='table'``. 

549 

550 convert_axes : bool, default None 

551 Try to convert the axes to the proper dtypes. 

552 

553 For all ``orient`` values except ``'table'``, default is True. 

554 

555 .. versionchanged:: 0.25.0 

556 

557 Not applicable for ``orient='table'``. 

558 

559 convert_dates : bool or list of str, default True 

560 If True then default datelike columns may be converted (depending on 

561 keep_default_dates). 

562 If False, no dates will be converted. 

563 If a list of column names, then those columns will be converted and 

564 default datelike columns may also be converted (depending on 

565 keep_default_dates). 

566 

567 keep_default_dates : bool, default True 

568 If parsing dates (convert_dates is not False), then try to parse the 

569 default datelike columns. 

570 A column label is datelike if 

571 

572 * it ends with ``'_at'``, 

573 

574 * it ends with ``'_time'``, 

575 

576 * it begins with ``'timestamp'``, 

577 

578 * it is ``'modified'``, or 

579 

580 * it is ``'date'``. 

581 

582 numpy : bool, default False 

583 Direct decoding to numpy arrays. Supports numeric data only, but 

584 non-numeric column and index labels are supported. Note also that the 

585 JSON ordering MUST be the same for each term if numpy=True. 

586 

587 .. deprecated:: 1.0.0 

588 

589 precise_float : bool, default False 

590 Set to enable usage of higher precision (strtod) function when 

591 decoding string to double values. Default (False) is to use fast but 

592 less precise builtin functionality. 

593 

594 date_unit : str, default None 

595 The timestamp unit to detect if converting dates. The default behaviour 

596 is to try and detect the correct precision, but if this is not desired 

597 then pass one of 's', 'ms', 'us' or 'ns' to force parsing only seconds, 

598 milliseconds, microseconds or nanoseconds respectively. 

599 

600 encoding : str, default is 'utf-8' 

601 The encoding to use to decode py3 bytes. 

602 

603 encoding_errors : str, optional, default "strict" 

604 How encoding errors are treated. `List of possible values 

605 <https://docs.python.org/3/library/codecs.html#error-handlers>`_ . 

606 

607 .. versionadded:: 1.3.0 

608 

609 lines : bool, default False 

610 Read the file as a json object per line. 

611 

612 chunksize : int, optional 

613 Return JsonReader object for iteration. 

614 See the `line-delimited json docs 

615 <https://pandas.pydata.org/pandas-docs/stable/user_guide/io.html#line-delimited-json>`_ 

616 for more information on ``chunksize``. 

617 This can only be passed if `lines=True`. 

618 If this is None, the file will be read into memory all at once. 

619 

620 .. versionchanged:: 1.2 

621 

622 ``JsonReader`` is a context manager. 

623 

624 {decompression_options} 

625 

626 .. versionchanged:: 1.4.0 Zstandard support. 

627 

628 nrows : int, optional 

629 The number of lines from the line-delimited jsonfile that has to be read. 

630 This can only be passed if `lines=True`. 

631 If this is None, all the rows will be returned. 

632 

633 .. versionadded:: 1.1 

634 

635 {storage_options} 

636 

637 .. versionadded:: 1.2.0 

638 

639 Returns 

640 ------- 

641 Series or DataFrame 

642 The type returned depends on the value of `typ`. 

643 

644 See Also 

645 -------- 

646 DataFrame.to_json : Convert a DataFrame to a JSON string. 

647 Series.to_json : Convert a Series to a JSON string. 

648 json_normalize : Normalize semi-structured JSON data into a flat table. 

649 

650 Notes 

651 ----- 

652 Specific to ``orient='table'``, if a :class:`DataFrame` with a literal 

653 :class:`Index` name of `index` gets written with :func:`to_json`, the 

654 subsequent read operation will incorrectly set the :class:`Index` name to 

655 ``None``. This is because `index` is also used by :func:`DataFrame.to_json` 

656 to denote a missing :class:`Index` name, and the subsequent 

657 :func:`read_json` operation cannot distinguish between the two. The same 

658 limitation is encountered with a :class:`MultiIndex` and any names 

659 beginning with ``'level_'``. 

660 

661 Examples 

662 -------- 

663 >>> df = pd.DataFrame([['a', 'b'], ['c', 'd']], 

664 ... index=['row 1', 'row 2'], 

665 ... columns=['col 1', 'col 2']) 

666 

667 Encoding/decoding a Dataframe using ``'split'`` formatted JSON: 

668 

669 >>> df.to_json(orient='split') 

670 '\ 

671{{\ 

672"columns":["col 1","col 2"],\ 

673"index":["row 1","row 2"],\ 

674"data":[["a","b"],["c","d"]]\ 

675}}\ 

676' 

677 >>> pd.read_json(_, orient='split') 

678 col 1 col 2 

679 row 1 a b 

680 row 2 c d 

681 

682 Encoding/decoding a Dataframe using ``'index'`` formatted JSON: 

683 

684 >>> df.to_json(orient='index') 

685 '{{"row 1":{{"col 1":"a","col 2":"b"}},"row 2":{{"col 1":"c","col 2":"d"}}}}' 

686 

687 >>> pd.read_json(_, orient='index') 

688 col 1 col 2 

689 row 1 a b 

690 row 2 c d 

691 

692 Encoding/decoding a Dataframe using ``'records'`` formatted JSON. 

693 Note that index labels are not preserved with this encoding. 

694 

695 >>> df.to_json(orient='records') 

696 '[{{"col 1":"a","col 2":"b"}},{{"col 1":"c","col 2":"d"}}]' 

697 >>> pd.read_json(_, orient='records') 

698 col 1 col 2 

699 0 a b 

700 1 c d 

701 

702 Encoding with Table Schema 

703 

704 >>> df.to_json(orient='table') 

705 '\ 

706{{"schema":{{"fields":[\ 

707{{"name":"index","type":"string"}},\ 

708{{"name":"col 1","type":"string"}},\ 

709{{"name":"col 2","type":"string"}}],\ 

710"primaryKey":["index"],\ 

711"pandas_version":"1.4.0"}},\ 

712"data":[\ 

713{{"index":"row 1","col 1":"a","col 2":"b"}},\ 

714{{"index":"row 2","col 1":"c","col 2":"d"}}]\ 

715}}\ 

716' 

717 """ 

718 if orient == "table" and dtype: 

719 raise ValueError("cannot pass both dtype and orient='table'") 

720 if orient == "table" and convert_axes: 

721 raise ValueError("cannot pass both convert_axes and orient='table'") 

722 

723 if dtype is None and orient != "table": 

724 # error: Incompatible types in assignment (expression has type "bool", variable 

725 # has type "Union[ExtensionDtype, str, dtype[Any], Type[str], Type[float], 

726 # Type[int], Type[complex], Type[bool], Type[object], Dict[Hashable, 

727 # Union[ExtensionDtype, Union[str, dtype[Any]], Type[str], Type[float], 

728 # Type[int], Type[complex], Type[bool], Type[object]]], None]") 

729 dtype = True # type: ignore[assignment] 

730 if convert_axes is None and orient != "table": 

731 convert_axes = True 

732 

733 json_reader = JsonReader( 

734 path_or_buf, 

735 orient=orient, 

736 typ=typ, 

737 dtype=dtype, 

738 convert_axes=convert_axes, 

739 convert_dates=convert_dates, 

740 keep_default_dates=keep_default_dates, 

741 numpy=numpy, 

742 precise_float=precise_float, 

743 date_unit=date_unit, 

744 encoding=encoding, 

745 lines=lines, 

746 chunksize=chunksize, 

747 compression=compression, 

748 nrows=nrows, 

749 storage_options=storage_options, 

750 encoding_errors=encoding_errors, 

751 ) 

752 

753 if chunksize: 

754 return json_reader 

755 

756 with json_reader: 

757 return json_reader.read() 

758 

759 

760class JsonReader(abc.Iterator, Generic[FrameSeriesStrT]): 

761 """ 

762 JsonReader provides an interface for reading in a JSON file. 

763 

764 If initialized with ``lines=True`` and ``chunksize``, can be iterated over 

765 ``chunksize`` lines at a time. Otherwise, calling ``read`` reads in the 

766 whole document. 

767 """ 

768 

769 def __init__( 

770 self, 

771 filepath_or_buffer, 

772 orient, 

773 typ: FrameSeriesStrT, 

774 dtype, 

775 convert_axes, 

776 convert_dates, 

777 keep_default_dates: bool, 

778 numpy: bool, 

779 precise_float: bool, 

780 date_unit, 

781 encoding, 

782 lines: bool, 

783 chunksize: int | None, 

784 compression: CompressionOptions, 

785 nrows: int | None, 

786 storage_options: StorageOptions = None, 

787 encoding_errors: str | None = "strict", 

788 ) -> None: 

789 

790 self.orient = orient 

791 self.typ = typ 

792 self.dtype = dtype 

793 self.convert_axes = convert_axes 

794 self.convert_dates = convert_dates 

795 self.keep_default_dates = keep_default_dates 

796 self.numpy = numpy 

797 self.precise_float = precise_float 

798 self.date_unit = date_unit 

799 self.encoding = encoding 

800 self.compression = compression 

801 self.storage_options = storage_options 

802 self.lines = lines 

803 self.chunksize = chunksize 

804 self.nrows_seen = 0 

805 self.nrows = nrows 

806 self.encoding_errors = encoding_errors 

807 self.handles: IOHandles[str] | None = None 

808 

809 if self.chunksize is not None: 

810 self.chunksize = validate_integer("chunksize", self.chunksize, 1) 

811 if not self.lines: 

812 raise ValueError("chunksize can only be passed if lines=True") 

813 if self.nrows is not None: 

814 self.nrows = validate_integer("nrows", self.nrows, 0) 

815 if not self.lines: 

816 raise ValueError("nrows can only be passed if lines=True") 

817 

818 data = self._get_data_from_filepath(filepath_or_buffer) 

819 self.data = self._preprocess_data(data) 

820 

821 def _preprocess_data(self, data): 

822 """ 

823 At this point, the data either has a `read` attribute (e.g. a file 

824 object or a StringIO) or is a string that is a JSON document. 

825 

826 If self.chunksize, we prepare the data for the `__next__` method. 

827 Otherwise, we read it into memory for the `read` method. 

828 """ 

829 if hasattr(data, "read") and not (self.chunksize or self.nrows): 

830 with self: 

831 data = data.read() 

832 if not hasattr(data, "read") and (self.chunksize or self.nrows): 

833 data = StringIO(data) 

834 

835 return data 

836 

837 def _get_data_from_filepath(self, filepath_or_buffer): 

838 """ 

839 The function read_json accepts three input types: 

840 1. filepath (string-like) 

841 2. file-like object (e.g. open file object, StringIO) 

842 3. JSON string 

843 

844 This method turns (1) into (2) to simplify the rest of the processing. 

845 It returns input types (2) and (3) unchanged. 

846 

847 It raises FileNotFoundError if the input is a string ending in 

848 one of .json, .json.gz, .json.bz2, etc. but no such file exists. 

849 """ 

850 # if it is a string but the file does not exist, it might be a JSON string 

851 filepath_or_buffer = stringify_path(filepath_or_buffer) 

852 if ( 

853 not isinstance(filepath_or_buffer, str) 

854 or is_url(filepath_or_buffer) 

855 or is_fsspec_url(filepath_or_buffer) 

856 or file_exists(filepath_or_buffer) 

857 ): 

858 self.handles = get_handle( 

859 filepath_or_buffer, 

860 "r", 

861 encoding=self.encoding, 

862 compression=self.compression, 

863 storage_options=self.storage_options, 

864 errors=self.encoding_errors, 

865 ) 

866 filepath_or_buffer = self.handles.handle 

867 elif ( 

868 isinstance(filepath_or_buffer, str) 

869 and filepath_or_buffer.lower().endswith( 

870 (".json",) + tuple(f".json{c}" for c in _extension_to_compression) 

871 ) 

872 and not file_exists(filepath_or_buffer) 

873 ): 

874 raise FileNotFoundError(f"File {filepath_or_buffer} does not exist") 

875 

876 return filepath_or_buffer 

877 

878 def _combine_lines(self, lines) -> str: 

879 """ 

880 Combines a list of JSON objects into one JSON object. 

881 """ 

882 return ( 

883 f'[{",".join([line for line in (line.strip() for line in lines) if line])}]' 

884 ) 

885 

886 @overload 

887 def read(self: JsonReader[Literal["frame"]]) -> DataFrame: 

888 ... 

889 

890 @overload 

891 def read(self: JsonReader[Literal["series"]]) -> Series: 

892 ... 

893 

894 @overload 

895 def read(self: JsonReader[Literal["frame", "series"]]) -> DataFrame | Series: 

896 ... 

897 

898 def read(self) -> DataFrame | Series: 

899 """ 

900 Read the whole JSON input into a pandas object. 

901 """ 

902 obj: DataFrame | Series 

903 if self.lines: 

904 if self.chunksize: 

905 obj = concat(self) 

906 elif self.nrows: 

907 lines = list(islice(self.data, self.nrows)) 

908 lines_json = self._combine_lines(lines) 

909 obj = self._get_object_parser(lines_json) 

910 else: 

911 data = ensure_str(self.data) 

912 data_lines = data.split("\n") 

913 obj = self._get_object_parser(self._combine_lines(data_lines)) 

914 else: 

915 obj = self._get_object_parser(self.data) 

916 self.close() 

917 return obj 

918 

919 def _get_object_parser(self, json) -> DataFrame | Series: 

920 """ 

921 Parses a json document into a pandas object. 

922 """ 

923 typ = self.typ 

924 dtype = self.dtype 

925 kwargs = { 

926 "orient": self.orient, 

927 "dtype": self.dtype, 

928 "convert_axes": self.convert_axes, 

929 "convert_dates": self.convert_dates, 

930 "keep_default_dates": self.keep_default_dates, 

931 "numpy": self.numpy, 

932 "precise_float": self.precise_float, 

933 "date_unit": self.date_unit, 

934 } 

935 obj = None 

936 if typ == "frame": 

937 obj = FrameParser(json, **kwargs).parse() 

938 

939 if typ == "series" or obj is None: 

940 if not isinstance(dtype, bool): 

941 kwargs["dtype"] = dtype 

942 obj = SeriesParser(json, **kwargs).parse() 

943 

944 return obj 

945 

946 def close(self) -> None: 

947 """ 

948 If we opened a stream earlier, in _get_data_from_filepath, we should 

949 close it. 

950 

951 If an open stream or file was passed, we leave it open. 

952 """ 

953 if self.handles is not None: 

954 self.handles.close() 

955 

956 def __iter__(self: JsonReader[FrameSeriesStrT]) -> JsonReader[FrameSeriesStrT]: 

957 return self 

958 

959 @overload 

960 def __next__(self: JsonReader[Literal["frame"]]) -> DataFrame: 

961 ... 

962 

963 @overload 

964 def __next__(self: JsonReader[Literal["series"]]) -> Series: 

965 ... 

966 

967 @overload 

968 def __next__(self: JsonReader[Literal["frame", "series"]]) -> DataFrame | Series: 

969 ... 

970 

971 def __next__(self) -> DataFrame | Series: 

972 if self.nrows: 

973 if self.nrows_seen >= self.nrows: 

974 self.close() 

975 raise StopIteration 

976 

977 lines = list(islice(self.data, self.chunksize)) 

978 if lines: 

979 lines_json = self._combine_lines(lines) 

980 obj = self._get_object_parser(lines_json) 

981 

982 # Make sure that the returned objects have the right index. 

983 obj.index = range(self.nrows_seen, self.nrows_seen + len(obj)) 

984 self.nrows_seen += len(obj) 

985 

986 return obj 

987 

988 self.close() 

989 raise StopIteration 

990 

991 def __enter__(self) -> JsonReader[FrameSeriesStrT]: 

992 return self 

993 

994 def __exit__(self, exc_type, exc_value, traceback) -> None: 

995 self.close() 

996 

997 

998class Parser: 

999 _split_keys: tuple[str, ...] 

1000 _default_orient: str 

1001 

1002 _STAMP_UNITS = ("s", "ms", "us", "ns") 

1003 _MIN_STAMPS = { 

1004 "s": 31536000, 

1005 "ms": 31536000000, 

1006 "us": 31536000000000, 

1007 "ns": 31536000000000000, 

1008 } 

1009 

1010 def __init__( 

1011 self, 

1012 json, 

1013 orient, 

1014 dtype: DtypeArg | None = None, 

1015 convert_axes: bool = True, 

1016 convert_dates: bool | list[str] = True, 

1017 keep_default_dates: bool = False, 

1018 numpy: bool = False, 

1019 precise_float: bool = False, 

1020 date_unit=None, 

1021 ) -> None: 

1022 self.json = json 

1023 

1024 if orient is None: 

1025 orient = self._default_orient 

1026 

1027 self.orient = orient 

1028 

1029 self.dtype = dtype 

1030 

1031 if orient == "split": 

1032 numpy = False 

1033 

1034 if date_unit is not None: 

1035 date_unit = date_unit.lower() 

1036 if date_unit not in self._STAMP_UNITS: 

1037 raise ValueError(f"date_unit must be one of {self._STAMP_UNITS}") 

1038 self.min_stamp = self._MIN_STAMPS[date_unit] 

1039 else: 

1040 self.min_stamp = self._MIN_STAMPS["s"] 

1041 

1042 self.numpy = numpy 

1043 self.precise_float = precise_float 

1044 self.convert_axes = convert_axes 

1045 self.convert_dates = convert_dates 

1046 self.date_unit = date_unit 

1047 self.keep_default_dates = keep_default_dates 

1048 self.obj: DataFrame | Series | None = None 

1049 

1050 def check_keys_split(self, decoded) -> None: 

1051 """ 

1052 Checks that dict has only the appropriate keys for orient='split'. 

1053 """ 

1054 bad_keys = set(decoded.keys()).difference(set(self._split_keys)) 

1055 if bad_keys: 

1056 bad_keys_joined = ", ".join(bad_keys) 

1057 raise ValueError(f"JSON data had unexpected key(s): {bad_keys_joined}") 

1058 

1059 def parse(self): 

1060 

1061 if self.numpy: 

1062 self._parse_numpy() 

1063 else: 

1064 self._parse_no_numpy() 

1065 

1066 if self.obj is None: 

1067 return None 

1068 if self.convert_axes: 

1069 self._convert_axes() 

1070 self._try_convert_types() 

1071 return self.obj 

1072 

1073 def _parse_numpy(self): 

1074 raise AbstractMethodError(self) 

1075 

1076 def _parse_no_numpy(self): 

1077 raise AbstractMethodError(self) 

1078 

1079 def _convert_axes(self): 

1080 """ 

1081 Try to convert axes. 

1082 """ 

1083 obj = self.obj 

1084 assert obj is not None # for mypy 

1085 for axis_name in obj._AXIS_ORDERS: 

1086 new_axis, result = self._try_convert_data( 

1087 name=axis_name, 

1088 data=obj._get_axis(axis_name), 

1089 use_dtypes=False, 

1090 convert_dates=True, 

1091 ) 

1092 if result: 

1093 setattr(self.obj, axis_name, new_axis) 

1094 

1095 def _try_convert_types(self): 

1096 raise AbstractMethodError(self) 

1097 

1098 def _try_convert_data( 

1099 self, 

1100 name, 

1101 data, 

1102 use_dtypes: bool = True, 

1103 convert_dates: bool | list[str] = True, 

1104 ): 

1105 """ 

1106 Try to parse a ndarray like into a column by inferring dtype. 

1107 """ 

1108 # don't try to coerce, unless a force conversion 

1109 if use_dtypes: 

1110 if not self.dtype: 

1111 if all(notna(data)): 

1112 return data, False 

1113 return data.fillna(np.nan), True 

1114 

1115 # error: Non-overlapping identity check (left operand type: 

1116 # "Union[ExtensionDtype, str, dtype[Any], Type[object], 

1117 # Dict[Hashable, Union[ExtensionDtype, Union[str, dtype[Any]], 

1118 # Type[str], Type[float], Type[int], Type[complex], Type[bool], 

1119 # Type[object]]]]", right operand type: "Literal[True]") 

1120 elif self.dtype is True: # type: ignore[comparison-overlap] 

1121 pass 

1122 else: 

1123 # dtype to force 

1124 dtype = ( 

1125 self.dtype.get(name) if isinstance(self.dtype, dict) else self.dtype 

1126 ) 

1127 if dtype is not None: 

1128 try: 

1129 return data.astype(dtype), True 

1130 except (TypeError, ValueError): 

1131 return data, False 

1132 

1133 if convert_dates: 

1134 new_data, result = self._try_convert_to_date(data) 

1135 if result: 

1136 return new_data, True 

1137 

1138 if data.dtype == "object": 

1139 

1140 # try float 

1141 try: 

1142 data = data.astype("float64") 

1143 except (TypeError, ValueError): 

1144 pass 

1145 

1146 if data.dtype.kind == "f": 

1147 

1148 if data.dtype != "float64": 

1149 

1150 # coerce floats to 64 

1151 try: 

1152 data = data.astype("float64") 

1153 except (TypeError, ValueError): 

1154 pass 

1155 

1156 # don't coerce 0-len data 

1157 if len(data) and (data.dtype == "float" or data.dtype == "object"): 

1158 

1159 # coerce ints if we can 

1160 try: 

1161 new_data = data.astype("int64") 

1162 if (new_data == data).all(): 

1163 data = new_data 

1164 except (TypeError, ValueError, OverflowError): 

1165 pass 

1166 

1167 # coerce ints to 64 

1168 if data.dtype == "int": 

1169 

1170 # coerce floats to 64 

1171 try: 

1172 data = data.astype("int64") 

1173 except (TypeError, ValueError): 

1174 pass 

1175 

1176 # if we have an index, we want to preserve dtypes 

1177 if name == "index" and len(data): 

1178 if self.orient == "split": 

1179 return data, False 

1180 

1181 return data, True 

1182 

1183 def _try_convert_to_date(self, data): 

1184 """ 

1185 Try to parse a ndarray like into a date column. 

1186 

1187 Try to coerce object in epoch/iso formats and integer/float in epoch 

1188 formats. Return a boolean if parsing was successful. 

1189 """ 

1190 # no conversion on empty 

1191 if not len(data): 

1192 return data, False 

1193 

1194 new_data = data 

1195 if new_data.dtype == "object": 

1196 try: 

1197 new_data = data.astype("int64") 

1198 except (TypeError, ValueError, OverflowError): 

1199 pass 

1200 

1201 # ignore numbers that are out of range 

1202 if issubclass(new_data.dtype.type, np.number): 

1203 in_range = ( 

1204 isna(new_data._values) 

1205 | (new_data > self.min_stamp) 

1206 | (new_data._values == iNaT) 

1207 ) 

1208 if not in_range.all(): 

1209 return data, False 

1210 

1211 date_units = (self.date_unit,) if self.date_unit else self._STAMP_UNITS 

1212 for date_unit in date_units: 

1213 try: 

1214 new_data = to_datetime(new_data, errors="raise", unit=date_unit) 

1215 except (ValueError, OverflowError, TypeError): 

1216 continue 

1217 return new_data, True 

1218 return data, False 

1219 

1220 def _try_convert_dates(self): 

1221 raise AbstractMethodError(self) 

1222 

1223 

1224class SeriesParser(Parser): 

1225 _default_orient = "index" 

1226 _split_keys = ("name", "index", "data") 

1227 

1228 def _parse_no_numpy(self): 

1229 data = loads(self.json, precise_float=self.precise_float) 

1230 

1231 if self.orient == "split": 

1232 decoded = {str(k): v for k, v in data.items()} 

1233 self.check_keys_split(decoded) 

1234 self.obj = create_series_with_explicit_dtype(**decoded) 

1235 else: 

1236 self.obj = create_series_with_explicit_dtype(data, dtype_if_empty=object) 

1237 

1238 def _parse_numpy(self): 

1239 load_kwargs = { 

1240 "dtype": None, 

1241 "numpy": True, 

1242 "precise_float": self.precise_float, 

1243 } 

1244 if self.orient in ["columns", "index"]: 

1245 load_kwargs["labelled"] = True 

1246 loads_ = functools.partial(loads, **load_kwargs) 

1247 data = loads_(self.json) 

1248 

1249 if self.orient == "split": 

1250 decoded = {str(k): v for k, v in data.items()} 

1251 self.check_keys_split(decoded) 

1252 self.obj = create_series_with_explicit_dtype(**decoded) 

1253 elif self.orient in ["columns", "index"]: 

1254 # error: "create_series_with_explicit_dtype" 

1255 # gets multiple values for keyword argument "dtype_if_empty 

1256 self.obj = create_series_with_explicit_dtype( 

1257 *data, dtype_if_empty=object 

1258 ) # type: ignore[misc] 

1259 else: 

1260 self.obj = create_series_with_explicit_dtype(data, dtype_if_empty=object) 

1261 

1262 def _try_convert_types(self): 

1263 if self.obj is None: 

1264 return 

1265 obj, result = self._try_convert_data( 

1266 "data", self.obj, convert_dates=self.convert_dates 

1267 ) 

1268 if result: 

1269 self.obj = obj 

1270 

1271 

1272class FrameParser(Parser): 

1273 _default_orient = "columns" 

1274 _split_keys = ("columns", "index", "data") 

1275 

1276 def _parse_numpy(self): 

1277 

1278 json = self.json 

1279 orient = self.orient 

1280 

1281 if orient == "columns": 

1282 args = loads( 

1283 json, 

1284 dtype=None, 

1285 numpy=True, 

1286 labelled=True, 

1287 precise_float=self.precise_float, 

1288 ) 

1289 if len(args): 

1290 args = (args[0].T, args[2], args[1]) 

1291 self.obj = DataFrame(*args) 

1292 elif orient == "split": 

1293 decoded = loads( 

1294 json, dtype=None, numpy=True, precise_float=self.precise_float 

1295 ) 

1296 decoded = {str(k): v for k, v in decoded.items()} 

1297 self.check_keys_split(decoded) 

1298 self.obj = DataFrame(**decoded) 

1299 elif orient == "values": 

1300 self.obj = DataFrame( 

1301 loads(json, dtype=None, numpy=True, precise_float=self.precise_float) 

1302 ) 

1303 else: 

1304 self.obj = DataFrame( 

1305 *loads( 

1306 json, 

1307 dtype=None, 

1308 numpy=True, 

1309 labelled=True, 

1310 precise_float=self.precise_float, 

1311 ) 

1312 ) 

1313 

1314 def _parse_no_numpy(self): 

1315 

1316 json = self.json 

1317 orient = self.orient 

1318 

1319 if orient == "columns": 

1320 self.obj = DataFrame( 

1321 loads(json, precise_float=self.precise_float), dtype=None 

1322 ) 

1323 elif orient == "split": 

1324 decoded = { 

1325 str(k): v 

1326 for k, v in loads(json, precise_float=self.precise_float).items() 

1327 } 

1328 self.check_keys_split(decoded) 

1329 self.obj = DataFrame(dtype=None, **decoded) 

1330 elif orient == "index": 

1331 self.obj = DataFrame.from_dict( 

1332 loads(json, precise_float=self.precise_float), 

1333 dtype=None, 

1334 orient="index", 

1335 ) 

1336 elif orient == "table": 

1337 self.obj = parse_table_schema(json, precise_float=self.precise_float) 

1338 else: 

1339 self.obj = DataFrame( 

1340 loads(json, precise_float=self.precise_float), dtype=None 

1341 ) 

1342 

1343 def _process_converter(self, f, filt=None): 

1344 """ 

1345 Take a conversion function and possibly recreate the frame. 

1346 """ 

1347 if filt is None: 

1348 filt = lambda col, c: True 

1349 

1350 obj = self.obj 

1351 assert obj is not None # for mypy 

1352 

1353 needs_new_obj = False 

1354 new_obj = {} 

1355 for i, (col, c) in enumerate(obj.items()): 

1356 if filt(col, c): 

1357 new_data, result = f(col, c) 

1358 if result: 

1359 c = new_data 

1360 needs_new_obj = True 

1361 new_obj[i] = c 

1362 

1363 if needs_new_obj: 

1364 

1365 # possibly handle dup columns 

1366 new_frame = DataFrame(new_obj, index=obj.index) 

1367 new_frame.columns = obj.columns 

1368 self.obj = new_frame 

1369 

1370 def _try_convert_types(self): 

1371 if self.obj is None: 

1372 return 

1373 if self.convert_dates: 

1374 self._try_convert_dates() 

1375 

1376 self._process_converter( 

1377 lambda col, c: self._try_convert_data(col, c, convert_dates=False) 

1378 ) 

1379 

1380 def _try_convert_dates(self): 

1381 if self.obj is None: 

1382 return 

1383 

1384 # our columns to parse 

1385 convert_dates_list_bool = self.convert_dates 

1386 if isinstance(convert_dates_list_bool, bool): 

1387 convert_dates_list_bool = [] 

1388 convert_dates = set(convert_dates_list_bool) 

1389 

1390 def is_ok(col) -> bool: 

1391 """ 

1392 Return if this col is ok to try for a date parse. 

1393 """ 

1394 if not isinstance(col, str): 

1395 return False 

1396 

1397 col_lower = col.lower() 

1398 if ( 

1399 col_lower.endswith("_at") 

1400 or col_lower.endswith("_time") 

1401 or col_lower == "modified" 

1402 or col_lower == "date" 

1403 or col_lower == "datetime" 

1404 or col_lower.startswith("timestamp") 

1405 ): 

1406 return True 

1407 return False 

1408 

1409 self._process_converter( 

1410 lambda col, c: self._try_convert_to_date(c), 

1411 lambda col, c: ( 

1412 (self.keep_default_dates and is_ok(col)) or col in convert_dates 

1413 ), 

1414 )