Coverage for /var/srv/projects/api.amasfac.comuna18.com/tmp/venv/lib/python3.9/site-packages/pandas/io/json/_json.py: 19%
498 statements
« prev ^ index » next coverage.py v6.4.4, created at 2023-07-17 14:22 -0600
« prev ^ index » next coverage.py v6.4.4, created at 2023-07-17 14:22 -0600
1from __future__ import annotations
3from abc import (
4 ABC,
5 abstractmethod,
6)
7from collections import abc
8import functools
9from io import StringIO
10from itertools import islice
11from typing import (
12 TYPE_CHECKING,
13 Any,
14 Callable,
15 Generic,
16 Literal,
17 Mapping,
18 TypeVar,
19 overload,
20)
22import numpy as np
24import pandas._libs.json as json
25from pandas._libs.tslibs import iNaT
26from pandas._typing import (
27 CompressionOptions,
28 DtypeArg,
29 FilePath,
30 IndexLabel,
31 JSONSerializable,
32 ReadBuffer,
33 StorageOptions,
34 WriteBuffer,
35)
36from pandas.errors import AbstractMethodError
37from pandas.util._decorators import (
38 deprecate_kwarg,
39 deprecate_nonkeyword_arguments,
40 doc,
41)
43from pandas.core.dtypes.common import (
44 ensure_str,
45 is_period_dtype,
46)
48from pandas import (
49 DataFrame,
50 MultiIndex,
51 Series,
52 isna,
53 notna,
54 to_datetime,
55)
56from pandas.core.construction import create_series_with_explicit_dtype
57from pandas.core.reshape.concat import concat
58from pandas.core.shared_docs import _shared_docs
60from pandas.io.common import (
61 IOHandles,
62 _extension_to_compression,
63 file_exists,
64 get_handle,
65 is_fsspec_url,
66 is_url,
67 stringify_path,
68)
69from pandas.io.json._normalize import convert_to_line_delimits
70from pandas.io.json._table_schema import (
71 build_table_schema,
72 parse_table_schema,
73)
74from pandas.io.parsers.readers import validate_integer
76if TYPE_CHECKING: 76 ↛ 77line 76 didn't jump to line 77, because the condition on line 76 was never true
77 from pandas.core.generic import NDFrame
79FrameSeriesStrT = TypeVar("FrameSeriesStrT", bound=Literal["frame", "series"])
81loads = json.loads
82dumps = json.dumps
85# interface to/from
86@overload
87def to_json(
88 path_or_buf: FilePath | WriteBuffer[str] | WriteBuffer[bytes],
89 obj: NDFrame,
90 orient: str | None = ...,
91 date_format: str = ...,
92 double_precision: int = ...,
93 force_ascii: bool = ...,
94 date_unit: str = ...,
95 default_handler: Callable[[Any], JSONSerializable] | None = ...,
96 lines: bool = ...,
97 compression: CompressionOptions = ...,
98 index: bool = ...,
99 indent: int = ...,
100 storage_options: StorageOptions = ...,
101) -> None:
102 ...
105@overload
106def to_json(
107 path_or_buf: None,
108 obj: NDFrame,
109 orient: str | None = ...,
110 date_format: str = ...,
111 double_precision: int = ...,
112 force_ascii: bool = ...,
113 date_unit: str = ...,
114 default_handler: Callable[[Any], JSONSerializable] | None = ...,
115 lines: bool = ...,
116 compression: CompressionOptions = ...,
117 index: bool = ...,
118 indent: int = ...,
119 storage_options: StorageOptions = ...,
120) -> str:
121 ...
124def to_json(
125 path_or_buf: FilePath | WriteBuffer[str] | WriteBuffer[bytes] | None,
126 obj: NDFrame,
127 orient: str | None = None,
128 date_format: str = "epoch",
129 double_precision: int = 10,
130 force_ascii: bool = True,
131 date_unit: str = "ms",
132 default_handler: Callable[[Any], JSONSerializable] | None = None,
133 lines: bool = False,
134 compression: CompressionOptions = "infer",
135 index: bool = True,
136 indent: int = 0,
137 storage_options: StorageOptions = None,
138) -> str | None:
140 if not index and orient not in ["split", "table"]:
141 raise ValueError(
142 "'index=False' is only valid when 'orient' is 'split' or 'table'"
143 )
145 if lines and orient != "records":
146 raise ValueError("'lines' keyword only valid when 'orient' is records")
148 if orient == "table" and isinstance(obj, Series):
149 obj = obj.to_frame(name=obj.name or "values")
151 writer: type[Writer]
152 if orient == "table" and isinstance(obj, DataFrame):
153 writer = JSONTableWriter
154 elif isinstance(obj, Series):
155 writer = SeriesWriter
156 elif isinstance(obj, DataFrame):
157 writer = FrameWriter
158 else:
159 raise NotImplementedError("'obj' should be a Series or a DataFrame")
161 s = writer(
162 obj,
163 orient=orient,
164 date_format=date_format,
165 double_precision=double_precision,
166 ensure_ascii=force_ascii,
167 date_unit=date_unit,
168 default_handler=default_handler,
169 index=index,
170 indent=indent,
171 ).write()
173 if lines:
174 s = convert_to_line_delimits(s)
176 if path_or_buf is not None:
177 # apply compression and byte/text conversion
178 with get_handle(
179 path_or_buf, "w", compression=compression, storage_options=storage_options
180 ) as handles:
181 handles.handle.write(s)
182 else:
183 return s
184 return None
187class Writer(ABC):
188 _default_orient: str
190 def __init__(
191 self,
192 obj,
193 orient: str | None,
194 date_format: str,
195 double_precision: int,
196 ensure_ascii: bool,
197 date_unit: str,
198 index: bool,
199 default_handler: Callable[[Any], JSONSerializable] | None = None,
200 indent: int = 0,
201 ) -> None:
202 self.obj = obj
204 if orient is None:
205 orient = self._default_orient
207 self.orient = orient
208 self.date_format = date_format
209 self.double_precision = double_precision
210 self.ensure_ascii = ensure_ascii
211 self.date_unit = date_unit
212 self.default_handler = default_handler
213 self.index = index
214 self.indent = indent
216 self.is_copy = None
217 self._format_axes()
219 def _format_axes(self):
220 raise AbstractMethodError(self)
222 def write(self) -> str:
223 iso_dates = self.date_format == "iso"
224 return dumps(
225 self.obj_to_write,
226 orient=self.orient,
227 double_precision=self.double_precision,
228 ensure_ascii=self.ensure_ascii,
229 date_unit=self.date_unit,
230 iso_dates=iso_dates,
231 default_handler=self.default_handler,
232 indent=self.indent,
233 )
235 @property
236 @abstractmethod
237 def obj_to_write(self) -> NDFrame | Mapping[IndexLabel, Any]:
238 """Object to write in JSON format."""
239 pass
242class SeriesWriter(Writer):
243 _default_orient = "index"
245 @property
246 def obj_to_write(self) -> NDFrame | Mapping[IndexLabel, Any]:
247 if not self.index and self.orient == "split":
248 return {"name": self.obj.name, "data": self.obj.values}
249 else:
250 return self.obj
252 def _format_axes(self):
253 if not self.obj.index.is_unique and self.orient == "index":
254 raise ValueError(f"Series index must be unique for orient='{self.orient}'")
257class FrameWriter(Writer):
258 _default_orient = "columns"
260 @property
261 def obj_to_write(self) -> NDFrame | Mapping[IndexLabel, Any]:
262 if not self.index and self.orient == "split":
263 obj_to_write = self.obj.to_dict(orient="split")
264 del obj_to_write["index"]
265 else:
266 obj_to_write = self.obj
267 return obj_to_write
269 def _format_axes(self):
270 """
271 Try to format axes if they are datelike.
272 """
273 if not self.obj.index.is_unique and self.orient in ("index", "columns"):
274 raise ValueError(
275 f"DataFrame index must be unique for orient='{self.orient}'."
276 )
277 if not self.obj.columns.is_unique and self.orient in (
278 "index",
279 "columns",
280 "records",
281 ):
282 raise ValueError(
283 f"DataFrame columns must be unique for orient='{self.orient}'."
284 )
287class JSONTableWriter(FrameWriter):
288 _default_orient = "records"
290 def __init__(
291 self,
292 obj,
293 orient: str | None,
294 date_format: str,
295 double_precision: int,
296 ensure_ascii: bool,
297 date_unit: str,
298 index: bool,
299 default_handler: Callable[[Any], JSONSerializable] | None = None,
300 indent: int = 0,
301 ) -> None:
302 """
303 Adds a `schema` attribute with the Table Schema, resets
304 the index (can't do in caller, because the schema inference needs
305 to know what the index is, forces orient to records, and forces
306 date_format to 'iso'.
307 """
308 super().__init__(
309 obj,
310 orient,
311 date_format,
312 double_precision,
313 ensure_ascii,
314 date_unit,
315 index,
316 default_handler=default_handler,
317 indent=indent,
318 )
320 if date_format != "iso":
321 msg = (
322 "Trying to write with `orient='table'` and "
323 f"`date_format='{date_format}'`. Table Schema requires dates "
324 "to be formatted with `date_format='iso'`"
325 )
326 raise ValueError(msg)
328 self.schema = build_table_schema(obj, index=self.index)
330 # NotImplemented on a column MultiIndex
331 if obj.ndim == 2 and isinstance(obj.columns, MultiIndex):
332 raise NotImplementedError(
333 "orient='table' is not supported for MultiIndex columns"
334 )
336 # TODO: Do this timedelta properly in objToJSON.c See GH #15137
337 if (
338 (obj.ndim == 1)
339 and (obj.name in set(obj.index.names))
340 or len(obj.columns.intersection(obj.index.names))
341 ):
342 msg = "Overlapping names between the index and columns"
343 raise ValueError(msg)
345 obj = obj.copy()
346 timedeltas = obj.select_dtypes(include=["timedelta"]).columns
347 if len(timedeltas):
348 obj[timedeltas] = obj[timedeltas].applymap(lambda x: x.isoformat())
349 # Convert PeriodIndex to datetimes before serializing
350 if is_period_dtype(obj.index.dtype):
351 obj.index = obj.index.to_timestamp()
353 # exclude index from obj if index=False
354 if not self.index:
355 self.obj = obj.reset_index(drop=True)
356 else:
357 self.obj = obj.reset_index(drop=False)
358 self.date_format = "iso"
359 self.orient = "records"
360 self.index = index
362 @property
363 def obj_to_write(self) -> NDFrame | Mapping[IndexLabel, Any]:
364 return {"schema": self.schema, "data": self.obj}
367@overload
368def read_json(
369 path_or_buf: FilePath | ReadBuffer[str] | ReadBuffer[bytes],
370 *,
371 orient: str | None = ...,
372 typ: Literal["frame"] = ...,
373 dtype: DtypeArg | None = ...,
374 convert_axes=...,
375 convert_dates: bool | list[str] = ...,
376 keep_default_dates: bool = ...,
377 numpy: bool = ...,
378 precise_float: bool = ...,
379 date_unit: str | None = ...,
380 encoding: str | None = ...,
381 encoding_errors: str | None = ...,
382 lines: bool = ...,
383 chunksize: int,
384 compression: CompressionOptions = ...,
385 nrows: int | None = ...,
386 storage_options: StorageOptions = ...,
387) -> JsonReader[Literal["frame"]]:
388 ...
391@overload
392def read_json(
393 path_or_buf: FilePath | ReadBuffer[str] | ReadBuffer[bytes],
394 *,
395 orient: str | None = ...,
396 typ: Literal["series"],
397 dtype: DtypeArg | None = ...,
398 convert_axes=...,
399 convert_dates: bool | list[str] = ...,
400 keep_default_dates: bool = ...,
401 numpy: bool = ...,
402 precise_float: bool = ...,
403 date_unit: str | None = ...,
404 encoding: str | None = ...,
405 encoding_errors: str | None = ...,
406 lines: bool = ...,
407 chunksize: int,
408 compression: CompressionOptions = ...,
409 nrows: int | None = ...,
410 storage_options: StorageOptions = ...,
411) -> JsonReader[Literal["series"]]:
412 ...
415@overload
416def read_json(
417 path_or_buf: FilePath | ReadBuffer[str] | ReadBuffer[bytes],
418 *,
419 orient: str | None = ...,
420 typ: Literal["series"],
421 dtype: DtypeArg | None = ...,
422 convert_axes=...,
423 convert_dates: bool | list[str] = ...,
424 keep_default_dates: bool = ...,
425 numpy: bool = ...,
426 precise_float: bool = ...,
427 date_unit: str | None = ...,
428 encoding: str | None = ...,
429 encoding_errors: str | None = ...,
430 lines: bool = ...,
431 chunksize: None = ...,
432 compression: CompressionOptions = ...,
433 nrows: int | None = ...,
434 storage_options: StorageOptions = ...,
435) -> Series:
436 ...
439@overload
440def read_json(
441 path_or_buf: FilePath | ReadBuffer[str] | ReadBuffer[bytes],
442 orient: str | None = ...,
443 typ: Literal["frame"] = ...,
444 dtype: DtypeArg | None = ...,
445 convert_axes=...,
446 convert_dates: bool | list[str] = ...,
447 keep_default_dates: bool = ...,
448 numpy: bool = ...,
449 precise_float: bool = ...,
450 date_unit: str | None = ...,
451 encoding: str | None = ...,
452 encoding_errors: str | None = ...,
453 lines: bool = ...,
454 chunksize: None = ...,
455 compression: CompressionOptions = ...,
456 nrows: int | None = ...,
457 storage_options: StorageOptions = ...,
458) -> DataFrame:
459 ...
462@doc(
463 storage_options=_shared_docs["storage_options"],
464 decompression_options=_shared_docs["decompression_options"] % "path_or_buf",
465)
466@deprecate_kwarg(old_arg_name="numpy", new_arg_name=None)
467@deprecate_nonkeyword_arguments(version="2.0", allowed_args=["path_or_buf"])
468def read_json(
469 path_or_buf: FilePath | ReadBuffer[str] | ReadBuffer[bytes],
470 orient: str | None = None,
471 typ: Literal["frame", "series"] = "frame",
472 dtype: DtypeArg | None = None,
473 convert_axes=None,
474 convert_dates: bool | list[str] = True,
475 keep_default_dates: bool = True,
476 numpy: bool = False,
477 precise_float: bool = False,
478 date_unit: str | None = None,
479 encoding: str | None = None,
480 encoding_errors: str | None = "strict",
481 lines: bool = False,
482 chunksize: int | None = None,
483 compression: CompressionOptions = "infer",
484 nrows: int | None = None,
485 storage_options: StorageOptions = None,
486) -> DataFrame | Series | JsonReader:
487 """
488 Convert a JSON string to pandas object.
490 Parameters
491 ----------
492 path_or_buf : a valid JSON str, path object or file-like object
493 Any valid string path is acceptable. The string could be a URL. Valid
494 URL schemes include http, ftp, s3, and file. For file URLs, a host is
495 expected. A local file could be:
496 ``file://localhost/path/to/table.json``.
498 If you want to pass in a path object, pandas accepts any
499 ``os.PathLike``.
501 By file-like object, we refer to objects with a ``read()`` method,
502 such as a file handle (e.g. via builtin ``open`` function)
503 or ``StringIO``.
504 orient : str
505 Indication of expected JSON string format.
506 Compatible JSON strings can be produced by ``to_json()`` with a
507 corresponding orient value.
508 The set of possible orients is:
510 - ``'split'`` : dict like
511 ``{{index -> [index], columns -> [columns], data -> [values]}}``
512 - ``'records'`` : list like
513 ``[{{column -> value}}, ... , {{column -> value}}]``
514 - ``'index'`` : dict like ``{{index -> {{column -> value}}}}``
515 - ``'columns'`` : dict like ``{{column -> {{index -> value}}}}``
516 - ``'values'`` : just the values array
518 The allowed and default values depend on the value
519 of the `typ` parameter.
521 * when ``typ == 'series'``,
523 - allowed orients are ``{{'split','records','index'}}``
524 - default is ``'index'``
525 - The Series index must be unique for orient ``'index'``.
527 * when ``typ == 'frame'``,
529 - allowed orients are ``{{'split','records','index',
530 'columns','values', 'table'}}``
531 - default is ``'columns'``
532 - The DataFrame index must be unique for orients ``'index'`` and
533 ``'columns'``.
534 - The DataFrame columns must be unique for orients ``'index'``,
535 ``'columns'``, and ``'records'``.
537 typ : {{'frame', 'series'}}, default 'frame'
538 The type of object to recover.
540 dtype : bool or dict, default None
541 If True, infer dtypes; if a dict of column to dtype, then use those;
542 if False, then don't infer dtypes at all, applies only to the data.
544 For all ``orient`` values except ``'table'``, default is True.
546 .. versionchanged:: 0.25.0
548 Not applicable for ``orient='table'``.
550 convert_axes : bool, default None
551 Try to convert the axes to the proper dtypes.
553 For all ``orient`` values except ``'table'``, default is True.
555 .. versionchanged:: 0.25.0
557 Not applicable for ``orient='table'``.
559 convert_dates : bool or list of str, default True
560 If True then default datelike columns may be converted (depending on
561 keep_default_dates).
562 If False, no dates will be converted.
563 If a list of column names, then those columns will be converted and
564 default datelike columns may also be converted (depending on
565 keep_default_dates).
567 keep_default_dates : bool, default True
568 If parsing dates (convert_dates is not False), then try to parse the
569 default datelike columns.
570 A column label is datelike if
572 * it ends with ``'_at'``,
574 * it ends with ``'_time'``,
576 * it begins with ``'timestamp'``,
578 * it is ``'modified'``, or
580 * it is ``'date'``.
582 numpy : bool, default False
583 Direct decoding to numpy arrays. Supports numeric data only, but
584 non-numeric column and index labels are supported. Note also that the
585 JSON ordering MUST be the same for each term if numpy=True.
587 .. deprecated:: 1.0.0
589 precise_float : bool, default False
590 Set to enable usage of higher precision (strtod) function when
591 decoding string to double values. Default (False) is to use fast but
592 less precise builtin functionality.
594 date_unit : str, default None
595 The timestamp unit to detect if converting dates. The default behaviour
596 is to try and detect the correct precision, but if this is not desired
597 then pass one of 's', 'ms', 'us' or 'ns' to force parsing only seconds,
598 milliseconds, microseconds or nanoseconds respectively.
600 encoding : str, default is 'utf-8'
601 The encoding to use to decode py3 bytes.
603 encoding_errors : str, optional, default "strict"
604 How encoding errors are treated. `List of possible values
605 <https://docs.python.org/3/library/codecs.html#error-handlers>`_ .
607 .. versionadded:: 1.3.0
609 lines : bool, default False
610 Read the file as a json object per line.
612 chunksize : int, optional
613 Return JsonReader object for iteration.
614 See the `line-delimited json docs
615 <https://pandas.pydata.org/pandas-docs/stable/user_guide/io.html#line-delimited-json>`_
616 for more information on ``chunksize``.
617 This can only be passed if `lines=True`.
618 If this is None, the file will be read into memory all at once.
620 .. versionchanged:: 1.2
622 ``JsonReader`` is a context manager.
624 {decompression_options}
626 .. versionchanged:: 1.4.0 Zstandard support.
628 nrows : int, optional
629 The number of lines from the line-delimited jsonfile that has to be read.
630 This can only be passed if `lines=True`.
631 If this is None, all the rows will be returned.
633 .. versionadded:: 1.1
635 {storage_options}
637 .. versionadded:: 1.2.0
639 Returns
640 -------
641 Series or DataFrame
642 The type returned depends on the value of `typ`.
644 See Also
645 --------
646 DataFrame.to_json : Convert a DataFrame to a JSON string.
647 Series.to_json : Convert a Series to a JSON string.
648 json_normalize : Normalize semi-structured JSON data into a flat table.
650 Notes
651 -----
652 Specific to ``orient='table'``, if a :class:`DataFrame` with a literal
653 :class:`Index` name of `index` gets written with :func:`to_json`, the
654 subsequent read operation will incorrectly set the :class:`Index` name to
655 ``None``. This is because `index` is also used by :func:`DataFrame.to_json`
656 to denote a missing :class:`Index` name, and the subsequent
657 :func:`read_json` operation cannot distinguish between the two. The same
658 limitation is encountered with a :class:`MultiIndex` and any names
659 beginning with ``'level_'``.
661 Examples
662 --------
663 >>> df = pd.DataFrame([['a', 'b'], ['c', 'd']],
664 ... index=['row 1', 'row 2'],
665 ... columns=['col 1', 'col 2'])
667 Encoding/decoding a Dataframe using ``'split'`` formatted JSON:
669 >>> df.to_json(orient='split')
670 '\
671{{\
672"columns":["col 1","col 2"],\
673"index":["row 1","row 2"],\
674"data":[["a","b"],["c","d"]]\
675}}\
676'
677 >>> pd.read_json(_, orient='split')
678 col 1 col 2
679 row 1 a b
680 row 2 c d
682 Encoding/decoding a Dataframe using ``'index'`` formatted JSON:
684 >>> df.to_json(orient='index')
685 '{{"row 1":{{"col 1":"a","col 2":"b"}},"row 2":{{"col 1":"c","col 2":"d"}}}}'
687 >>> pd.read_json(_, orient='index')
688 col 1 col 2
689 row 1 a b
690 row 2 c d
692 Encoding/decoding a Dataframe using ``'records'`` formatted JSON.
693 Note that index labels are not preserved with this encoding.
695 >>> df.to_json(orient='records')
696 '[{{"col 1":"a","col 2":"b"}},{{"col 1":"c","col 2":"d"}}]'
697 >>> pd.read_json(_, orient='records')
698 col 1 col 2
699 0 a b
700 1 c d
702 Encoding with Table Schema
704 >>> df.to_json(orient='table')
705 '\
706{{"schema":{{"fields":[\
707{{"name":"index","type":"string"}},\
708{{"name":"col 1","type":"string"}},\
709{{"name":"col 2","type":"string"}}],\
710"primaryKey":["index"],\
711"pandas_version":"1.4.0"}},\
712"data":[\
713{{"index":"row 1","col 1":"a","col 2":"b"}},\
714{{"index":"row 2","col 1":"c","col 2":"d"}}]\
715}}\
716'
717 """
718 if orient == "table" and dtype:
719 raise ValueError("cannot pass both dtype and orient='table'")
720 if orient == "table" and convert_axes:
721 raise ValueError("cannot pass both convert_axes and orient='table'")
723 if dtype is None and orient != "table":
724 # error: Incompatible types in assignment (expression has type "bool", variable
725 # has type "Union[ExtensionDtype, str, dtype[Any], Type[str], Type[float],
726 # Type[int], Type[complex], Type[bool], Type[object], Dict[Hashable,
727 # Union[ExtensionDtype, Union[str, dtype[Any]], Type[str], Type[float],
728 # Type[int], Type[complex], Type[bool], Type[object]]], None]")
729 dtype = True # type: ignore[assignment]
730 if convert_axes is None and orient != "table":
731 convert_axes = True
733 json_reader = JsonReader(
734 path_or_buf,
735 orient=orient,
736 typ=typ,
737 dtype=dtype,
738 convert_axes=convert_axes,
739 convert_dates=convert_dates,
740 keep_default_dates=keep_default_dates,
741 numpy=numpy,
742 precise_float=precise_float,
743 date_unit=date_unit,
744 encoding=encoding,
745 lines=lines,
746 chunksize=chunksize,
747 compression=compression,
748 nrows=nrows,
749 storage_options=storage_options,
750 encoding_errors=encoding_errors,
751 )
753 if chunksize:
754 return json_reader
756 with json_reader:
757 return json_reader.read()
760class JsonReader(abc.Iterator, Generic[FrameSeriesStrT]):
761 """
762 JsonReader provides an interface for reading in a JSON file.
764 If initialized with ``lines=True`` and ``chunksize``, can be iterated over
765 ``chunksize`` lines at a time. Otherwise, calling ``read`` reads in the
766 whole document.
767 """
769 def __init__(
770 self,
771 filepath_or_buffer,
772 orient,
773 typ: FrameSeriesStrT,
774 dtype,
775 convert_axes,
776 convert_dates,
777 keep_default_dates: bool,
778 numpy: bool,
779 precise_float: bool,
780 date_unit,
781 encoding,
782 lines: bool,
783 chunksize: int | None,
784 compression: CompressionOptions,
785 nrows: int | None,
786 storage_options: StorageOptions = None,
787 encoding_errors: str | None = "strict",
788 ) -> None:
790 self.orient = orient
791 self.typ = typ
792 self.dtype = dtype
793 self.convert_axes = convert_axes
794 self.convert_dates = convert_dates
795 self.keep_default_dates = keep_default_dates
796 self.numpy = numpy
797 self.precise_float = precise_float
798 self.date_unit = date_unit
799 self.encoding = encoding
800 self.compression = compression
801 self.storage_options = storage_options
802 self.lines = lines
803 self.chunksize = chunksize
804 self.nrows_seen = 0
805 self.nrows = nrows
806 self.encoding_errors = encoding_errors
807 self.handles: IOHandles[str] | None = None
809 if self.chunksize is not None:
810 self.chunksize = validate_integer("chunksize", self.chunksize, 1)
811 if not self.lines:
812 raise ValueError("chunksize can only be passed if lines=True")
813 if self.nrows is not None:
814 self.nrows = validate_integer("nrows", self.nrows, 0)
815 if not self.lines:
816 raise ValueError("nrows can only be passed if lines=True")
818 data = self._get_data_from_filepath(filepath_or_buffer)
819 self.data = self._preprocess_data(data)
821 def _preprocess_data(self, data):
822 """
823 At this point, the data either has a `read` attribute (e.g. a file
824 object or a StringIO) or is a string that is a JSON document.
826 If self.chunksize, we prepare the data for the `__next__` method.
827 Otherwise, we read it into memory for the `read` method.
828 """
829 if hasattr(data, "read") and not (self.chunksize or self.nrows):
830 with self:
831 data = data.read()
832 if not hasattr(data, "read") and (self.chunksize or self.nrows):
833 data = StringIO(data)
835 return data
837 def _get_data_from_filepath(self, filepath_or_buffer):
838 """
839 The function read_json accepts three input types:
840 1. filepath (string-like)
841 2. file-like object (e.g. open file object, StringIO)
842 3. JSON string
844 This method turns (1) into (2) to simplify the rest of the processing.
845 It returns input types (2) and (3) unchanged.
847 It raises FileNotFoundError if the input is a string ending in
848 one of .json, .json.gz, .json.bz2, etc. but no such file exists.
849 """
850 # if it is a string but the file does not exist, it might be a JSON string
851 filepath_or_buffer = stringify_path(filepath_or_buffer)
852 if (
853 not isinstance(filepath_or_buffer, str)
854 or is_url(filepath_or_buffer)
855 or is_fsspec_url(filepath_or_buffer)
856 or file_exists(filepath_or_buffer)
857 ):
858 self.handles = get_handle(
859 filepath_or_buffer,
860 "r",
861 encoding=self.encoding,
862 compression=self.compression,
863 storage_options=self.storage_options,
864 errors=self.encoding_errors,
865 )
866 filepath_or_buffer = self.handles.handle
867 elif (
868 isinstance(filepath_or_buffer, str)
869 and filepath_or_buffer.lower().endswith(
870 (".json",) + tuple(f".json{c}" for c in _extension_to_compression)
871 )
872 and not file_exists(filepath_or_buffer)
873 ):
874 raise FileNotFoundError(f"File {filepath_or_buffer} does not exist")
876 return filepath_or_buffer
878 def _combine_lines(self, lines) -> str:
879 """
880 Combines a list of JSON objects into one JSON object.
881 """
882 return (
883 f'[{",".join([line for line in (line.strip() for line in lines) if line])}]'
884 )
886 @overload
887 def read(self: JsonReader[Literal["frame"]]) -> DataFrame:
888 ...
890 @overload
891 def read(self: JsonReader[Literal["series"]]) -> Series:
892 ...
894 @overload
895 def read(self: JsonReader[Literal["frame", "series"]]) -> DataFrame | Series:
896 ...
898 def read(self) -> DataFrame | Series:
899 """
900 Read the whole JSON input into a pandas object.
901 """
902 obj: DataFrame | Series
903 if self.lines:
904 if self.chunksize:
905 obj = concat(self)
906 elif self.nrows:
907 lines = list(islice(self.data, self.nrows))
908 lines_json = self._combine_lines(lines)
909 obj = self._get_object_parser(lines_json)
910 else:
911 data = ensure_str(self.data)
912 data_lines = data.split("\n")
913 obj = self._get_object_parser(self._combine_lines(data_lines))
914 else:
915 obj = self._get_object_parser(self.data)
916 self.close()
917 return obj
919 def _get_object_parser(self, json) -> DataFrame | Series:
920 """
921 Parses a json document into a pandas object.
922 """
923 typ = self.typ
924 dtype = self.dtype
925 kwargs = {
926 "orient": self.orient,
927 "dtype": self.dtype,
928 "convert_axes": self.convert_axes,
929 "convert_dates": self.convert_dates,
930 "keep_default_dates": self.keep_default_dates,
931 "numpy": self.numpy,
932 "precise_float": self.precise_float,
933 "date_unit": self.date_unit,
934 }
935 obj = None
936 if typ == "frame":
937 obj = FrameParser(json, **kwargs).parse()
939 if typ == "series" or obj is None:
940 if not isinstance(dtype, bool):
941 kwargs["dtype"] = dtype
942 obj = SeriesParser(json, **kwargs).parse()
944 return obj
946 def close(self) -> None:
947 """
948 If we opened a stream earlier, in _get_data_from_filepath, we should
949 close it.
951 If an open stream or file was passed, we leave it open.
952 """
953 if self.handles is not None:
954 self.handles.close()
956 def __iter__(self: JsonReader[FrameSeriesStrT]) -> JsonReader[FrameSeriesStrT]:
957 return self
959 @overload
960 def __next__(self: JsonReader[Literal["frame"]]) -> DataFrame:
961 ...
963 @overload
964 def __next__(self: JsonReader[Literal["series"]]) -> Series:
965 ...
967 @overload
968 def __next__(self: JsonReader[Literal["frame", "series"]]) -> DataFrame | Series:
969 ...
971 def __next__(self) -> DataFrame | Series:
972 if self.nrows:
973 if self.nrows_seen >= self.nrows:
974 self.close()
975 raise StopIteration
977 lines = list(islice(self.data, self.chunksize))
978 if lines:
979 lines_json = self._combine_lines(lines)
980 obj = self._get_object_parser(lines_json)
982 # Make sure that the returned objects have the right index.
983 obj.index = range(self.nrows_seen, self.nrows_seen + len(obj))
984 self.nrows_seen += len(obj)
986 return obj
988 self.close()
989 raise StopIteration
991 def __enter__(self) -> JsonReader[FrameSeriesStrT]:
992 return self
994 def __exit__(self, exc_type, exc_value, traceback) -> None:
995 self.close()
998class Parser:
999 _split_keys: tuple[str, ...]
1000 _default_orient: str
1002 _STAMP_UNITS = ("s", "ms", "us", "ns")
1003 _MIN_STAMPS = {
1004 "s": 31536000,
1005 "ms": 31536000000,
1006 "us": 31536000000000,
1007 "ns": 31536000000000000,
1008 }
1010 def __init__(
1011 self,
1012 json,
1013 orient,
1014 dtype: DtypeArg | None = None,
1015 convert_axes: bool = True,
1016 convert_dates: bool | list[str] = True,
1017 keep_default_dates: bool = False,
1018 numpy: bool = False,
1019 precise_float: bool = False,
1020 date_unit=None,
1021 ) -> None:
1022 self.json = json
1024 if orient is None:
1025 orient = self._default_orient
1027 self.orient = orient
1029 self.dtype = dtype
1031 if orient == "split":
1032 numpy = False
1034 if date_unit is not None:
1035 date_unit = date_unit.lower()
1036 if date_unit not in self._STAMP_UNITS:
1037 raise ValueError(f"date_unit must be one of {self._STAMP_UNITS}")
1038 self.min_stamp = self._MIN_STAMPS[date_unit]
1039 else:
1040 self.min_stamp = self._MIN_STAMPS["s"]
1042 self.numpy = numpy
1043 self.precise_float = precise_float
1044 self.convert_axes = convert_axes
1045 self.convert_dates = convert_dates
1046 self.date_unit = date_unit
1047 self.keep_default_dates = keep_default_dates
1048 self.obj: DataFrame | Series | None = None
1050 def check_keys_split(self, decoded) -> None:
1051 """
1052 Checks that dict has only the appropriate keys for orient='split'.
1053 """
1054 bad_keys = set(decoded.keys()).difference(set(self._split_keys))
1055 if bad_keys:
1056 bad_keys_joined = ", ".join(bad_keys)
1057 raise ValueError(f"JSON data had unexpected key(s): {bad_keys_joined}")
1059 def parse(self):
1061 if self.numpy:
1062 self._parse_numpy()
1063 else:
1064 self._parse_no_numpy()
1066 if self.obj is None:
1067 return None
1068 if self.convert_axes:
1069 self._convert_axes()
1070 self._try_convert_types()
1071 return self.obj
1073 def _parse_numpy(self):
1074 raise AbstractMethodError(self)
1076 def _parse_no_numpy(self):
1077 raise AbstractMethodError(self)
1079 def _convert_axes(self):
1080 """
1081 Try to convert axes.
1082 """
1083 obj = self.obj
1084 assert obj is not None # for mypy
1085 for axis_name in obj._AXIS_ORDERS:
1086 new_axis, result = self._try_convert_data(
1087 name=axis_name,
1088 data=obj._get_axis(axis_name),
1089 use_dtypes=False,
1090 convert_dates=True,
1091 )
1092 if result:
1093 setattr(self.obj, axis_name, new_axis)
1095 def _try_convert_types(self):
1096 raise AbstractMethodError(self)
1098 def _try_convert_data(
1099 self,
1100 name,
1101 data,
1102 use_dtypes: bool = True,
1103 convert_dates: bool | list[str] = True,
1104 ):
1105 """
1106 Try to parse a ndarray like into a column by inferring dtype.
1107 """
1108 # don't try to coerce, unless a force conversion
1109 if use_dtypes:
1110 if not self.dtype:
1111 if all(notna(data)):
1112 return data, False
1113 return data.fillna(np.nan), True
1115 # error: Non-overlapping identity check (left operand type:
1116 # "Union[ExtensionDtype, str, dtype[Any], Type[object],
1117 # Dict[Hashable, Union[ExtensionDtype, Union[str, dtype[Any]],
1118 # Type[str], Type[float], Type[int], Type[complex], Type[bool],
1119 # Type[object]]]]", right operand type: "Literal[True]")
1120 elif self.dtype is True: # type: ignore[comparison-overlap]
1121 pass
1122 else:
1123 # dtype to force
1124 dtype = (
1125 self.dtype.get(name) if isinstance(self.dtype, dict) else self.dtype
1126 )
1127 if dtype is not None:
1128 try:
1129 return data.astype(dtype), True
1130 except (TypeError, ValueError):
1131 return data, False
1133 if convert_dates:
1134 new_data, result = self._try_convert_to_date(data)
1135 if result:
1136 return new_data, True
1138 if data.dtype == "object":
1140 # try float
1141 try:
1142 data = data.astype("float64")
1143 except (TypeError, ValueError):
1144 pass
1146 if data.dtype.kind == "f":
1148 if data.dtype != "float64":
1150 # coerce floats to 64
1151 try:
1152 data = data.astype("float64")
1153 except (TypeError, ValueError):
1154 pass
1156 # don't coerce 0-len data
1157 if len(data) and (data.dtype == "float" or data.dtype == "object"):
1159 # coerce ints if we can
1160 try:
1161 new_data = data.astype("int64")
1162 if (new_data == data).all():
1163 data = new_data
1164 except (TypeError, ValueError, OverflowError):
1165 pass
1167 # coerce ints to 64
1168 if data.dtype == "int":
1170 # coerce floats to 64
1171 try:
1172 data = data.astype("int64")
1173 except (TypeError, ValueError):
1174 pass
1176 # if we have an index, we want to preserve dtypes
1177 if name == "index" and len(data):
1178 if self.orient == "split":
1179 return data, False
1181 return data, True
1183 def _try_convert_to_date(self, data):
1184 """
1185 Try to parse a ndarray like into a date column.
1187 Try to coerce object in epoch/iso formats and integer/float in epoch
1188 formats. Return a boolean if parsing was successful.
1189 """
1190 # no conversion on empty
1191 if not len(data):
1192 return data, False
1194 new_data = data
1195 if new_data.dtype == "object":
1196 try:
1197 new_data = data.astype("int64")
1198 except (TypeError, ValueError, OverflowError):
1199 pass
1201 # ignore numbers that are out of range
1202 if issubclass(new_data.dtype.type, np.number):
1203 in_range = (
1204 isna(new_data._values)
1205 | (new_data > self.min_stamp)
1206 | (new_data._values == iNaT)
1207 )
1208 if not in_range.all():
1209 return data, False
1211 date_units = (self.date_unit,) if self.date_unit else self._STAMP_UNITS
1212 for date_unit in date_units:
1213 try:
1214 new_data = to_datetime(new_data, errors="raise", unit=date_unit)
1215 except (ValueError, OverflowError, TypeError):
1216 continue
1217 return new_data, True
1218 return data, False
1220 def _try_convert_dates(self):
1221 raise AbstractMethodError(self)
1224class SeriesParser(Parser):
1225 _default_orient = "index"
1226 _split_keys = ("name", "index", "data")
1228 def _parse_no_numpy(self):
1229 data = loads(self.json, precise_float=self.precise_float)
1231 if self.orient == "split":
1232 decoded = {str(k): v for k, v in data.items()}
1233 self.check_keys_split(decoded)
1234 self.obj = create_series_with_explicit_dtype(**decoded)
1235 else:
1236 self.obj = create_series_with_explicit_dtype(data, dtype_if_empty=object)
1238 def _parse_numpy(self):
1239 load_kwargs = {
1240 "dtype": None,
1241 "numpy": True,
1242 "precise_float": self.precise_float,
1243 }
1244 if self.orient in ["columns", "index"]:
1245 load_kwargs["labelled"] = True
1246 loads_ = functools.partial(loads, **load_kwargs)
1247 data = loads_(self.json)
1249 if self.orient == "split":
1250 decoded = {str(k): v for k, v in data.items()}
1251 self.check_keys_split(decoded)
1252 self.obj = create_series_with_explicit_dtype(**decoded)
1253 elif self.orient in ["columns", "index"]:
1254 # error: "create_series_with_explicit_dtype"
1255 # gets multiple values for keyword argument "dtype_if_empty
1256 self.obj = create_series_with_explicit_dtype(
1257 *data, dtype_if_empty=object
1258 ) # type: ignore[misc]
1259 else:
1260 self.obj = create_series_with_explicit_dtype(data, dtype_if_empty=object)
1262 def _try_convert_types(self):
1263 if self.obj is None:
1264 return
1265 obj, result = self._try_convert_data(
1266 "data", self.obj, convert_dates=self.convert_dates
1267 )
1268 if result:
1269 self.obj = obj
1272class FrameParser(Parser):
1273 _default_orient = "columns"
1274 _split_keys = ("columns", "index", "data")
1276 def _parse_numpy(self):
1278 json = self.json
1279 orient = self.orient
1281 if orient == "columns":
1282 args = loads(
1283 json,
1284 dtype=None,
1285 numpy=True,
1286 labelled=True,
1287 precise_float=self.precise_float,
1288 )
1289 if len(args):
1290 args = (args[0].T, args[2], args[1])
1291 self.obj = DataFrame(*args)
1292 elif orient == "split":
1293 decoded = loads(
1294 json, dtype=None, numpy=True, precise_float=self.precise_float
1295 )
1296 decoded = {str(k): v for k, v in decoded.items()}
1297 self.check_keys_split(decoded)
1298 self.obj = DataFrame(**decoded)
1299 elif orient == "values":
1300 self.obj = DataFrame(
1301 loads(json, dtype=None, numpy=True, precise_float=self.precise_float)
1302 )
1303 else:
1304 self.obj = DataFrame(
1305 *loads(
1306 json,
1307 dtype=None,
1308 numpy=True,
1309 labelled=True,
1310 precise_float=self.precise_float,
1311 )
1312 )
1314 def _parse_no_numpy(self):
1316 json = self.json
1317 orient = self.orient
1319 if orient == "columns":
1320 self.obj = DataFrame(
1321 loads(json, precise_float=self.precise_float), dtype=None
1322 )
1323 elif orient == "split":
1324 decoded = {
1325 str(k): v
1326 for k, v in loads(json, precise_float=self.precise_float).items()
1327 }
1328 self.check_keys_split(decoded)
1329 self.obj = DataFrame(dtype=None, **decoded)
1330 elif orient == "index":
1331 self.obj = DataFrame.from_dict(
1332 loads(json, precise_float=self.precise_float),
1333 dtype=None,
1334 orient="index",
1335 )
1336 elif orient == "table":
1337 self.obj = parse_table_schema(json, precise_float=self.precise_float)
1338 else:
1339 self.obj = DataFrame(
1340 loads(json, precise_float=self.precise_float), dtype=None
1341 )
1343 def _process_converter(self, f, filt=None):
1344 """
1345 Take a conversion function and possibly recreate the frame.
1346 """
1347 if filt is None:
1348 filt = lambda col, c: True
1350 obj = self.obj
1351 assert obj is not None # for mypy
1353 needs_new_obj = False
1354 new_obj = {}
1355 for i, (col, c) in enumerate(obj.items()):
1356 if filt(col, c):
1357 new_data, result = f(col, c)
1358 if result:
1359 c = new_data
1360 needs_new_obj = True
1361 new_obj[i] = c
1363 if needs_new_obj:
1365 # possibly handle dup columns
1366 new_frame = DataFrame(new_obj, index=obj.index)
1367 new_frame.columns = obj.columns
1368 self.obj = new_frame
1370 def _try_convert_types(self):
1371 if self.obj is None:
1372 return
1373 if self.convert_dates:
1374 self._try_convert_dates()
1376 self._process_converter(
1377 lambda col, c: self._try_convert_data(col, c, convert_dates=False)
1378 )
1380 def _try_convert_dates(self):
1381 if self.obj is None:
1382 return
1384 # our columns to parse
1385 convert_dates_list_bool = self.convert_dates
1386 if isinstance(convert_dates_list_bool, bool):
1387 convert_dates_list_bool = []
1388 convert_dates = set(convert_dates_list_bool)
1390 def is_ok(col) -> bool:
1391 """
1392 Return if this col is ok to try for a date parse.
1393 """
1394 if not isinstance(col, str):
1395 return False
1397 col_lower = col.lower()
1398 if (
1399 col_lower.endswith("_at")
1400 or col_lower.endswith("_time")
1401 or col_lower == "modified"
1402 or col_lower == "date"
1403 or col_lower == "datetime"
1404 or col_lower.startswith("timestamp")
1405 ):
1406 return True
1407 return False
1409 self._process_converter(
1410 lambda col, c: self._try_convert_to_date(c),
1411 lambda col, c: (
1412 (self.keep_default_dates and is_ok(col)) or col in convert_dates
1413 ),
1414 )