Coverage for /var/srv/projects/api.amasfac.comuna18.com/tmp/venv/lib/python3.9/site-packages/pandas/io/json/_normalize.py: 7%
138 statements
« prev ^ index » next coverage.py v6.4.4, created at 2023-07-17 14:22 -0600
« prev ^ index » next coverage.py v6.4.4, created at 2023-07-17 14:22 -0600
1# ---------------------------------------------------------------------
2# JSON normalization routines
3from __future__ import annotations
5from collections import (
6 abc,
7 defaultdict,
8)
9import copy
10from typing import (
11 Any,
12 DefaultDict,
13 Iterable,
14)
16import numpy as np
18from pandas._libs.writers import convert_json_to_lines
19from pandas._typing import (
20 IgnoreRaise,
21 Scalar,
22)
23from pandas.util._decorators import deprecate
25import pandas as pd
26from pandas import DataFrame
29def convert_to_line_delimits(s: str) -> str:
30 """
31 Helper function that converts JSON lists to line delimited JSON.
32 """
33 # Determine we have a JSON list to turn to lines otherwise just return the
34 # json object, only lists can
35 if not s[0] == "[" and s[-1] == "]":
36 return s
37 s = s[1:-1]
39 return convert_json_to_lines(s)
42def nested_to_record(
43 ds,
44 prefix: str = "",
45 sep: str = ".",
46 level: int = 0,
47 max_level: int | None = None,
48):
49 """
50 A simplified json_normalize
52 Converts a nested dict into a flat dict ("record"), unlike json_normalize,
53 it does not attempt to extract a subset of the data.
55 Parameters
56 ----------
57 ds : dict or list of dicts
58 prefix: the prefix, optional, default: ""
59 sep : str, default '.'
60 Nested records will generate names separated by sep,
61 e.g., for sep='.', { 'foo' : { 'bar' : 0 } } -> foo.bar
62 level: int, optional, default: 0
63 The number of levels in the json string.
65 max_level: int, optional, default: None
66 The max depth to normalize.
68 .. versionadded:: 0.25.0
70 Returns
71 -------
72 d - dict or list of dicts, matching `ds`
74 Examples
75 --------
76 >>> nested_to_record(
77 ... dict(flat1=1, dict1=dict(c=1, d=2), nested=dict(e=dict(c=1, d=2), d=2))
78 ... )
79 {\
80'flat1': 1, \
81'dict1.c': 1, \
82'dict1.d': 2, \
83'nested.e.c': 1, \
84'nested.e.d': 2, \
85'nested.d': 2\
86}
87 """
88 singleton = False
89 if isinstance(ds, dict):
90 ds = [ds]
91 singleton = True
92 new_ds = []
93 for d in ds:
94 new_d = copy.deepcopy(d)
95 for k, v in d.items():
96 # each key gets renamed with prefix
97 if not isinstance(k, str):
98 k = str(k)
99 if level == 0:
100 newkey = k
101 else:
102 newkey = prefix + sep + k
104 # flatten if type is dict and
105 # current dict level < maximum level provided and
106 # only dicts gets recurse-flattened
107 # only at level>1 do we rename the rest of the keys
108 if not isinstance(v, dict) or (
109 max_level is not None and level >= max_level
110 ):
111 if level != 0: # so we skip copying for top level, common case
112 v = new_d.pop(k)
113 new_d[newkey] = v
114 continue
115 else:
116 v = new_d.pop(k)
117 new_d.update(nested_to_record(v, newkey, sep, level + 1, max_level))
118 new_ds.append(new_d)
120 if singleton:
121 return new_ds[0]
122 return new_ds
125def _normalise_json(
126 data: Any,
127 key_string: str,
128 normalized_dict: dict[str, Any],
129 separator: str,
130) -> dict[str, Any]:
131 """
132 Main recursive function
133 Designed for the most basic use case of pd.json_normalize(data)
134 intended as a performance improvement, see #15621
136 Parameters
137 ----------
138 data : Any
139 Type dependent on types contained within nested Json
140 key_string : str
141 New key (with separator(s) in) for data
142 normalized_dict : dict
143 The new normalized/flattened Json dict
144 separator : str, default '.'
145 Nested records will generate names separated by sep,
146 e.g., for sep='.', { 'foo' : { 'bar' : 0 } } -> foo.bar
147 """
148 if isinstance(data, dict):
149 for key, value in data.items():
150 new_key = f"{key_string}{separator}{key}"
151 _normalise_json(
152 data=value,
153 # to avoid adding the separator to the start of every key
154 # GH#43831 avoid adding key if key_string blank
155 key_string=new_key
156 if new_key[: len(separator)] != separator
157 else new_key[len(separator) :],
158 normalized_dict=normalized_dict,
159 separator=separator,
160 )
161 else:
162 normalized_dict[key_string] = data
163 return normalized_dict
166def _normalise_json_ordered(data: dict[str, Any], separator: str) -> dict[str, Any]:
167 """
168 Order the top level keys and then recursively go to depth
170 Parameters
171 ----------
172 data : dict or list of dicts
173 separator : str, default '.'
174 Nested records will generate names separated by sep,
175 e.g., for sep='.', { 'foo' : { 'bar' : 0 } } -> foo.bar
177 Returns
178 -------
179 dict or list of dicts, matching `normalised_json_object`
180 """
181 top_dict_ = {k: v for k, v in data.items() if not isinstance(v, dict)}
182 nested_dict_ = _normalise_json(
183 data={k: v for k, v in data.items() if isinstance(v, dict)},
184 key_string="",
185 normalized_dict={},
186 separator=separator,
187 )
188 return {**top_dict_, **nested_dict_}
191def _simple_json_normalize(
192 ds: dict | list[dict],
193 sep: str = ".",
194) -> dict | list[dict] | Any:
195 """
196 A optimized basic json_normalize
198 Converts a nested dict into a flat dict ("record"), unlike
199 json_normalize and nested_to_record it doesn't do anything clever.
200 But for the most basic use cases it enhances performance.
201 E.g. pd.json_normalize(data)
203 Parameters
204 ----------
205 ds : dict or list of dicts
206 sep : str, default '.'
207 Nested records will generate names separated by sep,
208 e.g., for sep='.', { 'foo' : { 'bar' : 0 } } -> foo.bar
210 Returns
211 -------
212 frame : DataFrame
213 d - dict or list of dicts, matching `normalised_json_object`
215 Examples
216 --------
217 >>> _simple_json_normalize(
218 ... {
219 ... "flat1": 1,
220 ... "dict1": {"c": 1, "d": 2},
221 ... "nested": {"e": {"c": 1, "d": 2}, "d": 2},
222 ... }
223 ... )
224 {\
225'flat1': 1, \
226'dict1.c': 1, \
227'dict1.d': 2, \
228'nested.e.c': 1, \
229'nested.e.d': 2, \
230'nested.d': 2\
231}
233 """
234 normalised_json_object = {}
235 # expect a dictionary, as most jsons are. However, lists are perfectly valid
236 if isinstance(ds, dict):
237 normalised_json_object = _normalise_json_ordered(data=ds, separator=sep)
238 elif isinstance(ds, list):
239 normalised_json_list = [_simple_json_normalize(row, sep=sep) for row in ds]
240 return normalised_json_list
241 return normalised_json_object
244def _json_normalize(
245 data: dict | list[dict],
246 record_path: str | list | None = None,
247 meta: str | list[str | list[str]] | None = None,
248 meta_prefix: str | None = None,
249 record_prefix: str | None = None,
250 errors: IgnoreRaise = "raise",
251 sep: str = ".",
252 max_level: int | None = None,
253) -> DataFrame:
254 """
255 Normalize semi-structured JSON data into a flat table.
257 Parameters
258 ----------
259 data : dict or list of dicts
260 Unserialized JSON objects.
261 record_path : str or list of str, default None
262 Path in each object to list of records. If not passed, data will be
263 assumed to be an array of records.
264 meta : list of paths (str or list of str), default None
265 Fields to use as metadata for each record in resulting table.
266 meta_prefix : str, default None
267 If True, prefix records with dotted (?) path, e.g. foo.bar.field if
268 meta is ['foo', 'bar'].
269 record_prefix : str, default None
270 If True, prefix records with dotted (?) path, e.g. foo.bar.field if
271 path to records is ['foo', 'bar'].
272 errors : {'raise', 'ignore'}, default 'raise'
273 Configures error handling.
275 * 'ignore' : will ignore KeyError if keys listed in meta are not
276 always present.
277 * 'raise' : will raise KeyError if keys listed in meta are not
278 always present.
279 sep : str, default '.'
280 Nested records will generate names separated by sep.
281 e.g., for sep='.', {'foo': {'bar': 0}} -> foo.bar.
282 max_level : int, default None
283 Max number of levels(depth of dict) to normalize.
284 if None, normalizes all levels.
286 .. versionadded:: 0.25.0
288 Returns
289 -------
290 frame : DataFrame
291 Normalize semi-structured JSON data into a flat table.
293 Examples
294 --------
295 >>> data = [
296 ... {"id": 1, "name": {"first": "Coleen", "last": "Volk"}},
297 ... {"name": {"given": "Mark", "family": "Regner"}},
298 ... {"id": 2, "name": "Faye Raker"},
299 ... ]
300 >>> pd.json_normalize(data)
301 id name.first name.last name.given name.family name
302 0 1.0 Coleen Volk NaN NaN NaN
303 1 NaN NaN NaN Mark Regner NaN
304 2 2.0 NaN NaN NaN NaN Faye Raker
306 >>> data = [
307 ... {
308 ... "id": 1,
309 ... "name": "Cole Volk",
310 ... "fitness": {"height": 130, "weight": 60},
311 ... },
312 ... {"name": "Mark Reg", "fitness": {"height": 130, "weight": 60}},
313 ... {
314 ... "id": 2,
315 ... "name": "Faye Raker",
316 ... "fitness": {"height": 130, "weight": 60},
317 ... },
318 ... ]
319 >>> pd.json_normalize(data, max_level=0)
320 id name fitness
321 0 1.0 Cole Volk {'height': 130, 'weight': 60}
322 1 NaN Mark Reg {'height': 130, 'weight': 60}
323 2 2.0 Faye Raker {'height': 130, 'weight': 60}
325 Normalizes nested data up to level 1.
327 >>> data = [
328 ... {
329 ... "id": 1,
330 ... "name": "Cole Volk",
331 ... "fitness": {"height": 130, "weight": 60},
332 ... },
333 ... {"name": "Mark Reg", "fitness": {"height": 130, "weight": 60}},
334 ... {
335 ... "id": 2,
336 ... "name": "Faye Raker",
337 ... "fitness": {"height": 130, "weight": 60},
338 ... },
339 ... ]
340 >>> pd.json_normalize(data, max_level=1)
341 id name fitness.height fitness.weight
342 0 1.0 Cole Volk 130 60
343 1 NaN Mark Reg 130 60
344 2 2.0 Faye Raker 130 60
346 >>> data = [
347 ... {
348 ... "state": "Florida",
349 ... "shortname": "FL",
350 ... "info": {"governor": "Rick Scott"},
351 ... "counties": [
352 ... {"name": "Dade", "population": 12345},
353 ... {"name": "Broward", "population": 40000},
354 ... {"name": "Palm Beach", "population": 60000},
355 ... ],
356 ... },
357 ... {
358 ... "state": "Ohio",
359 ... "shortname": "OH",
360 ... "info": {"governor": "John Kasich"},
361 ... "counties": [
362 ... {"name": "Summit", "population": 1234},
363 ... {"name": "Cuyahoga", "population": 1337},
364 ... ],
365 ... },
366 ... ]
367 >>> result = pd.json_normalize(
368 ... data, "counties", ["state", "shortname", ["info", "governor"]]
369 ... )
370 >>> result
371 name population state shortname info.governor
372 0 Dade 12345 Florida FL Rick Scott
373 1 Broward 40000 Florida FL Rick Scott
374 2 Palm Beach 60000 Florida FL Rick Scott
375 3 Summit 1234 Ohio OH John Kasich
376 4 Cuyahoga 1337 Ohio OH John Kasich
378 >>> data = {"A": [1, 2]}
379 >>> pd.json_normalize(data, "A", record_prefix="Prefix.")
380 Prefix.0
381 0 1
382 1 2
384 Returns normalized data with columns prefixed with the given string.
385 """
387 def _pull_field(
388 js: dict[str, Any], spec: list | str, extract_record: bool = False
389 ) -> Scalar | Iterable:
390 """Internal function to pull field"""
391 result = js
392 try:
393 if isinstance(spec, list):
394 for field in spec:
395 if result is None:
396 raise KeyError(field)
397 result = result[field]
398 else:
399 result = result[spec]
400 except KeyError as e:
401 if extract_record:
402 raise KeyError(
403 f"Key {e} not found. If specifying a record_path, all elements of "
404 f"data should have the path."
405 ) from e
406 elif errors == "ignore":
407 return np.nan
408 else:
409 raise KeyError(
410 f"Key {e} not found. To replace missing values of {e} with "
411 f"np.nan, pass in errors='ignore'"
412 ) from e
414 return result
416 def _pull_records(js: dict[str, Any], spec: list | str) -> list:
417 """
418 Internal function to pull field for records, and similar to
419 _pull_field, but require to return list. And will raise error
420 if has non iterable value.
421 """
422 result = _pull_field(js, spec, extract_record=True)
424 # GH 31507 GH 30145, GH 26284 if result is not list, raise TypeError if not
425 # null, otherwise return an empty list
426 if not isinstance(result, list):
427 if pd.isnull(result):
428 result = []
429 else:
430 raise TypeError(
431 f"{js} has non list value {result} for path {spec}. "
432 "Must be list or null."
433 )
434 return result
436 if isinstance(data, list) and not data:
437 return DataFrame()
438 elif isinstance(data, dict):
439 # A bit of a hackjob
440 data = [data]
441 elif isinstance(data, abc.Iterable) and not isinstance(data, str):
442 # GH35923 Fix pd.json_normalize to not skip the first element of a
443 # generator input
444 data = list(data)
445 else:
446 raise NotImplementedError
448 # check to see if a simple recursive function is possible to
449 # improve performance (see #15621) but only for cases such
450 # as pd.Dataframe(data) or pd.Dataframe(data, sep)
451 if (
452 record_path is None
453 and meta is None
454 and meta_prefix is None
455 and record_prefix is None
456 and max_level is None
457 ):
458 return DataFrame(_simple_json_normalize(data, sep=sep))
460 if record_path is None:
461 if any([isinstance(x, dict) for x in y.values()] for y in data):
462 # naive normalization, this is idempotent for flat records
463 # and potentially will inflate the data considerably for
464 # deeply nested structures:
465 # {VeryLong: { b: 1,c:2}} -> {VeryLong.b:1 ,VeryLong.c:@}
466 #
467 # TODO: handle record value which are lists, at least error
468 # reasonably
469 data = nested_to_record(data, sep=sep, max_level=max_level)
470 return DataFrame(data)
471 elif not isinstance(record_path, list):
472 record_path = [record_path]
474 if meta is None:
475 meta = []
476 elif not isinstance(meta, list):
477 meta = [meta]
479 _meta = [m if isinstance(m, list) else [m] for m in meta]
481 # Disastrously inefficient for now
482 records: list = []
483 lengths = []
485 meta_vals: DefaultDict = defaultdict(list)
486 meta_keys = [sep.join(val) for val in _meta]
488 def _recursive_extract(data, path, seen_meta, level=0):
489 if isinstance(data, dict):
490 data = [data]
491 if len(path) > 1:
492 for obj in data:
493 for val, key in zip(_meta, meta_keys):
494 if level + 1 == len(val):
495 seen_meta[key] = _pull_field(obj, val[-1])
497 _recursive_extract(obj[path[0]], path[1:], seen_meta, level=level + 1)
498 else:
499 for obj in data:
500 recs = _pull_records(obj, path[0])
501 recs = [
502 nested_to_record(r, sep=sep, max_level=max_level)
503 if isinstance(r, dict)
504 else r
505 for r in recs
506 ]
508 # For repeating the metadata later
509 lengths.append(len(recs))
510 for val, key in zip(_meta, meta_keys):
511 if level + 1 > len(val):
512 meta_val = seen_meta[key]
513 else:
514 meta_val = _pull_field(obj, val[level:])
515 meta_vals[key].append(meta_val)
516 records.extend(recs)
518 _recursive_extract(data, record_path, {}, level=0)
520 result = DataFrame(records)
522 if record_prefix is not None:
523 result = result.rename(columns=lambda x: f"{record_prefix}{x}")
525 # Data types, a problem
526 for k, v in meta_vals.items():
527 if meta_prefix is not None:
528 k = meta_prefix + k
530 if k in result:
531 raise ValueError(
532 f"Conflicting metadata name {k}, need distinguishing prefix "
533 )
534 result[k] = np.array(v, dtype=object).repeat(lengths)
535 return result
538json_normalize = deprecate(
539 "pandas.io.json.json_normalize", _json_normalize, "1.0.0", "pandas.json_normalize"
540)