Coverage for /var/srv/projects/api.amasfac.comuna18.com/tmp/venv/lib/python3.9/site-packages/pandas/io/json/

1# ---------------------------------------------------------------------

2# JSON normalization routines

3from __future__ import annotations

5from collections import (

6 abc,

7 defaultdict,

9import copy

10from typing import (

11 Any,

12 DefaultDict,

13 Iterable,

14)

16import numpy as np

18from pandas._libs.writers import convert_json_to_lines

19from pandas._typing import (

20 IgnoreRaise,

21 Scalar,

22)

23from pandas.util._decorators import deprecate

25import pandas as pd

26from pandas import DataFrame

29def convert_to_line_delimits(s: str) -> str:

30 """

31 Helper function that converts JSON lists to line delimited JSON.

32 """

33 # Determine we have a JSON list to turn to lines otherwise just return the

34 # json object, only lists can

35 if not s[0] == "[" and s[-1] == "]":

36 return s

37 s = s[1:-1]

39 return convert_json_to_lines(s)

42def nested_to_record(

43 ds,

44 prefix: str = "",

45 sep: str = ".",

46 level: int = 0,

47 max_level: int | None = None,

48):

49 """

50 A simplified json_normalize

52 Converts a nested dict into a flat dict ("record"), unlike json_normalize,

53 it does not attempt to extract a subset of the data.

55 Parameters

56 ----------

57 ds : dict or list of dicts

58 prefix: the prefix, optional, default: ""

59 sep : str, default '.'

60 Nested records will generate names separated by sep,

61 e.g., for sep='.', { 'foo' : { 'bar' : 0 } } -> foo.bar

62 level: int, optional, default: 0

63 The number of levels in the json string.

65 max_level: int, optional, default: None

66 The max depth to normalize.

68 .. versionadded:: 0.25.0

70 Returns

71 -------

72 d - dict or list of dicts, matching `ds`

74 Examples

75 --------

76 >>> nested_to_record(

77 ... dict(flat1=1, dict1=dict(c=1, d=2), nested=dict(e=dict(c=1, d=2), d=2))

78 ... )

79 {\

80'flat1': 1, \

81'dict1.c': 1, \

82'dict1.d': 2, \

83'nested.e.c': 1, \

84'nested.e.d': 2, \

85'nested.d': 2\

86}

87 """

88 singleton = False

89 if isinstance(ds, dict):

90 ds = [ds]

91 singleton = True

92 new_ds = []

93 for d in ds:

94 new_d = copy.deepcopy(d)

95 for k, v in d.items():

96 # each key gets renamed with prefix

97 if not isinstance(k, str):

98 k = str(k)

99 if level == 0:

100 newkey = k

101 else:

102 newkey = prefix + sep + k

103

104 # flatten if type is dict and

105 # current dict level < maximum level provided and

106 # only dicts gets recurse-flattened

107 # only at level>1 do we rename the rest of the keys

108 if not isinstance(v, dict) or (

109 max_level is not None and level >= max_level

110 ):

111 if level != 0: # so we skip copying for top level, common case

112 v = new_d.pop(k)

113 new_d[newkey] = v

114 continue

115 else:

116 v = new_d.pop(k)

117 new_d.update(nested_to_record(v, newkey, sep, level + 1, max_level))

118 new_ds.append(new_d)

119

120 if singleton:

121 return new_ds[0]

122 return new_ds

123

124

125def _normalise_json(

126 data: Any,

127 key_string: str,

128 normalized_dict: dict[str, Any],

129 separator: str,

130) -> dict[str, Any]:

131 """

132 Main recursive function

133 Designed for the most basic use case of pd.json_normalize(data)

134 intended as a performance improvement, see #15621

135

136 Parameters

137 ----------

138 data : Any

139 Type dependent on types contained within nested Json

140 key_string : str

141 New key (with separator(s) in) for data

142 normalized_dict : dict

143 The new normalized/flattened Json dict

144 separator : str, default '.'

145 Nested records will generate names separated by sep,

146 e.g., for sep='.', { 'foo' : { 'bar' : 0 } } -> foo.bar

147 """

148 if isinstance(data, dict):

149 for key, value in data.items():

150 new_key = f"{key_string}{separator}{key}"

151 _normalise_json(

152 data=value,

153 # to avoid adding the separator to the start of every key

154 # GH#43831 avoid adding key if key_string blank

155 key_string=new_key

156 if new_key[: len(separator)] != separator

157 else new_key[len(separator) :],

158 normalized_dict=normalized_dict,

159 separator=separator,

160 )

161 else:

162 normalized_dict[key_string] = data

163 return normalized_dict

164

165

166def _normalise_json_ordered(data: dict[str, Any], separator: str) -> dict[str, Any]:

167 """

168 Order the top level keys and then recursively go to depth

169

170 Parameters

171 ----------

172 data : dict or list of dicts

173 separator : str, default '.'

174 Nested records will generate names separated by sep,

175 e.g., for sep='.', { 'foo' : { 'bar' : 0 } } -> foo.bar

176

177 Returns

178 -------

179 dict or list of dicts, matching `normalised_json_object`

180 """

181 top_dict_ = {k: v for k, v in data.items() if not isinstance(v, dict)}

182 nested_dict_ = _normalise_json(

183 data={k: v for k, v in data.items() if isinstance(v, dict)},

184 key_string="",

185 normalized_dict={},

186 separator=separator,

187 )

188 return {**top_dict_, **nested_dict_}

189

190

191def _simple_json_normalize(

192 ds: dict | list[dict],

193 sep: str = ".",

194) -> dict | list[dict] | Any:

195 """

196 A optimized basic json_normalize

197

198 Converts a nested dict into a flat dict ("record"), unlike

199 json_normalize and nested_to_record it doesn't do anything clever.

200 But for the most basic use cases it enhances performance.

201 E.g. pd.json_normalize(data)

202

203 Parameters

204 ----------

205 ds : dict or list of dicts

206 sep : str, default '.'

207 Nested records will generate names separated by sep,

208 e.g., for sep='.', { 'foo' : { 'bar' : 0 } } -> foo.bar

209

210 Returns

211 -------

212 frame : DataFrame

213 d - dict or list of dicts, matching `normalised_json_object`

214

215 Examples

216 --------

217 >>> _simple_json_normalize(

218 ... {

219 ... "flat1": 1,

220 ... "dict1": {"c": 1, "d": 2},

221 ... "nested": {"e": {"c": 1, "d": 2}, "d": 2},

222 ... }

223 ... )

224 {\

225'flat1': 1, \

226'dict1.c': 1, \

227'dict1.d': 2, \

228'nested.e.c': 1, \

229'nested.e.d': 2, \

230'nested.d': 2\

231}

232

233 """

234 normalised_json_object = {}

235 # expect a dictionary, as most jsons are. However, lists are perfectly valid

236 if isinstance(ds, dict):

237 normalised_json_object = _normalise_json_ordered(data=ds, separator=sep)

238 elif isinstance(ds, list):

239 normalised_json_list = [_simple_json_normalize(row, sep=sep) for row in ds]

240 return normalised_json_list

241 return normalised_json_object

242

243

244def _json_normalize(

245 data: dict | list[dict],

246 record_path: str | list | None = None,

247 meta: str | list[str | list[str]] | None = None,

248 meta_prefix: str | None = None,

249 record_prefix: str | None = None,

250 errors: IgnoreRaise = "raise",

251 sep: str = ".",

252 max_level: int | None = None,

253) -> DataFrame:

254 """

255 Normalize semi-structured JSON data into a flat table.

256

257 Parameters

258 ----------

259 data : dict or list of dicts

260 Unserialized JSON objects.

261 record_path : str or list of str, default None

262 Path in each object to list of records. If not passed, data will be

263 assumed to be an array of records.

264 meta : list of paths (str or list of str), default None

265 Fields to use as metadata for each record in resulting table.

266 meta_prefix : str, default None

267 If True, prefix records with dotted (?) path, e.g. foo.bar.field if

268 meta is ['foo', 'bar'].

269 record_prefix : str, default None

270 If True, prefix records with dotted (?) path, e.g. foo.bar.field if

271 path to records is ['foo', 'bar'].

272 errors : {'raise', 'ignore'}, default 'raise'

273 Configures error handling.

274

275 * 'ignore' : will ignore KeyError if keys listed in meta are not

276 always present.

277 * 'raise' : will raise KeyError if keys listed in meta are not

278 always present.

279 sep : str, default '.'

280 Nested records will generate names separated by sep.

281 e.g., for sep='.', {'foo': {'bar': 0}} -> foo.bar.

282 max_level : int, default None

283 Max number of levels(depth of dict) to normalize.

284 if None, normalizes all levels.

285

286 .. versionadded:: 0.25.0

287

288 Returns

289 -------

290 frame : DataFrame

291 Normalize semi-structured JSON data into a flat table.

292

293 Examples

294 --------

295 >>> data = [

296 ... {"id": 1, "name": {"first": "Coleen", "last": "Volk"}},

297 ... {"name": {"given": "Mark", "family": "Regner"}},

298 ... {"id": 2, "name": "Faye Raker"},

299 ... ]

300 >>> pd.json_normalize(data)

301 id name.first name.last name.given name.family name

302 0 1.0 Coleen Volk NaN NaN NaN

303 1 NaN NaN NaN Mark Regner NaN

304 2 2.0 NaN NaN NaN NaN Faye Raker

305

306 >>> data = [

307 ... {

308 ... "id": 1,

309 ... "name": "Cole Volk",

310 ... "fitness": {"height": 130, "weight": 60},

311 ... },

312 ... {"name": "Mark Reg", "fitness": {"height": 130, "weight": 60}},

313 ... {

314 ... "id": 2,

315 ... "name": "Faye Raker",

316 ... "fitness": {"height": 130, "weight": 60},

317 ... },

318 ... ]

319 >>> pd.json_normalize(data, max_level=0)

320 id name fitness

321 0 1.0 Cole Volk {'height': 130, 'weight': 60}

322 1 NaN Mark Reg {'height': 130, 'weight': 60}

323 2 2.0 Faye Raker {'height': 130, 'weight': 60}

324

325 Normalizes nested data up to level 1.

326

327 >>> data = [

328 ... {

329 ... "id": 1,

330 ... "name": "Cole Volk",

331 ... "fitness": {"height": 130, "weight": 60},

332 ... },

333 ... {"name": "Mark Reg", "fitness": {"height": 130, "weight": 60}},

334 ... {

335 ... "id": 2,

336 ... "name": "Faye Raker",

337 ... "fitness": {"height": 130, "weight": 60},

338 ... },

339 ... ]

340 >>> pd.json_normalize(data, max_level=1)

341 id name fitness.height fitness.weight

342 0 1.0 Cole Volk 130 60

343 1 NaN Mark Reg 130 60

344 2 2.0 Faye Raker 130 60

345

346 >>> data = [

347 ... {

348 ... "state": "Florida",

349 ... "shortname": "FL",

350 ... "info": {"governor": "Rick Scott"},

351 ... "counties": [

352 ... {"name": "Dade", "population": 12345},

353 ... {"name": "Broward", "population": 40000},

354 ... {"name": "Palm Beach", "population": 60000},

355 ... ],

356 ... },

357 ... {

358 ... "state": "Ohio",

359 ... "shortname": "OH",

360 ... "info": {"governor": "John Kasich"},

361 ... "counties": [

362 ... {"name": "Summit", "population": 1234},

363 ... {"name": "Cuyahoga", "population": 1337},

364 ... ],

365 ... },

366 ... ]

367 >>> result = pd.json_normalize(

368 ... data, "counties", ["state", "shortname", ["info", "governor"]]

369 ... )

370 >>> result

371 name population state shortname info.governor

372 0 Dade 12345 Florida FL Rick Scott

373 1 Broward 40000 Florida FL Rick Scott

374 2 Palm Beach 60000 Florida FL Rick Scott

375 3 Summit 1234 Ohio OH John Kasich

376 4 Cuyahoga 1337 Ohio OH John Kasich

377

378 >>> data = {"A": [1, 2]}

379 >>> pd.json_normalize(data, "A", record_prefix="Prefix.")

380 Prefix.0

381 0 1

382 1 2

383

384 Returns normalized data with columns prefixed with the given string.

385 """

386

387 def _pull_field(

388 js: dict[str, Any], spec: list | str, extract_record: bool = False

389 ) -> Scalar | Iterable:

390 """Internal function to pull field"""

391 result = js

392 try:

393 if isinstance(spec, list):

394 for field in spec:

395 if result is None:

396 raise KeyError(field)

397 result = result[field]

398 else:

399 result = result[spec]

400 except KeyError as e:

401 if extract_record:

402 raise KeyError(

403 f"Key {e} not found. If specifying a record_path, all elements of "

404 f"data should have the path."

405 ) from e

406 elif errors == "ignore":

407 return np.nan

408 else:

409 raise KeyError(

410 f"Key {e} not found. To replace missing values of {e} with "

411 f"np.nan, pass in errors='ignore'"

412 ) from e

413

414 return result

415

416 def _pull_records(js: dict[str, Any], spec: list | str) -> list:

417 """

418 Internal function to pull field for records, and similar to

419 _pull_field, but require to return list. And will raise error

420 if has non iterable value.

421 """

422 result = _pull_field(js, spec, extract_record=True)

423

424 # GH 31507 GH 30145, GH 26284 if result is not list, raise TypeError if not

425 # null, otherwise return an empty list

426 if not isinstance(result, list):

427 if pd.isnull(result):

428 result = []

429 else:

430 raise TypeError(

431 f"{js} has non list value {result} for path {spec}. "

432 "Must be list or null."

433 )

434 return result

435

436 if isinstance(data, list) and not data:

437 return DataFrame()

438 elif isinstance(data, dict):

439 # A bit of a hackjob

440 data = [data]

441 elif isinstance(data, abc.Iterable) and not isinstance(data, str):

442 # GH35923 Fix pd.json_normalize to not skip the first element of a

443 # generator input

444 data = list(data)

445 else:

446 raise NotImplementedError

447

448 # check to see if a simple recursive function is possible to

449 # improve performance (see #15621) but only for cases such

450 # as pd.Dataframe(data) or pd.Dataframe(data, sep)

451 if (

452 record_path is None

453 and meta is None

454 and meta_prefix is None

455 and record_prefix is None

456 and max_level is None

457 ):

458 return DataFrame(_simple_json_normalize(data, sep=sep))

459

460 if record_path is None:

461 if any([isinstance(x, dict) for x in y.values()] for y in data):

462 # naive normalization, this is idempotent for flat records

463 # and potentially will inflate the data considerably for

464 # deeply nested structures:

465 # {VeryLong: { b: 1,c:2}} -> {VeryLong.b:1 ,VeryLong.c:@}

466 #

467 # TODO: handle record value which are lists, at least error

468 # reasonably

469 data = nested_to_record(data, sep=sep, max_level=max_level)

470 return DataFrame(data)

471 elif not isinstance(record_path, list):

472 record_path = [record_path]

473

474 if meta is None:

475 meta = []

476 elif not isinstance(meta, list):

477 meta = [meta]

478

479 _meta = [m if isinstance(m, list) else [m] for m in meta]

480

481 # Disastrously inefficient for now

482 records: list = []

483 lengths = []

484

485 meta_vals: DefaultDict = defaultdict(list)

486 meta_keys = [sep.join(val) for val in _meta]

487

488 def _recursive_extract(data, path, seen_meta, level=0):

489 if isinstance(data, dict):

490 data = [data]

491 if len(path) > 1:

492 for obj in data:

493 for val, key in zip(_meta, meta_keys):

494 if level + 1 == len(val):

495 seen_meta[key] = _pull_field(obj, val[-1])

496

497 _recursive_extract(obj[path[0]], path[1:], seen_meta, level=level + 1)

498 else:

499 for obj in data:

500 recs = _pull_records(obj, path[0])

501 recs = [

502 nested_to_record(r, sep=sep, max_level=max_level)

503 if isinstance(r, dict)

504 else r

505 for r in recs

506 ]

507

508 # For repeating the metadata later

509 lengths.append(len(recs))

510 for val, key in zip(_meta, meta_keys):

511 if level + 1 > len(val):

512 meta_val = seen_meta[key]

513 else:

514 meta_val = _pull_field(obj, val[level:])

515 meta_vals[key].append(meta_val)

516 records.extend(recs)

517

518 _recursive_extract(data, record_path, {}, level=0)

519

520 result = DataFrame(records)

521

522 if record_prefix is not None:

523 result = result.rename(columns=lambda x: f"{record_prefix}{x}")

524

525 # Data types, a problem

526 for k, v in meta_vals.items():

527 if meta_prefix is not None:

528 k = meta_prefix + k

529

530 if k in result:

531 raise ValueError(

532 f"Conflicting metadata name {k}, need distinguishing prefix "

533 )

534 result[k] = np.array(v, dtype=object).repeat(lengths)

535 return result

536

537

538json_normalize = deprecate(

539 "pandas.io.json.json_normalize", _json_normalize, "1.0.0", "pandas.json_normalize"

540)

Coverage for /var/srv/projects/api.amasfac.comuna18.com/tmp/venv/lib/python3.9/site-packages/pandas/io/json/_normalize.py: 7%

138 statements