Coverage for /var/srv/projects/api.amasfac.comuna18.com/tmp/venv/lib/python3.9/site-packages/pandas/io/json/_normalize.py: 7%

138 statements  

« prev     ^ index     » next       coverage.py v6.4.4, created at 2023-07-17 14:22 -0600

1# --------------------------------------------------------------------- 

2# JSON normalization routines 

3from __future__ import annotations 

4 

5from collections import ( 

6 abc, 

7 defaultdict, 

8) 

9import copy 

10from typing import ( 

11 Any, 

12 DefaultDict, 

13 Iterable, 

14) 

15 

16import numpy as np 

17 

18from pandas._libs.writers import convert_json_to_lines 

19from pandas._typing import ( 

20 IgnoreRaise, 

21 Scalar, 

22) 

23from pandas.util._decorators import deprecate 

24 

25import pandas as pd 

26from pandas import DataFrame 

27 

28 

29def convert_to_line_delimits(s: str) -> str: 

30 """ 

31 Helper function that converts JSON lists to line delimited JSON. 

32 """ 

33 # Determine we have a JSON list to turn to lines otherwise just return the 

34 # json object, only lists can 

35 if not s[0] == "[" and s[-1] == "]": 

36 return s 

37 s = s[1:-1] 

38 

39 return convert_json_to_lines(s) 

40 

41 

42def nested_to_record( 

43 ds, 

44 prefix: str = "", 

45 sep: str = ".", 

46 level: int = 0, 

47 max_level: int | None = None, 

48): 

49 """ 

50 A simplified json_normalize 

51 

52 Converts a nested dict into a flat dict ("record"), unlike json_normalize, 

53 it does not attempt to extract a subset of the data. 

54 

55 Parameters 

56 ---------- 

57 ds : dict or list of dicts 

58 prefix: the prefix, optional, default: "" 

59 sep : str, default '.' 

60 Nested records will generate names separated by sep, 

61 e.g., for sep='.', { 'foo' : { 'bar' : 0 } } -> foo.bar 

62 level: int, optional, default: 0 

63 The number of levels in the json string. 

64 

65 max_level: int, optional, default: None 

66 The max depth to normalize. 

67 

68 .. versionadded:: 0.25.0 

69 

70 Returns 

71 ------- 

72 d - dict or list of dicts, matching `ds` 

73 

74 Examples 

75 -------- 

76 >>> nested_to_record( 

77 ... dict(flat1=1, dict1=dict(c=1, d=2), nested=dict(e=dict(c=1, d=2), d=2)) 

78 ... ) 

79 {\ 

80'flat1': 1, \ 

81'dict1.c': 1, \ 

82'dict1.d': 2, \ 

83'nested.e.c': 1, \ 

84'nested.e.d': 2, \ 

85'nested.d': 2\ 

86} 

87 """ 

88 singleton = False 

89 if isinstance(ds, dict): 

90 ds = [ds] 

91 singleton = True 

92 new_ds = [] 

93 for d in ds: 

94 new_d = copy.deepcopy(d) 

95 for k, v in d.items(): 

96 # each key gets renamed with prefix 

97 if not isinstance(k, str): 

98 k = str(k) 

99 if level == 0: 

100 newkey = k 

101 else: 

102 newkey = prefix + sep + k 

103 

104 # flatten if type is dict and 

105 # current dict level < maximum level provided and 

106 # only dicts gets recurse-flattened 

107 # only at level>1 do we rename the rest of the keys 

108 if not isinstance(v, dict) or ( 

109 max_level is not None and level >= max_level 

110 ): 

111 if level != 0: # so we skip copying for top level, common case 

112 v = new_d.pop(k) 

113 new_d[newkey] = v 

114 continue 

115 else: 

116 v = new_d.pop(k) 

117 new_d.update(nested_to_record(v, newkey, sep, level + 1, max_level)) 

118 new_ds.append(new_d) 

119 

120 if singleton: 

121 return new_ds[0] 

122 return new_ds 

123 

124 

125def _normalise_json( 

126 data: Any, 

127 key_string: str, 

128 normalized_dict: dict[str, Any], 

129 separator: str, 

130) -> dict[str, Any]: 

131 """ 

132 Main recursive function 

133 Designed for the most basic use case of pd.json_normalize(data) 

134 intended as a performance improvement, see #15621 

135 

136 Parameters 

137 ---------- 

138 data : Any 

139 Type dependent on types contained within nested Json 

140 key_string : str 

141 New key (with separator(s) in) for data 

142 normalized_dict : dict 

143 The new normalized/flattened Json dict 

144 separator : str, default '.' 

145 Nested records will generate names separated by sep, 

146 e.g., for sep='.', { 'foo' : { 'bar' : 0 } } -> foo.bar 

147 """ 

148 if isinstance(data, dict): 

149 for key, value in data.items(): 

150 new_key = f"{key_string}{separator}{key}" 

151 _normalise_json( 

152 data=value, 

153 # to avoid adding the separator to the start of every key 

154 # GH#43831 avoid adding key if key_string blank 

155 key_string=new_key 

156 if new_key[: len(separator)] != separator 

157 else new_key[len(separator) :], 

158 normalized_dict=normalized_dict, 

159 separator=separator, 

160 ) 

161 else: 

162 normalized_dict[key_string] = data 

163 return normalized_dict 

164 

165 

166def _normalise_json_ordered(data: dict[str, Any], separator: str) -> dict[str, Any]: 

167 """ 

168 Order the top level keys and then recursively go to depth 

169 

170 Parameters 

171 ---------- 

172 data : dict or list of dicts 

173 separator : str, default '.' 

174 Nested records will generate names separated by sep, 

175 e.g., for sep='.', { 'foo' : { 'bar' : 0 } } -> foo.bar 

176 

177 Returns 

178 ------- 

179 dict or list of dicts, matching `normalised_json_object` 

180 """ 

181 top_dict_ = {k: v for k, v in data.items() if not isinstance(v, dict)} 

182 nested_dict_ = _normalise_json( 

183 data={k: v for k, v in data.items() if isinstance(v, dict)}, 

184 key_string="", 

185 normalized_dict={}, 

186 separator=separator, 

187 ) 

188 return {**top_dict_, **nested_dict_} 

189 

190 

191def _simple_json_normalize( 

192 ds: dict | list[dict], 

193 sep: str = ".", 

194) -> dict | list[dict] | Any: 

195 """ 

196 A optimized basic json_normalize 

197 

198 Converts a nested dict into a flat dict ("record"), unlike 

199 json_normalize and nested_to_record it doesn't do anything clever. 

200 But for the most basic use cases it enhances performance. 

201 E.g. pd.json_normalize(data) 

202 

203 Parameters 

204 ---------- 

205 ds : dict or list of dicts 

206 sep : str, default '.' 

207 Nested records will generate names separated by sep, 

208 e.g., for sep='.', { 'foo' : { 'bar' : 0 } } -> foo.bar 

209 

210 Returns 

211 ------- 

212 frame : DataFrame 

213 d - dict or list of dicts, matching `normalised_json_object` 

214 

215 Examples 

216 -------- 

217 >>> _simple_json_normalize( 

218 ... { 

219 ... "flat1": 1, 

220 ... "dict1": {"c": 1, "d": 2}, 

221 ... "nested": {"e": {"c": 1, "d": 2}, "d": 2}, 

222 ... } 

223 ... ) 

224 {\ 

225'flat1': 1, \ 

226'dict1.c': 1, \ 

227'dict1.d': 2, \ 

228'nested.e.c': 1, \ 

229'nested.e.d': 2, \ 

230'nested.d': 2\ 

231} 

232 

233 """ 

234 normalised_json_object = {} 

235 # expect a dictionary, as most jsons are. However, lists are perfectly valid 

236 if isinstance(ds, dict): 

237 normalised_json_object = _normalise_json_ordered(data=ds, separator=sep) 

238 elif isinstance(ds, list): 

239 normalised_json_list = [_simple_json_normalize(row, sep=sep) for row in ds] 

240 return normalised_json_list 

241 return normalised_json_object 

242 

243 

244def _json_normalize( 

245 data: dict | list[dict], 

246 record_path: str | list | None = None, 

247 meta: str | list[str | list[str]] | None = None, 

248 meta_prefix: str | None = None, 

249 record_prefix: str | None = None, 

250 errors: IgnoreRaise = "raise", 

251 sep: str = ".", 

252 max_level: int | None = None, 

253) -> DataFrame: 

254 """ 

255 Normalize semi-structured JSON data into a flat table. 

256 

257 Parameters 

258 ---------- 

259 data : dict or list of dicts 

260 Unserialized JSON objects. 

261 record_path : str or list of str, default None 

262 Path in each object to list of records. If not passed, data will be 

263 assumed to be an array of records. 

264 meta : list of paths (str or list of str), default None 

265 Fields to use as metadata for each record in resulting table. 

266 meta_prefix : str, default None 

267 If True, prefix records with dotted (?) path, e.g. foo.bar.field if 

268 meta is ['foo', 'bar']. 

269 record_prefix : str, default None 

270 If True, prefix records with dotted (?) path, e.g. foo.bar.field if 

271 path to records is ['foo', 'bar']. 

272 errors : {'raise', 'ignore'}, default 'raise' 

273 Configures error handling. 

274 

275 * 'ignore' : will ignore KeyError if keys listed in meta are not 

276 always present. 

277 * 'raise' : will raise KeyError if keys listed in meta are not 

278 always present. 

279 sep : str, default '.' 

280 Nested records will generate names separated by sep. 

281 e.g., for sep='.', {'foo': {'bar': 0}} -> foo.bar. 

282 max_level : int, default None 

283 Max number of levels(depth of dict) to normalize. 

284 if None, normalizes all levels. 

285 

286 .. versionadded:: 0.25.0 

287 

288 Returns 

289 ------- 

290 frame : DataFrame 

291 Normalize semi-structured JSON data into a flat table. 

292 

293 Examples 

294 -------- 

295 >>> data = [ 

296 ... {"id": 1, "name": {"first": "Coleen", "last": "Volk"}}, 

297 ... {"name": {"given": "Mark", "family": "Regner"}}, 

298 ... {"id": 2, "name": "Faye Raker"}, 

299 ... ] 

300 >>> pd.json_normalize(data) 

301 id name.first name.last name.given name.family name 

302 0 1.0 Coleen Volk NaN NaN NaN 

303 1 NaN NaN NaN Mark Regner NaN 

304 2 2.0 NaN NaN NaN NaN Faye Raker 

305 

306 >>> data = [ 

307 ... { 

308 ... "id": 1, 

309 ... "name": "Cole Volk", 

310 ... "fitness": {"height": 130, "weight": 60}, 

311 ... }, 

312 ... {"name": "Mark Reg", "fitness": {"height": 130, "weight": 60}}, 

313 ... { 

314 ... "id": 2, 

315 ... "name": "Faye Raker", 

316 ... "fitness": {"height": 130, "weight": 60}, 

317 ... }, 

318 ... ] 

319 >>> pd.json_normalize(data, max_level=0) 

320 id name fitness 

321 0 1.0 Cole Volk {'height': 130, 'weight': 60} 

322 1 NaN Mark Reg {'height': 130, 'weight': 60} 

323 2 2.0 Faye Raker {'height': 130, 'weight': 60} 

324 

325 Normalizes nested data up to level 1. 

326 

327 >>> data = [ 

328 ... { 

329 ... "id": 1, 

330 ... "name": "Cole Volk", 

331 ... "fitness": {"height": 130, "weight": 60}, 

332 ... }, 

333 ... {"name": "Mark Reg", "fitness": {"height": 130, "weight": 60}}, 

334 ... { 

335 ... "id": 2, 

336 ... "name": "Faye Raker", 

337 ... "fitness": {"height": 130, "weight": 60}, 

338 ... }, 

339 ... ] 

340 >>> pd.json_normalize(data, max_level=1) 

341 id name fitness.height fitness.weight 

342 0 1.0 Cole Volk 130 60 

343 1 NaN Mark Reg 130 60 

344 2 2.0 Faye Raker 130 60 

345 

346 >>> data = [ 

347 ... { 

348 ... "state": "Florida", 

349 ... "shortname": "FL", 

350 ... "info": {"governor": "Rick Scott"}, 

351 ... "counties": [ 

352 ... {"name": "Dade", "population": 12345}, 

353 ... {"name": "Broward", "population": 40000}, 

354 ... {"name": "Palm Beach", "population": 60000}, 

355 ... ], 

356 ... }, 

357 ... { 

358 ... "state": "Ohio", 

359 ... "shortname": "OH", 

360 ... "info": {"governor": "John Kasich"}, 

361 ... "counties": [ 

362 ... {"name": "Summit", "population": 1234}, 

363 ... {"name": "Cuyahoga", "population": 1337}, 

364 ... ], 

365 ... }, 

366 ... ] 

367 >>> result = pd.json_normalize( 

368 ... data, "counties", ["state", "shortname", ["info", "governor"]] 

369 ... ) 

370 >>> result 

371 name population state shortname info.governor 

372 0 Dade 12345 Florida FL Rick Scott 

373 1 Broward 40000 Florida FL Rick Scott 

374 2 Palm Beach 60000 Florida FL Rick Scott 

375 3 Summit 1234 Ohio OH John Kasich 

376 4 Cuyahoga 1337 Ohio OH John Kasich 

377 

378 >>> data = {"A": [1, 2]} 

379 >>> pd.json_normalize(data, "A", record_prefix="Prefix.") 

380 Prefix.0 

381 0 1 

382 1 2 

383 

384 Returns normalized data with columns prefixed with the given string. 

385 """ 

386 

387 def _pull_field( 

388 js: dict[str, Any], spec: list | str, extract_record: bool = False 

389 ) -> Scalar | Iterable: 

390 """Internal function to pull field""" 

391 result = js 

392 try: 

393 if isinstance(spec, list): 

394 for field in spec: 

395 if result is None: 

396 raise KeyError(field) 

397 result = result[field] 

398 else: 

399 result = result[spec] 

400 except KeyError as e: 

401 if extract_record: 

402 raise KeyError( 

403 f"Key {e} not found. If specifying a record_path, all elements of " 

404 f"data should have the path." 

405 ) from e 

406 elif errors == "ignore": 

407 return np.nan 

408 else: 

409 raise KeyError( 

410 f"Key {e} not found. To replace missing values of {e} with " 

411 f"np.nan, pass in errors='ignore'" 

412 ) from e 

413 

414 return result 

415 

416 def _pull_records(js: dict[str, Any], spec: list | str) -> list: 

417 """ 

418 Internal function to pull field for records, and similar to 

419 _pull_field, but require to return list. And will raise error 

420 if has non iterable value. 

421 """ 

422 result = _pull_field(js, spec, extract_record=True) 

423 

424 # GH 31507 GH 30145, GH 26284 if result is not list, raise TypeError if not 

425 # null, otherwise return an empty list 

426 if not isinstance(result, list): 

427 if pd.isnull(result): 

428 result = [] 

429 else: 

430 raise TypeError( 

431 f"{js} has non list value {result} for path {spec}. " 

432 "Must be list or null." 

433 ) 

434 return result 

435 

436 if isinstance(data, list) and not data: 

437 return DataFrame() 

438 elif isinstance(data, dict): 

439 # A bit of a hackjob 

440 data = [data] 

441 elif isinstance(data, abc.Iterable) and not isinstance(data, str): 

442 # GH35923 Fix pd.json_normalize to not skip the first element of a 

443 # generator input 

444 data = list(data) 

445 else: 

446 raise NotImplementedError 

447 

448 # check to see if a simple recursive function is possible to 

449 # improve performance (see #15621) but only for cases such 

450 # as pd.Dataframe(data) or pd.Dataframe(data, sep) 

451 if ( 

452 record_path is None 

453 and meta is None 

454 and meta_prefix is None 

455 and record_prefix is None 

456 and max_level is None 

457 ): 

458 return DataFrame(_simple_json_normalize(data, sep=sep)) 

459 

460 if record_path is None: 

461 if any([isinstance(x, dict) for x in y.values()] for y in data): 

462 # naive normalization, this is idempotent for flat records 

463 # and potentially will inflate the data considerably for 

464 # deeply nested structures: 

465 # {VeryLong: { b: 1,c:2}} -> {VeryLong.b:1 ,VeryLong.c:@} 

466 # 

467 # TODO: handle record value which are lists, at least error 

468 # reasonably 

469 data = nested_to_record(data, sep=sep, max_level=max_level) 

470 return DataFrame(data) 

471 elif not isinstance(record_path, list): 

472 record_path = [record_path] 

473 

474 if meta is None: 

475 meta = [] 

476 elif not isinstance(meta, list): 

477 meta = [meta] 

478 

479 _meta = [m if isinstance(m, list) else [m] for m in meta] 

480 

481 # Disastrously inefficient for now 

482 records: list = [] 

483 lengths = [] 

484 

485 meta_vals: DefaultDict = defaultdict(list) 

486 meta_keys = [sep.join(val) for val in _meta] 

487 

488 def _recursive_extract(data, path, seen_meta, level=0): 

489 if isinstance(data, dict): 

490 data = [data] 

491 if len(path) > 1: 

492 for obj in data: 

493 for val, key in zip(_meta, meta_keys): 

494 if level + 1 == len(val): 

495 seen_meta[key] = _pull_field(obj, val[-1]) 

496 

497 _recursive_extract(obj[path[0]], path[1:], seen_meta, level=level + 1) 

498 else: 

499 for obj in data: 

500 recs = _pull_records(obj, path[0]) 

501 recs = [ 

502 nested_to_record(r, sep=sep, max_level=max_level) 

503 if isinstance(r, dict) 

504 else r 

505 for r in recs 

506 ] 

507 

508 # For repeating the metadata later 

509 lengths.append(len(recs)) 

510 for val, key in zip(_meta, meta_keys): 

511 if level + 1 > len(val): 

512 meta_val = seen_meta[key] 

513 else: 

514 meta_val = _pull_field(obj, val[level:]) 

515 meta_vals[key].append(meta_val) 

516 records.extend(recs) 

517 

518 _recursive_extract(data, record_path, {}, level=0) 

519 

520 result = DataFrame(records) 

521 

522 if record_prefix is not None: 

523 result = result.rename(columns=lambda x: f"{record_prefix}{x}") 

524 

525 # Data types, a problem 

526 for k, v in meta_vals.items(): 

527 if meta_prefix is not None: 

528 k = meta_prefix + k 

529 

530 if k in result: 

531 raise ValueError( 

532 f"Conflicting metadata name {k}, need distinguishing prefix " 

533 ) 

534 result[k] = np.array(v, dtype=object).repeat(lengths) 

535 return result 

536 

537 

538json_normalize = deprecate( 

539 "pandas.io.json.json_normalize", _json_normalize, "1.0.0", "pandas.json_normalize" 

540)