Coverage for /var/srv/projects/api.amasfac.comuna18.com/tmp/venv/lib/python3.9/site-packages/pandas/io/json/_table_schema.py: 9%

132 statements  

« prev     ^ index     » next       coverage.py v6.4.4, created at 2023-07-17 14:22 -0600

1""" 

2Table Schema builders 

3 

4https://specs.frictionlessdata.io/json-table-schema/ 

5""" 

6from __future__ import annotations 

7 

8from typing import ( 

9 TYPE_CHECKING, 

10 Any, 

11 cast, 

12) 

13import warnings 

14 

15import pandas._libs.json as json 

16from pandas._typing import ( 

17 DtypeObj, 

18 JSONSerializable, 

19) 

20from pandas.util._exceptions import find_stack_level 

21 

22from pandas.core.dtypes.base import _registry as registry 

23from pandas.core.dtypes.common import ( 

24 is_bool_dtype, 

25 is_categorical_dtype, 

26 is_datetime64_dtype, 

27 is_datetime64tz_dtype, 

28 is_extension_array_dtype, 

29 is_integer_dtype, 

30 is_numeric_dtype, 

31 is_period_dtype, 

32 is_string_dtype, 

33 is_timedelta64_dtype, 

34) 

35from pandas.core.dtypes.dtypes import CategoricalDtype 

36 

37from pandas import DataFrame 

38import pandas.core.common as com 

39 

40if TYPE_CHECKING: 40 ↛ 41line 40 didn't jump to line 41, because the condition on line 40 was never true

41 from pandas import Series 

42 from pandas.core.indexes.multi import MultiIndex 

43 

44loads = json.loads 

45 

46TABLE_SCHEMA_VERSION = "1.4.0" 

47 

48 

49def as_json_table_type(x: DtypeObj) -> str: 

50 """ 

51 Convert a NumPy / pandas type to its corresponding json_table. 

52 

53 Parameters 

54 ---------- 

55 x : np.dtype or ExtensionDtype 

56 

57 Returns 

58 ------- 

59 str 

60 the Table Schema data types 

61 

62 Notes 

63 ----- 

64 This table shows the relationship between NumPy / pandas dtypes, 

65 and Table Schema dtypes. 

66 

67 ============== ================= 

68 Pandas type Table Schema type 

69 ============== ================= 

70 int64 integer 

71 float64 number 

72 bool boolean 

73 datetime64[ns] datetime 

74 timedelta64[ns] duration 

75 object str 

76 categorical any 

77 =============== ================= 

78 """ 

79 if is_integer_dtype(x): 

80 return "integer" 

81 elif is_bool_dtype(x): 

82 return "boolean" 

83 elif is_numeric_dtype(x): 

84 return "number" 

85 elif is_datetime64_dtype(x) or is_datetime64tz_dtype(x) or is_period_dtype(x): 

86 return "datetime" 

87 elif is_timedelta64_dtype(x): 

88 return "duration" 

89 elif is_categorical_dtype(x): 

90 return "any" 

91 elif is_extension_array_dtype(x): 

92 return "any" 

93 elif is_string_dtype(x): 

94 return "string" 

95 else: 

96 return "any" 

97 

98 

99def set_default_names(data): 

100 """Sets index names to 'index' for regular, or 'level_x' for Multi""" 

101 if com.all_not_none(*data.index.names): 

102 nms = data.index.names 

103 if len(nms) == 1 and data.index.name == "index": 

104 warnings.warn( 

105 "Index name of 'index' is not round-trippable.", 

106 stacklevel=find_stack_level(), 

107 ) 

108 elif len(nms) > 1 and any(x.startswith("level_") for x in nms): 

109 warnings.warn( 

110 "Index names beginning with 'level_' are not round-trippable.", 

111 stacklevel=find_stack_level(), 

112 ) 

113 return data 

114 

115 data = data.copy() 

116 if data.index.nlevels > 1: 

117 data.index.names = com.fill_missing_names(data.index.names) 

118 else: 

119 data.index.name = data.index.name or "index" 

120 return data 

121 

122 

123def convert_pandas_type_to_json_field(arr) -> dict[str, JSONSerializable]: 

124 dtype = arr.dtype 

125 name: JSONSerializable 

126 if arr.name is None: 

127 name = "values" 

128 else: 

129 name = arr.name 

130 field: dict[str, JSONSerializable] = { 

131 "name": name, 

132 "type": as_json_table_type(dtype), 

133 } 

134 

135 if is_categorical_dtype(dtype): 

136 cats = dtype.categories 

137 ordered = dtype.ordered 

138 

139 field["constraints"] = {"enum": list(cats)} 

140 field["ordered"] = ordered 

141 elif is_period_dtype(dtype): 

142 field["freq"] = dtype.freq.freqstr 

143 elif is_datetime64tz_dtype(dtype): 

144 field["tz"] = dtype.tz.zone 

145 elif is_extension_array_dtype(dtype): 

146 field["extDtype"] = dtype.name 

147 return field 

148 

149 

150def convert_json_field_to_pandas_type(field) -> str | CategoricalDtype: 

151 """ 

152 Converts a JSON field descriptor into its corresponding NumPy / pandas type 

153 

154 Parameters 

155 ---------- 

156 field 

157 A JSON field descriptor 

158 

159 Returns 

160 ------- 

161 dtype 

162 

163 Raises 

164 ------ 

165 ValueError 

166 If the type of the provided field is unknown or currently unsupported 

167 

168 Examples 

169 -------- 

170 >>> convert_json_field_to_pandas_type({"name": "an_int", "type": "integer"}) 

171 'int64' 

172 

173 >>> convert_json_field_to_pandas_type( 

174 ... { 

175 ... "name": "a_categorical", 

176 ... "type": "any", 

177 ... "constraints": {"enum": ["a", "b", "c"]}, 

178 ... "ordered": True, 

179 ... } 

180 ... ) 

181 CategoricalDtype(categories=['a', 'b', 'c'], ordered=True) 

182 

183 >>> convert_json_field_to_pandas_type({"name": "a_datetime", "type": "datetime"}) 

184 'datetime64[ns]' 

185 

186 >>> convert_json_field_to_pandas_type( 

187 ... {"name": "a_datetime_with_tz", "type": "datetime", "tz": "US/Central"} 

188 ... ) 

189 'datetime64[ns, US/Central]' 

190 """ 

191 typ = field["type"] 

192 if typ == "string": 

193 return "object" 

194 elif typ == "integer": 

195 return "int64" 

196 elif typ == "number": 

197 return "float64" 

198 elif typ == "boolean": 

199 return "bool" 

200 elif typ == "duration": 

201 return "timedelta64" 

202 elif typ == "datetime": 

203 if field.get("tz"): 

204 return f"datetime64[ns, {field['tz']}]" 

205 elif field.get("freq"): 

206 # GH#47747 using datetime over period to minimize the change surface 

207 return f"period[{field['freq']}]" 

208 else: 

209 return "datetime64[ns]" 

210 elif typ == "any": 

211 if "constraints" in field and "ordered" in field: 

212 return CategoricalDtype( 

213 categories=field["constraints"]["enum"], ordered=field["ordered"] 

214 ) 

215 elif "extDtype" in field: 

216 return registry.find(field["extDtype"]) 

217 else: 

218 return "object" 

219 

220 raise ValueError(f"Unsupported or invalid field type: {typ}") 

221 

222 

223def build_table_schema( 

224 data: DataFrame | Series, 

225 index: bool = True, 

226 primary_key: bool | None = None, 

227 version: bool = True, 

228) -> dict[str, JSONSerializable]: 

229 """ 

230 Create a Table schema from ``data``. 

231 

232 Parameters 

233 ---------- 

234 data : Series, DataFrame 

235 index : bool, default True 

236 Whether to include ``data.index`` in the schema. 

237 primary_key : bool or None, default True 

238 Column names to designate as the primary key. 

239 The default `None` will set `'primaryKey'` to the index 

240 level or levels if the index is unique. 

241 version : bool, default True 

242 Whether to include a field `pandas_version` with the version 

243 of pandas that last revised the table schema. This version 

244 can be different from the installed pandas version. 

245 

246 Returns 

247 ------- 

248 schema : dict 

249 

250 Notes 

251 ----- 

252 See `Table Schema 

253 <https://pandas.pydata.org/docs/user_guide/io.html#table-schema>`__ for 

254 conversion types. 

255 Timedeltas as converted to ISO8601 duration format with 

256 9 decimal places after the seconds field for nanosecond precision. 

257 

258 Categoricals are converted to the `any` dtype, and use the `enum` field 

259 constraint to list the allowed values. The `ordered` attribute is included 

260 in an `ordered` field. 

261 

262 Examples 

263 -------- 

264 >>> df = pd.DataFrame( 

265 ... {'A': [1, 2, 3], 

266 ... 'B': ['a', 'b', 'c'], 

267 ... 'C': pd.date_range('2016-01-01', freq='d', periods=3), 

268 ... }, index=pd.Index(range(3), name='idx')) 

269 >>> build_table_schema(df) 

270 {'fields': \ 

271[{'name': 'idx', 'type': 'integer'}, \ 

272{'name': 'A', 'type': 'integer'}, \ 

273{'name': 'B', 'type': 'string'}, \ 

274{'name': 'C', 'type': 'datetime'}], \ 

275'primaryKey': ['idx'], \ 

276'pandas_version': '1.4.0'} 

277 """ 

278 if index is True: 

279 data = set_default_names(data) 

280 

281 schema: dict[str, Any] = {} 

282 fields = [] 

283 

284 if index: 

285 if data.index.nlevels > 1: 

286 data.index = cast("MultiIndex", data.index) 

287 for level, name in zip(data.index.levels, data.index.names): 

288 new_field = convert_pandas_type_to_json_field(level) 

289 new_field["name"] = name 

290 fields.append(new_field) 

291 else: 

292 fields.append(convert_pandas_type_to_json_field(data.index)) 

293 

294 if data.ndim > 1: 

295 for column, s in data.items(): 

296 fields.append(convert_pandas_type_to_json_field(s)) 

297 else: 

298 fields.append(convert_pandas_type_to_json_field(data)) 

299 

300 schema["fields"] = fields 

301 if index and data.index.is_unique and primary_key is None: 

302 if data.index.nlevels == 1: 

303 schema["primaryKey"] = [data.index.name] 

304 else: 

305 schema["primaryKey"] = data.index.names 

306 elif primary_key is not None: 

307 schema["primaryKey"] = primary_key 

308 

309 if version: 

310 schema["pandas_version"] = TABLE_SCHEMA_VERSION 

311 return schema 

312 

313 

314def parse_table_schema(json, precise_float): 

315 """ 

316 Builds a DataFrame from a given schema 

317 

318 Parameters 

319 ---------- 

320 json : 

321 A JSON table schema 

322 precise_float : bool 

323 Flag controlling precision when decoding string to double values, as 

324 dictated by ``read_json`` 

325 

326 Returns 

327 ------- 

328 df : DataFrame 

329 

330 Raises 

331 ------ 

332 NotImplementedError 

333 If the JSON table schema contains either timezone or timedelta data 

334 

335 Notes 

336 ----- 

337 Because :func:`DataFrame.to_json` uses the string 'index' to denote a 

338 name-less :class:`Index`, this function sets the name of the returned 

339 :class:`DataFrame` to ``None`` when said string is encountered with a 

340 normal :class:`Index`. For a :class:`MultiIndex`, the same limitation 

341 applies to any strings beginning with 'level_'. Therefore, an 

342 :class:`Index` name of 'index' and :class:`MultiIndex` names starting 

343 with 'level_' are not supported. 

344 

345 See Also 

346 -------- 

347 build_table_schema : Inverse function. 

348 pandas.read_json 

349 """ 

350 table = loads(json, precise_float=precise_float) 

351 col_order = [field["name"] for field in table["schema"]["fields"]] 

352 df = DataFrame(table["data"], columns=col_order)[col_order] 

353 

354 dtypes = { 

355 field["name"]: convert_json_field_to_pandas_type(field) 

356 for field in table["schema"]["fields"] 

357 } 

358 

359 # No ISO constructor for Timedelta as of yet, so need to raise 

360 if "timedelta64" in dtypes.values(): 

361 raise NotImplementedError( 

362 'table="orient" can not yet read ISO-formatted Timedelta data' 

363 ) 

364 

365 df = df.astype(dtypes) 

366 

367 if "primaryKey" in table["schema"]: 

368 df = df.set_index(table["schema"]["primaryKey"]) 

369 if len(df.index.names) == 1: 

370 if df.index.name == "index": 

371 df.index.name = None 

372 else: 

373 df.index.names = [ 

374 None if x.startswith("level_") else x for x in df.index.names 

375 ] 

376 

377 return df