Coverage for /var/srv/projects/api.amasfac.comuna18.com/tmp/venv/lib/python3.9/site-packages/pandas/core/interchange/from_dataframe.py: 8%

192 statements  

« prev     ^ index     » next       coverage.py v6.4.4, created at 2023-07-17 14:22 -0600

1from __future__ import annotations 

2 

3import ctypes 

4import re 

5from typing import Any 

6 

7import numpy as np 

8 

9import pandas as pd 

10from pandas.core.interchange.column import PandasColumn 

11from pandas.core.interchange.dataframe_protocol import ( 

12 Buffer, 

13 Column, 

14 ColumnNullType, 

15 DataFrame as DataFrameXchg, 

16 DtypeKind, 

17) 

18from pandas.core.interchange.utils import ( 

19 ArrowCTypes, 

20 Endianness, 

21) 

22 

23_NP_DTYPES: dict[DtypeKind, dict[int, Any]] = { 

24 DtypeKind.INT: {8: np.int8, 16: np.int16, 32: np.int32, 64: np.int64}, 

25 DtypeKind.UINT: {8: np.uint8, 16: np.uint16, 32: np.uint32, 64: np.uint64}, 

26 DtypeKind.FLOAT: {32: np.float32, 64: np.float64}, 

27 DtypeKind.BOOL: {8: bool}, 

28} 

29 

30 

31def from_dataframe(df, allow_copy=True) -> pd.DataFrame: 

32 """ 

33 Build a ``pd.DataFrame`` from any DataFrame supporting the interchange protocol. 

34 

35 Parameters 

36 ---------- 

37 df : DataFrameXchg 

38 Object supporting the interchange protocol, i.e. `__dataframe__` method. 

39 allow_copy : bool, default: True 

40 Whether to allow copying the memory to perform the conversion 

41 (if false then zero-copy approach is requested). 

42 

43 Returns 

44 ------- 

45 pd.DataFrame 

46 """ 

47 if isinstance(df, pd.DataFrame): 

48 return df 

49 

50 if not hasattr(df, "__dataframe__"): 

51 raise ValueError("`df` does not support __dataframe__") 

52 

53 return _from_dataframe(df.__dataframe__(allow_copy=allow_copy)) 

54 

55 

56def _from_dataframe(df: DataFrameXchg, allow_copy=True): 

57 """ 

58 Build a ``pd.DataFrame`` from the DataFrame interchange object. 

59 

60 Parameters 

61 ---------- 

62 df : DataFrameXchg 

63 Object supporting the interchange protocol, i.e. `__dataframe__` method. 

64 allow_copy : bool, default: True 

65 Whether to allow copying the memory to perform the conversion 

66 (if false then zero-copy approach is requested). 

67 

68 Returns 

69 ------- 

70 pd.DataFrame 

71 """ 

72 pandas_dfs = [] 

73 for chunk in df.get_chunks(): 

74 pandas_df = protocol_df_chunk_to_pandas(chunk) 

75 pandas_dfs.append(pandas_df) 

76 

77 if not allow_copy and len(pandas_dfs) > 1: 

78 raise RuntimeError( 

79 "To join chunks a copy is required which is forbidden by allow_copy=False" 

80 ) 

81 if len(pandas_dfs) == 1: 

82 pandas_df = pandas_dfs[0] 

83 else: 

84 pandas_df = pd.concat(pandas_dfs, axis=0, ignore_index=True, copy=False) 

85 

86 index_obj = df.metadata.get("pandas.index", None) 

87 if index_obj is not None: 

88 pandas_df.index = index_obj 

89 

90 return pandas_df 

91 

92 

93def protocol_df_chunk_to_pandas(df: DataFrameXchg) -> pd.DataFrame: 

94 """ 

95 Convert interchange protocol chunk to ``pd.DataFrame``. 

96 

97 Parameters 

98 ---------- 

99 df : DataFrameXchg 

100 

101 Returns 

102 ------- 

103 pd.DataFrame 

104 """ 

105 # We need a dict of columns here, with each column being a NumPy array (at 

106 # least for now, deal with non-NumPy dtypes later). 

107 columns: dict[str, Any] = {} 

108 buffers = [] # hold on to buffers, keeps memory alive 

109 for name in df.column_names(): 

110 if not isinstance(name, str): 

111 raise ValueError(f"Column {name} is not a string") 

112 if name in columns: 

113 raise ValueError(f"Column {name} is not unique") 

114 col = df.get_column_by_name(name) 

115 dtype = col.dtype[0] 

116 if dtype in ( 

117 DtypeKind.INT, 

118 DtypeKind.UINT, 

119 DtypeKind.FLOAT, 

120 DtypeKind.BOOL, 

121 ): 

122 columns[name], buf = primitive_column_to_ndarray(col) 

123 elif dtype == DtypeKind.CATEGORICAL: 

124 columns[name], buf = categorical_column_to_series(col) 

125 elif dtype == DtypeKind.STRING: 

126 columns[name], buf = string_column_to_ndarray(col) 

127 elif dtype == DtypeKind.DATETIME: 

128 columns[name], buf = datetime_column_to_ndarray(col) 

129 else: 

130 raise NotImplementedError(f"Data type {dtype} not handled yet") 

131 

132 buffers.append(buf) 

133 

134 pandas_df = pd.DataFrame(columns) 

135 pandas_df.attrs["_INTERCHANGE_PROTOCOL_BUFFERS"] = buffers 

136 return pandas_df 

137 

138 

139def primitive_column_to_ndarray(col: Column) -> tuple[np.ndarray, Any]: 

140 """ 

141 Convert a column holding one of the primitive dtypes to a NumPy array. 

142 

143 A primitive type is one of: int, uint, float, bool. 

144 

145 Parameters 

146 ---------- 

147 col : Column 

148 

149 Returns 

150 ------- 

151 tuple 

152 Tuple of np.ndarray holding the data and the memory owner object 

153 that keeps the memory alive. 

154 """ 

155 buffers = col.get_buffers() 

156 

157 data_buff, data_dtype = buffers["data"] 

158 data = buffer_to_ndarray(data_buff, data_dtype, col.offset, col.size) 

159 

160 data = set_nulls(data, col, buffers["validity"]) 

161 return data, buffers 

162 

163 

164def categorical_column_to_series(col: Column) -> tuple[pd.Series, Any]: 

165 """ 

166 Convert a column holding categorical data to a pandas Series. 

167 

168 Parameters 

169 ---------- 

170 col : Column 

171 

172 Returns 

173 ------- 

174 tuple 

175 Tuple of pd.Series holding the data and the memory owner object 

176 that keeps the memory alive. 

177 """ 

178 categorical = col.describe_categorical 

179 

180 if not categorical["is_dictionary"]: 

181 raise NotImplementedError("Non-dictionary categoricals not supported yet") 

182 

183 cat_column = categorical["categories"] 

184 # for mypy/pyright 

185 assert isinstance(cat_column, PandasColumn), "categories must be a PandasColumn" 

186 categories = np.array(cat_column._col) 

187 buffers = col.get_buffers() 

188 

189 codes_buff, codes_dtype = buffers["data"] 

190 codes = buffer_to_ndarray(codes_buff, codes_dtype, col.offset, col.size) 

191 

192 # Doing module in order to not get ``IndexError`` for 

193 # out-of-bounds sentinel values in `codes` 

194 values = categories[codes % len(categories)] 

195 

196 cat = pd.Categorical( 

197 values, categories=categories, ordered=categorical["is_ordered"] 

198 ) 

199 data = pd.Series(cat) 

200 

201 data = set_nulls(data, col, buffers["validity"]) 

202 return data, buffers 

203 

204 

205def string_column_to_ndarray(col: Column) -> tuple[np.ndarray, Any]: 

206 """ 

207 Convert a column holding string data to a NumPy array. 

208 

209 Parameters 

210 ---------- 

211 col : Column 

212 

213 Returns 

214 ------- 

215 tuple 

216 Tuple of np.ndarray holding the data and the memory owner object 

217 that keeps the memory alive. 

218 """ 

219 null_kind, sentinel_val = col.describe_null 

220 

221 if null_kind not in ( 

222 ColumnNullType.NON_NULLABLE, 

223 ColumnNullType.USE_BITMASK, 

224 ColumnNullType.USE_BYTEMASK, 

225 ): 

226 raise NotImplementedError( 

227 f"{null_kind} null kind is not yet supported for string columns." 

228 ) 

229 

230 buffers = col.get_buffers() 

231 

232 assert buffers["offsets"], "String buffers must contain offsets" 

233 # Retrieve the data buffer containing the UTF-8 code units 

234 data_buff, protocol_data_dtype = buffers["data"] 

235 # We're going to reinterpret the buffer as uint8, so make sure we can do it safely 

236 assert protocol_data_dtype[1] == 8 # bitwidth == 8 

237 assert protocol_data_dtype[2] == ArrowCTypes.STRING # format_str == utf-8 

238 # Convert the buffers to NumPy arrays. In order to go from STRING to 

239 # an equivalent ndarray, we claim that the buffer is uint8 (i.e., a byte array) 

240 data_dtype = ( 

241 DtypeKind.UINT, 

242 8, 

243 ArrowCTypes.UINT8, 

244 Endianness.NATIVE, 

245 ) 

246 # Specify zero offset as we don't want to chunk the string data 

247 data = buffer_to_ndarray(data_buff, data_dtype, offset=0, length=col.size) 

248 

249 # Retrieve the offsets buffer containing the index offsets demarcating 

250 # the beginning and the ending of each string 

251 offset_buff, offset_dtype = buffers["offsets"] 

252 # Offsets buffer contains start-stop positions of strings in the data buffer, 

253 # meaning that it has more elements than in the data buffer, do `col.size + 1` here 

254 # to pass a proper offsets buffer size 

255 offsets = buffer_to_ndarray( 

256 offset_buff, offset_dtype, col.offset, length=col.size + 1 

257 ) 

258 

259 null_pos = None 

260 if null_kind in (ColumnNullType.USE_BITMASK, ColumnNullType.USE_BYTEMASK): 

261 assert buffers["validity"], "Validity buffers cannot be empty for masks" 

262 valid_buff, valid_dtype = buffers["validity"] 

263 null_pos = buffer_to_ndarray(valid_buff, valid_dtype, col.offset, col.size) 

264 if sentinel_val == 0: 

265 null_pos = ~null_pos 

266 

267 # Assemble the strings from the code units 

268 str_list: list[None | float | str] = [None] * col.size 

269 for i in range(col.size): 

270 # Check for missing values 

271 if null_pos is not None and null_pos[i]: 

272 str_list[i] = np.nan 

273 continue 

274 

275 # Extract a range of code units 

276 units = data[offsets[i] : offsets[i + 1]] 

277 

278 # Convert the list of code units to bytes 

279 str_bytes = bytes(units) 

280 

281 # Create the string 

282 string = str_bytes.decode(encoding="utf-8") 

283 

284 # Add to our list of strings 

285 str_list[i] = string 

286 

287 # Convert the string list to a NumPy array 

288 return np.asarray(str_list, dtype="object"), buffers 

289 

290 

291def parse_datetime_format_str(format_str, data): 

292 """Parse datetime `format_str` to interpret the `data`.""" 

293 # timestamp 'ts{unit}:tz' 

294 timestamp_meta = re.match(r"ts([smun]):(.*)", format_str) 

295 if timestamp_meta: 

296 unit, tz = timestamp_meta.group(1), timestamp_meta.group(2) 

297 if tz != "": 

298 raise NotImplementedError("Timezones are not supported yet") 

299 if unit != "s": 

300 # the format string describes only a first letter of the unit, so 

301 # add one extra letter to convert the unit to numpy-style: 

302 # 'm' -> 'ms', 'u' -> 'us', 'n' -> 'ns' 

303 unit += "s" 

304 data = data.astype(f"datetime64[{unit}]") 

305 return data 

306 

307 # date 'td{Days/Ms}' 

308 date_meta = re.match(r"td([Dm])", format_str) 

309 if date_meta: 

310 unit = date_meta.group(1) 

311 if unit == "D": 

312 # NumPy doesn't support DAY unit, so converting days to seconds 

313 # (converting to uint64 to avoid overflow) 

314 data = (data.astype(np.uint64) * (24 * 60 * 60)).astype("datetime64[s]") 

315 elif unit == "m": 

316 data = data.astype("datetime64[ms]") 

317 else: 

318 raise NotImplementedError(f"Date unit is not supported: {unit}") 

319 return data 

320 

321 raise NotImplementedError(f"DateTime kind is not supported: {format_str}") 

322 

323 

324def datetime_column_to_ndarray(col: Column) -> tuple[np.ndarray, Any]: 

325 """ 

326 Convert a column holding DateTime data to a NumPy array. 

327 

328 Parameters 

329 ---------- 

330 col : Column 

331 

332 Returns 

333 ------- 

334 tuple 

335 Tuple of np.ndarray holding the data and the memory owner object 

336 that keeps the memory alive. 

337 """ 

338 buffers = col.get_buffers() 

339 

340 _, _, format_str, _ = col.dtype 

341 dbuf, dtype = buffers["data"] 

342 # Consider dtype being `uint` to get number of units passed since the 01.01.1970 

343 data = buffer_to_ndarray( 

344 dbuf, 

345 ( 

346 DtypeKind.UINT, 

347 dtype[1], 

348 getattr(ArrowCTypes, f"UINT{dtype[1]}"), 

349 Endianness.NATIVE, 

350 ), 

351 col.offset, 

352 col.size, 

353 ) 

354 

355 data = parse_datetime_format_str(format_str, data) 

356 data = set_nulls(data, col, buffers["validity"]) 

357 return data, buffers 

358 

359 

360def buffer_to_ndarray( 

361 buffer: Buffer, 

362 dtype: tuple[DtypeKind, int, str, str], 

363 offset: int = 0, 

364 length: int | None = None, 

365) -> np.ndarray: 

366 """ 

367 Build a NumPy array from the passed buffer. 

368 

369 Parameters 

370 ---------- 

371 buffer : Buffer 

372 Buffer to build a NumPy array from. 

373 dtype : tuple 

374 Data type of the buffer conforming protocol dtypes format. 

375 offset : int, default: 0 

376 Number of elements to offset from the start of the buffer. 

377 length : int, optional 

378 If the buffer is a bit-mask, specifies a number of bits to read 

379 from the buffer. Has no effect otherwise. 

380 

381 Returns 

382 ------- 

383 np.ndarray 

384 

385 Notes 

386 ----- 

387 The returned array doesn't own the memory. The caller of this function is 

388 responsible for keeping the memory owner object alive as long as 

389 the returned NumPy array is being used. 

390 """ 

391 kind, bit_width, _, _ = dtype 

392 

393 column_dtype = _NP_DTYPES.get(kind, {}).get(bit_width, None) 

394 if column_dtype is None: 

395 raise NotImplementedError(f"Conversion for {dtype} is not yet supported.") 

396 

397 # TODO: No DLPack yet, so need to construct a new ndarray from the data pointer 

398 # and size in the buffer plus the dtype on the column. Use DLPack as NumPy supports 

399 # it since https://github.com/numpy/numpy/pull/19083 

400 ctypes_type = np.ctypeslib.as_ctypes_type(column_dtype) 

401 data_pointer = ctypes.cast( 

402 buffer.ptr + (offset * bit_width // 8), ctypes.POINTER(ctypes_type) 

403 ) 

404 

405 if bit_width == 1: 

406 assert length is not None, "`length` must be specified for a bit-mask buffer." 

407 arr = np.ctypeslib.as_array(data_pointer, shape=(buffer.bufsize,)) 

408 return bitmask_to_bool_ndarray(arr, length, first_byte_offset=offset % 8) 

409 else: 

410 return np.ctypeslib.as_array( 

411 data_pointer, shape=(buffer.bufsize // (bit_width // 8),) 

412 ) 

413 

414 

415def bitmask_to_bool_ndarray( 

416 bitmask: np.ndarray, mask_length: int, first_byte_offset: int = 0 

417) -> np.ndarray: 

418 """ 

419 Convert bit-mask to a boolean NumPy array. 

420 

421 Parameters 

422 ---------- 

423 bitmask : np.ndarray[uint8] 

424 NumPy array of uint8 dtype representing the bitmask. 

425 mask_length : int 

426 Number of elements in the mask to interpret. 

427 first_byte_offset : int, default: 0 

428 Number of elements to offset from the start of the first byte. 

429 

430 Returns 

431 ------- 

432 np.ndarray[bool] 

433 """ 

434 bytes_to_skip = first_byte_offset // 8 

435 bitmask = bitmask[bytes_to_skip:] 

436 first_byte_offset %= 8 

437 

438 bool_mask = np.zeros(mask_length, dtype=bool) 

439 

440 # Processing the first byte separately as it has its own offset 

441 val = bitmask[0] 

442 mask_idx = 0 

443 bits_in_first_byte = min(8 - first_byte_offset, mask_length) 

444 for j in range(bits_in_first_byte): 

445 if val & (1 << (j + first_byte_offset)): 

446 bool_mask[mask_idx] = True 

447 mask_idx += 1 

448 

449 # `mask_length // 8` describes how many full bytes to process 

450 for i in range((mask_length - bits_in_first_byte) // 8): 

451 # doing `+ 1` as we already processed the first byte 

452 val = bitmask[i + 1] 

453 for j in range(8): 

454 if val & (1 << j): 

455 bool_mask[mask_idx] = True 

456 mask_idx += 1 

457 

458 if len(bitmask) > 1: 

459 # Processing reminder of last byte 

460 val = bitmask[-1] 

461 for j in range(len(bool_mask) - mask_idx): 

462 if val & (1 << j): 

463 bool_mask[mask_idx] = True 

464 mask_idx += 1 

465 

466 return bool_mask 

467 

468 

469def set_nulls( 

470 data: np.ndarray | pd.Series, 

471 col: Column, 

472 validity: tuple[Buffer, tuple[DtypeKind, int, str, str]] | None, 

473 allow_modify_inplace: bool = True, 

474): 

475 """ 

476 Set null values for the data according to the column null kind. 

477 

478 Parameters 

479 ---------- 

480 data : np.ndarray or pd.Series 

481 Data to set nulls in. 

482 col : Column 

483 Column object that describes the `data`. 

484 validity : tuple(Buffer, dtype) or None 

485 The return value of ``col.buffers()``. We do not access the ``col.buffers()`` 

486 here to not take the ownership of the memory of buffer objects. 

487 allow_modify_inplace : bool, default: True 

488 Whether to modify the `data` inplace when zero-copy is possible (True) or always 

489 modify a copy of the `data` (False). 

490 

491 Returns 

492 ------- 

493 np.ndarray or pd.Series 

494 Data with the nulls being set. 

495 """ 

496 null_kind, sentinel_val = col.describe_null 

497 null_pos = None 

498 

499 if null_kind == ColumnNullType.USE_SENTINEL: 

500 null_pos = pd.Series(data) == sentinel_val 

501 elif null_kind in (ColumnNullType.USE_BITMASK, ColumnNullType.USE_BYTEMASK): 

502 assert validity, "Expected to have a validity buffer for the mask" 

503 valid_buff, valid_dtype = validity 

504 null_pos = buffer_to_ndarray(valid_buff, valid_dtype, col.offset, col.size) 

505 if sentinel_val == 0: 

506 null_pos = ~null_pos 

507 elif null_kind in (ColumnNullType.NON_NULLABLE, ColumnNullType.USE_NAN): 

508 pass 

509 else: 

510 raise NotImplementedError(f"Null kind {null_kind} is not yet supported.") 

511 

512 if null_pos is not None and np.any(null_pos): 

513 if not allow_modify_inplace: 

514 data = data.copy() 

515 try: 

516 data[null_pos] = None 

517 except TypeError: 

518 # TypeError happens if the `data` dtype appears to be non-nullable 

519 # in numpy notation (bool, int, uint). If this happens, 

520 # cast the `data` to nullable float dtype. 

521 data = data.astype(float) 

522 data[null_pos] = None 

523 

524 return data