Coverage for /var/srv/projects/api.amasfac.comuna18.com/tmp/venv/lib/python3.9/site-packages/pandas/core/interchange/from

1from __future__ import annotations

3import ctypes

4import re

5from typing import Any

7import numpy as np

9import pandas as pd

10from pandas.core.interchange.column import PandasColumn

11from pandas.core.interchange.dataframe_protocol import (

12 Buffer,

13 Column,

14 ColumnNullType,

15 DataFrame as DataFrameXchg,

16 DtypeKind,

17)

18from pandas.core.interchange.utils import (

19 ArrowCTypes,

20 Endianness,

21)

23_NP_DTYPES: dict[DtypeKind, dict[int, Any]] = {

24 DtypeKind.INT: {8: np.int8, 16: np.int16, 32: np.int32, 64: np.int64},

25 DtypeKind.UINT: {8: np.uint8, 16: np.uint16, 32: np.uint32, 64: np.uint64},

26 DtypeKind.FLOAT: {32: np.float32, 64: np.float64},

27 DtypeKind.BOOL: {8: bool},

28}

31def from_dataframe(df, allow_copy=True) -> pd.DataFrame:

32 """

33 Build a ``pd.DataFrame`` from any DataFrame supporting the interchange protocol.

35 Parameters

36 ----------

37 df : DataFrameXchg

38 Object supporting the interchange protocol, i.e. `__dataframe__` method.

39 allow_copy : bool, default: True

40 Whether to allow copying the memory to perform the conversion

41 (if false then zero-copy approach is requested).

43 Returns

44 -------

45 pd.DataFrame

46 """

47 if isinstance(df, pd.DataFrame):

48 return df

50 if not hasattr(df, "__dataframe__"):

51 raise ValueError("`df` does not support __dataframe__")

53 return _from_dataframe(df.__dataframe__(allow_copy=allow_copy))

56def _from_dataframe(df: DataFrameXchg, allow_copy=True):

57 """

58 Build a ``pd.DataFrame`` from the DataFrame interchange object.

60 Parameters

61 ----------

62 df : DataFrameXchg

63 Object supporting the interchange protocol, i.e. `__dataframe__` method.

64 allow_copy : bool, default: True

65 Whether to allow copying the memory to perform the conversion

66 (if false then zero-copy approach is requested).

68 Returns

69 -------

70 pd.DataFrame

71 """

72 pandas_dfs = []

73 for chunk in df.get_chunks():

74 pandas_df = protocol_df_chunk_to_pandas(chunk)

75 pandas_dfs.append(pandas_df)

77 if not allow_copy and len(pandas_dfs) > 1:

78 raise RuntimeError(

79 "To join chunks a copy is required which is forbidden by allow_copy=False"

80 )

81 if len(pandas_dfs) == 1:

82 pandas_df = pandas_dfs[0]

83 else:

84 pandas_df = pd.concat(pandas_dfs, axis=0, ignore_index=True, copy=False)

86 index_obj = df.metadata.get("pandas.index", None)

87 if index_obj is not None:

88 pandas_df.index = index_obj

90 return pandas_df

93def protocol_df_chunk_to_pandas(df: DataFrameXchg) -> pd.DataFrame:

94 """

95 Convert interchange protocol chunk to ``pd.DataFrame``.

97 Parameters

98 ----------

99 df : DataFrameXchg

100

101 Returns

102 -------

103 pd.DataFrame

104 """

105 # We need a dict of columns here, with each column being a NumPy array (at

106 # least for now, deal with non-NumPy dtypes later).

107 columns: dict[str, Any] = {}

108 buffers = [] # hold on to buffers, keeps memory alive

109 for name in df.column_names():

110 if not isinstance(name, str):

111 raise ValueError(f"Column {name} is not a string")

112 if name in columns:

113 raise ValueError(f"Column {name} is not unique")

114 col = df.get_column_by_name(name)

115 dtype = col.dtype[0]

116 if dtype in (

117 DtypeKind.INT,

118 DtypeKind.UINT,

119 DtypeKind.FLOAT,

120 DtypeKind.BOOL,

121 ):

122 columns[name], buf = primitive_column_to_ndarray(col)

123 elif dtype == DtypeKind.CATEGORICAL:

124 columns[name], buf = categorical_column_to_series(col)

125 elif dtype == DtypeKind.STRING:

126 columns[name], buf = string_column_to_ndarray(col)

127 elif dtype == DtypeKind.DATETIME:

128 columns[name], buf = datetime_column_to_ndarray(col)

129 else:

130 raise NotImplementedError(f"Data type {dtype} not handled yet")

131

132 buffers.append(buf)

133

134 pandas_df = pd.DataFrame(columns)

135 pandas_df.attrs["_INTERCHANGE_PROTOCOL_BUFFERS"] = buffers

136 return pandas_df

137

138

139def primitive_column_to_ndarray(col: Column) -> tuple[np.ndarray, Any]:

140 """

141 Convert a column holding one of the primitive dtypes to a NumPy array.

142

143 A primitive type is one of: int, uint, float, bool.

144

145 Parameters

146 ----------

147 col : Column

148

149 Returns

150 -------

151 tuple

152 Tuple of np.ndarray holding the data and the memory owner object

153 that keeps the memory alive.

154 """

155 buffers = col.get_buffers()

156

157 data_buff, data_dtype = buffers["data"]

158 data = buffer_to_ndarray(data_buff, data_dtype, col.offset, col.size)

159

160 data = set_nulls(data, col, buffers["validity"])

161 return data, buffers

162

163

164def categorical_column_to_series(col: Column) -> tuple[pd.Series, Any]:

165 """

166 Convert a column holding categorical data to a pandas Series.

167

168 Parameters

169 ----------

170 col : Column

171

172 Returns

173 -------

174 tuple

175 Tuple of pd.Series holding the data and the memory owner object

176 that keeps the memory alive.

177 """

178 categorical = col.describe_categorical

179

180 if not categorical["is_dictionary"]:

181 raise NotImplementedError("Non-dictionary categoricals not supported yet")

182

183 cat_column = categorical["categories"]

184 # for mypy/pyright

185 assert isinstance(cat_column, PandasColumn), "categories must be a PandasColumn"

186 categories = np.array(cat_column._col)

187 buffers = col.get_buffers()

188

189 codes_buff, codes_dtype = buffers["data"]

190 codes = buffer_to_ndarray(codes_buff, codes_dtype, col.offset, col.size)

191

192 # Doing module in order to not get ``IndexError`` for

193 # out-of-bounds sentinel values in `codes`

194 values = categories[codes % len(categories)]

195

196 cat = pd.Categorical(

197 values, categories=categories, ordered=categorical["is_ordered"]

198 )

199 data = pd.Series(cat)

200

201 data = set_nulls(data, col, buffers["validity"])

202 return data, buffers

203

204

205def string_column_to_ndarray(col: Column) -> tuple[np.ndarray, Any]:

206 """

207 Convert a column holding string data to a NumPy array.

208

209 Parameters

210 ----------

211 col : Column

212

213 Returns

214 -------

215 tuple

216 Tuple of np.ndarray holding the data and the memory owner object

217 that keeps the memory alive.

218 """

219 null_kind, sentinel_val = col.describe_null

220

221 if null_kind not in (

222 ColumnNullType.NON_NULLABLE,

223 ColumnNullType.USE_BITMASK,

224 ColumnNullType.USE_BYTEMASK,

225 ):

226 raise NotImplementedError(

227 f"{null_kind} null kind is not yet supported for string columns."

228 )

229

230 buffers = col.get_buffers()

231

232 assert buffers["offsets"], "String buffers must contain offsets"

233 # Retrieve the data buffer containing the UTF-8 code units

234 data_buff, protocol_data_dtype = buffers["data"]

235 # We're going to reinterpret the buffer as uint8, so make sure we can do it safely

236 assert protocol_data_dtype[1] == 8 # bitwidth == 8

237 assert protocol_data_dtype[2] == ArrowCTypes.STRING # format_str == utf-8

238 # Convert the buffers to NumPy arrays. In order to go from STRING to

239 # an equivalent ndarray, we claim that the buffer is uint8 (i.e., a byte array)

240 data_dtype = (

241 DtypeKind.UINT,

242 8,

243 ArrowCTypes.UINT8,

244 Endianness.NATIVE,

245 )

246 # Specify zero offset as we don't want to chunk the string data

247 data = buffer_to_ndarray(data_buff, data_dtype, offset=0, length=col.size)

248

249 # Retrieve the offsets buffer containing the index offsets demarcating

250 # the beginning and the ending of each string

251 offset_buff, offset_dtype = buffers["offsets"]

252 # Offsets buffer contains start-stop positions of strings in the data buffer,

253 # meaning that it has more elements than in the data buffer, do `col.size + 1` here

254 # to pass a proper offsets buffer size

255 offsets = buffer_to_ndarray(

256 offset_buff, offset_dtype, col.offset, length=col.size + 1

257 )

258

259 null_pos = None

260 if null_kind in (ColumnNullType.USE_BITMASK, ColumnNullType.USE_BYTEMASK):

261 assert buffers["validity"], "Validity buffers cannot be empty for masks"

262 valid_buff, valid_dtype = buffers["validity"]

263 null_pos = buffer_to_ndarray(valid_buff, valid_dtype, col.offset, col.size)

264 if sentinel_val == 0:

265 null_pos = ~null_pos

266

267 # Assemble the strings from the code units

268 str_list: list[None | float | str] = [None] * col.size

269 for i in range(col.size):

270 # Check for missing values

271 if null_pos is not None and null_pos[i]:

272 str_list[i] = np.nan

273 continue

274

275 # Extract a range of code units

276 units = data[offsets[i] : offsets[i + 1]]

277

278 # Convert the list of code units to bytes

279 str_bytes = bytes(units)

280

281 # Create the string

282 string = str_bytes.decode(encoding="utf-8")

283

284 # Add to our list of strings

285 str_list[i] = string

286

287 # Convert the string list to a NumPy array

288 return np.asarray(str_list, dtype="object"), buffers

289

290

291def parse_datetime_format_str(format_str, data):

292 """Parse datetime `format_str` to interpret the `data`."""

293 # timestamp 'ts{unit}:tz'

294 timestamp_meta = re.match(r"ts([smun]):(.*)", format_str)

295 if timestamp_meta:

296 unit, tz = timestamp_meta.group(1), timestamp_meta.group(2)

297 if tz != "":

298 raise NotImplementedError("Timezones are not supported yet")

299 if unit != "s":

300 # the format string describes only a first letter of the unit, so

301 # add one extra letter to convert the unit to numpy-style:

302 # 'm' -> 'ms', 'u' -> 'us', 'n' -> 'ns'

303 unit += "s"

304 data = data.astype(f"datetime64[{unit}]")

305 return data

306

307 # date 'td{Days/Ms}'

308 date_meta = re.match(r"td([Dm])", format_str)

309 if date_meta:

310 unit = date_meta.group(1)

311 if unit == "D":

312 # NumPy doesn't support DAY unit, so converting days to seconds

313 # (converting to uint64 to avoid overflow)

314 data = (data.astype(np.uint64) * (24 * 60 * 60)).astype("datetime64[s]")

315 elif unit == "m":

316 data = data.astype("datetime64[ms]")

317 else:

318 raise NotImplementedError(f"Date unit is not supported: {unit}")

319 return data

320

321 raise NotImplementedError(f"DateTime kind is not supported: {format_str}")

322

323

324def datetime_column_to_ndarray(col: Column) -> tuple[np.ndarray, Any]:

325 """

326 Convert a column holding DateTime data to a NumPy array.

327

328 Parameters

329 ----------

330 col : Column

331

332 Returns

333 -------

334 tuple

335 Tuple of np.ndarray holding the data and the memory owner object

336 that keeps the memory alive.

337 """

338 buffers = col.get_buffers()

339

340 _, _, format_str, _ = col.dtype

341 dbuf, dtype = buffers["data"]

342 # Consider dtype being `uint` to get number of units passed since the 01.01.1970

343 data = buffer_to_ndarray(

344 dbuf,

345 (

346 DtypeKind.UINT,

347 dtype[1],

348 getattr(ArrowCTypes, f"UINT{dtype[1]}"),

349 Endianness.NATIVE,

350 ),

351 col.offset,

352 col.size,

353 )

354

355 data = parse_datetime_format_str(format_str, data)

356 data = set_nulls(data, col, buffers["validity"])

357 return data, buffers

358

359

360def buffer_to_ndarray(

361 buffer: Buffer,

362 dtype: tuple[DtypeKind, int, str, str],

363 offset: int = 0,

364 length: int | None = None,

365) -> np.ndarray:

366 """

367 Build a NumPy array from the passed buffer.

368

369 Parameters

370 ----------

371 buffer : Buffer

372 Buffer to build a NumPy array from.

373 dtype : tuple

374 Data type of the buffer conforming protocol dtypes format.

375 offset : int, default: 0

376 Number of elements to offset from the start of the buffer.

377 length : int, optional

378 If the buffer is a bit-mask, specifies a number of bits to read

379 from the buffer. Has no effect otherwise.

380

381 Returns

382 -------

383 np.ndarray

384

385 Notes

386 -----

387 The returned array doesn't own the memory. The caller of this function is

388 responsible for keeping the memory owner object alive as long as

389 the returned NumPy array is being used.

390 """

391 kind, bit_width, _, _ = dtype

392

393 column_dtype = _NP_DTYPES.get(kind, {}).get(bit_width, None)

394 if column_dtype is None:

395 raise NotImplementedError(f"Conversion for {dtype} is not yet supported.")

396

397 # TODO: No DLPack yet, so need to construct a new ndarray from the data pointer

398 # and size in the buffer plus the dtype on the column. Use DLPack as NumPy supports

399 # it since https://github.com/numpy/numpy/pull/19083

400 ctypes_type = np.ctypeslib.as_ctypes_type(column_dtype)

401 data_pointer = ctypes.cast(

402 buffer.ptr + (offset * bit_width // 8), ctypes.POINTER(ctypes_type)

403 )

404

405 if bit_width == 1:

406 assert length is not None, "`length` must be specified for a bit-mask buffer."

407 arr = np.ctypeslib.as_array(data_pointer, shape=(buffer.bufsize,))

408 return bitmask_to_bool_ndarray(arr, length, first_byte_offset=offset % 8)

409 else:

410 return np.ctypeslib.as_array(

411 data_pointer, shape=(buffer.bufsize // (bit_width // 8),)

412 )

413

414

415def bitmask_to_bool_ndarray(

416 bitmask: np.ndarray, mask_length: int, first_byte_offset: int = 0

417) -> np.ndarray:

418 """

419 Convert bit-mask to a boolean NumPy array.

420

421 Parameters

422 ----------

423 bitmask : np.ndarray[uint8]

424 NumPy array of uint8 dtype representing the bitmask.

425 mask_length : int

426 Number of elements in the mask to interpret.

427 first_byte_offset : int, default: 0

428 Number of elements to offset from the start of the first byte.

429

430 Returns

431 -------

432 np.ndarray[bool]

433 """

434 bytes_to_skip = first_byte_offset // 8

435 bitmask = bitmask[bytes_to_skip:]

436 first_byte_offset %= 8

437

438 bool_mask = np.zeros(mask_length, dtype=bool)

439

440 # Processing the first byte separately as it has its own offset

441 val = bitmask[0]

442 mask_idx = 0

443 bits_in_first_byte = min(8 - first_byte_offset, mask_length)

444 for j in range(bits_in_first_byte):

445 if val & (1 << (j + first_byte_offset)):

446 bool_mask[mask_idx] = True

447 mask_idx += 1

448

449 # `mask_length // 8` describes how many full bytes to process

450 for i in range((mask_length - bits_in_first_byte) // 8):

451 # doing `+ 1` as we already processed the first byte

452 val = bitmask[i + 1]

453 for j in range(8):

454 if val & (1 << j):

455 bool_mask[mask_idx] = True

456 mask_idx += 1

457

458 if len(bitmask) > 1:

459 # Processing reminder of last byte

460 val = bitmask[-1]

461 for j in range(len(bool_mask) - mask_idx):

462 if val & (1 << j):

463 bool_mask[mask_idx] = True

464 mask_idx += 1

465

466 return bool_mask

467

468

469def set_nulls(

470 data: np.ndarray | pd.Series,

471 col: Column,

472 validity: tuple[Buffer, tuple[DtypeKind, int, str, str]] | None,

473 allow_modify_inplace: bool = True,

474):

475 """

476 Set null values for the data according to the column null kind.

477

478 Parameters

479 ----------

480 data : np.ndarray or pd.Series

481 Data to set nulls in.

482 col : Column

483 Column object that describes the `data`.

484 validity : tuple(Buffer, dtype) or None

485 The return value of ``col.buffers()``. We do not access the ``col.buffers()``

486 here to not take the ownership of the memory of buffer objects.

487 allow_modify_inplace : bool, default: True

488 Whether to modify the `data` inplace when zero-copy is possible (True) or always

489 modify a copy of the `data` (False).

490

491 Returns

492 -------

493 np.ndarray or pd.Series

494 Data with the nulls being set.

495 """

496 null_kind, sentinel_val = col.describe_null

497 null_pos = None

498

499 if null_kind == ColumnNullType.USE_SENTINEL:

500 null_pos = pd.Series(data) == sentinel_val

501 elif null_kind in (ColumnNullType.USE_BITMASK, ColumnNullType.USE_BYTEMASK):

502 assert validity, "Expected to have a validity buffer for the mask"

503 valid_buff, valid_dtype = validity

504 null_pos = buffer_to_ndarray(valid_buff, valid_dtype, col.offset, col.size)

505 if sentinel_val == 0:

506 null_pos = ~null_pos

507 elif null_kind in (ColumnNullType.NON_NULLABLE, ColumnNullType.USE_NAN):

508 pass

509 else:

510 raise NotImplementedError(f"Null kind {null_kind} is not yet supported.")

511

512 if null_pos is not None and np.any(null_pos):

513 if not allow_modify_inplace:

514 data = data.copy()

515 try:

516 data[null_pos] = None

517 except TypeError:

518 # TypeError happens if the `data` dtype appears to be non-nullable

519 # in numpy notation (bool, int, uint). If this happens,

520 # cast the `data` to nullable float dtype.

521 data = data.astype(float)

522 data[null_pos] = None

523

524 return data

Coverage for /var/srv/projects/api.amasfac.comuna18.com/tmp/venv/lib/python3.9/site-packages/pandas/core/interchange/from_dataframe.py: 8%

192 statements