Coverage for /var/srv/projects/api.amasfac.comuna18.com/tmp/venv/lib/python3.9/site-packages/pandas/core/interchange/from_dataframe.py: 8%
192 statements
« prev ^ index » next coverage.py v6.4.4, created at 2023-07-17 14:22 -0600
« prev ^ index » next coverage.py v6.4.4, created at 2023-07-17 14:22 -0600
1from __future__ import annotations
3import ctypes
4import re
5from typing import Any
7import numpy as np
9import pandas as pd
10from pandas.core.interchange.column import PandasColumn
11from pandas.core.interchange.dataframe_protocol import (
12 Buffer,
13 Column,
14 ColumnNullType,
15 DataFrame as DataFrameXchg,
16 DtypeKind,
17)
18from pandas.core.interchange.utils import (
19 ArrowCTypes,
20 Endianness,
21)
23_NP_DTYPES: dict[DtypeKind, dict[int, Any]] = {
24 DtypeKind.INT: {8: np.int8, 16: np.int16, 32: np.int32, 64: np.int64},
25 DtypeKind.UINT: {8: np.uint8, 16: np.uint16, 32: np.uint32, 64: np.uint64},
26 DtypeKind.FLOAT: {32: np.float32, 64: np.float64},
27 DtypeKind.BOOL: {8: bool},
28}
31def from_dataframe(df, allow_copy=True) -> pd.DataFrame:
32 """
33 Build a ``pd.DataFrame`` from any DataFrame supporting the interchange protocol.
35 Parameters
36 ----------
37 df : DataFrameXchg
38 Object supporting the interchange protocol, i.e. `__dataframe__` method.
39 allow_copy : bool, default: True
40 Whether to allow copying the memory to perform the conversion
41 (if false then zero-copy approach is requested).
43 Returns
44 -------
45 pd.DataFrame
46 """
47 if isinstance(df, pd.DataFrame):
48 return df
50 if not hasattr(df, "__dataframe__"):
51 raise ValueError("`df` does not support __dataframe__")
53 return _from_dataframe(df.__dataframe__(allow_copy=allow_copy))
56def _from_dataframe(df: DataFrameXchg, allow_copy=True):
57 """
58 Build a ``pd.DataFrame`` from the DataFrame interchange object.
60 Parameters
61 ----------
62 df : DataFrameXchg
63 Object supporting the interchange protocol, i.e. `__dataframe__` method.
64 allow_copy : bool, default: True
65 Whether to allow copying the memory to perform the conversion
66 (if false then zero-copy approach is requested).
68 Returns
69 -------
70 pd.DataFrame
71 """
72 pandas_dfs = []
73 for chunk in df.get_chunks():
74 pandas_df = protocol_df_chunk_to_pandas(chunk)
75 pandas_dfs.append(pandas_df)
77 if not allow_copy and len(pandas_dfs) > 1:
78 raise RuntimeError(
79 "To join chunks a copy is required which is forbidden by allow_copy=False"
80 )
81 if len(pandas_dfs) == 1:
82 pandas_df = pandas_dfs[0]
83 else:
84 pandas_df = pd.concat(pandas_dfs, axis=0, ignore_index=True, copy=False)
86 index_obj = df.metadata.get("pandas.index", None)
87 if index_obj is not None:
88 pandas_df.index = index_obj
90 return pandas_df
93def protocol_df_chunk_to_pandas(df: DataFrameXchg) -> pd.DataFrame:
94 """
95 Convert interchange protocol chunk to ``pd.DataFrame``.
97 Parameters
98 ----------
99 df : DataFrameXchg
101 Returns
102 -------
103 pd.DataFrame
104 """
105 # We need a dict of columns here, with each column being a NumPy array (at
106 # least for now, deal with non-NumPy dtypes later).
107 columns: dict[str, Any] = {}
108 buffers = [] # hold on to buffers, keeps memory alive
109 for name in df.column_names():
110 if not isinstance(name, str):
111 raise ValueError(f"Column {name} is not a string")
112 if name in columns:
113 raise ValueError(f"Column {name} is not unique")
114 col = df.get_column_by_name(name)
115 dtype = col.dtype[0]
116 if dtype in (
117 DtypeKind.INT,
118 DtypeKind.UINT,
119 DtypeKind.FLOAT,
120 DtypeKind.BOOL,
121 ):
122 columns[name], buf = primitive_column_to_ndarray(col)
123 elif dtype == DtypeKind.CATEGORICAL:
124 columns[name], buf = categorical_column_to_series(col)
125 elif dtype == DtypeKind.STRING:
126 columns[name], buf = string_column_to_ndarray(col)
127 elif dtype == DtypeKind.DATETIME:
128 columns[name], buf = datetime_column_to_ndarray(col)
129 else:
130 raise NotImplementedError(f"Data type {dtype} not handled yet")
132 buffers.append(buf)
134 pandas_df = pd.DataFrame(columns)
135 pandas_df.attrs["_INTERCHANGE_PROTOCOL_BUFFERS"] = buffers
136 return pandas_df
139def primitive_column_to_ndarray(col: Column) -> tuple[np.ndarray, Any]:
140 """
141 Convert a column holding one of the primitive dtypes to a NumPy array.
143 A primitive type is one of: int, uint, float, bool.
145 Parameters
146 ----------
147 col : Column
149 Returns
150 -------
151 tuple
152 Tuple of np.ndarray holding the data and the memory owner object
153 that keeps the memory alive.
154 """
155 buffers = col.get_buffers()
157 data_buff, data_dtype = buffers["data"]
158 data = buffer_to_ndarray(data_buff, data_dtype, col.offset, col.size)
160 data = set_nulls(data, col, buffers["validity"])
161 return data, buffers
164def categorical_column_to_series(col: Column) -> tuple[pd.Series, Any]:
165 """
166 Convert a column holding categorical data to a pandas Series.
168 Parameters
169 ----------
170 col : Column
172 Returns
173 -------
174 tuple
175 Tuple of pd.Series holding the data and the memory owner object
176 that keeps the memory alive.
177 """
178 categorical = col.describe_categorical
180 if not categorical["is_dictionary"]:
181 raise NotImplementedError("Non-dictionary categoricals not supported yet")
183 cat_column = categorical["categories"]
184 # for mypy/pyright
185 assert isinstance(cat_column, PandasColumn), "categories must be a PandasColumn"
186 categories = np.array(cat_column._col)
187 buffers = col.get_buffers()
189 codes_buff, codes_dtype = buffers["data"]
190 codes = buffer_to_ndarray(codes_buff, codes_dtype, col.offset, col.size)
192 # Doing module in order to not get ``IndexError`` for
193 # out-of-bounds sentinel values in `codes`
194 values = categories[codes % len(categories)]
196 cat = pd.Categorical(
197 values, categories=categories, ordered=categorical["is_ordered"]
198 )
199 data = pd.Series(cat)
201 data = set_nulls(data, col, buffers["validity"])
202 return data, buffers
205def string_column_to_ndarray(col: Column) -> tuple[np.ndarray, Any]:
206 """
207 Convert a column holding string data to a NumPy array.
209 Parameters
210 ----------
211 col : Column
213 Returns
214 -------
215 tuple
216 Tuple of np.ndarray holding the data and the memory owner object
217 that keeps the memory alive.
218 """
219 null_kind, sentinel_val = col.describe_null
221 if null_kind not in (
222 ColumnNullType.NON_NULLABLE,
223 ColumnNullType.USE_BITMASK,
224 ColumnNullType.USE_BYTEMASK,
225 ):
226 raise NotImplementedError(
227 f"{null_kind} null kind is not yet supported for string columns."
228 )
230 buffers = col.get_buffers()
232 assert buffers["offsets"], "String buffers must contain offsets"
233 # Retrieve the data buffer containing the UTF-8 code units
234 data_buff, protocol_data_dtype = buffers["data"]
235 # We're going to reinterpret the buffer as uint8, so make sure we can do it safely
236 assert protocol_data_dtype[1] == 8 # bitwidth == 8
237 assert protocol_data_dtype[2] == ArrowCTypes.STRING # format_str == utf-8
238 # Convert the buffers to NumPy arrays. In order to go from STRING to
239 # an equivalent ndarray, we claim that the buffer is uint8 (i.e., a byte array)
240 data_dtype = (
241 DtypeKind.UINT,
242 8,
243 ArrowCTypes.UINT8,
244 Endianness.NATIVE,
245 )
246 # Specify zero offset as we don't want to chunk the string data
247 data = buffer_to_ndarray(data_buff, data_dtype, offset=0, length=col.size)
249 # Retrieve the offsets buffer containing the index offsets demarcating
250 # the beginning and the ending of each string
251 offset_buff, offset_dtype = buffers["offsets"]
252 # Offsets buffer contains start-stop positions of strings in the data buffer,
253 # meaning that it has more elements than in the data buffer, do `col.size + 1` here
254 # to pass a proper offsets buffer size
255 offsets = buffer_to_ndarray(
256 offset_buff, offset_dtype, col.offset, length=col.size + 1
257 )
259 null_pos = None
260 if null_kind in (ColumnNullType.USE_BITMASK, ColumnNullType.USE_BYTEMASK):
261 assert buffers["validity"], "Validity buffers cannot be empty for masks"
262 valid_buff, valid_dtype = buffers["validity"]
263 null_pos = buffer_to_ndarray(valid_buff, valid_dtype, col.offset, col.size)
264 if sentinel_val == 0:
265 null_pos = ~null_pos
267 # Assemble the strings from the code units
268 str_list: list[None | float | str] = [None] * col.size
269 for i in range(col.size):
270 # Check for missing values
271 if null_pos is not None and null_pos[i]:
272 str_list[i] = np.nan
273 continue
275 # Extract a range of code units
276 units = data[offsets[i] : offsets[i + 1]]
278 # Convert the list of code units to bytes
279 str_bytes = bytes(units)
281 # Create the string
282 string = str_bytes.decode(encoding="utf-8")
284 # Add to our list of strings
285 str_list[i] = string
287 # Convert the string list to a NumPy array
288 return np.asarray(str_list, dtype="object"), buffers
291def parse_datetime_format_str(format_str, data):
292 """Parse datetime `format_str` to interpret the `data`."""
293 # timestamp 'ts{unit}:tz'
294 timestamp_meta = re.match(r"ts([smun]):(.*)", format_str)
295 if timestamp_meta:
296 unit, tz = timestamp_meta.group(1), timestamp_meta.group(2)
297 if tz != "":
298 raise NotImplementedError("Timezones are not supported yet")
299 if unit != "s":
300 # the format string describes only a first letter of the unit, so
301 # add one extra letter to convert the unit to numpy-style:
302 # 'm' -> 'ms', 'u' -> 'us', 'n' -> 'ns'
303 unit += "s"
304 data = data.astype(f"datetime64[{unit}]")
305 return data
307 # date 'td{Days/Ms}'
308 date_meta = re.match(r"td([Dm])", format_str)
309 if date_meta:
310 unit = date_meta.group(1)
311 if unit == "D":
312 # NumPy doesn't support DAY unit, so converting days to seconds
313 # (converting to uint64 to avoid overflow)
314 data = (data.astype(np.uint64) * (24 * 60 * 60)).astype("datetime64[s]")
315 elif unit == "m":
316 data = data.astype("datetime64[ms]")
317 else:
318 raise NotImplementedError(f"Date unit is not supported: {unit}")
319 return data
321 raise NotImplementedError(f"DateTime kind is not supported: {format_str}")
324def datetime_column_to_ndarray(col: Column) -> tuple[np.ndarray, Any]:
325 """
326 Convert a column holding DateTime data to a NumPy array.
328 Parameters
329 ----------
330 col : Column
332 Returns
333 -------
334 tuple
335 Tuple of np.ndarray holding the data and the memory owner object
336 that keeps the memory alive.
337 """
338 buffers = col.get_buffers()
340 _, _, format_str, _ = col.dtype
341 dbuf, dtype = buffers["data"]
342 # Consider dtype being `uint` to get number of units passed since the 01.01.1970
343 data = buffer_to_ndarray(
344 dbuf,
345 (
346 DtypeKind.UINT,
347 dtype[1],
348 getattr(ArrowCTypes, f"UINT{dtype[1]}"),
349 Endianness.NATIVE,
350 ),
351 col.offset,
352 col.size,
353 )
355 data = parse_datetime_format_str(format_str, data)
356 data = set_nulls(data, col, buffers["validity"])
357 return data, buffers
360def buffer_to_ndarray(
361 buffer: Buffer,
362 dtype: tuple[DtypeKind, int, str, str],
363 offset: int = 0,
364 length: int | None = None,
365) -> np.ndarray:
366 """
367 Build a NumPy array from the passed buffer.
369 Parameters
370 ----------
371 buffer : Buffer
372 Buffer to build a NumPy array from.
373 dtype : tuple
374 Data type of the buffer conforming protocol dtypes format.
375 offset : int, default: 0
376 Number of elements to offset from the start of the buffer.
377 length : int, optional
378 If the buffer is a bit-mask, specifies a number of bits to read
379 from the buffer. Has no effect otherwise.
381 Returns
382 -------
383 np.ndarray
385 Notes
386 -----
387 The returned array doesn't own the memory. The caller of this function is
388 responsible for keeping the memory owner object alive as long as
389 the returned NumPy array is being used.
390 """
391 kind, bit_width, _, _ = dtype
393 column_dtype = _NP_DTYPES.get(kind, {}).get(bit_width, None)
394 if column_dtype is None:
395 raise NotImplementedError(f"Conversion for {dtype} is not yet supported.")
397 # TODO: No DLPack yet, so need to construct a new ndarray from the data pointer
398 # and size in the buffer plus the dtype on the column. Use DLPack as NumPy supports
399 # it since https://github.com/numpy/numpy/pull/19083
400 ctypes_type = np.ctypeslib.as_ctypes_type(column_dtype)
401 data_pointer = ctypes.cast(
402 buffer.ptr + (offset * bit_width // 8), ctypes.POINTER(ctypes_type)
403 )
405 if bit_width == 1:
406 assert length is not None, "`length` must be specified for a bit-mask buffer."
407 arr = np.ctypeslib.as_array(data_pointer, shape=(buffer.bufsize,))
408 return bitmask_to_bool_ndarray(arr, length, first_byte_offset=offset % 8)
409 else:
410 return np.ctypeslib.as_array(
411 data_pointer, shape=(buffer.bufsize // (bit_width // 8),)
412 )
415def bitmask_to_bool_ndarray(
416 bitmask: np.ndarray, mask_length: int, first_byte_offset: int = 0
417) -> np.ndarray:
418 """
419 Convert bit-mask to a boolean NumPy array.
421 Parameters
422 ----------
423 bitmask : np.ndarray[uint8]
424 NumPy array of uint8 dtype representing the bitmask.
425 mask_length : int
426 Number of elements in the mask to interpret.
427 first_byte_offset : int, default: 0
428 Number of elements to offset from the start of the first byte.
430 Returns
431 -------
432 np.ndarray[bool]
433 """
434 bytes_to_skip = first_byte_offset // 8
435 bitmask = bitmask[bytes_to_skip:]
436 first_byte_offset %= 8
438 bool_mask = np.zeros(mask_length, dtype=bool)
440 # Processing the first byte separately as it has its own offset
441 val = bitmask[0]
442 mask_idx = 0
443 bits_in_first_byte = min(8 - first_byte_offset, mask_length)
444 for j in range(bits_in_first_byte):
445 if val & (1 << (j + first_byte_offset)):
446 bool_mask[mask_idx] = True
447 mask_idx += 1
449 # `mask_length // 8` describes how many full bytes to process
450 for i in range((mask_length - bits_in_first_byte) // 8):
451 # doing `+ 1` as we already processed the first byte
452 val = bitmask[i + 1]
453 for j in range(8):
454 if val & (1 << j):
455 bool_mask[mask_idx] = True
456 mask_idx += 1
458 if len(bitmask) > 1:
459 # Processing reminder of last byte
460 val = bitmask[-1]
461 for j in range(len(bool_mask) - mask_idx):
462 if val & (1 << j):
463 bool_mask[mask_idx] = True
464 mask_idx += 1
466 return bool_mask
469def set_nulls(
470 data: np.ndarray | pd.Series,
471 col: Column,
472 validity: tuple[Buffer, tuple[DtypeKind, int, str, str]] | None,
473 allow_modify_inplace: bool = True,
474):
475 """
476 Set null values for the data according to the column null kind.
478 Parameters
479 ----------
480 data : np.ndarray or pd.Series
481 Data to set nulls in.
482 col : Column
483 Column object that describes the `data`.
484 validity : tuple(Buffer, dtype) or None
485 The return value of ``col.buffers()``. We do not access the ``col.buffers()``
486 here to not take the ownership of the memory of buffer objects.
487 allow_modify_inplace : bool, default: True
488 Whether to modify the `data` inplace when zero-copy is possible (True) or always
489 modify a copy of the `data` (False).
491 Returns
492 -------
493 np.ndarray or pd.Series
494 Data with the nulls being set.
495 """
496 null_kind, sentinel_val = col.describe_null
497 null_pos = None
499 if null_kind == ColumnNullType.USE_SENTINEL:
500 null_pos = pd.Series(data) == sentinel_val
501 elif null_kind in (ColumnNullType.USE_BITMASK, ColumnNullType.USE_BYTEMASK):
502 assert validity, "Expected to have a validity buffer for the mask"
503 valid_buff, valid_dtype = validity
504 null_pos = buffer_to_ndarray(valid_buff, valid_dtype, col.offset, col.size)
505 if sentinel_val == 0:
506 null_pos = ~null_pos
507 elif null_kind in (ColumnNullType.NON_NULLABLE, ColumnNullType.USE_NAN):
508 pass
509 else:
510 raise NotImplementedError(f"Null kind {null_kind} is not yet supported.")
512 if null_pos is not None and np.any(null_pos):
513 if not allow_modify_inplace:
514 data = data.copy()
515 try:
516 data[null_pos] = None
517 except TypeError:
518 # TypeError happens if the `data` dtype appears to be non-nullable
519 # in numpy notation (bool, int, uint). If this happens,
520 # cast the `data` to nullable float dtype.
521 data = data.astype(float)
522 data[null_pos] = None
524 return data