Coverage for /var/srv/projects/api.amasfac.comuna18.com/tmp/venv/lib/python3.9/site-packages/pandas/core/interchange/column.py: 22%
132 statements
« prev ^ index » next coverage.py v6.4.4, created at 2023-07-17 14:22 -0600
« prev ^ index » next coverage.py v6.4.4, created at 2023-07-17 14:22 -0600
1from __future__ import annotations
3from typing import Any
5import numpy as np
7from pandas._libs.lib import infer_dtype
8from pandas._libs.tslibs import iNaT
9from pandas.util._decorators import cache_readonly
11import pandas as pd
12from pandas.api.types import (
13 is_categorical_dtype,
14 is_string_dtype,
15)
16from pandas.core.interchange.buffer import PandasBuffer
17from pandas.core.interchange.dataframe_protocol import (
18 Column,
19 ColumnBuffers,
20 ColumnNullType,
21 DtypeKind,
22)
23from pandas.core.interchange.utils import (
24 ArrowCTypes,
25 Endianness,
26 NoBufferPresent,
27 dtype_to_arrow_c_fmt,
28)
30_NP_KINDS = {
31 "i": DtypeKind.INT,
32 "u": DtypeKind.UINT,
33 "f": DtypeKind.FLOAT,
34 "b": DtypeKind.BOOL,
35 "U": DtypeKind.STRING,
36 "M": DtypeKind.DATETIME,
37 "m": DtypeKind.DATETIME,
38}
40_NULL_DESCRIPTION = {
41 DtypeKind.FLOAT: (ColumnNullType.USE_NAN, None),
42 DtypeKind.DATETIME: (ColumnNullType.USE_SENTINEL, iNaT),
43 DtypeKind.INT: (ColumnNullType.NON_NULLABLE, None),
44 DtypeKind.UINT: (ColumnNullType.NON_NULLABLE, None),
45 DtypeKind.BOOL: (ColumnNullType.NON_NULLABLE, None),
46 # Null values for categoricals are stored as `-1` sentinel values
47 # in the category date (e.g., `col.values.codes` is int8 np.ndarray)
48 DtypeKind.CATEGORICAL: (ColumnNullType.USE_SENTINEL, -1),
49 # follow Arrow in using 1 as valid value and 0 for missing/null value
50 DtypeKind.STRING: (ColumnNullType.USE_BYTEMASK, 0),
51}
53_NO_VALIDITY_BUFFER = {
54 ColumnNullType.NON_NULLABLE: "This column is non-nullable",
55 ColumnNullType.USE_NAN: "This column uses NaN as null",
56 ColumnNullType.USE_SENTINEL: "This column uses a sentinel value",
57}
60class PandasColumn(Column):
61 """
62 A column object, with only the methods and properties required by the
63 interchange protocol defined.
64 A column can contain one or more chunks. Each chunk can contain up to three
65 buffers - a data buffer, a mask buffer (depending on null representation),
66 and an offsets buffer (if variable-size binary; e.g., variable-length
67 strings).
68 Note: this Column object can only be produced by ``__dataframe__``, so
69 doesn't need its own version or ``__column__`` protocol.
70 """
72 def __init__(self, column: pd.Series, allow_copy: bool = True) -> None:
73 """
74 Note: doesn't deal with extension arrays yet, just assume a regular
75 Series/ndarray for now.
76 """
77 if not isinstance(column, pd.Series):
78 raise NotImplementedError(f"Columns of type {type(column)} not handled yet")
80 # Store the column as a private attribute
81 self._col = column
82 self._allow_copy = allow_copy
84 def size(self) -> int:
85 """
86 Size of the column, in elements.
87 """
88 return self._col.size
90 @property
91 def offset(self) -> int:
92 """
93 Offset of first element. Always zero.
94 """
95 # TODO: chunks are implemented now, probably this should return something
96 return 0
98 @cache_readonly
99 def dtype(self) -> tuple[DtypeKind, int, str, str]:
100 dtype = self._col.dtype
102 if is_categorical_dtype(dtype):
103 codes = self._col.values.codes
104 (
105 _,
106 bitwidth,
107 c_arrow_dtype_f_str,
108 _,
109 ) = self._dtype_from_pandasdtype(codes.dtype)
110 return (
111 DtypeKind.CATEGORICAL,
112 bitwidth,
113 c_arrow_dtype_f_str,
114 Endianness.NATIVE,
115 )
116 elif is_string_dtype(dtype):
117 if infer_dtype(self._col) == "string":
118 return (
119 DtypeKind.STRING,
120 8,
121 dtype_to_arrow_c_fmt(dtype),
122 Endianness.NATIVE,
123 )
124 raise NotImplementedError("Non-string object dtypes are not supported yet")
125 else:
126 return self._dtype_from_pandasdtype(dtype)
128 def _dtype_from_pandasdtype(self, dtype) -> tuple[DtypeKind, int, str, str]:
129 """
130 See `self.dtype` for details.
131 """
132 # Note: 'c' (complex) not handled yet (not in array spec v1).
133 # 'b', 'B' (bytes), 'S', 'a', (old-style string) 'V' (void) not handled
134 # datetime and timedelta both map to datetime (is timedelta handled?)
136 kind = _NP_KINDS.get(dtype.kind, None)
137 if kind is None:
138 # Not a NumPy dtype. Check if it's a categorical maybe
139 raise ValueError(f"Data type {dtype} not supported by interchange protocol")
141 return kind, dtype.itemsize * 8, dtype_to_arrow_c_fmt(dtype), dtype.byteorder
143 @property
144 def describe_categorical(self):
145 """
146 If the dtype is categorical, there are two options:
147 - There are only values in the data buffer.
148 - There is a separate non-categorical Column encoding for categorical values.
150 Raises TypeError if the dtype is not categorical
152 Content of returned dict:
153 - "is_ordered" : bool, whether the ordering of dictionary indices is
154 semantically meaningful.
155 - "is_dictionary" : bool, whether a dictionary-style mapping of
156 categorical values to other objects exists
157 - "categories" : Column representing the (implicit) mapping of indices to
158 category values (e.g. an array of cat1, cat2, ...).
159 None if not a dictionary-style categorical.
160 """
161 if not self.dtype[0] == DtypeKind.CATEGORICAL:
162 raise TypeError(
163 "describe_categorical only works on a column with categorical dtype!"
164 )
166 return {
167 "is_ordered": self._col.cat.ordered,
168 "is_dictionary": True,
169 "categories": PandasColumn(pd.Series(self._col.cat.categories)),
170 }
172 @property
173 def describe_null(self):
174 kind = self.dtype[0]
175 try:
176 null, value = _NULL_DESCRIPTION[kind]
177 except KeyError:
178 raise NotImplementedError(f"Data type {kind} not yet supported")
180 return null, value
182 @cache_readonly
183 def null_count(self) -> int:
184 """
185 Number of null elements. Should always be known.
186 """
187 return self._col.isna().sum().item()
189 @property
190 def metadata(self) -> dict[str, pd.Index]:
191 """
192 Store specific metadata of the column.
193 """
194 return {"pandas.index": self._col.index}
196 def num_chunks(self) -> int:
197 """
198 Return the number of chunks the column consists of.
199 """
200 return 1
202 def get_chunks(self, n_chunks: int | None = None):
203 """
204 Return an iterator yielding the chunks.
205 See `DataFrame.get_chunks` for details on ``n_chunks``.
206 """
207 if n_chunks and n_chunks > 1:
208 size = len(self._col)
209 step = size // n_chunks
210 if size % n_chunks != 0:
211 step += 1
212 for start in range(0, step * n_chunks, step):
213 yield PandasColumn(
214 self._col.iloc[start : start + step], self._allow_copy
215 )
216 else:
217 yield self
219 def get_buffers(self) -> ColumnBuffers:
220 """
221 Return a dictionary containing the underlying buffers.
222 The returned dictionary has the following contents:
223 - "data": a two-element tuple whose first element is a buffer
224 containing the data and whose second element is the data
225 buffer's associated dtype.
226 - "validity": a two-element tuple whose first element is a buffer
227 containing mask values indicating missing data and
228 whose second element is the mask value buffer's
229 associated dtype. None if the null representation is
230 not a bit or byte mask.
231 - "offsets": a two-element tuple whose first element is a buffer
232 containing the offset values for variable-size binary
233 data (e.g., variable-length strings) and whose second
234 element is the offsets buffer's associated dtype. None
235 if the data buffer does not have an associated offsets
236 buffer.
237 """
238 buffers: ColumnBuffers = {
239 "data": self._get_data_buffer(),
240 "validity": None,
241 "offsets": None,
242 }
244 try:
245 buffers["validity"] = self._get_validity_buffer()
246 except NoBufferPresent:
247 pass
249 try:
250 buffers["offsets"] = self._get_offsets_buffer()
251 except NoBufferPresent:
252 pass
254 return buffers
256 def _get_data_buffer(
257 self,
258 ) -> tuple[PandasBuffer, Any]: # Any is for self.dtype tuple
259 """
260 Return the buffer containing the data and the buffer's associated dtype.
261 """
262 if self.dtype[0] in (
263 DtypeKind.INT,
264 DtypeKind.UINT,
265 DtypeKind.FLOAT,
266 DtypeKind.BOOL,
267 DtypeKind.DATETIME,
268 ):
269 buffer = PandasBuffer(self._col.to_numpy(), allow_copy=self._allow_copy)
270 dtype = self.dtype
271 elif self.dtype[0] == DtypeKind.CATEGORICAL:
272 codes = self._col.values._codes
273 buffer = PandasBuffer(codes, allow_copy=self._allow_copy)
274 dtype = self._dtype_from_pandasdtype(codes.dtype)
275 elif self.dtype[0] == DtypeKind.STRING:
276 # Marshal the strings from a NumPy object array into a byte array
277 buf = self._col.to_numpy()
278 b = bytearray()
280 # TODO: this for-loop is slow; can be implemented in Cython/C/C++ later
281 for obj in buf:
282 if isinstance(obj, str):
283 b.extend(obj.encode(encoding="utf-8"))
285 # Convert the byte array to a Pandas "buffer" using
286 # a NumPy array as the backing store
287 buffer = PandasBuffer(np.frombuffer(b, dtype="uint8"))
289 # Define the dtype for the returned buffer
290 dtype = (
291 DtypeKind.STRING,
292 8,
293 ArrowCTypes.STRING,
294 Endianness.NATIVE,
295 ) # note: currently only support native endianness
296 else:
297 raise NotImplementedError(f"Data type {self._col.dtype} not handled yet")
299 return buffer, dtype
301 def _get_validity_buffer(self) -> tuple[PandasBuffer, Any]:
302 """
303 Return the buffer containing the mask values indicating missing data and
304 the buffer's associated dtype.
305 Raises NoBufferPresent if null representation is not a bit or byte mask.
306 """
307 null, invalid = self.describe_null
309 if self.dtype[0] == DtypeKind.STRING:
310 # For now, use byte array as the mask.
311 # TODO: maybe store as bit array to save space?..
312 buf = self._col.to_numpy()
314 # Determine the encoding for valid values
315 valid = invalid == 0
316 invalid = not valid
318 mask = np.zeros(shape=(len(buf),), dtype=np.bool8)
319 for i, obj in enumerate(buf):
320 mask[i] = valid if isinstance(obj, str) else invalid
322 # Convert the mask array to a Pandas "buffer" using
323 # a NumPy array as the backing store
324 buffer = PandasBuffer(mask)
326 # Define the dtype of the returned buffer
327 dtype = (DtypeKind.BOOL, 8, ArrowCTypes.BOOL, Endianness.NATIVE)
329 return buffer, dtype
331 try:
332 msg = _NO_VALIDITY_BUFFER[null] + " so does not have a separate mask"
333 except KeyError:
334 # TODO: implement for other bit/byte masks?
335 raise NotImplementedError("See self.describe_null")
337 raise NoBufferPresent(msg)
339 def _get_offsets_buffer(self) -> tuple[PandasBuffer, Any]:
340 """
341 Return the buffer containing the offset values for variable-size binary
342 data (e.g., variable-length strings) and the buffer's associated dtype.
343 Raises NoBufferPresent if the data buffer does not have an associated
344 offsets buffer.
345 """
346 if self.dtype[0] == DtypeKind.STRING:
347 # For each string, we need to manually determine the next offset
348 values = self._col.to_numpy()
349 ptr = 0
350 offsets = np.zeros(shape=(len(values) + 1,), dtype=np.int64)
351 for i, v in enumerate(values):
352 # For missing values (in this case, `np.nan` values)
353 # we don't increment the pointer
354 if isinstance(v, str):
355 b = v.encode(encoding="utf-8")
356 ptr += len(b)
358 offsets[i + 1] = ptr
360 # Convert the offsets to a Pandas "buffer" using
361 # the NumPy array as the backing store
362 buffer = PandasBuffer(offsets)
364 # Assemble the buffer dtype info
365 dtype = (
366 DtypeKind.INT,
367 64,
368 ArrowCTypes.INT64,
369 Endianness.NATIVE,
370 ) # note: currently only support native endianness
371 else:
372 raise NoBufferPresent(
373 "This column has a fixed-length dtype so "
374 "it does not have an offsets buffer"
375 )
377 return buffer, dtype