Coverage for /var/srv/projects/api.amasfac.comuna18.com/tmp/venv/lib/python3.9/site-packages/pandas/core/interchange/dataframe_protocol.py: 82%
127 statements
« prev ^ index » next coverage.py v6.4.4, created at 2023-07-17 14:22 -0600
« prev ^ index » next coverage.py v6.4.4, created at 2023-07-17 14:22 -0600
1"""
2A verbatim copy (vendored) of the spec from https://github.com/data-apis/dataframe-api
3"""
5from __future__ import annotations
7from abc import (
8 ABC,
9 abstractmethod,
10)
11import enum
12from typing import (
13 Any,
14 Iterable,
15 Sequence,
16 TypedDict,
17)
20class DlpackDeviceType(enum.IntEnum):
21 """Integer enum for device type codes matching DLPack."""
23 CPU = 1
24 CUDA = 2
25 CPU_PINNED = 3
26 OPENCL = 4
27 VULKAN = 7
28 METAL = 8
29 VPI = 9
30 ROCM = 10
33class DtypeKind(enum.IntEnum):
34 """
35 Integer enum for data types.
37 Attributes
38 ----------
39 INT : int
40 Matches to signed integer data type.
41 UINT : int
42 Matches to unsigned integer data type.
43 FLOAT : int
44 Matches to floating point data type.
45 BOOL : int
46 Matches to boolean data type.
47 STRING : int
48 Matches to string data type (UTF-8 encoded).
49 DATETIME : int
50 Matches to datetime data type.
51 CATEGORICAL : int
52 Matches to categorical data type.
53 """
55 INT = 0
56 UINT = 1
57 FLOAT = 2
58 BOOL = 20
59 STRING = 21 # UTF-8
60 DATETIME = 22
61 CATEGORICAL = 23
64class ColumnNullType(enum.IntEnum):
65 """
66 Integer enum for null type representation.
68 Attributes
69 ----------
70 NON_NULLABLE : int
71 Non-nullable column.
72 USE_NAN : int
73 Use explicit float NaN value.
74 USE_SENTINEL : int
75 Sentinel value besides NaN/NaT.
76 USE_BITMASK : int
77 The bit is set/unset representing a null on a certain position.
78 USE_BYTEMASK : int
79 The byte is set/unset representing a null on a certain position.
80 """
82 NON_NULLABLE = 0
83 USE_NAN = 1
84 USE_SENTINEL = 2
85 USE_BITMASK = 3
86 USE_BYTEMASK = 4
89class ColumnBuffers(TypedDict):
90 # first element is a buffer containing the column data;
91 # second element is the data buffer's associated dtype
92 data: tuple[Buffer, Any]
94 # first element is a buffer containing mask values indicating missing data;
95 # second element is the mask value buffer's associated dtype.
96 # None if the null representation is not a bit or byte mask
97 validity: tuple[Buffer, Any] | None
99 # first element is a buffer containing the offset values for
100 # variable-size binary data (e.g., variable-length strings);
101 # second element is the offsets buffer's associated dtype.
102 # None if the data buffer does not have an associated offsets buffer
103 offsets: tuple[Buffer, Any] | None
106class CategoricalDescription(TypedDict):
107 # whether the ordering of dictionary indices is semantically meaningful
108 is_ordered: bool
109 # whether a dictionary-style mapping of categorical values to other objects exists
110 is_dictionary: bool
111 # Python-level only (e.g. ``{int: str}``).
112 # None if not a dictionary-style categorical.
113 categories: Column | None
116class Buffer(ABC):
117 """
118 Data in the buffer is guaranteed to be contiguous in memory.
120 Note that there is no dtype attribute present, a buffer can be thought of
121 as simply a block of memory. However, if the column that the buffer is
122 attached to has a dtype that's supported by DLPack and ``__dlpack__`` is
123 implemented, then that dtype information will be contained in the return
124 value from ``__dlpack__``.
126 This distinction is useful to support both data exchange via DLPack on a
127 buffer and (b) dtypes like variable-length strings which do not have a
128 fixed number of bytes per element.
129 """
131 @property
132 @abstractmethod
133 def bufsize(self) -> int:
134 """
135 Buffer size in bytes.
136 """
137 pass
139 @property
140 @abstractmethod
141 def ptr(self) -> int:
142 """
143 Pointer to start of the buffer as an integer.
144 """
145 pass
147 @abstractmethod
148 def __dlpack__(self):
149 """
150 Produce DLPack capsule (see array API standard).
152 Raises:
154 - TypeError : if the buffer contains unsupported dtypes.
155 - NotImplementedError : if DLPack support is not implemented
157 Useful to have to connect to array libraries. Support optional because
158 it's not completely trivial to implement for a Python-only library.
159 """
160 raise NotImplementedError("__dlpack__")
162 @abstractmethod
163 def __dlpack_device__(self) -> tuple[DlpackDeviceType, int | None]:
164 """
165 Device type and device ID for where the data in the buffer resides.
166 Uses device type codes matching DLPack.
167 Note: must be implemented even if ``__dlpack__`` is not.
168 """
169 pass
172class Column(ABC):
173 """
174 A column object, with only the methods and properties required by the
175 interchange protocol defined.
177 A column can contain one or more chunks. Each chunk can contain up to three
178 buffers - a data buffer, a mask buffer (depending on null representation),
179 and an offsets buffer (if variable-size binary; e.g., variable-length
180 strings).
182 TBD: Arrow has a separate "null" dtype, and has no separate mask concept.
183 Instead, it seems to use "children" for both columns with a bit mask,
184 and for nested dtypes. Unclear whether this is elegant or confusing.
185 This design requires checking the null representation explicitly.
187 The Arrow design requires checking:
188 1. the ARROW_FLAG_NULLABLE (for sentinel values)
189 2. if a column has two children, combined with one of those children
190 having a null dtype.
192 Making the mask concept explicit seems useful. One null dtype would
193 not be enough to cover both bit and byte masks, so that would mean
194 even more checking if we did it the Arrow way.
196 TBD: there's also the "chunk" concept here, which is implicit in Arrow as
197 multiple buffers per array (= column here). Semantically it may make
198 sense to have both: chunks were meant for example for lazy evaluation
199 of data which doesn't fit in memory, while multiple buffers per column
200 could also come from doing a selection operation on a single
201 contiguous buffer.
203 Given these concepts, one would expect chunks to be all of the same
204 size (say a 10,000 row dataframe could have 10 chunks of 1,000 rows),
205 while multiple buffers could have data-dependent lengths. Not an issue
206 in pandas if one column is backed by a single NumPy array, but in
207 Arrow it seems possible.
208 Are multiple chunks *and* multiple buffers per column necessary for
209 the purposes of this interchange protocol, or must producers either
210 reuse the chunk concept for this or copy the data?
212 Note: this Column object can only be produced by ``__dataframe__``, so
213 doesn't need its own version or ``__column__`` protocol.
214 """
216 @property
217 @abstractmethod
218 def size(self) -> int:
219 """
220 Size of the column, in elements.
222 Corresponds to DataFrame.num_rows() if column is a single chunk;
223 equal to size of this current chunk otherwise.
224 """
225 pass
227 @property
228 @abstractmethod
229 def offset(self) -> int:
230 """
231 Offset of first element.
233 May be > 0 if using chunks; for example for a column with N chunks of
234 equal size M (only the last chunk may be shorter),
235 ``offset = n * M``, ``n = 0 .. N-1``.
236 """
237 pass
239 @property
240 @abstractmethod
241 def dtype(self) -> tuple[DtypeKind, int, str, str]:
242 """
243 Dtype description as a tuple ``(kind, bit-width, format string, endianness)``.
245 Bit-width : the number of bits as an integer
246 Format string : data type description format string in Apache Arrow C
247 Data Interface format.
248 Endianness : current only native endianness (``=``) is supported
250 Notes:
251 - Kind specifiers are aligned with DLPack where possible (hence the
252 jump to 20, leave enough room for future extension)
253 - Masks must be specified as boolean with either bit width 1 (for bit
254 masks) or 8 (for byte masks).
255 - Dtype width in bits was preferred over bytes
256 - Endianness isn't too useful, but included now in case in the future
257 we need to support non-native endianness
258 - Went with Apache Arrow format strings over NumPy format strings
259 because they're more complete from a dataframe perspective
260 - Format strings are mostly useful for datetime specification, and
261 for categoricals.
262 - For categoricals, the format string describes the type of the
263 categorical in the data buffer. In case of a separate encoding of
264 the categorical (e.g. an integer to string mapping), this can
265 be derived from ``self.describe_categorical``.
266 - Data types not included: complex, Arrow-style null, binary, decimal,
267 and nested (list, struct, map, union) dtypes.
268 """
269 pass
271 @property
272 @abstractmethod
273 def describe_categorical(self) -> CategoricalDescription:
274 """
275 If the dtype is categorical, there are two options:
276 - There are only values in the data buffer.
277 - There is a separate non-categorical Column encoding for categorical values.
279 Raises TypeError if the dtype is not categorical
281 Returns the dictionary with description on how to interpret the data buffer:
282 - "is_ordered" : bool, whether the ordering of dictionary indices is
283 semantically meaningful.
284 - "is_dictionary" : bool, whether a mapping of
285 categorical values to other objects exists
286 - "categories" : Column representing the (implicit) mapping of indices to
287 category values (e.g. an array of cat1, cat2, ...).
288 None if not a dictionary-style categorical.
290 TBD: are there any other in-memory representations that are needed?
291 """
292 pass
294 @property
295 @abstractmethod
296 def describe_null(self) -> tuple[ColumnNullType, Any]:
297 """
298 Return the missing value (or "null") representation the column dtype
299 uses, as a tuple ``(kind, value)``.
301 Value : if kind is "sentinel value", the actual value. If kind is a bit
302 mask or a byte mask, the value (0 or 1) indicating a missing value. None
303 otherwise.
304 """
305 pass
307 @property
308 @abstractmethod
309 def null_count(self) -> int | None:
310 """
311 Number of null elements, if known.
313 Note: Arrow uses -1 to indicate "unknown", but None seems cleaner.
314 """
315 pass
317 @property
318 @abstractmethod
319 def metadata(self) -> dict[str, Any]:
320 """
321 The metadata for the column. See `DataFrame.metadata` for more details.
322 """
323 pass
325 @abstractmethod
326 def num_chunks(self) -> int:
327 """
328 Return the number of chunks the column consists of.
329 """
330 pass
332 @abstractmethod
333 def get_chunks(self, n_chunks: int | None = None) -> Iterable[Column]:
334 """
335 Return an iterator yielding the chunks.
337 See `DataFrame.get_chunks` for details on ``n_chunks``.
338 """
339 pass
341 @abstractmethod
342 def get_buffers(self) -> ColumnBuffers:
343 """
344 Return a dictionary containing the underlying buffers.
346 The returned dictionary has the following contents:
348 - "data": a two-element tuple whose first element is a buffer
349 containing the data and whose second element is the data
350 buffer's associated dtype.
351 - "validity": a two-element tuple whose first element is a buffer
352 containing mask values indicating missing data and
353 whose second element is the mask value buffer's
354 associated dtype. None if the null representation is
355 not a bit or byte mask.
356 - "offsets": a two-element tuple whose first element is a buffer
357 containing the offset values for variable-size binary
358 data (e.g., variable-length strings) and whose second
359 element is the offsets buffer's associated dtype. None
360 if the data buffer does not have an associated offsets
361 buffer.
362 """
363 pass
366# def get_children(self) -> Iterable[Column]:
367# """
368# Children columns underneath the column, each object in this iterator
369# must adhere to the column specification.
370# """
371# pass
374class DataFrame(ABC):
375 """
376 A data frame class, with only the methods required by the interchange
377 protocol defined.
379 A "data frame" represents an ordered collection of named columns.
380 A column's "name" must be a unique string.
381 Columns may be accessed by name or by position.
383 This could be a public data frame class, or an object with the methods and
384 attributes defined on this DataFrame class could be returned from the
385 ``__dataframe__`` method of a public data frame class in a library adhering
386 to the dataframe interchange protocol specification.
387 """
389 version = 0 # version of the protocol
391 @abstractmethod
392 def __dataframe__(self, nan_as_null: bool = False, allow_copy: bool = True):
393 """Construct a new interchange object, potentially changing the parameters."""
394 pass
396 @property
397 @abstractmethod
398 def metadata(self) -> dict[str, Any]:
399 """
400 The metadata for the data frame, as a dictionary with string keys. The
401 contents of `metadata` may be anything, they are meant for a library
402 to store information that it needs to, e.g., roundtrip losslessly or
403 for two implementations to share data that is not (yet) part of the
404 interchange protocol specification. For avoiding collisions with other
405 entries, please add name the keys with the name of the library
406 followed by a period and the desired name, e.g, ``pandas.indexcol``.
407 """
408 pass
410 @abstractmethod
411 def num_columns(self) -> int:
412 """
413 Return the number of columns in the DataFrame.
414 """
415 pass
417 @abstractmethod
418 def num_rows(self) -> int | None:
419 # TODO: not happy with Optional, but need to flag it may be expensive
420 # why include it if it may be None - what do we expect consumers
421 # to do here?
422 """
423 Return the number of rows in the DataFrame, if available.
424 """
425 pass
427 @abstractmethod
428 def num_chunks(self) -> int:
429 """
430 Return the number of chunks the DataFrame consists of.
431 """
432 pass
434 @abstractmethod
435 def column_names(self) -> Iterable[str]:
436 """
437 Return an iterator yielding the column names.
438 """
439 pass
441 @abstractmethod
442 def get_column(self, i: int) -> Column:
443 """
444 Return the column at the indicated position.
445 """
446 pass
448 @abstractmethod
449 def get_column_by_name(self, name: str) -> Column:
450 """
451 Return the column whose name is the indicated name.
452 """
453 pass
455 @abstractmethod
456 def get_columns(self) -> Iterable[Column]:
457 """
458 Return an iterator yielding the columns.
459 """
460 pass
462 @abstractmethod
463 def select_columns(self, indices: Sequence[int]) -> DataFrame:
464 """
465 Create a new DataFrame by selecting a subset of columns by index.
466 """
467 pass
469 @abstractmethod
470 def select_columns_by_name(self, names: Sequence[str]) -> DataFrame:
471 """
472 Create a new DataFrame by selecting a subset of columns by name.
473 """
474 pass
476 @abstractmethod
477 def get_chunks(self, n_chunks: int | None = None) -> Iterable[DataFrame]:
478 """
479 Return an iterator yielding the chunks.
481 By default (None), yields the chunks that the data is stored as by the
482 producer. If given, ``n_chunks`` must be a multiple of
483 ``self.num_chunks()``, meaning the producer must subdivide each chunk
484 before yielding it.
485 """
486 pass