Coverage for /var/srv/projects/api.amasfac.comuna18.com/tmp/venv/lib/python3.9/site-packages/pandas/core/interchange/dataframe

1"""

2A verbatim copy (vendored) of the spec from https://github.com/data-apis/dataframe-api

3"""

5from __future__ import annotations

7from abc import (

8 ABC,

9 abstractmethod,

10)

11import enum

12from typing import (

13 Any,

14 Iterable,

15 Sequence,

16 TypedDict,

17)

20class DlpackDeviceType(enum.IntEnum):

21 """Integer enum for device type codes matching DLPack."""

23 CPU = 1

24 CUDA = 2

25 CPU_PINNED = 3

26 OPENCL = 4

27 VULKAN = 7

28 METAL = 8

29 VPI = 9

30 ROCM = 10

33class DtypeKind(enum.IntEnum):

34 """

35 Integer enum for data types.

37 Attributes

38 ----------

39 INT : int

40 Matches to signed integer data type.

41 UINT : int

42 Matches to unsigned integer data type.

43 FLOAT : int

44 Matches to floating point data type.

45 BOOL : int

46 Matches to boolean data type.

47 STRING : int

48 Matches to string data type (UTF-8 encoded).

49 DATETIME : int

50 Matches to datetime data type.

51 CATEGORICAL : int

52 Matches to categorical data type.

53 """

55 INT = 0

56 UINT = 1

57 FLOAT = 2

58 BOOL = 20

59 STRING = 21 # UTF-8

60 DATETIME = 22

61 CATEGORICAL = 23

64class ColumnNullType(enum.IntEnum):

65 """

66 Integer enum for null type representation.

68 Attributes

69 ----------

70 NON_NULLABLE : int

71 Non-nullable column.

72 USE_NAN : int

73 Use explicit float NaN value.

74 USE_SENTINEL : int

75 Sentinel value besides NaN/NaT.

76 USE_BITMASK : int

77 The bit is set/unset representing a null on a certain position.

78 USE_BYTEMASK : int

79 The byte is set/unset representing a null on a certain position.

80 """

82 NON_NULLABLE = 0

83 USE_NAN = 1

84 USE_SENTINEL = 2

85 USE_BITMASK = 3

86 USE_BYTEMASK = 4

89class ColumnBuffers(TypedDict):

90 # first element is a buffer containing the column data;

91 # second element is the data buffer's associated dtype

92 data: tuple[Buffer, Any]

94 # first element is a buffer containing mask values indicating missing data;

95 # second element is the mask value buffer's associated dtype.

96 # None if the null representation is not a bit or byte mask

97 validity: tuple[Buffer, Any] | None

99 # first element is a buffer containing the offset values for

100 # variable-size binary data (e.g., variable-length strings);

101 # second element is the offsets buffer's associated dtype.

102 # None if the data buffer does not have an associated offsets buffer

103 offsets: tuple[Buffer, Any] | None

104

105

106class CategoricalDescription(TypedDict):

107 # whether the ordering of dictionary indices is semantically meaningful

108 is_ordered: bool

109 # whether a dictionary-style mapping of categorical values to other objects exists

110 is_dictionary: bool

111 # Python-level only (e.g. ``{int: str}``).

112 # None if not a dictionary-style categorical.

113 categories: Column | None

114

115

116class Buffer(ABC):

117 """

118 Data in the buffer is guaranteed to be contiguous in memory.

119

120 Note that there is no dtype attribute present, a buffer can be thought of

121 as simply a block of memory. However, if the column that the buffer is

122 attached to has a dtype that's supported by DLPack and ``__dlpack__`` is

123 implemented, then that dtype information will be contained in the return

124 value from ``__dlpack__``.

125

126 This distinction is useful to support both data exchange via DLPack on a

127 buffer and (b) dtypes like variable-length strings which do not have a

128 fixed number of bytes per element.

129 """

130

131 @property

132 @abstractmethod

133 def bufsize(self) -> int:

134 """

135 Buffer size in bytes.

136 """

137 pass

138

139 @property

140 @abstractmethod

141 def ptr(self) -> int:

142 """

143 Pointer to start of the buffer as an integer.

144 """

145 pass

146

147 @abstractmethod

148 def __dlpack__(self):

149 """

150 Produce DLPack capsule (see array API standard).

151

152 Raises:

153

154 - TypeError : if the buffer contains unsupported dtypes.

155 - NotImplementedError : if DLPack support is not implemented

156

157 Useful to have to connect to array libraries. Support optional because

158 it's not completely trivial to implement for a Python-only library.

159 """

160 raise NotImplementedError("__dlpack__")

161

162 @abstractmethod

163 def __dlpack_device__(self) -> tuple[DlpackDeviceType, int | None]:

164 """

165 Device type and device ID for where the data in the buffer resides.

166 Uses device type codes matching DLPack.

167 Note: must be implemented even if ``__dlpack__`` is not.

168 """

169 pass

170

171

172class Column(ABC):

173 """

174 A column object, with only the methods and properties required by the

175 interchange protocol defined.

176

177 A column can contain one or more chunks. Each chunk can contain up to three

178 buffers - a data buffer, a mask buffer (depending on null representation),

179 and an offsets buffer (if variable-size binary; e.g., variable-length

180 strings).

181

182 TBD: Arrow has a separate "null" dtype, and has no separate mask concept.

183 Instead, it seems to use "children" for both columns with a bit mask,

184 and for nested dtypes. Unclear whether this is elegant or confusing.

185 This design requires checking the null representation explicitly.

186

187 The Arrow design requires checking:

188 1. the ARROW_FLAG_NULLABLE (for sentinel values)

189 2. if a column has two children, combined with one of those children

190 having a null dtype.

191

192 Making the mask concept explicit seems useful. One null dtype would

193 not be enough to cover both bit and byte masks, so that would mean

194 even more checking if we did it the Arrow way.

195

196 TBD: there's also the "chunk" concept here, which is implicit in Arrow as

197 multiple buffers per array (= column here). Semantically it may make

198 sense to have both: chunks were meant for example for lazy evaluation

199 of data which doesn't fit in memory, while multiple buffers per column

200 could also come from doing a selection operation on a single

201 contiguous buffer.

202

203 Given these concepts, one would expect chunks to be all of the same

204 size (say a 10,000 row dataframe could have 10 chunks of 1,000 rows),

205 while multiple buffers could have data-dependent lengths. Not an issue

206 in pandas if one column is backed by a single NumPy array, but in

207 Arrow it seems possible.

208 Are multiple chunks *and* multiple buffers per column necessary for

209 the purposes of this interchange protocol, or must producers either

210 reuse the chunk concept for this or copy the data?

211

212 Note: this Column object can only be produced by ``__dataframe__``, so

213 doesn't need its own version or ``__column__`` protocol.

214 """

215

216 @property

217 @abstractmethod

218 def size(self) -> int:

219 """

220 Size of the column, in elements.

221

222 Corresponds to DataFrame.num_rows() if column is a single chunk;

223 equal to size of this current chunk otherwise.

224 """

225 pass

226

227 @property

228 @abstractmethod

229 def offset(self) -> int:

230 """

231 Offset of first element.

232

233 May be > 0 if using chunks; for example for a column with N chunks of

234 equal size M (only the last chunk may be shorter),

235 ``offset = n * M``, ``n = 0 .. N-1``.

236 """

237 pass

238

239 @property

240 @abstractmethod

241 def dtype(self) -> tuple[DtypeKind, int, str, str]:

242 """

243 Dtype description as a tuple ``(kind, bit-width, format string, endianness)``.

244

245 Bit-width : the number of bits as an integer

246 Format string : data type description format string in Apache Arrow C

247 Data Interface format.

248 Endianness : current only native endianness (``=``) is supported

249

250 Notes:

251 - Kind specifiers are aligned with DLPack where possible (hence the

252 jump to 20, leave enough room for future extension)

253 - Masks must be specified as boolean with either bit width 1 (for bit

254 masks) or 8 (for byte masks).

255 - Dtype width in bits was preferred over bytes

256 - Endianness isn't too useful, but included now in case in the future

257 we need to support non-native endianness

258 - Went with Apache Arrow format strings over NumPy format strings

259 because they're more complete from a dataframe perspective

260 - Format strings are mostly useful for datetime specification, and

261 for categoricals.

262 - For categoricals, the format string describes the type of the

263 categorical in the data buffer. In case of a separate encoding of

264 the categorical (e.g. an integer to string mapping), this can

265 be derived from ``self.describe_categorical``.

266 - Data types not included: complex, Arrow-style null, binary, decimal,

267 and nested (list, struct, map, union) dtypes.

268 """

269 pass

270

271 @property

272 @abstractmethod

273 def describe_categorical(self) -> CategoricalDescription:

274 """

275 If the dtype is categorical, there are two options:

276 - There are only values in the data buffer.

277 - There is a separate non-categorical Column encoding for categorical values.

278

279 Raises TypeError if the dtype is not categorical

280

281 Returns the dictionary with description on how to interpret the data buffer:

282 - "is_ordered" : bool, whether the ordering of dictionary indices is

283 semantically meaningful.

284 - "is_dictionary" : bool, whether a mapping of

285 categorical values to other objects exists

286 - "categories" : Column representing the (implicit) mapping of indices to

287 category values (e.g. an array of cat1, cat2, ...).

288 None if not a dictionary-style categorical.

289

290 TBD: are there any other in-memory representations that are needed?

291 """

292 pass

293

294 @property

295 @abstractmethod

296 def describe_null(self) -> tuple[ColumnNullType, Any]:

297 """

298 Return the missing value (or "null") representation the column dtype

299 uses, as a tuple ``(kind, value)``.

300

301 Value : if kind is "sentinel value", the actual value. If kind is a bit

302 mask or a byte mask, the value (0 or 1) indicating a missing value. None

303 otherwise.

304 """

305 pass

306

307 @property

308 @abstractmethod

309 def null_count(self) -> int | None:

310 """

311 Number of null elements, if known.

312

313 Note: Arrow uses -1 to indicate "unknown", but None seems cleaner.

314 """

315 pass

316

317 @property

318 @abstractmethod

319 def metadata(self) -> dict[str, Any]:

320 """

321 The metadata for the column. See `DataFrame.metadata` for more details.

322 """

323 pass

324

325 @abstractmethod

326 def num_chunks(self) -> int:

327 """

328 Return the number of chunks the column consists of.

329 """

330 pass

331

332 @abstractmethod

333 def get_chunks(self, n_chunks: int | None = None) -> Iterable[Column]:

334 """

335 Return an iterator yielding the chunks.

336

337 See `DataFrame.get_chunks` for details on ``n_chunks``.

338 """

339 pass

340

341 @abstractmethod

342 def get_buffers(self) -> ColumnBuffers:

343 """

344 Return a dictionary containing the underlying buffers.

345

346 The returned dictionary has the following contents:

347

348 - "data": a two-element tuple whose first element is a buffer

349 containing the data and whose second element is the data

350 buffer's associated dtype.

351 - "validity": a two-element tuple whose first element is a buffer

352 containing mask values indicating missing data and

353 whose second element is the mask value buffer's

354 associated dtype. None if the null representation is

355 not a bit or byte mask.

356 - "offsets": a two-element tuple whose first element is a buffer

357 containing the offset values for variable-size binary

358 data (e.g., variable-length strings) and whose second

359 element is the offsets buffer's associated dtype. None

360 if the data buffer does not have an associated offsets

361 buffer.

362 """

363 pass

364

365

366# def get_children(self) -> Iterable[Column]:

367# """

368# Children columns underneath the column, each object in this iterator

369# must adhere to the column specification.

370# """

371# pass

372

373

374class DataFrame(ABC):

375 """

376 A data frame class, with only the methods required by the interchange

377 protocol defined.

378

379 A "data frame" represents an ordered collection of named columns.

380 A column's "name" must be a unique string.

381 Columns may be accessed by name or by position.

382

383 This could be a public data frame class, or an object with the methods and

384 attributes defined on this DataFrame class could be returned from the

385 ``__dataframe__`` method of a public data frame class in a library adhering

386 to the dataframe interchange protocol specification.

387 """

388

389 version = 0 # version of the protocol

390

391 @abstractmethod

392 def __dataframe__(self, nan_as_null: bool = False, allow_copy: bool = True):

393 """Construct a new interchange object, potentially changing the parameters."""

394 pass

395

396 @property

397 @abstractmethod

398 def metadata(self) -> dict[str, Any]:

399 """

400 The metadata for the data frame, as a dictionary with string keys. The

401 contents of `metadata` may be anything, they are meant for a library

402 to store information that it needs to, e.g., roundtrip losslessly or

403 for two implementations to share data that is not (yet) part of the

404 interchange protocol specification. For avoiding collisions with other

405 entries, please add name the keys with the name of the library

406 followed by a period and the desired name, e.g, ``pandas.indexcol``.

407 """

408 pass

409

410 @abstractmethod

411 def num_columns(self) -> int:

412 """

413 Return the number of columns in the DataFrame.

414 """

415 pass

416

417 @abstractmethod

418 def num_rows(self) -> int | None:

419 # TODO: not happy with Optional, but need to flag it may be expensive

420 # why include it if it may be None - what do we expect consumers

421 # to do here?

422 """

423 Return the number of rows in the DataFrame, if available.

424 """

425 pass

426

427 @abstractmethod

428 def num_chunks(self) -> int:

429 """

430 Return the number of chunks the DataFrame consists of.

431 """

432 pass

433

434 @abstractmethod

435 def column_names(self) -> Iterable[str]:

436 """

437 Return an iterator yielding the column names.

438 """

439 pass

440

441 @abstractmethod

442 def get_column(self, i: int) -> Column:

443 """

444 Return the column at the indicated position.

445 """

446 pass

447

448 @abstractmethod

449 def get_column_by_name(self, name: str) -> Column:

450 """

451 Return the column whose name is the indicated name.

452 """

453 pass

454

455 @abstractmethod

456 def get_columns(self) -> Iterable[Column]:

457 """

458 Return an iterator yielding the columns.

459 """

460 pass

461

462 @abstractmethod

463 def select_columns(self, indices: Sequence[int]) -> DataFrame:

464 """

465 Create a new DataFrame by selecting a subset of columns by index.

466 """

467 pass

468

469 @abstractmethod

470 def select_columns_by_name(self, names: Sequence[str]) -> DataFrame:

471 """

472 Create a new DataFrame by selecting a subset of columns by name.

473 """

474 pass

475

476 @abstractmethod

477 def get_chunks(self, n_chunks: int | None = None) -> Iterable[DataFrame]:

478 """

479 Return an iterator yielding the chunks.

480

481 By default (None), yields the chunks that the data is stored as by the

482 producer. If given, ``n_chunks`` must be a multiple of

483 ``self.num_chunks()``, meaning the producer must subdivide each chunk

484 before yielding it.

485 """

486 pass

Coverage for /var/srv/projects/api.amasfac.comuna18.com/tmp/venv/lib/python3.9/site-packages/pandas/core/interchange/dataframe_protocol.py: 82%

127 statements