Coverage for /var/srv/projects/api.amasfac.comuna18.com/tmp/venv/lib/python3.9/site-packages/pandas/core/interchange/dataframe_protocol.py: 82%

127 statements  

« prev     ^ index     » next       coverage.py v6.4.4, created at 2023-07-17 14:22 -0600

1""" 

2A verbatim copy (vendored) of the spec from https://github.com/data-apis/dataframe-api 

3""" 

4 

5from __future__ import annotations 

6 

7from abc import ( 

8 ABC, 

9 abstractmethod, 

10) 

11import enum 

12from typing import ( 

13 Any, 

14 Iterable, 

15 Sequence, 

16 TypedDict, 

17) 

18 

19 

20class DlpackDeviceType(enum.IntEnum): 

21 """Integer enum for device type codes matching DLPack.""" 

22 

23 CPU = 1 

24 CUDA = 2 

25 CPU_PINNED = 3 

26 OPENCL = 4 

27 VULKAN = 7 

28 METAL = 8 

29 VPI = 9 

30 ROCM = 10 

31 

32 

33class DtypeKind(enum.IntEnum): 

34 """ 

35 Integer enum for data types. 

36 

37 Attributes 

38 ---------- 

39 INT : int 

40 Matches to signed integer data type. 

41 UINT : int 

42 Matches to unsigned integer data type. 

43 FLOAT : int 

44 Matches to floating point data type. 

45 BOOL : int 

46 Matches to boolean data type. 

47 STRING : int 

48 Matches to string data type (UTF-8 encoded). 

49 DATETIME : int 

50 Matches to datetime data type. 

51 CATEGORICAL : int 

52 Matches to categorical data type. 

53 """ 

54 

55 INT = 0 

56 UINT = 1 

57 FLOAT = 2 

58 BOOL = 20 

59 STRING = 21 # UTF-8 

60 DATETIME = 22 

61 CATEGORICAL = 23 

62 

63 

64class ColumnNullType(enum.IntEnum): 

65 """ 

66 Integer enum for null type representation. 

67 

68 Attributes 

69 ---------- 

70 NON_NULLABLE : int 

71 Non-nullable column. 

72 USE_NAN : int 

73 Use explicit float NaN value. 

74 USE_SENTINEL : int 

75 Sentinel value besides NaN/NaT. 

76 USE_BITMASK : int 

77 The bit is set/unset representing a null on a certain position. 

78 USE_BYTEMASK : int 

79 The byte is set/unset representing a null on a certain position. 

80 """ 

81 

82 NON_NULLABLE = 0 

83 USE_NAN = 1 

84 USE_SENTINEL = 2 

85 USE_BITMASK = 3 

86 USE_BYTEMASK = 4 

87 

88 

89class ColumnBuffers(TypedDict): 

90 # first element is a buffer containing the column data; 

91 # second element is the data buffer's associated dtype 

92 data: tuple[Buffer, Any] 

93 

94 # first element is a buffer containing mask values indicating missing data; 

95 # second element is the mask value buffer's associated dtype. 

96 # None if the null representation is not a bit or byte mask 

97 validity: tuple[Buffer, Any] | None 

98 

99 # first element is a buffer containing the offset values for 

100 # variable-size binary data (e.g., variable-length strings); 

101 # second element is the offsets buffer's associated dtype. 

102 # None if the data buffer does not have an associated offsets buffer 

103 offsets: tuple[Buffer, Any] | None 

104 

105 

106class CategoricalDescription(TypedDict): 

107 # whether the ordering of dictionary indices is semantically meaningful 

108 is_ordered: bool 

109 # whether a dictionary-style mapping of categorical values to other objects exists 

110 is_dictionary: bool 

111 # Python-level only (e.g. ``{int: str}``). 

112 # None if not a dictionary-style categorical. 

113 categories: Column | None 

114 

115 

116class Buffer(ABC): 

117 """ 

118 Data in the buffer is guaranteed to be contiguous in memory. 

119 

120 Note that there is no dtype attribute present, a buffer can be thought of 

121 as simply a block of memory. However, if the column that the buffer is 

122 attached to has a dtype that's supported by DLPack and ``__dlpack__`` is 

123 implemented, then that dtype information will be contained in the return 

124 value from ``__dlpack__``. 

125 

126 This distinction is useful to support both data exchange via DLPack on a 

127 buffer and (b) dtypes like variable-length strings which do not have a 

128 fixed number of bytes per element. 

129 """ 

130 

131 @property 

132 @abstractmethod 

133 def bufsize(self) -> int: 

134 """ 

135 Buffer size in bytes. 

136 """ 

137 pass 

138 

139 @property 

140 @abstractmethod 

141 def ptr(self) -> int: 

142 """ 

143 Pointer to start of the buffer as an integer. 

144 """ 

145 pass 

146 

147 @abstractmethod 

148 def __dlpack__(self): 

149 """ 

150 Produce DLPack capsule (see array API standard). 

151 

152 Raises: 

153 

154 - TypeError : if the buffer contains unsupported dtypes. 

155 - NotImplementedError : if DLPack support is not implemented 

156 

157 Useful to have to connect to array libraries. Support optional because 

158 it's not completely trivial to implement for a Python-only library. 

159 """ 

160 raise NotImplementedError("__dlpack__") 

161 

162 @abstractmethod 

163 def __dlpack_device__(self) -> tuple[DlpackDeviceType, int | None]: 

164 """ 

165 Device type and device ID for where the data in the buffer resides. 

166 Uses device type codes matching DLPack. 

167 Note: must be implemented even if ``__dlpack__`` is not. 

168 """ 

169 pass 

170 

171 

172class Column(ABC): 

173 """ 

174 A column object, with only the methods and properties required by the 

175 interchange protocol defined. 

176 

177 A column can contain one or more chunks. Each chunk can contain up to three 

178 buffers - a data buffer, a mask buffer (depending on null representation), 

179 and an offsets buffer (if variable-size binary; e.g., variable-length 

180 strings). 

181 

182 TBD: Arrow has a separate "null" dtype, and has no separate mask concept. 

183 Instead, it seems to use "children" for both columns with a bit mask, 

184 and for nested dtypes. Unclear whether this is elegant or confusing. 

185 This design requires checking the null representation explicitly. 

186 

187 The Arrow design requires checking: 

188 1. the ARROW_FLAG_NULLABLE (for sentinel values) 

189 2. if a column has two children, combined with one of those children 

190 having a null dtype. 

191 

192 Making the mask concept explicit seems useful. One null dtype would 

193 not be enough to cover both bit and byte masks, so that would mean 

194 even more checking if we did it the Arrow way. 

195 

196 TBD: there's also the "chunk" concept here, which is implicit in Arrow as 

197 multiple buffers per array (= column here). Semantically it may make 

198 sense to have both: chunks were meant for example for lazy evaluation 

199 of data which doesn't fit in memory, while multiple buffers per column 

200 could also come from doing a selection operation on a single 

201 contiguous buffer. 

202 

203 Given these concepts, one would expect chunks to be all of the same 

204 size (say a 10,000 row dataframe could have 10 chunks of 1,000 rows), 

205 while multiple buffers could have data-dependent lengths. Not an issue 

206 in pandas if one column is backed by a single NumPy array, but in 

207 Arrow it seems possible. 

208 Are multiple chunks *and* multiple buffers per column necessary for 

209 the purposes of this interchange protocol, or must producers either 

210 reuse the chunk concept for this or copy the data? 

211 

212 Note: this Column object can only be produced by ``__dataframe__``, so 

213 doesn't need its own version or ``__column__`` protocol. 

214 """ 

215 

216 @property 

217 @abstractmethod 

218 def size(self) -> int: 

219 """ 

220 Size of the column, in elements. 

221 

222 Corresponds to DataFrame.num_rows() if column is a single chunk; 

223 equal to size of this current chunk otherwise. 

224 """ 

225 pass 

226 

227 @property 

228 @abstractmethod 

229 def offset(self) -> int: 

230 """ 

231 Offset of first element. 

232 

233 May be > 0 if using chunks; for example for a column with N chunks of 

234 equal size M (only the last chunk may be shorter), 

235 ``offset = n * M``, ``n = 0 .. N-1``. 

236 """ 

237 pass 

238 

239 @property 

240 @abstractmethod 

241 def dtype(self) -> tuple[DtypeKind, int, str, str]: 

242 """ 

243 Dtype description as a tuple ``(kind, bit-width, format string, endianness)``. 

244 

245 Bit-width : the number of bits as an integer 

246 Format string : data type description format string in Apache Arrow C 

247 Data Interface format. 

248 Endianness : current only native endianness (``=``) is supported 

249 

250 Notes: 

251 - Kind specifiers are aligned with DLPack where possible (hence the 

252 jump to 20, leave enough room for future extension) 

253 - Masks must be specified as boolean with either bit width 1 (for bit 

254 masks) or 8 (for byte masks). 

255 - Dtype width in bits was preferred over bytes 

256 - Endianness isn't too useful, but included now in case in the future 

257 we need to support non-native endianness 

258 - Went with Apache Arrow format strings over NumPy format strings 

259 because they're more complete from a dataframe perspective 

260 - Format strings are mostly useful for datetime specification, and 

261 for categoricals. 

262 - For categoricals, the format string describes the type of the 

263 categorical in the data buffer. In case of a separate encoding of 

264 the categorical (e.g. an integer to string mapping), this can 

265 be derived from ``self.describe_categorical``. 

266 - Data types not included: complex, Arrow-style null, binary, decimal, 

267 and nested (list, struct, map, union) dtypes. 

268 """ 

269 pass 

270 

271 @property 

272 @abstractmethod 

273 def describe_categorical(self) -> CategoricalDescription: 

274 """ 

275 If the dtype is categorical, there are two options: 

276 - There are only values in the data buffer. 

277 - There is a separate non-categorical Column encoding for categorical values. 

278 

279 Raises TypeError if the dtype is not categorical 

280 

281 Returns the dictionary with description on how to interpret the data buffer: 

282 - "is_ordered" : bool, whether the ordering of dictionary indices is 

283 semantically meaningful. 

284 - "is_dictionary" : bool, whether a mapping of 

285 categorical values to other objects exists 

286 - "categories" : Column representing the (implicit) mapping of indices to 

287 category values (e.g. an array of cat1, cat2, ...). 

288 None if not a dictionary-style categorical. 

289 

290 TBD: are there any other in-memory representations that are needed? 

291 """ 

292 pass 

293 

294 @property 

295 @abstractmethod 

296 def describe_null(self) -> tuple[ColumnNullType, Any]: 

297 """ 

298 Return the missing value (or "null") representation the column dtype 

299 uses, as a tuple ``(kind, value)``. 

300 

301 Value : if kind is "sentinel value", the actual value. If kind is a bit 

302 mask or a byte mask, the value (0 or 1) indicating a missing value. None 

303 otherwise. 

304 """ 

305 pass 

306 

307 @property 

308 @abstractmethod 

309 def null_count(self) -> int | None: 

310 """ 

311 Number of null elements, if known. 

312 

313 Note: Arrow uses -1 to indicate "unknown", but None seems cleaner. 

314 """ 

315 pass 

316 

317 @property 

318 @abstractmethod 

319 def metadata(self) -> dict[str, Any]: 

320 """ 

321 The metadata for the column. See `DataFrame.metadata` for more details. 

322 """ 

323 pass 

324 

325 @abstractmethod 

326 def num_chunks(self) -> int: 

327 """ 

328 Return the number of chunks the column consists of. 

329 """ 

330 pass 

331 

332 @abstractmethod 

333 def get_chunks(self, n_chunks: int | None = None) -> Iterable[Column]: 

334 """ 

335 Return an iterator yielding the chunks. 

336 

337 See `DataFrame.get_chunks` for details on ``n_chunks``. 

338 """ 

339 pass 

340 

341 @abstractmethod 

342 def get_buffers(self) -> ColumnBuffers: 

343 """ 

344 Return a dictionary containing the underlying buffers. 

345 

346 The returned dictionary has the following contents: 

347 

348 - "data": a two-element tuple whose first element is a buffer 

349 containing the data and whose second element is the data 

350 buffer's associated dtype. 

351 - "validity": a two-element tuple whose first element is a buffer 

352 containing mask values indicating missing data and 

353 whose second element is the mask value buffer's 

354 associated dtype. None if the null representation is 

355 not a bit or byte mask. 

356 - "offsets": a two-element tuple whose first element is a buffer 

357 containing the offset values for variable-size binary 

358 data (e.g., variable-length strings) and whose second 

359 element is the offsets buffer's associated dtype. None 

360 if the data buffer does not have an associated offsets 

361 buffer. 

362 """ 

363 pass 

364 

365 

366# def get_children(self) -> Iterable[Column]: 

367# """ 

368# Children columns underneath the column, each object in this iterator 

369# must adhere to the column specification. 

370# """ 

371# pass 

372 

373 

374class DataFrame(ABC): 

375 """ 

376 A data frame class, with only the methods required by the interchange 

377 protocol defined. 

378 

379 A "data frame" represents an ordered collection of named columns. 

380 A column's "name" must be a unique string. 

381 Columns may be accessed by name or by position. 

382 

383 This could be a public data frame class, or an object with the methods and 

384 attributes defined on this DataFrame class could be returned from the 

385 ``__dataframe__`` method of a public data frame class in a library adhering 

386 to the dataframe interchange protocol specification. 

387 """ 

388 

389 version = 0 # version of the protocol 

390 

391 @abstractmethod 

392 def __dataframe__(self, nan_as_null: bool = False, allow_copy: bool = True): 

393 """Construct a new interchange object, potentially changing the parameters.""" 

394 pass 

395 

396 @property 

397 @abstractmethod 

398 def metadata(self) -> dict[str, Any]: 

399 """ 

400 The metadata for the data frame, as a dictionary with string keys. The 

401 contents of `metadata` may be anything, they are meant for a library 

402 to store information that it needs to, e.g., roundtrip losslessly or 

403 for two implementations to share data that is not (yet) part of the 

404 interchange protocol specification. For avoiding collisions with other 

405 entries, please add name the keys with the name of the library 

406 followed by a period and the desired name, e.g, ``pandas.indexcol``. 

407 """ 

408 pass 

409 

410 @abstractmethod 

411 def num_columns(self) -> int: 

412 """ 

413 Return the number of columns in the DataFrame. 

414 """ 

415 pass 

416 

417 @abstractmethod 

418 def num_rows(self) -> int | None: 

419 # TODO: not happy with Optional, but need to flag it may be expensive 

420 # why include it if it may be None - what do we expect consumers 

421 # to do here? 

422 """ 

423 Return the number of rows in the DataFrame, if available. 

424 """ 

425 pass 

426 

427 @abstractmethod 

428 def num_chunks(self) -> int: 

429 """ 

430 Return the number of chunks the DataFrame consists of. 

431 """ 

432 pass 

433 

434 @abstractmethod 

435 def column_names(self) -> Iterable[str]: 

436 """ 

437 Return an iterator yielding the column names. 

438 """ 

439 pass 

440 

441 @abstractmethod 

442 def get_column(self, i: int) -> Column: 

443 """ 

444 Return the column at the indicated position. 

445 """ 

446 pass 

447 

448 @abstractmethod 

449 def get_column_by_name(self, name: str) -> Column: 

450 """ 

451 Return the column whose name is the indicated name. 

452 """ 

453 pass 

454 

455 @abstractmethod 

456 def get_columns(self) -> Iterable[Column]: 

457 """ 

458 Return an iterator yielding the columns. 

459 """ 

460 pass 

461 

462 @abstractmethod 

463 def select_columns(self, indices: Sequence[int]) -> DataFrame: 

464 """ 

465 Create a new DataFrame by selecting a subset of columns by index. 

466 """ 

467 pass 

468 

469 @abstractmethod 

470 def select_columns_by_name(self, names: Sequence[str]) -> DataFrame: 

471 """ 

472 Create a new DataFrame by selecting a subset of columns by name. 

473 """ 

474 pass 

475 

476 @abstractmethod 

477 def get_chunks(self, n_chunks: int | None = None) -> Iterable[DataFrame]: 

478 """ 

479 Return an iterator yielding the chunks. 

480 

481 By default (None), yields the chunks that the data is stored as by the 

482 producer. If given, ``n_chunks`` must be a multiple of 

483 ``self.num_chunks()``, meaning the producer must subdivide each chunk 

484 before yielding it. 

485 """ 

486 pass