Coverage for /var/srv/projects/api.amasfac.comuna18.com/tmp/venv/lib/python3.9/site-packages/pandas/core/interchange/column.py: 22%

1from __future__ import annotations

3from typing import Any

5import numpy as np

7from pandas._libs.lib import infer_dtype

8from pandas._libs.tslibs import iNaT

9from pandas.util._decorators import cache_readonly

11import pandas as pd

12from pandas.api.types import (

13 is_categorical_dtype,

14 is_string_dtype,

15)

16from pandas.core.interchange.buffer import PandasBuffer

17from pandas.core.interchange.dataframe_protocol import (

18 Column,

19 ColumnBuffers,

20 ColumnNullType,

21 DtypeKind,

22)

23from pandas.core.interchange.utils import (

24 ArrowCTypes,

25 Endianness,

26 NoBufferPresent,

27 dtype_to_arrow_c_fmt,

28)

30_NP_KINDS = {

31 "i": DtypeKind.INT,

32 "u": DtypeKind.UINT,

33 "f": DtypeKind.FLOAT,

34 "b": DtypeKind.BOOL,

35 "U": DtypeKind.STRING,

36 "M": DtypeKind.DATETIME,

37 "m": DtypeKind.DATETIME,

38}

40_NULL_DESCRIPTION = {

41 DtypeKind.FLOAT: (ColumnNullType.USE_NAN, None),

42 DtypeKind.DATETIME: (ColumnNullType.USE_SENTINEL, iNaT),

43 DtypeKind.INT: (ColumnNullType.NON_NULLABLE, None),

44 DtypeKind.UINT: (ColumnNullType.NON_NULLABLE, None),

45 DtypeKind.BOOL: (ColumnNullType.NON_NULLABLE, None),

46 # Null values for categoricals are stored as `-1` sentinel values

47 # in the category date (e.g., `col.values.codes` is int8 np.ndarray)

48 DtypeKind.CATEGORICAL: (ColumnNullType.USE_SENTINEL, -1),

49 # follow Arrow in using 1 as valid value and 0 for missing/null value

50 DtypeKind.STRING: (ColumnNullType.USE_BYTEMASK, 0),

51}

53_NO_VALIDITY_BUFFER = {

54 ColumnNullType.NON_NULLABLE: "This column is non-nullable",

55 ColumnNullType.USE_NAN: "This column uses NaN as null",

56 ColumnNullType.USE_SENTINEL: "This column uses a sentinel value",

57}

60class PandasColumn(Column):

61 """

62 A column object, with only the methods and properties required by the

63 interchange protocol defined.

64 A column can contain one or more chunks. Each chunk can contain up to three

65 buffers - a data buffer, a mask buffer (depending on null representation),

66 and an offsets buffer (if variable-size binary; e.g., variable-length

67 strings).

68 Note: this Column object can only be produced by ``__dataframe__``, so

69 doesn't need its own version or ``__column__`` protocol.

70 """

72 def __init__(self, column: pd.Series, allow_copy: bool = True) -> None:

73 """

74 Note: doesn't deal with extension arrays yet, just assume a regular

75 Series/ndarray for now.

76 """

77 if not isinstance(column, pd.Series):

78 raise NotImplementedError(f"Columns of type {type(column)} not handled yet")

80 # Store the column as a private attribute

81 self._col = column

82 self._allow_copy = allow_copy

84 def size(self) -> int:

85 """

86 Size of the column, in elements.

87 """

88 return self._col.size

90 @property

91 def offset(self) -> int:

92 """

93 Offset of first element. Always zero.

94 """

95 # TODO: chunks are implemented now, probably this should return something

96 return 0

98 @cache_readonly

99 def dtype(self) -> tuple[DtypeKind, int, str, str]:

100 dtype = self._col.dtype

101

102 if is_categorical_dtype(dtype):

103 codes = self._col.values.codes

104 (

105 _,

106 bitwidth,

107 c_arrow_dtype_f_str,

108 _,

109 ) = self._dtype_from_pandasdtype(codes.dtype)

110 return (

111 DtypeKind.CATEGORICAL,

112 bitwidth,

113 c_arrow_dtype_f_str,

114 Endianness.NATIVE,

115 )

116 elif is_string_dtype(dtype):

117 if infer_dtype(self._col) == "string":

118 return (

119 DtypeKind.STRING,

120 8,

121 dtype_to_arrow_c_fmt(dtype),

122 Endianness.NATIVE,

123 )

124 raise NotImplementedError("Non-string object dtypes are not supported yet")

125 else:

126 return self._dtype_from_pandasdtype(dtype)

127

128 def _dtype_from_pandasdtype(self, dtype) -> tuple[DtypeKind, int, str, str]:

129 """

130 See `self.dtype` for details.

131 """

132 # Note: 'c' (complex) not handled yet (not in array spec v1).

133 # 'b', 'B' (bytes), 'S', 'a', (old-style string) 'V' (void) not handled

134 # datetime and timedelta both map to datetime (is timedelta handled?)

135

136 kind = _NP_KINDS.get(dtype.kind, None)

137 if kind is None:

138 # Not a NumPy dtype. Check if it's a categorical maybe

139 raise ValueError(f"Data type {dtype} not supported by interchange protocol")

140

141 return kind, dtype.itemsize * 8, dtype_to_arrow_c_fmt(dtype), dtype.byteorder

142

143 @property

144 def describe_categorical(self):

145 """

146 If the dtype is categorical, there are two options:

147 - There are only values in the data buffer.

148 - There is a separate non-categorical Column encoding for categorical values.

149

150 Raises TypeError if the dtype is not categorical

151

152 Content of returned dict:

153 - "is_ordered" : bool, whether the ordering of dictionary indices is

154 semantically meaningful.

155 - "is_dictionary" : bool, whether a dictionary-style mapping of

156 categorical values to other objects exists

157 - "categories" : Column representing the (implicit) mapping of indices to

158 category values (e.g. an array of cat1, cat2, ...).

159 None if not a dictionary-style categorical.

160 """

161 if not self.dtype[0] == DtypeKind.CATEGORICAL:

162 raise TypeError(

163 "describe_categorical only works on a column with categorical dtype!"

164 )

165

166 return {

167 "is_ordered": self._col.cat.ordered,

168 "is_dictionary": True,

169 "categories": PandasColumn(pd.Series(self._col.cat.categories)),

170 }

171

172 @property

173 def describe_null(self):

174 kind = self.dtype[0]

175 try:

176 null, value = _NULL_DESCRIPTION[kind]

177 except KeyError:

178 raise NotImplementedError(f"Data type {kind} not yet supported")

179

180 return null, value

181

182 @cache_readonly

183 def null_count(self) -> int:

184 """

185 Number of null elements. Should always be known.

186 """

187 return self._col.isna().sum().item()

188

189 @property

190 def metadata(self) -> dict[str, pd.Index]:

191 """

192 Store specific metadata of the column.

193 """

194 return {"pandas.index": self._col.index}

195

196 def num_chunks(self) -> int:

197 """

198 Return the number of chunks the column consists of.

199 """

200 return 1

201

202 def get_chunks(self, n_chunks: int | None = None):

203 """

204 Return an iterator yielding the chunks.

205 See `DataFrame.get_chunks` for details on ``n_chunks``.

206 """

207 if n_chunks and n_chunks > 1:

208 size = len(self._col)

209 step = size // n_chunks

210 if size % n_chunks != 0:

211 step += 1

212 for start in range(0, step * n_chunks, step):

213 yield PandasColumn(

214 self._col.iloc[start : start + step], self._allow_copy

215 )

216 else:

217 yield self

218

219 def get_buffers(self) -> ColumnBuffers:

220 """

221 Return a dictionary containing the underlying buffers.

222 The returned dictionary has the following contents:

223 - "data": a two-element tuple whose first element is a buffer

224 containing the data and whose second element is the data

225 buffer's associated dtype.

226 - "validity": a two-element tuple whose first element is a buffer

227 containing mask values indicating missing data and

228 whose second element is the mask value buffer's

229 associated dtype. None if the null representation is

230 not a bit or byte mask.

231 - "offsets": a two-element tuple whose first element is a buffer

232 containing the offset values for variable-size binary

233 data (e.g., variable-length strings) and whose second

234 element is the offsets buffer's associated dtype. None

235 if the data buffer does not have an associated offsets

236 buffer.

237 """

238 buffers: ColumnBuffers = {

239 "data": self._get_data_buffer(),

240 "validity": None,

241 "offsets": None,

242 }

243

244 try:

245 buffers["validity"] = self._get_validity_buffer()

246 except NoBufferPresent:

247 pass

248

249 try:

250 buffers["offsets"] = self._get_offsets_buffer()

251 except NoBufferPresent:

252 pass

253

254 return buffers

255

256 def _get_data_buffer(

257 self,

258 ) -> tuple[PandasBuffer, Any]: # Any is for self.dtype tuple

259 """

260 Return the buffer containing the data and the buffer's associated dtype.

261 """

262 if self.dtype[0] in (

263 DtypeKind.INT,

264 DtypeKind.UINT,

265 DtypeKind.FLOAT,

266 DtypeKind.BOOL,

267 DtypeKind.DATETIME,

268 ):

269 buffer = PandasBuffer(self._col.to_numpy(), allow_copy=self._allow_copy)

270 dtype = self.dtype

271 elif self.dtype[0] == DtypeKind.CATEGORICAL:

272 codes = self._col.values._codes

273 buffer = PandasBuffer(codes, allow_copy=self._allow_copy)

274 dtype = self._dtype_from_pandasdtype(codes.dtype)

275 elif self.dtype[0] == DtypeKind.STRING:

276 # Marshal the strings from a NumPy object array into a byte array

277 buf = self._col.to_numpy()

278 b = bytearray()

279

280 # TODO: this for-loop is slow; can be implemented in Cython/C/C++ later

281 for obj in buf:

282 if isinstance(obj, str):

283 b.extend(obj.encode(encoding="utf-8"))

284

285 # Convert the byte array to a Pandas "buffer" using

286 # a NumPy array as the backing store

287 buffer = PandasBuffer(np.frombuffer(b, dtype="uint8"))

288

289 # Define the dtype for the returned buffer

290 dtype = (

291 DtypeKind.STRING,

292 8,

293 ArrowCTypes.STRING,

294 Endianness.NATIVE,

295 ) # note: currently only support native endianness

296 else:

297 raise NotImplementedError(f"Data type {self._col.dtype} not handled yet")

298

299 return buffer, dtype

300

301 def _get_validity_buffer(self) -> tuple[PandasBuffer, Any]:

302 """

303 Return the buffer containing the mask values indicating missing data and

304 the buffer's associated dtype.

305 Raises NoBufferPresent if null representation is not a bit or byte mask.

306 """

307 null, invalid = self.describe_null

308

309 if self.dtype[0] == DtypeKind.STRING:

310 # For now, use byte array as the mask.

311 # TODO: maybe store as bit array to save space?..

312 buf = self._col.to_numpy()

313

314 # Determine the encoding for valid values

315 valid = invalid == 0

316 invalid = not valid

317

318 mask = np.zeros(shape=(len(buf),), dtype=np.bool8)

319 for i, obj in enumerate(buf):

320 mask[i] = valid if isinstance(obj, str) else invalid

321

322 # Convert the mask array to a Pandas "buffer" using

323 # a NumPy array as the backing store

324 buffer = PandasBuffer(mask)

325

326 # Define the dtype of the returned buffer

327 dtype = (DtypeKind.BOOL, 8, ArrowCTypes.BOOL, Endianness.NATIVE)

328

329 return buffer, dtype

330

331 try:

332 msg = _NO_VALIDITY_BUFFER[null] + " so does not have a separate mask"

333 except KeyError:

334 # TODO: implement for other bit/byte masks?

335 raise NotImplementedError("See self.describe_null")

336

337 raise NoBufferPresent(msg)

338

339 def _get_offsets_buffer(self) -> tuple[PandasBuffer, Any]:

340 """

341 Return the buffer containing the offset values for variable-size binary

342 data (e.g., variable-length strings) and the buffer's associated dtype.

343 Raises NoBufferPresent if the data buffer does not have an associated

344 offsets buffer.

345 """

346 if self.dtype[0] == DtypeKind.STRING:

347 # For each string, we need to manually determine the next offset

348 values = self._col.to_numpy()

349 ptr = 0

350 offsets = np.zeros(shape=(len(values) + 1,), dtype=np.int64)

351 for i, v in enumerate(values):

352 # For missing values (in this case, `np.nan` values)

353 # we don't increment the pointer

354 if isinstance(v, str):

355 b = v.encode(encoding="utf-8")

356 ptr += len(b)

357

358 offsets[i + 1] = ptr

359

360 # Convert the offsets to a Pandas "buffer" using

361 # the NumPy array as the backing store

362 buffer = PandasBuffer(offsets)

363

364 # Assemble the buffer dtype info

365 dtype = (

366 DtypeKind.INT,

367 64,

368 ArrowCTypes.INT64,

369 Endianness.NATIVE,

370 ) # note: currently only support native endianness

371 else:

372 raise NoBufferPresent(

373 "This column has a fixed-length dtype so "

374 "it does not have an offsets buffer"

375 )

376

377 return buffer, dtype