Coverage for /var/srv/projects/api.amasfac.comuna18.com/tmp/venv/lib/python3.9/site-packages/pandas/core/interchange/column.py: 22%

132 statements  

« prev     ^ index     » next       coverage.py v6.4.4, created at 2023-07-17 14:22 -0600

1from __future__ import annotations 

2 

3from typing import Any 

4 

5import numpy as np 

6 

7from pandas._libs.lib import infer_dtype 

8from pandas._libs.tslibs import iNaT 

9from pandas.util._decorators import cache_readonly 

10 

11import pandas as pd 

12from pandas.api.types import ( 

13 is_categorical_dtype, 

14 is_string_dtype, 

15) 

16from pandas.core.interchange.buffer import PandasBuffer 

17from pandas.core.interchange.dataframe_protocol import ( 

18 Column, 

19 ColumnBuffers, 

20 ColumnNullType, 

21 DtypeKind, 

22) 

23from pandas.core.interchange.utils import ( 

24 ArrowCTypes, 

25 Endianness, 

26 NoBufferPresent, 

27 dtype_to_arrow_c_fmt, 

28) 

29 

30_NP_KINDS = { 

31 "i": DtypeKind.INT, 

32 "u": DtypeKind.UINT, 

33 "f": DtypeKind.FLOAT, 

34 "b": DtypeKind.BOOL, 

35 "U": DtypeKind.STRING, 

36 "M": DtypeKind.DATETIME, 

37 "m": DtypeKind.DATETIME, 

38} 

39 

40_NULL_DESCRIPTION = { 

41 DtypeKind.FLOAT: (ColumnNullType.USE_NAN, None), 

42 DtypeKind.DATETIME: (ColumnNullType.USE_SENTINEL, iNaT), 

43 DtypeKind.INT: (ColumnNullType.NON_NULLABLE, None), 

44 DtypeKind.UINT: (ColumnNullType.NON_NULLABLE, None), 

45 DtypeKind.BOOL: (ColumnNullType.NON_NULLABLE, None), 

46 # Null values for categoricals are stored as `-1` sentinel values 

47 # in the category date (e.g., `col.values.codes` is int8 np.ndarray) 

48 DtypeKind.CATEGORICAL: (ColumnNullType.USE_SENTINEL, -1), 

49 # follow Arrow in using 1 as valid value and 0 for missing/null value 

50 DtypeKind.STRING: (ColumnNullType.USE_BYTEMASK, 0), 

51} 

52 

53_NO_VALIDITY_BUFFER = { 

54 ColumnNullType.NON_NULLABLE: "This column is non-nullable", 

55 ColumnNullType.USE_NAN: "This column uses NaN as null", 

56 ColumnNullType.USE_SENTINEL: "This column uses a sentinel value", 

57} 

58 

59 

60class PandasColumn(Column): 

61 """ 

62 A column object, with only the methods and properties required by the 

63 interchange protocol defined. 

64 A column can contain one or more chunks. Each chunk can contain up to three 

65 buffers - a data buffer, a mask buffer (depending on null representation), 

66 and an offsets buffer (if variable-size binary; e.g., variable-length 

67 strings). 

68 Note: this Column object can only be produced by ``__dataframe__``, so 

69 doesn't need its own version or ``__column__`` protocol. 

70 """ 

71 

72 def __init__(self, column: pd.Series, allow_copy: bool = True) -> None: 

73 """ 

74 Note: doesn't deal with extension arrays yet, just assume a regular 

75 Series/ndarray for now. 

76 """ 

77 if not isinstance(column, pd.Series): 

78 raise NotImplementedError(f"Columns of type {type(column)} not handled yet") 

79 

80 # Store the column as a private attribute 

81 self._col = column 

82 self._allow_copy = allow_copy 

83 

84 def size(self) -> int: 

85 """ 

86 Size of the column, in elements. 

87 """ 

88 return self._col.size 

89 

90 @property 

91 def offset(self) -> int: 

92 """ 

93 Offset of first element. Always zero. 

94 """ 

95 # TODO: chunks are implemented now, probably this should return something 

96 return 0 

97 

98 @cache_readonly 

99 def dtype(self) -> tuple[DtypeKind, int, str, str]: 

100 dtype = self._col.dtype 

101 

102 if is_categorical_dtype(dtype): 

103 codes = self._col.values.codes 

104 ( 

105 _, 

106 bitwidth, 

107 c_arrow_dtype_f_str, 

108 _, 

109 ) = self._dtype_from_pandasdtype(codes.dtype) 

110 return ( 

111 DtypeKind.CATEGORICAL, 

112 bitwidth, 

113 c_arrow_dtype_f_str, 

114 Endianness.NATIVE, 

115 ) 

116 elif is_string_dtype(dtype): 

117 if infer_dtype(self._col) == "string": 

118 return ( 

119 DtypeKind.STRING, 

120 8, 

121 dtype_to_arrow_c_fmt(dtype), 

122 Endianness.NATIVE, 

123 ) 

124 raise NotImplementedError("Non-string object dtypes are not supported yet") 

125 else: 

126 return self._dtype_from_pandasdtype(dtype) 

127 

128 def _dtype_from_pandasdtype(self, dtype) -> tuple[DtypeKind, int, str, str]: 

129 """ 

130 See `self.dtype` for details. 

131 """ 

132 # Note: 'c' (complex) not handled yet (not in array spec v1). 

133 # 'b', 'B' (bytes), 'S', 'a', (old-style string) 'V' (void) not handled 

134 # datetime and timedelta both map to datetime (is timedelta handled?) 

135 

136 kind = _NP_KINDS.get(dtype.kind, None) 

137 if kind is None: 

138 # Not a NumPy dtype. Check if it's a categorical maybe 

139 raise ValueError(f"Data type {dtype} not supported by interchange protocol") 

140 

141 return kind, dtype.itemsize * 8, dtype_to_arrow_c_fmt(dtype), dtype.byteorder 

142 

143 @property 

144 def describe_categorical(self): 

145 """ 

146 If the dtype is categorical, there are two options: 

147 - There are only values in the data buffer. 

148 - There is a separate non-categorical Column encoding for categorical values. 

149 

150 Raises TypeError if the dtype is not categorical 

151 

152 Content of returned dict: 

153 - "is_ordered" : bool, whether the ordering of dictionary indices is 

154 semantically meaningful. 

155 - "is_dictionary" : bool, whether a dictionary-style mapping of 

156 categorical values to other objects exists 

157 - "categories" : Column representing the (implicit) mapping of indices to 

158 category values (e.g. an array of cat1, cat2, ...). 

159 None if not a dictionary-style categorical. 

160 """ 

161 if not self.dtype[0] == DtypeKind.CATEGORICAL: 

162 raise TypeError( 

163 "describe_categorical only works on a column with categorical dtype!" 

164 ) 

165 

166 return { 

167 "is_ordered": self._col.cat.ordered, 

168 "is_dictionary": True, 

169 "categories": PandasColumn(pd.Series(self._col.cat.categories)), 

170 } 

171 

172 @property 

173 def describe_null(self): 

174 kind = self.dtype[0] 

175 try: 

176 null, value = _NULL_DESCRIPTION[kind] 

177 except KeyError: 

178 raise NotImplementedError(f"Data type {kind} not yet supported") 

179 

180 return null, value 

181 

182 @cache_readonly 

183 def null_count(self) -> int: 

184 """ 

185 Number of null elements. Should always be known. 

186 """ 

187 return self._col.isna().sum().item() 

188 

189 @property 

190 def metadata(self) -> dict[str, pd.Index]: 

191 """ 

192 Store specific metadata of the column. 

193 """ 

194 return {"pandas.index": self._col.index} 

195 

196 def num_chunks(self) -> int: 

197 """ 

198 Return the number of chunks the column consists of. 

199 """ 

200 return 1 

201 

202 def get_chunks(self, n_chunks: int | None = None): 

203 """ 

204 Return an iterator yielding the chunks. 

205 See `DataFrame.get_chunks` for details on ``n_chunks``. 

206 """ 

207 if n_chunks and n_chunks > 1: 

208 size = len(self._col) 

209 step = size // n_chunks 

210 if size % n_chunks != 0: 

211 step += 1 

212 for start in range(0, step * n_chunks, step): 

213 yield PandasColumn( 

214 self._col.iloc[start : start + step], self._allow_copy 

215 ) 

216 else: 

217 yield self 

218 

219 def get_buffers(self) -> ColumnBuffers: 

220 """ 

221 Return a dictionary containing the underlying buffers. 

222 The returned dictionary has the following contents: 

223 - "data": a two-element tuple whose first element is a buffer 

224 containing the data and whose second element is the data 

225 buffer's associated dtype. 

226 - "validity": a two-element tuple whose first element is a buffer 

227 containing mask values indicating missing data and 

228 whose second element is the mask value buffer's 

229 associated dtype. None if the null representation is 

230 not a bit or byte mask. 

231 - "offsets": a two-element tuple whose first element is a buffer 

232 containing the offset values for variable-size binary 

233 data (e.g., variable-length strings) and whose second 

234 element is the offsets buffer's associated dtype. None 

235 if the data buffer does not have an associated offsets 

236 buffer. 

237 """ 

238 buffers: ColumnBuffers = { 

239 "data": self._get_data_buffer(), 

240 "validity": None, 

241 "offsets": None, 

242 } 

243 

244 try: 

245 buffers["validity"] = self._get_validity_buffer() 

246 except NoBufferPresent: 

247 pass 

248 

249 try: 

250 buffers["offsets"] = self._get_offsets_buffer() 

251 except NoBufferPresent: 

252 pass 

253 

254 return buffers 

255 

256 def _get_data_buffer( 

257 self, 

258 ) -> tuple[PandasBuffer, Any]: # Any is for self.dtype tuple 

259 """ 

260 Return the buffer containing the data and the buffer's associated dtype. 

261 """ 

262 if self.dtype[0] in ( 

263 DtypeKind.INT, 

264 DtypeKind.UINT, 

265 DtypeKind.FLOAT, 

266 DtypeKind.BOOL, 

267 DtypeKind.DATETIME, 

268 ): 

269 buffer = PandasBuffer(self._col.to_numpy(), allow_copy=self._allow_copy) 

270 dtype = self.dtype 

271 elif self.dtype[0] == DtypeKind.CATEGORICAL: 

272 codes = self._col.values._codes 

273 buffer = PandasBuffer(codes, allow_copy=self._allow_copy) 

274 dtype = self._dtype_from_pandasdtype(codes.dtype) 

275 elif self.dtype[0] == DtypeKind.STRING: 

276 # Marshal the strings from a NumPy object array into a byte array 

277 buf = self._col.to_numpy() 

278 b = bytearray() 

279 

280 # TODO: this for-loop is slow; can be implemented in Cython/C/C++ later 

281 for obj in buf: 

282 if isinstance(obj, str): 

283 b.extend(obj.encode(encoding="utf-8")) 

284 

285 # Convert the byte array to a Pandas "buffer" using 

286 # a NumPy array as the backing store 

287 buffer = PandasBuffer(np.frombuffer(b, dtype="uint8")) 

288 

289 # Define the dtype for the returned buffer 

290 dtype = ( 

291 DtypeKind.STRING, 

292 8, 

293 ArrowCTypes.STRING, 

294 Endianness.NATIVE, 

295 ) # note: currently only support native endianness 

296 else: 

297 raise NotImplementedError(f"Data type {self._col.dtype} not handled yet") 

298 

299 return buffer, dtype 

300 

301 def _get_validity_buffer(self) -> tuple[PandasBuffer, Any]: 

302 """ 

303 Return the buffer containing the mask values indicating missing data and 

304 the buffer's associated dtype. 

305 Raises NoBufferPresent if null representation is not a bit or byte mask. 

306 """ 

307 null, invalid = self.describe_null 

308 

309 if self.dtype[0] == DtypeKind.STRING: 

310 # For now, use byte array as the mask. 

311 # TODO: maybe store as bit array to save space?.. 

312 buf = self._col.to_numpy() 

313 

314 # Determine the encoding for valid values 

315 valid = invalid == 0 

316 invalid = not valid 

317 

318 mask = np.zeros(shape=(len(buf),), dtype=np.bool8) 

319 for i, obj in enumerate(buf): 

320 mask[i] = valid if isinstance(obj, str) else invalid 

321 

322 # Convert the mask array to a Pandas "buffer" using 

323 # a NumPy array as the backing store 

324 buffer = PandasBuffer(mask) 

325 

326 # Define the dtype of the returned buffer 

327 dtype = (DtypeKind.BOOL, 8, ArrowCTypes.BOOL, Endianness.NATIVE) 

328 

329 return buffer, dtype 

330 

331 try: 

332 msg = _NO_VALIDITY_BUFFER[null] + " so does not have a separate mask" 

333 except KeyError: 

334 # TODO: implement for other bit/byte masks? 

335 raise NotImplementedError("See self.describe_null") 

336 

337 raise NoBufferPresent(msg) 

338 

339 def _get_offsets_buffer(self) -> tuple[PandasBuffer, Any]: 

340 """ 

341 Return the buffer containing the offset values for variable-size binary 

342 data (e.g., variable-length strings) and the buffer's associated dtype. 

343 Raises NoBufferPresent if the data buffer does not have an associated 

344 offsets buffer. 

345 """ 

346 if self.dtype[0] == DtypeKind.STRING: 

347 # For each string, we need to manually determine the next offset 

348 values = self._col.to_numpy() 

349 ptr = 0 

350 offsets = np.zeros(shape=(len(values) + 1,), dtype=np.int64) 

351 for i, v in enumerate(values): 

352 # For missing values (in this case, `np.nan` values) 

353 # we don't increment the pointer 

354 if isinstance(v, str): 

355 b = v.encode(encoding="utf-8") 

356 ptr += len(b) 

357 

358 offsets[i + 1] = ptr 

359 

360 # Convert the offsets to a Pandas "buffer" using 

361 # the NumPy array as the backing store 

362 buffer = PandasBuffer(offsets) 

363 

364 # Assemble the buffer dtype info 

365 dtype = ( 

366 DtypeKind.INT, 

367 64, 

368 ArrowCTypes.INT64, 

369 Endianness.NATIVE, 

370 ) # note: currently only support native endianness 

371 else: 

372 raise NoBufferPresent( 

373 "This column has a fixed-length dtype so " 

374 "it does not have an offsets buffer" 

375 ) 

376 

377 return buffer, dtype