Coverage for /var/srv/projects/api.amasfac.comuna18.com/tmp/venv/lib/python3.9/site-packages/pandas/core/arrays/sparse/accessor.py: 27%

110 statements  

« prev     ^ index     » next       coverage.py v6.4.4, created at 2023-07-17 14:22 -0600

1"""Sparse accessor""" 

2from __future__ import annotations 

3 

4from typing import TYPE_CHECKING 

5 

6import numpy as np 

7 

8from pandas.compat._optional import import_optional_dependency 

9 

10from pandas.core.dtypes.cast import find_common_type 

11 

12from pandas.core.accessor import ( 

13 PandasDelegate, 

14 delegate_names, 

15) 

16from pandas.core.arrays.sparse.array import SparseArray 

17from pandas.core.arrays.sparse.dtype import SparseDtype 

18 

19if TYPE_CHECKING: 19 ↛ 20line 19 didn't jump to line 20, because the condition on line 19 was never true

20 from pandas import ( 

21 DataFrame, 

22 Series, 

23 ) 

24 

25 

26class BaseAccessor: 

27 _validation_msg = "Can only use the '.sparse' accessor with Sparse data." 

28 

29 def __init__(self, data=None) -> None: 

30 self._parent = data 

31 self._validate(data) 

32 

33 def _validate(self, data): 

34 raise NotImplementedError 

35 

36 

37@delegate_names( 

38 SparseArray, ["npoints", "density", "fill_value", "sp_values"], typ="property" 

39) 

40class SparseAccessor(BaseAccessor, PandasDelegate): 

41 """ 

42 Accessor for SparseSparse from other sparse matrix data types. 

43 """ 

44 

45 def _validate(self, data): 

46 if not isinstance(data.dtype, SparseDtype): 

47 raise AttributeError(self._validation_msg) 

48 

49 def _delegate_property_get(self, name, *args, **kwargs): 

50 return getattr(self._parent.array, name) 

51 

52 def _delegate_method(self, name, *args, **kwargs): 

53 if name == "from_coo": 

54 return self.from_coo(*args, **kwargs) 

55 elif name == "to_coo": 

56 return self.to_coo(*args, **kwargs) 

57 else: 

58 raise ValueError 

59 

60 @classmethod 

61 def from_coo(cls, A, dense_index=False) -> Series: 

62 """ 

63 Create a Series with sparse values from a scipy.sparse.coo_matrix. 

64 

65 Parameters 

66 ---------- 

67 A : scipy.sparse.coo_matrix 

68 dense_index : bool, default False 

69 If False (default), the SparseSeries index consists of only the 

70 coords of the non-null entries of the original coo_matrix. 

71 If True, the SparseSeries index consists of the full sorted 

72 (row, col) coordinates of the coo_matrix. 

73 

74 Returns 

75 ------- 

76 s : Series 

77 A Series with sparse values. 

78 

79 Examples 

80 -------- 

81 >>> from scipy import sparse 

82 

83 >>> A = sparse.coo_matrix( 

84 ... ([3.0, 1.0, 2.0], ([1, 0, 0], [0, 2, 3])), shape=(3, 4) 

85 ... ) 

86 >>> A 

87 <3x4 sparse matrix of type '<class 'numpy.float64'>' 

88 with 3 stored elements in COOrdinate format> 

89 

90 >>> A.todense() 

91 matrix([[0., 0., 1., 2.], 

92 [3., 0., 0., 0.], 

93 [0., 0., 0., 0.]]) 

94 

95 >>> ss = pd.Series.sparse.from_coo(A) 

96 >>> ss 

97 0 2 1.0 

98 3 2.0 

99 1 0 3.0 

100 dtype: Sparse[float64, nan] 

101 """ 

102 from pandas import Series 

103 from pandas.core.arrays.sparse.scipy_sparse import coo_to_sparse_series 

104 

105 result = coo_to_sparse_series(A, dense_index=dense_index) 

106 result = Series(result.array, index=result.index, copy=False) 

107 

108 return result 

109 

110 def to_coo(self, row_levels=(0,), column_levels=(1,), sort_labels=False): 

111 """ 

112 Create a scipy.sparse.coo_matrix from a Series with MultiIndex. 

113 

114 Use row_levels and column_levels to determine the row and column 

115 coordinates respectively. row_levels and column_levels are the names 

116 (labels) or numbers of the levels. {row_levels, column_levels} must be 

117 a partition of the MultiIndex level names (or numbers). 

118 

119 Parameters 

120 ---------- 

121 row_levels : tuple/list 

122 column_levels : tuple/list 

123 sort_labels : bool, default False 

124 Sort the row and column labels before forming the sparse matrix. 

125 When `row_levels` and/or `column_levels` refer to a single level, 

126 set to `True` for a faster execution. 

127 

128 Returns 

129 ------- 

130 y : scipy.sparse.coo_matrix 

131 rows : list (row labels) 

132 columns : list (column labels) 

133 

134 Examples 

135 -------- 

136 >>> s = pd.Series([3.0, np.nan, 1.0, 3.0, np.nan, np.nan]) 

137 >>> s.index = pd.MultiIndex.from_tuples( 

138 ... [ 

139 ... (1, 2, "a", 0), 

140 ... (1, 2, "a", 1), 

141 ... (1, 1, "b", 0), 

142 ... (1, 1, "b", 1), 

143 ... (2, 1, "b", 0), 

144 ... (2, 1, "b", 1) 

145 ... ], 

146 ... names=["A", "B", "C", "D"], 

147 ... ) 

148 >>> s 

149 A B C D 

150 1 2 a 0 3.0 

151 1 NaN 

152 1 b 0 1.0 

153 1 3.0 

154 2 1 b 0 NaN 

155 1 NaN 

156 dtype: float64 

157 

158 >>> ss = s.astype("Sparse") 

159 >>> ss 

160 A B C D 

161 1 2 a 0 3.0 

162 1 NaN 

163 1 b 0 1.0 

164 1 3.0 

165 2 1 b 0 NaN 

166 1 NaN 

167 dtype: Sparse[float64, nan] 

168 

169 >>> A, rows, columns = ss.sparse.to_coo( 

170 ... row_levels=["A", "B"], column_levels=["C", "D"], sort_labels=True 

171 ... ) 

172 >>> A 

173 <3x4 sparse matrix of type '<class 'numpy.float64'>' 

174 with 3 stored elements in COOrdinate format> 

175 >>> A.todense() 

176 matrix([[0., 0., 1., 3.], 

177 [3., 0., 0., 0.], 

178 [0., 0., 0., 0.]]) 

179 

180 >>> rows 

181 [(1, 1), (1, 2), (2, 1)] 

182 >>> columns 

183 [('a', 0), ('a', 1), ('b', 0), ('b', 1)] 

184 """ 

185 from pandas.core.arrays.sparse.scipy_sparse import sparse_series_to_coo 

186 

187 A, rows, columns = sparse_series_to_coo( 

188 self._parent, row_levels, column_levels, sort_labels=sort_labels 

189 ) 

190 return A, rows, columns 

191 

192 def to_dense(self) -> Series: 

193 """ 

194 Convert a Series from sparse values to dense. 

195 

196 .. versionadded:: 0.25.0 

197 

198 Returns 

199 ------- 

200 Series: 

201 A Series with the same values, stored as a dense array. 

202 

203 Examples 

204 -------- 

205 >>> series = pd.Series(pd.arrays.SparseArray([0, 1, 0])) 

206 >>> series 

207 0 0 

208 1 1 

209 2 0 

210 dtype: Sparse[int64, 0] 

211 

212 >>> series.sparse.to_dense() 

213 0 0 

214 1 1 

215 2 0 

216 dtype: int64 

217 """ 

218 from pandas import Series 

219 

220 return Series( 

221 self._parent.array.to_dense(), 

222 index=self._parent.index, 

223 name=self._parent.name, 

224 ) 

225 

226 

227class SparseFrameAccessor(BaseAccessor, PandasDelegate): 

228 """ 

229 DataFrame accessor for sparse data. 

230 

231 .. versionadded:: 0.25.0 

232 """ 

233 

234 def _validate(self, data): 

235 dtypes = data.dtypes 

236 if not all(isinstance(t, SparseDtype) for t in dtypes): 

237 raise AttributeError(self._validation_msg) 

238 

239 @classmethod 

240 def from_spmatrix(cls, data, index=None, columns=None) -> DataFrame: 

241 """ 

242 Create a new DataFrame from a scipy sparse matrix. 

243 

244 .. versionadded:: 0.25.0 

245 

246 Parameters 

247 ---------- 

248 data : scipy.sparse.spmatrix 

249 Must be convertible to csc format. 

250 index, columns : Index, optional 

251 Row and column labels to use for the resulting DataFrame. 

252 Defaults to a RangeIndex. 

253 

254 Returns 

255 ------- 

256 DataFrame 

257 Each column of the DataFrame is stored as a 

258 :class:`arrays.SparseArray`. 

259 

260 Examples 

261 -------- 

262 >>> import scipy.sparse 

263 >>> mat = scipy.sparse.eye(3) 

264 >>> pd.DataFrame.sparse.from_spmatrix(mat) 

265 0 1 2 

266 0 1.0 0.0 0.0 

267 1 0.0 1.0 0.0 

268 2 0.0 0.0 1.0 

269 """ 

270 from pandas._libs.sparse import IntIndex 

271 

272 from pandas import DataFrame 

273 

274 data = data.tocsc() 

275 index, columns = cls._prep_index(data, index, columns) 

276 n_rows, n_columns = data.shape 

277 # We need to make sure indices are sorted, as we create 

278 # IntIndex with no input validation (i.e. check_integrity=False ). 

279 # Indices may already be sorted in scipy in which case this adds 

280 # a small overhead. 

281 data.sort_indices() 

282 indices = data.indices 

283 indptr = data.indptr 

284 array_data = data.data 

285 dtype = SparseDtype(array_data.dtype, 0) 

286 arrays = [] 

287 for i in range(n_columns): 

288 sl = slice(indptr[i], indptr[i + 1]) 

289 idx = IntIndex(n_rows, indices[sl], check_integrity=False) 

290 arr = SparseArray._simple_new(array_data[sl], idx, dtype) 

291 arrays.append(arr) 

292 return DataFrame._from_arrays( 

293 arrays, columns=columns, index=index, verify_integrity=False 

294 ) 

295 

296 def to_dense(self) -> DataFrame: 

297 """ 

298 Convert a DataFrame with sparse values to dense. 

299 

300 .. versionadded:: 0.25.0 

301 

302 Returns 

303 ------- 

304 DataFrame 

305 A DataFrame with the same values stored as dense arrays. 

306 

307 Examples 

308 -------- 

309 >>> df = pd.DataFrame({"A": pd.arrays.SparseArray([0, 1, 0])}) 

310 >>> df.sparse.to_dense() 

311 A 

312 0 0 

313 1 1 

314 2 0 

315 """ 

316 from pandas import DataFrame 

317 

318 data = {k: v.array.to_dense() for k, v in self._parent.items()} 

319 return DataFrame(data, index=self._parent.index, columns=self._parent.columns) 

320 

321 def to_coo(self): 

322 """ 

323 Return the contents of the frame as a sparse SciPy COO matrix. 

324 

325 .. versionadded:: 0.25.0 

326 

327 Returns 

328 ------- 

329 coo_matrix : scipy.sparse.spmatrix 

330 If the caller is heterogeneous and contains booleans or objects, 

331 the result will be of dtype=object. See Notes. 

332 

333 Notes 

334 ----- 

335 The dtype will be the lowest-common-denominator type (implicit 

336 upcasting); that is to say if the dtypes (even of numeric types) 

337 are mixed, the one that accommodates all will be chosen. 

338 

339 e.g. If the dtypes are float16 and float32, dtype will be upcast to 

340 float32. By numpy.find_common_type convention, mixing int64 and 

341 and uint64 will result in a float64 dtype. 

342 """ 

343 import_optional_dependency("scipy") 

344 from scipy.sparse import coo_matrix 

345 

346 dtype = find_common_type(self._parent.dtypes.to_list()) 

347 if isinstance(dtype, SparseDtype): 

348 dtype = dtype.subtype 

349 

350 cols, rows, data = [], [], [] 

351 for col, (_, ser) in enumerate(self._parent.items()): 

352 sp_arr = ser.array 

353 if sp_arr.fill_value != 0: 

354 raise ValueError("fill value must be 0 when converting to COO matrix") 

355 

356 row = sp_arr.sp_index.indices 

357 cols.append(np.repeat(col, len(row))) 

358 rows.append(row) 

359 data.append(sp_arr.sp_values.astype(dtype, copy=False)) 

360 

361 cols = np.concatenate(cols) 

362 rows = np.concatenate(rows) 

363 data = np.concatenate(data) 

364 return coo_matrix((data, (rows, cols)), shape=self._parent.shape) 

365 

366 @property 

367 def density(self) -> float: 

368 """ 

369 Ratio of non-sparse points to total (dense) data points. 

370 """ 

371 tmp = np.mean([column.array.density for _, column in self._parent.items()]) 

372 # error: Expression of type "floating" cannot be assigned to return type "float" 

373 return tmp # pyright: ignore[reportGeneralTypeIssues] 

374 

375 @staticmethod 

376 def _prep_index(data, index, columns): 

377 from pandas.core.indexes.api import ( 

378 default_index, 

379 ensure_index, 

380 ) 

381 

382 N, K = data.shape 

383 if index is None: 

384 index = default_index(N) 

385 else: 

386 index = ensure_index(index) 

387 if columns is None: 

388 columns = default_index(K) 

389 else: 

390 columns = ensure_index(columns) 

391 

392 if len(columns) != K: 

393 raise ValueError(f"Column length mismatch: {len(columns)} vs. {K}") 

394 if len(index) != N: 

395 raise ValueError(f"Index length mismatch: {len(index)} vs. {N}") 

396 return index, columns