Coverage for /var/srv/projects/api.amasfac.comuna18.com/tmp/venv/lib/python3.9/site-packages/pandas/core/describe.py: 19%

149 statements  

« prev     ^ index     » next       coverage.py v6.4.4, created at 2023-07-17 14:22 -0600

1""" 

2Module responsible for execution of NDFrame.describe() method. 

3 

4Method NDFrame.describe() delegates actual execution to function describe_ndframe(). 

5""" 

6from __future__ import annotations 

7 

8from abc import ( 

9 ABC, 

10 abstractmethod, 

11) 

12from typing import ( 

13 TYPE_CHECKING, 

14 Any, 

15 Callable, 

16 Hashable, 

17 Sequence, 

18 cast, 

19) 

20import warnings 

21 

22import numpy as np 

23 

24from pandas._libs.tslibs import Timestamp 

25from pandas._typing import ( 

26 DtypeObj, 

27 NDFrameT, 

28 npt, 

29) 

30from pandas.util._exceptions import find_stack_level 

31from pandas.util._validators import validate_percentile 

32 

33from pandas.core.dtypes.common import ( 

34 is_bool_dtype, 

35 is_complex_dtype, 

36 is_datetime64_any_dtype, 

37 is_extension_array_dtype, 

38 is_numeric_dtype, 

39 is_timedelta64_dtype, 

40) 

41 

42import pandas as pd 

43from pandas.core.reshape.concat import concat 

44 

45from pandas.io.formats.format import format_percentiles 

46 

47if TYPE_CHECKING: 47 ↛ 48line 47 didn't jump to line 48, because the condition on line 47 was never true

48 from pandas import ( 

49 DataFrame, 

50 Series, 

51 ) 

52 

53 

54def describe_ndframe( 

55 *, 

56 obj: NDFrameT, 

57 include: str | Sequence[str] | None, 

58 exclude: str | Sequence[str] | None, 

59 datetime_is_numeric: bool, 

60 percentiles: Sequence[float] | np.ndarray | None, 

61) -> NDFrameT: 

62 """Describe series or dataframe. 

63 

64 Called from pandas.core.generic.NDFrame.describe() 

65 

66 Parameters 

67 ---------- 

68 obj: DataFrame or Series 

69 Either dataframe or series to be described. 

70 include : 'all', list-like of dtypes or None (default), optional 

71 A white list of data types to include in the result. Ignored for ``Series``. 

72 exclude : list-like of dtypes or None (default), optional, 

73 A black list of data types to omit from the result. Ignored for ``Series``. 

74 datetime_is_numeric : bool, default False 

75 Whether to treat datetime dtypes as numeric. 

76 percentiles : list-like of numbers, optional 

77 The percentiles to include in the output. All should fall between 0 and 1. 

78 The default is ``[.25, .5, .75]``, which returns the 25th, 50th, and 

79 75th percentiles. 

80 

81 Returns 

82 ------- 

83 Dataframe or series description. 

84 """ 

85 percentiles = refine_percentiles(percentiles) 

86 

87 describer: NDFrameDescriberAbstract 

88 if obj.ndim == 1: 

89 describer = SeriesDescriber( 

90 obj=cast("Series", obj), 

91 datetime_is_numeric=datetime_is_numeric, 

92 ) 

93 else: 

94 describer = DataFrameDescriber( 

95 obj=cast("DataFrame", obj), 

96 include=include, 

97 exclude=exclude, 

98 datetime_is_numeric=datetime_is_numeric, 

99 ) 

100 

101 result = describer.describe(percentiles=percentiles) 

102 return cast(NDFrameT, result) 

103 

104 

105class NDFrameDescriberAbstract(ABC): 

106 """Abstract class for describing dataframe or series. 

107 

108 Parameters 

109 ---------- 

110 obj : Series or DataFrame 

111 Object to be described. 

112 datetime_is_numeric : bool 

113 Whether to treat datetime dtypes as numeric. 

114 """ 

115 

116 def __init__(self, obj: DataFrame | Series, datetime_is_numeric: bool) -> None: 

117 self.obj = obj 

118 self.datetime_is_numeric = datetime_is_numeric 

119 

120 @abstractmethod 

121 def describe(self, percentiles: Sequence[float] | np.ndarray) -> DataFrame | Series: 

122 """Do describe either series or dataframe. 

123 

124 Parameters 

125 ---------- 

126 percentiles : list-like of numbers 

127 The percentiles to include in the output. 

128 """ 

129 

130 

131class SeriesDescriber(NDFrameDescriberAbstract): 

132 """Class responsible for creating series description.""" 

133 

134 obj: Series 

135 

136 def describe(self, percentiles: Sequence[float] | np.ndarray) -> Series: 

137 describe_func = select_describe_func( 

138 self.obj, 

139 self.datetime_is_numeric, 

140 ) 

141 return describe_func(self.obj, percentiles) 

142 

143 

144class DataFrameDescriber(NDFrameDescriberAbstract): 

145 """Class responsible for creating dataobj description. 

146 

147 Parameters 

148 ---------- 

149 obj : DataFrame 

150 DataFrame to be described. 

151 include : 'all', list-like of dtypes or None 

152 A white list of data types to include in the result. 

153 exclude : list-like of dtypes or None 

154 A black list of data types to omit from the result. 

155 datetime_is_numeric : bool 

156 Whether to treat datetime dtypes as numeric. 

157 """ 

158 

159 def __init__( 

160 self, 

161 obj: DataFrame, 

162 *, 

163 include: str | Sequence[str] | None, 

164 exclude: str | Sequence[str] | None, 

165 datetime_is_numeric: bool, 

166 ) -> None: 

167 self.include = include 

168 self.exclude = exclude 

169 

170 if obj.ndim == 2 and obj.columns.size == 0: 

171 raise ValueError("Cannot describe a DataFrame without columns") 

172 

173 super().__init__(obj, datetime_is_numeric=datetime_is_numeric) 

174 

175 def describe(self, percentiles: Sequence[float] | np.ndarray) -> DataFrame: 

176 data = self._select_data() 

177 

178 ldesc: list[Series] = [] 

179 for _, series in data.items(): 

180 describe_func = select_describe_func(series, self.datetime_is_numeric) 

181 ldesc.append(describe_func(series, percentiles)) 

182 

183 col_names = reorder_columns(ldesc) 

184 d = concat( 

185 [x.reindex(col_names, copy=False) for x in ldesc], 

186 axis=1, 

187 sort=False, 

188 ) 

189 d.columns = data.columns.copy() 

190 return d 

191 

192 def _select_data(self): 

193 """Select columns to be described.""" 

194 if (self.include is None) and (self.exclude is None): 

195 # when some numerics are found, keep only numerics 

196 default_include: list[npt.DTypeLike] = [np.number] 

197 if self.datetime_is_numeric: 

198 default_include.append("datetime") 

199 data = self.obj.select_dtypes(include=default_include) 

200 if len(data.columns) == 0: 

201 data = self.obj 

202 elif self.include == "all": 

203 if self.exclude is not None: 

204 msg = "exclude must be None when include is 'all'" 

205 raise ValueError(msg) 

206 data = self.obj 

207 else: 

208 data = self.obj.select_dtypes( 

209 include=self.include, 

210 exclude=self.exclude, 

211 ) 

212 return data 

213 

214 

215def reorder_columns(ldesc: Sequence[Series]) -> list[Hashable]: 

216 """Set a convenient order for rows for display.""" 

217 names: list[Hashable] = [] 

218 ldesc_indexes = sorted((x.index for x in ldesc), key=len) 

219 for idxnames in ldesc_indexes: 

220 for name in idxnames: 

221 if name not in names: 

222 names.append(name) 

223 return names 

224 

225 

226def describe_numeric_1d(series: Series, percentiles: Sequence[float]) -> Series: 

227 """Describe series containing numerical data. 

228 

229 Parameters 

230 ---------- 

231 series : Series 

232 Series to be described. 

233 percentiles : list-like of numbers 

234 The percentiles to include in the output. 

235 """ 

236 from pandas import Series 

237 

238 formatted_percentiles = format_percentiles(percentiles) 

239 

240 stat_index = ["count", "mean", "std", "min"] + formatted_percentiles + ["max"] 

241 d = ( 

242 [series.count(), series.mean(), series.std(), series.min()] 

243 + series.quantile(percentiles).tolist() 

244 + [series.max()] 

245 ) 

246 # GH#48340 - always return float on non-complex numeric data 

247 dtype: DtypeObj | None 

248 if is_extension_array_dtype(series): 

249 dtype = pd.Float64Dtype() 

250 elif is_numeric_dtype(series) and not is_complex_dtype(series): 

251 dtype = np.dtype("float") 

252 else: 

253 dtype = None 

254 return Series(d, index=stat_index, name=series.name, dtype=dtype) 

255 

256 

257def describe_categorical_1d( 

258 data: Series, 

259 percentiles_ignored: Sequence[float], 

260) -> Series: 

261 """Describe series containing categorical data. 

262 

263 Parameters 

264 ---------- 

265 data : Series 

266 Series to be described. 

267 percentiles_ignored : list-like of numbers 

268 Ignored, but in place to unify interface. 

269 """ 

270 names = ["count", "unique", "top", "freq"] 

271 objcounts = data.value_counts() 

272 count_unique = len(objcounts[objcounts != 0]) 

273 if count_unique > 0: 

274 top, freq = objcounts.index[0], objcounts.iloc[0] 

275 dtype = None 

276 else: 

277 # If the DataFrame is empty, set 'top' and 'freq' to None 

278 # to maintain output shape consistency 

279 top, freq = np.nan, np.nan 

280 dtype = "object" 

281 

282 result = [data.count(), count_unique, top, freq] 

283 

284 from pandas import Series 

285 

286 return Series(result, index=names, name=data.name, dtype=dtype) 

287 

288 

289def describe_timestamp_as_categorical_1d( 

290 data: Series, 

291 percentiles_ignored: Sequence[float], 

292) -> Series: 

293 """Describe series containing timestamp data treated as categorical. 

294 

295 Parameters 

296 ---------- 

297 data : Series 

298 Series to be described. 

299 percentiles_ignored : list-like of numbers 

300 Ignored, but in place to unify interface. 

301 """ 

302 names = ["count", "unique"] 

303 objcounts = data.value_counts() 

304 count_unique = len(objcounts[objcounts != 0]) 

305 result = [data.count(), count_unique] 

306 dtype = None 

307 if count_unique > 0: 

308 top, freq = objcounts.index[0], objcounts.iloc[0] 

309 tz = data.dt.tz 

310 asint = data.dropna().values.view("i8") 

311 top = Timestamp(top) 

312 if top.tzinfo is not None and tz is not None: 

313 # Don't tz_localize(None) if key is already tz-aware 

314 top = top.tz_convert(tz) 

315 else: 

316 top = top.tz_localize(tz) 

317 names += ["top", "freq", "first", "last"] 

318 result += [ 

319 top, 

320 freq, 

321 Timestamp(asint.min(), tz=tz), 

322 Timestamp(asint.max(), tz=tz), 

323 ] 

324 

325 # If the DataFrame is empty, set 'top' and 'freq' to None 

326 # to maintain output shape consistency 

327 else: 

328 names += ["top", "freq"] 

329 result += [np.nan, np.nan] 

330 dtype = "object" 

331 

332 from pandas import Series 

333 

334 return Series(result, index=names, name=data.name, dtype=dtype) 

335 

336 

337def describe_timestamp_1d(data: Series, percentiles: Sequence[float]) -> Series: 

338 """Describe series containing datetime64 dtype. 

339 

340 Parameters 

341 ---------- 

342 data : Series 

343 Series to be described. 

344 percentiles : list-like of numbers 

345 The percentiles to include in the output. 

346 """ 

347 # GH-30164 

348 from pandas import Series 

349 

350 formatted_percentiles = format_percentiles(percentiles) 

351 

352 stat_index = ["count", "mean", "min"] + formatted_percentiles + ["max"] 

353 d = ( 

354 [data.count(), data.mean(), data.min()] 

355 + data.quantile(percentiles).tolist() 

356 + [data.max()] 

357 ) 

358 return Series(d, index=stat_index, name=data.name) 

359 

360 

361def select_describe_func( 

362 data: Series, 

363 datetime_is_numeric: bool, 

364) -> Callable: 

365 """Select proper function for describing series based on data type. 

366 

367 Parameters 

368 ---------- 

369 data : Series 

370 Series to be described. 

371 datetime_is_numeric : bool 

372 Whether to treat datetime dtypes as numeric. 

373 """ 

374 if is_bool_dtype(data.dtype): 

375 return describe_categorical_1d 

376 elif is_numeric_dtype(data): 

377 return describe_numeric_1d 

378 elif is_datetime64_any_dtype(data.dtype): 

379 if datetime_is_numeric: 

380 return describe_timestamp_1d 

381 else: 

382 warnings.warn( 

383 "Treating datetime data as categorical rather than numeric in " 

384 "`.describe` is deprecated and will be removed in a future " 

385 "version of pandas. Specify `datetime_is_numeric=True` to " 

386 "silence this warning and adopt the future behavior now.", 

387 FutureWarning, 

388 stacklevel=find_stack_level(), 

389 ) 

390 return describe_timestamp_as_categorical_1d 

391 elif is_timedelta64_dtype(data.dtype): 

392 return describe_numeric_1d 

393 else: 

394 return describe_categorical_1d 

395 

396 

397def refine_percentiles( 

398 percentiles: Sequence[float] | np.ndarray | None, 

399) -> np.ndarray[Any, np.dtype[np.float64]]: 

400 """ 

401 Ensure that percentiles are unique and sorted. 

402 

403 Parameters 

404 ---------- 

405 percentiles : list-like of numbers, optional 

406 The percentiles to include in the output. 

407 """ 

408 if percentiles is None: 

409 return np.array([0.25, 0.5, 0.75]) 

410 

411 # explicit conversion of `percentiles` to list 

412 percentiles = list(percentiles) 

413 

414 # get them all to be in [0, 1] 

415 validate_percentile(percentiles) 

416 

417 # median should always be included 

418 if 0.5 not in percentiles: 

419 percentiles.append(0.5) 

420 

421 percentiles = np.asarray(percentiles) 

422 

423 # sort and check for duplicates 

424 unique_pcts = np.unique(percentiles) 

425 assert percentiles is not None 

426 if len(unique_pcts) < len(percentiles): 

427 raise ValueError("percentiles cannot contain duplicates") 

428 

429 return unique_pcts