Coverage for /var/srv/projects/api.amasfac.comuna18.com/tmp/venv/lib/python3.9/site-packages/pandas/core/describe.py: 19%

1"""

2Module responsible for execution of NDFrame.describe() method.

4Method NDFrame.describe() delegates actual execution to function describe_ndframe().

5"""

6from __future__ import annotations

8from abc import (

9 ABC,

10 abstractmethod,

11)

12from typing import (

13 TYPE_CHECKING,

14 Any,

15 Callable,

16 Hashable,

17 Sequence,

18 cast,

19)

20import warnings

22import numpy as np

24from pandas._libs.tslibs import Timestamp

25from pandas._typing import (

26 DtypeObj,

27 NDFrameT,

28 npt,

29)

30from pandas.util._exceptions import find_stack_level

31from pandas.util._validators import validate_percentile

33from pandas.core.dtypes.common import (

34 is_bool_dtype,

35 is_complex_dtype,

36 is_datetime64_any_dtype,

37 is_extension_array_dtype,

38 is_numeric_dtype,

39 is_timedelta64_dtype,

40)

42import pandas as pd

43from pandas.core.reshape.concat import concat

45from pandas.io.formats.format import format_percentiles

47if TYPE_CHECKING: 47 ↛ 48line 47 didn't jump to line 48, because the condition on line 47 was never true

48 from pandas import (

49 DataFrame,

50 Series,

51 )

54def describe_ndframe(

55 *,

56 obj: NDFrameT,

57 include: str | Sequence[str] | None,

58 exclude: str | Sequence[str] | None,

59 datetime_is_numeric: bool,

60 percentiles: Sequence[float] | np.ndarray | None,

61) -> NDFrameT:

62 """Describe series or dataframe.

64 Called from pandas.core.generic.NDFrame.describe()

66 Parameters

67 ----------

68 obj: DataFrame or Series

69 Either dataframe or series to be described.

70 include : 'all', list-like of dtypes or None (default), optional

71 A white list of data types to include in the result. Ignored for ``Series``.

72 exclude : list-like of dtypes or None (default), optional,

73 A black list of data types to omit from the result. Ignored for ``Series``.

74 datetime_is_numeric : bool, default False

75 Whether to treat datetime dtypes as numeric.

76 percentiles : list-like of numbers, optional

77 The percentiles to include in the output. All should fall between 0 and 1.

78 The default is ``[.25, .5, .75]``, which returns the 25th, 50th, and

79 75th percentiles.

81 Returns

82 -------

83 Dataframe or series description.

84 """

85 percentiles = refine_percentiles(percentiles)

87 describer: NDFrameDescriberAbstract

88 if obj.ndim == 1:

89 describer = SeriesDescriber(

90 obj=cast("Series", obj),

91 datetime_is_numeric=datetime_is_numeric,

92 )

93 else:

94 describer = DataFrameDescriber(

95 obj=cast("DataFrame", obj),

96 include=include,

97 exclude=exclude,

98 datetime_is_numeric=datetime_is_numeric,

99 )

100

101 result = describer.describe(percentiles=percentiles)

102 return cast(NDFrameT, result)

103

104

105class NDFrameDescriberAbstract(ABC):

106 """Abstract class for describing dataframe or series.

107

108 Parameters

109 ----------

110 obj : Series or DataFrame

111 Object to be described.

112 datetime_is_numeric : bool

113 Whether to treat datetime dtypes as numeric.

114 """

115

116 def __init__(self, obj: DataFrame | Series, datetime_is_numeric: bool) -> None:

117 self.obj = obj

118 self.datetime_is_numeric = datetime_is_numeric

119

120 @abstractmethod

121 def describe(self, percentiles: Sequence[float] | np.ndarray) -> DataFrame | Series:

122 """Do describe either series or dataframe.

123

124 Parameters

125 ----------

126 percentiles : list-like of numbers

127 The percentiles to include in the output.

128 """

129

130

131class SeriesDescriber(NDFrameDescriberAbstract):

132 """Class responsible for creating series description."""

133

134 obj: Series

135

136 def describe(self, percentiles: Sequence[float] | np.ndarray) -> Series:

137 describe_func = select_describe_func(

138 self.obj,

139 self.datetime_is_numeric,

140 )

141 return describe_func(self.obj, percentiles)

142

143

144class DataFrameDescriber(NDFrameDescriberAbstract):

145 """Class responsible for creating dataobj description.

146

147 Parameters

148 ----------

149 obj : DataFrame

150 DataFrame to be described.

151 include : 'all', list-like of dtypes or None

152 A white list of data types to include in the result.

153 exclude : list-like of dtypes or None

154 A black list of data types to omit from the result.

155 datetime_is_numeric : bool

156 Whether to treat datetime dtypes as numeric.

157 """

158

159 def __init__(

160 self,

161 obj: DataFrame,

162 *,

163 include: str | Sequence[str] | None,

164 exclude: str | Sequence[str] | None,

165 datetime_is_numeric: bool,

166 ) -> None:

167 self.include = include

168 self.exclude = exclude

169

170 if obj.ndim == 2 and obj.columns.size == 0:

171 raise ValueError("Cannot describe a DataFrame without columns")

172

173 super().__init__(obj, datetime_is_numeric=datetime_is_numeric)

174

175 def describe(self, percentiles: Sequence[float] | np.ndarray) -> DataFrame:

176 data = self._select_data()

177

178 ldesc: list[Series] = []

179 for _, series in data.items():

180 describe_func = select_describe_func(series, self.datetime_is_numeric)

181 ldesc.append(describe_func(series, percentiles))

182

183 col_names = reorder_columns(ldesc)

184 d = concat(

185 [x.reindex(col_names, copy=False) for x in ldesc],

186 axis=1,

187 sort=False,

188 )

189 d.columns = data.columns.copy()

190 return d

191

192 def _select_data(self):

193 """Select columns to be described."""

194 if (self.include is None) and (self.exclude is None):

195 # when some numerics are found, keep only numerics

196 default_include: list[npt.DTypeLike] = [np.number]

197 if self.datetime_is_numeric:

198 default_include.append("datetime")

199 data = self.obj.select_dtypes(include=default_include)

200 if len(data.columns) == 0:

201 data = self.obj

202 elif self.include == "all":

203 if self.exclude is not None:

204 msg = "exclude must be None when include is 'all'"

205 raise ValueError(msg)

206 data = self.obj

207 else:

208 data = self.obj.select_dtypes(

209 include=self.include,

210 exclude=self.exclude,

211 )

212 return data

213

214

215def reorder_columns(ldesc: Sequence[Series]) -> list[Hashable]:

216 """Set a convenient order for rows for display."""

217 names: list[Hashable] = []

218 ldesc_indexes = sorted((x.index for x in ldesc), key=len)

219 for idxnames in ldesc_indexes:

220 for name in idxnames:

221 if name not in names:

222 names.append(name)

223 return names

224

225

226def describe_numeric_1d(series: Series, percentiles: Sequence[float]) -> Series:

227 """Describe series containing numerical data.

228

229 Parameters

230 ----------

231 series : Series

232 Series to be described.

233 percentiles : list-like of numbers

234 The percentiles to include in the output.

235 """

236 from pandas import Series

237

238 formatted_percentiles = format_percentiles(percentiles)

239

240 stat_index = ["count", "mean", "std", "min"] + formatted_percentiles + ["max"]

241 d = (

242 [series.count(), series.mean(), series.std(), series.min()]

243 + series.quantile(percentiles).tolist()

244 + [series.max()]

245 )

246 # GH#48340 - always return float on non-complex numeric data

247 dtype: DtypeObj | None

248 if is_extension_array_dtype(series):

249 dtype = pd.Float64Dtype()

250 elif is_numeric_dtype(series) and not is_complex_dtype(series):

251 dtype = np.dtype("float")

252 else:

253 dtype = None

254 return Series(d, index=stat_index, name=series.name, dtype=dtype)

255

256

257def describe_categorical_1d(

258 data: Series,

259 percentiles_ignored: Sequence[float],

260) -> Series:

261 """Describe series containing categorical data.

262

263 Parameters

264 ----------

265 data : Series

266 Series to be described.

267 percentiles_ignored : list-like of numbers

268 Ignored, but in place to unify interface.

269 """

270 names = ["count", "unique", "top", "freq"]

271 objcounts = data.value_counts()

272 count_unique = len(objcounts[objcounts != 0])

273 if count_unique > 0:

274 top, freq = objcounts.index[0], objcounts.iloc[0]

275 dtype = None

276 else:

277 # If the DataFrame is empty, set 'top' and 'freq' to None

278 # to maintain output shape consistency

279 top, freq = np.nan, np.nan

280 dtype = "object"

281

282 result = [data.count(), count_unique, top, freq]

283

284 from pandas import Series

285

286 return Series(result, index=names, name=data.name, dtype=dtype)

287

288

289def describe_timestamp_as_categorical_1d(

290 data: Series,

291 percentiles_ignored: Sequence[float],

292) -> Series:

293 """Describe series containing timestamp data treated as categorical.

294

295 Parameters

296 ----------

297 data : Series

298 Series to be described.

299 percentiles_ignored : list-like of numbers

300 Ignored, but in place to unify interface.

301 """

302 names = ["count", "unique"]

303 objcounts = data.value_counts()

304 count_unique = len(objcounts[objcounts != 0])

305 result = [data.count(), count_unique]

306 dtype = None

307 if count_unique > 0:

308 top, freq = objcounts.index[0], objcounts.iloc[0]

309 tz = data.dt.tz

310 asint = data.dropna().values.view("i8")

311 top = Timestamp(top)

312 if top.tzinfo is not None and tz is not None:

313 # Don't tz_localize(None) if key is already tz-aware

314 top = top.tz_convert(tz)

315 else:

316 top = top.tz_localize(tz)

317 names += ["top", "freq", "first", "last"]

318 result += [

319 top,

320 freq,

321 Timestamp(asint.min(), tz=tz),

322 Timestamp(asint.max(), tz=tz),

323 ]

324

325 # If the DataFrame is empty, set 'top' and 'freq' to None

326 # to maintain output shape consistency

327 else:

328 names += ["top", "freq"]

329 result += [np.nan, np.nan]

330 dtype = "object"

331

332 from pandas import Series

333

334 return Series(result, index=names, name=data.name, dtype=dtype)

335

336

337def describe_timestamp_1d(data: Series, percentiles: Sequence[float]) -> Series:

338 """Describe series containing datetime64 dtype.

339

340 Parameters

341 ----------

342 data : Series

343 Series to be described.

344 percentiles : list-like of numbers

345 The percentiles to include in the output.

346 """

347 # GH-30164

348 from pandas import Series

349

350 formatted_percentiles = format_percentiles(percentiles)

351

352 stat_index = ["count", "mean", "min"] + formatted_percentiles + ["max"]

353 d = (

354 [data.count(), data.mean(), data.min()]

355 + data.quantile(percentiles).tolist()

356 + [data.max()]

357 )

358 return Series(d, index=stat_index, name=data.name)

359

360

361def select_describe_func(

362 data: Series,

363 datetime_is_numeric: bool,

364) -> Callable:

365 """Select proper function for describing series based on data type.

366

367 Parameters

368 ----------

369 data : Series

370 Series to be described.

371 datetime_is_numeric : bool

372 Whether to treat datetime dtypes as numeric.

373 """

374 if is_bool_dtype(data.dtype):

375 return describe_categorical_1d

376 elif is_numeric_dtype(data):

377 return describe_numeric_1d

378 elif is_datetime64_any_dtype(data.dtype):

379 if datetime_is_numeric:

380 return describe_timestamp_1d

381 else:

382 warnings.warn(

383 "Treating datetime data as categorical rather than numeric in "

384 "`.describe` is deprecated and will be removed in a future "

385 "version of pandas. Specify `datetime_is_numeric=True` to "

386 "silence this warning and adopt the future behavior now.",

387 FutureWarning,

388 stacklevel=find_stack_level(),

389 )

390 return describe_timestamp_as_categorical_1d

391 elif is_timedelta64_dtype(data.dtype):

392 return describe_numeric_1d

393 else:

394 return describe_categorical_1d

395

396

397def refine_percentiles(

398 percentiles: Sequence[float] | np.ndarray | None,

399) -> np.ndarray[Any, np.dtype[np.float64]]:

400 """

401 Ensure that percentiles are unique and sorted.

402

403 Parameters

404 ----------

405 percentiles : list-like of numbers, optional

406 The percentiles to include in the output.

407 """

408 if percentiles is None:

409 return np.array([0.25, 0.5, 0.75])

410

411 # explicit conversion of `percentiles` to list

412 percentiles = list(percentiles)

413

414 # get them all to be in [0, 1]

415 validate_percentile(percentiles)

416

417 # median should always be included

418 if 0.5 not in percentiles:

419 percentiles.append(0.5)

420

421 percentiles = np.asarray(percentiles)

422

423 # sort and check for duplicates

424 unique_pcts = np.unique(percentiles)

425 assert percentiles is not None

426 if len(unique_pcts) < len(percentiles):

427 raise ValueError("percentiles cannot contain duplicates")

428

429 return unique_pcts