Coverage for /var/srv/projects/api.amasfac.comuna18.com/tmp/venv/lib/python3.9/site-packages/pandas/core/describe.py: 19%
149 statements
« prev ^ index » next coverage.py v6.4.4, created at 2023-07-17 14:22 -0600
« prev ^ index » next coverage.py v6.4.4, created at 2023-07-17 14:22 -0600
1"""
2Module responsible for execution of NDFrame.describe() method.
4Method NDFrame.describe() delegates actual execution to function describe_ndframe().
5"""
6from __future__ import annotations
8from abc import (
9 ABC,
10 abstractmethod,
11)
12from typing import (
13 TYPE_CHECKING,
14 Any,
15 Callable,
16 Hashable,
17 Sequence,
18 cast,
19)
20import warnings
22import numpy as np
24from pandas._libs.tslibs import Timestamp
25from pandas._typing import (
26 DtypeObj,
27 NDFrameT,
28 npt,
29)
30from pandas.util._exceptions import find_stack_level
31from pandas.util._validators import validate_percentile
33from pandas.core.dtypes.common import (
34 is_bool_dtype,
35 is_complex_dtype,
36 is_datetime64_any_dtype,
37 is_extension_array_dtype,
38 is_numeric_dtype,
39 is_timedelta64_dtype,
40)
42import pandas as pd
43from pandas.core.reshape.concat import concat
45from pandas.io.formats.format import format_percentiles
47if TYPE_CHECKING: 47 ↛ 48line 47 didn't jump to line 48, because the condition on line 47 was never true
48 from pandas import (
49 DataFrame,
50 Series,
51 )
54def describe_ndframe(
55 *,
56 obj: NDFrameT,
57 include: str | Sequence[str] | None,
58 exclude: str | Sequence[str] | None,
59 datetime_is_numeric: bool,
60 percentiles: Sequence[float] | np.ndarray | None,
61) -> NDFrameT:
62 """Describe series or dataframe.
64 Called from pandas.core.generic.NDFrame.describe()
66 Parameters
67 ----------
68 obj: DataFrame or Series
69 Either dataframe or series to be described.
70 include : 'all', list-like of dtypes or None (default), optional
71 A white list of data types to include in the result. Ignored for ``Series``.
72 exclude : list-like of dtypes or None (default), optional,
73 A black list of data types to omit from the result. Ignored for ``Series``.
74 datetime_is_numeric : bool, default False
75 Whether to treat datetime dtypes as numeric.
76 percentiles : list-like of numbers, optional
77 The percentiles to include in the output. All should fall between 0 and 1.
78 The default is ``[.25, .5, .75]``, which returns the 25th, 50th, and
79 75th percentiles.
81 Returns
82 -------
83 Dataframe or series description.
84 """
85 percentiles = refine_percentiles(percentiles)
87 describer: NDFrameDescriberAbstract
88 if obj.ndim == 1:
89 describer = SeriesDescriber(
90 obj=cast("Series", obj),
91 datetime_is_numeric=datetime_is_numeric,
92 )
93 else:
94 describer = DataFrameDescriber(
95 obj=cast("DataFrame", obj),
96 include=include,
97 exclude=exclude,
98 datetime_is_numeric=datetime_is_numeric,
99 )
101 result = describer.describe(percentiles=percentiles)
102 return cast(NDFrameT, result)
105class NDFrameDescriberAbstract(ABC):
106 """Abstract class for describing dataframe or series.
108 Parameters
109 ----------
110 obj : Series or DataFrame
111 Object to be described.
112 datetime_is_numeric : bool
113 Whether to treat datetime dtypes as numeric.
114 """
116 def __init__(self, obj: DataFrame | Series, datetime_is_numeric: bool) -> None:
117 self.obj = obj
118 self.datetime_is_numeric = datetime_is_numeric
120 @abstractmethod
121 def describe(self, percentiles: Sequence[float] | np.ndarray) -> DataFrame | Series:
122 """Do describe either series or dataframe.
124 Parameters
125 ----------
126 percentiles : list-like of numbers
127 The percentiles to include in the output.
128 """
131class SeriesDescriber(NDFrameDescriberAbstract):
132 """Class responsible for creating series description."""
134 obj: Series
136 def describe(self, percentiles: Sequence[float] | np.ndarray) -> Series:
137 describe_func = select_describe_func(
138 self.obj,
139 self.datetime_is_numeric,
140 )
141 return describe_func(self.obj, percentiles)
144class DataFrameDescriber(NDFrameDescriberAbstract):
145 """Class responsible for creating dataobj description.
147 Parameters
148 ----------
149 obj : DataFrame
150 DataFrame to be described.
151 include : 'all', list-like of dtypes or None
152 A white list of data types to include in the result.
153 exclude : list-like of dtypes or None
154 A black list of data types to omit from the result.
155 datetime_is_numeric : bool
156 Whether to treat datetime dtypes as numeric.
157 """
159 def __init__(
160 self,
161 obj: DataFrame,
162 *,
163 include: str | Sequence[str] | None,
164 exclude: str | Sequence[str] | None,
165 datetime_is_numeric: bool,
166 ) -> None:
167 self.include = include
168 self.exclude = exclude
170 if obj.ndim == 2 and obj.columns.size == 0:
171 raise ValueError("Cannot describe a DataFrame without columns")
173 super().__init__(obj, datetime_is_numeric=datetime_is_numeric)
175 def describe(self, percentiles: Sequence[float] | np.ndarray) -> DataFrame:
176 data = self._select_data()
178 ldesc: list[Series] = []
179 for _, series in data.items():
180 describe_func = select_describe_func(series, self.datetime_is_numeric)
181 ldesc.append(describe_func(series, percentiles))
183 col_names = reorder_columns(ldesc)
184 d = concat(
185 [x.reindex(col_names, copy=False) for x in ldesc],
186 axis=1,
187 sort=False,
188 )
189 d.columns = data.columns.copy()
190 return d
192 def _select_data(self):
193 """Select columns to be described."""
194 if (self.include is None) and (self.exclude is None):
195 # when some numerics are found, keep only numerics
196 default_include: list[npt.DTypeLike] = [np.number]
197 if self.datetime_is_numeric:
198 default_include.append("datetime")
199 data = self.obj.select_dtypes(include=default_include)
200 if len(data.columns) == 0:
201 data = self.obj
202 elif self.include == "all":
203 if self.exclude is not None:
204 msg = "exclude must be None when include is 'all'"
205 raise ValueError(msg)
206 data = self.obj
207 else:
208 data = self.obj.select_dtypes(
209 include=self.include,
210 exclude=self.exclude,
211 )
212 return data
215def reorder_columns(ldesc: Sequence[Series]) -> list[Hashable]:
216 """Set a convenient order for rows for display."""
217 names: list[Hashable] = []
218 ldesc_indexes = sorted((x.index for x in ldesc), key=len)
219 for idxnames in ldesc_indexes:
220 for name in idxnames:
221 if name not in names:
222 names.append(name)
223 return names
226def describe_numeric_1d(series: Series, percentiles: Sequence[float]) -> Series:
227 """Describe series containing numerical data.
229 Parameters
230 ----------
231 series : Series
232 Series to be described.
233 percentiles : list-like of numbers
234 The percentiles to include in the output.
235 """
236 from pandas import Series
238 formatted_percentiles = format_percentiles(percentiles)
240 stat_index = ["count", "mean", "std", "min"] + formatted_percentiles + ["max"]
241 d = (
242 [series.count(), series.mean(), series.std(), series.min()]
243 + series.quantile(percentiles).tolist()
244 + [series.max()]
245 )
246 # GH#48340 - always return float on non-complex numeric data
247 dtype: DtypeObj | None
248 if is_extension_array_dtype(series):
249 dtype = pd.Float64Dtype()
250 elif is_numeric_dtype(series) and not is_complex_dtype(series):
251 dtype = np.dtype("float")
252 else:
253 dtype = None
254 return Series(d, index=stat_index, name=series.name, dtype=dtype)
257def describe_categorical_1d(
258 data: Series,
259 percentiles_ignored: Sequence[float],
260) -> Series:
261 """Describe series containing categorical data.
263 Parameters
264 ----------
265 data : Series
266 Series to be described.
267 percentiles_ignored : list-like of numbers
268 Ignored, but in place to unify interface.
269 """
270 names = ["count", "unique", "top", "freq"]
271 objcounts = data.value_counts()
272 count_unique = len(objcounts[objcounts != 0])
273 if count_unique > 0:
274 top, freq = objcounts.index[0], objcounts.iloc[0]
275 dtype = None
276 else:
277 # If the DataFrame is empty, set 'top' and 'freq' to None
278 # to maintain output shape consistency
279 top, freq = np.nan, np.nan
280 dtype = "object"
282 result = [data.count(), count_unique, top, freq]
284 from pandas import Series
286 return Series(result, index=names, name=data.name, dtype=dtype)
289def describe_timestamp_as_categorical_1d(
290 data: Series,
291 percentiles_ignored: Sequence[float],
292) -> Series:
293 """Describe series containing timestamp data treated as categorical.
295 Parameters
296 ----------
297 data : Series
298 Series to be described.
299 percentiles_ignored : list-like of numbers
300 Ignored, but in place to unify interface.
301 """
302 names = ["count", "unique"]
303 objcounts = data.value_counts()
304 count_unique = len(objcounts[objcounts != 0])
305 result = [data.count(), count_unique]
306 dtype = None
307 if count_unique > 0:
308 top, freq = objcounts.index[0], objcounts.iloc[0]
309 tz = data.dt.tz
310 asint = data.dropna().values.view("i8")
311 top = Timestamp(top)
312 if top.tzinfo is not None and tz is not None:
313 # Don't tz_localize(None) if key is already tz-aware
314 top = top.tz_convert(tz)
315 else:
316 top = top.tz_localize(tz)
317 names += ["top", "freq", "first", "last"]
318 result += [
319 top,
320 freq,
321 Timestamp(asint.min(), tz=tz),
322 Timestamp(asint.max(), tz=tz),
323 ]
325 # If the DataFrame is empty, set 'top' and 'freq' to None
326 # to maintain output shape consistency
327 else:
328 names += ["top", "freq"]
329 result += [np.nan, np.nan]
330 dtype = "object"
332 from pandas import Series
334 return Series(result, index=names, name=data.name, dtype=dtype)
337def describe_timestamp_1d(data: Series, percentiles: Sequence[float]) -> Series:
338 """Describe series containing datetime64 dtype.
340 Parameters
341 ----------
342 data : Series
343 Series to be described.
344 percentiles : list-like of numbers
345 The percentiles to include in the output.
346 """
347 # GH-30164
348 from pandas import Series
350 formatted_percentiles = format_percentiles(percentiles)
352 stat_index = ["count", "mean", "min"] + formatted_percentiles + ["max"]
353 d = (
354 [data.count(), data.mean(), data.min()]
355 + data.quantile(percentiles).tolist()
356 + [data.max()]
357 )
358 return Series(d, index=stat_index, name=data.name)
361def select_describe_func(
362 data: Series,
363 datetime_is_numeric: bool,
364) -> Callable:
365 """Select proper function for describing series based on data type.
367 Parameters
368 ----------
369 data : Series
370 Series to be described.
371 datetime_is_numeric : bool
372 Whether to treat datetime dtypes as numeric.
373 """
374 if is_bool_dtype(data.dtype):
375 return describe_categorical_1d
376 elif is_numeric_dtype(data):
377 return describe_numeric_1d
378 elif is_datetime64_any_dtype(data.dtype):
379 if datetime_is_numeric:
380 return describe_timestamp_1d
381 else:
382 warnings.warn(
383 "Treating datetime data as categorical rather than numeric in "
384 "`.describe` is deprecated and will be removed in a future "
385 "version of pandas. Specify `datetime_is_numeric=True` to "
386 "silence this warning and adopt the future behavior now.",
387 FutureWarning,
388 stacklevel=find_stack_level(),
389 )
390 return describe_timestamp_as_categorical_1d
391 elif is_timedelta64_dtype(data.dtype):
392 return describe_numeric_1d
393 else:
394 return describe_categorical_1d
397def refine_percentiles(
398 percentiles: Sequence[float] | np.ndarray | None,
399) -> np.ndarray[Any, np.dtype[np.float64]]:
400 """
401 Ensure that percentiles are unique and sorted.
403 Parameters
404 ----------
405 percentiles : list-like of numbers, optional
406 The percentiles to include in the output.
407 """
408 if percentiles is None:
409 return np.array([0.25, 0.5, 0.75])
411 # explicit conversion of `percentiles` to list
412 percentiles = list(percentiles)
414 # get them all to be in [0, 1]
415 validate_percentile(percentiles)
417 # median should always be included
418 if 0.5 not in percentiles:
419 percentiles.append(0.5)
421 percentiles = np.asarray(percentiles)
423 # sort and check for duplicates
424 unique_pcts = np.unique(percentiles)
425 assert percentiles is not None
426 if len(unique_pcts) < len(percentiles):
427 raise ValueError("percentiles cannot contain duplicates")
429 return unique_pcts