Coverage for /var/srv/projects/api.amasfac.comuna18.com/tmp/venv/lib/python3.9/site-packages/pandas/core/dtypes/concat.py: 9%
110 statements
« prev ^ index » next coverage.py v6.4.4, created at 2023-07-17 14:22 -0600
« prev ^ index » next coverage.py v6.4.4, created at 2023-07-17 14:22 -0600
1"""
2Utility functions related to concat.
3"""
4from __future__ import annotations
6from typing import (
7 TYPE_CHECKING,
8 cast,
9)
10import warnings
12import numpy as np
14from pandas._typing import (
15 ArrayLike,
16 DtypeObj,
17)
18from pandas.util._exceptions import find_stack_level
20from pandas.core.dtypes.astype import astype_array
21from pandas.core.dtypes.cast import (
22 common_dtype_categorical_compat,
23 find_common_type,
24)
25from pandas.core.dtypes.common import (
26 is_dtype_equal,
27 is_sparse,
28)
29from pandas.core.dtypes.dtypes import (
30 DatetimeTZDtype,
31 ExtensionDtype,
32)
33from pandas.core.dtypes.generic import (
34 ABCCategoricalIndex,
35 ABCExtensionArray,
36 ABCSeries,
37)
39if TYPE_CHECKING: 39 ↛ 40line 39 didn't jump to line 40, because the condition on line 39 was never true
40 from pandas.core.arrays import Categorical
41 from pandas.core.arrays.sparse import SparseArray
44def cast_to_common_type(arr: ArrayLike, dtype: DtypeObj) -> ArrayLike:
45 """
46 Helper function for `arr.astype(common_dtype)` but handling all special
47 cases.
48 """
49 if is_dtype_equal(arr.dtype, dtype):
50 return arr
52 if is_sparse(arr) and not is_sparse(dtype):
53 # TODO(2.0): remove special case once SparseArray.astype deprecation
54 # is enforced.
55 # problem case: SparseArray.astype(dtype) doesn't follow the specified
56 # dtype exactly, but converts this to Sparse[dtype] -> first manually
57 # convert to dense array
59 # error: Argument 1 to "astype" of "_ArrayOrScalarCommon" has incompatible type
60 # "Union[dtype[Any], ExtensionDtype]"; expected "Union[dtype[Any], None, type, _
61 # SupportsDType[dtype[Any]], str, Union[Tuple[Any, int], Tuple[Any,
62 # Union[SupportsIndex, Sequence[SupportsIndex]]], List[Any], _DTypeDict,
63 # Tuple[Any, Any]]]" [arg-type]
64 arr = cast("SparseArray", arr)
65 return arr.to_dense().astype(dtype, copy=False) # type: ignore[arg-type]
67 # astype_array includes ensure_wrapped_if_datetimelike
68 return astype_array(arr, dtype=dtype, copy=False)
71def concat_compat(to_concat, axis: int = 0, ea_compat_axis: bool = False):
72 """
73 provide concatenation of an array of arrays each of which is a single
74 'normalized' dtypes (in that for example, if it's object, then it is a
75 non-datetimelike and provide a combined dtype for the resulting array that
76 preserves the overall dtype if possible)
78 Parameters
79 ----------
80 to_concat : array of arrays
81 axis : axis to provide concatenation
82 ea_compat_axis : bool, default False
83 For ExtensionArray compat, behave as if axis == 1 when determining
84 whether to drop empty arrays.
86 Returns
87 -------
88 a single array, preserving the combined dtypes
89 """
90 # filter empty arrays
91 # 1-d dtypes always are included here
92 def is_nonempty(x) -> bool:
93 if x.ndim <= axis:
94 return True
95 return x.shape[axis] > 0
97 # If all arrays are empty, there's nothing to convert, just short-cut to
98 # the concatenation, #3121.
99 #
100 # Creating an empty array directly is tempting, but the winnings would be
101 # marginal given that it would still require shape & dtype calculation and
102 # np.concatenate which has them both implemented is compiled.
103 non_empties = [x for x in to_concat if is_nonempty(x)]
104 if non_empties and axis == 0 and not ea_compat_axis:
105 # ea_compat_axis see GH#39574
106 to_concat = non_empties
108 dtypes = {obj.dtype for obj in to_concat}
109 kinds = {obj.dtype.kind for obj in to_concat}
110 contains_datetime = any(
111 isinstance(dtype, (np.dtype, DatetimeTZDtype)) and dtype.kind in ["m", "M"]
112 for dtype in dtypes
113 ) or any(isinstance(obj, ABCExtensionArray) and obj.ndim > 1 for obj in to_concat)
115 all_empty = not len(non_empties)
116 single_dtype = len({x.dtype for x in to_concat}) == 1
117 any_ea = any(isinstance(x.dtype, ExtensionDtype) for x in to_concat)
119 if contains_datetime:
120 return _concat_datetime(to_concat, axis=axis)
122 if any_ea:
123 # we ignore axis here, as internally concatting with EAs is always
124 # for axis=0
125 if not single_dtype:
126 target_dtype = find_common_type([x.dtype for x in to_concat])
127 target_dtype = common_dtype_categorical_compat(to_concat, target_dtype)
128 to_concat = [cast_to_common_type(arr, target_dtype) for arr in to_concat]
130 if isinstance(to_concat[0], ABCExtensionArray):
131 # TODO: what about EA-backed Index?
132 cls = type(to_concat[0])
133 return cls._concat_same_type(to_concat)
134 else:
135 return np.concatenate(to_concat)
137 elif all_empty:
138 # we have all empties, but may need to coerce the result dtype to
139 # object if we have non-numeric type operands (numpy would otherwise
140 # cast this to float)
141 if len(kinds) != 1:
143 if not len(kinds - {"i", "u", "f"}) or not len(kinds - {"b", "i", "u"}):
144 # let numpy coerce
145 pass
146 else:
147 # coerce to object
148 to_concat = [x.astype("object") for x in to_concat]
149 kinds = {"o"}
151 result = np.concatenate(to_concat, axis=axis)
152 if "b" in kinds and result.dtype.kind in ["i", "u", "f"]:
153 # GH#39817
154 warnings.warn(
155 "Behavior when concatenating bool-dtype and numeric-dtype arrays is "
156 "deprecated; in a future version these will cast to object dtype "
157 "(instead of coercing bools to numeric values). To retain the old "
158 "behavior, explicitly cast bool-dtype arrays to numeric dtype.",
159 FutureWarning,
160 stacklevel=find_stack_level(),
161 )
162 return result
165def union_categoricals(
166 to_union, sort_categories: bool = False, ignore_order: bool = False
167) -> Categorical:
168 """
169 Combine list-like of Categorical-like, unioning categories.
171 All categories must have the same dtype.
173 Parameters
174 ----------
175 to_union : list-like
176 Categorical, CategoricalIndex, or Series with dtype='category'.
177 sort_categories : bool, default False
178 If true, resulting categories will be lexsorted, otherwise
179 they will be ordered as they appear in the data.
180 ignore_order : bool, default False
181 If true, the ordered attribute of the Categoricals will be ignored.
182 Results in an unordered categorical.
184 Returns
185 -------
186 Categorical
188 Raises
189 ------
190 TypeError
191 - all inputs do not have the same dtype
192 - all inputs do not have the same ordered property
193 - all inputs are ordered and their categories are not identical
194 - sort_categories=True and Categoricals are ordered
195 ValueError
196 Empty list of categoricals passed
198 Notes
199 -----
200 To learn more about categories, see `link
201 <https://pandas.pydata.org/pandas-docs/stable/user_guide/categorical.html#unioning>`__
203 Examples
204 --------
205 If you want to combine categoricals that do not necessarily have
206 the same categories, `union_categoricals` will combine a list-like
207 of categoricals. The new categories will be the union of the
208 categories being combined.
210 >>> a = pd.Categorical(["b", "c"])
211 >>> b = pd.Categorical(["a", "b"])
212 >>> pd.api.types.union_categoricals([a, b])
213 ['b', 'c', 'a', 'b']
214 Categories (3, object): ['b', 'c', 'a']
216 By default, the resulting categories will be ordered as they appear
217 in the `categories` of the data. If you want the categories to be
218 lexsorted, use `sort_categories=True` argument.
220 >>> pd.api.types.union_categoricals([a, b], sort_categories=True)
221 ['b', 'c', 'a', 'b']
222 Categories (3, object): ['a', 'b', 'c']
224 `union_categoricals` also works with the case of combining two
225 categoricals of the same categories and order information (e.g. what
226 you could also `append` for).
228 >>> a = pd.Categorical(["a", "b"], ordered=True)
229 >>> b = pd.Categorical(["a", "b", "a"], ordered=True)
230 >>> pd.api.types.union_categoricals([a, b])
231 ['a', 'b', 'a', 'b', 'a']
232 Categories (2, object): ['a' < 'b']
234 Raises `TypeError` because the categories are ordered and not identical.
236 >>> a = pd.Categorical(["a", "b"], ordered=True)
237 >>> b = pd.Categorical(["a", "b", "c"], ordered=True)
238 >>> pd.api.types.union_categoricals([a, b])
239 Traceback (most recent call last):
240 ...
241 TypeError: to union ordered Categoricals, all categories must be the same
243 New in version 0.20.0
245 Ordered categoricals with different categories or orderings can be
246 combined by using the `ignore_ordered=True` argument.
248 >>> a = pd.Categorical(["a", "b", "c"], ordered=True)
249 >>> b = pd.Categorical(["c", "b", "a"], ordered=True)
250 >>> pd.api.types.union_categoricals([a, b], ignore_order=True)
251 ['a', 'b', 'c', 'c', 'b', 'a']
252 Categories (3, object): ['a', 'b', 'c']
254 `union_categoricals` also works with a `CategoricalIndex`, or `Series`
255 containing categorical data, but note that the resulting array will
256 always be a plain `Categorical`
258 >>> a = pd.Series(["b", "c"], dtype='category')
259 >>> b = pd.Series(["a", "b"], dtype='category')
260 >>> pd.api.types.union_categoricals([a, b])
261 ['b', 'c', 'a', 'b']
262 Categories (3, object): ['b', 'c', 'a']
263 """
264 from pandas import Categorical
265 from pandas.core.arrays.categorical import recode_for_categories
267 if len(to_union) == 0:
268 raise ValueError("No Categoricals to union")
270 def _maybe_unwrap(x):
271 if isinstance(x, (ABCCategoricalIndex, ABCSeries)):
272 return x._values
273 elif isinstance(x, Categorical):
274 return x
275 else:
276 raise TypeError("all components to combine must be Categorical")
278 to_union = [_maybe_unwrap(x) for x in to_union]
279 first = to_union[0]
281 if not all(
282 is_dtype_equal(other.categories.dtype, first.categories.dtype)
283 for other in to_union[1:]
284 ):
285 raise TypeError("dtype of categories must be the same")
287 ordered = False
288 if all(first._categories_match_up_to_permutation(other) for other in to_union[1:]):
289 # identical categories - fastpath
290 categories = first.categories
291 ordered = first.ordered
293 all_codes = [first._encode_with_my_categories(x)._codes for x in to_union]
294 new_codes = np.concatenate(all_codes)
296 if sort_categories and not ignore_order and ordered:
297 raise TypeError("Cannot use sort_categories=True with ordered Categoricals")
299 if sort_categories and not categories.is_monotonic_increasing:
300 categories = categories.sort_values()
301 indexer = categories.get_indexer(first.categories)
303 from pandas.core.algorithms import take_nd
305 new_codes = take_nd(indexer, new_codes, fill_value=-1)
306 elif ignore_order or all(not c.ordered for c in to_union):
307 # different categories - union and recode
308 cats = first.categories.append([c.categories for c in to_union[1:]])
309 categories = cats.unique()
310 if sort_categories:
311 categories = categories.sort_values()
313 new_codes = [
314 recode_for_categories(c.codes, c.categories, categories) for c in to_union
315 ]
316 new_codes = np.concatenate(new_codes)
317 else:
318 # ordered - to show a proper error message
319 if all(c.ordered for c in to_union):
320 msg = "to union ordered Categoricals, all categories must be the same"
321 raise TypeError(msg)
322 else:
323 raise TypeError("Categorical.ordered must be the same")
325 if ignore_order:
326 ordered = False
328 return Categorical(new_codes, categories=categories, ordered=ordered, fastpath=True)
331def _concatenate_2d(to_concat, axis: int):
332 # coerce to 2d if needed & concatenate
333 if axis == 1:
334 to_concat = [np.atleast_2d(x) for x in to_concat]
335 return np.concatenate(to_concat, axis=axis)
338def _concat_datetime(to_concat, axis=0):
339 """
340 provide concatenation of an datetimelike array of arrays each of which is a
341 single M8[ns], datetime64[ns, tz] or m8[ns] dtype
343 Parameters
344 ----------
345 to_concat : array of arrays
346 axis : axis to provide concatenation
348 Returns
349 -------
350 a single array, preserving the combined dtypes
351 """
352 from pandas.core.construction import ensure_wrapped_if_datetimelike
354 to_concat = [ensure_wrapped_if_datetimelike(x) for x in to_concat]
356 single_dtype = len({x.dtype for x in to_concat}) == 1
358 # multiple types, need to coerce to object
359 if not single_dtype:
360 # ensure_wrapped_if_datetimelike ensures that astype(object) wraps
361 # in Timestamp/Timedelta
362 return _concatenate_2d([x.astype(object) for x in to_concat], axis=axis)
364 result = type(to_concat[0])._concat_same_type(to_concat, axis=axis)
365 return result