Coverage for /var/srv/projects/api.amasfac.comuna18.com/tmp/venv/lib/python3.9/site-packages/pandas/core/dtypes/concat.py: 9%

1"""

2Utility functions related to concat.

3"""

4from __future__ import annotations

6from typing import (

7 TYPE_CHECKING,

8 cast,

10import warnings

12import numpy as np

14from pandas._typing import (

15 ArrayLike,

16 DtypeObj,

17)

18from pandas.util._exceptions import find_stack_level

20from pandas.core.dtypes.astype import astype_array

21from pandas.core.dtypes.cast import (

22 common_dtype_categorical_compat,

23 find_common_type,

24)

25from pandas.core.dtypes.common import (

26 is_dtype_equal,

27 is_sparse,

28)

29from pandas.core.dtypes.dtypes import (

30 DatetimeTZDtype,

31 ExtensionDtype,

32)

33from pandas.core.dtypes.generic import (

34 ABCCategoricalIndex,

35 ABCExtensionArray,

36 ABCSeries,

37)

39if TYPE_CHECKING: 39 ↛ 40line 39 didn't jump to line 40, because the condition on line 39 was never true

40 from pandas.core.arrays import Categorical

41 from pandas.core.arrays.sparse import SparseArray

44def cast_to_common_type(arr: ArrayLike, dtype: DtypeObj) -> ArrayLike:

45 """

46 Helper function for `arr.astype(common_dtype)` but handling all special

47 cases.

48 """

49 if is_dtype_equal(arr.dtype, dtype):

50 return arr

52 if is_sparse(arr) and not is_sparse(dtype):

53 # TODO(2.0): remove special case once SparseArray.astype deprecation

54 # is enforced.

55 # problem case: SparseArray.astype(dtype) doesn't follow the specified

56 # dtype exactly, but converts this to Sparse[dtype] -> first manually

57 # convert to dense array

59 # error: Argument 1 to "astype" of "_ArrayOrScalarCommon" has incompatible type

60 # "Union[dtype[Any], ExtensionDtype]"; expected "Union[dtype[Any], None, type, _

61 # SupportsDType[dtype[Any]], str, Union[Tuple[Any, int], Tuple[Any,

62 # Union[SupportsIndex, Sequence[SupportsIndex]]], List[Any], _DTypeDict,

63 # Tuple[Any, Any]]]" [arg-type]

64 arr = cast("SparseArray", arr)

65 return arr.to_dense().astype(dtype, copy=False) # type: ignore[arg-type]

67 # astype_array includes ensure_wrapped_if_datetimelike

68 return astype_array(arr, dtype=dtype, copy=False)

71def concat_compat(to_concat, axis: int = 0, ea_compat_axis: bool = False):

72 """

73 provide concatenation of an array of arrays each of which is a single

74 'normalized' dtypes (in that for example, if it's object, then it is a

75 non-datetimelike and provide a combined dtype for the resulting array that

76 preserves the overall dtype if possible)

78 Parameters

79 ----------

80 to_concat : array of arrays

81 axis : axis to provide concatenation

82 ea_compat_axis : bool, default False

83 For ExtensionArray compat, behave as if axis == 1 when determining

84 whether to drop empty arrays.

86 Returns

87 -------

88 a single array, preserving the combined dtypes

89 """

90 # filter empty arrays

91 # 1-d dtypes always are included here

92 def is_nonempty(x) -> bool:

93 if x.ndim <= axis:

94 return True

95 return x.shape[axis] > 0

97 # If all arrays are empty, there's nothing to convert, just short-cut to

98 # the concatenation, #3121.

99 #

100 # Creating an empty array directly is tempting, but the winnings would be

101 # marginal given that it would still require shape & dtype calculation and

102 # np.concatenate which has them both implemented is compiled.

103 non_empties = [x for x in to_concat if is_nonempty(x)]

104 if non_empties and axis == 0 and not ea_compat_axis:

105 # ea_compat_axis see GH#39574

106 to_concat = non_empties

107

108 dtypes = {obj.dtype for obj in to_concat}

109 kinds = {obj.dtype.kind for obj in to_concat}

110 contains_datetime = any(

111 isinstance(dtype, (np.dtype, DatetimeTZDtype)) and dtype.kind in ["m", "M"]

112 for dtype in dtypes

113 ) or any(isinstance(obj, ABCExtensionArray) and obj.ndim > 1 for obj in to_concat)

114

115 all_empty = not len(non_empties)

116 single_dtype = len({x.dtype for x in to_concat}) == 1

117 any_ea = any(isinstance(x.dtype, ExtensionDtype) for x in to_concat)

118

119 if contains_datetime:

120 return _concat_datetime(to_concat, axis=axis)

121

122 if any_ea:

123 # we ignore axis here, as internally concatting with EAs is always

124 # for axis=0

125 if not single_dtype:

126 target_dtype = find_common_type([x.dtype for x in to_concat])

127 target_dtype = common_dtype_categorical_compat(to_concat, target_dtype)

128 to_concat = [cast_to_common_type(arr, target_dtype) for arr in to_concat]

129

130 if isinstance(to_concat[0], ABCExtensionArray):

131 # TODO: what about EA-backed Index?

132 cls = type(to_concat[0])

133 return cls._concat_same_type(to_concat)

134 else:

135 return np.concatenate(to_concat)

136

137 elif all_empty:

138 # we have all empties, but may need to coerce the result dtype to

139 # object if we have non-numeric type operands (numpy would otherwise

140 # cast this to float)

141 if len(kinds) != 1:

142

143 if not len(kinds - {"i", "u", "f"}) or not len(kinds - {"b", "i", "u"}):

144 # let numpy coerce

145 pass

146 else:

147 # coerce to object

148 to_concat = [x.astype("object") for x in to_concat]

149 kinds = {"o"}

150

151 result = np.concatenate(to_concat, axis=axis)

152 if "b" in kinds and result.dtype.kind in ["i", "u", "f"]:

153 # GH#39817

154 warnings.warn(

155 "Behavior when concatenating bool-dtype and numeric-dtype arrays is "

156 "deprecated; in a future version these will cast to object dtype "

157 "(instead of coercing bools to numeric values). To retain the old "

158 "behavior, explicitly cast bool-dtype arrays to numeric dtype.",

159 FutureWarning,

160 stacklevel=find_stack_level(),

161 )

162 return result

163

164

165def union_categoricals(

166 to_union, sort_categories: bool = False, ignore_order: bool = False

167) -> Categorical:

168 """

169 Combine list-like of Categorical-like, unioning categories.

170

171 All categories must have the same dtype.

172

173 Parameters

174 ----------

175 to_union : list-like

176 Categorical, CategoricalIndex, or Series with dtype='category'.

177 sort_categories : bool, default False

178 If true, resulting categories will be lexsorted, otherwise

179 they will be ordered as they appear in the data.

180 ignore_order : bool, default False

181 If true, the ordered attribute of the Categoricals will be ignored.

182 Results in an unordered categorical.

183

184 Returns

185 -------

186 Categorical

187

188 Raises

189 ------

190 TypeError

191 - all inputs do not have the same dtype

192 - all inputs do not have the same ordered property

193 - all inputs are ordered and their categories are not identical

194 - sort_categories=True and Categoricals are ordered

195 ValueError

196 Empty list of categoricals passed

197

198 Notes

199 -----

200 To learn more about categories, see `link

201 <https://pandas.pydata.org/pandas-docs/stable/user_guide/categorical.html#unioning>`__

202

203 Examples

204 --------

205 If you want to combine categoricals that do not necessarily have

206 the same categories, `union_categoricals` will combine a list-like

207 of categoricals. The new categories will be the union of the

208 categories being combined.

209

210 >>> a = pd.Categorical(["b", "c"])

211 >>> b = pd.Categorical(["a", "b"])

212 >>> pd.api.types.union_categoricals([a, b])

213 ['b', 'c', 'a', 'b']

214 Categories (3, object): ['b', 'c', 'a']

215

216 By default, the resulting categories will be ordered as they appear

217 in the `categories` of the data. If you want the categories to be

218 lexsorted, use `sort_categories=True` argument.

219

220 >>> pd.api.types.union_categoricals([a, b], sort_categories=True)

221 ['b', 'c', 'a', 'b']

222 Categories (3, object): ['a', 'b', 'c']

223

224 `union_categoricals` also works with the case of combining two

225 categoricals of the same categories and order information (e.g. what

226 you could also `append` for).

227

228 >>> a = pd.Categorical(["a", "b"], ordered=True)

229 >>> b = pd.Categorical(["a", "b", "a"], ordered=True)

230 >>> pd.api.types.union_categoricals([a, b])

231 ['a', 'b', 'a', 'b', 'a']

232 Categories (2, object): ['a' < 'b']

233

234 Raises `TypeError` because the categories are ordered and not identical.

235

236 >>> a = pd.Categorical(["a", "b"], ordered=True)

237 >>> b = pd.Categorical(["a", "b", "c"], ordered=True)

238 >>> pd.api.types.union_categoricals([a, b])

239 Traceback (most recent call last):

240 ...

241 TypeError: to union ordered Categoricals, all categories must be the same

242

243 New in version 0.20.0

244

245 Ordered categoricals with different categories or orderings can be

246 combined by using the `ignore_ordered=True` argument.

247

248 >>> a = pd.Categorical(["a", "b", "c"], ordered=True)

249 >>> b = pd.Categorical(["c", "b", "a"], ordered=True)

250 >>> pd.api.types.union_categoricals([a, b], ignore_order=True)

251 ['a', 'b', 'c', 'c', 'b', 'a']

252 Categories (3, object): ['a', 'b', 'c']

253

254 `union_categoricals` also works with a `CategoricalIndex`, or `Series`

255 containing categorical data, but note that the resulting array will

256 always be a plain `Categorical`

257

258 >>> a = pd.Series(["b", "c"], dtype='category')

259 >>> b = pd.Series(["a", "b"], dtype='category')

260 >>> pd.api.types.union_categoricals([a, b])

261 ['b', 'c', 'a', 'b']

262 Categories (3, object): ['b', 'c', 'a']

263 """

264 from pandas import Categorical

265 from pandas.core.arrays.categorical import recode_for_categories

266

267 if len(to_union) == 0:

268 raise ValueError("No Categoricals to union")

269

270 def _maybe_unwrap(x):

271 if isinstance(x, (ABCCategoricalIndex, ABCSeries)):

272 return x._values

273 elif isinstance(x, Categorical):

274 return x

275 else:

276 raise TypeError("all components to combine must be Categorical")

277

278 to_union = [_maybe_unwrap(x) for x in to_union]

279 first = to_union[0]

280

281 if not all(

282 is_dtype_equal(other.categories.dtype, first.categories.dtype)

283 for other in to_union[1:]

284 ):

285 raise TypeError("dtype of categories must be the same")

286

287 ordered = False

288 if all(first._categories_match_up_to_permutation(other) for other in to_union[1:]):

289 # identical categories - fastpath

290 categories = first.categories

291 ordered = first.ordered

292

293 all_codes = [first._encode_with_my_categories(x)._codes for x in to_union]

294 new_codes = np.concatenate(all_codes)

295

296 if sort_categories and not ignore_order and ordered:

297 raise TypeError("Cannot use sort_categories=True with ordered Categoricals")

298

299 if sort_categories and not categories.is_monotonic_increasing:

300 categories = categories.sort_values()

301 indexer = categories.get_indexer(first.categories)

302

303 from pandas.core.algorithms import take_nd

304

305 new_codes = take_nd(indexer, new_codes, fill_value=-1)

306 elif ignore_order or all(not c.ordered for c in to_union):

307 # different categories - union and recode

308 cats = first.categories.append([c.categories for c in to_union[1:]])

309 categories = cats.unique()

310 if sort_categories:

311 categories = categories.sort_values()

312

313 new_codes = [

314 recode_for_categories(c.codes, c.categories, categories) for c in to_union

315 ]

316 new_codes = np.concatenate(new_codes)

317 else:

318 # ordered - to show a proper error message

319 if all(c.ordered for c in to_union):

320 msg = "to union ordered Categoricals, all categories must be the same"

321 raise TypeError(msg)

322 else:

323 raise TypeError("Categorical.ordered must be the same")

324

325 if ignore_order:

326 ordered = False

327

328 return Categorical(new_codes, categories=categories, ordered=ordered, fastpath=True)

329

330

331def _concatenate_2d(to_concat, axis: int):

332 # coerce to 2d if needed & concatenate

333 if axis == 1:

334 to_concat = [np.atleast_2d(x) for x in to_concat]

335 return np.concatenate(to_concat, axis=axis)

336

337

338def _concat_datetime(to_concat, axis=0):

339 """

340 provide concatenation of an datetimelike array of arrays each of which is a

341 single M8[ns], datetime64[ns, tz] or m8[ns] dtype

342

343 Parameters

344 ----------

345 to_concat : array of arrays

346 axis : axis to provide concatenation

347

348 Returns

349 -------

350 a single array, preserving the combined dtypes

351 """

352 from pandas.core.construction import ensure_wrapped_if_datetimelike

353

354 to_concat = [ensure_wrapped_if_datetimelike(x) for x in to_concat]

355

356 single_dtype = len({x.dtype for x in to_concat}) == 1

357

358 # multiple types, need to coerce to object

359 if not single_dtype:

360 # ensure_wrapped_if_datetimelike ensures that astype(object) wraps

361 # in Timestamp/Timedelta

362 return _concatenate_2d([x.astype(object) for x in to_concat], axis=axis)

363

364 result = type(to_concat[0])._concat_same_type(to_concat, axis=axis)

365 return result