Coverage for /var/srv/projects/api.amasfac.comuna18.com/tmp/venv/lib/python3.9/site-packages/pandas/core/dtypes/concat.py: 9%

110 statements  

« prev     ^ index     » next       coverage.py v6.4.4, created at 2023-07-17 14:22 -0600

1""" 

2Utility functions related to concat. 

3""" 

4from __future__ import annotations 

5 

6from typing import ( 

7 TYPE_CHECKING, 

8 cast, 

9) 

10import warnings 

11 

12import numpy as np 

13 

14from pandas._typing import ( 

15 ArrayLike, 

16 DtypeObj, 

17) 

18from pandas.util._exceptions import find_stack_level 

19 

20from pandas.core.dtypes.astype import astype_array 

21from pandas.core.dtypes.cast import ( 

22 common_dtype_categorical_compat, 

23 find_common_type, 

24) 

25from pandas.core.dtypes.common import ( 

26 is_dtype_equal, 

27 is_sparse, 

28) 

29from pandas.core.dtypes.dtypes import ( 

30 DatetimeTZDtype, 

31 ExtensionDtype, 

32) 

33from pandas.core.dtypes.generic import ( 

34 ABCCategoricalIndex, 

35 ABCExtensionArray, 

36 ABCSeries, 

37) 

38 

39if TYPE_CHECKING: 39 ↛ 40line 39 didn't jump to line 40, because the condition on line 39 was never true

40 from pandas.core.arrays import Categorical 

41 from pandas.core.arrays.sparse import SparseArray 

42 

43 

44def cast_to_common_type(arr: ArrayLike, dtype: DtypeObj) -> ArrayLike: 

45 """ 

46 Helper function for `arr.astype(common_dtype)` but handling all special 

47 cases. 

48 """ 

49 if is_dtype_equal(arr.dtype, dtype): 

50 return arr 

51 

52 if is_sparse(arr) and not is_sparse(dtype): 

53 # TODO(2.0): remove special case once SparseArray.astype deprecation 

54 # is enforced. 

55 # problem case: SparseArray.astype(dtype) doesn't follow the specified 

56 # dtype exactly, but converts this to Sparse[dtype] -> first manually 

57 # convert to dense array 

58 

59 # error: Argument 1 to "astype" of "_ArrayOrScalarCommon" has incompatible type 

60 # "Union[dtype[Any], ExtensionDtype]"; expected "Union[dtype[Any], None, type, _ 

61 # SupportsDType[dtype[Any]], str, Union[Tuple[Any, int], Tuple[Any, 

62 # Union[SupportsIndex, Sequence[SupportsIndex]]], List[Any], _DTypeDict, 

63 # Tuple[Any, Any]]]" [arg-type] 

64 arr = cast("SparseArray", arr) 

65 return arr.to_dense().astype(dtype, copy=False) # type: ignore[arg-type] 

66 

67 # astype_array includes ensure_wrapped_if_datetimelike 

68 return astype_array(arr, dtype=dtype, copy=False) 

69 

70 

71def concat_compat(to_concat, axis: int = 0, ea_compat_axis: bool = False): 

72 """ 

73 provide concatenation of an array of arrays each of which is a single 

74 'normalized' dtypes (in that for example, if it's object, then it is a 

75 non-datetimelike and provide a combined dtype for the resulting array that 

76 preserves the overall dtype if possible) 

77 

78 Parameters 

79 ---------- 

80 to_concat : array of arrays 

81 axis : axis to provide concatenation 

82 ea_compat_axis : bool, default False 

83 For ExtensionArray compat, behave as if axis == 1 when determining 

84 whether to drop empty arrays. 

85 

86 Returns 

87 ------- 

88 a single array, preserving the combined dtypes 

89 """ 

90 # filter empty arrays 

91 # 1-d dtypes always are included here 

92 def is_nonempty(x) -> bool: 

93 if x.ndim <= axis: 

94 return True 

95 return x.shape[axis] > 0 

96 

97 # If all arrays are empty, there's nothing to convert, just short-cut to 

98 # the concatenation, #3121. 

99 # 

100 # Creating an empty array directly is tempting, but the winnings would be 

101 # marginal given that it would still require shape & dtype calculation and 

102 # np.concatenate which has them both implemented is compiled. 

103 non_empties = [x for x in to_concat if is_nonempty(x)] 

104 if non_empties and axis == 0 and not ea_compat_axis: 

105 # ea_compat_axis see GH#39574 

106 to_concat = non_empties 

107 

108 dtypes = {obj.dtype for obj in to_concat} 

109 kinds = {obj.dtype.kind for obj in to_concat} 

110 contains_datetime = any( 

111 isinstance(dtype, (np.dtype, DatetimeTZDtype)) and dtype.kind in ["m", "M"] 

112 for dtype in dtypes 

113 ) or any(isinstance(obj, ABCExtensionArray) and obj.ndim > 1 for obj in to_concat) 

114 

115 all_empty = not len(non_empties) 

116 single_dtype = len({x.dtype for x in to_concat}) == 1 

117 any_ea = any(isinstance(x.dtype, ExtensionDtype) for x in to_concat) 

118 

119 if contains_datetime: 

120 return _concat_datetime(to_concat, axis=axis) 

121 

122 if any_ea: 

123 # we ignore axis here, as internally concatting with EAs is always 

124 # for axis=0 

125 if not single_dtype: 

126 target_dtype = find_common_type([x.dtype for x in to_concat]) 

127 target_dtype = common_dtype_categorical_compat(to_concat, target_dtype) 

128 to_concat = [cast_to_common_type(arr, target_dtype) for arr in to_concat] 

129 

130 if isinstance(to_concat[0], ABCExtensionArray): 

131 # TODO: what about EA-backed Index? 

132 cls = type(to_concat[0]) 

133 return cls._concat_same_type(to_concat) 

134 else: 

135 return np.concatenate(to_concat) 

136 

137 elif all_empty: 

138 # we have all empties, but may need to coerce the result dtype to 

139 # object if we have non-numeric type operands (numpy would otherwise 

140 # cast this to float) 

141 if len(kinds) != 1: 

142 

143 if not len(kinds - {"i", "u", "f"}) or not len(kinds - {"b", "i", "u"}): 

144 # let numpy coerce 

145 pass 

146 else: 

147 # coerce to object 

148 to_concat = [x.astype("object") for x in to_concat] 

149 kinds = {"o"} 

150 

151 result = np.concatenate(to_concat, axis=axis) 

152 if "b" in kinds and result.dtype.kind in ["i", "u", "f"]: 

153 # GH#39817 

154 warnings.warn( 

155 "Behavior when concatenating bool-dtype and numeric-dtype arrays is " 

156 "deprecated; in a future version these will cast to object dtype " 

157 "(instead of coercing bools to numeric values). To retain the old " 

158 "behavior, explicitly cast bool-dtype arrays to numeric dtype.", 

159 FutureWarning, 

160 stacklevel=find_stack_level(), 

161 ) 

162 return result 

163 

164 

165def union_categoricals( 

166 to_union, sort_categories: bool = False, ignore_order: bool = False 

167) -> Categorical: 

168 """ 

169 Combine list-like of Categorical-like, unioning categories. 

170 

171 All categories must have the same dtype. 

172 

173 Parameters 

174 ---------- 

175 to_union : list-like 

176 Categorical, CategoricalIndex, or Series with dtype='category'. 

177 sort_categories : bool, default False 

178 If true, resulting categories will be lexsorted, otherwise 

179 they will be ordered as they appear in the data. 

180 ignore_order : bool, default False 

181 If true, the ordered attribute of the Categoricals will be ignored. 

182 Results in an unordered categorical. 

183 

184 Returns 

185 ------- 

186 Categorical 

187 

188 Raises 

189 ------ 

190 TypeError 

191 - all inputs do not have the same dtype 

192 - all inputs do not have the same ordered property 

193 - all inputs are ordered and their categories are not identical 

194 - sort_categories=True and Categoricals are ordered 

195 ValueError 

196 Empty list of categoricals passed 

197 

198 Notes 

199 ----- 

200 To learn more about categories, see `link 

201 <https://pandas.pydata.org/pandas-docs/stable/user_guide/categorical.html#unioning>`__ 

202 

203 Examples 

204 -------- 

205 If you want to combine categoricals that do not necessarily have 

206 the same categories, `union_categoricals` will combine a list-like 

207 of categoricals. The new categories will be the union of the 

208 categories being combined. 

209 

210 >>> a = pd.Categorical(["b", "c"]) 

211 >>> b = pd.Categorical(["a", "b"]) 

212 >>> pd.api.types.union_categoricals([a, b]) 

213 ['b', 'c', 'a', 'b'] 

214 Categories (3, object): ['b', 'c', 'a'] 

215 

216 By default, the resulting categories will be ordered as they appear 

217 in the `categories` of the data. If you want the categories to be 

218 lexsorted, use `sort_categories=True` argument. 

219 

220 >>> pd.api.types.union_categoricals([a, b], sort_categories=True) 

221 ['b', 'c', 'a', 'b'] 

222 Categories (3, object): ['a', 'b', 'c'] 

223 

224 `union_categoricals` also works with the case of combining two 

225 categoricals of the same categories and order information (e.g. what 

226 you could also `append` for). 

227 

228 >>> a = pd.Categorical(["a", "b"], ordered=True) 

229 >>> b = pd.Categorical(["a", "b", "a"], ordered=True) 

230 >>> pd.api.types.union_categoricals([a, b]) 

231 ['a', 'b', 'a', 'b', 'a'] 

232 Categories (2, object): ['a' < 'b'] 

233 

234 Raises `TypeError` because the categories are ordered and not identical. 

235 

236 >>> a = pd.Categorical(["a", "b"], ordered=True) 

237 >>> b = pd.Categorical(["a", "b", "c"], ordered=True) 

238 >>> pd.api.types.union_categoricals([a, b]) 

239 Traceback (most recent call last): 

240 ... 

241 TypeError: to union ordered Categoricals, all categories must be the same 

242 

243 New in version 0.20.0 

244 

245 Ordered categoricals with different categories or orderings can be 

246 combined by using the `ignore_ordered=True` argument. 

247 

248 >>> a = pd.Categorical(["a", "b", "c"], ordered=True) 

249 >>> b = pd.Categorical(["c", "b", "a"], ordered=True) 

250 >>> pd.api.types.union_categoricals([a, b], ignore_order=True) 

251 ['a', 'b', 'c', 'c', 'b', 'a'] 

252 Categories (3, object): ['a', 'b', 'c'] 

253 

254 `union_categoricals` also works with a `CategoricalIndex`, or `Series` 

255 containing categorical data, but note that the resulting array will 

256 always be a plain `Categorical` 

257 

258 >>> a = pd.Series(["b", "c"], dtype='category') 

259 >>> b = pd.Series(["a", "b"], dtype='category') 

260 >>> pd.api.types.union_categoricals([a, b]) 

261 ['b', 'c', 'a', 'b'] 

262 Categories (3, object): ['b', 'c', 'a'] 

263 """ 

264 from pandas import Categorical 

265 from pandas.core.arrays.categorical import recode_for_categories 

266 

267 if len(to_union) == 0: 

268 raise ValueError("No Categoricals to union") 

269 

270 def _maybe_unwrap(x): 

271 if isinstance(x, (ABCCategoricalIndex, ABCSeries)): 

272 return x._values 

273 elif isinstance(x, Categorical): 

274 return x 

275 else: 

276 raise TypeError("all components to combine must be Categorical") 

277 

278 to_union = [_maybe_unwrap(x) for x in to_union] 

279 first = to_union[0] 

280 

281 if not all( 

282 is_dtype_equal(other.categories.dtype, first.categories.dtype) 

283 for other in to_union[1:] 

284 ): 

285 raise TypeError("dtype of categories must be the same") 

286 

287 ordered = False 

288 if all(first._categories_match_up_to_permutation(other) for other in to_union[1:]): 

289 # identical categories - fastpath 

290 categories = first.categories 

291 ordered = first.ordered 

292 

293 all_codes = [first._encode_with_my_categories(x)._codes for x in to_union] 

294 new_codes = np.concatenate(all_codes) 

295 

296 if sort_categories and not ignore_order and ordered: 

297 raise TypeError("Cannot use sort_categories=True with ordered Categoricals") 

298 

299 if sort_categories and not categories.is_monotonic_increasing: 

300 categories = categories.sort_values() 

301 indexer = categories.get_indexer(first.categories) 

302 

303 from pandas.core.algorithms import take_nd 

304 

305 new_codes = take_nd(indexer, new_codes, fill_value=-1) 

306 elif ignore_order or all(not c.ordered for c in to_union): 

307 # different categories - union and recode 

308 cats = first.categories.append([c.categories for c in to_union[1:]]) 

309 categories = cats.unique() 

310 if sort_categories: 

311 categories = categories.sort_values() 

312 

313 new_codes = [ 

314 recode_for_categories(c.codes, c.categories, categories) for c in to_union 

315 ] 

316 new_codes = np.concatenate(new_codes) 

317 else: 

318 # ordered - to show a proper error message 

319 if all(c.ordered for c in to_union): 

320 msg = "to union ordered Categoricals, all categories must be the same" 

321 raise TypeError(msg) 

322 else: 

323 raise TypeError("Categorical.ordered must be the same") 

324 

325 if ignore_order: 

326 ordered = False 

327 

328 return Categorical(new_codes, categories=categories, ordered=ordered, fastpath=True) 

329 

330 

331def _concatenate_2d(to_concat, axis: int): 

332 # coerce to 2d if needed & concatenate 

333 if axis == 1: 

334 to_concat = [np.atleast_2d(x) for x in to_concat] 

335 return np.concatenate(to_concat, axis=axis) 

336 

337 

338def _concat_datetime(to_concat, axis=0): 

339 """ 

340 provide concatenation of an datetimelike array of arrays each of which is a 

341 single M8[ns], datetime64[ns, tz] or m8[ns] dtype 

342 

343 Parameters 

344 ---------- 

345 to_concat : array of arrays 

346 axis : axis to provide concatenation 

347 

348 Returns 

349 ------- 

350 a single array, preserving the combined dtypes 

351 """ 

352 from pandas.core.construction import ensure_wrapped_if_datetimelike 

353 

354 to_concat = [ensure_wrapped_if_datetimelike(x) for x in to_concat] 

355 

356 single_dtype = len({x.dtype for x in to_concat}) == 1 

357 

358 # multiple types, need to coerce to object 

359 if not single_dtype: 

360 # ensure_wrapped_if_datetimelike ensures that astype(object) wraps 

361 # in Timestamp/Timedelta 

362 return _concatenate_2d([x.astype(object) for x in to_concat], axis=axis) 

363 

364 result = type(to_concat[0])._concat_same_type(to_concat, axis=axis) 

365 return result