Coverage for /var/srv/projects/api.amasfac.comuna18.com/tmp/venv/lib/python3.9/site-packages/pandas/core/reshape/encoding.py: 6%

151 statements  

« prev     ^ index     » next       coverage.py v6.4.4, created at 2023-07-17 14:22 -0600

1from __future__ import annotations 

2 

3from collections import defaultdict 

4import itertools 

5from typing import Hashable 

6 

7import numpy as np 

8 

9from pandas._libs.sparse import IntIndex 

10from pandas._typing import Dtype 

11 

12from pandas.core.dtypes.common import ( 

13 is_integer_dtype, 

14 is_list_like, 

15 is_object_dtype, 

16) 

17 

18from pandas.core.arrays import SparseArray 

19from pandas.core.arrays.categorical import factorize_from_iterable 

20from pandas.core.frame import DataFrame 

21from pandas.core.indexes.api import Index 

22from pandas.core.series import Series 

23 

24 

25def get_dummies( 

26 data, 

27 prefix=None, 

28 prefix_sep="_", 

29 dummy_na: bool = False, 

30 columns=None, 

31 sparse: bool = False, 

32 drop_first: bool = False, 

33 dtype: Dtype | None = None, 

34) -> DataFrame: 

35 """ 

36 Convert categorical variable into dummy/indicator variables. 

37 

38 Parameters 

39 ---------- 

40 data : array-like, Series, or DataFrame 

41 Data of which to get dummy indicators. 

42 prefix : str, list of str, or dict of str, default None 

43 String to append DataFrame column names. 

44 Pass a list with length equal to the number of columns 

45 when calling get_dummies on a DataFrame. Alternatively, `prefix` 

46 can be a dictionary mapping column names to prefixes. 

47 prefix_sep : str, default '_' 

48 If appending prefix, separator/delimiter to use. Or pass a 

49 list or dictionary as with `prefix`. 

50 dummy_na : bool, default False 

51 Add a column to indicate NaNs, if False NaNs are ignored. 

52 columns : list-like, default None 

53 Column names in the DataFrame to be encoded. 

54 If `columns` is None then all the columns with 

55 `object`, `string`, or `category` dtype will be converted. 

56 sparse : bool, default False 

57 Whether the dummy-encoded columns should be backed by 

58 a :class:`SparseArray` (True) or a regular NumPy array (False). 

59 drop_first : bool, default False 

60 Whether to get k-1 dummies out of k categorical levels by removing the 

61 first level. 

62 dtype : dtype, default np.uint8 

63 Data type for new columns. Only a single dtype is allowed. 

64 

65 Returns 

66 ------- 

67 DataFrame 

68 Dummy-coded data. 

69 

70 See Also 

71 -------- 

72 Series.str.get_dummies : Convert Series to dummy codes. 

73 :func:`~pandas.from_dummies` : Convert dummy codes to categorical ``DataFrame``. 

74 

75 Notes 

76 ----- 

77 Reference :ref:`the user guide <reshaping.dummies>` for more examples. 

78 

79 Examples 

80 -------- 

81 >>> s = pd.Series(list('abca')) 

82 

83 >>> pd.get_dummies(s) 

84 a b c 

85 0 1 0 0 

86 1 0 1 0 

87 2 0 0 1 

88 3 1 0 0 

89 

90 >>> s1 = ['a', 'b', np.nan] 

91 

92 >>> pd.get_dummies(s1) 

93 a b 

94 0 1 0 

95 1 0 1 

96 2 0 0 

97 

98 >>> pd.get_dummies(s1, dummy_na=True) 

99 a b NaN 

100 0 1 0 0 

101 1 0 1 0 

102 2 0 0 1 

103 

104 >>> df = pd.DataFrame({'A': ['a', 'b', 'a'], 'B': ['b', 'a', 'c'], 

105 ... 'C': [1, 2, 3]}) 

106 

107 >>> pd.get_dummies(df, prefix=['col1', 'col2']) 

108 C col1_a col1_b col2_a col2_b col2_c 

109 0 1 1 0 0 1 0 

110 1 2 0 1 1 0 0 

111 2 3 1 0 0 0 1 

112 

113 >>> pd.get_dummies(pd.Series(list('abcaa'))) 

114 a b c 

115 0 1 0 0 

116 1 0 1 0 

117 2 0 0 1 

118 3 1 0 0 

119 4 1 0 0 

120 

121 >>> pd.get_dummies(pd.Series(list('abcaa')), drop_first=True) 

122 b c 

123 0 0 0 

124 1 1 0 

125 2 0 1 

126 3 0 0 

127 4 0 0 

128 

129 >>> pd.get_dummies(pd.Series(list('abc')), dtype=float) 

130 a b c 

131 0 1.0 0.0 0.0 

132 1 0.0 1.0 0.0 

133 2 0.0 0.0 1.0 

134 """ 

135 from pandas.core.reshape.concat import concat 

136 

137 dtypes_to_encode = ["object", "string", "category"] 

138 

139 if isinstance(data, DataFrame): 

140 # determine columns being encoded 

141 if columns is None: 

142 data_to_encode = data.select_dtypes(include=dtypes_to_encode) 

143 elif not is_list_like(columns): 

144 raise TypeError("Input must be a list-like for parameter `columns`") 

145 else: 

146 data_to_encode = data[columns] 

147 

148 # validate prefixes and separator to avoid silently dropping cols 

149 def check_len(item, name): 

150 

151 if is_list_like(item): 

152 if not len(item) == data_to_encode.shape[1]: 

153 len_msg = ( 

154 f"Length of '{name}' ({len(item)}) did not match the " 

155 "length of the columns being encoded " 

156 f"({data_to_encode.shape[1]})." 

157 ) 

158 raise ValueError(len_msg) 

159 

160 check_len(prefix, "prefix") 

161 check_len(prefix_sep, "prefix_sep") 

162 

163 if isinstance(prefix, str): 

164 prefix = itertools.cycle([prefix]) 

165 if isinstance(prefix, dict): 

166 prefix = [prefix[col] for col in data_to_encode.columns] 

167 

168 if prefix is None: 

169 prefix = data_to_encode.columns 

170 

171 # validate separators 

172 if isinstance(prefix_sep, str): 

173 prefix_sep = itertools.cycle([prefix_sep]) 

174 elif isinstance(prefix_sep, dict): 

175 prefix_sep = [prefix_sep[col] for col in data_to_encode.columns] 

176 

177 with_dummies: list[DataFrame] 

178 if data_to_encode.shape == data.shape: 

179 # Encoding the entire df, do not prepend any dropped columns 

180 with_dummies = [] 

181 elif columns is not None: 

182 # Encoding only cols specified in columns. Get all cols not in 

183 # columns to prepend to result. 

184 with_dummies = [data.drop(columns, axis=1)] 

185 else: 

186 # Encoding only object and category dtype columns. Get remaining 

187 # columns to prepend to result. 

188 with_dummies = [data.select_dtypes(exclude=dtypes_to_encode)] 

189 

190 for (col, pre, sep) in zip(data_to_encode.items(), prefix, prefix_sep): 

191 # col is (column_name, column), use just column data here 

192 dummy = _get_dummies_1d( 

193 col[1], 

194 prefix=pre, 

195 prefix_sep=sep, 

196 dummy_na=dummy_na, 

197 sparse=sparse, 

198 drop_first=drop_first, 

199 dtype=dtype, 

200 ) 

201 with_dummies.append(dummy) 

202 result = concat(with_dummies, axis=1) 

203 else: 

204 result = _get_dummies_1d( 

205 data, 

206 prefix, 

207 prefix_sep, 

208 dummy_na, 

209 sparse=sparse, 

210 drop_first=drop_first, 

211 dtype=dtype, 

212 ) 

213 return result 

214 

215 

216def _get_dummies_1d( 

217 data, 

218 prefix, 

219 prefix_sep="_", 

220 dummy_na: bool = False, 

221 sparse: bool = False, 

222 drop_first: bool = False, 

223 dtype: Dtype | None = None, 

224) -> DataFrame: 

225 from pandas.core.reshape.concat import concat 

226 

227 # Series avoids inconsistent NaN handling 

228 codes, levels = factorize_from_iterable(Series(data)) 

229 

230 if dtype is None: 

231 dtype = np.dtype(np.uint8) 

232 # error: Argument 1 to "dtype" has incompatible type "Union[ExtensionDtype, str, 

233 # dtype[Any], Type[object]]"; expected "Type[Any]" 

234 dtype = np.dtype(dtype) # type: ignore[arg-type] 

235 

236 if is_object_dtype(dtype): 

237 raise ValueError("dtype=object is not a valid dtype for get_dummies") 

238 

239 def get_empty_frame(data) -> DataFrame: 

240 index: Index | np.ndarray 

241 if isinstance(data, Series): 

242 index = data.index 

243 else: 

244 index = Index(range(len(data))) 

245 return DataFrame(index=index) 

246 

247 # if all NaN 

248 if not dummy_na and len(levels) == 0: 

249 return get_empty_frame(data) 

250 

251 codes = codes.copy() 

252 if dummy_na: 

253 codes[codes == -1] = len(levels) 

254 levels = levels.insert(len(levels), np.nan) 

255 

256 # if dummy_na, we just fake a nan level. drop_first will drop it again 

257 if drop_first and len(levels) == 1: 

258 return get_empty_frame(data) 

259 

260 number_of_cols = len(levels) 

261 

262 if prefix is None: 

263 dummy_cols = levels 

264 else: 

265 dummy_cols = Index([f"{prefix}{prefix_sep}{level}" for level in levels]) 

266 

267 index: Index | None 

268 if isinstance(data, Series): 

269 index = data.index 

270 else: 

271 index = None 

272 

273 if sparse: 

274 

275 fill_value: bool | float 

276 if is_integer_dtype(dtype): 

277 fill_value = 0 

278 elif dtype == np.dtype(bool): 

279 fill_value = False 

280 else: 

281 fill_value = 0.0 

282 

283 sparse_series = [] 

284 N = len(data) 

285 sp_indices: list[list] = [[] for _ in range(len(dummy_cols))] 

286 mask = codes != -1 

287 codes = codes[mask] 

288 n_idx = np.arange(N)[mask] 

289 

290 for ndx, code in zip(n_idx, codes): 

291 sp_indices[code].append(ndx) 

292 

293 if drop_first: 

294 # remove first categorical level to avoid perfect collinearity 

295 # GH12042 

296 sp_indices = sp_indices[1:] 

297 dummy_cols = dummy_cols[1:] 

298 for col, ixs in zip(dummy_cols, sp_indices): 

299 sarr = SparseArray( 

300 np.ones(len(ixs), dtype=dtype), 

301 sparse_index=IntIndex(N, ixs), 

302 fill_value=fill_value, 

303 dtype=dtype, 

304 ) 

305 sparse_series.append(Series(data=sarr, index=index, name=col)) 

306 

307 return concat(sparse_series, axis=1, copy=False) 

308 

309 else: 

310 # take on axis=1 + transpose to ensure ndarray layout is column-major 

311 dummy_mat = np.eye(number_of_cols, dtype=dtype).take(codes, axis=1).T 

312 

313 if not dummy_na: 

314 # reset NaN GH4446 

315 dummy_mat[codes == -1] = 0 

316 

317 if drop_first: 

318 # remove first GH12042 

319 dummy_mat = dummy_mat[:, 1:] 

320 dummy_cols = dummy_cols[1:] 

321 return DataFrame(dummy_mat, index=index, columns=dummy_cols) 

322 

323 

324def from_dummies( 

325 data: DataFrame, 

326 sep: None | str = None, 

327 default_category: None | Hashable | dict[str, Hashable] = None, 

328) -> DataFrame: 

329 """ 

330 Create a categorical ``DataFrame`` from a ``DataFrame`` of dummy variables. 

331 

332 Inverts the operation performed by :func:`~pandas.get_dummies`. 

333 

334 .. versionadded:: 1.5.0 

335 

336 Parameters 

337 ---------- 

338 data : DataFrame 

339 Data which contains dummy-coded variables in form of integer columns of 

340 1's and 0's. 

341 sep : str, default None 

342 Separator used in the column names of the dummy categories they are 

343 character indicating the separation of the categorical names from the prefixes. 

344 For example, if your column names are 'prefix_A' and 'prefix_B', 

345 you can strip the underscore by specifying sep='_'. 

346 default_category : None, Hashable or dict of Hashables, default None 

347 The default category is the implied category when a value has none of the 

348 listed categories specified with a one, i.e. if all dummies in a row are 

349 zero. Can be a single value for all variables or a dict directly mapping 

350 the default categories to a prefix of a variable. 

351 

352 Returns 

353 ------- 

354 DataFrame 

355 Categorical data decoded from the dummy input-data. 

356 

357 Raises 

358 ------ 

359 ValueError 

360 * When the input ``DataFrame`` ``data`` contains NA values. 

361 * When the input ``DataFrame`` ``data`` contains column names with separators 

362 that do not match the separator specified with ``sep``. 

363 * When a ``dict`` passed to ``default_category`` does not include an implied 

364 category for each prefix. 

365 * When a value in ``data`` has more than one category assigned to it. 

366 * When ``default_category=None`` and a value in ``data`` has no category 

367 assigned to it. 

368 TypeError 

369 * When the input ``data`` is not of type ``DataFrame``. 

370 * When the input ``DataFrame`` ``data`` contains non-dummy data. 

371 * When the passed ``sep`` is of a wrong data type. 

372 * When the passed ``default_category`` is of a wrong data type. 

373 

374 See Also 

375 -------- 

376 :func:`~pandas.get_dummies` : Convert ``Series`` or ``DataFrame`` to dummy codes. 

377 :class:`~pandas.Categorical` : Represent a categorical variable in classic. 

378 

379 Notes 

380 ----- 

381 The columns of the passed dummy data should only include 1's and 0's, 

382 or boolean values. 

383 

384 Examples 

385 -------- 

386 >>> df = pd.DataFrame({"a": [1, 0, 0, 1], "b": [0, 1, 0, 0], 

387 ... "c": [0, 0, 1, 0]}) 

388 

389 >>> df 

390 a b c 

391 0 1 0 0 

392 1 0 1 0 

393 2 0 0 1 

394 3 1 0 0 

395 

396 >>> pd.from_dummies(df) 

397 0 a 

398 1 b 

399 2 c 

400 3 a 

401 

402 >>> df = pd.DataFrame({"col1_a": [1, 0, 1], "col1_b": [0, 1, 0], 

403 ... "col2_a": [0, 1, 0], "col2_b": [1, 0, 0], 

404 ... "col2_c": [0, 0, 1]}) 

405 

406 >>> df 

407 col1_a col1_b col2_a col2_b col2_c 

408 0 1 0 0 1 0 

409 1 0 1 1 0 0 

410 2 1 0 0 0 1 

411 

412 >>> pd.from_dummies(df, sep="_") 

413 col1 col2 

414 0 a b 

415 1 b a 

416 2 a c 

417 

418 >>> df = pd.DataFrame({"col1_a": [1, 0, 0], "col1_b": [0, 1, 0], 

419 ... "col2_a": [0, 1, 0], "col2_b": [1, 0, 0], 

420 ... "col2_c": [0, 0, 0]}) 

421 

422 >>> df 

423 col1_a col1_b col2_a col2_b col2_c 

424 0 1 0 0 1 0 

425 1 0 1 1 0 0 

426 2 0 0 0 0 0 

427 

428 >>> pd.from_dummies(df, sep="_", default_category={"col1": "d", "col2": "e"}) 

429 col1 col2 

430 0 a b 

431 1 b a 

432 2 d e 

433 """ 

434 from pandas.core.reshape.concat import concat 

435 

436 if not isinstance(data, DataFrame): 

437 raise TypeError( 

438 "Expected 'data' to be a 'DataFrame'; " 

439 f"Received 'data' of type: {type(data).__name__}" 

440 ) 

441 

442 if data.isna().any().any(): 

443 raise ValueError( 

444 "Dummy DataFrame contains NA value in column: " 

445 f"'{data.isna().any().idxmax()}'" 

446 ) 

447 

448 # index data with a list of all columns that are dummies 

449 try: 

450 data_to_decode = data.astype("boolean", copy=False) 

451 except TypeError: 

452 raise TypeError("Passed DataFrame contains non-dummy data") 

453 

454 # collect prefixes and get lists to slice data for each prefix 

455 variables_slice = defaultdict(list) 

456 if sep is None: 

457 variables_slice[""] = list(data.columns) 

458 elif isinstance(sep, str): 

459 for col in data_to_decode.columns: 

460 prefix = col.split(sep)[0] 

461 if len(prefix) == len(col): 

462 raise ValueError(f"Separator not specified for column: {col}") 

463 variables_slice[prefix].append(col) 

464 else: 

465 raise TypeError( 

466 "Expected 'sep' to be of type 'str' or 'None'; " 

467 f"Received 'sep' of type: {type(sep).__name__}" 

468 ) 

469 

470 if default_category is not None: 

471 if isinstance(default_category, dict): 

472 if not len(default_category) == len(variables_slice): 

473 len_msg = ( 

474 f"Length of 'default_category' ({len(default_category)}) " 

475 f"did not match the length of the columns being encoded " 

476 f"({len(variables_slice)})" 

477 ) 

478 raise ValueError(len_msg) 

479 elif isinstance(default_category, Hashable): 

480 default_category = dict( 

481 zip(variables_slice, [default_category] * len(variables_slice)) 

482 ) 

483 else: 

484 raise TypeError( 

485 "Expected 'default_category' to be of type " 

486 "'None', 'Hashable', or 'dict'; " 

487 "Received 'default_category' of type: " 

488 f"{type(default_category).__name__}" 

489 ) 

490 

491 cat_data = {} 

492 for prefix, prefix_slice in variables_slice.items(): 

493 if sep is None: 

494 cats = prefix_slice.copy() 

495 else: 

496 cats = [col[len(prefix + sep) :] for col in prefix_slice] 

497 assigned = data_to_decode.loc[:, prefix_slice].sum(axis=1) 

498 if any(assigned > 1): 

499 raise ValueError( 

500 "Dummy DataFrame contains multi-assignment(s); " 

501 f"First instance in row: {assigned.idxmax()}" 

502 ) 

503 elif any(assigned == 0): 

504 if isinstance(default_category, dict): 

505 cats.append(default_category[prefix]) 

506 else: 

507 raise ValueError( 

508 "Dummy DataFrame contains unassigned value(s); " 

509 f"First instance in row: {assigned.idxmin()}" 

510 ) 

511 data_slice = concat( 

512 (data_to_decode.loc[:, prefix_slice], assigned == 0), axis=1 

513 ) 

514 else: 

515 data_slice = data_to_decode.loc[:, prefix_slice] 

516 cats_array = np.array(cats, dtype="object") 

517 # get indices of True entries along axis=1 

518 cat_data[prefix] = cats_array[data_slice.to_numpy().nonzero()[1]] 

519 

520 return DataFrame(cat_data)