Coverage for /var/srv/projects/api.amasfac.comuna18.com/tmp/venv/lib/python3.9/site-packages/pandas/core/reshape/encoding.py: 6%

1from __future__ import annotations

3from collections import defaultdict

4import itertools

5from typing import Hashable

7import numpy as np

9from pandas._libs.sparse import IntIndex

10from pandas._typing import Dtype

12from pandas.core.dtypes.common import (

13 is_integer_dtype,

14 is_list_like,

15 is_object_dtype,

16)

18from pandas.core.arrays import SparseArray

19from pandas.core.arrays.categorical import factorize_from_iterable

20from pandas.core.frame import DataFrame

21from pandas.core.indexes.api import Index

22from pandas.core.series import Series

25def get_dummies(

26 data,

27 prefix=None,

28 prefix_sep="_",

29 dummy_na: bool = False,

30 columns=None,

31 sparse: bool = False,

32 drop_first: bool = False,

33 dtype: Dtype | None = None,

34) -> DataFrame:

35 """

36 Convert categorical variable into dummy/indicator variables.

38 Parameters

39 ----------

40 data : array-like, Series, or DataFrame

41 Data of which to get dummy indicators.

42 prefix : str, list of str, or dict of str, default None

43 String to append DataFrame column names.

44 Pass a list with length equal to the number of columns

45 when calling get_dummies on a DataFrame. Alternatively, `prefix`

46 can be a dictionary mapping column names to prefixes.

47 prefix_sep : str, default '_'

48 If appending prefix, separator/delimiter to use. Or pass a

49 list or dictionary as with `prefix`.

50 dummy_na : bool, default False

51 Add a column to indicate NaNs, if False NaNs are ignored.

52 columns : list-like, default None

53 Column names in the DataFrame to be encoded.

54 If `columns` is None then all the columns with

55 `object`, `string`, or `category` dtype will be converted.

56 sparse : bool, default False

57 Whether the dummy-encoded columns should be backed by

58 a :class:`SparseArray` (True) or a regular NumPy array (False).

59 drop_first : bool, default False

60 Whether to get k-1 dummies out of k categorical levels by removing the

61 first level.

62 dtype : dtype, default np.uint8

63 Data type for new columns. Only a single dtype is allowed.

65 Returns

66 -------

67 DataFrame

68 Dummy-coded data.

70 See Also

71 --------

72 Series.str.get_dummies : Convert Series to dummy codes.

73 :func:`~pandas.from_dummies` : Convert dummy codes to categorical ``DataFrame``.

75 Notes

76 -----

77 Reference :ref:`the user guide <reshaping.dummies>` for more examples.

79 Examples

80 --------

81 >>> s = pd.Series(list('abca'))

83 >>> pd.get_dummies(s)

84 a b c

85 0 1 0 0

86 1 0 1 0

87 2 0 0 1

88 3 1 0 0

90 >>> s1 = ['a', 'b', np.nan]

92 >>> pd.get_dummies(s1)

93 a b

94 0 1 0

95 1 0 1

96 2 0 0

98 >>> pd.get_dummies(s1, dummy_na=True)

99 a b NaN

100 0 1 0 0

101 1 0 1 0

102 2 0 0 1

103

104 >>> df = pd.DataFrame({'A': ['a', 'b', 'a'], 'B': ['b', 'a', 'c'],

105 ... 'C': [1, 2, 3]})

106

107 >>> pd.get_dummies(df, prefix=['col1', 'col2'])

108 C col1_a col1_b col2_a col2_b col2_c

109 0 1 1 0 0 1 0

110 1 2 0 1 1 0 0

111 2 3 1 0 0 0 1

112

113 >>> pd.get_dummies(pd.Series(list('abcaa')))

114 a b c

115 0 1 0 0

116 1 0 1 0

117 2 0 0 1

118 3 1 0 0

119 4 1 0 0

120

121 >>> pd.get_dummies(pd.Series(list('abcaa')), drop_first=True)

122 b c

123 0 0 0

124 1 1 0

125 2 0 1

126 3 0 0

127 4 0 0

128

129 >>> pd.get_dummies(pd.Series(list('abc')), dtype=float)

130 a b c

131 0 1.0 0.0 0.0

132 1 0.0 1.0 0.0

133 2 0.0 0.0 1.0

134 """

135 from pandas.core.reshape.concat import concat

136

137 dtypes_to_encode = ["object", "string", "category"]

138

139 if isinstance(data, DataFrame):

140 # determine columns being encoded

141 if columns is None:

142 data_to_encode = data.select_dtypes(include=dtypes_to_encode)

143 elif not is_list_like(columns):

144 raise TypeError("Input must be a list-like for parameter `columns`")

145 else:

146 data_to_encode = data[columns]

147

148 # validate prefixes and separator to avoid silently dropping cols

149 def check_len(item, name):

150

151 if is_list_like(item):

152 if not len(item) == data_to_encode.shape[1]:

153 len_msg = (

154 f"Length of '{name}' ({len(item)}) did not match the "

155 "length of the columns being encoded "

156 f"({data_to_encode.shape[1]})."

157 )

158 raise ValueError(len_msg)

159

160 check_len(prefix, "prefix")

161 check_len(prefix_sep, "prefix_sep")

162

163 if isinstance(prefix, str):

164 prefix = itertools.cycle([prefix])

165 if isinstance(prefix, dict):

166 prefix = [prefix[col] for col in data_to_encode.columns]

167

168 if prefix is None:

169 prefix = data_to_encode.columns

170

171 # validate separators

172 if isinstance(prefix_sep, str):

173 prefix_sep = itertools.cycle([prefix_sep])

174 elif isinstance(prefix_sep, dict):

175 prefix_sep = [prefix_sep[col] for col in data_to_encode.columns]

176

177 with_dummies: list[DataFrame]

178 if data_to_encode.shape == data.shape:

179 # Encoding the entire df, do not prepend any dropped columns

180 with_dummies = []

181 elif columns is not None:

182 # Encoding only cols specified in columns. Get all cols not in

183 # columns to prepend to result.

184 with_dummies = [data.drop(columns, axis=1)]

185 else:

186 # Encoding only object and category dtype columns. Get remaining

187 # columns to prepend to result.

188 with_dummies = [data.select_dtypes(exclude=dtypes_to_encode)]

189

190 for (col, pre, sep) in zip(data_to_encode.items(), prefix, prefix_sep):

191 # col is (column_name, column), use just column data here

192 dummy = _get_dummies_1d(

193 col[1],

194 prefix=pre,

195 prefix_sep=sep,

196 dummy_na=dummy_na,

197 sparse=sparse,

198 drop_first=drop_first,

199 dtype=dtype,

200 )

201 with_dummies.append(dummy)

202 result = concat(with_dummies, axis=1)

203 else:

204 result = _get_dummies_1d(

205 data,

206 prefix,

207 prefix_sep,

208 dummy_na,

209 sparse=sparse,

210 drop_first=drop_first,

211 dtype=dtype,

212 )

213 return result

214

215

216def _get_dummies_1d(

217 data,

218 prefix,

219 prefix_sep="_",

220 dummy_na: bool = False,

221 sparse: bool = False,

222 drop_first: bool = False,

223 dtype: Dtype | None = None,

224) -> DataFrame:

225 from pandas.core.reshape.concat import concat

226

227 # Series avoids inconsistent NaN handling

228 codes, levels = factorize_from_iterable(Series(data))

229

230 if dtype is None:

231 dtype = np.dtype(np.uint8)

232 # error: Argument 1 to "dtype" has incompatible type "Union[ExtensionDtype, str,

233 # dtype[Any], Type[object]]"; expected "Type[Any]"

234 dtype = np.dtype(dtype) # type: ignore[arg-type]

235

236 if is_object_dtype(dtype):

237 raise ValueError("dtype=object is not a valid dtype for get_dummies")

238

239 def get_empty_frame(data) -> DataFrame:

240 index: Index | np.ndarray

241 if isinstance(data, Series):

242 index = data.index

243 else:

244 index = Index(range(len(data)))

245 return DataFrame(index=index)

246

247 # if all NaN

248 if not dummy_na and len(levels) == 0:

249 return get_empty_frame(data)

250

251 codes = codes.copy()

252 if dummy_na:

253 codes[codes == -1] = len(levels)

254 levels = levels.insert(len(levels), np.nan)

255

256 # if dummy_na, we just fake a nan level. drop_first will drop it again

257 if drop_first and len(levels) == 1:

258 return get_empty_frame(data)

259

260 number_of_cols = len(levels)

261

262 if prefix is None:

263 dummy_cols = levels

264 else:

265 dummy_cols = Index([f"{prefix}{prefix_sep}{level}" for level in levels])

266

267 index: Index | None

268 if isinstance(data, Series):

269 index = data.index

270 else:

271 index = None

272

273 if sparse:

274

275 fill_value: bool | float

276 if is_integer_dtype(dtype):

277 fill_value = 0

278 elif dtype == np.dtype(bool):

279 fill_value = False

280 else:

281 fill_value = 0.0

282

283 sparse_series = []

284 N = len(data)

285 sp_indices: list[list] = [[] for _ in range(len(dummy_cols))]

286 mask = codes != -1

287 codes = codes[mask]

288 n_idx = np.arange(N)[mask]

289

290 for ndx, code in zip(n_idx, codes):

291 sp_indices[code].append(ndx)

292

293 if drop_first:

294 # remove first categorical level to avoid perfect collinearity

295 # GH12042

296 sp_indices = sp_indices[1:]

297 dummy_cols = dummy_cols[1:]

298 for col, ixs in zip(dummy_cols, sp_indices):

299 sarr = SparseArray(

300 np.ones(len(ixs), dtype=dtype),

301 sparse_index=IntIndex(N, ixs),

302 fill_value=fill_value,

303 dtype=dtype,

304 )

305 sparse_series.append(Series(data=sarr, index=index, name=col))

306

307 return concat(sparse_series, axis=1, copy=False)

308

309 else:

310 # take on axis=1 + transpose to ensure ndarray layout is column-major

311 dummy_mat = np.eye(number_of_cols, dtype=dtype).take(codes, axis=1).T

312

313 if not dummy_na:

314 # reset NaN GH4446

315 dummy_mat[codes == -1] = 0

316

317 if drop_first:

318 # remove first GH12042

319 dummy_mat = dummy_mat[:, 1:]

320 dummy_cols = dummy_cols[1:]

321 return DataFrame(dummy_mat, index=index, columns=dummy_cols)

322

323

324def from_dummies(

325 data: DataFrame,

326 sep: None | str = None,

327 default_category: None | Hashable | dict[str, Hashable] = None,

328) -> DataFrame:

329 """

330 Create a categorical ``DataFrame`` from a ``DataFrame`` of dummy variables.

331

332 Inverts the operation performed by :func:`~pandas.get_dummies`.

333

334 .. versionadded:: 1.5.0

335

336 Parameters

337 ----------

338 data : DataFrame

339 Data which contains dummy-coded variables in form of integer columns of

340 1's and 0's.

341 sep : str, default None

342 Separator used in the column names of the dummy categories they are

343 character indicating the separation of the categorical names from the prefixes.

344 For example, if your column names are 'prefix_A' and 'prefix_B',

345 you can strip the underscore by specifying sep='_'.

346 default_category : None, Hashable or dict of Hashables, default None

347 The default category is the implied category when a value has none of the

348 listed categories specified with a one, i.e. if all dummies in a row are

349 zero. Can be a single value for all variables or a dict directly mapping

350 the default categories to a prefix of a variable.

351

352 Returns

353 -------

354 DataFrame

355 Categorical data decoded from the dummy input-data.

356

357 Raises

358 ------

359 ValueError

360 * When the input ``DataFrame`` ``data`` contains NA values.

361 * When the input ``DataFrame`` ``data`` contains column names with separators

362 that do not match the separator specified with ``sep``.

363 * When a ``dict`` passed to ``default_category`` does not include an implied

364 category for each prefix.

365 * When a value in ``data`` has more than one category assigned to it.

366 * When ``default_category=None`` and a value in ``data`` has no category

367 assigned to it.

368 TypeError

369 * When the input ``data`` is not of type ``DataFrame``.

370 * When the input ``DataFrame`` ``data`` contains non-dummy data.

371 * When the passed ``sep`` is of a wrong data type.

372 * When the passed ``default_category`` is of a wrong data type.

373

374 See Also

375 --------

376 :func:`~pandas.get_dummies` : Convert ``Series`` or ``DataFrame`` to dummy codes.

377 :class:`~pandas.Categorical` : Represent a categorical variable in classic.

378

379 Notes

380 -----

381 The columns of the passed dummy data should only include 1's and 0's,

382 or boolean values.

383

384 Examples

385 --------

386 >>> df = pd.DataFrame({"a": [1, 0, 0, 1], "b": [0, 1, 0, 0],

387 ... "c": [0, 0, 1, 0]})

388

389 >>> df

390 a b c

391 0 1 0 0

392 1 0 1 0

393 2 0 0 1

394 3 1 0 0

395

396 >>> pd.from_dummies(df)

397 0 a

398 1 b

399 2 c

400 3 a

401

402 >>> df = pd.DataFrame({"col1_a": [1, 0, 1], "col1_b": [0, 1, 0],

403 ... "col2_a": [0, 1, 0], "col2_b": [1, 0, 0],

404 ... "col2_c": [0, 0, 1]})

405

406 >>> df

407 col1_a col1_b col2_a col2_b col2_c

408 0 1 0 0 1 0

409 1 0 1 1 0 0

410 2 1 0 0 0 1

411

412 >>> pd.from_dummies(df, sep="_")

413 col1 col2

414 0 a b

415 1 b a

416 2 a c

417

418 >>> df = pd.DataFrame({"col1_a": [1, 0, 0], "col1_b": [0, 1, 0],

419 ... "col2_a": [0, 1, 0], "col2_b": [1, 0, 0],

420 ... "col2_c": [0, 0, 0]})

421

422 >>> df

423 col1_a col1_b col2_a col2_b col2_c

424 0 1 0 0 1 0

425 1 0 1 1 0 0

426 2 0 0 0 0 0

427

428 >>> pd.from_dummies(df, sep="_", default_category={"col1": "d", "col2": "e"})

429 col1 col2

430 0 a b

431 1 b a

432 2 d e

433 """

434 from pandas.core.reshape.concat import concat

435

436 if not isinstance(data, DataFrame):

437 raise TypeError(

438 "Expected 'data' to be a 'DataFrame'; "

439 f"Received 'data' of type: {type(data).__name__}"

440 )

441

442 if data.isna().any().any():

443 raise ValueError(

444 "Dummy DataFrame contains NA value in column: "

445 f"'{data.isna().any().idxmax()}'"

446 )

447

448 # index data with a list of all columns that are dummies

449 try:

450 data_to_decode = data.astype("boolean", copy=False)

451 except TypeError:

452 raise TypeError("Passed DataFrame contains non-dummy data")

453

454 # collect prefixes and get lists to slice data for each prefix

455 variables_slice = defaultdict(list)

456 if sep is None:

457 variables_slice[""] = list(data.columns)

458 elif isinstance(sep, str):

459 for col in data_to_decode.columns:

460 prefix = col.split(sep)[0]

461 if len(prefix) == len(col):

462 raise ValueError(f"Separator not specified for column: {col}")

463 variables_slice[prefix].append(col)

464 else:

465 raise TypeError(

466 "Expected 'sep' to be of type 'str' or 'None'; "

467 f"Received 'sep' of type: {type(sep).__name__}"

468 )

469

470 if default_category is not None:

471 if isinstance(default_category, dict):

472 if not len(default_category) == len(variables_slice):

473 len_msg = (

474 f"Length of 'default_category' ({len(default_category)}) "

475 f"did not match the length of the columns being encoded "

476 f"({len(variables_slice)})"

477 )

478 raise ValueError(len_msg)

479 elif isinstance(default_category, Hashable):

480 default_category = dict(

481 zip(variables_slice, [default_category] * len(variables_slice))

482 )

483 else:

484 raise TypeError(

485 "Expected 'default_category' to be of type "

486 "'None', 'Hashable', or 'dict'; "

487 "Received 'default_category' of type: "

488 f"{type(default_category).__name__}"

489 )

490

491 cat_data = {}

492 for prefix, prefix_slice in variables_slice.items():

493 if sep is None:

494 cats = prefix_slice.copy()

495 else:

496 cats = [col[len(prefix + sep) :] for col in prefix_slice]

497 assigned = data_to_decode.loc[:, prefix_slice].sum(axis=1)

498 if any(assigned > 1):

499 raise ValueError(

500 "Dummy DataFrame contains multi-assignment(s); "

501 f"First instance in row: {assigned.idxmax()}"

502 )

503 elif any(assigned == 0):

504 if isinstance(default_category, dict):

505 cats.append(default_category[prefix])

506 else:

507 raise ValueError(

508 "Dummy DataFrame contains unassigned value(s); "

509 f"First instance in row: {assigned.idxmin()}"

510 )

511 data_slice = concat(

512 (data_to_decode.loc[:, prefix_slice], assigned == 0), axis=1

513 )

514 else:

515 data_slice = data_to_decode.loc[:, prefix_slice]

516 cats_array = np.array(cats, dtype="object")

517 # get indices of True entries along axis=1

518 cat_data[prefix] = cats_array[data_slice.to_numpy().nonzero()[1]]

519

520 return DataFrame(cat_data)