Coverage for /var/srv/projects/api.amasfac.comuna18.com/tmp/venv/lib/python3.9/site-packages/pandas/core/reshape/encoding.py: 6%
151 statements
« prev ^ index » next coverage.py v6.4.4, created at 2023-07-17 14:22 -0600
« prev ^ index » next coverage.py v6.4.4, created at 2023-07-17 14:22 -0600
1from __future__ import annotations
3from collections import defaultdict
4import itertools
5from typing import Hashable
7import numpy as np
9from pandas._libs.sparse import IntIndex
10from pandas._typing import Dtype
12from pandas.core.dtypes.common import (
13 is_integer_dtype,
14 is_list_like,
15 is_object_dtype,
16)
18from pandas.core.arrays import SparseArray
19from pandas.core.arrays.categorical import factorize_from_iterable
20from pandas.core.frame import DataFrame
21from pandas.core.indexes.api import Index
22from pandas.core.series import Series
25def get_dummies(
26 data,
27 prefix=None,
28 prefix_sep="_",
29 dummy_na: bool = False,
30 columns=None,
31 sparse: bool = False,
32 drop_first: bool = False,
33 dtype: Dtype | None = None,
34) -> DataFrame:
35 """
36 Convert categorical variable into dummy/indicator variables.
38 Parameters
39 ----------
40 data : array-like, Series, or DataFrame
41 Data of which to get dummy indicators.
42 prefix : str, list of str, or dict of str, default None
43 String to append DataFrame column names.
44 Pass a list with length equal to the number of columns
45 when calling get_dummies on a DataFrame. Alternatively, `prefix`
46 can be a dictionary mapping column names to prefixes.
47 prefix_sep : str, default '_'
48 If appending prefix, separator/delimiter to use. Or pass a
49 list or dictionary as with `prefix`.
50 dummy_na : bool, default False
51 Add a column to indicate NaNs, if False NaNs are ignored.
52 columns : list-like, default None
53 Column names in the DataFrame to be encoded.
54 If `columns` is None then all the columns with
55 `object`, `string`, or `category` dtype will be converted.
56 sparse : bool, default False
57 Whether the dummy-encoded columns should be backed by
58 a :class:`SparseArray` (True) or a regular NumPy array (False).
59 drop_first : bool, default False
60 Whether to get k-1 dummies out of k categorical levels by removing the
61 first level.
62 dtype : dtype, default np.uint8
63 Data type for new columns. Only a single dtype is allowed.
65 Returns
66 -------
67 DataFrame
68 Dummy-coded data.
70 See Also
71 --------
72 Series.str.get_dummies : Convert Series to dummy codes.
73 :func:`~pandas.from_dummies` : Convert dummy codes to categorical ``DataFrame``.
75 Notes
76 -----
77 Reference :ref:`the user guide <reshaping.dummies>` for more examples.
79 Examples
80 --------
81 >>> s = pd.Series(list('abca'))
83 >>> pd.get_dummies(s)
84 a b c
85 0 1 0 0
86 1 0 1 0
87 2 0 0 1
88 3 1 0 0
90 >>> s1 = ['a', 'b', np.nan]
92 >>> pd.get_dummies(s1)
93 a b
94 0 1 0
95 1 0 1
96 2 0 0
98 >>> pd.get_dummies(s1, dummy_na=True)
99 a b NaN
100 0 1 0 0
101 1 0 1 0
102 2 0 0 1
104 >>> df = pd.DataFrame({'A': ['a', 'b', 'a'], 'B': ['b', 'a', 'c'],
105 ... 'C': [1, 2, 3]})
107 >>> pd.get_dummies(df, prefix=['col1', 'col2'])
108 C col1_a col1_b col2_a col2_b col2_c
109 0 1 1 0 0 1 0
110 1 2 0 1 1 0 0
111 2 3 1 0 0 0 1
113 >>> pd.get_dummies(pd.Series(list('abcaa')))
114 a b c
115 0 1 0 0
116 1 0 1 0
117 2 0 0 1
118 3 1 0 0
119 4 1 0 0
121 >>> pd.get_dummies(pd.Series(list('abcaa')), drop_first=True)
122 b c
123 0 0 0
124 1 1 0
125 2 0 1
126 3 0 0
127 4 0 0
129 >>> pd.get_dummies(pd.Series(list('abc')), dtype=float)
130 a b c
131 0 1.0 0.0 0.0
132 1 0.0 1.0 0.0
133 2 0.0 0.0 1.0
134 """
135 from pandas.core.reshape.concat import concat
137 dtypes_to_encode = ["object", "string", "category"]
139 if isinstance(data, DataFrame):
140 # determine columns being encoded
141 if columns is None:
142 data_to_encode = data.select_dtypes(include=dtypes_to_encode)
143 elif not is_list_like(columns):
144 raise TypeError("Input must be a list-like for parameter `columns`")
145 else:
146 data_to_encode = data[columns]
148 # validate prefixes and separator to avoid silently dropping cols
149 def check_len(item, name):
151 if is_list_like(item):
152 if not len(item) == data_to_encode.shape[1]:
153 len_msg = (
154 f"Length of '{name}' ({len(item)}) did not match the "
155 "length of the columns being encoded "
156 f"({data_to_encode.shape[1]})."
157 )
158 raise ValueError(len_msg)
160 check_len(prefix, "prefix")
161 check_len(prefix_sep, "prefix_sep")
163 if isinstance(prefix, str):
164 prefix = itertools.cycle([prefix])
165 if isinstance(prefix, dict):
166 prefix = [prefix[col] for col in data_to_encode.columns]
168 if prefix is None:
169 prefix = data_to_encode.columns
171 # validate separators
172 if isinstance(prefix_sep, str):
173 prefix_sep = itertools.cycle([prefix_sep])
174 elif isinstance(prefix_sep, dict):
175 prefix_sep = [prefix_sep[col] for col in data_to_encode.columns]
177 with_dummies: list[DataFrame]
178 if data_to_encode.shape == data.shape:
179 # Encoding the entire df, do not prepend any dropped columns
180 with_dummies = []
181 elif columns is not None:
182 # Encoding only cols specified in columns. Get all cols not in
183 # columns to prepend to result.
184 with_dummies = [data.drop(columns, axis=1)]
185 else:
186 # Encoding only object and category dtype columns. Get remaining
187 # columns to prepend to result.
188 with_dummies = [data.select_dtypes(exclude=dtypes_to_encode)]
190 for (col, pre, sep) in zip(data_to_encode.items(), prefix, prefix_sep):
191 # col is (column_name, column), use just column data here
192 dummy = _get_dummies_1d(
193 col[1],
194 prefix=pre,
195 prefix_sep=sep,
196 dummy_na=dummy_na,
197 sparse=sparse,
198 drop_first=drop_first,
199 dtype=dtype,
200 )
201 with_dummies.append(dummy)
202 result = concat(with_dummies, axis=1)
203 else:
204 result = _get_dummies_1d(
205 data,
206 prefix,
207 prefix_sep,
208 dummy_na,
209 sparse=sparse,
210 drop_first=drop_first,
211 dtype=dtype,
212 )
213 return result
216def _get_dummies_1d(
217 data,
218 prefix,
219 prefix_sep="_",
220 dummy_na: bool = False,
221 sparse: bool = False,
222 drop_first: bool = False,
223 dtype: Dtype | None = None,
224) -> DataFrame:
225 from pandas.core.reshape.concat import concat
227 # Series avoids inconsistent NaN handling
228 codes, levels = factorize_from_iterable(Series(data))
230 if dtype is None:
231 dtype = np.dtype(np.uint8)
232 # error: Argument 1 to "dtype" has incompatible type "Union[ExtensionDtype, str,
233 # dtype[Any], Type[object]]"; expected "Type[Any]"
234 dtype = np.dtype(dtype) # type: ignore[arg-type]
236 if is_object_dtype(dtype):
237 raise ValueError("dtype=object is not a valid dtype for get_dummies")
239 def get_empty_frame(data) -> DataFrame:
240 index: Index | np.ndarray
241 if isinstance(data, Series):
242 index = data.index
243 else:
244 index = Index(range(len(data)))
245 return DataFrame(index=index)
247 # if all NaN
248 if not dummy_na and len(levels) == 0:
249 return get_empty_frame(data)
251 codes = codes.copy()
252 if dummy_na:
253 codes[codes == -1] = len(levels)
254 levels = levels.insert(len(levels), np.nan)
256 # if dummy_na, we just fake a nan level. drop_first will drop it again
257 if drop_first and len(levels) == 1:
258 return get_empty_frame(data)
260 number_of_cols = len(levels)
262 if prefix is None:
263 dummy_cols = levels
264 else:
265 dummy_cols = Index([f"{prefix}{prefix_sep}{level}" for level in levels])
267 index: Index | None
268 if isinstance(data, Series):
269 index = data.index
270 else:
271 index = None
273 if sparse:
275 fill_value: bool | float
276 if is_integer_dtype(dtype):
277 fill_value = 0
278 elif dtype == np.dtype(bool):
279 fill_value = False
280 else:
281 fill_value = 0.0
283 sparse_series = []
284 N = len(data)
285 sp_indices: list[list] = [[] for _ in range(len(dummy_cols))]
286 mask = codes != -1
287 codes = codes[mask]
288 n_idx = np.arange(N)[mask]
290 for ndx, code in zip(n_idx, codes):
291 sp_indices[code].append(ndx)
293 if drop_first:
294 # remove first categorical level to avoid perfect collinearity
295 # GH12042
296 sp_indices = sp_indices[1:]
297 dummy_cols = dummy_cols[1:]
298 for col, ixs in zip(dummy_cols, sp_indices):
299 sarr = SparseArray(
300 np.ones(len(ixs), dtype=dtype),
301 sparse_index=IntIndex(N, ixs),
302 fill_value=fill_value,
303 dtype=dtype,
304 )
305 sparse_series.append(Series(data=sarr, index=index, name=col))
307 return concat(sparse_series, axis=1, copy=False)
309 else:
310 # take on axis=1 + transpose to ensure ndarray layout is column-major
311 dummy_mat = np.eye(number_of_cols, dtype=dtype).take(codes, axis=1).T
313 if not dummy_na:
314 # reset NaN GH4446
315 dummy_mat[codes == -1] = 0
317 if drop_first:
318 # remove first GH12042
319 dummy_mat = dummy_mat[:, 1:]
320 dummy_cols = dummy_cols[1:]
321 return DataFrame(dummy_mat, index=index, columns=dummy_cols)
324def from_dummies(
325 data: DataFrame,
326 sep: None | str = None,
327 default_category: None | Hashable | dict[str, Hashable] = None,
328) -> DataFrame:
329 """
330 Create a categorical ``DataFrame`` from a ``DataFrame`` of dummy variables.
332 Inverts the operation performed by :func:`~pandas.get_dummies`.
334 .. versionadded:: 1.5.0
336 Parameters
337 ----------
338 data : DataFrame
339 Data which contains dummy-coded variables in form of integer columns of
340 1's and 0's.
341 sep : str, default None
342 Separator used in the column names of the dummy categories they are
343 character indicating the separation of the categorical names from the prefixes.
344 For example, if your column names are 'prefix_A' and 'prefix_B',
345 you can strip the underscore by specifying sep='_'.
346 default_category : None, Hashable or dict of Hashables, default None
347 The default category is the implied category when a value has none of the
348 listed categories specified with a one, i.e. if all dummies in a row are
349 zero. Can be a single value for all variables or a dict directly mapping
350 the default categories to a prefix of a variable.
352 Returns
353 -------
354 DataFrame
355 Categorical data decoded from the dummy input-data.
357 Raises
358 ------
359 ValueError
360 * When the input ``DataFrame`` ``data`` contains NA values.
361 * When the input ``DataFrame`` ``data`` contains column names with separators
362 that do not match the separator specified with ``sep``.
363 * When a ``dict`` passed to ``default_category`` does not include an implied
364 category for each prefix.
365 * When a value in ``data`` has more than one category assigned to it.
366 * When ``default_category=None`` and a value in ``data`` has no category
367 assigned to it.
368 TypeError
369 * When the input ``data`` is not of type ``DataFrame``.
370 * When the input ``DataFrame`` ``data`` contains non-dummy data.
371 * When the passed ``sep`` is of a wrong data type.
372 * When the passed ``default_category`` is of a wrong data type.
374 See Also
375 --------
376 :func:`~pandas.get_dummies` : Convert ``Series`` or ``DataFrame`` to dummy codes.
377 :class:`~pandas.Categorical` : Represent a categorical variable in classic.
379 Notes
380 -----
381 The columns of the passed dummy data should only include 1's and 0's,
382 or boolean values.
384 Examples
385 --------
386 >>> df = pd.DataFrame({"a": [1, 0, 0, 1], "b": [0, 1, 0, 0],
387 ... "c": [0, 0, 1, 0]})
389 >>> df
390 a b c
391 0 1 0 0
392 1 0 1 0
393 2 0 0 1
394 3 1 0 0
396 >>> pd.from_dummies(df)
397 0 a
398 1 b
399 2 c
400 3 a
402 >>> df = pd.DataFrame({"col1_a": [1, 0, 1], "col1_b": [0, 1, 0],
403 ... "col2_a": [0, 1, 0], "col2_b": [1, 0, 0],
404 ... "col2_c": [0, 0, 1]})
406 >>> df
407 col1_a col1_b col2_a col2_b col2_c
408 0 1 0 0 1 0
409 1 0 1 1 0 0
410 2 1 0 0 0 1
412 >>> pd.from_dummies(df, sep="_")
413 col1 col2
414 0 a b
415 1 b a
416 2 a c
418 >>> df = pd.DataFrame({"col1_a": [1, 0, 0], "col1_b": [0, 1, 0],
419 ... "col2_a": [0, 1, 0], "col2_b": [1, 0, 0],
420 ... "col2_c": [0, 0, 0]})
422 >>> df
423 col1_a col1_b col2_a col2_b col2_c
424 0 1 0 0 1 0
425 1 0 1 1 0 0
426 2 0 0 0 0 0
428 >>> pd.from_dummies(df, sep="_", default_category={"col1": "d", "col2": "e"})
429 col1 col2
430 0 a b
431 1 b a
432 2 d e
433 """
434 from pandas.core.reshape.concat import concat
436 if not isinstance(data, DataFrame):
437 raise TypeError(
438 "Expected 'data' to be a 'DataFrame'; "
439 f"Received 'data' of type: {type(data).__name__}"
440 )
442 if data.isna().any().any():
443 raise ValueError(
444 "Dummy DataFrame contains NA value in column: "
445 f"'{data.isna().any().idxmax()}'"
446 )
448 # index data with a list of all columns that are dummies
449 try:
450 data_to_decode = data.astype("boolean", copy=False)
451 except TypeError:
452 raise TypeError("Passed DataFrame contains non-dummy data")
454 # collect prefixes and get lists to slice data for each prefix
455 variables_slice = defaultdict(list)
456 if sep is None:
457 variables_slice[""] = list(data.columns)
458 elif isinstance(sep, str):
459 for col in data_to_decode.columns:
460 prefix = col.split(sep)[0]
461 if len(prefix) == len(col):
462 raise ValueError(f"Separator not specified for column: {col}")
463 variables_slice[prefix].append(col)
464 else:
465 raise TypeError(
466 "Expected 'sep' to be of type 'str' or 'None'; "
467 f"Received 'sep' of type: {type(sep).__name__}"
468 )
470 if default_category is not None:
471 if isinstance(default_category, dict):
472 if not len(default_category) == len(variables_slice):
473 len_msg = (
474 f"Length of 'default_category' ({len(default_category)}) "
475 f"did not match the length of the columns being encoded "
476 f"({len(variables_slice)})"
477 )
478 raise ValueError(len_msg)
479 elif isinstance(default_category, Hashable):
480 default_category = dict(
481 zip(variables_slice, [default_category] * len(variables_slice))
482 )
483 else:
484 raise TypeError(
485 "Expected 'default_category' to be of type "
486 "'None', 'Hashable', or 'dict'; "
487 "Received 'default_category' of type: "
488 f"{type(default_category).__name__}"
489 )
491 cat_data = {}
492 for prefix, prefix_slice in variables_slice.items():
493 if sep is None:
494 cats = prefix_slice.copy()
495 else:
496 cats = [col[len(prefix + sep) :] for col in prefix_slice]
497 assigned = data_to_decode.loc[:, prefix_slice].sum(axis=1)
498 if any(assigned > 1):
499 raise ValueError(
500 "Dummy DataFrame contains multi-assignment(s); "
501 f"First instance in row: {assigned.idxmax()}"
502 )
503 elif any(assigned == 0):
504 if isinstance(default_category, dict):
505 cats.append(default_category[prefix])
506 else:
507 raise ValueError(
508 "Dummy DataFrame contains unassigned value(s); "
509 f"First instance in row: {assigned.idxmin()}"
510 )
511 data_slice = concat(
512 (data_to_decode.loc[:, prefix_slice], assigned == 0), axis=1
513 )
514 else:
515 data_slice = data_to_decode.loc[:, prefix_slice]
516 cats_array = np.array(cats, dtype="object")
517 # get indices of True entries along axis=1
518 cat_data[prefix] = cats_array[data_slice.to_numpy().nonzero()[1]]
520 return DataFrame(cat_data)