Coverage for /var/srv/projects/api.amasfac.comuna18.com/tmp/venv/lib/python3.9/site-packages/pandas/core/reshape/melt.py: 11%
138 statements
« prev ^ index » next coverage.py v6.4.4, created at 2023-07-17 14:22 -0600
« prev ^ index » next coverage.py v6.4.4, created at 2023-07-17 14:22 -0600
1from __future__ import annotations
3import re
4from typing import TYPE_CHECKING
5import warnings
7import numpy as np
9from pandas.util._decorators import (
10 Appender,
11 deprecate_kwarg,
12)
13from pandas.util._exceptions import find_stack_level
15from pandas.core.dtypes.common import (
16 is_extension_array_dtype,
17 is_list_like,
18)
19from pandas.core.dtypes.concat import concat_compat
20from pandas.core.dtypes.missing import notna
22import pandas.core.algorithms as algos
23from pandas.core.arrays import Categorical
24import pandas.core.common as com
25from pandas.core.indexes.api import (
26 Index,
27 MultiIndex,
28)
29from pandas.core.reshape.concat import concat
30from pandas.core.reshape.util import tile_compat
31from pandas.core.shared_docs import _shared_docs
32from pandas.core.tools.numeric import to_numeric
34if TYPE_CHECKING: 34 ↛ 35line 34 didn't jump to line 35, because the condition on line 34 was never true
35 from pandas import DataFrame
38@Appender(_shared_docs["melt"] % {"caller": "pd.melt(df, ", "other": "DataFrame.melt"})
39def melt(
40 frame: DataFrame,
41 id_vars=None,
42 value_vars=None,
43 var_name=None,
44 value_name="value",
45 col_level=None,
46 ignore_index: bool = True,
47) -> DataFrame:
48 # If multiindex, gather names of columns on all level for checking presence
49 # of `id_vars` and `value_vars`
50 if isinstance(frame.columns, MultiIndex):
51 cols = [x for c in frame.columns for x in c]
52 else:
53 cols = list(frame.columns)
55 if value_name in frame.columns:
56 warnings.warn(
57 "This dataframe has a column name that matches the 'value_name' column "
58 "name of the resulting Dataframe. "
59 "In the future this will raise an error, please set the 'value_name' "
60 "parameter of DataFrame.melt to a unique name.",
61 FutureWarning,
62 stacklevel=find_stack_level(),
63 )
65 if id_vars is not None:
66 if not is_list_like(id_vars):
67 id_vars = [id_vars]
68 elif isinstance(frame.columns, MultiIndex) and not isinstance(id_vars, list):
69 raise ValueError(
70 "id_vars must be a list of tuples when columns are a MultiIndex"
71 )
72 else:
73 # Check that `id_vars` are in frame
74 id_vars = list(id_vars)
75 missing = Index(com.flatten(id_vars)).difference(cols)
76 if not missing.empty:
77 raise KeyError(
78 "The following 'id_vars' are not present "
79 f"in the DataFrame: {list(missing)}"
80 )
81 else:
82 id_vars = []
84 if value_vars is not None:
85 if not is_list_like(value_vars):
86 value_vars = [value_vars]
87 elif isinstance(frame.columns, MultiIndex) and not isinstance(value_vars, list):
88 raise ValueError(
89 "value_vars must be a list of tuples when columns are a MultiIndex"
90 )
91 else:
92 value_vars = list(value_vars)
93 # Check that `value_vars` are in frame
94 missing = Index(com.flatten(value_vars)).difference(cols)
95 if not missing.empty:
96 raise KeyError(
97 "The following 'value_vars' are not present in "
98 f"the DataFrame: {list(missing)}"
99 )
100 if col_level is not None:
101 idx = frame.columns.get_level_values(col_level).get_indexer(
102 id_vars + value_vars
103 )
104 else:
105 idx = algos.unique(frame.columns.get_indexer_for(id_vars + value_vars))
106 frame = frame.iloc[:, idx]
107 else:
108 frame = frame.copy()
110 if col_level is not None: # allow list or other?
111 # frame is a copy
112 frame.columns = frame.columns.get_level_values(col_level)
114 if var_name is None:
115 if isinstance(frame.columns, MultiIndex):
116 if len(frame.columns.names) == len(set(frame.columns.names)):
117 var_name = frame.columns.names
118 else:
119 var_name = [f"variable_{i}" for i in range(len(frame.columns.names))]
120 else:
121 var_name = [
122 frame.columns.name if frame.columns.name is not None else "variable"
123 ]
124 if isinstance(var_name, str):
125 var_name = [var_name]
127 N, K = frame.shape
128 K -= len(id_vars)
130 mdata = {}
131 for col in id_vars:
132 id_data = frame.pop(col)
133 if is_extension_array_dtype(id_data):
134 if K > 0:
135 id_data = concat([id_data] * K, ignore_index=True)
136 else:
137 # We can't concat empty list. (GH 46044)
138 id_data = type(id_data)([], name=id_data.name, dtype=id_data.dtype)
139 else:
140 # error: Incompatible types in assignment (expression has type
141 # "ndarray[Any, dtype[Any]]", variable has type "Series")
142 id_data = np.tile(id_data._values, K) # type: ignore[assignment]
143 mdata[col] = id_data
145 mcolumns = id_vars + var_name + [value_name]
147 # error: Incompatible types in assignment (expression has type "ndarray",
148 # target has type "Series")
149 mdata[value_name] = frame._values.ravel("F") # type: ignore[assignment]
150 for i, col in enumerate(var_name):
151 # asanyarray will keep the columns as an Index
153 # error: Incompatible types in assignment (expression has type "ndarray", target
154 # has type "Series")
155 mdata[col] = np.asanyarray( # type: ignore[assignment]
156 frame.columns._get_level_values(i)
157 ).repeat(N)
159 result = frame._constructor(mdata, columns=mcolumns)
161 if not ignore_index:
162 result.index = tile_compat(frame.index, K)
164 return result
167@deprecate_kwarg(old_arg_name="label", new_arg_name=None)
168def lreshape(data: DataFrame, groups, dropna: bool = True, label=None) -> DataFrame:
169 """
170 Reshape wide-format data to long. Generalized inverse of DataFrame.pivot.
172 Accepts a dictionary, ``groups``, in which each key is a new column name
173 and each value is a list of old column names that will be "melted" under
174 the new column name as part of the reshape.
176 Parameters
177 ----------
178 data : DataFrame
179 The wide-format DataFrame.
180 groups : dict
181 {new_name : list_of_columns}.
182 dropna : bool, default True
183 Do not include columns whose entries are all NaN.
184 label : None
185 Not used.
187 .. deprecated:: 1.0.0
189 Returns
190 -------
191 DataFrame
192 Reshaped DataFrame.
194 See Also
195 --------
196 melt : Unpivot a DataFrame from wide to long format, optionally leaving
197 identifiers set.
198 pivot : Create a spreadsheet-style pivot table as a DataFrame.
199 DataFrame.pivot : Pivot without aggregation that can handle
200 non-numeric data.
201 DataFrame.pivot_table : Generalization of pivot that can handle
202 duplicate values for one index/column pair.
203 DataFrame.unstack : Pivot based on the index values instead of a
204 column.
205 wide_to_long : Wide panel to long format. Less flexible but more
206 user-friendly than melt.
208 Examples
209 --------
210 >>> data = pd.DataFrame({'hr1': [514, 573], 'hr2': [545, 526],
211 ... 'team': ['Red Sox', 'Yankees'],
212 ... 'year1': [2007, 2007], 'year2': [2008, 2008]})
213 >>> data
214 hr1 hr2 team year1 year2
215 0 514 545 Red Sox 2007 2008
216 1 573 526 Yankees 2007 2008
218 >>> pd.lreshape(data, {'year': ['year1', 'year2'], 'hr': ['hr1', 'hr2']})
219 team year hr
220 0 Red Sox 2007 514
221 1 Yankees 2007 573
222 2 Red Sox 2008 545
223 3 Yankees 2008 526
224 """
225 if isinstance(groups, dict):
226 keys = list(groups.keys())
227 values = list(groups.values())
228 else:
229 keys, values = zip(*groups)
231 all_cols = list(set.union(*(set(x) for x in values)))
232 id_cols = list(data.columns.difference(all_cols))
234 K = len(values[0])
236 for seq in values:
237 if len(seq) != K:
238 raise ValueError("All column lists must be same length")
240 mdata = {}
241 pivot_cols = []
243 for target, names in zip(keys, values):
244 to_concat = [data[col]._values for col in names]
246 mdata[target] = concat_compat(to_concat)
247 pivot_cols.append(target)
249 for col in id_cols:
250 mdata[col] = np.tile(data[col]._values, K)
252 if dropna:
253 mask = np.ones(len(mdata[pivot_cols[0]]), dtype=bool)
254 for c in pivot_cols:
255 mask &= notna(mdata[c])
256 if not mask.all():
257 mdata = {k: v[mask] for k, v in mdata.items()}
259 return data._constructor(mdata, columns=id_cols + pivot_cols)
262def wide_to_long(
263 df: DataFrame, stubnames, i, j, sep: str = "", suffix: str = r"\d+"
264) -> DataFrame:
265 r"""
266 Unpivot a DataFrame from wide to long format.
268 Less flexible but more user-friendly than melt.
270 With stubnames ['A', 'B'], this function expects to find one or more
271 group of columns with format
272 A-suffix1, A-suffix2,..., B-suffix1, B-suffix2,...
273 You specify what you want to call this suffix in the resulting long format
274 with `j` (for example `j='year'`)
276 Each row of these wide variables are assumed to be uniquely identified by
277 `i` (can be a single column name or a list of column names)
279 All remaining variables in the data frame are left intact.
281 Parameters
282 ----------
283 df : DataFrame
284 The wide-format DataFrame.
285 stubnames : str or list-like
286 The stub name(s). The wide format variables are assumed to
287 start with the stub names.
288 i : str or list-like
289 Column(s) to use as id variable(s).
290 j : str
291 The name of the sub-observation variable. What you wish to name your
292 suffix in the long format.
293 sep : str, default ""
294 A character indicating the separation of the variable names
295 in the wide format, to be stripped from the names in the long format.
296 For example, if your column names are A-suffix1, A-suffix2, you
297 can strip the hyphen by specifying `sep='-'`.
298 suffix : str, default '\\d+'
299 A regular expression capturing the wanted suffixes. '\\d+' captures
300 numeric suffixes. Suffixes with no numbers could be specified with the
301 negated character class '\\D+'. You can also further disambiguate
302 suffixes, for example, if your wide variables are of the form A-one,
303 B-two,.., and you have an unrelated column A-rating, you can ignore the
304 last one by specifying `suffix='(!?one|two)'`. When all suffixes are
305 numeric, they are cast to int64/float64.
307 Returns
308 -------
309 DataFrame
310 A DataFrame that contains each stub name as a variable, with new index
311 (i, j).
313 See Also
314 --------
315 melt : Unpivot a DataFrame from wide to long format, optionally leaving
316 identifiers set.
317 pivot : Create a spreadsheet-style pivot table as a DataFrame.
318 DataFrame.pivot : Pivot without aggregation that can handle
319 non-numeric data.
320 DataFrame.pivot_table : Generalization of pivot that can handle
321 duplicate values for one index/column pair.
322 DataFrame.unstack : Pivot based on the index values instead of a
323 column.
325 Notes
326 -----
327 All extra variables are left untouched. This simply uses
328 `pandas.melt` under the hood, but is hard-coded to "do the right thing"
329 in a typical case.
331 Examples
332 --------
333 >>> np.random.seed(123)
334 >>> df = pd.DataFrame({"A1970" : {0 : "a", 1 : "b", 2 : "c"},
335 ... "A1980" : {0 : "d", 1 : "e", 2 : "f"},
336 ... "B1970" : {0 : 2.5, 1 : 1.2, 2 : .7},
337 ... "B1980" : {0 : 3.2, 1 : 1.3, 2 : .1},
338 ... "X" : dict(zip(range(3), np.random.randn(3)))
339 ... })
340 >>> df["id"] = df.index
341 >>> df
342 A1970 A1980 B1970 B1980 X id
343 0 a d 2.5 3.2 -1.085631 0
344 1 b e 1.2 1.3 0.997345 1
345 2 c f 0.7 0.1 0.282978 2
346 >>> pd.wide_to_long(df, ["A", "B"], i="id", j="year")
347 ... # doctest: +NORMALIZE_WHITESPACE
348 X A B
349 id year
350 0 1970 -1.085631 a 2.5
351 1 1970 0.997345 b 1.2
352 2 1970 0.282978 c 0.7
353 0 1980 -1.085631 d 3.2
354 1 1980 0.997345 e 1.3
355 2 1980 0.282978 f 0.1
357 With multiple id columns
359 >>> df = pd.DataFrame({
360 ... 'famid': [1, 1, 1, 2, 2, 2, 3, 3, 3],
361 ... 'birth': [1, 2, 3, 1, 2, 3, 1, 2, 3],
362 ... 'ht1': [2.8, 2.9, 2.2, 2, 1.8, 1.9, 2.2, 2.3, 2.1],
363 ... 'ht2': [3.4, 3.8, 2.9, 3.2, 2.8, 2.4, 3.3, 3.4, 2.9]
364 ... })
365 >>> df
366 famid birth ht1 ht2
367 0 1 1 2.8 3.4
368 1 1 2 2.9 3.8
369 2 1 3 2.2 2.9
370 3 2 1 2.0 3.2
371 4 2 2 1.8 2.8
372 5 2 3 1.9 2.4
373 6 3 1 2.2 3.3
374 7 3 2 2.3 3.4
375 8 3 3 2.1 2.9
376 >>> l = pd.wide_to_long(df, stubnames='ht', i=['famid', 'birth'], j='age')
377 >>> l
378 ... # doctest: +NORMALIZE_WHITESPACE
379 ht
380 famid birth age
381 1 1 1 2.8
382 2 3.4
383 2 1 2.9
384 2 3.8
385 3 1 2.2
386 2 2.9
387 2 1 1 2.0
388 2 3.2
389 2 1 1.8
390 2 2.8
391 3 1 1.9
392 2 2.4
393 3 1 1 2.2
394 2 3.3
395 2 1 2.3
396 2 3.4
397 3 1 2.1
398 2 2.9
400 Going from long back to wide just takes some creative use of `unstack`
402 >>> w = l.unstack()
403 >>> w.columns = w.columns.map('{0[0]}{0[1]}'.format)
404 >>> w.reset_index()
405 famid birth ht1 ht2
406 0 1 1 2.8 3.4
407 1 1 2 2.9 3.8
408 2 1 3 2.2 2.9
409 3 2 1 2.0 3.2
410 4 2 2 1.8 2.8
411 5 2 3 1.9 2.4
412 6 3 1 2.2 3.3
413 7 3 2 2.3 3.4
414 8 3 3 2.1 2.9
416 Less wieldy column names are also handled
418 >>> np.random.seed(0)
419 >>> df = pd.DataFrame({'A(weekly)-2010': np.random.rand(3),
420 ... 'A(weekly)-2011': np.random.rand(3),
421 ... 'B(weekly)-2010': np.random.rand(3),
422 ... 'B(weekly)-2011': np.random.rand(3),
423 ... 'X' : np.random.randint(3, size=3)})
424 >>> df['id'] = df.index
425 >>> df # doctest: +NORMALIZE_WHITESPACE, +ELLIPSIS
426 A(weekly)-2010 A(weekly)-2011 B(weekly)-2010 B(weekly)-2011 X id
427 0 0.548814 0.544883 0.437587 0.383442 0 0
428 1 0.715189 0.423655 0.891773 0.791725 1 1
429 2 0.602763 0.645894 0.963663 0.528895 1 2
431 >>> pd.wide_to_long(df, ['A(weekly)', 'B(weekly)'], i='id',
432 ... j='year', sep='-')
433 ... # doctest: +NORMALIZE_WHITESPACE
434 X A(weekly) B(weekly)
435 id year
436 0 2010 0 0.548814 0.437587
437 1 2010 1 0.715189 0.891773
438 2 2010 1 0.602763 0.963663
439 0 2011 0 0.544883 0.383442
440 1 2011 1 0.423655 0.791725
441 2 2011 1 0.645894 0.528895
443 If we have many columns, we could also use a regex to find our
444 stubnames and pass that list on to wide_to_long
446 >>> stubnames = sorted(
447 ... set([match[0] for match in df.columns.str.findall(
448 ... r'[A-B]\(.*\)').values if match != []])
449 ... )
450 >>> list(stubnames)
451 ['A(weekly)', 'B(weekly)']
453 All of the above examples have integers as suffixes. It is possible to
454 have non-integers as suffixes.
456 >>> df = pd.DataFrame({
457 ... 'famid': [1, 1, 1, 2, 2, 2, 3, 3, 3],
458 ... 'birth': [1, 2, 3, 1, 2, 3, 1, 2, 3],
459 ... 'ht_one': [2.8, 2.9, 2.2, 2, 1.8, 1.9, 2.2, 2.3, 2.1],
460 ... 'ht_two': [3.4, 3.8, 2.9, 3.2, 2.8, 2.4, 3.3, 3.4, 2.9]
461 ... })
462 >>> df
463 famid birth ht_one ht_two
464 0 1 1 2.8 3.4
465 1 1 2 2.9 3.8
466 2 1 3 2.2 2.9
467 3 2 1 2.0 3.2
468 4 2 2 1.8 2.8
469 5 2 3 1.9 2.4
470 6 3 1 2.2 3.3
471 7 3 2 2.3 3.4
472 8 3 3 2.1 2.9
474 >>> l = pd.wide_to_long(df, stubnames='ht', i=['famid', 'birth'], j='age',
475 ... sep='_', suffix=r'\w+')
476 >>> l
477 ... # doctest: +NORMALIZE_WHITESPACE
478 ht
479 famid birth age
480 1 1 one 2.8
481 two 3.4
482 2 one 2.9
483 two 3.8
484 3 one 2.2
485 two 2.9
486 2 1 one 2.0
487 two 3.2
488 2 one 1.8
489 two 2.8
490 3 one 1.9
491 two 2.4
492 3 1 one 2.2
493 two 3.3
494 2 one 2.3
495 two 3.4
496 3 one 2.1
497 two 2.9
498 """
500 def get_var_names(df, stub: str, sep: str, suffix: str) -> list[str]:
501 regex = rf"^{re.escape(stub)}{re.escape(sep)}{suffix}$"
502 pattern = re.compile(regex)
503 return [col for col in df.columns if pattern.match(col)]
505 def melt_stub(df, stub: str, i, j, value_vars, sep: str):
506 newdf = melt(
507 df,
508 id_vars=i,
509 value_vars=value_vars,
510 value_name=stub.rstrip(sep),
511 var_name=j,
512 )
513 newdf[j] = Categorical(newdf[j])
514 newdf[j] = newdf[j].str.replace(re.escape(stub + sep), "", regex=True)
516 # GH17627 Cast numerics suffixes to int/float
517 newdf[j] = to_numeric(newdf[j], errors="ignore")
519 return newdf.set_index(i + [j])
521 if not is_list_like(stubnames):
522 stubnames = [stubnames]
523 else:
524 stubnames = list(stubnames)
526 if any(col in stubnames for col in df.columns):
527 raise ValueError("stubname can't be identical to a column name")
529 if not is_list_like(i):
530 i = [i]
531 else:
532 i = list(i)
534 if df[i].duplicated().any():
535 raise ValueError("the id variables need to uniquely identify each row")
537 value_vars = [get_var_names(df, stub, sep, suffix) for stub in stubnames]
539 value_vars_flattened = [e for sublist in value_vars for e in sublist]
540 id_vars = list(set(df.columns.tolist()).difference(value_vars_flattened))
542 _melted = [melt_stub(df, s, i, j, v, sep) for s, v in zip(stubnames, value_vars)]
543 melted = _melted[0].join(_melted[1:], how="outer")
545 if len(i) == 1:
546 new = df[id_vars].set_index(i).join(melted)
547 return new
549 new = df[id_vars].merge(melted.reset_index(), on=i).set_index(i + [j])
551 return new