Coverage for /var/srv/projects/api.amasfac.comuna18.com/tmp/venv/lib/python3.9/site-packages/pandas/core/reshape/melt.py: 11%

1from __future__ import annotations

3import re

4from typing import TYPE_CHECKING

5import warnings

7import numpy as np

9from pandas.util._decorators import (

10 Appender,

11 deprecate_kwarg,

12)

13from pandas.util._exceptions import find_stack_level

15from pandas.core.dtypes.common import (

16 is_extension_array_dtype,

17 is_list_like,

18)

19from pandas.core.dtypes.concat import concat_compat

20from pandas.core.dtypes.missing import notna

22import pandas.core.algorithms as algos

23from pandas.core.arrays import Categorical

24import pandas.core.common as com

25from pandas.core.indexes.api import (

26 Index,

27 MultiIndex,

28)

29from pandas.core.reshape.concat import concat

30from pandas.core.reshape.util import tile_compat

31from pandas.core.shared_docs import _shared_docs

32from pandas.core.tools.numeric import to_numeric

34if TYPE_CHECKING: 34 ↛ 35line 34 didn't jump to line 35, because the condition on line 34 was never true

35 from pandas import DataFrame

38@Appender(_shared_docs["melt"] % {"caller": "pd.melt(df, ", "other": "DataFrame.melt"})

39def melt(

40 frame: DataFrame,

41 id_vars=None,

42 value_vars=None,

43 var_name=None,

44 value_name="value",

45 col_level=None,

46 ignore_index: bool = True,

47) -> DataFrame:

48 # If multiindex, gather names of columns on all level for checking presence

49 # of `id_vars` and `value_vars`

50 if isinstance(frame.columns, MultiIndex):

51 cols = [x for c in frame.columns for x in c]

52 else:

53 cols = list(frame.columns)

55 if value_name in frame.columns:

56 warnings.warn(

57 "This dataframe has a column name that matches the 'value_name' column "

58 "name of the resulting Dataframe. "

59 "In the future this will raise an error, please set the 'value_name' "

60 "parameter of DataFrame.melt to a unique name.",

61 FutureWarning,

62 stacklevel=find_stack_level(),

63 )

65 if id_vars is not None:

66 if not is_list_like(id_vars):

67 id_vars = [id_vars]

68 elif isinstance(frame.columns, MultiIndex) and not isinstance(id_vars, list):

69 raise ValueError(

70 "id_vars must be a list of tuples when columns are a MultiIndex"

71 )

72 else:

73 # Check that `id_vars` are in frame

74 id_vars = list(id_vars)

75 missing = Index(com.flatten(id_vars)).difference(cols)

76 if not missing.empty:

77 raise KeyError(

78 "The following 'id_vars' are not present "

79 f"in the DataFrame: {list(missing)}"

80 )

81 else:

82 id_vars = []

84 if value_vars is not None:

85 if not is_list_like(value_vars):

86 value_vars = [value_vars]

87 elif isinstance(frame.columns, MultiIndex) and not isinstance(value_vars, list):

88 raise ValueError(

89 "value_vars must be a list of tuples when columns are a MultiIndex"

90 )

91 else:

92 value_vars = list(value_vars)

93 # Check that `value_vars` are in frame

94 missing = Index(com.flatten(value_vars)).difference(cols)

95 if not missing.empty:

96 raise KeyError(

97 "The following 'value_vars' are not present in "

98 f"the DataFrame: {list(missing)}"

99 )

100 if col_level is not None:

101 idx = frame.columns.get_level_values(col_level).get_indexer(

102 id_vars + value_vars

103 )

104 else:

105 idx = algos.unique(frame.columns.get_indexer_for(id_vars + value_vars))

106 frame = frame.iloc[:, idx]

107 else:

108 frame = frame.copy()

109

110 if col_level is not None: # allow list or other?

111 # frame is a copy

112 frame.columns = frame.columns.get_level_values(col_level)

113

114 if var_name is None:

115 if isinstance(frame.columns, MultiIndex):

116 if len(frame.columns.names) == len(set(frame.columns.names)):

117 var_name = frame.columns.names

118 else:

119 var_name = [f"variable_{i}" for i in range(len(frame.columns.names))]

120 else:

121 var_name = [

122 frame.columns.name if frame.columns.name is not None else "variable"

123 ]

124 if isinstance(var_name, str):

125 var_name = [var_name]

126

127 N, K = frame.shape

128 K -= len(id_vars)

129

130 mdata = {}

131 for col in id_vars:

132 id_data = frame.pop(col)

133 if is_extension_array_dtype(id_data):

134 if K > 0:

135 id_data = concat([id_data] * K, ignore_index=True)

136 else:

137 # We can't concat empty list. (GH 46044)

138 id_data = type(id_data)([], name=id_data.name, dtype=id_data.dtype)

139 else:

140 # error: Incompatible types in assignment (expression has type

141 # "ndarray[Any, dtype[Any]]", variable has type "Series")

142 id_data = np.tile(id_data._values, K) # type: ignore[assignment]

143 mdata[col] = id_data

144

145 mcolumns = id_vars + var_name + [value_name]

146

147 # error: Incompatible types in assignment (expression has type "ndarray",

148 # target has type "Series")

149 mdata[value_name] = frame._values.ravel("F") # type: ignore[assignment]

150 for i, col in enumerate(var_name):

151 # asanyarray will keep the columns as an Index

152

153 # error: Incompatible types in assignment (expression has type "ndarray", target

154 # has type "Series")

155 mdata[col] = np.asanyarray( # type: ignore[assignment]

156 frame.columns._get_level_values(i)

157 ).repeat(N)

158

159 result = frame._constructor(mdata, columns=mcolumns)

160

161 if not ignore_index:

162 result.index = tile_compat(frame.index, K)

163

164 return result

165

166

167@deprecate_kwarg(old_arg_name="label", new_arg_name=None)

168def lreshape(data: DataFrame, groups, dropna: bool = True, label=None) -> DataFrame:

169 """

170 Reshape wide-format data to long. Generalized inverse of DataFrame.pivot.

171

172 Accepts a dictionary, ``groups``, in which each key is a new column name

173 and each value is a list of old column names that will be "melted" under

174 the new column name as part of the reshape.

175

176 Parameters

177 ----------

178 data : DataFrame

179 The wide-format DataFrame.

180 groups : dict

181 {new_name : list_of_columns}.

182 dropna : bool, default True

183 Do not include columns whose entries are all NaN.

184 label : None

185 Not used.

186

187 .. deprecated:: 1.0.0

188

189 Returns

190 -------

191 DataFrame

192 Reshaped DataFrame.

193

194 See Also

195 --------

196 melt : Unpivot a DataFrame from wide to long format, optionally leaving

197 identifiers set.

198 pivot : Create a spreadsheet-style pivot table as a DataFrame.

199 DataFrame.pivot : Pivot without aggregation that can handle

200 non-numeric data.

201 DataFrame.pivot_table : Generalization of pivot that can handle

202 duplicate values for one index/column pair.

203 DataFrame.unstack : Pivot based on the index values instead of a

204 column.

205 wide_to_long : Wide panel to long format. Less flexible but more

206 user-friendly than melt.

207

208 Examples

209 --------

210 >>> data = pd.DataFrame({'hr1': [514, 573], 'hr2': [545, 526],

211 ... 'team': ['Red Sox', 'Yankees'],

212 ... 'year1': [2007, 2007], 'year2': [2008, 2008]})

213 >>> data

214 hr1 hr2 team year1 year2

215 0 514 545 Red Sox 2007 2008

216 1 573 526 Yankees 2007 2008

217

218 >>> pd.lreshape(data, {'year': ['year1', 'year2'], 'hr': ['hr1', 'hr2']})

219 team year hr

220 0 Red Sox 2007 514

221 1 Yankees 2007 573

222 2 Red Sox 2008 545

223 3 Yankees 2008 526

224 """

225 if isinstance(groups, dict):

226 keys = list(groups.keys())

227 values = list(groups.values())

228 else:

229 keys, values = zip(*groups)

230

231 all_cols = list(set.union(*(set(x) for x in values)))

232 id_cols = list(data.columns.difference(all_cols))

233

234 K = len(values[0])

235

236 for seq in values:

237 if len(seq) != K:

238 raise ValueError("All column lists must be same length")

239

240 mdata = {}

241 pivot_cols = []

242

243 for target, names in zip(keys, values):

244 to_concat = [data[col]._values for col in names]

245

246 mdata[target] = concat_compat(to_concat)

247 pivot_cols.append(target)

248

249 for col in id_cols:

250 mdata[col] = np.tile(data[col]._values, K)

251

252 if dropna:

253 mask = np.ones(len(mdata[pivot_cols[0]]), dtype=bool)

254 for c in pivot_cols:

255 mask &= notna(mdata[c])

256 if not mask.all():

257 mdata = {k: v[mask] for k, v in mdata.items()}

258

259 return data._constructor(mdata, columns=id_cols + pivot_cols)

260

261

262def wide_to_long(

263 df: DataFrame, stubnames, i, j, sep: str = "", suffix: str = r"\d+"

264) -> DataFrame:

265 r"""

266 Unpivot a DataFrame from wide to long format.

267

268 Less flexible but more user-friendly than melt.

269

270 With stubnames ['A', 'B'], this function expects to find one or more

271 group of columns with format

272 A-suffix1, A-suffix2,..., B-suffix1, B-suffix2,...

273 You specify what you want to call this suffix in the resulting long format

274 with `j` (for example `j='year'`)

275

276 Each row of these wide variables are assumed to be uniquely identified by

277 `i` (can be a single column name or a list of column names)

278

279 All remaining variables in the data frame are left intact.

280

281 Parameters

282 ----------

283 df : DataFrame

284 The wide-format DataFrame.

285 stubnames : str or list-like

286 The stub name(s). The wide format variables are assumed to

287 start with the stub names.

288 i : str or list-like

289 Column(s) to use as id variable(s).

290 j : str

291 The name of the sub-observation variable. What you wish to name your

292 suffix in the long format.

293 sep : str, default ""

294 A character indicating the separation of the variable names

295 in the wide format, to be stripped from the names in the long format.

296 For example, if your column names are A-suffix1, A-suffix2, you

297 can strip the hyphen by specifying `sep='-'`.

298 suffix : str, default '\\d+'

299 A regular expression capturing the wanted suffixes. '\\d+' captures

300 numeric suffixes. Suffixes with no numbers could be specified with the

301 negated character class '\\D+'. You can also further disambiguate

302 suffixes, for example, if your wide variables are of the form A-one,

303 B-two,.., and you have an unrelated column A-rating, you can ignore the

304 last one by specifying `suffix='(!?one|two)'`. When all suffixes are

305 numeric, they are cast to int64/float64.

306

307 Returns

308 -------

309 DataFrame

310 A DataFrame that contains each stub name as a variable, with new index

311 (i, j).

312

313 See Also

314 --------

315 melt : Unpivot a DataFrame from wide to long format, optionally leaving

316 identifiers set.

317 pivot : Create a spreadsheet-style pivot table as a DataFrame.

318 DataFrame.pivot : Pivot without aggregation that can handle

319 non-numeric data.

320 DataFrame.pivot_table : Generalization of pivot that can handle

321 duplicate values for one index/column pair.

322 DataFrame.unstack : Pivot based on the index values instead of a

323 column.

324

325 Notes

326 -----

327 All extra variables are left untouched. This simply uses

328 `pandas.melt` under the hood, but is hard-coded to "do the right thing"

329 in a typical case.

330

331 Examples

332 --------

333 >>> np.random.seed(123)

334 >>> df = pd.DataFrame({"A1970" : {0 : "a", 1 : "b", 2 : "c"},

335 ... "A1980" : {0 : "d", 1 : "e", 2 : "f"},

336 ... "B1970" : {0 : 2.5, 1 : 1.2, 2 : .7},

337 ... "B1980" : {0 : 3.2, 1 : 1.3, 2 : .1},

338 ... "X" : dict(zip(range(3), np.random.randn(3)))

339 ... })

340 >>> df["id"] = df.index

341 >>> df

342 A1970 A1980 B1970 B1980 X id

343 0 a d 2.5 3.2 -1.085631 0

344 1 b e 1.2 1.3 0.997345 1

345 2 c f 0.7 0.1 0.282978 2

346 >>> pd.wide_to_long(df, ["A", "B"], i="id", j="year")

347 ... # doctest: +NORMALIZE_WHITESPACE

348 X A B

349 id year

350 0 1970 -1.085631 a 2.5

351 1 1970 0.997345 b 1.2

352 2 1970 0.282978 c 0.7

353 0 1980 -1.085631 d 3.2

354 1 1980 0.997345 e 1.3

355 2 1980 0.282978 f 0.1

356

357 With multiple id columns

358

359 >>> df = pd.DataFrame({

360 ... 'famid': [1, 1, 1, 2, 2, 2, 3, 3, 3],

361 ... 'birth': [1, 2, 3, 1, 2, 3, 1, 2, 3],

362 ... 'ht1': [2.8, 2.9, 2.2, 2, 1.8, 1.9, 2.2, 2.3, 2.1],

363 ... 'ht2': [3.4, 3.8, 2.9, 3.2, 2.8, 2.4, 3.3, 3.4, 2.9]

364 ... })

365 >>> df

366 famid birth ht1 ht2

367 0 1 1 2.8 3.4

368 1 1 2 2.9 3.8

369 2 1 3 2.2 2.9

370 3 2 1 2.0 3.2

371 4 2 2 1.8 2.8

372 5 2 3 1.9 2.4

373 6 3 1 2.2 3.3

374 7 3 2 2.3 3.4

375 8 3 3 2.1 2.9

376 >>> l = pd.wide_to_long(df, stubnames='ht', i=['famid', 'birth'], j='age')

377 >>> l

378 ... # doctest: +NORMALIZE_WHITESPACE

379 ht

380 famid birth age

381 1 1 1 2.8

382 2 3.4

383 2 1 2.9

384 2 3.8

385 3 1 2.2

386 2 2.9

387 2 1 1 2.0

388 2 3.2

389 2 1 1.8

390 2 2.8

391 3 1 1.9

392 2 2.4

393 3 1 1 2.2

394 2 3.3

395 2 1 2.3

396 2 3.4

397 3 1 2.1

398 2 2.9

399

400 Going from long back to wide just takes some creative use of `unstack`

401

402 >>> w = l.unstack()

403 >>> w.columns = w.columns.map('{0[0]}{0[1]}'.format)

404 >>> w.reset_index()

405 famid birth ht1 ht2

406 0 1 1 2.8 3.4

407 1 1 2 2.9 3.8

408 2 1 3 2.2 2.9

409 3 2 1 2.0 3.2

410 4 2 2 1.8 2.8

411 5 2 3 1.9 2.4

412 6 3 1 2.2 3.3

413 7 3 2 2.3 3.4

414 8 3 3 2.1 2.9

415

416 Less wieldy column names are also handled

417

418 >>> np.random.seed(0)

419 >>> df = pd.DataFrame({'A(weekly)-2010': np.random.rand(3),

420 ... 'A(weekly)-2011': np.random.rand(3),

421 ... 'B(weekly)-2010': np.random.rand(3),

422 ... 'B(weekly)-2011': np.random.rand(3),

423 ... 'X' : np.random.randint(3, size=3)})

424 >>> df['id'] = df.index

425 >>> df # doctest: +NORMALIZE_WHITESPACE, +ELLIPSIS

426 A(weekly)-2010 A(weekly)-2011 B(weekly)-2010 B(weekly)-2011 X id

427 0 0.548814 0.544883 0.437587 0.383442 0 0

428 1 0.715189 0.423655 0.891773 0.791725 1 1

429 2 0.602763 0.645894 0.963663 0.528895 1 2

430

431 >>> pd.wide_to_long(df, ['A(weekly)', 'B(weekly)'], i='id',

432 ... j='year', sep='-')

433 ... # doctest: +NORMALIZE_WHITESPACE

434 X A(weekly) B(weekly)

435 id year

436 0 2010 0 0.548814 0.437587

437 1 2010 1 0.715189 0.891773

438 2 2010 1 0.602763 0.963663

439 0 2011 0 0.544883 0.383442

440 1 2011 1 0.423655 0.791725

441 2 2011 1 0.645894 0.528895

442

443 If we have many columns, we could also use a regex to find our

444 stubnames and pass that list on to wide_to_long

445

446 >>> stubnames = sorted(

447 ... set([match[0] for match in df.columns.str.findall(

448 ... r'[A-B]$.*$').values if match != []])

449 ... )

450 >>> list(stubnames)

451 ['A(weekly)', 'B(weekly)']

452

453 All of the above examples have integers as suffixes. It is possible to

454 have non-integers as suffixes.

455

456 >>> df = pd.DataFrame({

457 ... 'famid': [1, 1, 1, 2, 2, 2, 3, 3, 3],

458 ... 'birth': [1, 2, 3, 1, 2, 3, 1, 2, 3],

459 ... 'ht_one': [2.8, 2.9, 2.2, 2, 1.8, 1.9, 2.2, 2.3, 2.1],

460 ... 'ht_two': [3.4, 3.8, 2.9, 3.2, 2.8, 2.4, 3.3, 3.4, 2.9]

461 ... })

462 >>> df

463 famid birth ht_one ht_two

464 0 1 1 2.8 3.4

465 1 1 2 2.9 3.8

466 2 1 3 2.2 2.9

467 3 2 1 2.0 3.2

468 4 2 2 1.8 2.8

469 5 2 3 1.9 2.4

470 6 3 1 2.2 3.3

471 7 3 2 2.3 3.4

472 8 3 3 2.1 2.9

473

474 >>> l = pd.wide_to_long(df, stubnames='ht', i=['famid', 'birth'], j='age',

475 ... sep='_', suffix=r'\w+')

476 >>> l

477 ... # doctest: +NORMALIZE_WHITESPACE

478 ht

479 famid birth age

480 1 1 one 2.8

481 two 3.4

482 2 one 2.9

483 two 3.8

484 3 one 2.2

485 two 2.9

486 2 1 one 2.0

487 two 3.2

488 2 one 1.8

489 two 2.8

490 3 one 1.9

491 two 2.4

492 3 1 one 2.2

493 two 3.3

494 2 one 2.3

495 two 3.4

496 3 one 2.1

497 two 2.9

498 """

499

500 def get_var_names(df, stub: str, sep: str, suffix: str) -> list[str]:

501 regex = rf"^{re.escape(stub)}{re.escape(sep)}{suffix}$"

502 pattern = re.compile(regex)

503 return [col for col in df.columns if pattern.match(col)]

504

505 def melt_stub(df, stub: str, i, j, value_vars, sep: str):

506 newdf = melt(

507 df,

508 id_vars=i,

509 value_vars=value_vars,

510 value_name=stub.rstrip(sep),

511 var_name=j,

512 )

513 newdf[j] = Categorical(newdf[j])

514 newdf[j] = newdf[j].str.replace(re.escape(stub + sep), "", regex=True)

515

516 # GH17627 Cast numerics suffixes to int/float

517 newdf[j] = to_numeric(newdf[j], errors="ignore")

518

519 return newdf.set_index(i + [j])

520

521 if not is_list_like(stubnames):

522 stubnames = [stubnames]

523 else:

524 stubnames = list(stubnames)

525

526 if any(col in stubnames for col in df.columns):

527 raise ValueError("stubname can't be identical to a column name")

528

529 if not is_list_like(i):

530 i = [i]

531 else:

532 i = list(i)

533

534 if df[i].duplicated().any():

535 raise ValueError("the id variables need to uniquely identify each row")

536

537 value_vars = [get_var_names(df, stub, sep, suffix) for stub in stubnames]

538

539 value_vars_flattened = [e for sublist in value_vars for e in sublist]

540 id_vars = list(set(df.columns.tolist()).difference(value_vars_flattened))

541

542 _melted = [melt_stub(df, s, i, j, v, sep) for s, v in zip(stubnames, value_vars)]

543 melted = _melted[0].join(_melted[1:], how="outer")

544

545 if len(i) == 1:

546 new = df[id_vars].set_index(i).join(melted)

547 return new

548

549 new = df[id_vars].merge(melted.reset_index(), on=i).set_index(i + [j])

550

551 return new