Coverage for /var/srv/projects/api.amasfac.comuna18.com/tmp/venv/lib/python3.9/site-packages/pandas/core/reshape/melt.py: 11%

138 statements  

« prev     ^ index     » next       coverage.py v6.4.4, created at 2023-07-17 14:22 -0600

1from __future__ import annotations 

2 

3import re 

4from typing import TYPE_CHECKING 

5import warnings 

6 

7import numpy as np 

8 

9from pandas.util._decorators import ( 

10 Appender, 

11 deprecate_kwarg, 

12) 

13from pandas.util._exceptions import find_stack_level 

14 

15from pandas.core.dtypes.common import ( 

16 is_extension_array_dtype, 

17 is_list_like, 

18) 

19from pandas.core.dtypes.concat import concat_compat 

20from pandas.core.dtypes.missing import notna 

21 

22import pandas.core.algorithms as algos 

23from pandas.core.arrays import Categorical 

24import pandas.core.common as com 

25from pandas.core.indexes.api import ( 

26 Index, 

27 MultiIndex, 

28) 

29from pandas.core.reshape.concat import concat 

30from pandas.core.reshape.util import tile_compat 

31from pandas.core.shared_docs import _shared_docs 

32from pandas.core.tools.numeric import to_numeric 

33 

34if TYPE_CHECKING: 34 ↛ 35line 34 didn't jump to line 35, because the condition on line 34 was never true

35 from pandas import DataFrame 

36 

37 

38@Appender(_shared_docs["melt"] % {"caller": "pd.melt(df, ", "other": "DataFrame.melt"}) 

39def melt( 

40 frame: DataFrame, 

41 id_vars=None, 

42 value_vars=None, 

43 var_name=None, 

44 value_name="value", 

45 col_level=None, 

46 ignore_index: bool = True, 

47) -> DataFrame: 

48 # If multiindex, gather names of columns on all level for checking presence 

49 # of `id_vars` and `value_vars` 

50 if isinstance(frame.columns, MultiIndex): 

51 cols = [x for c in frame.columns for x in c] 

52 else: 

53 cols = list(frame.columns) 

54 

55 if value_name in frame.columns: 

56 warnings.warn( 

57 "This dataframe has a column name that matches the 'value_name' column " 

58 "name of the resulting Dataframe. " 

59 "In the future this will raise an error, please set the 'value_name' " 

60 "parameter of DataFrame.melt to a unique name.", 

61 FutureWarning, 

62 stacklevel=find_stack_level(), 

63 ) 

64 

65 if id_vars is not None: 

66 if not is_list_like(id_vars): 

67 id_vars = [id_vars] 

68 elif isinstance(frame.columns, MultiIndex) and not isinstance(id_vars, list): 

69 raise ValueError( 

70 "id_vars must be a list of tuples when columns are a MultiIndex" 

71 ) 

72 else: 

73 # Check that `id_vars` are in frame 

74 id_vars = list(id_vars) 

75 missing = Index(com.flatten(id_vars)).difference(cols) 

76 if not missing.empty: 

77 raise KeyError( 

78 "The following 'id_vars' are not present " 

79 f"in the DataFrame: {list(missing)}" 

80 ) 

81 else: 

82 id_vars = [] 

83 

84 if value_vars is not None: 

85 if not is_list_like(value_vars): 

86 value_vars = [value_vars] 

87 elif isinstance(frame.columns, MultiIndex) and not isinstance(value_vars, list): 

88 raise ValueError( 

89 "value_vars must be a list of tuples when columns are a MultiIndex" 

90 ) 

91 else: 

92 value_vars = list(value_vars) 

93 # Check that `value_vars` are in frame 

94 missing = Index(com.flatten(value_vars)).difference(cols) 

95 if not missing.empty: 

96 raise KeyError( 

97 "The following 'value_vars' are not present in " 

98 f"the DataFrame: {list(missing)}" 

99 ) 

100 if col_level is not None: 

101 idx = frame.columns.get_level_values(col_level).get_indexer( 

102 id_vars + value_vars 

103 ) 

104 else: 

105 idx = algos.unique(frame.columns.get_indexer_for(id_vars + value_vars)) 

106 frame = frame.iloc[:, idx] 

107 else: 

108 frame = frame.copy() 

109 

110 if col_level is not None: # allow list or other? 

111 # frame is a copy 

112 frame.columns = frame.columns.get_level_values(col_level) 

113 

114 if var_name is None: 

115 if isinstance(frame.columns, MultiIndex): 

116 if len(frame.columns.names) == len(set(frame.columns.names)): 

117 var_name = frame.columns.names 

118 else: 

119 var_name = [f"variable_{i}" for i in range(len(frame.columns.names))] 

120 else: 

121 var_name = [ 

122 frame.columns.name if frame.columns.name is not None else "variable" 

123 ] 

124 if isinstance(var_name, str): 

125 var_name = [var_name] 

126 

127 N, K = frame.shape 

128 K -= len(id_vars) 

129 

130 mdata = {} 

131 for col in id_vars: 

132 id_data = frame.pop(col) 

133 if is_extension_array_dtype(id_data): 

134 if K > 0: 

135 id_data = concat([id_data] * K, ignore_index=True) 

136 else: 

137 # We can't concat empty list. (GH 46044) 

138 id_data = type(id_data)([], name=id_data.name, dtype=id_data.dtype) 

139 else: 

140 # error: Incompatible types in assignment (expression has type 

141 # "ndarray[Any, dtype[Any]]", variable has type "Series") 

142 id_data = np.tile(id_data._values, K) # type: ignore[assignment] 

143 mdata[col] = id_data 

144 

145 mcolumns = id_vars + var_name + [value_name] 

146 

147 # error: Incompatible types in assignment (expression has type "ndarray", 

148 # target has type "Series") 

149 mdata[value_name] = frame._values.ravel("F") # type: ignore[assignment] 

150 for i, col in enumerate(var_name): 

151 # asanyarray will keep the columns as an Index 

152 

153 # error: Incompatible types in assignment (expression has type "ndarray", target 

154 # has type "Series") 

155 mdata[col] = np.asanyarray( # type: ignore[assignment] 

156 frame.columns._get_level_values(i) 

157 ).repeat(N) 

158 

159 result = frame._constructor(mdata, columns=mcolumns) 

160 

161 if not ignore_index: 

162 result.index = tile_compat(frame.index, K) 

163 

164 return result 

165 

166 

167@deprecate_kwarg(old_arg_name="label", new_arg_name=None) 

168def lreshape(data: DataFrame, groups, dropna: bool = True, label=None) -> DataFrame: 

169 """ 

170 Reshape wide-format data to long. Generalized inverse of DataFrame.pivot. 

171 

172 Accepts a dictionary, ``groups``, in which each key is a new column name 

173 and each value is a list of old column names that will be "melted" under 

174 the new column name as part of the reshape. 

175 

176 Parameters 

177 ---------- 

178 data : DataFrame 

179 The wide-format DataFrame. 

180 groups : dict 

181 {new_name : list_of_columns}. 

182 dropna : bool, default True 

183 Do not include columns whose entries are all NaN. 

184 label : None 

185 Not used. 

186 

187 .. deprecated:: 1.0.0 

188 

189 Returns 

190 ------- 

191 DataFrame 

192 Reshaped DataFrame. 

193 

194 See Also 

195 -------- 

196 melt : Unpivot a DataFrame from wide to long format, optionally leaving 

197 identifiers set. 

198 pivot : Create a spreadsheet-style pivot table as a DataFrame. 

199 DataFrame.pivot : Pivot without aggregation that can handle 

200 non-numeric data. 

201 DataFrame.pivot_table : Generalization of pivot that can handle 

202 duplicate values for one index/column pair. 

203 DataFrame.unstack : Pivot based on the index values instead of a 

204 column. 

205 wide_to_long : Wide panel to long format. Less flexible but more 

206 user-friendly than melt. 

207 

208 Examples 

209 -------- 

210 >>> data = pd.DataFrame({'hr1': [514, 573], 'hr2': [545, 526], 

211 ... 'team': ['Red Sox', 'Yankees'], 

212 ... 'year1': [2007, 2007], 'year2': [2008, 2008]}) 

213 >>> data 

214 hr1 hr2 team year1 year2 

215 0 514 545 Red Sox 2007 2008 

216 1 573 526 Yankees 2007 2008 

217 

218 >>> pd.lreshape(data, {'year': ['year1', 'year2'], 'hr': ['hr1', 'hr2']}) 

219 team year hr 

220 0 Red Sox 2007 514 

221 1 Yankees 2007 573 

222 2 Red Sox 2008 545 

223 3 Yankees 2008 526 

224 """ 

225 if isinstance(groups, dict): 

226 keys = list(groups.keys()) 

227 values = list(groups.values()) 

228 else: 

229 keys, values = zip(*groups) 

230 

231 all_cols = list(set.union(*(set(x) for x in values))) 

232 id_cols = list(data.columns.difference(all_cols)) 

233 

234 K = len(values[0]) 

235 

236 for seq in values: 

237 if len(seq) != K: 

238 raise ValueError("All column lists must be same length") 

239 

240 mdata = {} 

241 pivot_cols = [] 

242 

243 for target, names in zip(keys, values): 

244 to_concat = [data[col]._values for col in names] 

245 

246 mdata[target] = concat_compat(to_concat) 

247 pivot_cols.append(target) 

248 

249 for col in id_cols: 

250 mdata[col] = np.tile(data[col]._values, K) 

251 

252 if dropna: 

253 mask = np.ones(len(mdata[pivot_cols[0]]), dtype=bool) 

254 for c in pivot_cols: 

255 mask &= notna(mdata[c]) 

256 if not mask.all(): 

257 mdata = {k: v[mask] for k, v in mdata.items()} 

258 

259 return data._constructor(mdata, columns=id_cols + pivot_cols) 

260 

261 

262def wide_to_long( 

263 df: DataFrame, stubnames, i, j, sep: str = "", suffix: str = r"\d+" 

264) -> DataFrame: 

265 r""" 

266 Unpivot a DataFrame from wide to long format. 

267 

268 Less flexible but more user-friendly than melt. 

269 

270 With stubnames ['A', 'B'], this function expects to find one or more 

271 group of columns with format 

272 A-suffix1, A-suffix2,..., B-suffix1, B-suffix2,... 

273 You specify what you want to call this suffix in the resulting long format 

274 with `j` (for example `j='year'`) 

275 

276 Each row of these wide variables are assumed to be uniquely identified by 

277 `i` (can be a single column name or a list of column names) 

278 

279 All remaining variables in the data frame are left intact. 

280 

281 Parameters 

282 ---------- 

283 df : DataFrame 

284 The wide-format DataFrame. 

285 stubnames : str or list-like 

286 The stub name(s). The wide format variables are assumed to 

287 start with the stub names. 

288 i : str or list-like 

289 Column(s) to use as id variable(s). 

290 j : str 

291 The name of the sub-observation variable. What you wish to name your 

292 suffix in the long format. 

293 sep : str, default "" 

294 A character indicating the separation of the variable names 

295 in the wide format, to be stripped from the names in the long format. 

296 For example, if your column names are A-suffix1, A-suffix2, you 

297 can strip the hyphen by specifying `sep='-'`. 

298 suffix : str, default '\\d+' 

299 A regular expression capturing the wanted suffixes. '\\d+' captures 

300 numeric suffixes. Suffixes with no numbers could be specified with the 

301 negated character class '\\D+'. You can also further disambiguate 

302 suffixes, for example, if your wide variables are of the form A-one, 

303 B-two,.., and you have an unrelated column A-rating, you can ignore the 

304 last one by specifying `suffix='(!?one|two)'`. When all suffixes are 

305 numeric, they are cast to int64/float64. 

306 

307 Returns 

308 ------- 

309 DataFrame 

310 A DataFrame that contains each stub name as a variable, with new index 

311 (i, j). 

312 

313 See Also 

314 -------- 

315 melt : Unpivot a DataFrame from wide to long format, optionally leaving 

316 identifiers set. 

317 pivot : Create a spreadsheet-style pivot table as a DataFrame. 

318 DataFrame.pivot : Pivot without aggregation that can handle 

319 non-numeric data. 

320 DataFrame.pivot_table : Generalization of pivot that can handle 

321 duplicate values for one index/column pair. 

322 DataFrame.unstack : Pivot based on the index values instead of a 

323 column. 

324 

325 Notes 

326 ----- 

327 All extra variables are left untouched. This simply uses 

328 `pandas.melt` under the hood, but is hard-coded to "do the right thing" 

329 in a typical case. 

330 

331 Examples 

332 -------- 

333 >>> np.random.seed(123) 

334 >>> df = pd.DataFrame({"A1970" : {0 : "a", 1 : "b", 2 : "c"}, 

335 ... "A1980" : {0 : "d", 1 : "e", 2 : "f"}, 

336 ... "B1970" : {0 : 2.5, 1 : 1.2, 2 : .7}, 

337 ... "B1980" : {0 : 3.2, 1 : 1.3, 2 : .1}, 

338 ... "X" : dict(zip(range(3), np.random.randn(3))) 

339 ... }) 

340 >>> df["id"] = df.index 

341 >>> df 

342 A1970 A1980 B1970 B1980 X id 

343 0 a d 2.5 3.2 -1.085631 0 

344 1 b e 1.2 1.3 0.997345 1 

345 2 c f 0.7 0.1 0.282978 2 

346 >>> pd.wide_to_long(df, ["A", "B"], i="id", j="year") 

347 ... # doctest: +NORMALIZE_WHITESPACE 

348 X A B 

349 id year 

350 0 1970 -1.085631 a 2.5 

351 1 1970 0.997345 b 1.2 

352 2 1970 0.282978 c 0.7 

353 0 1980 -1.085631 d 3.2 

354 1 1980 0.997345 e 1.3 

355 2 1980 0.282978 f 0.1 

356 

357 With multiple id columns 

358 

359 >>> df = pd.DataFrame({ 

360 ... 'famid': [1, 1, 1, 2, 2, 2, 3, 3, 3], 

361 ... 'birth': [1, 2, 3, 1, 2, 3, 1, 2, 3], 

362 ... 'ht1': [2.8, 2.9, 2.2, 2, 1.8, 1.9, 2.2, 2.3, 2.1], 

363 ... 'ht2': [3.4, 3.8, 2.9, 3.2, 2.8, 2.4, 3.3, 3.4, 2.9] 

364 ... }) 

365 >>> df 

366 famid birth ht1 ht2 

367 0 1 1 2.8 3.4 

368 1 1 2 2.9 3.8 

369 2 1 3 2.2 2.9 

370 3 2 1 2.0 3.2 

371 4 2 2 1.8 2.8 

372 5 2 3 1.9 2.4 

373 6 3 1 2.2 3.3 

374 7 3 2 2.3 3.4 

375 8 3 3 2.1 2.9 

376 >>> l = pd.wide_to_long(df, stubnames='ht', i=['famid', 'birth'], j='age') 

377 >>> l 

378 ... # doctest: +NORMALIZE_WHITESPACE 

379 ht 

380 famid birth age 

381 1 1 1 2.8 

382 2 3.4 

383 2 1 2.9 

384 2 3.8 

385 3 1 2.2 

386 2 2.9 

387 2 1 1 2.0 

388 2 3.2 

389 2 1 1.8 

390 2 2.8 

391 3 1 1.9 

392 2 2.4 

393 3 1 1 2.2 

394 2 3.3 

395 2 1 2.3 

396 2 3.4 

397 3 1 2.1 

398 2 2.9 

399 

400 Going from long back to wide just takes some creative use of `unstack` 

401 

402 >>> w = l.unstack() 

403 >>> w.columns = w.columns.map('{0[0]}{0[1]}'.format) 

404 >>> w.reset_index() 

405 famid birth ht1 ht2 

406 0 1 1 2.8 3.4 

407 1 1 2 2.9 3.8 

408 2 1 3 2.2 2.9 

409 3 2 1 2.0 3.2 

410 4 2 2 1.8 2.8 

411 5 2 3 1.9 2.4 

412 6 3 1 2.2 3.3 

413 7 3 2 2.3 3.4 

414 8 3 3 2.1 2.9 

415 

416 Less wieldy column names are also handled 

417 

418 >>> np.random.seed(0) 

419 >>> df = pd.DataFrame({'A(weekly)-2010': np.random.rand(3), 

420 ... 'A(weekly)-2011': np.random.rand(3), 

421 ... 'B(weekly)-2010': np.random.rand(3), 

422 ... 'B(weekly)-2011': np.random.rand(3), 

423 ... 'X' : np.random.randint(3, size=3)}) 

424 >>> df['id'] = df.index 

425 >>> df # doctest: +NORMALIZE_WHITESPACE, +ELLIPSIS 

426 A(weekly)-2010 A(weekly)-2011 B(weekly)-2010 B(weekly)-2011 X id 

427 0 0.548814 0.544883 0.437587 0.383442 0 0 

428 1 0.715189 0.423655 0.891773 0.791725 1 1 

429 2 0.602763 0.645894 0.963663 0.528895 1 2 

430 

431 >>> pd.wide_to_long(df, ['A(weekly)', 'B(weekly)'], i='id', 

432 ... j='year', sep='-') 

433 ... # doctest: +NORMALIZE_WHITESPACE 

434 X A(weekly) B(weekly) 

435 id year 

436 0 2010 0 0.548814 0.437587 

437 1 2010 1 0.715189 0.891773 

438 2 2010 1 0.602763 0.963663 

439 0 2011 0 0.544883 0.383442 

440 1 2011 1 0.423655 0.791725 

441 2 2011 1 0.645894 0.528895 

442 

443 If we have many columns, we could also use a regex to find our 

444 stubnames and pass that list on to wide_to_long 

445 

446 >>> stubnames = sorted( 

447 ... set([match[0] for match in df.columns.str.findall( 

448 ... r'[A-B]\(.*\)').values if match != []]) 

449 ... ) 

450 >>> list(stubnames) 

451 ['A(weekly)', 'B(weekly)'] 

452 

453 All of the above examples have integers as suffixes. It is possible to 

454 have non-integers as suffixes. 

455 

456 >>> df = pd.DataFrame({ 

457 ... 'famid': [1, 1, 1, 2, 2, 2, 3, 3, 3], 

458 ... 'birth': [1, 2, 3, 1, 2, 3, 1, 2, 3], 

459 ... 'ht_one': [2.8, 2.9, 2.2, 2, 1.8, 1.9, 2.2, 2.3, 2.1], 

460 ... 'ht_two': [3.4, 3.8, 2.9, 3.2, 2.8, 2.4, 3.3, 3.4, 2.9] 

461 ... }) 

462 >>> df 

463 famid birth ht_one ht_two 

464 0 1 1 2.8 3.4 

465 1 1 2 2.9 3.8 

466 2 1 3 2.2 2.9 

467 3 2 1 2.0 3.2 

468 4 2 2 1.8 2.8 

469 5 2 3 1.9 2.4 

470 6 3 1 2.2 3.3 

471 7 3 2 2.3 3.4 

472 8 3 3 2.1 2.9 

473 

474 >>> l = pd.wide_to_long(df, stubnames='ht', i=['famid', 'birth'], j='age', 

475 ... sep='_', suffix=r'\w+') 

476 >>> l 

477 ... # doctest: +NORMALIZE_WHITESPACE 

478 ht 

479 famid birth age 

480 1 1 one 2.8 

481 two 3.4 

482 2 one 2.9 

483 two 3.8 

484 3 one 2.2 

485 two 2.9 

486 2 1 one 2.0 

487 two 3.2 

488 2 one 1.8 

489 two 2.8 

490 3 one 1.9 

491 two 2.4 

492 3 1 one 2.2 

493 two 3.3 

494 2 one 2.3 

495 two 3.4 

496 3 one 2.1 

497 two 2.9 

498 """ 

499 

500 def get_var_names(df, stub: str, sep: str, suffix: str) -> list[str]: 

501 regex = rf"^{re.escape(stub)}{re.escape(sep)}{suffix}$" 

502 pattern = re.compile(regex) 

503 return [col for col in df.columns if pattern.match(col)] 

504 

505 def melt_stub(df, stub: str, i, j, value_vars, sep: str): 

506 newdf = melt( 

507 df, 

508 id_vars=i, 

509 value_vars=value_vars, 

510 value_name=stub.rstrip(sep), 

511 var_name=j, 

512 ) 

513 newdf[j] = Categorical(newdf[j]) 

514 newdf[j] = newdf[j].str.replace(re.escape(stub + sep), "", regex=True) 

515 

516 # GH17627 Cast numerics suffixes to int/float 

517 newdf[j] = to_numeric(newdf[j], errors="ignore") 

518 

519 return newdf.set_index(i + [j]) 

520 

521 if not is_list_like(stubnames): 

522 stubnames = [stubnames] 

523 else: 

524 stubnames = list(stubnames) 

525 

526 if any(col in stubnames for col in df.columns): 

527 raise ValueError("stubname can't be identical to a column name") 

528 

529 if not is_list_like(i): 

530 i = [i] 

531 else: 

532 i = list(i) 

533 

534 if df[i].duplicated().any(): 

535 raise ValueError("the id variables need to uniquely identify each row") 

536 

537 value_vars = [get_var_names(df, stub, sep, suffix) for stub in stubnames] 

538 

539 value_vars_flattened = [e for sublist in value_vars for e in sublist] 

540 id_vars = list(set(df.columns.tolist()).difference(value_vars_flattened)) 

541 

542 _melted = [melt_stub(df, s, i, j, v, sep) for s, v in zip(stubnames, value_vars)] 

543 melted = _melted[0].join(_melted[1:], how="outer") 

544 

545 if len(i) == 1: 

546 new = df[id_vars].set_index(i).join(melted) 

547 return new 

548 

549 new = df[id_vars].merge(melted.reset_index(), on=i).set_index(i + [j]) 

550 

551 return new