Coverage for /var/srv/projects/api.amasfac.comuna18.com/tmp/venv/lib/python3.9/site-packages/pandas/core/reshape/merge.py: 7%

867 statements  

« prev     ^ index     » next       coverage.py v6.4.4, created at 2023-07-17 14:22 -0600

1""" 

2SQL-style merge routines 

3""" 

4from __future__ import annotations 

5 

6import copy 

7import datetime 

8from functools import partial 

9import string 

10from typing import ( 

11 TYPE_CHECKING, 

12 Hashable, 

13 Sequence, 

14 cast, 

15) 

16import uuid 

17import warnings 

18 

19import numpy as np 

20 

21from pandas._libs import ( 

22 Timedelta, 

23 hashtable as libhashtable, 

24 join as libjoin, 

25 lib, 

26) 

27from pandas._typing import ( 

28 AnyArrayLike, 

29 ArrayLike, 

30 DtypeObj, 

31 IndexLabel, 

32 Suffixes, 

33 npt, 

34) 

35from pandas.errors import MergeError 

36from pandas.util._decorators import ( 

37 Appender, 

38 Substitution, 

39 cache_readonly, 

40) 

41from pandas.util._exceptions import find_stack_level 

42 

43from pandas.core.dtypes.cast import find_common_type 

44from pandas.core.dtypes.common import ( 

45 ensure_float64, 

46 ensure_int64, 

47 ensure_object, 

48 is_array_like, 

49 is_bool, 

50 is_bool_dtype, 

51 is_categorical_dtype, 

52 is_dtype_equal, 

53 is_extension_array_dtype, 

54 is_float_dtype, 

55 is_integer, 

56 is_integer_dtype, 

57 is_list_like, 

58 is_number, 

59 is_numeric_dtype, 

60 is_object_dtype, 

61 needs_i8_conversion, 

62) 

63from pandas.core.dtypes.dtypes import DatetimeTZDtype 

64from pandas.core.dtypes.generic import ( 

65 ABCDataFrame, 

66 ABCSeries, 

67) 

68from pandas.core.dtypes.missing import ( 

69 isna, 

70 na_value_for_dtype, 

71) 

72 

73from pandas import ( 

74 Categorical, 

75 Index, 

76 MultiIndex, 

77 Series, 

78) 

79import pandas.core.algorithms as algos 

80from pandas.core.arrays import ExtensionArray 

81from pandas.core.arrays._mixins import NDArrayBackedExtensionArray 

82import pandas.core.common as com 

83from pandas.core.construction import extract_array 

84from pandas.core.frame import _merge_doc 

85from pandas.core.sorting import is_int64_overflow_possible 

86 

87if TYPE_CHECKING: 87 ↛ 88line 87 didn't jump to line 88, because the condition on line 87 was never true

88 from pandas import DataFrame 

89 from pandas.core import groupby 

90 from pandas.core.arrays import DatetimeArray 

91 

92 

93@Substitution("\nleft : DataFrame or named Series") 

94@Appender(_merge_doc, indents=0) 

95def merge( 

96 left: DataFrame | Series, 

97 right: DataFrame | Series, 

98 how: str = "inner", 

99 on: IndexLabel | None = None, 

100 left_on: IndexLabel | None = None, 

101 right_on: IndexLabel | None = None, 

102 left_index: bool = False, 

103 right_index: bool = False, 

104 sort: bool = False, 

105 suffixes: Suffixes = ("_x", "_y"), 

106 copy: bool = True, 

107 indicator: bool = False, 

108 validate: str | None = None, 

109) -> DataFrame: 

110 op = _MergeOperation( 

111 left, 

112 right, 

113 how=how, 

114 on=on, 

115 left_on=left_on, 

116 right_on=right_on, 

117 left_index=left_index, 

118 right_index=right_index, 

119 sort=sort, 

120 suffixes=suffixes, 

121 indicator=indicator, 

122 validate=validate, 

123 ) 

124 return op.get_result(copy=copy) 

125 

126 

127if __debug__: 

128 merge.__doc__ = _merge_doc % "\nleft : DataFrame" 

129 

130 

131def _groupby_and_merge(by, left: DataFrame, right: DataFrame, merge_pieces): 

132 """ 

133 groupby & merge; we are always performing a left-by type operation 

134 

135 Parameters 

136 ---------- 

137 by: field to group 

138 left: DataFrame 

139 right: DataFrame 

140 merge_pieces: function for merging 

141 """ 

142 pieces = [] 

143 if not isinstance(by, (list, tuple)): 

144 by = [by] 

145 

146 lby = left.groupby(by, sort=False) 

147 rby: groupby.DataFrameGroupBy | None = None 

148 

149 # if we can groupby the rhs 

150 # then we can get vastly better perf 

151 if all(item in right.columns for item in by): 

152 rby = right.groupby(by, sort=False) 

153 

154 for key, lhs in lby.grouper.get_iterator(lby._selected_obj, axis=lby.axis): 

155 

156 if rby is None: 

157 rhs = right 

158 else: 

159 try: 

160 rhs = right.take(rby.indices[key]) 

161 except KeyError: 

162 # key doesn't exist in left 

163 lcols = lhs.columns.tolist() 

164 cols = lcols + [r for r in right.columns if r not in set(lcols)] 

165 merged = lhs.reindex(columns=cols) 

166 merged.index = range(len(merged)) 

167 pieces.append(merged) 

168 continue 

169 

170 merged = merge_pieces(lhs, rhs) 

171 

172 # make sure join keys are in the merged 

173 # TODO, should merge_pieces do this? 

174 merged[by] = key 

175 

176 pieces.append(merged) 

177 

178 # preserve the original order 

179 # if we have a missing piece this can be reset 

180 from pandas.core.reshape.concat import concat 

181 

182 result = concat(pieces, ignore_index=True) 

183 result = result.reindex(columns=pieces[0].columns, copy=False) 

184 return result, lby 

185 

186 

187def merge_ordered( 

188 left: DataFrame, 

189 right: DataFrame, 

190 on: IndexLabel | None = None, 

191 left_on: IndexLabel | None = None, 

192 right_on: IndexLabel | None = None, 

193 left_by=None, 

194 right_by=None, 

195 fill_method: str | None = None, 

196 suffixes: Suffixes = ("_x", "_y"), 

197 how: str = "outer", 

198) -> DataFrame: 

199 """ 

200 Perform a merge for ordered data with optional filling/interpolation. 

201 

202 Designed for ordered data like time series data. Optionally 

203 perform group-wise merge (see examples). 

204 

205 Parameters 

206 ---------- 

207 left : DataFrame 

208 right : DataFrame 

209 on : label or list 

210 Field names to join on. Must be found in both DataFrames. 

211 left_on : label or list, or array-like 

212 Field names to join on in left DataFrame. Can be a vector or list of 

213 vectors of the length of the DataFrame to use a particular vector as 

214 the join key instead of columns. 

215 right_on : label or list, or array-like 

216 Field names to join on in right DataFrame or vector/list of vectors per 

217 left_on docs. 

218 left_by : column name or list of column names 

219 Group left DataFrame by group columns and merge piece by piece with 

220 right DataFrame. 

221 right_by : column name or list of column names 

222 Group right DataFrame by group columns and merge piece by piece with 

223 left DataFrame. 

224 fill_method : {'ffill', None}, default None 

225 Interpolation method for data. 

226 suffixes : list-like, default is ("_x", "_y") 

227 A length-2 sequence where each element is optionally a string 

228 indicating the suffix to add to overlapping column names in 

229 `left` and `right` respectively. Pass a value of `None` instead 

230 of a string to indicate that the column name from `left` or 

231 `right` should be left as-is, with no suffix. At least one of the 

232 values must not be None. 

233 

234 .. versionchanged:: 0.25.0 

235 how : {'left', 'right', 'outer', 'inner'}, default 'outer' 

236 * left: use only keys from left frame (SQL: left outer join) 

237 * right: use only keys from right frame (SQL: right outer join) 

238 * outer: use union of keys from both frames (SQL: full outer join) 

239 * inner: use intersection of keys from both frames (SQL: inner join). 

240 

241 Returns 

242 ------- 

243 DataFrame 

244 The merged DataFrame output type will the be same as 

245 'left', if it is a subclass of DataFrame. 

246 

247 See Also 

248 -------- 

249 merge : Merge with a database-style join. 

250 merge_asof : Merge on nearest keys. 

251 

252 Examples 

253 -------- 

254 >>> df1 = pd.DataFrame( 

255 ... { 

256 ... "key": ["a", "c", "e", "a", "c", "e"], 

257 ... "lvalue": [1, 2, 3, 1, 2, 3], 

258 ... "group": ["a", "a", "a", "b", "b", "b"] 

259 ... } 

260 ... ) 

261 >>> df1 

262 key lvalue group 

263 0 a 1 a 

264 1 c 2 a 

265 2 e 3 a 

266 3 a 1 b 

267 4 c 2 b 

268 5 e 3 b 

269 

270 >>> df2 = pd.DataFrame({"key": ["b", "c", "d"], "rvalue": [1, 2, 3]}) 

271 >>> df2 

272 key rvalue 

273 0 b 1 

274 1 c 2 

275 2 d 3 

276 

277 >>> merge_ordered(df1, df2, fill_method="ffill", left_by="group") 

278 key lvalue group rvalue 

279 0 a 1 a NaN 

280 1 b 1 a 1.0 

281 2 c 2 a 2.0 

282 3 d 2 a 3.0 

283 4 e 3 a 3.0 

284 5 a 1 b NaN 

285 6 b 1 b 1.0 

286 7 c 2 b 2.0 

287 8 d 2 b 3.0 

288 9 e 3 b 3.0 

289 """ 

290 

291 def _merger(x, y) -> DataFrame: 

292 # perform the ordered merge operation 

293 op = _OrderedMerge( 

294 x, 

295 y, 

296 on=on, 

297 left_on=left_on, 

298 right_on=right_on, 

299 suffixes=suffixes, 

300 fill_method=fill_method, 

301 how=how, 

302 ) 

303 return op.get_result() 

304 

305 if left_by is not None and right_by is not None: 

306 raise ValueError("Can only group either left or right frames") 

307 elif left_by is not None: 

308 if isinstance(left_by, str): 

309 left_by = [left_by] 

310 check = set(left_by).difference(left.columns) 

311 if len(check) != 0: 

312 raise KeyError(f"{check} not found in left columns") 

313 result, _ = _groupby_and_merge(left_by, left, right, lambda x, y: _merger(x, y)) 

314 elif right_by is not None: 

315 if isinstance(right_by, str): 

316 right_by = [right_by] 

317 check = set(right_by).difference(right.columns) 

318 if len(check) != 0: 

319 raise KeyError(f"{check} not found in right columns") 

320 result, _ = _groupby_and_merge( 

321 right_by, right, left, lambda x, y: _merger(y, x) 

322 ) 

323 else: 

324 result = _merger(left, right) 

325 return result 

326 

327 

328def merge_asof( 

329 left: DataFrame | Series, 

330 right: DataFrame | Series, 

331 on: IndexLabel | None = None, 

332 left_on: IndexLabel | None = None, 

333 right_on: IndexLabel | None = None, 

334 left_index: bool = False, 

335 right_index: bool = False, 

336 by=None, 

337 left_by=None, 

338 right_by=None, 

339 suffixes: Suffixes = ("_x", "_y"), 

340 tolerance=None, 

341 allow_exact_matches: bool = True, 

342 direction: str = "backward", 

343) -> DataFrame: 

344 """ 

345 Perform a merge by key distance. 

346 

347 This is similar to a left-join except that we match on nearest 

348 key rather than equal keys. Both DataFrames must be sorted by the key. 

349 

350 For each row in the left DataFrame: 

351 

352 - A "backward" search selects the last row in the right DataFrame whose 

353 'on' key is less than or equal to the left's key. 

354 

355 - A "forward" search selects the first row in the right DataFrame whose 

356 'on' key is greater than or equal to the left's key. 

357 

358 - A "nearest" search selects the row in the right DataFrame whose 'on' 

359 key is closest in absolute distance to the left's key. 

360 

361 The default is "backward" and is compatible in versions below 0.20.0. 

362 The direction parameter was added in version 0.20.0 and introduces 

363 "forward" and "nearest". 

364 

365 Optionally match on equivalent keys with 'by' before searching with 'on'. 

366 

367 Parameters 

368 ---------- 

369 left : DataFrame or named Series 

370 right : DataFrame or named Series 

371 on : label 

372 Field name to join on. Must be found in both DataFrames. 

373 The data MUST be ordered. Furthermore this must be a numeric column, 

374 such as datetimelike, integer, or float. On or left_on/right_on 

375 must be given. 

376 left_on : label 

377 Field name to join on in left DataFrame. 

378 right_on : label 

379 Field name to join on in right DataFrame. 

380 left_index : bool 

381 Use the index of the left DataFrame as the join key. 

382 right_index : bool 

383 Use the index of the right DataFrame as the join key. 

384 by : column name or list of column names 

385 Match on these columns before performing merge operation. 

386 left_by : column name 

387 Field names to match on in the left DataFrame. 

388 right_by : column name 

389 Field names to match on in the right DataFrame. 

390 suffixes : 2-length sequence (tuple, list, ...) 

391 Suffix to apply to overlapping column names in the left and right 

392 side, respectively. 

393 tolerance : int or Timedelta, optional, default None 

394 Select asof tolerance within this range; must be compatible 

395 with the merge index. 

396 allow_exact_matches : bool, default True 

397 

398 - If True, allow matching with the same 'on' value 

399 (i.e. less-than-or-equal-to / greater-than-or-equal-to) 

400 - If False, don't match the same 'on' value 

401 (i.e., strictly less-than / strictly greater-than). 

402 

403 direction : 'backward' (default), 'forward', or 'nearest' 

404 Whether to search for prior, subsequent, or closest matches. 

405 

406 Returns 

407 ------- 

408 merged : DataFrame 

409 

410 See Also 

411 -------- 

412 merge : Merge with a database-style join. 

413 merge_ordered : Merge with optional filling/interpolation. 

414 

415 Examples 

416 -------- 

417 >>> left = pd.DataFrame({"a": [1, 5, 10], "left_val": ["a", "b", "c"]}) 

418 >>> left 

419 a left_val 

420 0 1 a 

421 1 5 b 

422 2 10 c 

423 

424 >>> right = pd.DataFrame({"a": [1, 2, 3, 6, 7], "right_val": [1, 2, 3, 6, 7]}) 

425 >>> right 

426 a right_val 

427 0 1 1 

428 1 2 2 

429 2 3 3 

430 3 6 6 

431 4 7 7 

432 

433 >>> pd.merge_asof(left, right, on="a") 

434 a left_val right_val 

435 0 1 a 1 

436 1 5 b 3 

437 2 10 c 7 

438 

439 >>> pd.merge_asof(left, right, on="a", allow_exact_matches=False) 

440 a left_val right_val 

441 0 1 a NaN 

442 1 5 b 3.0 

443 2 10 c 7.0 

444 

445 >>> pd.merge_asof(left, right, on="a", direction="forward") 

446 a left_val right_val 

447 0 1 a 1.0 

448 1 5 b 6.0 

449 2 10 c NaN 

450 

451 >>> pd.merge_asof(left, right, on="a", direction="nearest") 

452 a left_val right_val 

453 0 1 a 1 

454 1 5 b 6 

455 2 10 c 7 

456 

457 We can use indexed DataFrames as well. 

458 

459 >>> left = pd.DataFrame({"left_val": ["a", "b", "c"]}, index=[1, 5, 10]) 

460 >>> left 

461 left_val 

462 1 a 

463 5 b 

464 10 c 

465 

466 >>> right = pd.DataFrame({"right_val": [1, 2, 3, 6, 7]}, index=[1, 2, 3, 6, 7]) 

467 >>> right 

468 right_val 

469 1 1 

470 2 2 

471 3 3 

472 6 6 

473 7 7 

474 

475 >>> pd.merge_asof(left, right, left_index=True, right_index=True) 

476 left_val right_val 

477 1 a 1 

478 5 b 3 

479 10 c 7 

480 

481 Here is a real-world times-series example 

482 

483 >>> quotes = pd.DataFrame( 

484 ... { 

485 ... "time": [ 

486 ... pd.Timestamp("2016-05-25 13:30:00.023"), 

487 ... pd.Timestamp("2016-05-25 13:30:00.023"), 

488 ... pd.Timestamp("2016-05-25 13:30:00.030"), 

489 ... pd.Timestamp("2016-05-25 13:30:00.041"), 

490 ... pd.Timestamp("2016-05-25 13:30:00.048"), 

491 ... pd.Timestamp("2016-05-25 13:30:00.049"), 

492 ... pd.Timestamp("2016-05-25 13:30:00.072"), 

493 ... pd.Timestamp("2016-05-25 13:30:00.075") 

494 ... ], 

495 ... "ticker": [ 

496 ... "GOOG", 

497 ... "MSFT", 

498 ... "MSFT", 

499 ... "MSFT", 

500 ... "GOOG", 

501 ... "AAPL", 

502 ... "GOOG", 

503 ... "MSFT" 

504 ... ], 

505 ... "bid": [720.50, 51.95, 51.97, 51.99, 720.50, 97.99, 720.50, 52.01], 

506 ... "ask": [720.93, 51.96, 51.98, 52.00, 720.93, 98.01, 720.88, 52.03] 

507 ... } 

508 ... ) 

509 >>> quotes 

510 time ticker bid ask 

511 0 2016-05-25 13:30:00.023 GOOG 720.50 720.93 

512 1 2016-05-25 13:30:00.023 MSFT 51.95 51.96 

513 2 2016-05-25 13:30:00.030 MSFT 51.97 51.98 

514 3 2016-05-25 13:30:00.041 MSFT 51.99 52.00 

515 4 2016-05-25 13:30:00.048 GOOG 720.50 720.93 

516 5 2016-05-25 13:30:00.049 AAPL 97.99 98.01 

517 6 2016-05-25 13:30:00.072 GOOG 720.50 720.88 

518 7 2016-05-25 13:30:00.075 MSFT 52.01 52.03 

519 

520 >>> trades = pd.DataFrame( 

521 ... { 

522 ... "time": [ 

523 ... pd.Timestamp("2016-05-25 13:30:00.023"), 

524 ... pd.Timestamp("2016-05-25 13:30:00.038"), 

525 ... pd.Timestamp("2016-05-25 13:30:00.048"), 

526 ... pd.Timestamp("2016-05-25 13:30:00.048"), 

527 ... pd.Timestamp("2016-05-25 13:30:00.048") 

528 ... ], 

529 ... "ticker": ["MSFT", "MSFT", "GOOG", "GOOG", "AAPL"], 

530 ... "price": [51.95, 51.95, 720.77, 720.92, 98.0], 

531 ... "quantity": [75, 155, 100, 100, 100] 

532 ... } 

533 ... ) 

534 >>> trades 

535 time ticker price quantity 

536 0 2016-05-25 13:30:00.023 MSFT 51.95 75 

537 1 2016-05-25 13:30:00.038 MSFT 51.95 155 

538 2 2016-05-25 13:30:00.048 GOOG 720.77 100 

539 3 2016-05-25 13:30:00.048 GOOG 720.92 100 

540 4 2016-05-25 13:30:00.048 AAPL 98.00 100 

541 

542 By default we are taking the asof of the quotes 

543 

544 >>> pd.merge_asof(trades, quotes, on="time", by="ticker") 

545 time ticker price quantity bid ask 

546 0 2016-05-25 13:30:00.023 MSFT 51.95 75 51.95 51.96 

547 1 2016-05-25 13:30:00.038 MSFT 51.95 155 51.97 51.98 

548 2 2016-05-25 13:30:00.048 GOOG 720.77 100 720.50 720.93 

549 3 2016-05-25 13:30:00.048 GOOG 720.92 100 720.50 720.93 

550 4 2016-05-25 13:30:00.048 AAPL 98.00 100 NaN NaN 

551 

552 We only asof within 2ms between the quote time and the trade time 

553 

554 >>> pd.merge_asof( 

555 ... trades, quotes, on="time", by="ticker", tolerance=pd.Timedelta("2ms") 

556 ... ) 

557 time ticker price quantity bid ask 

558 0 2016-05-25 13:30:00.023 MSFT 51.95 75 51.95 51.96 

559 1 2016-05-25 13:30:00.038 MSFT 51.95 155 NaN NaN 

560 2 2016-05-25 13:30:00.048 GOOG 720.77 100 720.50 720.93 

561 3 2016-05-25 13:30:00.048 GOOG 720.92 100 720.50 720.93 

562 4 2016-05-25 13:30:00.048 AAPL 98.00 100 NaN NaN 

563 

564 We only asof within 10ms between the quote time and the trade time 

565 and we exclude exact matches on time. However *prior* data will 

566 propagate forward 

567 

568 >>> pd.merge_asof( 

569 ... trades, 

570 ... quotes, 

571 ... on="time", 

572 ... by="ticker", 

573 ... tolerance=pd.Timedelta("10ms"), 

574 ... allow_exact_matches=False 

575 ... ) 

576 time ticker price quantity bid ask 

577 0 2016-05-25 13:30:00.023 MSFT 51.95 75 NaN NaN 

578 1 2016-05-25 13:30:00.038 MSFT 51.95 155 51.97 51.98 

579 2 2016-05-25 13:30:00.048 GOOG 720.77 100 NaN NaN 

580 3 2016-05-25 13:30:00.048 GOOG 720.92 100 NaN NaN 

581 4 2016-05-25 13:30:00.048 AAPL 98.00 100 NaN NaN 

582 """ 

583 op = _AsOfMerge( 

584 left, 

585 right, 

586 on=on, 

587 left_on=left_on, 

588 right_on=right_on, 

589 left_index=left_index, 

590 right_index=right_index, 

591 by=by, 

592 left_by=left_by, 

593 right_by=right_by, 

594 suffixes=suffixes, 

595 how="asof", 

596 tolerance=tolerance, 

597 allow_exact_matches=allow_exact_matches, 

598 direction=direction, 

599 ) 

600 return op.get_result() 

601 

602 

603# TODO: transformations?? 

604# TODO: only copy DataFrames when modification necessary 

605class _MergeOperation: 

606 """ 

607 Perform a database (SQL) merge operation between two DataFrame or Series 

608 objects using either columns as keys or their row indexes 

609 """ 

610 

611 _merge_type = "merge" 

612 how: str 

613 on: IndexLabel | None 

614 # left_on/right_on may be None when passed, but in validate_specification 

615 # get replaced with non-None. 

616 left_on: Sequence[Hashable | AnyArrayLike] 

617 right_on: Sequence[Hashable | AnyArrayLike] 

618 left_index: bool 

619 right_index: bool 

620 axis: int 

621 bm_axis: int 

622 sort: bool 

623 suffixes: Suffixes 

624 copy: bool 

625 indicator: bool 

626 validate: str | None 

627 

628 def __init__( 

629 self, 

630 left: DataFrame | Series, 

631 right: DataFrame | Series, 

632 how: str = "inner", 

633 on: IndexLabel | None = None, 

634 left_on: IndexLabel | None = None, 

635 right_on: IndexLabel | None = None, 

636 axis: int = 1, 

637 left_index: bool = False, 

638 right_index: bool = False, 

639 sort: bool = True, 

640 suffixes: Suffixes = ("_x", "_y"), 

641 indicator: bool = False, 

642 validate: str | None = None, 

643 ) -> None: 

644 _left = _validate_operand(left) 

645 _right = _validate_operand(right) 

646 self.left = self.orig_left = _left 

647 self.right = self.orig_right = _right 

648 self.how = how 

649 

650 # bm_axis -> the axis on the BlockManager 

651 self.bm_axis = axis 

652 # axis --> the axis on the Series/DataFrame 

653 self.axis = 1 - axis if self.left.ndim == 2 else 0 

654 

655 self.on = com.maybe_make_list(on) 

656 

657 self.suffixes = suffixes 

658 self.sort = sort 

659 

660 self.left_index = left_index 

661 self.right_index = right_index 

662 

663 self.indicator = indicator 

664 

665 if not is_bool(left_index): 

666 raise ValueError( 

667 f"left_index parameter must be of type bool, not {type(left_index)}" 

668 ) 

669 if not is_bool(right_index): 

670 raise ValueError( 

671 f"right_index parameter must be of type bool, not {type(right_index)}" 

672 ) 

673 

674 # warn user when merging between different levels 

675 if _left.columns.nlevels != _right.columns.nlevels: 

676 msg = ( 

677 "merging between different levels is deprecated and will be removed " 

678 f"in a future version. ({_left.columns.nlevels} levels on the left, " 

679 f"{_right.columns.nlevels} on the right)" 

680 ) 

681 # stacklevel chosen to be correct when this is reached via pd.merge 

682 # (and not DataFrame.join) 

683 warnings.warn(msg, FutureWarning, stacklevel=find_stack_level()) 

684 

685 self.left_on, self.right_on = self._validate_left_right_on(left_on, right_on) 

686 

687 cross_col = None 

688 if self.how == "cross": 

689 ( 

690 self.left, 

691 self.right, 

692 self.how, 

693 cross_col, 

694 ) = self._create_cross_configuration(self.left, self.right) 

695 self.left_on = self.right_on = [cross_col] 

696 self._cross = cross_col 

697 

698 # note this function has side effects 

699 ( 

700 self.left_join_keys, 

701 self.right_join_keys, 

702 self.join_names, 

703 ) = self._get_merge_keys() 

704 

705 # validate the merge keys dtypes. We may need to coerce 

706 # to avoid incompatible dtypes 

707 self._maybe_coerce_merge_keys() 

708 

709 # If argument passed to validate, 

710 # check if columns specified as unique 

711 # are in fact unique. 

712 if validate is not None: 

713 self._validate(validate) 

714 

715 def _reindex_and_concat( 

716 self, 

717 join_index: Index, 

718 left_indexer: npt.NDArray[np.intp] | None, 

719 right_indexer: npt.NDArray[np.intp] | None, 

720 copy: bool, 

721 ) -> DataFrame: 

722 """ 

723 reindex along index and concat along columns. 

724 """ 

725 # Take views so we do not alter the originals 

726 left = self.left[:] 

727 right = self.right[:] 

728 

729 llabels, rlabels = _items_overlap_with_suffix( 

730 self.left._info_axis, self.right._info_axis, self.suffixes 

731 ) 

732 

733 if left_indexer is not None: 

734 # Pinning the index here (and in the right code just below) is not 

735 # necessary, but makes the `.take` more performant if we have e.g. 

736 # a MultiIndex for left.index. 

737 lmgr = left._mgr.reindex_indexer( 

738 join_index, 

739 left_indexer, 

740 axis=1, 

741 copy=False, 

742 only_slice=True, 

743 allow_dups=True, 

744 use_na_proxy=True, 

745 ) 

746 left = left._constructor(lmgr) 

747 left.index = join_index 

748 

749 if right_indexer is not None: 

750 rmgr = right._mgr.reindex_indexer( 

751 join_index, 

752 right_indexer, 

753 axis=1, 

754 copy=False, 

755 only_slice=True, 

756 allow_dups=True, 

757 use_na_proxy=True, 

758 ) 

759 right = right._constructor(rmgr) 

760 right.index = join_index 

761 

762 from pandas import concat 

763 

764 left.columns = llabels 

765 right.columns = rlabels 

766 result = concat([left, right], axis=1, copy=copy) 

767 return result 

768 

769 def get_result(self, copy: bool = True) -> DataFrame: 

770 if self.indicator: 

771 self.left, self.right = self._indicator_pre_merge(self.left, self.right) 

772 

773 join_index, left_indexer, right_indexer = self._get_join_info() 

774 

775 result = self._reindex_and_concat( 

776 join_index, left_indexer, right_indexer, copy=copy 

777 ) 

778 result = result.__finalize__(self, method=self._merge_type) 

779 

780 if self.indicator: 

781 result = self._indicator_post_merge(result) 

782 

783 self._maybe_add_join_keys(result, left_indexer, right_indexer) 

784 

785 self._maybe_restore_index_levels(result) 

786 

787 self._maybe_drop_cross_column(result, self._cross) 

788 

789 return result.__finalize__(self, method="merge") 

790 

791 def _maybe_drop_cross_column( 

792 self, result: DataFrame, cross_col: str | None 

793 ) -> None: 

794 if cross_col is not None: 

795 del result[cross_col] 

796 

797 @cache_readonly 

798 def _indicator_name(self) -> str | None: 

799 if isinstance(self.indicator, str): 

800 return self.indicator 

801 elif isinstance(self.indicator, bool): 

802 return "_merge" if self.indicator else None 

803 else: 

804 raise ValueError( 

805 "indicator option can only accept boolean or string arguments" 

806 ) 

807 

808 def _indicator_pre_merge( 

809 self, left: DataFrame, right: DataFrame 

810 ) -> tuple[DataFrame, DataFrame]: 

811 

812 columns = left.columns.union(right.columns) 

813 

814 for i in ["_left_indicator", "_right_indicator"]: 

815 if i in columns: 

816 raise ValueError( 

817 "Cannot use `indicator=True` option when " 

818 f"data contains a column named {i}" 

819 ) 

820 if self._indicator_name in columns: 

821 raise ValueError( 

822 "Cannot use name of an existing column for indicator column" 

823 ) 

824 

825 left = left.copy() 

826 right = right.copy() 

827 

828 left["_left_indicator"] = 1 

829 left["_left_indicator"] = left["_left_indicator"].astype("int8") 

830 

831 right["_right_indicator"] = 2 

832 right["_right_indicator"] = right["_right_indicator"].astype("int8") 

833 

834 return left, right 

835 

836 def _indicator_post_merge(self, result: DataFrame) -> DataFrame: 

837 

838 result["_left_indicator"] = result["_left_indicator"].fillna(0) 

839 result["_right_indicator"] = result["_right_indicator"].fillna(0) 

840 

841 result[self._indicator_name] = Categorical( 

842 (result["_left_indicator"] + result["_right_indicator"]), 

843 categories=[1, 2, 3], 

844 ) 

845 result[self._indicator_name] = result[ 

846 self._indicator_name 

847 ].cat.rename_categories(["left_only", "right_only", "both"]) 

848 

849 result = result.drop(labels=["_left_indicator", "_right_indicator"], axis=1) 

850 return result 

851 

852 def _maybe_restore_index_levels(self, result: DataFrame) -> None: 

853 """ 

854 Restore index levels specified as `on` parameters 

855 

856 Here we check for cases where `self.left_on` and `self.right_on` pairs 

857 each reference an index level in their respective DataFrames. The 

858 joined columns corresponding to these pairs are then restored to the 

859 index of `result`. 

860 

861 **Note:** This method has side effects. It modifies `result` in-place 

862 

863 Parameters 

864 ---------- 

865 result: DataFrame 

866 merge result 

867 

868 Returns 

869 ------- 

870 None 

871 """ 

872 names_to_restore = [] 

873 for name, left_key, right_key in zip( 

874 self.join_names, self.left_on, self.right_on 

875 ): 

876 if ( 

877 # Argument 1 to "_is_level_reference" of "NDFrame" has incompatible 

878 # type "Union[Hashable, ExtensionArray, Index, Series]"; expected 

879 # "Hashable" 

880 self.orig_left._is_level_reference(left_key) # type: ignore[arg-type] 

881 # Argument 1 to "_is_level_reference" of "NDFrame" has incompatible 

882 # type "Union[Hashable, ExtensionArray, Index, Series]"; expected 

883 # "Hashable" 

884 and self.orig_right._is_level_reference( 

885 right_key # type: ignore[arg-type] 

886 ) 

887 and left_key == right_key 

888 and name not in result.index.names 

889 ): 

890 

891 names_to_restore.append(name) 

892 

893 if names_to_restore: 

894 result.set_index(names_to_restore, inplace=True) 

895 

896 def _maybe_add_join_keys( 

897 self, 

898 result: DataFrame, 

899 left_indexer: np.ndarray | None, 

900 right_indexer: np.ndarray | None, 

901 ) -> None: 

902 

903 left_has_missing = None 

904 right_has_missing = None 

905 

906 keys = zip(self.join_names, self.left_on, self.right_on) 

907 for i, (name, lname, rname) in enumerate(keys): 

908 if not _should_fill(lname, rname): 

909 continue 

910 

911 take_left, take_right = None, None 

912 

913 if name in result: 

914 

915 if left_indexer is not None and right_indexer is not None: 

916 if name in self.left: 

917 

918 if left_has_missing is None: 

919 left_has_missing = (left_indexer == -1).any() 

920 

921 if left_has_missing: 

922 take_right = self.right_join_keys[i] 

923 

924 if not is_dtype_equal( 

925 result[name].dtype, self.left[name].dtype 

926 ): 

927 take_left = self.left[name]._values 

928 

929 elif name in self.right: 

930 

931 if right_has_missing is None: 

932 right_has_missing = (right_indexer == -1).any() 

933 

934 if right_has_missing: 

935 take_left = self.left_join_keys[i] 

936 

937 if not is_dtype_equal( 

938 result[name].dtype, self.right[name].dtype 

939 ): 

940 take_right = self.right[name]._values 

941 

942 elif left_indexer is not None and is_array_like(self.left_join_keys[i]): 

943 take_left = self.left_join_keys[i] 

944 take_right = self.right_join_keys[i] 

945 

946 if take_left is not None or take_right is not None: 

947 

948 if take_left is None: 

949 lvals = result[name]._values 

950 else: 

951 # TODO: can we pin down take_left's type earlier? 

952 take_left = extract_array(take_left, extract_numpy=True) 

953 lfill = na_value_for_dtype(take_left.dtype) 

954 lvals = algos.take_nd(take_left, left_indexer, fill_value=lfill) 

955 

956 if take_right is None: 

957 rvals = result[name]._values 

958 else: 

959 # TODO: can we pin down take_right's type earlier? 

960 take_right = extract_array(take_right, extract_numpy=True) 

961 rfill = na_value_for_dtype(take_right.dtype) 

962 rvals = algos.take_nd(take_right, right_indexer, fill_value=rfill) 

963 

964 # if we have an all missing left_indexer 

965 # make sure to just use the right values or vice-versa 

966 mask_left = left_indexer == -1 

967 # error: Item "bool" of "Union[Any, bool]" has no attribute "all" 

968 if mask_left.all(): # type: ignore[union-attr] 

969 key_col = Index(rvals) 

970 result_dtype = rvals.dtype 

971 elif right_indexer is not None and (right_indexer == -1).all(): 

972 key_col = Index(lvals) 

973 result_dtype = lvals.dtype 

974 else: 

975 key_col = Index(lvals).where(~mask_left, rvals) 

976 result_dtype = find_common_type([lvals.dtype, rvals.dtype]) 

977 

978 if result._is_label_reference(name): 

979 result[name] = Series( 

980 key_col, dtype=result_dtype, index=result.index 

981 ) 

982 elif result._is_level_reference(name): 

983 if isinstance(result.index, MultiIndex): 

984 key_col.name = name 

985 idx_list = [ 

986 result.index.get_level_values(level_name) 

987 if level_name != name 

988 else key_col 

989 for level_name in result.index.names 

990 ] 

991 

992 result.set_index(idx_list, inplace=True) 

993 else: 

994 result.index = Index(key_col, name=name) 

995 else: 

996 result.insert(i, name or f"key_{i}", key_col) 

997 

998 def _get_join_indexers(self) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]: 

999 """return the join indexers""" 

1000 return get_join_indexers( 

1001 self.left_join_keys, self.right_join_keys, sort=self.sort, how=self.how 

1002 ) 

1003 

1004 def _get_join_info( 

1005 self, 

1006 ) -> tuple[Index, npt.NDArray[np.intp] | None, npt.NDArray[np.intp] | None]: 

1007 

1008 left_ax = self.left.axes[self.axis] 

1009 right_ax = self.right.axes[self.axis] 

1010 

1011 if self.left_index and self.right_index and self.how != "asof": 

1012 join_index, left_indexer, right_indexer = left_ax.join( 

1013 right_ax, how=self.how, return_indexers=True, sort=self.sort 

1014 ) 

1015 

1016 elif self.right_index and self.how == "left": 

1017 join_index, left_indexer, right_indexer = _left_join_on_index( 

1018 left_ax, right_ax, self.left_join_keys, sort=self.sort 

1019 ) 

1020 

1021 elif self.left_index and self.how == "right": 

1022 join_index, right_indexer, left_indexer = _left_join_on_index( 

1023 right_ax, left_ax, self.right_join_keys, sort=self.sort 

1024 ) 

1025 else: 

1026 (left_indexer, right_indexer) = self._get_join_indexers() 

1027 

1028 if self.right_index: 

1029 if len(self.left) > 0: 

1030 join_index = self._create_join_index( 

1031 self.left.index, 

1032 self.right.index, 

1033 left_indexer, 

1034 how="right", 

1035 ) 

1036 else: 

1037 join_index = self.right.index.take(right_indexer) 

1038 elif self.left_index: 

1039 if self.how == "asof": 

1040 # GH#33463 asof should always behave like a left merge 

1041 join_index = self._create_join_index( 

1042 self.left.index, 

1043 self.right.index, 

1044 left_indexer, 

1045 how="left", 

1046 ) 

1047 

1048 elif len(self.right) > 0: 

1049 join_index = self._create_join_index( 

1050 self.right.index, 

1051 self.left.index, 

1052 right_indexer, 

1053 how="left", 

1054 ) 

1055 else: 

1056 join_index = self.left.index.take(left_indexer) 

1057 else: 

1058 join_index = Index(np.arange(len(left_indexer))) 

1059 

1060 if len(join_index) == 0: 

1061 join_index = join_index.astype(object) 

1062 return join_index, left_indexer, right_indexer 

1063 

1064 def _create_join_index( 

1065 self, 

1066 index: Index, 

1067 other_index: Index, 

1068 indexer: npt.NDArray[np.intp], 

1069 how: str = "left", 

1070 ) -> Index: 

1071 """ 

1072 Create a join index by rearranging one index to match another 

1073 

1074 Parameters 

1075 ---------- 

1076 index : Index being rearranged 

1077 other_index : Index used to supply values not found in index 

1078 indexer : np.ndarray[np.intp] how to rearrange index 

1079 how : str 

1080 Replacement is only necessary if indexer based on other_index. 

1081 

1082 Returns 

1083 ------- 

1084 Index 

1085 """ 

1086 if self.how in (how, "outer") and not isinstance(other_index, MultiIndex): 

1087 # if final index requires values in other_index but not target 

1088 # index, indexer may hold missing (-1) values, causing Index.take 

1089 # to take the final value in target index. So, we set the last 

1090 # element to be the desired fill value. We do not use allow_fill 

1091 # and fill_value because it throws a ValueError on integer indices 

1092 mask = indexer == -1 

1093 if np.any(mask): 

1094 fill_value = na_value_for_dtype(index.dtype, compat=False) 

1095 index = index.append(Index([fill_value])) 

1096 return index.take(indexer) 

1097 

1098 def _get_merge_keys(self): 

1099 """ 

1100 Note: has side effects (copy/delete key columns) 

1101 

1102 Parameters 

1103 ---------- 

1104 left 

1105 right 

1106 on 

1107 

1108 Returns 

1109 ------- 

1110 left_keys, right_keys, join_names 

1111 """ 

1112 # left_keys, right_keys entries can actually be anything listlike 

1113 # with a 'dtype' attr 

1114 left_keys: list[AnyArrayLike] = [] 

1115 right_keys: list[AnyArrayLike] = [] 

1116 join_names: list[Hashable] = [] 

1117 right_drop = [] 

1118 left_drop = [] 

1119 

1120 left, right = self.left, self.right 

1121 

1122 is_lkey = lambda x: is_array_like(x) and len(x) == len(left) 

1123 is_rkey = lambda x: is_array_like(x) and len(x) == len(right) 

1124 

1125 # Note that pd.merge_asof() has separate 'on' and 'by' parameters. A 

1126 # user could, for example, request 'left_index' and 'left_by'. In a 

1127 # regular pd.merge(), users cannot specify both 'left_index' and 

1128 # 'left_on'. (Instead, users have a MultiIndex). That means the 

1129 # self.left_on in this function is always empty in a pd.merge(), but 

1130 # a pd.merge_asof(left_index=True, left_by=...) will result in a 

1131 # self.left_on array with a None in the middle of it. This requires 

1132 # a work-around as designated in the code below. 

1133 # See _validate_left_right_on() for where this happens. 

1134 

1135 # ugh, spaghetti re #733 

1136 if _any(self.left_on) and _any(self.right_on): 

1137 for lk, rk in zip(self.left_on, self.right_on): 

1138 if is_lkey(lk): 

1139 lk = cast(AnyArrayLike, lk) 

1140 left_keys.append(lk) 

1141 if is_rkey(rk): 

1142 rk = cast(AnyArrayLike, rk) 

1143 right_keys.append(rk) 

1144 join_names.append(None) # what to do? 

1145 else: 

1146 # Then we're either Hashable or a wrong-length arraylike, 

1147 # the latter of which will raise 

1148 rk = cast(Hashable, rk) 

1149 if rk is not None: 

1150 right_keys.append(right._get_label_or_level_values(rk)) 

1151 join_names.append(rk) 

1152 else: 

1153 # work-around for merge_asof(right_index=True) 

1154 right_keys.append(right.index) 

1155 join_names.append(right.index.name) 

1156 else: 

1157 if not is_rkey(rk): 

1158 # Then we're either Hashable or a wrong-length arraylike, 

1159 # the latter of which will raise 

1160 rk = cast(Hashable, rk) 

1161 if rk is not None: 

1162 right_keys.append(right._get_label_or_level_values(rk)) 

1163 else: 

1164 # work-around for merge_asof(right_index=True) 

1165 right_keys.append(right.index) 

1166 if lk is not None and lk == rk: # FIXME: what about other NAs? 

1167 # avoid key upcast in corner case (length-0) 

1168 if len(left) > 0: 

1169 right_drop.append(rk) 

1170 else: 

1171 left_drop.append(lk) 

1172 else: 

1173 rk = cast(AnyArrayLike, rk) 

1174 right_keys.append(rk) 

1175 if lk is not None: 

1176 # Then we're either Hashable or a wrong-length arraylike, 

1177 # the latter of which will raise 

1178 lk = cast(Hashable, lk) 

1179 left_keys.append(left._get_label_or_level_values(lk)) 

1180 join_names.append(lk) 

1181 else: 

1182 # work-around for merge_asof(left_index=True) 

1183 left_keys.append(left.index) 

1184 join_names.append(left.index.name) 

1185 elif _any(self.left_on): 

1186 for k in self.left_on: 

1187 if is_lkey(k): 

1188 k = cast(AnyArrayLike, k) 

1189 left_keys.append(k) 

1190 join_names.append(None) 

1191 else: 

1192 # Then we're either Hashable or a wrong-length arraylike, 

1193 # the latter of which will raise 

1194 k = cast(Hashable, k) 

1195 left_keys.append(left._get_label_or_level_values(k)) 

1196 join_names.append(k) 

1197 if isinstance(self.right.index, MultiIndex): 

1198 right_keys = [ 

1199 lev._values.take(lev_codes) 

1200 for lev, lev_codes in zip( 

1201 self.right.index.levels, self.right.index.codes 

1202 ) 

1203 ] 

1204 else: 

1205 right_keys = [self.right.index._values] 

1206 elif _any(self.right_on): 

1207 for k in self.right_on: 

1208 if is_rkey(k): 

1209 k = cast(AnyArrayLike, k) 

1210 right_keys.append(k) 

1211 join_names.append(None) 

1212 else: 

1213 # Then we're either Hashable or a wrong-length arraylike, 

1214 # the latter of which will raise 

1215 k = cast(Hashable, k) 

1216 right_keys.append(right._get_label_or_level_values(k)) 

1217 join_names.append(k) 

1218 if isinstance(self.left.index, MultiIndex): 

1219 left_keys = [ 

1220 lev._values.take(lev_codes) 

1221 for lev, lev_codes in zip( 

1222 self.left.index.levels, self.left.index.codes 

1223 ) 

1224 ] 

1225 else: 

1226 left_keys = [self.left.index._values] 

1227 

1228 if left_drop: 

1229 self.left = self.left._drop_labels_or_levels(left_drop) 

1230 

1231 if right_drop: 

1232 self.right = self.right._drop_labels_or_levels(right_drop) 

1233 

1234 return left_keys, right_keys, join_names 

1235 

1236 def _maybe_coerce_merge_keys(self) -> None: 

1237 # we have valid merges but we may have to further 

1238 # coerce these if they are originally incompatible types 

1239 # 

1240 # for example if these are categorical, but are not dtype_equal 

1241 # or if we have object and integer dtypes 

1242 

1243 for lk, rk, name in zip( 

1244 self.left_join_keys, self.right_join_keys, self.join_names 

1245 ): 

1246 if (len(lk) and not len(rk)) or (not len(lk) and len(rk)): 

1247 continue 

1248 

1249 lk_is_cat = is_categorical_dtype(lk.dtype) 

1250 rk_is_cat = is_categorical_dtype(rk.dtype) 

1251 lk_is_object = is_object_dtype(lk.dtype) 

1252 rk_is_object = is_object_dtype(rk.dtype) 

1253 

1254 # if either left or right is a categorical 

1255 # then the must match exactly in categories & ordered 

1256 if lk_is_cat and rk_is_cat: 

1257 if lk._categories_match_up_to_permutation(rk): 

1258 continue 

1259 

1260 elif lk_is_cat or rk_is_cat: 

1261 pass 

1262 

1263 elif is_dtype_equal(lk.dtype, rk.dtype): 

1264 continue 

1265 

1266 msg = ( 

1267 f"You are trying to merge on {lk.dtype} and " 

1268 f"{rk.dtype} columns. If you wish to proceed you should use pd.concat" 

1269 ) 

1270 

1271 # if we are numeric, then allow differing 

1272 # kinds to proceed, eg. int64 and int8, int and float 

1273 # further if we are object, but we infer to 

1274 # the same, then proceed 

1275 if is_numeric_dtype(lk.dtype) and is_numeric_dtype(rk.dtype): 

1276 if lk.dtype.kind == rk.dtype.kind: 

1277 continue 

1278 

1279 # check whether ints and floats 

1280 elif is_integer_dtype(rk.dtype) and is_float_dtype(lk.dtype): 

1281 # GH 47391 numpy > 1.24 will raise a RuntimeError for nan -> int 

1282 with np.errstate(invalid="ignore"): 

1283 if not (lk == lk.astype(rk.dtype))[~np.isnan(lk)].all(): 

1284 warnings.warn( 

1285 "You are merging on int and float " 

1286 "columns where the float values " 

1287 "are not equal to their int representation.", 

1288 UserWarning, 

1289 stacklevel=find_stack_level(), 

1290 ) 

1291 continue 

1292 

1293 elif is_float_dtype(rk.dtype) and is_integer_dtype(lk.dtype): 

1294 # GH 47391 numpy > 1.24 will raise a RuntimeError for nan -> int 

1295 with np.errstate(invalid="ignore"): 

1296 if not (rk == rk.astype(lk.dtype))[~np.isnan(rk)].all(): 

1297 warnings.warn( 

1298 "You are merging on int and float " 

1299 "columns where the float values " 

1300 "are not equal to their int representation.", 

1301 UserWarning, 

1302 stacklevel=find_stack_level(), 

1303 ) 

1304 continue 

1305 

1306 # let's infer and see if we are ok 

1307 elif lib.infer_dtype(lk, skipna=False) == lib.infer_dtype( 

1308 rk, skipna=False 

1309 ): 

1310 continue 

1311 

1312 # Check if we are trying to merge on obviously 

1313 # incompatible dtypes GH 9780, GH 15800 

1314 

1315 # bool values are coerced to object 

1316 elif (lk_is_object and is_bool_dtype(rk.dtype)) or ( 

1317 is_bool_dtype(lk.dtype) and rk_is_object 

1318 ): 

1319 pass 

1320 

1321 # object values are allowed to be merged 

1322 elif (lk_is_object and is_numeric_dtype(rk.dtype)) or ( 

1323 is_numeric_dtype(lk.dtype) and rk_is_object 

1324 ): 

1325 inferred_left = lib.infer_dtype(lk, skipna=False) 

1326 inferred_right = lib.infer_dtype(rk, skipna=False) 

1327 bool_types = ["integer", "mixed-integer", "boolean", "empty"] 

1328 string_types = ["string", "unicode", "mixed", "bytes", "empty"] 

1329 

1330 # inferred bool 

1331 if inferred_left in bool_types and inferred_right in bool_types: 

1332 pass 

1333 

1334 # unless we are merging non-string-like with string-like 

1335 elif ( 

1336 inferred_left in string_types and inferred_right not in string_types 

1337 ) or ( 

1338 inferred_right in string_types and inferred_left not in string_types 

1339 ): 

1340 raise ValueError(msg) 

1341 

1342 # datetimelikes must match exactly 

1343 elif needs_i8_conversion(lk.dtype) and not needs_i8_conversion(rk.dtype): 

1344 raise ValueError(msg) 

1345 elif not needs_i8_conversion(lk.dtype) and needs_i8_conversion(rk.dtype): 

1346 raise ValueError(msg) 

1347 elif isinstance(lk.dtype, DatetimeTZDtype) and not isinstance( 

1348 rk.dtype, DatetimeTZDtype 

1349 ): 

1350 raise ValueError(msg) 

1351 elif not isinstance(lk.dtype, DatetimeTZDtype) and isinstance( 

1352 rk.dtype, DatetimeTZDtype 

1353 ): 

1354 raise ValueError(msg) 

1355 

1356 elif lk_is_object and rk_is_object: 

1357 continue 

1358 

1359 # Houston, we have a problem! 

1360 # let's coerce to object if the dtypes aren't 

1361 # categorical, otherwise coerce to the category 

1362 # dtype. If we coerced categories to object, 

1363 # then we would lose type information on some 

1364 # columns, and end up trying to merge 

1365 # incompatible dtypes. See GH 16900. 

1366 if name in self.left.columns: 

1367 typ = lk.categories.dtype if lk_is_cat else object 

1368 self.left = self.left.copy() 

1369 self.left[name] = self.left[name].astype(typ) 

1370 if name in self.right.columns: 

1371 typ = rk.categories.dtype if rk_is_cat else object 

1372 self.right = self.right.copy() 

1373 self.right[name] = self.right[name].astype(typ) 

1374 

1375 def _create_cross_configuration( 

1376 self, left: DataFrame, right: DataFrame 

1377 ) -> tuple[DataFrame, DataFrame, str, str]: 

1378 """ 

1379 Creates the configuration to dispatch the cross operation to inner join, 

1380 e.g. adding a join column and resetting parameters. Join column is added 

1381 to a new object, no inplace modification 

1382 

1383 Parameters 

1384 ---------- 

1385 left : DataFrame 

1386 right : DataFrame 

1387 

1388 Returns 

1389 ------- 

1390 a tuple (left, right, how, cross_col) representing the adjusted 

1391 DataFrames with cross_col, the merge operation set to inner and the column 

1392 to join over. 

1393 """ 

1394 cross_col = f"_cross_{uuid.uuid4()}" 

1395 how = "inner" 

1396 return ( 

1397 left.assign(**{cross_col: 1}), 

1398 right.assign(**{cross_col: 1}), 

1399 how, 

1400 cross_col, 

1401 ) 

1402 

1403 def _validate_left_right_on(self, left_on, right_on): 

1404 left_on = com.maybe_make_list(left_on) 

1405 right_on = com.maybe_make_list(right_on) 

1406 

1407 if self.how == "cross": 

1408 if ( 

1409 self.left_index 

1410 or self.right_index 

1411 or right_on is not None 

1412 or left_on is not None 

1413 or self.on is not None 

1414 ): 

1415 raise MergeError( 

1416 "Can not pass on, right_on, left_on or set right_index=True or " 

1417 "left_index=True" 

1418 ) 

1419 # Hm, any way to make this logic less complicated?? 

1420 elif self.on is None and left_on is None and right_on is None: 

1421 

1422 if self.left_index and self.right_index: 

1423 left_on, right_on = (), () 

1424 elif self.left_index: 

1425 raise MergeError("Must pass right_on or right_index=True") 

1426 elif self.right_index: 

1427 raise MergeError("Must pass left_on or left_index=True") 

1428 else: 

1429 # use the common columns 

1430 left_cols = self.left.columns 

1431 right_cols = self.right.columns 

1432 common_cols = left_cols.intersection(right_cols) 

1433 if len(common_cols) == 0: 

1434 raise MergeError( 

1435 "No common columns to perform merge on. " 

1436 f"Merge options: left_on={left_on}, " 

1437 f"right_on={right_on}, " 

1438 f"left_index={self.left_index}, " 

1439 f"right_index={self.right_index}" 

1440 ) 

1441 if ( 

1442 not left_cols.join(common_cols, how="inner").is_unique 

1443 or not right_cols.join(common_cols, how="inner").is_unique 

1444 ): 

1445 raise MergeError(f"Data columns not unique: {repr(common_cols)}") 

1446 left_on = right_on = common_cols 

1447 elif self.on is not None: 

1448 if left_on is not None or right_on is not None: 

1449 raise MergeError( 

1450 'Can only pass argument "on" OR "left_on" ' 

1451 'and "right_on", not a combination of both.' 

1452 ) 

1453 if self.left_index or self.right_index: 

1454 raise MergeError( 

1455 'Can only pass argument "on" OR "left_index" ' 

1456 'and "right_index", not a combination of both.' 

1457 ) 

1458 left_on = right_on = self.on 

1459 elif left_on is not None: 

1460 if self.left_index: 

1461 raise MergeError( 

1462 'Can only pass argument "left_on" OR "left_index" not both.' 

1463 ) 

1464 if not self.right_index and right_on is None: 

1465 raise MergeError('Must pass "right_on" OR "right_index".') 

1466 n = len(left_on) 

1467 if self.right_index: 

1468 if len(left_on) != self.right.index.nlevels: 

1469 raise ValueError( 

1470 "len(left_on) must equal the number " 

1471 'of levels in the index of "right"' 

1472 ) 

1473 right_on = [None] * n 

1474 elif right_on is not None: 

1475 if self.right_index: 

1476 raise MergeError( 

1477 'Can only pass argument "right_on" OR "right_index" not both.' 

1478 ) 

1479 if not self.left_index and left_on is None: 

1480 raise MergeError('Must pass "left_on" OR "left_index".') 

1481 n = len(right_on) 

1482 if self.left_index: 

1483 if len(right_on) != self.left.index.nlevels: 

1484 raise ValueError( 

1485 "len(right_on) must equal the number " 

1486 'of levels in the index of "left"' 

1487 ) 

1488 left_on = [None] * n 

1489 if self.how != "cross" and len(right_on) != len(left_on): 

1490 raise ValueError("len(right_on) must equal len(left_on)") 

1491 

1492 return left_on, right_on 

1493 

1494 def _validate(self, validate: str) -> None: 

1495 

1496 # Check uniqueness of each 

1497 if self.left_index: 

1498 left_unique = self.orig_left.index.is_unique 

1499 else: 

1500 left_unique = MultiIndex.from_arrays(self.left_join_keys).is_unique 

1501 

1502 if self.right_index: 

1503 right_unique = self.orig_right.index.is_unique 

1504 else: 

1505 right_unique = MultiIndex.from_arrays(self.right_join_keys).is_unique 

1506 

1507 # Check data integrity 

1508 if validate in ["one_to_one", "1:1"]: 

1509 if not left_unique and not right_unique: 

1510 raise MergeError( 

1511 "Merge keys are not unique in either left " 

1512 "or right dataset; not a one-to-one merge" 

1513 ) 

1514 elif not left_unique: 

1515 raise MergeError( 

1516 "Merge keys are not unique in left dataset; not a one-to-one merge" 

1517 ) 

1518 elif not right_unique: 

1519 raise MergeError( 

1520 "Merge keys are not unique in right dataset; not a one-to-one merge" 

1521 ) 

1522 

1523 elif validate in ["one_to_many", "1:m"]: 

1524 if not left_unique: 

1525 raise MergeError( 

1526 "Merge keys are not unique in left dataset; not a one-to-many merge" 

1527 ) 

1528 

1529 elif validate in ["many_to_one", "m:1"]: 

1530 if not right_unique: 

1531 raise MergeError( 

1532 "Merge keys are not unique in right dataset; " 

1533 "not a many-to-one merge" 

1534 ) 

1535 

1536 elif validate in ["many_to_many", "m:m"]: 

1537 pass 

1538 

1539 else: 

1540 raise ValueError("Not a valid argument for validate") 

1541 

1542 

1543def get_join_indexers( 

1544 left_keys, right_keys, sort: bool = False, how: str = "inner", **kwargs 

1545) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]: 

1546 """ 

1547 

1548 Parameters 

1549 ---------- 

1550 left_keys : ndarray, Index, Series 

1551 right_keys : ndarray, Index, Series 

1552 sort : bool, default False 

1553 how : {'inner', 'outer', 'left', 'right'}, default 'inner' 

1554 

1555 Returns 

1556 ------- 

1557 np.ndarray[np.intp] 

1558 Indexer into the left_keys. 

1559 np.ndarray[np.intp] 

1560 Indexer into the right_keys. 

1561 """ 

1562 assert len(left_keys) == len( 

1563 right_keys 

1564 ), "left_key and right_keys must be the same length" 

1565 

1566 # fast-path for empty left/right 

1567 left_n = len(left_keys[0]) 

1568 right_n = len(right_keys[0]) 

1569 if left_n == 0: 

1570 if how in ["left", "inner", "cross"]: 

1571 return _get_empty_indexer() 

1572 elif not sort and how in ["right", "outer"]: 

1573 return _get_no_sort_one_missing_indexer(right_n, True) 

1574 elif right_n == 0: 

1575 if how in ["right", "inner", "cross"]: 

1576 return _get_empty_indexer() 

1577 elif not sort and how in ["left", "outer"]: 

1578 return _get_no_sort_one_missing_indexer(left_n, False) 

1579 

1580 # get left & right join labels and num. of levels at each location 

1581 mapped = ( 

1582 _factorize_keys(left_keys[n], right_keys[n], sort=sort, how=how) 

1583 for n in range(len(left_keys)) 

1584 ) 

1585 zipped = zip(*mapped) 

1586 llab, rlab, shape = (list(x) for x in zipped) 

1587 

1588 # get flat i8 keys from label lists 

1589 lkey, rkey = _get_join_keys(llab, rlab, shape, sort) 

1590 

1591 # factorize keys to a dense i8 space 

1592 # `count` is the num. of unique keys 

1593 # set(lkey) | set(rkey) == range(count) 

1594 

1595 lkey, rkey, count = _factorize_keys(lkey, rkey, sort=sort, how=how) 

1596 # preserve left frame order if how == 'left' and sort == False 

1597 kwargs = copy.copy(kwargs) 

1598 if how in ("left", "right"): 

1599 kwargs["sort"] = sort 

1600 join_func = { 

1601 "inner": libjoin.inner_join, 

1602 "left": libjoin.left_outer_join, 

1603 "right": lambda x, y, count, **kwargs: libjoin.left_outer_join( 

1604 y, x, count, **kwargs 

1605 )[::-1], 

1606 "outer": libjoin.full_outer_join, 

1607 }[how] 

1608 

1609 # error: Cannot call function of unknown type 

1610 return join_func(lkey, rkey, count, **kwargs) # type: ignore[operator] 

1611 

1612 

1613def restore_dropped_levels_multijoin( 

1614 left: MultiIndex, 

1615 right: MultiIndex, 

1616 dropped_level_names, 

1617 join_index: Index, 

1618 lindexer: npt.NDArray[np.intp], 

1619 rindexer: npt.NDArray[np.intp], 

1620) -> tuple[list[Index], npt.NDArray[np.intp], list[Hashable]]: 

1621 """ 

1622 *this is an internal non-public method* 

1623 

1624 Returns the levels, labels and names of a multi-index to multi-index join. 

1625 Depending on the type of join, this method restores the appropriate 

1626 dropped levels of the joined multi-index. 

1627 The method relies on lidx, rindexer which hold the index positions of 

1628 left and right, where a join was feasible 

1629 

1630 Parameters 

1631 ---------- 

1632 left : MultiIndex 

1633 left index 

1634 right : MultiIndex 

1635 right index 

1636 dropped_level_names : str array 

1637 list of non-common level names 

1638 join_index : Index 

1639 the index of the join between the 

1640 common levels of left and right 

1641 lindexer : np.ndarray[np.intp] 

1642 left indexer 

1643 rindexer : np.ndarray[np.intp] 

1644 right indexer 

1645 

1646 Returns 

1647 ------- 

1648 levels : list of Index 

1649 levels of combined multiindexes 

1650 labels : np.ndarray[np.intp] 

1651 labels of combined multiindexes 

1652 names : List[Hashable] 

1653 names of combined multiindex levels 

1654 

1655 """ 

1656 

1657 def _convert_to_multiindex(index: Index) -> MultiIndex: 

1658 if isinstance(index, MultiIndex): 

1659 return index 

1660 else: 

1661 return MultiIndex.from_arrays([index._values], names=[index.name]) 

1662 

1663 # For multi-multi joins with one overlapping level, 

1664 # the returned index if of type Index 

1665 # Assure that join_index is of type MultiIndex 

1666 # so that dropped levels can be appended 

1667 join_index = _convert_to_multiindex(join_index) 

1668 

1669 join_levels = join_index.levels 

1670 join_codes = join_index.codes 

1671 join_names = join_index.names 

1672 

1673 # lindexer and rindexer hold the indexes where the join occurred 

1674 # for left and right respectively. If left/right is None then 

1675 # the join occurred on all indices of left/right 

1676 if lindexer is None: 

1677 lindexer = range(left.size) 

1678 

1679 if rindexer is None: 

1680 rindexer = range(right.size) 

1681 

1682 # Iterate through the levels that must be restored 

1683 for dropped_level_name in dropped_level_names: 

1684 if dropped_level_name in left.names: 

1685 idx = left 

1686 indexer = lindexer 

1687 else: 

1688 idx = right 

1689 indexer = rindexer 

1690 

1691 # The index of the level name to be restored 

1692 name_idx = idx.names.index(dropped_level_name) 

1693 

1694 restore_levels = idx.levels[name_idx] 

1695 # Inject -1 in the codes list where a join was not possible 

1696 # IOW indexer[i]=-1 

1697 codes = idx.codes[name_idx] 

1698 restore_codes = algos.take_nd(codes, indexer, fill_value=-1) 

1699 

1700 join_levels = join_levels + [restore_levels] 

1701 join_codes = join_codes + [restore_codes] 

1702 join_names = join_names + [dropped_level_name] 

1703 

1704 return join_levels, join_codes, join_names 

1705 

1706 

1707class _OrderedMerge(_MergeOperation): 

1708 _merge_type = "ordered_merge" 

1709 

1710 def __init__( 

1711 self, 

1712 left: DataFrame | Series, 

1713 right: DataFrame | Series, 

1714 on: IndexLabel | None = None, 

1715 left_on: IndexLabel | None = None, 

1716 right_on: IndexLabel | None = None, 

1717 left_index: bool = False, 

1718 right_index: bool = False, 

1719 axis: int = 1, 

1720 suffixes: Suffixes = ("_x", "_y"), 

1721 fill_method: str | None = None, 

1722 how: str = "outer", 

1723 ) -> None: 

1724 

1725 self.fill_method = fill_method 

1726 _MergeOperation.__init__( 

1727 self, 

1728 left, 

1729 right, 

1730 on=on, 

1731 left_on=left_on, 

1732 left_index=left_index, 

1733 right_index=right_index, 

1734 right_on=right_on, 

1735 axis=axis, 

1736 how=how, 

1737 suffixes=suffixes, 

1738 sort=True, # factorize sorts 

1739 ) 

1740 

1741 def get_result(self, copy: bool = True) -> DataFrame: 

1742 join_index, left_indexer, right_indexer = self._get_join_info() 

1743 

1744 llabels, rlabels = _items_overlap_with_suffix( 

1745 self.left._info_axis, self.right._info_axis, self.suffixes 

1746 ) 

1747 

1748 left_join_indexer: np.ndarray | None 

1749 right_join_indexer: np.ndarray | None 

1750 

1751 if self.fill_method == "ffill": 

1752 if left_indexer is None: 

1753 raise TypeError("left_indexer cannot be None") 

1754 left_indexer, right_indexer = cast(np.ndarray, left_indexer), cast( 

1755 np.ndarray, right_indexer 

1756 ) 

1757 left_join_indexer = libjoin.ffill_indexer(left_indexer) 

1758 right_join_indexer = libjoin.ffill_indexer(right_indexer) 

1759 else: 

1760 left_join_indexer = left_indexer 

1761 right_join_indexer = right_indexer 

1762 

1763 result = self._reindex_and_concat( 

1764 join_index, left_join_indexer, right_join_indexer, copy=copy 

1765 ) 

1766 self._maybe_add_join_keys(result, left_indexer, right_indexer) 

1767 

1768 return result 

1769 

1770 

1771def _asof_by_function(direction: str): 

1772 name = f"asof_join_{direction}_on_X_by_Y" 

1773 return getattr(libjoin, name, None) 

1774 

1775 

1776_type_casters = { 

1777 "int64_t": ensure_int64, 

1778 "double": ensure_float64, 

1779 "object": ensure_object, 

1780} 

1781 

1782 

1783def _get_cython_type_upcast(dtype: DtypeObj) -> str: 

1784 """Upcast a dtype to 'int64_t', 'double', or 'object'""" 

1785 if is_integer_dtype(dtype): 

1786 return "int64_t" 

1787 elif is_float_dtype(dtype): 

1788 return "double" 

1789 else: 

1790 return "object" 

1791 

1792 

1793class _AsOfMerge(_OrderedMerge): 

1794 _merge_type = "asof_merge" 

1795 

1796 def __init__( 

1797 self, 

1798 left: DataFrame | Series, 

1799 right: DataFrame | Series, 

1800 on: IndexLabel | None = None, 

1801 left_on: IndexLabel | None = None, 

1802 right_on: IndexLabel | None = None, 

1803 left_index: bool = False, 

1804 right_index: bool = False, 

1805 by=None, 

1806 left_by=None, 

1807 right_by=None, 

1808 axis: int = 1, 

1809 suffixes: Suffixes = ("_x", "_y"), 

1810 copy: bool = True, 

1811 fill_method: str | None = None, 

1812 how: str = "asof", 

1813 tolerance=None, 

1814 allow_exact_matches: bool = True, 

1815 direction: str = "backward", 

1816 ) -> None: 

1817 

1818 self.by = by 

1819 self.left_by = left_by 

1820 self.right_by = right_by 

1821 self.tolerance = tolerance 

1822 self.allow_exact_matches = allow_exact_matches 

1823 self.direction = direction 

1824 

1825 _OrderedMerge.__init__( 

1826 self, 

1827 left, 

1828 right, 

1829 on=on, 

1830 left_on=left_on, 

1831 right_on=right_on, 

1832 left_index=left_index, 

1833 right_index=right_index, 

1834 axis=axis, 

1835 how=how, 

1836 suffixes=suffixes, 

1837 fill_method=fill_method, 

1838 ) 

1839 

1840 def _validate_left_right_on(self, left_on, right_on): 

1841 left_on, right_on = super()._validate_left_right_on(left_on, right_on) 

1842 

1843 # we only allow on to be a single item for on 

1844 if len(left_on) != 1 and not self.left_index: 

1845 raise MergeError("can only asof on a key for left") 

1846 

1847 if len(right_on) != 1 and not self.right_index: 

1848 raise MergeError("can only asof on a key for right") 

1849 

1850 if self.left_index and isinstance(self.left.index, MultiIndex): 

1851 raise MergeError("left can only have one index") 

1852 

1853 if self.right_index and isinstance(self.right.index, MultiIndex): 

1854 raise MergeError("right can only have one index") 

1855 

1856 # set 'by' columns 

1857 if self.by is not None: 

1858 if self.left_by is not None or self.right_by is not None: 

1859 raise MergeError("Can only pass by OR left_by and right_by") 

1860 self.left_by = self.right_by = self.by 

1861 if self.left_by is None and self.right_by is not None: 

1862 raise MergeError("missing left_by") 

1863 if self.left_by is not None and self.right_by is None: 

1864 raise MergeError("missing right_by") 

1865 

1866 # GH#29130 Check that merge keys do not have dtype object 

1867 if not self.left_index: 

1868 left_on_0 = left_on[0] 

1869 if is_array_like(left_on_0): 

1870 lo_dtype = left_on_0.dtype 

1871 else: 

1872 lo_dtype = ( 

1873 self.left[left_on_0].dtype 

1874 if left_on_0 in self.left.columns 

1875 else self.left.index.get_level_values(left_on_0) 

1876 ) 

1877 else: 

1878 lo_dtype = self.left.index.dtype 

1879 

1880 if not self.right_index: 

1881 right_on_0 = right_on[0] 

1882 if is_array_like(right_on_0): 

1883 ro_dtype = right_on_0.dtype 

1884 else: 

1885 ro_dtype = ( 

1886 self.right[right_on_0].dtype 

1887 if right_on_0 in self.right.columns 

1888 else self.right.index.get_level_values(right_on_0) 

1889 ) 

1890 else: 

1891 ro_dtype = self.right.index.dtype 

1892 

1893 if is_object_dtype(lo_dtype) or is_object_dtype(ro_dtype): 

1894 raise MergeError( 

1895 f"Incompatible merge dtype, {repr(ro_dtype)} and " 

1896 f"{repr(lo_dtype)}, both sides must have numeric dtype" 

1897 ) 

1898 

1899 # add 'by' to our key-list so we can have it in the 

1900 # output as a key 

1901 if self.left_by is not None: 

1902 if not is_list_like(self.left_by): 

1903 self.left_by = [self.left_by] 

1904 if not is_list_like(self.right_by): 

1905 self.right_by = [self.right_by] 

1906 

1907 if len(self.left_by) != len(self.right_by): 

1908 raise MergeError("left_by and right_by must be same length") 

1909 

1910 left_on = self.left_by + list(left_on) 

1911 right_on = self.right_by + list(right_on) 

1912 

1913 # check 'direction' is valid 

1914 if self.direction not in ["backward", "forward", "nearest"]: 

1915 raise MergeError(f"direction invalid: {self.direction}") 

1916 

1917 return left_on, right_on 

1918 

1919 def _get_merge_keys(self): 

1920 

1921 # note this function has side effects 

1922 (left_join_keys, right_join_keys, join_names) = super()._get_merge_keys() 

1923 

1924 # validate index types are the same 

1925 for i, (lk, rk) in enumerate(zip(left_join_keys, right_join_keys)): 

1926 if not is_dtype_equal(lk.dtype, rk.dtype): 

1927 if is_categorical_dtype(lk.dtype) and is_categorical_dtype(rk.dtype): 

1928 # The generic error message is confusing for categoricals. 

1929 # 

1930 # In this function, the join keys include both the original 

1931 # ones of the merge_asof() call, and also the keys passed 

1932 # to its by= argument. Unordered but equal categories 

1933 # are not supported for the former, but will fail 

1934 # later with a ValueError, so we don't *need* to check 

1935 # for them here. 

1936 msg = ( 

1937 f"incompatible merge keys [{i}] {repr(lk.dtype)} and " 

1938 f"{repr(rk.dtype)}, both sides category, but not equal ones" 

1939 ) 

1940 else: 

1941 msg = ( 

1942 f"incompatible merge keys [{i}] {repr(lk.dtype)} and " 

1943 f"{repr(rk.dtype)}, must be the same type" 

1944 ) 

1945 raise MergeError(msg) 

1946 

1947 # validate tolerance; datetime.timedelta or Timedelta if we have a DTI 

1948 if self.tolerance is not None: 

1949 

1950 if self.left_index: 

1951 lt = self.left.index 

1952 else: 

1953 lt = left_join_keys[-1] 

1954 

1955 msg = ( 

1956 f"incompatible tolerance {self.tolerance}, must be compat " 

1957 f"with type {repr(lt.dtype)}" 

1958 ) 

1959 

1960 if needs_i8_conversion(lt): 

1961 if not isinstance(self.tolerance, datetime.timedelta): 

1962 raise MergeError(msg) 

1963 if self.tolerance < Timedelta(0): 

1964 raise MergeError("tolerance must be positive") 

1965 

1966 elif is_integer_dtype(lt): 

1967 if not is_integer(self.tolerance): 

1968 raise MergeError(msg) 

1969 if self.tolerance < 0: 

1970 raise MergeError("tolerance must be positive") 

1971 

1972 elif is_float_dtype(lt): 

1973 if not is_number(self.tolerance): 

1974 raise MergeError(msg) 

1975 if self.tolerance < 0: 

1976 raise MergeError("tolerance must be positive") 

1977 

1978 else: 

1979 raise MergeError("key must be integer, timestamp or float") 

1980 

1981 # validate allow_exact_matches 

1982 if not is_bool(self.allow_exact_matches): 

1983 msg = ( 

1984 "allow_exact_matches must be boolean, " 

1985 f"passed {self.allow_exact_matches}" 

1986 ) 

1987 raise MergeError(msg) 

1988 

1989 return left_join_keys, right_join_keys, join_names 

1990 

1991 def _get_join_indexers(self) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]: 

1992 """return the join indexers""" 

1993 

1994 def flip(xs) -> np.ndarray: 

1995 """unlike np.transpose, this returns an array of tuples""" 

1996 

1997 def injection(obj): 

1998 if not is_extension_array_dtype(obj): 

1999 # ndarray 

2000 return obj 

2001 obj = extract_array(obj) 

2002 if isinstance(obj, NDArrayBackedExtensionArray): 

2003 # fastpath for e.g. dt64tz, categorical 

2004 return obj._ndarray 

2005 # FIXME: returning obj._values_for_argsort() here doesn't 

2006 # break in any existing test cases, but i (@jbrockmendel) 

2007 # am pretty sure it should! 

2008 # e.g. 

2009 # arr = pd.array([0, pd.NA, 255], dtype="UInt8") 

2010 # will have values_for_argsort (before GH#45434) 

2011 # np.array([0, 255, 255], dtype=np.uint8) 

2012 # and the non-injectivity should make a difference somehow 

2013 # shouldn't it? 

2014 return np.asarray(obj) 

2015 

2016 xs = [injection(x) for x in xs] 

2017 labels = list(string.ascii_lowercase[: len(xs)]) 

2018 dtypes = [x.dtype for x in xs] 

2019 labeled_dtypes = list(zip(labels, dtypes)) 

2020 return np.array(list(zip(*xs)), labeled_dtypes) 

2021 

2022 # values to compare 

2023 left_values = ( 

2024 self.left.index._values if self.left_index else self.left_join_keys[-1] 

2025 ) 

2026 right_values = ( 

2027 self.right.index._values if self.right_index else self.right_join_keys[-1] 

2028 ) 

2029 tolerance = self.tolerance 

2030 

2031 # we require sortedness and non-null values in the join keys 

2032 if not Index(left_values).is_monotonic_increasing: 

2033 side = "left" 

2034 if isna(left_values).any(): 

2035 raise ValueError(f"Merge keys contain null values on {side} side") 

2036 else: 

2037 raise ValueError(f"{side} keys must be sorted") 

2038 

2039 if not Index(right_values).is_monotonic_increasing: 

2040 side = "right" 

2041 if isna(right_values).any(): 

2042 raise ValueError(f"Merge keys contain null values on {side} side") 

2043 else: 

2044 raise ValueError(f"{side} keys must be sorted") 

2045 

2046 # initial type conversion as needed 

2047 if needs_i8_conversion(left_values): 

2048 left_values = left_values.view("i8") 

2049 right_values = right_values.view("i8") 

2050 if tolerance is not None: 

2051 tolerance = Timedelta(tolerance) 

2052 tolerance = tolerance.value 

2053 

2054 # a "by" parameter requires special handling 

2055 if self.left_by is not None: 

2056 # remove 'on' parameter from values if one existed 

2057 if self.left_index and self.right_index: 

2058 left_by_values = self.left_join_keys 

2059 right_by_values = self.right_join_keys 

2060 else: 

2061 left_by_values = self.left_join_keys[0:-1] 

2062 right_by_values = self.right_join_keys[0:-1] 

2063 

2064 # get tuple representation of values if more than one 

2065 if len(left_by_values) == 1: 

2066 left_by_values = left_by_values[0] 

2067 right_by_values = right_by_values[0] 

2068 else: 

2069 # We get here with non-ndarrays in test_merge_by_col_tz_aware 

2070 # and test_merge_groupby_multiple_column_with_categorical_column 

2071 left_by_values = flip(left_by_values) 

2072 right_by_values = flip(right_by_values) 

2073 

2074 # upcast 'by' parameter because HashTable is limited 

2075 by_type = _get_cython_type_upcast(left_by_values.dtype) 

2076 by_type_caster = _type_casters[by_type] 

2077 # error: Cannot call function of unknown type 

2078 left_by_values = by_type_caster(left_by_values) # type: ignore[operator] 

2079 # error: Cannot call function of unknown type 

2080 right_by_values = by_type_caster(right_by_values) # type: ignore[operator] 

2081 

2082 # choose appropriate function by type 

2083 func = _asof_by_function(self.direction) 

2084 return func( 

2085 left_values, 

2086 right_values, 

2087 left_by_values, 

2088 right_by_values, 

2089 self.allow_exact_matches, 

2090 tolerance, 

2091 ) 

2092 else: 

2093 # choose appropriate function by type 

2094 func = _asof_by_function(self.direction) 

2095 return func( 

2096 left_values, 

2097 right_values, 

2098 None, 

2099 None, 

2100 self.allow_exact_matches, 

2101 tolerance, 

2102 False, 

2103 ) 

2104 

2105 

2106def _get_multiindex_indexer( 

2107 join_keys, index: MultiIndex, sort: bool 

2108) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]: 

2109 

2110 # left & right join labels and num. of levels at each location 

2111 mapped = ( 

2112 _factorize_keys(index.levels[n], join_keys[n], sort=sort) 

2113 for n in range(index.nlevels) 

2114 ) 

2115 zipped = zip(*mapped) 

2116 rcodes, lcodes, shape = (list(x) for x in zipped) 

2117 if sort: 

2118 rcodes = list(map(np.take, rcodes, index.codes)) 

2119 else: 

2120 i8copy = lambda a: a.astype("i8", subok=False, copy=True) 

2121 rcodes = list(map(i8copy, index.codes)) 

2122 

2123 # fix right labels if there were any nulls 

2124 for i in range(len(join_keys)): 

2125 mask = index.codes[i] == -1 

2126 if mask.any(): 

2127 # check if there already was any nulls at this location 

2128 # if there was, it is factorized to `shape[i] - 1` 

2129 a = join_keys[i][lcodes[i] == shape[i] - 1] 

2130 if a.size == 0 or not a[0] != a[0]: 

2131 shape[i] += 1 

2132 

2133 rcodes[i][mask] = shape[i] - 1 

2134 

2135 # get flat i8 join keys 

2136 lkey, rkey = _get_join_keys(lcodes, rcodes, shape, sort) 

2137 

2138 # factorize keys to a dense i8 space 

2139 lkey, rkey, count = _factorize_keys(lkey, rkey, sort=sort) 

2140 

2141 return libjoin.left_outer_join(lkey, rkey, count, sort=sort) 

2142 

2143 

2144def _get_single_indexer( 

2145 join_key, index: Index, sort: bool = False 

2146) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]: 

2147 left_key, right_key, count = _factorize_keys(join_key, index._values, sort=sort) 

2148 

2149 return libjoin.left_outer_join(left_key, right_key, count, sort=sort) 

2150 

2151 

2152def _get_empty_indexer() -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]: 

2153 """Return empty join indexers.""" 

2154 return ( 

2155 np.array([], dtype=np.intp), 

2156 np.array([], dtype=np.intp), 

2157 ) 

2158 

2159 

2160def _get_no_sort_one_missing_indexer( 

2161 n: int, left_missing: bool 

2162) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]: 

2163 """ 

2164 Return join indexers where all of one side is selected without sorting 

2165 and none of the other side is selected. 

2166 

2167 Parameters 

2168 ---------- 

2169 n : int 

2170 Length of indexers to create. 

2171 left_missing : bool 

2172 If True, the left indexer will contain only -1's. 

2173 If False, the right indexer will contain only -1's. 

2174 

2175 Returns 

2176 ------- 

2177 np.ndarray[np.intp] 

2178 Left indexer 

2179 np.ndarray[np.intp] 

2180 Right indexer 

2181 """ 

2182 idx = np.arange(n, dtype=np.intp) 

2183 idx_missing = np.full(shape=n, fill_value=-1, dtype=np.intp) 

2184 if left_missing: 

2185 return idx_missing, idx 

2186 return idx, idx_missing 

2187 

2188 

2189def _left_join_on_index( 

2190 left_ax: Index, right_ax: Index, join_keys, sort: bool = False 

2191) -> tuple[Index, npt.NDArray[np.intp] | None, npt.NDArray[np.intp]]: 

2192 if len(join_keys) > 1: 

2193 if not ( 

2194 isinstance(right_ax, MultiIndex) and len(join_keys) == right_ax.nlevels 

2195 ): 

2196 raise AssertionError( 

2197 "If more than one join key is given then " 

2198 "'right_ax' must be a MultiIndex and the " 

2199 "number of join keys must be the number of levels in right_ax" 

2200 ) 

2201 

2202 left_indexer, right_indexer = _get_multiindex_indexer( 

2203 join_keys, right_ax, sort=sort 

2204 ) 

2205 else: 

2206 jkey = join_keys[0] 

2207 

2208 left_indexer, right_indexer = _get_single_indexer(jkey, right_ax, sort=sort) 

2209 

2210 if sort or len(left_ax) != len(left_indexer): 

2211 # if asked to sort or there are 1-to-many matches 

2212 join_index = left_ax.take(left_indexer) 

2213 return join_index, left_indexer, right_indexer 

2214 

2215 # left frame preserves order & length of its index 

2216 return left_ax, None, right_indexer 

2217 

2218 

2219def _factorize_keys( 

2220 lk: ArrayLike, rk: ArrayLike, sort: bool = True, how: str = "inner" 

2221) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp], int]: 

2222 """ 

2223 Encode left and right keys as enumerated types. 

2224 

2225 This is used to get the join indexers to be used when merging DataFrames. 

2226 

2227 Parameters 

2228 ---------- 

2229 lk : array-like 

2230 Left key. 

2231 rk : array-like 

2232 Right key. 

2233 sort : bool, defaults to True 

2234 If True, the encoding is done such that the unique elements in the 

2235 keys are sorted. 

2236 how : {‘left’, ‘right’, ‘outer’, ‘inner’}, default ‘inner’ 

2237 Type of merge. 

2238 

2239 Returns 

2240 ------- 

2241 np.ndarray[np.intp] 

2242 Left (resp. right if called with `key='right'`) labels, as enumerated type. 

2243 np.ndarray[np.intp] 

2244 Right (resp. left if called with `key='right'`) labels, as enumerated type. 

2245 int 

2246 Number of unique elements in union of left and right labels. 

2247 

2248 See Also 

2249 -------- 

2250 merge : Merge DataFrame or named Series objects 

2251 with a database-style join. 

2252 algorithms.factorize : Encode the object as an enumerated type 

2253 or categorical variable. 

2254 

2255 Examples 

2256 -------- 

2257 >>> lk = np.array(["a", "c", "b"]) 

2258 >>> rk = np.array(["a", "c"]) 

2259 

2260 Here, the unique values are `'a', 'b', 'c'`. With the default 

2261 `sort=True`, the encoding will be `{0: 'a', 1: 'b', 2: 'c'}`: 

2262 

2263 >>> pd.core.reshape.merge._factorize_keys(lk, rk) 

2264 (array([0, 2, 1]), array([0, 2]), 3) 

2265 

2266 With the `sort=False`, the encoding will correspond to the order 

2267 in which the unique elements first appear: `{0: 'a', 1: 'c', 2: 'b'}`: 

2268 

2269 >>> pd.core.reshape.merge._factorize_keys(lk, rk, sort=False) 

2270 (array([0, 1, 2]), array([0, 1]), 3) 

2271 """ 

2272 # Some pre-processing for non-ndarray lk / rk 

2273 lk = extract_array(lk, extract_numpy=True, extract_range=True) 

2274 rk = extract_array(rk, extract_numpy=True, extract_range=True) 

2275 # TODO: if either is a RangeIndex, we can likely factorize more efficiently? 

2276 

2277 if isinstance(lk.dtype, DatetimeTZDtype) and isinstance(rk.dtype, DatetimeTZDtype): 

2278 # Extract the ndarray (UTC-localized) values 

2279 # Note: we dont need the dtypes to match, as these can still be compared 

2280 # TODO(non-nano): need to make sure resolutions match 

2281 lk = cast("DatetimeArray", lk)._ndarray 

2282 rk = cast("DatetimeArray", rk)._ndarray 

2283 

2284 elif ( 

2285 is_categorical_dtype(lk.dtype) 

2286 and is_categorical_dtype(rk.dtype) 

2287 and is_dtype_equal(lk.dtype, rk.dtype) 

2288 ): 

2289 assert isinstance(lk, Categorical) 

2290 assert isinstance(rk, Categorical) 

2291 # Cast rk to encoding so we can compare codes with lk 

2292 

2293 rk = lk._encode_with_my_categories(rk) 

2294 

2295 lk = ensure_int64(lk.codes) 

2296 rk = ensure_int64(rk.codes) 

2297 

2298 elif isinstance(lk, ExtensionArray) and is_dtype_equal(lk.dtype, rk.dtype): 

2299 lk, _ = lk._values_for_factorize() 

2300 

2301 # error: Item "ndarray" of "Union[Any, ndarray]" has no attribute 

2302 # "_values_for_factorize" 

2303 rk, _ = rk._values_for_factorize() # type: ignore[union-attr] 

2304 

2305 klass: type[libhashtable.Factorizer] | type[libhashtable.Int64Factorizer] 

2306 if is_integer_dtype(lk.dtype) and is_integer_dtype(rk.dtype): 

2307 # GH#23917 TODO: needs tests for case where lk is integer-dtype 

2308 # and rk is datetime-dtype 

2309 klass = libhashtable.Int64Factorizer 

2310 lk = ensure_int64(np.asarray(lk)) 

2311 rk = ensure_int64(np.asarray(rk)) 

2312 

2313 elif needs_i8_conversion(lk.dtype) and is_dtype_equal(lk.dtype, rk.dtype): 

2314 # GH#23917 TODO: Needs tests for non-matching dtypes 

2315 klass = libhashtable.Int64Factorizer 

2316 lk = ensure_int64(np.asarray(lk, dtype=np.int64)) 

2317 rk = ensure_int64(np.asarray(rk, dtype=np.int64)) 

2318 

2319 else: 

2320 klass = libhashtable.ObjectFactorizer 

2321 lk = ensure_object(lk) 

2322 rk = ensure_object(rk) 

2323 

2324 rizer = klass(max(len(lk), len(rk))) 

2325 

2326 # Argument 1 to "factorize" of "ObjectFactorizer" has incompatible type 

2327 # "Union[ndarray[Any, dtype[signedinteger[_64Bit]]], 

2328 # ndarray[Any, dtype[object_]]]"; expected "ndarray[Any, dtype[object_]]" 

2329 llab = rizer.factorize(lk) # type: ignore[arg-type] 

2330 # Argument 1 to "factorize" of "ObjectFactorizer" has incompatible type 

2331 # "Union[ndarray[Any, dtype[signedinteger[_64Bit]]], 

2332 # ndarray[Any, dtype[object_]]]"; expected "ndarray[Any, dtype[object_]]" 

2333 rlab = rizer.factorize(rk) # type: ignore[arg-type] 

2334 assert llab.dtype == np.dtype(np.intp), llab.dtype 

2335 assert rlab.dtype == np.dtype(np.intp), rlab.dtype 

2336 

2337 count = rizer.get_count() 

2338 

2339 if sort: 

2340 uniques = rizer.uniques.to_array() 

2341 llab, rlab = _sort_labels(uniques, llab, rlab) 

2342 

2343 # NA group 

2344 lmask = llab == -1 

2345 lany = lmask.any() 

2346 rmask = rlab == -1 

2347 rany = rmask.any() 

2348 

2349 if lany or rany: 

2350 if lany: 

2351 np.putmask(llab, lmask, count) 

2352 if rany: 

2353 np.putmask(rlab, rmask, count) 

2354 count += 1 

2355 

2356 if how == "right": 

2357 return rlab, llab, count 

2358 return llab, rlab, count 

2359 

2360 

2361def _sort_labels( 

2362 uniques: np.ndarray, left: npt.NDArray[np.intp], right: npt.NDArray[np.intp] 

2363) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]: 

2364 

2365 llength = len(left) 

2366 labels = np.concatenate([left, right]) 

2367 

2368 _, new_labels = algos.safe_sort(uniques, labels, na_sentinel=-1) 

2369 new_left, new_right = new_labels[:llength], new_labels[llength:] 

2370 

2371 return new_left, new_right 

2372 

2373 

2374def _get_join_keys(llab, rlab, shape, sort: bool): 

2375 

2376 # how many levels can be done without overflow 

2377 nlev = next( 

2378 lev 

2379 for lev in range(len(shape), 0, -1) 

2380 if not is_int64_overflow_possible(shape[:lev]) 

2381 ) 

2382 

2383 # get keys for the first `nlev` levels 

2384 stride = np.prod(shape[1:nlev], dtype="i8") 

2385 lkey = stride * llab[0].astype("i8", subok=False, copy=False) 

2386 rkey = stride * rlab[0].astype("i8", subok=False, copy=False) 

2387 

2388 for i in range(1, nlev): 

2389 with np.errstate(divide="ignore"): 

2390 stride //= shape[i] 

2391 lkey += llab[i] * stride 

2392 rkey += rlab[i] * stride 

2393 

2394 if nlev == len(shape): # all done! 

2395 return lkey, rkey 

2396 

2397 # densify current keys to avoid overflow 

2398 lkey, rkey, count = _factorize_keys(lkey, rkey, sort=sort) 

2399 

2400 llab = [lkey] + llab[nlev:] 

2401 rlab = [rkey] + rlab[nlev:] 

2402 shape = [count] + shape[nlev:] 

2403 

2404 return _get_join_keys(llab, rlab, shape, sort) 

2405 

2406 

2407def _should_fill(lname, rname) -> bool: 

2408 if not isinstance(lname, str) or not isinstance(rname, str): 

2409 return True 

2410 return lname == rname 

2411 

2412 

2413def _any(x) -> bool: 

2414 return x is not None and com.any_not_none(*x) 

2415 

2416 

2417def _validate_operand(obj: DataFrame | Series) -> DataFrame: 

2418 if isinstance(obj, ABCDataFrame): 

2419 return obj 

2420 elif isinstance(obj, ABCSeries): 

2421 if obj.name is None: 

2422 raise ValueError("Cannot merge a Series without a name") 

2423 else: 

2424 return obj.to_frame() 

2425 else: 

2426 raise TypeError( 

2427 f"Can only merge Series or DataFrame objects, a {type(obj)} was passed" 

2428 ) 

2429 

2430 

2431def _items_overlap_with_suffix( 

2432 left: Index, right: Index, suffixes: Suffixes 

2433) -> tuple[Index, Index]: 

2434 """ 

2435 Suffixes type validation. 

2436 

2437 If two indices overlap, add suffixes to overlapping entries. 

2438 

2439 If corresponding suffix is empty, the entry is simply converted to string. 

2440 

2441 """ 

2442 if not is_list_like(suffixes, allow_sets=False): 

2443 warnings.warn( 

2444 f"Passing 'suffixes' as a {type(suffixes)}, is not supported and may give " 

2445 "unexpected results. Provide 'suffixes' as a tuple instead. In the " 

2446 "future a 'TypeError' will be raised.", 

2447 FutureWarning, 

2448 stacklevel=find_stack_level(), 

2449 ) 

2450 

2451 to_rename = left.intersection(right) 

2452 if len(to_rename) == 0: 

2453 return left, right 

2454 

2455 lsuffix, rsuffix = suffixes 

2456 

2457 if not lsuffix and not rsuffix: 

2458 raise ValueError(f"columns overlap but no suffix specified: {to_rename}") 

2459 

2460 def renamer(x, suffix): 

2461 """ 

2462 Rename the left and right indices. 

2463 

2464 If there is overlap, and suffix is not None, add 

2465 suffix, otherwise, leave it as-is. 

2466 

2467 Parameters 

2468 ---------- 

2469 x : original column name 

2470 suffix : str or None 

2471 

2472 Returns 

2473 ------- 

2474 x : renamed column name 

2475 """ 

2476 if x in to_rename and suffix is not None: 

2477 return f"{x}{suffix}" 

2478 return x 

2479 

2480 lrenamer = partial(renamer, suffix=lsuffix) 

2481 rrenamer = partial(renamer, suffix=rsuffix) 

2482 

2483 llabels = left._transform_index(lrenamer) 

2484 rlabels = right._transform_index(rrenamer) 

2485 

2486 dups = [] 

2487 if not llabels.is_unique: 

2488 # Only warn when duplicates are caused because of suffixes, already duplicated 

2489 # columns in origin should not warn 

2490 dups = llabels[(llabels.duplicated()) & (~left.duplicated())].tolist() 

2491 if not rlabels.is_unique: 

2492 dups.extend(rlabels[(rlabels.duplicated()) & (~right.duplicated())].tolist()) 

2493 if dups: 

2494 warnings.warn( 

2495 f"Passing 'suffixes' which cause duplicate columns {set(dups)} in the " 

2496 f"result is deprecated and will raise a MergeError in a future version.", 

2497 FutureWarning, 

2498 stacklevel=find_stack_level(), 

2499 ) 

2500 

2501 return llabels, rlabels