Coverage for /var/srv/projects/api.amasfac.comuna18.com/tmp/venv/lib/python3.9/site-packages/pandas/core/reshape/merge.py: 7%

1"""

2SQL-style merge routines

3"""

4from __future__ import annotations

6import copy

7import datetime

8from functools import partial

9import string

10from typing import (

11 TYPE_CHECKING,

12 Hashable,

13 Sequence,

14 cast,

15)

16import uuid

17import warnings

19import numpy as np

21from pandas._libs import (

22 Timedelta,

23 hashtable as libhashtable,

24 join as libjoin,

25 lib,

26)

27from pandas._typing import (

28 AnyArrayLike,

29 ArrayLike,

30 DtypeObj,

31 IndexLabel,

32 Suffixes,

33 npt,

34)

35from pandas.errors import MergeError

36from pandas.util._decorators import (

37 Appender,

38 Substitution,

39 cache_readonly,

40)

41from pandas.util._exceptions import find_stack_level

43from pandas.core.dtypes.cast import find_common_type

44from pandas.core.dtypes.common import (

45 ensure_float64,

46 ensure_int64,

47 ensure_object,

48 is_array_like,

49 is_bool,

50 is_bool_dtype,

51 is_categorical_dtype,

52 is_dtype_equal,

53 is_extension_array_dtype,

54 is_float_dtype,

55 is_integer,

56 is_integer_dtype,

57 is_list_like,

58 is_number,

59 is_numeric_dtype,

60 is_object_dtype,

61 needs_i8_conversion,

62)

63from pandas.core.dtypes.dtypes import DatetimeTZDtype

64from pandas.core.dtypes.generic import (

65 ABCDataFrame,

66 ABCSeries,

67)

68from pandas.core.dtypes.missing import (

69 isna,

70 na_value_for_dtype,

71)

73from pandas import (

74 Categorical,

75 Index,

76 MultiIndex,

77 Series,

78)

79import pandas.core.algorithms as algos

80from pandas.core.arrays import ExtensionArray

81from pandas.core.arrays._mixins import NDArrayBackedExtensionArray

82import pandas.core.common as com

83from pandas.core.construction import extract_array

84from pandas.core.frame import _merge_doc

85from pandas.core.sorting import is_int64_overflow_possible

87if TYPE_CHECKING: 87 ↛ 88line 87 didn't jump to line 88, because the condition on line 87 was never true

88 from pandas import DataFrame

89 from pandas.core import groupby

90 from pandas.core.arrays import DatetimeArray

93@Substitution("\nleft : DataFrame or named Series")

94@Appender(_merge_doc, indents=0)

95def merge(

96 left: DataFrame | Series,

97 right: DataFrame | Series,

98 how: str = "inner",

99 on: IndexLabel | None = None,

100 left_on: IndexLabel | None = None,

101 right_on: IndexLabel | None = None,

102 left_index: bool = False,

103 right_index: bool = False,

104 sort: bool = False,

105 suffixes: Suffixes = ("_x", "_y"),

106 copy: bool = True,

107 indicator: bool = False,

108 validate: str | None = None,

109) -> DataFrame:

110 op = _MergeOperation(

111 left,

112 right,

113 how=how,

114 on=on,

115 left_on=left_on,

116 right_on=right_on,

117 left_index=left_index,

118 right_index=right_index,

119 sort=sort,

120 suffixes=suffixes,

121 indicator=indicator,

122 validate=validate,

123 )

124 return op.get_result(copy=copy)

125

126

127if __debug__:

128 merge.__doc__ = _merge_doc % "\nleft : DataFrame"

129

130

131def _groupby_and_merge(by, left: DataFrame, right: DataFrame, merge_pieces):

132 """

133 groupby & merge; we are always performing a left-by type operation

134

135 Parameters

136 ----------

137 by: field to group

138 left: DataFrame

139 right: DataFrame

140 merge_pieces: function for merging

141 """

142 pieces = []

143 if not isinstance(by, (list, tuple)):

144 by = [by]

145

146 lby = left.groupby(by, sort=False)

147 rby: groupby.DataFrameGroupBy | None = None

148

149 # if we can groupby the rhs

150 # then we can get vastly better perf

151 if all(item in right.columns for item in by):

152 rby = right.groupby(by, sort=False)

153

154 for key, lhs in lby.grouper.get_iterator(lby._selected_obj, axis=lby.axis):

155

156 if rby is None:

157 rhs = right

158 else:

159 try:

160 rhs = right.take(rby.indices[key])

161 except KeyError:

162 # key doesn't exist in left

163 lcols = lhs.columns.tolist()

164 cols = lcols + [r for r in right.columns if r not in set(lcols)]

165 merged = lhs.reindex(columns=cols)

166 merged.index = range(len(merged))

167 pieces.append(merged)

168 continue

169

170 merged = merge_pieces(lhs, rhs)

171

172 # make sure join keys are in the merged

173 # TODO, should merge_pieces do this?

174 merged[by] = key

175

176 pieces.append(merged)

177

178 # preserve the original order

179 # if we have a missing piece this can be reset

180 from pandas.core.reshape.concat import concat

181

182 result = concat(pieces, ignore_index=True)

183 result = result.reindex(columns=pieces[0].columns, copy=False)

184 return result, lby

185

186

187def merge_ordered(

188 left: DataFrame,

189 right: DataFrame,

190 on: IndexLabel | None = None,

191 left_on: IndexLabel | None = None,

192 right_on: IndexLabel | None = None,

193 left_by=None,

194 right_by=None,

195 fill_method: str | None = None,

196 suffixes: Suffixes = ("_x", "_y"),

197 how: str = "outer",

198) -> DataFrame:

199 """

200 Perform a merge for ordered data with optional filling/interpolation.

201

202 Designed for ordered data like time series data. Optionally

203 perform group-wise merge (see examples).

204

205 Parameters

206 ----------

207 left : DataFrame

208 right : DataFrame

209 on : label or list

210 Field names to join on. Must be found in both DataFrames.

211 left_on : label or list, or array-like

212 Field names to join on in left DataFrame. Can be a vector or list of

213 vectors of the length of the DataFrame to use a particular vector as

214 the join key instead of columns.

215 right_on : label or list, or array-like

216 Field names to join on in right DataFrame or vector/list of vectors per

217 left_on docs.

218 left_by : column name or list of column names

219 Group left DataFrame by group columns and merge piece by piece with

220 right DataFrame.

221 right_by : column name or list of column names

222 Group right DataFrame by group columns and merge piece by piece with

223 left DataFrame.

224 fill_method : {'ffill', None}, default None

225 Interpolation method for data.

226 suffixes : list-like, default is ("_x", "_y")

227 A length-2 sequence where each element is optionally a string

228 indicating the suffix to add to overlapping column names in

229 `left` and `right` respectively. Pass a value of `None` instead

230 of a string to indicate that the column name from `left` or

231 `right` should be left as-is, with no suffix. At least one of the

232 values must not be None.

233

234 .. versionchanged:: 0.25.0

235 how : {'left', 'right', 'outer', 'inner'}, default 'outer'

236 * left: use only keys from left frame (SQL: left outer join)

237 * right: use only keys from right frame (SQL: right outer join)

238 * outer: use union of keys from both frames (SQL: full outer join)

239 * inner: use intersection of keys from both frames (SQL: inner join).

240

241 Returns

242 -------

243 DataFrame

244 The merged DataFrame output type will the be same as

245 'left', if it is a subclass of DataFrame.

246

247 See Also

248 --------

249 merge : Merge with a database-style join.

250 merge_asof : Merge on nearest keys.

251

252 Examples

253 --------

254 >>> df1 = pd.DataFrame(

255 ... {

256 ... "key": ["a", "c", "e", "a", "c", "e"],

257 ... "lvalue": [1, 2, 3, 1, 2, 3],

258 ... "group": ["a", "a", "a", "b", "b", "b"]

259 ... }

260 ... )

261 >>> df1

262 key lvalue group

263 0 a 1 a

264 1 c 2 a

265 2 e 3 a

266 3 a 1 b

267 4 c 2 b

268 5 e 3 b

269

270 >>> df2 = pd.DataFrame({"key": ["b", "c", "d"], "rvalue": [1, 2, 3]})

271 >>> df2

272 key rvalue

273 0 b 1

274 1 c 2

275 2 d 3

276

277 >>> merge_ordered(df1, df2, fill_method="ffill", left_by="group")

278 key lvalue group rvalue

279 0 a 1 a NaN

280 1 b 1 a 1.0

281 2 c 2 a 2.0

282 3 d 2 a 3.0

283 4 e 3 a 3.0

284 5 a 1 b NaN

285 6 b 1 b 1.0

286 7 c 2 b 2.0

287 8 d 2 b 3.0

288 9 e 3 b 3.0

289 """

290

291 def _merger(x, y) -> DataFrame:

292 # perform the ordered merge operation

293 op = _OrderedMerge(

294 x,

295 y,

296 on=on,

297 left_on=left_on,

298 right_on=right_on,

299 suffixes=suffixes,

300 fill_method=fill_method,

301 how=how,

302 )

303 return op.get_result()

304

305 if left_by is not None and right_by is not None:

306 raise ValueError("Can only group either left or right frames")

307 elif left_by is not None:

308 if isinstance(left_by, str):

309 left_by = [left_by]

310 check = set(left_by).difference(left.columns)

311 if len(check) != 0:

312 raise KeyError(f"{check} not found in left columns")

313 result, _ = _groupby_and_merge(left_by, left, right, lambda x, y: _merger(x, y))

314 elif right_by is not None:

315 if isinstance(right_by, str):

316 right_by = [right_by]

317 check = set(right_by).difference(right.columns)

318 if len(check) != 0:

319 raise KeyError(f"{check} not found in right columns")

320 result, _ = _groupby_and_merge(

321 right_by, right, left, lambda x, y: _merger(y, x)

322 )

323 else:

324 result = _merger(left, right)

325 return result

326

327

328def merge_asof(

329 left: DataFrame | Series,

330 right: DataFrame | Series,

331 on: IndexLabel | None = None,

332 left_on: IndexLabel | None = None,

333 right_on: IndexLabel | None = None,

334 left_index: bool = False,

335 right_index: bool = False,

336 by=None,

337 left_by=None,

338 right_by=None,

339 suffixes: Suffixes = ("_x", "_y"),

340 tolerance=None,

341 allow_exact_matches: bool = True,

342 direction: str = "backward",

343) -> DataFrame:

344 """

345 Perform a merge by key distance.

346

347 This is similar to a left-join except that we match on nearest

348 key rather than equal keys. Both DataFrames must be sorted by the key.

349

350 For each row in the left DataFrame:

351

352 - A "backward" search selects the last row in the right DataFrame whose

353 'on' key is less than or equal to the left's key.

354

355 - A "forward" search selects the first row in the right DataFrame whose

356 'on' key is greater than or equal to the left's key.

357

358 - A "nearest" search selects the row in the right DataFrame whose 'on'

359 key is closest in absolute distance to the left's key.

360

361 The default is "backward" and is compatible in versions below 0.20.0.

362 The direction parameter was added in version 0.20.0 and introduces

363 "forward" and "nearest".

364

365 Optionally match on equivalent keys with 'by' before searching with 'on'.

366

367 Parameters

368 ----------

369 left : DataFrame or named Series

370 right : DataFrame or named Series

371 on : label

372 Field name to join on. Must be found in both DataFrames.

373 The data MUST be ordered. Furthermore this must be a numeric column,

374 such as datetimelike, integer, or float. On or left_on/right_on

375 must be given.

376 left_on : label

377 Field name to join on in left DataFrame.

378 right_on : label

379 Field name to join on in right DataFrame.

380 left_index : bool

381 Use the index of the left DataFrame as the join key.

382 right_index : bool

383 Use the index of the right DataFrame as the join key.

384 by : column name or list of column names

385 Match on these columns before performing merge operation.

386 left_by : column name

387 Field names to match on in the left DataFrame.

388 right_by : column name

389 Field names to match on in the right DataFrame.

390 suffixes : 2-length sequence (tuple, list, ...)

391 Suffix to apply to overlapping column names in the left and right

392 side, respectively.

393 tolerance : int or Timedelta, optional, default None

394 Select asof tolerance within this range; must be compatible

395 with the merge index.

396 allow_exact_matches : bool, default True

397

398 - If True, allow matching with the same 'on' value

399 (i.e. less-than-or-equal-to / greater-than-or-equal-to)

400 - If False, don't match the same 'on' value

401 (i.e., strictly less-than / strictly greater-than).

402

403 direction : 'backward' (default), 'forward', or 'nearest'

404 Whether to search for prior, subsequent, or closest matches.

405

406 Returns

407 -------

408 merged : DataFrame

409

410 See Also

411 --------

412 merge : Merge with a database-style join.

413 merge_ordered : Merge with optional filling/interpolation.

414

415 Examples

416 --------

417 >>> left = pd.DataFrame({"a": [1, 5, 10], "left_val": ["a", "b", "c"]})

418 >>> left

419 a left_val

420 0 1 a

421 1 5 b

422 2 10 c

423

424 >>> right = pd.DataFrame({"a": [1, 2, 3, 6, 7], "right_val": [1, 2, 3, 6, 7]})

425 >>> right

426 a right_val

427 0 1 1

428 1 2 2

429 2 3 3

430 3 6 6

431 4 7 7

432

433 >>> pd.merge_asof(left, right, on="a")

434 a left_val right_val

435 0 1 a 1

436 1 5 b 3

437 2 10 c 7

438

439 >>> pd.merge_asof(left, right, on="a", allow_exact_matches=False)

440 a left_val right_val

441 0 1 a NaN

442 1 5 b 3.0

443 2 10 c 7.0

444

445 >>> pd.merge_asof(left, right, on="a", direction="forward")

446 a left_val right_val

447 0 1 a 1.0

448 1 5 b 6.0

449 2 10 c NaN

450

451 >>> pd.merge_asof(left, right, on="a", direction="nearest")

452 a left_val right_val

453 0 1 a 1

454 1 5 b 6

455 2 10 c 7

456

457 We can use indexed DataFrames as well.

458

459 >>> left = pd.DataFrame({"left_val": ["a", "b", "c"]}, index=[1, 5, 10])

460 >>> left

461 left_val

462 1 a

463 5 b

464 10 c

465

466 >>> right = pd.DataFrame({"right_val": [1, 2, 3, 6, 7]}, index=[1, 2, 3, 6, 7])

467 >>> right

468 right_val

469 1 1

470 2 2

471 3 3

472 6 6

473 7 7

474

475 >>> pd.merge_asof(left, right, left_index=True, right_index=True)

476 left_val right_val

477 1 a 1

478 5 b 3

479 10 c 7

480

481 Here is a real-world times-series example

482

483 >>> quotes = pd.DataFrame(

484 ... {

485 ... "time": [

486 ... pd.Timestamp("2016-05-25 13:30:00.023"),

487 ... pd.Timestamp("2016-05-25 13:30:00.023"),

488 ... pd.Timestamp("2016-05-25 13:30:00.030"),

489 ... pd.Timestamp("2016-05-25 13:30:00.041"),

490 ... pd.Timestamp("2016-05-25 13:30:00.048"),

491 ... pd.Timestamp("2016-05-25 13:30:00.049"),

492 ... pd.Timestamp("2016-05-25 13:30:00.072"),

493 ... pd.Timestamp("2016-05-25 13:30:00.075")

494 ... ],

495 ... "ticker": [

496 ... "GOOG",

497 ... "MSFT",

498 ... "MSFT",

499 ... "MSFT",

500 ... "GOOG",

501 ... "AAPL",

502 ... "GOOG",

503 ... "MSFT"

504 ... ],

505 ... "bid": [720.50, 51.95, 51.97, 51.99, 720.50, 97.99, 720.50, 52.01],

506 ... "ask": [720.93, 51.96, 51.98, 52.00, 720.93, 98.01, 720.88, 52.03]

507 ... }

508 ... )

509 >>> quotes

510 time ticker bid ask

511 0 2016-05-25 13:30:00.023 GOOG 720.50 720.93

512 1 2016-05-25 13:30:00.023 MSFT 51.95 51.96

513 2 2016-05-25 13:30:00.030 MSFT 51.97 51.98

514 3 2016-05-25 13:30:00.041 MSFT 51.99 52.00

515 4 2016-05-25 13:30:00.048 GOOG 720.50 720.93

516 5 2016-05-25 13:30:00.049 AAPL 97.99 98.01

517 6 2016-05-25 13:30:00.072 GOOG 720.50 720.88

518 7 2016-05-25 13:30:00.075 MSFT 52.01 52.03

519

520 >>> trades = pd.DataFrame(

521 ... {

522 ... "time": [

523 ... pd.Timestamp("2016-05-25 13:30:00.023"),

524 ... pd.Timestamp("2016-05-25 13:30:00.038"),

525 ... pd.Timestamp("2016-05-25 13:30:00.048"),

526 ... pd.Timestamp("2016-05-25 13:30:00.048"),

527 ... pd.Timestamp("2016-05-25 13:30:00.048")

528 ... ],

529 ... "ticker": ["MSFT", "MSFT", "GOOG", "GOOG", "AAPL"],

530 ... "price": [51.95, 51.95, 720.77, 720.92, 98.0],

531 ... "quantity": [75, 155, 100, 100, 100]

532 ... }

533 ... )

534 >>> trades

535 time ticker price quantity

536 0 2016-05-25 13:30:00.023 MSFT 51.95 75

537 1 2016-05-25 13:30:00.038 MSFT 51.95 155

538 2 2016-05-25 13:30:00.048 GOOG 720.77 100

539 3 2016-05-25 13:30:00.048 GOOG 720.92 100

540 4 2016-05-25 13:30:00.048 AAPL 98.00 100

541

542 By default we are taking the asof of the quotes

543

544 >>> pd.merge_asof(trades, quotes, on="time", by="ticker")

545 time ticker price quantity bid ask

546 0 2016-05-25 13:30:00.023 MSFT 51.95 75 51.95 51.96

547 1 2016-05-25 13:30:00.038 MSFT 51.95 155 51.97 51.98

548 2 2016-05-25 13:30:00.048 GOOG 720.77 100 720.50 720.93

549 3 2016-05-25 13:30:00.048 GOOG 720.92 100 720.50 720.93

550 4 2016-05-25 13:30:00.048 AAPL 98.00 100 NaN NaN

551

552 We only asof within 2ms between the quote time and the trade time

553

554 >>> pd.merge_asof(

555 ... trades, quotes, on="time", by="ticker", tolerance=pd.Timedelta("2ms")

556 ... )

557 time ticker price quantity bid ask

558 0 2016-05-25 13:30:00.023 MSFT 51.95 75 51.95 51.96

559 1 2016-05-25 13:30:00.038 MSFT 51.95 155 NaN NaN

560 2 2016-05-25 13:30:00.048 GOOG 720.77 100 720.50 720.93

561 3 2016-05-25 13:30:00.048 GOOG 720.92 100 720.50 720.93

562 4 2016-05-25 13:30:00.048 AAPL 98.00 100 NaN NaN

563

564 We only asof within 10ms between the quote time and the trade time

565 and we exclude exact matches on time. However *prior* data will

566 propagate forward

567

568 >>> pd.merge_asof(

569 ... trades,

570 ... quotes,

571 ... on="time",

572 ... by="ticker",

573 ... tolerance=pd.Timedelta("10ms"),

574 ... allow_exact_matches=False

575 ... )

576 time ticker price quantity bid ask

577 0 2016-05-25 13:30:00.023 MSFT 51.95 75 NaN NaN

578 1 2016-05-25 13:30:00.038 MSFT 51.95 155 51.97 51.98

579 2 2016-05-25 13:30:00.048 GOOG 720.77 100 NaN NaN

580 3 2016-05-25 13:30:00.048 GOOG 720.92 100 NaN NaN

581 4 2016-05-25 13:30:00.048 AAPL 98.00 100 NaN NaN

582 """

583 op = _AsOfMerge(

584 left,

585 right,

586 on=on,

587 left_on=left_on,

588 right_on=right_on,

589 left_index=left_index,

590 right_index=right_index,

591 by=by,

592 left_by=left_by,

593 right_by=right_by,

594 suffixes=suffixes,

595 how="asof",

596 tolerance=tolerance,

597 allow_exact_matches=allow_exact_matches,

598 direction=direction,

599 )

600 return op.get_result()

601

602

603# TODO: transformations??

604# TODO: only copy DataFrames when modification necessary

605class _MergeOperation:

606 """

607 Perform a database (SQL) merge operation between two DataFrame or Series

608 objects using either columns as keys or their row indexes

609 """

610

611 _merge_type = "merge"

612 how: str

613 on: IndexLabel | None

614 # left_on/right_on may be None when passed, but in validate_specification

615 # get replaced with non-None.

616 left_on: Sequence[Hashable | AnyArrayLike]

617 right_on: Sequence[Hashable | AnyArrayLike]

618 left_index: bool

619 right_index: bool

620 axis: int

621 bm_axis: int

622 sort: bool

623 suffixes: Suffixes

624 copy: bool

625 indicator: bool

626 validate: str | None

627

628 def __init__(

629 self,

630 left: DataFrame | Series,

631 right: DataFrame | Series,

632 how: str = "inner",

633 on: IndexLabel | None = None,

634 left_on: IndexLabel | None = None,

635 right_on: IndexLabel | None = None,

636 axis: int = 1,

637 left_index: bool = False,

638 right_index: bool = False,

639 sort: bool = True,

640 suffixes: Suffixes = ("_x", "_y"),

641 indicator: bool = False,

642 validate: str | None = None,

643 ) -> None:

644 _left = _validate_operand(left)

645 _right = _validate_operand(right)

646 self.left = self.orig_left = _left

647 self.right = self.orig_right = _right

648 self.how = how

649

650 # bm_axis -> the axis on the BlockManager

651 self.bm_axis = axis

652 # axis --> the axis on the Series/DataFrame

653 self.axis = 1 - axis if self.left.ndim == 2 else 0

654

655 self.on = com.maybe_make_list(on)

656

657 self.suffixes = suffixes

658 self.sort = sort

659

660 self.left_index = left_index

661 self.right_index = right_index

662

663 self.indicator = indicator

664

665 if not is_bool(left_index):

666 raise ValueError(

667 f"left_index parameter must be of type bool, not {type(left_index)}"

668 )

669 if not is_bool(right_index):

670 raise ValueError(

671 f"right_index parameter must be of type bool, not {type(right_index)}"

672 )

673

674 # warn user when merging between different levels

675 if _left.columns.nlevels != _right.columns.nlevels:

676 msg = (

677 "merging between different levels is deprecated and will be removed "

678 f"in a future version. ({_left.columns.nlevels} levels on the left, "

679 f"{_right.columns.nlevels} on the right)"

680 )

681 # stacklevel chosen to be correct when this is reached via pd.merge

682 # (and not DataFrame.join)

683 warnings.warn(msg, FutureWarning, stacklevel=find_stack_level())

684

685 self.left_on, self.right_on = self._validate_left_right_on(left_on, right_on)

686

687 cross_col = None

688 if self.how == "cross":

689 (

690 self.left,

691 self.right,

692 self.how,

693 cross_col,

694 ) = self._create_cross_configuration(self.left, self.right)

695 self.left_on = self.right_on = [cross_col]

696 self._cross = cross_col

697

698 # note this function has side effects

699 (

700 self.left_join_keys,

701 self.right_join_keys,

702 self.join_names,

703 ) = self._get_merge_keys()

704

705 # validate the merge keys dtypes. We may need to coerce

706 # to avoid incompatible dtypes

707 self._maybe_coerce_merge_keys()

708

709 # If argument passed to validate,

710 # check if columns specified as unique

711 # are in fact unique.

712 if validate is not None:

713 self._validate(validate)

714

715 def _reindex_and_concat(

716 self,

717 join_index: Index,

718 left_indexer: npt.NDArray[np.intp] | None,

719 right_indexer: npt.NDArray[np.intp] | None,

720 copy: bool,

721 ) -> DataFrame:

722 """

723 reindex along index and concat along columns.

724 """

725 # Take views so we do not alter the originals

726 left = self.left[:]

727 right = self.right[:]

728

729 llabels, rlabels = _items_overlap_with_suffix(

730 self.left._info_axis, self.right._info_axis, self.suffixes

731 )

732

733 if left_indexer is not None:

734 # Pinning the index here (and in the right code just below) is not

735 # necessary, but makes the `.take` more performant if we have e.g.

736 # a MultiIndex for left.index.

737 lmgr = left._mgr.reindex_indexer(

738 join_index,

739 left_indexer,

740 axis=1,

741 copy=False,

742 only_slice=True,

743 allow_dups=True,

744 use_na_proxy=True,

745 )

746 left = left._constructor(lmgr)

747 left.index = join_index

748

749 if right_indexer is not None:

750 rmgr = right._mgr.reindex_indexer(

751 join_index,

752 right_indexer,

753 axis=1,

754 copy=False,

755 only_slice=True,

756 allow_dups=True,

757 use_na_proxy=True,

758 )

759 right = right._constructor(rmgr)

760 right.index = join_index

761

762 from pandas import concat

763

764 left.columns = llabels

765 right.columns = rlabels

766 result = concat([left, right], axis=1, copy=copy)

767 return result

768

769 def get_result(self, copy: bool = True) -> DataFrame:

770 if self.indicator:

771 self.left, self.right = self._indicator_pre_merge(self.left, self.right)

772

773 join_index, left_indexer, right_indexer = self._get_join_info()

774

775 result = self._reindex_and_concat(

776 join_index, left_indexer, right_indexer, copy=copy

777 )

778 result = result.__finalize__(self, method=self._merge_type)

779

780 if self.indicator:

781 result = self._indicator_post_merge(result)

782

783 self._maybe_add_join_keys(result, left_indexer, right_indexer)

784

785 self._maybe_restore_index_levels(result)

786

787 self._maybe_drop_cross_column(result, self._cross)

788

789 return result.__finalize__(self, method="merge")

790

791 def _maybe_drop_cross_column(

792 self, result: DataFrame, cross_col: str | None

793 ) -> None:

794 if cross_col is not None:

795 del result[cross_col]

796

797 @cache_readonly

798 def _indicator_name(self) -> str | None:

799 if isinstance(self.indicator, str):

800 return self.indicator

801 elif isinstance(self.indicator, bool):

802 return "_merge" if self.indicator else None

803 else:

804 raise ValueError(

805 "indicator option can only accept boolean or string arguments"

806 )

807

808 def _indicator_pre_merge(

809 self, left: DataFrame, right: DataFrame

810 ) -> tuple[DataFrame, DataFrame]:

811

812 columns = left.columns.union(right.columns)

813

814 for i in ["_left_indicator", "_right_indicator"]:

815 if i in columns:

816 raise ValueError(

817 "Cannot use `indicator=True` option when "

818 f"data contains a column named {i}"

819 )

820 if self._indicator_name in columns:

821 raise ValueError(

822 "Cannot use name of an existing column for indicator column"

823 )

824

825 left = left.copy()

826 right = right.copy()

827

828 left["_left_indicator"] = 1

829 left["_left_indicator"] = left["_left_indicator"].astype("int8")

830

831 right["_right_indicator"] = 2

832 right["_right_indicator"] = right["_right_indicator"].astype("int8")

833

834 return left, right

835

836 def _indicator_post_merge(self, result: DataFrame) -> DataFrame:

837

838 result["_left_indicator"] = result["_left_indicator"].fillna(0)

839 result["_right_indicator"] = result["_right_indicator"].fillna(0)

840

841 result[self._indicator_name] = Categorical(

842 (result["_left_indicator"] + result["_right_indicator"]),

843 categories=[1, 2, 3],

844 )

845 result[self._indicator_name] = result[

846 self._indicator_name

847 ].cat.rename_categories(["left_only", "right_only", "both"])

848

849 result = result.drop(labels=["_left_indicator", "_right_indicator"], axis=1)

850 return result

851

852 def _maybe_restore_index_levels(self, result: DataFrame) -> None:

853 """

854 Restore index levels specified as `on` parameters

855

856 Here we check for cases where `self.left_on` and `self.right_on` pairs

857 each reference an index level in their respective DataFrames. The

858 joined columns corresponding to these pairs are then restored to the

859 index of `result`.

860

861 **Note:** This method has side effects. It modifies `result` in-place

862

863 Parameters

864 ----------

865 result: DataFrame

866 merge result

867

868 Returns

869 -------

870 None

871 """

872 names_to_restore = []

873 for name, left_key, right_key in zip(

874 self.join_names, self.left_on, self.right_on

875 ):

876 if (

877 # Argument 1 to "_is_level_reference" of "NDFrame" has incompatible

878 # type "Union[Hashable, ExtensionArray, Index, Series]"; expected

879 # "Hashable"

880 self.orig_left._is_level_reference(left_key) # type: ignore[arg-type]

881 # Argument 1 to "_is_level_reference" of "NDFrame" has incompatible

882 # type "Union[Hashable, ExtensionArray, Index, Series]"; expected

883 # "Hashable"

884 and self.orig_right._is_level_reference(

885 right_key # type: ignore[arg-type]

886 )

887 and left_key == right_key

888 and name not in result.index.names

889 ):

890

891 names_to_restore.append(name)

892

893 if names_to_restore:

894 result.set_index(names_to_restore, inplace=True)

895

896 def _maybe_add_join_keys(

897 self,

898 result: DataFrame,

899 left_indexer: np.ndarray | None,

900 right_indexer: np.ndarray | None,

901 ) -> None:

902

903 left_has_missing = None

904 right_has_missing = None

905

906 keys = zip(self.join_names, self.left_on, self.right_on)

907 for i, (name, lname, rname) in enumerate(keys):

908 if not _should_fill(lname, rname):

909 continue

910

911 take_left, take_right = None, None

912

913 if name in result:

914

915 if left_indexer is not None and right_indexer is not None:

916 if name in self.left:

917

918 if left_has_missing is None:

919 left_has_missing = (left_indexer == -1).any()

920

921 if left_has_missing:

922 take_right = self.right_join_keys[i]

923

924 if not is_dtype_equal(

925 result[name].dtype, self.left[name].dtype

926 ):

927 take_left = self.left[name]._values

928

929 elif name in self.right:

930

931 if right_has_missing is None:

932 right_has_missing = (right_indexer == -1).any()

933

934 if right_has_missing:

935 take_left = self.left_join_keys[i]

936

937 if not is_dtype_equal(

938 result[name].dtype, self.right[name].dtype

939 ):

940 take_right = self.right[name]._values

941

942 elif left_indexer is not None and is_array_like(self.left_join_keys[i]):

943 take_left = self.left_join_keys[i]

944 take_right = self.right_join_keys[i]

945

946 if take_left is not None or take_right is not None:

947

948 if take_left is None:

949 lvals = result[name]._values

950 else:

951 # TODO: can we pin down take_left's type earlier?

952 take_left = extract_array(take_left, extract_numpy=True)

953 lfill = na_value_for_dtype(take_left.dtype)

954 lvals = algos.take_nd(take_left, left_indexer, fill_value=lfill)

955

956 if take_right is None:

957 rvals = result[name]._values

958 else:

959 # TODO: can we pin down take_right's type earlier?

960 take_right = extract_array(take_right, extract_numpy=True)

961 rfill = na_value_for_dtype(take_right.dtype)

962 rvals = algos.take_nd(take_right, right_indexer, fill_value=rfill)

963

964 # if we have an all missing left_indexer

965 # make sure to just use the right values or vice-versa

966 mask_left = left_indexer == -1

967 # error: Item "bool" of "Union[Any, bool]" has no attribute "all"

968 if mask_left.all(): # type: ignore[union-attr]

969 key_col = Index(rvals)

970 result_dtype = rvals.dtype

971 elif right_indexer is not None and (right_indexer == -1).all():

972 key_col = Index(lvals)

973 result_dtype = lvals.dtype

974 else:

975 key_col = Index(lvals).where(~mask_left, rvals)

976 result_dtype = find_common_type([lvals.dtype, rvals.dtype])

977

978 if result._is_label_reference(name):

979 result[name] = Series(

980 key_col, dtype=result_dtype, index=result.index

981 )

982 elif result._is_level_reference(name):

983 if isinstance(result.index, MultiIndex):

984 key_col.name = name

985 idx_list = [

986 result.index.get_level_values(level_name)

987 if level_name != name

988 else key_col

989 for level_name in result.index.names

990 ]

991

992 result.set_index(idx_list, inplace=True)

993 else:

994 result.index = Index(key_col, name=name)

995 else:

996 result.insert(i, name or f"key_{i}", key_col)

997

998 def _get_join_indexers(self) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]:

999 """return the join indexers"""

1000 return get_join_indexers(

1001 self.left_join_keys, self.right_join_keys, sort=self.sort, how=self.how

1002 )

1003

1004 def _get_join_info(

1005 self,

1006 ) -> tuple[Index, npt.NDArray[np.intp] | None, npt.NDArray[np.intp] | None]:

1007

1008 left_ax = self.left.axes[self.axis]

1009 right_ax = self.right.axes[self.axis]

1010

1011 if self.left_index and self.right_index and self.how != "asof":

1012 join_index, left_indexer, right_indexer = left_ax.join(

1013 right_ax, how=self.how, return_indexers=True, sort=self.sort

1014 )

1015

1016 elif self.right_index and self.how == "left":

1017 join_index, left_indexer, right_indexer = _left_join_on_index(

1018 left_ax, right_ax, self.left_join_keys, sort=self.sort

1019 )

1020

1021 elif self.left_index and self.how == "right":

1022 join_index, right_indexer, left_indexer = _left_join_on_index(

1023 right_ax, left_ax, self.right_join_keys, sort=self.sort

1024 )

1025 else:

1026 (left_indexer, right_indexer) = self._get_join_indexers()

1027

1028 if self.right_index:

1029 if len(self.left) > 0:

1030 join_index = self._create_join_index(

1031 self.left.index,

1032 self.right.index,

1033 left_indexer,

1034 how="right",

1035 )

1036 else:

1037 join_index = self.right.index.take(right_indexer)

1038 elif self.left_index:

1039 if self.how == "asof":

1040 # GH#33463 asof should always behave like a left merge

1041 join_index = self._create_join_index(

1042 self.left.index,

1043 self.right.index,

1044 left_indexer,

1045 how="left",

1046 )

1047

1048 elif len(self.right) > 0:

1049 join_index = self._create_join_index(

1050 self.right.index,

1051 self.left.index,

1052 right_indexer,

1053 how="left",

1054 )

1055 else:

1056 join_index = self.left.index.take(left_indexer)

1057 else:

1058 join_index = Index(np.arange(len(left_indexer)))

1059

1060 if len(join_index) == 0:

1061 join_index = join_index.astype(object)

1062 return join_index, left_indexer, right_indexer

1063

1064 def _create_join_index(

1065 self,

1066 index: Index,

1067 other_index: Index,

1068 indexer: npt.NDArray[np.intp],

1069 how: str = "left",

1070 ) -> Index:

1071 """

1072 Create a join index by rearranging one index to match another

1073

1074 Parameters

1075 ----------

1076 index : Index being rearranged

1077 other_index : Index used to supply values not found in index

1078 indexer : np.ndarray[np.intp] how to rearrange index

1079 how : str

1080 Replacement is only necessary if indexer based on other_index.

1081

1082 Returns

1083 -------

1084 Index

1085 """

1086 if self.how in (how, "outer") and not isinstance(other_index, MultiIndex):

1087 # if final index requires values in other_index but not target

1088 # index, indexer may hold missing (-1) values, causing Index.take

1089 # to take the final value in target index. So, we set the last

1090 # element to be the desired fill value. We do not use allow_fill

1091 # and fill_value because it throws a ValueError on integer indices

1092 mask = indexer == -1

1093 if np.any(mask):

1094 fill_value = na_value_for_dtype(index.dtype, compat=False)

1095 index = index.append(Index([fill_value]))

1096 return index.take(indexer)

1097

1098 def _get_merge_keys(self):

1099 """

1100 Note: has side effects (copy/delete key columns)

1101

1102 Parameters

1103 ----------

1104 left

1105 right

1106 on

1107

1108 Returns

1109 -------

1110 left_keys, right_keys, join_names

1111 """

1112 # left_keys, right_keys entries can actually be anything listlike

1113 # with a 'dtype' attr

1114 left_keys: list[AnyArrayLike] = []

1115 right_keys: list[AnyArrayLike] = []

1116 join_names: list[Hashable] = []

1117 right_drop = []

1118 left_drop = []

1119

1120 left, right = self.left, self.right

1121

1122 is_lkey = lambda x: is_array_like(x) and len(x) == len(left)

1123 is_rkey = lambda x: is_array_like(x) and len(x) == len(right)

1124

1125 # Note that pd.merge_asof() has separate 'on' and 'by' parameters. A

1126 # user could, for example, request 'left_index' and 'left_by'. In a

1127 # regular pd.merge(), users cannot specify both 'left_index' and

1128 # 'left_on'. (Instead, users have a MultiIndex). That means the

1129 # self.left_on in this function is always empty in a pd.merge(), but

1130 # a pd.merge_asof(left_index=True, left_by=...) will result in a

1131 # self.left_on array with a None in the middle of it. This requires

1132 # a work-around as designated in the code below.

1133 # See _validate_left_right_on() for where this happens.

1134

1135 # ugh, spaghetti re #733

1136 if _any(self.left_on) and _any(self.right_on):

1137 for lk, rk in zip(self.left_on, self.right_on):

1138 if is_lkey(lk):

1139 lk = cast(AnyArrayLike, lk)

1140 left_keys.append(lk)

1141 if is_rkey(rk):

1142 rk = cast(AnyArrayLike, rk)

1143 right_keys.append(rk)

1144 join_names.append(None) # what to do?

1145 else:

1146 # Then we're either Hashable or a wrong-length arraylike,

1147 # the latter of which will raise

1148 rk = cast(Hashable, rk)

1149 if rk is not None:

1150 right_keys.append(right._get_label_or_level_values(rk))

1151 join_names.append(rk)

1152 else:

1153 # work-around for merge_asof(right_index=True)

1154 right_keys.append(right.index)

1155 join_names.append(right.index.name)

1156 else:

1157 if not is_rkey(rk):

1158 # Then we're either Hashable or a wrong-length arraylike,

1159 # the latter of which will raise

1160 rk = cast(Hashable, rk)

1161 if rk is not None:

1162 right_keys.append(right._get_label_or_level_values(rk))

1163 else:

1164 # work-around for merge_asof(right_index=True)

1165 right_keys.append(right.index)

1166 if lk is not None and lk == rk: # FIXME: what about other NAs?

1167 # avoid key upcast in corner case (length-0)

1168 if len(left) > 0:

1169 right_drop.append(rk)

1170 else:

1171 left_drop.append(lk)

1172 else:

1173 rk = cast(AnyArrayLike, rk)

1174 right_keys.append(rk)

1175 if lk is not None:

1176 # Then we're either Hashable or a wrong-length arraylike,

1177 # the latter of which will raise

1178 lk = cast(Hashable, lk)

1179 left_keys.append(left._get_label_or_level_values(lk))

1180 join_names.append(lk)

1181 else:

1182 # work-around for merge_asof(left_index=True)

1183 left_keys.append(left.index)

1184 join_names.append(left.index.name)

1185 elif _any(self.left_on):

1186 for k in self.left_on:

1187 if is_lkey(k):

1188 k = cast(AnyArrayLike, k)

1189 left_keys.append(k)

1190 join_names.append(None)

1191 else:

1192 # Then we're either Hashable or a wrong-length arraylike,

1193 # the latter of which will raise

1194 k = cast(Hashable, k)

1195 left_keys.append(left._get_label_or_level_values(k))

1196 join_names.append(k)

1197 if isinstance(self.right.index, MultiIndex):

1198 right_keys = [

1199 lev._values.take(lev_codes)

1200 for lev, lev_codes in zip(

1201 self.right.index.levels, self.right.index.codes

1202 )

1203 ]

1204 else:

1205 right_keys = [self.right.index._values]

1206 elif _any(self.right_on):

1207 for k in self.right_on:

1208 if is_rkey(k):

1209 k = cast(AnyArrayLike, k)

1210 right_keys.append(k)

1211 join_names.append(None)

1212 else:

1213 # Then we're either Hashable or a wrong-length arraylike,

1214 # the latter of which will raise

1215 k = cast(Hashable, k)

1216 right_keys.append(right._get_label_or_level_values(k))

1217 join_names.append(k)

1218 if isinstance(self.left.index, MultiIndex):

1219 left_keys = [

1220 lev._values.take(lev_codes)

1221 for lev, lev_codes in zip(

1222 self.left.index.levels, self.left.index.codes

1223 )

1224 ]

1225 else:

1226 left_keys = [self.left.index._values]

1227

1228 if left_drop:

1229 self.left = self.left._drop_labels_or_levels(left_drop)

1230

1231 if right_drop:

1232 self.right = self.right._drop_labels_or_levels(right_drop)

1233

1234 return left_keys, right_keys, join_names

1235

1236 def _maybe_coerce_merge_keys(self) -> None:

1237 # we have valid merges but we may have to further

1238 # coerce these if they are originally incompatible types

1239 #

1240 # for example if these are categorical, but are not dtype_equal

1241 # or if we have object and integer dtypes

1242

1243 for lk, rk, name in zip(

1244 self.left_join_keys, self.right_join_keys, self.join_names

1245 ):

1246 if (len(lk) and not len(rk)) or (not len(lk) and len(rk)):

1247 continue

1248

1249 lk_is_cat = is_categorical_dtype(lk.dtype)

1250 rk_is_cat = is_categorical_dtype(rk.dtype)

1251 lk_is_object = is_object_dtype(lk.dtype)

1252 rk_is_object = is_object_dtype(rk.dtype)

1253

1254 # if either left or right is a categorical

1255 # then the must match exactly in categories & ordered

1256 if lk_is_cat and rk_is_cat:

1257 if lk._categories_match_up_to_permutation(rk):

1258 continue

1259

1260 elif lk_is_cat or rk_is_cat:

1261 pass

1262

1263 elif is_dtype_equal(lk.dtype, rk.dtype):

1264 continue

1265

1266 msg = (

1267 f"You are trying to merge on {lk.dtype} and "

1268 f"{rk.dtype} columns. If you wish to proceed you should use pd.concat"

1269 )

1270

1271 # if we are numeric, then allow differing

1272 # kinds to proceed, eg. int64 and int8, int and float

1273 # further if we are object, but we infer to

1274 # the same, then proceed

1275 if is_numeric_dtype(lk.dtype) and is_numeric_dtype(rk.dtype):

1276 if lk.dtype.kind == rk.dtype.kind:

1277 continue

1278

1279 # check whether ints and floats

1280 elif is_integer_dtype(rk.dtype) and is_float_dtype(lk.dtype):

1281 # GH 47391 numpy > 1.24 will raise a RuntimeError for nan -> int

1282 with np.errstate(invalid="ignore"):

1283 if not (lk == lk.astype(rk.dtype))[~np.isnan(lk)].all():

1284 warnings.warn(

1285 "You are merging on int and float "

1286 "columns where the float values "

1287 "are not equal to their int representation.",

1288 UserWarning,

1289 stacklevel=find_stack_level(),

1290 )

1291 continue

1292

1293 elif is_float_dtype(rk.dtype) and is_integer_dtype(lk.dtype):

1294 # GH 47391 numpy > 1.24 will raise a RuntimeError for nan -> int

1295 with np.errstate(invalid="ignore"):

1296 if not (rk == rk.astype(lk.dtype))[~np.isnan(rk)].all():

1297 warnings.warn(

1298 "You are merging on int and float "

1299 "columns where the float values "

1300 "are not equal to their int representation.",

1301 UserWarning,

1302 stacklevel=find_stack_level(),

1303 )

1304 continue

1305

1306 # let's infer and see if we are ok

1307 elif lib.infer_dtype(lk, skipna=False) == lib.infer_dtype(

1308 rk, skipna=False

1309 ):

1310 continue

1311

1312 # Check if we are trying to merge on obviously

1313 # incompatible dtypes GH 9780, GH 15800

1314

1315 # bool values are coerced to object

1316 elif (lk_is_object and is_bool_dtype(rk.dtype)) or (

1317 is_bool_dtype(lk.dtype) and rk_is_object

1318 ):

1319 pass

1320

1321 # object values are allowed to be merged

1322 elif (lk_is_object and is_numeric_dtype(rk.dtype)) or (

1323 is_numeric_dtype(lk.dtype) and rk_is_object

1324 ):

1325 inferred_left = lib.infer_dtype(lk, skipna=False)

1326 inferred_right = lib.infer_dtype(rk, skipna=False)

1327 bool_types = ["integer", "mixed-integer", "boolean", "empty"]

1328 string_types = ["string", "unicode", "mixed", "bytes", "empty"]

1329

1330 # inferred bool

1331 if inferred_left in bool_types and inferred_right in bool_types:

1332 pass

1333

1334 # unless we are merging non-string-like with string-like

1335 elif (

1336 inferred_left in string_types and inferred_right not in string_types

1337 ) or (

1338 inferred_right in string_types and inferred_left not in string_types

1339 ):

1340 raise ValueError(msg)

1341

1342 # datetimelikes must match exactly

1343 elif needs_i8_conversion(lk.dtype) and not needs_i8_conversion(rk.dtype):

1344 raise ValueError(msg)

1345 elif not needs_i8_conversion(lk.dtype) and needs_i8_conversion(rk.dtype):

1346 raise ValueError(msg)

1347 elif isinstance(lk.dtype, DatetimeTZDtype) and not isinstance(

1348 rk.dtype, DatetimeTZDtype

1349 ):

1350 raise ValueError(msg)

1351 elif not isinstance(lk.dtype, DatetimeTZDtype) and isinstance(

1352 rk.dtype, DatetimeTZDtype

1353 ):

1354 raise ValueError(msg)

1355

1356 elif lk_is_object and rk_is_object:

1357 continue

1358

1359 # Houston, we have a problem!

1360 # let's coerce to object if the dtypes aren't

1361 # categorical, otherwise coerce to the category

1362 # dtype. If we coerced categories to object,

1363 # then we would lose type information on some

1364 # columns, and end up trying to merge

1365 # incompatible dtypes. See GH 16900.

1366 if name in self.left.columns:

1367 typ = lk.categories.dtype if lk_is_cat else object

1368 self.left = self.left.copy()

1369 self.left[name] = self.left[name].astype(typ)

1370 if name in self.right.columns:

1371 typ = rk.categories.dtype if rk_is_cat else object

1372 self.right = self.right.copy()

1373 self.right[name] = self.right[name].astype(typ)

1374

1375 def _create_cross_configuration(

1376 self, left: DataFrame, right: DataFrame

1377 ) -> tuple[DataFrame, DataFrame, str, str]:

1378 """

1379 Creates the configuration to dispatch the cross operation to inner join,

1380 e.g. adding a join column and resetting parameters. Join column is added

1381 to a new object, no inplace modification

1382

1383 Parameters

1384 ----------

1385 left : DataFrame

1386 right : DataFrame

1387

1388 Returns

1389 -------

1390 a tuple (left, right, how, cross_col) representing the adjusted

1391 DataFrames with cross_col, the merge operation set to inner and the column

1392 to join over.

1393 """

1394 cross_col = f"_cross_{uuid.uuid4()}"

1395 how = "inner"

1396 return (

1397 left.assign(**{cross_col: 1}),

1398 right.assign(**{cross_col: 1}),

1399 how,

1400 cross_col,

1401 )

1402

1403 def _validate_left_right_on(self, left_on, right_on):

1404 left_on = com.maybe_make_list(left_on)

1405 right_on = com.maybe_make_list(right_on)

1406

1407 if self.how == "cross":

1408 if (

1409 self.left_index

1410 or self.right_index

1411 or right_on is not None

1412 or left_on is not None

1413 or self.on is not None

1414 ):

1415 raise MergeError(

1416 "Can not pass on, right_on, left_on or set right_index=True or "

1417 "left_index=True"

1418 )

1419 # Hm, any way to make this logic less complicated??

1420 elif self.on is None and left_on is None and right_on is None:

1421

1422 if self.left_index and self.right_index:

1423 left_on, right_on = (), ()

1424 elif self.left_index:

1425 raise MergeError("Must pass right_on or right_index=True")

1426 elif self.right_index:

1427 raise MergeError("Must pass left_on or left_index=True")

1428 else:

1429 # use the common columns

1430 left_cols = self.left.columns

1431 right_cols = self.right.columns

1432 common_cols = left_cols.intersection(right_cols)

1433 if len(common_cols) == 0:

1434 raise MergeError(

1435 "No common columns to perform merge on. "

1436 f"Merge options: left_on={left_on}, "

1437 f"right_on={right_on}, "

1438 f"left_index={self.left_index}, "

1439 f"right_index={self.right_index}"

1440 )

1441 if (

1442 not left_cols.join(common_cols, how="inner").is_unique

1443 or not right_cols.join(common_cols, how="inner").is_unique

1444 ):

1445 raise MergeError(f"Data columns not unique: {repr(common_cols)}")

1446 left_on = right_on = common_cols

1447 elif self.on is not None:

1448 if left_on is not None or right_on is not None:

1449 raise MergeError(

1450 'Can only pass argument "on" OR "left_on" '

1451 'and "right_on", not a combination of both.'

1452 )

1453 if self.left_index or self.right_index:

1454 raise MergeError(

1455 'Can only pass argument "on" OR "left_index" '

1456 'and "right_index", not a combination of both.'

1457 )

1458 left_on = right_on = self.on

1459 elif left_on is not None:

1460 if self.left_index:

1461 raise MergeError(

1462 'Can only pass argument "left_on" OR "left_index" not both.'

1463 )

1464 if not self.right_index and right_on is None:

1465 raise MergeError('Must pass "right_on" OR "right_index".')

1466 n = len(left_on)

1467 if self.right_index:

1468 if len(left_on) != self.right.index.nlevels:

1469 raise ValueError(

1470 "len(left_on) must equal the number "

1471 'of levels in the index of "right"'

1472 )

1473 right_on = [None] * n

1474 elif right_on is not None:

1475 if self.right_index:

1476 raise MergeError(

1477 'Can only pass argument "right_on" OR "right_index" not both.'

1478 )

1479 if not self.left_index and left_on is None:

1480 raise MergeError('Must pass "left_on" OR "left_index".')

1481 n = len(right_on)

1482 if self.left_index:

1483 if len(right_on) != self.left.index.nlevels:

1484 raise ValueError(

1485 "len(right_on) must equal the number "

1486 'of levels in the index of "left"'

1487 )

1488 left_on = [None] * n

1489 if self.how != "cross" and len(right_on) != len(left_on):

1490 raise ValueError("len(right_on) must equal len(left_on)")

1491

1492 return left_on, right_on

1493

1494 def _validate(self, validate: str) -> None:

1495

1496 # Check uniqueness of each

1497 if self.left_index:

1498 left_unique = self.orig_left.index.is_unique

1499 else:

1500 left_unique = MultiIndex.from_arrays(self.left_join_keys).is_unique

1501

1502 if self.right_index:

1503 right_unique = self.orig_right.index.is_unique

1504 else:

1505 right_unique = MultiIndex.from_arrays(self.right_join_keys).is_unique

1506

1507 # Check data integrity

1508 if validate in ["one_to_one", "1:1"]:

1509 if not left_unique and not right_unique:

1510 raise MergeError(

1511 "Merge keys are not unique in either left "

1512 "or right dataset; not a one-to-one merge"

1513 )

1514 elif not left_unique:

1515 raise MergeError(

1516 "Merge keys are not unique in left dataset; not a one-to-one merge"

1517 )

1518 elif not right_unique:

1519 raise MergeError(

1520 "Merge keys are not unique in right dataset; not a one-to-one merge"

1521 )

1522

1523 elif validate in ["one_to_many", "1:m"]:

1524 if not left_unique:

1525 raise MergeError(

1526 "Merge keys are not unique in left dataset; not a one-to-many merge"

1527 )

1528

1529 elif validate in ["many_to_one", "m:1"]:

1530 if not right_unique:

1531 raise MergeError(

1532 "Merge keys are not unique in right dataset; "

1533 "not a many-to-one merge"

1534 )

1535

1536 elif validate in ["many_to_many", "m:m"]:

1537 pass

1538

1539 else:

1540 raise ValueError("Not a valid argument for validate")

1541

1542

1543def get_join_indexers(

1544 left_keys, right_keys, sort: bool = False, how: str = "inner", **kwargs

1545) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]:

1546 """

1547

1548 Parameters

1549 ----------

1550 left_keys : ndarray, Index, Series

1551 right_keys : ndarray, Index, Series

1552 sort : bool, default False

1553 how : {'inner', 'outer', 'left', 'right'}, default 'inner'

1554

1555 Returns

1556 -------

1557 np.ndarray[np.intp]

1558 Indexer into the left_keys.

1559 np.ndarray[np.intp]

1560 Indexer into the right_keys.

1561 """

1562 assert len(left_keys) == len(

1563 right_keys

1564 ), "left_key and right_keys must be the same length"

1565

1566 # fast-path for empty left/right

1567 left_n = len(left_keys[0])

1568 right_n = len(right_keys[0])

1569 if left_n == 0:

1570 if how in ["left", "inner", "cross"]:

1571 return _get_empty_indexer()

1572 elif not sort and how in ["right", "outer"]:

1573 return _get_no_sort_one_missing_indexer(right_n, True)

1574 elif right_n == 0:

1575 if how in ["right", "inner", "cross"]:

1576 return _get_empty_indexer()

1577 elif not sort and how in ["left", "outer"]:

1578 return _get_no_sort_one_missing_indexer(left_n, False)

1579

1580 # get left & right join labels and num. of levels at each location

1581 mapped = (

1582 _factorize_keys(left_keys[n], right_keys[n], sort=sort, how=how)

1583 for n in range(len(left_keys))

1584 )

1585 zipped = zip(*mapped)

1586 llab, rlab, shape = (list(x) for x in zipped)

1587

1588 # get flat i8 keys from label lists

1589 lkey, rkey = _get_join_keys(llab, rlab, shape, sort)

1590

1591 # factorize keys to a dense i8 space

1592 # `count` is the num. of unique keys

1593 # set(lkey) | set(rkey) == range(count)

1594

1595 lkey, rkey, count = _factorize_keys(lkey, rkey, sort=sort, how=how)

1596 # preserve left frame order if how == 'left' and sort == False

1597 kwargs = copy.copy(kwargs)

1598 if how in ("left", "right"):

1599 kwargs["sort"] = sort

1600 join_func = {

1601 "inner": libjoin.inner_join,

1602 "left": libjoin.left_outer_join,

1603 "right": lambda x, y, count, **kwargs: libjoin.left_outer_join(

1604 y, x, count, **kwargs

1605 )[::-1],

1606 "outer": libjoin.full_outer_join,

1607 }[how]

1608

1609 # error: Cannot call function of unknown type

1610 return join_func(lkey, rkey, count, **kwargs) # type: ignore[operator]

1611

1612

1613def restore_dropped_levels_multijoin(

1614 left: MultiIndex,

1615 right: MultiIndex,

1616 dropped_level_names,

1617 join_index: Index,

1618 lindexer: npt.NDArray[np.intp],

1619 rindexer: npt.NDArray[np.intp],

1620) -> tuple[list[Index], npt.NDArray[np.intp], list[Hashable]]:

1621 """

1622 *this is an internal non-public method*

1623

1624 Returns the levels, labels and names of a multi-index to multi-index join.

1625 Depending on the type of join, this method restores the appropriate

1626 dropped levels of the joined multi-index.

1627 The method relies on lidx, rindexer which hold the index positions of

1628 left and right, where a join was feasible

1629

1630 Parameters

1631 ----------

1632 left : MultiIndex

1633 left index

1634 right : MultiIndex

1635 right index

1636 dropped_level_names : str array

1637 list of non-common level names

1638 join_index : Index

1639 the index of the join between the

1640 common levels of left and right

1641 lindexer : np.ndarray[np.intp]

1642 left indexer

1643 rindexer : np.ndarray[np.intp]

1644 right indexer

1645

1646 Returns

1647 -------

1648 levels : list of Index

1649 levels of combined multiindexes

1650 labels : np.ndarray[np.intp]

1651 labels of combined multiindexes

1652 names : List[Hashable]

1653 names of combined multiindex levels

1654

1655 """

1656

1657 def _convert_to_multiindex(index: Index) -> MultiIndex:

1658 if isinstance(index, MultiIndex):

1659 return index

1660 else:

1661 return MultiIndex.from_arrays([index._values], names=[index.name])

1662

1663 # For multi-multi joins with one overlapping level,

1664 # the returned index if of type Index

1665 # Assure that join_index is of type MultiIndex

1666 # so that dropped levels can be appended

1667 join_index = _convert_to_multiindex(join_index)

1668

1669 join_levels = join_index.levels

1670 join_codes = join_index.codes

1671 join_names = join_index.names

1672

1673 # lindexer and rindexer hold the indexes where the join occurred

1674 # for left and right respectively. If left/right is None then

1675 # the join occurred on all indices of left/right

1676 if lindexer is None:

1677 lindexer = range(left.size)

1678

1679 if rindexer is None:

1680 rindexer = range(right.size)

1681

1682 # Iterate through the levels that must be restored

1683 for dropped_level_name in dropped_level_names:

1684 if dropped_level_name in left.names:

1685 idx = left

1686 indexer = lindexer

1687 else:

1688 idx = right

1689 indexer = rindexer

1690

1691 # The index of the level name to be restored

1692 name_idx = idx.names.index(dropped_level_name)

1693

1694 restore_levels = idx.levels[name_idx]

1695 # Inject -1 in the codes list where a join was not possible

1696 # IOW indexer[i]=-1

1697 codes = idx.codes[name_idx]

1698 restore_codes = algos.take_nd(codes, indexer, fill_value=-1)

1699

1700 join_levels = join_levels + [restore_levels]

1701 join_codes = join_codes + [restore_codes]

1702 join_names = join_names + [dropped_level_name]

1703

1704 return join_levels, join_codes, join_names

1705

1706

1707class _OrderedMerge(_MergeOperation):

1708 _merge_type = "ordered_merge"

1709

1710 def __init__(

1711 self,

1712 left: DataFrame | Series,

1713 right: DataFrame | Series,

1714 on: IndexLabel | None = None,

1715 left_on: IndexLabel | None = None,

1716 right_on: IndexLabel | None = None,

1717 left_index: bool = False,

1718 right_index: bool = False,

1719 axis: int = 1,

1720 suffixes: Suffixes = ("_x", "_y"),

1721 fill_method: str | None = None,

1722 how: str = "outer",

1723 ) -> None:

1724

1725 self.fill_method = fill_method

1726 _MergeOperation.__init__(

1727 self,

1728 left,

1729 right,

1730 on=on,

1731 left_on=left_on,

1732 left_index=left_index,

1733 right_index=right_index,

1734 right_on=right_on,

1735 axis=axis,

1736 how=how,

1737 suffixes=suffixes,

1738 sort=True, # factorize sorts

1739 )

1740

1741 def get_result(self, copy: bool = True) -> DataFrame:

1742 join_index, left_indexer, right_indexer = self._get_join_info()

1743

1744 llabels, rlabels = _items_overlap_with_suffix(

1745 self.left._info_axis, self.right._info_axis, self.suffixes

1746 )

1747

1748 left_join_indexer: np.ndarray | None

1749 right_join_indexer: np.ndarray | None

1750

1751 if self.fill_method == "ffill":

1752 if left_indexer is None:

1753 raise TypeError("left_indexer cannot be None")

1754 left_indexer, right_indexer = cast(np.ndarray, left_indexer), cast(

1755 np.ndarray, right_indexer

1756 )

1757 left_join_indexer = libjoin.ffill_indexer(left_indexer)

1758 right_join_indexer = libjoin.ffill_indexer(right_indexer)

1759 else:

1760 left_join_indexer = left_indexer

1761 right_join_indexer = right_indexer

1762

1763 result = self._reindex_and_concat(

1764 join_index, left_join_indexer, right_join_indexer, copy=copy

1765 )

1766 self._maybe_add_join_keys(result, left_indexer, right_indexer)

1767

1768 return result

1769

1770

1771def _asof_by_function(direction: str):

1772 name = f"asof_join_{direction}_on_X_by_Y"

1773 return getattr(libjoin, name, None)

1774

1775

1776_type_casters = {

1777 "int64_t": ensure_int64,

1778 "double": ensure_float64,

1779 "object": ensure_object,

1780}

1781

1782

1783def _get_cython_type_upcast(dtype: DtypeObj) -> str:

1784 """Upcast a dtype to 'int64_t', 'double', or 'object'"""

1785 if is_integer_dtype(dtype):

1786 return "int64_t"

1787 elif is_float_dtype(dtype):

1788 return "double"

1789 else:

1790 return "object"

1791

1792

1793class _AsOfMerge(_OrderedMerge):

1794 _merge_type = "asof_merge"

1795

1796 def __init__(

1797 self,

1798 left: DataFrame | Series,

1799 right: DataFrame | Series,

1800 on: IndexLabel | None = None,

1801 left_on: IndexLabel | None = None,

1802 right_on: IndexLabel | None = None,

1803 left_index: bool = False,

1804 right_index: bool = False,

1805 by=None,

1806 left_by=None,

1807 right_by=None,

1808 axis: int = 1,

1809 suffixes: Suffixes = ("_x", "_y"),

1810 copy: bool = True,

1811 fill_method: str | None = None,

1812 how: str = "asof",

1813 tolerance=None,

1814 allow_exact_matches: bool = True,

1815 direction: str = "backward",

1816 ) -> None:

1817

1818 self.by = by

1819 self.left_by = left_by

1820 self.right_by = right_by

1821 self.tolerance = tolerance

1822 self.allow_exact_matches = allow_exact_matches

1823 self.direction = direction

1824

1825 _OrderedMerge.__init__(

1826 self,

1827 left,

1828 right,

1829 on=on,

1830 left_on=left_on,

1831 right_on=right_on,

1832 left_index=left_index,

1833 right_index=right_index,

1834 axis=axis,

1835 how=how,

1836 suffixes=suffixes,

1837 fill_method=fill_method,

1838 )

1839

1840 def _validate_left_right_on(self, left_on, right_on):

1841 left_on, right_on = super()._validate_left_right_on(left_on, right_on)

1842

1843 # we only allow on to be a single item for on

1844 if len(left_on) != 1 and not self.left_index:

1845 raise MergeError("can only asof on a key for left")

1846

1847 if len(right_on) != 1 and not self.right_index:

1848 raise MergeError("can only asof on a key for right")

1849

1850 if self.left_index and isinstance(self.left.index, MultiIndex):

1851 raise MergeError("left can only have one index")

1852

1853 if self.right_index and isinstance(self.right.index, MultiIndex):

1854 raise MergeError("right can only have one index")

1855

1856 # set 'by' columns

1857 if self.by is not None:

1858 if self.left_by is not None or self.right_by is not None:

1859 raise MergeError("Can only pass by OR left_by and right_by")

1860 self.left_by = self.right_by = self.by

1861 if self.left_by is None and self.right_by is not None:

1862 raise MergeError("missing left_by")

1863 if self.left_by is not None and self.right_by is None:

1864 raise MergeError("missing right_by")

1865

1866 # GH#29130 Check that merge keys do not have dtype object

1867 if not self.left_index:

1868 left_on_0 = left_on[0]

1869 if is_array_like(left_on_0):

1870 lo_dtype = left_on_0.dtype

1871 else:

1872 lo_dtype = (

1873 self.left[left_on_0].dtype

1874 if left_on_0 in self.left.columns

1875 else self.left.index.get_level_values(left_on_0)

1876 )

1877 else:

1878 lo_dtype = self.left.index.dtype

1879

1880 if not self.right_index:

1881 right_on_0 = right_on[0]

1882 if is_array_like(right_on_0):

1883 ro_dtype = right_on_0.dtype

1884 else:

1885 ro_dtype = (

1886 self.right[right_on_0].dtype

1887 if right_on_0 in self.right.columns

1888 else self.right.index.get_level_values(right_on_0)

1889 )

1890 else:

1891 ro_dtype = self.right.index.dtype

1892

1893 if is_object_dtype(lo_dtype) or is_object_dtype(ro_dtype):

1894 raise MergeError(

1895 f"Incompatible merge dtype, {repr(ro_dtype)} and "

1896 f"{repr(lo_dtype)}, both sides must have numeric dtype"

1897 )

1898

1899 # add 'by' to our key-list so we can have it in the

1900 # output as a key

1901 if self.left_by is not None:

1902 if not is_list_like(self.left_by):

1903 self.left_by = [self.left_by]

1904 if not is_list_like(self.right_by):

1905 self.right_by = [self.right_by]

1906

1907 if len(self.left_by) != len(self.right_by):

1908 raise MergeError("left_by and right_by must be same length")

1909

1910 left_on = self.left_by + list(left_on)

1911 right_on = self.right_by + list(right_on)

1912

1913 # check 'direction' is valid

1914 if self.direction not in ["backward", "forward", "nearest"]:

1915 raise MergeError(f"direction invalid: {self.direction}")

1916

1917 return left_on, right_on

1918

1919 def _get_merge_keys(self):

1920

1921 # note this function has side effects

1922 (left_join_keys, right_join_keys, join_names) = super()._get_merge_keys()

1923

1924 # validate index types are the same

1925 for i, (lk, rk) in enumerate(zip(left_join_keys, right_join_keys)):

1926 if not is_dtype_equal(lk.dtype, rk.dtype):

1927 if is_categorical_dtype(lk.dtype) and is_categorical_dtype(rk.dtype):

1928 # The generic error message is confusing for categoricals.

1929 #

1930 # In this function, the join keys include both the original

1931 # ones of the merge_asof() call, and also the keys passed

1932 # to its by= argument. Unordered but equal categories

1933 # are not supported for the former, but will fail

1934 # later with a ValueError, so we don't *need* to check

1935 # for them here.

1936 msg = (

1937 f"incompatible merge keys [{i}] {repr(lk.dtype)} and "

1938 f"{repr(rk.dtype)}, both sides category, but not equal ones"

1939 )

1940 else:

1941 msg = (

1942 f"incompatible merge keys [{i}] {repr(lk.dtype)} and "

1943 f"{repr(rk.dtype)}, must be the same type"

1944 )

1945 raise MergeError(msg)

1946

1947 # validate tolerance; datetime.timedelta or Timedelta if we have a DTI

1948 if self.tolerance is not None:

1949

1950 if self.left_index:

1951 lt = self.left.index

1952 else:

1953 lt = left_join_keys[-1]

1954

1955 msg = (

1956 f"incompatible tolerance {self.tolerance}, must be compat "

1957 f"with type {repr(lt.dtype)}"

1958 )

1959

1960 if needs_i8_conversion(lt):

1961 if not isinstance(self.tolerance, datetime.timedelta):

1962 raise MergeError(msg)

1963 if self.tolerance < Timedelta(0):

1964 raise MergeError("tolerance must be positive")

1965

1966 elif is_integer_dtype(lt):

1967 if not is_integer(self.tolerance):

1968 raise MergeError(msg)

1969 if self.tolerance < 0:

1970 raise MergeError("tolerance must be positive")

1971

1972 elif is_float_dtype(lt):

1973 if not is_number(self.tolerance):

1974 raise MergeError(msg)

1975 if self.tolerance < 0:

1976 raise MergeError("tolerance must be positive")

1977

1978 else:

1979 raise MergeError("key must be integer, timestamp or float")

1980

1981 # validate allow_exact_matches

1982 if not is_bool(self.allow_exact_matches):

1983 msg = (

1984 "allow_exact_matches must be boolean, "

1985 f"passed {self.allow_exact_matches}"

1986 )

1987 raise MergeError(msg)

1988

1989 return left_join_keys, right_join_keys, join_names

1990

1991 def _get_join_indexers(self) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]:

1992 """return the join indexers"""

1993

1994 def flip(xs) -> np.ndarray:

1995 """unlike np.transpose, this returns an array of tuples"""

1996

1997 def injection(obj):

1998 if not is_extension_array_dtype(obj):

1999 # ndarray

2000 return obj

2001 obj = extract_array(obj)

2002 if isinstance(obj, NDArrayBackedExtensionArray):

2003 # fastpath for e.g. dt64tz, categorical

2004 return obj._ndarray

2005 # FIXME: returning obj._values_for_argsort() here doesn't

2006 # break in any existing test cases, but i (@jbrockmendel)

2007 # am pretty sure it should!

2008 # e.g.

2009 # arr = pd.array([0, pd.NA, 255], dtype="UInt8")

2010 # will have values_for_argsort (before GH#45434)

2011 # np.array([0, 255, 255], dtype=np.uint8)

2012 # and the non-injectivity should make a difference somehow

2013 # shouldn't it?

2014 return np.asarray(obj)

2015

2016 xs = [injection(x) for x in xs]

2017 labels = list(string.ascii_lowercase[: len(xs)])

2018 dtypes = [x.dtype for x in xs]

2019 labeled_dtypes = list(zip(labels, dtypes))

2020 return np.array(list(zip(*xs)), labeled_dtypes)

2021

2022 # values to compare

2023 left_values = (

2024 self.left.index._values if self.left_index else self.left_join_keys[-1]

2025 )

2026 right_values = (

2027 self.right.index._values if self.right_index else self.right_join_keys[-1]

2028 )

2029 tolerance = self.tolerance

2030

2031 # we require sortedness and non-null values in the join keys

2032 if not Index(left_values).is_monotonic_increasing:

2033 side = "left"

2034 if isna(left_values).any():

2035 raise ValueError(f"Merge keys contain null values on {side} side")

2036 else:

2037 raise ValueError(f"{side} keys must be sorted")

2038

2039 if not Index(right_values).is_monotonic_increasing:

2040 side = "right"

2041 if isna(right_values).any():

2042 raise ValueError(f"Merge keys contain null values on {side} side")

2043 else:

2044 raise ValueError(f"{side} keys must be sorted")

2045

2046 # initial type conversion as needed

2047 if needs_i8_conversion(left_values):

2048 left_values = left_values.view("i8")

2049 right_values = right_values.view("i8")

2050 if tolerance is not None:

2051 tolerance = Timedelta(tolerance)

2052 tolerance = tolerance.value

2053

2054 # a "by" parameter requires special handling

2055 if self.left_by is not None:

2056 # remove 'on' parameter from values if one existed

2057 if self.left_index and self.right_index:

2058 left_by_values = self.left_join_keys

2059 right_by_values = self.right_join_keys

2060 else:

2061 left_by_values = self.left_join_keys[0:-1]

2062 right_by_values = self.right_join_keys[0:-1]

2063

2064 # get tuple representation of values if more than one

2065 if len(left_by_values) == 1:

2066 left_by_values = left_by_values[0]

2067 right_by_values = right_by_values[0]

2068 else:

2069 # We get here with non-ndarrays in test_merge_by_col_tz_aware

2070 # and test_merge_groupby_multiple_column_with_categorical_column

2071 left_by_values = flip(left_by_values)

2072 right_by_values = flip(right_by_values)

2073

2074 # upcast 'by' parameter because HashTable is limited

2075 by_type = _get_cython_type_upcast(left_by_values.dtype)

2076 by_type_caster = _type_casters[by_type]

2077 # error: Cannot call function of unknown type

2078 left_by_values = by_type_caster(left_by_values) # type: ignore[operator]

2079 # error: Cannot call function of unknown type

2080 right_by_values = by_type_caster(right_by_values) # type: ignore[operator]

2081

2082 # choose appropriate function by type

2083 func = _asof_by_function(self.direction)

2084 return func(

2085 left_values,

2086 right_values,

2087 left_by_values,

2088 right_by_values,

2089 self.allow_exact_matches,

2090 tolerance,

2091 )

2092 else:

2093 # choose appropriate function by type

2094 func = _asof_by_function(self.direction)

2095 return func(

2096 left_values,

2097 right_values,

2098 None,

2099 None,

2100 self.allow_exact_matches,

2101 tolerance,

2102 False,

2103 )

2104

2105

2106def _get_multiindex_indexer(

2107 join_keys, index: MultiIndex, sort: bool

2108) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]:

2109

2110 # left & right join labels and num. of levels at each location

2111 mapped = (

2112 _factorize_keys(index.levels[n], join_keys[n], sort=sort)

2113 for n in range(index.nlevels)

2114 )

2115 zipped = zip(*mapped)

2116 rcodes, lcodes, shape = (list(x) for x in zipped)

2117 if sort:

2118 rcodes = list(map(np.take, rcodes, index.codes))

2119 else:

2120 i8copy = lambda a: a.astype("i8", subok=False, copy=True)

2121 rcodes = list(map(i8copy, index.codes))

2122

2123 # fix right labels if there were any nulls

2124 for i in range(len(join_keys)):

2125 mask = index.codes[i] == -1

2126 if mask.any():

2127 # check if there already was any nulls at this location

2128 # if there was, it is factorized to `shape[i] - 1`

2129 a = join_keys[i][lcodes[i] == shape[i] - 1]

2130 if a.size == 0 or not a[0] != a[0]:

2131 shape[i] += 1

2132

2133 rcodes[i][mask] = shape[i] - 1

2134

2135 # get flat i8 join keys

2136 lkey, rkey = _get_join_keys(lcodes, rcodes, shape, sort)

2137

2138 # factorize keys to a dense i8 space

2139 lkey, rkey, count = _factorize_keys(lkey, rkey, sort=sort)

2140

2141 return libjoin.left_outer_join(lkey, rkey, count, sort=sort)

2142

2143

2144def _get_single_indexer(

2145 join_key, index: Index, sort: bool = False

2146) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]:

2147 left_key, right_key, count = _factorize_keys(join_key, index._values, sort=sort)

2148

2149 return libjoin.left_outer_join(left_key, right_key, count, sort=sort)

2150

2151

2152def _get_empty_indexer() -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]:

2153 """Return empty join indexers."""

2154 return (

2155 np.array([], dtype=np.intp),

2156 np.array([], dtype=np.intp),

2157 )

2158

2159

2160def _get_no_sort_one_missing_indexer(

2161 n: int, left_missing: bool

2162) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]:

2163 """

2164 Return join indexers where all of one side is selected without sorting

2165 and none of the other side is selected.

2166

2167 Parameters

2168 ----------

2169 n : int

2170 Length of indexers to create.

2171 left_missing : bool

2172 If True, the left indexer will contain only -1's.

2173 If False, the right indexer will contain only -1's.

2174

2175 Returns

2176 -------

2177 np.ndarray[np.intp]

2178 Left indexer

2179 np.ndarray[np.intp]

2180 Right indexer

2181 """

2182 idx = np.arange(n, dtype=np.intp)

2183 idx_missing = np.full(shape=n, fill_value=-1, dtype=np.intp)

2184 if left_missing:

2185 return idx_missing, idx

2186 return idx, idx_missing

2187

2188

2189def _left_join_on_index(

2190 left_ax: Index, right_ax: Index, join_keys, sort: bool = False

2191) -> tuple[Index, npt.NDArray[np.intp] | None, npt.NDArray[np.intp]]:

2192 if len(join_keys) > 1:

2193 if not (

2194 isinstance(right_ax, MultiIndex) and len(join_keys) == right_ax.nlevels

2195 ):

2196 raise AssertionError(

2197 "If more than one join key is given then "

2198 "'right_ax' must be a MultiIndex and the "

2199 "number of join keys must be the number of levels in right_ax"

2200 )

2201

2202 left_indexer, right_indexer = _get_multiindex_indexer(

2203 join_keys, right_ax, sort=sort

2204 )

2205 else:

2206 jkey = join_keys[0]

2207

2208 left_indexer, right_indexer = _get_single_indexer(jkey, right_ax, sort=sort)

2209

2210 if sort or len(left_ax) != len(left_indexer):

2211 # if asked to sort or there are 1-to-many matches

2212 join_index = left_ax.take(left_indexer)

2213 return join_index, left_indexer, right_indexer

2214

2215 # left frame preserves order & length of its index

2216 return left_ax, None, right_indexer

2217

2218

2219def _factorize_keys(

2220 lk: ArrayLike, rk: ArrayLike, sort: bool = True, how: str = "inner"

2221) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp], int]:

2222 """

2223 Encode left and right keys as enumerated types.

2224

2225 This is used to get the join indexers to be used when merging DataFrames.

2226

2227 Parameters

2228 ----------

2229 lk : array-like

2230 Left key.

2231 rk : array-like

2232 Right key.

2233 sort : bool, defaults to True

2234 If True, the encoding is done such that the unique elements in the

2235 keys are sorted.

2236 how : {‘left’, ‘right’, ‘outer’, ‘inner’}, default ‘inner’

2237 Type of merge.

2238

2239 Returns

2240 -------

2241 np.ndarray[np.intp]

2242 Left (resp. right if called with `key='right'`) labels, as enumerated type.

2243 np.ndarray[np.intp]

2244 Right (resp. left if called with `key='right'`) labels, as enumerated type.

2245 int

2246 Number of unique elements in union of left and right labels.

2247

2248 See Also

2249 --------

2250 merge : Merge DataFrame or named Series objects

2251 with a database-style join.

2252 algorithms.factorize : Encode the object as an enumerated type

2253 or categorical variable.

2254

2255 Examples

2256 --------

2257 >>> lk = np.array(["a", "c", "b"])

2258 >>> rk = np.array(["a", "c"])

2259

2260 Here, the unique values are `'a', 'b', 'c'`. With the default

2261 `sort=True`, the encoding will be `{0: 'a', 1: 'b', 2: 'c'}`:

2262

2263 >>> pd.core.reshape.merge._factorize_keys(lk, rk)

2264 (array([0, 2, 1]), array([0, 2]), 3)

2265

2266 With the `sort=False`, the encoding will correspond to the order

2267 in which the unique elements first appear: `{0: 'a', 1: 'c', 2: 'b'}`:

2268

2269 >>> pd.core.reshape.merge._factorize_keys(lk, rk, sort=False)

2270 (array([0, 1, 2]), array([0, 1]), 3)

2271 """

2272 # Some pre-processing for non-ndarray lk / rk

2273 lk = extract_array(lk, extract_numpy=True, extract_range=True)

2274 rk = extract_array(rk, extract_numpy=True, extract_range=True)

2275 # TODO: if either is a RangeIndex, we can likely factorize more efficiently?

2276

2277 if isinstance(lk.dtype, DatetimeTZDtype) and isinstance(rk.dtype, DatetimeTZDtype):

2278 # Extract the ndarray (UTC-localized) values

2279 # Note: we dont need the dtypes to match, as these can still be compared

2280 # TODO(non-nano): need to make sure resolutions match

2281 lk = cast("DatetimeArray", lk)._ndarray

2282 rk = cast("DatetimeArray", rk)._ndarray

2283

2284 elif (

2285 is_categorical_dtype(lk.dtype)

2286 and is_categorical_dtype(rk.dtype)

2287 and is_dtype_equal(lk.dtype, rk.dtype)

2288 ):

2289 assert isinstance(lk, Categorical)

2290 assert isinstance(rk, Categorical)

2291 # Cast rk to encoding so we can compare codes with lk

2292

2293 rk = lk._encode_with_my_categories(rk)

2294

2295 lk = ensure_int64(lk.codes)

2296 rk = ensure_int64(rk.codes)

2297

2298 elif isinstance(lk, ExtensionArray) and is_dtype_equal(lk.dtype, rk.dtype):

2299 lk, _ = lk._values_for_factorize()

2300

2301 # error: Item "ndarray" of "Union[Any, ndarray]" has no attribute

2302 # "_values_for_factorize"

2303 rk, _ = rk._values_for_factorize() # type: ignore[union-attr]

2304

2305 klass: type[libhashtable.Factorizer] | type[libhashtable.Int64Factorizer]

2306 if is_integer_dtype(lk.dtype) and is_integer_dtype(rk.dtype):

2307 # GH#23917 TODO: needs tests for case where lk is integer-dtype

2308 # and rk is datetime-dtype

2309 klass = libhashtable.Int64Factorizer

2310 lk = ensure_int64(np.asarray(lk))

2311 rk = ensure_int64(np.asarray(rk))

2312

2313 elif needs_i8_conversion(lk.dtype) and is_dtype_equal(lk.dtype, rk.dtype):

2314 # GH#23917 TODO: Needs tests for non-matching dtypes

2315 klass = libhashtable.Int64Factorizer

2316 lk = ensure_int64(np.asarray(lk, dtype=np.int64))

2317 rk = ensure_int64(np.asarray(rk, dtype=np.int64))

2318

2319 else:

2320 klass = libhashtable.ObjectFactorizer

2321 lk = ensure_object(lk)

2322 rk = ensure_object(rk)

2323

2324 rizer = klass(max(len(lk), len(rk)))

2325

2326 # Argument 1 to "factorize" of "ObjectFactorizer" has incompatible type

2327 # "Union[ndarray[Any, dtype[signedinteger[_64Bit]]],

2328 # ndarray[Any, dtype[object_]]]"; expected "ndarray[Any, dtype[object_]]"

2329 llab = rizer.factorize(lk) # type: ignore[arg-type]

2330 # Argument 1 to "factorize" of "ObjectFactorizer" has incompatible type

2331 # "Union[ndarray[Any, dtype[signedinteger[_64Bit]]],

2332 # ndarray[Any, dtype[object_]]]"; expected "ndarray[Any, dtype[object_]]"

2333 rlab = rizer.factorize(rk) # type: ignore[arg-type]

2334 assert llab.dtype == np.dtype(np.intp), llab.dtype

2335 assert rlab.dtype == np.dtype(np.intp), rlab.dtype

2336

2337 count = rizer.get_count()

2338

2339 if sort:

2340 uniques = rizer.uniques.to_array()

2341 llab, rlab = _sort_labels(uniques, llab, rlab)

2342

2343 # NA group

2344 lmask = llab == -1

2345 lany = lmask.any()

2346 rmask = rlab == -1

2347 rany = rmask.any()

2348

2349 if lany or rany:

2350 if lany:

2351 np.putmask(llab, lmask, count)

2352 if rany:

2353 np.putmask(rlab, rmask, count)

2354 count += 1

2355

2356 if how == "right":

2357 return rlab, llab, count

2358 return llab, rlab, count

2359

2360

2361def _sort_labels(

2362 uniques: np.ndarray, left: npt.NDArray[np.intp], right: npt.NDArray[np.intp]

2363) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]:

2364

2365 llength = len(left)

2366 labels = np.concatenate([left, right])

2367

2368 _, new_labels = algos.safe_sort(uniques, labels, na_sentinel=-1)

2369 new_left, new_right = new_labels[:llength], new_labels[llength:]

2370

2371 return new_left, new_right

2372

2373

2374def _get_join_keys(llab, rlab, shape, sort: bool):

2375

2376 # how many levels can be done without overflow

2377 nlev = next(

2378 lev

2379 for lev in range(len(shape), 0, -1)

2380 if not is_int64_overflow_possible(shape[:lev])

2381 )

2382

2383 # get keys for the first `nlev` levels

2384 stride = np.prod(shape[1:nlev], dtype="i8")

2385 lkey = stride * llab[0].astype("i8", subok=False, copy=False)

2386 rkey = stride * rlab[0].astype("i8", subok=False, copy=False)

2387

2388 for i in range(1, nlev):

2389 with np.errstate(divide="ignore"):

2390 stride //= shape[i]

2391 lkey += llab[i] * stride

2392 rkey += rlab[i] * stride

2393

2394 if nlev == len(shape): # all done!

2395 return lkey, rkey

2396

2397 # densify current keys to avoid overflow

2398 lkey, rkey, count = _factorize_keys(lkey, rkey, sort=sort)

2399

2400 llab = [lkey] + llab[nlev:]

2401 rlab = [rkey] + rlab[nlev:]

2402 shape = [count] + shape[nlev:]

2403

2404 return _get_join_keys(llab, rlab, shape, sort)

2405

2406

2407def _should_fill(lname, rname) -> bool:

2408 if not isinstance(lname, str) or not isinstance(rname, str):

2409 return True

2410 return lname == rname

2411

2412

2413def _any(x) -> bool:

2414 return x is not None and com.any_not_none(*x)

2415

2416

2417def _validate_operand(obj: DataFrame | Series) -> DataFrame:

2418 if isinstance(obj, ABCDataFrame):

2419 return obj

2420 elif isinstance(obj, ABCSeries):

2421 if obj.name is None:

2422 raise ValueError("Cannot merge a Series without a name")

2423 else:

2424 return obj.to_frame()

2425 else:

2426 raise TypeError(

2427 f"Can only merge Series or DataFrame objects, a {type(obj)} was passed"

2428 )

2429

2430

2431def _items_overlap_with_suffix(

2432 left: Index, right: Index, suffixes: Suffixes

2433) -> tuple[Index, Index]:

2434 """

2435 Suffixes type validation.

2436

2437 If two indices overlap, add suffixes to overlapping entries.

2438

2439 If corresponding suffix is empty, the entry is simply converted to string.

2440

2441 """

2442 if not is_list_like(suffixes, allow_sets=False):

2443 warnings.warn(

2444 f"Passing 'suffixes' as a {type(suffixes)}, is not supported and may give "

2445 "unexpected results. Provide 'suffixes' as a tuple instead. In the "

2446 "future a 'TypeError' will be raised.",

2447 FutureWarning,

2448 stacklevel=find_stack_level(),

2449 )

2450

2451 to_rename = left.intersection(right)

2452 if len(to_rename) == 0:

2453 return left, right

2454

2455 lsuffix, rsuffix = suffixes

2456

2457 if not lsuffix and not rsuffix:

2458 raise ValueError(f"columns overlap but no suffix specified: {to_rename}")

2459

2460 def renamer(x, suffix):

2461 """

2462 Rename the left and right indices.

2463

2464 If there is overlap, and suffix is not None, add

2465 suffix, otherwise, leave it as-is.

2466

2467 Parameters

2468 ----------

2469 x : original column name

2470 suffix : str or None

2471

2472 Returns

2473 -------

2474 x : renamed column name

2475 """

2476 if x in to_rename and suffix is not None:

2477 return f"{x}{suffix}"

2478 return x

2479

2480 lrenamer = partial(renamer, suffix=lsuffix)

2481 rrenamer = partial(renamer, suffix=rsuffix)

2482

2483 llabels = left._transform_index(lrenamer)

2484 rlabels = right._transform_index(rrenamer)

2485

2486 dups = []

2487 if not llabels.is_unique:

2488 # Only warn when duplicates are caused because of suffixes, already duplicated

2489 # columns in origin should not warn

2490 dups = llabels[(llabels.duplicated()) & (~left.duplicated())].tolist()

2491 if not rlabels.is_unique:

2492 dups.extend(rlabels[(rlabels.duplicated()) & (~right.duplicated())].tolist())

2493 if dups:

2494 warnings.warn(

2495 f"Passing 'suffixes' which cause duplicate columns {set(dups)} in the "

2496 f"result is deprecated and will raise a MergeError in a future version.",

2497 FutureWarning,

2498 stacklevel=find_stack_level(),

2499 )

2500

2501 return llabels, rlabels