Coverage for /var/srv/projects/api.amasfac.comuna18.com/tmp/venv/lib/python3.9/site-packages/pandas/core/reshape/merge.py: 7%
867 statements
« prev ^ index » next coverage.py v6.4.4, created at 2023-07-17 14:22 -0600
« prev ^ index » next coverage.py v6.4.4, created at 2023-07-17 14:22 -0600
1"""
2SQL-style merge routines
3"""
4from __future__ import annotations
6import copy
7import datetime
8from functools import partial
9import string
10from typing import (
11 TYPE_CHECKING,
12 Hashable,
13 Sequence,
14 cast,
15)
16import uuid
17import warnings
19import numpy as np
21from pandas._libs import (
22 Timedelta,
23 hashtable as libhashtable,
24 join as libjoin,
25 lib,
26)
27from pandas._typing import (
28 AnyArrayLike,
29 ArrayLike,
30 DtypeObj,
31 IndexLabel,
32 Suffixes,
33 npt,
34)
35from pandas.errors import MergeError
36from pandas.util._decorators import (
37 Appender,
38 Substitution,
39 cache_readonly,
40)
41from pandas.util._exceptions import find_stack_level
43from pandas.core.dtypes.cast import find_common_type
44from pandas.core.dtypes.common import (
45 ensure_float64,
46 ensure_int64,
47 ensure_object,
48 is_array_like,
49 is_bool,
50 is_bool_dtype,
51 is_categorical_dtype,
52 is_dtype_equal,
53 is_extension_array_dtype,
54 is_float_dtype,
55 is_integer,
56 is_integer_dtype,
57 is_list_like,
58 is_number,
59 is_numeric_dtype,
60 is_object_dtype,
61 needs_i8_conversion,
62)
63from pandas.core.dtypes.dtypes import DatetimeTZDtype
64from pandas.core.dtypes.generic import (
65 ABCDataFrame,
66 ABCSeries,
67)
68from pandas.core.dtypes.missing import (
69 isna,
70 na_value_for_dtype,
71)
73from pandas import (
74 Categorical,
75 Index,
76 MultiIndex,
77 Series,
78)
79import pandas.core.algorithms as algos
80from pandas.core.arrays import ExtensionArray
81from pandas.core.arrays._mixins import NDArrayBackedExtensionArray
82import pandas.core.common as com
83from pandas.core.construction import extract_array
84from pandas.core.frame import _merge_doc
85from pandas.core.sorting import is_int64_overflow_possible
87if TYPE_CHECKING: 87 ↛ 88line 87 didn't jump to line 88, because the condition on line 87 was never true
88 from pandas import DataFrame
89 from pandas.core import groupby
90 from pandas.core.arrays import DatetimeArray
93@Substitution("\nleft : DataFrame or named Series")
94@Appender(_merge_doc, indents=0)
95def merge(
96 left: DataFrame | Series,
97 right: DataFrame | Series,
98 how: str = "inner",
99 on: IndexLabel | None = None,
100 left_on: IndexLabel | None = None,
101 right_on: IndexLabel | None = None,
102 left_index: bool = False,
103 right_index: bool = False,
104 sort: bool = False,
105 suffixes: Suffixes = ("_x", "_y"),
106 copy: bool = True,
107 indicator: bool = False,
108 validate: str | None = None,
109) -> DataFrame:
110 op = _MergeOperation(
111 left,
112 right,
113 how=how,
114 on=on,
115 left_on=left_on,
116 right_on=right_on,
117 left_index=left_index,
118 right_index=right_index,
119 sort=sort,
120 suffixes=suffixes,
121 indicator=indicator,
122 validate=validate,
123 )
124 return op.get_result(copy=copy)
127if __debug__:
128 merge.__doc__ = _merge_doc % "\nleft : DataFrame"
131def _groupby_and_merge(by, left: DataFrame, right: DataFrame, merge_pieces):
132 """
133 groupby & merge; we are always performing a left-by type operation
135 Parameters
136 ----------
137 by: field to group
138 left: DataFrame
139 right: DataFrame
140 merge_pieces: function for merging
141 """
142 pieces = []
143 if not isinstance(by, (list, tuple)):
144 by = [by]
146 lby = left.groupby(by, sort=False)
147 rby: groupby.DataFrameGroupBy | None = None
149 # if we can groupby the rhs
150 # then we can get vastly better perf
151 if all(item in right.columns for item in by):
152 rby = right.groupby(by, sort=False)
154 for key, lhs in lby.grouper.get_iterator(lby._selected_obj, axis=lby.axis):
156 if rby is None:
157 rhs = right
158 else:
159 try:
160 rhs = right.take(rby.indices[key])
161 except KeyError:
162 # key doesn't exist in left
163 lcols = lhs.columns.tolist()
164 cols = lcols + [r for r in right.columns if r not in set(lcols)]
165 merged = lhs.reindex(columns=cols)
166 merged.index = range(len(merged))
167 pieces.append(merged)
168 continue
170 merged = merge_pieces(lhs, rhs)
172 # make sure join keys are in the merged
173 # TODO, should merge_pieces do this?
174 merged[by] = key
176 pieces.append(merged)
178 # preserve the original order
179 # if we have a missing piece this can be reset
180 from pandas.core.reshape.concat import concat
182 result = concat(pieces, ignore_index=True)
183 result = result.reindex(columns=pieces[0].columns, copy=False)
184 return result, lby
187def merge_ordered(
188 left: DataFrame,
189 right: DataFrame,
190 on: IndexLabel | None = None,
191 left_on: IndexLabel | None = None,
192 right_on: IndexLabel | None = None,
193 left_by=None,
194 right_by=None,
195 fill_method: str | None = None,
196 suffixes: Suffixes = ("_x", "_y"),
197 how: str = "outer",
198) -> DataFrame:
199 """
200 Perform a merge for ordered data with optional filling/interpolation.
202 Designed for ordered data like time series data. Optionally
203 perform group-wise merge (see examples).
205 Parameters
206 ----------
207 left : DataFrame
208 right : DataFrame
209 on : label or list
210 Field names to join on. Must be found in both DataFrames.
211 left_on : label or list, or array-like
212 Field names to join on in left DataFrame. Can be a vector or list of
213 vectors of the length of the DataFrame to use a particular vector as
214 the join key instead of columns.
215 right_on : label or list, or array-like
216 Field names to join on in right DataFrame or vector/list of vectors per
217 left_on docs.
218 left_by : column name or list of column names
219 Group left DataFrame by group columns and merge piece by piece with
220 right DataFrame.
221 right_by : column name or list of column names
222 Group right DataFrame by group columns and merge piece by piece with
223 left DataFrame.
224 fill_method : {'ffill', None}, default None
225 Interpolation method for data.
226 suffixes : list-like, default is ("_x", "_y")
227 A length-2 sequence where each element is optionally a string
228 indicating the suffix to add to overlapping column names in
229 `left` and `right` respectively. Pass a value of `None` instead
230 of a string to indicate that the column name from `left` or
231 `right` should be left as-is, with no suffix. At least one of the
232 values must not be None.
234 .. versionchanged:: 0.25.0
235 how : {'left', 'right', 'outer', 'inner'}, default 'outer'
236 * left: use only keys from left frame (SQL: left outer join)
237 * right: use only keys from right frame (SQL: right outer join)
238 * outer: use union of keys from both frames (SQL: full outer join)
239 * inner: use intersection of keys from both frames (SQL: inner join).
241 Returns
242 -------
243 DataFrame
244 The merged DataFrame output type will the be same as
245 'left', if it is a subclass of DataFrame.
247 See Also
248 --------
249 merge : Merge with a database-style join.
250 merge_asof : Merge on nearest keys.
252 Examples
253 --------
254 >>> df1 = pd.DataFrame(
255 ... {
256 ... "key": ["a", "c", "e", "a", "c", "e"],
257 ... "lvalue": [1, 2, 3, 1, 2, 3],
258 ... "group": ["a", "a", "a", "b", "b", "b"]
259 ... }
260 ... )
261 >>> df1
262 key lvalue group
263 0 a 1 a
264 1 c 2 a
265 2 e 3 a
266 3 a 1 b
267 4 c 2 b
268 5 e 3 b
270 >>> df2 = pd.DataFrame({"key": ["b", "c", "d"], "rvalue": [1, 2, 3]})
271 >>> df2
272 key rvalue
273 0 b 1
274 1 c 2
275 2 d 3
277 >>> merge_ordered(df1, df2, fill_method="ffill", left_by="group")
278 key lvalue group rvalue
279 0 a 1 a NaN
280 1 b 1 a 1.0
281 2 c 2 a 2.0
282 3 d 2 a 3.0
283 4 e 3 a 3.0
284 5 a 1 b NaN
285 6 b 1 b 1.0
286 7 c 2 b 2.0
287 8 d 2 b 3.0
288 9 e 3 b 3.0
289 """
291 def _merger(x, y) -> DataFrame:
292 # perform the ordered merge operation
293 op = _OrderedMerge(
294 x,
295 y,
296 on=on,
297 left_on=left_on,
298 right_on=right_on,
299 suffixes=suffixes,
300 fill_method=fill_method,
301 how=how,
302 )
303 return op.get_result()
305 if left_by is not None and right_by is not None:
306 raise ValueError("Can only group either left or right frames")
307 elif left_by is not None:
308 if isinstance(left_by, str):
309 left_by = [left_by]
310 check = set(left_by).difference(left.columns)
311 if len(check) != 0:
312 raise KeyError(f"{check} not found in left columns")
313 result, _ = _groupby_and_merge(left_by, left, right, lambda x, y: _merger(x, y))
314 elif right_by is not None:
315 if isinstance(right_by, str):
316 right_by = [right_by]
317 check = set(right_by).difference(right.columns)
318 if len(check) != 0:
319 raise KeyError(f"{check} not found in right columns")
320 result, _ = _groupby_and_merge(
321 right_by, right, left, lambda x, y: _merger(y, x)
322 )
323 else:
324 result = _merger(left, right)
325 return result
328def merge_asof(
329 left: DataFrame | Series,
330 right: DataFrame | Series,
331 on: IndexLabel | None = None,
332 left_on: IndexLabel | None = None,
333 right_on: IndexLabel | None = None,
334 left_index: bool = False,
335 right_index: bool = False,
336 by=None,
337 left_by=None,
338 right_by=None,
339 suffixes: Suffixes = ("_x", "_y"),
340 tolerance=None,
341 allow_exact_matches: bool = True,
342 direction: str = "backward",
343) -> DataFrame:
344 """
345 Perform a merge by key distance.
347 This is similar to a left-join except that we match on nearest
348 key rather than equal keys. Both DataFrames must be sorted by the key.
350 For each row in the left DataFrame:
352 - A "backward" search selects the last row in the right DataFrame whose
353 'on' key is less than or equal to the left's key.
355 - A "forward" search selects the first row in the right DataFrame whose
356 'on' key is greater than or equal to the left's key.
358 - A "nearest" search selects the row in the right DataFrame whose 'on'
359 key is closest in absolute distance to the left's key.
361 The default is "backward" and is compatible in versions below 0.20.0.
362 The direction parameter was added in version 0.20.0 and introduces
363 "forward" and "nearest".
365 Optionally match on equivalent keys with 'by' before searching with 'on'.
367 Parameters
368 ----------
369 left : DataFrame or named Series
370 right : DataFrame or named Series
371 on : label
372 Field name to join on. Must be found in both DataFrames.
373 The data MUST be ordered. Furthermore this must be a numeric column,
374 such as datetimelike, integer, or float. On or left_on/right_on
375 must be given.
376 left_on : label
377 Field name to join on in left DataFrame.
378 right_on : label
379 Field name to join on in right DataFrame.
380 left_index : bool
381 Use the index of the left DataFrame as the join key.
382 right_index : bool
383 Use the index of the right DataFrame as the join key.
384 by : column name or list of column names
385 Match on these columns before performing merge operation.
386 left_by : column name
387 Field names to match on in the left DataFrame.
388 right_by : column name
389 Field names to match on in the right DataFrame.
390 suffixes : 2-length sequence (tuple, list, ...)
391 Suffix to apply to overlapping column names in the left and right
392 side, respectively.
393 tolerance : int or Timedelta, optional, default None
394 Select asof tolerance within this range; must be compatible
395 with the merge index.
396 allow_exact_matches : bool, default True
398 - If True, allow matching with the same 'on' value
399 (i.e. less-than-or-equal-to / greater-than-or-equal-to)
400 - If False, don't match the same 'on' value
401 (i.e., strictly less-than / strictly greater-than).
403 direction : 'backward' (default), 'forward', or 'nearest'
404 Whether to search for prior, subsequent, or closest matches.
406 Returns
407 -------
408 merged : DataFrame
410 See Also
411 --------
412 merge : Merge with a database-style join.
413 merge_ordered : Merge with optional filling/interpolation.
415 Examples
416 --------
417 >>> left = pd.DataFrame({"a": [1, 5, 10], "left_val": ["a", "b", "c"]})
418 >>> left
419 a left_val
420 0 1 a
421 1 5 b
422 2 10 c
424 >>> right = pd.DataFrame({"a": [1, 2, 3, 6, 7], "right_val": [1, 2, 3, 6, 7]})
425 >>> right
426 a right_val
427 0 1 1
428 1 2 2
429 2 3 3
430 3 6 6
431 4 7 7
433 >>> pd.merge_asof(left, right, on="a")
434 a left_val right_val
435 0 1 a 1
436 1 5 b 3
437 2 10 c 7
439 >>> pd.merge_asof(left, right, on="a", allow_exact_matches=False)
440 a left_val right_val
441 0 1 a NaN
442 1 5 b 3.0
443 2 10 c 7.0
445 >>> pd.merge_asof(left, right, on="a", direction="forward")
446 a left_val right_val
447 0 1 a 1.0
448 1 5 b 6.0
449 2 10 c NaN
451 >>> pd.merge_asof(left, right, on="a", direction="nearest")
452 a left_val right_val
453 0 1 a 1
454 1 5 b 6
455 2 10 c 7
457 We can use indexed DataFrames as well.
459 >>> left = pd.DataFrame({"left_val": ["a", "b", "c"]}, index=[1, 5, 10])
460 >>> left
461 left_val
462 1 a
463 5 b
464 10 c
466 >>> right = pd.DataFrame({"right_val": [1, 2, 3, 6, 7]}, index=[1, 2, 3, 6, 7])
467 >>> right
468 right_val
469 1 1
470 2 2
471 3 3
472 6 6
473 7 7
475 >>> pd.merge_asof(left, right, left_index=True, right_index=True)
476 left_val right_val
477 1 a 1
478 5 b 3
479 10 c 7
481 Here is a real-world times-series example
483 >>> quotes = pd.DataFrame(
484 ... {
485 ... "time": [
486 ... pd.Timestamp("2016-05-25 13:30:00.023"),
487 ... pd.Timestamp("2016-05-25 13:30:00.023"),
488 ... pd.Timestamp("2016-05-25 13:30:00.030"),
489 ... pd.Timestamp("2016-05-25 13:30:00.041"),
490 ... pd.Timestamp("2016-05-25 13:30:00.048"),
491 ... pd.Timestamp("2016-05-25 13:30:00.049"),
492 ... pd.Timestamp("2016-05-25 13:30:00.072"),
493 ... pd.Timestamp("2016-05-25 13:30:00.075")
494 ... ],
495 ... "ticker": [
496 ... "GOOG",
497 ... "MSFT",
498 ... "MSFT",
499 ... "MSFT",
500 ... "GOOG",
501 ... "AAPL",
502 ... "GOOG",
503 ... "MSFT"
504 ... ],
505 ... "bid": [720.50, 51.95, 51.97, 51.99, 720.50, 97.99, 720.50, 52.01],
506 ... "ask": [720.93, 51.96, 51.98, 52.00, 720.93, 98.01, 720.88, 52.03]
507 ... }
508 ... )
509 >>> quotes
510 time ticker bid ask
511 0 2016-05-25 13:30:00.023 GOOG 720.50 720.93
512 1 2016-05-25 13:30:00.023 MSFT 51.95 51.96
513 2 2016-05-25 13:30:00.030 MSFT 51.97 51.98
514 3 2016-05-25 13:30:00.041 MSFT 51.99 52.00
515 4 2016-05-25 13:30:00.048 GOOG 720.50 720.93
516 5 2016-05-25 13:30:00.049 AAPL 97.99 98.01
517 6 2016-05-25 13:30:00.072 GOOG 720.50 720.88
518 7 2016-05-25 13:30:00.075 MSFT 52.01 52.03
520 >>> trades = pd.DataFrame(
521 ... {
522 ... "time": [
523 ... pd.Timestamp("2016-05-25 13:30:00.023"),
524 ... pd.Timestamp("2016-05-25 13:30:00.038"),
525 ... pd.Timestamp("2016-05-25 13:30:00.048"),
526 ... pd.Timestamp("2016-05-25 13:30:00.048"),
527 ... pd.Timestamp("2016-05-25 13:30:00.048")
528 ... ],
529 ... "ticker": ["MSFT", "MSFT", "GOOG", "GOOG", "AAPL"],
530 ... "price": [51.95, 51.95, 720.77, 720.92, 98.0],
531 ... "quantity": [75, 155, 100, 100, 100]
532 ... }
533 ... )
534 >>> trades
535 time ticker price quantity
536 0 2016-05-25 13:30:00.023 MSFT 51.95 75
537 1 2016-05-25 13:30:00.038 MSFT 51.95 155
538 2 2016-05-25 13:30:00.048 GOOG 720.77 100
539 3 2016-05-25 13:30:00.048 GOOG 720.92 100
540 4 2016-05-25 13:30:00.048 AAPL 98.00 100
542 By default we are taking the asof of the quotes
544 >>> pd.merge_asof(trades, quotes, on="time", by="ticker")
545 time ticker price quantity bid ask
546 0 2016-05-25 13:30:00.023 MSFT 51.95 75 51.95 51.96
547 1 2016-05-25 13:30:00.038 MSFT 51.95 155 51.97 51.98
548 2 2016-05-25 13:30:00.048 GOOG 720.77 100 720.50 720.93
549 3 2016-05-25 13:30:00.048 GOOG 720.92 100 720.50 720.93
550 4 2016-05-25 13:30:00.048 AAPL 98.00 100 NaN NaN
552 We only asof within 2ms between the quote time and the trade time
554 >>> pd.merge_asof(
555 ... trades, quotes, on="time", by="ticker", tolerance=pd.Timedelta("2ms")
556 ... )
557 time ticker price quantity bid ask
558 0 2016-05-25 13:30:00.023 MSFT 51.95 75 51.95 51.96
559 1 2016-05-25 13:30:00.038 MSFT 51.95 155 NaN NaN
560 2 2016-05-25 13:30:00.048 GOOG 720.77 100 720.50 720.93
561 3 2016-05-25 13:30:00.048 GOOG 720.92 100 720.50 720.93
562 4 2016-05-25 13:30:00.048 AAPL 98.00 100 NaN NaN
564 We only asof within 10ms between the quote time and the trade time
565 and we exclude exact matches on time. However *prior* data will
566 propagate forward
568 >>> pd.merge_asof(
569 ... trades,
570 ... quotes,
571 ... on="time",
572 ... by="ticker",
573 ... tolerance=pd.Timedelta("10ms"),
574 ... allow_exact_matches=False
575 ... )
576 time ticker price quantity bid ask
577 0 2016-05-25 13:30:00.023 MSFT 51.95 75 NaN NaN
578 1 2016-05-25 13:30:00.038 MSFT 51.95 155 51.97 51.98
579 2 2016-05-25 13:30:00.048 GOOG 720.77 100 NaN NaN
580 3 2016-05-25 13:30:00.048 GOOG 720.92 100 NaN NaN
581 4 2016-05-25 13:30:00.048 AAPL 98.00 100 NaN NaN
582 """
583 op = _AsOfMerge(
584 left,
585 right,
586 on=on,
587 left_on=left_on,
588 right_on=right_on,
589 left_index=left_index,
590 right_index=right_index,
591 by=by,
592 left_by=left_by,
593 right_by=right_by,
594 suffixes=suffixes,
595 how="asof",
596 tolerance=tolerance,
597 allow_exact_matches=allow_exact_matches,
598 direction=direction,
599 )
600 return op.get_result()
603# TODO: transformations??
604# TODO: only copy DataFrames when modification necessary
605class _MergeOperation:
606 """
607 Perform a database (SQL) merge operation between two DataFrame or Series
608 objects using either columns as keys or their row indexes
609 """
611 _merge_type = "merge"
612 how: str
613 on: IndexLabel | None
614 # left_on/right_on may be None when passed, but in validate_specification
615 # get replaced with non-None.
616 left_on: Sequence[Hashable | AnyArrayLike]
617 right_on: Sequence[Hashable | AnyArrayLike]
618 left_index: bool
619 right_index: bool
620 axis: int
621 bm_axis: int
622 sort: bool
623 suffixes: Suffixes
624 copy: bool
625 indicator: bool
626 validate: str | None
628 def __init__(
629 self,
630 left: DataFrame | Series,
631 right: DataFrame | Series,
632 how: str = "inner",
633 on: IndexLabel | None = None,
634 left_on: IndexLabel | None = None,
635 right_on: IndexLabel | None = None,
636 axis: int = 1,
637 left_index: bool = False,
638 right_index: bool = False,
639 sort: bool = True,
640 suffixes: Suffixes = ("_x", "_y"),
641 indicator: bool = False,
642 validate: str | None = None,
643 ) -> None:
644 _left = _validate_operand(left)
645 _right = _validate_operand(right)
646 self.left = self.orig_left = _left
647 self.right = self.orig_right = _right
648 self.how = how
650 # bm_axis -> the axis on the BlockManager
651 self.bm_axis = axis
652 # axis --> the axis on the Series/DataFrame
653 self.axis = 1 - axis if self.left.ndim == 2 else 0
655 self.on = com.maybe_make_list(on)
657 self.suffixes = suffixes
658 self.sort = sort
660 self.left_index = left_index
661 self.right_index = right_index
663 self.indicator = indicator
665 if not is_bool(left_index):
666 raise ValueError(
667 f"left_index parameter must be of type bool, not {type(left_index)}"
668 )
669 if not is_bool(right_index):
670 raise ValueError(
671 f"right_index parameter must be of type bool, not {type(right_index)}"
672 )
674 # warn user when merging between different levels
675 if _left.columns.nlevels != _right.columns.nlevels:
676 msg = (
677 "merging between different levels is deprecated and will be removed "
678 f"in a future version. ({_left.columns.nlevels} levels on the left, "
679 f"{_right.columns.nlevels} on the right)"
680 )
681 # stacklevel chosen to be correct when this is reached via pd.merge
682 # (and not DataFrame.join)
683 warnings.warn(msg, FutureWarning, stacklevel=find_stack_level())
685 self.left_on, self.right_on = self._validate_left_right_on(left_on, right_on)
687 cross_col = None
688 if self.how == "cross":
689 (
690 self.left,
691 self.right,
692 self.how,
693 cross_col,
694 ) = self._create_cross_configuration(self.left, self.right)
695 self.left_on = self.right_on = [cross_col]
696 self._cross = cross_col
698 # note this function has side effects
699 (
700 self.left_join_keys,
701 self.right_join_keys,
702 self.join_names,
703 ) = self._get_merge_keys()
705 # validate the merge keys dtypes. We may need to coerce
706 # to avoid incompatible dtypes
707 self._maybe_coerce_merge_keys()
709 # If argument passed to validate,
710 # check if columns specified as unique
711 # are in fact unique.
712 if validate is not None:
713 self._validate(validate)
715 def _reindex_and_concat(
716 self,
717 join_index: Index,
718 left_indexer: npt.NDArray[np.intp] | None,
719 right_indexer: npt.NDArray[np.intp] | None,
720 copy: bool,
721 ) -> DataFrame:
722 """
723 reindex along index and concat along columns.
724 """
725 # Take views so we do not alter the originals
726 left = self.left[:]
727 right = self.right[:]
729 llabels, rlabels = _items_overlap_with_suffix(
730 self.left._info_axis, self.right._info_axis, self.suffixes
731 )
733 if left_indexer is not None:
734 # Pinning the index here (and in the right code just below) is not
735 # necessary, but makes the `.take` more performant if we have e.g.
736 # a MultiIndex for left.index.
737 lmgr = left._mgr.reindex_indexer(
738 join_index,
739 left_indexer,
740 axis=1,
741 copy=False,
742 only_slice=True,
743 allow_dups=True,
744 use_na_proxy=True,
745 )
746 left = left._constructor(lmgr)
747 left.index = join_index
749 if right_indexer is not None:
750 rmgr = right._mgr.reindex_indexer(
751 join_index,
752 right_indexer,
753 axis=1,
754 copy=False,
755 only_slice=True,
756 allow_dups=True,
757 use_na_proxy=True,
758 )
759 right = right._constructor(rmgr)
760 right.index = join_index
762 from pandas import concat
764 left.columns = llabels
765 right.columns = rlabels
766 result = concat([left, right], axis=1, copy=copy)
767 return result
769 def get_result(self, copy: bool = True) -> DataFrame:
770 if self.indicator:
771 self.left, self.right = self._indicator_pre_merge(self.left, self.right)
773 join_index, left_indexer, right_indexer = self._get_join_info()
775 result = self._reindex_and_concat(
776 join_index, left_indexer, right_indexer, copy=copy
777 )
778 result = result.__finalize__(self, method=self._merge_type)
780 if self.indicator:
781 result = self._indicator_post_merge(result)
783 self._maybe_add_join_keys(result, left_indexer, right_indexer)
785 self._maybe_restore_index_levels(result)
787 self._maybe_drop_cross_column(result, self._cross)
789 return result.__finalize__(self, method="merge")
791 def _maybe_drop_cross_column(
792 self, result: DataFrame, cross_col: str | None
793 ) -> None:
794 if cross_col is not None:
795 del result[cross_col]
797 @cache_readonly
798 def _indicator_name(self) -> str | None:
799 if isinstance(self.indicator, str):
800 return self.indicator
801 elif isinstance(self.indicator, bool):
802 return "_merge" if self.indicator else None
803 else:
804 raise ValueError(
805 "indicator option can only accept boolean or string arguments"
806 )
808 def _indicator_pre_merge(
809 self, left: DataFrame, right: DataFrame
810 ) -> tuple[DataFrame, DataFrame]:
812 columns = left.columns.union(right.columns)
814 for i in ["_left_indicator", "_right_indicator"]:
815 if i in columns:
816 raise ValueError(
817 "Cannot use `indicator=True` option when "
818 f"data contains a column named {i}"
819 )
820 if self._indicator_name in columns:
821 raise ValueError(
822 "Cannot use name of an existing column for indicator column"
823 )
825 left = left.copy()
826 right = right.copy()
828 left["_left_indicator"] = 1
829 left["_left_indicator"] = left["_left_indicator"].astype("int8")
831 right["_right_indicator"] = 2
832 right["_right_indicator"] = right["_right_indicator"].astype("int8")
834 return left, right
836 def _indicator_post_merge(self, result: DataFrame) -> DataFrame:
838 result["_left_indicator"] = result["_left_indicator"].fillna(0)
839 result["_right_indicator"] = result["_right_indicator"].fillna(0)
841 result[self._indicator_name] = Categorical(
842 (result["_left_indicator"] + result["_right_indicator"]),
843 categories=[1, 2, 3],
844 )
845 result[self._indicator_name] = result[
846 self._indicator_name
847 ].cat.rename_categories(["left_only", "right_only", "both"])
849 result = result.drop(labels=["_left_indicator", "_right_indicator"], axis=1)
850 return result
852 def _maybe_restore_index_levels(self, result: DataFrame) -> None:
853 """
854 Restore index levels specified as `on` parameters
856 Here we check for cases where `self.left_on` and `self.right_on` pairs
857 each reference an index level in their respective DataFrames. The
858 joined columns corresponding to these pairs are then restored to the
859 index of `result`.
861 **Note:** This method has side effects. It modifies `result` in-place
863 Parameters
864 ----------
865 result: DataFrame
866 merge result
868 Returns
869 -------
870 None
871 """
872 names_to_restore = []
873 for name, left_key, right_key in zip(
874 self.join_names, self.left_on, self.right_on
875 ):
876 if (
877 # Argument 1 to "_is_level_reference" of "NDFrame" has incompatible
878 # type "Union[Hashable, ExtensionArray, Index, Series]"; expected
879 # "Hashable"
880 self.orig_left._is_level_reference(left_key) # type: ignore[arg-type]
881 # Argument 1 to "_is_level_reference" of "NDFrame" has incompatible
882 # type "Union[Hashable, ExtensionArray, Index, Series]"; expected
883 # "Hashable"
884 and self.orig_right._is_level_reference(
885 right_key # type: ignore[arg-type]
886 )
887 and left_key == right_key
888 and name not in result.index.names
889 ):
891 names_to_restore.append(name)
893 if names_to_restore:
894 result.set_index(names_to_restore, inplace=True)
896 def _maybe_add_join_keys(
897 self,
898 result: DataFrame,
899 left_indexer: np.ndarray | None,
900 right_indexer: np.ndarray | None,
901 ) -> None:
903 left_has_missing = None
904 right_has_missing = None
906 keys = zip(self.join_names, self.left_on, self.right_on)
907 for i, (name, lname, rname) in enumerate(keys):
908 if not _should_fill(lname, rname):
909 continue
911 take_left, take_right = None, None
913 if name in result:
915 if left_indexer is not None and right_indexer is not None:
916 if name in self.left:
918 if left_has_missing is None:
919 left_has_missing = (left_indexer == -1).any()
921 if left_has_missing:
922 take_right = self.right_join_keys[i]
924 if not is_dtype_equal(
925 result[name].dtype, self.left[name].dtype
926 ):
927 take_left = self.left[name]._values
929 elif name in self.right:
931 if right_has_missing is None:
932 right_has_missing = (right_indexer == -1).any()
934 if right_has_missing:
935 take_left = self.left_join_keys[i]
937 if not is_dtype_equal(
938 result[name].dtype, self.right[name].dtype
939 ):
940 take_right = self.right[name]._values
942 elif left_indexer is not None and is_array_like(self.left_join_keys[i]):
943 take_left = self.left_join_keys[i]
944 take_right = self.right_join_keys[i]
946 if take_left is not None or take_right is not None:
948 if take_left is None:
949 lvals = result[name]._values
950 else:
951 # TODO: can we pin down take_left's type earlier?
952 take_left = extract_array(take_left, extract_numpy=True)
953 lfill = na_value_for_dtype(take_left.dtype)
954 lvals = algos.take_nd(take_left, left_indexer, fill_value=lfill)
956 if take_right is None:
957 rvals = result[name]._values
958 else:
959 # TODO: can we pin down take_right's type earlier?
960 take_right = extract_array(take_right, extract_numpy=True)
961 rfill = na_value_for_dtype(take_right.dtype)
962 rvals = algos.take_nd(take_right, right_indexer, fill_value=rfill)
964 # if we have an all missing left_indexer
965 # make sure to just use the right values or vice-versa
966 mask_left = left_indexer == -1
967 # error: Item "bool" of "Union[Any, bool]" has no attribute "all"
968 if mask_left.all(): # type: ignore[union-attr]
969 key_col = Index(rvals)
970 result_dtype = rvals.dtype
971 elif right_indexer is not None and (right_indexer == -1).all():
972 key_col = Index(lvals)
973 result_dtype = lvals.dtype
974 else:
975 key_col = Index(lvals).where(~mask_left, rvals)
976 result_dtype = find_common_type([lvals.dtype, rvals.dtype])
978 if result._is_label_reference(name):
979 result[name] = Series(
980 key_col, dtype=result_dtype, index=result.index
981 )
982 elif result._is_level_reference(name):
983 if isinstance(result.index, MultiIndex):
984 key_col.name = name
985 idx_list = [
986 result.index.get_level_values(level_name)
987 if level_name != name
988 else key_col
989 for level_name in result.index.names
990 ]
992 result.set_index(idx_list, inplace=True)
993 else:
994 result.index = Index(key_col, name=name)
995 else:
996 result.insert(i, name or f"key_{i}", key_col)
998 def _get_join_indexers(self) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]:
999 """return the join indexers"""
1000 return get_join_indexers(
1001 self.left_join_keys, self.right_join_keys, sort=self.sort, how=self.how
1002 )
1004 def _get_join_info(
1005 self,
1006 ) -> tuple[Index, npt.NDArray[np.intp] | None, npt.NDArray[np.intp] | None]:
1008 left_ax = self.left.axes[self.axis]
1009 right_ax = self.right.axes[self.axis]
1011 if self.left_index and self.right_index and self.how != "asof":
1012 join_index, left_indexer, right_indexer = left_ax.join(
1013 right_ax, how=self.how, return_indexers=True, sort=self.sort
1014 )
1016 elif self.right_index and self.how == "left":
1017 join_index, left_indexer, right_indexer = _left_join_on_index(
1018 left_ax, right_ax, self.left_join_keys, sort=self.sort
1019 )
1021 elif self.left_index and self.how == "right":
1022 join_index, right_indexer, left_indexer = _left_join_on_index(
1023 right_ax, left_ax, self.right_join_keys, sort=self.sort
1024 )
1025 else:
1026 (left_indexer, right_indexer) = self._get_join_indexers()
1028 if self.right_index:
1029 if len(self.left) > 0:
1030 join_index = self._create_join_index(
1031 self.left.index,
1032 self.right.index,
1033 left_indexer,
1034 how="right",
1035 )
1036 else:
1037 join_index = self.right.index.take(right_indexer)
1038 elif self.left_index:
1039 if self.how == "asof":
1040 # GH#33463 asof should always behave like a left merge
1041 join_index = self._create_join_index(
1042 self.left.index,
1043 self.right.index,
1044 left_indexer,
1045 how="left",
1046 )
1048 elif len(self.right) > 0:
1049 join_index = self._create_join_index(
1050 self.right.index,
1051 self.left.index,
1052 right_indexer,
1053 how="left",
1054 )
1055 else:
1056 join_index = self.left.index.take(left_indexer)
1057 else:
1058 join_index = Index(np.arange(len(left_indexer)))
1060 if len(join_index) == 0:
1061 join_index = join_index.astype(object)
1062 return join_index, left_indexer, right_indexer
1064 def _create_join_index(
1065 self,
1066 index: Index,
1067 other_index: Index,
1068 indexer: npt.NDArray[np.intp],
1069 how: str = "left",
1070 ) -> Index:
1071 """
1072 Create a join index by rearranging one index to match another
1074 Parameters
1075 ----------
1076 index : Index being rearranged
1077 other_index : Index used to supply values not found in index
1078 indexer : np.ndarray[np.intp] how to rearrange index
1079 how : str
1080 Replacement is only necessary if indexer based on other_index.
1082 Returns
1083 -------
1084 Index
1085 """
1086 if self.how in (how, "outer") and not isinstance(other_index, MultiIndex):
1087 # if final index requires values in other_index but not target
1088 # index, indexer may hold missing (-1) values, causing Index.take
1089 # to take the final value in target index. So, we set the last
1090 # element to be the desired fill value. We do not use allow_fill
1091 # and fill_value because it throws a ValueError on integer indices
1092 mask = indexer == -1
1093 if np.any(mask):
1094 fill_value = na_value_for_dtype(index.dtype, compat=False)
1095 index = index.append(Index([fill_value]))
1096 return index.take(indexer)
1098 def _get_merge_keys(self):
1099 """
1100 Note: has side effects (copy/delete key columns)
1102 Parameters
1103 ----------
1104 left
1105 right
1106 on
1108 Returns
1109 -------
1110 left_keys, right_keys, join_names
1111 """
1112 # left_keys, right_keys entries can actually be anything listlike
1113 # with a 'dtype' attr
1114 left_keys: list[AnyArrayLike] = []
1115 right_keys: list[AnyArrayLike] = []
1116 join_names: list[Hashable] = []
1117 right_drop = []
1118 left_drop = []
1120 left, right = self.left, self.right
1122 is_lkey = lambda x: is_array_like(x) and len(x) == len(left)
1123 is_rkey = lambda x: is_array_like(x) and len(x) == len(right)
1125 # Note that pd.merge_asof() has separate 'on' and 'by' parameters. A
1126 # user could, for example, request 'left_index' and 'left_by'. In a
1127 # regular pd.merge(), users cannot specify both 'left_index' and
1128 # 'left_on'. (Instead, users have a MultiIndex). That means the
1129 # self.left_on in this function is always empty in a pd.merge(), but
1130 # a pd.merge_asof(left_index=True, left_by=...) will result in a
1131 # self.left_on array with a None in the middle of it. This requires
1132 # a work-around as designated in the code below.
1133 # See _validate_left_right_on() for where this happens.
1135 # ugh, spaghetti re #733
1136 if _any(self.left_on) and _any(self.right_on):
1137 for lk, rk in zip(self.left_on, self.right_on):
1138 if is_lkey(lk):
1139 lk = cast(AnyArrayLike, lk)
1140 left_keys.append(lk)
1141 if is_rkey(rk):
1142 rk = cast(AnyArrayLike, rk)
1143 right_keys.append(rk)
1144 join_names.append(None) # what to do?
1145 else:
1146 # Then we're either Hashable or a wrong-length arraylike,
1147 # the latter of which will raise
1148 rk = cast(Hashable, rk)
1149 if rk is not None:
1150 right_keys.append(right._get_label_or_level_values(rk))
1151 join_names.append(rk)
1152 else:
1153 # work-around for merge_asof(right_index=True)
1154 right_keys.append(right.index)
1155 join_names.append(right.index.name)
1156 else:
1157 if not is_rkey(rk):
1158 # Then we're either Hashable or a wrong-length arraylike,
1159 # the latter of which will raise
1160 rk = cast(Hashable, rk)
1161 if rk is not None:
1162 right_keys.append(right._get_label_or_level_values(rk))
1163 else:
1164 # work-around for merge_asof(right_index=True)
1165 right_keys.append(right.index)
1166 if lk is not None and lk == rk: # FIXME: what about other NAs?
1167 # avoid key upcast in corner case (length-0)
1168 if len(left) > 0:
1169 right_drop.append(rk)
1170 else:
1171 left_drop.append(lk)
1172 else:
1173 rk = cast(AnyArrayLike, rk)
1174 right_keys.append(rk)
1175 if lk is not None:
1176 # Then we're either Hashable or a wrong-length arraylike,
1177 # the latter of which will raise
1178 lk = cast(Hashable, lk)
1179 left_keys.append(left._get_label_or_level_values(lk))
1180 join_names.append(lk)
1181 else:
1182 # work-around for merge_asof(left_index=True)
1183 left_keys.append(left.index)
1184 join_names.append(left.index.name)
1185 elif _any(self.left_on):
1186 for k in self.left_on:
1187 if is_lkey(k):
1188 k = cast(AnyArrayLike, k)
1189 left_keys.append(k)
1190 join_names.append(None)
1191 else:
1192 # Then we're either Hashable or a wrong-length arraylike,
1193 # the latter of which will raise
1194 k = cast(Hashable, k)
1195 left_keys.append(left._get_label_or_level_values(k))
1196 join_names.append(k)
1197 if isinstance(self.right.index, MultiIndex):
1198 right_keys = [
1199 lev._values.take(lev_codes)
1200 for lev, lev_codes in zip(
1201 self.right.index.levels, self.right.index.codes
1202 )
1203 ]
1204 else:
1205 right_keys = [self.right.index._values]
1206 elif _any(self.right_on):
1207 for k in self.right_on:
1208 if is_rkey(k):
1209 k = cast(AnyArrayLike, k)
1210 right_keys.append(k)
1211 join_names.append(None)
1212 else:
1213 # Then we're either Hashable or a wrong-length arraylike,
1214 # the latter of which will raise
1215 k = cast(Hashable, k)
1216 right_keys.append(right._get_label_or_level_values(k))
1217 join_names.append(k)
1218 if isinstance(self.left.index, MultiIndex):
1219 left_keys = [
1220 lev._values.take(lev_codes)
1221 for lev, lev_codes in zip(
1222 self.left.index.levels, self.left.index.codes
1223 )
1224 ]
1225 else:
1226 left_keys = [self.left.index._values]
1228 if left_drop:
1229 self.left = self.left._drop_labels_or_levels(left_drop)
1231 if right_drop:
1232 self.right = self.right._drop_labels_or_levels(right_drop)
1234 return left_keys, right_keys, join_names
1236 def _maybe_coerce_merge_keys(self) -> None:
1237 # we have valid merges but we may have to further
1238 # coerce these if they are originally incompatible types
1239 #
1240 # for example if these are categorical, but are not dtype_equal
1241 # or if we have object and integer dtypes
1243 for lk, rk, name in zip(
1244 self.left_join_keys, self.right_join_keys, self.join_names
1245 ):
1246 if (len(lk) and not len(rk)) or (not len(lk) and len(rk)):
1247 continue
1249 lk_is_cat = is_categorical_dtype(lk.dtype)
1250 rk_is_cat = is_categorical_dtype(rk.dtype)
1251 lk_is_object = is_object_dtype(lk.dtype)
1252 rk_is_object = is_object_dtype(rk.dtype)
1254 # if either left or right is a categorical
1255 # then the must match exactly in categories & ordered
1256 if lk_is_cat and rk_is_cat:
1257 if lk._categories_match_up_to_permutation(rk):
1258 continue
1260 elif lk_is_cat or rk_is_cat:
1261 pass
1263 elif is_dtype_equal(lk.dtype, rk.dtype):
1264 continue
1266 msg = (
1267 f"You are trying to merge on {lk.dtype} and "
1268 f"{rk.dtype} columns. If you wish to proceed you should use pd.concat"
1269 )
1271 # if we are numeric, then allow differing
1272 # kinds to proceed, eg. int64 and int8, int and float
1273 # further if we are object, but we infer to
1274 # the same, then proceed
1275 if is_numeric_dtype(lk.dtype) and is_numeric_dtype(rk.dtype):
1276 if lk.dtype.kind == rk.dtype.kind:
1277 continue
1279 # check whether ints and floats
1280 elif is_integer_dtype(rk.dtype) and is_float_dtype(lk.dtype):
1281 # GH 47391 numpy > 1.24 will raise a RuntimeError for nan -> int
1282 with np.errstate(invalid="ignore"):
1283 if not (lk == lk.astype(rk.dtype))[~np.isnan(lk)].all():
1284 warnings.warn(
1285 "You are merging on int and float "
1286 "columns where the float values "
1287 "are not equal to their int representation.",
1288 UserWarning,
1289 stacklevel=find_stack_level(),
1290 )
1291 continue
1293 elif is_float_dtype(rk.dtype) and is_integer_dtype(lk.dtype):
1294 # GH 47391 numpy > 1.24 will raise a RuntimeError for nan -> int
1295 with np.errstate(invalid="ignore"):
1296 if not (rk == rk.astype(lk.dtype))[~np.isnan(rk)].all():
1297 warnings.warn(
1298 "You are merging on int and float "
1299 "columns where the float values "
1300 "are not equal to their int representation.",
1301 UserWarning,
1302 stacklevel=find_stack_level(),
1303 )
1304 continue
1306 # let's infer and see if we are ok
1307 elif lib.infer_dtype(lk, skipna=False) == lib.infer_dtype(
1308 rk, skipna=False
1309 ):
1310 continue
1312 # Check if we are trying to merge on obviously
1313 # incompatible dtypes GH 9780, GH 15800
1315 # bool values are coerced to object
1316 elif (lk_is_object and is_bool_dtype(rk.dtype)) or (
1317 is_bool_dtype(lk.dtype) and rk_is_object
1318 ):
1319 pass
1321 # object values are allowed to be merged
1322 elif (lk_is_object and is_numeric_dtype(rk.dtype)) or (
1323 is_numeric_dtype(lk.dtype) and rk_is_object
1324 ):
1325 inferred_left = lib.infer_dtype(lk, skipna=False)
1326 inferred_right = lib.infer_dtype(rk, skipna=False)
1327 bool_types = ["integer", "mixed-integer", "boolean", "empty"]
1328 string_types = ["string", "unicode", "mixed", "bytes", "empty"]
1330 # inferred bool
1331 if inferred_left in bool_types and inferred_right in bool_types:
1332 pass
1334 # unless we are merging non-string-like with string-like
1335 elif (
1336 inferred_left in string_types and inferred_right not in string_types
1337 ) or (
1338 inferred_right in string_types and inferred_left not in string_types
1339 ):
1340 raise ValueError(msg)
1342 # datetimelikes must match exactly
1343 elif needs_i8_conversion(lk.dtype) and not needs_i8_conversion(rk.dtype):
1344 raise ValueError(msg)
1345 elif not needs_i8_conversion(lk.dtype) and needs_i8_conversion(rk.dtype):
1346 raise ValueError(msg)
1347 elif isinstance(lk.dtype, DatetimeTZDtype) and not isinstance(
1348 rk.dtype, DatetimeTZDtype
1349 ):
1350 raise ValueError(msg)
1351 elif not isinstance(lk.dtype, DatetimeTZDtype) and isinstance(
1352 rk.dtype, DatetimeTZDtype
1353 ):
1354 raise ValueError(msg)
1356 elif lk_is_object and rk_is_object:
1357 continue
1359 # Houston, we have a problem!
1360 # let's coerce to object if the dtypes aren't
1361 # categorical, otherwise coerce to the category
1362 # dtype. If we coerced categories to object,
1363 # then we would lose type information on some
1364 # columns, and end up trying to merge
1365 # incompatible dtypes. See GH 16900.
1366 if name in self.left.columns:
1367 typ = lk.categories.dtype if lk_is_cat else object
1368 self.left = self.left.copy()
1369 self.left[name] = self.left[name].astype(typ)
1370 if name in self.right.columns:
1371 typ = rk.categories.dtype if rk_is_cat else object
1372 self.right = self.right.copy()
1373 self.right[name] = self.right[name].astype(typ)
1375 def _create_cross_configuration(
1376 self, left: DataFrame, right: DataFrame
1377 ) -> tuple[DataFrame, DataFrame, str, str]:
1378 """
1379 Creates the configuration to dispatch the cross operation to inner join,
1380 e.g. adding a join column and resetting parameters. Join column is added
1381 to a new object, no inplace modification
1383 Parameters
1384 ----------
1385 left : DataFrame
1386 right : DataFrame
1388 Returns
1389 -------
1390 a tuple (left, right, how, cross_col) representing the adjusted
1391 DataFrames with cross_col, the merge operation set to inner and the column
1392 to join over.
1393 """
1394 cross_col = f"_cross_{uuid.uuid4()}"
1395 how = "inner"
1396 return (
1397 left.assign(**{cross_col: 1}),
1398 right.assign(**{cross_col: 1}),
1399 how,
1400 cross_col,
1401 )
1403 def _validate_left_right_on(self, left_on, right_on):
1404 left_on = com.maybe_make_list(left_on)
1405 right_on = com.maybe_make_list(right_on)
1407 if self.how == "cross":
1408 if (
1409 self.left_index
1410 or self.right_index
1411 or right_on is not None
1412 or left_on is not None
1413 or self.on is not None
1414 ):
1415 raise MergeError(
1416 "Can not pass on, right_on, left_on or set right_index=True or "
1417 "left_index=True"
1418 )
1419 # Hm, any way to make this logic less complicated??
1420 elif self.on is None and left_on is None and right_on is None:
1422 if self.left_index and self.right_index:
1423 left_on, right_on = (), ()
1424 elif self.left_index:
1425 raise MergeError("Must pass right_on or right_index=True")
1426 elif self.right_index:
1427 raise MergeError("Must pass left_on or left_index=True")
1428 else:
1429 # use the common columns
1430 left_cols = self.left.columns
1431 right_cols = self.right.columns
1432 common_cols = left_cols.intersection(right_cols)
1433 if len(common_cols) == 0:
1434 raise MergeError(
1435 "No common columns to perform merge on. "
1436 f"Merge options: left_on={left_on}, "
1437 f"right_on={right_on}, "
1438 f"left_index={self.left_index}, "
1439 f"right_index={self.right_index}"
1440 )
1441 if (
1442 not left_cols.join(common_cols, how="inner").is_unique
1443 or not right_cols.join(common_cols, how="inner").is_unique
1444 ):
1445 raise MergeError(f"Data columns not unique: {repr(common_cols)}")
1446 left_on = right_on = common_cols
1447 elif self.on is not None:
1448 if left_on is not None or right_on is not None:
1449 raise MergeError(
1450 'Can only pass argument "on" OR "left_on" '
1451 'and "right_on", not a combination of both.'
1452 )
1453 if self.left_index or self.right_index:
1454 raise MergeError(
1455 'Can only pass argument "on" OR "left_index" '
1456 'and "right_index", not a combination of both.'
1457 )
1458 left_on = right_on = self.on
1459 elif left_on is not None:
1460 if self.left_index:
1461 raise MergeError(
1462 'Can only pass argument "left_on" OR "left_index" not both.'
1463 )
1464 if not self.right_index and right_on is None:
1465 raise MergeError('Must pass "right_on" OR "right_index".')
1466 n = len(left_on)
1467 if self.right_index:
1468 if len(left_on) != self.right.index.nlevels:
1469 raise ValueError(
1470 "len(left_on) must equal the number "
1471 'of levels in the index of "right"'
1472 )
1473 right_on = [None] * n
1474 elif right_on is not None:
1475 if self.right_index:
1476 raise MergeError(
1477 'Can only pass argument "right_on" OR "right_index" not both.'
1478 )
1479 if not self.left_index and left_on is None:
1480 raise MergeError('Must pass "left_on" OR "left_index".')
1481 n = len(right_on)
1482 if self.left_index:
1483 if len(right_on) != self.left.index.nlevels:
1484 raise ValueError(
1485 "len(right_on) must equal the number "
1486 'of levels in the index of "left"'
1487 )
1488 left_on = [None] * n
1489 if self.how != "cross" and len(right_on) != len(left_on):
1490 raise ValueError("len(right_on) must equal len(left_on)")
1492 return left_on, right_on
1494 def _validate(self, validate: str) -> None:
1496 # Check uniqueness of each
1497 if self.left_index:
1498 left_unique = self.orig_left.index.is_unique
1499 else:
1500 left_unique = MultiIndex.from_arrays(self.left_join_keys).is_unique
1502 if self.right_index:
1503 right_unique = self.orig_right.index.is_unique
1504 else:
1505 right_unique = MultiIndex.from_arrays(self.right_join_keys).is_unique
1507 # Check data integrity
1508 if validate in ["one_to_one", "1:1"]:
1509 if not left_unique and not right_unique:
1510 raise MergeError(
1511 "Merge keys are not unique in either left "
1512 "or right dataset; not a one-to-one merge"
1513 )
1514 elif not left_unique:
1515 raise MergeError(
1516 "Merge keys are not unique in left dataset; not a one-to-one merge"
1517 )
1518 elif not right_unique:
1519 raise MergeError(
1520 "Merge keys are not unique in right dataset; not a one-to-one merge"
1521 )
1523 elif validate in ["one_to_many", "1:m"]:
1524 if not left_unique:
1525 raise MergeError(
1526 "Merge keys are not unique in left dataset; not a one-to-many merge"
1527 )
1529 elif validate in ["many_to_one", "m:1"]:
1530 if not right_unique:
1531 raise MergeError(
1532 "Merge keys are not unique in right dataset; "
1533 "not a many-to-one merge"
1534 )
1536 elif validate in ["many_to_many", "m:m"]:
1537 pass
1539 else:
1540 raise ValueError("Not a valid argument for validate")
1543def get_join_indexers(
1544 left_keys, right_keys, sort: bool = False, how: str = "inner", **kwargs
1545) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]:
1546 """
1548 Parameters
1549 ----------
1550 left_keys : ndarray, Index, Series
1551 right_keys : ndarray, Index, Series
1552 sort : bool, default False
1553 how : {'inner', 'outer', 'left', 'right'}, default 'inner'
1555 Returns
1556 -------
1557 np.ndarray[np.intp]
1558 Indexer into the left_keys.
1559 np.ndarray[np.intp]
1560 Indexer into the right_keys.
1561 """
1562 assert len(left_keys) == len(
1563 right_keys
1564 ), "left_key and right_keys must be the same length"
1566 # fast-path for empty left/right
1567 left_n = len(left_keys[0])
1568 right_n = len(right_keys[0])
1569 if left_n == 0:
1570 if how in ["left", "inner", "cross"]:
1571 return _get_empty_indexer()
1572 elif not sort and how in ["right", "outer"]:
1573 return _get_no_sort_one_missing_indexer(right_n, True)
1574 elif right_n == 0:
1575 if how in ["right", "inner", "cross"]:
1576 return _get_empty_indexer()
1577 elif not sort and how in ["left", "outer"]:
1578 return _get_no_sort_one_missing_indexer(left_n, False)
1580 # get left & right join labels and num. of levels at each location
1581 mapped = (
1582 _factorize_keys(left_keys[n], right_keys[n], sort=sort, how=how)
1583 for n in range(len(left_keys))
1584 )
1585 zipped = zip(*mapped)
1586 llab, rlab, shape = (list(x) for x in zipped)
1588 # get flat i8 keys from label lists
1589 lkey, rkey = _get_join_keys(llab, rlab, shape, sort)
1591 # factorize keys to a dense i8 space
1592 # `count` is the num. of unique keys
1593 # set(lkey) | set(rkey) == range(count)
1595 lkey, rkey, count = _factorize_keys(lkey, rkey, sort=sort, how=how)
1596 # preserve left frame order if how == 'left' and sort == False
1597 kwargs = copy.copy(kwargs)
1598 if how in ("left", "right"):
1599 kwargs["sort"] = sort
1600 join_func = {
1601 "inner": libjoin.inner_join,
1602 "left": libjoin.left_outer_join,
1603 "right": lambda x, y, count, **kwargs: libjoin.left_outer_join(
1604 y, x, count, **kwargs
1605 )[::-1],
1606 "outer": libjoin.full_outer_join,
1607 }[how]
1609 # error: Cannot call function of unknown type
1610 return join_func(lkey, rkey, count, **kwargs) # type: ignore[operator]
1613def restore_dropped_levels_multijoin(
1614 left: MultiIndex,
1615 right: MultiIndex,
1616 dropped_level_names,
1617 join_index: Index,
1618 lindexer: npt.NDArray[np.intp],
1619 rindexer: npt.NDArray[np.intp],
1620) -> tuple[list[Index], npt.NDArray[np.intp], list[Hashable]]:
1621 """
1622 *this is an internal non-public method*
1624 Returns the levels, labels and names of a multi-index to multi-index join.
1625 Depending on the type of join, this method restores the appropriate
1626 dropped levels of the joined multi-index.
1627 The method relies on lidx, rindexer which hold the index positions of
1628 left and right, where a join was feasible
1630 Parameters
1631 ----------
1632 left : MultiIndex
1633 left index
1634 right : MultiIndex
1635 right index
1636 dropped_level_names : str array
1637 list of non-common level names
1638 join_index : Index
1639 the index of the join between the
1640 common levels of left and right
1641 lindexer : np.ndarray[np.intp]
1642 left indexer
1643 rindexer : np.ndarray[np.intp]
1644 right indexer
1646 Returns
1647 -------
1648 levels : list of Index
1649 levels of combined multiindexes
1650 labels : np.ndarray[np.intp]
1651 labels of combined multiindexes
1652 names : List[Hashable]
1653 names of combined multiindex levels
1655 """
1657 def _convert_to_multiindex(index: Index) -> MultiIndex:
1658 if isinstance(index, MultiIndex):
1659 return index
1660 else:
1661 return MultiIndex.from_arrays([index._values], names=[index.name])
1663 # For multi-multi joins with one overlapping level,
1664 # the returned index if of type Index
1665 # Assure that join_index is of type MultiIndex
1666 # so that dropped levels can be appended
1667 join_index = _convert_to_multiindex(join_index)
1669 join_levels = join_index.levels
1670 join_codes = join_index.codes
1671 join_names = join_index.names
1673 # lindexer and rindexer hold the indexes where the join occurred
1674 # for left and right respectively. If left/right is None then
1675 # the join occurred on all indices of left/right
1676 if lindexer is None:
1677 lindexer = range(left.size)
1679 if rindexer is None:
1680 rindexer = range(right.size)
1682 # Iterate through the levels that must be restored
1683 for dropped_level_name in dropped_level_names:
1684 if dropped_level_name in left.names:
1685 idx = left
1686 indexer = lindexer
1687 else:
1688 idx = right
1689 indexer = rindexer
1691 # The index of the level name to be restored
1692 name_idx = idx.names.index(dropped_level_name)
1694 restore_levels = idx.levels[name_idx]
1695 # Inject -1 in the codes list where a join was not possible
1696 # IOW indexer[i]=-1
1697 codes = idx.codes[name_idx]
1698 restore_codes = algos.take_nd(codes, indexer, fill_value=-1)
1700 join_levels = join_levels + [restore_levels]
1701 join_codes = join_codes + [restore_codes]
1702 join_names = join_names + [dropped_level_name]
1704 return join_levels, join_codes, join_names
1707class _OrderedMerge(_MergeOperation):
1708 _merge_type = "ordered_merge"
1710 def __init__(
1711 self,
1712 left: DataFrame | Series,
1713 right: DataFrame | Series,
1714 on: IndexLabel | None = None,
1715 left_on: IndexLabel | None = None,
1716 right_on: IndexLabel | None = None,
1717 left_index: bool = False,
1718 right_index: bool = False,
1719 axis: int = 1,
1720 suffixes: Suffixes = ("_x", "_y"),
1721 fill_method: str | None = None,
1722 how: str = "outer",
1723 ) -> None:
1725 self.fill_method = fill_method
1726 _MergeOperation.__init__(
1727 self,
1728 left,
1729 right,
1730 on=on,
1731 left_on=left_on,
1732 left_index=left_index,
1733 right_index=right_index,
1734 right_on=right_on,
1735 axis=axis,
1736 how=how,
1737 suffixes=suffixes,
1738 sort=True, # factorize sorts
1739 )
1741 def get_result(self, copy: bool = True) -> DataFrame:
1742 join_index, left_indexer, right_indexer = self._get_join_info()
1744 llabels, rlabels = _items_overlap_with_suffix(
1745 self.left._info_axis, self.right._info_axis, self.suffixes
1746 )
1748 left_join_indexer: np.ndarray | None
1749 right_join_indexer: np.ndarray | None
1751 if self.fill_method == "ffill":
1752 if left_indexer is None:
1753 raise TypeError("left_indexer cannot be None")
1754 left_indexer, right_indexer = cast(np.ndarray, left_indexer), cast(
1755 np.ndarray, right_indexer
1756 )
1757 left_join_indexer = libjoin.ffill_indexer(left_indexer)
1758 right_join_indexer = libjoin.ffill_indexer(right_indexer)
1759 else:
1760 left_join_indexer = left_indexer
1761 right_join_indexer = right_indexer
1763 result = self._reindex_and_concat(
1764 join_index, left_join_indexer, right_join_indexer, copy=copy
1765 )
1766 self._maybe_add_join_keys(result, left_indexer, right_indexer)
1768 return result
1771def _asof_by_function(direction: str):
1772 name = f"asof_join_{direction}_on_X_by_Y"
1773 return getattr(libjoin, name, None)
1776_type_casters = {
1777 "int64_t": ensure_int64,
1778 "double": ensure_float64,
1779 "object": ensure_object,
1780}
1783def _get_cython_type_upcast(dtype: DtypeObj) -> str:
1784 """Upcast a dtype to 'int64_t', 'double', or 'object'"""
1785 if is_integer_dtype(dtype):
1786 return "int64_t"
1787 elif is_float_dtype(dtype):
1788 return "double"
1789 else:
1790 return "object"
1793class _AsOfMerge(_OrderedMerge):
1794 _merge_type = "asof_merge"
1796 def __init__(
1797 self,
1798 left: DataFrame | Series,
1799 right: DataFrame | Series,
1800 on: IndexLabel | None = None,
1801 left_on: IndexLabel | None = None,
1802 right_on: IndexLabel | None = None,
1803 left_index: bool = False,
1804 right_index: bool = False,
1805 by=None,
1806 left_by=None,
1807 right_by=None,
1808 axis: int = 1,
1809 suffixes: Suffixes = ("_x", "_y"),
1810 copy: bool = True,
1811 fill_method: str | None = None,
1812 how: str = "asof",
1813 tolerance=None,
1814 allow_exact_matches: bool = True,
1815 direction: str = "backward",
1816 ) -> None:
1818 self.by = by
1819 self.left_by = left_by
1820 self.right_by = right_by
1821 self.tolerance = tolerance
1822 self.allow_exact_matches = allow_exact_matches
1823 self.direction = direction
1825 _OrderedMerge.__init__(
1826 self,
1827 left,
1828 right,
1829 on=on,
1830 left_on=left_on,
1831 right_on=right_on,
1832 left_index=left_index,
1833 right_index=right_index,
1834 axis=axis,
1835 how=how,
1836 suffixes=suffixes,
1837 fill_method=fill_method,
1838 )
1840 def _validate_left_right_on(self, left_on, right_on):
1841 left_on, right_on = super()._validate_left_right_on(left_on, right_on)
1843 # we only allow on to be a single item for on
1844 if len(left_on) != 1 and not self.left_index:
1845 raise MergeError("can only asof on a key for left")
1847 if len(right_on) != 1 and not self.right_index:
1848 raise MergeError("can only asof on a key for right")
1850 if self.left_index and isinstance(self.left.index, MultiIndex):
1851 raise MergeError("left can only have one index")
1853 if self.right_index and isinstance(self.right.index, MultiIndex):
1854 raise MergeError("right can only have one index")
1856 # set 'by' columns
1857 if self.by is not None:
1858 if self.left_by is not None or self.right_by is not None:
1859 raise MergeError("Can only pass by OR left_by and right_by")
1860 self.left_by = self.right_by = self.by
1861 if self.left_by is None and self.right_by is not None:
1862 raise MergeError("missing left_by")
1863 if self.left_by is not None and self.right_by is None:
1864 raise MergeError("missing right_by")
1866 # GH#29130 Check that merge keys do not have dtype object
1867 if not self.left_index:
1868 left_on_0 = left_on[0]
1869 if is_array_like(left_on_0):
1870 lo_dtype = left_on_0.dtype
1871 else:
1872 lo_dtype = (
1873 self.left[left_on_0].dtype
1874 if left_on_0 in self.left.columns
1875 else self.left.index.get_level_values(left_on_0)
1876 )
1877 else:
1878 lo_dtype = self.left.index.dtype
1880 if not self.right_index:
1881 right_on_0 = right_on[0]
1882 if is_array_like(right_on_0):
1883 ro_dtype = right_on_0.dtype
1884 else:
1885 ro_dtype = (
1886 self.right[right_on_0].dtype
1887 if right_on_0 in self.right.columns
1888 else self.right.index.get_level_values(right_on_0)
1889 )
1890 else:
1891 ro_dtype = self.right.index.dtype
1893 if is_object_dtype(lo_dtype) or is_object_dtype(ro_dtype):
1894 raise MergeError(
1895 f"Incompatible merge dtype, {repr(ro_dtype)} and "
1896 f"{repr(lo_dtype)}, both sides must have numeric dtype"
1897 )
1899 # add 'by' to our key-list so we can have it in the
1900 # output as a key
1901 if self.left_by is not None:
1902 if not is_list_like(self.left_by):
1903 self.left_by = [self.left_by]
1904 if not is_list_like(self.right_by):
1905 self.right_by = [self.right_by]
1907 if len(self.left_by) != len(self.right_by):
1908 raise MergeError("left_by and right_by must be same length")
1910 left_on = self.left_by + list(left_on)
1911 right_on = self.right_by + list(right_on)
1913 # check 'direction' is valid
1914 if self.direction not in ["backward", "forward", "nearest"]:
1915 raise MergeError(f"direction invalid: {self.direction}")
1917 return left_on, right_on
1919 def _get_merge_keys(self):
1921 # note this function has side effects
1922 (left_join_keys, right_join_keys, join_names) = super()._get_merge_keys()
1924 # validate index types are the same
1925 for i, (lk, rk) in enumerate(zip(left_join_keys, right_join_keys)):
1926 if not is_dtype_equal(lk.dtype, rk.dtype):
1927 if is_categorical_dtype(lk.dtype) and is_categorical_dtype(rk.dtype):
1928 # The generic error message is confusing for categoricals.
1929 #
1930 # In this function, the join keys include both the original
1931 # ones of the merge_asof() call, and also the keys passed
1932 # to its by= argument. Unordered but equal categories
1933 # are not supported for the former, but will fail
1934 # later with a ValueError, so we don't *need* to check
1935 # for them here.
1936 msg = (
1937 f"incompatible merge keys [{i}] {repr(lk.dtype)} and "
1938 f"{repr(rk.dtype)}, both sides category, but not equal ones"
1939 )
1940 else:
1941 msg = (
1942 f"incompatible merge keys [{i}] {repr(lk.dtype)} and "
1943 f"{repr(rk.dtype)}, must be the same type"
1944 )
1945 raise MergeError(msg)
1947 # validate tolerance; datetime.timedelta or Timedelta if we have a DTI
1948 if self.tolerance is not None:
1950 if self.left_index:
1951 lt = self.left.index
1952 else:
1953 lt = left_join_keys[-1]
1955 msg = (
1956 f"incompatible tolerance {self.tolerance}, must be compat "
1957 f"with type {repr(lt.dtype)}"
1958 )
1960 if needs_i8_conversion(lt):
1961 if not isinstance(self.tolerance, datetime.timedelta):
1962 raise MergeError(msg)
1963 if self.tolerance < Timedelta(0):
1964 raise MergeError("tolerance must be positive")
1966 elif is_integer_dtype(lt):
1967 if not is_integer(self.tolerance):
1968 raise MergeError(msg)
1969 if self.tolerance < 0:
1970 raise MergeError("tolerance must be positive")
1972 elif is_float_dtype(lt):
1973 if not is_number(self.tolerance):
1974 raise MergeError(msg)
1975 if self.tolerance < 0:
1976 raise MergeError("tolerance must be positive")
1978 else:
1979 raise MergeError("key must be integer, timestamp or float")
1981 # validate allow_exact_matches
1982 if not is_bool(self.allow_exact_matches):
1983 msg = (
1984 "allow_exact_matches must be boolean, "
1985 f"passed {self.allow_exact_matches}"
1986 )
1987 raise MergeError(msg)
1989 return left_join_keys, right_join_keys, join_names
1991 def _get_join_indexers(self) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]:
1992 """return the join indexers"""
1994 def flip(xs) -> np.ndarray:
1995 """unlike np.transpose, this returns an array of tuples"""
1997 def injection(obj):
1998 if not is_extension_array_dtype(obj):
1999 # ndarray
2000 return obj
2001 obj = extract_array(obj)
2002 if isinstance(obj, NDArrayBackedExtensionArray):
2003 # fastpath for e.g. dt64tz, categorical
2004 return obj._ndarray
2005 # FIXME: returning obj._values_for_argsort() here doesn't
2006 # break in any existing test cases, but i (@jbrockmendel)
2007 # am pretty sure it should!
2008 # e.g.
2009 # arr = pd.array([0, pd.NA, 255], dtype="UInt8")
2010 # will have values_for_argsort (before GH#45434)
2011 # np.array([0, 255, 255], dtype=np.uint8)
2012 # and the non-injectivity should make a difference somehow
2013 # shouldn't it?
2014 return np.asarray(obj)
2016 xs = [injection(x) for x in xs]
2017 labels = list(string.ascii_lowercase[: len(xs)])
2018 dtypes = [x.dtype for x in xs]
2019 labeled_dtypes = list(zip(labels, dtypes))
2020 return np.array(list(zip(*xs)), labeled_dtypes)
2022 # values to compare
2023 left_values = (
2024 self.left.index._values if self.left_index else self.left_join_keys[-1]
2025 )
2026 right_values = (
2027 self.right.index._values if self.right_index else self.right_join_keys[-1]
2028 )
2029 tolerance = self.tolerance
2031 # we require sortedness and non-null values in the join keys
2032 if not Index(left_values).is_monotonic_increasing:
2033 side = "left"
2034 if isna(left_values).any():
2035 raise ValueError(f"Merge keys contain null values on {side} side")
2036 else:
2037 raise ValueError(f"{side} keys must be sorted")
2039 if not Index(right_values).is_monotonic_increasing:
2040 side = "right"
2041 if isna(right_values).any():
2042 raise ValueError(f"Merge keys contain null values on {side} side")
2043 else:
2044 raise ValueError(f"{side} keys must be sorted")
2046 # initial type conversion as needed
2047 if needs_i8_conversion(left_values):
2048 left_values = left_values.view("i8")
2049 right_values = right_values.view("i8")
2050 if tolerance is not None:
2051 tolerance = Timedelta(tolerance)
2052 tolerance = tolerance.value
2054 # a "by" parameter requires special handling
2055 if self.left_by is not None:
2056 # remove 'on' parameter from values if one existed
2057 if self.left_index and self.right_index:
2058 left_by_values = self.left_join_keys
2059 right_by_values = self.right_join_keys
2060 else:
2061 left_by_values = self.left_join_keys[0:-1]
2062 right_by_values = self.right_join_keys[0:-1]
2064 # get tuple representation of values if more than one
2065 if len(left_by_values) == 1:
2066 left_by_values = left_by_values[0]
2067 right_by_values = right_by_values[0]
2068 else:
2069 # We get here with non-ndarrays in test_merge_by_col_tz_aware
2070 # and test_merge_groupby_multiple_column_with_categorical_column
2071 left_by_values = flip(left_by_values)
2072 right_by_values = flip(right_by_values)
2074 # upcast 'by' parameter because HashTable is limited
2075 by_type = _get_cython_type_upcast(left_by_values.dtype)
2076 by_type_caster = _type_casters[by_type]
2077 # error: Cannot call function of unknown type
2078 left_by_values = by_type_caster(left_by_values) # type: ignore[operator]
2079 # error: Cannot call function of unknown type
2080 right_by_values = by_type_caster(right_by_values) # type: ignore[operator]
2082 # choose appropriate function by type
2083 func = _asof_by_function(self.direction)
2084 return func(
2085 left_values,
2086 right_values,
2087 left_by_values,
2088 right_by_values,
2089 self.allow_exact_matches,
2090 tolerance,
2091 )
2092 else:
2093 # choose appropriate function by type
2094 func = _asof_by_function(self.direction)
2095 return func(
2096 left_values,
2097 right_values,
2098 None,
2099 None,
2100 self.allow_exact_matches,
2101 tolerance,
2102 False,
2103 )
2106def _get_multiindex_indexer(
2107 join_keys, index: MultiIndex, sort: bool
2108) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]:
2110 # left & right join labels and num. of levels at each location
2111 mapped = (
2112 _factorize_keys(index.levels[n], join_keys[n], sort=sort)
2113 for n in range(index.nlevels)
2114 )
2115 zipped = zip(*mapped)
2116 rcodes, lcodes, shape = (list(x) for x in zipped)
2117 if sort:
2118 rcodes = list(map(np.take, rcodes, index.codes))
2119 else:
2120 i8copy = lambda a: a.astype("i8", subok=False, copy=True)
2121 rcodes = list(map(i8copy, index.codes))
2123 # fix right labels if there were any nulls
2124 for i in range(len(join_keys)):
2125 mask = index.codes[i] == -1
2126 if mask.any():
2127 # check if there already was any nulls at this location
2128 # if there was, it is factorized to `shape[i] - 1`
2129 a = join_keys[i][lcodes[i] == shape[i] - 1]
2130 if a.size == 0 or not a[0] != a[0]:
2131 shape[i] += 1
2133 rcodes[i][mask] = shape[i] - 1
2135 # get flat i8 join keys
2136 lkey, rkey = _get_join_keys(lcodes, rcodes, shape, sort)
2138 # factorize keys to a dense i8 space
2139 lkey, rkey, count = _factorize_keys(lkey, rkey, sort=sort)
2141 return libjoin.left_outer_join(lkey, rkey, count, sort=sort)
2144def _get_single_indexer(
2145 join_key, index: Index, sort: bool = False
2146) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]:
2147 left_key, right_key, count = _factorize_keys(join_key, index._values, sort=sort)
2149 return libjoin.left_outer_join(left_key, right_key, count, sort=sort)
2152def _get_empty_indexer() -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]:
2153 """Return empty join indexers."""
2154 return (
2155 np.array([], dtype=np.intp),
2156 np.array([], dtype=np.intp),
2157 )
2160def _get_no_sort_one_missing_indexer(
2161 n: int, left_missing: bool
2162) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]:
2163 """
2164 Return join indexers where all of one side is selected without sorting
2165 and none of the other side is selected.
2167 Parameters
2168 ----------
2169 n : int
2170 Length of indexers to create.
2171 left_missing : bool
2172 If True, the left indexer will contain only -1's.
2173 If False, the right indexer will contain only -1's.
2175 Returns
2176 -------
2177 np.ndarray[np.intp]
2178 Left indexer
2179 np.ndarray[np.intp]
2180 Right indexer
2181 """
2182 idx = np.arange(n, dtype=np.intp)
2183 idx_missing = np.full(shape=n, fill_value=-1, dtype=np.intp)
2184 if left_missing:
2185 return idx_missing, idx
2186 return idx, idx_missing
2189def _left_join_on_index(
2190 left_ax: Index, right_ax: Index, join_keys, sort: bool = False
2191) -> tuple[Index, npt.NDArray[np.intp] | None, npt.NDArray[np.intp]]:
2192 if len(join_keys) > 1:
2193 if not (
2194 isinstance(right_ax, MultiIndex) and len(join_keys) == right_ax.nlevels
2195 ):
2196 raise AssertionError(
2197 "If more than one join key is given then "
2198 "'right_ax' must be a MultiIndex and the "
2199 "number of join keys must be the number of levels in right_ax"
2200 )
2202 left_indexer, right_indexer = _get_multiindex_indexer(
2203 join_keys, right_ax, sort=sort
2204 )
2205 else:
2206 jkey = join_keys[0]
2208 left_indexer, right_indexer = _get_single_indexer(jkey, right_ax, sort=sort)
2210 if sort or len(left_ax) != len(left_indexer):
2211 # if asked to sort or there are 1-to-many matches
2212 join_index = left_ax.take(left_indexer)
2213 return join_index, left_indexer, right_indexer
2215 # left frame preserves order & length of its index
2216 return left_ax, None, right_indexer
2219def _factorize_keys(
2220 lk: ArrayLike, rk: ArrayLike, sort: bool = True, how: str = "inner"
2221) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp], int]:
2222 """
2223 Encode left and right keys as enumerated types.
2225 This is used to get the join indexers to be used when merging DataFrames.
2227 Parameters
2228 ----------
2229 lk : array-like
2230 Left key.
2231 rk : array-like
2232 Right key.
2233 sort : bool, defaults to True
2234 If True, the encoding is done such that the unique elements in the
2235 keys are sorted.
2236 how : {‘left’, ‘right’, ‘outer’, ‘inner’}, default ‘inner’
2237 Type of merge.
2239 Returns
2240 -------
2241 np.ndarray[np.intp]
2242 Left (resp. right if called with `key='right'`) labels, as enumerated type.
2243 np.ndarray[np.intp]
2244 Right (resp. left if called with `key='right'`) labels, as enumerated type.
2245 int
2246 Number of unique elements in union of left and right labels.
2248 See Also
2249 --------
2250 merge : Merge DataFrame or named Series objects
2251 with a database-style join.
2252 algorithms.factorize : Encode the object as an enumerated type
2253 or categorical variable.
2255 Examples
2256 --------
2257 >>> lk = np.array(["a", "c", "b"])
2258 >>> rk = np.array(["a", "c"])
2260 Here, the unique values are `'a', 'b', 'c'`. With the default
2261 `sort=True`, the encoding will be `{0: 'a', 1: 'b', 2: 'c'}`:
2263 >>> pd.core.reshape.merge._factorize_keys(lk, rk)
2264 (array([0, 2, 1]), array([0, 2]), 3)
2266 With the `sort=False`, the encoding will correspond to the order
2267 in which the unique elements first appear: `{0: 'a', 1: 'c', 2: 'b'}`:
2269 >>> pd.core.reshape.merge._factorize_keys(lk, rk, sort=False)
2270 (array([0, 1, 2]), array([0, 1]), 3)
2271 """
2272 # Some pre-processing for non-ndarray lk / rk
2273 lk = extract_array(lk, extract_numpy=True, extract_range=True)
2274 rk = extract_array(rk, extract_numpy=True, extract_range=True)
2275 # TODO: if either is a RangeIndex, we can likely factorize more efficiently?
2277 if isinstance(lk.dtype, DatetimeTZDtype) and isinstance(rk.dtype, DatetimeTZDtype):
2278 # Extract the ndarray (UTC-localized) values
2279 # Note: we dont need the dtypes to match, as these can still be compared
2280 # TODO(non-nano): need to make sure resolutions match
2281 lk = cast("DatetimeArray", lk)._ndarray
2282 rk = cast("DatetimeArray", rk)._ndarray
2284 elif (
2285 is_categorical_dtype(lk.dtype)
2286 and is_categorical_dtype(rk.dtype)
2287 and is_dtype_equal(lk.dtype, rk.dtype)
2288 ):
2289 assert isinstance(lk, Categorical)
2290 assert isinstance(rk, Categorical)
2291 # Cast rk to encoding so we can compare codes with lk
2293 rk = lk._encode_with_my_categories(rk)
2295 lk = ensure_int64(lk.codes)
2296 rk = ensure_int64(rk.codes)
2298 elif isinstance(lk, ExtensionArray) and is_dtype_equal(lk.dtype, rk.dtype):
2299 lk, _ = lk._values_for_factorize()
2301 # error: Item "ndarray" of "Union[Any, ndarray]" has no attribute
2302 # "_values_for_factorize"
2303 rk, _ = rk._values_for_factorize() # type: ignore[union-attr]
2305 klass: type[libhashtable.Factorizer] | type[libhashtable.Int64Factorizer]
2306 if is_integer_dtype(lk.dtype) and is_integer_dtype(rk.dtype):
2307 # GH#23917 TODO: needs tests for case where lk is integer-dtype
2308 # and rk is datetime-dtype
2309 klass = libhashtable.Int64Factorizer
2310 lk = ensure_int64(np.asarray(lk))
2311 rk = ensure_int64(np.asarray(rk))
2313 elif needs_i8_conversion(lk.dtype) and is_dtype_equal(lk.dtype, rk.dtype):
2314 # GH#23917 TODO: Needs tests for non-matching dtypes
2315 klass = libhashtable.Int64Factorizer
2316 lk = ensure_int64(np.asarray(lk, dtype=np.int64))
2317 rk = ensure_int64(np.asarray(rk, dtype=np.int64))
2319 else:
2320 klass = libhashtable.ObjectFactorizer
2321 lk = ensure_object(lk)
2322 rk = ensure_object(rk)
2324 rizer = klass(max(len(lk), len(rk)))
2326 # Argument 1 to "factorize" of "ObjectFactorizer" has incompatible type
2327 # "Union[ndarray[Any, dtype[signedinteger[_64Bit]]],
2328 # ndarray[Any, dtype[object_]]]"; expected "ndarray[Any, dtype[object_]]"
2329 llab = rizer.factorize(lk) # type: ignore[arg-type]
2330 # Argument 1 to "factorize" of "ObjectFactorizer" has incompatible type
2331 # "Union[ndarray[Any, dtype[signedinteger[_64Bit]]],
2332 # ndarray[Any, dtype[object_]]]"; expected "ndarray[Any, dtype[object_]]"
2333 rlab = rizer.factorize(rk) # type: ignore[arg-type]
2334 assert llab.dtype == np.dtype(np.intp), llab.dtype
2335 assert rlab.dtype == np.dtype(np.intp), rlab.dtype
2337 count = rizer.get_count()
2339 if sort:
2340 uniques = rizer.uniques.to_array()
2341 llab, rlab = _sort_labels(uniques, llab, rlab)
2343 # NA group
2344 lmask = llab == -1
2345 lany = lmask.any()
2346 rmask = rlab == -1
2347 rany = rmask.any()
2349 if lany or rany:
2350 if lany:
2351 np.putmask(llab, lmask, count)
2352 if rany:
2353 np.putmask(rlab, rmask, count)
2354 count += 1
2356 if how == "right":
2357 return rlab, llab, count
2358 return llab, rlab, count
2361def _sort_labels(
2362 uniques: np.ndarray, left: npt.NDArray[np.intp], right: npt.NDArray[np.intp]
2363) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]:
2365 llength = len(left)
2366 labels = np.concatenate([left, right])
2368 _, new_labels = algos.safe_sort(uniques, labels, na_sentinel=-1)
2369 new_left, new_right = new_labels[:llength], new_labels[llength:]
2371 return new_left, new_right
2374def _get_join_keys(llab, rlab, shape, sort: bool):
2376 # how many levels can be done without overflow
2377 nlev = next(
2378 lev
2379 for lev in range(len(shape), 0, -1)
2380 if not is_int64_overflow_possible(shape[:lev])
2381 )
2383 # get keys for the first `nlev` levels
2384 stride = np.prod(shape[1:nlev], dtype="i8")
2385 lkey = stride * llab[0].astype("i8", subok=False, copy=False)
2386 rkey = stride * rlab[0].astype("i8", subok=False, copy=False)
2388 for i in range(1, nlev):
2389 with np.errstate(divide="ignore"):
2390 stride //= shape[i]
2391 lkey += llab[i] * stride
2392 rkey += rlab[i] * stride
2394 if nlev == len(shape): # all done!
2395 return lkey, rkey
2397 # densify current keys to avoid overflow
2398 lkey, rkey, count = _factorize_keys(lkey, rkey, sort=sort)
2400 llab = [lkey] + llab[nlev:]
2401 rlab = [rkey] + rlab[nlev:]
2402 shape = [count] + shape[nlev:]
2404 return _get_join_keys(llab, rlab, shape, sort)
2407def _should_fill(lname, rname) -> bool:
2408 if not isinstance(lname, str) or not isinstance(rname, str):
2409 return True
2410 return lname == rname
2413def _any(x) -> bool:
2414 return x is not None and com.any_not_none(*x)
2417def _validate_operand(obj: DataFrame | Series) -> DataFrame:
2418 if isinstance(obj, ABCDataFrame):
2419 return obj
2420 elif isinstance(obj, ABCSeries):
2421 if obj.name is None:
2422 raise ValueError("Cannot merge a Series without a name")
2423 else:
2424 return obj.to_frame()
2425 else:
2426 raise TypeError(
2427 f"Can only merge Series or DataFrame objects, a {type(obj)} was passed"
2428 )
2431def _items_overlap_with_suffix(
2432 left: Index, right: Index, suffixes: Suffixes
2433) -> tuple[Index, Index]:
2434 """
2435 Suffixes type validation.
2437 If two indices overlap, add suffixes to overlapping entries.
2439 If corresponding suffix is empty, the entry is simply converted to string.
2441 """
2442 if not is_list_like(suffixes, allow_sets=False):
2443 warnings.warn(
2444 f"Passing 'suffixes' as a {type(suffixes)}, is not supported and may give "
2445 "unexpected results. Provide 'suffixes' as a tuple instead. In the "
2446 "future a 'TypeError' will be raised.",
2447 FutureWarning,
2448 stacklevel=find_stack_level(),
2449 )
2451 to_rename = left.intersection(right)
2452 if len(to_rename) == 0:
2453 return left, right
2455 lsuffix, rsuffix = suffixes
2457 if not lsuffix and not rsuffix:
2458 raise ValueError(f"columns overlap but no suffix specified: {to_rename}")
2460 def renamer(x, suffix):
2461 """
2462 Rename the left and right indices.
2464 If there is overlap, and suffix is not None, add
2465 suffix, otherwise, leave it as-is.
2467 Parameters
2468 ----------
2469 x : original column name
2470 suffix : str or None
2472 Returns
2473 -------
2474 x : renamed column name
2475 """
2476 if x in to_rename and suffix is not None:
2477 return f"{x}{suffix}"
2478 return x
2480 lrenamer = partial(renamer, suffix=lsuffix)
2481 rrenamer = partial(renamer, suffix=rsuffix)
2483 llabels = left._transform_index(lrenamer)
2484 rlabels = right._transform_index(rrenamer)
2486 dups = []
2487 if not llabels.is_unique:
2488 # Only warn when duplicates are caused because of suffixes, already duplicated
2489 # columns in origin should not warn
2490 dups = llabels[(llabels.duplicated()) & (~left.duplicated())].tolist()
2491 if not rlabels.is_unique:
2492 dups.extend(rlabels[(rlabels.duplicated()) & (~right.duplicated())].tolist())
2493 if dups:
2494 warnings.warn(
2495 f"Passing 'suffixes' which cause duplicate columns {set(dups)} in the "
2496 f"result is deprecated and will raise a MergeError in a future version.",
2497 FutureWarning,
2498 stacklevel=find_stack_level(),
2499 )
2501 return llabels, rlabels