Coverage for /var/srv/projects/api.amasfac.comuna18.com/tmp/venv/lib/python3.9/site-packages/pandas/core/util/hashing.py: 11%
114 statements
« prev ^ index » next coverage.py v6.4.4, created at 2023-07-17 14:22 -0600
« prev ^ index » next coverage.py v6.4.4, created at 2023-07-17 14:22 -0600
1"""
2data hash pandas / numpy objects
3"""
4from __future__ import annotations
6import itertools
7from typing import (
8 TYPE_CHECKING,
9 Hashable,
10 Iterable,
11 Iterator,
12 cast,
13)
15import numpy as np
17from pandas._libs import lib
18from pandas._libs.hashing import hash_object_array
19from pandas._typing import (
20 ArrayLike,
21 npt,
22)
24from pandas.core.dtypes.common import (
25 is_categorical_dtype,
26 is_list_like,
27)
28from pandas.core.dtypes.generic import (
29 ABCDataFrame,
30 ABCExtensionArray,
31 ABCIndex,
32 ABCMultiIndex,
33 ABCSeries,
34)
36if TYPE_CHECKING: 36 ↛ 37line 36 didn't jump to line 37, because the condition on line 36 was never true
37 from pandas import (
38 Categorical,
39 DataFrame,
40 Index,
41 MultiIndex,
42 Series,
43 )
46# 16 byte long hashing key
47_default_hash_key = "0123456789123456"
50def combine_hash_arrays(
51 arrays: Iterator[np.ndarray], num_items: int
52) -> npt.NDArray[np.uint64]:
53 """
54 Parameters
55 ----------
56 arrays : Iterator[np.ndarray]
57 num_items : int
59 Returns
60 -------
61 np.ndarray[uint64]
63 Should be the same as CPython's tupleobject.c
64 """
65 try:
66 first = next(arrays)
67 except StopIteration:
68 return np.array([], dtype=np.uint64)
70 arrays = itertools.chain([first], arrays)
72 mult = np.uint64(1000003)
73 out = np.zeros_like(first) + np.uint64(0x345678)
74 for i, a in enumerate(arrays):
75 inverse_i = num_items - i
76 out ^= a
77 out *= mult
78 mult += np.uint64(82520 + inverse_i + inverse_i)
79 assert i + 1 == num_items, "Fed in wrong num_items"
80 out += np.uint64(97531)
81 return out
84def hash_pandas_object(
85 obj: Index | DataFrame | Series,
86 index: bool = True,
87 encoding: str = "utf8",
88 hash_key: str | None = _default_hash_key,
89 categorize: bool = True,
90) -> Series:
91 """
92 Return a data hash of the Index/Series/DataFrame.
94 Parameters
95 ----------
96 obj : Index, Series, or DataFrame
97 index : bool, default True
98 Include the index in the hash (if Series/DataFrame).
99 encoding : str, default 'utf8'
100 Encoding for data & key when strings.
101 hash_key : str, default _default_hash_key
102 Hash_key for string key to encode.
103 categorize : bool, default True
104 Whether to first categorize object arrays before hashing. This is more
105 efficient when the array contains duplicate values.
107 Returns
108 -------
109 Series of uint64, same length as the object
110 """
111 from pandas import Series
113 if hash_key is None:
114 hash_key = _default_hash_key
116 if isinstance(obj, ABCMultiIndex):
117 return Series(hash_tuples(obj, encoding, hash_key), dtype="uint64", copy=False)
119 elif isinstance(obj, ABCIndex):
120 h = hash_array(obj._values, encoding, hash_key, categorize).astype(
121 "uint64", copy=False
122 )
123 ser = Series(h, index=obj, dtype="uint64", copy=False)
125 elif isinstance(obj, ABCSeries):
126 h = hash_array(obj._values, encoding, hash_key, categorize).astype(
127 "uint64", copy=False
128 )
129 if index:
130 index_iter = (
131 hash_pandas_object(
132 obj.index,
133 index=False,
134 encoding=encoding,
135 hash_key=hash_key,
136 categorize=categorize,
137 )._values
138 for _ in [None]
139 )
140 arrays = itertools.chain([h], index_iter)
141 h = combine_hash_arrays(arrays, 2)
143 ser = Series(h, index=obj.index, dtype="uint64", copy=False)
145 elif isinstance(obj, ABCDataFrame):
146 hashes = (
147 hash_array(series._values, encoding, hash_key, categorize)
148 for _, series in obj.items()
149 )
150 num_items = len(obj.columns)
151 if index:
152 index_hash_generator = (
153 hash_pandas_object(
154 obj.index,
155 index=False,
156 encoding=encoding,
157 hash_key=hash_key,
158 categorize=categorize,
159 )._values
160 for _ in [None]
161 )
162 num_items += 1
164 # keep `hashes` specifically a generator to keep mypy happy
165 _hashes = itertools.chain(hashes, index_hash_generator)
166 hashes = (x for x in _hashes)
167 h = combine_hash_arrays(hashes, num_items)
169 ser = Series(h, index=obj.index, dtype="uint64", copy=False)
170 else:
171 raise TypeError(f"Unexpected type for hashing {type(obj)}")
173 return ser
176def hash_tuples(
177 vals: MultiIndex | Iterable[tuple[Hashable, ...]],
178 encoding: str = "utf8",
179 hash_key: str = _default_hash_key,
180) -> npt.NDArray[np.uint64]:
181 """
182 Hash an MultiIndex / listlike-of-tuples efficiently.
184 Parameters
185 ----------
186 vals : MultiIndex or listlike-of-tuples
187 encoding : str, default 'utf8'
188 hash_key : str, default _default_hash_key
190 Returns
191 -------
192 ndarray[np.uint64] of hashed values
193 """
194 if not is_list_like(vals):
195 raise TypeError("must be convertible to a list-of-tuples")
197 from pandas import (
198 Categorical,
199 MultiIndex,
200 )
202 if not isinstance(vals, ABCMultiIndex):
203 mi = MultiIndex.from_tuples(vals)
204 else:
205 mi = vals
207 # create a list-of-Categoricals
208 cat_vals = [
209 Categorical(mi.codes[level], mi.levels[level], ordered=False, fastpath=True)
210 for level in range(mi.nlevels)
211 ]
213 # hash the list-of-ndarrays
214 hashes = (
215 _hash_categorical(cat, encoding=encoding, hash_key=hash_key) for cat in cat_vals
216 )
217 h = combine_hash_arrays(hashes, len(cat_vals))
219 return h
222def _hash_categorical(
223 cat: Categorical, encoding: str, hash_key: str
224) -> npt.NDArray[np.uint64]:
225 """
226 Hash a Categorical by hashing its categories, and then mapping the codes
227 to the hashes
229 Parameters
230 ----------
231 cat : Categorical
232 encoding : str
233 hash_key : str
235 Returns
236 -------
237 ndarray[np.uint64] of hashed values, same size as len(c)
238 """
239 # Convert ExtensionArrays to ndarrays
240 values = np.asarray(cat.categories._values)
241 hashed = hash_array(values, encoding, hash_key, categorize=False)
243 # we have uint64, as we don't directly support missing values
244 # we don't want to use take_nd which will coerce to float
245 # instead, directly construct the result with a
246 # max(np.uint64) as the missing value indicator
247 #
248 # TODO: GH 15362
250 mask = cat.isna()
251 if len(hashed):
252 result = hashed.take(cat.codes)
253 else:
254 result = np.zeros(len(mask), dtype="uint64")
256 if mask.any():
257 result[mask] = lib.u8max
259 return result
262def hash_array(
263 vals: ArrayLike,
264 encoding: str = "utf8",
265 hash_key: str = _default_hash_key,
266 categorize: bool = True,
267) -> npt.NDArray[np.uint64]:
268 """
269 Given a 1d array, return an array of deterministic integers.
271 Parameters
272 ----------
273 vals : ndarray or ExtensionArray
274 encoding : str, default 'utf8'
275 Encoding for data & key when strings.
276 hash_key : str, default _default_hash_key
277 Hash_key for string key to encode.
278 categorize : bool, default True
279 Whether to first categorize object arrays before hashing. This is more
280 efficient when the array contains duplicate values.
282 Returns
283 -------
284 ndarray[np.uint64, ndim=1]
285 Hashed values, same length as the vals.
286 """
287 if not hasattr(vals, "dtype"):
288 raise TypeError("must pass a ndarray-like")
289 dtype = vals.dtype
291 # For categoricals, we hash the categories, then remap the codes to the
292 # hash values. (This check is above the complex check so that we don't ask
293 # numpy if categorical is a subdtype of complex, as it will choke).
294 if is_categorical_dtype(dtype):
295 vals = cast("Categorical", vals)
296 return _hash_categorical(vals, encoding, hash_key)
298 elif isinstance(vals, ABCExtensionArray):
299 vals, _ = vals._values_for_factorize()
301 elif not isinstance(vals, np.ndarray):
302 # GH#42003
303 raise TypeError(
304 "hash_array requires np.ndarray or ExtensionArray, not "
305 f"{type(vals).__name__}. Use hash_pandas_object instead."
306 )
308 return _hash_ndarray(vals, encoding, hash_key, categorize)
311def _hash_ndarray(
312 vals: np.ndarray,
313 encoding: str = "utf8",
314 hash_key: str = _default_hash_key,
315 categorize: bool = True,
316) -> npt.NDArray[np.uint64]:
317 """
318 See hash_array.__doc__.
319 """
320 dtype = vals.dtype
322 # we'll be working with everything as 64-bit values, so handle this
323 # 128-bit value early
324 if np.issubdtype(dtype, np.complex128):
325 return hash_array(np.real(vals)) + 23 * hash_array(np.imag(vals))
327 # First, turn whatever array this is into unsigned 64-bit ints, if we can
328 # manage it.
329 elif dtype == bool:
330 vals = vals.astype("u8")
331 elif issubclass(dtype.type, (np.datetime64, np.timedelta64)):
332 vals = vals.view("i8").astype("u8", copy=False)
333 elif issubclass(dtype.type, np.number) and dtype.itemsize <= 8:
334 vals = vals.view(f"u{vals.dtype.itemsize}").astype("u8")
335 else:
336 # With repeated values, its MUCH faster to categorize object dtypes,
337 # then hash and rename categories. We allow skipping the categorization
338 # when the values are known/likely to be unique.
339 if categorize:
340 from pandas import (
341 Categorical,
342 Index,
343 factorize,
344 )
346 codes, categories = factorize(vals, sort=False)
347 cat = Categorical(
348 codes, Index._with_infer(categories), ordered=False, fastpath=True
349 )
350 return _hash_categorical(cat, encoding, hash_key)
352 try:
353 vals = hash_object_array(vals, hash_key, encoding)
354 except TypeError:
355 # we have mixed types
356 vals = hash_object_array(
357 vals.astype(str).astype(object), hash_key, encoding
358 )
360 # Then, redistribute these 64-bit ints within the space of 64-bit ints
361 vals ^= vals >> 30
362 vals *= np.uint64(0xBF58476D1CE4E5B9)
363 vals ^= vals >> 27
364 vals *= np.uint64(0x94D049BB133111EB)
365 vals ^= vals >> 31
366 return vals