Coverage for /var/srv/projects/api.amasfac.comuna18.com/tmp/venv/lib/python3.9/site-packages/pandas/core/util/hashing.py: 11%

1"""

2data hash pandas / numpy objects

3"""

4from __future__ import annotations

6import itertools

7from typing import (

8 TYPE_CHECKING,

9 Hashable,

10 Iterable,

11 Iterator,

12 cast,

13)

15import numpy as np

17from pandas._libs import lib

18from pandas._libs.hashing import hash_object_array

19from pandas._typing import (

20 ArrayLike,

21 npt,

22)

24from pandas.core.dtypes.common import (

25 is_categorical_dtype,

26 is_list_like,

27)

28from pandas.core.dtypes.generic import (

29 ABCDataFrame,

30 ABCExtensionArray,

31 ABCIndex,

32 ABCMultiIndex,

33 ABCSeries,

34)

36if TYPE_CHECKING: 36 ↛ 37line 36 didn't jump to line 37, because the condition on line 36 was never true

37 from pandas import (

38 Categorical,

39 DataFrame,

40 Index,

41 MultiIndex,

42 Series,

43 )

46# 16 byte long hashing key

47_default_hash_key = "0123456789123456"

50def combine_hash_arrays(

51 arrays: Iterator[np.ndarray], num_items: int

52) -> npt.NDArray[np.uint64]:

53 """

54 Parameters

55 ----------

56 arrays : Iterator[np.ndarray]

57 num_items : int

59 Returns

60 -------

61 np.ndarray[uint64]

63 Should be the same as CPython's tupleobject.c

64 """

65 try:

66 first = next(arrays)

67 except StopIteration:

68 return np.array([], dtype=np.uint64)

70 arrays = itertools.chain([first], arrays)

72 mult = np.uint64(1000003)

73 out = np.zeros_like(first) + np.uint64(0x345678)

74 for i, a in enumerate(arrays):

75 inverse_i = num_items - i

76 out ^= a

77 out *= mult

78 mult += np.uint64(82520 + inverse_i + inverse_i)

79 assert i + 1 == num_items, "Fed in wrong num_items"

80 out += np.uint64(97531)

81 return out

84def hash_pandas_object(

85 obj: Index | DataFrame | Series,

86 index: bool = True,

87 encoding: str = "utf8",

88 hash_key: str | None = _default_hash_key,

89 categorize: bool = True,

90) -> Series:

91 """

92 Return a data hash of the Index/Series/DataFrame.

94 Parameters

95 ----------

96 obj : Index, Series, or DataFrame

97 index : bool, default True

98 Include the index in the hash (if Series/DataFrame).

99 encoding : str, default 'utf8'

100 Encoding for data & key when strings.

101 hash_key : str, default _default_hash_key

102 Hash_key for string key to encode.

103 categorize : bool, default True

104 Whether to first categorize object arrays before hashing. This is more

105 efficient when the array contains duplicate values.

106

107 Returns

108 -------

109 Series of uint64, same length as the object

110 """

111 from pandas import Series

112

113 if hash_key is None:

114 hash_key = _default_hash_key

115

116 if isinstance(obj, ABCMultiIndex):

117 return Series(hash_tuples(obj, encoding, hash_key), dtype="uint64", copy=False)

118

119 elif isinstance(obj, ABCIndex):

120 h = hash_array(obj._values, encoding, hash_key, categorize).astype(

121 "uint64", copy=False

122 )

123 ser = Series(h, index=obj, dtype="uint64", copy=False)

124

125 elif isinstance(obj, ABCSeries):

126 h = hash_array(obj._values, encoding, hash_key, categorize).astype(

127 "uint64", copy=False

128 )

129 if index:

130 index_iter = (

131 hash_pandas_object(

132 obj.index,

133 index=False,

134 encoding=encoding,

135 hash_key=hash_key,

136 categorize=categorize,

137 )._values

138 for _ in [None]

139 )

140 arrays = itertools.chain([h], index_iter)

141 h = combine_hash_arrays(arrays, 2)

142

143 ser = Series(h, index=obj.index, dtype="uint64", copy=False)

144

145 elif isinstance(obj, ABCDataFrame):

146 hashes = (

147 hash_array(series._values, encoding, hash_key, categorize)

148 for _, series in obj.items()

149 )

150 num_items = len(obj.columns)

151 if index:

152 index_hash_generator = (

153 hash_pandas_object(

154 obj.index,

155 index=False,

156 encoding=encoding,

157 hash_key=hash_key,

158 categorize=categorize,

159 )._values

160 for _ in [None]

161 )

162 num_items += 1

163

164 # keep `hashes` specifically a generator to keep mypy happy

165 _hashes = itertools.chain(hashes, index_hash_generator)

166 hashes = (x for x in _hashes)

167 h = combine_hash_arrays(hashes, num_items)

168

169 ser = Series(h, index=obj.index, dtype="uint64", copy=False)

170 else:

171 raise TypeError(f"Unexpected type for hashing {type(obj)}")

172

173 return ser

174

175

176def hash_tuples(

177 vals: MultiIndex | Iterable[tuple[Hashable, ...]],

178 encoding: str = "utf8",

179 hash_key: str = _default_hash_key,

180) -> npt.NDArray[np.uint64]:

181 """

182 Hash an MultiIndex / listlike-of-tuples efficiently.

183

184 Parameters

185 ----------

186 vals : MultiIndex or listlike-of-tuples

187 encoding : str, default 'utf8'

188 hash_key : str, default _default_hash_key

189

190 Returns

191 -------

192 ndarray[np.uint64] of hashed values

193 """

194 if not is_list_like(vals):

195 raise TypeError("must be convertible to a list-of-tuples")

196

197 from pandas import (

198 Categorical,

199 MultiIndex,

200 )

201

202 if not isinstance(vals, ABCMultiIndex):

203 mi = MultiIndex.from_tuples(vals)

204 else:

205 mi = vals

206

207 # create a list-of-Categoricals

208 cat_vals = [

209 Categorical(mi.codes[level], mi.levels[level], ordered=False, fastpath=True)

210 for level in range(mi.nlevels)

211 ]

212

213 # hash the list-of-ndarrays

214 hashes = (

215 _hash_categorical(cat, encoding=encoding, hash_key=hash_key) for cat in cat_vals

216 )

217 h = combine_hash_arrays(hashes, len(cat_vals))

218

219 return h

220

221

222def _hash_categorical(

223 cat: Categorical, encoding: str, hash_key: str

224) -> npt.NDArray[np.uint64]:

225 """

226 Hash a Categorical by hashing its categories, and then mapping the codes

227 to the hashes

228

229 Parameters

230 ----------

231 cat : Categorical

232 encoding : str

233 hash_key : str

234

235 Returns

236 -------

237 ndarray[np.uint64] of hashed values, same size as len(c)

238 """

239 # Convert ExtensionArrays to ndarrays

240 values = np.asarray(cat.categories._values)

241 hashed = hash_array(values, encoding, hash_key, categorize=False)

242

243 # we have uint64, as we don't directly support missing values

244 # we don't want to use take_nd which will coerce to float

245 # instead, directly construct the result with a

246 # max(np.uint64) as the missing value indicator

247 #

248 # TODO: GH 15362

249

250 mask = cat.isna()

251 if len(hashed):

252 result = hashed.take(cat.codes)

253 else:

254 result = np.zeros(len(mask), dtype="uint64")

255

256 if mask.any():

257 result[mask] = lib.u8max

258

259 return result

260

261

262def hash_array(

263 vals: ArrayLike,

264 encoding: str = "utf8",

265 hash_key: str = _default_hash_key,

266 categorize: bool = True,

267) -> npt.NDArray[np.uint64]:

268 """

269 Given a 1d array, return an array of deterministic integers.

270

271 Parameters

272 ----------

273 vals : ndarray or ExtensionArray

274 encoding : str, default 'utf8'

275 Encoding for data & key when strings.

276 hash_key : str, default _default_hash_key

277 Hash_key for string key to encode.

278 categorize : bool, default True

279 Whether to first categorize object arrays before hashing. This is more

280 efficient when the array contains duplicate values.

281

282 Returns

283 -------

284 ndarray[np.uint64, ndim=1]

285 Hashed values, same length as the vals.

286 """

287 if not hasattr(vals, "dtype"):

288 raise TypeError("must pass a ndarray-like")

289 dtype = vals.dtype

290

291 # For categoricals, we hash the categories, then remap the codes to the

292 # hash values. (This check is above the complex check so that we don't ask

293 # numpy if categorical is a subdtype of complex, as it will choke).

294 if is_categorical_dtype(dtype):

295 vals = cast("Categorical", vals)

296 return _hash_categorical(vals, encoding, hash_key)

297

298 elif isinstance(vals, ABCExtensionArray):

299 vals, _ = vals._values_for_factorize()

300

301 elif not isinstance(vals, np.ndarray):

302 # GH#42003

303 raise TypeError(

304 "hash_array requires np.ndarray or ExtensionArray, not "

305 f"{type(vals).__name__}. Use hash_pandas_object instead."

306 )

307

308 return _hash_ndarray(vals, encoding, hash_key, categorize)

309

310

311def _hash_ndarray(

312 vals: np.ndarray,

313 encoding: str = "utf8",

314 hash_key: str = _default_hash_key,

315 categorize: bool = True,

316) -> npt.NDArray[np.uint64]:

317 """

318 See hash_array.__doc__.

319 """

320 dtype = vals.dtype

321

322 # we'll be working with everything as 64-bit values, so handle this

323 # 128-bit value early

324 if np.issubdtype(dtype, np.complex128):

325 return hash_array(np.real(vals)) + 23 * hash_array(np.imag(vals))

326

327 # First, turn whatever array this is into unsigned 64-bit ints, if we can

328 # manage it.

329 elif dtype == bool:

330 vals = vals.astype("u8")

331 elif issubclass(dtype.type, (np.datetime64, np.timedelta64)):

332 vals = vals.view("i8").astype("u8", copy=False)

333 elif issubclass(dtype.type, np.number) and dtype.itemsize <= 8:

334 vals = vals.view(f"u{vals.dtype.itemsize}").astype("u8")

335 else:

336 # With repeated values, its MUCH faster to categorize object dtypes,

337 # then hash and rename categories. We allow skipping the categorization

338 # when the values are known/likely to be unique.

339 if categorize:

340 from pandas import (

341 Categorical,

342 Index,

343 factorize,

344 )

345

346 codes, categories = factorize(vals, sort=False)

347 cat = Categorical(

348 codes, Index._with_infer(categories), ordered=False, fastpath=True

349 )

350 return _hash_categorical(cat, encoding, hash_key)

351

352 try:

353 vals = hash_object_array(vals, hash_key, encoding)

354 except TypeError:

355 # we have mixed types

356 vals = hash_object_array(

357 vals.astype(str).astype(object), hash_key, encoding

358 )

359

360 # Then, redistribute these 64-bit ints within the space of 64-bit ints

361 vals ^= vals >> 30

362 vals *= np.uint64(0xBF58476D1CE4E5B9)

363 vals ^= vals >> 27

364 vals *= np.uint64(0x94D049BB133111EB)

365 vals ^= vals >> 31

366 return vals