Coverage for /var/srv/projects/api.amasfac.comuna18.com/tmp/venv/lib/python3.9/site-packages/pandas/core/util/hashing.py: 11%

114 statements  

« prev     ^ index     » next       coverage.py v6.4.4, created at 2023-07-17 14:22 -0600

1""" 

2data hash pandas / numpy objects 

3""" 

4from __future__ import annotations 

5 

6import itertools 

7from typing import ( 

8 TYPE_CHECKING, 

9 Hashable, 

10 Iterable, 

11 Iterator, 

12 cast, 

13) 

14 

15import numpy as np 

16 

17from pandas._libs import lib 

18from pandas._libs.hashing import hash_object_array 

19from pandas._typing import ( 

20 ArrayLike, 

21 npt, 

22) 

23 

24from pandas.core.dtypes.common import ( 

25 is_categorical_dtype, 

26 is_list_like, 

27) 

28from pandas.core.dtypes.generic import ( 

29 ABCDataFrame, 

30 ABCExtensionArray, 

31 ABCIndex, 

32 ABCMultiIndex, 

33 ABCSeries, 

34) 

35 

36if TYPE_CHECKING: 36 ↛ 37line 36 didn't jump to line 37, because the condition on line 36 was never true

37 from pandas import ( 

38 Categorical, 

39 DataFrame, 

40 Index, 

41 MultiIndex, 

42 Series, 

43 ) 

44 

45 

46# 16 byte long hashing key 

47_default_hash_key = "0123456789123456" 

48 

49 

50def combine_hash_arrays( 

51 arrays: Iterator[np.ndarray], num_items: int 

52) -> npt.NDArray[np.uint64]: 

53 """ 

54 Parameters 

55 ---------- 

56 arrays : Iterator[np.ndarray] 

57 num_items : int 

58 

59 Returns 

60 ------- 

61 np.ndarray[uint64] 

62 

63 Should be the same as CPython's tupleobject.c 

64 """ 

65 try: 

66 first = next(arrays) 

67 except StopIteration: 

68 return np.array([], dtype=np.uint64) 

69 

70 arrays = itertools.chain([first], arrays) 

71 

72 mult = np.uint64(1000003) 

73 out = np.zeros_like(first) + np.uint64(0x345678) 

74 for i, a in enumerate(arrays): 

75 inverse_i = num_items - i 

76 out ^= a 

77 out *= mult 

78 mult += np.uint64(82520 + inverse_i + inverse_i) 

79 assert i + 1 == num_items, "Fed in wrong num_items" 

80 out += np.uint64(97531) 

81 return out 

82 

83 

84def hash_pandas_object( 

85 obj: Index | DataFrame | Series, 

86 index: bool = True, 

87 encoding: str = "utf8", 

88 hash_key: str | None = _default_hash_key, 

89 categorize: bool = True, 

90) -> Series: 

91 """ 

92 Return a data hash of the Index/Series/DataFrame. 

93 

94 Parameters 

95 ---------- 

96 obj : Index, Series, or DataFrame 

97 index : bool, default True 

98 Include the index in the hash (if Series/DataFrame). 

99 encoding : str, default 'utf8' 

100 Encoding for data & key when strings. 

101 hash_key : str, default _default_hash_key 

102 Hash_key for string key to encode. 

103 categorize : bool, default True 

104 Whether to first categorize object arrays before hashing. This is more 

105 efficient when the array contains duplicate values. 

106 

107 Returns 

108 ------- 

109 Series of uint64, same length as the object 

110 """ 

111 from pandas import Series 

112 

113 if hash_key is None: 

114 hash_key = _default_hash_key 

115 

116 if isinstance(obj, ABCMultiIndex): 

117 return Series(hash_tuples(obj, encoding, hash_key), dtype="uint64", copy=False) 

118 

119 elif isinstance(obj, ABCIndex): 

120 h = hash_array(obj._values, encoding, hash_key, categorize).astype( 

121 "uint64", copy=False 

122 ) 

123 ser = Series(h, index=obj, dtype="uint64", copy=False) 

124 

125 elif isinstance(obj, ABCSeries): 

126 h = hash_array(obj._values, encoding, hash_key, categorize).astype( 

127 "uint64", copy=False 

128 ) 

129 if index: 

130 index_iter = ( 

131 hash_pandas_object( 

132 obj.index, 

133 index=False, 

134 encoding=encoding, 

135 hash_key=hash_key, 

136 categorize=categorize, 

137 )._values 

138 for _ in [None] 

139 ) 

140 arrays = itertools.chain([h], index_iter) 

141 h = combine_hash_arrays(arrays, 2) 

142 

143 ser = Series(h, index=obj.index, dtype="uint64", copy=False) 

144 

145 elif isinstance(obj, ABCDataFrame): 

146 hashes = ( 

147 hash_array(series._values, encoding, hash_key, categorize) 

148 for _, series in obj.items() 

149 ) 

150 num_items = len(obj.columns) 

151 if index: 

152 index_hash_generator = ( 

153 hash_pandas_object( 

154 obj.index, 

155 index=False, 

156 encoding=encoding, 

157 hash_key=hash_key, 

158 categorize=categorize, 

159 )._values 

160 for _ in [None] 

161 ) 

162 num_items += 1 

163 

164 # keep `hashes` specifically a generator to keep mypy happy 

165 _hashes = itertools.chain(hashes, index_hash_generator) 

166 hashes = (x for x in _hashes) 

167 h = combine_hash_arrays(hashes, num_items) 

168 

169 ser = Series(h, index=obj.index, dtype="uint64", copy=False) 

170 else: 

171 raise TypeError(f"Unexpected type for hashing {type(obj)}") 

172 

173 return ser 

174 

175 

176def hash_tuples( 

177 vals: MultiIndex | Iterable[tuple[Hashable, ...]], 

178 encoding: str = "utf8", 

179 hash_key: str = _default_hash_key, 

180) -> npt.NDArray[np.uint64]: 

181 """ 

182 Hash an MultiIndex / listlike-of-tuples efficiently. 

183 

184 Parameters 

185 ---------- 

186 vals : MultiIndex or listlike-of-tuples 

187 encoding : str, default 'utf8' 

188 hash_key : str, default _default_hash_key 

189 

190 Returns 

191 ------- 

192 ndarray[np.uint64] of hashed values 

193 """ 

194 if not is_list_like(vals): 

195 raise TypeError("must be convertible to a list-of-tuples") 

196 

197 from pandas import ( 

198 Categorical, 

199 MultiIndex, 

200 ) 

201 

202 if not isinstance(vals, ABCMultiIndex): 

203 mi = MultiIndex.from_tuples(vals) 

204 else: 

205 mi = vals 

206 

207 # create a list-of-Categoricals 

208 cat_vals = [ 

209 Categorical(mi.codes[level], mi.levels[level], ordered=False, fastpath=True) 

210 for level in range(mi.nlevels) 

211 ] 

212 

213 # hash the list-of-ndarrays 

214 hashes = ( 

215 _hash_categorical(cat, encoding=encoding, hash_key=hash_key) for cat in cat_vals 

216 ) 

217 h = combine_hash_arrays(hashes, len(cat_vals)) 

218 

219 return h 

220 

221 

222def _hash_categorical( 

223 cat: Categorical, encoding: str, hash_key: str 

224) -> npt.NDArray[np.uint64]: 

225 """ 

226 Hash a Categorical by hashing its categories, and then mapping the codes 

227 to the hashes 

228 

229 Parameters 

230 ---------- 

231 cat : Categorical 

232 encoding : str 

233 hash_key : str 

234 

235 Returns 

236 ------- 

237 ndarray[np.uint64] of hashed values, same size as len(c) 

238 """ 

239 # Convert ExtensionArrays to ndarrays 

240 values = np.asarray(cat.categories._values) 

241 hashed = hash_array(values, encoding, hash_key, categorize=False) 

242 

243 # we have uint64, as we don't directly support missing values 

244 # we don't want to use take_nd which will coerce to float 

245 # instead, directly construct the result with a 

246 # max(np.uint64) as the missing value indicator 

247 # 

248 # TODO: GH 15362 

249 

250 mask = cat.isna() 

251 if len(hashed): 

252 result = hashed.take(cat.codes) 

253 else: 

254 result = np.zeros(len(mask), dtype="uint64") 

255 

256 if mask.any(): 

257 result[mask] = lib.u8max 

258 

259 return result 

260 

261 

262def hash_array( 

263 vals: ArrayLike, 

264 encoding: str = "utf8", 

265 hash_key: str = _default_hash_key, 

266 categorize: bool = True, 

267) -> npt.NDArray[np.uint64]: 

268 """ 

269 Given a 1d array, return an array of deterministic integers. 

270 

271 Parameters 

272 ---------- 

273 vals : ndarray or ExtensionArray 

274 encoding : str, default 'utf8' 

275 Encoding for data & key when strings. 

276 hash_key : str, default _default_hash_key 

277 Hash_key for string key to encode. 

278 categorize : bool, default True 

279 Whether to first categorize object arrays before hashing. This is more 

280 efficient when the array contains duplicate values. 

281 

282 Returns 

283 ------- 

284 ndarray[np.uint64, ndim=1] 

285 Hashed values, same length as the vals. 

286 """ 

287 if not hasattr(vals, "dtype"): 

288 raise TypeError("must pass a ndarray-like") 

289 dtype = vals.dtype 

290 

291 # For categoricals, we hash the categories, then remap the codes to the 

292 # hash values. (This check is above the complex check so that we don't ask 

293 # numpy if categorical is a subdtype of complex, as it will choke). 

294 if is_categorical_dtype(dtype): 

295 vals = cast("Categorical", vals) 

296 return _hash_categorical(vals, encoding, hash_key) 

297 

298 elif isinstance(vals, ABCExtensionArray): 

299 vals, _ = vals._values_for_factorize() 

300 

301 elif not isinstance(vals, np.ndarray): 

302 # GH#42003 

303 raise TypeError( 

304 "hash_array requires np.ndarray or ExtensionArray, not " 

305 f"{type(vals).__name__}. Use hash_pandas_object instead." 

306 ) 

307 

308 return _hash_ndarray(vals, encoding, hash_key, categorize) 

309 

310 

311def _hash_ndarray( 

312 vals: np.ndarray, 

313 encoding: str = "utf8", 

314 hash_key: str = _default_hash_key, 

315 categorize: bool = True, 

316) -> npt.NDArray[np.uint64]: 

317 """ 

318 See hash_array.__doc__. 

319 """ 

320 dtype = vals.dtype 

321 

322 # we'll be working with everything as 64-bit values, so handle this 

323 # 128-bit value early 

324 if np.issubdtype(dtype, np.complex128): 

325 return hash_array(np.real(vals)) + 23 * hash_array(np.imag(vals)) 

326 

327 # First, turn whatever array this is into unsigned 64-bit ints, if we can 

328 # manage it. 

329 elif dtype == bool: 

330 vals = vals.astype("u8") 

331 elif issubclass(dtype.type, (np.datetime64, np.timedelta64)): 

332 vals = vals.view("i8").astype("u8", copy=False) 

333 elif issubclass(dtype.type, np.number) and dtype.itemsize <= 8: 

334 vals = vals.view(f"u{vals.dtype.itemsize}").astype("u8") 

335 else: 

336 # With repeated values, its MUCH faster to categorize object dtypes, 

337 # then hash and rename categories. We allow skipping the categorization 

338 # when the values are known/likely to be unique. 

339 if categorize: 

340 from pandas import ( 

341 Categorical, 

342 Index, 

343 factorize, 

344 ) 

345 

346 codes, categories = factorize(vals, sort=False) 

347 cat = Categorical( 

348 codes, Index._with_infer(categories), ordered=False, fastpath=True 

349 ) 

350 return _hash_categorical(cat, encoding, hash_key) 

351 

352 try: 

353 vals = hash_object_array(vals, hash_key, encoding) 

354 except TypeError: 

355 # we have mixed types 

356 vals = hash_object_array( 

357 vals.astype(str).astype(object), hash_key, encoding 

358 ) 

359 

360 # Then, redistribute these 64-bit ints within the space of 64-bit ints 

361 vals ^= vals >> 30 

362 vals *= np.uint64(0xBF58476D1CE4E5B9) 

363 vals ^= vals >> 27 

364 vals *= np.uint64(0x94D049BB133111EB) 

365 vals ^= vals >> 31 

366 return vals