Coverage for /var/srv/projects/api.amasfac.comuna18.com/tmp/venv/lib/python3.9/site-packages/charset_normalizer/models.py: 32%

191 statements  

« prev     ^ index     » next       coverage.py v6.4.4, created at 2023-07-17 14:22 -0600

1import warnings 

2from collections import Counter 

3from encodings.aliases import aliases 

4from hashlib import sha256 

5from json import dumps 

6from re import sub 

7from typing import ( 

8 Any, 

9 Counter as TypeCounter, 

10 Dict, 

11 Iterator, 

12 List, 

13 Optional, 

14 Tuple, 

15 Union, 

16) 

17 

18from .constant import NOT_PRINTABLE_PATTERN, TOO_BIG_SEQUENCE 

19from .md import mess_ratio 

20from .utils import iana_name, is_multi_byte_encoding, unicode_range 

21 

22 

23class CharsetMatch: 

24 def __init__( 

25 self, 

26 payload: bytes, 

27 guessed_encoding: str, 

28 mean_mess_ratio: float, 

29 has_sig_or_bom: bool, 

30 languages: "CoherenceMatches", 

31 decoded_payload: Optional[str] = None, 

32 ): 

33 self._payload: bytes = payload 

34 

35 self._encoding: str = guessed_encoding 

36 self._mean_mess_ratio: float = mean_mess_ratio 

37 self._languages: CoherenceMatches = languages 

38 self._has_sig_or_bom: bool = has_sig_or_bom 

39 self._unicode_ranges: Optional[List[str]] = None 

40 

41 self._leaves: List[CharsetMatch] = [] 

42 self._mean_coherence_ratio: float = 0.0 

43 

44 self._output_payload: Optional[bytes] = None 

45 self._output_encoding: Optional[str] = None 

46 

47 self._string: Optional[str] = decoded_payload 

48 

49 def __eq__(self, other: object) -> bool: 

50 if not isinstance(other, CharsetMatch): 

51 raise TypeError( 

52 "__eq__ cannot be invoked on {} and {}.".format( 

53 str(other.__class__), str(self.__class__) 

54 ) 

55 ) 

56 return self.encoding == other.encoding and self.fingerprint == other.fingerprint 

57 

58 def __lt__(self, other: object) -> bool: 

59 """ 

60 Implemented to make sorted available upon CharsetMatches items. 

61 """ 

62 if not isinstance(other, CharsetMatch): 

63 raise ValueError 

64 

65 chaos_difference: float = abs(self.chaos - other.chaos) 

66 coherence_difference: float = abs(self.coherence - other.coherence) 

67 

68 # Bellow 1% difference --> Use Coherence 

69 if chaos_difference < 0.01 and coherence_difference > 0.02: 

70 # When having a tough decision, use the result that decoded as many multi-byte as possible. 

71 if chaos_difference == 0.0 and self.coherence == other.coherence: 

72 return self.multi_byte_usage > other.multi_byte_usage 

73 return self.coherence > other.coherence 

74 

75 return self.chaos < other.chaos 

76 

77 @property 

78 def multi_byte_usage(self) -> float: 

79 return 1.0 - len(str(self)) / len(self.raw) 

80 

81 @property 

82 def chaos_secondary_pass(self) -> float: 

83 """ 

84 Check once again chaos in decoded text, except this time, with full content. 

85 Use with caution, this can be very slow. 

86 Notice: Will be removed in 3.0 

87 """ 

88 warnings.warn( 

89 "chaos_secondary_pass is deprecated and will be removed in 3.0", 

90 DeprecationWarning, 

91 ) 

92 return mess_ratio(str(self), 1.0) 

93 

94 @property 

95 def coherence_non_latin(self) -> float: 

96 """ 

97 Coherence ratio on the first non-latin language detected if ANY. 

98 Notice: Will be removed in 3.0 

99 """ 

100 warnings.warn( 

101 "coherence_non_latin is deprecated and will be removed in 3.0", 

102 DeprecationWarning, 

103 ) 

104 return 0.0 

105 

106 @property 

107 def w_counter(self) -> TypeCounter[str]: 

108 """ 

109 Word counter instance on decoded text. 

110 Notice: Will be removed in 3.0 

111 """ 

112 warnings.warn( 

113 "w_counter is deprecated and will be removed in 3.0", DeprecationWarning 

114 ) 

115 

116 string_printable_only = sub(NOT_PRINTABLE_PATTERN, " ", str(self).lower()) 

117 

118 return Counter(string_printable_only.split()) 

119 

120 def __str__(self) -> str: 

121 # Lazy Str Loading 

122 if self._string is None: 

123 self._string = str(self._payload, self._encoding, "strict") 

124 return self._string 

125 

126 def __repr__(self) -> str: 

127 return "<CharsetMatch '{}' bytes({})>".format(self.encoding, self.fingerprint) 

128 

129 def add_submatch(self, other: "CharsetMatch") -> None: 

130 if not isinstance(other, CharsetMatch) or other == self: 

131 raise ValueError( 

132 "Unable to add instance <{}> as a submatch of a CharsetMatch".format( 

133 other.__class__ 

134 ) 

135 ) 

136 

137 other._string = None # Unload RAM usage; dirty trick. 

138 self._leaves.append(other) 

139 

140 @property 

141 def encoding(self) -> str: 

142 return self._encoding 

143 

144 @property 

145 def encoding_aliases(self) -> List[str]: 

146 """ 

147 Encoding name are known by many name, using this could help when searching for IBM855 when it's listed as CP855. 

148 """ 

149 also_known_as: List[str] = [] 

150 for u, p in aliases.items(): 

151 if self.encoding == u: 

152 also_known_as.append(p) 

153 elif self.encoding == p: 

154 also_known_as.append(u) 

155 return also_known_as 

156 

157 @property 

158 def bom(self) -> bool: 

159 return self._has_sig_or_bom 

160 

161 @property 

162 def byte_order_mark(self) -> bool: 

163 return self._has_sig_or_bom 

164 

165 @property 

166 def languages(self) -> List[str]: 

167 """ 

168 Return the complete list of possible languages found in decoded sequence. 

169 Usually not really useful. Returned list may be empty even if 'language' property return something != 'Unknown'. 

170 """ 

171 return [e[0] for e in self._languages] 

172 

173 @property 

174 def language(self) -> str: 

175 """ 

176 Most probable language found in decoded sequence. If none were detected or inferred, the property will return 

177 "Unknown". 

178 """ 

179 if not self._languages: 

180 # Trying to infer the language based on the given encoding 

181 # Its either English or we should not pronounce ourselves in certain cases. 

182 if "ascii" in self.could_be_from_charset: 

183 return "English" 

184 

185 # doing it there to avoid circular import 

186 from charset_normalizer.cd import encoding_languages, mb_encoding_languages 

187 

188 languages = ( 

189 mb_encoding_languages(self.encoding) 

190 if is_multi_byte_encoding(self.encoding) 

191 else encoding_languages(self.encoding) 

192 ) 

193 

194 if len(languages) == 0 or "Latin Based" in languages: 

195 return "Unknown" 

196 

197 return languages[0] 

198 

199 return self._languages[0][0] 

200 

201 @property 

202 def chaos(self) -> float: 

203 return self._mean_mess_ratio 

204 

205 @property 

206 def coherence(self) -> float: 

207 if not self._languages: 

208 return 0.0 

209 return self._languages[0][1] 

210 

211 @property 

212 def percent_chaos(self) -> float: 

213 return round(self.chaos * 100, ndigits=3) 

214 

215 @property 

216 def percent_coherence(self) -> float: 

217 return round(self.coherence * 100, ndigits=3) 

218 

219 @property 

220 def raw(self) -> bytes: 

221 """ 

222 Original untouched bytes. 

223 """ 

224 return self._payload 

225 

226 @property 

227 def submatch(self) -> List["CharsetMatch"]: 

228 return self._leaves 

229 

230 @property 

231 def has_submatch(self) -> bool: 

232 return len(self._leaves) > 0 

233 

234 @property 

235 def alphabets(self) -> List[str]: 

236 if self._unicode_ranges is not None: 

237 return self._unicode_ranges 

238 # list detected ranges 

239 detected_ranges: List[Optional[str]] = [ 

240 unicode_range(char) for char in str(self) 

241 ] 

242 # filter and sort 

243 self._unicode_ranges = sorted(list({r for r in detected_ranges if r})) 

244 return self._unicode_ranges 

245 

246 @property 

247 def could_be_from_charset(self) -> List[str]: 

248 """ 

249 The complete list of encoding that output the exact SAME str result and therefore could be the originating 

250 encoding. 

251 This list does include the encoding available in property 'encoding'. 

252 """ 

253 return [self._encoding] + [m.encoding for m in self._leaves] 

254 

255 def first(self) -> "CharsetMatch": 

256 """ 

257 Kept for BC reasons. Will be removed in 3.0. 

258 """ 

259 return self 

260 

261 def best(self) -> "CharsetMatch": 

262 """ 

263 Kept for BC reasons. Will be removed in 3.0. 

264 """ 

265 return self 

266 

267 def output(self, encoding: str = "utf_8") -> bytes: 

268 """ 

269 Method to get re-encoded bytes payload using given target encoding. Default to UTF-8. 

270 Any errors will be simply ignored by the encoder NOT replaced. 

271 """ 

272 if self._output_encoding is None or self._output_encoding != encoding: 

273 self._output_encoding = encoding 

274 self._output_payload = str(self).encode(encoding, "replace") 

275 

276 return self._output_payload # type: ignore 

277 

278 @property 

279 def fingerprint(self) -> str: 

280 """ 

281 Retrieve the unique SHA256 computed using the transformed (re-encoded) payload. Not the original one. 

282 """ 

283 return sha256(self.output()).hexdigest() 

284 

285 

286class CharsetMatches: 

287 """ 

288 Container with every CharsetMatch items ordered by default from most probable to the less one. 

289 Act like a list(iterable) but does not implements all related methods. 

290 """ 

291 

292 def __init__(self, results: Optional[List[CharsetMatch]] = None): 

293 self._results: List[CharsetMatch] = sorted(results) if results else [] 

294 

295 def __iter__(self) -> Iterator[CharsetMatch]: 

296 yield from self._results 

297 

298 def __getitem__(self, item: Union[int, str]) -> CharsetMatch: 

299 """ 

300 Retrieve a single item either by its position or encoding name (alias may be used here). 

301 Raise KeyError upon invalid index or encoding not present in results. 

302 """ 

303 if isinstance(item, int): 

304 return self._results[item] 

305 if isinstance(item, str): 

306 item = iana_name(item, False) 

307 for result in self._results: 

308 if item in result.could_be_from_charset: 

309 return result 

310 raise KeyError 

311 

312 def __len__(self) -> int: 

313 return len(self._results) 

314 

315 def __bool__(self) -> bool: 

316 return len(self._results) > 0 

317 

318 def append(self, item: CharsetMatch) -> None: 

319 """ 

320 Insert a single match. Will be inserted accordingly to preserve sort. 

321 Can be inserted as a submatch. 

322 """ 

323 if not isinstance(item, CharsetMatch): 

324 raise ValueError( 

325 "Cannot append instance '{}' to CharsetMatches".format( 

326 str(item.__class__) 

327 ) 

328 ) 

329 # We should disable the submatch factoring when the input file is too heavy (conserve RAM usage) 

330 if len(item.raw) <= TOO_BIG_SEQUENCE: 

331 for match in self._results: 

332 if match.fingerprint == item.fingerprint and match.chaos == item.chaos: 

333 match.add_submatch(item) 

334 return 

335 self._results.append(item) 

336 self._results = sorted(self._results) 

337 

338 def best(self) -> Optional["CharsetMatch"]: 

339 """ 

340 Simply return the first match. Strict equivalent to matches[0]. 

341 """ 

342 if not self._results: 

343 return None 

344 return self._results[0] 

345 

346 def first(self) -> Optional["CharsetMatch"]: 

347 """ 

348 Redundant method, call the method best(). Kept for BC reasons. 

349 """ 

350 return self.best() 

351 

352 

353CoherenceMatch = Tuple[str, float] 

354CoherenceMatches = List[CoherenceMatch] 

355 

356 

357class CliDetectionResult: 

358 def __init__( 

359 self, 

360 path: str, 

361 encoding: Optional[str], 

362 encoding_aliases: List[str], 

363 alternative_encodings: List[str], 

364 language: str, 

365 alphabets: List[str], 

366 has_sig_or_bom: bool, 

367 chaos: float, 

368 coherence: float, 

369 unicode_path: Optional[str], 

370 is_preferred: bool, 

371 ): 

372 self.path: str = path 

373 self.unicode_path: Optional[str] = unicode_path 

374 self.encoding: Optional[str] = encoding 

375 self.encoding_aliases: List[str] = encoding_aliases 

376 self.alternative_encodings: List[str] = alternative_encodings 

377 self.language: str = language 

378 self.alphabets: List[str] = alphabets 

379 self.has_sig_or_bom: bool = has_sig_or_bom 

380 self.chaos: float = chaos 

381 self.coherence: float = coherence 

382 self.is_preferred: bool = is_preferred 

383 

384 @property 

385 def __dict__(self) -> Dict[str, Any]: # type: ignore 

386 return { 

387 "path": self.path, 

388 "encoding": self.encoding, 

389 "encoding_aliases": self.encoding_aliases, 

390 "alternative_encodings": self.alternative_encodings, 

391 "language": self.language, 

392 "alphabets": self.alphabets, 

393 "has_sig_or_bom": self.has_sig_or_bom, 

394 "chaos": self.chaos, 

395 "coherence": self.coherence, 

396 "unicode_path": self.unicode_path, 

397 "is_preferred": self.is_preferred, 

398 } 

399 

400 def to_json(self) -> str: 

401 return dumps(self.__dict__, ensure_ascii=True, indent=4)