Coverage for /var/srv/projects/api.amasfac.comuna18.com/tmp/venv/lib/python3.9/site-packages/charset_normalizer/models.py: 32%
191 statements
« prev ^ index » next coverage.py v6.4.4, created at 2023-07-17 14:22 -0600
« prev ^ index » next coverage.py v6.4.4, created at 2023-07-17 14:22 -0600
1import warnings
2from collections import Counter
3from encodings.aliases import aliases
4from hashlib import sha256
5from json import dumps
6from re import sub
7from typing import (
8 Any,
9 Counter as TypeCounter,
10 Dict,
11 Iterator,
12 List,
13 Optional,
14 Tuple,
15 Union,
16)
18from .constant import NOT_PRINTABLE_PATTERN, TOO_BIG_SEQUENCE
19from .md import mess_ratio
20from .utils import iana_name, is_multi_byte_encoding, unicode_range
23class CharsetMatch:
24 def __init__(
25 self,
26 payload: bytes,
27 guessed_encoding: str,
28 mean_mess_ratio: float,
29 has_sig_or_bom: bool,
30 languages: "CoherenceMatches",
31 decoded_payload: Optional[str] = None,
32 ):
33 self._payload: bytes = payload
35 self._encoding: str = guessed_encoding
36 self._mean_mess_ratio: float = mean_mess_ratio
37 self._languages: CoherenceMatches = languages
38 self._has_sig_or_bom: bool = has_sig_or_bom
39 self._unicode_ranges: Optional[List[str]] = None
41 self._leaves: List[CharsetMatch] = []
42 self._mean_coherence_ratio: float = 0.0
44 self._output_payload: Optional[bytes] = None
45 self._output_encoding: Optional[str] = None
47 self._string: Optional[str] = decoded_payload
49 def __eq__(self, other: object) -> bool:
50 if not isinstance(other, CharsetMatch):
51 raise TypeError(
52 "__eq__ cannot be invoked on {} and {}.".format(
53 str(other.__class__), str(self.__class__)
54 )
55 )
56 return self.encoding == other.encoding and self.fingerprint == other.fingerprint
58 def __lt__(self, other: object) -> bool:
59 """
60 Implemented to make sorted available upon CharsetMatches items.
61 """
62 if not isinstance(other, CharsetMatch):
63 raise ValueError
65 chaos_difference: float = abs(self.chaos - other.chaos)
66 coherence_difference: float = abs(self.coherence - other.coherence)
68 # Bellow 1% difference --> Use Coherence
69 if chaos_difference < 0.01 and coherence_difference > 0.02:
70 # When having a tough decision, use the result that decoded as many multi-byte as possible.
71 if chaos_difference == 0.0 and self.coherence == other.coherence:
72 return self.multi_byte_usage > other.multi_byte_usage
73 return self.coherence > other.coherence
75 return self.chaos < other.chaos
77 @property
78 def multi_byte_usage(self) -> float:
79 return 1.0 - len(str(self)) / len(self.raw)
81 @property
82 def chaos_secondary_pass(self) -> float:
83 """
84 Check once again chaos in decoded text, except this time, with full content.
85 Use with caution, this can be very slow.
86 Notice: Will be removed in 3.0
87 """
88 warnings.warn(
89 "chaos_secondary_pass is deprecated and will be removed in 3.0",
90 DeprecationWarning,
91 )
92 return mess_ratio(str(self), 1.0)
94 @property
95 def coherence_non_latin(self) -> float:
96 """
97 Coherence ratio on the first non-latin language detected if ANY.
98 Notice: Will be removed in 3.0
99 """
100 warnings.warn(
101 "coherence_non_latin is deprecated and will be removed in 3.0",
102 DeprecationWarning,
103 )
104 return 0.0
106 @property
107 def w_counter(self) -> TypeCounter[str]:
108 """
109 Word counter instance on decoded text.
110 Notice: Will be removed in 3.0
111 """
112 warnings.warn(
113 "w_counter is deprecated and will be removed in 3.0", DeprecationWarning
114 )
116 string_printable_only = sub(NOT_PRINTABLE_PATTERN, " ", str(self).lower())
118 return Counter(string_printable_only.split())
120 def __str__(self) -> str:
121 # Lazy Str Loading
122 if self._string is None:
123 self._string = str(self._payload, self._encoding, "strict")
124 return self._string
126 def __repr__(self) -> str:
127 return "<CharsetMatch '{}' bytes({})>".format(self.encoding, self.fingerprint)
129 def add_submatch(self, other: "CharsetMatch") -> None:
130 if not isinstance(other, CharsetMatch) or other == self:
131 raise ValueError(
132 "Unable to add instance <{}> as a submatch of a CharsetMatch".format(
133 other.__class__
134 )
135 )
137 other._string = None # Unload RAM usage; dirty trick.
138 self._leaves.append(other)
140 @property
141 def encoding(self) -> str:
142 return self._encoding
144 @property
145 def encoding_aliases(self) -> List[str]:
146 """
147 Encoding name are known by many name, using this could help when searching for IBM855 when it's listed as CP855.
148 """
149 also_known_as: List[str] = []
150 for u, p in aliases.items():
151 if self.encoding == u:
152 also_known_as.append(p)
153 elif self.encoding == p:
154 also_known_as.append(u)
155 return also_known_as
157 @property
158 def bom(self) -> bool:
159 return self._has_sig_or_bom
161 @property
162 def byte_order_mark(self) -> bool:
163 return self._has_sig_or_bom
165 @property
166 def languages(self) -> List[str]:
167 """
168 Return the complete list of possible languages found in decoded sequence.
169 Usually not really useful. Returned list may be empty even if 'language' property return something != 'Unknown'.
170 """
171 return [e[0] for e in self._languages]
173 @property
174 def language(self) -> str:
175 """
176 Most probable language found in decoded sequence. If none were detected or inferred, the property will return
177 "Unknown".
178 """
179 if not self._languages:
180 # Trying to infer the language based on the given encoding
181 # Its either English or we should not pronounce ourselves in certain cases.
182 if "ascii" in self.could_be_from_charset:
183 return "English"
185 # doing it there to avoid circular import
186 from charset_normalizer.cd import encoding_languages, mb_encoding_languages
188 languages = (
189 mb_encoding_languages(self.encoding)
190 if is_multi_byte_encoding(self.encoding)
191 else encoding_languages(self.encoding)
192 )
194 if len(languages) == 0 or "Latin Based" in languages:
195 return "Unknown"
197 return languages[0]
199 return self._languages[0][0]
201 @property
202 def chaos(self) -> float:
203 return self._mean_mess_ratio
205 @property
206 def coherence(self) -> float:
207 if not self._languages:
208 return 0.0
209 return self._languages[0][1]
211 @property
212 def percent_chaos(self) -> float:
213 return round(self.chaos * 100, ndigits=3)
215 @property
216 def percent_coherence(self) -> float:
217 return round(self.coherence * 100, ndigits=3)
219 @property
220 def raw(self) -> bytes:
221 """
222 Original untouched bytes.
223 """
224 return self._payload
226 @property
227 def submatch(self) -> List["CharsetMatch"]:
228 return self._leaves
230 @property
231 def has_submatch(self) -> bool:
232 return len(self._leaves) > 0
234 @property
235 def alphabets(self) -> List[str]:
236 if self._unicode_ranges is not None:
237 return self._unicode_ranges
238 # list detected ranges
239 detected_ranges: List[Optional[str]] = [
240 unicode_range(char) for char in str(self)
241 ]
242 # filter and sort
243 self._unicode_ranges = sorted(list({r for r in detected_ranges if r}))
244 return self._unicode_ranges
246 @property
247 def could_be_from_charset(self) -> List[str]:
248 """
249 The complete list of encoding that output the exact SAME str result and therefore could be the originating
250 encoding.
251 This list does include the encoding available in property 'encoding'.
252 """
253 return [self._encoding] + [m.encoding for m in self._leaves]
255 def first(self) -> "CharsetMatch":
256 """
257 Kept for BC reasons. Will be removed in 3.0.
258 """
259 return self
261 def best(self) -> "CharsetMatch":
262 """
263 Kept for BC reasons. Will be removed in 3.0.
264 """
265 return self
267 def output(self, encoding: str = "utf_8") -> bytes:
268 """
269 Method to get re-encoded bytes payload using given target encoding. Default to UTF-8.
270 Any errors will be simply ignored by the encoder NOT replaced.
271 """
272 if self._output_encoding is None or self._output_encoding != encoding:
273 self._output_encoding = encoding
274 self._output_payload = str(self).encode(encoding, "replace")
276 return self._output_payload # type: ignore
278 @property
279 def fingerprint(self) -> str:
280 """
281 Retrieve the unique SHA256 computed using the transformed (re-encoded) payload. Not the original one.
282 """
283 return sha256(self.output()).hexdigest()
286class CharsetMatches:
287 """
288 Container with every CharsetMatch items ordered by default from most probable to the less one.
289 Act like a list(iterable) but does not implements all related methods.
290 """
292 def __init__(self, results: Optional[List[CharsetMatch]] = None):
293 self._results: List[CharsetMatch] = sorted(results) if results else []
295 def __iter__(self) -> Iterator[CharsetMatch]:
296 yield from self._results
298 def __getitem__(self, item: Union[int, str]) -> CharsetMatch:
299 """
300 Retrieve a single item either by its position or encoding name (alias may be used here).
301 Raise KeyError upon invalid index or encoding not present in results.
302 """
303 if isinstance(item, int):
304 return self._results[item]
305 if isinstance(item, str):
306 item = iana_name(item, False)
307 for result in self._results:
308 if item in result.could_be_from_charset:
309 return result
310 raise KeyError
312 def __len__(self) -> int:
313 return len(self._results)
315 def __bool__(self) -> bool:
316 return len(self._results) > 0
318 def append(self, item: CharsetMatch) -> None:
319 """
320 Insert a single match. Will be inserted accordingly to preserve sort.
321 Can be inserted as a submatch.
322 """
323 if not isinstance(item, CharsetMatch):
324 raise ValueError(
325 "Cannot append instance '{}' to CharsetMatches".format(
326 str(item.__class__)
327 )
328 )
329 # We should disable the submatch factoring when the input file is too heavy (conserve RAM usage)
330 if len(item.raw) <= TOO_BIG_SEQUENCE:
331 for match in self._results:
332 if match.fingerprint == item.fingerprint and match.chaos == item.chaos:
333 match.add_submatch(item)
334 return
335 self._results.append(item)
336 self._results = sorted(self._results)
338 def best(self) -> Optional["CharsetMatch"]:
339 """
340 Simply return the first match. Strict equivalent to matches[0].
341 """
342 if not self._results:
343 return None
344 return self._results[0]
346 def first(self) -> Optional["CharsetMatch"]:
347 """
348 Redundant method, call the method best(). Kept for BC reasons.
349 """
350 return self.best()
353CoherenceMatch = Tuple[str, float]
354CoherenceMatches = List[CoherenceMatch]
357class CliDetectionResult:
358 def __init__(
359 self,
360 path: str,
361 encoding: Optional[str],
362 encoding_aliases: List[str],
363 alternative_encodings: List[str],
364 language: str,
365 alphabets: List[str],
366 has_sig_or_bom: bool,
367 chaos: float,
368 coherence: float,
369 unicode_path: Optional[str],
370 is_preferred: bool,
371 ):
372 self.path: str = path
373 self.unicode_path: Optional[str] = unicode_path
374 self.encoding: Optional[str] = encoding
375 self.encoding_aliases: List[str] = encoding_aliases
376 self.alternative_encodings: List[str] = alternative_encodings
377 self.language: str = language
378 self.alphabets: List[str] = alphabets
379 self.has_sig_or_bom: bool = has_sig_or_bom
380 self.chaos: float = chaos
381 self.coherence: float = coherence
382 self.is_preferred: bool = is_preferred
384 @property
385 def __dict__(self) -> Dict[str, Any]: # type: ignore
386 return {
387 "path": self.path,
388 "encoding": self.encoding,
389 "encoding_aliases": self.encoding_aliases,
390 "alternative_encodings": self.alternative_encodings,
391 "language": self.language,
392 "alphabets": self.alphabets,
393 "has_sig_or_bom": self.has_sig_or_bom,
394 "chaos": self.chaos,
395 "coherence": self.coherence,
396 "unicode_path": self.unicode_path,
397 "is_preferred": self.is_preferred,
398 }
400 def to_json(self) -> str:
401 return dumps(self.__dict__, ensure_ascii=True, indent=4)