Coverage for /var/srv/projects/api.amasfac.comuna18.com/tmp/venv/lib/python3.9/site-packages/charset_normalizer/md.py: 19%
318 statements
« prev ^ index » next coverage.py v6.4.4, created at 2023-07-17 14:22 -0600
« prev ^ index » next coverage.py v6.4.4, created at 2023-07-17 14:22 -0600
1from functools import lru_cache
2from typing import List, Optional
4from .constant import COMMON_SAFE_ASCII_CHARACTERS, UNICODE_SECONDARY_RANGE_KEYWORD
5from .utils import (
6 is_accentuated,
7 is_ascii,
8 is_case_variable,
9 is_cjk,
10 is_emoticon,
11 is_hangul,
12 is_hiragana,
13 is_katakana,
14 is_latin,
15 is_punctuation,
16 is_separator,
17 is_symbol,
18 is_thai,
19 is_unprintable,
20 remove_accent,
21 unicode_range,
22)
25class MessDetectorPlugin:
26 """
27 Base abstract class used for mess detection plugins.
28 All detectors MUST extend and implement given methods.
29 """
31 def eligible(self, character: str) -> bool:
32 """
33 Determine if given character should be fed in.
34 """
35 raise NotImplementedError # pragma: nocover
37 def feed(self, character: str) -> None:
38 """
39 The main routine to be executed upon character.
40 Insert the logic in witch the text would be considered chaotic.
41 """
42 raise NotImplementedError # pragma: nocover
44 def reset(self) -> None: # pragma: no cover
45 """
46 Permit to reset the plugin to the initial state.
47 """
48 raise NotImplementedError
50 @property
51 def ratio(self) -> float:
52 """
53 Compute the chaos ratio based on what your feed() has seen.
54 Must NOT be lower than 0.; No restriction gt 0.
55 """
56 raise NotImplementedError # pragma: nocover
59class TooManySymbolOrPunctuationPlugin(MessDetectorPlugin):
60 def __init__(self) -> None:
61 self._punctuation_count: int = 0
62 self._symbol_count: int = 0
63 self._character_count: int = 0
65 self._last_printable_char: Optional[str] = None
66 self._frenzy_symbol_in_word: bool = False
68 def eligible(self, character: str) -> bool:
69 return character.isprintable()
71 def feed(self, character: str) -> None:
72 self._character_count += 1
74 if (
75 character != self._last_printable_char
76 and character not in COMMON_SAFE_ASCII_CHARACTERS
77 ):
78 if is_punctuation(character):
79 self._punctuation_count += 1
80 elif (
81 character.isdigit() is False
82 and is_symbol(character)
83 and is_emoticon(character) is False
84 ):
85 self._symbol_count += 2
87 self._last_printable_char = character
89 def reset(self) -> None: # pragma: no cover
90 self._punctuation_count = 0
91 self._character_count = 0
92 self._symbol_count = 0
94 @property
95 def ratio(self) -> float:
96 if self._character_count == 0:
97 return 0.0
99 ratio_of_punctuation: float = (
100 self._punctuation_count + self._symbol_count
101 ) / self._character_count
103 return ratio_of_punctuation if ratio_of_punctuation >= 0.3 else 0.0
106class TooManyAccentuatedPlugin(MessDetectorPlugin):
107 def __init__(self) -> None:
108 self._character_count: int = 0
109 self._accentuated_count: int = 0
111 def eligible(self, character: str) -> bool:
112 return character.isalpha()
114 def feed(self, character: str) -> None:
115 self._character_count += 1
117 if is_accentuated(character):
118 self._accentuated_count += 1
120 def reset(self) -> None: # pragma: no cover
121 self._character_count = 0
122 self._accentuated_count = 0
124 @property
125 def ratio(self) -> float:
126 if self._character_count == 0:
127 return 0.0
128 ratio_of_accentuation: float = self._accentuated_count / self._character_count
129 return ratio_of_accentuation if ratio_of_accentuation >= 0.35 else 0.0
132class UnprintablePlugin(MessDetectorPlugin):
133 def __init__(self) -> None:
134 self._unprintable_count: int = 0
135 self._character_count: int = 0
137 def eligible(self, character: str) -> bool:
138 return True
140 def feed(self, character: str) -> None:
141 if is_unprintable(character):
142 self._unprintable_count += 1
143 self._character_count += 1
145 def reset(self) -> None: # pragma: no cover
146 self._unprintable_count = 0
148 @property
149 def ratio(self) -> float:
150 if self._character_count == 0:
151 return 0.0
153 return (self._unprintable_count * 8) / self._character_count
156class SuspiciousDuplicateAccentPlugin(MessDetectorPlugin):
157 def __init__(self) -> None:
158 self._successive_count: int = 0
159 self._character_count: int = 0
161 self._last_latin_character: Optional[str] = None
163 def eligible(self, character: str) -> bool:
164 return character.isalpha() and is_latin(character)
166 def feed(self, character: str) -> None:
167 self._character_count += 1
168 if (
169 self._last_latin_character is not None
170 and is_accentuated(character)
171 and is_accentuated(self._last_latin_character)
172 ):
173 if character.isupper() and self._last_latin_character.isupper():
174 self._successive_count += 1
175 # Worse if its the same char duplicated with different accent.
176 if remove_accent(character) == remove_accent(self._last_latin_character):
177 self._successive_count += 1
178 self._last_latin_character = character
180 def reset(self) -> None: # pragma: no cover
181 self._successive_count = 0
182 self._character_count = 0
183 self._last_latin_character = None
185 @property
186 def ratio(self) -> float:
187 if self._character_count == 0:
188 return 0.0
190 return (self._successive_count * 2) / self._character_count
193class SuspiciousRange(MessDetectorPlugin):
194 def __init__(self) -> None:
195 self._suspicious_successive_range_count: int = 0
196 self._character_count: int = 0
197 self._last_printable_seen: Optional[str] = None
199 def eligible(self, character: str) -> bool:
200 return character.isprintable()
202 def feed(self, character: str) -> None:
203 self._character_count += 1
205 if (
206 character.isspace()
207 or is_punctuation(character)
208 or character in COMMON_SAFE_ASCII_CHARACTERS
209 ):
210 self._last_printable_seen = None
211 return
213 if self._last_printable_seen is None:
214 self._last_printable_seen = character
215 return
217 unicode_range_a: Optional[str] = unicode_range(self._last_printable_seen)
218 unicode_range_b: Optional[str] = unicode_range(character)
220 if is_suspiciously_successive_range(unicode_range_a, unicode_range_b):
221 self._suspicious_successive_range_count += 1
223 self._last_printable_seen = character
225 def reset(self) -> None: # pragma: no cover
226 self._character_count = 0
227 self._suspicious_successive_range_count = 0
228 self._last_printable_seen = None
230 @property
231 def ratio(self) -> float:
232 if self._character_count == 0:
233 return 0.0
235 ratio_of_suspicious_range_usage: float = (
236 self._suspicious_successive_range_count * 2
237 ) / self._character_count
239 if ratio_of_suspicious_range_usage < 0.1:
240 return 0.0
242 return ratio_of_suspicious_range_usage
245class SuperWeirdWordPlugin(MessDetectorPlugin):
246 def __init__(self) -> None:
247 self._word_count: int = 0
248 self._bad_word_count: int = 0
249 self._foreign_long_count: int = 0
251 self._is_current_word_bad: bool = False
252 self._foreign_long_watch: bool = False
254 self._character_count: int = 0
255 self._bad_character_count: int = 0
257 self._buffer: str = ""
258 self._buffer_accent_count: int = 0
260 def eligible(self, character: str) -> bool:
261 return True
263 def feed(self, character: str) -> None:
264 if character.isalpha():
265 self._buffer += character
266 if is_accentuated(character):
267 self._buffer_accent_count += 1
268 if (
269 self._foreign_long_watch is False
270 and (is_latin(character) is False or is_accentuated(character))
271 and is_cjk(character) is False
272 and is_hangul(character) is False
273 and is_katakana(character) is False
274 and is_hiragana(character) is False
275 and is_thai(character) is False
276 ):
277 self._foreign_long_watch = True
278 return
279 if not self._buffer:
280 return
281 if (
282 character.isspace() or is_punctuation(character) or is_separator(character)
283 ) and self._buffer:
284 self._word_count += 1
285 buffer_length: int = len(self._buffer)
287 self._character_count += buffer_length
289 if buffer_length >= 4:
290 if self._buffer_accent_count / buffer_length > 0.34:
291 self._is_current_word_bad = True
292 # Word/Buffer ending with a upper case accentuated letter are so rare,
293 # that we will consider them all as suspicious. Same weight as foreign_long suspicious.
294 if is_accentuated(self._buffer[-1]) and self._buffer[-1].isupper():
295 self._foreign_long_count += 1
296 self._is_current_word_bad = True
297 if buffer_length >= 24 and self._foreign_long_watch:
298 self._foreign_long_count += 1
299 self._is_current_word_bad = True
301 if self._is_current_word_bad:
302 self._bad_word_count += 1
303 self._bad_character_count += len(self._buffer)
304 self._is_current_word_bad = False
306 self._foreign_long_watch = False
307 self._buffer = ""
308 self._buffer_accent_count = 0
309 elif (
310 character not in {"<", ">", "-", "=", "~", "|", "_"}
311 and character.isdigit() is False
312 and is_symbol(character)
313 ):
314 self._is_current_word_bad = True
315 self._buffer += character
317 def reset(self) -> None: # pragma: no cover
318 self._buffer = ""
319 self._is_current_word_bad = False
320 self._foreign_long_watch = False
321 self._bad_word_count = 0
322 self._word_count = 0
323 self._character_count = 0
324 self._bad_character_count = 0
325 self._foreign_long_count = 0
327 @property
328 def ratio(self) -> float:
329 if self._word_count <= 10 and self._foreign_long_count == 0:
330 return 0.0
332 return self._bad_character_count / self._character_count
335class CjkInvalidStopPlugin(MessDetectorPlugin):
336 """
337 GB(Chinese) based encoding often render the stop incorrectly when the content does not fit and
338 can be easily detected. Searching for the overuse of '丅' and '丄'.
339 """
341 def __init__(self) -> None:
342 self._wrong_stop_count: int = 0
343 self._cjk_character_count: int = 0
345 def eligible(self, character: str) -> bool:
346 return True
348 def feed(self, character: str) -> None:
349 if character in {"丅", "丄"}:
350 self._wrong_stop_count += 1
351 return
352 if is_cjk(character):
353 self._cjk_character_count += 1
355 def reset(self) -> None: # pragma: no cover
356 self._wrong_stop_count = 0
357 self._cjk_character_count = 0
359 @property
360 def ratio(self) -> float:
361 if self._cjk_character_count < 16:
362 return 0.0
363 return self._wrong_stop_count / self._cjk_character_count
366class ArchaicUpperLowerPlugin(MessDetectorPlugin):
367 def __init__(self) -> None:
368 self._buf: bool = False
370 self._character_count_since_last_sep: int = 0
372 self._successive_upper_lower_count: int = 0
373 self._successive_upper_lower_count_final: int = 0
375 self._character_count: int = 0
377 self._last_alpha_seen: Optional[str] = None
378 self._current_ascii_only: bool = True
380 def eligible(self, character: str) -> bool:
381 return True
383 def feed(self, character: str) -> None:
384 is_concerned = character.isalpha() and is_case_variable(character)
385 chunk_sep = is_concerned is False
387 if chunk_sep and self._character_count_since_last_sep > 0:
388 if (
389 self._character_count_since_last_sep <= 64
390 and character.isdigit() is False
391 and self._current_ascii_only is False
392 ):
393 self._successive_upper_lower_count_final += (
394 self._successive_upper_lower_count
395 )
397 self._successive_upper_lower_count = 0
398 self._character_count_since_last_sep = 0
399 self._last_alpha_seen = None
400 self._buf = False
401 self._character_count += 1
402 self._current_ascii_only = True
404 return
406 if self._current_ascii_only is True and is_ascii(character) is False:
407 self._current_ascii_only = False
409 if self._last_alpha_seen is not None:
410 if (character.isupper() and self._last_alpha_seen.islower()) or (
411 character.islower() and self._last_alpha_seen.isupper()
412 ):
413 if self._buf is True:
414 self._successive_upper_lower_count += 2
415 self._buf = False
416 else:
417 self._buf = True
418 else:
419 self._buf = False
421 self._character_count += 1
422 self._character_count_since_last_sep += 1
423 self._last_alpha_seen = character
425 def reset(self) -> None: # pragma: no cover
426 self._character_count = 0
427 self._character_count_since_last_sep = 0
428 self._successive_upper_lower_count = 0
429 self._successive_upper_lower_count_final = 0
430 self._last_alpha_seen = None
431 self._buf = False
432 self._current_ascii_only = True
434 @property
435 def ratio(self) -> float:
436 if self._character_count == 0:
437 return 0.0
439 return self._successive_upper_lower_count_final / self._character_count
442@lru_cache(maxsize=1024)
443def is_suspiciously_successive_range(
444 unicode_range_a: Optional[str], unicode_range_b: Optional[str]
445) -> bool:
446 """
447 Determine if two Unicode range seen next to each other can be considered as suspicious.
448 """
449 if unicode_range_a is None or unicode_range_b is None:
450 return True
452 if unicode_range_a == unicode_range_b:
453 return False
455 if "Latin" in unicode_range_a and "Latin" in unicode_range_b:
456 return False
458 if "Emoticons" in unicode_range_a or "Emoticons" in unicode_range_b:
459 return False
461 # Latin characters can be accompanied with a combining diacritical mark
462 # eg. Vietnamese.
463 if ("Latin" in unicode_range_a or "Latin" in unicode_range_b) and (
464 "Combining" in unicode_range_a or "Combining" in unicode_range_b
465 ):
466 return False
468 keywords_range_a, keywords_range_b = unicode_range_a.split(
469 " "
470 ), unicode_range_b.split(" ")
472 for el in keywords_range_a:
473 if el in UNICODE_SECONDARY_RANGE_KEYWORD:
474 continue
475 if el in keywords_range_b:
476 return False
478 # Japanese Exception
479 range_a_jp_chars, range_b_jp_chars = (
480 unicode_range_a
481 in (
482 "Hiragana",
483 "Katakana",
484 ),
485 unicode_range_b in ("Hiragana", "Katakana"),
486 )
487 if (range_a_jp_chars or range_b_jp_chars) and (
488 "CJK" in unicode_range_a or "CJK" in unicode_range_b
489 ):
490 return False
491 if range_a_jp_chars and range_b_jp_chars:
492 return False
494 if "Hangul" in unicode_range_a or "Hangul" in unicode_range_b:
495 if "CJK" in unicode_range_a or "CJK" in unicode_range_b:
496 return False
497 if unicode_range_a == "Basic Latin" or unicode_range_b == "Basic Latin":
498 return False
500 # Chinese/Japanese use dedicated range for punctuation and/or separators.
501 if ("CJK" in unicode_range_a or "CJK" in unicode_range_b) or (
502 unicode_range_a in ["Katakana", "Hiragana"]
503 and unicode_range_b in ["Katakana", "Hiragana"]
504 ):
505 if "Punctuation" in unicode_range_a or "Punctuation" in unicode_range_b:
506 return False
507 if "Forms" in unicode_range_a or "Forms" in unicode_range_b:
508 return False
510 return True
513@lru_cache(maxsize=2048)
514def mess_ratio(
515 decoded_sequence: str, maximum_threshold: float = 0.2, debug: bool = False
516) -> float:
517 """
518 Compute a mess ratio given a decoded bytes sequence. The maximum threshold does stop the computation earlier.
519 """
521 detectors: List[MessDetectorPlugin] = [
522 md_class() for md_class in MessDetectorPlugin.__subclasses__()
523 ]
525 length: int = len(decoded_sequence) + 1
527 mean_mess_ratio: float = 0.0
529 if length < 512:
530 intermediary_mean_mess_ratio_calc: int = 32
531 elif length <= 1024:
532 intermediary_mean_mess_ratio_calc = 64
533 else:
534 intermediary_mean_mess_ratio_calc = 128
536 for character, index in zip(decoded_sequence + "\n", range(length)):
537 for detector in detectors:
538 if detector.eligible(character):
539 detector.feed(character)
541 if (
542 index > 0 and index % intermediary_mean_mess_ratio_calc == 0
543 ) or index == length - 1:
544 mean_mess_ratio = sum(dt.ratio for dt in detectors)
546 if mean_mess_ratio >= maximum_threshold:
547 break
549 if debug:
550 for dt in detectors: # pragma: nocover
551 print(dt.__class__, dt.ratio)
553 return round(mean_mess_ratio, 3)