Coverage for /var/srv/projects/api.amasfac.comuna18.com/tmp/venv/lib/python3.9/site-packages/charset_normalizer/md.py: 19%

318 statements  

« prev     ^ index     » next       coverage.py v6.4.4, created at 2023-07-17 14:22 -0600

1from functools import lru_cache 

2from typing import List, Optional 

3 

4from .constant import COMMON_SAFE_ASCII_CHARACTERS, UNICODE_SECONDARY_RANGE_KEYWORD 

5from .utils import ( 

6 is_accentuated, 

7 is_ascii, 

8 is_case_variable, 

9 is_cjk, 

10 is_emoticon, 

11 is_hangul, 

12 is_hiragana, 

13 is_katakana, 

14 is_latin, 

15 is_punctuation, 

16 is_separator, 

17 is_symbol, 

18 is_thai, 

19 is_unprintable, 

20 remove_accent, 

21 unicode_range, 

22) 

23 

24 

25class MessDetectorPlugin: 

26 """ 

27 Base abstract class used for mess detection plugins. 

28 All detectors MUST extend and implement given methods. 

29 """ 

30 

31 def eligible(self, character: str) -> bool: 

32 """ 

33 Determine if given character should be fed in. 

34 """ 

35 raise NotImplementedError # pragma: nocover 

36 

37 def feed(self, character: str) -> None: 

38 """ 

39 The main routine to be executed upon character. 

40 Insert the logic in witch the text would be considered chaotic. 

41 """ 

42 raise NotImplementedError # pragma: nocover 

43 

44 def reset(self) -> None: # pragma: no cover 

45 """ 

46 Permit to reset the plugin to the initial state. 

47 """ 

48 raise NotImplementedError 

49 

50 @property 

51 def ratio(self) -> float: 

52 """ 

53 Compute the chaos ratio based on what your feed() has seen. 

54 Must NOT be lower than 0.; No restriction gt 0. 

55 """ 

56 raise NotImplementedError # pragma: nocover 

57 

58 

59class TooManySymbolOrPunctuationPlugin(MessDetectorPlugin): 

60 def __init__(self) -> None: 

61 self._punctuation_count: int = 0 

62 self._symbol_count: int = 0 

63 self._character_count: int = 0 

64 

65 self._last_printable_char: Optional[str] = None 

66 self._frenzy_symbol_in_word: bool = False 

67 

68 def eligible(self, character: str) -> bool: 

69 return character.isprintable() 

70 

71 def feed(self, character: str) -> None: 

72 self._character_count += 1 

73 

74 if ( 

75 character != self._last_printable_char 

76 and character not in COMMON_SAFE_ASCII_CHARACTERS 

77 ): 

78 if is_punctuation(character): 

79 self._punctuation_count += 1 

80 elif ( 

81 character.isdigit() is False 

82 and is_symbol(character) 

83 and is_emoticon(character) is False 

84 ): 

85 self._symbol_count += 2 

86 

87 self._last_printable_char = character 

88 

89 def reset(self) -> None: # pragma: no cover 

90 self._punctuation_count = 0 

91 self._character_count = 0 

92 self._symbol_count = 0 

93 

94 @property 

95 def ratio(self) -> float: 

96 if self._character_count == 0: 

97 return 0.0 

98 

99 ratio_of_punctuation: float = ( 

100 self._punctuation_count + self._symbol_count 

101 ) / self._character_count 

102 

103 return ratio_of_punctuation if ratio_of_punctuation >= 0.3 else 0.0 

104 

105 

106class TooManyAccentuatedPlugin(MessDetectorPlugin): 

107 def __init__(self) -> None: 

108 self._character_count: int = 0 

109 self._accentuated_count: int = 0 

110 

111 def eligible(self, character: str) -> bool: 

112 return character.isalpha() 

113 

114 def feed(self, character: str) -> None: 

115 self._character_count += 1 

116 

117 if is_accentuated(character): 

118 self._accentuated_count += 1 

119 

120 def reset(self) -> None: # pragma: no cover 

121 self._character_count = 0 

122 self._accentuated_count = 0 

123 

124 @property 

125 def ratio(self) -> float: 

126 if self._character_count == 0: 

127 return 0.0 

128 ratio_of_accentuation: float = self._accentuated_count / self._character_count 

129 return ratio_of_accentuation if ratio_of_accentuation >= 0.35 else 0.0 

130 

131 

132class UnprintablePlugin(MessDetectorPlugin): 

133 def __init__(self) -> None: 

134 self._unprintable_count: int = 0 

135 self._character_count: int = 0 

136 

137 def eligible(self, character: str) -> bool: 

138 return True 

139 

140 def feed(self, character: str) -> None: 

141 if is_unprintable(character): 

142 self._unprintable_count += 1 

143 self._character_count += 1 

144 

145 def reset(self) -> None: # pragma: no cover 

146 self._unprintable_count = 0 

147 

148 @property 

149 def ratio(self) -> float: 

150 if self._character_count == 0: 

151 return 0.0 

152 

153 return (self._unprintable_count * 8) / self._character_count 

154 

155 

156class SuspiciousDuplicateAccentPlugin(MessDetectorPlugin): 

157 def __init__(self) -> None: 

158 self._successive_count: int = 0 

159 self._character_count: int = 0 

160 

161 self._last_latin_character: Optional[str] = None 

162 

163 def eligible(self, character: str) -> bool: 

164 return character.isalpha() and is_latin(character) 

165 

166 def feed(self, character: str) -> None: 

167 self._character_count += 1 

168 if ( 

169 self._last_latin_character is not None 

170 and is_accentuated(character) 

171 and is_accentuated(self._last_latin_character) 

172 ): 

173 if character.isupper() and self._last_latin_character.isupper(): 

174 self._successive_count += 1 

175 # Worse if its the same char duplicated with different accent. 

176 if remove_accent(character) == remove_accent(self._last_latin_character): 

177 self._successive_count += 1 

178 self._last_latin_character = character 

179 

180 def reset(self) -> None: # pragma: no cover 

181 self._successive_count = 0 

182 self._character_count = 0 

183 self._last_latin_character = None 

184 

185 @property 

186 def ratio(self) -> float: 

187 if self._character_count == 0: 

188 return 0.0 

189 

190 return (self._successive_count * 2) / self._character_count 

191 

192 

193class SuspiciousRange(MessDetectorPlugin): 

194 def __init__(self) -> None: 

195 self._suspicious_successive_range_count: int = 0 

196 self._character_count: int = 0 

197 self._last_printable_seen: Optional[str] = None 

198 

199 def eligible(self, character: str) -> bool: 

200 return character.isprintable() 

201 

202 def feed(self, character: str) -> None: 

203 self._character_count += 1 

204 

205 if ( 

206 character.isspace() 

207 or is_punctuation(character) 

208 or character in COMMON_SAFE_ASCII_CHARACTERS 

209 ): 

210 self._last_printable_seen = None 

211 return 

212 

213 if self._last_printable_seen is None: 

214 self._last_printable_seen = character 

215 return 

216 

217 unicode_range_a: Optional[str] = unicode_range(self._last_printable_seen) 

218 unicode_range_b: Optional[str] = unicode_range(character) 

219 

220 if is_suspiciously_successive_range(unicode_range_a, unicode_range_b): 

221 self._suspicious_successive_range_count += 1 

222 

223 self._last_printable_seen = character 

224 

225 def reset(self) -> None: # pragma: no cover 

226 self._character_count = 0 

227 self._suspicious_successive_range_count = 0 

228 self._last_printable_seen = None 

229 

230 @property 

231 def ratio(self) -> float: 

232 if self._character_count == 0: 

233 return 0.0 

234 

235 ratio_of_suspicious_range_usage: float = ( 

236 self._suspicious_successive_range_count * 2 

237 ) / self._character_count 

238 

239 if ratio_of_suspicious_range_usage < 0.1: 

240 return 0.0 

241 

242 return ratio_of_suspicious_range_usage 

243 

244 

245class SuperWeirdWordPlugin(MessDetectorPlugin): 

246 def __init__(self) -> None: 

247 self._word_count: int = 0 

248 self._bad_word_count: int = 0 

249 self._foreign_long_count: int = 0 

250 

251 self._is_current_word_bad: bool = False 

252 self._foreign_long_watch: bool = False 

253 

254 self._character_count: int = 0 

255 self._bad_character_count: int = 0 

256 

257 self._buffer: str = "" 

258 self._buffer_accent_count: int = 0 

259 

260 def eligible(self, character: str) -> bool: 

261 return True 

262 

263 def feed(self, character: str) -> None: 

264 if character.isalpha(): 

265 self._buffer += character 

266 if is_accentuated(character): 

267 self._buffer_accent_count += 1 

268 if ( 

269 self._foreign_long_watch is False 

270 and (is_latin(character) is False or is_accentuated(character)) 

271 and is_cjk(character) is False 

272 and is_hangul(character) is False 

273 and is_katakana(character) is False 

274 and is_hiragana(character) is False 

275 and is_thai(character) is False 

276 ): 

277 self._foreign_long_watch = True 

278 return 

279 if not self._buffer: 

280 return 

281 if ( 

282 character.isspace() or is_punctuation(character) or is_separator(character) 

283 ) and self._buffer: 

284 self._word_count += 1 

285 buffer_length: int = len(self._buffer) 

286 

287 self._character_count += buffer_length 

288 

289 if buffer_length >= 4: 

290 if self._buffer_accent_count / buffer_length > 0.34: 

291 self._is_current_word_bad = True 

292 # Word/Buffer ending with a upper case accentuated letter are so rare, 

293 # that we will consider them all as suspicious. Same weight as foreign_long suspicious. 

294 if is_accentuated(self._buffer[-1]) and self._buffer[-1].isupper(): 

295 self._foreign_long_count += 1 

296 self._is_current_word_bad = True 

297 if buffer_length >= 24 and self._foreign_long_watch: 

298 self._foreign_long_count += 1 

299 self._is_current_word_bad = True 

300 

301 if self._is_current_word_bad: 

302 self._bad_word_count += 1 

303 self._bad_character_count += len(self._buffer) 

304 self._is_current_word_bad = False 

305 

306 self._foreign_long_watch = False 

307 self._buffer = "" 

308 self._buffer_accent_count = 0 

309 elif ( 

310 character not in {"<", ">", "-", "=", "~", "|", "_"} 

311 and character.isdigit() is False 

312 and is_symbol(character) 

313 ): 

314 self._is_current_word_bad = True 

315 self._buffer += character 

316 

317 def reset(self) -> None: # pragma: no cover 

318 self._buffer = "" 

319 self._is_current_word_bad = False 

320 self._foreign_long_watch = False 

321 self._bad_word_count = 0 

322 self._word_count = 0 

323 self._character_count = 0 

324 self._bad_character_count = 0 

325 self._foreign_long_count = 0 

326 

327 @property 

328 def ratio(self) -> float: 

329 if self._word_count <= 10 and self._foreign_long_count == 0: 

330 return 0.0 

331 

332 return self._bad_character_count / self._character_count 

333 

334 

335class CjkInvalidStopPlugin(MessDetectorPlugin): 

336 """ 

337 GB(Chinese) based encoding often render the stop incorrectly when the content does not fit and 

338 can be easily detected. Searching for the overuse of '丅' and '丄'. 

339 """ 

340 

341 def __init__(self) -> None: 

342 self._wrong_stop_count: int = 0 

343 self._cjk_character_count: int = 0 

344 

345 def eligible(self, character: str) -> bool: 

346 return True 

347 

348 def feed(self, character: str) -> None: 

349 if character in {"丅", "丄"}: 

350 self._wrong_stop_count += 1 

351 return 

352 if is_cjk(character): 

353 self._cjk_character_count += 1 

354 

355 def reset(self) -> None: # pragma: no cover 

356 self._wrong_stop_count = 0 

357 self._cjk_character_count = 0 

358 

359 @property 

360 def ratio(self) -> float: 

361 if self._cjk_character_count < 16: 

362 return 0.0 

363 return self._wrong_stop_count / self._cjk_character_count 

364 

365 

366class ArchaicUpperLowerPlugin(MessDetectorPlugin): 

367 def __init__(self) -> None: 

368 self._buf: bool = False 

369 

370 self._character_count_since_last_sep: int = 0 

371 

372 self._successive_upper_lower_count: int = 0 

373 self._successive_upper_lower_count_final: int = 0 

374 

375 self._character_count: int = 0 

376 

377 self._last_alpha_seen: Optional[str] = None 

378 self._current_ascii_only: bool = True 

379 

380 def eligible(self, character: str) -> bool: 

381 return True 

382 

383 def feed(self, character: str) -> None: 

384 is_concerned = character.isalpha() and is_case_variable(character) 

385 chunk_sep = is_concerned is False 

386 

387 if chunk_sep and self._character_count_since_last_sep > 0: 

388 if ( 

389 self._character_count_since_last_sep <= 64 

390 and character.isdigit() is False 

391 and self._current_ascii_only is False 

392 ): 

393 self._successive_upper_lower_count_final += ( 

394 self._successive_upper_lower_count 

395 ) 

396 

397 self._successive_upper_lower_count = 0 

398 self._character_count_since_last_sep = 0 

399 self._last_alpha_seen = None 

400 self._buf = False 

401 self._character_count += 1 

402 self._current_ascii_only = True 

403 

404 return 

405 

406 if self._current_ascii_only is True and is_ascii(character) is False: 

407 self._current_ascii_only = False 

408 

409 if self._last_alpha_seen is not None: 

410 if (character.isupper() and self._last_alpha_seen.islower()) or ( 

411 character.islower() and self._last_alpha_seen.isupper() 

412 ): 

413 if self._buf is True: 

414 self._successive_upper_lower_count += 2 

415 self._buf = False 

416 else: 

417 self._buf = True 

418 else: 

419 self._buf = False 

420 

421 self._character_count += 1 

422 self._character_count_since_last_sep += 1 

423 self._last_alpha_seen = character 

424 

425 def reset(self) -> None: # pragma: no cover 

426 self._character_count = 0 

427 self._character_count_since_last_sep = 0 

428 self._successive_upper_lower_count = 0 

429 self._successive_upper_lower_count_final = 0 

430 self._last_alpha_seen = None 

431 self._buf = False 

432 self._current_ascii_only = True 

433 

434 @property 

435 def ratio(self) -> float: 

436 if self._character_count == 0: 

437 return 0.0 

438 

439 return self._successive_upper_lower_count_final / self._character_count 

440 

441 

442@lru_cache(maxsize=1024) 

443def is_suspiciously_successive_range( 

444 unicode_range_a: Optional[str], unicode_range_b: Optional[str] 

445) -> bool: 

446 """ 

447 Determine if two Unicode range seen next to each other can be considered as suspicious. 

448 """ 

449 if unicode_range_a is None or unicode_range_b is None: 

450 return True 

451 

452 if unicode_range_a == unicode_range_b: 

453 return False 

454 

455 if "Latin" in unicode_range_a and "Latin" in unicode_range_b: 

456 return False 

457 

458 if "Emoticons" in unicode_range_a or "Emoticons" in unicode_range_b: 

459 return False 

460 

461 # Latin characters can be accompanied with a combining diacritical mark 

462 # eg. Vietnamese. 

463 if ("Latin" in unicode_range_a or "Latin" in unicode_range_b) and ( 

464 "Combining" in unicode_range_a or "Combining" in unicode_range_b 

465 ): 

466 return False 

467 

468 keywords_range_a, keywords_range_b = unicode_range_a.split( 

469 " " 

470 ), unicode_range_b.split(" ") 

471 

472 for el in keywords_range_a: 

473 if el in UNICODE_SECONDARY_RANGE_KEYWORD: 

474 continue 

475 if el in keywords_range_b: 

476 return False 

477 

478 # Japanese Exception 

479 range_a_jp_chars, range_b_jp_chars = ( 

480 unicode_range_a 

481 in ( 

482 "Hiragana", 

483 "Katakana", 

484 ), 

485 unicode_range_b in ("Hiragana", "Katakana"), 

486 ) 

487 if (range_a_jp_chars or range_b_jp_chars) and ( 

488 "CJK" in unicode_range_a or "CJK" in unicode_range_b 

489 ): 

490 return False 

491 if range_a_jp_chars and range_b_jp_chars: 

492 return False 

493 

494 if "Hangul" in unicode_range_a or "Hangul" in unicode_range_b: 

495 if "CJK" in unicode_range_a or "CJK" in unicode_range_b: 

496 return False 

497 if unicode_range_a == "Basic Latin" or unicode_range_b == "Basic Latin": 

498 return False 

499 

500 # Chinese/Japanese use dedicated range for punctuation and/or separators. 

501 if ("CJK" in unicode_range_a or "CJK" in unicode_range_b) or ( 

502 unicode_range_a in ["Katakana", "Hiragana"] 

503 and unicode_range_b in ["Katakana", "Hiragana"] 

504 ): 

505 if "Punctuation" in unicode_range_a or "Punctuation" in unicode_range_b: 

506 return False 

507 if "Forms" in unicode_range_a or "Forms" in unicode_range_b: 

508 return False 

509 

510 return True 

511 

512 

513@lru_cache(maxsize=2048) 

514def mess_ratio( 

515 decoded_sequence: str, maximum_threshold: float = 0.2, debug: bool = False 

516) -> float: 

517 """ 

518 Compute a mess ratio given a decoded bytes sequence. The maximum threshold does stop the computation earlier. 

519 """ 

520 

521 detectors: List[MessDetectorPlugin] = [ 

522 md_class() for md_class in MessDetectorPlugin.__subclasses__() 

523 ] 

524 

525 length: int = len(decoded_sequence) + 1 

526 

527 mean_mess_ratio: float = 0.0 

528 

529 if length < 512: 

530 intermediary_mean_mess_ratio_calc: int = 32 

531 elif length <= 1024: 

532 intermediary_mean_mess_ratio_calc = 64 

533 else: 

534 intermediary_mean_mess_ratio_calc = 128 

535 

536 for character, index in zip(decoded_sequence + "\n", range(length)): 

537 for detector in detectors: 

538 if detector.eligible(character): 

539 detector.feed(character) 

540 

541 if ( 

542 index > 0 and index % intermediary_mean_mess_ratio_calc == 0 

543 ) or index == length - 1: 

544 mean_mess_ratio = sum(dt.ratio for dt in detectors) 

545 

546 if mean_mess_ratio >= maximum_threshold: 

547 break 

548 

549 if debug: 

550 for dt in detectors: # pragma: nocover 

551 print(dt.__class__, dt.ratio) 

552 

553 return round(mean_mess_ratio, 3)