Coverage for /var/srv/projects/api.amasfac.comuna18.com/tmp/venv/lib/python3.9/site-packages/charset

1from functools import lru_cache

2from typing import List, Optional

4from .constant import COMMON_SAFE_ASCII_CHARACTERS, UNICODE_SECONDARY_RANGE_KEYWORD

5from .utils import (

6 is_accentuated,

7 is_ascii,

8 is_case_variable,

9 is_cjk,

10 is_emoticon,

11 is_hangul,

12 is_hiragana,

13 is_katakana,

14 is_latin,

15 is_punctuation,

16 is_separator,

17 is_symbol,

18 is_thai,

19 is_unprintable,

20 remove_accent,

21 unicode_range,

22)

25class MessDetectorPlugin:

26 """

27 Base abstract class used for mess detection plugins.

28 All detectors MUST extend and implement given methods.

29 """

31 def eligible(self, character: str) -> bool:

32 """

33 Determine if given character should be fed in.

34 """

35 raise NotImplementedError # pragma: nocover

37 def feed(self, character: str) -> None:

38 """

39 The main routine to be executed upon character.

40 Insert the logic in witch the text would be considered chaotic.

41 """

42 raise NotImplementedError # pragma: nocover

44 def reset(self) -> None: # pragma: no cover

45 """

46 Permit to reset the plugin to the initial state.

47 """

48 raise NotImplementedError

50 @property

51 def ratio(self) -> float:

52 """

53 Compute the chaos ratio based on what your feed() has seen.

54 Must NOT be lower than 0.; No restriction gt 0.

55 """

56 raise NotImplementedError # pragma: nocover

59class TooManySymbolOrPunctuationPlugin(MessDetectorPlugin):

60 def __init__(self) -> None:

61 self._punctuation_count: int = 0

62 self._symbol_count: int = 0

63 self._character_count: int = 0

65 self._last_printable_char: Optional[str] = None

66 self._frenzy_symbol_in_word: bool = False

68 def eligible(self, character: str) -> bool:

69 return character.isprintable()

71 def feed(self, character: str) -> None:

72 self._character_count += 1

74 if (

75 character != self._last_printable_char

76 and character not in COMMON_SAFE_ASCII_CHARACTERS

77 ):

78 if is_punctuation(character):

79 self._punctuation_count += 1

80 elif (

81 character.isdigit() is False

82 and is_symbol(character)

83 and is_emoticon(character) is False

84 ):

85 self._symbol_count += 2

87 self._last_printable_char = character

89 def reset(self) -> None: # pragma: no cover

90 self._punctuation_count = 0

91 self._character_count = 0

92 self._symbol_count = 0

94 @property

95 def ratio(self) -> float:

96 if self._character_count == 0:

97 return 0.0

99 ratio_of_punctuation: float = (

100 self._punctuation_count + self._symbol_count

101 ) / self._character_count

102

103 return ratio_of_punctuation if ratio_of_punctuation >= 0.3 else 0.0

104

105

106class TooManyAccentuatedPlugin(MessDetectorPlugin):

107 def __init__(self) -> None:

108 self._character_count: int = 0

109 self._accentuated_count: int = 0

110

111 def eligible(self, character: str) -> bool:

112 return character.isalpha()

113

114 def feed(self, character: str) -> None:

115 self._character_count += 1

116

117 if is_accentuated(character):

118 self._accentuated_count += 1

119

120 def reset(self) -> None: # pragma: no cover

121 self._character_count = 0

122 self._accentuated_count = 0

123

124 @property

125 def ratio(self) -> float:

126 if self._character_count == 0:

127 return 0.0

128 ratio_of_accentuation: float = self._accentuated_count / self._character_count

129 return ratio_of_accentuation if ratio_of_accentuation >= 0.35 else 0.0

130

131

132class UnprintablePlugin(MessDetectorPlugin):

133 def __init__(self) -> None:

134 self._unprintable_count: int = 0

135 self._character_count: int = 0

136

137 def eligible(self, character: str) -> bool:

138 return True

139

140 def feed(self, character: str) -> None:

141 if is_unprintable(character):

142 self._unprintable_count += 1

143 self._character_count += 1

144

145 def reset(self) -> None: # pragma: no cover

146 self._unprintable_count = 0

147

148 @property

149 def ratio(self) -> float:

150 if self._character_count == 0:

151 return 0.0

152

153 return (self._unprintable_count * 8) / self._character_count

154

155

156class SuspiciousDuplicateAccentPlugin(MessDetectorPlugin):

157 def __init__(self) -> None:

158 self._successive_count: int = 0

159 self._character_count: int = 0

160

161 self._last_latin_character: Optional[str] = None

162

163 def eligible(self, character: str) -> bool:

164 return character.isalpha() and is_latin(character)

165

166 def feed(self, character: str) -> None:

167 self._character_count += 1

168 if (

169 self._last_latin_character is not None

170 and is_accentuated(character)

171 and is_accentuated(self._last_latin_character)

172 ):

173 if character.isupper() and self._last_latin_character.isupper():

174 self._successive_count += 1

175 # Worse if its the same char duplicated with different accent.

176 if remove_accent(character) == remove_accent(self._last_latin_character):

177 self._successive_count += 1

178 self._last_latin_character = character

179

180 def reset(self) -> None: # pragma: no cover

181 self._successive_count = 0

182 self._character_count = 0

183 self._last_latin_character = None

184

185 @property

186 def ratio(self) -> float:

187 if self._character_count == 0:

188 return 0.0

189

190 return (self._successive_count * 2) / self._character_count

191

192

193class SuspiciousRange(MessDetectorPlugin):

194 def __init__(self) -> None:

195 self._suspicious_successive_range_count: int = 0

196 self._character_count: int = 0

197 self._last_printable_seen: Optional[str] = None

198

199 def eligible(self, character: str) -> bool:

200 return character.isprintable()

201

202 def feed(self, character: str) -> None:

203 self._character_count += 1

204

205 if (

206 character.isspace()

207 or is_punctuation(character)

208 or character in COMMON_SAFE_ASCII_CHARACTERS

209 ):

210 self._last_printable_seen = None

211 return

212

213 if self._last_printable_seen is None:

214 self._last_printable_seen = character

215 return

216

217 unicode_range_a: Optional[str] = unicode_range(self._last_printable_seen)

218 unicode_range_b: Optional[str] = unicode_range(character)

219

220 if is_suspiciously_successive_range(unicode_range_a, unicode_range_b):

221 self._suspicious_successive_range_count += 1

222

223 self._last_printable_seen = character

224

225 def reset(self) -> None: # pragma: no cover

226 self._character_count = 0

227 self._suspicious_successive_range_count = 0

228 self._last_printable_seen = None

229

230 @property

231 def ratio(self) -> float:

232 if self._character_count == 0:

233 return 0.0

234

235 ratio_of_suspicious_range_usage: float = (

236 self._suspicious_successive_range_count * 2

237 ) / self._character_count

238

239 if ratio_of_suspicious_range_usage < 0.1:

240 return 0.0

241

242 return ratio_of_suspicious_range_usage

243

244

245class SuperWeirdWordPlugin(MessDetectorPlugin):

246 def __init__(self) -> None:

247 self._word_count: int = 0

248 self._bad_word_count: int = 0

249 self._foreign_long_count: int = 0

250

251 self._is_current_word_bad: bool = False

252 self._foreign_long_watch: bool = False

253

254 self._character_count: int = 0

255 self._bad_character_count: int = 0

256

257 self._buffer: str = ""

258 self._buffer_accent_count: int = 0

259

260 def eligible(self, character: str) -> bool:

261 return True

262

263 def feed(self, character: str) -> None:

264 if character.isalpha():

265 self._buffer += character

266 if is_accentuated(character):

267 self._buffer_accent_count += 1

268 if (

269 self._foreign_long_watch is False

270 and (is_latin(character) is False or is_accentuated(character))

271 and is_cjk(character) is False

272 and is_hangul(character) is False

273 and is_katakana(character) is False

274 and is_hiragana(character) is False

275 and is_thai(character) is False

276 ):

277 self._foreign_long_watch = True

278 return

279 if not self._buffer:

280 return

281 if (

282 character.isspace() or is_punctuation(character) or is_separator(character)

283 ) and self._buffer:

284 self._word_count += 1

285 buffer_length: int = len(self._buffer)

286

287 self._character_count += buffer_length

288

289 if buffer_length >= 4:

290 if self._buffer_accent_count / buffer_length > 0.34:

291 self._is_current_word_bad = True

292 # Word/Buffer ending with a upper case accentuated letter are so rare,

293 # that we will consider them all as suspicious. Same weight as foreign_long suspicious.

294 if is_accentuated(self._buffer[-1]) and self._buffer[-1].isupper():

295 self._foreign_long_count += 1

296 self._is_current_word_bad = True

297 if buffer_length >= 24 and self._foreign_long_watch:

298 self._foreign_long_count += 1

299 self._is_current_word_bad = True

300

301 if self._is_current_word_bad:

302 self._bad_word_count += 1

303 self._bad_character_count += len(self._buffer)

304 self._is_current_word_bad = False

305

306 self._foreign_long_watch = False

307 self._buffer = ""

308 self._buffer_accent_count = 0

309 elif (

310 character not in {"<", ">", "-", "=", "~", "|", "_"}

311 and character.isdigit() is False

312 and is_symbol(character)

313 ):

314 self._is_current_word_bad = True

315 self._buffer += character

316

317 def reset(self) -> None: # pragma: no cover

318 self._buffer = ""

319 self._is_current_word_bad = False

320 self._foreign_long_watch = False

321 self._bad_word_count = 0

322 self._word_count = 0

323 self._character_count = 0

324 self._bad_character_count = 0

325 self._foreign_long_count = 0

326

327 @property

328 def ratio(self) -> float:

329 if self._word_count <= 10 and self._foreign_long_count == 0:

330 return 0.0

331

332 return self._bad_character_count / self._character_count

333

334

335class CjkInvalidStopPlugin(MessDetectorPlugin):

336 """

337 GB(Chinese) based encoding often render the stop incorrectly when the content does not fit and

338 can be easily detected. Searching for the overuse of '丅' and '丄'.

339 """

340

341 def __init__(self) -> None:

342 self._wrong_stop_count: int = 0

343 self._cjk_character_count: int = 0

344

345 def eligible(self, character: str) -> bool:

346 return True

347

348 def feed(self, character: str) -> None:

349 if character in {"丅", "丄"}:

350 self._wrong_stop_count += 1

351 return

352 if is_cjk(character):

353 self._cjk_character_count += 1

354

355 def reset(self) -> None: # pragma: no cover

356 self._wrong_stop_count = 0

357 self._cjk_character_count = 0

358

359 @property

360 def ratio(self) -> float:

361 if self._cjk_character_count < 16:

362 return 0.0

363 return self._wrong_stop_count / self._cjk_character_count

364

365

366class ArchaicUpperLowerPlugin(MessDetectorPlugin):

367 def __init__(self) -> None:

368 self._buf: bool = False

369

370 self._character_count_since_last_sep: int = 0

371

372 self._successive_upper_lower_count: int = 0

373 self._successive_upper_lower_count_final: int = 0

374

375 self._character_count: int = 0

376

377 self._last_alpha_seen: Optional[str] = None

378 self._current_ascii_only: bool = True

379

380 def eligible(self, character: str) -> bool:

381 return True

382

383 def feed(self, character: str) -> None:

384 is_concerned = character.isalpha() and is_case_variable(character)

385 chunk_sep = is_concerned is False

386

387 if chunk_sep and self._character_count_since_last_sep > 0:

388 if (

389 self._character_count_since_last_sep <= 64

390 and character.isdigit() is False

391 and self._current_ascii_only is False

392 ):

393 self._successive_upper_lower_count_final += (

394 self._successive_upper_lower_count

395 )

396

397 self._successive_upper_lower_count = 0

398 self._character_count_since_last_sep = 0

399 self._last_alpha_seen = None

400 self._buf = False

401 self._character_count += 1

402 self._current_ascii_only = True

403

404 return

405

406 if self._current_ascii_only is True and is_ascii(character) is False:

407 self._current_ascii_only = False

408

409 if self._last_alpha_seen is not None:

410 if (character.isupper() and self._last_alpha_seen.islower()) or (

411 character.islower() and self._last_alpha_seen.isupper()

412 ):

413 if self._buf is True:

414 self._successive_upper_lower_count += 2

415 self._buf = False

416 else:

417 self._buf = True

418 else:

419 self._buf = False

420

421 self._character_count += 1

422 self._character_count_since_last_sep += 1

423 self._last_alpha_seen = character

424

425 def reset(self) -> None: # pragma: no cover

426 self._character_count = 0

427 self._character_count_since_last_sep = 0

428 self._successive_upper_lower_count = 0

429 self._successive_upper_lower_count_final = 0

430 self._last_alpha_seen = None

431 self._buf = False

432 self._current_ascii_only = True

433

434 @property

435 def ratio(self) -> float:

436 if self._character_count == 0:

437 return 0.0

438

439 return self._successive_upper_lower_count_final / self._character_count

440

441

442@lru_cache(maxsize=1024)

443def is_suspiciously_successive_range(

444 unicode_range_a: Optional[str], unicode_range_b: Optional[str]

445) -> bool:

446 """

447 Determine if two Unicode range seen next to each other can be considered as suspicious.

448 """

449 if unicode_range_a is None or unicode_range_b is None:

450 return True

451

452 if unicode_range_a == unicode_range_b:

453 return False

454

455 if "Latin" in unicode_range_a and "Latin" in unicode_range_b:

456 return False

457

458 if "Emoticons" in unicode_range_a or "Emoticons" in unicode_range_b:

459 return False

460

461 # Latin characters can be accompanied with a combining diacritical mark

462 # eg. Vietnamese.

463 if ("Latin" in unicode_range_a or "Latin" in unicode_range_b) and (

464 "Combining" in unicode_range_a or "Combining" in unicode_range_b

465 ):

466 return False

467

468 keywords_range_a, keywords_range_b = unicode_range_a.split(

469 " "

470 ), unicode_range_b.split(" ")

471

472 for el in keywords_range_a:

473 if el in UNICODE_SECONDARY_RANGE_KEYWORD:

474 continue

475 if el in keywords_range_b:

476 return False

477

478 # Japanese Exception

479 range_a_jp_chars, range_b_jp_chars = (

480 unicode_range_a

481 in (

482 "Hiragana",

483 "Katakana",

484 ),

485 unicode_range_b in ("Hiragana", "Katakana"),

486 )

487 if (range_a_jp_chars or range_b_jp_chars) and (

488 "CJK" in unicode_range_a or "CJK" in unicode_range_b

489 ):

490 return False

491 if range_a_jp_chars and range_b_jp_chars:

492 return False

493

494 if "Hangul" in unicode_range_a or "Hangul" in unicode_range_b:

495 if "CJK" in unicode_range_a or "CJK" in unicode_range_b:

496 return False

497 if unicode_range_a == "Basic Latin" or unicode_range_b == "Basic Latin":

498 return False

499

500 # Chinese/Japanese use dedicated range for punctuation and/or separators.

501 if ("CJK" in unicode_range_a or "CJK" in unicode_range_b) or (

502 unicode_range_a in ["Katakana", "Hiragana"]

503 and unicode_range_b in ["Katakana", "Hiragana"]

504 ):

505 if "Punctuation" in unicode_range_a or "Punctuation" in unicode_range_b:

506 return False

507 if "Forms" in unicode_range_a or "Forms" in unicode_range_b:

508 return False

509

510 return True

511

512

513@lru_cache(maxsize=2048)

514def mess_ratio(

515 decoded_sequence: str, maximum_threshold: float = 0.2, debug: bool = False

516) -> float:

517 """

518 Compute a mess ratio given a decoded bytes sequence. The maximum threshold does stop the computation earlier.

519 """

520

521 detectors: List[MessDetectorPlugin] = [

522 md_class() for md_class in MessDetectorPlugin.__subclasses__()

523 ]

524

525 length: int = len(decoded_sequence) + 1

526

527 mean_mess_ratio: float = 0.0

528

529 if length < 512:

530 intermediary_mean_mess_ratio_calc: int = 32

531 elif length <= 1024:

532 intermediary_mean_mess_ratio_calc = 64

533 else:

534 intermediary_mean_mess_ratio_calc = 128

535

536 for character, index in zip(decoded_sequence + "\n", range(length)):

537 for detector in detectors:

538 if detector.eligible(character):

539 detector.feed(character)

540

541 if (

542 index > 0 and index % intermediary_mean_mess_ratio_calc == 0

543 ) or index == length - 1:

544 mean_mess_ratio = sum(dt.ratio for dt in detectors)

545

546 if mean_mess_ratio >= maximum_threshold:

547 break

548

549 if debug:

550 for dt in detectors: # pragma: nocover

551 print(dt.__class__, dt.ratio)

552

553 return round(mean_mess_ratio, 3)

Coverage for /var/srv/projects/api.amasfac.comuna18.com/tmp/venv/lib/python3.9/site-packages/charset_normalizer/md.py: 19%

318 statements