Coverage for /var/srv/projects/api.amasfac.comuna18.com/tmp/venv/lib/python3.9/site-packages/charset_normalizer/utils.py: 20%

217 statements  

« prev     ^ index     » next       coverage.py v6.4.4, created at 2023-07-17 14:22 -0600

1try: 

2 # WARNING: unicodedata2 support is going to be removed in 3.0 

3 # Python is quickly catching up. 

4 import unicodedata2 as unicodedata 

5except ImportError: 

6 import unicodedata # type: ignore[no-redef] 

7 

8import importlib 

9import logging 

10from codecs import IncrementalDecoder 

11from encodings.aliases import aliases 

12from functools import lru_cache 

13from re import findall 

14from typing import Generator, List, Optional, Set, Tuple, Union 

15 

16from _multibytecodec import MultibyteIncrementalDecoder 

17 

18from .constant import ( 

19 ENCODING_MARKS, 

20 IANA_SUPPORTED_SIMILAR, 

21 RE_POSSIBLE_ENCODING_INDICATION, 

22 UNICODE_RANGES_COMBINED, 

23 UNICODE_SECONDARY_RANGE_KEYWORD, 

24 UTF8_MAXIMAL_ALLOCATION, 

25) 

26 

27 

28@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION) 

29def is_accentuated(character: str) -> bool: 

30 try: 

31 description: str = unicodedata.name(character) 

32 except ValueError: 

33 return False 

34 return ( 

35 "WITH GRAVE" in description 

36 or "WITH ACUTE" in description 

37 or "WITH CEDILLA" in description 

38 or "WITH DIAERESIS" in description 

39 or "WITH CIRCUMFLEX" in description 

40 or "WITH TILDE" in description 

41 ) 

42 

43 

44@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION) 

45def remove_accent(character: str) -> str: 

46 decomposed: str = unicodedata.decomposition(character) 

47 if not decomposed: 

48 return character 

49 

50 codes: List[str] = decomposed.split(" ") 

51 

52 return chr(int(codes[0], 16)) 

53 

54 

55@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION) 

56def unicode_range(character: str) -> Optional[str]: 

57 """ 

58 Retrieve the Unicode range official name from a single character. 

59 """ 

60 character_ord: int = ord(character) 

61 

62 for range_name, ord_range in UNICODE_RANGES_COMBINED.items(): 

63 if character_ord in ord_range: 

64 return range_name 

65 

66 return None 

67 

68 

69@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION) 

70def is_latin(character: str) -> bool: 

71 try: 

72 description: str = unicodedata.name(character) 

73 except ValueError: 

74 return False 

75 return "LATIN" in description 

76 

77 

78@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION) 

79def is_ascii(character: str) -> bool: 

80 try: 

81 character.encode("ascii") 

82 except UnicodeEncodeError: 

83 return False 

84 return True 

85 

86 

87@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION) 

88def is_punctuation(character: str) -> bool: 

89 character_category: str = unicodedata.category(character) 

90 

91 if "P" in character_category: 

92 return True 

93 

94 character_range: Optional[str] = unicode_range(character) 

95 

96 if character_range is None: 

97 return False 

98 

99 return "Punctuation" in character_range 

100 

101 

102@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION) 

103def is_symbol(character: str) -> bool: 

104 character_category: str = unicodedata.category(character) 

105 

106 if "S" in character_category or "N" in character_category: 

107 return True 

108 

109 character_range: Optional[str] = unicode_range(character) 

110 

111 if character_range is None: 

112 return False 

113 

114 return "Forms" in character_range 

115 

116 

117@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION) 

118def is_emoticon(character: str) -> bool: 

119 character_range: Optional[str] = unicode_range(character) 

120 

121 if character_range is None: 

122 return False 

123 

124 return "Emoticons" in character_range 

125 

126 

127@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION) 

128def is_separator(character: str) -> bool: 

129 if character.isspace() or character in {"|", "+", ",", ";", "<", ">"}: 

130 return True 

131 

132 character_category: str = unicodedata.category(character) 

133 

134 return "Z" in character_category 

135 

136 

137@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION) 

138def is_case_variable(character: str) -> bool: 

139 return character.islower() != character.isupper() 

140 

141 

142def is_private_use_only(character: str) -> bool: 

143 character_category: str = unicodedata.category(character) 

144 

145 return character_category == "Co" 

146 

147 

148@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION) 

149def is_cjk(character: str) -> bool: 

150 try: 

151 character_name = unicodedata.name(character) 

152 except ValueError: 

153 return False 

154 

155 return "CJK" in character_name 

156 

157 

158@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION) 

159def is_hiragana(character: str) -> bool: 

160 try: 

161 character_name = unicodedata.name(character) 

162 except ValueError: 

163 return False 

164 

165 return "HIRAGANA" in character_name 

166 

167 

168@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION) 

169def is_katakana(character: str) -> bool: 

170 try: 

171 character_name = unicodedata.name(character) 

172 except ValueError: 

173 return False 

174 

175 return "KATAKANA" in character_name 

176 

177 

178@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION) 

179def is_hangul(character: str) -> bool: 

180 try: 

181 character_name = unicodedata.name(character) 

182 except ValueError: 

183 return False 

184 

185 return "HANGUL" in character_name 

186 

187 

188@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION) 

189def is_thai(character: str) -> bool: 

190 try: 

191 character_name = unicodedata.name(character) 

192 except ValueError: 

193 return False 

194 

195 return "THAI" in character_name 

196 

197 

198@lru_cache(maxsize=len(UNICODE_RANGES_COMBINED)) 

199def is_unicode_range_secondary(range_name: str) -> bool: 

200 return any(keyword in range_name for keyword in UNICODE_SECONDARY_RANGE_KEYWORD) 

201 

202 

203@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION) 

204def is_unprintable(character: str) -> bool: 

205 return ( 

206 character.isspace() is False # includes \n \t \r \v 

207 and character.isprintable() is False 

208 and character != "\x1A" # Why? Its the ASCII substitute character. 

209 and character != "\ufeff" # bug discovered in Python, 

210 # Zero Width No-Break Space located in Arabic Presentation Forms-B, Unicode 1.1 not acknowledged as space. 

211 ) 

212 

213 

214def any_specified_encoding(sequence: bytes, search_zone: int = 4096) -> Optional[str]: 

215 """ 

216 Extract using ASCII-only decoder any specified encoding in the first n-bytes. 

217 """ 

218 if not isinstance(sequence, bytes): 

219 raise TypeError 

220 

221 seq_len: int = len(sequence) 

222 

223 results: List[str] = findall( 

224 RE_POSSIBLE_ENCODING_INDICATION, 

225 sequence[: min(seq_len, search_zone)].decode("ascii", errors="ignore"), 

226 ) 

227 

228 if len(results) == 0: 

229 return None 

230 

231 for specified_encoding in results: 

232 specified_encoding = specified_encoding.lower().replace("-", "_") 

233 

234 encoding_alias: str 

235 encoding_iana: str 

236 

237 for encoding_alias, encoding_iana in aliases.items(): 

238 if encoding_alias == specified_encoding: 

239 return encoding_iana 

240 if encoding_iana == specified_encoding: 

241 return encoding_iana 

242 

243 return None 

244 

245 

246@lru_cache(maxsize=128) 

247def is_multi_byte_encoding(name: str) -> bool: 

248 """ 

249 Verify is a specific encoding is a multi byte one based on it IANA name 

250 """ 

251 return name in { 

252 "utf_8", 

253 "utf_8_sig", 

254 "utf_16", 

255 "utf_16_be", 

256 "utf_16_le", 

257 "utf_32", 

258 "utf_32_le", 

259 "utf_32_be", 

260 "utf_7", 

261 } or issubclass( 

262 importlib.import_module("encodings.{}".format(name)).IncrementalDecoder, 

263 MultibyteIncrementalDecoder, 

264 ) 

265 

266 

267def identify_sig_or_bom(sequence: bytes) -> Tuple[Optional[str], bytes]: 

268 """ 

269 Identify and extract SIG/BOM in given sequence. 

270 """ 

271 

272 for iana_encoding in ENCODING_MARKS: 

273 marks: Union[bytes, List[bytes]] = ENCODING_MARKS[iana_encoding] 

274 

275 if isinstance(marks, bytes): 

276 marks = [marks] 

277 

278 for mark in marks: 

279 if sequence.startswith(mark): 

280 return iana_encoding, mark 

281 

282 return None, b"" 

283 

284 

285def should_strip_sig_or_bom(iana_encoding: str) -> bool: 

286 return iana_encoding not in {"utf_16", "utf_32"} 

287 

288 

289def iana_name(cp_name: str, strict: bool = True) -> str: 

290 cp_name = cp_name.lower().replace("-", "_") 

291 

292 encoding_alias: str 

293 encoding_iana: str 

294 

295 for encoding_alias, encoding_iana in aliases.items(): 

296 if cp_name in [encoding_alias, encoding_iana]: 

297 return encoding_iana 

298 

299 if strict: 

300 raise ValueError("Unable to retrieve IANA for '{}'".format(cp_name)) 

301 

302 return cp_name 

303 

304 

305def range_scan(decoded_sequence: str) -> List[str]: 

306 ranges: Set[str] = set() 

307 

308 for character in decoded_sequence: 

309 character_range: Optional[str] = unicode_range(character) 

310 

311 if character_range is None: 

312 continue 

313 

314 ranges.add(character_range) 

315 

316 return list(ranges) 

317 

318 

319def cp_similarity(iana_name_a: str, iana_name_b: str) -> float: 

320 

321 if is_multi_byte_encoding(iana_name_a) or is_multi_byte_encoding(iana_name_b): 

322 return 0.0 

323 

324 decoder_a = importlib.import_module( 

325 "encodings.{}".format(iana_name_a) 

326 ).IncrementalDecoder 

327 decoder_b = importlib.import_module( 

328 "encodings.{}".format(iana_name_b) 

329 ).IncrementalDecoder 

330 

331 id_a: IncrementalDecoder = decoder_a(errors="ignore") 

332 id_b: IncrementalDecoder = decoder_b(errors="ignore") 

333 

334 character_match_count: int = 0 

335 

336 for i in range(255): 

337 to_be_decoded: bytes = bytes([i]) 

338 if id_a.decode(to_be_decoded) == id_b.decode(to_be_decoded): 

339 character_match_count += 1 

340 

341 return character_match_count / 254 

342 

343 

344def is_cp_similar(iana_name_a: str, iana_name_b: str) -> bool: 

345 """ 

346 Determine if two code page are at least 80% similar. IANA_SUPPORTED_SIMILAR dict was generated using 

347 the function cp_similarity. 

348 """ 

349 return ( 

350 iana_name_a in IANA_SUPPORTED_SIMILAR 

351 and iana_name_b in IANA_SUPPORTED_SIMILAR[iana_name_a] 

352 ) 

353 

354 

355def set_logging_handler( 

356 name: str = "charset_normalizer", 

357 level: int = logging.INFO, 

358 format_string: str = "%(asctime)s | %(levelname)s | %(message)s", 

359) -> None: 

360 

361 logger = logging.getLogger(name) 

362 logger.setLevel(level) 

363 

364 handler = logging.StreamHandler() 

365 handler.setFormatter(logging.Formatter(format_string)) 

366 logger.addHandler(handler) 

367 

368 

369def cut_sequence_chunks( 

370 sequences: bytes, 

371 encoding_iana: str, 

372 offsets: range, 

373 chunk_size: int, 

374 bom_or_sig_available: bool, 

375 strip_sig_or_bom: bool, 

376 sig_payload: bytes, 

377 is_multi_byte_decoder: bool, 

378 decoded_payload: Optional[str] = None, 

379) -> Generator[str, None, None]: 

380 

381 if decoded_payload and is_multi_byte_decoder is False: 

382 for i in offsets: 

383 chunk = decoded_payload[i : i + chunk_size] 

384 if not chunk: 

385 break 

386 yield chunk 

387 else: 

388 for i in offsets: 

389 chunk_end = i + chunk_size 

390 if chunk_end > len(sequences) + 8: 

391 continue 

392 

393 cut_sequence = sequences[i : i + chunk_size] 

394 

395 if bom_or_sig_available and strip_sig_or_bom is False: 

396 cut_sequence = sig_payload + cut_sequence 

397 

398 chunk = cut_sequence.decode( 

399 encoding_iana, 

400 errors="ignore" if is_multi_byte_decoder else "strict", 

401 ) 

402 

403 # multi-byte bad cutting detector and adjustment 

404 # not the cleanest way to perform that fix but clever enough for now. 

405 if is_multi_byte_decoder and i > 0 and sequences[i] >= 0x80: 

406 

407 chunk_partial_size_chk: int = min(chunk_size, 16) 

408 

409 if ( 

410 decoded_payload 

411 and chunk[:chunk_partial_size_chk] not in decoded_payload 

412 ): 

413 for j in range(i, i - 4, -1): 

414 cut_sequence = sequences[j:chunk_end] 

415 

416 if bom_or_sig_available and strip_sig_or_bom is False: 

417 cut_sequence = sig_payload + cut_sequence 

418 

419 chunk = cut_sequence.decode(encoding_iana, errors="ignore") 

420 

421 if chunk[:chunk_partial_size_chk] in decoded_payload: 

422 break 

423 

424 yield chunk