Coverage for /var/srv/projects/api.amasfac.comuna18.com/tmp/venv/lib/python3.9/site-packages/charset_normalizer/utils.py: 20%
217 statements
« prev ^ index » next coverage.py v6.4.4, created at 2023-07-17 14:22 -0600
« prev ^ index » next coverage.py v6.4.4, created at 2023-07-17 14:22 -0600
1try:
2 # WARNING: unicodedata2 support is going to be removed in 3.0
3 # Python is quickly catching up.
4 import unicodedata2 as unicodedata
5except ImportError:
6 import unicodedata # type: ignore[no-redef]
8import importlib
9import logging
10from codecs import IncrementalDecoder
11from encodings.aliases import aliases
12from functools import lru_cache
13from re import findall
14from typing import Generator, List, Optional, Set, Tuple, Union
16from _multibytecodec import MultibyteIncrementalDecoder
18from .constant import (
19 ENCODING_MARKS,
20 IANA_SUPPORTED_SIMILAR,
21 RE_POSSIBLE_ENCODING_INDICATION,
22 UNICODE_RANGES_COMBINED,
23 UNICODE_SECONDARY_RANGE_KEYWORD,
24 UTF8_MAXIMAL_ALLOCATION,
25)
28@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
29def is_accentuated(character: str) -> bool:
30 try:
31 description: str = unicodedata.name(character)
32 except ValueError:
33 return False
34 return (
35 "WITH GRAVE" in description
36 or "WITH ACUTE" in description
37 or "WITH CEDILLA" in description
38 or "WITH DIAERESIS" in description
39 or "WITH CIRCUMFLEX" in description
40 or "WITH TILDE" in description
41 )
44@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
45def remove_accent(character: str) -> str:
46 decomposed: str = unicodedata.decomposition(character)
47 if not decomposed:
48 return character
50 codes: List[str] = decomposed.split(" ")
52 return chr(int(codes[0], 16))
55@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
56def unicode_range(character: str) -> Optional[str]:
57 """
58 Retrieve the Unicode range official name from a single character.
59 """
60 character_ord: int = ord(character)
62 for range_name, ord_range in UNICODE_RANGES_COMBINED.items():
63 if character_ord in ord_range:
64 return range_name
66 return None
69@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
70def is_latin(character: str) -> bool:
71 try:
72 description: str = unicodedata.name(character)
73 except ValueError:
74 return False
75 return "LATIN" in description
78@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
79def is_ascii(character: str) -> bool:
80 try:
81 character.encode("ascii")
82 except UnicodeEncodeError:
83 return False
84 return True
87@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
88def is_punctuation(character: str) -> bool:
89 character_category: str = unicodedata.category(character)
91 if "P" in character_category:
92 return True
94 character_range: Optional[str] = unicode_range(character)
96 if character_range is None:
97 return False
99 return "Punctuation" in character_range
102@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
103def is_symbol(character: str) -> bool:
104 character_category: str = unicodedata.category(character)
106 if "S" in character_category or "N" in character_category:
107 return True
109 character_range: Optional[str] = unicode_range(character)
111 if character_range is None:
112 return False
114 return "Forms" in character_range
117@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
118def is_emoticon(character: str) -> bool:
119 character_range: Optional[str] = unicode_range(character)
121 if character_range is None:
122 return False
124 return "Emoticons" in character_range
127@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
128def is_separator(character: str) -> bool:
129 if character.isspace() or character in {"|", "+", ",", ";", "<", ">"}:
130 return True
132 character_category: str = unicodedata.category(character)
134 return "Z" in character_category
137@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
138def is_case_variable(character: str) -> bool:
139 return character.islower() != character.isupper()
142def is_private_use_only(character: str) -> bool:
143 character_category: str = unicodedata.category(character)
145 return character_category == "Co"
148@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
149def is_cjk(character: str) -> bool:
150 try:
151 character_name = unicodedata.name(character)
152 except ValueError:
153 return False
155 return "CJK" in character_name
158@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
159def is_hiragana(character: str) -> bool:
160 try:
161 character_name = unicodedata.name(character)
162 except ValueError:
163 return False
165 return "HIRAGANA" in character_name
168@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
169def is_katakana(character: str) -> bool:
170 try:
171 character_name = unicodedata.name(character)
172 except ValueError:
173 return False
175 return "KATAKANA" in character_name
178@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
179def is_hangul(character: str) -> bool:
180 try:
181 character_name = unicodedata.name(character)
182 except ValueError:
183 return False
185 return "HANGUL" in character_name
188@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
189def is_thai(character: str) -> bool:
190 try:
191 character_name = unicodedata.name(character)
192 except ValueError:
193 return False
195 return "THAI" in character_name
198@lru_cache(maxsize=len(UNICODE_RANGES_COMBINED))
199def is_unicode_range_secondary(range_name: str) -> bool:
200 return any(keyword in range_name for keyword in UNICODE_SECONDARY_RANGE_KEYWORD)
203@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
204def is_unprintable(character: str) -> bool:
205 return (
206 character.isspace() is False # includes \n \t \r \v
207 and character.isprintable() is False
208 and character != "\x1A" # Why? Its the ASCII substitute character.
209 and character != "\ufeff" # bug discovered in Python,
210 # Zero Width No-Break Space located in Arabic Presentation Forms-B, Unicode 1.1 not acknowledged as space.
211 )
214def any_specified_encoding(sequence: bytes, search_zone: int = 4096) -> Optional[str]:
215 """
216 Extract using ASCII-only decoder any specified encoding in the first n-bytes.
217 """
218 if not isinstance(sequence, bytes):
219 raise TypeError
221 seq_len: int = len(sequence)
223 results: List[str] = findall(
224 RE_POSSIBLE_ENCODING_INDICATION,
225 sequence[: min(seq_len, search_zone)].decode("ascii", errors="ignore"),
226 )
228 if len(results) == 0:
229 return None
231 for specified_encoding in results:
232 specified_encoding = specified_encoding.lower().replace("-", "_")
234 encoding_alias: str
235 encoding_iana: str
237 for encoding_alias, encoding_iana in aliases.items():
238 if encoding_alias == specified_encoding:
239 return encoding_iana
240 if encoding_iana == specified_encoding:
241 return encoding_iana
243 return None
246@lru_cache(maxsize=128)
247def is_multi_byte_encoding(name: str) -> bool:
248 """
249 Verify is a specific encoding is a multi byte one based on it IANA name
250 """
251 return name in {
252 "utf_8",
253 "utf_8_sig",
254 "utf_16",
255 "utf_16_be",
256 "utf_16_le",
257 "utf_32",
258 "utf_32_le",
259 "utf_32_be",
260 "utf_7",
261 } or issubclass(
262 importlib.import_module("encodings.{}".format(name)).IncrementalDecoder,
263 MultibyteIncrementalDecoder,
264 )
267def identify_sig_or_bom(sequence: bytes) -> Tuple[Optional[str], bytes]:
268 """
269 Identify and extract SIG/BOM in given sequence.
270 """
272 for iana_encoding in ENCODING_MARKS:
273 marks: Union[bytes, List[bytes]] = ENCODING_MARKS[iana_encoding]
275 if isinstance(marks, bytes):
276 marks = [marks]
278 for mark in marks:
279 if sequence.startswith(mark):
280 return iana_encoding, mark
282 return None, b""
285def should_strip_sig_or_bom(iana_encoding: str) -> bool:
286 return iana_encoding not in {"utf_16", "utf_32"}
289def iana_name(cp_name: str, strict: bool = True) -> str:
290 cp_name = cp_name.lower().replace("-", "_")
292 encoding_alias: str
293 encoding_iana: str
295 for encoding_alias, encoding_iana in aliases.items():
296 if cp_name in [encoding_alias, encoding_iana]:
297 return encoding_iana
299 if strict:
300 raise ValueError("Unable to retrieve IANA for '{}'".format(cp_name))
302 return cp_name
305def range_scan(decoded_sequence: str) -> List[str]:
306 ranges: Set[str] = set()
308 for character in decoded_sequence:
309 character_range: Optional[str] = unicode_range(character)
311 if character_range is None:
312 continue
314 ranges.add(character_range)
316 return list(ranges)
319def cp_similarity(iana_name_a: str, iana_name_b: str) -> float:
321 if is_multi_byte_encoding(iana_name_a) or is_multi_byte_encoding(iana_name_b):
322 return 0.0
324 decoder_a = importlib.import_module(
325 "encodings.{}".format(iana_name_a)
326 ).IncrementalDecoder
327 decoder_b = importlib.import_module(
328 "encodings.{}".format(iana_name_b)
329 ).IncrementalDecoder
331 id_a: IncrementalDecoder = decoder_a(errors="ignore")
332 id_b: IncrementalDecoder = decoder_b(errors="ignore")
334 character_match_count: int = 0
336 for i in range(255):
337 to_be_decoded: bytes = bytes([i])
338 if id_a.decode(to_be_decoded) == id_b.decode(to_be_decoded):
339 character_match_count += 1
341 return character_match_count / 254
344def is_cp_similar(iana_name_a: str, iana_name_b: str) -> bool:
345 """
346 Determine if two code page are at least 80% similar. IANA_SUPPORTED_SIMILAR dict was generated using
347 the function cp_similarity.
348 """
349 return (
350 iana_name_a in IANA_SUPPORTED_SIMILAR
351 and iana_name_b in IANA_SUPPORTED_SIMILAR[iana_name_a]
352 )
355def set_logging_handler(
356 name: str = "charset_normalizer",
357 level: int = logging.INFO,
358 format_string: str = "%(asctime)s | %(levelname)s | %(message)s",
359) -> None:
361 logger = logging.getLogger(name)
362 logger.setLevel(level)
364 handler = logging.StreamHandler()
365 handler.setFormatter(logging.Formatter(format_string))
366 logger.addHandler(handler)
369def cut_sequence_chunks(
370 sequences: bytes,
371 encoding_iana: str,
372 offsets: range,
373 chunk_size: int,
374 bom_or_sig_available: bool,
375 strip_sig_or_bom: bool,
376 sig_payload: bytes,
377 is_multi_byte_decoder: bool,
378 decoded_payload: Optional[str] = None,
379) -> Generator[str, None, None]:
381 if decoded_payload and is_multi_byte_decoder is False:
382 for i in offsets:
383 chunk = decoded_payload[i : i + chunk_size]
384 if not chunk:
385 break
386 yield chunk
387 else:
388 for i in offsets:
389 chunk_end = i + chunk_size
390 if chunk_end > len(sequences) + 8:
391 continue
393 cut_sequence = sequences[i : i + chunk_size]
395 if bom_or_sig_available and strip_sig_or_bom is False:
396 cut_sequence = sig_payload + cut_sequence
398 chunk = cut_sequence.decode(
399 encoding_iana,
400 errors="ignore" if is_multi_byte_decoder else "strict",
401 )
403 # multi-byte bad cutting detector and adjustment
404 # not the cleanest way to perform that fix but clever enough for now.
405 if is_multi_byte_decoder and i > 0 and sequences[i] >= 0x80:
407 chunk_partial_size_chk: int = min(chunk_size, 16)
409 if (
410 decoded_payload
411 and chunk[:chunk_partial_size_chk] not in decoded_payload
412 ):
413 for j in range(i, i - 4, -1):
414 cut_sequence = sequences[j:chunk_end]
416 if bom_or_sig_available and strip_sig_or_bom is False:
417 cut_sequence = sig_payload + cut_sequence
419 chunk = cut_sequence.decode(encoding_iana, errors="ignore")
421 if chunk[:chunk_partial_size_chk] in decoded_payload:
422 break
424 yield chunk