Coverage for /var/srv/projects/api.amasfac.comuna18.com/tmp/venv/lib/python3.9/site-packages/charset_normalizer/cd.py: 9%

164 statements  

« prev     ^ index     » next       coverage.py v6.4.4, created at 2023-07-17 14:22 -0600

1import importlib 

2from codecs import IncrementalDecoder 

3from collections import Counter 

4from functools import lru_cache 

5from typing import Counter as TypeCounter, Dict, List, Optional, Tuple 

6 

7from .assets import FREQUENCIES 

8from .constant import KO_NAMES, LANGUAGE_SUPPORTED_COUNT, TOO_SMALL_SEQUENCE, ZH_NAMES 

9from .md import is_suspiciously_successive_range 

10from .models import CoherenceMatches 

11from .utils import ( 

12 is_accentuated, 

13 is_latin, 

14 is_multi_byte_encoding, 

15 is_unicode_range_secondary, 

16 unicode_range, 

17) 

18 

19 

20def encoding_unicode_range(iana_name: str) -> List[str]: 

21 """ 

22 Return associated unicode ranges in a single byte code page. 

23 """ 

24 if is_multi_byte_encoding(iana_name): 

25 raise IOError("Function not supported on multi-byte code page") 

26 

27 decoder = importlib.import_module( 

28 "encodings.{}".format(iana_name) 

29 ).IncrementalDecoder 

30 

31 p: IncrementalDecoder = decoder(errors="ignore") 

32 seen_ranges: Dict[str, int] = {} 

33 character_count: int = 0 

34 

35 for i in range(0x40, 0xFF): 

36 chunk: str = p.decode(bytes([i])) 

37 

38 if chunk: 

39 character_range: Optional[str] = unicode_range(chunk) 

40 

41 if character_range is None: 

42 continue 

43 

44 if is_unicode_range_secondary(character_range) is False: 

45 if character_range not in seen_ranges: 

46 seen_ranges[character_range] = 0 

47 seen_ranges[character_range] += 1 

48 character_count += 1 

49 

50 return sorted( 

51 [ 

52 character_range 

53 for character_range in seen_ranges 

54 if seen_ranges[character_range] / character_count >= 0.15 

55 ] 

56 ) 

57 

58 

59def unicode_range_languages(primary_range: str) -> List[str]: 

60 """ 

61 Return inferred languages used with a unicode range. 

62 """ 

63 languages: List[str] = [] 

64 

65 for language, characters in FREQUENCIES.items(): 

66 for character in characters: 

67 if unicode_range(character) == primary_range: 

68 languages.append(language) 

69 break 

70 

71 return languages 

72 

73 

74@lru_cache() 

75def encoding_languages(iana_name: str) -> List[str]: 

76 """ 

77 Single-byte encoding language association. Some code page are heavily linked to particular language(s). 

78 This function does the correspondence. 

79 """ 

80 unicode_ranges: List[str] = encoding_unicode_range(iana_name) 

81 primary_range: Optional[str] = None 

82 

83 for specified_range in unicode_ranges: 

84 if "Latin" not in specified_range: 

85 primary_range = specified_range 

86 break 

87 

88 if primary_range is None: 

89 return ["Latin Based"] 

90 

91 return unicode_range_languages(primary_range) 

92 

93 

94@lru_cache() 

95def mb_encoding_languages(iana_name: str) -> List[str]: 

96 """ 

97 Multi-byte encoding language association. Some code page are heavily linked to particular language(s). 

98 This function does the correspondence. 

99 """ 

100 if ( 

101 iana_name.startswith("shift_") 

102 or iana_name.startswith("iso2022_jp") 

103 or iana_name.startswith("euc_j") 

104 or iana_name == "cp932" 

105 ): 

106 return ["Japanese"] 

107 if iana_name.startswith("gb") or iana_name in ZH_NAMES: 

108 return ["Chinese", "Classical Chinese"] 

109 if iana_name.startswith("iso2022_kr") or iana_name in KO_NAMES: 

110 return ["Korean"] 

111 

112 return [] 

113 

114 

115@lru_cache(maxsize=LANGUAGE_SUPPORTED_COUNT) 

116def get_target_features(language: str) -> Tuple[bool, bool]: 

117 """ 

118 Determine main aspects from a supported language if it contains accents and if is pure Latin. 

119 """ 

120 target_have_accents: bool = False 

121 target_pure_latin: bool = True 

122 

123 for character in FREQUENCIES[language]: 

124 if not target_have_accents and is_accentuated(character): 

125 target_have_accents = True 

126 if target_pure_latin and is_latin(character) is False: 

127 target_pure_latin = False 

128 

129 return target_have_accents, target_pure_latin 

130 

131 

132def alphabet_languages( 

133 characters: List[str], ignore_non_latin: bool = False 

134) -> List[str]: 

135 """ 

136 Return associated languages associated to given characters. 

137 """ 

138 languages: List[Tuple[str, float]] = [] 

139 

140 source_have_accents = any(is_accentuated(character) for character in characters) 

141 

142 for language, language_characters in FREQUENCIES.items(): 

143 

144 target_have_accents, target_pure_latin = get_target_features(language) 

145 

146 if ignore_non_latin and target_pure_latin is False: 

147 continue 

148 

149 if target_have_accents is False and source_have_accents: 

150 continue 

151 

152 character_count: int = len(language_characters) 

153 

154 character_match_count: int = len( 

155 [c for c in language_characters if c in characters] 

156 ) 

157 

158 ratio: float = character_match_count / character_count 

159 

160 if ratio >= 0.2: 

161 languages.append((language, ratio)) 

162 

163 languages = sorted(languages, key=lambda x: x[1], reverse=True) 

164 

165 return [compatible_language[0] for compatible_language in languages] 

166 

167 

168def characters_popularity_compare( 

169 language: str, ordered_characters: List[str] 

170) -> float: 

171 """ 

172 Determine if a ordered characters list (by occurrence from most appearance to rarest) match a particular language. 

173 The result is a ratio between 0. (absolutely no correspondence) and 1. (near perfect fit). 

174 Beware that is function is not strict on the match in order to ease the detection. (Meaning close match is 1.) 

175 """ 

176 if language not in FREQUENCIES: 

177 raise ValueError("{} not available".format(language)) 

178 

179 character_approved_count: int = 0 

180 FREQUENCIES_language_set = set(FREQUENCIES[language]) 

181 

182 for character in ordered_characters: 

183 if character not in FREQUENCIES_language_set: 

184 continue 

185 

186 characters_before_source: List[str] = FREQUENCIES[language][ 

187 0 : FREQUENCIES[language].index(character) 

188 ] 

189 characters_after_source: List[str] = FREQUENCIES[language][ 

190 FREQUENCIES[language].index(character) : 

191 ] 

192 characters_before: List[str] = ordered_characters[ 

193 0 : ordered_characters.index(character) 

194 ] 

195 characters_after: List[str] = ordered_characters[ 

196 ordered_characters.index(character) : 

197 ] 

198 

199 before_match_count: int = len( 

200 set(characters_before) & set(characters_before_source) 

201 ) 

202 

203 after_match_count: int = len( 

204 set(characters_after) & set(characters_after_source) 

205 ) 

206 

207 if len(characters_before_source) == 0 and before_match_count <= 4: 

208 character_approved_count += 1 

209 continue 

210 

211 if len(characters_after_source) == 0 and after_match_count <= 4: 

212 character_approved_count += 1 

213 continue 

214 

215 if ( 

216 before_match_count / len(characters_before_source) >= 0.4 

217 or after_match_count / len(characters_after_source) >= 0.4 

218 ): 

219 character_approved_count += 1 

220 continue 

221 

222 return character_approved_count / len(ordered_characters) 

223 

224 

225def alpha_unicode_split(decoded_sequence: str) -> List[str]: 

226 """ 

227 Given a decoded text sequence, return a list of str. Unicode range / alphabet separation. 

228 Ex. a text containing English/Latin with a bit a Hebrew will return two items in the resulting list; 

229 One containing the latin letters and the other hebrew. 

230 """ 

231 layers: Dict[str, str] = {} 

232 

233 for character in decoded_sequence: 

234 if character.isalpha() is False: 

235 continue 

236 

237 character_range: Optional[str] = unicode_range(character) 

238 

239 if character_range is None: 

240 continue 

241 

242 layer_target_range: Optional[str] = None 

243 

244 for discovered_range in layers: 

245 if ( 

246 is_suspiciously_successive_range(discovered_range, character_range) 

247 is False 

248 ): 

249 layer_target_range = discovered_range 

250 break 

251 

252 if layer_target_range is None: 

253 layer_target_range = character_range 

254 

255 if layer_target_range not in layers: 

256 layers[layer_target_range] = character.lower() 

257 continue 

258 

259 layers[layer_target_range] += character.lower() 

260 

261 return list(layers.values()) 

262 

263 

264def merge_coherence_ratios(results: List[CoherenceMatches]) -> CoherenceMatches: 

265 """ 

266 This function merge results previously given by the function coherence_ratio. 

267 The return type is the same as coherence_ratio. 

268 """ 

269 per_language_ratios: Dict[str, List[float]] = {} 

270 for result in results: 

271 for sub_result in result: 

272 language, ratio = sub_result 

273 if language not in per_language_ratios: 

274 per_language_ratios[language] = [ratio] 

275 continue 

276 per_language_ratios[language].append(ratio) 

277 

278 merge = [ 

279 ( 

280 language, 

281 round( 

282 sum(per_language_ratios[language]) / len(per_language_ratios[language]), 

283 4, 

284 ), 

285 ) 

286 for language in per_language_ratios 

287 ] 

288 

289 return sorted(merge, key=lambda x: x[1], reverse=True) 

290 

291 

292@lru_cache(maxsize=2048) 

293def coherence_ratio( 

294 decoded_sequence: str, threshold: float = 0.1, lg_inclusion: Optional[str] = None 

295) -> CoherenceMatches: 

296 """ 

297 Detect ANY language that can be identified in given sequence. The sequence will be analysed by layers. 

298 A layer = Character extraction by alphabets/ranges. 

299 """ 

300 

301 results: List[Tuple[str, float]] = [] 

302 ignore_non_latin: bool = False 

303 

304 sufficient_match_count: int = 0 

305 

306 lg_inclusion_list = lg_inclusion.split(",") if lg_inclusion is not None else [] 

307 if "Latin Based" in lg_inclusion_list: 

308 ignore_non_latin = True 

309 lg_inclusion_list.remove("Latin Based") 

310 

311 for layer in alpha_unicode_split(decoded_sequence): 

312 sequence_frequencies: TypeCounter[str] = Counter(layer) 

313 most_common = sequence_frequencies.most_common() 

314 

315 character_count: int = sum(o for c, o in most_common) 

316 

317 if character_count <= TOO_SMALL_SEQUENCE: 

318 continue 

319 

320 popular_character_ordered: List[str] = [c for c, o in most_common] 

321 

322 for language in lg_inclusion_list or alphabet_languages( 

323 popular_character_ordered, ignore_non_latin 

324 ): 

325 ratio: float = characters_popularity_compare( 

326 language, popular_character_ordered 

327 ) 

328 

329 if ratio < threshold: 

330 continue 

331 elif ratio >= 0.8: 

332 sufficient_match_count += 1 

333 

334 results.append((language, round(ratio, 4))) 

335 

336 if sufficient_match_count >= 3: 

337 break 

338 

339 return sorted(results, key=lambda x: x[1], reverse=True)