Coverage for /var/srv/projects/api.amasfac.comuna18.com/tmp/venv/lib/python3.9/site-packages/charset

1import warnings

2from collections import Counter

3from encodings.aliases import aliases

4from hashlib import sha256

5from json import dumps

6from re import sub

7from typing import (

8 Any,

9 Counter as TypeCounter,

10 Dict,

11 Iterator,

12 List,

13 Optional,

14 Tuple,

15 Union,

16)

18from .constant import NOT_PRINTABLE_PATTERN, TOO_BIG_SEQUENCE

19from .md import mess_ratio

20from .utils import iana_name, is_multi_byte_encoding, unicode_range

23class CharsetMatch:

24 def __init__(

25 self,

26 payload: bytes,

27 guessed_encoding: str,

28 mean_mess_ratio: float,

29 has_sig_or_bom: bool,

30 languages: "CoherenceMatches",

31 decoded_payload: Optional[str] = None,

32 ):

33 self._payload: bytes = payload

35 self._encoding: str = guessed_encoding

36 self._mean_mess_ratio: float = mean_mess_ratio

37 self._languages: CoherenceMatches = languages

38 self._has_sig_or_bom: bool = has_sig_or_bom

39 self._unicode_ranges: Optional[List[str]] = None

41 self._leaves: List[CharsetMatch] = []

42 self._mean_coherence_ratio: float = 0.0

44 self._output_payload: Optional[bytes] = None

45 self._output_encoding: Optional[str] = None

47 self._string: Optional[str] = decoded_payload

49 def __eq__(self, other: object) -> bool:

50 if not isinstance(other, CharsetMatch):

51 raise TypeError(

52 "__eq__ cannot be invoked on {} and {}.".format(

53 str(other.__class__), str(self.__class__)

54 )

55 )

56 return self.encoding == other.encoding and self.fingerprint == other.fingerprint

58 def __lt__(self, other: object) -> bool:

59 """

60 Implemented to make sorted available upon CharsetMatches items.

61 """

62 if not isinstance(other, CharsetMatch):

63 raise ValueError

65 chaos_difference: float = abs(self.chaos - other.chaos)

66 coherence_difference: float = abs(self.coherence - other.coherence)

68 # Bellow 1% difference --> Use Coherence

69 if chaos_difference < 0.01 and coherence_difference > 0.02:

70 # When having a tough decision, use the result that decoded as many multi-byte as possible.

71 if chaos_difference == 0.0 and self.coherence == other.coherence:

72 return self.multi_byte_usage > other.multi_byte_usage

73 return self.coherence > other.coherence

75 return self.chaos < other.chaos

77 @property

78 def multi_byte_usage(self) -> float:

79 return 1.0 - len(str(self)) / len(self.raw)

81 @property

82 def chaos_secondary_pass(self) -> float:

83 """

84 Check once again chaos in decoded text, except this time, with full content.

85 Use with caution, this can be very slow.

86 Notice: Will be removed in 3.0

87 """

88 warnings.warn(

89 "chaos_secondary_pass is deprecated and will be removed in 3.0",

90 DeprecationWarning,

91 )

92 return mess_ratio(str(self), 1.0)

94 @property

95 def coherence_non_latin(self) -> float:

96 """

97 Coherence ratio on the first non-latin language detected if ANY.

98 Notice: Will be removed in 3.0

99 """

100 warnings.warn(

101 "coherence_non_latin is deprecated and will be removed in 3.0",

102 DeprecationWarning,

103 )

104 return 0.0

105

106 @property

107 def w_counter(self) -> TypeCounter[str]:

108 """

109 Word counter instance on decoded text.

110 Notice: Will be removed in 3.0

111 """

112 warnings.warn(

113 "w_counter is deprecated and will be removed in 3.0", DeprecationWarning

114 )

115

116 string_printable_only = sub(NOT_PRINTABLE_PATTERN, " ", str(self).lower())

117

118 return Counter(string_printable_only.split())

119

120 def __str__(self) -> str:

121 # Lazy Str Loading

122 if self._string is None:

123 self._string = str(self._payload, self._encoding, "strict")

124 return self._string

125

126 def __repr__(self) -> str:

127 return "<CharsetMatch '{}' bytes({})>".format(self.encoding, self.fingerprint)

128

129 def add_submatch(self, other: "CharsetMatch") -> None:

130 if not isinstance(other, CharsetMatch) or other == self:

131 raise ValueError(

132 "Unable to add instance <{}> as a submatch of a CharsetMatch".format(

133 other.__class__

134 )

135 )

136

137 other._string = None # Unload RAM usage; dirty trick.

138 self._leaves.append(other)

139

140 @property

141 def encoding(self) -> str:

142 return self._encoding

143

144 @property

145 def encoding_aliases(self) -> List[str]:

146 """

147 Encoding name are known by many name, using this could help when searching for IBM855 when it's listed as CP855.

148 """

149 also_known_as: List[str] = []

150 for u, p in aliases.items():

151 if self.encoding == u:

152 also_known_as.append(p)

153 elif self.encoding == p:

154 also_known_as.append(u)

155 return also_known_as

156

157 @property

158 def bom(self) -> bool:

159 return self._has_sig_or_bom

160

161 @property

162 def byte_order_mark(self) -> bool:

163 return self._has_sig_or_bom

164

165 @property

166 def languages(self) -> List[str]:

167 """

168 Return the complete list of possible languages found in decoded sequence.

169 Usually not really useful. Returned list may be empty even if 'language' property return something != 'Unknown'.

170 """

171 return [e[0] for e in self._languages]

172

173 @property

174 def language(self) -> str:

175 """

176 Most probable language found in decoded sequence. If none were detected or inferred, the property will return

177 "Unknown".

178 """

179 if not self._languages:

180 # Trying to infer the language based on the given encoding

181 # Its either English or we should not pronounce ourselves in certain cases.

182 if "ascii" in self.could_be_from_charset:

183 return "English"

184

185 # doing it there to avoid circular import

186 from charset_normalizer.cd import encoding_languages, mb_encoding_languages

187

188 languages = (

189 mb_encoding_languages(self.encoding)

190 if is_multi_byte_encoding(self.encoding)

191 else encoding_languages(self.encoding)

192 )

193

194 if len(languages) == 0 or "Latin Based" in languages:

195 return "Unknown"

196

197 return languages[0]

198

199 return self._languages[0][0]

200

201 @property

202 def chaos(self) -> float:

203 return self._mean_mess_ratio

204

205 @property

206 def coherence(self) -> float:

207 if not self._languages:

208 return 0.0

209 return self._languages[0][1]

210

211 @property

212 def percent_chaos(self) -> float:

213 return round(self.chaos * 100, ndigits=3)

214

215 @property

216 def percent_coherence(self) -> float:

217 return round(self.coherence * 100, ndigits=3)

218

219 @property

220 def raw(self) -> bytes:

221 """

222 Original untouched bytes.

223 """

224 return self._payload

225

226 @property

227 def submatch(self) -> List["CharsetMatch"]:

228 return self._leaves

229

230 @property

231 def has_submatch(self) -> bool:

232 return len(self._leaves) > 0

233

234 @property

235 def alphabets(self) -> List[str]:

236 if self._unicode_ranges is not None:

237 return self._unicode_ranges

238 # list detected ranges

239 detected_ranges: List[Optional[str]] = [

240 unicode_range(char) for char in str(self)

241 ]

242 # filter and sort

243 self._unicode_ranges = sorted(list({r for r in detected_ranges if r}))

244 return self._unicode_ranges

245

246 @property

247 def could_be_from_charset(self) -> List[str]:

248 """

249 The complete list of encoding that output the exact SAME str result and therefore could be the originating

250 encoding.

251 This list does include the encoding available in property 'encoding'.

252 """

253 return [self._encoding] + [m.encoding for m in self._leaves]

254

255 def first(self) -> "CharsetMatch":

256 """

257 Kept for BC reasons. Will be removed in 3.0.

258 """

259 return self

260

261 def best(self) -> "CharsetMatch":

262 """

263 Kept for BC reasons. Will be removed in 3.0.

264 """

265 return self

266

267 def output(self, encoding: str = "utf_8") -> bytes:

268 """

269 Method to get re-encoded bytes payload using given target encoding. Default to UTF-8.

270 Any errors will be simply ignored by the encoder NOT replaced.

271 """

272 if self._output_encoding is None or self._output_encoding != encoding:

273 self._output_encoding = encoding

274 self._output_payload = str(self).encode(encoding, "replace")

275

276 return self._output_payload # type: ignore

277

278 @property

279 def fingerprint(self) -> str:

280 """

281 Retrieve the unique SHA256 computed using the transformed (re-encoded) payload. Not the original one.

282 """

283 return sha256(self.output()).hexdigest()

284

285

286class CharsetMatches:

287 """

288 Container with every CharsetMatch items ordered by default from most probable to the less one.

289 Act like a list(iterable) but does not implements all related methods.

290 """

291

292 def __init__(self, results: Optional[List[CharsetMatch]] = None):

293 self._results: List[CharsetMatch] = sorted(results) if results else []

294

295 def __iter__(self) -> Iterator[CharsetMatch]:

296 yield from self._results

297

298 def __getitem__(self, item: Union[int, str]) -> CharsetMatch:

299 """

300 Retrieve a single item either by its position or encoding name (alias may be used here).

301 Raise KeyError upon invalid index or encoding not present in results.

302 """

303 if isinstance(item, int):

304 return self._results[item]

305 if isinstance(item, str):

306 item = iana_name(item, False)

307 for result in self._results:

308 if item in result.could_be_from_charset:

309 return result

310 raise KeyError

311

312 def __len__(self) -> int:

313 return len(self._results)

314

315 def __bool__(self) -> bool:

316 return len(self._results) > 0

317

318 def append(self, item: CharsetMatch) -> None:

319 """

320 Insert a single match. Will be inserted accordingly to preserve sort.

321 Can be inserted as a submatch.

322 """

323 if not isinstance(item, CharsetMatch):

324 raise ValueError(

325 "Cannot append instance '{}' to CharsetMatches".format(

326 str(item.__class__)

327 )

328 )

329 # We should disable the submatch factoring when the input file is too heavy (conserve RAM usage)

330 if len(item.raw) <= TOO_BIG_SEQUENCE:

331 for match in self._results:

332 if match.fingerprint == item.fingerprint and match.chaos == item.chaos:

333 match.add_submatch(item)

334 return

335 self._results.append(item)

336 self._results = sorted(self._results)

337

338 def best(self) -> Optional["CharsetMatch"]:

339 """

340 Simply return the first match. Strict equivalent to matches[0].

341 """

342 if not self._results:

343 return None

344 return self._results[0]

345

346 def first(self) -> Optional["CharsetMatch"]:

347 """

348 Redundant method, call the method best(). Kept for BC reasons.

349 """

350 return self.best()

351

352

353CoherenceMatch = Tuple[str, float]

354CoherenceMatches = List[CoherenceMatch]

355

356

357class CliDetectionResult:

358 def __init__(

359 self,

360 path: str,

361 encoding: Optional[str],

362 encoding_aliases: List[str],

363 alternative_encodings: List[str],

364 language: str,

365 alphabets: List[str],

366 has_sig_or_bom: bool,

367 chaos: float,

368 coherence: float,

369 unicode_path: Optional[str],

370 is_preferred: bool,

371 ):

372 self.path: str = path

373 self.unicode_path: Optional[str] = unicode_path

374 self.encoding: Optional[str] = encoding

375 self.encoding_aliases: List[str] = encoding_aliases

376 self.alternative_encodings: List[str] = alternative_encodings

377 self.language: str = language

378 self.alphabets: List[str] = alphabets

379 self.has_sig_or_bom: bool = has_sig_or_bom

380 self.chaos: float = chaos

381 self.coherence: float = coherence

382 self.is_preferred: bool = is_preferred

383

384 @property

385 def __dict__(self) -> Dict[str, Any]: # type: ignore

386 return {

387 "path": self.path,

388 "encoding": self.encoding,

389 "encoding_aliases": self.encoding_aliases,

390 "alternative_encodings": self.alternative_encodings,

391 "language": self.language,

392 "alphabets": self.alphabets,

393 "has_sig_or_bom": self.has_sig_or_bom,

394 "chaos": self.chaos,

395 "coherence": self.coherence,

396 "unicode_path": self.unicode_path,

397 "is_preferred": self.is_preferred,

398 }

399

400 def to_json(self) -> str:

401 return dumps(self.__dict__, ensure_ascii=True, indent=4)

Coverage for /var/srv/projects/api.amasfac.comuna18.com/tmp/venv/lib/python3.9/site-packages/charset_normalizer/models.py: 32%

191 statements