Coverage for /var/srv/projects/api.amasfac.comuna18.com/tmp/venv/lib/python3.9/site-packages/xlwt/UnicodeUtils.py: 8%
58 statements
« prev ^ index » next coverage.py v6.4.4, created at 2023-07-17 14:22 -0600
« prev ^ index » next coverage.py v6.4.4, created at 2023-07-17 14:22 -0600
1# -*- coding: windows-1252 -*-
3'''
4From BIFF8 on, strings are always stored using UTF-16LE text encoding. The
5character array is a sequence of 16-bit values4. Additionally it is
6possible to use a compressed format, which omits the high bytes of all
7characters, if they are all zero.
9The following tables describe the standard format of the entire string, but
10in many records the strings differ from this format. This will be mentioned
11separately. It is possible (but not required) to store Rich-Text formatting
12information and Asian phonetic information inside a Unicode string. This
13results in four different ways to store a string. The character array
14is not zero-terminated.
16The string consists of the character count (as usual an 8-bit value or
17a 16-bit value), option flags, the character array and optional formatting
18information. If the string is empty, sometimes the option flags field will
19not occur. This is mentioned at the respective place.
21Offset Size Contents
220 1 or 2 Length of the string (character count, ln)
231 or 2 1 Option flags:
24 Bit Mask Contents
25 0 01H Character compression (ccompr):
26 0 = Compressed (8-bit characters)
27 1 = Uncompressed (16-bit characters)
28 2 04H Asian phonetic settings (phonetic):
29 0 = Does not contain Asian phonetic settings
30 1 = Contains Asian phonetic settings
31 3 08H Rich-Text settings (richtext):
32 0 = Does not contain Rich-Text settings
33 1 = Contains Rich-Text settings
34[2 or 3] 2 (optional, only if richtext=1) Number of Rich-Text formatting runs (rt)
35[var.] 4 (optional, only if phonetic=1) Size of Asian phonetic settings block (in bytes, sz)
36var. ln or
37 2·ln Character array (8-bit characters or 16-bit characters, dependent on ccompr)
38[var.] 4·rt (optional, only if richtext=1) List of rt formatting runs
39[var.] sz (optional, only if phonetic=1) Asian Phonetic Settings Block
40'''
42from .compat import unicode, unicode_type
43from struct import pack
45def upack2(s, encoding='ascii'):
46 # If not unicode, make it so.
47 if isinstance(s, unicode_type):
48 us = s
49 else:
50 us = unicode(s, encoding)
51 # Limit is based on number of content characters
52 # (not on number of bytes in packed result)
53 len_us = len(us)
54 if len_us > 32767:
55 raise Exception('String longer than 32767 characters')
56 try:
57 encs = us.encode('latin1')
58 # Success here means all chars are in U+0000 to U+00FF
59 # inclusive, meaning that we can use "compressed format".
60 flag = 0
61 n_items = len_us
62 except UnicodeEncodeError:
63 encs = us.encode('utf_16_le')
64 flag = 1
65 n_items = len(encs) // 2
66 # n_items is the number of "double byte characters" i.e. MS C wchars
67 # Can't use len(us).
68 # len(u"\U0001D400") -> 1 on a wide-unicode build
69 # and 2 on a narrow-unicode build.
70 # We need n_items == 2 in this case.
71 return pack('<HB', n_items, flag) + encs
73def upack2rt(rt, encoding='ascii'):
74 us = u''
75 fr = b''
76 offset = 0
77 # convert rt strings to unicode if not already unicode
78 # also generate the formatting run for the styles added
79 for s, fontx in rt:
80 if not isinstance(s, unicode_type):
81 s = unicode(s, encoding)
82 us += s
83 if fontx is not None:
84 # code in Rows.py ensures that
85 # fontx can be None only for the first piece
86 fr += pack('<HH', offset, fontx)
87 # offset is the number of MS C wchar characters.
88 # That is 1 if c <= u'\uFFFF' else 2
89 offset += len(s.encode('utf_16_le')) // 2
90 num_fr = len(fr) // 4 # ensure result is int
91 if offset > 32767:
92 raise Exception('String longer than 32767 characters')
93 try:
94 encs = us.encode('latin1')
95 # Success here means all chars are in U+0000 to U+00FF
96 # inclusive, meaning that we can use "compressed format".
97 flag = 0 | 8
98 n_items = len(encs)
99 except UnicodeEncodeError:
100 encs = us.encode('utf_16_le')
101 flag = 1 | 8
102 n_items = len(encs) // 2 # see comments in upack2 function above
103 return pack('<HBH', n_items, flag, num_fr) + encs, fr
105def upack1(s, encoding='ascii'):
106 # Same as upack2(), but with a one-byte length field.
107 if isinstance(s, unicode_type):
108 us = s
109 else:
110 us = unicode(s, encoding)
111 len_us = len(us)
112 if len_us > 255:
113 raise Exception('String longer than 255 characters')
114 try:
115 encs = us.encode('latin1')
116 flag = 0
117 n_items = len_us
118 except UnicodeEncodeError:
119 encs = us.encode('utf_16_le')
120 flag = 1
121 n_items = len(encs) // 2
122 return pack('<BB', n_items, flag) + encs