Coverage for /var/srv/projects/api.amasfac.comuna18.com/tmp/venv/lib/python3.9/site-packages/xlwt/UnicodeUtils.py: 8%

58 statements  

« prev     ^ index     » next       coverage.py v6.4.4, created at 2023-07-17 14:22 -0600

1# -*- coding: windows-1252 -*- 

2 

3''' 

4From BIFF8 on, strings are always stored using UTF-16LE text encoding. The 

5character array is a sequence of 16-bit values4. Additionally it is 

6possible to use a compressed format, which omits the high bytes of all 

7characters, if they are all zero. 

8 

9The following tables describe the standard format of the entire string, but 

10in many records the strings differ from this format. This will be mentioned 

11separately. It is possible (but not required) to store Rich-Text formatting 

12information and Asian phonetic information inside a Unicode string. This 

13results in four different ways to store a string. The character array 

14is not zero-terminated. 

15 

16The string consists of the character count (as usual an 8-bit value or 

17a 16-bit value), option flags, the character array and optional formatting 

18information. If the string is empty, sometimes the option flags field will 

19not occur. This is mentioned at the respective place. 

20 

21Offset Size Contents 

220 1 or 2 Length of the string (character count, ln) 

231 or 2 1 Option flags: 

24 Bit Mask Contents 

25 0 01H Character compression (ccompr): 

26 0 = Compressed (8-bit characters) 

27 1 = Uncompressed (16-bit characters) 

28 2 04H Asian phonetic settings (phonetic): 

29 0 = Does not contain Asian phonetic settings 

30 1 = Contains Asian phonetic settings 

31 3 08H Rich-Text settings (richtext): 

32 0 = Does not contain Rich-Text settings 

33 1 = Contains Rich-Text settings 

34[2 or 3] 2 (optional, only if richtext=1) Number of Rich-Text formatting runs (rt) 

35[var.] 4 (optional, only if phonetic=1) Size of Asian phonetic settings block (in bytes, sz) 

36var. ln or  

37 2·ln Character array (8-bit characters or 16-bit characters, dependent on ccompr) 

38[var.] 4·rt (optional, only if richtext=1) List of rt formatting runs  

39[var.] sz (optional, only if phonetic=1) Asian Phonetic Settings Block  

40''' 

41 

42from .compat import unicode, unicode_type 

43from struct import pack 

44 

45def upack2(s, encoding='ascii'): 

46 # If not unicode, make it so. 

47 if isinstance(s, unicode_type): 

48 us = s 

49 else: 

50 us = unicode(s, encoding) 

51 # Limit is based on number of content characters 

52 # (not on number of bytes in packed result) 

53 len_us = len(us) 

54 if len_us > 32767: 

55 raise Exception('String longer than 32767 characters') 

56 try: 

57 encs = us.encode('latin1') 

58 # Success here means all chars are in U+0000 to U+00FF 

59 # inclusive, meaning that we can use "compressed format". 

60 flag = 0 

61 n_items = len_us 

62 except UnicodeEncodeError: 

63 encs = us.encode('utf_16_le') 

64 flag = 1 

65 n_items = len(encs) // 2 

66 # n_items is the number of "double byte characters" i.e. MS C wchars 

67 # Can't use len(us). 

68 # len(u"\U0001D400") -> 1 on a wide-unicode build  

69 # and 2 on a narrow-unicode build. 

70 # We need n_items == 2 in this case. 

71 return pack('<HB', n_items, flag) + encs 

72 

73def upack2rt(rt, encoding='ascii'): 

74 us = u'' 

75 fr = b'' 

76 offset = 0 

77 # convert rt strings to unicode if not already unicode 

78 # also generate the formatting run for the styles added 

79 for s, fontx in rt: 

80 if not isinstance(s, unicode_type): 

81 s = unicode(s, encoding) 

82 us += s 

83 if fontx is not None: 

84 # code in Rows.py ensures that 

85 # fontx can be None only for the first piece 

86 fr += pack('<HH', offset, fontx) 

87 # offset is the number of MS C wchar characters. 

88 # That is 1 if c <= u'\uFFFF' else 2  

89 offset += len(s.encode('utf_16_le')) // 2 

90 num_fr = len(fr) // 4 # ensure result is int 

91 if offset > 32767: 

92 raise Exception('String longer than 32767 characters') 

93 try: 

94 encs = us.encode('latin1') 

95 # Success here means all chars are in U+0000 to U+00FF 

96 # inclusive, meaning that we can use "compressed format". 

97 flag = 0 | 8 

98 n_items = len(encs) 

99 except UnicodeEncodeError: 

100 encs = us.encode('utf_16_le') 

101 flag = 1 | 8 

102 n_items = len(encs) // 2 # see comments in upack2 function above 

103 return pack('<HBH', n_items, flag, num_fr) + encs, fr 

104 

105def upack1(s, encoding='ascii'): 

106 # Same as upack2(), but with a one-byte length field. 

107 if isinstance(s, unicode_type): 

108 us = s 

109 else: 

110 us = unicode(s, encoding) 

111 len_us = len(us) 

112 if len_us > 255: 

113 raise Exception('String longer than 255 characters') 

114 try: 

115 encs = us.encode('latin1') 

116 flag = 0 

117 n_items = len_us 

118 except UnicodeEncodeError: 

119 encs = us.encode('utf_16_le') 

120 flag = 1 

121 n_items = len(encs) // 2 

122 return pack('<BB', n_items, flag) + encs