Coverage for /var/srv/projects/api.amasfac.comuna18.com/tmp/venv/lib/python3.9/site-packages/openpyxl/formula/tokenizer.py: 19%
216 statements
« prev ^ index » next coverage.py v6.4.4, created at 2023-07-17 14:22 -0600
« prev ^ index » next coverage.py v6.4.4, created at 2023-07-17 14:22 -0600
1"""
2This module contains a tokenizer for Excel formulae.
4The tokenizer is based on the Javascript tokenizer found at
5http://ewbi.blogs.com/develops/2004/12/excel_formula_p.html written by Eric
6Bachtal
7"""
9import re
12class TokenizerError(Exception):
13 """Base class for all Tokenizer errors."""
16class Tokenizer(object):
18 """
19 A tokenizer for Excel worksheet formulae.
21 Converts a str string representing an Excel formula (in A1 notation)
22 into a sequence of `Token` objects.
24 `formula`: The str string to tokenize
26 Tokenizer defines a method `._parse()` to parse the formula into tokens,
27 which can then be accessed through the `.items` attribute.
29 """
31 SN_RE = re.compile("^[1-9](\\.[0-9]+)?[Ee]$") # Scientific notation
32 WSPACE_RE = re.compile(r"[ \n]+")
33 STRING_REGEXES = {
34 # Inside a string, all characters are treated as literals, except for
35 # the quote character used to start the string. That character, when
36 # doubled is treated as a single character in the string. If an
37 # unmatched quote appears, the string is terminated.
38 '"': re.compile('"(?:[^"]*"")*[^"]*"(?!")'),
39 "'": re.compile("'(?:[^']*'')*[^']*'(?!')"),
40 }
41 ERROR_CODES = ("#NULL!", "#DIV/0!", "#VALUE!", "#REF!", "#NAME?",
42 "#NUM!", "#N/A", "#GETTING_DATA")
43 TOKEN_ENDERS = ',;}) +-*/^&=><%' # Each of these characters, marks the
44 # end of an operand token
46 def __init__(self, formula):
47 self.formula = formula
48 self.items = []
49 self.token_stack = [] # Used to keep track of arrays, functions, and
50 # parentheses
51 self.offset = 0 # How many chars have we read
52 self.token = [] # Used to build up token values char by char
53 self._parse()
55 def _parse(self):
56 """Populate self.items with the tokens from the formula."""
57 if self.offset:
58 return # Already parsed!
59 if not self.formula:
60 return
61 elif self.formula[0] == '=':
62 self.offset += 1
63 else:
64 self.items.append(Token(self.formula, Token.LITERAL))
65 return
66 consumers = (
67 ('"\'', self._parse_string),
68 ('[', self._parse_brackets),
69 ('#', self._parse_error),
70 (' ', self._parse_whitespace),
71 ('\n', self._parse_whitespace),
72 ('+-*/^&=><%', self._parse_operator),
73 ('{(', self._parse_opener),
74 (')}', self._parse_closer),
75 (';,', self._parse_separator),
76 )
77 dispatcher = {} # maps chars to the specific parsing function
78 for chars, consumer in consumers:
79 dispatcher.update(dict.fromkeys(chars, consumer))
80 while self.offset < len(self.formula):
81 if self.check_scientific_notation(): # May consume one character
82 continue
83 curr_char = self.formula[self.offset]
84 if curr_char in self.TOKEN_ENDERS:
85 self.save_token()
86 if curr_char in dispatcher:
87 self.offset += dispatcher[curr_char]()
88 else:
89 # TODO: this can probably be sped up using a regex to get to
90 # the next interesting character
91 self.token.append(curr_char)
92 self.offset += 1
93 self.save_token()
95 def _parse_string(self):
96 """
97 Parse a "-delimited string or '-delimited link.
99 The offset must be pointing to either a single quote ("'") or double
100 quote ('"') character. The strings are parsed according to Excel
101 rules where to escape the delimiter you just double it up. E.g.,
102 "abc""def" in Excel is parsed as 'abc"def' in Python.
104 Returns the number of characters matched. (Does not update
105 self.offset)
107 """
108 self.assert_empty_token(can_follow=':')
109 delim = self.formula[self.offset]
110 assert delim in ('"', "'")
111 regex = self.STRING_REGEXES[delim]
112 match = regex.match(self.formula[self.offset:])
113 if match is None:
114 subtype = "string" if delim == '"' else 'link'
115 raise TokenizerError(f"Reached end of formula while parsing {subtype} in {self.formula}")
116 match = match.group(0)
117 if delim == '"':
118 self.items.append(Token.make_operand(match))
119 else:
120 self.token.append(match)
121 return len(match)
123 def _parse_brackets(self):
124 """
125 Consume all the text between square brackets [].
127 Returns the number of characters matched. (Does not update
128 self.offset)
130 """
131 assert self.formula[self.offset] == '['
132 lefts = [(t.start(), 1) for t in
133 re.finditer(r"\[", self.formula[self.offset:])]
134 rights = [(t.start(), -1) for t in
135 re.finditer(r"\]", self.formula[self.offset:])]
137 open_count = 0
138 for idx, open_close in sorted(lefts + rights):
139 open_count += open_close
140 if open_count == 0:
141 outer_right = idx + 1
142 self.token.append(
143 self.formula[self.offset:self.offset + outer_right])
144 return outer_right
146 raise TokenizerError(f"Encountered unmatched '[' in {self.formula}")
148 def _parse_error(self):
149 """
150 Consume the text following a '#' as an error.
152 Looks for a match in self.ERROR_CODES and returns the number of
153 characters matched. (Does not update self.offset)
155 """
156 self.assert_empty_token(can_follow='!')
157 assert self.formula[self.offset] == '#'
158 subformula = self.formula[self.offset:]
159 for err in self.ERROR_CODES:
160 if subformula.startswith(err):
161 self.items.append(Token.make_operand(''.join(self.token) + err))
162 del self.token[:]
163 return len(err)
164 raise TokenizerError(f"Invalid error code at position {self.offset} in '{self.formula}'")
166 def _parse_whitespace(self):
167 """
168 Consume a string of consecutive spaces.
170 Returns the number of spaces found. (Does not update self.offset).
172 """
173 assert self.formula[self.offset] in (' ', '\n')
174 self.items.append(Token(self.formula[self.offset], Token.WSPACE))
175 return self.WSPACE_RE.match(self.formula[self.offset:]).end()
177 def _parse_operator(self):
178 """
179 Consume the characters constituting an operator.
181 Returns the number of characters consumed. (Does not update
182 self.offset)
184 """
185 if self.formula[self.offset:self.offset + 2] in ('>=', '<=', '<>'):
186 self.items.append(Token(
187 self.formula[self.offset:self.offset + 2],
188 Token.OP_IN
189 ))
190 return 2
191 curr_char = self.formula[self.offset] # guaranteed to be 1 char
192 assert curr_char in '%*/^&=><+-'
193 if curr_char == '%':
194 token = Token('%', Token.OP_POST)
195 elif curr_char in "*/^&=><":
196 token = Token(curr_char, Token.OP_IN)
197 # From here on, curr_char is guaranteed to be in '+-'
198 elif not self.items:
199 token = Token(curr_char, Token.OP_PRE)
200 else:
201 prev = next((i for i in reversed(self.items)
202 if i.type != Token.WSPACE), None)
203 is_infix = prev and (
204 prev.subtype == Token.CLOSE
205 or prev.type == Token.OP_POST
206 or prev.type == Token.OPERAND
207 )
208 if is_infix:
209 token = Token(curr_char, Token.OP_IN)
210 else:
211 token = Token(curr_char, Token.OP_PRE)
212 self.items.append(token)
213 return 1
215 def _parse_opener(self):
216 """
217 Consumes a ( or { character.
219 Returns the number of characters consumed. (Does not update
220 self.offset)
222 """
223 assert self.formula[self.offset] in ('(', '{')
224 if self.formula[self.offset] == '{':
225 self.assert_empty_token()
226 token = Token.make_subexp("{")
227 elif self.token:
228 token_value = "".join(self.token) + '('
229 del self.token[:]
230 token = Token.make_subexp(token_value)
231 else:
232 token = Token.make_subexp("(")
233 self.items.append(token)
234 self.token_stack.append(token)
235 return 1
237 def _parse_closer(self):
238 """
239 Consumes a } or ) character.
241 Returns the number of characters consumed. (Does not update
242 self.offset)
244 """
245 assert self.formula[self.offset] in (')', '}')
246 token = self.token_stack.pop().get_closer()
247 if token.value != self.formula[self.offset]:
248 raise TokenizerError(
249 "Mismatched ( and { pair in '%s'" % self.formula)
250 self.items.append(token)
251 return 1
253 def _parse_separator(self):
254 """
255 Consumes a ; or , character.
257 Returns the number of characters consumed. (Does not update
258 self.offset)
260 """
261 curr_char = self.formula[self.offset]
262 assert curr_char in (';', ',')
263 if curr_char == ';':
264 token = Token.make_separator(";")
265 else:
266 try:
267 top_type = self.token_stack[-1].type
268 except IndexError:
269 token = Token(",", Token.OP_IN) # Range Union operator
270 else:
271 if top_type == Token.PAREN:
272 token = Token(",", Token.OP_IN) # Range Union operator
273 else:
274 token = Token.make_separator(",")
275 self.items.append(token)
276 return 1
278 def check_scientific_notation(self):
279 """
280 Consumes a + or - character if part of a number in sci. notation.
282 Returns True if the character was consumed and self.offset was
283 updated, False otherwise.
285 """
286 curr_char = self.formula[self.offset]
287 if (curr_char in '+-'
288 and len(self.token) >= 1
289 and self.SN_RE.match("".join(self.token))):
290 self.token.append(curr_char)
291 self.offset += 1
292 return True
293 return False
295 def assert_empty_token(self, can_follow=()):
296 """
297 Ensure that there's no token currently being parsed.
299 Or if there is a token being parsed, it must end with a character in
300 can_follow.
302 If there are unconsumed token contents, it means we hit an unexpected
303 token transition. In this case, we raise a TokenizerError
305 """
306 if self.token and self.token[-1] not in can_follow:
307 raise TokenizerError(f"Unexpected character at position {self.offset} in '{self.formula}'")
309 def save_token(self):
310 """If there's a token being parsed, add it to the item list."""
311 if self.token:
312 self.items.append(Token.make_operand("".join(self.token)))
313 del self.token[:]
315 def render(self):
316 """Convert the parsed tokens back to a string."""
317 if not self.items:
318 return ""
319 elif self.items[0].type == Token.LITERAL:
320 return self.items[0].value
321 return "=" + "".join(token.value for token in self.items)
324class Token(object):
326 """
327 A token in an Excel formula.
329 Tokens have three attributes:
331 * `value`: The string value parsed that led to this token
332 * `type`: A string identifying the type of token
333 * `subtype`: A string identifying subtype of the token (optional, and
334 defaults to "")
336 """
338 __slots__ = ['value', 'type', 'subtype']
340 LITERAL = "LITERAL"
341 OPERAND = "OPERAND"
342 FUNC = "FUNC"
343 ARRAY = "ARRAY"
344 PAREN = "PAREN"
345 SEP = "SEP"
346 OP_PRE = "OPERATOR-PREFIX"
347 OP_IN = "OPERATOR-INFIX"
348 OP_POST = "OPERATOR-POSTFIX"
349 WSPACE = "WHITE-SPACE"
351 def __init__(self, value, type_, subtype=""):
352 self.value = value
353 self.type = type_
354 self.subtype = subtype
356 # Literal operands:
357 #
358 # Literal operands are always of type 'OPERAND' and can be of subtype
359 # 'TEXT' (for text strings), 'NUMBER' (for all numeric types), 'LOGICAL'
360 # (for TRUE and FALSE), 'ERROR' (for literal error values), or 'RANGE'
361 # (for all range references).
363 TEXT = 'TEXT'
364 NUMBER = 'NUMBER'
365 LOGICAL = 'LOGICAL'
366 ERROR = 'ERROR'
367 RANGE = 'RANGE'
369 def __repr__(self):
370 return u"{0} {1} {2}:".format(self.type, self.subtype, self.value)
372 @classmethod
373 def make_operand(cls, value):
374 """Create an operand token."""
375 if value.startswith('"'):
376 subtype = cls.TEXT
377 elif value.startswith('#'):
378 subtype = cls.ERROR
379 elif value in ('TRUE', 'FALSE'):
380 subtype = cls.LOGICAL
381 else:
382 try:
383 float(value)
384 subtype = cls.NUMBER
385 except ValueError:
386 subtype = cls.RANGE
387 return cls(value, cls.OPERAND, subtype)
390 # Subexpresssions
391 #
392 # There are 3 types of `Subexpressions`: functions, array literals, and
393 # parentheticals. Subexpressions have 'OPEN' and 'CLOSE' tokens. 'OPEN'
394 # is used when parsing the initial expression token (i.e., '(' or '{')
395 # and 'CLOSE' is used when parsing the closing expression token ('}' or
396 # ')').
398 OPEN = "OPEN"
399 CLOSE = "CLOSE"
401 @classmethod
402 def make_subexp(cls, value, func=False):
403 """
404 Create a subexpression token.
406 `value`: The value of the token
407 `func`: If True, force the token to be of type FUNC
409 """
410 assert value[-1] in ('{', '}', '(', ')')
411 if func:
412 assert re.match('.+\\(|\\)', value)
413 type_ = Token.FUNC
414 elif value in '{}':
415 type_ = Token.ARRAY
416 elif value in '()':
417 type_ = Token.PAREN
418 else:
419 type_ = Token.FUNC
420 subtype = cls.CLOSE if value in ')}' else cls.OPEN
421 return cls(value, type_, subtype)
423 def get_closer(self):
424 """Return a closing token that matches this token's type."""
425 assert self.type in (self.FUNC, self.ARRAY, self.PAREN)
426 assert self.subtype == self.OPEN
427 value = "}" if self.type == self.ARRAY else ")"
428 return self.make_subexp(value, func=self.type == self.FUNC)
430 # Separator tokens
431 #
432 # Argument separators always have type 'SEP' and can have one of two
433 # subtypes: 'ARG', 'ROW'. 'ARG' is used for the ',' token, when used to
434 # delimit either function arguments or array elements. 'ROW' is used for
435 # the ';' token, which is always used to delimit rows in an array
436 # literal.
438 ARG = "ARG"
439 ROW = "ROW"
441 @classmethod
442 def make_separator(cls, value):
443 """Create a separator token"""
444 assert value in (',', ';')
445 subtype = cls.ARG if value == ',' else cls.ROW
446 return cls(value, cls.SEP, subtype)