Coverage for /var/srv/projects/api.amasfac.comuna18.com/tmp/venv/lib/python3.9/site-packages/openpyxl/formula/tokenizer.py: 19%

216 statements  

« prev     ^ index     » next       coverage.py v6.4.4, created at 2023-07-17 14:22 -0600

1""" 

2This module contains a tokenizer for Excel formulae. 

3 

4The tokenizer is based on the Javascript tokenizer found at 

5http://ewbi.blogs.com/develops/2004/12/excel_formula_p.html written by Eric 

6Bachtal 

7""" 

8 

9import re 

10 

11 

12class TokenizerError(Exception): 

13 """Base class for all Tokenizer errors.""" 

14 

15 

16class Tokenizer(object): 

17 

18 """ 

19 A tokenizer for Excel worksheet formulae. 

20 

21 Converts a str string representing an Excel formula (in A1 notation) 

22 into a sequence of `Token` objects. 

23 

24 `formula`: The str string to tokenize 

25 

26 Tokenizer defines a method `._parse()` to parse the formula into tokens, 

27 which can then be accessed through the `.items` attribute. 

28 

29 """ 

30 

31 SN_RE = re.compile("^[1-9](\\.[0-9]+)?[Ee]$") # Scientific notation 

32 WSPACE_RE = re.compile(r"[ \n]+") 

33 STRING_REGEXES = { 

34 # Inside a string, all characters are treated as literals, except for 

35 # the quote character used to start the string. That character, when 

36 # doubled is treated as a single character in the string. If an 

37 # unmatched quote appears, the string is terminated. 

38 '"': re.compile('"(?:[^"]*"")*[^"]*"(?!")'), 

39 "'": re.compile("'(?:[^']*'')*[^']*'(?!')"), 

40 } 

41 ERROR_CODES = ("#NULL!", "#DIV/0!", "#VALUE!", "#REF!", "#NAME?", 

42 "#NUM!", "#N/A", "#GETTING_DATA") 

43 TOKEN_ENDERS = ',;}) +-*/^&=><%' # Each of these characters, marks the 

44 # end of an operand token 

45 

46 def __init__(self, formula): 

47 self.formula = formula 

48 self.items = [] 

49 self.token_stack = [] # Used to keep track of arrays, functions, and 

50 # parentheses 

51 self.offset = 0 # How many chars have we read 

52 self.token = [] # Used to build up token values char by char 

53 self._parse() 

54 

55 def _parse(self): 

56 """Populate self.items with the tokens from the formula.""" 

57 if self.offset: 

58 return # Already parsed! 

59 if not self.formula: 

60 return 

61 elif self.formula[0] == '=': 

62 self.offset += 1 

63 else: 

64 self.items.append(Token(self.formula, Token.LITERAL)) 

65 return 

66 consumers = ( 

67 ('"\'', self._parse_string), 

68 ('[', self._parse_brackets), 

69 ('#', self._parse_error), 

70 (' ', self._parse_whitespace), 

71 ('\n', self._parse_whitespace), 

72 ('+-*/^&=><%', self._parse_operator), 

73 ('{(', self._parse_opener), 

74 (')}', self._parse_closer), 

75 (';,', self._parse_separator), 

76 ) 

77 dispatcher = {} # maps chars to the specific parsing function 

78 for chars, consumer in consumers: 

79 dispatcher.update(dict.fromkeys(chars, consumer)) 

80 while self.offset < len(self.formula): 

81 if self.check_scientific_notation(): # May consume one character 

82 continue 

83 curr_char = self.formula[self.offset] 

84 if curr_char in self.TOKEN_ENDERS: 

85 self.save_token() 

86 if curr_char in dispatcher: 

87 self.offset += dispatcher[curr_char]() 

88 else: 

89 # TODO: this can probably be sped up using a regex to get to 

90 # the next interesting character 

91 self.token.append(curr_char) 

92 self.offset += 1 

93 self.save_token() 

94 

95 def _parse_string(self): 

96 """ 

97 Parse a "-delimited string or '-delimited link. 

98 

99 The offset must be pointing to either a single quote ("'") or double 

100 quote ('"') character. The strings are parsed according to Excel 

101 rules where to escape the delimiter you just double it up. E.g., 

102 "abc""def" in Excel is parsed as 'abc"def' in Python. 

103 

104 Returns the number of characters matched. (Does not update 

105 self.offset) 

106 

107 """ 

108 self.assert_empty_token(can_follow=':') 

109 delim = self.formula[self.offset] 

110 assert delim in ('"', "'") 

111 regex = self.STRING_REGEXES[delim] 

112 match = regex.match(self.formula[self.offset:]) 

113 if match is None: 

114 subtype = "string" if delim == '"' else 'link' 

115 raise TokenizerError(f"Reached end of formula while parsing {subtype} in {self.formula}") 

116 match = match.group(0) 

117 if delim == '"': 

118 self.items.append(Token.make_operand(match)) 

119 else: 

120 self.token.append(match) 

121 return len(match) 

122 

123 def _parse_brackets(self): 

124 """ 

125 Consume all the text between square brackets []. 

126 

127 Returns the number of characters matched. (Does not update 

128 self.offset) 

129 

130 """ 

131 assert self.formula[self.offset] == '[' 

132 lefts = [(t.start(), 1) for t in 

133 re.finditer(r"\[", self.formula[self.offset:])] 

134 rights = [(t.start(), -1) for t in 

135 re.finditer(r"\]", self.formula[self.offset:])] 

136 

137 open_count = 0 

138 for idx, open_close in sorted(lefts + rights): 

139 open_count += open_close 

140 if open_count == 0: 

141 outer_right = idx + 1 

142 self.token.append( 

143 self.formula[self.offset:self.offset + outer_right]) 

144 return outer_right 

145 

146 raise TokenizerError(f"Encountered unmatched '[' in {self.formula}") 

147 

148 def _parse_error(self): 

149 """ 

150 Consume the text following a '#' as an error. 

151 

152 Looks for a match in self.ERROR_CODES and returns the number of 

153 characters matched. (Does not update self.offset) 

154 

155 """ 

156 self.assert_empty_token(can_follow='!') 

157 assert self.formula[self.offset] == '#' 

158 subformula = self.formula[self.offset:] 

159 for err in self.ERROR_CODES: 

160 if subformula.startswith(err): 

161 self.items.append(Token.make_operand(''.join(self.token) + err)) 

162 del self.token[:] 

163 return len(err) 

164 raise TokenizerError(f"Invalid error code at position {self.offset} in '{self.formula}'") 

165 

166 def _parse_whitespace(self): 

167 """ 

168 Consume a string of consecutive spaces. 

169 

170 Returns the number of spaces found. (Does not update self.offset). 

171 

172 """ 

173 assert self.formula[self.offset] in (' ', '\n') 

174 self.items.append(Token(self.formula[self.offset], Token.WSPACE)) 

175 return self.WSPACE_RE.match(self.formula[self.offset:]).end() 

176 

177 def _parse_operator(self): 

178 """ 

179 Consume the characters constituting an operator. 

180 

181 Returns the number of characters consumed. (Does not update 

182 self.offset) 

183 

184 """ 

185 if self.formula[self.offset:self.offset + 2] in ('>=', '<=', '<>'): 

186 self.items.append(Token( 

187 self.formula[self.offset:self.offset + 2], 

188 Token.OP_IN 

189 )) 

190 return 2 

191 curr_char = self.formula[self.offset] # guaranteed to be 1 char 

192 assert curr_char in '%*/^&=><+-' 

193 if curr_char == '%': 

194 token = Token('%', Token.OP_POST) 

195 elif curr_char in "*/^&=><": 

196 token = Token(curr_char, Token.OP_IN) 

197 # From here on, curr_char is guaranteed to be in '+-' 

198 elif not self.items: 

199 token = Token(curr_char, Token.OP_PRE) 

200 else: 

201 prev = next((i for i in reversed(self.items) 

202 if i.type != Token.WSPACE), None) 

203 is_infix = prev and ( 

204 prev.subtype == Token.CLOSE 

205 or prev.type == Token.OP_POST 

206 or prev.type == Token.OPERAND 

207 ) 

208 if is_infix: 

209 token = Token(curr_char, Token.OP_IN) 

210 else: 

211 token = Token(curr_char, Token.OP_PRE) 

212 self.items.append(token) 

213 return 1 

214 

215 def _parse_opener(self): 

216 """ 

217 Consumes a ( or { character. 

218 

219 Returns the number of characters consumed. (Does not update 

220 self.offset) 

221 

222 """ 

223 assert self.formula[self.offset] in ('(', '{') 

224 if self.formula[self.offset] == '{': 

225 self.assert_empty_token() 

226 token = Token.make_subexp("{") 

227 elif self.token: 

228 token_value = "".join(self.token) + '(' 

229 del self.token[:] 

230 token = Token.make_subexp(token_value) 

231 else: 

232 token = Token.make_subexp("(") 

233 self.items.append(token) 

234 self.token_stack.append(token) 

235 return 1 

236 

237 def _parse_closer(self): 

238 """ 

239 Consumes a } or ) character. 

240 

241 Returns the number of characters consumed. (Does not update 

242 self.offset) 

243 

244 """ 

245 assert self.formula[self.offset] in (')', '}') 

246 token = self.token_stack.pop().get_closer() 

247 if token.value != self.formula[self.offset]: 

248 raise TokenizerError( 

249 "Mismatched ( and { pair in '%s'" % self.formula) 

250 self.items.append(token) 

251 return 1 

252 

253 def _parse_separator(self): 

254 """ 

255 Consumes a ; or , character. 

256 

257 Returns the number of characters consumed. (Does not update 

258 self.offset) 

259 

260 """ 

261 curr_char = self.formula[self.offset] 

262 assert curr_char in (';', ',') 

263 if curr_char == ';': 

264 token = Token.make_separator(";") 

265 else: 

266 try: 

267 top_type = self.token_stack[-1].type 

268 except IndexError: 

269 token = Token(",", Token.OP_IN) # Range Union operator 

270 else: 

271 if top_type == Token.PAREN: 

272 token = Token(",", Token.OP_IN) # Range Union operator 

273 else: 

274 token = Token.make_separator(",") 

275 self.items.append(token) 

276 return 1 

277 

278 def check_scientific_notation(self): 

279 """ 

280 Consumes a + or - character if part of a number in sci. notation. 

281 

282 Returns True if the character was consumed and self.offset was 

283 updated, False otherwise. 

284 

285 """ 

286 curr_char = self.formula[self.offset] 

287 if (curr_char in '+-' 

288 and len(self.token) >= 1 

289 and self.SN_RE.match("".join(self.token))): 

290 self.token.append(curr_char) 

291 self.offset += 1 

292 return True 

293 return False 

294 

295 def assert_empty_token(self, can_follow=()): 

296 """ 

297 Ensure that there's no token currently being parsed. 

298 

299 Or if there is a token being parsed, it must end with a character in 

300 can_follow. 

301 

302 If there are unconsumed token contents, it means we hit an unexpected 

303 token transition. In this case, we raise a TokenizerError 

304 

305 """ 

306 if self.token and self.token[-1] not in can_follow: 

307 raise TokenizerError(f"Unexpected character at position {self.offset} in '{self.formula}'") 

308 

309 def save_token(self): 

310 """If there's a token being parsed, add it to the item list.""" 

311 if self.token: 

312 self.items.append(Token.make_operand("".join(self.token))) 

313 del self.token[:] 

314 

315 def render(self): 

316 """Convert the parsed tokens back to a string.""" 

317 if not self.items: 

318 return "" 

319 elif self.items[0].type == Token.LITERAL: 

320 return self.items[0].value 

321 return "=" + "".join(token.value for token in self.items) 

322 

323 

324class Token(object): 

325 

326 """ 

327 A token in an Excel formula. 

328 

329 Tokens have three attributes: 

330 

331 * `value`: The string value parsed that led to this token 

332 * `type`: A string identifying the type of token 

333 * `subtype`: A string identifying subtype of the token (optional, and 

334 defaults to "") 

335 

336 """ 

337 

338 __slots__ = ['value', 'type', 'subtype'] 

339 

340 LITERAL = "LITERAL" 

341 OPERAND = "OPERAND" 

342 FUNC = "FUNC" 

343 ARRAY = "ARRAY" 

344 PAREN = "PAREN" 

345 SEP = "SEP" 

346 OP_PRE = "OPERATOR-PREFIX" 

347 OP_IN = "OPERATOR-INFIX" 

348 OP_POST = "OPERATOR-POSTFIX" 

349 WSPACE = "WHITE-SPACE" 

350 

351 def __init__(self, value, type_, subtype=""): 

352 self.value = value 

353 self.type = type_ 

354 self.subtype = subtype 

355 

356 # Literal operands: 

357 # 

358 # Literal operands are always of type 'OPERAND' and can be of subtype 

359 # 'TEXT' (for text strings), 'NUMBER' (for all numeric types), 'LOGICAL' 

360 # (for TRUE and FALSE), 'ERROR' (for literal error values), or 'RANGE' 

361 # (for all range references). 

362 

363 TEXT = 'TEXT' 

364 NUMBER = 'NUMBER' 

365 LOGICAL = 'LOGICAL' 

366 ERROR = 'ERROR' 

367 RANGE = 'RANGE' 

368 

369 def __repr__(self): 

370 return u"{0} {1} {2}:".format(self.type, self.subtype, self.value) 

371 

372 @classmethod 

373 def make_operand(cls, value): 

374 """Create an operand token.""" 

375 if value.startswith('"'): 

376 subtype = cls.TEXT 

377 elif value.startswith('#'): 

378 subtype = cls.ERROR 

379 elif value in ('TRUE', 'FALSE'): 

380 subtype = cls.LOGICAL 

381 else: 

382 try: 

383 float(value) 

384 subtype = cls.NUMBER 

385 except ValueError: 

386 subtype = cls.RANGE 

387 return cls(value, cls.OPERAND, subtype) 

388 

389 

390 # Subexpresssions 

391 # 

392 # There are 3 types of `Subexpressions`: functions, array literals, and 

393 # parentheticals. Subexpressions have 'OPEN' and 'CLOSE' tokens. 'OPEN' 

394 # is used when parsing the initial expression token (i.e., '(' or '{') 

395 # and 'CLOSE' is used when parsing the closing expression token ('}' or 

396 # ')'). 

397 

398 OPEN = "OPEN" 

399 CLOSE = "CLOSE" 

400 

401 @classmethod 

402 def make_subexp(cls, value, func=False): 

403 """ 

404 Create a subexpression token. 

405 

406 `value`: The value of the token 

407 `func`: If True, force the token to be of type FUNC 

408 

409 """ 

410 assert value[-1] in ('{', '}', '(', ')') 

411 if func: 

412 assert re.match('.+\\(|\\)', value) 

413 type_ = Token.FUNC 

414 elif value in '{}': 

415 type_ = Token.ARRAY 

416 elif value in '()': 

417 type_ = Token.PAREN 

418 else: 

419 type_ = Token.FUNC 

420 subtype = cls.CLOSE if value in ')}' else cls.OPEN 

421 return cls(value, type_, subtype) 

422 

423 def get_closer(self): 

424 """Return a closing token that matches this token's type.""" 

425 assert self.type in (self.FUNC, self.ARRAY, self.PAREN) 

426 assert self.subtype == self.OPEN 

427 value = "}" if self.type == self.ARRAY else ")" 

428 return self.make_subexp(value, func=self.type == self.FUNC) 

429 

430 # Separator tokens 

431 # 

432 # Argument separators always have type 'SEP' and can have one of two 

433 # subtypes: 'ARG', 'ROW'. 'ARG' is used for the ',' token, when used to 

434 # delimit either function arguments or array elements. 'ROW' is used for 

435 # the ';' token, which is always used to delimit rows in an array 

436 # literal. 

437 

438 ARG = "ARG" 

439 ROW = "ROW" 

440 

441 @classmethod 

442 def make_separator(cls, value): 

443 """Create a separator token""" 

444 assert value in (',', ';') 

445 subtype = cls.ARG if value == ',' else cls.ROW 

446 return cls(value, cls.SEP, subtype)