Coverage for /var/srv/projects/api.amasfac.comuna18.com/tmp/venv/lib/python3.9/site-packages/openpyxl/formula/tokenizer.py: 19%

1"""

2This module contains a tokenizer for Excel formulae.

4The tokenizer is based on the Javascript tokenizer found at

5http://ewbi.blogs.com/develops/2004/12/excel_formula_p.html written by Eric

6Bachtal

7"""

9import re

12class TokenizerError(Exception):

13 """Base class for all Tokenizer errors."""

16class Tokenizer(object):

18 """

19 A tokenizer for Excel worksheet formulae.

21 Converts a str string representing an Excel formula (in A1 notation)

22 into a sequence of `Token` objects.

24 `formula`: The str string to tokenize

26 Tokenizer defines a method `._parse()` to parse the formula into tokens,

27 which can then be accessed through the `.items` attribute.

29 """

31 SN_RE = re.compile("^[1-9](\\.[0-9]+)?[Ee]$") # Scientific notation

32 WSPACE_RE = re.compile(r"[ \n]+")

33 STRING_REGEXES = {

34 # Inside a string, all characters are treated as literals, except for

35 # the quote character used to start the string. That character, when

36 # doubled is treated as a single character in the string. If an

37 # unmatched quote appears, the string is terminated.

38 '"': re.compile('"(?:[^"]*"")*[^"]*"(?!")'),

39 "'": re.compile("'(?:[^']*'')*[^']*'(?!')"),

40 }

41 ERROR_CODES = ("#NULL!", "#DIV/0!", "#VALUE!", "#REF!", "#NAME?",

42 "#NUM!", "#N/A", "#GETTING_DATA")

43 TOKEN_ENDERS = ',;}) +-*/^&=><%' # Each of these characters, marks the

44 # end of an operand token

46 def __init__(self, formula):

47 self.formula = formula

48 self.items = []

49 self.token_stack = [] # Used to keep track of arrays, functions, and

50 # parentheses

51 self.offset = 0 # How many chars have we read

52 self.token = [] # Used to build up token values char by char

53 self._parse()

55 def _parse(self):

56 """Populate self.items with the tokens from the formula."""

57 if self.offset:

58 return # Already parsed!

59 if not self.formula:

60 return

61 elif self.formula[0] == '=':

62 self.offset += 1

63 else:

64 self.items.append(Token(self.formula, Token.LITERAL))

65 return

66 consumers = (

67 ('"\'', self._parse_string),

68 ('[', self._parse_brackets),

69 ('#', self._parse_error),

70 (' ', self._parse_whitespace),

71 ('\n', self._parse_whitespace),

72 ('+-*/^&=><%', self._parse_operator),

73 ('{(', self._parse_opener),

74 (')}', self._parse_closer),

75 (';,', self._parse_separator),

76 )

77 dispatcher = {} # maps chars to the specific parsing function

78 for chars, consumer in consumers:

79 dispatcher.update(dict.fromkeys(chars, consumer))

80 while self.offset < len(self.formula):

81 if self.check_scientific_notation(): # May consume one character

82 continue

83 curr_char = self.formula[self.offset]

84 if curr_char in self.TOKEN_ENDERS:

85 self.save_token()

86 if curr_char in dispatcher:

87 self.offset += dispatcher[curr_char]()

88 else:

89 # TODO: this can probably be sped up using a regex to get to

90 # the next interesting character

91 self.token.append(curr_char)

92 self.offset += 1

93 self.save_token()

95 def _parse_string(self):

96 """

97 Parse a "-delimited string or '-delimited link.

99 The offset must be pointing to either a single quote ("'") or double

100 quote ('"') character. The strings are parsed according to Excel

101 rules where to escape the delimiter you just double it up. E.g.,

102 "abc""def" in Excel is parsed as 'abc"def' in Python.

103

104 Returns the number of characters matched. (Does not update

105 self.offset)

106

107 """

108 self.assert_empty_token(can_follow=':')

109 delim = self.formula[self.offset]

110 assert delim in ('"', "'")

111 regex = self.STRING_REGEXES[delim]

112 match = regex.match(self.formula[self.offset:])

113 if match is None:

114 subtype = "string" if delim == '"' else 'link'

115 raise TokenizerError(f"Reached end of formula while parsing {subtype} in {self.formula}")

116 match = match.group(0)

117 if delim == '"':

118 self.items.append(Token.make_operand(match))

119 else:

120 self.token.append(match)

121 return len(match)

122

123 def _parse_brackets(self):

124 """

125 Consume all the text between square brackets [].

126

127 Returns the number of characters matched. (Does not update

128 self.offset)

129

130 """

131 assert self.formula[self.offset] == '['

132 lefts = [(t.start(), 1) for t in

133 re.finditer(r"\[", self.formula[self.offset:])]

134 rights = [(t.start(), -1) for t in

135 re.finditer(r"\]", self.formula[self.offset:])]

136

137 open_count = 0

138 for idx, open_close in sorted(lefts + rights):

139 open_count += open_close

140 if open_count == 0:

141 outer_right = idx + 1

142 self.token.append(

143 self.formula[self.offset:self.offset + outer_right])

144 return outer_right

145

146 raise TokenizerError(f"Encountered unmatched '[' in {self.formula}")

147

148 def _parse_error(self):

149 """

150 Consume the text following a '#' as an error.

151

152 Looks for a match in self.ERROR_CODES and returns the number of

153 characters matched. (Does not update self.offset)

154

155 """

156 self.assert_empty_token(can_follow='!')

157 assert self.formula[self.offset] == '#'

158 subformula = self.formula[self.offset:]

159 for err in self.ERROR_CODES:

160 if subformula.startswith(err):

161 self.items.append(Token.make_operand(''.join(self.token) + err))

162 del self.token[:]

163 return len(err)

164 raise TokenizerError(f"Invalid error code at position {self.offset} in '{self.formula}'")

165

166 def _parse_whitespace(self):

167 """

168 Consume a string of consecutive spaces.

169

170 Returns the number of spaces found. (Does not update self.offset).

171

172 """

173 assert self.formula[self.offset] in (' ', '\n')

174 self.items.append(Token(self.formula[self.offset], Token.WSPACE))

175 return self.WSPACE_RE.match(self.formula[self.offset:]).end()

176

177 def _parse_operator(self):

178 """

179 Consume the characters constituting an operator.

180

181 Returns the number of characters consumed. (Does not update

182 self.offset)

183

184 """

185 if self.formula[self.offset:self.offset + 2] in ('>=', '<=', '<>'):

186 self.items.append(Token(

187 self.formula[self.offset:self.offset + 2],

188 Token.OP_IN

189 ))

190 return 2

191 curr_char = self.formula[self.offset] # guaranteed to be 1 char

192 assert curr_char in '%*/^&=><+-'

193 if curr_char == '%':

194 token = Token('%', Token.OP_POST)

195 elif curr_char in "*/^&=><":

196 token = Token(curr_char, Token.OP_IN)

197 # From here on, curr_char is guaranteed to be in '+-'

198 elif not self.items:

199 token = Token(curr_char, Token.OP_PRE)

200 else:

201 prev = next((i for i in reversed(self.items)

202 if i.type != Token.WSPACE), None)

203 is_infix = prev and (

204 prev.subtype == Token.CLOSE

205 or prev.type == Token.OP_POST

206 or prev.type == Token.OPERAND

207 )

208 if is_infix:

209 token = Token(curr_char, Token.OP_IN)

210 else:

211 token = Token(curr_char, Token.OP_PRE)

212 self.items.append(token)

213 return 1

214

215 def _parse_opener(self):

216 """

217 Consumes a ( or { character.

218

219 Returns the number of characters consumed. (Does not update

220 self.offset)

221

222 """

223 assert self.formula[self.offset] in ('(', '{')

224 if self.formula[self.offset] == '{':

225 self.assert_empty_token()

226 token = Token.make_subexp("{")

227 elif self.token:

228 token_value = "".join(self.token) + '('

229 del self.token[:]

230 token = Token.make_subexp(token_value)

231 else:

232 token = Token.make_subexp("(")

233 self.items.append(token)

234 self.token_stack.append(token)

235 return 1

236

237 def _parse_closer(self):

238 """

239 Consumes a } or ) character.

240

241 Returns the number of characters consumed. (Does not update

242 self.offset)

243

244 """

245 assert self.formula[self.offset] in (')', '}')

246 token = self.token_stack.pop().get_closer()

247 if token.value != self.formula[self.offset]:

248 raise TokenizerError(

249 "Mismatched ( and { pair in '%s'" % self.formula)

250 self.items.append(token)

251 return 1

252

253 def _parse_separator(self):

254 """

255 Consumes a ; or , character.

256

257 Returns the number of characters consumed. (Does not update

258 self.offset)

259

260 """

261 curr_char = self.formula[self.offset]

262 assert curr_char in (';', ',')

263 if curr_char == ';':

264 token = Token.make_separator(";")

265 else:

266 try:

267 top_type = self.token_stack[-1].type

268 except IndexError:

269 token = Token(",", Token.OP_IN) # Range Union operator

270 else:

271 if top_type == Token.PAREN:

272 token = Token(",", Token.OP_IN) # Range Union operator

273 else:

274 token = Token.make_separator(",")

275 self.items.append(token)

276 return 1

277

278 def check_scientific_notation(self):

279 """

280 Consumes a + or - character if part of a number in sci. notation.

281

282 Returns True if the character was consumed and self.offset was

283 updated, False otherwise.

284

285 """

286 curr_char = self.formula[self.offset]

287 if (curr_char in '+-'

288 and len(self.token) >= 1

289 and self.SN_RE.match("".join(self.token))):

290 self.token.append(curr_char)

291 self.offset += 1

292 return True

293 return False

294

295 def assert_empty_token(self, can_follow=()):

296 """

297 Ensure that there's no token currently being parsed.

298

299 Or if there is a token being parsed, it must end with a character in

300 can_follow.

301

302 If there are unconsumed token contents, it means we hit an unexpected

303 token transition. In this case, we raise a TokenizerError

304

305 """

306 if self.token and self.token[-1] not in can_follow:

307 raise TokenizerError(f"Unexpected character at position {self.offset} in '{self.formula}'")

308

309 def save_token(self):

310 """If there's a token being parsed, add it to the item list."""

311 if self.token:

312 self.items.append(Token.make_operand("".join(self.token)))

313 del self.token[:]

314

315 def render(self):

316 """Convert the parsed tokens back to a string."""

317 if not self.items:

318 return ""

319 elif self.items[0].type == Token.LITERAL:

320 return self.items[0].value

321 return "=" + "".join(token.value for token in self.items)

322

323

324class Token(object):

325

326 """

327 A token in an Excel formula.

328

329 Tokens have three attributes:

330

331 * `value`: The string value parsed that led to this token

332 * `type`: A string identifying the type of token

333 * `subtype`: A string identifying subtype of the token (optional, and

334 defaults to "")

335

336 """

337

338 __slots__ = ['value', 'type', 'subtype']

339

340 LITERAL = "LITERAL"

341 OPERAND = "OPERAND"

342 FUNC = "FUNC"

343 ARRAY = "ARRAY"

344 PAREN = "PAREN"

345 SEP = "SEP"

346 OP_PRE = "OPERATOR-PREFIX"

347 OP_IN = "OPERATOR-INFIX"

348 OP_POST = "OPERATOR-POSTFIX"

349 WSPACE = "WHITE-SPACE"

350

351 def __init__(self, value, type_, subtype=""):

352 self.value = value

353 self.type = type_

354 self.subtype = subtype

355

356 # Literal operands:

357 #

358 # Literal operands are always of type 'OPERAND' and can be of subtype

359 # 'TEXT' (for text strings), 'NUMBER' (for all numeric types), 'LOGICAL'

360 # (for TRUE and FALSE), 'ERROR' (for literal error values), or 'RANGE'

361 # (for all range references).

362

363 TEXT = 'TEXT'

364 NUMBER = 'NUMBER'

365 LOGICAL = 'LOGICAL'

366 ERROR = 'ERROR'

367 RANGE = 'RANGE'

368

369 def __repr__(self):

370 return u"{0} {1} {2}:".format(self.type, self.subtype, self.value)

371

372 @classmethod

373 def make_operand(cls, value):

374 """Create an operand token."""

375 if value.startswith('"'):

376 subtype = cls.TEXT

377 elif value.startswith('#'):

378 subtype = cls.ERROR

379 elif value in ('TRUE', 'FALSE'):

380 subtype = cls.LOGICAL

381 else:

382 try:

383 float(value)

384 subtype = cls.NUMBER

385 except ValueError:

386 subtype = cls.RANGE

387 return cls(value, cls.OPERAND, subtype)

388

389

390 # Subexpresssions

391 #

392 # There are 3 types of `Subexpressions`: functions, array literals, and

393 # parentheticals. Subexpressions have 'OPEN' and 'CLOSE' tokens. 'OPEN'

394 # is used when parsing the initial expression token (i.e., '(' or '{')

395 # and 'CLOSE' is used when parsing the closing expression token ('}' or

396 # ')').

397

398 OPEN = "OPEN"

399 CLOSE = "CLOSE"

400

401 @classmethod

402 def make_subexp(cls, value, func=False):

403 """

404 Create a subexpression token.

405

406 `value`: The value of the token

407 `func`: If True, force the token to be of type FUNC

408

409 """

410 assert value[-1] in ('{', '}', '(', ')')

411 if func:

412 assert re.match('.+\$|\$', value)

413 type_ = Token.FUNC

414 elif value in '{}':

415 type_ = Token.ARRAY

416 elif value in '()':

417 type_ = Token.PAREN

418 else:

419 type_ = Token.FUNC

420 subtype = cls.CLOSE if value in ')}' else cls.OPEN

421 return cls(value, type_, subtype)

422

423 def get_closer(self):

424 """Return a closing token that matches this token's type."""

425 assert self.type in (self.FUNC, self.ARRAY, self.PAREN)

426 assert self.subtype == self.OPEN

427 value = "}" if self.type == self.ARRAY else ")"

428 return self.make_subexp(value, func=self.type == self.FUNC)

429

430 # Separator tokens

431 #

432 # Argument separators always have type 'SEP' and can have one of two

433 # subtypes: 'ARG', 'ROW'. 'ARG' is used for the ',' token, when used to

434 # delimit either function arguments or array elements. 'ROW' is used for

435 # the ';' token, which is always used to delimit rows in an array

436 # literal.

437

438 ARG = "ARG"

439 ROW = "ROW"

440

441 @classmethod

442 def make_separator(cls, value):

443 """Create a separator token"""

444 assert value in (',', ';')

445 subtype = cls.ARG if value == ',' else cls.ROW

446 return cls(value, cls.SEP, subtype)