Coverage for /var/srv/projects/api.amasfac.comuna18.com/tmp/venv/lib/python3.9/site-packages/jinja2/lexer.py: 28%

338 statements  

« prev     ^ index     » next       coverage.py v6.4.4, created at 2023-07-17 14:22 -0600

1"""Implements a Jinja / Python combination lexer. The ``Lexer`` class 

2is used to do some preprocessing. It filters out invalid operators like 

3the bitshift operators we don't allow in templates. It separates 

4template code and python code in expressions. 

5""" 

6import re 

7import typing as t 

8from ast import literal_eval 

9from collections import deque 

10from sys import intern 

11 

12from ._identifier import pattern as name_re 

13from .exceptions import TemplateSyntaxError 

14from .utils import LRUCache 

15 

16if t.TYPE_CHECKING: 16 ↛ 17line 16 didn't jump to line 17, because the condition on line 16 was never true

17 import typing_extensions as te 

18 from .environment import Environment 

19 

20# cache for the lexers. Exists in order to be able to have multiple 

21# environments with the same lexer 

22_lexer_cache: t.MutableMapping[t.Tuple, "Lexer"] = LRUCache(50) # type: ignore 

23 

24# static regular expressions 

25whitespace_re = re.compile(r"\s+") 

26newline_re = re.compile(r"(\r\n|\r|\n)") 

27string_re = re.compile( 

28 r"('([^'\\]*(?:\\.[^'\\]*)*)'" r'|"([^"\\]*(?:\\.[^"\\]*)*)")', re.S 

29) 

30integer_re = re.compile( 

31 r""" 

32 ( 

33 0b(_?[0-1])+ # binary 

34 | 

35 0o(_?[0-7])+ # octal 

36 | 

37 0x(_?[\da-f])+ # hex 

38 | 

39 [1-9](_?\d)* # decimal 

40 | 

41 0(_?0)* # decimal zero 

42 ) 

43 """, 

44 re.IGNORECASE | re.VERBOSE, 

45) 

46float_re = re.compile( 

47 r""" 

48 (?<!\.) # doesn't start with a . 

49 (\d+_)*\d+ # digits, possibly _ separated 

50 ( 

51 (\.(\d+_)*\d+)? # optional fractional part 

52 e[+\-]?(\d+_)*\d+ # exponent part 

53 | 

54 \.(\d+_)*\d+ # required fractional part 

55 ) 

56 """, 

57 re.IGNORECASE | re.VERBOSE, 

58) 

59 

60# internal the tokens and keep references to them 

61TOKEN_ADD = intern("add") 

62TOKEN_ASSIGN = intern("assign") 

63TOKEN_COLON = intern("colon") 

64TOKEN_COMMA = intern("comma") 

65TOKEN_DIV = intern("div") 

66TOKEN_DOT = intern("dot") 

67TOKEN_EQ = intern("eq") 

68TOKEN_FLOORDIV = intern("floordiv") 

69TOKEN_GT = intern("gt") 

70TOKEN_GTEQ = intern("gteq") 

71TOKEN_LBRACE = intern("lbrace") 

72TOKEN_LBRACKET = intern("lbracket") 

73TOKEN_LPAREN = intern("lparen") 

74TOKEN_LT = intern("lt") 

75TOKEN_LTEQ = intern("lteq") 

76TOKEN_MOD = intern("mod") 

77TOKEN_MUL = intern("mul") 

78TOKEN_NE = intern("ne") 

79TOKEN_PIPE = intern("pipe") 

80TOKEN_POW = intern("pow") 

81TOKEN_RBRACE = intern("rbrace") 

82TOKEN_RBRACKET = intern("rbracket") 

83TOKEN_RPAREN = intern("rparen") 

84TOKEN_SEMICOLON = intern("semicolon") 

85TOKEN_SUB = intern("sub") 

86TOKEN_TILDE = intern("tilde") 

87TOKEN_WHITESPACE = intern("whitespace") 

88TOKEN_FLOAT = intern("float") 

89TOKEN_INTEGER = intern("integer") 

90TOKEN_NAME = intern("name") 

91TOKEN_STRING = intern("string") 

92TOKEN_OPERATOR = intern("operator") 

93TOKEN_BLOCK_BEGIN = intern("block_begin") 

94TOKEN_BLOCK_END = intern("block_end") 

95TOKEN_VARIABLE_BEGIN = intern("variable_begin") 

96TOKEN_VARIABLE_END = intern("variable_end") 

97TOKEN_RAW_BEGIN = intern("raw_begin") 

98TOKEN_RAW_END = intern("raw_end") 

99TOKEN_COMMENT_BEGIN = intern("comment_begin") 

100TOKEN_COMMENT_END = intern("comment_end") 

101TOKEN_COMMENT = intern("comment") 

102TOKEN_LINESTATEMENT_BEGIN = intern("linestatement_begin") 

103TOKEN_LINESTATEMENT_END = intern("linestatement_end") 

104TOKEN_LINECOMMENT_BEGIN = intern("linecomment_begin") 

105TOKEN_LINECOMMENT_END = intern("linecomment_end") 

106TOKEN_LINECOMMENT = intern("linecomment") 

107TOKEN_DATA = intern("data") 

108TOKEN_INITIAL = intern("initial") 

109TOKEN_EOF = intern("eof") 

110 

111# bind operators to token types 

112operators = { 

113 "+": TOKEN_ADD, 

114 "-": TOKEN_SUB, 

115 "/": TOKEN_DIV, 

116 "//": TOKEN_FLOORDIV, 

117 "*": TOKEN_MUL, 

118 "%": TOKEN_MOD, 

119 "**": TOKEN_POW, 

120 "~": TOKEN_TILDE, 

121 "[": TOKEN_LBRACKET, 

122 "]": TOKEN_RBRACKET, 

123 "(": TOKEN_LPAREN, 

124 ")": TOKEN_RPAREN, 

125 "{": TOKEN_LBRACE, 

126 "}": TOKEN_RBRACE, 

127 "==": TOKEN_EQ, 

128 "!=": TOKEN_NE, 

129 ">": TOKEN_GT, 

130 ">=": TOKEN_GTEQ, 

131 "<": TOKEN_LT, 

132 "<=": TOKEN_LTEQ, 

133 "=": TOKEN_ASSIGN, 

134 ".": TOKEN_DOT, 

135 ":": TOKEN_COLON, 

136 "|": TOKEN_PIPE, 

137 ",": TOKEN_COMMA, 

138 ";": TOKEN_SEMICOLON, 

139} 

140 

141reverse_operators = {v: k for k, v in operators.items()} 

142assert len(operators) == len(reverse_operators), "operators dropped" 

143operator_re = re.compile( 

144 f"({'|'.join(re.escape(x) for x in sorted(operators, key=lambda x: -len(x)))})" 

145) 

146 

147ignored_tokens = frozenset( 

148 [ 

149 TOKEN_COMMENT_BEGIN, 

150 TOKEN_COMMENT, 

151 TOKEN_COMMENT_END, 

152 TOKEN_WHITESPACE, 

153 TOKEN_LINECOMMENT_BEGIN, 

154 TOKEN_LINECOMMENT_END, 

155 TOKEN_LINECOMMENT, 

156 ] 

157) 

158ignore_if_empty = frozenset( 

159 [TOKEN_WHITESPACE, TOKEN_DATA, TOKEN_COMMENT, TOKEN_LINECOMMENT] 

160) 

161 

162 

163def _describe_token_type(token_type: str) -> str: 

164 if token_type in reverse_operators: 

165 return reverse_operators[token_type] 

166 

167 return { 

168 TOKEN_COMMENT_BEGIN: "begin of comment", 

169 TOKEN_COMMENT_END: "end of comment", 

170 TOKEN_COMMENT: "comment", 

171 TOKEN_LINECOMMENT: "comment", 

172 TOKEN_BLOCK_BEGIN: "begin of statement block", 

173 TOKEN_BLOCK_END: "end of statement block", 

174 TOKEN_VARIABLE_BEGIN: "begin of print statement", 

175 TOKEN_VARIABLE_END: "end of print statement", 

176 TOKEN_LINESTATEMENT_BEGIN: "begin of line statement", 

177 TOKEN_LINESTATEMENT_END: "end of line statement", 

178 TOKEN_DATA: "template data / text", 

179 TOKEN_EOF: "end of template", 

180 }.get(token_type, token_type) 

181 

182 

183def describe_token(token: "Token") -> str: 

184 """Returns a description of the token.""" 

185 if token.type == TOKEN_NAME: 

186 return token.value 

187 

188 return _describe_token_type(token.type) 

189 

190 

191def describe_token_expr(expr: str) -> str: 

192 """Like `describe_token` but for token expressions.""" 

193 if ":" in expr: 

194 type, value = expr.split(":", 1) 

195 

196 if type == TOKEN_NAME: 

197 return value 

198 else: 

199 type = expr 

200 

201 return _describe_token_type(type) 

202 

203 

204def count_newlines(value: str) -> int: 

205 """Count the number of newline characters in the string. This is 

206 useful for extensions that filter a stream. 

207 """ 

208 return len(newline_re.findall(value)) 

209 

210 

211def compile_rules(environment: "Environment") -> t.List[t.Tuple[str, str]]: 

212 """Compiles all the rules from the environment into a list of rules.""" 

213 e = re.escape 

214 rules = [ 

215 ( 

216 len(environment.comment_start_string), 

217 TOKEN_COMMENT_BEGIN, 

218 e(environment.comment_start_string), 

219 ), 

220 ( 

221 len(environment.block_start_string), 

222 TOKEN_BLOCK_BEGIN, 

223 e(environment.block_start_string), 

224 ), 

225 ( 

226 len(environment.variable_start_string), 

227 TOKEN_VARIABLE_BEGIN, 

228 e(environment.variable_start_string), 

229 ), 

230 ] 

231 

232 if environment.line_statement_prefix is not None: 

233 rules.append( 

234 ( 

235 len(environment.line_statement_prefix), 

236 TOKEN_LINESTATEMENT_BEGIN, 

237 r"^[ \t\v]*" + e(environment.line_statement_prefix), 

238 ) 

239 ) 

240 if environment.line_comment_prefix is not None: 

241 rules.append( 

242 ( 

243 len(environment.line_comment_prefix), 

244 TOKEN_LINECOMMENT_BEGIN, 

245 r"(?:^|(?<=\S))[^\S\r\n]*" + e(environment.line_comment_prefix), 

246 ) 

247 ) 

248 

249 return [x[1:] for x in sorted(rules, reverse=True)] 

250 

251 

252class Failure: 

253 """Class that raises a `TemplateSyntaxError` if called. 

254 Used by the `Lexer` to specify known errors. 

255 """ 

256 

257 def __init__( 

258 self, message: str, cls: t.Type[TemplateSyntaxError] = TemplateSyntaxError 

259 ) -> None: 

260 self.message = message 

261 self.error_class = cls 

262 

263 def __call__(self, lineno: int, filename: str) -> "te.NoReturn": 

264 raise self.error_class(self.message, lineno, filename) 

265 

266 

267class Token(t.NamedTuple): 

268 lineno: int 

269 type: str 

270 value: str 

271 

272 def __str__(self) -> str: 

273 return describe_token(self) 

274 

275 def test(self, expr: str) -> bool: 

276 """Test a token against a token expression. This can either be a 

277 token type or ``'token_type:token_value'``. This can only test 

278 against string values and types. 

279 """ 

280 # here we do a regular string equality check as test_any is usually 

281 # passed an iterable of not interned strings. 

282 if self.type == expr: 

283 return True 

284 

285 if ":" in expr: 

286 return expr.split(":", 1) == [self.type, self.value] 

287 

288 return False 

289 

290 def test_any(self, *iterable: str) -> bool: 

291 """Test against multiple token expressions.""" 

292 return any(self.test(expr) for expr in iterable) 

293 

294 

295class TokenStreamIterator: 

296 """The iterator for tokenstreams. Iterate over the stream 

297 until the eof token is reached. 

298 """ 

299 

300 def __init__(self, stream: "TokenStream") -> None: 

301 self.stream = stream 

302 

303 def __iter__(self) -> "TokenStreamIterator": 

304 return self 

305 

306 def __next__(self) -> Token: 

307 token = self.stream.current 

308 

309 if token.type is TOKEN_EOF: 

310 self.stream.close() 

311 raise StopIteration 

312 

313 next(self.stream) 

314 return token 

315 

316 

317class TokenStream: 

318 """A token stream is an iterable that yields :class:`Token`\\s. The 

319 parser however does not iterate over it but calls :meth:`next` to go 

320 one token ahead. The current active token is stored as :attr:`current`. 

321 """ 

322 

323 def __init__( 

324 self, 

325 generator: t.Iterable[Token], 

326 name: t.Optional[str], 

327 filename: t.Optional[str], 

328 ): 

329 self._iter = iter(generator) 

330 self._pushed: "te.Deque[Token]" = deque() 

331 self.name = name 

332 self.filename = filename 

333 self.closed = False 

334 self.current = Token(1, TOKEN_INITIAL, "") 

335 next(self) 

336 

337 def __iter__(self) -> TokenStreamIterator: 

338 return TokenStreamIterator(self) 

339 

340 def __bool__(self) -> bool: 

341 return bool(self._pushed) or self.current.type is not TOKEN_EOF 

342 

343 @property 

344 def eos(self) -> bool: 

345 """Are we at the end of the stream?""" 

346 return not self 

347 

348 def push(self, token: Token) -> None: 

349 """Push a token back to the stream.""" 

350 self._pushed.append(token) 

351 

352 def look(self) -> Token: 

353 """Look at the next token.""" 

354 old_token = next(self) 

355 result = self.current 

356 self.push(result) 

357 self.current = old_token 

358 return result 

359 

360 def skip(self, n: int = 1) -> None: 

361 """Got n tokens ahead.""" 

362 for _ in range(n): 

363 next(self) 

364 

365 def next_if(self, expr: str) -> t.Optional[Token]: 

366 """Perform the token test and return the token if it matched. 

367 Otherwise the return value is `None`. 

368 """ 

369 if self.current.test(expr): 

370 return next(self) 

371 

372 return None 

373 

374 def skip_if(self, expr: str) -> bool: 

375 """Like :meth:`next_if` but only returns `True` or `False`.""" 

376 return self.next_if(expr) is not None 

377 

378 def __next__(self) -> Token: 

379 """Go one token ahead and return the old one. 

380 

381 Use the built-in :func:`next` instead of calling this directly. 

382 """ 

383 rv = self.current 

384 

385 if self._pushed: 

386 self.current = self._pushed.popleft() 

387 elif self.current.type is not TOKEN_EOF: 

388 try: 

389 self.current = next(self._iter) 

390 except StopIteration: 

391 self.close() 

392 

393 return rv 

394 

395 def close(self) -> None: 

396 """Close the stream.""" 

397 self.current = Token(self.current.lineno, TOKEN_EOF, "") 

398 self._iter = iter(()) 

399 self.closed = True 

400 

401 def expect(self, expr: str) -> Token: 

402 """Expect a given token type and return it. This accepts the same 

403 argument as :meth:`jinja2.lexer.Token.test`. 

404 """ 

405 if not self.current.test(expr): 

406 expr = describe_token_expr(expr) 

407 

408 if self.current.type is TOKEN_EOF: 

409 raise TemplateSyntaxError( 

410 f"unexpected end of template, expected {expr!r}.", 

411 self.current.lineno, 

412 self.name, 

413 self.filename, 

414 ) 

415 

416 raise TemplateSyntaxError( 

417 f"expected token {expr!r}, got {describe_token(self.current)!r}", 

418 self.current.lineno, 

419 self.name, 

420 self.filename, 

421 ) 

422 

423 return next(self) 

424 

425 

426def get_lexer(environment: "Environment") -> "Lexer": 

427 """Return a lexer which is probably cached.""" 

428 key = ( 

429 environment.block_start_string, 

430 environment.block_end_string, 

431 environment.variable_start_string, 

432 environment.variable_end_string, 

433 environment.comment_start_string, 

434 environment.comment_end_string, 

435 environment.line_statement_prefix, 

436 environment.line_comment_prefix, 

437 environment.trim_blocks, 

438 environment.lstrip_blocks, 

439 environment.newline_sequence, 

440 environment.keep_trailing_newline, 

441 ) 

442 lexer = _lexer_cache.get(key) 

443 

444 if lexer is None: 

445 _lexer_cache[key] = lexer = Lexer(environment) 

446 

447 return lexer 

448 

449 

450class OptionalLStrip(tuple): 

451 """A special tuple for marking a point in the state that can have 

452 lstrip applied. 

453 """ 

454 

455 __slots__ = () 

456 

457 # Even though it looks like a no-op, creating instances fails 

458 # without this. 

459 def __new__(cls, *members, **kwargs): # type: ignore 

460 return super().__new__(cls, members) 

461 

462 

463class _Rule(t.NamedTuple): 

464 pattern: t.Pattern[str] 

465 tokens: t.Union[str, t.Tuple[str, ...], t.Tuple[Failure]] 

466 command: t.Optional[str] 

467 

468 

469class Lexer: 

470 """Class that implements a lexer for a given environment. Automatically 

471 created by the environment class, usually you don't have to do that. 

472 

473 Note that the lexer is not automatically bound to an environment. 

474 Multiple environments can share the same lexer. 

475 """ 

476 

477 def __init__(self, environment: "Environment") -> None: 

478 # shortcuts 

479 e = re.escape 

480 

481 def c(x: str) -> t.Pattern[str]: 

482 return re.compile(x, re.M | re.S) 

483 

484 # lexing rules for tags 

485 tag_rules: t.List[_Rule] = [ 

486 _Rule(whitespace_re, TOKEN_WHITESPACE, None), 

487 _Rule(float_re, TOKEN_FLOAT, None), 

488 _Rule(integer_re, TOKEN_INTEGER, None), 

489 _Rule(name_re, TOKEN_NAME, None), 

490 _Rule(string_re, TOKEN_STRING, None), 

491 _Rule(operator_re, TOKEN_OPERATOR, None), 

492 ] 

493 

494 # assemble the root lexing rule. because "|" is ungreedy 

495 # we have to sort by length so that the lexer continues working 

496 # as expected when we have parsing rules like <% for block and 

497 # <%= for variables. (if someone wants asp like syntax) 

498 # variables are just part of the rules if variable processing 

499 # is required. 

500 root_tag_rules = compile_rules(environment) 

501 

502 block_start_re = e(environment.block_start_string) 

503 block_end_re = e(environment.block_end_string) 

504 comment_end_re = e(environment.comment_end_string) 

505 variable_end_re = e(environment.variable_end_string) 

506 

507 # block suffix if trimming is enabled 

508 block_suffix_re = "\\n?" if environment.trim_blocks else "" 

509 

510 self.lstrip_blocks = environment.lstrip_blocks 

511 

512 self.newline_sequence = environment.newline_sequence 

513 self.keep_trailing_newline = environment.keep_trailing_newline 

514 

515 root_raw_re = ( 

516 rf"(?P<raw_begin>{block_start_re}(\-|\+|)\s*raw\s*" 

517 rf"(?:\-{block_end_re}\s*|{block_end_re}))" 

518 ) 

519 root_parts_re = "|".join( 

520 [root_raw_re] + [rf"(?P<{n}>{r}(\-|\+|))" for n, r in root_tag_rules] 

521 ) 

522 

523 # global lexing rules 

524 self.rules: t.Dict[str, t.List[_Rule]] = { 

525 "root": [ 

526 # directives 

527 _Rule( 

528 c(rf"(.*?)(?:{root_parts_re})"), 

529 OptionalLStrip(TOKEN_DATA, "#bygroup"), # type: ignore 

530 "#bygroup", 

531 ), 

532 # data 

533 _Rule(c(".+"), TOKEN_DATA, None), 

534 ], 

535 # comments 

536 TOKEN_COMMENT_BEGIN: [ 

537 _Rule( 

538 c( 

539 rf"(.*?)((?:\+{comment_end_re}|\-{comment_end_re}\s*" 

540 rf"|{comment_end_re}{block_suffix_re}))" 

541 ), 

542 (TOKEN_COMMENT, TOKEN_COMMENT_END), 

543 "#pop", 

544 ), 

545 _Rule(c(r"(.)"), (Failure("Missing end of comment tag"),), None), 

546 ], 

547 # blocks 

548 TOKEN_BLOCK_BEGIN: [ 

549 _Rule( 

550 c( 

551 rf"(?:\+{block_end_re}|\-{block_end_re}\s*" 

552 rf"|{block_end_re}{block_suffix_re})" 

553 ), 

554 TOKEN_BLOCK_END, 

555 "#pop", 

556 ), 

557 ] 

558 + tag_rules, 

559 # variables 

560 TOKEN_VARIABLE_BEGIN: [ 

561 _Rule( 

562 c(rf"\-{variable_end_re}\s*|{variable_end_re}"), 

563 TOKEN_VARIABLE_END, 

564 "#pop", 

565 ) 

566 ] 

567 + tag_rules, 

568 # raw block 

569 TOKEN_RAW_BEGIN: [ 

570 _Rule( 

571 c( 

572 rf"(.*?)((?:{block_start_re}(\-|\+|))\s*endraw\s*" 

573 rf"(?:\+{block_end_re}|\-{block_end_re}\s*" 

574 rf"|{block_end_re}{block_suffix_re}))" 

575 ), 

576 OptionalLStrip(TOKEN_DATA, TOKEN_RAW_END), # type: ignore 

577 "#pop", 

578 ), 

579 _Rule(c(r"(.)"), (Failure("Missing end of raw directive"),), None), 

580 ], 

581 # line statements 

582 TOKEN_LINESTATEMENT_BEGIN: [ 

583 _Rule(c(r"\s*(\n|$)"), TOKEN_LINESTATEMENT_END, "#pop") 

584 ] 

585 + tag_rules, 

586 # line comments 

587 TOKEN_LINECOMMENT_BEGIN: [ 

588 _Rule( 

589 c(r"(.*?)()(?=\n|$)"), 

590 (TOKEN_LINECOMMENT, TOKEN_LINECOMMENT_END), 

591 "#pop", 

592 ) 

593 ], 

594 } 

595 

596 def _normalize_newlines(self, value: str) -> str: 

597 """Replace all newlines with the configured sequence in strings 

598 and template data. 

599 """ 

600 return newline_re.sub(self.newline_sequence, value) 

601 

602 def tokenize( 

603 self, 

604 source: str, 

605 name: t.Optional[str] = None, 

606 filename: t.Optional[str] = None, 

607 state: t.Optional[str] = None, 

608 ) -> TokenStream: 

609 """Calls tokeniter + tokenize and wraps it in a token stream.""" 

610 stream = self.tokeniter(source, name, filename, state) 

611 return TokenStream(self.wrap(stream, name, filename), name, filename) 

612 

613 def wrap( 

614 self, 

615 stream: t.Iterable[t.Tuple[int, str, str]], 

616 name: t.Optional[str] = None, 

617 filename: t.Optional[str] = None, 

618 ) -> t.Iterator[Token]: 

619 """This is called with the stream as returned by `tokenize` and wraps 

620 every token in a :class:`Token` and converts the value. 

621 """ 

622 for lineno, token, value_str in stream: 

623 if token in ignored_tokens: 

624 continue 

625 

626 value: t.Any = value_str 

627 

628 if token == TOKEN_LINESTATEMENT_BEGIN: 

629 token = TOKEN_BLOCK_BEGIN 

630 elif token == TOKEN_LINESTATEMENT_END: 

631 token = TOKEN_BLOCK_END 

632 # we are not interested in those tokens in the parser 

633 elif token in (TOKEN_RAW_BEGIN, TOKEN_RAW_END): 

634 continue 

635 elif token == TOKEN_DATA: 

636 value = self._normalize_newlines(value_str) 

637 elif token == "keyword": 

638 token = value_str 

639 elif token == TOKEN_NAME: 

640 value = value_str 

641 

642 if not value.isidentifier(): 

643 raise TemplateSyntaxError( 

644 "Invalid character in identifier", lineno, name, filename 

645 ) 

646 elif token == TOKEN_STRING: 

647 # try to unescape string 

648 try: 

649 value = ( 

650 self._normalize_newlines(value_str[1:-1]) 

651 .encode("ascii", "backslashreplace") 

652 .decode("unicode-escape") 

653 ) 

654 except Exception as e: 

655 msg = str(e).split(":")[-1].strip() 

656 raise TemplateSyntaxError(msg, lineno, name, filename) from e 

657 elif token == TOKEN_INTEGER: 

658 value = int(value_str.replace("_", ""), 0) 

659 elif token == TOKEN_FLOAT: 

660 # remove all "_" first to support more Python versions 

661 value = literal_eval(value_str.replace("_", "")) 

662 elif token == TOKEN_OPERATOR: 

663 token = operators[value_str] 

664 

665 yield Token(lineno, token, value) 

666 

667 def tokeniter( 

668 self, 

669 source: str, 

670 name: t.Optional[str], 

671 filename: t.Optional[str] = None, 

672 state: t.Optional[str] = None, 

673 ) -> t.Iterator[t.Tuple[int, str, str]]: 

674 """This method tokenizes the text and returns the tokens in a 

675 generator. Use this method if you just want to tokenize a template. 

676 

677 .. versionchanged:: 3.0 

678 Only ``\\n``, ``\\r\\n`` and ``\\r`` are treated as line 

679 breaks. 

680 """ 

681 lines = newline_re.split(source)[::2] 

682 

683 if not self.keep_trailing_newline and lines[-1] == "": 

684 del lines[-1] 

685 

686 source = "\n".join(lines) 

687 pos = 0 

688 lineno = 1 

689 stack = ["root"] 

690 

691 if state is not None and state != "root": 

692 assert state in ("variable", "block"), "invalid state" 

693 stack.append(state + "_begin") 

694 

695 statetokens = self.rules[stack[-1]] 

696 source_length = len(source) 

697 balancing_stack: t.List[str] = [] 

698 newlines_stripped = 0 

699 line_starting = True 

700 

701 while True: 

702 # tokenizer loop 

703 for regex, tokens, new_state in statetokens: 

704 m = regex.match(source, pos) 

705 

706 # if no match we try again with the next rule 

707 if m is None: 

708 continue 

709 

710 # we only match blocks and variables if braces / parentheses 

711 # are balanced. continue parsing with the lower rule which 

712 # is the operator rule. do this only if the end tags look 

713 # like operators 

714 if balancing_stack and tokens in ( 

715 TOKEN_VARIABLE_END, 

716 TOKEN_BLOCK_END, 

717 TOKEN_LINESTATEMENT_END, 

718 ): 

719 continue 

720 

721 # tuples support more options 

722 if isinstance(tokens, tuple): 

723 groups: t.Sequence[str] = m.groups() 

724 

725 if isinstance(tokens, OptionalLStrip): 

726 # Rule supports lstrip. Match will look like 

727 # text, block type, whitespace control, type, control, ... 

728 text = groups[0] 

729 # Skipping the text and first type, every other group is the 

730 # whitespace control for each type. One of the groups will be 

731 # -, +, or empty string instead of None. 

732 strip_sign = next(g for g in groups[2::2] if g is not None) 

733 

734 if strip_sign == "-": 

735 # Strip all whitespace between the text and the tag. 

736 stripped = text.rstrip() 

737 newlines_stripped = text[len(stripped) :].count("\n") 

738 groups = [stripped, *groups[1:]] 

739 elif ( 

740 # Not marked for preserving whitespace. 

741 strip_sign != "+" 

742 # lstrip is enabled. 

743 and self.lstrip_blocks 

744 # Not a variable expression. 

745 and not m.groupdict().get(TOKEN_VARIABLE_BEGIN) 

746 ): 

747 # The start of text between the last newline and the tag. 

748 l_pos = text.rfind("\n") + 1 

749 

750 if l_pos > 0 or line_starting: 

751 # If there's only whitespace between the newline and the 

752 # tag, strip it. 

753 if whitespace_re.fullmatch(text, l_pos): 

754 groups = [text[:l_pos], *groups[1:]] 

755 

756 for idx, token in enumerate(tokens): 

757 # failure group 

758 if token.__class__ is Failure: 

759 raise token(lineno, filename) 

760 # bygroup is a bit more complex, in that case we 

761 # yield for the current token the first named 

762 # group that matched 

763 elif token == "#bygroup": 

764 for key, value in m.groupdict().items(): 

765 if value is not None: 

766 yield lineno, key, value 

767 lineno += value.count("\n") 

768 break 

769 else: 

770 raise RuntimeError( 

771 f"{regex!r} wanted to resolve the token dynamically" 

772 " but no group matched" 

773 ) 

774 # normal group 

775 else: 

776 data = groups[idx] 

777 

778 if data or token not in ignore_if_empty: 

779 yield lineno, token, data 

780 

781 lineno += data.count("\n") + newlines_stripped 

782 newlines_stripped = 0 

783 

784 # strings as token just are yielded as it. 

785 else: 

786 data = m.group() 

787 

788 # update brace/parentheses balance 

789 if tokens == TOKEN_OPERATOR: 

790 if data == "{": 

791 balancing_stack.append("}") 

792 elif data == "(": 

793 balancing_stack.append(")") 

794 elif data == "[": 

795 balancing_stack.append("]") 

796 elif data in ("}", ")", "]"): 

797 if not balancing_stack: 

798 raise TemplateSyntaxError( 

799 f"unexpected '{data}'", lineno, name, filename 

800 ) 

801 

802 expected_op = balancing_stack.pop() 

803 

804 if expected_op != data: 

805 raise TemplateSyntaxError( 

806 f"unexpected '{data}', expected '{expected_op}'", 

807 lineno, 

808 name, 

809 filename, 

810 ) 

811 

812 # yield items 

813 if data or tokens not in ignore_if_empty: 

814 yield lineno, tokens, data 

815 

816 lineno += data.count("\n") 

817 

818 line_starting = m.group()[-1:] == "\n" 

819 # fetch new position into new variable so that we can check 

820 # if there is a internal parsing error which would result 

821 # in an infinite loop 

822 pos2 = m.end() 

823 

824 # handle state changes 

825 if new_state is not None: 

826 # remove the uppermost state 

827 if new_state == "#pop": 

828 stack.pop() 

829 # resolve the new state by group checking 

830 elif new_state == "#bygroup": 

831 for key, value in m.groupdict().items(): 

832 if value is not None: 

833 stack.append(key) 

834 break 

835 else: 

836 raise RuntimeError( 

837 f"{regex!r} wanted to resolve the new state dynamically" 

838 f" but no group matched" 

839 ) 

840 # direct state name given 

841 else: 

842 stack.append(new_state) 

843 

844 statetokens = self.rules[stack[-1]] 

845 # we are still at the same position and no stack change. 

846 # this means a loop without break condition, avoid that and 

847 # raise error 

848 elif pos2 == pos: 

849 raise RuntimeError( 

850 f"{regex!r} yielded empty string without stack change" 

851 ) 

852 

853 # publish new function and start again 

854 pos = pos2 

855 break 

856 # if loop terminated without break we haven't found a single match 

857 # either we are at the end of the file or we have a problem 

858 else: 

859 # end of text 

860 if pos >= source_length: 

861 return 

862 

863 # something went wrong 

864 raise TemplateSyntaxError( 

865 f"unexpected char {source[pos]!r} at {pos}", lineno, name, filename 

866 )