Coverage for /var/srv/projects/api.amasfac.comuna18.com/tmp/venv/lib/python3.9/site-packages/django/test/html.py: 18%
148 statements
« prev ^ index » next coverage.py v6.4.4, created at 2023-07-17 14:22 -0600
« prev ^ index » next coverage.py v6.4.4, created at 2023-07-17 14:22 -0600
1"""Compare two HTML documents."""
3from html.parser import HTMLParser
5from django.utils.regex_helper import _lazy_re_compile
7# ASCII whitespace is U+0009 TAB, U+000A LF, U+000C FF, U+000D CR, or U+0020
8# SPACE.
9# https://infra.spec.whatwg.org/#ascii-whitespace
10ASCII_WHITESPACE = _lazy_re_compile(r"[\t\n\f\r ]+")
12# https://html.spec.whatwg.org/#attributes-3
13BOOLEAN_ATTRIBUTES = {
14 "allowfullscreen",
15 "async",
16 "autofocus",
17 "autoplay",
18 "checked",
19 "controls",
20 "default",
21 "defer ",
22 "disabled",
23 "formnovalidate",
24 "hidden",
25 "ismap",
26 "itemscope",
27 "loop",
28 "multiple",
29 "muted",
30 "nomodule",
31 "novalidate",
32 "open",
33 "playsinline",
34 "readonly",
35 "required",
36 "reversed",
37 "selected",
38 # Attributes for deprecated tags.
39 "truespeed",
40}
43def normalize_whitespace(string):
44 return ASCII_WHITESPACE.sub(" ", string)
47def normalize_attributes(attributes):
48 normalized = []
49 for name, value in attributes:
50 if name == "class" and value:
51 # Special case handling of 'class' attribute, so that comparisons
52 # of DOM instances are not sensitive to ordering of classes.
53 value = " ".join(
54 sorted(value for value in ASCII_WHITESPACE.split(value) if value)
55 )
56 # Boolean attributes without a value is same as attribute with value
57 # that equals the attributes name. For example:
58 # <input checked> == <input checked="checked">
59 if name in BOOLEAN_ATTRIBUTES:
60 if not value or value == name:
61 value = None
62 elif value is None:
63 value = ""
64 normalized.append((name, value))
65 return normalized
68class Element:
69 def __init__(self, name, attributes):
70 self.name = name
71 self.attributes = sorted(attributes)
72 self.children = []
74 def append(self, element):
75 if isinstance(element, str):
76 element = normalize_whitespace(element)
77 if self.children and isinstance(self.children[-1], str):
78 self.children[-1] += element
79 self.children[-1] = normalize_whitespace(self.children[-1])
80 return
81 elif self.children:
82 # removing last children if it is only whitespace
83 # this can result in incorrect dom representations since
84 # whitespace between inline tags like <span> is significant
85 if isinstance(self.children[-1], str) and self.children[-1].isspace():
86 self.children.pop()
87 if element:
88 self.children.append(element)
90 def finalize(self):
91 def rstrip_last_element(children):
92 if children and isinstance(children[-1], str):
93 children[-1] = children[-1].rstrip()
94 if not children[-1]:
95 children.pop()
96 children = rstrip_last_element(children)
97 return children
99 rstrip_last_element(self.children)
100 for i, child in enumerate(self.children):
101 if isinstance(child, str):
102 self.children[i] = child.strip()
103 elif hasattr(child, "finalize"):
104 child.finalize()
106 def __eq__(self, element):
107 if not hasattr(element, "name") or self.name != element.name:
108 return False
109 if self.attributes != element.attributes:
110 return False
111 return self.children == element.children
113 def __hash__(self):
114 return hash((self.name, *self.attributes))
116 def _count(self, element, count=True):
117 if not isinstance(element, str) and self == element:
118 return 1
119 if isinstance(element, RootElement) and self.children == element.children:
120 return 1
121 i = 0
122 elem_child_idx = 0
123 for child in self.children:
124 # child is text content and element is also text content, then
125 # make a simple "text" in "text"
126 if isinstance(child, str):
127 if isinstance(element, str):
128 if count:
129 i += child.count(element)
130 elif element in child:
131 return 1
132 else:
133 # Look for element wholly within this child.
134 i += child._count(element, count=count)
135 if not count and i:
136 return i
137 # Also look for a sequence of element's children among self's
138 # children. self.children == element.children is tested above,
139 # but will fail if self has additional children. Ex: '<a/><b/>'
140 # is contained in '<a/><b/><c/>'.
141 if isinstance(element, RootElement) and element.children:
142 elem_child = element.children[elem_child_idx]
143 # Start or continue match, advance index.
144 if elem_child == child:
145 elem_child_idx += 1
146 # Match found, reset index.
147 if elem_child_idx == len(element.children):
148 i += 1
149 elem_child_idx = 0
150 # No match, reset index.
151 else:
152 elem_child_idx = 0
153 return i
155 def __contains__(self, element):
156 return self._count(element, count=False) > 0
158 def count(self, element):
159 return self._count(element, count=True)
161 def __getitem__(self, key):
162 return self.children[key]
164 def __str__(self):
165 output = "<%s" % self.name
166 for key, value in self.attributes:
167 if value is not None:
168 output += ' %s="%s"' % (key, value)
169 else:
170 output += " %s" % key
171 if self.children:
172 output += ">\n"
173 output += "".join(str(c) for c in self.children)
174 output += "\n</%s>" % self.name
175 else:
176 output += ">"
177 return output
179 def __repr__(self):
180 return str(self)
183class RootElement(Element):
184 def __init__(self):
185 super().__init__(None, ())
187 def __str__(self):
188 return "".join(str(c) for c in self.children)
191class HTMLParseError(Exception):
192 pass
195class Parser(HTMLParser):
196 # https://html.spec.whatwg.org/#void-elements
197 SELF_CLOSING_TAGS = {
198 "area",
199 "base",
200 "br",
201 "col",
202 "embed",
203 "hr",
204 "img",
205 "input",
206 "link",
207 "meta",
208 "param",
209 "source",
210 "track",
211 "wbr",
212 # Deprecated tags
213 "frame",
214 "spacer",
215 }
217 def __init__(self):
218 super().__init__()
219 self.root = RootElement()
220 self.open_tags = []
221 self.element_positions = {}
223 def error(self, msg):
224 raise HTMLParseError(msg, self.getpos())
226 def format_position(self, position=None, element=None):
227 if not position and element:
228 position = self.element_positions[element]
229 if position is None:
230 position = self.getpos()
231 if hasattr(position, "lineno"):
232 position = position.lineno, position.offset
233 return "Line %d, Column %d" % position
235 @property
236 def current(self):
237 if self.open_tags:
238 return self.open_tags[-1]
239 else:
240 return self.root
242 def handle_startendtag(self, tag, attrs):
243 self.handle_starttag(tag, attrs)
244 if tag not in self.SELF_CLOSING_TAGS:
245 self.handle_endtag(tag)
247 def handle_starttag(self, tag, attrs):
248 attrs = normalize_attributes(attrs)
249 element = Element(tag, attrs)
250 self.current.append(element)
251 if tag not in self.SELF_CLOSING_TAGS:
252 self.open_tags.append(element)
253 self.element_positions[element] = self.getpos()
255 def handle_endtag(self, tag):
256 if not self.open_tags:
257 self.error("Unexpected end tag `%s` (%s)" % (tag, self.format_position()))
258 element = self.open_tags.pop()
259 while element.name != tag:
260 if not self.open_tags:
261 self.error(
262 "Unexpected end tag `%s` (%s)" % (tag, self.format_position())
263 )
264 element = self.open_tags.pop()
266 def handle_data(self, data):
267 self.current.append(data)
270def parse_html(html):
271 """
272 Take a string that contains HTML and turn it into a Python object structure
273 that can be easily compared against other HTML on semantic equivalence.
274 Syntactical differences like which quotation is used on arguments will be
275 ignored.
276 """
277 parser = Parser()
278 parser.feed(html)
279 parser.close()
280 document = parser.root
281 document.finalize()
282 # Removing ROOT element if it's not necessary
283 if len(document.children) == 1 and not isinstance(document.children[0], str):
284 document = document.children[0]
285 return document