Coverage for /var/srv/projects/api.amasfac.comuna18.com/tmp/venv/lib/python3.9/site-packages/django/test/html.py: 18%

148 statements  

« prev     ^ index     » next       coverage.py v6.4.4, created at 2023-07-17 14:22 -0600

1"""Compare two HTML documents.""" 

2 

3from html.parser import HTMLParser 

4 

5from django.utils.regex_helper import _lazy_re_compile 

6 

7# ASCII whitespace is U+0009 TAB, U+000A LF, U+000C FF, U+000D CR, or U+0020 

8# SPACE. 

9# https://infra.spec.whatwg.org/#ascii-whitespace 

10ASCII_WHITESPACE = _lazy_re_compile(r"[\t\n\f\r ]+") 

11 

12# https://html.spec.whatwg.org/#attributes-3 

13BOOLEAN_ATTRIBUTES = { 

14 "allowfullscreen", 

15 "async", 

16 "autofocus", 

17 "autoplay", 

18 "checked", 

19 "controls", 

20 "default", 

21 "defer ", 

22 "disabled", 

23 "formnovalidate", 

24 "hidden", 

25 "ismap", 

26 "itemscope", 

27 "loop", 

28 "multiple", 

29 "muted", 

30 "nomodule", 

31 "novalidate", 

32 "open", 

33 "playsinline", 

34 "readonly", 

35 "required", 

36 "reversed", 

37 "selected", 

38 # Attributes for deprecated tags. 

39 "truespeed", 

40} 

41 

42 

43def normalize_whitespace(string): 

44 return ASCII_WHITESPACE.sub(" ", string) 

45 

46 

47def normalize_attributes(attributes): 

48 normalized = [] 

49 for name, value in attributes: 

50 if name == "class" and value: 

51 # Special case handling of 'class' attribute, so that comparisons 

52 # of DOM instances are not sensitive to ordering of classes. 

53 value = " ".join( 

54 sorted(value for value in ASCII_WHITESPACE.split(value) if value) 

55 ) 

56 # Boolean attributes without a value is same as attribute with value 

57 # that equals the attributes name. For example: 

58 # <input checked> == <input checked="checked"> 

59 if name in BOOLEAN_ATTRIBUTES: 

60 if not value or value == name: 

61 value = None 

62 elif value is None: 

63 value = "" 

64 normalized.append((name, value)) 

65 return normalized 

66 

67 

68class Element: 

69 def __init__(self, name, attributes): 

70 self.name = name 

71 self.attributes = sorted(attributes) 

72 self.children = [] 

73 

74 def append(self, element): 

75 if isinstance(element, str): 

76 element = normalize_whitespace(element) 

77 if self.children and isinstance(self.children[-1], str): 

78 self.children[-1] += element 

79 self.children[-1] = normalize_whitespace(self.children[-1]) 

80 return 

81 elif self.children: 

82 # removing last children if it is only whitespace 

83 # this can result in incorrect dom representations since 

84 # whitespace between inline tags like <span> is significant 

85 if isinstance(self.children[-1], str) and self.children[-1].isspace(): 

86 self.children.pop() 

87 if element: 

88 self.children.append(element) 

89 

90 def finalize(self): 

91 def rstrip_last_element(children): 

92 if children and isinstance(children[-1], str): 

93 children[-1] = children[-1].rstrip() 

94 if not children[-1]: 

95 children.pop() 

96 children = rstrip_last_element(children) 

97 return children 

98 

99 rstrip_last_element(self.children) 

100 for i, child in enumerate(self.children): 

101 if isinstance(child, str): 

102 self.children[i] = child.strip() 

103 elif hasattr(child, "finalize"): 

104 child.finalize() 

105 

106 def __eq__(self, element): 

107 if not hasattr(element, "name") or self.name != element.name: 

108 return False 

109 if self.attributes != element.attributes: 

110 return False 

111 return self.children == element.children 

112 

113 def __hash__(self): 

114 return hash((self.name, *self.attributes)) 

115 

116 def _count(self, element, count=True): 

117 if not isinstance(element, str) and self == element: 

118 return 1 

119 if isinstance(element, RootElement) and self.children == element.children: 

120 return 1 

121 i = 0 

122 elem_child_idx = 0 

123 for child in self.children: 

124 # child is text content and element is also text content, then 

125 # make a simple "text" in "text" 

126 if isinstance(child, str): 

127 if isinstance(element, str): 

128 if count: 

129 i += child.count(element) 

130 elif element in child: 

131 return 1 

132 else: 

133 # Look for element wholly within this child. 

134 i += child._count(element, count=count) 

135 if not count and i: 

136 return i 

137 # Also look for a sequence of element's children among self's 

138 # children. self.children == element.children is tested above, 

139 # but will fail if self has additional children. Ex: '<a/><b/>' 

140 # is contained in '<a/><b/><c/>'. 

141 if isinstance(element, RootElement) and element.children: 

142 elem_child = element.children[elem_child_idx] 

143 # Start or continue match, advance index. 

144 if elem_child == child: 

145 elem_child_idx += 1 

146 # Match found, reset index. 

147 if elem_child_idx == len(element.children): 

148 i += 1 

149 elem_child_idx = 0 

150 # No match, reset index. 

151 else: 

152 elem_child_idx = 0 

153 return i 

154 

155 def __contains__(self, element): 

156 return self._count(element, count=False) > 0 

157 

158 def count(self, element): 

159 return self._count(element, count=True) 

160 

161 def __getitem__(self, key): 

162 return self.children[key] 

163 

164 def __str__(self): 

165 output = "<%s" % self.name 

166 for key, value in self.attributes: 

167 if value is not None: 

168 output += ' %s="%s"' % (key, value) 

169 else: 

170 output += " %s" % key 

171 if self.children: 

172 output += ">\n" 

173 output += "".join(str(c) for c in self.children) 

174 output += "\n</%s>" % self.name 

175 else: 

176 output += ">" 

177 return output 

178 

179 def __repr__(self): 

180 return str(self) 

181 

182 

183class RootElement(Element): 

184 def __init__(self): 

185 super().__init__(None, ()) 

186 

187 def __str__(self): 

188 return "".join(str(c) for c in self.children) 

189 

190 

191class HTMLParseError(Exception): 

192 pass 

193 

194 

195class Parser(HTMLParser): 

196 # https://html.spec.whatwg.org/#void-elements 

197 SELF_CLOSING_TAGS = { 

198 "area", 

199 "base", 

200 "br", 

201 "col", 

202 "embed", 

203 "hr", 

204 "img", 

205 "input", 

206 "link", 

207 "meta", 

208 "param", 

209 "source", 

210 "track", 

211 "wbr", 

212 # Deprecated tags 

213 "frame", 

214 "spacer", 

215 } 

216 

217 def __init__(self): 

218 super().__init__() 

219 self.root = RootElement() 

220 self.open_tags = [] 

221 self.element_positions = {} 

222 

223 def error(self, msg): 

224 raise HTMLParseError(msg, self.getpos()) 

225 

226 def format_position(self, position=None, element=None): 

227 if not position and element: 

228 position = self.element_positions[element] 

229 if position is None: 

230 position = self.getpos() 

231 if hasattr(position, "lineno"): 

232 position = position.lineno, position.offset 

233 return "Line %d, Column %d" % position 

234 

235 @property 

236 def current(self): 

237 if self.open_tags: 

238 return self.open_tags[-1] 

239 else: 

240 return self.root 

241 

242 def handle_startendtag(self, tag, attrs): 

243 self.handle_starttag(tag, attrs) 

244 if tag not in self.SELF_CLOSING_TAGS: 

245 self.handle_endtag(tag) 

246 

247 def handle_starttag(self, tag, attrs): 

248 attrs = normalize_attributes(attrs) 

249 element = Element(tag, attrs) 

250 self.current.append(element) 

251 if tag not in self.SELF_CLOSING_TAGS: 

252 self.open_tags.append(element) 

253 self.element_positions[element] = self.getpos() 

254 

255 def handle_endtag(self, tag): 

256 if not self.open_tags: 

257 self.error("Unexpected end tag `%s` (%s)" % (tag, self.format_position())) 

258 element = self.open_tags.pop() 

259 while element.name != tag: 

260 if not self.open_tags: 

261 self.error( 

262 "Unexpected end tag `%s` (%s)" % (tag, self.format_position()) 

263 ) 

264 element = self.open_tags.pop() 

265 

266 def handle_data(self, data): 

267 self.current.append(data) 

268 

269 

270def parse_html(html): 

271 """ 

272 Take a string that contains HTML and turn it into a Python object structure 

273 that can be easily compared against other HTML on semantic equivalence. 

274 Syntactical differences like which quotation is used on arguments will be 

275 ignored. 

276 """ 

277 parser = Parser() 

278 parser.feed(html) 

279 parser.close() 

280 document = parser.root 

281 document.finalize() 

282 # Removing ROOT element if it's not necessary 

283 if len(document.children) == 1 and not isinstance(document.children[0], str): 

284 document = document.children[0] 

285 return document