1
2
3 """
4 HTML parser test cases for etree
5 """
6
7 import unittest
8 import tempfile, os, os.path, sys
9
10 this_dir = os.path.dirname(__file__)
11 if this_dir not in sys.path:
12 sys.path.insert(0, this_dir)
13
14 from common_imports import etree, StringIO, BytesIO, fileInTestDir, _bytes, _str
15 from common_imports import SillyFileLike, HelperTestCase, write_to_file
16
17 try:
18 unicode
19 except NameError:
20 unicode = str
21
23 """HTML parser test cases
24 """
25 etree = etree
26
27 html_str = _bytes("<html><head><title>test</title></head><body><h1>page title</h1></body></html>")
28 html_str_pretty = _bytes("""\
29 <html>
30 <head><title>test</title></head>
31 <body><h1>page title</h1></body>
32 </html>
33 """)
34 broken_html_str = _bytes("<html><head><title>test<body><h1>page title</h3></p></html>")
35 uhtml_str = _str("<html><head><title>test á\uF8D2</title></head><body><h1>page á\uF8D2 title</h1></body></html>")
36
40
45
51
56
63
65 parser = self.etree.HTMLParser()
66 Element = parser.makeelement
67
68 el = Element('name')
69 self.assertRaises(ValueError, Element, '{}')
70 self.assertRaises(ValueError, setattr, el, 'tag', '{}')
71
72 self.assertRaises(ValueError, Element, '{test}')
73 self.assertRaises(ValueError, setattr, el, 'tag', '{test}')
74
88
90 parser = self.etree.HTMLParser()
91 Element = parser.makeelement
92
93 self.assertRaises(ValueError, Element, 'p"name')
94 self.assertRaises(ValueError, Element, "na'me")
95 self.assertRaises(ValueError, Element, '{test}"name')
96 self.assertRaises(ValueError, Element, "{test}name'")
97
98 el = Element('name')
99 self.assertRaises(ValueError, setattr, el, 'tag', "pname'")
100 self.assertRaises(ValueError, setattr, el, 'tag', '"pname')
101 self.assertEqual(el.tag, "name")
102
104 parser = self.etree.HTMLParser()
105 Element = parser.makeelement
106
107 self.assertRaises(ValueError, Element, ' name ')
108 self.assertRaises(ValueError, Element, 'na me')
109 self.assertRaises(ValueError, Element, '{test} name')
110
111 el = Element('name')
112 self.assertRaises(ValueError, setattr, el, 'tag', ' name ')
113 self.assertEqual(el.tag, "name")
114
124
136
138 parser = self.etree.HTMLParser()
139 Element = parser.makeelement
140 SubElement = self.etree.SubElement
141
142 el = Element('name')
143 self.assertRaises(ValueError, SubElement, el, "name'")
144 self.assertRaises(ValueError, SubElement, el, 'na"me')
145 self.assertRaises(ValueError, SubElement, el, "{test}na'me")
146 self.assertRaises(ValueError, SubElement, el, '{test}"name')
147
157
164
174
176 text = _str('Søk på nettet')
177 wrong_head = _str('''
178 <head>
179 <meta http-equiv="Content-Type"
180 content="text/html; charset=UTF-8" />
181 </head>''')
182 html_latin1 = (_str('<html>%s<body><p>%s</p></body></html>') % (wrong_head,
183 text)
184 ).encode('iso-8859-1')
185
186 self.assertRaises(self.etree.ParseError,
187 self.etree.parse,
188 BytesIO(html_latin1))
189
190 tree = self.etree.parse(
191 BytesIO(html_latin1),
192 self.etree.HTMLParser(encoding="iso-8859-1"))
193 p = tree.find("//p")
194 self.assertEqual(p.text, text)
195
200
202
203 html = _bytes('<html><head><style>foo</style></head></html>')
204 element = self.etree.HTML(html)
205 self.assertEqual(element[0][0].text, "foo")
206
210
223
231
232
233
234
235
236
237
238
245
260
262 iterparse = self.etree.iterparse
263 f = BytesIO(
264 '<html><head><title>TITLE</title><body><p>P</p></body></html>')
265
266 iterator = iterparse(f, html=True)
267 self.assertEqual(None, iterator.root)
268
269 events = list(iterator)
270 root = iterator.root
271 self.assertTrue(root is not None)
272 self.assertEqual(
273 [('end', root[0][0]), ('end', root[0]), ('end', root[1][0]),
274 ('end', root[1]), ('end', root)],
275 events)
276
278 iterparse = self.etree.iterparse
279 iterator = iterparse(fileInTestDir("shakespeare.html"),
280 html=True)
281
282 self.assertEqual(None, iterator.root)
283 events = list(iterator)
284 root = iterator.root
285 self.assertTrue(root is not None)
286 self.assertEqual(249, len(events))
287 self.assertEqual(
288 [],
289 [ event for (event, element) in events if event != 'end' ])
290
292 iterparse = self.etree.iterparse
293 f = BytesIO(
294 '<html><head><title>TITLE</title><body><p>P</p></body></html>')
295
296 iterator = iterparse(f, html=True, events=('start',))
297 self.assertEqual(None, iterator.root)
298
299 events = list(iterator)
300 root = iterator.root
301 self.assertTrue(root is not None)
302 self.assertEqual(
303 [('start', root), ('start', root[0]), ('start', root[0][0]),
304 ('start', root[1]), ('start', root[1][0])],
305 events)
306
314 def end(self, tag):
315 events.append(("end", tag))
316 def close(self):
317 return "DONE"
318
319 parser = self.etree.HTMLParser(target=Target())
320
321 parser.feed("<html><body></body></html>")
322 done = parser.close()
323
324 self.assertEqual("DONE", done)
325 self.assertEqual([
326 ("start", "html"), ("start", "body"),
327 ("end", "body"), ("end", "html")], events)
328
336 def end(self, tag):
337 events.append(("end", tag))
338 def doctype(self, *args):
339 events.append(("doctype", args))
340 def close(self):
341 return "DONE"
342
343 parser = self.etree.HTMLParser(target=Target())
344 parser.feed("<!DOCTYPE><html><body></body></html>")
345 done = parser.close()
346
347 self.assertEqual("DONE", done)
348 self.assertEqual([
349 ("doctype", (None, None, None)),
350 ("start", "html"), ("start", "body"),
351 ("end", "body"), ("end", "html")], events)
352
360 def end(self, tag):
361 events.append(("end", tag))
362 def doctype(self, *args):
363 events.append(("doctype", args))
364 def close(self):
365 return "DONE"
366
367 parser = self.etree.HTMLParser(target=Target())
368 parser.feed("<!DOCTYPE html><html><body></body></html>")
369 done = parser.close()
370
371 self.assertEqual("DONE", done)
372 self.assertEqual([
373 ("doctype", ("html", None, None)),
374 ("start", "html"), ("start", "body"),
375 ("end", "body"), ("end", "html")], events)
376
384 def end(self, tag):
385 events.append(("end", tag))
386 def doctype(self, *args):
387 events.append(("doctype", args))
388 def close(self):
389 return "DONE"
390
391 parser = self.etree.HTMLParser(target=Target())
392 parser.feed('<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN" "sys.dtd">'
393 '<html><body></body></html>')
394 done = parser.close()
395
396 self.assertEqual("DONE", done)
397 self.assertEqual([
398 ("doctype", ("html", "-//W3C//DTD HTML 4.01//EN", "sys.dtd")),
399 ("start", "html"), ("start", "body"),
400 ("end", "body"), ("end", "html")], events)
401
402
404 suite = unittest.TestSuite()
405 suite.addTests([unittest.makeSuite(HtmlParserTestCase)])
406 return suite
407
408 if __name__ == '__main__':
409 print('to test use test.py %s' % __file__)
410