1
2
3 """
4 HTML parser test cases for etree
5 """
6
7 import unittest
8 import tempfile, os, os.path, sys
9
10 this_dir = os.path.dirname(__file__)
11 if this_dir not in sys.path:
12 sys.path.insert(0, this_dir)
13
14 from common_imports import etree, StringIO, BytesIO, fileInTestDir, _bytes, _str
15 from common_imports import SillyFileLike, HelperTestCase, write_to_file, next
16
17 try:
18 unicode
19 except NameError:
20 unicode = str
21
23 """HTML parser test cases
24 """
25 etree = etree
26
27 html_str = _bytes("<html><head><title>test</title></head><body><h1>page title</h1></body></html>")
28 html_str_pretty = _bytes("""\
29 <html>
30 <head><title>test</title></head>
31 <body><h1>page title</h1></body>
32 </html>
33 """)
34 broken_html_str = _bytes("<html><head><title>test"
35 "<body><h1>page title</h3></p></html>")
36 uhtml_str = _bytes(
37 "<html><head><title>test á</title></head>"
38 "<body><h1>page á title</h1></body></html>").decode('utf8')
39
43
48
56
58 if sys.maxunicode < 1114111:
59 return
60 element = self.etree.HTML(_bytes(
61 '<html><body><p>\\U00026007</p></body></html>'
62 ).decode('unicode_escape'))
63 p_text = element.findtext('.//p')
64 self.assertEqual(1, len(p_text))
65 self.assertEqual(_bytes('\\U00026007').decode('unicode_escape'),
66 p_text)
67
72
79
81 parser = self.etree.HTMLParser()
82 Element = parser.makeelement
83
84 el = Element('name')
85 self.assertRaises(ValueError, Element, '{}')
86 self.assertRaises(ValueError, setattr, el, 'tag', '{}')
87
88 self.assertRaises(ValueError, Element, '{test}')
89 self.assertRaises(ValueError, setattr, el, 'tag', '{test}')
90
104
106 parser = self.etree.HTMLParser()
107 Element = parser.makeelement
108
109 self.assertRaises(ValueError, Element, 'p"name')
110 self.assertRaises(ValueError, Element, "na'me")
111 self.assertRaises(ValueError, Element, '{test}"name')
112 self.assertRaises(ValueError, Element, "{test}name'")
113
114 el = Element('name')
115 self.assertRaises(ValueError, setattr, el, 'tag', "pname'")
116 self.assertRaises(ValueError, setattr, el, 'tag', '"pname')
117 self.assertEqual(el.tag, "name")
118
120 parser = self.etree.HTMLParser()
121 Element = parser.makeelement
122
123 self.assertRaises(ValueError, Element, ' name ')
124 self.assertRaises(ValueError, Element, 'na me')
125 self.assertRaises(ValueError, Element, '{test} name')
126
127 el = Element('name')
128 self.assertRaises(ValueError, setattr, el, 'tag', ' name ')
129 self.assertEqual(el.tag, "name")
130
140
152
154 parser = self.etree.HTMLParser()
155 Element = parser.makeelement
156 SubElement = self.etree.SubElement
157
158 el = Element('name')
159 self.assertRaises(ValueError, SubElement, el, "name'")
160 self.assertRaises(ValueError, SubElement, el, 'na"me')
161 self.assertRaises(ValueError, SubElement, el, "{test}na'me")
162 self.assertRaises(ValueError, SubElement, el, '{test}"name')
163
173
180
190
192 text = _str('Søk på nettet')
193 wrong_head = _str('''
194 <head>
195 <meta http-equiv="Content-Type"
196 content="text/html; charset=UTF-8" />
197 </head>''')
198 html_latin1 = (_str('<html>%s<body><p>%s</p></body></html>') % (wrong_head,
199 text)
200 ).encode('iso-8859-1')
201
202 self.assertRaises(self.etree.ParseError,
203 self.etree.parse,
204 BytesIO(html_latin1))
205
206 tree = self.etree.parse(
207 BytesIO(html_latin1),
208 self.etree.HTMLParser(encoding="iso-8859-1"))
209 p = tree.find("//p")
210 self.assertEqual(p.text, text)
211
216
218
219 html = _bytes('<html><head><style>foo</style></head></html>')
220 element = self.etree.HTML(html)
221 self.assertEqual(element[0][0].text, "foo")
222
226
239
247
248
249
250
251
252
253
254
261
276
278 iterparse = self.etree.iterparse
279 f = BytesIO(
280 '<html><head><title>TITLE</title><body><p>P</p></body></html>')
281
282 iterator = iterparse(f, html=True)
283 self.assertEqual(None, iterator.root)
284
285 events = list(iterator)
286 root = iterator.root
287 self.assertTrue(root is not None)
288 self.assertEqual(
289 [('end', root[0][0]), ('end', root[0]), ('end', root[1][0]),
290 ('end', root[1]), ('end', root)],
291 events)
292
294 iterparse = self.etree.iterparse
295 f = BytesIO(
296 '<html><head><title>TITLE</title><body><p>P</p></body></html>')
297
298 iterator = iterparse(f, html=True)
299 self.assertEqual(None, iterator.root)
300
301 event, element = next(iterator)
302 self.assertEqual('end', event)
303 self.assertEqual('title', element.tag)
304 self.assertEqual(None, iterator.root)
305 del element
306
307 event, element = next(iterator)
308 self.assertEqual('end', event)
309 self.assertEqual('head', element.tag)
310 self.assertEqual(None, iterator.root)
311 del element
312 del iterator
313
315 iterparse = self.etree.iterparse
316 f = BytesIO('<head><title>TEST></head><p>P<br></div>')
317
318 iterator = iterparse(f, html=True)
319 self.assertEqual(None, iterator.root)
320
321 events = list(iterator)
322 root = iterator.root
323 self.assertTrue(root is not None)
324 self.assertEqual('html', root.tag)
325 self.assertEqual('head', root[0].tag)
326 self.assertEqual('body', root[1].tag)
327 self.assertEqual('p', root[1][0].tag)
328 self.assertEqual('br', root[1][0][0].tag)
329 self.assertEqual(
330 [('end', root[0][0]), ('end', root[0]), ('end', root[1][0][0]),
331 ('end', root[1][0]), ('end', root[1]), ('end', root)],
332 events)
333
339
352
354 iterparse = self.etree.iterparse
355 f = BytesIO(
356 '<html><head><title>TITLE</title><body><p>P</p></body></html>')
357
358 iterator = iterparse(f, html=True, events=('start',))
359 self.assertEqual(None, iterator.root)
360
361 events = list(iterator)
362 root = iterator.root
363 self.assertNotEqual(None, root)
364 self.assertEqual(
365 [('start', root), ('start', root[0]), ('start', root[0][0]),
366 ('start', root[1]), ('start', root[1][0])],
367 events)
368
379
398
413
421 def end(self, tag):
422 events.append(("end", tag))
423 def close(self):
424 return "DONE"
425
426 parser = self.etree.HTMLParser(target=Target())
427
428 parser.feed("<html><body></body></html>")
429 done = parser.close()
430
431 self.assertEqual("DONE", done)
432 self.assertEqual([
433 ("start", "html"), ("start", "body"),
434 ("end", "body"), ("end", "html")], events)
435
443 def end(self, tag):
444 events.append(("end", tag))
445 def doctype(self, *args):
446 events.append(("doctype", args))
447 def close(self):
448 return "DONE"
449
450 parser = self.etree.HTMLParser(target=Target())
451 parser.feed("<!DOCTYPE><html><body></body></html>")
452 done = parser.close()
453
454 self.assertEqual("DONE", done)
455 self.assertEqual([
456 ("doctype", (None, None, None)),
457 ("start", "html"), ("start", "body"),
458 ("end", "body"), ("end", "html")], events)
459
467 def end(self, tag):
468 events.append(("end", tag))
469 def doctype(self, *args):
470 events.append(("doctype", args))
471 def close(self):
472 return "DONE"
473
474 parser = self.etree.HTMLParser(target=Target())
475 parser.feed("<!DOCTYPE html><html><body></body></html>")
476 done = parser.close()
477
478 self.assertEqual("DONE", done)
479 self.assertEqual([
480 ("doctype", ("html", None, None)),
481 ("start", "html"), ("start", "body"),
482 ("end", "body"), ("end", "html")], events)
483
491 def end(self, tag):
492 events.append(("end", tag))
493 def doctype(self, *args):
494 events.append(("doctype", args))
495 def close(self):
496 return "DONE"
497
498 parser = self.etree.HTMLParser(target=Target())
499 parser.feed('<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN" "sys.dtd">'
500 '<html><body></body></html>')
501 done = parser.close()
502
503 self.assertEqual("DONE", done)
504 self.assertEqual([
505 ("doctype", ("html", "-//W3C//DTD HTML 4.01//EN", "sys.dtd")),
506 ("start", "html"), ("start", "body"),
507 ("end", "body"), ("end", "html")], events)
508
509
511 suite = unittest.TestSuite()
512 suite.addTests([unittest.makeSuite(HtmlParserTestCase)])
513 return suite
514
515 if __name__ == '__main__':
516 print('to test use test.py %s' % __file__)
517