1
2
3 """
4 HTML parser test cases for etree
5 """
6
7 import unittest
8 import tempfile, os, os.path, sys
9
10 this_dir = os.path.dirname(__file__)
11 if this_dir not in sys.path:
12 sys.path.insert(0, this_dir)
13
14 from common_imports import etree, html, StringIO, BytesIO, fileInTestDir, _bytes, _str
15 from common_imports import SillyFileLike, HelperTestCase, write_to_file, next
16
17 try:
18 unicode
19 except NameError:
20 unicode = str
21
22
24 """HTML parser test cases
25 """
26 etree = etree
27
28 html_str = _bytes("<html><head><title>test</title></head><body><h1>page title</h1></body></html>")
29 html_str_pretty = _bytes("""\
30 <html>
31 <head><title>test</title></head>
32 <body><h1>page title</h1></body>
33 </html>
34 """)
35 broken_html_str = _bytes("<html><head><title>test"
36 "<body><h1>page title</h3></p></html>")
37 uhtml_str = _bytes(
38 "<html><head><title>test á</title></head>"
39 "<body><h1>page á title</h1></body></html>").decode('utf8')
40
44
49
57
59 if sys.maxunicode < 1114111:
60 return
61 element = self.etree.HTML(_bytes(
62 '<html><body><p>\\U00026007</p></body></html>'
63 ).decode('unicode_escape'))
64 p_text = element.findtext('.//p')
65 self.assertEqual(1, len(p_text))
66 self.assertEqual(_bytes('\\U00026007').decode('unicode_escape'),
67 p_text)
68
76
84
89
96
98 parser = self.etree.HTMLParser()
99 Element = parser.makeelement
100
101 el = Element('name')
102 self.assertRaises(ValueError, Element, '{}')
103 self.assertRaises(ValueError, setattr, el, 'tag', '{}')
104
105 self.assertRaises(ValueError, Element, '{test}')
106 self.assertRaises(ValueError, setattr, el, 'tag', '{test}')
107
121
123 parser = self.etree.HTMLParser()
124 Element = parser.makeelement
125
126 self.assertRaises(ValueError, Element, 'p"name')
127 self.assertRaises(ValueError, Element, "na'me")
128 self.assertRaises(ValueError, Element, '{test}"name')
129 self.assertRaises(ValueError, Element, "{test}name'")
130
131 el = Element('name')
132 self.assertRaises(ValueError, setattr, el, 'tag', "pname'")
133 self.assertRaises(ValueError, setattr, el, 'tag', '"pname')
134 self.assertEqual(el.tag, "name")
135
137 parser = self.etree.HTMLParser()
138 Element = parser.makeelement
139
140 self.assertRaises(ValueError, Element, ' name ')
141 self.assertRaises(ValueError, Element, 'na me')
142 self.assertRaises(ValueError, Element, '{test} name')
143
144 el = Element('name')
145 self.assertRaises(ValueError, setattr, el, 'tag', ' name ')
146 self.assertEqual(el.tag, "name")
147
157
169
171 parser = self.etree.HTMLParser()
172 Element = parser.makeelement
173 SubElement = self.etree.SubElement
174
175 el = Element('name')
176 self.assertRaises(ValueError, SubElement, el, "name'")
177 self.assertRaises(ValueError, SubElement, el, 'na"me')
178 self.assertRaises(ValueError, SubElement, el, "{test}na'me")
179 self.assertRaises(ValueError, SubElement, el, '{test}"name')
180
190
197
205
215
217 text = _str('Søk på nettet')
218 wrong_head = _str('''
219 <head>
220 <meta http-equiv="Content-Type"
221 content="text/html; charset=UTF-8" />
222 </head>''')
223 html_latin1 = (_str('<html>%s<body><p>%s</p></body></html>') % (wrong_head,
224 text)
225 ).encode('iso-8859-1')
226
227 self.assertRaises(self.etree.ParseError,
228 self.etree.parse,
229 BytesIO(html_latin1))
230
231 tree = self.etree.parse(
232 BytesIO(html_latin1),
233 self.etree.HTMLParser(encoding="iso-8859-1"))
234 p = tree.find("//p")
235 self.assertEqual(p.text, text)
236
241
247
251
264
272
273
274
275
276
277
278
279
286
301
303 iterparse = self.etree.iterparse
304 f = BytesIO(
305 '<html><head><title>TITLE</title><body><p>P</p></body></html>')
306
307 iterator = iterparse(f, html=True)
308 self.assertEqual(None, iterator.root)
309
310 events = list(iterator)
311 root = iterator.root
312 self.assertTrue(root is not None)
313 self.assertEqual(
314 [('end', root[0][0]), ('end', root[0]), ('end', root[1][0]),
315 ('end', root[1]), ('end', root)],
316 events)
317
319 iterparse = self.etree.iterparse
320 f = BytesIO(
321 '<html><head><title>TITLE</title><body><p>P</p></body></html>')
322
323 iterator = iterparse(f, html=True)
324 self.assertEqual(None, iterator.root)
325
326 event, element = next(iterator)
327 self.assertEqual('end', event)
328 self.assertEqual('title', element.tag)
329 self.assertEqual(None, iterator.root)
330 del element
331
332 event, element = next(iterator)
333 self.assertEqual('end', event)
334 self.assertEqual('head', element.tag)
335 self.assertEqual(None, iterator.root)
336 del element
337 del iterator
338
340 iterparse = self.etree.iterparse
341 f = BytesIO('<head><title>TEST></head><p>P<br></div>')
342
343 iterator = iterparse(f, html=True)
344 self.assertEqual(None, iterator.root)
345
346 events = list(iterator)
347 root = iterator.root
348 self.assertTrue(root is not None)
349 self.assertEqual('html', root.tag)
350 self.assertEqual('head', root[0].tag)
351 self.assertEqual('body', root[1].tag)
352 self.assertEqual('p', root[1][0].tag)
353 self.assertEqual('br', root[1][0][0].tag)
354 self.assertEqual(
355 [('end', root[0][0]), ('end', root[0]), ('end', root[1][0][0]),
356 ('end', root[1][0]), ('end', root[1]), ('end', root)],
357 events)
358
364
366 iterparse = self.etree.iterparse
367 iterator = iterparse(fileInTestDir("shakespeare.html"),
368 html=True)
369
370 self.assertEqual(None, iterator.root)
371 events = list(iterator)
372 root = iterator.root
373 self.assertTrue(root is not None)
374 self.assertEqual(249, len(events))
375 self.assertFalse(
376 [event for (event, element) in events if event != 'end'])
377
379 iterparse = self.etree.iterparse
380 f = BytesIO(
381 '<html><head><title>TITLE</title><body><p>P</p></body></html>')
382
383 iterator = iterparse(f, html=True, events=('start',))
384 self.assertEqual(None, iterator.root)
385
386 events = list(iterator)
387 root = iterator.root
388 self.assertNotEqual(None, root)
389 self.assertEqual(
390 [('start', root), ('start', root[0]), ('start', root[0][0]),
391 ('start', root[1]), ('start', root[1][0])],
392 events)
393
404
423
438
440 assertFalse = self.assertFalse
441 events = []
442 class Target(object):
443 def start(self, tag, attrib):
444 events.append(("start", tag))
445 assertFalse(attrib)
446 def end(self, tag):
447 events.append(("end", tag))
448 def close(self):
449 return "DONE"
450
451 parser = self.etree.HTMLParser(target=Target())
452
453 parser.feed("<html><body></body></html>")
454 done = parser.close()
455
456 self.assertEqual("DONE", done)
457 self.assertEqual([
458 ("start", "html"), ("start", "body"),
459 ("end", "body"), ("end", "html")], events)
460
462 assertFalse = self.assertFalse
463 events = []
464 class Target(object):
465 def start(self, tag, attrib):
466 events.append(("start", tag))
467 assertFalse(attrib)
468 def end(self, tag):
469 events.append(("end", tag))
470 def doctype(self, *args):
471 events.append(("doctype", args))
472 def close(self):
473 return "DONE"
474
475 parser = self.etree.HTMLParser(target=Target())
476 parser.feed("<!DOCTYPE><html><body></body></html>")
477 done = parser.close()
478
479 self.assertEqual("DONE", done)
480 self.assertEqual([
481 ("doctype", (None, None, None)),
482 ("start", "html"), ("start", "body"),
483 ("end", "body"), ("end", "html")], events)
484
486 assertFalse = self.assertFalse
487 events = []
488 class Target(object):
489 def start(self, tag, attrib):
490 events.append(("start", tag))
491 assertFalse(attrib)
492 def end(self, tag):
493 events.append(("end", tag))
494 def doctype(self, *args):
495 events.append(("doctype", args))
496 def close(self):
497 return "DONE"
498
499 parser = self.etree.HTMLParser(target=Target())
500 parser.feed("<!DOCTYPE html><html><body></body></html>")
501 done = parser.close()
502
503 self.assertEqual("DONE", done)
504 self.assertEqual([
505 ("doctype", ("html", None, None)),
506 ("start", "html"), ("start", "body"),
507 ("end", "body"), ("end", "html")], events)
508
510 assertFalse = self.assertFalse
511 events = []
512 class Target(object):
513 def start(self, tag, attrib):
514 events.append(("start", tag))
515 assertFalse(attrib)
516 def end(self, tag):
517 events.append(("end", tag))
518 def doctype(self, *args):
519 events.append(("doctype", args))
520 def close(self):
521 return "DONE"
522
523 parser = self.etree.HTMLParser(target=Target())
524 parser.feed('<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN" "sys.dtd">'
525 '<html><body></body></html>')
526 done = parser.close()
527
528 self.assertEqual("DONE", done)
529 self.assertEqual([
530 ("doctype", ("html", "-//W3C//DTD HTML 4.01//EN", "sys.dtd")),
531 ("start", "html"), ("start", "body"),
532 ("end", "body"), ("end", "html")], events)
533
535 events = []
536 class Target(object):
537 def start(self, tag, attrib):
538 events.append(("start", tag))
539 raise ValueError("START")
540 def end(self, tag):
541 events.append(("end", tag))
542 raise TypeError("END")
543 def close(self):
544 return "DONE"
545
546 parser = self.etree.HTMLParser(target=Target())
547 try:
548 parser.feed('<html><body>')
549 parser.feed('</body></html>')
550 except ValueError as exc:
551 assert "START" in str(exc)
552 except TypeError as exc:
553 assert "END" in str(exc)
554 self.assertTrue(False, "wrong exception raised")
555 else:
556 self.assertTrue(False, "no exception raised")
557
558 self.assertTrue(("start", "html") in events, events)
559 self.assertTrue(("end", "html") not in events, events)
560
562 events = []
563 class Target(object):
564 def start(self, tag, attrib):
565 events.append(("start", tag))
566 raise ValueError("START")
567 def end(self, tag):
568 events.append(("end", tag))
569 raise TypeError("END")
570 def close(self):
571 return "DONE"
572
573 parser = self.etree.HTMLParser(target=Target())
574 try:
575 self.etree.fromstring('<html><body></body></html>', parser)
576 except ValueError as exc:
577 assert "START" in str(exc), str(exc)
578 except TypeError as exc:
579 assert "END" in str(exc), str(exc)
580 self.assertTrue(False, "wrong exception raised")
581 else:
582 self.assertTrue(False, "no exception raised")
583
584 self.assertTrue(("start", "html") in events, events)
585 self.assertTrue(("end", "html") not in events, events)
586
588 doc = html.Element('html').getroottree()
589 doc.docinfo.public_id = "-//W3C//DTD XHTML 1.0 Strict//EN"
590 doc.docinfo.system_url = \
591 "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"
592 self.assertEqual(doc.docinfo.doctype,
593 '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">')
594 self.assertEqual(self.etree.tostring(doc),
595 _bytes('''<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
596 <html xmlns="http://www.w3.org/1999/xhtml"></html>'''))
597
608
618
628
634
640
641
643 suite = unittest.TestSuite()
644 suite.addTests([unittest.makeSuite(HtmlParserTestCase)])
645 return suite
646
647
648 if __name__ == '__main__':
649 print('to test use test.py %s' % __file__)
650