1
2
3 """
4 HTML parser test cases for etree
5 """
6
7 from __future__ import absolute_import
8
9 import unittest
10 import tempfile, os, os.path, sys
11
12 from .common_imports import etree, html, BytesIO, fileInTestDir, _bytes, _str
13 from .common_imports import SillyFileLike, HelperTestCase, write_to_file, next
14
15 try:
16 unicode
17 except NameError:
18 unicode = str
19
20
22 """HTML parser test cases
23 """
24 etree = etree
25
26 html_str = _bytes("<html><head><title>test</title></head><body><h1>page title</h1></body></html>")
27 html_str_pretty = _bytes("""\
28 <html>
29 <head><title>test</title></head>
30 <body><h1>page title</h1></body>
31 </html>
32 """)
33 broken_html_str = _bytes("<html><head><title>test"
34 "<body><h1>page title</h3></p></html>")
35 uhtml_str = _bytes(
36 "<html><head><title>test á</title></head>"
37 "<body><h1>page á title</h1></body></html>").decode('utf8')
38
42
47
55
57 if sys.maxunicode < 1114111:
58 return
59 element = self.etree.HTML(_bytes(
60 '<html><body><p>\\U00026007</p></body></html>'
61 ).decode('unicode_escape'))
62 p_text = element.findtext('.//p')
63 self.assertEqual(1, len(p_text))
64 self.assertEqual(_bytes('\\U00026007').decode('unicode_escape'),
65 p_text)
66
75
84
89
96
98 parser = self.etree.HTMLParser()
99 Element = parser.makeelement
100
101 el = Element('name')
102 self.assertRaises(ValueError, Element, '{}')
103 self.assertRaises(ValueError, setattr, el, 'tag', '{}')
104
105 self.assertRaises(ValueError, Element, '{test}')
106 self.assertRaises(ValueError, setattr, el, 'tag', '{test}')
107
121
123 parser = self.etree.HTMLParser()
124 Element = parser.makeelement
125
126 self.assertRaises(ValueError, Element, 'p"name')
127 self.assertRaises(ValueError, Element, "na'me")
128 self.assertRaises(ValueError, Element, '{test}"name')
129 self.assertRaises(ValueError, Element, "{test}name'")
130
131 el = Element('name')
132 self.assertRaises(ValueError, setattr, el, 'tag', "pname'")
133 self.assertRaises(ValueError, setattr, el, 'tag', '"pname')
134 self.assertEqual(el.tag, "name")
135
137 parser = self.etree.HTMLParser()
138 Element = parser.makeelement
139
140 self.assertRaises(ValueError, Element, ' name ')
141 self.assertRaises(ValueError, Element, 'na me')
142 self.assertRaises(ValueError, Element, '{test} name')
143
144 el = Element('name')
145 self.assertRaises(ValueError, setattr, el, 'tag', ' name ')
146 self.assertEqual(el.tag, "name")
147
157
169
171 parser = self.etree.HTMLParser()
172 Element = parser.makeelement
173 SubElement = self.etree.SubElement
174
175 el = Element('name')
176 self.assertRaises(ValueError, SubElement, el, "name'")
177 self.assertRaises(ValueError, SubElement, el, 'na"me')
178 self.assertRaises(ValueError, SubElement, el, "{test}na'me")
179 self.assertRaises(ValueError, SubElement, el, '{test}"name')
180
190
197
205
215
217 text = _str('Søk på nettet')
218 wrong_head = _str('''
219 <head>
220 <meta http-equiv="Content-Type"
221 content="text/html; charset=UTF-8" />
222 </head>''')
223 html_latin1 = (_str('<html>%s<body><p>%s</p></body></html>') % (wrong_head,
224 text)
225 ).encode('iso-8859-1')
226
227 self.assertRaises(self.etree.ParseError,
228 self.etree.parse,
229 BytesIO(html_latin1))
230
231 tree = self.etree.parse(
232 BytesIO(html_latin1),
233 self.etree.HTMLParser(encoding="iso-8859-1"))
234 p = tree.find("//p")
235 self.assertEqual(p.text, text)
236
241
247
251
263
271
272
273
274
275
276
277
278
285
300
302 iterparse = self.etree.iterparse
303 f = BytesIO(
304 '<html><head><title>TITLE</title><body><p>P</p></body></html>')
305
306 iterator = iterparse(f, html=True)
307 self.assertEqual(None, iterator.root)
308
309 events = list(iterator)
310 root = iterator.root
311 self.assertTrue(root is not None)
312 self.assertEqual(
313 [('end', root[0][0]), ('end', root[0]), ('end', root[1][0]),
314 ('end', root[1]), ('end', root)],
315 events)
316
318 iterparse = self.etree.iterparse
319 f = BytesIO(
320 '<html><head><title>TITLE</title><body><p>P</p></body></html>')
321
322 iterator = iterparse(f, html=True, tag=["p", "title"])
323 self.assertEqual(None, iterator.root)
324
325 events = list(iterator)
326 root = iterator.root
327 self.assertTrue(root is not None)
328 self.assertEqual(
329 [('end', root[0][0]), ('end', root[1][0])],
330 events)
331
333 iterparse = self.etree.iterparse
334 f = BytesIO(
335 '<html><head><title>TITLE</title><body><p>P</p></body></html>')
336
337 iterator = iterparse(f, html=True)
338 self.assertEqual(None, iterator.root)
339
340 event, element = next(iterator)
341 self.assertEqual('end', event)
342 self.assertEqual('title', element.tag)
343 self.assertEqual(None, iterator.root)
344 del element
345
346 event, element = next(iterator)
347 self.assertEqual('end', event)
348 self.assertEqual('head', element.tag)
349 self.assertEqual(None, iterator.root)
350 del element
351 del iterator
352
354 iterparse = self.etree.iterparse
355 f = BytesIO('<head><title>TEST></head><p>P<br></div>')
356
357 iterator = iterparse(f, html=True)
358 self.assertEqual(None, iterator.root)
359
360 events = list(iterator)
361 root = iterator.root
362 self.assertTrue(root is not None)
363 self.assertEqual('html', root.tag)
364 self.assertEqual('head', root[0].tag)
365 self.assertEqual('body', root[1].tag)
366 self.assertEqual('p', root[1][0].tag)
367 self.assertEqual('br', root[1][0][0].tag)
368 self.assertEqual(
369 [('end', root[0][0]), ('end', root[0]), ('end', root[1][0][0]),
370 ('end', root[1][0]), ('end', root[1]), ('end', root)],
371 events)
372
378
380 iterparse = self.etree.iterparse
381 iterator = iterparse(fileInTestDir("shakespeare.html"),
382 html=True)
383
384 self.assertEqual(None, iterator.root)
385 events = list(iterator)
386 root = iterator.root
387 self.assertTrue(root is not None)
388 self.assertEqual(249, len(events))
389 self.assertFalse(
390 [event for (event, element) in events if event != 'end'])
391
393 iterparse = self.etree.iterparse
394 f = BytesIO(
395 '<html><head><title>TITLE</title><body><p>P</p></body></html>')
396
397 iterator = iterparse(f, html=True, events=('start',))
398 self.assertEqual(None, iterator.root)
399
400 events = list(iterator)
401 root = iterator.root
402 self.assertNotEqual(None, root)
403 self.assertEqual(
404 [('start', root), ('start', root[0]), ('start', root[0][0]),
405 ('start', root[1]), ('start', root[1][0])],
406 events)
407
418
437
452
454 assertFalse = self.assertFalse
455 events = []
456 class Target(object):
457 def start(self, tag, attrib):
458 events.append(("start", tag))
459 assertFalse(attrib)
460 def end(self, tag):
461 events.append(("end", tag))
462 def close(self):
463 return "DONE"
464
465 parser = self.etree.HTMLParser(target=Target())
466
467 parser.feed("<html><body></body></html>")
468 done = parser.close()
469
470 self.assertEqual("DONE", done)
471 self.assertEqual([
472 ("start", "html"), ("start", "body"),
473 ("end", "body"), ("end", "html")], events)
474
476 assertFalse = self.assertFalse
477 events = []
478 class Target(object):
479 def start(self, tag, attrib):
480 events.append(("start", tag))
481 assertFalse(attrib)
482 def end(self, tag):
483 events.append(("end", tag))
484 def doctype(self, *args):
485 events.append(("doctype", args))
486 def close(self):
487 return "DONE"
488
489 parser = self.etree.HTMLParser(target=Target())
490 parser.feed("<!DOCTYPE><html><body></body></html>")
491 done = parser.close()
492
493 self.assertEqual("DONE", done)
494 self.assertEqual([
495 ("doctype", (None, None, None)),
496 ("start", "html"), ("start", "body"),
497 ("end", "body"), ("end", "html")], events)
498
500 assertFalse = self.assertFalse
501 events = []
502 class Target(object):
503 def start(self, tag, attrib):
504 events.append(("start", tag))
505 assertFalse(attrib)
506 def end(self, tag):
507 events.append(("end", tag))
508 def doctype(self, *args):
509 events.append(("doctype", args))
510 def close(self):
511 return "DONE"
512
513 parser = self.etree.HTMLParser(target=Target())
514 parser.feed("<!DOCTYPE html><html><body></body></html>")
515 done = parser.close()
516
517 self.assertEqual("DONE", done)
518 self.assertEqual([
519 ("doctype", ("html", None, None)),
520 ("start", "html"), ("start", "body"),
521 ("end", "body"), ("end", "html")], events)
522
524 assertFalse = self.assertFalse
525 events = []
526 class Target(object):
527 def start(self, tag, attrib):
528 events.append(("start", tag))
529 assertFalse(attrib)
530 def end(self, tag):
531 events.append(("end", tag))
532 def doctype(self, *args):
533 events.append(("doctype", args))
534 def close(self):
535 return "DONE"
536
537 parser = self.etree.HTMLParser(target=Target())
538 parser.feed('<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN" "sys.dtd">'
539 '<html><body></body></html>')
540 done = parser.close()
541
542 self.assertEqual("DONE", done)
543 self.assertEqual([
544 ("doctype", ("html", "-//W3C//DTD HTML 4.01//EN", "sys.dtd")),
545 ("start", "html"), ("start", "body"),
546 ("end", "body"), ("end", "html")], events)
547
549 events = []
550 class Target(object):
551 def start(self, tag, attrib):
552 events.append(("start", tag))
553 raise ValueError("START")
554 def end(self, tag):
555 events.append(("end", tag))
556 raise TypeError("END")
557 def close(self):
558 return "DONE"
559
560 parser = self.etree.HTMLParser(target=Target())
561 try:
562 parser.feed('<html><body>')
563 parser.feed('</body></html>')
564 except ValueError as exc:
565 assert "START" in str(exc)
566 except TypeError as exc:
567 assert "END" in str(exc)
568 self.assertTrue(False, "wrong exception raised")
569 else:
570 self.assertTrue(False, "no exception raised")
571
572 self.assertTrue(("start", "html") in events, events)
573 self.assertTrue(("end", "html") not in events, events)
574
576 events = []
577 class Target(object):
578 def start(self, tag, attrib):
579 events.append(("start", tag))
580 raise ValueError("START")
581 def end(self, tag):
582 events.append(("end", tag))
583 raise TypeError("END")
584 def close(self):
585 return "DONE"
586
587 parser = self.etree.HTMLParser(target=Target())
588 try:
589 self.etree.fromstring('<html><body></body></html>', parser)
590 except ValueError as exc:
591 assert "START" in str(exc), str(exc)
592 except TypeError as exc:
593 assert "END" in str(exc), str(exc)
594 self.assertTrue(False, "wrong exception raised")
595 else:
596 self.assertTrue(False, "no exception raised")
597
598 self.assertTrue(("start", "html") in events, events)
599 self.assertTrue(("end", "html") not in events, events)
600
602 doc = html.Element('html').getroottree()
603 doc.docinfo.public_id = "-//W3C//DTD XHTML 1.0 Strict//EN"
604 doc.docinfo.system_url = \
605 "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"
606 self.assertEqual(doc.docinfo.doctype,
607 '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">')
608 self.assertEqual(self.etree.tostring(doc),
609 _bytes('''<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
610 <html xmlns="http://www.w3.org/1999/xhtml"></html>'''))
611
622
632
642
648
654
655
657 suite = unittest.TestSuite()
658 suite.addTests([unittest.makeSuite(HtmlParserTestCase)])
659 return suite
660
661
662 if __name__ == '__main__':
663 print('to test use test.py %s' % __file__)
664