1
2
3 """
4 HTML parser test cases for etree
5 """
6
7 import unittest
8 import tempfile, os, os.path, sys
9
10 this_dir = os.path.dirname(__file__)
11 if this_dir not in sys.path:
12 sys.path.insert(0, this_dir)
13
14 from common_imports import etree, html, StringIO, BytesIO, fileInTestDir, _bytes, _str
15 from common_imports import SillyFileLike, HelperTestCase, write_to_file, next
16
17 try:
18 unicode
19 except NameError:
20 unicode = str
21
22
24 """HTML parser test cases
25 """
26 etree = etree
27
28 html_str = _bytes("<html><head><title>test</title></head><body><h1>page title</h1></body></html>")
29 html_str_pretty = _bytes("""\
30 <html>
31 <head><title>test</title></head>
32 <body><h1>page title</h1></body>
33 </html>
34 """)
35 broken_html_str = _bytes("<html><head><title>test"
36 "<body><h1>page title</h3></p></html>")
37 uhtml_str = _bytes(
38 "<html><head><title>test á</title></head>"
39 "<body><h1>page á title</h1></body></html>").decode('utf8')
40
44
49
57
59 if sys.maxunicode < 1114111:
60 return
61 element = self.etree.HTML(_bytes(
62 '<html><body><p>\\U00026007</p></body></html>'
63 ).decode('unicode_escape'))
64 p_text = element.findtext('.//p')
65 self.assertEqual(1, len(p_text))
66 self.assertEqual(_bytes('\\U00026007').decode('unicode_escape'),
67 p_text)
68
73
80
82 parser = self.etree.HTMLParser()
83 Element = parser.makeelement
84
85 el = Element('name')
86 self.assertRaises(ValueError, Element, '{}')
87 self.assertRaises(ValueError, setattr, el, 'tag', '{}')
88
89 self.assertRaises(ValueError, Element, '{test}')
90 self.assertRaises(ValueError, setattr, el, 'tag', '{test}')
91
105
107 parser = self.etree.HTMLParser()
108 Element = parser.makeelement
109
110 self.assertRaises(ValueError, Element, 'p"name')
111 self.assertRaises(ValueError, Element, "na'me")
112 self.assertRaises(ValueError, Element, '{test}"name')
113 self.assertRaises(ValueError, Element, "{test}name'")
114
115 el = Element('name')
116 self.assertRaises(ValueError, setattr, el, 'tag', "pname'")
117 self.assertRaises(ValueError, setattr, el, 'tag', '"pname')
118 self.assertEqual(el.tag, "name")
119
121 parser = self.etree.HTMLParser()
122 Element = parser.makeelement
123
124 self.assertRaises(ValueError, Element, ' name ')
125 self.assertRaises(ValueError, Element, 'na me')
126 self.assertRaises(ValueError, Element, '{test} name')
127
128 el = Element('name')
129 self.assertRaises(ValueError, setattr, el, 'tag', ' name ')
130 self.assertEqual(el.tag, "name")
131
141
153
155 parser = self.etree.HTMLParser()
156 Element = parser.makeelement
157 SubElement = self.etree.SubElement
158
159 el = Element('name')
160 self.assertRaises(ValueError, SubElement, el, "name'")
161 self.assertRaises(ValueError, SubElement, el, 'na"me')
162 self.assertRaises(ValueError, SubElement, el, "{test}na'me")
163 self.assertRaises(ValueError, SubElement, el, '{test}"name')
164
174
181
191
193 text = _str('Søk på nettet')
194 wrong_head = _str('''
195 <head>
196 <meta http-equiv="Content-Type"
197 content="text/html; charset=UTF-8" />
198 </head>''')
199 html_latin1 = (_str('<html>%s<body><p>%s</p></body></html>') % (wrong_head,
200 text)
201 ).encode('iso-8859-1')
202
203 self.assertRaises(self.etree.ParseError,
204 self.etree.parse,
205 BytesIO(html_latin1))
206
207 tree = self.etree.parse(
208 BytesIO(html_latin1),
209 self.etree.HTMLParser(encoding="iso-8859-1"))
210 p = tree.find("//p")
211 self.assertEqual(p.text, text)
212
217
223
227
240
248
249
250
251
252
253
254
255
262
277
279 iterparse = self.etree.iterparse
280 f = BytesIO(
281 '<html><head><title>TITLE</title><body><p>P</p></body></html>')
282
283 iterator = iterparse(f, html=True)
284 self.assertEqual(None, iterator.root)
285
286 events = list(iterator)
287 root = iterator.root
288 self.assertTrue(root is not None)
289 self.assertEqual(
290 [('end', root[0][0]), ('end', root[0]), ('end', root[1][0]),
291 ('end', root[1]), ('end', root)],
292 events)
293
295 iterparse = self.etree.iterparse
296 f = BytesIO(
297 '<html><head><title>TITLE</title><body><p>P</p></body></html>')
298
299 iterator = iterparse(f, html=True)
300 self.assertEqual(None, iterator.root)
301
302 event, element = next(iterator)
303 self.assertEqual('end', event)
304 self.assertEqual('title', element.tag)
305 self.assertEqual(None, iterator.root)
306 del element
307
308 event, element = next(iterator)
309 self.assertEqual('end', event)
310 self.assertEqual('head', element.tag)
311 self.assertEqual(None, iterator.root)
312 del element
313 del iterator
314
316 iterparse = self.etree.iterparse
317 f = BytesIO('<head><title>TEST></head><p>P<br></div>')
318
319 iterator = iterparse(f, html=True)
320 self.assertEqual(None, iterator.root)
321
322 events = list(iterator)
323 root = iterator.root
324 self.assertTrue(root is not None)
325 self.assertEqual('html', root.tag)
326 self.assertEqual('head', root[0].tag)
327 self.assertEqual('body', root[1].tag)
328 self.assertEqual('p', root[1][0].tag)
329 self.assertEqual('br', root[1][0][0].tag)
330 self.assertEqual(
331 [('end', root[0][0]), ('end', root[0]), ('end', root[1][0][0]),
332 ('end', root[1][0]), ('end', root[1]), ('end', root)],
333 events)
334
340
353
355 iterparse = self.etree.iterparse
356 f = BytesIO(
357 '<html><head><title>TITLE</title><body><p>P</p></body></html>')
358
359 iterator = iterparse(f, html=True, events=('start',))
360 self.assertEqual(None, iterator.root)
361
362 events = list(iterator)
363 root = iterator.root
364 self.assertNotEqual(None, root)
365 self.assertEqual(
366 [('start', root), ('start', root[0]), ('start', root[0][0]),
367 ('start', root[1]), ('start', root[1][0])],
368 events)
369
380
399
414
422 def end(self, tag):
423 events.append(("end", tag))
424 def close(self):
425 return "DONE"
426
427 parser = self.etree.HTMLParser(target=Target())
428
429 parser.feed("<html><body></body></html>")
430 done = parser.close()
431
432 self.assertEqual("DONE", done)
433 self.assertEqual([
434 ("start", "html"), ("start", "body"),
435 ("end", "body"), ("end", "html")], events)
436
444 def end(self, tag):
445 events.append(("end", tag))
446 def doctype(self, *args):
447 events.append(("doctype", args))
448 def close(self):
449 return "DONE"
450
451 parser = self.etree.HTMLParser(target=Target())
452 parser.feed("<!DOCTYPE><html><body></body></html>")
453 done = parser.close()
454
455 self.assertEqual("DONE", done)
456 self.assertEqual([
457 ("doctype", (None, None, None)),
458 ("start", "html"), ("start", "body"),
459 ("end", "body"), ("end", "html")], events)
460
468 def end(self, tag):
469 events.append(("end", tag))
470 def doctype(self, *args):
471 events.append(("doctype", args))
472 def close(self):
473 return "DONE"
474
475 parser = self.etree.HTMLParser(target=Target())
476 parser.feed("<!DOCTYPE html><html><body></body></html>")
477 done = parser.close()
478
479 self.assertEqual("DONE", done)
480 self.assertEqual([
481 ("doctype", ("html", None, None)),
482 ("start", "html"), ("start", "body"),
483 ("end", "body"), ("end", "html")], events)
484
492 def end(self, tag):
493 events.append(("end", tag))
494 def doctype(self, *args):
495 events.append(("doctype", args))
496 def close(self):
497 return "DONE"
498
499 parser = self.etree.HTMLParser(target=Target())
500 parser.feed('<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN" "sys.dtd">'
501 '<html><body></body></html>')
502 done = parser.close()
503
504 self.assertEqual("DONE", done)
505 self.assertEqual([
506 ("doctype", ("html", "-//W3C//DTD HTML 4.01//EN", "sys.dtd")),
507 ("start", "html"), ("start", "body"),
508 ("end", "body"), ("end", "html")], events)
509
511 events = []
512 class Target(object):
513 def start(self, tag, attrib):
514 events.append(("start", tag))
515 raise ValueError("START")
516 def end(self, tag):
517 events.append(("end", tag))
518 raise TypeError("END")
519 def close(self):
520 return "DONE"
521
522 parser = self.etree.HTMLParser(target=Target())
523 try:
524 parser.feed('<html><body>')
525 parser.feed('</body></html>')
526 except ValueError as exc:
527 assert "START" in str(exc)
528 except TypeError as exc:
529 assert "END" in str(exc)
530 self.assertTrue(False, "wrong exception raised")
531 else:
532 self.assertTrue(False, "no exception raised")
533
534 self.assertTrue(("start", "html") in events, events)
535 self.assertTrue(("end", "html") not in events, events)
536
538 events = []
539 class Target(object):
540 def start(self, tag, attrib):
541 events.append(("start", tag))
542 raise ValueError("START")
543 def end(self, tag):
544 events.append(("end", tag))
545 raise TypeError("END")
546 def close(self):
547 return "DONE"
548
549 parser = self.etree.HTMLParser(target=Target())
550 try:
551 self.etree.fromstring('<html><body></body></html>', parser)
552 except ValueError as exc:
553 assert "START" in str(exc), str(exc)
554 except TypeError as exc:
555 assert "END" in str(exc), str(exc)
556 self.assertTrue(False, "wrong exception raised")
557 else:
558 self.assertTrue(False, "no exception raised")
559
560 self.assertTrue(("start", "html") in events, events)
561 self.assertTrue(("end", "html") not in events, events)
562
564 doc = html.Element('html').getroottree()
565 doc.docinfo.public_id = "-//W3C//DTD XHTML 1.0 Strict//EN"
566 doc.docinfo.system_url = \
567 "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"
568 self.assertEqual(doc.docinfo.doctype,
569 '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">')
570 self.assertEqual(self.etree.tostring(doc),
571 _bytes('''<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
572 <html xmlns="http://www.w3.org/1999/xhtml"></html>'''))
573
584
594
595
597 suite = unittest.TestSuite()
598 suite.addTests([unittest.makeSuite(HtmlParserTestCase)])
599 return suite
600
601
602 if __name__ == '__main__':
603 print('to test use test.py %s' % __file__)
604