Package lxml :: Package tests :: Module test_htmlparser
[hide private]
[frames] | no frames]

Source Code for Module lxml.tests.test_htmlparser

  1  # -*- coding: utf-8 -*- 
  2   
  3  """ 
  4  HTML parser test cases for etree 
  5  """ 
  6   
  7  from __future__ import absolute_import 
  8   
  9  import unittest 
 10  import tempfile, os, os.path, sys 
 11   
 12  from .common_imports import etree, html, BytesIO, fileInTestDir, _bytes, _str 
 13  from .common_imports import SillyFileLike, HelperTestCase, write_to_file, next 
 14   
 15  try: 
 16      unicode 
 17  except NameError: 
 18      unicode = str 
 19   
 20   
21 -class HtmlParserTestCase(HelperTestCase):
22 """HTML parser test cases 23 """ 24 etree = etree 25 26 html_str = _bytes("<html><head><title>test</title></head><body><h1>page title</h1></body></html>") 27 html_str_pretty = _bytes("""\ 28 <html> 29 <head><title>test</title></head> 30 <body><h1>page title</h1></body> 31 </html> 32 """) 33 broken_html_str = _bytes("<html><head><title>test" 34 "<body><h1>page title</h3></p></html>") 35 uhtml_str = _bytes( 36 "<html><head><title>test á</title></head>" 37 "<body><h1>page á title</h1></body></html>").decode('utf8') 38
39 - def tearDown(self):
40 super(HtmlParserTestCase, self).tearDown() 41 self.etree.set_default_parser()
42
43 - def test_module_HTML(self):
44 element = self.etree.HTML(self.html_str) 45 self.assertEqual(self.etree.tostring(element, method="html"), 46 self.html_str)
47
48 - def test_module_HTML_unicode(self):
49 element = self.etree.HTML(self.uhtml_str) 50 self.assertEqual( 51 self.etree.tostring(element, method="html", encoding='unicode'), 52 self.uhtml_str) 53 self.assertEqual(element.findtext('.//h1'), 54 _bytes("page á title").decode('utf8'))
55
56 - def test_wide_unicode_xml(self):
57 if sys.maxunicode < 1114111: 58 return # skip test 59 element = self.etree.HTML(_bytes( 60 '<html><body><p>\\U00026007</p></body></html>' 61 ).decode('unicode_escape')) 62 p_text = element.findtext('.//p') 63 self.assertEqual(1, len(p_text)) 64 self.assertEqual(_bytes('\\U00026007').decode('unicode_escape'), 65 p_text)
66
67 - def test_html_ids(self):
68 parser = self.etree.HTMLParser(recover=False) 69 fromstring = self.etree.fromstring 70 html = fromstring(''' 71 <html><body id="bodyID"><p id="pID"></p></body></html> 72 ''', parser=parser) 73 self.assertEqual(len(html.xpath('//p[@id="pID"]')), 1) 74 self.assertEqual(len(html.findall('.//p[@id="pID"]')), 1)
75
77 parser = self.etree.HTMLParser(recover=False, collect_ids=False) 78 fromstring = self.etree.fromstring 79 html = fromstring(''' 80 <html><body id="bodyID"><p id="pID"></p></body></html> 81 ''', parser=parser) 82 self.assertEqual(len(html.xpath('//p[@id="pID"]')), 1) 83 self.assertEqual(len(html.findall('.//p[@id="pID"]')), 1)
84
86 element = self.etree.HTML(self.html_str) 87 self.assertEqual(self.etree.tostring(element, method="html", pretty_print=True), 88 self.html_str_pretty)
89
91 parser = self.etree.HTMLParser(recover=False) 92 parse = self.etree.parse 93 f = BytesIO("<html></body>") 94 self.assertRaises(self.etree.XMLSyntaxError, 95 parse, f, parser)
96
98 parser = self.etree.HTMLParser() 99 Element = parser.makeelement 100 101 el = Element('name') 102 self.assertRaises(ValueError, Element, '{}') 103 self.assertRaises(ValueError, setattr, el, 'tag', '{}') 104 105 self.assertRaises(ValueError, Element, '{test}') 106 self.assertRaises(ValueError, setattr, el, 'tag', '{test}')
107
109 parser = self.etree.HTMLParser() 110 Element = parser.makeelement 111 112 pname = Element('p:name') 113 self.assertEqual(pname.tag, 'p:name') 114 115 pname = Element('{test}p:name') 116 self.assertEqual(pname.tag, '{test}p:name') 117 118 pname = Element('name') 119 pname.tag = 'p:name' 120 self.assertEqual(pname.tag, 'p:name')
121
123 parser = self.etree.HTMLParser() 124 Element = parser.makeelement 125 126 self.assertRaises(ValueError, Element, 'p"name') 127 self.assertRaises(ValueError, Element, "na'me") 128 self.assertRaises(ValueError, Element, '{test}"name') 129 self.assertRaises(ValueError, Element, "{test}name'") 130 131 el = Element('name') 132 self.assertRaises(ValueError, setattr, el, 'tag', "pname'") 133 self.assertRaises(ValueError, setattr, el, 'tag', '"pname') 134 self.assertEqual(el.tag, "name")
135
137 parser = self.etree.HTMLParser() 138 Element = parser.makeelement 139 140 self.assertRaises(ValueError, Element, ' name ') 141 self.assertRaises(ValueError, Element, 'na me') 142 self.assertRaises(ValueError, Element, '{test} name') 143 144 el = Element('name') 145 self.assertRaises(ValueError, setattr, el, 'tag', ' name ') 146 self.assertEqual(el.tag, "name")
147
149 parser = self.etree.HTMLParser() 150 Element = parser.makeelement 151 152 SubElement = self.etree.SubElement 153 154 el = Element('name') 155 self.assertRaises(ValueError, SubElement, el, '{}') 156 self.assertRaises(ValueError, SubElement, el, '{test}')
157
159 parser = self.etree.HTMLParser() 160 Element = parser.makeelement 161 SubElement = self.etree.SubElement 162 163 el = Element('name') 164 pname = SubElement(el, 'p:name') 165 self.assertEqual(pname.tag, 'p:name') 166 167 pname = SubElement(el, '{test}p:name') 168 self.assertEqual(pname.tag, '{test}p:name')
169
171 parser = self.etree.HTMLParser() 172 Element = parser.makeelement 173 SubElement = self.etree.SubElement 174 175 el = Element('name') 176 self.assertRaises(ValueError, SubElement, el, "name'") 177 self.assertRaises(ValueError, SubElement, el, 'na"me') 178 self.assertRaises(ValueError, SubElement, el, "{test}na'me") 179 self.assertRaises(ValueError, SubElement, el, '{test}"name')
180
182 parser = self.etree.HTMLParser() 183 Element = parser.makeelement 184 SubElement = self.etree.SubElement 185 186 el = Element('name') 187 self.assertRaises(ValueError, SubElement, el, ' name ') 188 self.assertRaises(ValueError, SubElement, el, 'na me') 189 self.assertRaises(ValueError, SubElement, el, '{test} name')
190
192 parser = self.etree.HTMLParser(recover=False) 193 parse = self.etree.parse 194 f = BytesIO(self.broken_html_str) 195 self.assertRaises(self.etree.XMLSyntaxError, 196 parse, f, parser)
197
199 parser = self.etree.HTMLParser(default_doctype=False) 200 d = html.fromstring('<!DOCTYPE html><h1>S</h1></html>', parser=parser) 201 self.assertEqual(d.getroottree().docinfo.doctype, '<!DOCTYPE html>') 202 203 d = html.fromstring('<html><h1>S</h1></html>', parser=parser) 204 self.assertEqual(d.getroottree().docinfo.doctype, '')
205
207 text = _str('Søk på nettet') 208 html_latin1 = (_str('<p>%s</p>') % text).encode('iso-8859-1') 209 210 tree = self.etree.parse( 211 BytesIO(html_latin1), 212 self.etree.HTMLParser(encoding="iso-8859-1")) 213 p = tree.find("//p") 214 self.assertEqual(p.text, text)
215
217 text = _str('Søk på nettet') 218 wrong_head = _str(''' 219 <head> 220 <meta http-equiv="Content-Type" 221 content="text/html; charset=UTF-8" /> 222 </head>''') 223 html_latin1 = (_str('<html>%s<body><p>%s</p></body></html>') % (wrong_head, 224 text) 225 ).encode('iso-8859-1') 226 227 self.assertRaises(self.etree.ParseError, 228 self.etree.parse, 229 BytesIO(html_latin1)) 230 231 tree = self.etree.parse( 232 BytesIO(html_latin1), 233 self.etree.HTMLParser(encoding="iso-8859-1")) 234 p = tree.find("//p") 235 self.assertEqual(p.text, text)
236
237 - def test_module_HTML_broken(self):
238 element = self.etree.HTML(self.broken_html_str) 239 self.assertEqual(self.etree.tostring(element, method="html"), 240 self.html_str)
241
242 - def test_module_HTML_cdata(self):
243 # by default, libxml2 generates CDATA nodes for <script> content 244 html = _bytes('<html><head><style>foo</style></head></html>') 245 element = self.etree.HTML(html) 246 self.assertEqual(element[0][0].text, "foo")
247
248 - def test_module_HTML_access(self):
249 element = self.etree.HTML(self.html_str) 250 self.assertEqual(element[0][0].tag, 'title')
251
252 - def test_module_parse_html(self):
253 parser = self.etree.HTMLParser() 254 filename = tempfile.mktemp(suffix=".html") 255 write_to_file(filename, self.html_str, 'wb') 256 try: 257 with open(filename, 'rb') as f: 258 tree = self.etree.parse(f, parser) 259 self.assertEqual(self.etree.tostring(tree.getroot(), method="html"), 260 self.html_str) 261 finally: 262 os.remove(filename)
263
265 parser = self.etree.HTMLParser() 266 f = SillyFileLike(self.html_str) 267 tree = self.etree.parse(f, parser) 268 html = self.etree.tostring(tree.getroot(), 269 method="html", encoding='UTF-8') 270 self.assertEqual(html, self.html_str)
271 272 ## def test_module_parse_html_filelike_unicode(self): 273 ## parser = self.etree.HTMLParser() 274 ## f = SillyFileLike(self.uhtml_str) 275 ## tree = self.etree.parse(f, parser) 276 ## html = self.etree.tostring(tree.getroot(), encoding='UTF-8') 277 ## self.assertEqual(unicode(html, 'UTF-8'), self.uhtml_str) 278
279 - def test_html_file_error(self):
280 parser = self.etree.HTMLParser() 281 parse = self.etree.parse 282 self.assertRaises(IOError, 283 parse, "__some_hopefully_nonexisting_file__.html", 284 parser)
285
287 self.assertRaises(self.etree.XMLSyntaxError, 288 self.etree.parse, BytesIO(self.broken_html_str)) 289 290 self.etree.set_default_parser( self.etree.HTMLParser() ) 291 292 tree = self.etree.parse(BytesIO(self.broken_html_str)) 293 self.assertEqual(self.etree.tostring(tree.getroot(), method="html"), 294 self.html_str) 295 296 self.etree.set_default_parser() 297 298 self.assertRaises(self.etree.XMLSyntaxError, 299 self.etree.parse, BytesIO(self.broken_html_str))
300
301 - def test_html_iterparse(self):
302 iterparse = self.etree.iterparse 303 f = BytesIO( 304 '<html><head><title>TITLE</title><body><p>P</p></body></html>') 305 306 iterator = iterparse(f, html=True) 307 self.assertEqual(None, iterator.root) 308 309 events = list(iterator) 310 root = iterator.root 311 self.assertTrue(root is not None) 312 self.assertEqual( 313 [('end', root[0][0]), ('end', root[0]), ('end', root[1][0]), 314 ('end', root[1]), ('end', root)], 315 events)
316
317 - def test_html_iterparse_tag(self):
318 iterparse = self.etree.iterparse 319 f = BytesIO( 320 '<html><head><title>TITLE</title><body><p>P</p></body></html>') 321 322 iterator = iterparse(f, html=True, tag=["p", "title"]) 323 self.assertEqual(None, iterator.root) 324 325 events = list(iterator) 326 root = iterator.root 327 self.assertTrue(root is not None) 328 self.assertEqual( 329 [('end', root[0][0]), ('end', root[1][0])], 330 events)
331
333 iterparse = self.etree.iterparse 334 f = BytesIO( 335 '<html><head><title>TITLE</title><body><p>P</p></body></html>') 336 337 iterator = iterparse(f, html=True) 338 self.assertEqual(None, iterator.root) 339 340 event, element = next(iterator) 341 self.assertEqual('end', event) 342 self.assertEqual('title', element.tag) 343 self.assertEqual(None, iterator.root) 344 del element 345 346 event, element = next(iterator) 347 self.assertEqual('end', event) 348 self.assertEqual('head', element.tag) 349 self.assertEqual(None, iterator.root) 350 del element 351 del iterator
352
354 iterparse = self.etree.iterparse 355 f = BytesIO('<head><title>TEST></head><p>P<br></div>') 356 357 iterator = iterparse(f, html=True) 358 self.assertEqual(None, iterator.root) 359 360 events = list(iterator) 361 root = iterator.root 362 self.assertTrue(root is not None) 363 self.assertEqual('html', root.tag) 364 self.assertEqual('head', root[0].tag) 365 self.assertEqual('body', root[1].tag) 366 self.assertEqual('p', root[1][0].tag) 367 self.assertEqual('br', root[1][0][0].tag) 368 self.assertEqual( 369 [('end', root[0][0]), ('end', root[0]), ('end', root[1][0][0]), 370 ('end', root[1][0]), ('end', root[1]), ('end', root)], 371 events)
372
374 iterparse = self.etree.iterparse 375 f = BytesIO('<p>P<br></div>') 376 iterator = iterparse(f, html=True, recover=False) 377 self.assertRaises(self.etree.XMLSyntaxError, list, iterator)
378
379 - def test_html_iterparse_file(self):
380 iterparse = self.etree.iterparse 381 iterator = iterparse(fileInTestDir("shakespeare.html"), 382 html=True) 383 384 self.assertEqual(None, iterator.root) 385 events = list(iterator) 386 root = iterator.root 387 self.assertTrue(root is not None) 388 self.assertEqual(249, len(events)) 389 self.assertFalse( 390 [event for (event, element) in events if event != 'end'])
391
393 iterparse = self.etree.iterparse 394 f = BytesIO( 395 '<html><head><title>TITLE</title><body><p>P</p></body></html>') 396 397 iterator = iterparse(f, html=True, events=('start',)) 398 self.assertEqual(None, iterator.root) 399 400 events = list(iterator) 401 root = iterator.root 402 self.assertNotEqual(None, root) 403 self.assertEqual( 404 [('start', root), ('start', root[0]), ('start', root[0][0]), 405 ('start', root[1]), ('start', root[1][0])], 406 events)
407
408 - def test_html_feed_parser(self):
409 parser = self.etree.HTMLParser() 410 parser.feed("<html><body></") 411 parser.feed("body></html>") 412 root = parser.close() 413 414 self.assertEqual('html', root.tag) 415 # test that we find all names in the parser dict 416 self.assertEqual([root], list(root.iter('html'))) 417 self.assertEqual([root[0]], list(root.iter('body')))
418
420 parser = self.etree.HTMLParser() 421 parser.feed("<htm") 422 parser.feed("l><body") 423 parser.feed("><") 424 parser.feed("p><") 425 parser.feed("strong") 426 parser.feed(">some ") 427 parser.feed("text</strong></p><") 428 parser.feed("/body></html>") 429 root = parser.close() 430 431 self.assertEqual('html', root.tag) 432 # test that we find all names in the parser dict 433 self.assertEqual([root], list(root.iter('html'))) 434 self.assertEqual([root[0]], list(root.iter('body'))) 435 self.assertEqual([root[0][0]], list(root.iter('p'))) 436 self.assertEqual([root[0][0][0]], list(root.iter('strong')))
437
439 parser = self.etree.HTMLParser() 440 parser.feed('<html><head>') 441 parser.feed('<title>TITLE</title><body><p>P</p></body><') 442 parser.feed("/html>") 443 root = parser.close() 444 445 self.assertEqual('html', root.tag) 446 # test that we find all names in the parser dict 447 self.assertEqual([root], list(root.iter('html'))) 448 self.assertEqual([root[0]], list(root.iter('head'))) 449 self.assertEqual([root[0][0]], list(root.iter('title'))) 450 self.assertEqual([root[1]], list(root.iter('body'))) 451 self.assertEqual([root[1][0]], list(root.iter('p')))
452
454 assertFalse = self.assertFalse 455 events = [] 456 class Target(object): 457 def start(self, tag, attrib): 458 events.append(("start", tag)) 459 assertFalse(attrib)
460 def end(self, tag): 461 events.append(("end", tag))
462 def close(self): 463 return "DONE" 464 465 parser = self.etree.HTMLParser(target=Target()) 466 467 parser.feed("<html><body></body></html>") 468 done = parser.close() 469 470 self.assertEqual("DONE", done) 471 self.assertEqual([ 472 ("start", "html"), ("start", "body"), 473 ("end", "body"), ("end", "html")], events) 474
475 - def test_html_parser_target_doctype_empty(self):
476 assertFalse = self.assertFalse 477 events = [] 478 class Target(object): 479 def start(self, tag, attrib): 480 events.append(("start", tag)) 481 assertFalse(attrib)
482 def end(self, tag): 483 events.append(("end", tag)) 484 def doctype(self, *args): 485 events.append(("doctype", args)) 486 def close(self): 487 return "DONE" 488 489 parser = self.etree.HTMLParser(target=Target()) 490 parser.feed("<!DOCTYPE><html><body></body></html>") 491 done = parser.close() 492 493 self.assertEqual("DONE", done) 494 self.assertEqual([ 495 ("doctype", (None, None, None)), 496 ("start", "html"), ("start", "body"), 497 ("end", "body"), ("end", "html")], events) 498
499 - def test_html_parser_target_doctype_html(self):
500 assertFalse = self.assertFalse 501 events = [] 502 class Target(object): 503 def start(self, tag, attrib): 504 events.append(("start", tag)) 505 assertFalse(attrib)
506 def end(self, tag): 507 events.append(("end", tag)) 508 def doctype(self, *args): 509 events.append(("doctype", args)) 510 def close(self): 511 return "DONE" 512 513 parser = self.etree.HTMLParser(target=Target()) 514 parser.feed("<!DOCTYPE html><html><body></body></html>") 515 done = parser.close() 516 517 self.assertEqual("DONE", done) 518 self.assertEqual([ 519 ("doctype", ("html", None, None)), 520 ("start", "html"), ("start", "body"), 521 ("end", "body"), ("end", "html")], events) 522
523 - def test_html_parser_target_doctype_html_full(self):
524 assertFalse = self.assertFalse 525 events = [] 526 class Target(object): 527 def start(self, tag, attrib): 528 events.append(("start", tag)) 529 assertFalse(attrib)
530 def end(self, tag): 531 events.append(("end", tag)) 532 def doctype(self, *args): 533 events.append(("doctype", args)) 534 def close(self): 535 return "DONE" 536 537 parser = self.etree.HTMLParser(target=Target()) 538 parser.feed('<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN" "sys.dtd">' 539 '<html><body></body></html>') 540 done = parser.close() 541 542 self.assertEqual("DONE", done) 543 self.assertEqual([ 544 ("doctype", ("html", "-//W3C//DTD HTML 4.01//EN", "sys.dtd")), 545 ("start", "html"), ("start", "body"), 546 ("end", "body"), ("end", "html")], events) 547
548 - def test_html_parser_target_exceptions(self):
549 events = [] 550 class Target(object): 551 def start(self, tag, attrib): 552 events.append(("start", tag)) 553 raise ValueError("START")
554 def end(self, tag): 555 events.append(("end", tag)) 556 raise TypeError("END") 557 def close(self): 558 return "DONE" 559 560 parser = self.etree.HTMLParser(target=Target()) 561 try: 562 parser.feed('<html><body>') 563 parser.feed('</body></html>') 564 except ValueError as exc: 565 assert "START" in str(exc) 566 except TypeError as exc: 567 assert "END" in str(exc) 568 self.assertTrue(False, "wrong exception raised") 569 else: 570 self.assertTrue(False, "no exception raised") 571 572 self.assertTrue(("start", "html") in events, events) 573 self.assertTrue(("end", "html") not in events, events) 574
575 - def test_html_fromstring_target_exceptions(self):
576 events = [] 577 class Target(object): 578 def start(self, tag, attrib): 579 events.append(("start", tag)) 580 raise ValueError("START")
581 def end(self, tag): 582 events.append(("end", tag)) 583 raise TypeError("END") 584 def close(self): 585 return "DONE" 586 587 parser = self.etree.HTMLParser(target=Target()) 588 try: 589 self.etree.fromstring('<html><body></body></html>', parser) 590 except ValueError as exc: 591 assert "START" in str(exc), str(exc) 592 except TypeError as exc: 593 assert "END" in str(exc), str(exc) 594 self.assertTrue(False, "wrong exception raised") 595 else: 596 self.assertTrue(False, "no exception raised") 597 598 self.assertTrue(("start", "html") in events, events) 599 self.assertTrue(("end", "html") not in events, events) 600
601 - def test_set_decl_html(self):
602 doc = html.Element('html').getroottree() 603 doc.docinfo.public_id = "-//W3C//DTD XHTML 1.0 Strict//EN" 604 doc.docinfo.system_url = \ 605 "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd" 606 self.assertEqual(doc.docinfo.doctype, 607 '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">') 608 self.assertEqual(self.etree.tostring(doc), 609 _bytes('''<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"> 610 <html xmlns="http://www.w3.org/1999/xhtml"></html>'''))
611
612 - def test_html5_doctype(self):
613 # document type declaration with neither public if nor system url 614 doc = html.Element('html').getroottree() 615 doc.docinfo.public_id = None 616 doc.docinfo.system_url = None 617 self.assertEqual(doc.docinfo.doctype, 618 '<!DOCTYPE html>') 619 self.assertTrue(doc.docinfo.public_id is None) 620 self.assertEqual(self.etree.tostring(doc), 621 _bytes('<!DOCTYPE html>\n<html/>'))
622
623 - def test_ietf_decl(self):
624 # legacy declaration with public id, no system url 625 doc = html.Element('html').getroottree() 626 doc.docinfo.public_id = '-//IETF//DTD HTML//EN' 627 doc.docinfo.system_url = None 628 self.assertEqual(doc.docinfo.doctype, 629 '<!DOCTYPE html PUBLIC "-//IETF//DTD HTML//EN">') 630 self.assertEqual(self.etree.tostring(doc), 631 _bytes('<!DOCTYPE html PUBLIC "-//IETF//DTD HTML//EN">\n<html/>'))
632
633 - def test_boolean_attribute(self):
634 # ability to serialize boolean attribute by setting value to None 635 form = html.Element('form') 636 form.set('novalidate', None) 637 self.assertEqual(html.tostring(form), 638 _bytes('<form novalidate></form>')) 639 form.set('custom') 640 self.assertEqual(html.tostring(form), 641 _bytes('<form novalidate custom></form>'))
642
643 - def test_boolean_attribute_round_trip(self):
644 # ability to pass boolean attributes unmodified 645 fragment = '<tag attribute></tag>' 646 self.assertEqual(html.tostring(html.fragment_fromstring(fragment)), 647 _bytes(fragment))
648
649 - def test_boolean_attribute_xml_adds_empty_string(self):
650 # html serialized as xml converts boolean attributes to empty strings 651 fragment = '<tag attribute></tag>' 652 self.assertEqual(self.etree.tostring(html.fragment_fromstring(fragment)), 653 _bytes('<tag attribute=""/>'))
654 655
656 -def test_suite():
657 suite = unittest.TestSuite() 658 suite.addTests([unittest.makeSuite(HtmlParserTestCase)]) 659 return suite
660 661 662 if __name__ == '__main__': 663 print('to test use test.py %s' % __file__) 664