Package lxml :: Package tests :: Module test_htmlparser
[hide private]
[frames] | no frames]

Source Code for Module lxml.tests.test_htmlparser

  1  # -*- coding: utf-8 -*- 
  2   
  3  """ 
  4  HTML parser test cases for etree 
  5  """ 
  6   
  7  import unittest 
  8  import tempfile, os, os.path, sys 
  9   
 10  this_dir = os.path.dirname(__file__) 
 11  if this_dir not in sys.path: 
 12      sys.path.insert(0, this_dir) # needed for Py3 
 13   
 14  from common_imports import etree, html, StringIO, BytesIO, fileInTestDir, _bytes, _str 
 15  from common_imports import SillyFileLike, HelperTestCase, write_to_file, next 
 16   
 17  try: 
 18      unicode 
 19  except NameError: 
 20      unicode = str 
 21   
 22   
23 -class HtmlParserTestCase(HelperTestCase):
24 """HTML parser test cases 25 """ 26 etree = etree 27 28 html_str = _bytes("<html><head><title>test</title></head><body><h1>page title</h1></body></html>") 29 html_str_pretty = _bytes("""\ 30 <html> 31 <head><title>test</title></head> 32 <body><h1>page title</h1></body> 33 </html> 34 """) 35 broken_html_str = _bytes("<html><head><title>test" 36 "<body><h1>page title</h3></p></html>") 37 uhtml_str = _bytes( 38 "<html><head><title>test á</title></head>" 39 "<body><h1>page á title</h1></body></html>").decode('utf8') 40
41 - def tearDown(self):
42 super(HtmlParserTestCase, self).tearDown() 43 self.etree.set_default_parser()
44
45 - def test_module_HTML(self):
46 element = self.etree.HTML(self.html_str) 47 self.assertEqual(self.etree.tostring(element, method="html"), 48 self.html_str)
49
50 - def test_module_HTML_unicode(self):
51 element = self.etree.HTML(self.uhtml_str) 52 self.assertEqual( 53 self.etree.tostring(element, method="html", encoding='unicode'), 54 self.uhtml_str) 55 self.assertEqual(element.findtext('.//h1'), 56 _bytes("page á title").decode('utf8'))
57
58 - def test_wide_unicode_xml(self):
59 if sys.maxunicode < 1114111: 60 return # skip test 61 element = self.etree.HTML(_bytes( 62 '<html><body><p>\\U00026007</p></body></html>' 63 ).decode('unicode_escape')) 64 p_text = element.findtext('.//p') 65 self.assertEqual(1, len(p_text)) 66 self.assertEqual(_bytes('\\U00026007').decode('unicode_escape'), 67 p_text)
68
69 - def test_html_ids(self):
70 parser = self.etree.HTMLParser(recover=False) 71 fromstring = self.etree.fromstring 72 html = fromstring(''' 73 <html><body id="bodyID"><p id="pID"></p></body></html> 74 ''', parser=parser) 75 self.assertEqual(len(html.xpath('//p[@id="pID"]')), 1)
76
78 parser = self.etree.HTMLParser(recover=False, collect_ids=False) 79 fromstring = self.etree.fromstring 80 html = fromstring(''' 81 <html><body id="bodyID"><p id="pID"></p></body></html> 82 ''', parser=parser) 83 self.assertEqual(len(html.xpath('//p[@id="pID"]')), 1)
84
86 element = self.etree.HTML(self.html_str) 87 self.assertEqual(self.etree.tostring(element, method="html", pretty_print=True), 88 self.html_str_pretty)
89
91 parser = self.etree.HTMLParser(recover=False) 92 parse = self.etree.parse 93 f = BytesIO("<html></body>") 94 self.assertRaises(self.etree.XMLSyntaxError, 95 parse, f, parser)
96
98 parser = self.etree.HTMLParser() 99 Element = parser.makeelement 100 101 el = Element('name') 102 self.assertRaises(ValueError, Element, '{}') 103 self.assertRaises(ValueError, setattr, el, 'tag', '{}') 104 105 self.assertRaises(ValueError, Element, '{test}') 106 self.assertRaises(ValueError, setattr, el, 'tag', '{test}')
107
109 parser = self.etree.HTMLParser() 110 Element = parser.makeelement 111 112 pname = Element('p:name') 113 self.assertEqual(pname.tag, 'p:name') 114 115 pname = Element('{test}p:name') 116 self.assertEqual(pname.tag, '{test}p:name') 117 118 pname = Element('name') 119 pname.tag = 'p:name' 120 self.assertEqual(pname.tag, 'p:name')
121
123 parser = self.etree.HTMLParser() 124 Element = parser.makeelement 125 126 self.assertRaises(ValueError, Element, 'p"name') 127 self.assertRaises(ValueError, Element, "na'me") 128 self.assertRaises(ValueError, Element, '{test}"name') 129 self.assertRaises(ValueError, Element, "{test}name'") 130 131 el = Element('name') 132 self.assertRaises(ValueError, setattr, el, 'tag', "pname'") 133 self.assertRaises(ValueError, setattr, el, 'tag', '"pname') 134 self.assertEqual(el.tag, "name")
135
137 parser = self.etree.HTMLParser() 138 Element = parser.makeelement 139 140 self.assertRaises(ValueError, Element, ' name ') 141 self.assertRaises(ValueError, Element, 'na me') 142 self.assertRaises(ValueError, Element, '{test} name') 143 144 el = Element('name') 145 self.assertRaises(ValueError, setattr, el, 'tag', ' name ') 146 self.assertEqual(el.tag, "name")
147
149 parser = self.etree.HTMLParser() 150 Element = parser.makeelement 151 152 SubElement = self.etree.SubElement 153 154 el = Element('name') 155 self.assertRaises(ValueError, SubElement, el, '{}') 156 self.assertRaises(ValueError, SubElement, el, '{test}')
157
159 parser = self.etree.HTMLParser() 160 Element = parser.makeelement 161 SubElement = self.etree.SubElement 162 163 el = Element('name') 164 pname = SubElement(el, 'p:name') 165 self.assertEqual(pname.tag, 'p:name') 166 167 pname = SubElement(el, '{test}p:name') 168 self.assertEqual(pname.tag, '{test}p:name')
169
171 parser = self.etree.HTMLParser() 172 Element = parser.makeelement 173 SubElement = self.etree.SubElement 174 175 el = Element('name') 176 self.assertRaises(ValueError, SubElement, el, "name'") 177 self.assertRaises(ValueError, SubElement, el, 'na"me') 178 self.assertRaises(ValueError, SubElement, el, "{test}na'me") 179 self.assertRaises(ValueError, SubElement, el, '{test}"name')
180
182 parser = self.etree.HTMLParser() 183 Element = parser.makeelement 184 SubElement = self.etree.SubElement 185 186 el = Element('name') 187 self.assertRaises(ValueError, SubElement, el, ' name ') 188 self.assertRaises(ValueError, SubElement, el, 'na me') 189 self.assertRaises(ValueError, SubElement, el, '{test} name')
190
192 parser = self.etree.HTMLParser(recover=False) 193 parse = self.etree.parse 194 f = BytesIO(self.broken_html_str) 195 self.assertRaises(self.etree.XMLSyntaxError, 196 parse, f, parser)
197
199 parser = self.etree.HTMLParser(default_doctype=False) 200 d = html.fromstring('<!DOCTYPE html><h1>S</h1></html>', parser=parser) 201 self.assertEqual(d.getroottree().docinfo.doctype, '<!DOCTYPE html>') 202 203 d = html.fromstring('<html><h1>S</h1></html>', parser=parser) 204 self.assertEqual(d.getroottree().docinfo.doctype, '')
205
207 text = _str('Søk på nettet') 208 html_latin1 = (_str('<p>%s</p>') % text).encode('iso-8859-1') 209 210 tree = self.etree.parse( 211 BytesIO(html_latin1), 212 self.etree.HTMLParser(encoding="iso-8859-1")) 213 p = tree.find("//p") 214 self.assertEqual(p.text, text)
215
217 text = _str('Søk på nettet') 218 wrong_head = _str(''' 219 <head> 220 <meta http-equiv="Content-Type" 221 content="text/html; charset=UTF-8" /> 222 </head>''') 223 html_latin1 = (_str('<html>%s<body><p>%s</p></body></html>') % (wrong_head, 224 text) 225 ).encode('iso-8859-1') 226 227 self.assertRaises(self.etree.ParseError, 228 self.etree.parse, 229 BytesIO(html_latin1)) 230 231 tree = self.etree.parse( 232 BytesIO(html_latin1), 233 self.etree.HTMLParser(encoding="iso-8859-1")) 234 p = tree.find("//p") 235 self.assertEqual(p.text, text)
236
237 - def test_module_HTML_broken(self):
238 element = self.etree.HTML(self.broken_html_str) 239 self.assertEqual(self.etree.tostring(element, method="html"), 240 self.html_str)
241
242 - def test_module_HTML_cdata(self):
243 # by default, libxml2 generates CDATA nodes for <script> content 244 html = _bytes('<html><head><style>foo</style></head></html>') 245 element = self.etree.HTML(html) 246 self.assertEqual(element[0][0].text, "foo")
247
248 - def test_module_HTML_access(self):
249 element = self.etree.HTML(self.html_str) 250 self.assertEqual(element[0][0].tag, 'title')
251
252 - def test_module_parse_html(self):
253 parser = self.etree.HTMLParser() 254 filename = tempfile.mktemp(suffix=".html") 255 write_to_file(filename, self.html_str, 'wb') 256 try: 257 f = open(filename, 'rb') 258 tree = self.etree.parse(f, parser) 259 f.close() 260 self.assertEqual(self.etree.tostring(tree.getroot(), method="html"), 261 self.html_str) 262 finally: 263 os.remove(filename)
264
266 parser = self.etree.HTMLParser() 267 f = SillyFileLike(self.html_str) 268 tree = self.etree.parse(f, parser) 269 html = self.etree.tostring(tree.getroot(), 270 method="html", encoding='UTF-8') 271 self.assertEqual(html, self.html_str)
272 273 ## def test_module_parse_html_filelike_unicode(self): 274 ## parser = self.etree.HTMLParser() 275 ## f = SillyFileLike(self.uhtml_str) 276 ## tree = self.etree.parse(f, parser) 277 ## html = self.etree.tostring(tree.getroot(), encoding='UTF-8') 278 ## self.assertEqual(unicode(html, 'UTF-8'), self.uhtml_str) 279
280 - def test_html_file_error(self):
281 parser = self.etree.HTMLParser() 282 parse = self.etree.parse 283 self.assertRaises(IOError, 284 parse, "__some_hopefully_nonexisting_file__.html", 285 parser)
286
288 self.assertRaises(self.etree.XMLSyntaxError, 289 self.etree.parse, BytesIO(self.broken_html_str)) 290 291 self.etree.set_default_parser( self.etree.HTMLParser() ) 292 293 tree = self.etree.parse(BytesIO(self.broken_html_str)) 294 self.assertEqual(self.etree.tostring(tree.getroot(), method="html"), 295 self.html_str) 296 297 self.etree.set_default_parser() 298 299 self.assertRaises(self.etree.XMLSyntaxError, 300 self.etree.parse, BytesIO(self.broken_html_str))
301
302 - def test_html_iterparse(self):
303 iterparse = self.etree.iterparse 304 f = BytesIO( 305 '<html><head><title>TITLE</title><body><p>P</p></body></html>') 306 307 iterator = iterparse(f, html=True) 308 self.assertEqual(None, iterator.root) 309 310 events = list(iterator) 311 root = iterator.root 312 self.assertTrue(root is not None) 313 self.assertEqual( 314 [('end', root[0][0]), ('end', root[0]), ('end', root[1][0]), 315 ('end', root[1]), ('end', root)], 316 events)
317
319 iterparse = self.etree.iterparse 320 f = BytesIO( 321 '<html><head><title>TITLE</title><body><p>P</p></body></html>') 322 323 iterator = iterparse(f, html=True) 324 self.assertEqual(None, iterator.root) 325 326 event, element = next(iterator) 327 self.assertEqual('end', event) 328 self.assertEqual('title', element.tag) 329 self.assertEqual(None, iterator.root) 330 del element 331 332 event, element = next(iterator) 333 self.assertEqual('end', event) 334 self.assertEqual('head', element.tag) 335 self.assertEqual(None, iterator.root) 336 del element 337 del iterator
338
340 iterparse = self.etree.iterparse 341 f = BytesIO('<head><title>TEST></head><p>P<br></div>') 342 343 iterator = iterparse(f, html=True) 344 self.assertEqual(None, iterator.root) 345 346 events = list(iterator) 347 root = iterator.root 348 self.assertTrue(root is not None) 349 self.assertEqual('html', root.tag) 350 self.assertEqual('head', root[0].tag) 351 self.assertEqual('body', root[1].tag) 352 self.assertEqual('p', root[1][0].tag) 353 self.assertEqual('br', root[1][0][0].tag) 354 self.assertEqual( 355 [('end', root[0][0]), ('end', root[0]), ('end', root[1][0][0]), 356 ('end', root[1][0]), ('end', root[1]), ('end', root)], 357 events)
358
360 iterparse = self.etree.iterparse 361 f = BytesIO('<p>P<br></div>') 362 iterator = iterparse(f, html=True, recover=False) 363 self.assertRaises(self.etree.XMLSyntaxError, list, iterator)
364
365 - def test_html_iterparse_file(self):
366 iterparse = self.etree.iterparse 367 iterator = iterparse(fileInTestDir("shakespeare.html"), 368 html=True) 369 370 self.assertEqual(None, iterator.root) 371 events = list(iterator) 372 root = iterator.root 373 self.assertTrue(root is not None) 374 self.assertEqual(249, len(events)) 375 self.assertFalse( 376 [event for (event, element) in events if event != 'end'])
377
379 iterparse = self.etree.iterparse 380 f = BytesIO( 381 '<html><head><title>TITLE</title><body><p>P</p></body></html>') 382 383 iterator = iterparse(f, html=True, events=('start',)) 384 self.assertEqual(None, iterator.root) 385 386 events = list(iterator) 387 root = iterator.root 388 self.assertNotEqual(None, root) 389 self.assertEqual( 390 [('start', root), ('start', root[0]), ('start', root[0][0]), 391 ('start', root[1]), ('start', root[1][0])], 392 events)
393
394 - def test_html_feed_parser(self):
395 parser = self.etree.HTMLParser() 396 parser.feed("<html><body></") 397 parser.feed("body></html>") 398 root = parser.close() 399 400 self.assertEqual('html', root.tag) 401 # test that we find all names in the parser dict 402 self.assertEqual([root], list(root.iter('html'))) 403 self.assertEqual([root[0]], list(root.iter('body')))
404
406 parser = self.etree.HTMLParser() 407 parser.feed("<htm") 408 parser.feed("l><body") 409 parser.feed("><") 410 parser.feed("p><") 411 parser.feed("strong") 412 parser.feed(">some ") 413 parser.feed("text</strong></p><") 414 parser.feed("/body></html>") 415 root = parser.close() 416 417 self.assertEqual('html', root.tag) 418 # test that we find all names in the parser dict 419 self.assertEqual([root], list(root.iter('html'))) 420 self.assertEqual([root[0]], list(root.iter('body'))) 421 self.assertEqual([root[0][0]], list(root.iter('p'))) 422 self.assertEqual([root[0][0][0]], list(root.iter('strong')))
423
425 parser = self.etree.HTMLParser() 426 parser.feed('<html><head>') 427 parser.feed('<title>TITLE</title><body><p>P</p></body><') 428 parser.feed("/html>") 429 root = parser.close() 430 431 self.assertEqual('html', root.tag) 432 # test that we find all names in the parser dict 433 self.assertEqual([root], list(root.iter('html'))) 434 self.assertEqual([root[0]], list(root.iter('head'))) 435 self.assertEqual([root[0][0]], list(root.iter('title'))) 436 self.assertEqual([root[1]], list(root.iter('body'))) 437 self.assertEqual([root[1][0]], list(root.iter('p')))
438
440 assertFalse = self.assertFalse 441 events = [] 442 class Target(object): 443 def start(self, tag, attrib): 444 events.append(("start", tag)) 445 assertFalse(attrib)
446 def end(self, tag): 447 events.append(("end", tag))
448 def close(self): 449 return "DONE" 450 451 parser = self.etree.HTMLParser(target=Target()) 452 453 parser.feed("<html><body></body></html>") 454 done = parser.close() 455 456 self.assertEqual("DONE", done) 457 self.assertEqual([ 458 ("start", "html"), ("start", "body"), 459 ("end", "body"), ("end", "html")], events) 460
461 - def test_html_parser_target_doctype_empty(self):
462 assertFalse = self.assertFalse 463 events = [] 464 class Target(object): 465 def start(self, tag, attrib): 466 events.append(("start", tag)) 467 assertFalse(attrib)
468 def end(self, tag): 469 events.append(("end", tag)) 470 def doctype(self, *args): 471 events.append(("doctype", args)) 472 def close(self): 473 return "DONE" 474 475 parser = self.etree.HTMLParser(target=Target()) 476 parser.feed("<!DOCTYPE><html><body></body></html>") 477 done = parser.close() 478 479 self.assertEqual("DONE", done) 480 self.assertEqual([ 481 ("doctype", (None, None, None)), 482 ("start", "html"), ("start", "body"), 483 ("end", "body"), ("end", "html")], events) 484
485 - def test_html_parser_target_doctype_html(self):
486 assertFalse = self.assertFalse 487 events = [] 488 class Target(object): 489 def start(self, tag, attrib): 490 events.append(("start", tag)) 491 assertFalse(attrib)
492 def end(self, tag): 493 events.append(("end", tag)) 494 def doctype(self, *args): 495 events.append(("doctype", args)) 496 def close(self): 497 return "DONE" 498 499 parser = self.etree.HTMLParser(target=Target()) 500 parser.feed("<!DOCTYPE html><html><body></body></html>") 501 done = parser.close() 502 503 self.assertEqual("DONE", done) 504 self.assertEqual([ 505 ("doctype", ("html", None, None)), 506 ("start", "html"), ("start", "body"), 507 ("end", "body"), ("end", "html")], events) 508
509 - def test_html_parser_target_doctype_html_full(self):
510 assertFalse = self.assertFalse 511 events = [] 512 class Target(object): 513 def start(self, tag, attrib): 514 events.append(("start", tag)) 515 assertFalse(attrib)
516 def end(self, tag): 517 events.append(("end", tag)) 518 def doctype(self, *args): 519 events.append(("doctype", args)) 520 def close(self): 521 return "DONE" 522 523 parser = self.etree.HTMLParser(target=Target()) 524 parser.feed('<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN" "sys.dtd">' 525 '<html><body></body></html>') 526 done = parser.close() 527 528 self.assertEqual("DONE", done) 529 self.assertEqual([ 530 ("doctype", ("html", "-//W3C//DTD HTML 4.01//EN", "sys.dtd")), 531 ("start", "html"), ("start", "body"), 532 ("end", "body"), ("end", "html")], events) 533
534 - def test_html_parser_target_exceptions(self):
535 events = [] 536 class Target(object): 537 def start(self, tag, attrib): 538 events.append(("start", tag)) 539 raise ValueError("START")
540 def end(self, tag): 541 events.append(("end", tag)) 542 raise TypeError("END") 543 def close(self): 544 return "DONE" 545 546 parser = self.etree.HTMLParser(target=Target()) 547 try: 548 parser.feed('<html><body>') 549 parser.feed('</body></html>') 550 except ValueError as exc: 551 assert "START" in str(exc) 552 except TypeError as exc: 553 assert "END" in str(exc) 554 self.assertTrue(False, "wrong exception raised") 555 else: 556 self.assertTrue(False, "no exception raised") 557 558 self.assertTrue(("start", "html") in events, events) 559 self.assertTrue(("end", "html") not in events, events) 560
561 - def test_html_fromstring_target_exceptions(self):
562 events = [] 563 class Target(object): 564 def start(self, tag, attrib): 565 events.append(("start", tag)) 566 raise ValueError("START")
567 def end(self, tag): 568 events.append(("end", tag)) 569 raise TypeError("END") 570 def close(self): 571 return "DONE" 572 573 parser = self.etree.HTMLParser(target=Target()) 574 try: 575 self.etree.fromstring('<html><body></body></html>', parser) 576 except ValueError as exc: 577 assert "START" in str(exc), str(exc) 578 except TypeError as exc: 579 assert "END" in str(exc), str(exc) 580 self.assertTrue(False, "wrong exception raised") 581 else: 582 self.assertTrue(False, "no exception raised") 583 584 self.assertTrue(("start", "html") in events, events) 585 self.assertTrue(("end", "html") not in events, events) 586
587 - def test_set_decl_html(self):
588 doc = html.Element('html').getroottree() 589 doc.docinfo.public_id = "-//W3C//DTD XHTML 1.0 Strict//EN" 590 doc.docinfo.system_url = \ 591 "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd" 592 self.assertEqual(doc.docinfo.doctype, 593 '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">') 594 self.assertEqual(self.etree.tostring(doc), 595 _bytes('''<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"> 596 <html xmlns="http://www.w3.org/1999/xhtml"></html>'''))
597
598 - def test_html5_doctype(self):
599 # document type declaration with neither public if nor system url 600 doc = html.Element('html').getroottree() 601 doc.docinfo.public_id = None 602 doc.docinfo.system_url = None 603 self.assertEqual(doc.docinfo.doctype, 604 '<!DOCTYPE html>') 605 self.assertTrue(doc.docinfo.public_id is None) 606 self.assertEqual(self.etree.tostring(doc), 607 _bytes('<!DOCTYPE html>\n<html/>'))
608
609 - def test_ietf_decl(self):
610 # legacy declaration with public id, no system url 611 doc = html.Element('html').getroottree() 612 doc.docinfo.public_id = '-//IETF//DTD HTML//EN' 613 doc.docinfo.system_url = None 614 self.assertEqual(doc.docinfo.doctype, 615 '<!DOCTYPE html PUBLIC "-//IETF//DTD HTML//EN">') 616 self.assertEqual(self.etree.tostring(doc), 617 _bytes('<!DOCTYPE html PUBLIC "-//IETF//DTD HTML//EN">\n<html/>'))
618
619 - def test_boolean_attribute(self):
620 # ability to serialize boolean attribute by setting value to None 621 form = html.Element('form') 622 form.set('novalidate', None) 623 self.assertEqual(html.tostring(form), 624 _bytes('<form novalidate></form>')) 625 form.set('custom') 626 self.assertEqual(html.tostring(form), 627 _bytes('<form novalidate custom></form>'))
628
629 - def test_boolean_attribute_round_trip(self):
630 # ability to pass boolean attributes unmodified 631 fragment = '<tag attribute></tag>' 632 self.assertEqual(html.tostring(html.fragment_fromstring(fragment)), 633 _bytes(fragment))
634
635 - def test_boolean_attribute_xml_adds_empty_string(self):
636 # html serialized as xml converts boolean attributes to empty strings 637 fragment = '<tag attribute></tag>' 638 self.assertEqual(self.etree.tostring(html.fragment_fromstring(fragment)), 639 _bytes('<tag attribute=""/>'))
640 641
642 -def test_suite():
643 suite = unittest.TestSuite() 644 suite.addTests([unittest.makeSuite(HtmlParserTestCase)]) 645 return suite
646 647 648 if __name__ == '__main__': 649 print('to test use test.py %s' % __file__) 650