Package lxml :: Package tests :: Module test_htmlparser
[hide private]
[frames] | no frames]

Source Code for Module lxml.tests.test_htmlparser

  1  # -*- coding: utf-8 -*- 
  2   
  3  """ 
  4  HTML parser test cases for etree 
  5  """ 
  6   
  7  import unittest 
  8  import tempfile, os, os.path, sys 
  9   
 10  this_dir = os.path.dirname(__file__) 
 11  if this_dir not in sys.path: 
 12      sys.path.insert(0, this_dir) # needed for Py3 
 13   
 14  from common_imports import etree, html, StringIO, BytesIO, fileInTestDir, _bytes, _str 
 15  from common_imports import SillyFileLike, HelperTestCase, write_to_file, next 
 16   
 17  try: 
 18      unicode 
 19  except NameError: 
 20      unicode = str 
 21   
 22   
23 -class HtmlParserTestCase(HelperTestCase):
24 """HTML parser test cases 25 """ 26 etree = etree 27 28 html_str = _bytes("<html><head><title>test</title></head><body><h1>page title</h1></body></html>") 29 html_str_pretty = _bytes("""\ 30 <html> 31 <head><title>test</title></head> 32 <body><h1>page title</h1></body> 33 </html> 34 """) 35 broken_html_str = _bytes("<html><head><title>test" 36 "<body><h1>page title</h3></p></html>") 37 uhtml_str = _bytes( 38 "<html><head><title>test á</title></head>" 39 "<body><h1>page á title</h1></body></html>").decode('utf8') 40
41 - def tearDown(self):
42 super(HtmlParserTestCase, self).tearDown() 43 self.etree.set_default_parser()
44
45 - def test_module_HTML(self):
46 element = self.etree.HTML(self.html_str) 47 self.assertEqual(self.etree.tostring(element, method="html"), 48 self.html_str)
49
50 - def test_module_HTML_unicode(self):
51 element = self.etree.HTML(self.uhtml_str) 52 self.assertEqual( 53 self.etree.tostring(element, method="html", encoding='unicode'), 54 self.uhtml_str) 55 self.assertEqual(element.findtext('.//h1'), 56 _bytes("page á title").decode('utf8'))
57
58 - def test_wide_unicode_xml(self):
59 if sys.maxunicode < 1114111: 60 return # skip test 61 element = self.etree.HTML(_bytes( 62 '<html><body><p>\\U00026007</p></body></html>' 63 ).decode('unicode_escape')) 64 p_text = element.findtext('.//p') 65 self.assertEqual(1, len(p_text)) 66 self.assertEqual(_bytes('\\U00026007').decode('unicode_escape'), 67 p_text)
68
70 element = self.etree.HTML(self.html_str) 71 self.assertEqual(self.etree.tostring(element, method="html", pretty_print=True), 72 self.html_str_pretty)
73
75 parser = self.etree.HTMLParser(recover=False) 76 parse = self.etree.parse 77 f = BytesIO("<html></body>") 78 self.assertRaises(self.etree.XMLSyntaxError, 79 parse, f, parser)
80
82 parser = self.etree.HTMLParser() 83 Element = parser.makeelement 84 85 el = Element('name') 86 self.assertRaises(ValueError, Element, '{}') 87 self.assertRaises(ValueError, setattr, el, 'tag', '{}') 88 89 self.assertRaises(ValueError, Element, '{test}') 90 self.assertRaises(ValueError, setattr, el, 'tag', '{test}')
91
93 parser = self.etree.HTMLParser() 94 Element = parser.makeelement 95 96 pname = Element('p:name') 97 self.assertEqual(pname.tag, 'p:name') 98 99 pname = Element('{test}p:name') 100 self.assertEqual(pname.tag, '{test}p:name') 101 102 pname = Element('name') 103 pname.tag = 'p:name' 104 self.assertEqual(pname.tag, 'p:name')
105
107 parser = self.etree.HTMLParser() 108 Element = parser.makeelement 109 110 self.assertRaises(ValueError, Element, 'p"name') 111 self.assertRaises(ValueError, Element, "na'me") 112 self.assertRaises(ValueError, Element, '{test}"name') 113 self.assertRaises(ValueError, Element, "{test}name'") 114 115 el = Element('name') 116 self.assertRaises(ValueError, setattr, el, 'tag', "pname'") 117 self.assertRaises(ValueError, setattr, el, 'tag', '"pname') 118 self.assertEqual(el.tag, "name")
119
121 parser = self.etree.HTMLParser() 122 Element = parser.makeelement 123 124 self.assertRaises(ValueError, Element, ' name ') 125 self.assertRaises(ValueError, Element, 'na me') 126 self.assertRaises(ValueError, Element, '{test} name') 127 128 el = Element('name') 129 self.assertRaises(ValueError, setattr, el, 'tag', ' name ') 130 self.assertEqual(el.tag, "name")
131
133 parser = self.etree.HTMLParser() 134 Element = parser.makeelement 135 136 SubElement = self.etree.SubElement 137 138 el = Element('name') 139 self.assertRaises(ValueError, SubElement, el, '{}') 140 self.assertRaises(ValueError, SubElement, el, '{test}')
141
143 parser = self.etree.HTMLParser() 144 Element = parser.makeelement 145 SubElement = self.etree.SubElement 146 147 el = Element('name') 148 pname = SubElement(el, 'p:name') 149 self.assertEqual(pname.tag, 'p:name') 150 151 pname = SubElement(el, '{test}p:name') 152 self.assertEqual(pname.tag, '{test}p:name')
153
155 parser = self.etree.HTMLParser() 156 Element = parser.makeelement 157 SubElement = self.etree.SubElement 158 159 el = Element('name') 160 self.assertRaises(ValueError, SubElement, el, "name'") 161 self.assertRaises(ValueError, SubElement, el, 'na"me') 162 self.assertRaises(ValueError, SubElement, el, "{test}na'me") 163 self.assertRaises(ValueError, SubElement, el, '{test}"name')
164
166 parser = self.etree.HTMLParser() 167 Element = parser.makeelement 168 SubElement = self.etree.SubElement 169 170 el = Element('name') 171 self.assertRaises(ValueError, SubElement, el, ' name ') 172 self.assertRaises(ValueError, SubElement, el, 'na me') 173 self.assertRaises(ValueError, SubElement, el, '{test} name')
174
176 parser = self.etree.HTMLParser(recover=False) 177 parse = self.etree.parse 178 f = BytesIO(self.broken_html_str) 179 self.assertRaises(self.etree.XMLSyntaxError, 180 parse, f, parser)
181
183 text = _str('Søk på nettet') 184 html_latin1 = (_str('<p>%s</p>') % text).encode('iso-8859-1') 185 186 tree = self.etree.parse( 187 BytesIO(html_latin1), 188 self.etree.HTMLParser(encoding="iso-8859-1")) 189 p = tree.find("//p") 190 self.assertEqual(p.text, text)
191
193 text = _str('Søk på nettet') 194 wrong_head = _str(''' 195 <head> 196 <meta http-equiv="Content-Type" 197 content="text/html; charset=UTF-8" /> 198 </head>''') 199 html_latin1 = (_str('<html>%s<body><p>%s</p></body></html>') % (wrong_head, 200 text) 201 ).encode('iso-8859-1') 202 203 self.assertRaises(self.etree.ParseError, 204 self.etree.parse, 205 BytesIO(html_latin1)) 206 207 tree = self.etree.parse( 208 BytesIO(html_latin1), 209 self.etree.HTMLParser(encoding="iso-8859-1")) 210 p = tree.find("//p") 211 self.assertEqual(p.text, text)
212
213 - def test_module_HTML_broken(self):
214 element = self.etree.HTML(self.broken_html_str) 215 self.assertEqual(self.etree.tostring(element, method="html"), 216 self.html_str)
217
218 - def test_module_HTML_cdata(self):
219 # by default, libxml2 generates CDATA nodes for <script> content 220 html = _bytes('<html><head><style>foo</style></head></html>') 221 element = self.etree.HTML(html) 222 self.assertEqual(element[0][0].text, "foo")
223
224 - def test_module_HTML_access(self):
225 element = self.etree.HTML(self.html_str) 226 self.assertEqual(element[0][0].tag, 'title')
227
228 - def test_module_parse_html(self):
229 parser = self.etree.HTMLParser() 230 filename = tempfile.mktemp(suffix=".html") 231 write_to_file(filename, self.html_str, 'wb') 232 try: 233 f = open(filename, 'rb') 234 tree = self.etree.parse(f, parser) 235 f.close() 236 self.assertEqual(self.etree.tostring(tree.getroot(), method="html"), 237 self.html_str) 238 finally: 239 os.remove(filename)
240
242 parser = self.etree.HTMLParser() 243 f = SillyFileLike(self.html_str) 244 tree = self.etree.parse(f, parser) 245 html = self.etree.tostring(tree.getroot(), 246 method="html", encoding='UTF-8') 247 self.assertEqual(html, self.html_str)
248 249 ## def test_module_parse_html_filelike_unicode(self): 250 ## parser = self.etree.HTMLParser() 251 ## f = SillyFileLike(self.uhtml_str) 252 ## tree = self.etree.parse(f, parser) 253 ## html = self.etree.tostring(tree.getroot(), encoding='UTF-8') 254 ## self.assertEqual(unicode(html, 'UTF-8'), self.uhtml_str) 255
256 - def test_html_file_error(self):
257 parser = self.etree.HTMLParser() 258 parse = self.etree.parse 259 self.assertRaises(IOError, 260 parse, "__some_hopefully_nonexisting_file__.html", 261 parser)
262
264 self.assertRaises(self.etree.XMLSyntaxError, 265 self.etree.parse, BytesIO(self.broken_html_str)) 266 267 self.etree.set_default_parser( self.etree.HTMLParser() ) 268 269 tree = self.etree.parse(BytesIO(self.broken_html_str)) 270 self.assertEqual(self.etree.tostring(tree.getroot(), method="html"), 271 self.html_str) 272 273 self.etree.set_default_parser() 274 275 self.assertRaises(self.etree.XMLSyntaxError, 276 self.etree.parse, BytesIO(self.broken_html_str))
277
278 - def test_html_iterparse(self):
279 iterparse = self.etree.iterparse 280 f = BytesIO( 281 '<html><head><title>TITLE</title><body><p>P</p></body></html>') 282 283 iterator = iterparse(f, html=True) 284 self.assertEqual(None, iterator.root) 285 286 events = list(iterator) 287 root = iterator.root 288 self.assertTrue(root is not None) 289 self.assertEqual( 290 [('end', root[0][0]), ('end', root[0]), ('end', root[1][0]), 291 ('end', root[1]), ('end', root)], 292 events)
293
295 iterparse = self.etree.iterparse 296 f = BytesIO( 297 '<html><head><title>TITLE</title><body><p>P</p></body></html>') 298 299 iterator = iterparse(f, html=True) 300 self.assertEqual(None, iterator.root) 301 302 event, element = next(iterator) 303 self.assertEqual('end', event) 304 self.assertEqual('title', element.tag) 305 self.assertEqual(None, iterator.root) 306 del element 307 308 event, element = next(iterator) 309 self.assertEqual('end', event) 310 self.assertEqual('head', element.tag) 311 self.assertEqual(None, iterator.root) 312 del element 313 del iterator
314
316 iterparse = self.etree.iterparse 317 f = BytesIO('<head><title>TEST></head><p>P<br></div>') 318 319 iterator = iterparse(f, html=True) 320 self.assertEqual(None, iterator.root) 321 322 events = list(iterator) 323 root = iterator.root 324 self.assertTrue(root is not None) 325 self.assertEqual('html', root.tag) 326 self.assertEqual('head', root[0].tag) 327 self.assertEqual('body', root[1].tag) 328 self.assertEqual('p', root[1][0].tag) 329 self.assertEqual('br', root[1][0][0].tag) 330 self.assertEqual( 331 [('end', root[0][0]), ('end', root[0]), ('end', root[1][0][0]), 332 ('end', root[1][0]), ('end', root[1]), ('end', root)], 333 events)
334
336 iterparse = self.etree.iterparse 337 f = BytesIO('<p>P<br></div>') 338 iterator = iterparse(f, html=True, recover=False) 339 self.assertRaises(self.etree.XMLSyntaxError, list, iterator)
340
341 - def test_html_iterparse_file(self):
342 iterparse = self.etree.iterparse 343 iterator = iterparse(fileInTestDir("shakespeare.html"), 344 html=True) 345 346 self.assertEqual(None, iterator.root) 347 events = list(iterator) 348 root = iterator.root 349 self.assertTrue(root is not None) 350 self.assertEqual(249, len(events)) 351 self.assertFalse( 352 [event for (event, element) in events if event != 'end'])
353
355 iterparse = self.etree.iterparse 356 f = BytesIO( 357 '<html><head><title>TITLE</title><body><p>P</p></body></html>') 358 359 iterator = iterparse(f, html=True, events=('start',)) 360 self.assertEqual(None, iterator.root) 361 362 events = list(iterator) 363 root = iterator.root 364 self.assertNotEqual(None, root) 365 self.assertEqual( 366 [('start', root), ('start', root[0]), ('start', root[0][0]), 367 ('start', root[1]), ('start', root[1][0])], 368 events)
369
370 - def test_html_feed_parser(self):
371 parser = self.etree.HTMLParser() 372 parser.feed("<html><body></") 373 parser.feed("body></html>") 374 root = parser.close() 375 376 self.assertEqual('html', root.tag) 377 # test that we find all names in the parser dict 378 self.assertEqual([root], list(root.iter('html'))) 379 self.assertEqual([root[0]], list(root.iter('body')))
380
382 parser = self.etree.HTMLParser() 383 parser.feed("<htm") 384 parser.feed("l><body") 385 parser.feed("><") 386 parser.feed("p><") 387 parser.feed("strong") 388 parser.feed(">some ") 389 parser.feed("text</strong></p><") 390 parser.feed("/body></html>") 391 root = parser.close() 392 393 self.assertEqual('html', root.tag) 394 # test that we find all names in the parser dict 395 self.assertEqual([root], list(root.iter('html'))) 396 self.assertEqual([root[0]], list(root.iter('body'))) 397 self.assertEqual([root[0][0]], list(root.iter('p'))) 398 self.assertEqual([root[0][0][0]], list(root.iter('strong')))
399
401 parser = self.etree.HTMLParser() 402 parser.feed('<html><head>') 403 parser.feed('<title>TITLE</title><body><p>P</p></body><') 404 parser.feed("/html>") 405 root = parser.close() 406 407 self.assertEqual('html', root.tag) 408 # test that we find all names in the parser dict 409 self.assertEqual([root], list(root.iter('html'))) 410 self.assertEqual([root[0]], list(root.iter('head'))) 411 self.assertEqual([root[0][0]], list(root.iter('title'))) 412 self.assertEqual([root[1]], list(root.iter('body'))) 413 self.assertEqual([root[1][0]], list(root.iter('p')))
414
416 assertFalse = self.assertFalse 417 events = [] 418 class Target(object): 419 def start(self, tag, attrib): 420 events.append(("start", tag)) 421 assertFalse(attrib)
422 def end(self, tag): 423 events.append(("end", tag))
424 def close(self): 425 return "DONE" 426 427 parser = self.etree.HTMLParser(target=Target()) 428 429 parser.feed("<html><body></body></html>") 430 done = parser.close() 431 432 self.assertEqual("DONE", done) 433 self.assertEqual([ 434 ("start", "html"), ("start", "body"), 435 ("end", "body"), ("end", "html")], events) 436
437 - def test_html_parser_target_doctype_empty(self):
438 assertFalse = self.assertFalse 439 events = [] 440 class Target(object): 441 def start(self, tag, attrib): 442 events.append(("start", tag)) 443 assertFalse(attrib)
444 def end(self, tag): 445 events.append(("end", tag)) 446 def doctype(self, *args): 447 events.append(("doctype", args)) 448 def close(self): 449 return "DONE" 450 451 parser = self.etree.HTMLParser(target=Target()) 452 parser.feed("<!DOCTYPE><html><body></body></html>") 453 done = parser.close() 454 455 self.assertEqual("DONE", done) 456 self.assertEqual([ 457 ("doctype", (None, None, None)), 458 ("start", "html"), ("start", "body"), 459 ("end", "body"), ("end", "html")], events) 460
461 - def test_html_parser_target_doctype_html(self):
462 assertFalse = self.assertFalse 463 events = [] 464 class Target(object): 465 def start(self, tag, attrib): 466 events.append(("start", tag)) 467 assertFalse(attrib)
468 def end(self, tag): 469 events.append(("end", tag)) 470 def doctype(self, *args): 471 events.append(("doctype", args)) 472 def close(self): 473 return "DONE" 474 475 parser = self.etree.HTMLParser(target=Target()) 476 parser.feed("<!DOCTYPE html><html><body></body></html>") 477 done = parser.close() 478 479 self.assertEqual("DONE", done) 480 self.assertEqual([ 481 ("doctype", ("html", None, None)), 482 ("start", "html"), ("start", "body"), 483 ("end", "body"), ("end", "html")], events) 484
485 - def test_html_parser_target_doctype_html_full(self):
486 assertFalse = self.assertFalse 487 events = [] 488 class Target(object): 489 def start(self, tag, attrib): 490 events.append(("start", tag)) 491 assertFalse(attrib)
492 def end(self, tag): 493 events.append(("end", tag)) 494 def doctype(self, *args): 495 events.append(("doctype", args)) 496 def close(self): 497 return "DONE" 498 499 parser = self.etree.HTMLParser(target=Target()) 500 parser.feed('<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN" "sys.dtd">' 501 '<html><body></body></html>') 502 done = parser.close() 503 504 self.assertEqual("DONE", done) 505 self.assertEqual([ 506 ("doctype", ("html", "-//W3C//DTD HTML 4.01//EN", "sys.dtd")), 507 ("start", "html"), ("start", "body"), 508 ("end", "body"), ("end", "html")], events) 509
510 - def test_html_parser_target_exceptions(self):
511 events = [] 512 class Target(object): 513 def start(self, tag, attrib): 514 events.append(("start", tag)) 515 raise ValueError("START")
516 def end(self, tag): 517 events.append(("end", tag)) 518 raise TypeError("END") 519 def close(self): 520 return "DONE" 521 522 parser = self.etree.HTMLParser(target=Target()) 523 try: 524 parser.feed('<html><body>') 525 parser.feed('</body></html>') 526 except ValueError as exc: 527 assert "START" in str(exc) 528 except TypeError as exc: 529 assert "END" in str(exc) 530 self.assertTrue(False, "wrong exception raised") 531 else: 532 self.assertTrue(False, "no exception raised") 533 534 self.assertTrue(("start", "html") in events, events) 535 self.assertTrue(("end", "html") not in events, events) 536
537 - def test_html_fromstring_target_exceptions(self):
538 events = [] 539 class Target(object): 540 def start(self, tag, attrib): 541 events.append(("start", tag)) 542 raise ValueError("START")
543 def end(self, tag): 544 events.append(("end", tag)) 545 raise TypeError("END") 546 def close(self): 547 return "DONE" 548 549 parser = self.etree.HTMLParser(target=Target()) 550 try: 551 self.etree.fromstring('<html><body></body></html>', parser) 552 except ValueError as exc: 553 assert "START" in str(exc), str(exc) 554 except TypeError as exc: 555 assert "END" in str(exc), str(exc) 556 self.assertTrue(False, "wrong exception raised") 557 else: 558 self.assertTrue(False, "no exception raised") 559 560 self.assertTrue(("start", "html") in events, events) 561 self.assertTrue(("end", "html") not in events, events) 562
563 - def test_set_decl_html(self):
564 doc = html.Element('html').getroottree() 565 doc.docinfo.public_id = "-//W3C//DTD XHTML 1.0 Strict//EN" 566 doc.docinfo.system_url = \ 567 "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd" 568 self.assertEqual(doc.docinfo.doctype, 569 '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">') 570 self.assertEqual(self.etree.tostring(doc), 571 _bytes('''<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"> 572 <html xmlns="http://www.w3.org/1999/xhtml"></html>'''))
573
574 - def test_html5_doctype(self):
575 # document type declaration with neither public if nor system url 576 doc = html.Element('html').getroottree() 577 doc.docinfo.public_id = None 578 doc.docinfo.system_url = None 579 self.assertEqual(doc.docinfo.doctype, 580 '<!DOCTYPE html>') 581 self.assertTrue(doc.docinfo.public_id is None) 582 self.assertEqual(self.etree.tostring(doc), 583 _bytes('<!DOCTYPE html>\n<html/>'))
584
585 - def test_ietf_decl(self):
586 # legacy declaration with public id, no system url 587 doc = html.Element('html').getroottree() 588 doc.docinfo.public_id = '-//IETF//DTD HTML//EN' 589 doc.docinfo.system_url = None 590 self.assertEqual(doc.docinfo.doctype, 591 '<!DOCTYPE html PUBLIC "-//IETF//DTD HTML//EN">') 592 self.assertEqual(self.etree.tostring(doc), 593 _bytes('<!DOCTYPE html PUBLIC "-//IETF//DTD HTML//EN">\n<html/>'))
594 595
596 -def test_suite():
597 suite = unittest.TestSuite() 598 suite.addTests([unittest.makeSuite(HtmlParserTestCase)]) 599 return suite
600 601 602 if __name__ == '__main__': 603 print('to test use test.py %s' % __file__) 604