Package lxml :: Package tests :: Module test_htmlparser
[hide private]
[frames] | no frames]

Source Code for Module lxml.tests.test_htmlparser

  1  # -*- coding: utf-8 -*- 
  2   
  3  """ 
  4  HTML parser test cases for etree 
  5  """ 
  6   
  7  import unittest 
  8  import tempfile, os, os.path, sys 
  9   
 10  this_dir = os.path.dirname(__file__) 
 11  if this_dir not in sys.path: 
 12      sys.path.insert(0, this_dir) # needed for Py3 
 13   
 14  from common_imports import etree, StringIO, BytesIO, fileInTestDir, _bytes, _str 
 15  from common_imports import SillyFileLike, HelperTestCase, write_to_file, next 
 16   
 17  try: 
 18      unicode 
 19  except NameError: 
 20      unicode = str 
 21   
22 -class HtmlParserTestCase(HelperTestCase):
23 """HTML parser test cases 24 """ 25 etree = etree 26 27 html_str = _bytes("<html><head><title>test</title></head><body><h1>page title</h1></body></html>") 28 html_str_pretty = _bytes("""\ 29 <html> 30 <head><title>test</title></head> 31 <body><h1>page title</h1></body> 32 </html> 33 """) 34 broken_html_str = _bytes("<html><head><title>test" 35 "<body><h1>page title</h3></p></html>") 36 uhtml_str = _bytes( 37 "<html><head><title>test á</title></head>" 38 "<body><h1>page á title</h1></body></html>").decode('utf8') 39
40 - def tearDown(self):
41 super(HtmlParserTestCase, self).tearDown() 42 self.etree.set_default_parser()
43
44 - def test_module_HTML(self):
45 element = self.etree.HTML(self.html_str) 46 self.assertEqual(self.etree.tostring(element, method="html"), 47 self.html_str)
48
49 - def test_module_HTML_unicode(self):
50 element = self.etree.HTML(self.uhtml_str) 51 self.assertEqual( 52 self.etree.tostring(element, method="html", encoding='unicode'), 53 self.uhtml_str) 54 self.assertEqual(element.findtext('.//h1'), 55 _bytes("page á title").decode('utf8'))
56
57 - def test_wide_unicode_xml(self):
58 if sys.maxunicode < 1114111: 59 return # skip test 60 element = self.etree.HTML(_bytes( 61 '<html><body><p>\\U00026007</p></body></html>' 62 ).decode('unicode_escape')) 63 p_text = element.findtext('.//p') 64 self.assertEqual(1, len(p_text)) 65 self.assertEqual(_bytes('\\U00026007').decode('unicode_escape'), 66 p_text)
67
69 element = self.etree.HTML(self.html_str) 70 self.assertEqual(self.etree.tostring(element, method="html", pretty_print=True), 71 self.html_str_pretty)
72
74 parser = self.etree.HTMLParser(recover=False) 75 parse = self.etree.parse 76 f = BytesIO("<html></body>") 77 self.assertRaises(self.etree.XMLSyntaxError, 78 parse, f, parser)
79
81 parser = self.etree.HTMLParser() 82 Element = parser.makeelement 83 84 el = Element('name') 85 self.assertRaises(ValueError, Element, '{}') 86 self.assertRaises(ValueError, setattr, el, 'tag', '{}') 87 88 self.assertRaises(ValueError, Element, '{test}') 89 self.assertRaises(ValueError, setattr, el, 'tag', '{test}')
90
92 parser = self.etree.HTMLParser() 93 Element = parser.makeelement 94 95 pname = Element('p:name') 96 self.assertEqual(pname.tag, 'p:name') 97 98 pname = Element('{test}p:name') 99 self.assertEqual(pname.tag, '{test}p:name') 100 101 pname = Element('name') 102 pname.tag = 'p:name' 103 self.assertEqual(pname.tag, 'p:name')
104
106 parser = self.etree.HTMLParser() 107 Element = parser.makeelement 108 109 self.assertRaises(ValueError, Element, 'p"name') 110 self.assertRaises(ValueError, Element, "na'me") 111 self.assertRaises(ValueError, Element, '{test}"name') 112 self.assertRaises(ValueError, Element, "{test}name'") 113 114 el = Element('name') 115 self.assertRaises(ValueError, setattr, el, 'tag', "pname'") 116 self.assertRaises(ValueError, setattr, el, 'tag', '"pname') 117 self.assertEqual(el.tag, "name")
118
120 parser = self.etree.HTMLParser() 121 Element = parser.makeelement 122 123 self.assertRaises(ValueError, Element, ' name ') 124 self.assertRaises(ValueError, Element, 'na me') 125 self.assertRaises(ValueError, Element, '{test} name') 126 127 el = Element('name') 128 self.assertRaises(ValueError, setattr, el, 'tag', ' name ') 129 self.assertEqual(el.tag, "name")
130
132 parser = self.etree.HTMLParser() 133 Element = parser.makeelement 134 135 SubElement = self.etree.SubElement 136 137 el = Element('name') 138 self.assertRaises(ValueError, SubElement, el, '{}') 139 self.assertRaises(ValueError, SubElement, el, '{test}')
140
142 parser = self.etree.HTMLParser() 143 Element = parser.makeelement 144 SubElement = self.etree.SubElement 145 146 el = Element('name') 147 pname = SubElement(el, 'p:name') 148 self.assertEqual(pname.tag, 'p:name') 149 150 pname = SubElement(el, '{test}p:name') 151 self.assertEqual(pname.tag, '{test}p:name')
152
154 parser = self.etree.HTMLParser() 155 Element = parser.makeelement 156 SubElement = self.etree.SubElement 157 158 el = Element('name') 159 self.assertRaises(ValueError, SubElement, el, "name'") 160 self.assertRaises(ValueError, SubElement, el, 'na"me') 161 self.assertRaises(ValueError, SubElement, el, "{test}na'me") 162 self.assertRaises(ValueError, SubElement, el, '{test}"name')
163
165 parser = self.etree.HTMLParser() 166 Element = parser.makeelement 167 SubElement = self.etree.SubElement 168 169 el = Element('name') 170 self.assertRaises(ValueError, SubElement, el, ' name ') 171 self.assertRaises(ValueError, SubElement, el, 'na me') 172 self.assertRaises(ValueError, SubElement, el, '{test} name')
173
175 parser = self.etree.HTMLParser(recover=False) 176 parse = self.etree.parse 177 f = BytesIO(self.broken_html_str) 178 self.assertRaises(self.etree.XMLSyntaxError, 179 parse, f, parser)
180
182 text = _str('Søk på nettet') 183 html_latin1 = (_str('<p>%s</p>') % text).encode('iso-8859-1') 184 185 tree = self.etree.parse( 186 BytesIO(html_latin1), 187 self.etree.HTMLParser(encoding="iso-8859-1")) 188 p = tree.find("//p") 189 self.assertEqual(p.text, text)
190
192 text = _str('Søk på nettet') 193 wrong_head = _str(''' 194 <head> 195 <meta http-equiv="Content-Type" 196 content="text/html; charset=UTF-8" /> 197 </head>''') 198 html_latin1 = (_str('<html>%s<body><p>%s</p></body></html>') % (wrong_head, 199 text) 200 ).encode('iso-8859-1') 201 202 self.assertRaises(self.etree.ParseError, 203 self.etree.parse, 204 BytesIO(html_latin1)) 205 206 tree = self.etree.parse( 207 BytesIO(html_latin1), 208 self.etree.HTMLParser(encoding="iso-8859-1")) 209 p = tree.find("//p") 210 self.assertEqual(p.text, text)
211
212 - def test_module_HTML_broken(self):
213 element = self.etree.HTML(self.broken_html_str) 214 self.assertEqual(self.etree.tostring(element, method="html"), 215 self.html_str)
216
217 - def test_module_HTML_cdata(self):
218 # by default, libxml2 generates CDATA nodes for <script> content 219 html = _bytes('<html><head><style>foo</style></head></html>') 220 element = self.etree.HTML(html) 221 self.assertEqual(element[0][0].text, "foo")
222
223 - def test_module_HTML_access(self):
224 element = self.etree.HTML(self.html_str) 225 self.assertEqual(element[0][0].tag, 'title')
226
227 - def test_module_parse_html(self):
228 parser = self.etree.HTMLParser() 229 filename = tempfile.mktemp(suffix=".html") 230 write_to_file(filename, self.html_str, 'wb') 231 try: 232 f = open(filename, 'rb') 233 tree = self.etree.parse(f, parser) 234 f.close() 235 self.assertEqual(self.etree.tostring(tree.getroot(), method="html"), 236 self.html_str) 237 finally: 238 os.remove(filename)
239
241 parser = self.etree.HTMLParser() 242 f = SillyFileLike(self.html_str) 243 tree = self.etree.parse(f, parser) 244 html = self.etree.tostring(tree.getroot(), 245 method="html", encoding='UTF-8') 246 self.assertEqual(html, self.html_str)
247 248 ## def test_module_parse_html_filelike_unicode(self): 249 ## parser = self.etree.HTMLParser() 250 ## f = SillyFileLike(self.uhtml_str) 251 ## tree = self.etree.parse(f, parser) 252 ## html = self.etree.tostring(tree.getroot(), encoding='UTF-8') 253 ## self.assertEqual(unicode(html, 'UTF-8'), self.uhtml_str) 254
255 - def test_html_file_error(self):
256 parser = self.etree.HTMLParser() 257 parse = self.etree.parse 258 self.assertRaises(IOError, 259 parse, "__some_hopefully_nonexisting_file__.html", 260 parser)
261
263 self.assertRaises(self.etree.XMLSyntaxError, 264 self.etree.parse, BytesIO(self.broken_html_str)) 265 266 self.etree.set_default_parser( self.etree.HTMLParser() ) 267 268 tree = self.etree.parse(BytesIO(self.broken_html_str)) 269 self.assertEqual(self.etree.tostring(tree.getroot(), method="html"), 270 self.html_str) 271 272 self.etree.set_default_parser() 273 274 self.assertRaises(self.etree.XMLSyntaxError, 275 self.etree.parse, BytesIO(self.broken_html_str))
276
277 - def test_html_iterparse(self):
278 iterparse = self.etree.iterparse 279 f = BytesIO( 280 '<html><head><title>TITLE</title><body><p>P</p></body></html>') 281 282 iterator = iterparse(f, html=True) 283 self.assertEqual(None, iterator.root) 284 285 events = list(iterator) 286 root = iterator.root 287 self.assertTrue(root is not None) 288 self.assertEqual( 289 [('end', root[0][0]), ('end', root[0]), ('end', root[1][0]), 290 ('end', root[1]), ('end', root)], 291 events)
292
294 iterparse = self.etree.iterparse 295 f = BytesIO( 296 '<html><head><title>TITLE</title><body><p>P</p></body></html>') 297 298 iterator = iterparse(f, html=True) 299 self.assertEqual(None, iterator.root) 300 301 event, element = next(iterator) 302 self.assertEqual('end', event) 303 self.assertEqual('title', element.tag) 304 self.assertEqual(None, iterator.root) 305 del element 306 307 event, element = next(iterator) 308 self.assertEqual('end', event) 309 self.assertEqual('head', element.tag) 310 self.assertEqual(None, iterator.root) 311 del element 312 del iterator
313
315 iterparse = self.etree.iterparse 316 f = BytesIO('<head><title>TEST></head><p>P<br></div>') 317 318 iterator = iterparse(f, html=True) 319 self.assertEqual(None, iterator.root) 320 321 events = list(iterator) 322 root = iterator.root 323 self.assertTrue(root is not None) 324 self.assertEqual('html', root.tag) 325 self.assertEqual('head', root[0].tag) 326 self.assertEqual('body', root[1].tag) 327 self.assertEqual('p', root[1][0].tag) 328 self.assertEqual('br', root[1][0][0].tag) 329 self.assertEqual( 330 [('end', root[0][0]), ('end', root[0]), ('end', root[1][0][0]), 331 ('end', root[1][0]), ('end', root[1]), ('end', root)], 332 events)
333
335 iterparse = self.etree.iterparse 336 f = BytesIO('<p>P<br></div>') 337 iterator = iterparse(f, html=True, recover=False) 338 self.assertRaises(self.etree.XMLSyntaxError, list, iterator)
339
340 - def test_html_iterparse_file(self):
341 iterparse = self.etree.iterparse 342 iterator = iterparse(fileInTestDir("shakespeare.html"), 343 html=True) 344 345 self.assertEqual(None, iterator.root) 346 events = list(iterator) 347 root = iterator.root 348 self.assertTrue(root is not None) 349 self.assertEqual(249, len(events)) 350 self.assertFalse( 351 [event for (event, element) in events if event != 'end'])
352
354 iterparse = self.etree.iterparse 355 f = BytesIO( 356 '<html><head><title>TITLE</title><body><p>P</p></body></html>') 357 358 iterator = iterparse(f, html=True, events=('start',)) 359 self.assertEqual(None, iterator.root) 360 361 events = list(iterator) 362 root = iterator.root 363 self.assertNotEqual(None, root) 364 self.assertEqual( 365 [('start', root), ('start', root[0]), ('start', root[0][0]), 366 ('start', root[1]), ('start', root[1][0])], 367 events)
368
369 - def test_html_feed_parser(self):
370 parser = self.etree.HTMLParser() 371 parser.feed("<html><body></") 372 parser.feed("body></html>") 373 root = parser.close() 374 375 self.assertEqual('html', root.tag) 376 # test that we find all names in the parser dict 377 self.assertEqual([root], list(root.iter('html'))) 378 self.assertEqual([root[0]], list(root.iter('body')))
379
381 parser = self.etree.HTMLParser() 382 parser.feed("<htm") 383 parser.feed("l><body") 384 parser.feed("><") 385 parser.feed("p><") 386 parser.feed("strong") 387 parser.feed(">some ") 388 parser.feed("text</strong></p><") 389 parser.feed("/body></html>") 390 root = parser.close() 391 392 self.assertEqual('html', root.tag) 393 # test that we find all names in the parser dict 394 self.assertEqual([root], list(root.iter('html'))) 395 self.assertEqual([root[0]], list(root.iter('body'))) 396 self.assertEqual([root[0][0]], list(root.iter('p'))) 397 self.assertEqual([root[0][0][0]], list(root.iter('strong')))
398
400 parser = self.etree.HTMLParser() 401 parser.feed('<html><head>') 402 parser.feed('<title>TITLE</title><body><p>P</p></body><') 403 parser.feed("/html>") 404 root = parser.close() 405 406 self.assertEqual('html', root.tag) 407 # test that we find all names in the parser dict 408 self.assertEqual([root], list(root.iter('html'))) 409 self.assertEqual([root[0]], list(root.iter('head'))) 410 self.assertEqual([root[0][0]], list(root.iter('title'))) 411 self.assertEqual([root[1]], list(root.iter('body'))) 412 self.assertEqual([root[1][0]], list(root.iter('p')))
413
415 assertFalse = self.assertFalse 416 events = [] 417 class Target(object): 418 def start(self, tag, attrib): 419 events.append(("start", tag)) 420 assertFalse(attrib)
421 def end(self, tag): 422 events.append(("end", tag))
423 def close(self): 424 return "DONE" 425 426 parser = self.etree.HTMLParser(target=Target()) 427 428 parser.feed("<html><body></body></html>") 429 done = parser.close() 430 431 self.assertEqual("DONE", done) 432 self.assertEqual([ 433 ("start", "html"), ("start", "body"), 434 ("end", "body"), ("end", "html")], events) 435
436 - def test_html_parser_target_doctype_empty(self):
437 assertFalse = self.assertFalse 438 events = [] 439 class Target(object): 440 def start(self, tag, attrib): 441 events.append(("start", tag)) 442 assertFalse(attrib)
443 def end(self, tag): 444 events.append(("end", tag)) 445 def doctype(self, *args): 446 events.append(("doctype", args)) 447 def close(self): 448 return "DONE" 449 450 parser = self.etree.HTMLParser(target=Target()) 451 parser.feed("<!DOCTYPE><html><body></body></html>") 452 done = parser.close() 453 454 self.assertEqual("DONE", done) 455 self.assertEqual([ 456 ("doctype", (None, None, None)), 457 ("start", "html"), ("start", "body"), 458 ("end", "body"), ("end", "html")], events) 459
460 - def test_html_parser_target_doctype_html(self):
461 assertFalse = self.assertFalse 462 events = [] 463 class Target(object): 464 def start(self, tag, attrib): 465 events.append(("start", tag)) 466 assertFalse(attrib)
467 def end(self, tag): 468 events.append(("end", tag)) 469 def doctype(self, *args): 470 events.append(("doctype", args)) 471 def close(self): 472 return "DONE" 473 474 parser = self.etree.HTMLParser(target=Target()) 475 parser.feed("<!DOCTYPE html><html><body></body></html>") 476 done = parser.close() 477 478 self.assertEqual("DONE", done) 479 self.assertEqual([ 480 ("doctype", ("html", None, None)), 481 ("start", "html"), ("start", "body"), 482 ("end", "body"), ("end", "html")], events) 483
484 - def test_html_parser_target_doctype_html_full(self):
485 assertFalse = self.assertFalse 486 events = [] 487 class Target(object): 488 def start(self, tag, attrib): 489 events.append(("start", tag)) 490 assertFalse(attrib)
491 def end(self, tag): 492 events.append(("end", tag)) 493 def doctype(self, *args): 494 events.append(("doctype", args)) 495 def close(self): 496 return "DONE" 497 498 parser = self.etree.HTMLParser(target=Target()) 499 parser.feed('<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN" "sys.dtd">' 500 '<html><body></body></html>') 501 done = parser.close() 502 503 self.assertEqual("DONE", done) 504 self.assertEqual([ 505 ("doctype", ("html", "-//W3C//DTD HTML 4.01//EN", "sys.dtd")), 506 ("start", "html"), ("start", "body"), 507 ("end", "body"), ("end", "html")], events) 508 509
510 -def test_suite():
511 suite = unittest.TestSuite() 512 suite.addTests([unittest.makeSuite(HtmlParserTestCase)]) 513 return suite
514 515 if __name__ == '__main__': 516 print('to test use test.py %s' % __file__) 517