Package lxml :: Package tests :: Module test_htmlparser
[hide private]
[frames] | no frames]

Source Code for Module lxml.tests.test_htmlparser

  1  # -*- coding: utf-8 -*- 
  2   
  3  """ 
  4  HTML parser test cases for etree 
  5  """ 
  6   
  7  import unittest 
  8  import tempfile, os, os.path, sys 
  9   
 10  this_dir = os.path.dirname(__file__) 
 11  if this_dir not in sys.path: 
 12      sys.path.insert(0, this_dir) # needed for Py3 
 13   
 14  from common_imports import etree, StringIO, BytesIO, fileInTestDir, _bytes, _str 
 15  from common_imports import SillyFileLike, HelperTestCase, write_to_file, next 
 16   
 17  try: 
 18      unicode 
 19  except NameError: 
 20      unicode = str 
 21   
22 -class HtmlParserTestCase(HelperTestCase):
23 """HTML parser test cases 24 """ 25 etree = etree 26 27 html_str = _bytes("<html><head><title>test</title></head><body><h1>page title</h1></body></html>") 28 html_str_pretty = _bytes("""\ 29 <html> 30 <head><title>test</title></head> 31 <body><h1>page title</h1></body> 32 </html> 33 """) 34 broken_html_str = _bytes("<html><head><title>test" 35 "<body><h1>page title</h3></p></html>") 36 uhtml_str = _bytes( 37 "<html><head><title>test á</title></head>" 38 "<body><h1>page á title</h1></body></html>").decode('utf8') 39
40 - def tearDown(self):
41 super(HtmlParserTestCase, self).tearDown() 42 self.etree.set_default_parser()
43
44 - def test_module_HTML(self):
45 element = self.etree.HTML(self.html_str) 46 self.assertEqual(self.etree.tostring(element, method="html"), 47 self.html_str)
48
49 - def test_module_HTML_unicode(self):
50 element = self.etree.HTML(self.uhtml_str) 51 self.assertEqual( 52 self.etree.tostring(element, method="html", encoding='unicode'), 53 self.uhtml_str) 54 self.assertEqual(element.findtext('.//h1'), 55 _bytes("page á title").decode('utf8'))
56
57 - def test_wide_unicode_xml(self):
58 if sys.maxunicode < 1114111: 59 return # skip test 60 element = self.etree.HTML(_bytes( 61 '<html><body><p>\\U00026007</p></body></html>' 62 ).decode('unicode_escape')) 63 p_text = element.findtext('.//p') 64 self.assertEqual(1, len(p_text)) 65 self.assertEqual(_bytes('\\U00026007').decode('unicode_escape'), 66 p_text)
67
69 element = self.etree.HTML(self.html_str) 70 self.assertEqual(self.etree.tostring(element, method="html", pretty_print=True), 71 self.html_str_pretty)
72
74 parser = self.etree.HTMLParser(recover=False) 75 parse = self.etree.parse 76 f = BytesIO("<html></body>") 77 self.assertRaises(self.etree.XMLSyntaxError, 78 parse, f, parser)
79
81 parser = self.etree.HTMLParser() 82 Element = parser.makeelement 83 84 el = Element('name') 85 self.assertRaises(ValueError, Element, '{}') 86 self.assertRaises(ValueError, setattr, el, 'tag', '{}') 87 88 self.assertRaises(ValueError, Element, '{test}') 89 self.assertRaises(ValueError, setattr, el, 'tag', '{test}')
90
92 parser = self.etree.HTMLParser() 93 Element = parser.makeelement 94 95 pname = Element('p:name') 96 self.assertEqual(pname.tag, 'p:name') 97 98 pname = Element('{test}p:name') 99 self.assertEqual(pname.tag, '{test}p:name') 100 101 pname = Element('name') 102 pname.tag = 'p:name' 103 self.assertEqual(pname.tag, 'p:name')
104
106 parser = self.etree.HTMLParser() 107 Element = parser.makeelement 108 109 self.assertRaises(ValueError, Element, 'p"name') 110 self.assertRaises(ValueError, Element, "na'me") 111 self.assertRaises(ValueError, Element, '{test}"name') 112 self.assertRaises(ValueError, Element, "{test}name'") 113 114 el = Element('name') 115 self.assertRaises(ValueError, setattr, el, 'tag', "pname'") 116 self.assertRaises(ValueError, setattr, el, 'tag', '"pname') 117 self.assertEqual(el.tag, "name")
118
120 parser = self.etree.HTMLParser() 121 Element = parser.makeelement 122 123 self.assertRaises(ValueError, Element, ' name ') 124 self.assertRaises(ValueError, Element, 'na me') 125 self.assertRaises(ValueError, Element, '{test} name') 126 127 el = Element('name') 128 self.assertRaises(ValueError, setattr, el, 'tag', ' name ') 129 self.assertEqual(el.tag, "name")
130
132 parser = self.etree.HTMLParser() 133 Element = parser.makeelement 134 135 SubElement = self.etree.SubElement 136 137 el = Element('name') 138 self.assertRaises(ValueError, SubElement, el, '{}') 139 self.assertRaises(ValueError, SubElement, el, '{test}')
140
142 parser = self.etree.HTMLParser() 143 Element = parser.makeelement 144 SubElement = self.etree.SubElement 145 146 el = Element('name') 147 pname = SubElement(el, 'p:name') 148 self.assertEqual(pname.tag, 'p:name') 149 150 pname = SubElement(el, '{test}p:name') 151 self.assertEqual(pname.tag, '{test}p:name')
152
154 parser = self.etree.HTMLParser() 155 Element = parser.makeelement 156 SubElement = self.etree.SubElement 157 158 el = Element('name') 159 self.assertRaises(ValueError, SubElement, el, "name'") 160 self.assertRaises(ValueError, SubElement, el, 'na"me') 161 self.assertRaises(ValueError, SubElement, el, "{test}na'me") 162 self.assertRaises(ValueError, SubElement, el, '{test}"name')
163
165 parser = self.etree.HTMLParser() 166 Element = parser.makeelement 167 SubElement = self.etree.SubElement 168 169 el = Element('name') 170 self.assertRaises(ValueError, SubElement, el, ' name ') 171 self.assertRaises(ValueError, SubElement, el, 'na me') 172 self.assertRaises(ValueError, SubElement, el, '{test} name')
173
175 parser = self.etree.HTMLParser(recover=False) 176 parse = self.etree.parse 177 f = BytesIO(self.broken_html_str) 178 self.assertRaises(self.etree.XMLSyntaxError, 179 parse, f, parser)
180
182 text = _str('Søk på nettet') 183 html_latin1 = (_str('<p>%s</p>') % text).encode('iso-8859-1') 184 185 tree = self.etree.parse( 186 BytesIO(html_latin1), 187 self.etree.HTMLParser(encoding="iso-8859-1")) 188 p = tree.find("//p") 189 self.assertEqual(p.text, text)
190
192 text = _str('Søk på nettet') 193 wrong_head = _str(''' 194 <head> 195 <meta http-equiv="Content-Type" 196 content="text/html; charset=UTF-8" /> 197 </head>''') 198 html_latin1 = (_str('<html>%s<body><p>%s</p></body></html>') % (wrong_head, 199 text) 200 ).encode('iso-8859-1') 201 202 self.assertRaises(self.etree.ParseError, 203 self.etree.parse, 204 BytesIO(html_latin1)) 205 206 tree = self.etree.parse( 207 BytesIO(html_latin1), 208 self.etree.HTMLParser(encoding="iso-8859-1")) 209 p = tree.find("//p") 210 self.assertEqual(p.text, text)
211
212 - def test_module_HTML_broken(self):
213 element = self.etree.HTML(self.broken_html_str) 214 self.assertEqual(self.etree.tostring(element, method="html"), 215 self.html_str)
216
217 - def test_module_HTML_cdata(self):
218 # by default, libxml2 generates CDATA nodes for <script> content 219 html = _bytes('<html><head><style>foo</style></head></html>') 220 element = self.etree.HTML(html) 221 self.assertEqual(element[0][0].text, "foo")
222
223 - def test_module_HTML_access(self):
224 element = self.etree.HTML(self.html_str) 225 self.assertEqual(element[0][0].tag, 'title')
226
227 - def test_module_parse_html(self):
228 parser = self.etree.HTMLParser() 229 filename = tempfile.mktemp(suffix=".html") 230 write_to_file(filename, self.html_str, 'wb') 231 try: 232 f = open(filename, 'rb') 233 tree = self.etree.parse(f, parser) 234 f.close() 235 self.assertEqual(self.etree.tostring(tree.getroot(), method="html"), 236 self.html_str) 237 finally: 238 os.remove(filename)
239
241 parser = self.etree.HTMLParser() 242 f = SillyFileLike(self.html_str) 243 tree = self.etree.parse(f, parser) 244 html = self.etree.tostring(tree.getroot(), 245 method="html", encoding='UTF-8') 246 self.assertEqual(html, self.html_str)
247 248 ## def test_module_parse_html_filelike_unicode(self): 249 ## parser = self.etree.HTMLParser() 250 ## f = SillyFileLike(self.uhtml_str) 251 ## tree = self.etree.parse(f, parser) 252 ## html = self.etree.tostring(tree.getroot(), encoding='UTF-8') 253 ## self.assertEqual(unicode(html, 'UTF-8'), self.uhtml_str) 254
255 - def test_html_file_error(self):
256 parser = self.etree.HTMLParser() 257 parse = self.etree.parse 258 self.assertRaises(IOError, 259 parse, "__some_hopefully_nonexisting_file__.html", 260 parser)
261
263 self.assertRaises(self.etree.XMLSyntaxError, 264 self.etree.parse, BytesIO(self.broken_html_str)) 265 266 self.etree.set_default_parser( self.etree.HTMLParser() ) 267 268 tree = self.etree.parse(BytesIO(self.broken_html_str)) 269 self.assertEqual(self.etree.tostring(tree.getroot(), method="html"), 270 self.html_str) 271 272 self.etree.set_default_parser() 273 274 self.assertRaises(self.etree.XMLSyntaxError, 275 self.etree.parse, BytesIO(self.broken_html_str))
276
277 - def test_html_iterparse(self):
278 iterparse = self.etree.iterparse 279 f = BytesIO( 280 '<html><head><title>TITLE</title><body><p>P</p></body></html>') 281 282 iterator = iterparse(f, html=True) 283 self.assertEqual(None, iterator.root) 284 285 events = list(iterator) 286 root = iterator.root 287 self.assertTrue(root is not None) 288 self.assertEqual( 289 [('end', root[0][0]), ('end', root[0]), ('end', root[1][0]), 290 ('end', root[1]), ('end', root)], 291 events)
292
294 iterparse = self.etree.iterparse 295 f = BytesIO( 296 '<html><head><title>TITLE</title><body><p>P</p></body></html>') 297 298 iterator = iterparse(f, html=True) 299 self.assertEqual(None, iterator.root) 300 301 event, element = next(iterator) 302 self.assertEqual('end', event) 303 self.assertEqual('title', element.tag) 304 self.assertEqual(None, iterator.root) 305 del element 306 307 event, element = next(iterator) 308 self.assertEqual('end', event) 309 self.assertEqual('head', element.tag) 310 self.assertEqual(None, iterator.root) 311 del element 312 del iterator
313
315 iterparse = self.etree.iterparse 316 f = BytesIO('<head><title>TEST></head><p>P<br></div>') 317 318 iterator = iterparse(f, html=True) 319 self.assertEqual(None, iterator.root) 320 321 events = list(iterator) 322 root = iterator.root 323 self.assertTrue(root is not None) 324 self.assertEqual('html', root.tag) 325 self.assertEqual('head', root[0].tag) 326 self.assertEqual('body', root[1].tag) 327 self.assertEqual('p', root[1][0].tag) 328 self.assertEqual('br', root[1][0][0].tag) 329 self.assertEqual( 330 [('end', root[0][0]), ('end', root[0]), ('end', root[1][0][0]), 331 ('end', root[1][0]), ('end', root[1]), ('end', root)], 332 events)
333
335 iterparse = self.etree.iterparse 336 f = BytesIO('<p>P<br></div>') 337 iterator = iterparse(f, html=True, recover=False) 338 self.assertRaises(self.etree.XMLSyntaxError, list, iterator)
339
340 - def test_html_iterparse_file(self):
341 iterparse = self.etree.iterparse 342 iterator = iterparse(fileInTestDir("shakespeare.html"), 343 html=True) 344 345 self.assertEqual(None, iterator.root) 346 events = list(iterator) 347 root = iterator.root 348 self.assertTrue(root is not None) 349 self.assertEqual(249, len(events)) 350 self.assertEqual( 351 [], 352 [ event for (event, element) in events if event != 'end' ])
353
355 iterparse = self.etree.iterparse 356 f = BytesIO( 357 '<html><head><title>TITLE</title><body><p>P</p></body></html>') 358 359 iterator = iterparse(f, html=True, events=('start',)) 360 self.assertEqual(None, iterator.root) 361 362 events = list(iterator) 363 root = iterator.root 364 self.assertTrue(root is not None) 365 self.assertEqual( 366 [('start', root), ('start', root[0]), ('start', root[0][0]), 367 ('start', root[1]), ('start', root[1][0])], 368 events)
369
371 assertFalse = self.assertFalse 372 events = [] 373 class Target(object): 374 def start(self, tag, attrib): 375 events.append(("start", tag)) 376 assertFalse(attrib)
377 def end(self, tag): 378 events.append(("end", tag))
379 def close(self): 380 return "DONE" 381 382 parser = self.etree.HTMLParser(target=Target()) 383 384 parser.feed("<html><body></body></html>") 385 done = parser.close() 386 387 self.assertEqual("DONE", done) 388 self.assertEqual([ 389 ("start", "html"), ("start", "body"), 390 ("end", "body"), ("end", "html")], events) 391
392 - def test_html_parser_target_doctype_empty(self):
393 assertFalse = self.assertFalse 394 events = [] 395 class Target(object): 396 def start(self, tag, attrib): 397 events.append(("start", tag)) 398 assertFalse(attrib)
399 def end(self, tag): 400 events.append(("end", tag)) 401 def doctype(self, *args): 402 events.append(("doctype", args)) 403 def close(self): 404 return "DONE" 405 406 parser = self.etree.HTMLParser(target=Target()) 407 parser.feed("<!DOCTYPE><html><body></body></html>") 408 done = parser.close() 409 410 self.assertEqual("DONE", done) 411 self.assertEqual([ 412 ("doctype", (None, None, None)), 413 ("start", "html"), ("start", "body"), 414 ("end", "body"), ("end", "html")], events) 415
416 - def test_html_parser_target_doctype_html(self):
417 assertFalse = self.assertFalse 418 events = [] 419 class Target(object): 420 def start(self, tag, attrib): 421 events.append(("start", tag)) 422 assertFalse(attrib)
423 def end(self, tag): 424 events.append(("end", tag)) 425 def doctype(self, *args): 426 events.append(("doctype", args)) 427 def close(self): 428 return "DONE" 429 430 parser = self.etree.HTMLParser(target=Target()) 431 parser.feed("<!DOCTYPE html><html><body></body></html>") 432 done = parser.close() 433 434 self.assertEqual("DONE", done) 435 self.assertEqual([ 436 ("doctype", ("html", None, None)), 437 ("start", "html"), ("start", "body"), 438 ("end", "body"), ("end", "html")], events) 439
440 - def test_html_parser_target_doctype_html_full(self):
441 assertFalse = self.assertFalse 442 events = [] 443 class Target(object): 444 def start(self, tag, attrib): 445 events.append(("start", tag)) 446 assertFalse(attrib)
447 def end(self, tag): 448 events.append(("end", tag)) 449 def doctype(self, *args): 450 events.append(("doctype", args)) 451 def close(self): 452 return "DONE" 453 454 parser = self.etree.HTMLParser(target=Target()) 455 parser.feed('<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN" "sys.dtd">' 456 '<html><body></body></html>') 457 done = parser.close() 458 459 self.assertEqual("DONE", done) 460 self.assertEqual([ 461 ("doctype", ("html", "-//W3C//DTD HTML 4.01//EN", "sys.dtd")), 462 ("start", "html"), ("start", "body"), 463 ("end", "body"), ("end", "html")], events) 464 465
466 -def test_suite():
467 suite = unittest.TestSuite() 468 suite.addTests([unittest.makeSuite(HtmlParserTestCase)]) 469 return suite
470 471 if __name__ == '__main__': 472 print('to test use test.py %s' % __file__) 473