Package lxml :: Package tests :: Module test_htmlparser
[hide private]
[frames] | no frames]

Source Code for Module lxml.tests.test_htmlparser

  1  # -*- coding: utf-8 -*- 
  2   
  3  """ 
  4  HTML parser test cases for etree 
  5  """ 
  6   
  7  import unittest 
  8  import tempfile, os, os.path, sys 
  9   
 10  this_dir = os.path.dirname(__file__) 
 11  if this_dir not in sys.path: 
 12      sys.path.insert(0, this_dir) # needed for Py3 
 13   
 14  from common_imports import etree, StringIO, BytesIO, fileInTestDir, _bytes, _str 
 15  from common_imports import SillyFileLike, HelperTestCase, write_to_file 
 16   
 17  try: 
 18      unicode = __builtins__["unicode"] 
 19  except (NameError, KeyError): 
 20      unicode = str 
 21   
22 -class HtmlParserTestCase(HelperTestCase):
23 """HTML parser test cases 24 """ 25 etree = etree 26 27 html_str = _bytes("<html><head><title>test</title></head><body><h1>page title</h1></body></html>") 28 html_str_pretty = _bytes("""\ 29 <html> 30 <head><title>test</title></head> 31 <body><h1>page title</h1></body> 32 </html> 33 """) 34 broken_html_str = _bytes("<html><head><title>test<body><h1>page title</h3></p></html>") 35 uhtml_str = _str("<html><head><title>test á\uF8D2</title></head><body><h1>page á\uF8D2 title</h1></body></html>") 36
37 - def tearDown(self):
38 super(HtmlParserTestCase, self).tearDown() 39 self.etree.set_default_parser()
40
41 - def test_module_HTML(self):
42 element = self.etree.HTML(self.html_str) 43 self.assertEqual(self.etree.tostring(element, method="html"), 44 self.html_str)
45
46 - def test_module_HTML_unicode(self):
47 element = self.etree.HTML(self.uhtml_str) 48 self.assertEqual(unicode(self.etree.tostring(element, method="html", 49 encoding='UTF8'), 'UTF8'), 50 unicode(self.uhtml_str.encode('UTF8'), 'UTF8'))
51
53 element = self.etree.HTML(self.html_str) 54 self.assertEqual(self.etree.tostring(element, method="html", pretty_print=True), 55 self.html_str_pretty)
56
58 parser = self.etree.HTMLParser(recover=False) 59 parse = self.etree.parse 60 f = BytesIO("<html></body>") 61 self.assertRaises(self.etree.XMLSyntaxError, 62 parse, f, parser)
63
65 parser = self.etree.HTMLParser() 66 Element = parser.makeelement 67 68 el = Element('name') 69 self.assertRaises(ValueError, Element, '{}') 70 self.assertRaises(ValueError, setattr, el, 'tag', '{}') 71 72 self.assertRaises(ValueError, Element, '{test}') 73 self.assertRaises(ValueError, setattr, el, 'tag', '{test}')
74
76 parser = self.etree.HTMLParser() 77 Element = parser.makeelement 78 79 pname = Element('p:name') 80 self.assertEquals(pname.tag, 'p:name') 81 82 pname = Element('{test}p:name') 83 self.assertEquals(pname.tag, '{test}p:name') 84 85 pname = Element('name') 86 pname.tag = 'p:name' 87 self.assertEquals(pname.tag, 'p:name')
88
90 parser = self.etree.HTMLParser() 91 Element = parser.makeelement 92 93 self.assertRaises(ValueError, Element, 'p"name') 94 self.assertRaises(ValueError, Element, "na'me") 95 self.assertRaises(ValueError, Element, '{test}"name') 96 self.assertRaises(ValueError, Element, "{test}name'") 97 98 el = Element('name') 99 self.assertRaises(ValueError, setattr, el, 'tag', "pname'") 100 self.assertRaises(ValueError, setattr, el, 'tag', '"pname') 101 self.assertEquals(el.tag, "name")
102
104 parser = self.etree.HTMLParser() 105 Element = parser.makeelement 106 107 self.assertRaises(ValueError, Element, ' name ') 108 self.assertRaises(ValueError, Element, 'na me') 109 self.assertRaises(ValueError, Element, '{test} name') 110 111 el = Element('name') 112 self.assertRaises(ValueError, setattr, el, 'tag', ' name ') 113 self.assertEquals(el.tag, "name")
114
116 parser = self.etree.HTMLParser() 117 Element = parser.makeelement 118 119 SubElement = self.etree.SubElement 120 121 el = Element('name') 122 self.assertRaises(ValueError, SubElement, el, '{}') 123 self.assertRaises(ValueError, SubElement, el, '{test}')
124
126 parser = self.etree.HTMLParser() 127 Element = parser.makeelement 128 SubElement = self.etree.SubElement 129 130 el = Element('name') 131 pname = SubElement(el, 'p:name') 132 self.assertEquals(pname.tag, 'p:name') 133 134 pname = SubElement(el, '{test}p:name') 135 self.assertEquals(pname.tag, '{test}p:name')
136
138 parser = self.etree.HTMLParser() 139 Element = parser.makeelement 140 SubElement = self.etree.SubElement 141 142 el = Element('name') 143 self.assertRaises(ValueError, SubElement, el, "name'") 144 self.assertRaises(ValueError, SubElement, el, 'na"me') 145 self.assertRaises(ValueError, SubElement, el, "{test}na'me") 146 self.assertRaises(ValueError, SubElement, el, '{test}"name')
147
149 parser = self.etree.HTMLParser() 150 Element = parser.makeelement 151 SubElement = self.etree.SubElement 152 153 el = Element('name') 154 self.assertRaises(ValueError, SubElement, el, ' name ') 155 self.assertRaises(ValueError, SubElement, el, 'na me') 156 self.assertRaises(ValueError, SubElement, el, '{test} name')
157
159 parser = self.etree.HTMLParser(recover=False) 160 parse = self.etree.parse 161 f = BytesIO(self.broken_html_str) 162 self.assertRaises(self.etree.XMLSyntaxError, 163 parse, f, parser)
164
166 text = _str('Søk på nettet') 167 html_latin1 = (_str('<p>%s</p>') % text).encode('iso-8859-1') 168 169 tree = self.etree.parse( 170 BytesIO(html_latin1), 171 self.etree.HTMLParser(encoding="iso-8859-1")) 172 p = tree.find("//p") 173 self.assertEquals(p.text, text)
174
176 text = _str('Søk på nettet') 177 wrong_head = _str(''' 178 <head> 179 <meta http-equiv="Content-Type" 180 content="text/html; charset=UTF-8" /> 181 </head>''') 182 html_latin1 = (_str('<html>%s<body><p>%s</p></body></html>') % (wrong_head, 183 text) 184 ).encode('iso-8859-1') 185 186 self.assertRaises(self.etree.ParseError, 187 self.etree.parse, 188 BytesIO(html_latin1)) 189 190 tree = self.etree.parse( 191 BytesIO(html_latin1), 192 self.etree.HTMLParser(encoding="iso-8859-1")) 193 p = tree.find("//p") 194 self.assertEquals(p.text, text)
195
196 - def test_module_HTML_broken(self):
197 element = self.etree.HTML(self.broken_html_str) 198 self.assertEqual(self.etree.tostring(element, method="html"), 199 self.html_str)
200
201 - def test_module_HTML_cdata(self):
202 # by default, libxml2 generates CDATA nodes for <script> content 203 html = _bytes('<html><head><style>foo</style></head></html>') 204 element = self.etree.HTML(html) 205 self.assertEquals(element[0][0].text, "foo")
206
207 - def test_module_HTML_access(self):
208 element = self.etree.HTML(self.html_str) 209 self.assertEqual(element[0][0].tag, 'title')
210
211 - def test_module_parse_html(self):
212 parser = self.etree.HTMLParser() 213 filename = tempfile.mktemp(suffix=".html") 214 write_to_file(filename, self.html_str, 'wb') 215 try: 216 f = open(filename, 'rb') 217 tree = self.etree.parse(f, parser) 218 f.close() 219 self.assertEqual(self.etree.tostring(tree.getroot(), method="html"), 220 self.html_str) 221 finally: 222 os.remove(filename)
223
225 parser = self.etree.HTMLParser() 226 f = SillyFileLike(self.html_str) 227 tree = self.etree.parse(f, parser) 228 html = self.etree.tostring(tree.getroot(), 229 method="html", encoding='UTF-8') 230 self.assertEqual(html, self.html_str)
231 232 ## def test_module_parse_html_filelike_unicode(self): 233 ## parser = self.etree.HTMLParser() 234 ## f = SillyFileLike(self.uhtml_str) 235 ## tree = self.etree.parse(f, parser) 236 ## html = self.etree.tostring(tree.getroot(), encoding='UTF-8') 237 ## self.assertEqual(unicode(html, 'UTF-8'), self.uhtml_str) 238
239 - def test_html_file_error(self):
240 parser = self.etree.HTMLParser() 241 parse = self.etree.parse 242 self.assertRaises(IOError, 243 parse, "__some_hopefully_nonexisting_file__.html", 244 parser)
245
247 self.assertRaises(self.etree.XMLSyntaxError, 248 self.etree.parse, BytesIO(self.broken_html_str)) 249 250 self.etree.set_default_parser( self.etree.HTMLParser() ) 251 252 tree = self.etree.parse(BytesIO(self.broken_html_str)) 253 self.assertEqual(self.etree.tostring(tree.getroot(), method="html"), 254 self.html_str) 255 256 self.etree.set_default_parser() 257 258 self.assertRaises(self.etree.XMLSyntaxError, 259 self.etree.parse, BytesIO(self.broken_html_str))
260
261 - def test_html_iterparse(self):
262 iterparse = self.etree.iterparse 263 f = BytesIO( 264 '<html><head><title>TITLE</title><body><p>P</p></body></html>') 265 266 iterator = iterparse(f, html=True) 267 self.assertEquals(None, iterator.root) 268 269 events = list(iterator) 270 root = iterator.root 271 self.assert_(root is not None) 272 self.assertEquals( 273 [('end', root[0][0]), ('end', root[0]), ('end', root[1][0]), 274 ('end', root[1]), ('end', root)], 275 events)
276
277 - def test_html_iterparse_file(self):
278 iterparse = self.etree.iterparse 279 iterator = iterparse(fileInTestDir("css_shakespear.html"), 280 html=True) 281 282 self.assertEquals(None, iterator.root) 283 events = list(iterator) 284 root = iterator.root 285 self.assert_(root is not None) 286 self.assertEquals(249, len(events)) 287 self.assertEquals( 288 [], 289 [ event for (event, element) in events if event != 'end' ])
290
292 iterparse = self.etree.iterparse 293 f = BytesIO( 294 '<html><head><title>TITLE</title><body><p>P</p></body></html>') 295 296 iterator = iterparse(f, html=True, events=('start',)) 297 self.assertEquals(None, iterator.root) 298 299 events = list(iterator) 300 root = iterator.root 301 self.assert_(root is not None) 302 self.assertEquals( 303 [('start', root), ('start', root[0]), ('start', root[0][0]), 304 ('start', root[1]), ('start', root[1][0])], 305 events)
306
308 assertFalse = self.assertFalse 309 events = [] 310 class Target(object): 311 def start(self, tag, attrib): 312 events.append(("start", tag)) 313 assertFalse(attrib)
314 def end(self, tag): 315 events.append(("end", tag))
316 def close(self): 317 return "DONE" 318 319 parser = self.etree.HTMLParser(target=Target()) 320 321 parser.feed("<html><body></body></html>") 322 done = parser.close() 323 324 self.assertEquals("DONE", done) 325 self.assertEquals([ 326 ("start", "html"), ("start", "body"), 327 ("end", "body"), ("end", "html")], events) 328
329 - def test_html_parser_target_doctype_empty(self):
330 assertFalse = self.assertFalse 331 events = [] 332 class Target(object): 333 def start(self, tag, attrib): 334 events.append(("start", tag)) 335 assertFalse(attrib)
336 def end(self, tag): 337 events.append(("end", tag)) 338 def doctype(self, *args): 339 events.append(("doctype", args)) 340 def close(self): 341 return "DONE" 342 343 parser = self.etree.HTMLParser(target=Target()) 344 parser.feed("<!DOCTYPE><html><body></body></html>") 345 done = parser.close() 346 347 self.assertEquals("DONE", done) 348 self.assertEquals([ 349 ("doctype", (None, None, None)), 350 ("start", "html"), ("start", "body"), 351 ("end", "body"), ("end", "html")], events) 352
353 - def test_html_parser_target_doctype_html(self):
354 assertFalse = self.assertFalse 355 events = [] 356 class Target(object): 357 def start(self, tag, attrib): 358 events.append(("start", tag)) 359 assertFalse(attrib)
360 def end(self, tag): 361 events.append(("end", tag)) 362 def doctype(self, *args): 363 events.append(("doctype", args)) 364 def close(self): 365 return "DONE" 366 367 parser = self.etree.HTMLParser(target=Target()) 368 parser.feed("<!DOCTYPE html><html><body></body></html>") 369 done = parser.close() 370 371 self.assertEquals("DONE", done) 372 self.assertEquals([ 373 ("doctype", ("html", None, None)), 374 ("start", "html"), ("start", "body"), 375 ("end", "body"), ("end", "html")], events) 376
377 - def test_html_parser_target_doctype_html_full(self):
378 assertFalse = self.assertFalse 379 events = [] 380 class Target(object): 381 def start(self, tag, attrib): 382 events.append(("start", tag)) 383 assertFalse(attrib)
384 def end(self, tag): 385 events.append(("end", tag)) 386 def doctype(self, *args): 387 events.append(("doctype", args)) 388 def close(self): 389 return "DONE" 390 391 parser = self.etree.HTMLParser(target=Target()) 392 parser.feed('<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN" "sys.dtd">' 393 '<html><body></body></html>') 394 done = parser.close() 395 396 self.assertEquals("DONE", done) 397 self.assertEquals([ 398 ("doctype", ("html", "-//W3C//DTD HTML 4.01//EN", "sys.dtd")), 399 ("start", "html"), ("start", "body"), 400 ("end", "body"), ("end", "html")], events) 401 402
403 -def test_suite():
404 suite = unittest.TestSuite() 405 suite.addTests([unittest.makeSuite(HtmlParserTestCase)]) 406 return suite
407 408 if __name__ == '__main__': 409 print('to test use test.py %s' % __file__) 410