Package lxml :: Package tests :: Module test_htmlparser
[hide private]
[frames] | no frames]

Source Code for Module lxml.tests.test_htmlparser

  1  # -*- coding: utf-8 -*- 
  2   
  3  """ 
  4  HTML parser test cases for etree 
  5  """ 
  6   
  7  import unittest 
  8  import tempfile, os, os.path, sys 
  9   
 10  this_dir = os.path.dirname(__file__) 
 11  if this_dir not in sys.path: 
 12      sys.path.insert(0, this_dir) # needed for Py3 
 13   
 14  from common_imports import etree, StringIO, BytesIO, fileInTestDir, _bytes, _str 
 15  from common_imports import SillyFileLike, HelperTestCase 
 16   
 17  try: 
 18      unicode = __builtins__["unicode"] 
 19  except (NameError, KeyError): 
 20      unicode = str 
 21   
22 -class HtmlParserTestCase(HelperTestCase):
23 """HTML parser test cases 24 """ 25 etree = etree 26 27 html_str = _bytes("<html><head><title>test</title></head><body><h1>page title</h1></body></html>") 28 html_str_pretty = _bytes("""\ 29 <html> 30 <head><title>test</title></head> 31 <body><h1>page title</h1></body> 32 </html> 33 """) 34 broken_html_str = _bytes("<html><head><title>test<body><h1>page title</h3></p></html>") 35 uhtml_str = _str("<html><head><title>test á\uF8D2</title></head><body><h1>page á\uF8D2 title</h1></body></html>") 36
37 - def tearDown(self):
38 super(HtmlParserTestCase, self).tearDown() 39 self.etree.set_default_parser()
40
41 - def test_module_HTML(self):
42 element = self.etree.HTML(self.html_str) 43 self.assertEqual(self.etree.tostring(element, method="html"), 44 self.html_str)
45
46 - def test_module_HTML_unicode(self):
47 element = self.etree.HTML(self.uhtml_str) 48 self.assertEqual(unicode(self.etree.tostring(element, method="html", 49 encoding='UTF8'), 'UTF8'), 50 unicode(self.uhtml_str.encode('UTF8'), 'UTF8'))
51
53 element = self.etree.HTML(self.html_str) 54 self.assertEqual(self.etree.tostring(element, method="html", pretty_print=True), 55 self.html_str_pretty)
56
58 parser = self.etree.HTMLParser(recover=False) 59 parse = self.etree.parse 60 f = BytesIO("<html></body>") 61 self.assertRaises(self.etree.XMLSyntaxError, 62 parse, f, parser)
63
65 parser = self.etree.HTMLParser() 66 Element = parser.makeelement 67 68 el = Element('name') 69 self.assertRaises(ValueError, Element, '{}') 70 self.assertRaises(ValueError, setattr, el, 'tag', '{}') 71 72 self.assertRaises(ValueError, Element, '{test}') 73 self.assertRaises(ValueError, setattr, el, 'tag', '{test}')
74
76 parser = self.etree.HTMLParser() 77 Element = parser.makeelement 78 79 pname = Element('p:name') 80 self.assertEquals(pname.tag, 'p:name') 81 82 pname = Element('{test}p:name') 83 self.assertEquals(pname.tag, '{test}p:name') 84 85 pname = Element('name') 86 pname.tag = 'p:name' 87 self.assertEquals(pname.tag, 'p:name')
88
90 parser = self.etree.HTMLParser() 91 Element = parser.makeelement 92 93 self.assertRaises(ValueError, Element, 'p"name') 94 self.assertRaises(ValueError, Element, "na'me") 95 self.assertRaises(ValueError, Element, '{test}"name') 96 self.assertRaises(ValueError, Element, "{test}name'") 97 98 el = Element('name') 99 self.assertRaises(ValueError, setattr, el, 'tag', "pname'") 100 self.assertRaises(ValueError, setattr, el, 'tag', '"pname') 101 self.assertEquals(el.tag, "name")
102
104 parser = self.etree.HTMLParser() 105 Element = parser.makeelement 106 107 self.assertRaises(ValueError, Element, ' name ') 108 self.assertRaises(ValueError, Element, 'na me') 109 self.assertRaises(ValueError, Element, '{test} name') 110 111 el = Element('name') 112 self.assertRaises(ValueError, setattr, el, 'tag', ' name ') 113 self.assertEquals(el.tag, "name")
114
116 parser = self.etree.HTMLParser() 117 Element = parser.makeelement 118 119 SubElement = self.etree.SubElement 120 121 el = Element('name') 122 self.assertRaises(ValueError, SubElement, el, '{}') 123 self.assertRaises(ValueError, SubElement, el, '{test}')
124
126 parser = self.etree.HTMLParser() 127 Element = parser.makeelement 128 SubElement = self.etree.SubElement 129 130 el = Element('name') 131 pname = SubElement(el, 'p:name') 132 self.assertEquals(pname.tag, 'p:name') 133 134 pname = SubElement(el, '{test}p:name') 135 self.assertEquals(pname.tag, '{test}p:name')
136
138 parser = self.etree.HTMLParser() 139 Element = parser.makeelement 140 SubElement = self.etree.SubElement 141 142 el = Element('name') 143 self.assertRaises(ValueError, SubElement, el, "name'") 144 self.assertRaises(ValueError, SubElement, el, 'na"me') 145 self.assertRaises(ValueError, SubElement, el, "{test}na'me") 146 self.assertRaises(ValueError, SubElement, el, '{test}"name')
147
149 parser = self.etree.HTMLParser() 150 Element = parser.makeelement 151 SubElement = self.etree.SubElement 152 153 el = Element('name') 154 self.assertRaises(ValueError, SubElement, el, ' name ') 155 self.assertRaises(ValueError, SubElement, el, 'na me') 156 self.assertRaises(ValueError, SubElement, el, '{test} name')
157
159 parser = self.etree.HTMLParser(recover=False) 160 parse = self.etree.parse 161 f = BytesIO(self.broken_html_str) 162 self.assertRaises(self.etree.XMLSyntaxError, 163 parse, f, parser)
164
166 text = _str('Søk på nettet') 167 html_latin1 = (_str('<p>%s</p>') % text).encode('iso-8859-1') 168 169 tree = self.etree.parse( 170 BytesIO(html_latin1), 171 self.etree.HTMLParser(encoding="iso-8859-1")) 172 p = tree.find("//p") 173 self.assertEquals(p.text, text)
174
176 text = _str('Søk på nettet') 177 wrong_head = _str(''' 178 <head> 179 <meta http-equiv="Content-Type" 180 content="text/html; charset=UTF-8" /> 181 </head>''') 182 html_latin1 = (_str('<html>%s<body><p>%s</p></body></html>') % (wrong_head, 183 text) 184 ).encode('iso-8859-1') 185 186 self.assertRaises(self.etree.ParseError, 187 self.etree.parse, 188 BytesIO(html_latin1)) 189 190 tree = self.etree.parse( 191 BytesIO(html_latin1), 192 self.etree.HTMLParser(encoding="iso-8859-1")) 193 p = tree.find("//p") 194 self.assertEquals(p.text, text)
195
196 - def test_module_HTML_broken(self):
197 element = self.etree.HTML(self.broken_html_str) 198 self.assertEqual(self.etree.tostring(element, method="html"), 199 self.html_str)
200
201 - def test_module_HTML_cdata(self):
202 # by default, libxml2 generates CDATA nodes for <script> content 203 html = _bytes('<html><head><style>foo</style></head></html>') 204 element = self.etree.HTML(html) 205 self.assertEquals(element[0][0].text, "foo")
206
207 - def test_module_HTML_access(self):
208 element = self.etree.HTML(self.html_str) 209 self.assertEqual(element[0][0].tag, 'title')
210
211 - def test_module_parse_html(self):
212 parser = self.etree.HTMLParser() 213 filename = tempfile.mktemp(suffix=".html") 214 open(filename, 'wb').write(self.html_str) 215 try: 216 f = open(filename, 'rb') 217 tree = self.etree.parse(f, parser) 218 f.close() 219 self.assertEqual(self.etree.tostring(tree.getroot(), method="html"), 220 self.html_str) 221 finally: 222 os.remove(filename)
223
225 parser = self.etree.HTMLParser() 226 f = SillyFileLike(self.html_str) 227 tree = self.etree.parse(f, parser) 228 html = self.etree.tostring(tree.getroot(), 229 method="html", encoding='UTF-8') 230 self.assertEqual(html, self.html_str)
231 232 ## def test_module_parse_html_filelike_unicode(self): 233 ## parser = self.etree.HTMLParser() 234 ## f = SillyFileLike(self.uhtml_str) 235 ## tree = self.etree.parse(f, parser) 236 ## html = self.etree.tostring(tree.getroot(), encoding='UTF-8') 237 ## self.assertEqual(unicode(html, 'UTF-8'), self.uhtml_str) 238
239 - def test_html_file_error(self):
240 parser = self.etree.HTMLParser() 241 parse = self.etree.parse 242 self.assertRaises(IOError, 243 parse, "__some_hopefully_nonexisting_file__.html", 244 parser)
245
247 self.assertRaises(self.etree.XMLSyntaxError, 248 self.etree.parse, BytesIO(self.broken_html_str)) 249 250 self.etree.set_default_parser( self.etree.HTMLParser() ) 251 252 tree = self.etree.parse(BytesIO(self.broken_html_str)) 253 self.assertEqual(self.etree.tostring(tree.getroot(), method="html"), 254 self.html_str) 255 256 self.etree.set_default_parser() 257 258 self.assertRaises(self.etree.XMLSyntaxError, 259 self.etree.parse, BytesIO(self.broken_html_str))
260
261 - def test_html_iterparse(self):
262 iterparse = self.etree.iterparse 263 f = BytesIO( 264 '<html><head><title>TITLE</title><body><p>P</p></body></html>') 265 266 iterator = iterparse(f, html=True) 267 self.assertEquals(None, iterator.root) 268 269 events = list(iterator) 270 root = iterator.root 271 self.assert_(root is not None) 272 self.assertEquals( 273 [('end', root[0][0]), ('end', root[0]), ('end', root[1][0]), 274 ('end', root[1]), ('end', root)], 275 events)
276
277 - def test_html_iterparse_file(self):
278 iterparse = self.etree.iterparse 279 iterator = iterparse(fileInTestDir("css_shakespear.html"), 280 html=True) 281 282 self.assertEquals(None, iterator.root) 283 events = list(iterator) 284 root = iterator.root 285 self.assert_(root is not None) 286 self.assertEquals(249, len(events)) 287 self.assertEquals( 288 [], 289 [ event for (event, element) in events if event != 'end' ])
290
291 -def test_suite():
292 suite = unittest.TestSuite() 293 suite.addTests([unittest.makeSuite(HtmlParserTestCase)]) 294 return suite
295 296 if __name__ == '__main__': 297 print('to test use test.py %s' % __file__) 298