Package lxml :: Package tests :: Module test_htmlparser
[hide private]
[frames] | no frames]

Source Code for Module lxml.tests.test_htmlparser

  1  # -*- coding: utf-8 -*- 
  2   
  3  """ 
  4  HTML parser test cases for etree 
  5  """ 
  6   
  7  import unittest 
  8  import tempfile, os 
  9   
 10  from common_imports import StringIO, etree, fileInTestDir 
 11  from common_imports import SillyFileLike, HelperTestCase 
 12   
13 -class HtmlParserTestCase(HelperTestCase):
14 """HTML parser test cases 15 """ 16 etree = etree 17 18 html_str = "<html><head><title>test</title></head><body><h1>page title</h1></body></html>" 19 html_str_pretty = """\ 20 <html> 21 <head><title>test</title></head> 22 <body><h1>page title</h1></body> 23 </html>""" 24 broken_html_str = "<html><head><title>test<body><h1>page title</h3></p></html>" 25 uhtml_str = u"<html><head><title>test á\uF8D2</title></head><body><h1>page á\uF8D2 title</h1></body></html>" 26
27 - def tearDown(self):
28 super(HtmlParserTestCase, self).tearDown() 29 self.etree.setDefaultParser()
30
31 - def test_module_HTML(self):
32 element = self.etree.HTML(self.html_str) 33 self.assertEqual(self.etree.tostring(element), 34 self.html_str)
35
36 - def test_module_HTML_unicode(self):
37 element = self.etree.HTML(self.uhtml_str) 38 self.assertEqual(unicode(self.etree.tostring(element, encoding='UTF8'), 'UTF8'), 39 unicode(self.uhtml_str.encode('UTF8'), 'UTF8'))
40
42 element = self.etree.HTML(self.html_str) 43 self.assertEqual(self.etree.tostring(element, method="html", pretty_print=True), 44 self.html_str_pretty)
45
47 parser = self.etree.HTMLParser(recover=False) 48 parse = self.etree.parse 49 f = StringIO("<html></body>") 50 self.assertRaises(self.etree.XMLSyntaxError, 51 parse, f, parser)
52
54 parser = self.etree.HTMLParser() 55 Element = parser.makeelement 56 57 el = Element('name') 58 self.assertRaises(ValueError, Element, '{}') 59 self.assertRaises(ValueError, setattr, el, 'tag', '{}') 60 61 self.assertRaises(ValueError, Element, '{test}') 62 self.assertRaises(ValueError, setattr, el, 'tag', '{test}')
63
65 parser = self.etree.HTMLParser() 66 Element = parser.makeelement 67 68 pname = Element('p:name') 69 self.assertEquals(pname.tag, 'p:name') 70 71 pname = Element('{test}p:name') 72 self.assertEquals(pname.tag, '{test}p:name') 73 74 pname = Element('name') 75 pname.tag = 'p:name' 76 self.assertEquals(pname.tag, 'p:name')
77
79 parser = self.etree.HTMLParser() 80 Element = parser.makeelement 81 82 self.assertRaises(ValueError, Element, 'p"name') 83 self.assertRaises(ValueError, Element, "na'me") 84 self.assertRaises(ValueError, Element, '{test}"name') 85 self.assertRaises(ValueError, Element, "{test}name'") 86 87 el = Element('name') 88 self.assertRaises(ValueError, setattr, el, 'tag', "pname'") 89 self.assertRaises(ValueError, setattr, el, 'tag', '"pname') 90 self.assertEquals(el.tag, "name")
91
93 parser = self.etree.HTMLParser() 94 Element = parser.makeelement 95 96 self.assertRaises(ValueError, Element, ' name ') 97 self.assertRaises(ValueError, Element, 'na me') 98 self.assertRaises(ValueError, Element, '{test} name') 99 100 el = Element('name') 101 self.assertRaises(ValueError, setattr, el, 'tag', ' name ') 102 self.assertEquals(el.tag, "name")
103
105 parser = self.etree.HTMLParser() 106 Element = parser.makeelement 107 108 SubElement = self.etree.SubElement 109 110 el = Element('name') 111 self.assertRaises(ValueError, SubElement, el, '{}') 112 self.assertRaises(ValueError, SubElement, el, '{test}')
113
115 parser = self.etree.HTMLParser() 116 Element = parser.makeelement 117 SubElement = self.etree.SubElement 118 119 el = Element('name') 120 pname = SubElement(el, 'p:name') 121 self.assertEquals(pname.tag, 'p:name') 122 123 pname = SubElement(el, '{test}p:name') 124 self.assertEquals(pname.tag, '{test}p:name')
125
127 parser = self.etree.HTMLParser() 128 Element = parser.makeelement 129 SubElement = self.etree.SubElement 130 131 el = Element('name') 132 self.assertRaises(ValueError, SubElement, el, "name'") 133 self.assertRaises(ValueError, SubElement, el, 'na"me') 134 self.assertRaises(ValueError, SubElement, el, "{test}na'me") 135 self.assertRaises(ValueError, SubElement, el, '{test}"name')
136
138 parser = self.etree.HTMLParser() 139 Element = parser.makeelement 140 SubElement = self.etree.SubElement 141 142 el = Element('name') 143 self.assertRaises(ValueError, SubElement, el, ' name ') 144 self.assertRaises(ValueError, SubElement, el, 'na me') 145 self.assertRaises(ValueError, SubElement, el, '{test} name')
146
148 parser = self.etree.HTMLParser(recover=False) 149 parse = self.etree.parse 150 f = StringIO(self.broken_html_str) 151 self.assertRaises(self.etree.XMLSyntaxError, 152 parse, f, parser)
153
155 text = u'Søk på nettet' 156 html_latin1 = (u'<p>%s</p>' % text).encode('iso-8859-1') 157 158 tree = self.etree.parse( 159 StringIO(html_latin1), 160 self.etree.HTMLParser(encoding="iso-8859-1")) 161 p = tree.find("//p") 162 self.assertEquals(p.text, text)
163
165 text = u'Søk på nettet' 166 wrong_head = ''' 167 <head> 168 <meta http-equiv="Content-Type" 169 content="text/html; charset=UTF-8" /> 170 </head>''' 171 html_latin1 = (u'<html>%s<body><p>%s</p></body></html>' % (wrong_head, 172 text) 173 ).encode('iso-8859-1') 174 175 self.assertRaises(self.etree.ParseError, 176 self.etree.parse, 177 StringIO(html_latin1)) 178 179 tree = self.etree.parse( 180 StringIO(html_latin1), 181 self.etree.HTMLParser(encoding="iso-8859-1")) 182 p = tree.find("//p") 183 self.assertEquals(p.text, text)
184
185 - def test_module_HTML_broken(self):
186 element = self.etree.HTML(self.broken_html_str) 187 self.assertEqual(self.etree.tostring(element), 188 self.html_str)
189
190 - def test_module_HTML_cdata(self):
191 # by default, libxml2 generates CDATA nodes for <script> content 192 html = '<html><head><style>foo</style></head></html>' 193 element = self.etree.HTML(html) 194 self.assertEquals(element[0][0].text, "foo")
195
196 - def test_module_HTML_access(self):
197 element = self.etree.HTML(self.html_str) 198 self.assertEqual(element[0][0].tag, 'title')
199
200 - def test_module_parse_html(self):
201 parser = self.etree.HTMLParser() 202 filename = tempfile.mktemp(suffix=".html") 203 open(filename, 'wb').write(self.html_str) 204 try: 205 f = open(filename, 'r') 206 tree = self.etree.parse(f, parser) 207 f.close() 208 self.assertEqual(self.etree.tostring(tree.getroot()), self.html_str) 209 finally: 210 os.remove(filename)
211
213 parser = self.etree.HTMLParser() 214 f = SillyFileLike(self.html_str) 215 tree = self.etree.parse(f, parser) 216 html = self.etree.tostring(tree.getroot(), encoding='UTF-8') 217 self.assertEqual(html, self.html_str)
218 219 ## def test_module_parse_html_filelike_unicode(self): 220 ## parser = self.etree.HTMLParser() 221 ## f = SillyFileLike(self.uhtml_str) 222 ## tree = self.etree.parse(f, parser) 223 ## html = self.etree.tostring(tree.getroot(), encoding='UTF-8') 224 ## self.assertEqual(unicode(html, 'UTF-8'), self.uhtml_str) 225
226 - def test_html_file_error(self):
227 parser = self.etree.HTMLParser() 228 parse = self.etree.parse 229 self.assertRaises(IOError, 230 parse, "__some_hopefully_nonexisting_file__.html", 231 parser)
232
234 self.assertRaises(self.etree.XMLSyntaxError, 235 self.etree.parse, StringIO(self.broken_html_str)) 236 237 self.etree.setDefaultParser( self.etree.HTMLParser() ) 238 239 tree = self.etree.parse(StringIO(self.broken_html_str)) 240 self.assertEqual(self.etree.tostring(tree.getroot()), 241 self.html_str) 242 243 self.etree.setDefaultParser() 244 245 self.assertRaises(self.etree.XMLSyntaxError, 246 self.etree.parse, StringIO(self.broken_html_str))
247
248 - def test_html_iterparse(self):
249 iterparse = self.etree.iterparse 250 f = StringIO( 251 '<html><head><title>TITLE</title><body><p>P</p></body></html>') 252 253 iterator = iterparse(f, html=True) 254 self.assertEquals(None, iterator.root) 255 256 events = list(iterator) 257 root = iterator.root 258 self.assert_(root is not None) 259 self.assertEquals( 260 [('end', root[0][0]), ('end', root[0]), ('end', root[1][0]), 261 ('end', root[1]), ('end', root)], 262 events)
263
264 - def test_html_iterparse_file(self):
265 iterparse = self.etree.iterparse 266 iterator = iterparse(fileInTestDir("css_shakespear.html"), 267 html=True) 268 269 self.assertEquals(None, iterator.root) 270 events = list(iterator) 271 root = iterator.root 272 self.assert_(root is not None) 273 self.assertEquals(249, len(events)) 274 self.assertEquals( 275 [], 276 [ event for (event, element) in events if event != 'end' ])
277
278 -def test_suite():
279 suite = unittest.TestSuite() 280 suite.addTests([unittest.makeSuite(HtmlParserTestCase)]) 281 return suite
282 283 if __name__ == '__main__': 284 print 'to test use test.py %s' % __file__ 285