Package lxml :: Package tests :: Module test_htmlparser
[frames] | no frames]

Source Code for Module lxml.tests.test_htmlparser

  1  # -*- coding: utf-8 -*- 
  2   
  3  """ 
  4  HTML parser test cases for etree 
  5  """ 
  6   
  7  import unittest 
  8  import tempfile, os 
  9   
 10  from common_imports import StringIO, etree, fileInTestDir 
 11  from common_imports import SillyFileLike, HelperTestCase 
 12   
13 -class HtmlParserTestCase(HelperTestCase):
14 """HTML parser test cases 15 """ 16 etree = etree 17 18 html_str = "<html><head><title>test</title></head><body><h1>page title</h1></body></html>" 19 html_str_pretty = """\ 20 <html> 21 <head><title>test</title></head> 22 <body><h1>page title</h1></body> 23 </html> 24 """ 25 broken_html_str = "<html><head><title>test<body><h1>page title</h3></p></html>" 26 uhtml_str = u"<html><head><title>test á\uF8D2</title></head><body><h1>page á\uF8D2 title</h1></body></html>" 27
28 - def tearDown(self):
29 super(HtmlParserTestCase, self).tearDown() 30 self.etree.setDefaultParser()
31
32 - def test_module_HTML(self):
33 element = self.etree.HTML(self.html_str) 34 self.assertEqual(self.etree.tostring(element), 35 self.html_str)
36
37 - def test_module_HTML_unicode(self):
38 element = self.etree.HTML(self.uhtml_str) 39 self.assertEqual(unicode(self.etree.tostring(element, encoding='UTF8'), 'UTF8'), 40 unicode(self.uhtml_str.encode('UTF8'), 'UTF8'))
41
43 element = self.etree.HTML(self.html_str) 44 self.assertEqual(self.etree.tostring(element, method="html", pretty_print=True), 45 self.html_str_pretty)
46
48 parser = self.etree.HTMLParser(recover=False) 49 parse = self.etree.parse 50 f = StringIO("<html></body>") 51 self.assertRaises(self.etree.XMLSyntaxError, 52 parse, f, parser)
53
55 parser = self.etree.HTMLParser() 56 Element = parser.makeelement 57 58 el = Element('name') 59 self.assertRaises(ValueError, Element, '{}') 60 self.assertRaises(ValueError, setattr, el, 'tag', '{}') 61 62 self.assertRaises(ValueError, Element, '{test}') 63 self.assertRaises(ValueError, setattr, el, 'tag', '{test}')
64
66 parser = self.etree.HTMLParser() 67 Element = parser.makeelement 68 69 pname = Element('p:name') 70 self.assertEquals(pname.tag, 'p:name') 71 72 pname = Element('{test}p:name') 73 self.assertEquals(pname.tag, '{test}p:name') 74 75 pname = Element('name') 76 pname.tag = 'p:name' 77 self.assertEquals(pname.tag, 'p:name')
78
80 parser = self.etree.HTMLParser() 81 Element = parser.makeelement 82 83 self.assertRaises(ValueError, Element, 'p"name') 84 self.assertRaises(ValueError, Element, "na'me") 85 self.assertRaises(ValueError, Element, '{test}"name') 86 self.assertRaises(ValueError, Element, "{test}name'") 87 88 el = Element('name') 89 self.assertRaises(ValueError, setattr, el, 'tag', "pname'") 90 self.assertRaises(ValueError, setattr, el, 'tag', '"pname') 91 self.assertEquals(el.tag, "name")
92
94 parser = self.etree.HTMLParser() 95 Element = parser.makeelement 96 97 self.assertRaises(ValueError, Element, ' name ') 98 self.assertRaises(ValueError, Element, 'na me') 99 self.assertRaises(ValueError, Element, '{test} name') 100 101 el = Element('name') 102 self.assertRaises(ValueError, setattr, el, 'tag', ' name ') 103 self.assertEquals(el.tag, "name")
104
106 parser = self.etree.HTMLParser() 107 Element = parser.makeelement 108 109 SubElement = self.etree.SubElement 110 111 el = Element('name') 112 self.assertRaises(ValueError, SubElement, el, '{}') 113 self.assertRaises(ValueError, SubElement, el, '{test}')
114
116 parser = self.etree.HTMLParser() 117 Element = parser.makeelement 118 SubElement = self.etree.SubElement 119 120 el = Element('name') 121 pname = SubElement(el, 'p:name') 122 self.assertEquals(pname.tag, 'p:name') 123 124 pname = SubElement(el, '{test}p:name') 125 self.assertEquals(pname.tag, '{test}p:name')
126
128 parser = self.etree.HTMLParser() 129 Element = parser.makeelement 130 SubElement = self.etree.SubElement 131 132 el = Element('name') 133 self.assertRaises(ValueError, SubElement, el, "name'") 134 self.assertRaises(ValueError, SubElement, el, 'na"me') 135 self.assertRaises(ValueError, SubElement, el, "{test}na'me") 136 self.assertRaises(ValueError, SubElement, el, '{test}"name')
137
139 parser = self.etree.HTMLParser() 140 Element = parser.makeelement 141 SubElement = self.etree.SubElement 142 143 el = Element('name') 144 self.assertRaises(ValueError, SubElement, el, ' name ') 145 self.assertRaises(ValueError, SubElement, el, 'na me') 146 self.assertRaises(ValueError, SubElement, el, '{test} name')
147
149 parser = self.etree.HTMLParser(recover=False) 150 parse = self.etree.parse 151 f = StringIO(self.broken_html_str) 152 self.assertRaises(self.etree.XMLSyntaxError, 153 parse, f, parser)
154
156 text = u'Søk på nettet' 157 html_latin1 = (u'<p>%s</p>' % text).encode('iso-8859-1') 158 159 tree = self.etree.parse( 160 StringIO(html_latin1), 161 self.etree.HTMLParser(encoding="iso-8859-1")) 162 p = tree.find("//p") 163 self.assertEquals(p.text, text)
164
166 text = u'Søk på nettet' 167 wrong_head = ''' 168 <head> 169 <meta http-equiv="Content-Type" 170 content="text/html; charset=UTF-8" /> 171 </head>''' 172 html_latin1 = (u'<html>%s<body><p>%s</p></body></html>' % (wrong_head, 173 text) 174 ).encode('iso-8859-1') 175 176 self.assertRaises(self.etree.ParseError, 177 self.etree.parse, 178 StringIO(html_latin1)) 179 180 tree = self.etree.parse( 181 StringIO(html_latin1), 182 self.etree.HTMLParser(encoding="iso-8859-1")) 183 p = tree.find("//p") 184 self.assertEquals(p.text, text)
185
186 - def test_module_HTML_broken(self):
187 element = self.etree.HTML(self.broken_html_str) 188 self.assertEqual(self.etree.tostring(element), 189 self.html_str)
190
191 - def test_module_HTML_cdata(self):
192 # by default, libxml2 generates CDATA nodes for <script> content 193 html = '<html><head><style>foo</style></head></html>' 194 element = self.etree.HTML(html) 195 self.assertEquals(element[0][0].text, "foo")
196
197 - def test_module_HTML_access(self):
198 element = self.etree.HTML(self.html_str) 199 self.assertEqual(element[0][0].tag, 'title')
200
201 - def test_module_parse_html(self):
202 parser = self.etree.HTMLParser() 203 filename = tempfile.mktemp(suffix=".html") 204 open(filename, 'wb').write(self.html_str) 205 try: 206 f = open(filename, 'r') 207 tree = self.etree.parse(f, parser) 208 f.close() 209 self.assertEqual(self.etree.tostring(tree.getroot()), self.html_str) 210 finally: 211 os.remove(filename)
212
214 parser = self.etree.HTMLParser() 215 f = SillyFileLike(self.html_str) 216 tree = self.etree.parse(f, parser) 217 html = self.etree.tostring(tree.getroot(), encoding='UTF-8') 218 self.assertEqual(html, self.html_str)
219 220 ## def test_module_parse_html_filelike_unicode(self): 221 ## parser = self.etree.HTMLParser() 222 ## f = SillyFileLike(self.uhtml_str) 223 ## tree = self.etree.parse(f, parser) 224 ## html = self.etree.tostring(tree.getroot(), encoding='UTF-8') 225 ## self.assertEqual(unicode(html, 'UTF-8'), self.uhtml_str) 226
227 - def test_html_file_error(self):
228 parser = self.etree.HTMLParser() 229 parse = self.etree.parse 230 self.assertRaises(IOError, 231 parse, "__some_hopefully_nonexisting_file__.html", 232 parser)
233
235 self.assertRaises(self.etree.XMLSyntaxError, 236 self.etree.parse, StringIO(self.broken_html_str)) 237 238 self.etree.setDefaultParser( self.etree.HTMLParser() ) 239 240 tree = self.etree.parse(StringIO(self.broken_html_str)) 241 self.assertEqual(self.etree.tostring(tree.getroot()), 242 self.html_str) 243 244 self.etree.setDefaultParser() 245 246 self.assertRaises(self.etree.XMLSyntaxError, 247 self.etree.parse, StringIO(self.broken_html_str))
248
249 - def test_html_iterparse(self):
250 iterparse = self.etree.iterparse 251 f = StringIO( 252 '<html><head><title>TITLE</title><body><p>P</p></body></html>') 253 254 iterator = iterparse(f, html=True) 255 self.assertEquals(None, iterator.root) 256 257 events = list(iterator) 258 root = iterator.root 259 self.assert_(root is not None) 260 self.assertEquals( 261 [('end', root[0][0]), ('end', root[0]), ('end', root[1][0]), 262 ('end', root[1]), ('end', root)], 263 events)
264
265 - def test_html_iterparse_file(self):
266 iterparse = self.etree.iterparse 267 iterator = iterparse(fileInTestDir("css_shakespear.html"), 268 html=True) 269 270 self.assertEquals(None, iterator.root) 271 events = list(iterator) 272 root = iterator.root 273 self.assert_(root is not None) 274 self.assertEquals(249, len(events)) 275 self.assertEquals( 276 [], 277 [ event for (event, element) in events if event != 'end' ])
278
279 -def test_suite():
280 suite = unittest.TestSuite() 281 suite.addTests([unittest.makeSuite(HtmlParserTestCase)]) 282 return suite
283 284 if __name__ == '__main__': 285 print 'to test use test.py %s' % __file__ 286