Package lxml :: Package tests :: Module test_htmlparser
[hide private]
[frames] | no frames]

Source Code for Module lxml.tests.test_htmlparser

  1  # -*- coding: utf-8 -*- 
  2   
  3  """ 
  4  HTML parser test cases for etree 
  5  """ 
  6   
  7  import unittest 
  8  import tempfile, os 
  9   
 10  from common_imports import StringIO, etree, fileInTestDir 
 11  from common_imports import SillyFileLike, HelperTestCase 
 12   
13 -class HtmlParserTestCaseBase(HelperTestCase):
14 """HTML parser test cases 15 """ 16 etree = etree 17 18 html_str = "<html><head><title>test</title></head><body><h1>page title</h1></body></html>" 19 broken_html_str = "<html><head><title>test<body><h1>page title</h3></p></html>" 20 uhtml_str = u"<html><head><title>test á\uF8D2</title></head><body><h1>page á\uF8D2 title</h1></body></html>" 21
22 - def tearDown(self):
24
25 - def test_module_HTML(self):
26 element = self.etree.HTML(self.html_str) 27 self.assertEqual(self.etree.tostring(element), 28 self.html_str)
29
30 - def test_module_HTML_unicode(self):
31 element = self.etree.HTML(self.uhtml_str) 32 self.assertEqual(unicode(self.etree.tostring(element, 'UTF8'), 'UTF8'), 33 unicode(self.uhtml_str.encode('UTF8'), 'UTF8'))
34
36 parser = self.etree.HTMLParser(recover=False) 37 parse = self.etree.parse 38 f = StringIO("<html></body>") 39 self.assertRaises(self.etree.XMLSyntaxError, 40 parse, f, parser)
41
43 parser = self.etree.HTMLParser(recover=False) 44 parse = self.etree.parse 45 f = StringIO(self.broken_html_str) 46 self.assertRaises(self.etree.XMLSyntaxError, 47 parse, f, parser)
48
50 text = u'Søk på nettet' 51 html_latin1 = (u'<p>%s</p>' % text).encode('iso-8859-1') 52 53 tree = self.etree.parse( 54 StringIO(html_latin1), 55 self.etree.HTMLParser(encoding="iso-8859-1")) 56 p = tree.find("//p") 57 self.assertEquals(p.text, text)
58
60 text = u'Søk på nettet' 61 wrong_head = ''' 62 <head> 63 <meta http-equiv="Content-Type" 64 content="text/html; charset=UTF-8" /> 65 </head>''' 66 html_latin1 = (u'<html>%s<body><p>%s</p></body></html>' % (wrong_head, 67 text) 68 ).encode('iso-8859-1') 69 70 self.assertRaises(self.etree.ParseError, 71 self.etree.parse, 72 StringIO(html_latin1)) 73 74 tree = self.etree.parse( 75 StringIO(html_latin1), 76 self.etree.HTMLParser(encoding="iso-8859-1")) 77 p = tree.find("//p") 78 self.assertEquals(p.text, text)
79
80 - def test_module_HTML_broken(self):
81 element = self.etree.HTML(self.broken_html_str) 82 self.assertEqual(self.etree.tostring(element), 83 self.html_str)
84
85 - def test_module_HTML_cdata(self):
86 # by default, libxml2 generates CDATA nodes for <script> content 87 html = '<html><head><style>foo</style></head></html>' 88 element = self.etree.HTML(html) 89 self.assertEquals(element[0][0].text, "foo")
90
91 - def test_module_HTML_access(self):
92 element = self.etree.HTML(self.html_str) 93 self.assertEqual(element[0][0].tag, 'title')
94
95 - def test_module_parse_html(self):
96 parser = self.etree.HTMLParser() 97 filename = tempfile.mktemp(suffix=".html") 98 open(filename, 'wb').write(self.html_str) 99 try: 100 f = open(filename, 'r') 101 tree = self.etree.parse(f, parser) 102 f.close() 103 self.assertEqual(self.etree.tostring(tree.getroot()), self.html_str) 104 finally: 105 os.remove(filename)
106
108 parser = self.etree.HTMLParser() 109 f = SillyFileLike(self.html_str) 110 tree = self.etree.parse(f, parser) 111 html = self.etree.tostring(tree.getroot(), 'UTF-8') 112 self.assertEqual(html, self.html_str)
113 114 ## def test_module_parse_html_filelike_unicode(self): 115 ## parser = self.etree.HTMLParser() 116 ## f = SillyFileLike(self.uhtml_str) 117 ## tree = self.etree.parse(f, parser) 118 ## html = self.etree.tostring(tree.getroot(), 'UTF-8') 119 ## self.assertEqual(unicode(html, 'UTF-8'), self.uhtml_str) 120
121 - def test_html_file_error(self):
122 parser = self.etree.HTMLParser() 123 parse = self.etree.parse 124 self.assertRaises(IOError, 125 parse, "__some_hopefully_nonexisting_file__.html", 126 parser)
127
129 self.assertRaises(self.etree.XMLSyntaxError, 130 self.etree.parse, StringIO(self.broken_html_str)) 131 132 self.etree.setDefaultParser( self.etree.HTMLParser() ) 133 134 tree = self.etree.parse(StringIO(self.broken_html_str)) 135 self.assertEqual(self.etree.tostring(tree.getroot()), 136 self.html_str) 137 138 self.etree.setDefaultParser() 139 140 self.assertRaises(self.etree.XMLSyntaxError, 141 self.etree.parse, StringIO(self.broken_html_str))
142
143 - def test_html_iterparse(self):
144 iterparse = self.etree.iterparse 145 f = StringIO( 146 '<html><head><title>TITLE</title><body><p>P</p></body></html>') 147 148 iterator = iterparse(f, html=True) 149 self.assertEquals(None, iterator.root) 150 151 events = list(iterator) 152 root = iterator.root 153 self.assert_(root is not None) 154 self.assertEquals( 155 [('end', root[0][0]), ('end', root[0]), ('end', root[1][0]), 156 ('end', root[1]), ('end', root)], 157 events)
158
159 - def test_html_iterparse_file(self):
160 iterparse = self.etree.iterparse 161 iterator = iterparse(fileInTestDir("css_shakespear.html"), 162 html=True) 163 164 self.assertEquals(None, iterator.root) 165 events = list(iterator) 166 root = iterator.root 167 self.assert_(root is not None) 168 self.assertEquals(249, len(events)) 169 self.assertEquals( 170 [], 171 [ event for (event, element) in events if event != 'end' ])
172
173 -def test_suite():
174 suite = unittest.TestSuite() 175 suite.addTests([unittest.makeSuite(HtmlParserTestCaseBase)]) 176 return suite
177 178 if __name__ == '__main__': 179 print 'to test use test.py %s' % __file__ 180