Package lxml :: Package tests :: Module test_unicode
[hide private]
[frames] | no frames]

Source Code for Module lxml.tests.test_unicode

  1  # -*- coding: utf-8 -*- 
  2  from __future__ import absolute_import 
  3   
  4  import unittest 
  5  import sys 
  6   
  7  from .common_imports import StringIO, etree, HelperTestCase, _str, _bytes, _chr 
  8   
  9  try: 
 10      unicode 
 11  except NameError: 
 12      unicode = str 
 13   
 14  ascii_uni = _bytes('a').decode('utf8') 
 15   
 16  klingon = _bytes("\\uF8D2").decode("unicode_escape") # not valid for XML names 
 17   
 18  invalid_tag = _bytes("test").decode('utf8') + klingon 
 19   
 20  uni = _bytes('\\xc3\\u0680\\u3120').decode("unicode_escape") # some non-ASCII characters 
 21   
 22  uxml = _bytes("<test><title>test \\xc3\\xa1\\u3120</title><h1>page \\xc3\\xa1\\u3120 title</h1></test>" 
 23                ).decode("unicode_escape") 
 24   
 25   
26 -class UnicodeTestCase(HelperTestCase):
27 - def test__str(self):
28 # test the testing framework, namely _str from common_imports 29 self.assertEqual(_str('\x10'), _str('\u0010')) 30 self.assertEqual(_str('\x10'), _str('\U00000010')) 31 self.assertEqual(_str('\u1234'), _str('\U00001234'))
32
33 - def test_unicode_xml(self):
34 tree = etree.XML('<p>%s</p>' % uni) 35 self.assertEqual(uni, tree.text)
36
37 - def test_wide_unicode_xml(self):
38 if sys.maxunicode < 1114111: 39 return # skip test 40 tree = etree.XML(_bytes('<p>\\U00026007</p>').decode('unicode_escape')) 41 self.assertEqual(1, len(tree.text)) 42 self.assertEqual(_bytes('\\U00026007').decode('unicode_escape'), 43 tree.text)
44
45 - def test_unicode_xml_broken(self):
46 uxml = ('<?xml version="1.0" encoding="UTF-8"?>' + 47 '<p>%s</p>' % uni) 48 self.assertRaises(ValueError, etree.XML, uxml)
49
50 - def test_unicode_tag(self):
51 el = etree.Element(uni) 52 self.assertEqual(uni, el.tag)
53
54 - def test_unicode_tag_invalid(self):
55 # sadly, Klingon is not well-formed 56 self.assertRaises(ValueError, etree.Element, invalid_tag)
57
58 - def test_unicode_nstag(self):
59 tag = "{http://abc/}%s" % uni 60 el = etree.Element(tag) 61 self.assertEqual(tag, el.tag)
62
63 - def test_unicode_ns_invalid(self):
64 # namespace URIs must conform to RFC 3986 65 tag = "{http://%s/}abc" % uni 66 self.assertRaises(ValueError, etree.Element, tag)
67
69 # sadly, Klingon is not well-formed 70 tag = "{http://abc/}%s" % invalid_tag 71 self.assertRaises(ValueError, etree.Element, tag)
72
73 - def test_unicode_qname(self):
74 qname = etree.QName(uni, uni) 75 tag = "{%s}%s" % (uni, uni) 76 self.assertEqual(qname.text, tag) 77 self.assertEqual(unicode(qname), tag)
78
80 self.assertRaises(ValueError, etree.QName, invalid_tag)
81
82 - def test_unicode_attr(self):
83 el = etree.Element('foo', {'bar': uni}) 84 self.assertEqual(uni, el.attrib['bar'])
85
86 - def test_unicode_comment(self):
87 el = etree.Comment(uni) 88 self.assertEqual(uni, el.text)
89
90 - def test_unicode_repr1(self):
91 x = etree.Element(_str('å')) 92 # must not raise UnicodeEncodeError 93 repr(x)
94
95 - def test_unicode_repr2(self):
96 x = etree.Comment(_str('ö')) 97 repr(x)
98
99 - def test_unicode_repr3(self):
100 x = etree.ProcessingInstruction(_str('Å'), _str('\u0131')) 101 repr(x)
102
103 - def test_unicode_repr4(self):
104 x = etree.Entity(_str('ä')) 105 repr(x)
106
107 - def test_unicode_text(self):
108 e = etree.Element('e') 109 110 def settext(text): 111 e.text = text
112 113 self.assertRaises(ValueError, settext, _str('ab\ufffe')) 114 self.assertRaises(ValueError, settext, _str('ö\ffff')) 115 self.assertRaises(ValueError, settext, _str('\u0123\ud800')) 116 self.assertRaises(ValueError, settext, _str('x\ud8ff')) 117 self.assertRaises(ValueError, settext, _str('\U00010000\udfff')) 118 self.assertRaises(ValueError, settext, _str('abd\x00def')) 119 # should not Raise 120 settext(_str('\ud7ff\ue000\U00010000\U0010FFFFäöas')) 121 122 for char_val in range(0xD800, 0xDFFF+1): 123 self.assertRaises(ValueError, settext, 'abc' + _chr(char_val)) 124 self.assertRaises(ValueError, settext, _chr(char_val)) 125 self.assertRaises(ValueError, settext, _chr(char_val) + 'abc') 126 127 self.assertRaises(ValueError, settext, _bytes('\xe4')) 128 self.assertRaises(ValueError, settext, _bytes('\x80')) 129 self.assertRaises(ValueError, settext, _bytes('\xff')) 130 self.assertRaises(ValueError, settext, _bytes('\x08')) 131 self.assertRaises(ValueError, settext, _bytes('\x19')) 132 self.assertRaises(ValueError, settext, _bytes('\x20\x00')) 133 # should not Raise 134 settext(_bytes('\x09\x0A\x0D\x20\x60\x7f'))
135
136 - def test_uniname(self):
137 Element = etree.Element 138 def el(name): 139 return Element(name)
140 141 self.assertRaises(ValueError, el, ':') 142 self.assertRaises(ValueError, el, '0a') 143 self.assertRaises(ValueError, el, _str('\u203f')) 144 # should not Raise 145 el(_str('\u0132')) 146 147 148
149 - def test_unicode_parse_stringio(self):
150 el = etree.parse(StringIO('<p>%s</p>' % uni)).getroot() 151 self.assertEqual(uni, el.text)
152 153 ## def test_parse_fileobject_unicode(self): 154 ## # parse unicode from unnamed file object (not supported by ElementTree) 155 ## f = SillyFileLike(uxml) 156 ## root = etree.parse(f).getroot() 157 ## self.assertEqual(unicode(etree.tostring(root, 'UTF-8'), 'UTF-8'), 158 ## uxml) 159 160
161 -class EncodingsTestCase(HelperTestCase):
162 - def test_illegal_utf8(self):
163 data = _bytes('<test>\x80\x80\x80</test>', encoding='iso8859-1') 164 self.assertRaises(etree.XMLSyntaxError, etree.fromstring, data)
165
167 data = _bytes('<test>\x80\x80\x80</test>', encoding='iso8859-1') 168 parser = etree.XMLParser(recover=True) 169 self.assertRaises(etree.XMLSyntaxError, etree.fromstring, data, parser)
170
171 - def _test_encoding(self, encoding, xml_encoding_name=None):
172 foo = """<?xml version='1.0' encoding='%s'?>\n<tag attrib='123'></tag>""" % ( 173 xml_encoding_name or encoding) 174 root = etree.fromstring(foo.encode(encoding)) 175 self.assertEqual('tag', root.tag) 176 177 doc_encoding = root.getroottree().docinfo.encoding 178 self.assertTrue( 179 doc_encoding.lower().rstrip('lbe'), 180 (xml_encoding_name or encoding).lower().rstrip('lbe'))
181
182 - def test_utf8_fromstring(self):
183 self._test_encoding('utf-8')
184
185 - def test_utf8sig_fromstring(self):
186 self._test_encoding('utf_8_sig', 'utf-8')
187
188 - def test_utf16_fromstring(self):
189 self._test_encoding('utf-16')
190
191 - def test_utf16LE_fromstring(self):
192 self._test_encoding('utf-16le', 'utf-16')
193
194 - def test_utf16BE_fromstring(self):
195 self._test_encoding('utf-16be', 'utf-16')
196
197 - def test_utf32_fromstring(self):
198 self._test_encoding('utf-32', 'utf-32')
199
200 - def test_utf32LE_fromstring(self):
201 self._test_encoding('utf-32le', 'utf-32')
202
203 - def test_utf32BE_fromstring(self):
204 self._test_encoding('utf-32be', 'utf-32')
205 206
207 -def test_suite():
208 suite = unittest.TestSuite() 209 suite.addTests([unittest.makeSuite(UnicodeTestCase)]) 210 suite.addTests([unittest.makeSuite(EncodingsTestCase)]) 211 return suite
212