Package lxml :: Package tests :: Module test_unicode
[hide private]
[frames] | no frames]

Source Code for Module lxml.tests.test_unicode

  1  # -*- coding: utf-8 -*- 
  2  import unittest 
  3  import sys 
  4  import os.path 
  5   
  6  this_dir = os.path.dirname(__file__) 
  7  if this_dir not in sys.path: 
  8      sys.path.insert(0, this_dir)  # needed for Py3 
  9   
 10  from common_imports import StringIO, etree, SillyFileLike, HelperTestCase 
 11  from common_imports import _str, _bytes 
 12   
 13  try: 
 14      unicode 
 15  except NameError: 
 16      unicode = str 
 17   
 18  ascii_uni = _bytes('a').decode('utf8') 
 19   
 20  klingon = _bytes("\\uF8D2").decode("unicode_escape") # not valid for XML names 
 21   
 22  invalid_tag = _bytes("test").decode('utf8') + klingon 
 23   
 24  uni = _bytes('\\xc3\\u0680\\u3120').decode("unicode_escape") # some non-ASCII characters 
 25   
 26  uxml = _bytes("<test><title>test \\xc3\\xa1\\u3120</title><h1>page \\xc3\\xa1\\u3120 title</h1></test>" 
 27                ).decode("unicode_escape") 
 28   
 29   
30 -class UnicodeTestCase(HelperTestCase):
31 - def test_unicode_xml(self):
32 tree = etree.XML('<p>%s</p>' % uni) 33 self.assertEqual(uni, tree.text)
34
35 - def test_wide_unicode_xml(self):
36 if sys.maxunicode < 1114111: 37 return # skip test 38 tree = etree.XML(_bytes('<p>\\U00026007</p>').decode('unicode_escape')) 39 self.assertEqual(1, len(tree.text)) 40 self.assertEqual(_bytes('\\U00026007').decode('unicode_escape'), 41 tree.text)
42
43 - def test_unicode_xml_broken(self):
44 uxml = ('<?xml version="1.0" encoding="UTF-8"?>' + 45 '<p>%s</p>' % uni) 46 self.assertRaises(ValueError, etree.XML, uxml)
47
48 - def test_unicode_tag(self):
49 el = etree.Element(uni) 50 self.assertEqual(uni, el.tag)
51
52 - def test_unicode_tag_invalid(self):
53 # sadly, Klingon is not well-formed 54 self.assertRaises(ValueError, etree.Element, invalid_tag)
55
56 - def test_unicode_nstag(self):
57 tag = "{http://abc/}%s" % uni 58 el = etree.Element(tag) 59 self.assertEqual(tag, el.tag)
60
61 - def test_unicode_ns_invalid(self):
62 # namespace URIs must conform to RFC 3986 63 tag = "{http://%s/}abc" % uni 64 self.assertRaises(ValueError, etree.Element, tag)
65
67 # sadly, Klingon is not well-formed 68 tag = "{http://abc/}%s" % invalid_tag 69 self.assertRaises(ValueError, etree.Element, tag)
70
71 - def test_unicode_qname(self):
72 qname = etree.QName(uni, uni) 73 tag = "{%s}%s" % (uni, uni) 74 self.assertEqual(qname.text, tag) 75 self.assertEqual(unicode(qname), tag)
76
78 self.assertRaises(ValueError, etree.QName, invalid_tag)
79
80 - def test_unicode_attr(self):
81 el = etree.Element('foo', {'bar': uni}) 82 self.assertEqual(uni, el.attrib['bar'])
83
84 - def test_unicode_comment(self):
85 el = etree.Comment(uni) 86 self.assertEqual(uni, el.text)
87
89 el = etree.parse(StringIO('<p>%s</p>' % uni)).getroot() 90 self.assertEqual(uni, el.text)
91 92 ## def test_parse_fileobject_unicode(self): 93 ## # parse unicode from unamed file object (not support by ElementTree) 94 ## f = SillyFileLike(uxml) 95 ## root = etree.parse(f).getroot() 96 ## self.assertEqual(unicode(etree.tostring(root, 'UTF-8'), 'UTF-8'), 97 ## uxml) 98 99
100 -class EncodingsTestCase(HelperTestCase):
101 - def test_illegal_utf8(self):
102 data = _bytes('<test>\x80\x80\x80</test>', encoding='iso8859-1') 103 self.assertRaises(etree.XMLSyntaxError, etree.fromstring, data)
104
106 data = _bytes('<test>\x80\x80\x80</test>', encoding='iso8859-1') 107 parser = etree.XMLParser(recover=True) 108 self.assertRaises(etree.XMLSyntaxError, etree.fromstring, data, parser)
109 110
111 -def test_suite():
112 suite = unittest.TestSuite() 113 suite.addTests([unittest.makeSuite(UnicodeTestCase)]) 114 suite.addTests([unittest.makeSuite(EncodingsTestCase)]) 115 return suite
116