Package lxml :: Package tests :: Module test_unicode
[hide private]
[frames] | no frames]

Source Code for Module lxml.tests.test_unicode

  1  # -*- coding: utf-8 -*- 
  2  import unittest, doctest, sys, os.path 
  3   
  4  this_dir = os.path.dirname(__file__) 
  5  if this_dir not in sys.path: 
  6      sys.path.insert(0, this_dir) # needed for Py3 
  7   
  8  from common_imports import StringIO, etree, SillyFileLike, HelperTestCase 
  9  from common_imports import _str, _bytes 
 10   
 11  try: 
 12      unicode 
 13  except NameError: 
 14      unicode = str 
 15   
 16  ascii_uni = _bytes('a').decode('utf8') 
 17   
 18  klingon = _bytes("\\uF8D2").decode("unicode_escape") # not valid for XML names 
 19   
 20  invalid_tag = _bytes("test").decode('utf8') + klingon 
 21   
 22  uni = _bytes('\\xc3\\u0680\\u3120').decode("unicode_escape") # some non-ASCII characters 
 23   
 24  uxml = _bytes("<test><title>test \\xc3\\xa1\\u3120</title><h1>page \\xc3\\xa1\\u3120 title</h1></test>" 
 25                ).decode("unicode_escape") 
 26   
27 -class UnicodeTestCase(HelperTestCase):
28 - def test_unicode_xml(self):
29 tree = etree.XML('<p>%s</p>' % uni) 30 self.assertEqual(uni, tree.text)
31
32 - def test_wide_unicode_xml(self):
33 if sys.maxunicode < 1114111: 34 return # skip test 35 tree = etree.XML(_bytes('<p>\\U00026007</p>').decode('unicode_escape')) 36 self.assertEqual(1, len(tree.text)) 37 self.assertEqual(_bytes('\\U00026007').decode('unicode_escape'), 38 tree.text)
39
40 - def test_unicode_xml_broken(self):
41 uxml = ('<?xml version="1.0" encoding="UTF-8"?>' + 42 '<p>%s</p>' % uni) 43 self.assertRaises(ValueError, etree.XML, uxml)
44
45 - def test_unicode_tag(self):
46 el = etree.Element(uni) 47 self.assertEqual(uni, el.tag)
48
49 - def test_unicode_tag_invalid(self):
50 # sadly, Klingon is not well-formed 51 self.assertRaises(ValueError, etree.Element, invalid_tag)
52
53 - def test_unicode_nstag(self):
54 tag = "{http://abc/}%s" % uni 55 el = etree.Element(tag) 56 self.assertEqual(tag, el.tag)
57
58 - def test_unicode_ns_invalid(self):
59 # namespace URIs must conform to RFC 3986 60 tag = "{http://%s/}abc" % uni 61 self.assertRaises(ValueError, etree.Element, tag)
62
64 # sadly, Klingon is not well-formed 65 tag = "{http://abc/}%s" % invalid_tag 66 self.assertRaises(ValueError, etree.Element, tag)
67
68 - def test_unicode_qname(self):
69 qname = etree.QName(uni, uni) 70 tag = "{%s}%s" % (uni, uni) 71 self.assertEqual(qname.text, tag) 72 self.assertEqual(unicode(qname), tag)
73
75 self.assertRaises(ValueError, etree.QName, invalid_tag)
76
77 - def test_unicode_attr(self):
78 el = etree.Element('foo', {'bar': uni}) 79 self.assertEqual(uni, el.attrib['bar'])
80
81 - def test_unicode_comment(self):
82 el = etree.Comment(uni) 83 self.assertEqual(uni, el.text)
84
86 el = etree.parse(StringIO('<p>%s</p>' % uni)).getroot() 87 self.assertEqual(uni, el.text)
88 89 ## def test_parse_fileobject_unicode(self): 90 ## # parse unicode from unamed file object (not support by ElementTree) 91 ## f = SillyFileLike(uxml) 92 ## root = etree.parse(f).getroot() 93 ## self.assertEqual(unicode(etree.tostring(root, 'UTF-8'), 'UTF-8'), 94 ## uxml) 95
96 -def test_suite():
97 suite = unittest.TestSuite() 98 suite.addTests([unittest.makeSuite(UnicodeTestCase)]) 99 return suite
100