Package lxml :: Package tests :: Module selftest
[hide private]
[frames] | no frames]

Source Code for Module lxml.tests.selftest

   1  # $Id: selftest.py 3276 2007-09-12 06:52:30Z fredrik $ 
   2  # -*- coding: iso-8859-1 -*- 
   3  # elementtree selftest program 
   4   
   5  # this test script uses Python's "doctest" module to check that the 
   6  # *test script* works as expected. 
   7   
   8  # TODO: add more elementtree method tests 
   9  # TODO: add xml/html parsing tests 
  10  # TODO: etc 
  11   
  12  import re, sys 
  13   
14 -def stdout():
15 if sys.version_info[0] < 3: 16 return sys.stdout 17 class bytes_stdout(object): 18 def write(self, data): 19 if isinstance(data, bytes): 20 data = data.decode('ISO8859-1') 21 sys.stdout.write(data)
22 return bytes_stdout() 23 24 try: 25 from StringIO import StringIO as BytesIO 26 except ImportError: 27 from io import BytesIO 28 29 from lxml import etree as ElementTree 30 from lxml import _elementpath as ElementPath 31 from lxml import ElementInclude 32 ET = ElementTree 33 34 #from elementtree import ElementTree 35 #from elementtree import ElementPath 36 #from elementtree import ElementInclude 37 #from elementtree import HTMLTreeBuilder 38 #from elementtree import SimpleXMLWriter 39
40 -def fix_compatibility(xml_data):
41 xml_data = re.sub(r'\s*xmlns:[a-z0-9]+="http://www.w3.org/2001/XInclude"', '', xml_data) 42 xml_data = xml_data.replace(' />', '/>') 43 if xml_data[-1:] == '\n': 44 xml_data = xml_data[:-1] 45 return xml_data
46
47 -def serialize(elem, **options):
48 file = BytesIO() 49 tree = ElementTree.ElementTree(elem) 50 tree.write(file, **options) 51 if sys.version_info[0] < 3: 52 try: 53 encoding = options["encoding"] 54 except KeyError: 55 encoding = "utf-8" 56 else: 57 encoding = 'ISO8859-1' 58 result = fix_compatibility(file.getvalue().decode(encoding)) 59 if sys.version_info[0] < 3: 60 result = result.encode(encoding) 61 return result
62
63 -def summarize(elem):
64 return elem.tag
65
66 -def summarize_list(seq):
67 return list(map(summarize, seq))
68
69 -def normalize_crlf(tree):
70 for elem in tree.getiterator(): 71 if elem.text: elem.text = elem.text.replace("\r\n", "\n") 72 if elem.tail: elem.tail = elem.tail.replace("\r\n", "\n")
73 74 SAMPLE_XML = ElementTree.XML(""" 75 <body> 76 <tag class='a'>text</tag> 77 <tag class='b' /> 78 <section> 79 <tag class='b' id='inner'>subtext</tag> 80 </section> 81 </body> 82 """) 83 84 # 85 # interface tests 86
87 -def check_string(string):
88 len(string) 89 for char in string: 90 if len(char) != 1: 91 print("expected one-character string, got %r" % char) 92 new_string = string + "" 93 new_string = string + " " 94 string[:0]
95
96 -def check_string_or_none(value):
97 if value is None: 98 return 99 return check_string(value)
100
101 -def check_mapping(mapping):
102 len(mapping) 103 keys = mapping.keys() 104 items = mapping.items() 105 for key in keys: 106 item = mapping[key] 107 mapping["key"] = "value" 108 if mapping["key"] != "value": 109 print("expected value string, got %r" % mapping["key"])
110
111 -def check_element(element):
112 if not hasattr(element, "tag"): 113 print("no tag member") 114 if not hasattr(element, "attrib"): 115 print("no attrib member") 116 if not hasattr(element, "text"): 117 print("no text member") 118 if not hasattr(element, "tail"): 119 print("no tail member") 120 check_string(element.tag) 121 check_mapping(element.attrib) 122 check_string_or_none(element.text) 123 check_string_or_none(element.tail) 124 for elem in element: 125 check_element(elem)
126
127 -def check_element_tree(tree):
128 check_element(tree.getroot())
129 130 # -------------------------------------------------------------------- 131 # element tree tests 132
133 -def sanity():
134 """ 135 >>> from elementtree.ElementTree import * 136 >>> from elementtree.ElementInclude import * 137 >>> from elementtree.ElementPath import * 138 >>> from elementtree.HTMLTreeBuilder import * 139 >>> from elementtree.SimpleXMLWriter import * 140 >>> from elementtree.TidyTools import * 141 """
142 143 # doesn't work with lxml.etree 144 del sanity 145
146 -def version():
147 """ 148 >>> ElementTree.VERSION 149 '1.3a2' 150 """
151 152 # doesn't work with lxml.etree 153 del version 154
155 -def interface():
156 """ 157 Test element tree interface. 158 159 >>> element = ElementTree.Element("tag") 160 >>> check_element(element) 161 >>> tree = ElementTree.ElementTree(element) 162 >>> check_element_tree(tree) 163 """
164
165 -def simpleops():
166 """ 167 >>> elem = ElementTree.XML("<body><tag/></body>") 168 >>> serialize(elem) 169 '<body><tag/></body>' 170 >>> e = ElementTree.Element("tag2") 171 >>> elem.append(e) 172 >>> serialize(elem) 173 '<body><tag/><tag2/></body>' 174 >>> elem.remove(e) 175 >>> serialize(elem) 176 '<body><tag/></body>' 177 >>> elem.insert(0, e) 178 >>> serialize(elem) 179 '<body><tag2/><tag/></body>' 180 >>> elem.remove(e) 181 >>> elem.extend([e]) 182 >>> serialize(elem) 183 '<body><tag/><tag2/></body>' 184 >>> elem.remove(e) 185 """
186
187 -def simplefind():
188 """ 189 Test find methods using the elementpath fallback. 190 191 >>> CurrentElementPath = ElementTree.ElementPath 192 >>> ElementTree.ElementPath = ElementTree._SimpleElementPath() 193 >>> elem = SAMPLE_XML 194 >>> elem.find("tag").tag 195 'tag' 196 >>> ElementTree.ElementTree(elem).find("tag").tag 197 'tag' 198 >>> elem.findtext("tag") 199 'text' 200 >>> elem.findtext("tog") 201 >>> elem.findtext("tog", "default") 202 'default' 203 >>> ElementTree.ElementTree(elem).findtext("tag") 204 'text' 205 >>> summarize_list(elem.findall("tag")) 206 ['tag', 'tag'] 207 >>> summarize_list(elem.findall(".//tag")) 208 ['tag', 'tag', 'tag'] 209 210 Path syntax doesn't work in this case. 211 212 >>> elem.find("section/tag") 213 >>> elem.findtext("section/tag") 214 >>> elem.findall("section/tag") 215 [] 216 217 >>> ElementTree.ElementPath = CurrentElementPath 218 """
219 220 # doesn't work with lxml.etree 221 del simplefind 222
223 -def find():
224 """ 225 Test find methods (including xpath syntax). 226 227 >>> elem = SAMPLE_XML 228 >>> elem.find("tag").tag 229 'tag' 230 >>> ElementTree.ElementTree(elem).find("tag").tag 231 'tag' 232 >>> elem.find("section/tag").tag 233 'tag' 234 >>> ElementTree.ElementTree(elem).find("section/tag").tag 235 'tag' 236 >>> elem.findtext("tag") 237 'text' 238 >>> elem.findtext("tog") 239 >>> elem.findtext("tog", "default") 240 'default' 241 >>> ElementTree.ElementTree(elem).findtext("tag") 242 'text' 243 >>> elem.findtext("section/tag") 244 'subtext' 245 >>> ElementTree.ElementTree(elem).findtext("section/tag") 246 'subtext' 247 >>> summarize_list(elem.findall("tag")) 248 ['tag', 'tag'] 249 >>> summarize_list(elem.findall("*")) 250 ['tag', 'tag', 'section'] 251 >>> summarize_list(elem.findall(".//tag")) 252 ['tag', 'tag', 'tag'] 253 >>> summarize_list(elem.findall("section/tag")) 254 ['tag'] 255 >>> summarize_list(elem.findall("section//tag")) 256 ['tag'] 257 >>> summarize_list(elem.findall("section/*")) 258 ['tag'] 259 >>> summarize_list(elem.findall("section//*")) 260 ['tag'] 261 >>> summarize_list(elem.findall("section/.//*")) 262 ['tag'] 263 >>> summarize_list(elem.findall("*/*")) 264 ['tag'] 265 >>> summarize_list(elem.findall("*//*")) 266 ['tag'] 267 >>> summarize_list(elem.findall("*/tag")) 268 ['tag'] 269 >>> summarize_list(elem.findall("*/./tag")) 270 ['tag'] 271 >>> summarize_list(elem.findall("./tag")) 272 ['tag', 'tag'] 273 >>> summarize_list(elem.findall(".//tag")) 274 ['tag', 'tag', 'tag'] 275 >>> summarize_list(elem.findall("././tag")) 276 ['tag', 'tag'] 277 >>> summarize_list(elem.findall(".//tag[@class]")) 278 ['tag', 'tag', 'tag'] 279 >>> summarize_list(elem.findall(".//tag[@class='a']")) 280 ['tag'] 281 >>> summarize_list(elem.findall(".//tag[@class='b']")) 282 ['tag', 'tag'] 283 >>> summarize_list(elem.findall(".//tag[@id]")) 284 ['tag'] 285 >>> summarize_list(elem.findall(".//section[tag]")) 286 ['section'] 287 >>> summarize_list(elem.findall(".//section[element]")) 288 [] 289 >>> summarize_list(elem.findall("../tag")) 290 [] 291 >>> summarize_list(elem.findall("section/../tag")) 292 ['tag', 'tag'] 293 >>> summarize_list(ElementTree.ElementTree(elem).findall("./tag")) 294 ['tag', 'tag'] 295 296 FIXME: ET's Path module handles this case incorrectly; this gives 297 a warning in 1.3, and the behaviour will be modified in 1.4. 298 299 >>> summarize_list(ElementTree.ElementTree(elem).findall("/tag")) 300 ['tag', 'tag'] 301 """
302
303 -def bad_find():
304 """ 305 Check bad or unsupported path expressions. 306 307 >>> elem = SAMPLE_XML 308 >>> elem.findall("/tag") 309 Traceback (most recent call last): 310 SyntaxError: cannot use absolute path on element 311 312 # this is supported in ET 1.3: 313 #>>> elem.findall("section//") 314 #Traceback (most recent call last): 315 #SyntaxError: invalid path 316 """
317
318 -def parsefile():
319 """ 320 Test parsing from file. 321 322 >>> tree = ElementTree.parse("samples/simple.xml") 323 >>> normalize_crlf(tree) 324 >>> tree.write(stdout()) 325 <root> 326 <element key="value">text</element> 327 <element>text</element>tail 328 <empty-element/> 329 </root> 330 >>> tree = ElementTree.parse("samples/simple-ns.xml") 331 >>> normalize_crlf(tree) 332 >>> tree.write(stdout()) 333 <root xmlns="http://namespace/"> 334 <element key="value">text</element> 335 <element>text</element>tail 336 <empty-element/> 337 </root> 338 339 ## <ns0:root xmlns:ns0="http://namespace/"> 340 ## <ns0:element key="value">text</ns0:element> 341 ## <ns0:element>text</ns0:element>tail 342 ## <ns0:empty-element/> 343 ## </ns0:root> 344 """
345
346 -def parsehtml():
347 """ 348 Test HTML parsing. 349 350 >>> # p = HTMLTreeBuilder.TreeBuilder() 351 >>> p = ElementTree.HTMLParser() 352 >>> p.feed("<p><p>spam<b>egg</b></p>") 353 >>> serialize(p.close()) 354 '<p>spam<b>egg</b></p>' 355 """
356 357 # doesn't work with lxml.etree 358 del parsehtml 359
360 -def parseliteral():
361 r""" 362 >>> element = ElementTree.XML("<html><body>text</body></html>") 363 >>> ElementTree.ElementTree(element).write(stdout()) 364 <html><body>text</body></html> 365 >>> element = ElementTree.fromstring("<html><body>text</body></html>") 366 >>> ElementTree.ElementTree(element).write(stdout()) 367 <html><body>text</body></html> 368 369 ## >>> sequence = ["<html><body>", "text</bo", "dy></html>"] 370 ## >>> element = ElementTree.fromstringlist(sequence) 371 ## >>> ElementTree.ElementTree(element).write(stdout()) 372 ## <html><body>text</body></html> 373 374 >>> print(repr(ElementTree.tostring(element)).lstrip('b')) 375 '<html><body>text</body></html>' 376 377 # looks different in lxml 378 # >>> print(ElementTree.tostring(element, "ascii")) 379 # <?xml version='1.0' encoding='ascii'?> 380 # <html><body>text</body></html> 381 382 >>> _, ids = ElementTree.XMLID("<html><body>text</body></html>") 383 >>> len(ids) 384 0 385 >>> _, ids = ElementTree.XMLID("<html><body id='body'>text</body></html>") 386 >>> len(ids) 387 1 388 >>> ids["body"].tag 389 'body' 390 """
391
392 -def simpleparsefile():
393 """ 394 Test the xmllib-based parser. 395 396 >>> from elementtree import SimpleXMLTreeBuilder 397 >>> parser = SimpleXMLTreeBuilder.TreeBuilder() 398 >>> tree = ElementTree.parse("samples/simple.xml", parser) 399 >>> normalize_crlf(tree) 400 >>> tree.write(sys.stdout) 401 <root> 402 <element key="value">text</element> 403 <element>text</element>tail 404 <empty-element /> 405 </root> 406 """
407 408 # doesn't work with lxml.etree 409 del simpleparsefile 410
411 -def iterparse():
412 """ 413 Test iterparse interface. 414 415 >>> iterparse = ElementTree.iterparse 416 417 >>> context = iterparse("samples/simple.xml") 418 >>> for action, elem in context: 419 ... print("%s %s" % (action, elem.tag)) 420 end element 421 end element 422 end empty-element 423 end root 424 >>> context.root.tag 425 'root' 426 427 >>> context = iterparse("samples/simple-ns.xml") 428 >>> for action, elem in context: 429 ... print("%s %s" % (action, elem.tag)) 430 end {http://namespace/}element 431 end {http://namespace/}element 432 end {http://namespace/}empty-element 433 end {http://namespace/}root 434 435 >>> events = () 436 >>> context = iterparse("samples/simple.xml", events) 437 >>> for action, elem in context: 438 ... print("%s %s" % (action, elem.tag)) 439 440 >>> events = () 441 >>> context = iterparse("samples/simple.xml", events=events) 442 >>> for action, elem in context: 443 ... print("%s %s" % (action, elem.tag)) 444 445 >>> events = ("start", "end") 446 >>> context = iterparse("samples/simple.xml", events) 447 >>> for action, elem in context: 448 ... print("%s %s" % (action, elem.tag)) 449 start root 450 start element 451 end element 452 start element 453 end element 454 start empty-element 455 end empty-element 456 end root 457 458 >>> events = ("start", "end", "start-ns", "end-ns") 459 >>> context = iterparse("samples/simple-ns.xml", events) 460 >>> for action, elem in context: 461 ... if action in ("start", "end"): 462 ... print("%s %s" % (action, elem.tag)) 463 ... else: 464 ... print("%s %s" % (action, elem)) 465 start-ns ('', 'http://namespace/') 466 start {http://namespace/}root 467 start {http://namespace/}element 468 end {http://namespace/}element 469 start {http://namespace/}element 470 end {http://namespace/}element 471 start {http://namespace/}empty-element 472 end {http://namespace/}empty-element 473 end {http://namespace/}root 474 end-ns None 475 476 """
477
478 -def fancyparsefile():
479 """ 480 Test the "fancy" parser. 481 482 Sanity check. 483 >>> from elementtree import XMLTreeBuilder 484 >>> parser = XMLTreeBuilder.FancyTreeBuilder() 485 >>> tree = ElementTree.parse("samples/simple.xml", parser) 486 >>> normalize_crlf(tree) 487 >>> tree.write(sys.stdout) 488 <root> 489 <element key="value">text</element> 490 <element>text</element>tail 491 <empty-element /> 492 </root> 493 494 Callback check. 495 >>> class MyFancyParser(XMLTreeBuilder.FancyTreeBuilder): 496 ... def start(self, elem): 497 ... print("START %s" % elem.tag) 498 ... def end(self, elem): 499 ... print("END %s" % elem.tag) 500 >>> parser = MyFancyParser() 501 >>> tree = ElementTree.parse("samples/simple.xml", parser) 502 START root 503 START element 504 END element 505 START element 506 END element 507 START empty-element 508 END empty-element 509 END root 510 """
511 512 # doesn't work with lxml.etree 513 del fancyparsefile 514
515 -def writefile():
516 """ 517 >>> elem = ElementTree.Element("tag") 518 >>> elem.text = "text" 519 >>> serialize(elem) 520 '<tag>text</tag>' 521 >>> ElementTree.SubElement(elem, "subtag").text = "subtext" 522 >>> serialize(elem) 523 '<tag>text<subtag>subtext</subtag></tag>' 524 525 ## Test tag suppression 526 ## >>> elem.tag = None 527 ## >>> serialize(elem) 528 ## 'text<subtag>subtext</subtag>' 529 """
530
531 -def writestring():
532 """ 533 >>> elem = ElementTree.XML("<html><body>text</body></html>") 534 >>> print(repr(ElementTree.tostring(elem)).lstrip('b')) 535 '<html><body>text</body></html>' 536 >>> elem = ElementTree.fromstring("<html><body>text</body></html>") 537 >>> print(repr(ElementTree.tostring(elem)).lstrip('b')) 538 '<html><body>text</body></html>' 539 """
540
541 -def encoding():
542 r""" 543 Test encoding issues. 544 545 >>> elem = ElementTree.Element("tag") 546 >>> elem.text = u'abc' 547 >>> serialize(elem) 548 '<tag>abc</tag>' 549 >>> serialize(elem, encoding="utf-8") 550 '<tag>abc</tag>' 551 >>> serialize(elem, encoding="us-ascii") 552 '<tag>abc</tag>' 553 >>> serialize(elem, encoding="iso-8859-1").lower() 554 "<?xml version='1.0' encoding='iso-8859-1'?>\n<tag>abc</tag>" 555 556 >>> elem.text = "<&\"\'>" 557 >>> serialize(elem) 558 '<tag>&lt;&amp;"\'&gt;</tag>' 559 >>> serialize(elem, encoding="utf-8") 560 '<tag>&lt;&amp;"\'&gt;</tag>' 561 >>> serialize(elem, encoding="us-ascii") # cdata characters 562 '<tag>&lt;&amp;"\'&gt;</tag>' 563 >>> serialize(elem, encoding="iso-8859-1").lower() 564 '<?xml version=\'1.0\' encoding=\'iso-8859-1\'?>\n<tag>&lt;&amp;"\'&gt;</tag>' 565 566 >>> elem.attrib["key"] = "<&\"\'>" 567 >>> elem.text = None 568 >>> serialize(elem) 569 '<tag key="&lt;&amp;&quot;\'&gt;"/>' 570 >>> serialize(elem, encoding="utf-8") 571 '<tag key="&lt;&amp;&quot;\'&gt;"/>' 572 >>> serialize(elem, encoding="us-ascii") 573 '<tag key="&lt;&amp;&quot;\'&gt;"/>' 574 >>> serialize(elem, encoding="iso-8859-1").lower() 575 '<?xml version=\'1.0\' encoding=\'iso-8859-1\'?>\n<tag key="&lt;&amp;&quot;\'&gt;"/>' 576 577 >>> elem.text = u'\xe5\xf6\xf6<>' 578 >>> elem.attrib.clear() 579 >>> serialize(elem) 580 '<tag>&#229;&#246;&#246;&lt;&gt;</tag>' 581 >>> serialize(elem, encoding="utf-8") 582 '<tag>\xc3\xa5\xc3\xb6\xc3\xb6&lt;&gt;</tag>' 583 >>> serialize(elem, encoding="us-ascii") 584 '<tag>&#229;&#246;&#246;&lt;&gt;</tag>' 585 >>> serialize(elem, encoding="iso-8859-1").lower() 586 "<?xml version='1.0' encoding='iso-8859-1'?>\n<tag>\xe5\xf6\xf6&lt;&gt;</tag>" 587 588 >>> elem.attrib["key"] = u'\xe5\xf6\xf6<>' 589 >>> elem.text = None 590 >>> serialize(elem) 591 '<tag key="&#229;&#246;&#246;&lt;&gt;"/>' 592 >>> serialize(elem, encoding="utf-8") 593 '<tag key="\xc3\xa5\xc3\xb6\xc3\xb6&lt;&gt;"/>' 594 >>> serialize(elem, encoding="us-ascii") 595 '<tag key="&#229;&#246;&#246;&lt;&gt;"/>' 596 >>> serialize(elem, encoding="iso-8859-1").lower() 597 '<?xml version=\'1.0\' encoding=\'iso-8859-1\'?>\n<tag key="\xe5\xf6\xf6&lt;&gt;"/>' 598 """
599 600 if sys.version_info[0] >= 3: 601 encoding.__doc__ = encoding.__doc__.replace("u'", "'") 602
603 -def methods():
604 r""" 605 Test serialization methods. 606 607 >>> e = ET.XML("<html><link/><script>1 &lt; 2</script></html>") 608 >>> e.tail = "\n" 609 >>> serialize(e) 610 '<html><link /><script>1 &lt; 2</script></html>\n' 611 >>> serialize(e, method=None) 612 '<html><link /><script>1 &lt; 2</script></html>\n' 613 >>> serialize(e, method="xml") 614 '<html><link /><script>1 &lt; 2</script></html>\n' 615 >>> serialize(e, method="html") 616 '<html><link><script>1 < 2</script></html>\n' 617 >>> serialize(e, method="text") 618 '1 < 2\n' 619 620 """
621 622 # doesn't work with lxml.etree 623 del methods 624
625 -def iterators():
626 """ 627 Test iterators. 628 629 >>> e = ET.XML("<html><body>this is a <i>paragraph</i>.</body>..</html>") 630 >>> summarize_list(e.iter()) 631 ['html', 'body', 'i'] 632 >>> summarize_list(e.find("body").iter()) 633 ['body', 'i'] 634 >>> "".join(e.itertext()) 635 'this is a paragraph...' 636 >>> "".join(e.find("body").itertext()) 637 'this is a paragraph.' 638 """
639 640 ENTITY_XML = """\ 641 <!DOCTYPE points [ 642 <!ENTITY % user-entities SYSTEM 'user-entities.xml'> 643 %user-entities; 644 ]> 645 <document>&entity;</document> 646 """ 647
648 -def entity():
649 """ 650 Test entity handling. 651 652 1) bad entities 653 654 >>> ElementTree.XML("<document>&entity;</document>") 655 Traceback (most recent call last): 656 ExpatError: undefined entity: line 1, column 10 657 658 >>> ElementTree.XML(ENTITY_XML) 659 Traceback (most recent call last): 660 ExpatError: undefined entity &entity;: line 5, column 10 661 662 (add more tests here) 663 664 """
665 666 # doesn't work with lxml.etree 667 del entity 668
669 -def error(xml):
670 """ 671 Test error handling. 672 673 >>> error("foo").position 674 (1, 0) 675 >>> error("<tag>&foo;</tag>").position 676 (1, 5) 677 >>> error("foobar<").position 678 (1, 6) 679 680 """ 681 try: 682 ET.XML(xml) 683 except ET.ParseError: 684 return sys.exc_value
685 686 # doesn't work with lxml.etree -> different positions 687 del error 688
689 -def namespace():
690 """ 691 Test namespace issues. 692 693 1) xml namespace 694 695 >>> elem = ElementTree.XML("<tag xml:lang='en' />") 696 >>> serialize(elem) # 1.1 697 '<tag xml:lang="en"/>' 698 699 2) other "well-known" namespaces 700 701 >>> elem = ElementTree.XML("<rdf:RDF xmlns:rdf='http://www.w3.org/1999/02/22-rdf-syntax-ns#' />") 702 >>> serialize(elem) # 2.1 703 '<rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"/>' 704 705 >>> elem = ElementTree.XML("<html:html xmlns:html='http://www.w3.org/1999/xhtml' />") 706 >>> serialize(elem) # 2.2 707 '<html:html xmlns:html="http://www.w3.org/1999/xhtml"/>' 708 709 >>> elem = ElementTree.XML("<soap:Envelope xmlns:soap='http://schemas.xmlsoap.org/soap/envelope' />") 710 >>> serialize(elem) # 2.3 711 '<soap:Envelope xmlns:soap="http://schemas.xmlsoap.org/soap/envelope"/>' 712 713 3) unknown namespaces 714 715 """
716
717 -def qname():
718 """ 719 Test QName handling. 720 721 1) decorated tags 722 723 >>> elem = ElementTree.Element("{uri}tag") 724 >>> serialize(elem) # 1.1 725 '<ns0:tag xmlns:ns0="uri"/>' 726 >>> elem = ElementTree.Element(ElementTree.QName("{uri}tag")) 727 >>> serialize(elem) # 1.2 728 '<ns0:tag xmlns:ns0="uri"/>' 729 >>> elem = ElementTree.Element(ElementTree.QName("uri", "tag")) 730 >>> serialize(elem) # 1.3 731 '<ns0:tag xmlns:ns0="uri"/>' 732 733 # ns/attribute order ... 734 735 ## 2) decorated attributes 736 737 ## >>> elem.clear() 738 ## >>> elem.attrib["{uri}key"] = "value" 739 ## >>> serialize(elem) # 2.1 740 ## '<ns0:tag ns0:key="value" xmlns:ns0="uri"/>' 741 742 ## >>> elem.clear() 743 ## >>> elem.attrib[ElementTree.QName("{uri}key")] = "value" 744 ## >>> serialize(elem) # 2.2 745 ## '<ns0:tag ns0:key="value" xmlns:ns0="uri"/>' 746 747 ## 3) decorated values are not converted by default, but the 748 ## QName wrapper can be used for values 749 750 ## >>> elem.clear() 751 ## >>> elem.attrib["{uri}key"] = "{uri}value" 752 ## >>> serialize(elem) # 3.1 753 ## '<ns0:tag ns0:key="{uri}value" xmlns:ns0="uri"/>' 754 755 ## >>> elem.clear() 756 ## >>> elem.attrib["{uri}key"] = ElementTree.QName("{uri}value") 757 ## >>> serialize(elem) # 3.2 758 ## '<ns0:tag ns0:key="ns0:value" xmlns:ns0="uri"/>' 759 760 ## >>> elem.clear() 761 ## >>> subelem = ElementTree.Element("tag") 762 ## >>> subelem.attrib["{uri1}key"] = ElementTree.QName("{uri2}value") 763 ## >>> elem.append(subelem) 764 ## >>> elem.append(subelem) 765 ## >>> serialize(elem) # 3.3 766 ## '<ns0:tag xmlns:ns0="uri"><tag ns1:key="ns2:value" xmlns:ns1="uri1" xmlns:ns2="uri2"/><tag ns1:key="ns2:value" xmlns:ns1="uri1" xmlns:ns2="uri2"/></ns0:tag>' 767 768 """
769
770 -def xpath_tokenizer(p):
771 """ 772 Test the XPath tokenizer. 773 774 >>> # tests from the xml specification 775 >>> xpath_tokenizer("*") 776 ['*'] 777 >>> xpath_tokenizer("text()") 778 ['text', '()'] 779 >>> xpath_tokenizer("@name") 780 ['@', 'name'] 781 >>> xpath_tokenizer("@*") 782 ['@', '*'] 783 >>> xpath_tokenizer("para[1]") 784 ['para', '[', '1', ']'] 785 >>> xpath_tokenizer("para[last()]") 786 ['para', '[', 'last', '()', ']'] 787 >>> xpath_tokenizer("*/para") 788 ['*', '/', 'para'] 789 >>> xpath_tokenizer("/doc/chapter[5]/section[2]") 790 ['/', 'doc', '/', 'chapter', '[', '5', ']', '/', 'section', '[', '2', ']'] 791 >>> xpath_tokenizer("chapter//para") 792 ['chapter', '//', 'para'] 793 >>> xpath_tokenizer("//para") 794 ['//', 'para'] 795 >>> xpath_tokenizer("//olist/item") 796 ['//', 'olist', '/', 'item'] 797 >>> xpath_tokenizer(".") 798 ['.'] 799 >>> xpath_tokenizer(".//para") 800 ['.', '//', 'para'] 801 >>> xpath_tokenizer("..") 802 ['..'] 803 >>> xpath_tokenizer("../@lang") 804 ['..', '/', '@', 'lang'] 805 >>> xpath_tokenizer("chapter[title]") 806 ['chapter', '[', 'title', ']'] 807 >>> xpath_tokenizer("employee[@secretary and @assistant]") 808 ['employee', '[', '@', 'secretary', '', 'and', '', '@', 'assistant', ']'] 809 810 >>> # additional tests 811 >>> xpath_tokenizer("{http://spam}egg") 812 ['{http://spam}egg'] 813 >>> xpath_tokenizer("./spam.egg") 814 ['.', '/', 'spam.egg'] 815 >>> xpath_tokenizer(".//{http://spam}egg") 816 ['.', '//', '{http://spam}egg'] 817 """ 818 out = [] 819 for op, tag in ElementPath.xpath_tokenizer(p): 820 out.append(op or tag) 821 return out
822 823 # 824 # xinclude tests (samples from appendix C of the xinclude specification) 825 826 XINCLUDE = {} 827 828 XINCLUDE["C1.xml"] = """\ 829 <?xml version='1.0'?> 830 <document xmlns:xi="http://www.w3.org/2001/XInclude"> 831 <p>120 Mz is adequate for an average home user.</p> 832 <xi:include href="disclaimer.xml"/> 833 </document> 834 """ 835 836 XINCLUDE["disclaimer.xml"] = """\ 837 <?xml version='1.0'?> 838 <disclaimer> 839 <p>The opinions represented herein represent those of the individual 840 and should not be interpreted as official policy endorsed by this 841 organization.</p> 842 </disclaimer> 843 """ 844 845 XINCLUDE["C2.xml"] = """\ 846 <?xml version='1.0'?> 847 <document xmlns:xi="http://www.w3.org/2001/XInclude"> 848 <p>This document has been accessed 849 <xi:include href="count.txt" parse="text"/> times.</p> 850 </document> 851 """ 852 853 XINCLUDE["count.txt"] = "324387" 854 855 XINCLUDE["C3.xml"] = """\ 856 <?xml version='1.0'?> 857 <document xmlns:xi="http://www.w3.org/2001/XInclude"> 858 <p>The following is the source of the "data.xml" resource:</p> 859 <example><xi:include href="data.xml" parse="text"/></example> 860 </document> 861 """ 862 863 XINCLUDE["data.xml"] = """\ 864 <?xml version='1.0'?> 865 <data> 866 <item><![CDATA[Brooks & Shields]]></item> 867 </data> 868 """ 869 870 XINCLUDE["C5.xml"] = """\ 871 <?xml version='1.0'?> 872 <div xmlns:xi="http://www.w3.org/2001/XInclude"> 873 <xi:include href="example.txt" parse="text"> 874 <xi:fallback> 875 <xi:include href="fallback-example.txt" parse="text"> 876 <xi:fallback><a href="mailto:bob@example.org">Report error</a></xi:fallback> 877 </xi:include> 878 </xi:fallback> 879 </xi:include> 880 </div> 881 """ 882 883 XINCLUDE["default.xml"] = """\ 884 <?xml version='1.0'?> 885 <document xmlns:xi="http://www.w3.org/2001/XInclude"> 886 <p>Example.</p> 887 <xi:include href="samples/simple.xml"/> 888 </document> 889 """ 890
891 -def xinclude_loader(href, parse="xml", encoding=None):
892 try: 893 data = XINCLUDE[href] 894 except KeyError: 895 raise IOError("resource not found") 896 if parse == "xml": 897 return ElementTree.XML(data) 898 return data
899
900 -def xinclude():
901 r""" 902 Basic inclusion example (XInclude C.1) 903 904 >>> document = xinclude_loader("C1.xml") 905 >>> ElementInclude.include(document, xinclude_loader) 906 >>> print(serialize(document)) # C1 907 <document> 908 <p>120 Mz is adequate for an average home user.</p> 909 <disclaimer> 910 <p>The opinions represented herein represent those of the individual 911 and should not be interpreted as official policy endorsed by this 912 organization.</p> 913 </disclaimer> 914 </document> 915 916 Textual inclusion example (XInclude C.2) 917 918 >>> document = xinclude_loader("C2.xml") 919 >>> ElementInclude.include(document, xinclude_loader) 920 >>> print(serialize(document)) # C2 921 <document> 922 <p>This document has been accessed 923 324387 times.</p> 924 </document> 925 926 Textual inclusion of XML example (XInclude C.3) 927 928 >>> document = xinclude_loader("C3.xml") 929 >>> ElementInclude.include(document, xinclude_loader) 930 >>> print(serialize(document)) # C3 931 <document> 932 <p>The following is the source of the "data.xml" resource:</p> 933 <example>&lt;?xml version='1.0'?&gt; 934 &lt;data&gt; 935 &lt;item&gt;&lt;![CDATA[Brooks &amp; Shields]]&gt;&lt;/item&gt; 936 &lt;/data&gt; 937 </example> 938 </document> 939 940 ## Fallback example (XInclude C.5) 941 ## Note! Fallback support is not yet implemented 942 943 ## >>> document = xinclude_loader("C5.xml") 944 ## >>> ElementInclude.include(document, xinclude_loader) 945 ## Traceback (most recent call last): 946 ## IOError: resource not found 947 ## >>> # print(serialize(document)) # C5 948 949 """
950
951 -def xinclude_default():
952 """ 953 >>> document = xinclude_loader("default.xml") 954 >>> ElementInclude.include(document) 955 >>> print(serialize(document)) # default 956 <document> 957 <p>Example.</p> 958 <root> 959 <element key="value">text</element> 960 <element>text</element>tail 961 <empty-element/> 962 </root> 963 </document> 964 """
965 966 # 967 # xmlwriter 968
969 -def xmlwriter():
970 r""" 971 >>> file = BytesIO() 972 >>> w = SimpleXMLWriter.XMLWriter(file) 973 >>> html = w.start("html") 974 >>> x = w.start("head") 975 >>> w.element("title", "my document") 976 >>> w.data("\n") 977 >>> w.element("meta", name="hello", value="goodbye") 978 >>> w.data("\n") 979 >>> w.end() 980 >>> x = w.start("body") 981 >>> w.element("h1", "this is a heading") 982 >>> w.data("\n") 983 >>> w.element("p", u"this is a paragraph") 984 >>> w.data("\n") 985 >>> w.element("p", u"reserved characters: <&>") 986 >>> w.data("\n") 987 >>> w.element("p", u"detta är också ett stycke") 988 >>> w.data("\n") 989 >>> w.close(html) 990 >>> print(file.getvalue()) 991 <html><head><title>my document</title> 992 <meta name="hello" value="goodbye" /> 993 </head><body><h1>this is a heading</h1> 994 <p>this is a paragraph</p> 995 <p>reserved characters: &lt;&amp;&gt;</p> 996 <p>detta &#228;r ocks&#229; ett stycke</p> 997 </body></html> 998 """
999 1000 # doesn't work with lxml.etree 1001 del xmlwriter 1002 1003 # -------------------------------------------------------------------- 1004 # reported bugs 1005
1006 -def bug_xmltoolkit21():
1007 """ 1008 marshaller gives obscure errors for non-string values 1009 1010 >>> elem = ElementTree.Element(123) 1011 >>> serialize(elem) # tag 1012 Traceback (most recent call last): 1013 TypeError: cannot serialize 123 (type int) 1014 >>> elem = ElementTree.Element("elem") 1015 >>> elem.text = 123 1016 >>> serialize(elem) # text 1017 Traceback (most recent call last): 1018 TypeError: cannot serialize 123 (type int) 1019 >>> elem = ElementTree.Element("elem") 1020 >>> elem.tail = 123 1021 >>> serialize(elem) # tail 1022 Traceback (most recent call last): 1023 TypeError: cannot serialize 123 (type int) 1024 >>> elem = ElementTree.Element("elem") 1025 >>> elem.set(123, "123") 1026 >>> serialize(elem) # attribute key 1027 Traceback (most recent call last): 1028 TypeError: cannot serialize 123 (type int) 1029 >>> elem = ElementTree.Element("elem") 1030 >>> elem.set("123", 123) 1031 >>> serialize(elem) # attribute value 1032 Traceback (most recent call last): 1033 TypeError: cannot serialize 123 (type int) 1034 1035 """
1036 1037 # doesn't work with lxml.etree 1038 del bug_xmltoolkit21 1039
1040 -def bug_xmltoolkit25():
1041 """ 1042 typo in ElementTree.findtext 1043 1044 >>> tree = ElementTree.ElementTree(SAMPLE_XML) 1045 >>> tree.findtext("tag") 1046 'text' 1047 >>> tree.findtext("section/tag") 1048 'subtext' 1049 """
1050
1051 -def bug_xmltoolkit28():
1052 """ 1053 .//tag causes exceptions 1054 1055 >>> tree = ElementTree.XML("<doc><table><tbody/></table></doc>") 1056 >>> summarize_list(tree.findall(".//thead")) 1057 [] 1058 >>> summarize_list(tree.findall(".//tbody")) 1059 ['tbody'] 1060 """
1061
1062 -def bug_xmltoolkitX1():
1063 """ 1064 dump() doesn't flush the output buffer 1065 1066 >>> tree = ElementTree.XML("<doc><table><tbody/></table></doc>") 1067 >>> ElementTree.dump(tree); sys.stdout.write("tail") 1068 <doc><table><tbody /></table></doc> 1069 tail 1070 """
1071 1072 # doesn't work with lxml.etree 1073 del bug_xmltoolkitX1 1074
1075 -def bug_xmltoolkit39():
1076 """ 1077 non-ascii element and attribute names doesn't work 1078 1079 >>> tree = ElementTree.XML("<?xml version='1.0' encoding='iso-8859-1'?><täg />") 1080 >>> ElementTree.tostring(tree, "utf-8") 1081 '<t\\xc3\\xa4g />' 1082 1083 >>> tree = ElementTree.XML("<?xml version='1.0' encoding='iso-8859-1'?><tag ättr='v&#228;lue' />") 1084 >>> tree.attrib 1085 {u'\\xe4ttr': u'v\\xe4lue'} 1086 >>> ElementTree.tostring(tree, "utf-8") 1087 '<tag \\xc3\\xa4ttr="v\\xc3\\xa4lue" />' 1088 1089 >>> tree = ElementTree.XML("<?xml version='1.0' encoding='iso-8859-1'?><täg>text</täg>") 1090 >>> ElementTree.tostring(tree, "utf-8") 1091 '<t\\xc3\\xa4g>text</t\\xc3\\xa4g>' 1092 1093 >>> tree = ElementTree.Element(u"täg") 1094 >>> ElementTree.tostring(tree, "utf-8") 1095 '<t\\xc3\\xa4g />' 1096 1097 >>> tree = ElementTree.Element("tag") 1098 >>> tree.set(u"ättr", u"välue") 1099 >>> ElementTree.tostring(tree, "utf-8") 1100 '<tag \\xc3\\xa4ttr="v\\xc3\\xa4lue" />' 1101 1102 """
1103 1104 # doesn't work with lxml.etree 1105 del bug_xmltoolkit39 1106
1107 -def bug_xmltoolkit45():
1108 """ 1109 problems parsing mixed unicode/non-ascii html documents 1110 1111 latin-1 text 1112 >>> p = HTMLTreeBuilder.TreeBuilder() 1113 >>> p.feed("<p>välue</p>") 1114 >>> serialize(p.close()) 1115 '<p>v&#228;lue</p>' 1116 1117 utf-8 text 1118 >>> p = HTMLTreeBuilder.TreeBuilder(encoding="utf-8") 1119 >>> p.feed("<p>v\xc3\xa4lue</p>") 1120 >>> serialize(p.close()) 1121 '<p>v&#228;lue</p>' 1122 1123 utf-8 text using meta tag 1124 >>> p = HTMLTreeBuilder.TreeBuilder() 1125 >>> p.feed("<html><meta http-equiv='Content-Type' content='text/html; charset=utf-8'><p>v\xc3\xa4lue</p></html>") 1126 >>> serialize(p.close().find("p")) 1127 '<p>v&#228;lue</p>' 1128 1129 latin-1 character references 1130 >>> p = HTMLTreeBuilder.TreeBuilder() 1131 >>> p.feed("<p>v&#228;lue</p>") 1132 >>> serialize(p.close()) 1133 '<p>v&#228;lue</p>' 1134 1135 latin-1 character entities 1136 >>> p = HTMLTreeBuilder.TreeBuilder() 1137 >>> p.feed("<p>v&auml;lue</p>") 1138 >>> serialize(p.close()) 1139 '<p>v&#228;lue</p>' 1140 1141 mixed latin-1 text and unicode entities 1142 >>> p = HTMLTreeBuilder.TreeBuilder() 1143 >>> p.feed("<p>&#8221;välue&#8221;</p>") 1144 >>> serialize(p.close()) 1145 '<p>&#8221;v&#228;lue&#8221;</p>' 1146 1147 mixed unicode and latin-1 entities 1148 >>> p = HTMLTreeBuilder.TreeBuilder() 1149 >>> p.feed("<p>&#8221;v&auml;lue&#8221;</p>") 1150 >>> serialize(p.close()) 1151 '<p>&#8221;v&#228;lue&#8221;</p>' 1152 1153 """
1154 1155 # doesn't work with lxml.etree 1156 del bug_xmltoolkit45 1157
1158 -def bug_xmltoolkit46():
1159 """ 1160 problems parsing open BR tags 1161 1162 >>> p = HTMLTreeBuilder.TreeBuilder() 1163 >>> p.feed("<p>key<br>value</p>") 1164 >>> serialize(p.close()) 1165 '<p>key<br />value</p>' 1166 1167 """
1168 1169 # doesn't work with lxml.etree 1170 del bug_xmltoolkit46 1171
1172 -def bug_xmltoolkit54():
1173 """ 1174 problems handling internally defined entities 1175 1176 >>> e = ElementTree.XML("<!DOCTYPE doc [<!ENTITY ldots '&#x8230;'>]><doc>&ldots;</doc>") 1177 >>> serialize(e) 1178 '<doc>&#33328;</doc>' 1179 """
1180 1181 # doesn't work with lxml.etree 1182 del bug_xmltoolkit54 1183
1184 -def bug_xmltoolkit55():
1185 """ 1186 make sure we're reporting the first error, not the last 1187 1188 >>> e = ElementTree.XML("<!DOCTYPE doc SYSTEM 'doc.dtd'><doc>&ldots;&ndots;&rdots;</doc>") 1189 Traceback (most recent call last): 1190 ParseError: undefined entity &ldots;: line 1, column 36 1191 """
1192 1193 # doesn't work with lxml.etree 1194 del bug_xmltoolkit55 1195
1196 -def bug_200708_version():
1197 """ 1198 >>> parser = ET.XMLParser() 1199 >>> parser.version 1200 'Expat 2.0.0' 1201 >>> parser.feed(open("samples/simple.xml").read()) 1202 >>> print(serialize(parser.close())) 1203 <root> 1204 <element key="value">text</element> 1205 <element>text</element>tail 1206 <empty-element /> 1207 </root> 1208 """
1209 1210 # doesn't work with lxml.etree 1211 del bug_200708_version 1212
1213 -def bug_200708_newline():
1214 r""" 1215 1216 Preserve newlines in attributes. 1217 1218 >>> e = ET.Element('SomeTag', text="def _f():\n return 3\n") 1219 >>> ET.tostring(e) 1220 '<SomeTag text="def _f():&#10; return 3&#10;" />' 1221 >>> ET.XML(ET.tostring(e)).get("text") 1222 'def _f():\n return 3\n' 1223 >>> ET.tostring(ET.XML(ET.tostring(e))) 1224 '<SomeTag text="def _f():&#10; return 3&#10;" />' 1225 """
1226 1227 # doesn't work with lxml.etree 1228 del bug_200708_newline 1229
1230 -def bug_200709_default_namespace():
1231 """ 1232 1233 >>> e = ET.Element("{default}elem") 1234 >>> s = ET.SubElement(e, "{default}elem") 1235 >>> serialize(e, default_namespace="default") # 1 1236 '<elem xmlns="default"><elem /></elem>' 1237 1238 >>> e = ET.Element("{default}elem") 1239 >>> s = ET.SubElement(e, "{default}elem") 1240 >>> s = ET.SubElement(e, "{not-default}elem") 1241 >>> serialize(e, default_namespace="default") # 2 1242 '<elem xmlns="default" xmlns:ns1="not-default"><elem /><ns1:elem /></elem>' 1243 1244 >>> e = ET.Element("{default}elem") 1245 >>> s = ET.SubElement(e, "{default}elem") 1246 >>> s = ET.SubElement(e, "elem") # unprefixed name 1247 >>> serialize(e, default_namespace="default") # 3 1248 Traceback (most recent call last): 1249 ValueError: cannot use non-qualified names with default_namespace option 1250 1251 """
1252 1253 # doesn't work with lxml.etree 1254 del bug_200709_default_namespace 1255 1256 # -------------------------------------------------------------------- 1257 1258 if __name__ == "__main__": 1259 import doctest, selftest 1260 failed, tested = doctest.testmod(selftest) 1261 print("%d tests ok." % (tested - failed)) 1262 if failed > 0: 1263 print("%d tests failed. Exiting with non-zero return code." % failed) 1264 sys.exit(1) 1265