Package lxml :: Package tests :: Module selftest
[hide private]
[frames] | no frames]

Source Code for Module lxml.tests.selftest

   1  # $Id: selftest.py 3276 2007-09-12 06:52:30Z fredrik $ 
   2  # -*- coding: iso-8859-1 -*- 
   3  # elementtree selftest program 
   4   
   5  # this test script uses Python's "doctest" module to check that the 
   6  # *test script* works as expected. 
   7   
   8  # TODO: add more elementtree method tests 
   9  # TODO: add xml/html parsing tests 
  10  # TODO: etc 
  11   
  12  import re, sys 
  13   
14 -def stdout():
15 if sys.version_info[0] < 3: 16 return sys.stdout 17 class bytes_stdout(object): 18 def write(self, data): 19 if isinstance(data, bytes): 20 data = data.decode('ISO8859-1') 21 sys.stdout.write(data)
22 return bytes_stdout() 23 24 try: 25 from StringIO import StringIO as BytesIO 26 except ImportError: 27 from io import BytesIO 28 29 from lxml import etree as ElementTree 30 from lxml import _elementpath as ElementPath 31 from lxml import ElementInclude 32 ET = ElementTree 33 34 #from elementtree import ElementTree 35 #from elementtree import ElementPath 36 #from elementtree import ElementInclude 37 #from elementtree import HTMLTreeBuilder 38 #from elementtree import SimpleXMLWriter 39
40 -def fix_compatibility(xml_data):
41 xml_data = re.sub(r'\s*xmlns:[a-z0-9]+="http://www.w3.org/2001/XInclude"', '', xml_data) 42 xml_data = xml_data.replace(' />', '/>') 43 if xml_data[-1:] == '\n': 44 xml_data = xml_data[:-1] 45 return xml_data
46
47 -def serialize(elem, **options):
48 file = BytesIO() 49 tree = ElementTree.ElementTree(elem) 50 tree.write(file, **options) 51 if sys.version_info[0] < 3: 52 try: 53 encoding = options["encoding"] 54 except KeyError: 55 encoding = "utf-8" 56 else: 57 encoding = 'ISO8859-1' 58 result = fix_compatibility(file.getvalue().decode(encoding)) 59 if sys.version_info[0] < 3: 60 result = result.encode(encoding) 61 return result
62
63 -def summarize(elem):
64 return elem.tag
65
66 -def summarize_list(seq):
67 return list(map(summarize, seq))
68
69 -def normalize_crlf(tree):
70 for elem in tree.getiterator(): 71 if elem.text: elem.text = elem.text.replace("\r\n", "\n") 72 if elem.tail: elem.tail = elem.tail.replace("\r\n", "\n")
73 74 SAMPLE_XML = ElementTree.XML(""" 75 <body> 76 <tag class='a'>text</tag> 77 <tag class='b' /> 78 <section> 79 <tag class='b' id='inner'>subtext</tag> 80 </section> 81 </body> 82 """) 83 84 # 85 # interface tests 86
87 -def check_string(string):
88 len(string) 89 for char in string: 90 if len(char) != 1: 91 print("expected one-character string, got %r" % char) 92 new_string = string + "" 93 new_string = string + " " 94 string[:0]
95
96 -def check_string_or_none(value):
97 if value is None: 98 return 99 return check_string(value)
100
101 -def check_mapping(mapping):
102 len(mapping) 103 keys = mapping.keys() 104 items = mapping.items() 105 for key in keys: 106 item = mapping[key] 107 mapping["key"] = "value" 108 if mapping["key"] != "value": 109 print("expected value string, got %r" % mapping["key"])
110
111 -def check_element(element):
112 if not hasattr(element, "tag"): 113 print("no tag member") 114 if not hasattr(element, "attrib"): 115 print("no attrib member") 116 if not hasattr(element, "text"): 117 print("no text member") 118 if not hasattr(element, "tail"): 119 print("no tail member") 120 check_string(element.tag) 121 check_mapping(element.attrib) 122 check_string_or_none(element.text) 123 check_string_or_none(element.tail) 124 for elem in element: 125 check_element(elem)
126
127 -def check_element_tree(tree):
128 check_element(tree.getroot())
129 130 # -------------------------------------------------------------------- 131 # element tree tests 132
133 -def sanity():
134 """ 135 >>> from elementtree.ElementTree import * 136 >>> from elementtree.ElementInclude import * 137 >>> from elementtree.ElementPath import * 138 >>> from elementtree.HTMLTreeBuilder import * 139 >>> from elementtree.SimpleXMLWriter import * 140 >>> from elementtree.TidyTools import * 141 """
142 143 # doesn't work with lxml.etree 144 del sanity 145
146 -def version():
147 """ 148 >>> ElementTree.VERSION 149 '1.3a2' 150 """
151 152 # doesn't work with lxml.etree 153 del version 154
155 -def interface():
156 """ 157 Test element tree interface. 158 159 >>> element = ElementTree.Element("tag") 160 >>> check_element(element) 161 >>> tree = ElementTree.ElementTree(element) 162 >>> check_element_tree(tree) 163 """
164
165 -def simpleops():
166 """ 167 >>> elem = ElementTree.XML("<body><tag/></body>") 168 >>> serialize(elem) 169 '<body><tag/></body>' 170 >>> e = ElementTree.Element("tag2") 171 >>> elem.append(e) 172 >>> serialize(elem) 173 '<body><tag/><tag2/></body>' 174 >>> elem.remove(e) 175 >>> serialize(elem) 176 '<body><tag/></body>' 177 >>> elem.insert(0, e) 178 >>> serialize(elem) 179 '<body><tag2/><tag/></body>' 180 >>> elem.remove(e) 181 >>> elem.extend([e]) 182 >>> serialize(elem) 183 '<body><tag/><tag2/></body>' 184 >>> elem.remove(e) 185 """
186
187 -def simplefind():
188 """ 189 Test find methods using the elementpath fallback. 190 191 >>> CurrentElementPath = ElementTree.ElementPath 192 >>> ElementTree.ElementPath = ElementTree._SimpleElementPath() 193 >>> elem = SAMPLE_XML 194 >>> elem.find("tag").tag 195 'tag' 196 >>> ElementTree.ElementTree(elem).find("tag").tag 197 'tag' 198 >>> elem.findtext("tag") 199 'text' 200 >>> elem.findtext("tog") 201 >>> elem.findtext("tog", "default") 202 'default' 203 >>> ElementTree.ElementTree(elem).findtext("tag") 204 'text' 205 >>> summarize_list(elem.findall("tag")) 206 ['tag', 'tag'] 207 >>> summarize_list(elem.findall(".//tag")) 208 ['tag', 'tag', 'tag'] 209 210 Path syntax doesn't work in this case. 211 212 >>> elem.find("section/tag") 213 >>> elem.findtext("section/tag") 214 >>> elem.findall("section/tag") 215 [] 216 217 >>> ElementTree.ElementPath = CurrentElementPath 218 """
219 220 # doesn't work with lxml.etree 221 del simplefind 222
223 -def find():
224 """ 225 Test find methods (including xpath syntax). 226 227 >>> elem = SAMPLE_XML 228 >>> elem.find("tag").tag 229 'tag' 230 >>> ElementTree.ElementTree(elem).find("tag").tag 231 'tag' 232 >>> elem.find("section/tag").tag 233 'tag' 234 >>> ElementTree.ElementTree(elem).find("section/tag").tag 235 'tag' 236 >>> elem.findtext("tag") 237 'text' 238 >>> elem.findtext("tog") 239 >>> elem.findtext("tog", "default") 240 'default' 241 >>> ElementTree.ElementTree(elem).findtext("tag") 242 'text' 243 >>> elem.findtext("section/tag") 244 'subtext' 245 >>> ElementTree.ElementTree(elem).findtext("section/tag") 246 'subtext' 247 >>> summarize_list(elem.findall("tag")) 248 ['tag', 'tag'] 249 >>> summarize_list(elem.findall("*")) 250 ['tag', 'tag', 'section'] 251 >>> summarize_list(elem.findall(".//tag")) 252 ['tag', 'tag', 'tag'] 253 >>> summarize_list(elem.findall("section/tag")) 254 ['tag'] 255 >>> summarize_list(elem.findall("section//tag")) 256 ['tag'] 257 >>> summarize_list(elem.findall("section/*")) 258 ['tag'] 259 >>> summarize_list(elem.findall("section//*")) 260 ['tag'] 261 >>> summarize_list(elem.findall("section/.//*")) 262 ['tag'] 263 >>> summarize_list(elem.findall("*/*")) 264 ['tag'] 265 >>> summarize_list(elem.findall("*//*")) 266 ['tag'] 267 >>> summarize_list(elem.findall("*/tag")) 268 ['tag'] 269 >>> summarize_list(elem.findall("*/./tag")) 270 ['tag'] 271 >>> summarize_list(elem.findall("./tag")) 272 ['tag', 'tag'] 273 >>> summarize_list(elem.findall(".//tag")) 274 ['tag', 'tag', 'tag'] 275 >>> summarize_list(elem.findall("././tag")) 276 ['tag', 'tag'] 277 >>> summarize_list(elem.findall(".//tag[@class]")) 278 ['tag', 'tag', 'tag'] 279 >>> summarize_list(elem.findall(".//tag[@class='a']")) 280 ['tag'] 281 >>> summarize_list(elem.findall(".//tag[@class='b']")) 282 ['tag', 'tag'] 283 >>> summarize_list(elem.findall(".//tag[@id]")) 284 ['tag'] 285 >>> summarize_list(elem.findall(".//section[tag]")) 286 ['section'] 287 >>> summarize_list(elem.findall(".//section[element]")) 288 [] 289 >>> summarize_list(elem.findall("../tag")) 290 [] 291 >>> summarize_list(elem.findall("section/../tag")) 292 ['tag', 'tag'] 293 >>> summarize_list(ElementTree.ElementTree(elem).findall("./tag")) 294 ['tag', 'tag'] 295 296 FIXME: ET's Path module handles this case incorrectly; this gives 297 a warning in 1.3, and the behaviour will be modified in 1.4. 298 299 >>> summarize_list(ElementTree.ElementTree(elem).findall("/tag")) 300 ['tag', 'tag'] 301 """
302
303 -def bad_find():
304 """ 305 Check bad or unsupported path expressions. 306 307 >>> elem = SAMPLE_XML 308 >>> elem.findall("/tag") 309 Traceback (most recent call last): 310 SyntaxError: cannot use absolute path on element 311 312 # this is supported in ET 1.3: 313 #>>> elem.findall("section//") 314 #Traceback (most recent call last): 315 #SyntaxError: invalid path 316 """
317
318 -def parsefile():
319 """ 320 Test parsing from file. 321 322 >>> tree = ElementTree.parse("samples/simple.xml") 323 >>> normalize_crlf(tree) 324 >>> tree.write(stdout()) 325 <root> 326 <element key="value">text</element> 327 <element>text</element>tail 328 <empty-element/> 329 </root> 330 >>> tree = ElementTree.parse("samples/simple-ns.xml") 331 >>> normalize_crlf(tree) 332 >>> tree.write(stdout()) 333 <root xmlns="http://namespace/"> 334 <element key="value">text</element> 335 <element>text</element>tail 336 <empty-element/> 337 </root> 338 339 ## <ns0:root xmlns:ns0="http://namespace/"> 340 ## <ns0:element key="value">text</ns0:element> 341 ## <ns0:element>text</ns0:element>tail 342 ## <ns0:empty-element/> 343 ## </ns0:root> 344 """
345
346 -def parsehtml():
347 """ 348 Test HTML parsing. 349 350 >>> # p = HTMLTreeBuilder.TreeBuilder() 351 >>> p = ElementTree.HTMLParser() 352 >>> p.feed("<p><p>spam<b>egg</b></p>") 353 >>> serialize(p.close()) 354 '<p>spam<b>egg</b></p>' 355 """
356 357 # doesn't work with lxml.etree 358 del parsehtml 359
360 -def parseliteral():
361 r""" 362 >>> element = ElementTree.XML("<html><body>text</body></html>") 363 >>> ElementTree.ElementTree(element).write(stdout()) 364 <html><body>text</body></html> 365 >>> element = ElementTree.fromstring("<html><body>text</body></html>") 366 >>> ElementTree.ElementTree(element).write(stdout()) 367 <html><body>text</body></html> 368 369 ## >>> sequence = ["<html><body>", "text</bo", "dy></html>"] 370 ## >>> element = ElementTree.fromstringlist(sequence) 371 ## >>> ElementTree.ElementTree(element).write(stdout()) 372 ## <html><body>text</body></html> 373 374 >>> print(repr(ElementTree.tostring(element)).lstrip('b')) 375 '<html><body>text</body></html>' 376 377 # looks different in lxml 378 # >>> print(ElementTree.tostring(element, "ascii")) 379 # <?xml version='1.0' encoding='ascii'?> 380 # <html><body>text</body></html> 381 382 >>> _, ids = ElementTree.XMLID("<html><body>text</body></html>") 383 >>> len(ids) 384 0 385 >>> _, ids = ElementTree.XMLID("<html><body id='body'>text</body></html>") 386 >>> len(ids) 387 1 388 >>> ids["body"].tag 389 'body' 390 """
391
392 -def simpleparsefile():
393 """ 394 Test the xmllib-based parser. 395 396 >>> from elementtree import SimpleXMLTreeBuilder 397 >>> parser = SimpleXMLTreeBuilder.TreeBuilder() 398 >>> tree = ElementTree.parse("samples/simple.xml", parser) 399 >>> normalize_crlf(tree) 400 >>> tree.write(sys.stdout) 401 <root> 402 <element key="value">text</element> 403 <element>text</element>tail 404 <empty-element /> 405 </root> 406 """
407 408 # doesn't work with lxml.etree 409 del simpleparsefile 410
411 -def iterparse():
412 """ 413 Test iterparse interface. 414 415 >>> iterparse = ElementTree.iterparse 416 417 >>> context = iterparse("samples/simple.xml") 418 >>> for action, elem in context: 419 ... print("%s %s" % (action, elem.tag)) 420 end element 421 end element 422 end empty-element 423 end root 424 >>> context.root.tag 425 'root' 426 427 >>> context = iterparse("samples/simple-ns.xml") 428 >>> for action, elem in context: 429 ... print("%s %s" % (action, elem.tag)) 430 end {http://namespace/}element 431 end {http://namespace/}element 432 end {http://namespace/}empty-element 433 end {http://namespace/}root 434 435 >>> events = () 436 >>> context = iterparse("samples/simple.xml", events) 437 >>> for action, elem in context: 438 ... print("%s %s" % (action, elem.tag)) 439 440 >>> events = () 441 >>> context = iterparse("samples/simple.xml", events=events) 442 >>> for action, elem in context: 443 ... print("%s %s" % (action, elem.tag)) 444 445 >>> events = ("start", "end") 446 >>> context = iterparse("samples/simple.xml", events) 447 >>> for action, elem in context: 448 ... print("%s %s" % (action, elem.tag)) 449 start root 450 start element 451 end element 452 start element 453 end element 454 start empty-element 455 end empty-element 456 end root 457 458 >>> events = ("start", "end", "start-ns", "end-ns") 459 >>> context = iterparse("samples/simple-ns.xml", events) 460 >>> for action, elem in context: 461 ... if action in ("start", "end"): 462 ... print("%s %s" % (action, elem.tag)) 463 ... else: 464 ... print("%s %s" % (action, elem)) 465 start-ns ('', 'http://namespace/') 466 start {http://namespace/}root 467 start {http://namespace/}element 468 end {http://namespace/}element 469 start {http://namespace/}element 470 end {http://namespace/}element 471 start {http://namespace/}empty-element 472 end {http://namespace/}empty-element 473 end {http://namespace/}root 474 end-ns None 475 476 """
477
478 -def fancyparsefile():
479 """ 480 Test the "fancy" parser. 481 482 Sanity check. 483 >>> from elementtree import XMLTreeBuilder 484 >>> parser = XMLTreeBuilder.FancyTreeBuilder() 485 >>> tree = ElementTree.parse("samples/simple.xml", parser) 486 >>> normalize_crlf(tree) 487 >>> tree.write(sys.stdout) 488 <root> 489 <element key="value">text</element> 490 <element>text</element>tail 491 <empty-element /> 492 </root> 493 494 Callback check. 495 >>> class MyFancyParser(XMLTreeBuilder.FancyTreeBuilder): 496 ... def start(self, elem): 497 ... print("START %s" % elem.tag) 498 ... def end(self, elem): 499 ... print("END %s" % elem.tag) 500 >>> parser = MyFancyParser() 501 >>> tree = ElementTree.parse("samples/simple.xml", parser) 502 START root 503 START element 504 END element 505 START element 506 END element 507 START empty-element 508 END empty-element 509 END root 510 """
511 512 # doesn't work with lxml.etree 513 del fancyparsefile 514
515 -def writefile():
516 """ 517 >>> elem = ElementTree.Element("tag") 518 >>> elem.text = "text" 519 >>> serialize(elem) 520 '<tag>text</tag>' 521 >>> ElementTree.SubElement(elem, "subtag").text = "subtext" 522 >>> serialize(elem) 523 '<tag>text<subtag>subtext</subtag></tag>' 524 525 ## Test tag suppression 526 ## >>> elem.tag = None 527 ## >>> serialize(elem) 528 ## 'text<subtag>subtext</subtag>' 529 """
530
531 -def writestring():
532 """ 533 >>> elem = ElementTree.XML("<html><body>text</body></html>") 534 >>> print(repr(ElementTree.tostring(elem)).lstrip('b')) 535 '<html><body>text</body></html>' 536 >>> elem = ElementTree.fromstring("<html><body>text</body></html>") 537 >>> print(repr(ElementTree.tostring(elem)).lstrip('b')) 538 '<html><body>text</body></html>' 539 """
540
541 -def encoding():
542 r""" 543 Test encoding issues. 544 545 >>> elem = ElementTree.Element("tag") 546 >>> elem.text = u'abc' 547 >>> serialize(elem) 548 '<tag>abc</tag>' 549 >>> serialize(elem, encoding="utf-8") 550 '<tag>abc</tag>' 551 >>> serialize(elem, encoding="us-ascii") 552 '<tag>abc</tag>' 553 >>> serialize(elem, encoding="iso-8859-1").lower() 554 "<?xml version='1.0' encoding='iso-8859-1'?>\n<tag>abc</tag>" 555 556 >>> elem.text = "<&\"\'>" 557 >>> serialize(elem) 558 '<tag>&lt;&amp;"\'&gt;</tag>' 559 >>> serialize(elem, encoding="utf-8") 560 '<tag>&lt;&amp;"\'&gt;</tag>' 561 >>> serialize(elem, encoding="us-ascii") # cdata characters 562 '<tag>&lt;&amp;"\'&gt;</tag>' 563 >>> serialize(elem, encoding="iso-8859-1").lower() 564 '<?xml version=\'1.0\' encoding=\'iso-8859-1\'?>\n<tag>&lt;&amp;"\'&gt;</tag>' 565 566 >>> elem.attrib["key"] = "<&\"\'>" 567 >>> elem.text = None 568 >>> serialize(elem) 569 '<tag key="&lt;&amp;&quot;\'&gt;"/>' 570 >>> serialize(elem, encoding="utf-8") 571 '<tag key="&lt;&amp;&quot;\'&gt;"/>' 572 >>> serialize(elem, encoding="us-ascii") 573 '<tag key="&lt;&amp;&quot;\'&gt;"/>' 574 >>> serialize(elem, encoding="iso-8859-1").lower() 575 '<?xml version=\'1.0\' encoding=\'iso-8859-1\'?>\n<tag key="&lt;&amp;&quot;\'&gt;"/>' 576 577 >>> elem.text = u'\xe5\xf6\xf6<>' 578 >>> elem.attrib.clear() 579 >>> serialize(elem) 580 '<tag>&#229;&#246;&#246;&lt;&gt;</tag>' 581 >>> serialize(elem, encoding="utf-8") 582 '<tag>\xc3\xa5\xc3\xb6\xc3\xb6&lt;&gt;</tag>' 583 >>> serialize(elem, encoding="us-ascii") 584 '<tag>&#229;&#246;&#246;&lt;&gt;</tag>' 585 >>> serialize(elem, encoding="iso-8859-1").lower() 586 "<?xml version='1.0' encoding='iso-8859-1'?>\n<tag>\xe5\xf6\xf6&lt;&gt;</tag>" 587 588 >>> elem.attrib["key"] = u'\xe5\xf6\xf6<>' 589 >>> elem.text = None 590 >>> serialize(elem) 591 '<tag key="&#229;&#246;&#246;&lt;&gt;"/>' 592 >>> serialize(elem, encoding="utf-8") 593 '<tag key="\xc3\xa5\xc3\xb6\xc3\xb6&lt;&gt;"/>' 594 >>> serialize(elem, encoding="us-ascii") 595 '<tag key="&#229;&#246;&#246;&lt;&gt;"/>' 596 >>> serialize(elem, encoding="iso-8859-1").lower() 597 '<?xml version=\'1.0\' encoding=\'iso-8859-1\'?>\n<tag key="\xe5\xf6\xf6&lt;&gt;"/>' 598 """
599 600 if sys.version_info[0] >= 3: 601 encoding.__doc__ = encoding.__doc__.replace("u'", "'") 602
603 -def methods():
604 r""" 605 Test serialization methods. 606 607 >>> e = ET.XML("<html><link/><script>1 &lt; 2</script></html>") 608 >>> e.tail = "\n" 609 >>> serialize(e) 610 '<html><link /><script>1 &lt; 2</script></html>\n' 611 >>> serialize(e, method=None) 612 '<html><link /><script>1 &lt; 2</script></html>\n' 613 >>> serialize(e, method="xml") 614 '<html><link /><script>1 &lt; 2</script></html>\n' 615 >>> serialize(e, method="html") 616 '<html><link><script>1 < 2</script></html>\n' 617 >>> serialize(e, method="text") 618 '1 < 2\n' 619 620 """
621 622 # doesn't work with lxml.etree 623 del methods 624
625 -def iterators():
626 """ 627 Test iterators. 628 629 >>> e = ET.XML("<html><body>this is a <i>paragraph</i>.</body>..</html>") 630 >>> summarize_list(e.iter()) 631 ['html', 'body', 'i'] 632 >>> summarize_list(e.find("body").iter()) 633 ['body', 'i'] 634 >>> "".join(e.itertext()) 635 'this is a paragraph...' 636 >>> "".join(e.find("body").itertext()) 637 'this is a paragraph.' 638 """
639 640 ENTITY_XML = """\ 641 <!DOCTYPE points [ 642 <!ENTITY % user-entities SYSTEM 'user-entities.xml'> 643 %user-entities; 644 ]> 645 <document>&entity;</document> 646 """ 647
648 -def entity():
649 """ 650 Test entity handling. 651 652 1) bad entities 653 654 >>> ElementTree.XML("<document>&entity;</document>") 655 Traceback (most recent call last): 656 ExpatError: undefined entity: line 1, column 10 657 658 >>> ElementTree.XML(ENTITY_XML) 659 Traceback (most recent call last): 660 ExpatError: undefined entity &entity;: line 5, column 10 661 662 (add more tests here) 663 664 """
665 666 # doesn't work with lxml.etree 667 del entity 668
669 -def error(xml):
670 """ 671 Test error handling. 672 673 >>> error("foo").position 674 (1, 0) 675 >>> error("<tag>&foo;</tag>").position 676 (1, 5) 677 >>> error("foobar<").position 678 (1, 6) 679 680 """ 681 try: 682 ET.XML(xml) 683 except ET.ParseError: 684 return sys.exc_value
685 686 # doesn't work with lxml.etree -> different positions 687 del error 688
689 -def namespace():
690 """ 691 Test namespace issues. 692 693 1) xml namespace 694 695 >>> elem = ElementTree.XML("<tag xml:lang='en' />") 696 >>> serialize(elem) # 1.1 697 '<tag xml:lang="en"/>' 698 699 2) other "well-known" namespaces 700 701 >>> elem = ElementTree.XML("<rdf:RDF xmlns:rdf='http://www.w3.org/1999/02/22-rdf-syntax-ns#' />") 702 >>> serialize(elem) # 2.1 703 '<rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"/>' 704 705 >>> elem = ElementTree.XML("<html:html xmlns:html='http://www.w3.org/1999/xhtml' />") 706 >>> serialize(elem) # 2.2 707 '<html:html xmlns:html="http://www.w3.org/1999/xhtml"/>' 708 709 >>> elem = ElementTree.XML("<soap:Envelope xmlns:soap='http://schemas.xmlsoap.org/soap/envelope' />") 710 >>> serialize(elem) # 2.3 711 '<soap:Envelope xmlns:soap="http://schemas.xmlsoap.org/soap/envelope"/>' 712 713 3) unknown namespaces 714 715 """
716
717 -def qname():
718 """ 719 Test QName handling. 720 721 1) decorated tags 722 723 >>> elem = ElementTree.Element("{uri}tag") 724 >>> serialize(elem) # 1.1 725 '<ns0:tag xmlns:ns0="uri"/>' 726 >>> elem = ElementTree.Element(ElementTree.QName("{uri}tag")) 727 >>> serialize(elem) # 1.2 728 '<ns0:tag xmlns:ns0="uri"/>' 729 >>> elem = ElementTree.Element(ElementTree.QName("uri", "tag")) 730 >>> serialize(elem) # 1.3 731 '<ns0:tag xmlns:ns0="uri"/>' 732 733 # ns/attribute order ... 734 735 ## 2) decorated attributes 736 737 ## >>> elem.clear() 738 ## >>> elem.attrib["{uri}key"] = "value" 739 ## >>> serialize(elem) # 2.1 740 ## '<ns0:tag ns0:key="value" xmlns:ns0="uri"/>' 741 742 ## >>> elem.clear() 743 ## >>> elem.attrib[ElementTree.QName("{uri}key")] = "value" 744 ## >>> serialize(elem) # 2.2 745 ## '<ns0:tag ns0:key="value" xmlns:ns0="uri"/>' 746 747 ## 3) decorated values are not converted by default, but the 748 ## QName wrapper can be used for values 749 750 ## >>> elem.clear() 751 ## >>> elem.attrib["{uri}key"] = "{uri}value" 752 ## >>> serialize(elem) # 3.1 753 ## '<ns0:tag ns0:key="{uri}value" xmlns:ns0="uri"/>' 754 755 ## >>> elem.clear() 756 ## >>> elem.attrib["{uri}key"] = ElementTree.QName("{uri}value") 757 ## >>> serialize(elem) # 3.2 758 ## '<ns0:tag ns0:key="ns0:value" xmlns:ns0="uri"/>' 759 760 ## >>> elem.clear() 761 ## >>> subelem = ElementTree.Element("tag") 762 ## >>> subelem.attrib["{uri1}key"] = ElementTree.QName("{uri2}value") 763 ## >>> elem.append(subelem) 764 ## >>> elem.append(subelem) 765 ## >>> serialize(elem) # 3.3 766 ## '<ns0:tag xmlns:ns0="uri"><tag ns1:key="ns2:value" xmlns:ns1="uri1" xmlns:ns2="uri2"/><tag ns1:key="ns2:value" xmlns:ns1="uri1" xmlns:ns2="uri2"/></ns0:tag>' 767 768 """
769
770 -def xpath_tokenizer(p):
771 """ 772 Test the XPath tokenizer. 773 774 >>> # tests from the xml specification 775 >>> xpath_tokenizer("*") 776 ['*'] 777 >>> xpath_tokenizer("text()") 778 ['text', '()'] 779 >>> xpath_tokenizer("@name") 780 ['@', 'name'] 781 >>> xpath_tokenizer("@*") 782 ['@', '*'] 783 >>> xpath_tokenizer("para[1]") 784 ['para', '[', '1', ']'] 785 >>> xpath_tokenizer("para[last()]") 786 ['para', '[', 'last', '()', ']'] 787 >>> xpath_tokenizer("*/para") 788 ['*', '/', 'para'] 789 >>> xpath_tokenizer("/doc/chapter[5]/section[2]") 790 ['/', 'doc', '/', 'chapter', '[', '5', ']', '/', 'section', '[', '2', ']'] 791 >>> xpath_tokenizer("chapter//para") 792 ['chapter', '//', 'para'] 793 >>> xpath_tokenizer("//para") 794 ['//', 'para'] 795 >>> xpath_tokenizer("//olist/item") 796 ['//', 'olist', '/', 'item'] 797 >>> xpath_tokenizer(".") 798 ['.'] 799 >>> xpath_tokenizer(".//para") 800 ['.', '//', 'para'] 801 >>> xpath_tokenizer("..") 802 ['..'] 803 >>> xpath_tokenizer("../@lang") 804 ['..', '/', '@', 'lang'] 805 >>> xpath_tokenizer("chapter[title]") 806 ['chapter', '[', 'title', ']'] 807 >>> xpath_tokenizer("employee[@secretary and @assistant]") 808 ['employee', '[', '@', 'secretary', '', 'and', '', '@', 'assistant', ']'] 809 810 >>> # additional tests 811 >>> xpath_tokenizer("{http://spam}egg") 812 ['{http://spam}egg'] 813 >>> xpath_tokenizer("./spam.egg") 814 ['.', '/', 'spam.egg'] 815 >>> xpath_tokenizer(".//{http://spam}egg") 816 ['.', '//', '{http://spam}egg'] 817 """ 818 out = [] 819 for op, tag in ElementPath.xpath_tokenizer(p): 820 out.append(op or tag) 821 return out
822 823 # 824 # xinclude tests (samples from appendix C of the xinclude specification) 825 826 XINCLUDE = { 827 "C1.xml": """\ 828 <?xml version='1.0'?> 829 <document xmlns:xi="http://www.w3.org/2001/XInclude"> 830 <p>120 Mz is adequate for an average home user.</p> 831 <xi:include href="disclaimer.xml"/> 832 </document> 833 """, "disclaimer.xml": """\ 834 <?xml version='1.0'?> 835 <disclaimer> 836 <p>The opinions represented herein represent those of the individual 837 and should not be interpreted as official policy endorsed by this 838 organization.</p> 839 </disclaimer> 840 """, 841 "C2.xml": """\ 842 <?xml version='1.0'?> 843 <document xmlns:xi="http://www.w3.org/2001/XInclude"> 844 <p>This document has been accessed 845 <xi:include href="count.txt" parse="text"/> times.</p> 846 </document> 847 """, "count.txt": "324387", "C3.xml": """\ 848 <?xml version='1.0'?> 849 <document xmlns:xi="http://www.w3.org/2001/XInclude"> 850 <p>The following is the source of the "data.xml" resource:</p> 851 <example><xi:include href="data.xml" parse="text"/></example> 852 </document> 853 """, "data.xml": """\ 854 <?xml version='1.0'?> 855 <data> 856 <item><![CDATA[Brooks & Shields]]></item> 857 </data> 858 """, 859 "C5.xml": """\ 860 <?xml version='1.0'?> 861 <div xmlns:xi="http://www.w3.org/2001/XInclude"> 862 <xi:include href="example.txt" parse="text"> 863 <xi:fallback> 864 <xi:include href="fallback-example.txt" parse="text"> 865 <xi:fallback><a href="mailto:bob@example.org">Report error</a></xi:fallback> 866 </xi:include> 867 </xi:fallback> 868 </xi:include> 869 </div> 870 """, 871 "default.xml": """\ 872 <?xml version='1.0'?> 873 <document xmlns:xi="http://www.w3.org/2001/XInclude"> 874 <p>Example.</p> 875 <xi:include href="samples/simple.xml"/> 876 </document> 877 """} 878 879
880 -def xinclude_loader(href, parse="xml", encoding=None):
881 try: 882 data = XINCLUDE[href] 883 except KeyError: 884 raise IOError("resource not found") 885 if parse == "xml": 886 return ElementTree.XML(data) 887 return data
888
889 -def xinclude():
890 r""" 891 Basic inclusion example (XInclude C.1) 892 893 >>> document = xinclude_loader("C1.xml") 894 >>> ElementInclude.include(document, xinclude_loader) 895 >>> print(serialize(document)) # C1 896 <document> 897 <p>120 Mz is adequate for an average home user.</p> 898 <disclaimer> 899 <p>The opinions represented herein represent those of the individual 900 and should not be interpreted as official policy endorsed by this 901 organization.</p> 902 </disclaimer> 903 </document> 904 905 Textual inclusion example (XInclude C.2) 906 907 >>> document = xinclude_loader("C2.xml") 908 >>> ElementInclude.include(document, xinclude_loader) 909 >>> print(serialize(document)) # C2 910 <document> 911 <p>This document has been accessed 912 324387 times.</p> 913 </document> 914 915 Textual inclusion of XML example (XInclude C.3) 916 917 >>> document = xinclude_loader("C3.xml") 918 >>> ElementInclude.include(document, xinclude_loader) 919 >>> print(serialize(document)) # C3 920 <document> 921 <p>The following is the source of the "data.xml" resource:</p> 922 <example>&lt;?xml version='1.0'?&gt; 923 &lt;data&gt; 924 &lt;item&gt;&lt;![CDATA[Brooks &amp; Shields]]&gt;&lt;/item&gt; 925 &lt;/data&gt; 926 </example> 927 </document> 928 929 ## Fallback example (XInclude C.5) 930 ## Note! Fallback support is not yet implemented 931 932 ## >>> document = xinclude_loader("C5.xml") 933 ## >>> ElementInclude.include(document, xinclude_loader) 934 ## Traceback (most recent call last): 935 ## IOError: resource not found 936 ## >>> # print(serialize(document)) # C5 937 938 """
939
940 -def xinclude_default():
941 """ 942 >>> document = xinclude_loader("default.xml") 943 >>> ElementInclude.include(document) 944 >>> print(serialize(document)) # default 945 <document> 946 <p>Example.</p> 947 <root> 948 <element key="value">text</element> 949 <element>text</element>tail 950 <empty-element/> 951 </root> 952 </document> 953 """
954 955 # 956 # xmlwriter 957
958 -def xmlwriter():
959 r""" 960 >>> file = BytesIO() 961 >>> w = SimpleXMLWriter.XMLWriter(file) 962 >>> html = w.start("html") 963 >>> x = w.start("head") 964 >>> w.element("title", "my document") 965 >>> w.data("\n") 966 >>> w.element("meta", name="hello", value="goodbye") 967 >>> w.data("\n") 968 >>> w.end() 969 >>> x = w.start("body") 970 >>> w.element("h1", "this is a heading") 971 >>> w.data("\n") 972 >>> w.element("p", u"this is a paragraph") 973 >>> w.data("\n") 974 >>> w.element("p", u"reserved characters: <&>") 975 >>> w.data("\n") 976 >>> w.element("p", u"detta är också ett stycke") 977 >>> w.data("\n") 978 >>> w.close(html) 979 >>> print(file.getvalue()) 980 <html><head><title>my document</title> 981 <meta name="hello" value="goodbye" /> 982 </head><body><h1>this is a heading</h1> 983 <p>this is a paragraph</p> 984 <p>reserved characters: &lt;&amp;&gt;</p> 985 <p>detta &#228;r ocks&#229; ett stycke</p> 986 </body></html> 987 """
988 989 # doesn't work with lxml.etree 990 del xmlwriter 991 992 # -------------------------------------------------------------------- 993 # reported bugs 994
995 -def bug_xmltoolkit21():
996 """ 997 marshaller gives obscure errors for non-string values 998 999 >>> elem = ElementTree.Element(123) 1000 >>> serialize(elem) # tag 1001 Traceback (most recent call last): 1002 TypeError: cannot serialize 123 (type int) 1003 >>> elem = ElementTree.Element("elem") 1004 >>> elem.text = 123 1005 >>> serialize(elem) # text 1006 Traceback (most recent call last): 1007 TypeError: cannot serialize 123 (type int) 1008 >>> elem = ElementTree.Element("elem") 1009 >>> elem.tail = 123 1010 >>> serialize(elem) # tail 1011 Traceback (most recent call last): 1012 TypeError: cannot serialize 123 (type int) 1013 >>> elem = ElementTree.Element("elem") 1014 >>> elem.set(123, "123") 1015 >>> serialize(elem) # attribute key 1016 Traceback (most recent call last): 1017 TypeError: cannot serialize 123 (type int) 1018 >>> elem = ElementTree.Element("elem") 1019 >>> elem.set("123", 123) 1020 >>> serialize(elem) # attribute value 1021 Traceback (most recent call last): 1022 TypeError: cannot serialize 123 (type int) 1023 1024 """
1025 1026 # doesn't work with lxml.etree 1027 del bug_xmltoolkit21 1028
1029 -def bug_xmltoolkit25():
1030 """ 1031 typo in ElementTree.findtext 1032 1033 >>> tree = ElementTree.ElementTree(SAMPLE_XML) 1034 >>> tree.findtext("tag") 1035 'text' 1036 >>> tree.findtext("section/tag") 1037 'subtext' 1038 """
1039
1040 -def bug_xmltoolkit28():
1041 """ 1042 .//tag causes exceptions 1043 1044 >>> tree = ElementTree.XML("<doc><table><tbody/></table></doc>") 1045 >>> summarize_list(tree.findall(".//thead")) 1046 [] 1047 >>> summarize_list(tree.findall(".//tbody")) 1048 ['tbody'] 1049 """
1050
1051 -def bug_xmltoolkitX1():
1052 """ 1053 dump() doesn't flush the output buffer 1054 1055 >>> tree = ElementTree.XML("<doc><table><tbody/></table></doc>") 1056 >>> ElementTree.dump(tree); sys.stdout.write("tail") 1057 <doc><table><tbody /></table></doc> 1058 tail 1059 """
1060 1061 # doesn't work with lxml.etree 1062 del bug_xmltoolkitX1 1063
1064 -def bug_xmltoolkit39():
1065 """ 1066 non-ascii element and attribute names doesn't work 1067 1068 >>> tree = ElementTree.XML("<?xml version='1.0' encoding='iso-8859-1'?><täg />") 1069 >>> ElementTree.tostring(tree, "utf-8") 1070 '<t\\xc3\\xa4g />' 1071 1072 >>> tree = ElementTree.XML("<?xml version='1.0' encoding='iso-8859-1'?><tag ättr='v&#228;lue' />") 1073 >>> tree.attrib 1074 {u'\\xe4ttr': u'v\\xe4lue'} 1075 >>> ElementTree.tostring(tree, "utf-8") 1076 '<tag \\xc3\\xa4ttr="v\\xc3\\xa4lue" />' 1077 1078 >>> tree = ElementTree.XML("<?xml version='1.0' encoding='iso-8859-1'?><täg>text</täg>") 1079 >>> ElementTree.tostring(tree, "utf-8") 1080 '<t\\xc3\\xa4g>text</t\\xc3\\xa4g>' 1081 1082 >>> tree = ElementTree.Element(u"täg") 1083 >>> ElementTree.tostring(tree, "utf-8") 1084 '<t\\xc3\\xa4g />' 1085 1086 >>> tree = ElementTree.Element("tag") 1087 >>> tree.set(u"ättr", u"välue") 1088 >>> ElementTree.tostring(tree, "utf-8") 1089 '<tag \\xc3\\xa4ttr="v\\xc3\\xa4lue" />' 1090 1091 """
1092 1093 # doesn't work with lxml.etree 1094 del bug_xmltoolkit39 1095
1096 -def bug_xmltoolkit45():
1097 """ 1098 problems parsing mixed unicode/non-ascii html documents 1099 1100 latin-1 text 1101 >>> p = HTMLTreeBuilder.TreeBuilder() 1102 >>> p.feed("<p>välue</p>") 1103 >>> serialize(p.close()) 1104 '<p>v&#228;lue</p>' 1105 1106 utf-8 text 1107 >>> p = HTMLTreeBuilder.TreeBuilder(encoding="utf-8") 1108 >>> p.feed("<p>v\xc3\xa4lue</p>") 1109 >>> serialize(p.close()) 1110 '<p>v&#228;lue</p>' 1111 1112 utf-8 text using meta tag 1113 >>> p = HTMLTreeBuilder.TreeBuilder() 1114 >>> p.feed("<html><meta http-equiv='Content-Type' content='text/html; charset=utf-8'><p>v\xc3\xa4lue</p></html>") 1115 >>> serialize(p.close().find("p")) 1116 '<p>v&#228;lue</p>' 1117 1118 latin-1 character references 1119 >>> p = HTMLTreeBuilder.TreeBuilder() 1120 >>> p.feed("<p>v&#228;lue</p>") 1121 >>> serialize(p.close()) 1122 '<p>v&#228;lue</p>' 1123 1124 latin-1 character entities 1125 >>> p = HTMLTreeBuilder.TreeBuilder() 1126 >>> p.feed("<p>v&auml;lue</p>") 1127 >>> serialize(p.close()) 1128 '<p>v&#228;lue</p>' 1129 1130 mixed latin-1 text and unicode entities 1131 >>> p = HTMLTreeBuilder.TreeBuilder() 1132 >>> p.feed("<p>&#8221;välue&#8221;</p>") 1133 >>> serialize(p.close()) 1134 '<p>&#8221;v&#228;lue&#8221;</p>' 1135 1136 mixed unicode and latin-1 entities 1137 >>> p = HTMLTreeBuilder.TreeBuilder() 1138 >>> p.feed("<p>&#8221;v&auml;lue&#8221;</p>") 1139 >>> serialize(p.close()) 1140 '<p>&#8221;v&#228;lue&#8221;</p>' 1141 1142 """
1143 1144 # doesn't work with lxml.etree 1145 del bug_xmltoolkit45 1146
1147 -def bug_xmltoolkit46():
1148 """ 1149 problems parsing open BR tags 1150 1151 >>> p = HTMLTreeBuilder.TreeBuilder() 1152 >>> p.feed("<p>key<br>value</p>") 1153 >>> serialize(p.close()) 1154 '<p>key<br />value</p>' 1155 1156 """
1157 1158 # doesn't work with lxml.etree 1159 del bug_xmltoolkit46 1160
1161 -def bug_xmltoolkit54():
1162 """ 1163 problems handling internally defined entities 1164 1165 >>> e = ElementTree.XML("<!DOCTYPE doc [<!ENTITY ldots '&#x8230;'>]><doc>&ldots;</doc>") 1166 >>> serialize(e) 1167 '<doc>&#33328;</doc>' 1168 """
1169 1170 # doesn't work with lxml.etree 1171 del bug_xmltoolkit54 1172
1173 -def bug_xmltoolkit55():
1174 """ 1175 make sure we're reporting the first error, not the last 1176 1177 >>> e = ElementTree.XML("<!DOCTYPE doc SYSTEM 'doc.dtd'><doc>&ldots;&ndots;&rdots;</doc>") 1178 Traceback (most recent call last): 1179 ParseError: undefined entity &ldots;: line 1, column 36 1180 """
1181 1182 # doesn't work with lxml.etree 1183 del bug_xmltoolkit55 1184
1185 -def bug_200708_version():
1186 """ 1187 >>> parser = ET.XMLParser() 1188 >>> parser.version 1189 'Expat 2.0.0' 1190 >>> parser.feed(open("samples/simple.xml").read()) 1191 >>> print(serialize(parser.close())) 1192 <root> 1193 <element key="value">text</element> 1194 <element>text</element>tail 1195 <empty-element /> 1196 </root> 1197 """
1198 1199 # doesn't work with lxml.etree 1200 del bug_200708_version 1201
1202 -def bug_200708_newline():
1203 r""" 1204 1205 Preserve newlines in attributes. 1206 1207 >>> e = ET.Element('SomeTag', text="def _f():\n return 3\n") 1208 >>> ET.tostring(e) 1209 '<SomeTag text="def _f():&#10; return 3&#10;" />' 1210 >>> ET.XML(ET.tostring(e)).get("text") 1211 'def _f():\n return 3\n' 1212 >>> ET.tostring(ET.XML(ET.tostring(e))) 1213 '<SomeTag text="def _f():&#10; return 3&#10;" />' 1214 """
1215 1216 # doesn't work with lxml.etree 1217 del bug_200708_newline 1218
1219 -def bug_200709_default_namespace():
1220 """ 1221 1222 >>> e = ET.Element("{default}elem") 1223 >>> s = ET.SubElement(e, "{default}elem") 1224 >>> serialize(e, default_namespace="default") # 1 1225 '<elem xmlns="default"><elem /></elem>' 1226 1227 >>> e = ET.Element("{default}elem") 1228 >>> s = ET.SubElement(e, "{default}elem") 1229 >>> s = ET.SubElement(e, "{not-default}elem") 1230 >>> serialize(e, default_namespace="default") # 2 1231 '<elem xmlns="default" xmlns:ns1="not-default"><elem /><ns1:elem /></elem>' 1232 1233 >>> e = ET.Element("{default}elem") 1234 >>> s = ET.SubElement(e, "{default}elem") 1235 >>> s = ET.SubElement(e, "elem") # unprefixed name 1236 >>> serialize(e, default_namespace="default") # 3 1237 Traceback (most recent call last): 1238 ValueError: cannot use non-qualified names with default_namespace option 1239 1240 """
1241 1242 # doesn't work with lxml.etree 1243 del bug_200709_default_namespace 1244 1245 # -------------------------------------------------------------------- 1246 1247 if __name__ == "__main__": 1248 import doctest, selftest 1249 failed, tested = doctest.testmod(selftest) 1250 print("%d tests ok." % (tested - failed)) 1251 if failed > 0: 1252 print("%d tests failed. Exiting with non-zero return code." % failed) 1253 sys.exit(1) 1254