elementtree.ElementTree

1 # 2 # ElementTree 3 # $Id: ElementTree.py 3276 2007-09-12 06:52:30Z fredrik $ 4 # 5 # light-weight XML support for Python 2.2 and later. 6 # 7 # history: 8 # 2001-10-20 fl created (from various sources) 9 # 2001-11-01 fl return root from parse method 10 # 2002-02-16 fl sort attributes in lexical order 11 # 2002-04-06 fl TreeBuilder refactoring, added PythonDoc markup 12 # 2002-05-01 fl finished TreeBuilder refactoring 13 # 2002-07-14 fl added basic namespace support to ElementTree.write 14 # 2002-07-25 fl added QName attribute support 15 # 2002-10-20 fl fixed encoding in write 16 # 2002-11-24 fl changed default encoding to ascii; fixed attribute encoding 17 # 2002-11-27 fl accept file objects or file names for parse/write 18 # 2002-12-04 fl moved XMLTreeBuilder back to this module 19 # 2003-01-11 fl fixed entity encoding glitch for us-ascii 20 # 2003-02-13 fl added XML literal factory 21 # 2003-02-21 fl added ProcessingInstruction/PI factory 22 # 2003-05-11 fl added tostring/fromstring helpers 23 # 2003-05-26 fl added ElementPath support 24 # 2003-07-05 fl added makeelement factory method 25 # 2003-07-28 fl added more well-known namespace prefixes 26 # 2003-08-15 fl fixed typo in ElementTree.findtext (Thomas Dartsch) 27 # 2003-09-04 fl fall back on emulator if ElementPath is not installed 28 # 2003-10-31 fl markup updates 29 # 2003-11-15 fl fixed nested namespace bug 30 # 2004-03-28 fl added XMLID helper 31 # 2004-06-02 fl added default support to findtext 32 # 2004-06-08 fl fixed encoding of non-ascii element/attribute names 33 # 2004-08-23 fl take advantage of post-2.1 expat features 34 # 2004-09-03 fl made Element class visible; removed factory 35 # 2005-02-01 fl added iterparse implementation 36 # 2005-03-02 fl fixed iterparse support for pre-2.2 versions 37 # 2005-11-12 fl added tostringlist/fromstringlist helpers 38 # 2006-07-05 fl merged in selected changes from the 1.3 sandbox 39 # 2006-07-05 fl removed support for 2.1 and earlier 40 # 2007-06-21 fl added deprecation/future warnings 41 # 2007-08-25 fl added doctype hook, added parser version attribute etc 42 # 2007-08-26 fl added new serializer code (better namespace handling, etc) 43 # 2007-08-27 fl warn for broken /tag searches on tree level 44 # 2007-09-02 fl added html/text methods to serializer (experimental) 45 # 2007-09-05 fl added method argument to tostring/tostringlist 46 # 2007-09-06 fl improved error handling 47 # 48 # Copyright (c) 1999-2007 by Fredrik Lundh. All rights reserved. 49 # 50 # fredrik@pythonware.com 51 # http://www.pythonware.com 52 # 53 # -------------------------------------------------------------------- 54 # The ElementTree toolkit is 55 # 56 # Copyright (c) 1999-2007 by Fredrik Lundh 57 # 58 # By obtaining, using, and/or copying this software and/or its 59 # associated documentation, you agree that you have read, understood, 60 # and will comply with the following terms and conditions: 61 # 62 # Permission to use, copy, modify, and distribute this software and 63 # its associated documentation for any purpose and without fee is 64 # hereby granted, provided that the above copyright notice appears in 65 # all copies, and that both that copyright notice and this permission 66 # notice appear in supporting documentation, and that the name of 67 # Secret Labs AB or the author not be used in advertising or publicity 68 # pertaining to distribution of the software without specific, written 69 # prior permission. 70 # 71 # SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD 72 # TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANT- 73 # ABILITY AND FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR 74 # BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY 75 # DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, 76 # WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS 77 # ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE 78 # OF THIS SOFTWARE. 79 # -------------------------------------------------------------------- 80 81 from __future__ import generators 82 83 __all__ = [ 84 # public symbols 85 "Comment", 86 "dump", 87 "Element", "ElementTree", 88 "fromstring", "fromstringlist", 89 "iselement", "iterparse", 90 "parse", "ParseError", 91 "PI", "ProcessingInstruction", 92 "QName", 93 "SubElement", 94 "tostring", "tostringlist", 95 "TreeBuilder", 96 "VERSION", 97 "XML", 98 "XMLParser", "XMLTreeBuilder", 99 ] 100 101 ## 102 # The Element type is a flexible container object, designed to 103 # store hierarchical data structures in memory. The type can be 104 # described as a cross between a list and a dictionary. 105 #  106 # Each element has a number of properties associated with it: 107 # <ul> 108 # <li>a tag. This is a string identifying what kind of data 109 # this element represents (the element type, in other words).</li> 110 # <li>a number of attributes, stored in a Python dictionary.</li> 111 # <li>a text string.</li> 112 # <li>an optional tail string.</li> 113 # <li>a number of child elements, stored in a Python sequence</li> 114 # </ul> 115 # 116 # To create an element instance, use the {@link #Element} constructor 117 # or the {@link #SubElement} factory function. 118 #  119 # The {@link #ElementTree} class can be used to wrap an element 120 # structure, and convert it from and to XML. 121 ## 122 123 import sys, re 124

125 -class _SimpleElementPath(object):

126 # emulate pre-1.2 find/findtext/findall behaviour

127 - def find(self, element, tag):

128 for elem in element: 129 if elem.tag == tag: 130 return elem 131 return None

132 - def findtext(self, element, tag, default=None):

133 for elem in element: 134 if elem.tag == tag: 135 return elem.text or "" 136 return default

137 - def findall(self, element, tag):

138 if tag[:3] == ".//": 139 return element.getiterator(tag[3:]) 140 result = [] 141 for elem in element: 142 if elem.tag == tag: 143 result.append(elem) 144 return result

145 146 try: 147 import ElementPath 148 except ImportError: 149 # FIXME: issue warning in this case? 150 ElementPath = _SimpleElementPath() 151 152 VERSION = "1.3a2" 153

154 -class ParseError(SyntaxError):

155 pass

156 157 # -------------------------------------------------------------------- 158 159 ## 160 # Checks if an object appears to be a valid element object. 161 # 162 # @param An element instance. 163 # @return A true value if this is an element object. 164 # @defreturn flag 165

166 -def iselement(element):

167 # FIXME: not sure about this; might be a better idea to look 168 # for tag/attrib/text attributes 169 return isinstance(element, Element) or hasattr(element, "tag")

170 171 ## 172 # Element class. This class defines the Element interface, and 173 # provides a reference implementation of this interface. 174 #  175 # The element name, attribute names, and attribute values can be 176 # either 8-bit ASCII strings or Unicode strings. 177 # 178 # @param tag The element name. 179 # @param attrib An optional dictionary, containing element attributes. 180 # @param **extra Additional attributes, given as keyword arguments. 181 # @see Element 182 # @see SubElement 183 # @see Comment 184 # @see ProcessingInstruction 185

186 -class Element(object):

187 # <tag attrib>text<child/>...</tag>tail 188 189 ## 190 # (Attribute) Element tag. 191 192 tag = None 193 194 ## 195 # (Attribute) Element attribute dictionary. Where possible, use 196 # {@link #Element.get}, 197 # {@link #Element.set}, 198 # {@link #Element.keys}, and 199 # {@link #Element.items} to access 200 # element attributes. 201 202 attrib = None 203 204 ## 205 # (Attribute) Text before first subelement. This is either a 206 # string or the value None, if there was no text. 207 208 text = None 209 210 ## 211 # (Attribute) Text after this element's end tag, but before the 212 # next sibling element's start tag. This is either a string or 213 # the value None, if there was no text. 214 215 tail = None # text after end tag, if any 216

217 - def __init__(self, tag, attrib={}, **extra):

218 attrib = attrib.copy() 219 attrib.update(extra) 220 self.tag = tag 221 self.attrib = attrib 222 self._children = []

223

224 - def __repr__(self):

225 return "<Element %s at %x>" % (repr(self.tag), id(self))

226 227 ## 228 # Creates a new element object of the same type as this element. 229 # 230 # @param tag Element tag. 231 # @param attrib Element attributes, given as a dictionary. 232 # @return A new element instance. 233

234 - def makeelement(self, tag, attrib):

235 return Element(tag, attrib)

236 237 ## 238 # Returns the number of subelements. 239 # 240 # @return The number of subelements. 241

242 - def __len__(self):

243 return len(self._children)

244

245 - def __nonzero__(self):

246 import warnings 247 warnings.warn( 248 "The behavior of this method will change in future versions. " 249 "Use specific 'len(elem)' or 'elem is not None' test instead.", 250 FutureWarning 251 ) 252 return len(self._children) != 0 # emulate old behaviour

253 254 ## 255 # Returns the given subelement. 256 # 257 # @param index What subelement to return. 258 # @return The given subelement. 259 # @exception IndexError If the given element does not exist. 260

261 - def __getitem__(self, index):

262 return self._children[index]

263 264 ## 265 # Replaces the given subelement. 266 # 267 # @param index What subelement to replace. 268 # @param element The new element value. 269 # @exception IndexError If the given element does not exist. 270 # @exception AssertionError If element is not a valid object. 271

272 - def __setitem__(self, index, element):

273 assert iselement(element) 274 self._children[index] = element

275 276 ## 277 # Deletes the given subelement. 278 # 279 # @param index What subelement to delete. 280 # @exception IndexError If the given element does not exist. 281

282 - def __delitem__(self, index):

283 del self._children[index]

284 285 ## 286 # Returns a list containing subelements in the given range. 287 # 288 # @param start The first subelement to return. 289 # @param stop The first subelement that shouldn't be returned. 290 # @return A sequence object containing subelements. 291

292 - def __getslice__(self, start, stop):

293 return self._children[start:stop]

294 295 ## 296 # Replaces a number of subelements with elements from a sequence. 297 # 298 # @param start The first subelement to replace. 299 # @param stop The first subelement that shouldn't be replaced. 300 # @param elements A sequence object with zero or more elements. 301 # @exception AssertionError If a sequence member is not a valid object. 302

303 - def __setslice__(self, start, stop, elements):

304 for element in elements: 305 assert iselement(element) 306 self._children[start:stop] = list(elements)

307 308 ## 309 # Deletes a number of subelements. 310 # 311 # @param start The first subelement to delete. 312 # @param stop The first subelement to leave in there. 313

314 - def __delslice__(self, start, stop):

315 del self._children[start:stop]

316 317 ## 318 # Adds a subelement to the end of this element. 319 # 320 # @param element The element to add. 321 # @exception AssertionError If a sequence member is not a valid object. 322

323 - def append(self, element):

324 assert iselement(element) 325 self._children.append(element)

326 327 ## 328 # Appends subelements from a sequence. 329 # 330 # @param elements A sequence object with zero or more elements. 331 # @exception AssertionError If a subelement is not a valid object. 332 # @since 1.3 333

334 - def extend(self, elements):

335 for element in elements: 336 assert iselement(element) 337 self._children.extend(elements)

338 339 ## 340 # Inserts a subelement at the given position in this element. 341 # 342 # @param index Where to insert the new subelement. 343 # @exception AssertionError If the element is not a valid object. 344

345 - def insert(self, index, element):

346 assert iselement(element) 347 self._children.insert(index, element)

348 349 ## 350 # Removes a matching subelement. Unlike the find methods, 351 # this method compares elements based on identity, not on tag 352 # value or contents. 353 # 354 # @param element What element to remove. 355 # @exception ValueError If a matching element could not be found. 356 # @exception AssertionError If the element is not a valid object. 357

358 - def remove(self, element):

359 assert iselement(element) 360 self._children.remove(element)

361 362 ## 363 # (Deprecated) Returns all subelements. The elements are returned 364 # in document order. 365 # 366 # @return A list of subelements. 367 # @defreturn list of Element instances 368

369 - def getchildren(self):

370 import warnings 371 warnings.warn( 372 "This method will be removed in future versions. " 373 "Use 'list(elem)' or iteration over elem instead.", 374 DeprecationWarning 375 ) 376 return self._children

377 378 ## 379 # Finds the first matching subelement, by tag name or path. 380 # 381 # @param path What element to look for. 382 # @return The first matching element, or None if no element was found. 383 # @defreturn Element or None 384

385 - def find(self, path):

386 return ElementPath.find(self, path)

387 388 ## 389 # Finds text for the first matching subelement, by tag name or path. 390 # 391 # @param path What element to look for. 392 # @param default What to return if the element was not found. 393 # @return The text content of the first matching element, or the 394 # default value no element was found. Note that if the element 395 # has is found, but has no text content, this method returns an 396 # empty string. 397 # @defreturn string 398

399 - def findtext(self, path, default=None):

400 return ElementPath.findtext(self, path, default)

401 402 ## 403 # Finds all matching subelements, by tag name or path. 404 # 405 # @param path What element to look for. 406 # @return A list or iterator containing all matching elements, 407 # in document order. 408 # @defreturn list of Element instances 409

410 - def findall(self, path):

411 return ElementPath.findall(self, path)

412 413 ## 414 # Resets an element. This function removes all subelements, clears 415 # all attributes, and sets the text and tail attributes to None. 416

417 - def clear(self):

418 self.attrib.clear() 419 self._children = [] 420 self.text = self.tail = None

421 422 ## 423 # Gets an element attribute. 424 # 425 # @param key What attribute to look for. 426 # @param default What to return if the attribute was not found. 427 # @return The attribute value, or the default value, if the 428 # attribute was not found. 429 # @defreturn string or None 430

431 - def get(self, key, default=None):

432 return self.attrib.get(key, default)

433 434 ## 435 # Sets an element attribute. 436 # 437 # @param key What attribute to set. 438 # @param value The attribute value. 439

440 - def set(self, key, value):

441 self.attrib[key] = value

442 443 ## 444 # Gets a list of attribute names. The names are returned in an 445 # arbitrary order (just like for an ordinary Python dictionary). 446 # 447 # @return A list of element attribute names. 448 # @defreturn list of strings 449

450 - def keys(self):

451 return self.attrib.keys()

452 453 ## 454 # Gets element attributes, as a sequence. The attributes are 455 # returned in an arbitrary order. 456 # 457 # @return A list of (name, value) tuples for all attributes. 458 # @defreturn list of (string, string) tuples 459

460 - def items(self):

461 return self.attrib.items()

462 463 ## 464 # Creates a tree iterator. The iterator loops over this element 465 # and all subelements, in document order, and returns all elements 466 # with a matching tag. 467 #  468 # If the tree structure is modified during iteration, new or removed 469 # elements may or may not be included. To get a stable set, use the 470 # list() function on the iterator, and loop over the resulting list. 471 # 472 # @param tag What tags to look for (default is to return all elements). 473 # @return An iterator containing all the matching elements. 474 # @defreturn iterator 475

476 - def iter(self, tag=None):

477 if tag == "*": 478 tag = None 479 if tag is None or self.tag == tag: 480 yield self 481 for e in self._children: 482 for e in e.iter(tag): 483 yield e

484 485 # compatibility (FIXME: preserve list behaviour too? see below) 486 getiterator = iter 487 488 # def getiterator(self, tag=None): 489 # return list(tag) 490 491 ## 492 # Creates a text iterator. The iterator loops over this element 493 # and all subelements, in document order, and returns all inner 494 # text. 495 # 496 # @return An iterator containing all inner text. 497 # @defreturn iterator 498

499 - def itertext(self):

500 if self.text: 501 yield self.text 502 for e in self: 503 for s in e.itertext(): 504 yield s 505 if e.tail: 506 yield e.tail

507 508 # compatibility 509 _Element = _ElementInterface = Element 510 511 ## 512 # Subelement factory. This function creates an element instance, and 513 # appends it to an existing element. 514 #  515 # The element name, attribute names, and attribute values can be 516 # either 8-bit ASCII strings or Unicode strings. 517 # 518 # @param parent The parent element. 519 # @param tag The subelement name. 520 # @param attrib An optional dictionary, containing element attributes. 521 # @param **extra Additional attributes, given as keyword arguments. 522 # @return An element instance. 523 # @defreturn Element 524

525 -def SubElement(parent, tag, attrib={}, **extra):

526 attrib = attrib.copy() 527 attrib.update(extra) 528 element = parent.makeelement(tag, attrib) 529 parent.append(element) 530 return element

531 532 ## 533 # Comment element factory. This factory function creates a special 534 # element that will be serialized as an XML comment by the standard 535 # serializer. 536 #  537 # The comment string can be either an 8-bit ASCII string or a Unicode 538 # string. 539 # 540 # @param text A string containing the comment string. 541 # @return An element instance, representing a comment. 542 # @defreturn Element 543

544 -def Comment(text=None):

545 element = Element(Comment) 546 element.text = text 547 return element

548 549 ## 550 # PI element factory. This factory function creates a special element 551 # that will be serialized as an XML processing instruction by the standard 552 # serializer. 553 # 554 # @param target A string containing the PI target. 555 # @param text A string containing the PI contents, if any. 556 # @return An element instance, representing a PI. 557 # @defreturn Element 558

559 -def ProcessingInstruction(target, text=None):

560 element = Element(ProcessingInstruction) 561 element.text = target 562 if text: 563 element.text = element.text + " " + text 564 return element

565 566 PI = ProcessingInstruction 567 568 ## 569 # QName wrapper. This can be used to wrap a QName attribute value, in 570 # order to get proper namespace handling on output. 571 # 572 # @param text A string containing the QName value, in the form {uri}local, 573 # or, if the tag argument is given, the URI part of a QName. 574 # @param tag Optional tag. If given, the first argument is interpreted as 575 # an URI, and this argument is interpreted as a local name. 576 # @return An opaque object, representing the QName. 577

578 -class QName(object):

579 - def __init__(self, text_or_uri, tag=None):

580 if tag: 581 text_or_uri = "{%s}%s" % (text_or_uri, tag) 582 self.text = text_or_uri

583 - def __str__(self):

584 return self.text

585 - def __hash__(self):

586 return hash(self.text)

587 - def __cmp__(self, other):

588 if isinstance(other, QName): 589 return cmp(self.text, other.text) 590 return cmp(self.text, other)

591 592 # -------------------------------------------------------------------- 593 594 ## 595 # ElementTree wrapper class. This class represents an entire element 596 # hierarchy, and adds some extra support for serialization to and from 597 # standard XML. 598 # 599 # @param element Optional root element. 600 # @keyparam file Optional file handle or file name. If given, the 601 # tree is initialized with the contents of this XML file. 602

603 -class ElementTree(object):

604

605 - def __init__(self, element=None, file=None):

606 assert element is None or iselement(element) 607 self._root = element # first node 608 if file: 609 self.parse(file)

610 611 ## 612 # Gets the root element for this tree. 613 # 614 # @return An element instance. 615 # @defreturn Element 616

617 - def getroot(self):

618 return self._root

619 620 ## 621 # Replaces the root element for this tree. This discards the 622 # current contents of the tree, and replaces it with the given 623 # element. Use with care. 624 # 625 # @param element An element instance. 626

627 - def _setroot(self, element):

628 assert iselement(element) 629 self._root = element

630 631 ## 632 # Loads an external XML document into this element tree. 633 # 634 # @param source A file name or file object. 635 # @keyparam parser An optional parser instance. If not given, the 636 # standard {@link XMLParser} parser is used. 637 # @return The document root element. 638 # @defreturn Element 639

640 - def parse(self, source, parser=None):

641 if not hasattr(source, "read"): 642 source = open(source, "rb") 643 if not parser: 644 parser = XMLParser(target=TreeBuilder()) 645 while 1: 646 data = source.read(32768) 647 if not data: 648 break 649 parser.feed(data) 650 self._root = parser.close() 651 return self._root

652 653 ## 654 # Creates a tree iterator for the root element. The iterator loops 655 # over all elements in this tree, in document order. 656 # 657 # @param tag What tags to look for (default is to return all elements) 658 # @return An iterator. 659 # @defreturn iterator 660

661 - def iter(self, tag=None):

662 assert self._root is not None 663 return self._root.iter(tag)

664 665 getiterator = iter 666 667 ## 668 # Finds the first toplevel element with given tag. 669 # Same as getroot().find(path). 670 # 671 # @param path What element to look for. 672 # @return The first matching element, or None if no element was found. 673 # @defreturn Element or None 674

675 - def find(self, path):

676 assert self._root is not None 677 if path[:1] == "/": 678 path = "." + path 679 import warnings 680 warnings.warn( 681 "This search is broken in 1.3 and earlier; if you rely " 682 "on the current behaviour, change it to %r" % path, 683 FutureWarning 684 ) 685 return self._root.find(path)

686 687 ## 688 # Finds the element text for the first toplevel element with given 689 # tag. Same as getroot().findtext(path). 690 # 691 # @param path What toplevel element to look for. 692 # @param default What to return if the element was not found. 693 # @return The text content of the first matching element, or the 694 # default value no element was found. Note that if the element 695 # has is found, but has no text content, this method returns an 696 # empty string. 697 # @defreturn string 698

699 - def findtext(self, path, default=None):

700 assert self._root is not None 701 if path[:1] == "/": 702 path = "." + path 703 import warnings 704 warnings.warn( 705 "This search is broken in 1.3 and earlier; if you rely " 706 "on the current behaviour, change it to %r" % path, 707 FutureWarning 708 ) 709 return self._root.findtext(path, default)

710 711 ## 712 # Finds all toplevel elements with the given tag. 713 # Same as getroot().findall(path). 714 # 715 # @param path What element to look for. 716 # @return A list or iterator containing all matching elements, 717 # in document order. 718 # @defreturn list of Element instances 719

720 - def findall(self, path):

721 assert self._root is not None 722 if path[:1] == "/": 723 path = "." + path 724 import warnings 725 warnings.warn( 726 "This search is broken in 1.3 and earlier; if you rely " 727 "on the current behaviour, change it to %r" % path, 728 FutureWarning 729 ) 730 return self._root.findall(path)

731 732 ## 733 # Writes the element tree to a file, as XML. 734 # 735 # @param file A file name, or a file object opened for writing. 736 # @keyparam encoding Optional output encoding (default is US-ASCII). 737 # @keyparam method Optional output method ("xml" or "html"; default 738 # is "xml". 739 # @keyparam xml_declaration Controls if an XML declaration should 740 # be added to the file. Use False for never, True for always, 741 # None for only if not US-ASCII or UTF-8. None is default. 742

743 - def write(self, file, 744 # keyword arguments 745 encoding="us-ascii", 746 xml_declaration=None, 747 default_namespace=None, 748 method=None):

749 assert self._root is not None 750 if not hasattr(file, "write"): 751 file = open(file, "wb") 752 write = file.write 753 if not method: 754 method = "xml" 755 if not encoding: 756 encoding = "us-ascii" 757 elif xml_declaration or (xml_declaration is None and 758 encoding not in ("utf-8", "us-ascii")): 759 write("<?xml version='1.0' encoding='%s'?>\n" % encoding) 760 if method == "text": 761 _serialize_text(write, self._root, encoding) 762 else: 763 qnames, namespaces = _namespaces( 764 self._root, encoding, default_namespace 765 ) 766 if method == "xml": 767 _serialize_xml( 768 write, self._root, encoding, qnames, namespaces 769 ) 770 elif method == "html": 771 _serialize_html( 772 write, self._root, encoding, qnames, namespaces 773 ) 774 else: 775 raise ValueError("unknown method %r" % method)

776 777 # -------------------------------------------------------------------- 778 # serialization support 779

780 -def _namespaces(elem, encoding, default_namespace=None):

781 # identify namespaces used in this tree 782 783 # maps qnames to *encoded* prefix:local names 784 qnames = {None: None} 785 786 # maps uri:s to prefixes 787 namespaces = {} 788 if default_namespace: 789 namespaces[default_namespace] = "" 790 791 def encode(text): 792 return text.encode(encoding)

793 794 def add_qname(qname): 795 # calculate serialized qname representation 796 try: 797 if qname[:1] == "{": 798 uri, tag = qname[1:].split("}", 1) 799 prefix = namespaces.get(uri) 800 if prefix is None: 801 prefix = _namespace_map.get(uri) 802 if prefix is None: 803 prefix = "ns%d" % len(namespaces) 804 if prefix != "xml": 805 namespaces[uri] = prefix 806 if prefix: 807 qnames[qname] = encode("%s:%s" % (prefix, tag)) 808 else: 809 qnames[qname] = encode(tag) # default element 810 else: 811 if default_namespace: 812 # FIXME: can this be handled in XML 1.0? 813 raise ValueError( 814 "cannot use non-qualified names with " 815 "default_namespace option" 816 ) 817 qnames[qname] = encode(qname) 818 except TypeError: 819 _raise_serialization_error(qname) 820 821 # populate qname and namespaces table 822 try: 823 iterate = elem.iter 824 except AttributeError: 825 iterate = elem.getiterator # cET compatibility 826 for elem in iterate(): 827 tag = elem.tag 828 if isinstance(tag, QName) and tag.text not in qnames: 829 add_qname(tag.text) 830 elif isinstance(tag, basestring): 831 if tag not in qnames: 832 add_qname(tag) 833 elif tag is not None and tag is not Comment and tag is not PI: 834 _raise_serialization_error(tag) 835 for key, value in elem.items(): 836 if isinstance(key, QName): 837 key = key.text 838 if key not in qnames: 839 add_qname(key) 840 if isinstance(value, QName) and value.text not in qnames: 841 add_qname(value.text) 842 text = elem.text 843 if isinstance(text, QName) and text.text not in qnames: 844 add_qname(text.text) 845 return qnames, namespaces 846

847 -def _serialize_xml(write, elem, encoding, qnames, namespaces):

848 tag = elem.tag 849 text = elem.text 850 if tag is Comment: 851 write("" % _escape_cdata(text, encoding)) 852 elif tag is ProcessingInstruction: 853 write("<?%s?>" % _escape_cdata(text, encoding)) 854 else: 855 tag = qnames[tag] 856 if tag is None: 857 if text: 858 write(_escape_cdata(text, encoding)) 859 for e in elem: 860 _serialize_xml(write, e, encoding, qnames, None) 861 else: 862 write("<" + tag) 863 items = elem.items() 864 if items or namespaces: 865 items.sort() # lexical order 866 for k, v in items: 867 if isinstance(k, QName): 868 k = k.text 869 if isinstance(v, QName): 870 v = qnames[v.text] 871 else: 872 v = _escape_attrib(v, encoding) 873 write(" %s=\"%s\"" % (qnames[k], v)) 874 if namespaces: 875 items = namespaces.items() 876 items.sort(key=lambda x: x[1]) # sort on prefix 877 for v, k in items: 878 if k: 879 k = ":" + k 880 write(" xmlns%s=\"%s\"" % ( 881 k.encode(encoding), 882 _escape_attrib(v, encoding) 883 )) 884 if text or len(elem): 885 write(">") 886 if text: 887 write(_escape_cdata(text, encoding)) 888 for e in elem: 889 _serialize_xml(write, e, encoding, qnames, None) 890 write("</" + tag + ">") 891 else: 892 write(" />") 893 if elem.tail: 894 write(_escape_cdata(elem.tail, encoding))

895 896 HTML_EMPTY = ("area", "base", "basefont", "br", "col", "frame", "hr", 897 "img", "input", "isindex", "link", "meta" "param") 898 899 try: 900 HTML_EMPTY = set(HTML_EMPTY) 901 except NameError: 902 pass 903

904 -def _serialize_html(write, elem, encoding, qnames, namespaces):

905 tag = elem.tag 906 text = elem.text 907 if tag is Comment: 908 write("" % _escape_cdata(text, encoding)) 909 elif tag is ProcessingInstruction: 910 write("<?%s?>" % _escape_cdata(text, encoding)) 911 else: 912 tag = qnames[tag] 913 if tag is None: 914 if text: 915 write(_escape_cdata(text, encoding)) 916 for e in elem: 917 _serialize_html(write, e, encoding, qnames, None) 918 else: 919 write("<" + tag) 920 items = elem.items() 921 if items or namespaces: 922 items.sort() # lexical order 923 for k, v in items: 924 if isinstance(k, QName): 925 k = k.text 926 if isinstance(v, QName): 927 v = qnames[v.text] 928 else: 929 v = _escape_attrib_html(v, encoding) 930 # FIXME: handle boolean attributes 931 write(" %s=\"%s\"" % (qnames[k], v)) 932 if namespaces: 933 items = namespaces.items() 934 items.sort(key=lambda x: x[1]) # sort on prefix 935 for v, k in items: 936 if k: 937 k = ":" + k 938 write(" xmlns%s=\"%s\"" % ( 939 k.encode(encoding), 940 _escape_attrib(v, encoding) 941 )) 942 write(">") 943 tag = tag.lower() 944 if text: 945 if tag == "script" or tag == "style": 946 write(_encode(text, encoding)) 947 else: 948 write(_escape_cdata(text, encoding)) 949 for e in elem: 950 _serialize_html(write, e, encoding, qnames, None) 951 if tag not in HTML_EMPTY: 952 write("</" + tag + ">") 953 if elem.tail: 954 write(_escape_cdata(elem.tail, encoding))

955

956 -def _serialize_text(write, elem, encoding):

957 for part in elem.itertext(): 958 write(part.encode(encoding)) 959 if elem.tail: 960 write(elem.tail.encode(encoding))

961 962 ## 963 # Registers a namespace prefix. The registry is global, and any 964 # existing mapping for either the given prefix or the namespace URI 965 # will be removed. 966 # 967 # @param prefix Namespace prefix. 968 # @param uri Namespace uri. Tags and attributes in this namespace 969 # will be serialized with the given prefix, if at all possible. 970 # @raise ValueError If the prefix is reserved, or is otherwise 971 # invalid. 972

973 -def register_namespace(prefix, uri):

974 if re.match("ns\d+$", prefix): 975 raise ValueError("Prefix format reserved for internal use") 976 for k, v in _namespace_map.items(): 977 if k == uri or v == prefix: 978 del _namespace_map[k] 979 _namespace_map[uri] = prefix

980 981 _namespace_map = { 982 # "well-known" namespace prefixes 983 "http://www.w3.org/XML/1998/namespace": "xml", 984 "http://www.w3.org/1999/xhtml": "html", 985 "http://www.w3.org/1999/02/22-rdf-syntax-ns#": "rdf", 986 "http://schemas.xmlsoap.org/wsdl/": "wsdl", 987 # xml schema 988 "http://www.w3.org/2001/XMLSchema": "xs", 989 "http://www.w3.org/2001/XMLSchema-instance": "xsi", 990 # dublic core 991 "http://purl.org/dc/elements/1.1/": "dc", 992 } 993

994 -def _raise_serialization_error(text):

995 raise TypeError( 996 "cannot serialize %r (type %s)" % (text, type(text).__name__) 997 )

998

999 -def _encode(text, encoding):

1000 try: 1001 return text.encode(encoding, "xmlcharrefreplace") 1002 except (TypeError, AttributeError): 1003 _raise_serialization_error(text)

1004

1005 -def _escape_cdata(text, encoding):

1006 # escape character data 1007 try: 1008 # it's worth avoiding do-nothing calls for strings that are 1009 # shorter than 500 character, or so. assume that's, by far, 1010 # the most common case in most applications. 1011 if "&" in text: 1012 text = text.replace("&", "&") 1013 if "<" in text: 1014 text = text.replace("<", "<") 1015 if ">" in text: 1016 text = text.replace(">", ">") 1017 return text.encode(encoding, "xmlcharrefreplace") 1018 except (TypeError, AttributeError): 1019 _raise_serialization_error(text)

1020

1021 -def _escape_attrib(text, encoding):

1022 # escape attribute value 1023 try: 1024 if "&" in text: 1025 text = text.replace("&", "&") 1026 if "<" in text: 1027 text = text.replace("<", "<") 1028 if ">" in text: 1029 text = text.replace(">", ">") 1030 if "\"" in text: 1031 text = text.replace("\"", """) 1032 if "\n" in text: 1033 text = text.replace("\n", "
") 1034 return text.encode(encoding, "xmlcharrefreplace") 1035 except (TypeError, AttributeError): 1036 _raise_serialization_error(text)

1037

1038 -def _escape_attrib_html(text, encoding):

1039 # escape attribute value 1040 try: 1041 if "&" in text: 1042 text = text.replace("&", "&") 1043 if ">" in text: 1044 text = text.replace(">", ">") 1045 if "\"" in text: 1046 text = text.replace("\"", """) 1047 return text.encode(encoding, "xmlcharrefreplace") 1048 except (TypeError, AttributeError): 1049 _raise_serialization_error(text)

1050 1051 # -------------------------------------------------------------------- 1052 1053 ## 1054 # Generates a string representation of an XML element, including all 1055 # subelements. 1056 # 1057 # @param element An Element instance. 1058 # @return An encoded string containing the XML data. 1059 # @defreturn string 1060

1061 -def tostring(element, encoding=None, method=None):

1062 class dummy: 1063 pass

1064 data = [] 1065 file = dummy() 1066 file.write = data.append 1067 ElementTree(element).write(file, encoding, method=method) 1068 return "".join(data) 1069 1070 ## 1071 # Generates a string representation of an XML element, including all 1072 # subelements. The string is returned as a sequence of string fragments. 1073 # 1074 # @param element An Element instance. 1075 # @return A sequence object containing the XML data. 1076 # @defreturn sequence 1077 # @since 1.3 1078

1079 -def tostringlist(element, encoding=None):

1080 class dummy: 1081 pass

1082 data = [] 1083 file = dummy() 1084 file.write = data.append 1085 ElementTree(element).write(file, encoding) 1086 # FIXME: merge small fragments into larger parts 1087 return data 1088 1089 ## 1090 # Writes an element tree or element structure to sys.stdout. This 1091 # function should be used for debugging only. 1092 #  1093 # The exact output format is implementation dependent. In this 1094 # version, it's written as an ordinary XML file. 1095 # 1096 # @param elem An element tree or an individual element. 1097

1098 -def dump(elem):

1099 # debugging 1100 if not isinstance(elem, ElementTree): 1101 elem = ElementTree(elem) 1102 elem.write(sys.stdout) 1103 tail = elem.getroot().tail 1104 if not tail or tail[-1] != "\n": 1105 sys.stdout.write("\n")

1106 1107 # -------------------------------------------------------------------- 1108 # parsing 1109 1110 ## 1111 # Parses an XML document into an element tree. 1112 # 1113 # @param source A filename or file object containing XML data. 1114 # @param parser An optional parser instance. If not given, the 1115 # standard {@link XMLParser} parser is used. 1116 # @return An ElementTree instance 1117

1118 -def parse(source, parser=None):

1119 tree = ElementTree() 1120 tree.parse(source, parser) 1121 return tree

1122 1123 ## 1124 # Parses an XML document into an element tree incrementally, and reports 1125 # what's going on to the user. 1126 # 1127 # @param source A filename or file object containing XML data. 1128 # @param events A list of events to report back. If omitted, only "end" 1129 # events are reported. 1130 # @param parser An optional parser instance. If not given, the 1131 # standard {@link XMLParser} parser is used. 1132 # @return A (event, elem) iterator. 1133

1134 -def iterparse(source, events=None, parser=None):

1135 if not hasattr(source, "read"): 1136 source = open(source, "rb") 1137 if not parser: 1138 parser = XMLParser(target=TreeBuilder()) 1139 return _IterParseIterator(source, events, parser)

1140

1141 -class _IterParseIterator(object):

1142

1143 - def __init__(self, source, events, parser):

1144 self._file = source 1145 self._events = [] 1146 self._index = 0 1147 self.root = self._root = None 1148 self._parser = parser 1149 # wire up the parser for event reporting 1150 parser = self._parser._parser 1151 append = self._events.append 1152 if events is None: 1153 events = ["end"] 1154 for event in events: 1155 if event == "start": 1156 try: 1157 parser.ordered_attributes = 1 1158 parser.specified_attributes = 1 1159 def handler(tag, attrib_in, event=event, append=append, 1160 start=self._parser._start_list): 1161 append((event, start(tag, attrib_in)))

1162 parser.StartElementHandler = handler 1163 except AttributeError: 1164 def handler(tag, attrib_in, event=event, append=append, 1165 start=self._parser._start): 1166 append((event, start(tag, attrib_in)))

1167 parser.StartElementHandler = handler 1168 elif event == "end": 1169 def handler(tag, event=event, append=append, 1170 end=self._parser._end): 1171 append((event, end(tag))) 1172 parser.EndElementHandler = handler 1173 elif event == "start-ns": 1174 def handler(prefix, uri, event=event, append=append): 1175 try: 1176 uri = uri.encode("ascii") 1177 except UnicodeError: 1178 pass 1179 append((event, (prefix or "", uri))) 1180 parser.StartNamespaceDeclHandler = handler 1181 elif event == "end-ns": 1182 def handler(prefix, event=event, append=append): 1183 append((event, None)) 1184 parser.EndNamespaceDeclHandler = handler 1185

1186 - def next(self):

1187 while 1: 1188 try: 1189 item = self._events[self._index] 1190 except IndexError: 1191 if self._parser is None: 1192 self.root = self._root 1193 raise StopIteration 1194 # load event buffer 1195 del self._events[:] 1196 self._index = 0 1197 data = self._file.read(16384) 1198 if data: 1199 self._parser.feed(data) 1200 else: 1201 self._root = self._parser.close() 1202 self._parser = None 1203 else: 1204 self._index = self._index + 1 1205 return item

1206

1207 - def __iter__(self):

1208 return self

1209 1210 ## 1211 # Parses an XML document from a string constant. This function can 1212 # be used to embed "XML literals" in Python code. 1213 # 1214 # @param source A string containing XML data. 1215 # @param parser An optional parser instance. If not given, the 1216 # standard {@link XMLParser} parser is used. 1217 # @return An Element instance. 1218 # @defreturn Element 1219

1220 -def XML(text, parser=None):

1221 if not parser: 1222 parser = XMLParser(target=TreeBuilder()) 1223 parser.feed(text) 1224 return parser.close()

1225 1226 ## 1227 # Parses an XML document from a string constant, and also returns 1228 # a dictionary which maps from element id:s to elements. 1229 # 1230 # @param source A string containing XML data. 1231 # @param parser An optional parser instance. If not given, the 1232 # standard {@link XMLParser} parser is used. 1233 # @return A tuple containing an Element instance and a dictionary. 1234 # @defreturn (Element, dictionary) 1235

1236 -def XMLID(text, parser=None):

1237 if not parser: 1238 parser = XMLParser(target=TreeBuilder()) 1239 parser.feed(text) 1240 tree = parser.close() 1241 ids = {} 1242 for elem in tree.getiterator(): 1243 id = elem.get("id") 1244 if id: 1245 ids[id] = elem 1246 return tree, ids

1247 1248 ## 1249 # Parses an XML document from a string constant. Same as {@link #XML}. 1250 # 1251 # @def fromstring(text) 1252 # @param source A string containing XML data. 1253 # @return An Element instance. 1254 # @defreturn Element 1255 1256 fromstring = XML 1257 1258 ## 1259 # Parses an XML document from a sequence of string fragments. 1260 # 1261 # @param sequence A list or other sequence containing XML data fragments. 1262 # @param parser An optional parser instance. If not given, the 1263 # standard {@link XMLParser} parser is used. 1264 # @return An Element instance. 1265 # @defreturn Element 1266 # @since 1.3 1267

1268 -def fromstringlist(sequence, parser=None):

1269 if not parser: 1270 parser = XMLParser(target=TreeBuilder()) 1271 for text in sequence: 1272 parser.feed(text) 1273 return parser.close()

1274 1275 # -------------------------------------------------------------------- 1276 1277 ## 1278 # Generic element structure builder. This builder converts a sequence 1279 # of {@link #TreeBuilder.start}, {@link #TreeBuilder.data}, and {@link 1280 # #TreeBuilder.end} method calls to a well-formed element structure. 1281 #  1282 # You can use this class to build an element structure using a custom XML 1283 # parser, or a parser for some other XML-like format. 1284 # 1285 # @param element_factory Optional element factory. This factory 1286 # is called to create new Element instances, as necessary. 1287

1288 -class TreeBuilder(object):

1289

1290 - def __init__(self, element_factory=None):

1291 self._data = [] # data collector 1292 self._elem = [] # element stack 1293 self._last = None # last element 1294 self._tail = None # true if we're after an end tag 1295 if element_factory is None: 1296 element_factory = Element 1297 self._factory = element_factory

1298 1299 ## 1300 # Flushes the builder buffers, and returns the toplevel document 1301 # element. 1302 # 1303 # @return An Element instance. 1304 # @defreturn Element 1305

1306 - def close(self):

1307 assert len(self._elem) == 0, "missing end tags" 1308 assert self._last != None, "missing toplevel element" 1309 return self._last

1310

1311 - def _flush(self):

1312 if self._data: 1313 if self._last is not None: 1314 text = "".join(self._data) 1315 if self._tail: 1316 assert self._last.tail is None, "internal error (tail)" 1317 self._last.tail = text 1318 else: 1319 assert self._last.text is None, "internal error (text)" 1320 self._last.text = text 1321 self._data = []

1322 1323 ## 1324 # Adds text to the current element. 1325 # 1326 # @param data A string. This should be either an 8-bit string 1327 # containing ASCII text, or a Unicode string. 1328

1329 - def data(self, data):

1330 self._data.append(data)

1331 1332 ## 1333 # Opens a new element. 1334 # 1335 # @param tag The element name. 1336 # @param attrib A dictionary containing element attributes. 1337 # @return The opened element. 1338 # @defreturn Element 1339

1340 - def start(self, tag, attrs):

1341 self._flush() 1342 self._last = elem = self._factory(tag, attrs) 1343 if self._elem: 1344 self._elem[-1].append(elem) 1345 self._elem.append(elem) 1346 self._tail = 0 1347 return elem

1348 1349 ## 1350 # Closes the current element. 1351 # 1352 # @param tag The element name. 1353 # @return The closed element. 1354 # @defreturn Element 1355

1356 - def end(self, tag):

1357 self._flush() 1358 self._last = self._elem.pop() 1359 assert self._last.tag == tag,\ 1360 "end tag mismatch (expected %s, got %s)" % ( 1361 self._last.tag, tag) 1362 self._tail = 1 1363 return self._last

1364 1365 ## 1366 # Element structure builder for XML source data, based on the 1367 # expat parser. 1368 # 1369 # @keyparam target Target object. If omitted, the builder uses an 1370 # instance of the standard {@link #TreeBuilder} class. 1371 # @keyparam html Predefine HTML entities. This flag is not supported 1372 # by the current implementation. 1373 # @keyparam encoding Optional encoding. If given, the value overrides 1374 # the encoding specified in the XML file. 1375 # @see #ElementTree 1376 # @see #TreeBuilder 1377

1378 -class XMLParser(object):

1379

1380 - def __init__(self, html=0, target=None, encoding=None):

1381 try: 1382 from xml.parsers import expat 1383 except ImportError: 1384 try: 1385 import pyexpat; expat = pyexpat 1386 except ImportError: 1387 raise ImportError( 1388 "No module named expat; use SimpleXMLTreeBuilder instead" 1389 ) 1390 parser = expat.ParserCreate(encoding, "}") 1391 if target is None: 1392 target = TreeBuilder() 1393 # underscored names are provided for compatibility only 1394 self.parser = self._parser = parser 1395 self.target = self._target = target 1396 self._error = expat.error 1397 self._names = {} # name memo cache 1398 # callbacks 1399 parser.DefaultHandlerExpand = self._default 1400 parser.StartElementHandler = self._start 1401 parser.EndElementHandler = self._end 1402 parser.CharacterDataHandler = self._data 1403 # let expat do the buffering, if supported 1404 try: 1405 self._parser.buffer_text = 1 1406 except AttributeError: 1407 pass 1408 # use new-style attribute handling, if supported 1409 try: 1410 self._parser.ordered_attributes = 1 1411 self._parser.specified_attributes = 1 1412 parser.StartElementHandler = self._start_list 1413 except AttributeError: 1414 pass 1415 self._doctype = None 1416 self.entity = {} 1417 try: 1418 self.version = "Expat %d.%d.%d" % expat.version_info 1419 except AttributeError: 1420 pass # unknown

1421

1422 - def _raiseerror(self, value):

1423 err = ParseError(value) 1424 err.code = value.code 1425 err.position = value.lineno, value.offset 1426 raise err

1427

1428 - def _fixtext(self, text):

1429 # convert text string to ascii, if possible 1430 try: 1431 return text.encode("ascii") 1432 except UnicodeError: 1433 return text

1434

1435 - def _fixname(self, key):

1436 # expand qname, and convert name string to ascii, if possible 1437 try: 1438 name = self._names[key] 1439 except KeyError: 1440 name = key 1441 if "}" in name: 1442 name = "{" + name 1443 self._names[key] = name = self._fixtext(name) 1444 return name

1445

1446 - def _start(self, tag, attrib_in):

1447 fixname = self._fixname 1448 fixtext = self._fixtext 1449 tag = fixname(tag) 1450 attrib = {} 1451 for key, value in attrib_in.items(): 1452 attrib[fixname(key)] = fixtext(value) 1453 return self.target.start(tag, attrib)

1454

1455 - def _start_list(self, tag, attrib_in):

1456 fixname = self._fixname 1457 fixtext = self._fixtext 1458 tag = fixname(tag) 1459 attrib = {} 1460 if attrib_in: 1461 for i in range(0, len(attrib_in), 2): 1462 attrib[fixname(attrib_in[i])] = fixtext(attrib_in[i+1]) 1463 return self.target.start(tag, attrib)

1464

1465 - def _data(self, text):

1466 return self.target.data(self._fixtext(text))

1467

1468 - def _end(self, tag):

1469 return self.target.end(self._fixname(tag))

1470

1471 - def _default(self, text):

1472 prefix = text[:1] 1473 if prefix == "&": 1474 # deal with undefined entities 1475 try: 1476 self.target.data(self.entity[text[1:-1]]) 1477 except KeyError: 1478 from xml.parsers import expat 1479 err = expat.error( 1480 "undefined entity %s: line %d, column %d" % 1481 (text, self._parser.ErrorLineNumber, 1482 self._parser.ErrorColumnNumber) 1483 ) 1484 err.code = 11 # XML_ERROR_UNDEFINED_ENTITY 1485 err.lineno = self._parser.ErrorLineNumber 1486 err.offset = self._parser.ErrorColumnNumber 1487 raise err 1488 elif prefix == "<" and text[:9] == "<!DOCTYPE": 1489 self._doctype = [] # inside a doctype declaration 1490 elif self._doctype is not None: 1491 # parse doctype contents 1492 if prefix == ">": 1493 self._doctype = None 1494 return 1495 text = text.strip() 1496 if not text: 1497 return 1498 self._doctype.append(text) 1499 n = len(self._doctype) 1500 if n > 2: 1501 type = self._doctype[1] 1502 if type == "PUBLIC" and n == 4: 1503 name, type, pubid, system = self._doctype 1504 elif type == "SYSTEM" and n == 3: 1505 name, type, system = self._doctype 1506 pubid = None 1507 else: 1508 return 1509 if pubid: 1510 pubid = pubid[1:-1] 1511 if hasattr(self.target, "doctype"): 1512 self.target.doctype(name, pubid, system[1:-1]) 1513 self._doctype = None

1514 1515 ## 1516 # Feeds data to the parser. 1517 # 1518 # @param data Encoded data. 1519

1520 - def feed(self, data):

1521 try: 1522 self._parser.Parse(data, 0) 1523 except self._error, v: 1524 self._raiseerror(v)

1525 1526 ## 1527 # Finishes feeding data to the parser. 1528 # 1529 # @return An element structure. 1530 # @defreturn Element 1531

1532 - def close(self):

1533 try: 1534 self._parser.Parse("", 1) # end of data 1535 except self._error, v: 1536 self._raiseerror(v) 1537 tree = self.target.close() 1538 del self.target, self._parser # get rid of circular references 1539 return tree

1540 1541 # compatibility 1542 XMLTreeBuilder = XMLParser 1543

Source Code for Module elementtree.ElementTree