Package lxml :: Package html
[hide private]
[frames] | no frames]

Source Code for Package lxml.html

   1  # Copyright (c) 2004 Ian Bicking. All rights reserved. 
   2  # 
   3  # Redistribution and use in source and binary forms, with or without 
   4  # modification, are permitted provided that the following conditions are 
   5  # met: 
   6  # 
   7  # 1. Redistributions of source code must retain the above copyright 
   8  # notice, this list of conditions and the following disclaimer. 
   9  # 
  10  # 2. Redistributions in binary form must reproduce the above copyright 
  11  # notice, this list of conditions and the following disclaimer in 
  12  # the documentation and/or other materials provided with the 
  13  # distribution. 
  14  # 
  15  # 3. Neither the name of Ian Bicking nor the names of its contributors may 
  16  # be used to endorse or promote products derived from this software 
  17  # without specific prior written permission. 
  18  # 
  19  # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 
  20  # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 
  21  # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 
  22  # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL IAN BICKING OR 
  23  # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 
  24  # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 
  25  # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 
  26  # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 
  27  # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 
  28  # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 
  29  # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 
  30   
  31  """The ``lxml.html`` tool set for HTML handling. 
  32  """ 
  33   
  34  import sys 
  35  import re 
  36  try: 
  37      from urlparse import urljoin 
  38  except ImportError: 
  39      # Python 3 
  40      from urllib.parse import urljoin 
  41  import copy 
  42  from lxml import etree 
  43  from lxml.html import defs 
  44  from lxml.html._setmixin import SetMixin 
  45  try: 
  46      from collections import MutableMapping as DictMixin 
  47  except ImportError: 
  48      # Python < 2.6 
  49      from UserDict import DictMixin 
  50  try: 
  51      set 
  52  except NameError: 
  53      # Python 2.3 
  54      from sets import Set as set 
  55  try: 
  56      bytes 
  57  except NameError: 
  58      # Python < 2.6 
  59      bytes = str 
  60  try: 
  61      unicode 
  62  except NameError: 
  63      # Python 3 
  64      unicode = str 
  65  try: 
  66      basestring 
  67  except NameError: 
  68      # Python 3 
  69      basestring = (str, bytes) 
  70   
71 -def __fix_docstring(s):
72 if not s: 73 return s 74 import sys 75 if sys.version_info[0] >= 3: 76 sub = re.compile(r"^(\s*)u'", re.M).sub 77 else: 78 sub = re.compile(r"^(\s*)b'", re.M).sub 79 return sub(r"\1'", s)
80 81 __all__ = [ 82 'document_fromstring', 'fragment_fromstring', 'fragments_fromstring', 'fromstring', 83 'tostring', 'Element', 'defs', 'open_in_browser', 'submit_form', 84 'find_rel_links', 'find_class', 'make_links_absolute', 85 'resolve_base_href', 'iterlinks', 'rewrite_links', 'open_in_browser', 'parse'] 86 87 XHTML_NAMESPACE = "http://www.w3.org/1999/xhtml" 88 89 _rel_links_xpath = etree.XPath("descendant-or-self::a[@rel]|descendant-or-self::x:a[@rel]", 90 namespaces={'x':XHTML_NAMESPACE}) 91 _options_xpath = etree.XPath("descendant-or-self::option|descendant-or-self::x:option", 92 namespaces={'x':XHTML_NAMESPACE}) 93 _forms_xpath = etree.XPath("descendant-or-self::form|descendant-or-self::x:form", 94 namespaces={'x':XHTML_NAMESPACE}) 95 #_class_xpath = etree.XPath(r"descendant-or-self::*[regexp:match(@class, concat('\b', $class_name, '\b'))]", {'regexp': 'http://exslt.org/regular-expressions'}) 96 _class_xpath = etree.XPath("descendant-or-self::*[@class and contains(concat(' ', normalize-space(@class), ' '), concat(' ', $class_name, ' '))]") 97 _id_xpath = etree.XPath("descendant-or-self::*[@id=$id]") 98 _collect_string_content = etree.XPath("string()") 99 _css_url_re = re.compile(r'url\(('+'["][^"]*["]|'+"['][^']*[']|"+r'[^)]*)\)', re.I) 100 _css_import_re = re.compile(r'@import "(.*?)"') 101 _label_xpath = etree.XPath("//label[@for=$id]|//x:label[@for=$id]", 102 namespaces={'x':XHTML_NAMESPACE}) 103 _archive_re = re.compile(r'[^ ]+') 104
105 -def _unquote_match(s, pos):
106 if s[:1] == '"' and s[-1:] == '"' or s[:1] == "'" and s[-1:] == "'": 107 return s[1:-1], pos+1 108 else: 109 return s,pos
110
111 -def _transform_result(typ, result):
112 """Convert the result back into the input type. 113 """ 114 if issubclass(typ, bytes): 115 return tostring(result, encoding='utf-8') 116 elif issubclass(typ, unicode): 117 return tostring(result, encoding='unicode') 118 else: 119 return result
120
121 -def _nons(tag):
122 if isinstance(tag, basestring): 123 if tag[0] == '{' and tag[1:len(XHTML_NAMESPACE)+1] == XHTML_NAMESPACE: 124 return tag.split('}')[-1] 125 return tag
126
127 -class HtmlMixin(object):
128
129 - def base_url(self):
130 """ 131 Returns the base URL, given when the page was parsed. 132 133 Use with ``urlparse.urljoin(el.base_url, href)`` to get 134 absolute URLs. 135 """ 136 return self.getroottree().docinfo.URL
137 base_url = property(base_url, doc=base_url.__doc__) 138
139 - def forms(self):
140 """ 141 Return a list of all the forms 142 """ 143 return _forms_xpath(self)
144 forms = property(forms, doc=forms.__doc__) 145
146 - def body(self):
147 """ 148 Return the <body> element. Can be called from a child element 149 to get the document's head. 150 """ 151 return self.xpath('//body|//x:body', namespaces={'x':XHTML_NAMESPACE})[0]
152 body = property(body, doc=body.__doc__) 153
154 - def head(self):
155 """ 156 Returns the <head> element. Can be called from a child 157 element to get the document's head. 158 """ 159 return self.xpath('//head|//x:head', namespaces={'x':XHTML_NAMESPACE})[0]
160 head = property(head, doc=head.__doc__) 161
162 - def _label__get(self):
163 """ 164 Get or set any <label> element associated with this element. 165 """ 166 id = self.get('id') 167 if not id: 168 return None 169 result = _label_xpath(self, id=id) 170 if not result: 171 return None 172 else: 173 return result[0]
174 - def _label__set(self, label):
175 id = self.get('id') 176 if not id: 177 raise TypeError( 178 "You cannot set a label for an element (%r) that has no id" 179 % self) 180 if _nons(label.tag) != 'label': 181 raise TypeError( 182 "You can only assign label to a label element (not %r)" 183 % label) 184 label.set('for', id)
185 - def _label__del(self):
186 label = self.label 187 if label is not None: 188 del label.attrib['for']
189 label = property(_label__get, _label__set, _label__del, doc=_label__get.__doc__) 190
191 - def drop_tree(self):
192 """ 193 Removes this element from the tree, including its children and 194 text. The tail text is joined to the previous element or 195 parent. 196 """ 197 parent = self.getparent() 198 assert parent is not None 199 if self.tail: 200 previous = self.getprevious() 201 if previous is None: 202 parent.text = (parent.text or '') + self.tail 203 else: 204 previous.tail = (previous.tail or '') + self.tail 205 parent.remove(self)
206
207 - def drop_tag(self):
208 """ 209 Remove the tag, but not its children or text. The children and text 210 are merged into the parent. 211 212 Example:: 213 214 >>> h = fragment_fromstring('<div>Hello <b>World!</b></div>') 215 >>> h.find('.//b').drop_tag() 216 >>> print(tostring(h, encoding='unicode')) 217 <div>Hello World!</div> 218 """ 219 parent = self.getparent() 220 assert parent is not None 221 previous = self.getprevious() 222 if self.text and isinstance(self.tag, basestring): 223 # not a Comment, etc. 224 if previous is None: 225 parent.text = (parent.text or '') + self.text 226 else: 227 previous.tail = (previous.tail or '') + self.text 228 if self.tail: 229 if len(self): 230 last = self[-1] 231 last.tail = (last.tail or '') + self.tail 232 elif previous is None: 233 parent.text = (parent.text or '') + self.tail 234 else: 235 previous.tail = (previous.tail or '') + self.tail 236 index = parent.index(self) 237 parent[index:index+1] = self[:]
238 246
247 - def find_class(self, class_name):
248 """ 249 Find any elements with the given class name. 250 """ 251 return _class_xpath(self, class_name=class_name)
252
253 - def get_element_by_id(self, id, *default):
254 """ 255 Get the first element in a document with the given id. If none is 256 found, return the default argument if provided or raise KeyError 257 otherwise. 258 259 Note that there can be more than one element with the same id, 260 and this isn't uncommon in HTML documents found in the wild. 261 Browsers return only the first match, and this function does 262 the same. 263 """ 264 try: 265 # FIXME: should this check for multiple matches? 266 # browsers just return the first one 267 return _id_xpath(self, id=id)[0] 268 except IndexError: 269 if default: 270 return default[0] 271 else: 272 raise KeyError(id)
273
274 - def text_content(self):
275 """ 276 Return the text content of the tag (and the text in any children). 277 """ 278 return _collect_string_content(self)
279
280 - def cssselect(self, expr, translator='html'):
281 """ 282 Run the CSS expression on this element and its children, 283 returning a list of the results. 284 285 Equivalent to lxml.cssselect.CSSSelect(expr, translator='html')(self) 286 -- note that pre-compiling the expression can provide a substantial 287 speedup. 288 """ 289 # Do the import here to make the dependency optional. 290 from lxml.cssselect import CSSSelector 291 return CSSSelector(expr, translator=translator)(self)
292 293 ######################################## 294 ## Link functions 295 ######################################## 296 327 elif handle_failures == 'discard': 328 def link_repl(href): 329 try: 330 return urljoin(base_url, href) 331 except ValueError: 332 return None
333 elif handle_failures is None: 334 def link_repl(href): 335 return urljoin(base_url, href) 336 else: 337 raise ValueError( 338 "unexpected value for handle_failures: %r" % handle_failures) 339 340 self.rewrite_links(link_repl) 341
342 - def resolve_base_href(self, handle_failures=None):
343 """ 344 Find any ``<base href>`` tag in the document, and apply its 345 values to all links found in the document. Also remove the 346 tag once it has been applied. 347 348 If ``handle_failures`` is None (default), a failure to process 349 a URL will abort the processing. If set to 'ignore', errors 350 are ignored. If set to 'discard', failing URLs will be removed. 351 """ 352 base_href = None 353 basetags = self.xpath('//base[@href]|//x:base[@href]', 354 namespaces={'x': XHTML_NAMESPACE}) 355 for b in basetags: 356 base_href = b.get('href') 357 b.drop_tree() 358 if not base_href: 359 return 360 self.make_links_absolute(base_href, resolve_base_href=False, 361 handle_failures=handle_failures)
362 441 490 491
492 -class _MethodFunc(object):
493 """ 494 An object that represents a method on an element as a function; 495 the function takes either an element or an HTML string. It 496 returns whatever the function normally returns, or if the function 497 works in-place (and so returns None) it returns a serialized form 498 of the resulting document. 499 """
500 - def __init__(self, name, copy=False, source_class=HtmlMixin):
501 self.name = name 502 self.copy = copy 503 self.__doc__ = getattr(source_class, self.name).__doc__
504 - def __call__(self, doc, *args, **kw):
505 result_type = type(doc) 506 if isinstance(doc, basestring): 507 if 'copy' in kw: 508 raise TypeError( 509 "The keyword 'copy' can only be used with element inputs to %s, not a string input" % self.name) 510 doc = fromstring(doc, **kw) 511 else: 512 if 'copy' in kw: 513 make_a_copy = kw.pop('copy') 514 else: 515 make_a_copy = self.copy 516 if make_a_copy: 517 doc = copy.deepcopy(doc) 518 meth = getattr(doc, self.name) 519 result = meth(*args, **kw) 520 # FIXME: this None test is a bit sloppy 521 if result is None: 522 # Then return what we got in 523 return _transform_result(result_type, doc) 524 else: 525 return result
526 527 find_rel_links = _MethodFunc('find_rel_links', copy=False) 528 find_class = _MethodFunc('find_class', copy=False) 529 make_links_absolute = _MethodFunc('make_links_absolute', copy=True) 530 resolve_base_href = _MethodFunc('resolve_base_href', copy=True) 531 iterlinks = _MethodFunc('iterlinks', copy=False) 532 rewrite_links = _MethodFunc('rewrite_links', copy=True) 533
534 -class HtmlComment(etree.CommentBase, HtmlMixin):
535 pass
536
537 -class HtmlElement(etree.ElementBase, HtmlMixin):
538 pass
539
540 -class HtmlProcessingInstruction(etree.PIBase, HtmlMixin):
541 pass
542
543 -class HtmlEntity(etree.EntityBase, HtmlMixin):
544 pass
545 546
547 -class HtmlElementClassLookup(etree.CustomElementClassLookup):
548 """A lookup scheme for HTML Element classes. 549 550 To create a lookup instance with different Element classes, pass a tag 551 name mapping of Element classes in the ``classes`` keyword argument and/or 552 a tag name mapping of Mixin classes in the ``mixins`` keyword argument. 553 The special key '*' denotes a Mixin class that should be mixed into all 554 Element classes. 555 """ 556 _default_element_classes = {} 557
558 - def __init__(self, classes=None, mixins=None):
559 etree.CustomElementClassLookup.__init__(self) 560 if classes is None: 561 classes = self._default_element_classes.copy() 562 if mixins: 563 mixers = {} 564 for name, value in mixins: 565 if name == '*': 566 for n in classes.keys(): 567 mixers.setdefault(n, []).append(value) 568 else: 569 mixers.setdefault(name, []).append(value) 570 for name, mix_bases in mixers.items(): 571 cur = classes.get(name, HtmlElement) 572 bases = tuple(mix_bases + [cur]) 573 classes[name] = type(cur.__name__, bases, {}) 574 self._element_classes = classes
575
576 - def lookup(self, node_type, document, namespace, name):
577 if node_type == 'element': 578 return self._element_classes.get(name.lower(), HtmlElement) 579 elif node_type == 'comment': 580 return HtmlComment 581 elif node_type == 'PI': 582 return HtmlProcessingInstruction 583 elif node_type == 'entity': 584 return HtmlEntity 585 # Otherwise normal lookup 586 return None
587 588 ################################################################################ 589 # parsing 590 ################################################################################ 591 592 _looks_like_full_html_unicode = re.compile( 593 unicode(r'^\s*<(?:html|!doctype)'), re.I).match 594 _looks_like_full_html_bytes = re.compile( 595 r'^\s*<(?:html|!doctype)'.encode('ascii'), re.I).match 596
597 -def document_fromstring(html, parser=None, **kw):
598 if parser is None: 599 parser = html_parser 600 value = etree.fromstring(html, parser, **kw) 601 if value is None: 602 raise etree.ParserError( 603 "Document is empty") 604 return value
605
606 -def fragments_fromstring(html, no_leading_text=False, base_url=None, 607 parser=None, **kw):
608 """ 609 Parses several HTML elements, returning a list of elements. 610 611 The first item in the list may be a string (though leading 612 whitespace is removed). If no_leading_text is true, then it will 613 be an error if there is leading text, and it will always be a list 614 of only elements. 615 616 base_url will set the document's base_url attribute (and the tree's docinfo.URL) 617 """ 618 if parser is None: 619 parser = html_parser 620 # FIXME: check what happens when you give html with a body, head, etc. 621 if isinstance(html, bytes): 622 if not _looks_like_full_html_bytes(html): 623 # can't use %-formatting in early Py3 versions 624 html = ('<html><body>'.encode('ascii') + html + 625 '</body></html>'.encode('ascii')) 626 else: 627 if not _looks_like_full_html_unicode(html): 628 html = '<html><body>%s</body></html>' % html 629 doc = document_fromstring(html, parser=parser, base_url=base_url, **kw) 630 assert _nons(doc.tag) == 'html' 631 bodies = [e for e in doc if _nons(e.tag) == 'body'] 632 assert len(bodies) == 1, ("too many bodies: %r in %r" % (bodies, html)) 633 body = bodies[0] 634 elements = [] 635 if no_leading_text and body.text and body.text.strip(): 636 raise etree.ParserError( 637 "There is leading text: %r" % body.text) 638 if body.text and body.text.strip(): 639 elements.append(body.text) 640 elements.extend(body) 641 # FIXME: removing the reference to the parent artificial document 642 # would be nice 643 return elements
644
645 -def fragment_fromstring(html, create_parent=False, base_url=None, 646 parser=None, **kw):
647 """ 648 Parses a single HTML element; it is an error if there is more than 649 one element, or if anything but whitespace precedes or follows the 650 element. 651 652 If create_parent is true (or is a tag name) then a parent node 653 will be created to encapsulate the HTML in a single element. In 654 this case, leading or trailing text is allowed. 655 656 base_url will set the document's base_url attribute (and the tree's docinfo.URL) 657 """ 658 if parser is None: 659 parser = html_parser 660 661 accept_leading_text = bool(create_parent) 662 663 elements = fragments_fromstring( 664 html, parser=parser, no_leading_text=not accept_leading_text, 665 base_url=base_url, **kw) 666 667 if create_parent: 668 if not isinstance(create_parent, basestring): 669 create_parent = 'div' 670 new_root = Element(create_parent) 671 if elements: 672 if isinstance(elements[0], basestring): 673 new_root.text = elements[0] 674 del elements[0] 675 new_root.extend(elements) 676 return new_root 677 678 if not elements: 679 raise etree.ParserError('No elements found') 680 if len(elements) > 1: 681 raise etree.ParserError( 682 "Multiple elements found (%s)" 683 % ', '.join([_element_name(e) for e in elements])) 684 el = elements[0] 685 if el.tail and el.tail.strip(): 686 raise etree.ParserError( 687 "Element followed by text: %r" % el.tail) 688 el.tail = None 689 return el
690
691 -def fromstring(html, base_url=None, parser=None, **kw):
692 """ 693 Parse the html, returning a single element/document. 694 695 This tries to minimally parse the chunk of text, without knowing if it 696 is a fragment or a document. 697 698 base_url will set the document's base_url attribute (and the tree's docinfo.URL) 699 """ 700 if parser is None: 701 parser = html_parser 702 if isinstance(html, bytes): 703 is_full_html = _looks_like_full_html_bytes(html) 704 else: 705 is_full_html = _looks_like_full_html_unicode(html) 706 doc = document_fromstring(html, parser=parser, base_url=base_url, **kw) 707 if is_full_html: 708 return doc 709 # otherwise, lets parse it out... 710 bodies = doc.findall('body') 711 if not bodies: 712 bodies = doc.findall('{%s}body' % XHTML_NAMESPACE) 713 if bodies: 714 body = bodies[0] 715 if len(bodies) > 1: 716 # Somehow there are multiple bodies, which is bad, but just 717 # smash them into one body 718 for other_body in bodies[1:]: 719 if other_body.text: 720 if len(body): 721 body[-1].tail = (body[-1].tail or '') + other_body.text 722 else: 723 body.text = (body.text or '') + other_body.text 724 body.extend(other_body) 725 # We'll ignore tail 726 # I guess we are ignoring attributes too 727 other_body.drop_tree() 728 else: 729 body = None 730 heads = doc.findall('head') 731 if not heads: 732 heads = doc.findall('{%s}head' % XHTML_NAMESPACE) 733 if heads: 734 # Well, we have some sort of structure, so lets keep it all 735 head = heads[0] 736 if len(heads) > 1: 737 for other_head in heads[1:]: 738 head.extend(other_head) 739 # We don't care about text or tail in a head 740 other_head.drop_tree() 741 return doc 742 if body is None: 743 return doc 744 if (len(body) == 1 and (not body.text or not body.text.strip()) 745 and (not body[-1].tail or not body[-1].tail.strip())): 746 # The body has just one element, so it was probably a single 747 # element passed in 748 return body[0] 749 # Now we have a body which represents a bunch of tags which have the 750 # content that was passed in. We will create a fake container, which 751 # is the body tag, except <body> implies too much structure. 752 if _contains_block_level_tag(body): 753 body.tag = 'div' 754 else: 755 body.tag = 'span' 756 return body
757
758 -def parse(filename_or_url, parser=None, base_url=None, **kw):
759 """ 760 Parse a filename, URL, or file-like object into an HTML document 761 tree. Note: this returns a tree, not an element. Use 762 ``parse(...).getroot()`` to get the document root. 763 764 You can override the base URL with the ``base_url`` keyword. This 765 is most useful when parsing from a file-like object. 766 """ 767 if parser is None: 768 parser = html_parser 769 return etree.parse(filename_or_url, parser, base_url=base_url, **kw)
770
771 -def _contains_block_level_tag(el):
772 # FIXME: I could do this with XPath, but would that just be 773 # unnecessarily slow? 774 for el in el.iter(etree.Element): 775 if _nons(el.tag) in defs.block_tags: 776 return True 777 return False
778
779 -def _element_name(el):
780 if isinstance(el, etree.CommentBase): 781 return 'comment' 782 elif isinstance(el, basestring): 783 return 'string' 784 else: 785 return _nons(el.tag)
786 787 ################################################################################ 788 # form handling 789 ################################################################################ 790
791 -class FormElement(HtmlElement):
792 """ 793 Represents a <form> element. 794 """ 795
796 - def inputs(self):
797 """ 798 Returns an accessor for all the input elements in the form. 799 800 See `InputGetter` for more information about the object. 801 """ 802 return InputGetter(self)
803 inputs = property(inputs, doc=inputs.__doc__) 804
805 - def _fields__get(self):
806 """ 807 Dictionary-like object that represents all the fields in this 808 form. You can set values in this dictionary to effect the 809 form. 810 """ 811 return FieldsDict(self.inputs)
812 - def _fields__set(self, value):
813 prev_keys = self.fields.keys() 814 for key, value in value.items(): 815 if key in prev_keys: 816 prev_keys.remove(key) 817 self.fields[key] = value 818 for key in prev_keys: 819 if key is None: 820 # Case of an unnamed input; these aren't really 821 # expressed in form_values() anyway. 822 continue 823 self.fields[key] = None
824 825 fields = property(_fields__get, _fields__set, doc=_fields__get.__doc__) 826
827 - def _name(self):
828 if self.get('name'): 829 return self.get('name') 830 elif self.get('id'): 831 return '#' + self.get('id') 832 forms = list(self.body.iter('form')) 833 if not forms: 834 forms = list(self.body.iter('{%s}form' % XHTML_NAMESPACE)) 835 return str(forms.index(self))
836
837 - def form_values(self):
838 """ 839 Return a list of tuples of the field values for the form. 840 This is suitable to be passed to ``urllib.urlencode()``. 841 """ 842 results = [] 843 for el in self.inputs: 844 name = el.name 845 if not name: 846 continue 847 tag = _nons(el.tag) 848 if tag == 'textarea': 849 results.append((name, el.value)) 850 elif tag == 'select': 851 value = el.value 852 if el.multiple: 853 for v in value: 854 results.append((name, v)) 855 elif value is not None: 856 results.append((name, el.value)) 857 else: 858 assert tag == 'input', ( 859 "Unexpected tag: %r" % el) 860 if el.checkable and not el.checked: 861 continue 862 if el.type in ('submit', 'image', 'reset'): 863 continue 864 value = el.value 865 if value is not None: 866 results.append((name, el.value)) 867 return results
868
869 - def _action__get(self):
870 """ 871 Get/set the form's ``action`` attribute. 872 """ 873 base_url = self.base_url 874 action = self.get('action') 875 if base_url and action is not None: 876 return urljoin(base_url, action) 877 else: 878 return action
879 - def _action__set(self, value):
880 self.set('action', value)
881 - def _action__del(self):
882 if 'action' in self.attrib: 883 del self.attrib['action']
884 action = property(_action__get, _action__set, _action__del, doc=_action__get.__doc__) 885
886 - def _method__get(self):
887 """ 888 Get/set the form's method. Always returns a capitalized 889 string, and defaults to ``'GET'`` 890 """ 891 return self.get('method', 'GET').upper()
892 - def _method__set(self, value):
893 self.set('method', value.upper())
894 method = property(_method__get, _method__set, doc=_method__get.__doc__)
895 896 HtmlElementClassLookup._default_element_classes['form'] = FormElement 897
898 -def submit_form(form, extra_values=None, open_http=None):
899 """ 900 Helper function to submit a form. Returns a file-like object, as from 901 ``urllib.urlopen()``. This object also has a ``.geturl()`` function, 902 which shows the URL if there were any redirects. 903 904 You can use this like:: 905 906 form = doc.forms[0] 907 form.inputs['foo'].value = 'bar' # etc 908 response = form.submit() 909 doc = parse(response) 910 doc.make_links_absolute(response.geturl()) 911 912 To change the HTTP requester, pass a function as ``open_http`` keyword 913 argument that opens the URL for you. The function must have the following 914 signature:: 915 916 open_http(method, URL, values) 917 918 The action is one of 'GET' or 'POST', the URL is the target URL as a 919 string, and the values are a sequence of ``(name, value)`` tuples with the 920 form data. 921 """ 922 values = form.form_values() 923 if extra_values: 924 if hasattr(extra_values, 'items'): 925 extra_values = extra_values.items() 926 values.extend(extra_values) 927 if open_http is None: 928 open_http = open_http_urllib 929 if form.action: 930 url = form.action 931 else: 932 url = form.base_url 933 return open_http(form.method, url, values)
934
935 -def open_http_urllib(method, url, values):
936 if not url: 937 raise ValueError("cannot submit, no URL provided") 938 ## FIXME: should test that it's not a relative URL or something 939 try: 940 from urllib import urlencode, urlopen 941 except ImportError: # Python 3 942 from urllib.request import urlopen 943 from urllib.parse import urlencode 944 if method == 'GET': 945 if '?' in url: 946 url += '&' 947 else: 948 url += '?' 949 url += urlencode(values) 950 data = None 951 else: 952 data = urlencode(values) 953 return urlopen(url, data)
954
955 -class FieldsDict(DictMixin):
956
957 - def __init__(self, inputs):
958 self.inputs = inputs
959 - def __getitem__(self, item):
960 return self.inputs[item].value
961 - def __setitem__(self, item, value):
962 self.inputs[item].value = value
963 - def __delitem__(self, item):
964 raise KeyError( 965 "You cannot remove keys from ElementDict")
966 - def keys(self):
967 return self.inputs.keys()
968 - def __contains__(self, item):
969 return item in self.inputs
970 - def __iter__(self):
971 return iter(self.inputs.keys())
972 - def __len__(self):
973 return len(self.inputs)
974
975 - def __repr__(self):
976 return '<%s for form %s>' % ( 977 self.__class__.__name__, 978 self.inputs.form._name())
979
980 -class InputGetter(object):
981 982 """ 983 An accessor that represents all the input fields in a form. 984 985 You can get fields by name from this, with 986 ``form.inputs['field_name']``. If there are a set of checkboxes 987 with the same name, they are returned as a list (a `CheckboxGroup` 988 which also allows value setting). Radio inputs are handled 989 similarly. 990 991 You can also iterate over this to get all input elements. This 992 won't return the same thing as if you get all the names, as 993 checkboxes and radio elements are returned individually. 994 """ 995 996 _name_xpath = etree.XPath(".//*[@name = $name and (local-name(.) = 'select' or local-name(.) = 'input' or local-name(.) = 'textarea')]") 997 _all_xpath = etree.XPath(".//*[local-name() = 'select' or local-name() = 'input' or local-name() = 'textarea']") 998
999 - def __init__(self, form):
1000 self.form = form
1001
1002 - def __repr__(self):
1003 return '<%s for form %s>' % ( 1004 self.__class__.__name__, 1005 self.form._name())
1006 1007 ## FIXME: there should be more methods, and it's unclear if this is 1008 ## a dictionary-like object or list-like object 1009
1010 - def __getitem__(self, name):
1011 results = self._name_xpath(self.form, name=name) 1012 if results: 1013 type = results[0].get('type') 1014 if type == 'radio' and len(results) > 1: 1015 group = RadioGroup(results) 1016 group.name = name 1017 return group 1018 elif type == 'checkbox' and len(results) > 1: 1019 group = CheckboxGroup(results) 1020 group.name = name 1021 return group 1022 else: 1023 # I don't like throwing away elements like this 1024 return results[0] 1025 else: 1026 raise KeyError( 1027 "No input element with the name %r" % name)
1028
1029 - def __contains__(self, name):
1030 results = self._name_xpath(self.form, name=name) 1031 return bool(results)
1032
1033 - def keys(self):
1034 names = set() 1035 for el in self: 1036 names.add(el.name) 1037 if None in names: 1038 names.remove(None) 1039 return list(names)
1040
1041 - def __iter__(self):
1042 ## FIXME: kind of dumb to turn a list into an iterator, only 1043 ## to have it likely turned back into a list again :( 1044 return iter(self._all_xpath(self.form))
1045
1046 -class InputMixin(object):
1047 1048 """ 1049 Mix-in for all input elements (input, select, and textarea) 1050 """ 1051 1052
1053 - def _name__get(self):
1054 """ 1055 Get/set the name of the element 1056 """ 1057 return self.get('name')
1058 - def _name__set(self, value):
1059 self.set('name', value)
1060 - def _name__del(self):
1061 if 'name' in self.attrib: 1062 del self.attrib['name']
1063 name = property(_name__get, _name__set, _name__del, doc=_name__get.__doc__) 1064
1065 - def __repr__(self):
1066 type = getattr(self, 'type', None) 1067 if type: 1068 type = ' type=%r' % type 1069 else: 1070 type = '' 1071 return '<%s %x name=%r%s>' % ( 1072 self.__class__.__name__, id(self), self.name, type)
1073
1074 -class TextareaElement(InputMixin, HtmlElement):
1075 """ 1076 ``<textarea>`` element. You can get the name with ``.name`` and 1077 get/set the value with ``.value`` 1078 """ 1079
1080 - def _value__get(self):
1081 """ 1082 Get/set the value (which is the contents of this element) 1083 """ 1084 content = self.text or '' 1085 if self.tag.startswith("{%s}" % XHTML_NAMESPACE): 1086 serialisation_method = 'xml' 1087 else: 1088 serialisation_method = 'html' 1089 for el in self: 1090 # it's rare that we actually get here, so let's not use ''.join() 1091 content += etree.tostring( 1092 el, method=serialisation_method, encoding='unicode') 1093 return content
1094 - def _value__set(self, value):
1095 del self[:] 1096 self.text = value
1097 - def _value__del(self):
1098 self.text = '' 1099 del self[:]
1100 value = property(_value__get, _value__set, _value__del, doc=_value__get.__doc__)
1101 1102 HtmlElementClassLookup._default_element_classes['textarea'] = TextareaElement 1103
1104 -class SelectElement(InputMixin, HtmlElement):
1105 """ 1106 ``<select>`` element. You can get the name with ``.name``. 1107 1108 ``.value`` will be the value of the selected option, unless this 1109 is a multi-select element (``<select multiple>``), in which case 1110 it will be a set-like object. In either case ``.value_options`` 1111 gives the possible values. 1112 1113 The boolean attribute ``.multiple`` shows if this is a 1114 multi-select. 1115 """ 1116
1117 - def _value__get(self):
1118 """ 1119 Get/set the value of this select (the selected option). 1120 1121 If this is a multi-select, this is a set-like object that 1122 represents all the selected options. 1123 """ 1124 if self.multiple: 1125 return MultipleSelectOptions(self) 1126 for el in _options_xpath(self): 1127 if el.get('selected') is not None: 1128 value = el.get('value') 1129 if value is None: 1130 value = el.text or '' 1131 if value: 1132 value = value.strip() 1133 return value 1134 return None
1135
1136 - def _value__set(self, value):
1137 if self.multiple: 1138 if isinstance(value, basestring): 1139 raise TypeError( 1140 "You must pass in a sequence") 1141 self.value.clear() 1142 self.value.update(value) 1143 return 1144 if value is not None: 1145 value = value.strip() 1146 for el in _options_xpath(self): 1147 opt_value = el.get('value') 1148 if opt_value is None: 1149 opt_value = el.text or '' 1150 if opt_value: 1151 opt_value = opt_value.strip() 1152 if opt_value == value: 1153 checked_option = el 1154 break 1155 else: 1156 raise ValueError( 1157 "There is no option with the value of %r" % value) 1158 for el in _options_xpath(self): 1159 if 'selected' in el.attrib: 1160 del el.attrib['selected'] 1161 if value is not None: 1162 checked_option.set('selected', '')
1163
1164 - def _value__del(self):
1165 # FIXME: should del be allowed at all? 1166 if self.multiple: 1167 self.value.clear() 1168 else: 1169 self.value = None
1170 1171 value = property(_value__get, _value__set, _value__del, doc=_value__get.__doc__) 1172
1173 - def value_options(self):
1174 """ 1175 All the possible values this select can have (the ``value`` 1176 attribute of all the ``<option>`` elements. 1177 """ 1178 options = [] 1179 for el in _options_xpath(self): 1180 value = el.get('value') 1181 if value is None: 1182 value = el.text or '' 1183 if value: 1184 value = value.strip() 1185 options.append(value) 1186 return options
1187 value_options = property(value_options, doc=value_options.__doc__) 1188
1189 - def _multiple__get(self):
1190 """ 1191 Boolean attribute: is there a ``multiple`` attribute on this element. 1192 """ 1193 return 'multiple' in self.attrib
1194 - def _multiple__set(self, value):
1195 if value: 1196 self.set('multiple', '') 1197 elif 'multiple' in self.attrib: 1198 del self.attrib['multiple']
1199 multiple = property(_multiple__get, _multiple__set, doc=_multiple__get.__doc__)
1200 1201 HtmlElementClassLookup._default_element_classes['select'] = SelectElement 1202
1203 -class MultipleSelectOptions(SetMixin):
1204 """ 1205 Represents all the selected options in a ``<select multiple>`` element. 1206 1207 You can add to this set-like option to select an option, or remove 1208 to unselect the option. 1209 """ 1210
1211 - def __init__(self, select):
1212 self.select = select
1213
1214 - def options(self):
1215 """ 1216 Iterator of all the ``<option>`` elements. 1217 """ 1218 return iter(_options_xpath(self.select))
1219 options = property(options) 1220
1221 - def __iter__(self):
1222 for option in self.options: 1223 if 'selected' in option.attrib: 1224 opt_value = option.get('value') 1225 if opt_value is None: 1226 opt_value = option.text or '' 1227 if opt_value: 1228 opt_value = opt_value.strip() 1229 yield opt_value
1230
1231 - def add(self, item):
1232 for option in self.options: 1233 opt_value = option.get('value') 1234 if opt_value is None: 1235 opt_value = option.text or '' 1236 if opt_value: 1237 opt_value = opt_value.strip() 1238 if opt_value == item: 1239 option.set('selected', '') 1240 break 1241 else: 1242 raise ValueError( 1243 "There is no option with the value %r" % item)
1244
1245 - def remove(self, item):
1246 for option in self.options: 1247 opt_value = option.get('value') 1248 if opt_value is None: 1249 opt_value = option.text or '' 1250 if opt_value: 1251 opt_value = opt_value.strip() 1252 if opt_value == item: 1253 if 'selected' in option.attrib: 1254 del option.attrib['selected'] 1255 else: 1256 raise ValueError( 1257 "The option %r is not currently selected" % item) 1258 break 1259 else: 1260 raise ValueError( 1261 "There is not option with the value %r" % item)
1262
1263 - def __repr__(self):
1264 return '<%s {%s} for select name=%r>' % ( 1265 self.__class__.__name__, 1266 ', '.join([repr(v) for v in self]), 1267 self.select.name)
1268
1269 -class RadioGroup(list):
1270 """ 1271 This object represents several ``<input type=radio>`` elements 1272 that have the same name. 1273 1274 You can use this like a list, but also use the property 1275 ``.value`` to check/uncheck inputs. Also you can use 1276 ``.value_options`` to get the possible values. 1277 """ 1278
1279 - def _value__get(self):
1280 """ 1281 Get/set the value, which checks the radio with that value (and 1282 unchecks any other value). 1283 """ 1284 for el in self: 1285 if 'checked' in el.attrib: 1286 return el.get('value') 1287 return None
1288
1289 - def _value__set(self, value):
1290 if value is not None: 1291 for el in self: 1292 if el.get('value') == value: 1293 checked_option = el 1294 break 1295 else: 1296 raise ValueError( 1297 "There is no radio input with the value %r" % value) 1298 for el in self: 1299 if 'checked' in el.attrib: 1300 del el.attrib['checked'] 1301 if value is not None: 1302 checked_option.set('checked', '')
1303
1304 - def _value__del(self):
1305 self.value = None
1306 1307 value = property(_value__get, _value__set, _value__del, doc=_value__get.__doc__) 1308
1309 - def value_options(self):
1310 """ 1311 Returns a list of all the possible values. 1312 """ 1313 return [el.get('value') for el in self]
1314 value_options = property(value_options, doc=value_options.__doc__) 1315
1316 - def __repr__(self):
1317 return '%s(%s)' % ( 1318 self.__class__.__name__, 1319 list.__repr__(self))
1320
1321 -class CheckboxGroup(list):
1322 """ 1323 Represents a group of checkboxes (``<input type=checkbox>``) that 1324 have the same name. 1325 1326 In addition to using this like a list, the ``.value`` attribute 1327 returns a set-like object that you can add to or remove from to 1328 check and uncheck checkboxes. You can also use ``.value_options`` 1329 to get the possible values. 1330 """ 1331
1332 - def _value__get(self):
1333 """ 1334 Return a set-like object that can be modified to check or 1335 uncheck individual checkboxes according to their value. 1336 """ 1337 return CheckboxValues(self)
1338 - def _value__set(self, value):
1339 self.value.clear() 1340 if not hasattr(value, '__iter__'): 1341 raise ValueError( 1342 "A CheckboxGroup (name=%r) must be set to a sequence (not %r)" 1343 % (self[0].name, value)) 1344 self.value.update(value)
1345 - def _value__del(self):
1346 self.value.clear()
1347 value = property(_value__get, _value__set, _value__del, doc=_value__get.__doc__) 1348
1349 - def value_options(self):
1350 """ 1351 Returns a list of all the possible values. 1352 """ 1353 return [el.get('value') for el in self]
1354 value_options = property(value_options, doc=value_options.__doc__) 1355
1356 - def __repr__(self):
1357 return '%s(%s)' % ( 1358 self.__class__.__name__, list.__repr__(self))
1359
1360 -class CheckboxValues(SetMixin):
1361 1362 """ 1363 Represents the values of the checked checkboxes in a group of 1364 checkboxes with the same name. 1365 """ 1366
1367 - def __init__(self, group):
1368 self.group = group
1369
1370 - def __iter__(self):
1371 return iter([ 1372 el.get('value') 1373 for el in self.group 1374 if 'checked' in el.attrib])
1375
1376 - def add(self, value):
1377 for el in self.group: 1378 if el.get('value') == value: 1379 el.set('checked', '') 1380 break 1381 else: 1382 raise KeyError("No checkbox with value %r" % value)
1383
1384 - def remove(self, value):
1385 for el in self.group: 1386 if el.get('value') == value: 1387 if 'checked' in el.attrib: 1388 del el.attrib['checked'] 1389 else: 1390 raise KeyError( 1391 "The checkbox with value %r was already unchecked" % value) 1392 break 1393 else: 1394 raise KeyError( 1395 "No checkbox with value %r" % value)
1396
1397 - def __repr__(self):
1398 return '<%s {%s} for checkboxes name=%r>' % ( 1399 self.__class__.__name__, 1400 ', '.join([repr(v) for v in self]), 1401 self.group.name)
1402
1403 -class InputElement(InputMixin, HtmlElement):
1404 """ 1405 Represents an ``<input>`` element. 1406 1407 You can get the type with ``.type`` (which is lower-cased and 1408 defaults to ``'text'``). 1409 1410 Also you can get and set the value with ``.value`` 1411 1412 Checkboxes and radios have the attribute ``input.checkable == 1413 True`` (for all others it is false) and a boolean attribute 1414 ``.checked``. 1415 1416 """ 1417 1418 ## FIXME: I'm a little uncomfortable with the use of .checked
1419 - def _value__get(self):
1420 """ 1421 Get/set the value of this element, using the ``value`` attribute. 1422 1423 Also, if this is a checkbox and it has no value, this defaults 1424 to ``'on'``. If it is a checkbox or radio that is not 1425 checked, this returns None. 1426 """ 1427 if self.checkable: 1428 if self.checked: 1429 return self.get('value') or 'on' 1430 else: 1431 return None 1432 return self.get('value')
1433 - def _value__set(self, value):
1434 if self.checkable: 1435 if not value: 1436 self.checked = False 1437 else: 1438 self.checked = True 1439 if isinstance(value, basestring): 1440 self.set('value', value) 1441 else: 1442 self.set('value', value)
1443 - def _value__del(self):
1444 if self.checkable: 1445 self.checked = False 1446 else: 1447 if 'value' in self.attrib: 1448 del self.attrib['value']
1449 value = property(_value__get, _value__set, _value__del, doc=_value__get.__doc__) 1450
1451 - def _type__get(self):
1452 """ 1453 Return the type of this element (using the type attribute). 1454 """ 1455 return self.get('type', 'text').lower()
1456 - def _type__set(self, value):
1457 self.set('type', value)
1458 type = property(_type__get, _type__set, doc=_type__get.__doc__) 1459
1460 - def checkable(self):
1461 """ 1462 Boolean: can this element be checked? 1463 """ 1464 return self.type in ['checkbox', 'radio']
1465 checkable = property(checkable, doc=checkable.__doc__) 1466
1467 - def _checked__get(self):
1468 """ 1469 Boolean attribute to get/set the presence of the ``checked`` 1470 attribute. 1471 1472 You can only use this on checkable input types. 1473 """ 1474 if not self.checkable: 1475 raise AttributeError('Not a checkable input type') 1476 return 'checked' in self.attrib
1477 - def _checked__set(self, value):
1478 if not self.checkable: 1479 raise AttributeError('Not a checkable input type') 1480 if value: 1481 self.set('checked', '') 1482 else: 1483 if 'checked' in self.attrib: 1484 del self.attrib['checked']
1485 checked = property(_checked__get, _checked__set, doc=_checked__get.__doc__)
1486 1487 HtmlElementClassLookup._default_element_classes['input'] = InputElement 1488
1489 -class LabelElement(HtmlElement):
1490 """ 1491 Represents a ``<label>`` element. 1492 1493 Label elements are linked to other elements with their ``for`` 1494 attribute. You can access this element with ``label.for_element``. 1495 """ 1496
1497 - def _for_element__get(self):
1498 """ 1499 Get/set the element this label points to. Return None if it 1500 can't be found. 1501 """ 1502 id = self.get('for') 1503 if not id: 1504 return None 1505 return self.body.get_element_by_id(id)
1506 - def _for_element__set(self, other):
1507 id = other.get('id') 1508 if not id: 1509 raise TypeError( 1510 "Element %r has no id attribute" % other) 1511 self.set('for', id)
1512 - def _for_element__del(self):
1513 if 'id' in self.attrib: 1514 del self.attrib['id']
1515 for_element = property(_for_element__get, _for_element__set, _for_element__del, 1516 doc=_for_element__get.__doc__)
1517 1518 HtmlElementClassLookup._default_element_classes['label'] = LabelElement 1519 1520 ############################################################ 1521 ## Serialization 1522 ############################################################ 1523
1524 -def html_to_xhtml(html):
1525 """Convert all tags in an HTML tree to XHTML by moving them to the 1526 XHTML namespace. 1527 """ 1528 try: 1529 html = html.getroot() 1530 except AttributeError: 1531 pass 1532 prefix = "{%s}" % XHTML_NAMESPACE 1533 for el in html.iter(etree.Element): 1534 tag = el.tag 1535 if tag[0] != '{': 1536 el.tag = prefix + tag
1537
1538 -def xhtml_to_html(xhtml):
1539 """Convert all tags in an XHTML tree to HTML by removing their 1540 XHTML namespace. 1541 """ 1542 try: 1543 xhtml = xhtml.getroot() 1544 except AttributeError: 1545 pass 1546 prefix = "{%s}" % XHTML_NAMESPACE 1547 prefix_len = len(prefix) 1548 for el in xhtml.iter(prefix + "*"): 1549 el.tag = el.tag[prefix_len:]
1550 1551 # This isn't a general match, but it's a match for what libxml2 1552 # specifically serialises: 1553 __str_replace_meta_content_type = re.compile( 1554 r'<meta http-equiv="Content-Type"[^>]*>').sub 1555 __bytes_replace_meta_content_type = re.compile( 1556 r'<meta http-equiv="Content-Type"[^>]*>'.encode('ASCII')).sub 1557
1558 -def tostring(doc, pretty_print=False, include_meta_content_type=False, 1559 encoding=None, method="html", with_tail=True, doctype=None):
1560 """Return an HTML string representation of the document. 1561 1562 Note: if include_meta_content_type is true this will create a 1563 ``<meta http-equiv="Content-Type" ...>`` tag in the head; 1564 regardless of the value of include_meta_content_type any existing 1565 ``<meta http-equiv="Content-Type" ...>`` tag will be removed 1566 1567 The ``encoding`` argument controls the output encoding (defauts to 1568 ASCII, with &#...; character references for any characters outside 1569 of ASCII). Note that you can pass the name ``'unicode'`` as 1570 ``encoding`` argument to serialise to a Unicode string. 1571 1572 The ``method`` argument defines the output method. It defaults to 1573 'html', but can also be 'xml' for xhtml output, or 'text' to 1574 serialise to plain text without markup. 1575 1576 To leave out the tail text of the top-level element that is being 1577 serialised, pass ``with_tail=False``. 1578 1579 The ``doctype`` option allows passing in a plain string that will 1580 be serialised before the XML tree. Note that passing in non 1581 well-formed content here will make the XML output non well-formed. 1582 Also, an existing doctype in the document tree will not be removed 1583 when serialising an ElementTree instance. 1584 1585 Example:: 1586 1587 >>> from lxml import html 1588 >>> root = html.fragment_fromstring('<p>Hello<br>world!</p>') 1589 1590 >>> html.tostring(root) 1591 b'<p>Hello<br>world!</p>' 1592 >>> html.tostring(root, method='html') 1593 b'<p>Hello<br>world!</p>' 1594 1595 >>> html.tostring(root, method='xml') 1596 b'<p>Hello<br/>world!</p>' 1597 1598 >>> html.tostring(root, method='text') 1599 b'Helloworld!' 1600 1601 >>> html.tostring(root, method='text', encoding='unicode') 1602 u'Helloworld!' 1603 1604 >>> root = html.fragment_fromstring('<div><p>Hello<br>world!</p>TAIL</div>') 1605 >>> html.tostring(root[0], method='text', encoding='unicode') 1606 u'Helloworld!TAIL' 1607 1608 >>> html.tostring(root[0], method='text', encoding='unicode', with_tail=False) 1609 u'Helloworld!' 1610 1611 >>> doc = html.document_fromstring('<p>Hello<br>world!</p>') 1612 >>> html.tostring(doc, method='html', encoding='unicode') 1613 u'<html><body><p>Hello<br>world!</p></body></html>' 1614 1615 >>> print(html.tostring(doc, method='html', encoding='unicode', 1616 ... doctype='<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN"' 1617 ... ' "http://www.w3.org/TR/html4/strict.dtd">')) 1618 <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd"> 1619 <html><body><p>Hello<br>world!</p></body></html> 1620 """ 1621 html = etree.tostring(doc, method=method, pretty_print=pretty_print, 1622 encoding=encoding, with_tail=with_tail, 1623 doctype=doctype) 1624 if method == 'html' and not include_meta_content_type: 1625 if isinstance(html, str): 1626 html = __str_replace_meta_content_type('', html) 1627 else: 1628 html = __bytes_replace_meta_content_type(bytes(), html) 1629 return html
1630 1631 tostring.__doc__ = __fix_docstring(tostring.__doc__) 1632
1633 -def open_in_browser(doc, encoding=None):
1634 """ 1635 Open the HTML document in a web browser, saving it to a temporary 1636 file to open it. Note that this does not delete the file after 1637 use. This is mainly meant for debugging. 1638 """ 1639 import os 1640 import webbrowser 1641 import tempfile 1642 if not isinstance(doc, etree._ElementTree): 1643 doc = etree.ElementTree(doc) 1644 handle, fn = tempfile.mkstemp(suffix='.html') 1645 f = os.fdopen(handle, 'wb') 1646 try: 1647 doc.write(f, method="html", encoding=encoding or doc.docinfo.encoding or "UTF-8") 1648 finally: 1649 # we leak the file itself here, but we should at least close it 1650 f.close() 1651 url = 'file://' + fn.replace(os.path.sep, '/') 1652 print(url) 1653 webbrowser.open(url)
1654 1655 ################################################################################ 1656 # configure Element class lookup 1657 ################################################################################ 1658
1659 -class HTMLParser(etree.HTMLParser):
1660 """An HTML parser that is configured to return lxml.html Element 1661 objects. 1662 """
1663 - def __init__(self, **kwargs):
1664 super(HTMLParser, self).__init__(**kwargs) 1665 self.set_element_class_lookup(HtmlElementClassLookup())
1666
1667 -class XHTMLParser(etree.XMLParser):
1668 """An XML parser that is configured to return lxml.html Element 1669 objects. 1670 1671 Note that this parser is not really XHTML aware unless you let it 1672 load a DTD that declares the HTML entities. To do this, make sure 1673 you have the XHTML DTDs installed in your catalogs, and create the 1674 parser like this:: 1675 1676 >>> parser = XHTMLParser(load_dtd=True) 1677 1678 If you additionally want to validate the document, use this:: 1679 1680 >>> parser = XHTMLParser(dtd_validation=True) 1681 1682 For catalog support, see http://www.xmlsoft.org/catalog.html. 1683 """
1684 - def __init__(self, **kwargs):
1685 super(XHTMLParser, self).__init__(**kwargs) 1686 self.set_element_class_lookup(HtmlElementClassLookup())
1687
1688 -def Element(*args, **kw):
1689 """Create a new HTML Element. 1690 1691 This can also be used for XHTML documents. 1692 """ 1693 v = html_parser.makeelement(*args, **kw) 1694 return v
1695 1696 html_parser = HTMLParser() 1697 xhtml_parser = XHTMLParser() 1698