Package lxml :: Package html
[hide private]
[frames] | no frames]

Source Code for Package lxml.html

   1  # Copyright (c) 2004 Ian Bicking. All rights reserved. 
   2  # 
   3  # Redistribution and use in source and binary forms, with or without 
   4  # modification, are permitted provided that the following conditions are 
   5  # met: 
   6  # 
   7  # 1. Redistributions of source code must retain the above copyright 
   8  # notice, this list of conditions and the following disclaimer. 
   9  # 
  10  # 2. Redistributions in binary form must reproduce the above copyright 
  11  # notice, this list of conditions and the following disclaimer in 
  12  # the documentation and/or other materials provided with the 
  13  # distribution. 
  14  # 
  15  # 3. Neither the name of Ian Bicking nor the names of its contributors may 
  16  # be used to endorse or promote products derived from this software 
  17  # without specific prior written permission. 
  18  # 
  19  # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 
  20  # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 
  21  # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 
  22  # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL IAN BICKING OR 
  23  # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 
  24  # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 
  25  # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 
  26  # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 
  27  # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 
  28  # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 
  29  # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 
  30   
  31  """The ``lxml.html`` tool set for HTML handling. 
  32  """ 
  33   
  34  import sys 
  35  import re 
  36  try: 
  37      from urlparse import urljoin 
  38  except ImportError: 
  39      # Python 3 
  40      from urllib.parse import urljoin 
  41  import copy 
  42  from lxml import etree 
  43  from lxml.html import defs 
  44  from lxml.html._setmixin import SetMixin 
  45  try: 
  46      from collections import MutableMapping as DictMixin 
  47  except ImportError: 
  48      # Python < 2.6 
  49      from UserDict import DictMixin 
  50  try: 
  51      set 
  52  except NameError: 
  53      # Python 2.3 
  54      from sets import Set as set 
  55  try: 
  56      bytes 
  57  except NameError: 
  58      # Python < 2.6 
  59      bytes = str 
  60  try: 
  61      unicode 
  62  except NameError: 
  63      # Python 3 
  64      unicode = str 
  65  try: 
  66      basestring 
  67  except NameError: 
  68      # Python 3 
  69      basestring = (str, bytes) 
  70   
71 -def __fix_docstring(s):
72 if not s: 73 return s 74 import sys 75 if sys.version_info[0] >= 3: 76 sub = re.compile(r"^(\s*)u'", re.M).sub 77 else: 78 sub = re.compile(r"^(\s*)b'", re.M).sub 79 return sub(r"\1'", s)
80 81 __all__ = [ 82 'document_fromstring', 'fragment_fromstring', 'fragments_fromstring', 'fromstring', 83 'tostring', 'Element', 'defs', 'open_in_browser', 'submit_form', 84 'find_rel_links', 'find_class', 'make_links_absolute', 85 'resolve_base_href', 'iterlinks', 'rewrite_links', 'open_in_browser', 'parse'] 86 87 XHTML_NAMESPACE = "http://www.w3.org/1999/xhtml" 88 89 _rel_links_xpath = etree.XPath("descendant-or-self::a[@rel]|descendant-or-self::x:a[@rel]", 90 namespaces={'x':XHTML_NAMESPACE}) 91 _options_xpath = etree.XPath("descendant-or-self::option|descendant-or-self::x:option", 92 namespaces={'x':XHTML_NAMESPACE}) 93 _forms_xpath = etree.XPath("descendant-or-self::form|descendant-or-self::x:form", 94 namespaces={'x':XHTML_NAMESPACE}) 95 #_class_xpath = etree.XPath(r"descendant-or-self::*[regexp:match(@class, concat('\b', $class_name, '\b'))]", {'regexp': 'http://exslt.org/regular-expressions'}) 96 _class_xpath = etree.XPath("descendant-or-self::*[@class and contains(concat(' ', normalize-space(@class), ' '), concat(' ', $class_name, ' '))]") 97 _id_xpath = etree.XPath("descendant-or-self::*[@id=$id]") 98 _collect_string_content = etree.XPath("string()") 99 _iter_css_urls = re.compile(r'url\(('+'["][^"]*["]|'+"['][^']*[']|"+r'[^)]*)\)', re.I).finditer 100 _iter_css_imports = re.compile(r'@import "(.*?)"').finditer 101 _label_xpath = etree.XPath("//label[@for=$id]|//x:label[@for=$id]", 102 namespaces={'x':XHTML_NAMESPACE}) 103 _archive_re = re.compile(r'[^ ]+') 104
105 -def _unquote_match(s, pos):
106 if s[:1] == '"' and s[-1:] == '"' or s[:1] == "'" and s[-1:] == "'": 107 return s[1:-1], pos+1 108 else: 109 return s,pos
110
111 -def _transform_result(typ, result):
112 """Convert the result back into the input type. 113 """ 114 if issubclass(typ, bytes): 115 return tostring(result, encoding='utf-8') 116 elif issubclass(typ, unicode): 117 return tostring(result, encoding='unicode') 118 else: 119 return result
120
121 -def _nons(tag):
122 if isinstance(tag, basestring): 123 if tag[0] == '{' and tag[1:len(XHTML_NAMESPACE)+1] == XHTML_NAMESPACE: 124 return tag.split('}')[-1] 125 return tag
126
127 -class HtmlMixin(object):
128
129 - def base_url(self):
130 """ 131 Returns the base URL, given when the page was parsed. 132 133 Use with ``urlparse.urljoin(el.base_url, href)`` to get 134 absolute URLs. 135 """ 136 return self.getroottree().docinfo.URL
137 base_url = property(base_url, doc=base_url.__doc__) 138
139 - def forms(self):
140 """ 141 Return a list of all the forms 142 """ 143 return _forms_xpath(self)
144 forms = property(forms, doc=forms.__doc__) 145
146 - def body(self):
147 """ 148 Return the <body> element. Can be called from a child element 149 to get the document's head. 150 """ 151 return self.xpath('//body|//x:body', namespaces={'x':XHTML_NAMESPACE})[0]
152 body = property(body, doc=body.__doc__) 153
154 - def head(self):
155 """ 156 Returns the <head> element. Can be called from a child 157 element to get the document's head. 158 """ 159 return self.xpath('//head|//x:head', namespaces={'x':XHTML_NAMESPACE})[0]
160 head = property(head, doc=head.__doc__) 161
162 - def _label__get(self):
163 """ 164 Get or set any <label> element associated with this element. 165 """ 166 id = self.get('id') 167 if not id: 168 return None 169 result = _label_xpath(self, id=id) 170 if not result: 171 return None 172 else: 173 return result[0]
174 - def _label__set(self, label):
175 id = self.get('id') 176 if not id: 177 raise TypeError( 178 "You cannot set a label for an element (%r) that has no id" 179 % self) 180 if _nons(label.tag) != 'label': 181 raise TypeError( 182 "You can only assign label to a label element (not %r)" 183 % label) 184 label.set('for', id)
185 - def _label__del(self):
186 label = self.label 187 if label is not None: 188 del label.attrib['for']
189 label = property(_label__get, _label__set, _label__del, doc=_label__get.__doc__) 190
191 - def drop_tree(self):
192 """ 193 Removes this element from the tree, including its children and 194 text. The tail text is joined to the previous element or 195 parent. 196 """ 197 parent = self.getparent() 198 assert parent is not None 199 if self.tail: 200 previous = self.getprevious() 201 if previous is None: 202 parent.text = (parent.text or '') + self.tail 203 else: 204 previous.tail = (previous.tail or '') + self.tail 205 parent.remove(self)
206
207 - def drop_tag(self):
208 """ 209 Remove the tag, but not its children or text. The children and text 210 are merged into the parent. 211 212 Example:: 213 214 >>> h = fragment_fromstring('<div>Hello <b>World!</b></div>') 215 >>> h.find('.//b').drop_tag() 216 >>> print(tostring(h, encoding='unicode')) 217 <div>Hello World!</div> 218 """ 219 parent = self.getparent() 220 assert parent is not None 221 previous = self.getprevious() 222 if self.text and isinstance(self.tag, basestring): 223 # not a Comment, etc. 224 if previous is None: 225 parent.text = (parent.text or '') + self.text 226 else: 227 previous.tail = (previous.tail or '') + self.text 228 if self.tail: 229 if len(self): 230 last = self[-1] 231 last.tail = (last.tail or '') + self.tail 232 elif previous is None: 233 parent.text = (parent.text or '') + self.tail 234 else: 235 previous.tail = (previous.tail or '') + self.tail 236 index = parent.index(self) 237 parent[index:index+1] = self[:]
238 246
247 - def find_class(self, class_name):
248 """ 249 Find any elements with the given class name. 250 """ 251 return _class_xpath(self, class_name=class_name)
252
253 - def get_element_by_id(self, id, *default):
254 """ 255 Get the first element in a document with the given id. If none is 256 found, return the default argument if provided or raise KeyError 257 otherwise. 258 259 Note that there can be more than one element with the same id, 260 and this isn't uncommon in HTML documents found in the wild. 261 Browsers return only the first match, and this function does 262 the same. 263 """ 264 try: 265 # FIXME: should this check for multiple matches? 266 # browsers just return the first one 267 return _id_xpath(self, id=id)[0] 268 except IndexError: 269 if default: 270 return default[0] 271 else: 272 raise KeyError(id)
273
274 - def text_content(self):
275 """ 276 Return the text content of the tag (and the text in any children). 277 """ 278 return _collect_string_content(self)
279
280 - def cssselect(self, expr, translator='html'):
281 """ 282 Run the CSS expression on this element and its children, 283 returning a list of the results. 284 285 Equivalent to lxml.cssselect.CSSSelect(expr, translator='html')(self) 286 -- note that pre-compiling the expression can provide a substantial 287 speedup. 288 """ 289 # Do the import here to make the dependency optional. 290 from lxml.cssselect import CSSSelector 291 return CSSSelector(expr, translator=translator)(self)
292 293 ######################################## 294 ## Link functions 295 ######################################## 296 327 elif handle_failures == 'discard': 328 def link_repl(href): 329 try: 330 return urljoin(base_url, href) 331 except ValueError: 332 return None
333 elif handle_failures is None: 334 def link_repl(href): 335 return urljoin(base_url, href) 336 else: 337 raise ValueError( 338 "unexpected value for handle_failures: %r" % handle_failures) 339 340 self.rewrite_links(link_repl) 341
342 - def resolve_base_href(self, handle_failures=None):
343 """ 344 Find any ``<base href>`` tag in the document, and apply its 345 values to all links found in the document. Also remove the 346 tag once it has been applied. 347 348 If ``handle_failures`` is None (default), a failure to process 349 a URL will abort the processing. If set to 'ignore', errors 350 are ignored. If set to 'discard', failing URLs will be removed. 351 """ 352 base_href = None 353 basetags = self.xpath('//base[@href]|//x:base[@href]', 354 namespaces={'x': XHTML_NAMESPACE}) 355 for b in basetags: 356 base_href = b.get('href') 357 b.drop_tree() 358 if not base_href: 359 return 360 self.make_links_absolute(base_href, resolve_base_href=False, 361 handle_failures=handle_failures)
362 454 503 504
505 -class _MethodFunc(object):
506 """ 507 An object that represents a method on an element as a function; 508 the function takes either an element or an HTML string. It 509 returns whatever the function normally returns, or if the function 510 works in-place (and so returns None) it returns a serialized form 511 of the resulting document. 512 """
513 - def __init__(self, name, copy=False, source_class=HtmlMixin):
514 self.name = name 515 self.copy = copy 516 self.__doc__ = getattr(source_class, self.name).__doc__
517 - def __call__(self, doc, *args, **kw):
518 result_type = type(doc) 519 if isinstance(doc, basestring): 520 if 'copy' in kw: 521 raise TypeError( 522 "The keyword 'copy' can only be used with element inputs to %s, not a string input" % self.name) 523 doc = fromstring(doc, **kw) 524 else: 525 if 'copy' in kw: 526 make_a_copy = kw.pop('copy') 527 else: 528 make_a_copy = self.copy 529 if make_a_copy: 530 doc = copy.deepcopy(doc) 531 meth = getattr(doc, self.name) 532 result = meth(*args, **kw) 533 # FIXME: this None test is a bit sloppy 534 if result is None: 535 # Then return what we got in 536 return _transform_result(result_type, doc) 537 else: 538 return result
539 540 find_rel_links = _MethodFunc('find_rel_links', copy=False) 541 find_class = _MethodFunc('find_class', copy=False) 542 make_links_absolute = _MethodFunc('make_links_absolute', copy=True) 543 resolve_base_href = _MethodFunc('resolve_base_href', copy=True) 544 iterlinks = _MethodFunc('iterlinks', copy=False) 545 rewrite_links = _MethodFunc('rewrite_links', copy=True) 546
547 -class HtmlComment(etree.CommentBase, HtmlMixin):
548 pass
549
550 -class HtmlElement(etree.ElementBase, HtmlMixin):
551 pass
552
553 -class HtmlProcessingInstruction(etree.PIBase, HtmlMixin):
554 pass
555
556 -class HtmlEntity(etree.EntityBase, HtmlMixin):
557 pass
558 559
560 -class HtmlElementClassLookup(etree.CustomElementClassLookup):
561 """A lookup scheme for HTML Element classes. 562 563 To create a lookup instance with different Element classes, pass a tag 564 name mapping of Element classes in the ``classes`` keyword argument and/or 565 a tag name mapping of Mixin classes in the ``mixins`` keyword argument. 566 The special key '*' denotes a Mixin class that should be mixed into all 567 Element classes. 568 """ 569 _default_element_classes = {} 570
571 - def __init__(self, classes=None, mixins=None):
572 etree.CustomElementClassLookup.__init__(self) 573 if classes is None: 574 classes = self._default_element_classes.copy() 575 if mixins: 576 mixers = {} 577 for name, value in mixins: 578 if name == '*': 579 for n in classes.keys(): 580 mixers.setdefault(n, []).append(value) 581 else: 582 mixers.setdefault(name, []).append(value) 583 for name, mix_bases in mixers.items(): 584 cur = classes.get(name, HtmlElement) 585 bases = tuple(mix_bases + [cur]) 586 classes[name] = type(cur.__name__, bases, {}) 587 self._element_classes = classes
588
589 - def lookup(self, node_type, document, namespace, name):
590 if node_type == 'element': 591 return self._element_classes.get(name.lower(), HtmlElement) 592 elif node_type == 'comment': 593 return HtmlComment 594 elif node_type == 'PI': 595 return HtmlProcessingInstruction 596 elif node_type == 'entity': 597 return HtmlEntity 598 # Otherwise normal lookup 599 return None
600 601 ################################################################################ 602 # parsing 603 ################################################################################ 604 605 _looks_like_full_html_unicode = re.compile( 606 unicode(r'^\s*<(?:html|!doctype)'), re.I).match 607 _looks_like_full_html_bytes = re.compile( 608 r'^\s*<(?:html|!doctype)'.encode('ascii'), re.I).match 609
610 -def document_fromstring(html, parser=None, ensure_head_body=False, **kw):
611 if parser is None: 612 parser = html_parser 613 value = etree.fromstring(html, parser, **kw) 614 if value is None: 615 raise etree.ParserError( 616 "Document is empty") 617 if ensure_head_body and value.find('head') is None: 618 value.insert(0, Element('head')) 619 if ensure_head_body and value.find('body') is None: 620 value.append(Element('body')) 621 return value
622
623 -def fragments_fromstring(html, no_leading_text=False, base_url=None, 624 parser=None, **kw):
625 """ 626 Parses several HTML elements, returning a list of elements. 627 628 The first item in the list may be a string (though leading 629 whitespace is removed). If no_leading_text is true, then it will 630 be an error if there is leading text, and it will always be a list 631 of only elements. 632 633 base_url will set the document's base_url attribute (and the tree's docinfo.URL) 634 """ 635 if parser is None: 636 parser = html_parser 637 # FIXME: check what happens when you give html with a body, head, etc. 638 if isinstance(html, bytes): 639 if not _looks_like_full_html_bytes(html): 640 # can't use %-formatting in early Py3 versions 641 html = ('<html><body>'.encode('ascii') + html + 642 '</body></html>'.encode('ascii')) 643 else: 644 if not _looks_like_full_html_unicode(html): 645 html = '<html><body>%s</body></html>' % html 646 doc = document_fromstring(html, parser=parser, base_url=base_url, **kw) 647 assert _nons(doc.tag) == 'html' 648 bodies = [e for e in doc if _nons(e.tag) == 'body'] 649 assert len(bodies) == 1, ("too many bodies: %r in %r" % (bodies, html)) 650 body = bodies[0] 651 elements = [] 652 if no_leading_text and body.text and body.text.strip(): 653 raise etree.ParserError( 654 "There is leading text: %r" % body.text) 655 if body.text and body.text.strip(): 656 elements.append(body.text) 657 elements.extend(body) 658 # FIXME: removing the reference to the parent artificial document 659 # would be nice 660 return elements
661
662 -def fragment_fromstring(html, create_parent=False, base_url=None, 663 parser=None, **kw):
664 """ 665 Parses a single HTML element; it is an error if there is more than 666 one element, or if anything but whitespace precedes or follows the 667 element. 668 669 If ``create_parent`` is true (or is a tag name) then a parent node 670 will be created to encapsulate the HTML in a single element. In this 671 case, leading or trailing text is also allowed, as are multiple elements 672 as result of the parsing. 673 674 Passing a ``base_url`` will set the document's ``base_url`` attribute 675 (and the tree's docinfo.URL). 676 """ 677 if parser is None: 678 parser = html_parser 679 680 accept_leading_text = bool(create_parent) 681 682 elements = fragments_fromstring( 683 html, parser=parser, no_leading_text=not accept_leading_text, 684 base_url=base_url, **kw) 685 686 if create_parent: 687 if not isinstance(create_parent, basestring): 688 create_parent = 'div' 689 new_root = Element(create_parent) 690 if elements: 691 if isinstance(elements[0], basestring): 692 new_root.text = elements[0] 693 del elements[0] 694 new_root.extend(elements) 695 return new_root 696 697 if not elements: 698 raise etree.ParserError('No elements found') 699 if len(elements) > 1: 700 raise etree.ParserError( 701 "Multiple elements found (%s)" 702 % ', '.join([_element_name(e) for e in elements])) 703 el = elements[0] 704 if el.tail and el.tail.strip(): 705 raise etree.ParserError( 706 "Element followed by text: %r" % el.tail) 707 el.tail = None 708 return el
709
710 -def fromstring(html, base_url=None, parser=None, **kw):
711 """ 712 Parse the html, returning a single element/document. 713 714 This tries to minimally parse the chunk of text, without knowing if it 715 is a fragment or a document. 716 717 base_url will set the document's base_url attribute (and the tree's docinfo.URL) 718 """ 719 if parser is None: 720 parser = html_parser 721 if isinstance(html, bytes): 722 is_full_html = _looks_like_full_html_bytes(html) 723 else: 724 is_full_html = _looks_like_full_html_unicode(html) 725 doc = document_fromstring(html, parser=parser, base_url=base_url, **kw) 726 if is_full_html: 727 return doc 728 # otherwise, lets parse it out... 729 bodies = doc.findall('body') 730 if not bodies: 731 bodies = doc.findall('{%s}body' % XHTML_NAMESPACE) 732 if bodies: 733 body = bodies[0] 734 if len(bodies) > 1: 735 # Somehow there are multiple bodies, which is bad, but just 736 # smash them into one body 737 for other_body in bodies[1:]: 738 if other_body.text: 739 if len(body): 740 body[-1].tail = (body[-1].tail or '') + other_body.text 741 else: 742 body.text = (body.text or '') + other_body.text 743 body.extend(other_body) 744 # We'll ignore tail 745 # I guess we are ignoring attributes too 746 other_body.drop_tree() 747 else: 748 body = None 749 heads = doc.findall('head') 750 if not heads: 751 heads = doc.findall('{%s}head' % XHTML_NAMESPACE) 752 if heads: 753 # Well, we have some sort of structure, so lets keep it all 754 head = heads[0] 755 if len(heads) > 1: 756 for other_head in heads[1:]: 757 head.extend(other_head) 758 # We don't care about text or tail in a head 759 other_head.drop_tree() 760 return doc 761 if body is None: 762 return doc 763 if (len(body) == 1 and (not body.text or not body.text.strip()) 764 and (not body[-1].tail or not body[-1].tail.strip())): 765 # The body has just one element, so it was probably a single 766 # element passed in 767 return body[0] 768 # Now we have a body which represents a bunch of tags which have the 769 # content that was passed in. We will create a fake container, which 770 # is the body tag, except <body> implies too much structure. 771 if _contains_block_level_tag(body): 772 body.tag = 'div' 773 else: 774 body.tag = 'span' 775 return body
776
777 -def parse(filename_or_url, parser=None, base_url=None, **kw):
778 """ 779 Parse a filename, URL, or file-like object into an HTML document 780 tree. Note: this returns a tree, not an element. Use 781 ``parse(...).getroot()`` to get the document root. 782 783 You can override the base URL with the ``base_url`` keyword. This 784 is most useful when parsing from a file-like object. 785 """ 786 if parser is None: 787 parser = html_parser 788 return etree.parse(filename_or_url, parser, base_url=base_url, **kw)
789
790 -def _contains_block_level_tag(el):
791 # FIXME: I could do this with XPath, but would that just be 792 # unnecessarily slow? 793 for el in el.iter(etree.Element): 794 if _nons(el.tag) in defs.block_tags: 795 return True 796 return False
797
798 -def _element_name(el):
799 if isinstance(el, etree.CommentBase): 800 return 'comment' 801 elif isinstance(el, basestring): 802 return 'string' 803 else: 804 return _nons(el.tag)
805 806 ################################################################################ 807 # form handling 808 ################################################################################ 809
810 -class FormElement(HtmlElement):
811 """ 812 Represents a <form> element. 813 """ 814
815 - def inputs(self):
816 """ 817 Returns an accessor for all the input elements in the form. 818 819 See `InputGetter` for more information about the object. 820 """ 821 return InputGetter(self)
822 inputs = property(inputs, doc=inputs.__doc__) 823
824 - def _fields__get(self):
825 """ 826 Dictionary-like object that represents all the fields in this 827 form. You can set values in this dictionary to effect the 828 form. 829 """ 830 return FieldsDict(self.inputs)
831 - def _fields__set(self, value):
832 prev_keys = self.fields.keys() 833 for key, value in value.items(): 834 if key in prev_keys: 835 prev_keys.remove(key) 836 self.fields[key] = value 837 for key in prev_keys: 838 if key is None: 839 # Case of an unnamed input; these aren't really 840 # expressed in form_values() anyway. 841 continue 842 self.fields[key] = None
843 844 fields = property(_fields__get, _fields__set, doc=_fields__get.__doc__) 845
846 - def _name(self):
847 if self.get('name'): 848 return self.get('name') 849 elif self.get('id'): 850 return '#' + self.get('id') 851 forms = list(self.body.iter('form')) 852 if not forms: 853 forms = list(self.body.iter('{%s}form' % XHTML_NAMESPACE)) 854 return str(forms.index(self))
855
856 - def form_values(self):
857 """ 858 Return a list of tuples of the field values for the form. 859 This is suitable to be passed to ``urllib.urlencode()``. 860 """ 861 results = [] 862 for el in self.inputs: 863 name = el.name 864 if not name: 865 continue 866 tag = _nons(el.tag) 867 if tag == 'textarea': 868 results.append((name, el.value)) 869 elif tag == 'select': 870 value = el.value 871 if el.multiple: 872 for v in value: 873 results.append((name, v)) 874 elif value is not None: 875 results.append((name, el.value)) 876 else: 877 assert tag == 'input', ( 878 "Unexpected tag: %r" % el) 879 if el.checkable and not el.checked: 880 continue 881 if el.type in ('submit', 'image', 'reset'): 882 continue 883 value = el.value 884 if value is not None: 885 results.append((name, el.value)) 886 return results
887
888 - def _action__get(self):
889 """ 890 Get/set the form's ``action`` attribute. 891 """ 892 base_url = self.base_url 893 action = self.get('action') 894 if base_url and action is not None: 895 return urljoin(base_url, action) 896 else: 897 return action
898 - def _action__set(self, value):
899 self.set('action', value)
900 - def _action__del(self):
901 if 'action' in self.attrib: 902 del self.attrib['action']
903 action = property(_action__get, _action__set, _action__del, doc=_action__get.__doc__) 904
905 - def _method__get(self):
906 """ 907 Get/set the form's method. Always returns a capitalized 908 string, and defaults to ``'GET'`` 909 """ 910 return self.get('method', 'GET').upper()
911 - def _method__set(self, value):
912 self.set('method', value.upper())
913 method = property(_method__get, _method__set, doc=_method__get.__doc__)
914 915 HtmlElementClassLookup._default_element_classes['form'] = FormElement 916
917 -def submit_form(form, extra_values=None, open_http=None):
918 """ 919 Helper function to submit a form. Returns a file-like object, as from 920 ``urllib.urlopen()``. This object also has a ``.geturl()`` function, 921 which shows the URL if there were any redirects. 922 923 You can use this like:: 924 925 form = doc.forms[0] 926 form.inputs['foo'].value = 'bar' # etc 927 response = form.submit() 928 doc = parse(response) 929 doc.make_links_absolute(response.geturl()) 930 931 To change the HTTP requester, pass a function as ``open_http`` keyword 932 argument that opens the URL for you. The function must have the following 933 signature:: 934 935 open_http(method, URL, values) 936 937 The action is one of 'GET' or 'POST', the URL is the target URL as a 938 string, and the values are a sequence of ``(name, value)`` tuples with the 939 form data. 940 """ 941 values = form.form_values() 942 if extra_values: 943 if hasattr(extra_values, 'items'): 944 extra_values = extra_values.items() 945 values.extend(extra_values) 946 if open_http is None: 947 open_http = open_http_urllib 948 if form.action: 949 url = form.action 950 else: 951 url = form.base_url 952 return open_http(form.method, url, values)
953
954 -def open_http_urllib(method, url, values):
955 if not url: 956 raise ValueError("cannot submit, no URL provided") 957 ## FIXME: should test that it's not a relative URL or something 958 try: 959 from urllib import urlencode, urlopen 960 except ImportError: # Python 3 961 from urllib.request import urlopen 962 from urllib.parse import urlencode 963 if method == 'GET': 964 if '?' in url: 965 url += '&' 966 else: 967 url += '?' 968 url += urlencode(values) 969 data = None 970 else: 971 data = urlencode(values) 972 return urlopen(url, data)
973
974 -class FieldsDict(DictMixin):
975
976 - def __init__(self, inputs):
977 self.inputs = inputs
978 - def __getitem__(self, item):
979 return self.inputs[item].value
980 - def __setitem__(self, item, value):
981 self.inputs[item].value = value
982 - def __delitem__(self, item):
983 raise KeyError( 984 "You cannot remove keys from ElementDict")
985 - def keys(self):
986 return self.inputs.keys()
987 - def __contains__(self, item):
988 return item in self.inputs
989 - def __iter__(self):
990 return iter(self.inputs.keys())
991 - def __len__(self):
992 return len(self.inputs)
993
994 - def __repr__(self):
995 return '<%s for form %s>' % ( 996 self.__class__.__name__, 997 self.inputs.form._name())
998
999 -class InputGetter(object):
1000 1001 """ 1002 An accessor that represents all the input fields in a form. 1003 1004 You can get fields by name from this, with 1005 ``form.inputs['field_name']``. If there are a set of checkboxes 1006 with the same name, they are returned as a list (a `CheckboxGroup` 1007 which also allows value setting). Radio inputs are handled 1008 similarly. 1009 1010 You can also iterate over this to get all input elements. This 1011 won't return the same thing as if you get all the names, as 1012 checkboxes and radio elements are returned individually. 1013 """ 1014 1015 _name_xpath = etree.XPath(".//*[@name = $name and (local-name(.) = 'select' or local-name(.) = 'input' or local-name(.) = 'textarea')]") 1016 _all_xpath = etree.XPath(".//*[local-name() = 'select' or local-name() = 'input' or local-name() = 'textarea']") 1017
1018 - def __init__(self, form):
1019 self.form = form
1020
1021 - def __repr__(self):
1022 return '<%s for form %s>' % ( 1023 self.__class__.__name__, 1024 self.form._name())
1025 1026 ## FIXME: there should be more methods, and it's unclear if this is 1027 ## a dictionary-like object or list-like object 1028
1029 - def __getitem__(self, name):
1030 results = self._name_xpath(self.form, name=name) 1031 if results: 1032 type = results[0].get('type') 1033 if type == 'radio' and len(results) > 1: 1034 group = RadioGroup(results) 1035 group.name = name 1036 return group 1037 elif type == 'checkbox' and len(results) > 1: 1038 group = CheckboxGroup(results) 1039 group.name = name 1040 return group 1041 else: 1042 # I don't like throwing away elements like this 1043 return results[0] 1044 else: 1045 raise KeyError( 1046 "No input element with the name %r" % name)
1047
1048 - def __contains__(self, name):
1049 results = self._name_xpath(self.form, name=name) 1050 return bool(results)
1051
1052 - def keys(self):
1053 names = set() 1054 for el in self: 1055 names.add(el.name) 1056 if None in names: 1057 names.remove(None) 1058 return list(names)
1059
1060 - def __iter__(self):
1061 ## FIXME: kind of dumb to turn a list into an iterator, only 1062 ## to have it likely turned back into a list again :( 1063 return iter(self._all_xpath(self.form))
1064
1065 -class InputMixin(object):
1066 1067 """ 1068 Mix-in for all input elements (input, select, and textarea) 1069 """ 1070 1071
1072 - def _name__get(self):
1073 """ 1074 Get/set the name of the element 1075 """ 1076 return self.get('name')
1077 - def _name__set(self, value):
1078 self.set('name', value)
1079 - def _name__del(self):
1080 if 'name' in self.attrib: 1081 del self.attrib['name']
1082 name = property(_name__get, _name__set, _name__del, doc=_name__get.__doc__) 1083
1084 - def __repr__(self):
1085 type = getattr(self, 'type', None) 1086 if type: 1087 type = ' type=%r' % type 1088 else: 1089 type = '' 1090 return '<%s %x name=%r%s>' % ( 1091 self.__class__.__name__, id(self), self.name, type)
1092
1093 -class TextareaElement(InputMixin, HtmlElement):
1094 """ 1095 ``<textarea>`` element. You can get the name with ``.name`` and 1096 get/set the value with ``.value`` 1097 """ 1098
1099 - def _value__get(self):
1100 """ 1101 Get/set the value (which is the contents of this element) 1102 """ 1103 content = self.text or '' 1104 if self.tag.startswith("{%s}" % XHTML_NAMESPACE): 1105 serialisation_method = 'xml' 1106 else: 1107 serialisation_method = 'html' 1108 for el in self: 1109 # it's rare that we actually get here, so let's not use ''.join() 1110 content += etree.tostring( 1111 el, method=serialisation_method, encoding='unicode') 1112 return content
1113 - def _value__set(self, value):
1114 del self[:] 1115 self.text = value
1116 - def _value__del(self):
1117 self.text = '' 1118 del self[:]
1119 value = property(_value__get, _value__set, _value__del, doc=_value__get.__doc__)
1120 1121 HtmlElementClassLookup._default_element_classes['textarea'] = TextareaElement 1122
1123 -class SelectElement(InputMixin, HtmlElement):
1124 """ 1125 ``<select>`` element. You can get the name with ``.name``. 1126 1127 ``.value`` will be the value of the selected option, unless this 1128 is a multi-select element (``<select multiple>``), in which case 1129 it will be a set-like object. In either case ``.value_options`` 1130 gives the possible values. 1131 1132 The boolean attribute ``.multiple`` shows if this is a 1133 multi-select. 1134 """ 1135
1136 - def _value__get(self):
1137 """ 1138 Get/set the value of this select (the selected option). 1139 1140 If this is a multi-select, this is a set-like object that 1141 represents all the selected options. 1142 """ 1143 if self.multiple: 1144 return MultipleSelectOptions(self) 1145 for el in _options_xpath(self): 1146 if el.get('selected') is not None: 1147 value = el.get('value') 1148 if value is None: 1149 value = el.text or '' 1150 if value: 1151 value = value.strip() 1152 return value 1153 return None
1154
1155 - def _value__set(self, value):
1156 if self.multiple: 1157 if isinstance(value, basestring): 1158 raise TypeError( 1159 "You must pass in a sequence") 1160 self.value.clear() 1161 self.value.update(value) 1162 return 1163 if value is not None: 1164 value = value.strip() 1165 for el in _options_xpath(self): 1166 opt_value = el.get('value') 1167 if opt_value is None: 1168 opt_value = el.text or '' 1169 if opt_value: 1170 opt_value = opt_value.strip() 1171 if opt_value == value: 1172 checked_option = el 1173 break 1174 else: 1175 raise ValueError( 1176 "There is no option with the value of %r" % value) 1177 for el in _options_xpath(self): 1178 if 'selected' in el.attrib: 1179 del el.attrib['selected'] 1180 if value is not None: 1181 checked_option.set('selected', '')
1182
1183 - def _value__del(self):
1184 # FIXME: should del be allowed at all? 1185 if self.multiple: 1186 self.value.clear() 1187 else: 1188 self.value = None
1189 1190 value = property(_value__get, _value__set, _value__del, doc=_value__get.__doc__) 1191
1192 - def value_options(self):
1193 """ 1194 All the possible values this select can have (the ``value`` 1195 attribute of all the ``<option>`` elements. 1196 """ 1197 options = [] 1198 for el in _options_xpath(self): 1199 value = el.get('value') 1200 if value is None: 1201 value = el.text or '' 1202 if value: 1203 value = value.strip() 1204 options.append(value) 1205 return options
1206 value_options = property(value_options, doc=value_options.__doc__) 1207
1208 - def _multiple__get(self):
1209 """ 1210 Boolean attribute: is there a ``multiple`` attribute on this element. 1211 """ 1212 return 'multiple' in self.attrib
1213 - def _multiple__set(self, value):
1214 if value: 1215 self.set('multiple', '') 1216 elif 'multiple' in self.attrib: 1217 del self.attrib['multiple']
1218 multiple = property(_multiple__get, _multiple__set, doc=_multiple__get.__doc__)
1219 1220 HtmlElementClassLookup._default_element_classes['select'] = SelectElement 1221
1222 -class MultipleSelectOptions(SetMixin):
1223 """ 1224 Represents all the selected options in a ``<select multiple>`` element. 1225 1226 You can add to this set-like option to select an option, or remove 1227 to unselect the option. 1228 """ 1229
1230 - def __init__(self, select):
1231 self.select = select
1232
1233 - def options(self):
1234 """ 1235 Iterator of all the ``<option>`` elements. 1236 """ 1237 return iter(_options_xpath(self.select))
1238 options = property(options) 1239
1240 - def __iter__(self):
1241 for option in self.options: 1242 if 'selected' in option.attrib: 1243 opt_value = option.get('value') 1244 if opt_value is None: 1245 opt_value = option.text or '' 1246 if opt_value: 1247 opt_value = opt_value.strip() 1248 yield opt_value
1249
1250 - def add(self, item):
1251 for option in self.options: 1252 opt_value = option.get('value') 1253 if opt_value is None: 1254 opt_value = option.text or '' 1255 if opt_value: 1256 opt_value = opt_value.strip() 1257 if opt_value == item: 1258 option.set('selected', '') 1259 break 1260 else: 1261 raise ValueError( 1262 "There is no option with the value %r" % item)
1263
1264 - def remove(self, item):
1265 for option in self.options: 1266 opt_value = option.get('value') 1267 if opt_value is None: 1268 opt_value = option.text or '' 1269 if opt_value: 1270 opt_value = opt_value.strip() 1271 if opt_value == item: 1272 if 'selected' in option.attrib: 1273 del option.attrib['selected'] 1274 else: 1275 raise ValueError( 1276 "The option %r is not currently selected" % item) 1277 break 1278 else: 1279 raise ValueError( 1280 "There is not option with the value %r" % item)
1281
1282 - def __repr__(self):
1283 return '<%s {%s} for select name=%r>' % ( 1284 self.__class__.__name__, 1285 ', '.join([repr(v) for v in self]), 1286 self.select.name)
1287
1288 -class RadioGroup(list):
1289 """ 1290 This object represents several ``<input type=radio>`` elements 1291 that have the same name. 1292 1293 You can use this like a list, but also use the property 1294 ``.value`` to check/uncheck inputs. Also you can use 1295 ``.value_options`` to get the possible values. 1296 """ 1297
1298 - def _value__get(self):
1299 """ 1300 Get/set the value, which checks the radio with that value (and 1301 unchecks any other value). 1302 """ 1303 for el in self: 1304 if 'checked' in el.attrib: 1305 return el.get('value') 1306 return None
1307
1308 - def _value__set(self, value):
1309 if value is not None: 1310 for el in self: 1311 if el.get('value') == value: 1312 checked_option = el 1313 break 1314 else: 1315 raise ValueError( 1316 "There is no radio input with the value %r" % value) 1317 for el in self: 1318 if 'checked' in el.attrib: 1319 del el.attrib['checked'] 1320 if value is not None: 1321 checked_option.set('checked', '')
1322
1323 - def _value__del(self):
1324 self.value = None
1325 1326 value = property(_value__get, _value__set, _value__del, doc=_value__get.__doc__) 1327
1328 - def value_options(self):
1329 """ 1330 Returns a list of all the possible values. 1331 """ 1332 return [el.get('value') for el in self]
1333 value_options = property(value_options, doc=value_options.__doc__) 1334
1335 - def __repr__(self):
1336 return '%s(%s)' % ( 1337 self.__class__.__name__, 1338 list.__repr__(self))
1339
1340 -class CheckboxGroup(list):
1341 """ 1342 Represents a group of checkboxes (``<input type=checkbox>``) that 1343 have the same name. 1344 1345 In addition to using this like a list, the ``.value`` attribute 1346 returns a set-like object that you can add to or remove from to 1347 check and uncheck checkboxes. You can also use ``.value_options`` 1348 to get the possible values. 1349 """ 1350
1351 - def _value__get(self):
1352 """ 1353 Return a set-like object that can be modified to check or 1354 uncheck individual checkboxes according to their value. 1355 """ 1356 return CheckboxValues(self)
1357 - def _value__set(self, value):
1358 self.value.clear() 1359 if not hasattr(value, '__iter__'): 1360 raise ValueError( 1361 "A CheckboxGroup (name=%r) must be set to a sequence (not %r)" 1362 % (self[0].name, value)) 1363 self.value.update(value)
1364 - def _value__del(self):
1365 self.value.clear()
1366 value = property(_value__get, _value__set, _value__del, doc=_value__get.__doc__) 1367
1368 - def value_options(self):
1369 """ 1370 Returns a list of all the possible values. 1371 """ 1372 return [el.get('value') for el in self]
1373 value_options = property(value_options, doc=value_options.__doc__) 1374
1375 - def __repr__(self):
1376 return '%s(%s)' % ( 1377 self.__class__.__name__, list.__repr__(self))
1378
1379 -class CheckboxValues(SetMixin):
1380 1381 """ 1382 Represents the values of the checked checkboxes in a group of 1383 checkboxes with the same name. 1384 """ 1385
1386 - def __init__(self, group):
1387 self.group = group
1388
1389 - def __iter__(self):
1390 return iter([ 1391 el.get('value') 1392 for el in self.group 1393 if 'checked' in el.attrib])
1394
1395 - def add(self, value):
1396 for el in self.group: 1397 if el.get('value') == value: 1398 el.set('checked', '') 1399 break 1400 else: 1401 raise KeyError("No checkbox with value %r" % value)
1402
1403 - def remove(self, value):
1404 for el in self.group: 1405 if el.get('value') == value: 1406 if 'checked' in el.attrib: 1407 del el.attrib['checked'] 1408 else: 1409 raise KeyError( 1410 "The checkbox with value %r was already unchecked" % value) 1411 break 1412 else: 1413 raise KeyError( 1414 "No checkbox with value %r" % value)
1415
1416 - def __repr__(self):
1417 return '<%s {%s} for checkboxes name=%r>' % ( 1418 self.__class__.__name__, 1419 ', '.join([repr(v) for v in self]), 1420 self.group.name)
1421
1422 -class InputElement(InputMixin, HtmlElement):
1423 """ 1424 Represents an ``<input>`` element. 1425 1426 You can get the type with ``.type`` (which is lower-cased and 1427 defaults to ``'text'``). 1428 1429 Also you can get and set the value with ``.value`` 1430 1431 Checkboxes and radios have the attribute ``input.checkable == 1432 True`` (for all others it is false) and a boolean attribute 1433 ``.checked``. 1434 1435 """ 1436 1437 ## FIXME: I'm a little uncomfortable with the use of .checked
1438 - def _value__get(self):
1439 """ 1440 Get/set the value of this element, using the ``value`` attribute. 1441 1442 Also, if this is a checkbox and it has no value, this defaults 1443 to ``'on'``. If it is a checkbox or radio that is not 1444 checked, this returns None. 1445 """ 1446 if self.checkable: 1447 if self.checked: 1448 return self.get('value') or 'on' 1449 else: 1450 return None 1451 return self.get('value')
1452 - def _value__set(self, value):
1453 if self.checkable: 1454 if not value: 1455 self.checked = False 1456 else: 1457 self.checked = True 1458 if isinstance(value, basestring): 1459 self.set('value', value) 1460 else: 1461 self.set('value', value)
1462 - def _value__del(self):
1463 if self.checkable: 1464 self.checked = False 1465 else: 1466 if 'value' in self.attrib: 1467 del self.attrib['value']
1468 value = property(_value__get, _value__set, _value__del, doc=_value__get.__doc__) 1469
1470 - def _type__get(self):
1471 """ 1472 Return the type of this element (using the type attribute). 1473 """ 1474 return self.get('type', 'text').lower()
1475 - def _type__set(self, value):
1476 self.set('type', value)
1477 type = property(_type__get, _type__set, doc=_type__get.__doc__) 1478
1479 - def checkable(self):
1480 """ 1481 Boolean: can this element be checked? 1482 """ 1483 return self.type in ['checkbox', 'radio']
1484 checkable = property(checkable, doc=checkable.__doc__) 1485
1486 - def _checked__get(self):
1487 """ 1488 Boolean attribute to get/set the presence of the ``checked`` 1489 attribute. 1490 1491 You can only use this on checkable input types. 1492 """ 1493 if not self.checkable: 1494 raise AttributeError('Not a checkable input type') 1495 return 'checked' in self.attrib
1496 - def _checked__set(self, value):
1497 if not self.checkable: 1498 raise AttributeError('Not a checkable input type') 1499 if value: 1500 self.set('checked', '') 1501 else: 1502 if 'checked' in self.attrib: 1503 del self.attrib['checked']
1504 checked = property(_checked__get, _checked__set, doc=_checked__get.__doc__)
1505 1506 HtmlElementClassLookup._default_element_classes['input'] = InputElement 1507
1508 -class LabelElement(HtmlElement):
1509 """ 1510 Represents a ``<label>`` element. 1511 1512 Label elements are linked to other elements with their ``for`` 1513 attribute. You can access this element with ``label.for_element``. 1514 """ 1515
1516 - def _for_element__get(self):
1517 """ 1518 Get/set the element this label points to. Return None if it 1519 can't be found. 1520 """ 1521 id = self.get('for') 1522 if not id: 1523 return None 1524 return self.body.get_element_by_id(id)
1525 - def _for_element__set(self, other):
1526 id = other.get('id') 1527 if not id: 1528 raise TypeError( 1529 "Element %r has no id attribute" % other) 1530 self.set('for', id)
1531 - def _for_element__del(self):
1532 if 'id' in self.attrib: 1533 del self.attrib['id']
1534 for_element = property(_for_element__get, _for_element__set, _for_element__del, 1535 doc=_for_element__get.__doc__)
1536 1537 HtmlElementClassLookup._default_element_classes['label'] = LabelElement 1538 1539 ############################################################ 1540 ## Serialization 1541 ############################################################ 1542
1543 -def html_to_xhtml(html):
1544 """Convert all tags in an HTML tree to XHTML by moving them to the 1545 XHTML namespace. 1546 """ 1547 try: 1548 html = html.getroot() 1549 except AttributeError: 1550 pass 1551 prefix = "{%s}" % XHTML_NAMESPACE 1552 for el in html.iter(etree.Element): 1553 tag = el.tag 1554 if tag[0] != '{': 1555 el.tag = prefix + tag
1556
1557 -def xhtml_to_html(xhtml):
1558 """Convert all tags in an XHTML tree to HTML by removing their 1559 XHTML namespace. 1560 """ 1561 try: 1562 xhtml = xhtml.getroot() 1563 except AttributeError: 1564 pass 1565 prefix = "{%s}" % XHTML_NAMESPACE 1566 prefix_len = len(prefix) 1567 for el in xhtml.iter(prefix + "*"): 1568 el.tag = el.tag[prefix_len:]
1569 1570 # This isn't a general match, but it's a match for what libxml2 1571 # specifically serialises: 1572 __str_replace_meta_content_type = re.compile( 1573 r'<meta http-equiv="Content-Type"[^>]*>').sub 1574 __bytes_replace_meta_content_type = re.compile( 1575 r'<meta http-equiv="Content-Type"[^>]*>'.encode('ASCII')).sub 1576
1577 -def tostring(doc, pretty_print=False, include_meta_content_type=False, 1578 encoding=None, method="html", with_tail=True, doctype=None):
1579 """Return an HTML string representation of the document. 1580 1581 Note: if include_meta_content_type is true this will create a 1582 ``<meta http-equiv="Content-Type" ...>`` tag in the head; 1583 regardless of the value of include_meta_content_type any existing 1584 ``<meta http-equiv="Content-Type" ...>`` tag will be removed 1585 1586 The ``encoding`` argument controls the output encoding (defauts to 1587 ASCII, with &#...; character references for any characters outside 1588 of ASCII). Note that you can pass the name ``'unicode'`` as 1589 ``encoding`` argument to serialise to a Unicode string. 1590 1591 The ``method`` argument defines the output method. It defaults to 1592 'html', but can also be 'xml' for xhtml output, or 'text' to 1593 serialise to plain text without markup. 1594 1595 To leave out the tail text of the top-level element that is being 1596 serialised, pass ``with_tail=False``. 1597 1598 The ``doctype`` option allows passing in a plain string that will 1599 be serialised before the XML tree. Note that passing in non 1600 well-formed content here will make the XML output non well-formed. 1601 Also, an existing doctype in the document tree will not be removed 1602 when serialising an ElementTree instance. 1603 1604 Example:: 1605 1606 >>> from lxml import html 1607 >>> root = html.fragment_fromstring('<p>Hello<br>world!</p>') 1608 1609 >>> html.tostring(root) 1610 b'<p>Hello<br>world!</p>' 1611 >>> html.tostring(root, method='html') 1612 b'<p>Hello<br>world!</p>' 1613 1614 >>> html.tostring(root, method='xml') 1615 b'<p>Hello<br/>world!</p>' 1616 1617 >>> html.tostring(root, method='text') 1618 b'Helloworld!' 1619 1620 >>> html.tostring(root, method='text', encoding='unicode') 1621 u'Helloworld!' 1622 1623 >>> root = html.fragment_fromstring('<div><p>Hello<br>world!</p>TAIL</div>') 1624 >>> html.tostring(root[0], method='text', encoding='unicode') 1625 u'Helloworld!TAIL' 1626 1627 >>> html.tostring(root[0], method='text', encoding='unicode', with_tail=False) 1628 u'Helloworld!' 1629 1630 >>> doc = html.document_fromstring('<p>Hello<br>world!</p>') 1631 >>> html.tostring(doc, method='html', encoding='unicode') 1632 u'<html><body><p>Hello<br>world!</p></body></html>' 1633 1634 >>> print(html.tostring(doc, method='html', encoding='unicode', 1635 ... doctype='<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN"' 1636 ... ' "http://www.w3.org/TR/html4/strict.dtd">')) 1637 <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd"> 1638 <html><body><p>Hello<br>world!</p></body></html> 1639 """ 1640 html = etree.tostring(doc, method=method, pretty_print=pretty_print, 1641 encoding=encoding, with_tail=with_tail, 1642 doctype=doctype) 1643 if method == 'html' and not include_meta_content_type: 1644 if isinstance(html, str): 1645 html = __str_replace_meta_content_type('', html) 1646 else: 1647 html = __bytes_replace_meta_content_type(bytes(), html) 1648 return html
1649 1650 tostring.__doc__ = __fix_docstring(tostring.__doc__) 1651
1652 -def open_in_browser(doc, encoding=None):
1653 """ 1654 Open the HTML document in a web browser, saving it to a temporary 1655 file to open it. Note that this does not delete the file after 1656 use. This is mainly meant for debugging. 1657 """ 1658 import os 1659 import webbrowser 1660 import tempfile 1661 if not isinstance(doc, etree._ElementTree): 1662 doc = etree.ElementTree(doc) 1663 handle, fn = tempfile.mkstemp(suffix='.html') 1664 f = os.fdopen(handle, 'wb') 1665 try: 1666 doc.write(f, method="html", encoding=encoding or doc.docinfo.encoding or "UTF-8") 1667 finally: 1668 # we leak the file itself here, but we should at least close it 1669 f.close() 1670 url = 'file://' + fn.replace(os.path.sep, '/') 1671 print(url) 1672 webbrowser.open(url)
1673 1674 ################################################################################ 1675 # configure Element class lookup 1676 ################################################################################ 1677
1678 -class HTMLParser(etree.HTMLParser):
1679 """An HTML parser that is configured to return lxml.html Element 1680 objects. 1681 """
1682 - def __init__(self, **kwargs):
1683 super(HTMLParser, self).__init__(**kwargs) 1684 self.set_element_class_lookup(HtmlElementClassLookup())
1685
1686 -class XHTMLParser(etree.XMLParser):
1687 """An XML parser that is configured to return lxml.html Element 1688 objects. 1689 1690 Note that this parser is not really XHTML aware unless you let it 1691 load a DTD that declares the HTML entities. To do this, make sure 1692 you have the XHTML DTDs installed in your catalogs, and create the 1693 parser like this:: 1694 1695 >>> parser = XHTMLParser(load_dtd=True) 1696 1697 If you additionally want to validate the document, use this:: 1698 1699 >>> parser = XHTMLParser(dtd_validation=True) 1700 1701 For catalog support, see http://www.xmlsoft.org/catalog.html. 1702 """
1703 - def __init__(self, **kwargs):
1704 super(XHTMLParser, self).__init__(**kwargs) 1705 self.set_element_class_lookup(HtmlElementClassLookup())
1706
1707 -def Element(*args, **kw):
1708 """Create a new HTML Element. 1709 1710 This can also be used for XHTML documents. 1711 """ 1712 v = html_parser.makeelement(*args, **kw) 1713 return v
1714 1715 html_parser = HTMLParser() 1716 xhtml_parser = XHTMLParser() 1717