Package lxml :: Package html
[hide private]
[frames] | no frames]

Source Code for Package lxml.html

   1  # Copyright (c) 2004 Ian Bicking. All rights reserved. 
   2  # 
   3  # Redistribution and use in source and binary forms, with or without 
   4  # modification, are permitted provided that the following conditions are 
   5  # met: 
   6  # 
   7  # 1. Redistributions of source code must retain the above copyright 
   8  # notice, this list of conditions and the following disclaimer. 
   9  # 
  10  # 2. Redistributions in binary form must reproduce the above copyright 
  11  # notice, this list of conditions and the following disclaimer in 
  12  # the documentation and/or other materials provided with the 
  13  # distribution. 
  14  # 
  15  # 3. Neither the name of Ian Bicking nor the names of its contributors may 
  16  # be used to endorse or promote products derived from this software 
  17  # without specific prior written permission. 
  18  # 
  19  # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 
  20  # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 
  21  # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 
  22  # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL IAN BICKING OR 
  23  # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 
  24  # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 
  25  # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 
  26  # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 
  27  # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 
  28  # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 
  29  # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 
  30   
  31  """The ``lxml.html`` tool set for HTML handling. 
  32  """ 
  33   
  34  import sys 
  35  import re 
  36  try: 
  37      from urlparse import urljoin 
  38  except ImportError: 
  39      # Python 3 
  40      from urllib.parse import urljoin 
  41  import copy 
  42  from lxml import etree 
  43  from lxml.html import defs 
  44  from lxml.html._setmixin import SetMixin 
  45  try: 
  46      from collections import MutableMapping as DictMixin 
  47  except ImportError: 
  48      # Python < 2.6 
  49      from UserDict import DictMixin 
  50  try: 
  51      set 
  52  except NameError: 
  53      # Python 2.3 
  54      from sets import Set as set 
  55  try: 
  56      bytes 
  57  except NameError: 
  58      # Python < 2.6 
  59      bytes = str 
  60  try: 
  61      unicode 
  62  except NameError: 
  63      # Python 3 
  64      unicode = str 
  65  try: 
  66      basestring 
  67  except NameError: 
  68      # Python 3 
  69      basestring = (str, bytes) 
  70   
71 -def __fix_docstring(s):
72 if not s: 73 return s 74 import sys 75 if sys.version_info[0] >= 3: 76 sub = re.compile(r"^(\s*)u'", re.M).sub 77 else: 78 sub = re.compile(r"^(\s*)b'", re.M).sub 79 return sub(r"\1'", s)
80 81 __all__ = [ 82 'document_fromstring', 'fragment_fromstring', 'fragments_fromstring', 'fromstring', 83 'tostring', 'Element', 'defs', 'open_in_browser', 'submit_form', 84 'find_rel_links', 'find_class', 'make_links_absolute', 85 'resolve_base_href', 'iterlinks', 'rewrite_links', 'open_in_browser', 'parse'] 86 87 XHTML_NAMESPACE = "http://www.w3.org/1999/xhtml" 88 89 _rel_links_xpath = etree.XPath("descendant-or-self::a[@rel]|descendant-or-self::x:a[@rel]", 90 namespaces={'x':XHTML_NAMESPACE}) 91 _options_xpath = etree.XPath("descendant-or-self::option|descendant-or-self::x:option", 92 namespaces={'x':XHTML_NAMESPACE}) 93 _forms_xpath = etree.XPath("descendant-or-self::form|descendant-or-self::x:form", 94 namespaces={'x':XHTML_NAMESPACE}) 95 #_class_xpath = etree.XPath(r"descendant-or-self::*[regexp:match(@class, concat('\b', $class_name, '\b'))]", {'regexp': 'http://exslt.org/regular-expressions'}) 96 _class_xpath = etree.XPath("descendant-or-self::*[@class and contains(concat(' ', normalize-space(@class), ' '), concat(' ', $class_name, ' '))]") 97 _id_xpath = etree.XPath("descendant-or-self::*[@id=$id]") 98 _collect_string_content = etree.XPath("string()") 99 _iter_css_urls = re.compile(r'url\(('+'["][^"]*["]|'+"['][^']*[']|"+r'[^)]*)\)', re.I).finditer 100 _iter_css_imports = re.compile(r'@import "(.*?)"').finditer 101 _label_xpath = etree.XPath("//label[@for=$id]|//x:label[@for=$id]", 102 namespaces={'x':XHTML_NAMESPACE}) 103 _archive_re = re.compile(r'[^ ]+') 104
105 -def _unquote_match(s, pos):
106 if s[:1] == '"' and s[-1:] == '"' or s[:1] == "'" and s[-1:] == "'": 107 return s[1:-1], pos+1 108 else: 109 return s,pos
110
111 -def _transform_result(typ, result):
112 """Convert the result back into the input type. 113 """ 114 if issubclass(typ, bytes): 115 return tostring(result, encoding='utf-8') 116 elif issubclass(typ, unicode): 117 return tostring(result, encoding='unicode') 118 else: 119 return result
120
121 -def _nons(tag):
122 if isinstance(tag, basestring): 123 if tag[0] == '{' and tag[1:len(XHTML_NAMESPACE)+1] == XHTML_NAMESPACE: 124 return tag.split('}')[-1] 125 return tag
126
127 -class HtmlMixin(object):
128
129 - def base_url(self):
130 """ 131 Returns the base URL, given when the page was parsed. 132 133 Use with ``urlparse.urljoin(el.base_url, href)`` to get 134 absolute URLs. 135 """ 136 return self.getroottree().docinfo.URL
137 base_url = property(base_url, doc=base_url.__doc__) 138
139 - def forms(self):
140 """ 141 Return a list of all the forms 142 """ 143 return _forms_xpath(self)
144 forms = property(forms, doc=forms.__doc__) 145
146 - def body(self):
147 """ 148 Return the <body> element. Can be called from a child element 149 to get the document's head. 150 """ 151 return self.xpath('//body|//x:body', namespaces={'x':XHTML_NAMESPACE})[0]
152 body = property(body, doc=body.__doc__) 153
154 - def head(self):
155 """ 156 Returns the <head> element. Can be called from a child 157 element to get the document's head. 158 """ 159 return self.xpath('//head|//x:head', namespaces={'x':XHTML_NAMESPACE})[0]
160 head = property(head, doc=head.__doc__) 161
162 - def _label__get(self):
163 """ 164 Get or set any <label> element associated with this element. 165 """ 166 id = self.get('id') 167 if not id: 168 return None 169 result = _label_xpath(self, id=id) 170 if not result: 171 return None 172 else: 173 return result[0]
174 - def _label__set(self, label):
175 id = self.get('id') 176 if not id: 177 raise TypeError( 178 "You cannot set a label for an element (%r) that has no id" 179 % self) 180 if _nons(label.tag) != 'label': 181 raise TypeError( 182 "You can only assign label to a label element (not %r)" 183 % label) 184 label.set('for', id)
185 - def _label__del(self):
186 label = self.label 187 if label is not None: 188 del label.attrib['for']
189 label = property(_label__get, _label__set, _label__del, doc=_label__get.__doc__) 190
191 - def drop_tree(self):
192 """ 193 Removes this element from the tree, including its children and 194 text. The tail text is joined to the previous element or 195 parent. 196 """ 197 parent = self.getparent() 198 assert parent is not None 199 if self.tail: 200 previous = self.getprevious() 201 if previous is None: 202 parent.text = (parent.text or '') + self.tail 203 else: 204 previous.tail = (previous.tail or '') + self.tail 205 parent.remove(self)
206
207 - def drop_tag(self):
208 """ 209 Remove the tag, but not its children or text. The children and text 210 are merged into the parent. 211 212 Example:: 213 214 >>> h = fragment_fromstring('<div>Hello <b>World!</b></div>') 215 >>> h.find('.//b').drop_tag() 216 >>> print(tostring(h, encoding='unicode')) 217 <div>Hello World!</div> 218 """ 219 parent = self.getparent() 220 assert parent is not None 221 previous = self.getprevious() 222 if self.text and isinstance(self.tag, basestring): 223 # not a Comment, etc. 224 if previous is None: 225 parent.text = (parent.text or '') + self.text 226 else: 227 previous.tail = (previous.tail or '') + self.text 228 if self.tail: 229 if len(self): 230 last = self[-1] 231 last.tail = (last.tail or '') + self.tail 232 elif previous is None: 233 parent.text = (parent.text or '') + self.tail 234 else: 235 previous.tail = (previous.tail or '') + self.tail 236 index = parent.index(self) 237 parent[index:index+1] = self[:]
238 246
247 - def find_class(self, class_name):
248 """ 249 Find any elements with the given class name. 250 """ 251 return _class_xpath(self, class_name=class_name)
252
253 - def get_element_by_id(self, id, *default):
254 """ 255 Get the first element in a document with the given id. If none is 256 found, return the default argument if provided or raise KeyError 257 otherwise. 258 259 Note that there can be more than one element with the same id, 260 and this isn't uncommon in HTML documents found in the wild. 261 Browsers return only the first match, and this function does 262 the same. 263 """ 264 try: 265 # FIXME: should this check for multiple matches? 266 # browsers just return the first one 267 return _id_xpath(self, id=id)[0] 268 except IndexError: 269 if default: 270 return default[0] 271 else: 272 raise KeyError(id)
273
274 - def text_content(self):
275 """ 276 Return the text content of the tag (and the text in any children). 277 """ 278 return _collect_string_content(self)
279
280 - def cssselect(self, expr, translator='html'):
281 """ 282 Run the CSS expression on this element and its children, 283 returning a list of the results. 284 285 Equivalent to lxml.cssselect.CSSSelect(expr, translator='html')(self) 286 -- note that pre-compiling the expression can provide a substantial 287 speedup. 288 """ 289 # Do the import here to make the dependency optional. 290 from lxml.cssselect import CSSSelector 291 return CSSSelector(expr, translator=translator)(self)
292 293 ######################################## 294 ## Link functions 295 ######################################## 296 327 elif handle_failures == 'discard': 328 def link_repl(href): 329 try: 330 return urljoin(base_url, href) 331 except ValueError: 332 return None
333 elif handle_failures is None: 334 def link_repl(href): 335 return urljoin(base_url, href) 336 else: 337 raise ValueError( 338 "unexpected value for handle_failures: %r" % handle_failures) 339 340 self.rewrite_links(link_repl) 341
342 - def resolve_base_href(self, handle_failures=None):
343 """ 344 Find any ``<base href>`` tag in the document, and apply its 345 values to all links found in the document. Also remove the 346 tag once it has been applied. 347 348 If ``handle_failures`` is None (default), a failure to process 349 a URL will abort the processing. If set to 'ignore', errors 350 are ignored. If set to 'discard', failing URLs will be removed. 351 """ 352 base_href = None 353 basetags = self.xpath('//base[@href]|//x:base[@href]', 354 namespaces={'x': XHTML_NAMESPACE}) 355 for b in basetags: 356 base_href = b.get('href') 357 b.drop_tree() 358 if not base_href: 359 return 360 self.make_links_absolute(base_href, resolve_base_href=False, 361 handle_failures=handle_failures)
362 454 503 504
505 -class _MethodFunc(object):
506 """ 507 An object that represents a method on an element as a function; 508 the function takes either an element or an HTML string. It 509 returns whatever the function normally returns, or if the function 510 works in-place (and so returns None) it returns a serialized form 511 of the resulting document. 512 """
513 - def __init__(self, name, copy=False, source_class=HtmlMixin):
514 self.name = name 515 self.copy = copy 516 self.__doc__ = getattr(source_class, self.name).__doc__
517 - def __call__(self, doc, *args, **kw):
518 result_type = type(doc) 519 if isinstance(doc, basestring): 520 if 'copy' in kw: 521 raise TypeError( 522 "The keyword 'copy' can only be used with element inputs to %s, not a string input" % self.name) 523 doc = fromstring(doc, **kw) 524 else: 525 if 'copy' in kw: 526 make_a_copy = kw.pop('copy') 527 else: 528 make_a_copy = self.copy 529 if make_a_copy: 530 doc = copy.deepcopy(doc) 531 meth = getattr(doc, self.name) 532 result = meth(*args, **kw) 533 # FIXME: this None test is a bit sloppy 534 if result is None: 535 # Then return what we got in 536 return _transform_result(result_type, doc) 537 else: 538 return result
539 540 find_rel_links = _MethodFunc('find_rel_links', copy=False) 541 find_class = _MethodFunc('find_class', copy=False) 542 make_links_absolute = _MethodFunc('make_links_absolute', copy=True) 543 resolve_base_href = _MethodFunc('resolve_base_href', copy=True) 544 iterlinks = _MethodFunc('iterlinks', copy=False) 545 rewrite_links = _MethodFunc('rewrite_links', copy=True) 546
547 -class HtmlComment(etree.CommentBase, HtmlMixin):
548 pass
549
550 -class HtmlElement(etree.ElementBase, HtmlMixin):
551 pass
552
553 -class HtmlProcessingInstruction(etree.PIBase, HtmlMixin):
554 pass
555
556 -class HtmlEntity(etree.EntityBase, HtmlMixin):
557 pass
558 559
560 -class HtmlElementClassLookup(etree.CustomElementClassLookup):
561 """A lookup scheme for HTML Element classes. 562 563 To create a lookup instance with different Element classes, pass a tag 564 name mapping of Element classes in the ``classes`` keyword argument and/or 565 a tag name mapping of Mixin classes in the ``mixins`` keyword argument. 566 The special key '*' denotes a Mixin class that should be mixed into all 567 Element classes. 568 """ 569 _default_element_classes = {} 570
571 - def __init__(self, classes=None, mixins=None):
572 etree.CustomElementClassLookup.__init__(self) 573 if classes is None: 574 classes = self._default_element_classes.copy() 575 if mixins: 576 mixers = {} 577 for name, value in mixins: 578 if name == '*': 579 for n in classes.keys(): 580 mixers.setdefault(n, []).append(value) 581 else: 582 mixers.setdefault(name, []).append(value) 583 for name, mix_bases in mixers.items(): 584 cur = classes.get(name, HtmlElement) 585 bases = tuple(mix_bases + [cur]) 586 classes[name] = type(cur.__name__, bases, {}) 587 self._element_classes = classes
588
589 - def lookup(self, node_type, document, namespace, name):
590 if node_type == 'element': 591 return self._element_classes.get(name.lower(), HtmlElement) 592 elif node_type == 'comment': 593 return HtmlComment 594 elif node_type == 'PI': 595 return HtmlProcessingInstruction 596 elif node_type == 'entity': 597 return HtmlEntity 598 # Otherwise normal lookup 599 return None
600 601 ################################################################################ 602 # parsing 603 ################################################################################ 604 605 _looks_like_full_html_unicode = re.compile( 606 unicode(r'^\s*<(?:html|!doctype)'), re.I).match 607 _looks_like_full_html_bytes = re.compile( 608 r'^\s*<(?:html|!doctype)'.encode('ascii'), re.I).match 609
610 -def document_fromstring(html, parser=None, ensure_head_body=False, **kw):
611 if parser is None: 612 parser = html_parser 613 value = etree.fromstring(html, parser, **kw) 614 if value is None: 615 raise etree.ParserError( 616 "Document is empty") 617 if ensure_head_body and value.find('head') is None: 618 value.insert(0, Element('head')) 619 if ensure_head_body and value.find('body') is None: 620 value.append(Element('body')) 621 return value
622
623 -def fragments_fromstring(html, no_leading_text=False, base_url=None, 624 parser=None, **kw):
625 """ 626 Parses several HTML elements, returning a list of elements. 627 628 The first item in the list may be a string (though leading 629 whitespace is removed). If no_leading_text is true, then it will 630 be an error if there is leading text, and it will always be a list 631 of only elements. 632 633 base_url will set the document's base_url attribute (and the tree's docinfo.URL) 634 """ 635 if parser is None: 636 parser = html_parser 637 # FIXME: check what happens when you give html with a body, head, etc. 638 if isinstance(html, bytes): 639 if not _looks_like_full_html_bytes(html): 640 # can't use %-formatting in early Py3 versions 641 html = ('<html><body>'.encode('ascii') + html + 642 '</body></html>'.encode('ascii')) 643 else: 644 if not _looks_like_full_html_unicode(html): 645 html = '<html><body>%s</body></html>' % html 646 doc = document_fromstring(html, parser=parser, base_url=base_url, **kw) 647 assert _nons(doc.tag) == 'html' 648 bodies = [e for e in doc if _nons(e.tag) == 'body'] 649 assert len(bodies) == 1, ("too many bodies: %r in %r" % (bodies, html)) 650 body = bodies[0] 651 elements = [] 652 if no_leading_text and body.text and body.text.strip(): 653 raise etree.ParserError( 654 "There is leading text: %r" % body.text) 655 if body.text and body.text.strip(): 656 elements.append(body.text) 657 elements.extend(body) 658 # FIXME: removing the reference to the parent artificial document 659 # would be nice 660 return elements
661
662 -def fragment_fromstring(html, create_parent=False, base_url=None, 663 parser=None, **kw):
664 """ 665 Parses a single HTML element; it is an error if there is more than 666 one element, or if anything but whitespace precedes or follows the 667 element. 668 669 If create_parent is true (or is a tag name) then a parent node 670 will be created to encapsulate the HTML in a single element. In 671 this case, leading or trailing text is allowed. 672 673 base_url will set the document's base_url attribute (and the tree's docinfo.URL) 674 """ 675 if parser is None: 676 parser = html_parser 677 678 accept_leading_text = bool(create_parent) 679 680 elements = fragments_fromstring( 681 html, parser=parser, no_leading_text=not accept_leading_text, 682 base_url=base_url, **kw) 683 684 if create_parent: 685 if not isinstance(create_parent, basestring): 686 create_parent = 'div' 687 new_root = Element(create_parent) 688 if elements: 689 if isinstance(elements[0], basestring): 690 new_root.text = elements[0] 691 del elements[0] 692 new_root.extend(elements) 693 return new_root 694 695 if not elements: 696 raise etree.ParserError('No elements found') 697 if len(elements) > 1: 698 raise etree.ParserError( 699 "Multiple elements found (%s)" 700 % ', '.join([_element_name(e) for e in elements])) 701 el = elements[0] 702 if el.tail and el.tail.strip(): 703 raise etree.ParserError( 704 "Element followed by text: %r" % el.tail) 705 el.tail = None 706 return el
707
708 -def fromstring(html, base_url=None, parser=None, **kw):
709 """ 710 Parse the html, returning a single element/document. 711 712 This tries to minimally parse the chunk of text, without knowing if it 713 is a fragment or a document. 714 715 base_url will set the document's base_url attribute (and the tree's docinfo.URL) 716 """ 717 if parser is None: 718 parser = html_parser 719 if isinstance(html, bytes): 720 is_full_html = _looks_like_full_html_bytes(html) 721 else: 722 is_full_html = _looks_like_full_html_unicode(html) 723 doc = document_fromstring(html, parser=parser, base_url=base_url, **kw) 724 if is_full_html: 725 return doc 726 # otherwise, lets parse it out... 727 bodies = doc.findall('body') 728 if not bodies: 729 bodies = doc.findall('{%s}body' % XHTML_NAMESPACE) 730 if bodies: 731 body = bodies[0] 732 if len(bodies) > 1: 733 # Somehow there are multiple bodies, which is bad, but just 734 # smash them into one body 735 for other_body in bodies[1:]: 736 if other_body.text: 737 if len(body): 738 body[-1].tail = (body[-1].tail or '') + other_body.text 739 else: 740 body.text = (body.text or '') + other_body.text 741 body.extend(other_body) 742 # We'll ignore tail 743 # I guess we are ignoring attributes too 744 other_body.drop_tree() 745 else: 746 body = None 747 heads = doc.findall('head') 748 if not heads: 749 heads = doc.findall('{%s}head' % XHTML_NAMESPACE) 750 if heads: 751 # Well, we have some sort of structure, so lets keep it all 752 head = heads[0] 753 if len(heads) > 1: 754 for other_head in heads[1:]: 755 head.extend(other_head) 756 # We don't care about text or tail in a head 757 other_head.drop_tree() 758 return doc 759 if body is None: 760 return doc 761 if (len(body) == 1 and (not body.text or not body.text.strip()) 762 and (not body[-1].tail or not body[-1].tail.strip())): 763 # The body has just one element, so it was probably a single 764 # element passed in 765 return body[0] 766 # Now we have a body which represents a bunch of tags which have the 767 # content that was passed in. We will create a fake container, which 768 # is the body tag, except <body> implies too much structure. 769 if _contains_block_level_tag(body): 770 body.tag = 'div' 771 else: 772 body.tag = 'span' 773 return body
774
775 -def parse(filename_or_url, parser=None, base_url=None, **kw):
776 """ 777 Parse a filename, URL, or file-like object into an HTML document 778 tree. Note: this returns a tree, not an element. Use 779 ``parse(...).getroot()`` to get the document root. 780 781 You can override the base URL with the ``base_url`` keyword. This 782 is most useful when parsing from a file-like object. 783 """ 784 if parser is None: 785 parser = html_parser 786 return etree.parse(filename_or_url, parser, base_url=base_url, **kw)
787
788 -def _contains_block_level_tag(el):
789 # FIXME: I could do this with XPath, but would that just be 790 # unnecessarily slow? 791 for el in el.iter(etree.Element): 792 if _nons(el.tag) in defs.block_tags: 793 return True 794 return False
795
796 -def _element_name(el):
797 if isinstance(el, etree.CommentBase): 798 return 'comment' 799 elif isinstance(el, basestring): 800 return 'string' 801 else: 802 return _nons(el.tag)
803 804 ################################################################################ 805 # form handling 806 ################################################################################ 807
808 -class FormElement(HtmlElement):
809 """ 810 Represents a <form> element. 811 """ 812
813 - def inputs(self):
814 """ 815 Returns an accessor for all the input elements in the form. 816 817 See `InputGetter` for more information about the object. 818 """ 819 return InputGetter(self)
820 inputs = property(inputs, doc=inputs.__doc__) 821
822 - def _fields__get(self):
823 """ 824 Dictionary-like object that represents all the fields in this 825 form. You can set values in this dictionary to effect the 826 form. 827 """ 828 return FieldsDict(self.inputs)
829 - def _fields__set(self, value):
830 prev_keys = self.fields.keys() 831 for key, value in value.items(): 832 if key in prev_keys: 833 prev_keys.remove(key) 834 self.fields[key] = value 835 for key in prev_keys: 836 if key is None: 837 # Case of an unnamed input; these aren't really 838 # expressed in form_values() anyway. 839 continue 840 self.fields[key] = None
841 842 fields = property(_fields__get, _fields__set, doc=_fields__get.__doc__) 843
844 - def _name(self):
845 if self.get('name'): 846 return self.get('name') 847 elif self.get('id'): 848 return '#' + self.get('id') 849 forms = list(self.body.iter('form')) 850 if not forms: 851 forms = list(self.body.iter('{%s}form' % XHTML_NAMESPACE)) 852 return str(forms.index(self))
853
854 - def form_values(self):
855 """ 856 Return a list of tuples of the field values for the form. 857 This is suitable to be passed to ``urllib.urlencode()``. 858 """ 859 results = [] 860 for el in self.inputs: 861 name = el.name 862 if not name: 863 continue 864 tag = _nons(el.tag) 865 if tag == 'textarea': 866 results.append((name, el.value)) 867 elif tag == 'select': 868 value = el.value 869 if el.multiple: 870 for v in value: 871 results.append((name, v)) 872 elif value is not None: 873 results.append((name, el.value)) 874 else: 875 assert tag == 'input', ( 876 "Unexpected tag: %r" % el) 877 if el.checkable and not el.checked: 878 continue 879 if el.type in ('submit', 'image', 'reset'): 880 continue 881 value = el.value 882 if value is not None: 883 results.append((name, el.value)) 884 return results
885
886 - def _action__get(self):
887 """ 888 Get/set the form's ``action`` attribute. 889 """ 890 base_url = self.base_url 891 action = self.get('action') 892 if base_url and action is not None: 893 return urljoin(base_url, action) 894 else: 895 return action
896 - def _action__set(self, value):
897 self.set('action', value)
898 - def _action__del(self):
899 if 'action' in self.attrib: 900 del self.attrib['action']
901 action = property(_action__get, _action__set, _action__del, doc=_action__get.__doc__) 902
903 - def _method__get(self):
904 """ 905 Get/set the form's method. Always returns a capitalized 906 string, and defaults to ``'GET'`` 907 """ 908 return self.get('method', 'GET').upper()
909 - def _method__set(self, value):
910 self.set('method', value.upper())
911 method = property(_method__get, _method__set, doc=_method__get.__doc__)
912 913 HtmlElementClassLookup._default_element_classes['form'] = FormElement 914
915 -def submit_form(form, extra_values=None, open_http=None):
916 """ 917 Helper function to submit a form. Returns a file-like object, as from 918 ``urllib.urlopen()``. This object also has a ``.geturl()`` function, 919 which shows the URL if there were any redirects. 920 921 You can use this like:: 922 923 form = doc.forms[0] 924 form.inputs['foo'].value = 'bar' # etc 925 response = form.submit() 926 doc = parse(response) 927 doc.make_links_absolute(response.geturl()) 928 929 To change the HTTP requester, pass a function as ``open_http`` keyword 930 argument that opens the URL for you. The function must have the following 931 signature:: 932 933 open_http(method, URL, values) 934 935 The action is one of 'GET' or 'POST', the URL is the target URL as a 936 string, and the values are a sequence of ``(name, value)`` tuples with the 937 form data. 938 """ 939 values = form.form_values() 940 if extra_values: 941 if hasattr(extra_values, 'items'): 942 extra_values = extra_values.items() 943 values.extend(extra_values) 944 if open_http is None: 945 open_http = open_http_urllib 946 if form.action: 947 url = form.action 948 else: 949 url = form.base_url 950 return open_http(form.method, url, values)
951
952 -def open_http_urllib(method, url, values):
953 if not url: 954 raise ValueError("cannot submit, no URL provided") 955 ## FIXME: should test that it's not a relative URL or something 956 try: 957 from urllib import urlencode, urlopen 958 except ImportError: # Python 3 959 from urllib.request import urlopen 960 from urllib.parse import urlencode 961 if method == 'GET': 962 if '?' in url: 963 url += '&' 964 else: 965 url += '?' 966 url += urlencode(values) 967 data = None 968 else: 969 data = urlencode(values) 970 return urlopen(url, data)
971
972 -class FieldsDict(DictMixin):
973
974 - def __init__(self, inputs):
975 self.inputs = inputs
976 - def __getitem__(self, item):
977 return self.inputs[item].value
978 - def __setitem__(self, item, value):
979 self.inputs[item].value = value
980 - def __delitem__(self, item):
981 raise KeyError( 982 "You cannot remove keys from ElementDict")
983 - def keys(self):
984 return self.inputs.keys()
985 - def __contains__(self, item):
986 return item in self.inputs
987 - def __iter__(self):
988 return iter(self.inputs.keys())
989 - def __len__(self):
990 return len(self.inputs)
991
992 - def __repr__(self):
993 return '<%s for form %s>' % ( 994 self.__class__.__name__, 995 self.inputs.form._name())
996
997 -class InputGetter(object):
998 999 """ 1000 An accessor that represents all the input fields in a form. 1001 1002 You can get fields by name from this, with 1003 ``form.inputs['field_name']``. If there are a set of checkboxes 1004 with the same name, they are returned as a list (a `CheckboxGroup` 1005 which also allows value setting). Radio inputs are handled 1006 similarly. 1007 1008 You can also iterate over this to get all input elements. This 1009 won't return the same thing as if you get all the names, as 1010 checkboxes and radio elements are returned individually. 1011 """ 1012 1013 _name_xpath = etree.XPath(".//*[@name = $name and (local-name(.) = 'select' or local-name(.) = 'input' or local-name(.) = 'textarea')]") 1014 _all_xpath = etree.XPath(".//*[local-name() = 'select' or local-name() = 'input' or local-name() = 'textarea']") 1015
1016 - def __init__(self, form):
1017 self.form = form
1018
1019 - def __repr__(self):
1020 return '<%s for form %s>' % ( 1021 self.__class__.__name__, 1022 self.form._name())
1023 1024 ## FIXME: there should be more methods, and it's unclear if this is 1025 ## a dictionary-like object or list-like object 1026
1027 - def __getitem__(self, name):
1028 results = self._name_xpath(self.form, name=name) 1029 if results: 1030 type = results[0].get('type') 1031 if type == 'radio' and len(results) > 1: 1032 group = RadioGroup(results) 1033 group.name = name 1034 return group 1035 elif type == 'checkbox' and len(results) > 1: 1036 group = CheckboxGroup(results) 1037 group.name = name 1038 return group 1039 else: 1040 # I don't like throwing away elements like this 1041 return results[0] 1042 else: 1043 raise KeyError( 1044 "No input element with the name %r" % name)
1045
1046 - def __contains__(self, name):
1047 results = self._name_xpath(self.form, name=name) 1048 return bool(results)
1049
1050 - def keys(self):
1051 names = set() 1052 for el in self: 1053 names.add(el.name) 1054 if None in names: 1055 names.remove(None) 1056 return list(names)
1057
1058 - def __iter__(self):
1059 ## FIXME: kind of dumb to turn a list into an iterator, only 1060 ## to have it likely turned back into a list again :( 1061 return iter(self._all_xpath(self.form))
1062
1063 -class InputMixin(object):
1064 1065 """ 1066 Mix-in for all input elements (input, select, and textarea) 1067 """ 1068 1069
1070 - def _name__get(self):
1071 """ 1072 Get/set the name of the element 1073 """ 1074 return self.get('name')
1075 - def _name__set(self, value):
1076 self.set('name', value)
1077 - def _name__del(self):
1078 if 'name' in self.attrib: 1079 del self.attrib['name']
1080 name = property(_name__get, _name__set, _name__del, doc=_name__get.__doc__) 1081
1082 - def __repr__(self):
1083 type = getattr(self, 'type', None) 1084 if type: 1085 type = ' type=%r' % type 1086 else: 1087 type = '' 1088 return '<%s %x name=%r%s>' % ( 1089 self.__class__.__name__, id(self), self.name, type)
1090
1091 -class TextareaElement(InputMixin, HtmlElement):
1092 """ 1093 ``<textarea>`` element. You can get the name with ``.name`` and 1094 get/set the value with ``.value`` 1095 """ 1096
1097 - def _value__get(self):
1098 """ 1099 Get/set the value (which is the contents of this element) 1100 """ 1101 content = self.text or '' 1102 if self.tag.startswith("{%s}" % XHTML_NAMESPACE): 1103 serialisation_method = 'xml' 1104 else: 1105 serialisation_method = 'html' 1106 for el in self: 1107 # it's rare that we actually get here, so let's not use ''.join() 1108 content += etree.tostring( 1109 el, method=serialisation_method, encoding='unicode') 1110 return content
1111 - def _value__set(self, value):
1112 del self[:] 1113 self.text = value
1114 - def _value__del(self):
1115 self.text = '' 1116 del self[:]
1117 value = property(_value__get, _value__set, _value__del, doc=_value__get.__doc__)
1118 1119 HtmlElementClassLookup._default_element_classes['textarea'] = TextareaElement 1120
1121 -class SelectElement(InputMixin, HtmlElement):
1122 """ 1123 ``<select>`` element. You can get the name with ``.name``. 1124 1125 ``.value`` will be the value of the selected option, unless this 1126 is a multi-select element (``<select multiple>``), in which case 1127 it will be a set-like object. In either case ``.value_options`` 1128 gives the possible values. 1129 1130 The boolean attribute ``.multiple`` shows if this is a 1131 multi-select. 1132 """ 1133
1134 - def _value__get(self):
1135 """ 1136 Get/set the value of this select (the selected option). 1137 1138 If this is a multi-select, this is a set-like object that 1139 represents all the selected options. 1140 """ 1141 if self.multiple: 1142 return MultipleSelectOptions(self) 1143 for el in _options_xpath(self): 1144 if el.get('selected') is not None: 1145 value = el.get('value') 1146 if value is None: 1147 value = el.text or '' 1148 if value: 1149 value = value.strip() 1150 return value 1151 return None
1152
1153 - def _value__set(self, value):
1154 if self.multiple: 1155 if isinstance(value, basestring): 1156 raise TypeError( 1157 "You must pass in a sequence") 1158 self.value.clear() 1159 self.value.update(value) 1160 return 1161 if value is not None: 1162 value = value.strip() 1163 for el in _options_xpath(self): 1164 opt_value = el.get('value') 1165 if opt_value is None: 1166 opt_value = el.text or '' 1167 if opt_value: 1168 opt_value = opt_value.strip() 1169 if opt_value == value: 1170 checked_option = el 1171 break 1172 else: 1173 raise ValueError( 1174 "There is no option with the value of %r" % value) 1175 for el in _options_xpath(self): 1176 if 'selected' in el.attrib: 1177 del el.attrib['selected'] 1178 if value is not None: 1179 checked_option.set('selected', '')
1180
1181 - def _value__del(self):
1182 # FIXME: should del be allowed at all? 1183 if self.multiple: 1184 self.value.clear() 1185 else: 1186 self.value = None
1187 1188 value = property(_value__get, _value__set, _value__del, doc=_value__get.__doc__) 1189
1190 - def value_options(self):
1191 """ 1192 All the possible values this select can have (the ``value`` 1193 attribute of all the ``<option>`` elements. 1194 """ 1195 options = [] 1196 for el in _options_xpath(self): 1197 value = el.get('value') 1198 if value is None: 1199 value = el.text or '' 1200 if value: 1201 value = value.strip() 1202 options.append(value) 1203 return options
1204 value_options = property(value_options, doc=value_options.__doc__) 1205
1206 - def _multiple__get(self):
1207 """ 1208 Boolean attribute: is there a ``multiple`` attribute on this element. 1209 """ 1210 return 'multiple' in self.attrib
1211 - def _multiple__set(self, value):
1212 if value: 1213 self.set('multiple', '') 1214 elif 'multiple' in self.attrib: 1215 del self.attrib['multiple']
1216 multiple = property(_multiple__get, _multiple__set, doc=_multiple__get.__doc__)
1217 1218 HtmlElementClassLookup._default_element_classes['select'] = SelectElement 1219
1220 -class MultipleSelectOptions(SetMixin):
1221 """ 1222 Represents all the selected options in a ``<select multiple>`` element. 1223 1224 You can add to this set-like option to select an option, or remove 1225 to unselect the option. 1226 """ 1227
1228 - def __init__(self, select):
1229 self.select = select
1230
1231 - def options(self):
1232 """ 1233 Iterator of all the ``<option>`` elements. 1234 """ 1235 return iter(_options_xpath(self.select))
1236 options = property(options) 1237
1238 - def __iter__(self):
1239 for option in self.options: 1240 if 'selected' in option.attrib: 1241 opt_value = option.get('value') 1242 if opt_value is None: 1243 opt_value = option.text or '' 1244 if opt_value: 1245 opt_value = opt_value.strip() 1246 yield opt_value
1247
1248 - def add(self, item):
1249 for option in self.options: 1250 opt_value = option.get('value') 1251 if opt_value is None: 1252 opt_value = option.text or '' 1253 if opt_value: 1254 opt_value = opt_value.strip() 1255 if opt_value == item: 1256 option.set('selected', '') 1257 break 1258 else: 1259 raise ValueError( 1260 "There is no option with the value %r" % item)
1261
1262 - def remove(self, item):
1263 for option in self.options: 1264 opt_value = option.get('value') 1265 if opt_value is None: 1266 opt_value = option.text or '' 1267 if opt_value: 1268 opt_value = opt_value.strip() 1269 if opt_value == item: 1270 if 'selected' in option.attrib: 1271 del option.attrib['selected'] 1272 else: 1273 raise ValueError( 1274 "The option %r is not currently selected" % item) 1275 break 1276 else: 1277 raise ValueError( 1278 "There is not option with the value %r" % item)
1279
1280 - def __repr__(self):
1281 return '<%s {%s} for select name=%r>' % ( 1282 self.__class__.__name__, 1283 ', '.join([repr(v) for v in self]), 1284 self.select.name)
1285
1286 -class RadioGroup(list):
1287 """ 1288 This object represents several ``<input type=radio>`` elements 1289 that have the same name. 1290 1291 You can use this like a list, but also use the property 1292 ``.value`` to check/uncheck inputs. Also you can use 1293 ``.value_options`` to get the possible values. 1294 """ 1295
1296 - def _value__get(self):
1297 """ 1298 Get/set the value, which checks the radio with that value (and 1299 unchecks any other value). 1300 """ 1301 for el in self: 1302 if 'checked' in el.attrib: 1303 return el.get('value') 1304 return None
1305
1306 - def _value__set(self, value):
1307 if value is not None: 1308 for el in self: 1309 if el.get('value') == value: 1310 checked_option = el 1311 break 1312 else: 1313 raise ValueError( 1314 "There is no radio input with the value %r" % value) 1315 for el in self: 1316 if 'checked' in el.attrib: 1317 del el.attrib['checked'] 1318 if value is not None: 1319 checked_option.set('checked', '')
1320
1321 - def _value__del(self):
1322 self.value = None
1323 1324 value = property(_value__get, _value__set, _value__del, doc=_value__get.__doc__) 1325
1326 - def value_options(self):
1327 """ 1328 Returns a list of all the possible values. 1329 """ 1330 return [el.get('value') for el in self]
1331 value_options = property(value_options, doc=value_options.__doc__) 1332
1333 - def __repr__(self):
1334 return '%s(%s)' % ( 1335 self.__class__.__name__, 1336 list.__repr__(self))
1337
1338 -class CheckboxGroup(list):
1339 """ 1340 Represents a group of checkboxes (``<input type=checkbox>``) that 1341 have the same name. 1342 1343 In addition to using this like a list, the ``.value`` attribute 1344 returns a set-like object that you can add to or remove from to 1345 check and uncheck checkboxes. You can also use ``.value_options`` 1346 to get the possible values. 1347 """ 1348
1349 - def _value__get(self):
1350 """ 1351 Return a set-like object that can be modified to check or 1352 uncheck individual checkboxes according to their value. 1353 """ 1354 return CheckboxValues(self)
1355 - def _value__set(self, value):
1356 self.value.clear() 1357 if not hasattr(value, '__iter__'): 1358 raise ValueError( 1359 "A CheckboxGroup (name=%r) must be set to a sequence (not %r)" 1360 % (self[0].name, value)) 1361 self.value.update(value)
1362 - def _value__del(self):
1363 self.value.clear()
1364 value = property(_value__get, _value__set, _value__del, doc=_value__get.__doc__) 1365
1366 - def value_options(self):
1367 """ 1368 Returns a list of all the possible values. 1369 """ 1370 return [el.get('value') for el in self]
1371 value_options = property(value_options, doc=value_options.__doc__) 1372
1373 - def __repr__(self):
1374 return '%s(%s)' % ( 1375 self.__class__.__name__, list.__repr__(self))
1376
1377 -class CheckboxValues(SetMixin):
1378 1379 """ 1380 Represents the values of the checked checkboxes in a group of 1381 checkboxes with the same name. 1382 """ 1383
1384 - def __init__(self, group):
1385 self.group = group
1386
1387 - def __iter__(self):
1388 return iter([ 1389 el.get('value') 1390 for el in self.group 1391 if 'checked' in el.attrib])
1392
1393 - def add(self, value):
1394 for el in self.group: 1395 if el.get('value') == value: 1396 el.set('checked', '') 1397 break 1398 else: 1399 raise KeyError("No checkbox with value %r" % value)
1400
1401 - def remove(self, value):
1402 for el in self.group: 1403 if el.get('value') == value: 1404 if 'checked' in el.attrib: 1405 del el.attrib['checked'] 1406 else: 1407 raise KeyError( 1408 "The checkbox with value %r was already unchecked" % value) 1409 break 1410 else: 1411 raise KeyError( 1412 "No checkbox with value %r" % value)
1413
1414 - def __repr__(self):
1415 return '<%s {%s} for checkboxes name=%r>' % ( 1416 self.__class__.__name__, 1417 ', '.join([repr(v) for v in self]), 1418 self.group.name)
1419
1420 -class InputElement(InputMixin, HtmlElement):
1421 """ 1422 Represents an ``<input>`` element. 1423 1424 You can get the type with ``.type`` (which is lower-cased and 1425 defaults to ``'text'``). 1426 1427 Also you can get and set the value with ``.value`` 1428 1429 Checkboxes and radios have the attribute ``input.checkable == 1430 True`` (for all others it is false) and a boolean attribute 1431 ``.checked``. 1432 1433 """ 1434 1435 ## FIXME: I'm a little uncomfortable with the use of .checked
1436 - def _value__get(self):
1437 """ 1438 Get/set the value of this element, using the ``value`` attribute. 1439 1440 Also, if this is a checkbox and it has no value, this defaults 1441 to ``'on'``. If it is a checkbox or radio that is not 1442 checked, this returns None. 1443 """ 1444 if self.checkable: 1445 if self.checked: 1446 return self.get('value') or 'on' 1447 else: 1448 return None 1449 return self.get('value')
1450 - def _value__set(self, value):
1451 if self.checkable: 1452 if not value: 1453 self.checked = False 1454 else: 1455 self.checked = True 1456 if isinstance(value, basestring): 1457 self.set('value', value) 1458 else: 1459 self.set('value', value)
1460 - def _value__del(self):
1461 if self.checkable: 1462 self.checked = False 1463 else: 1464 if 'value' in self.attrib: 1465 del self.attrib['value']
1466 value = property(_value__get, _value__set, _value__del, doc=_value__get.__doc__) 1467
1468 - def _type__get(self):
1469 """ 1470 Return the type of this element (using the type attribute). 1471 """ 1472 return self.get('type', 'text').lower()
1473 - def _type__set(self, value):
1474 self.set('type', value)
1475 type = property(_type__get, _type__set, doc=_type__get.__doc__) 1476
1477 - def checkable(self):
1478 """ 1479 Boolean: can this element be checked? 1480 """ 1481 return self.type in ['checkbox', 'radio']
1482 checkable = property(checkable, doc=checkable.__doc__) 1483
1484 - def _checked__get(self):
1485 """ 1486 Boolean attribute to get/set the presence of the ``checked`` 1487 attribute. 1488 1489 You can only use this on checkable input types. 1490 """ 1491 if not self.checkable: 1492 raise AttributeError('Not a checkable input type') 1493 return 'checked' in self.attrib
1494 - def _checked__set(self, value):
1495 if not self.checkable: 1496 raise AttributeError('Not a checkable input type') 1497 if value: 1498 self.set('checked', '') 1499 else: 1500 if 'checked' in self.attrib: 1501 del self.attrib['checked']
1502 checked = property(_checked__get, _checked__set, doc=_checked__get.__doc__)
1503 1504 HtmlElementClassLookup._default_element_classes['input'] = InputElement 1505
1506 -class LabelElement(HtmlElement):
1507 """ 1508 Represents a ``<label>`` element. 1509 1510 Label elements are linked to other elements with their ``for`` 1511 attribute. You can access this element with ``label.for_element``. 1512 """ 1513
1514 - def _for_element__get(self):
1515 """ 1516 Get/set the element this label points to. Return None if it 1517 can't be found. 1518 """ 1519 id = self.get('for') 1520 if not id: 1521 return None 1522 return self.body.get_element_by_id(id)
1523 - def _for_element__set(self, other):
1524 id = other.get('id') 1525 if not id: 1526 raise TypeError( 1527 "Element %r has no id attribute" % other) 1528 self.set('for', id)
1529 - def _for_element__del(self):
1530 if 'id' in self.attrib: 1531 del self.attrib['id']
1532 for_element = property(_for_element__get, _for_element__set, _for_element__del, 1533 doc=_for_element__get.__doc__)
1534 1535 HtmlElementClassLookup._default_element_classes['label'] = LabelElement 1536 1537 ############################################################ 1538 ## Serialization 1539 ############################################################ 1540
1541 -def html_to_xhtml(html):
1542 """Convert all tags in an HTML tree to XHTML by moving them to the 1543 XHTML namespace. 1544 """ 1545 try: 1546 html = html.getroot() 1547 except AttributeError: 1548 pass 1549 prefix = "{%s}" % XHTML_NAMESPACE 1550 for el in html.iter(etree.Element): 1551 tag = el.tag 1552 if tag[0] != '{': 1553 el.tag = prefix + tag
1554
1555 -def xhtml_to_html(xhtml):
1556 """Convert all tags in an XHTML tree to HTML by removing their 1557 XHTML namespace. 1558 """ 1559 try: 1560 xhtml = xhtml.getroot() 1561 except AttributeError: 1562 pass 1563 prefix = "{%s}" % XHTML_NAMESPACE 1564 prefix_len = len(prefix) 1565 for el in xhtml.iter(prefix + "*"): 1566 el.tag = el.tag[prefix_len:]
1567 1568 # This isn't a general match, but it's a match for what libxml2 1569 # specifically serialises: 1570 __str_replace_meta_content_type = re.compile( 1571 r'<meta http-equiv="Content-Type"[^>]*>').sub 1572 __bytes_replace_meta_content_type = re.compile( 1573 r'<meta http-equiv="Content-Type"[^>]*>'.encode('ASCII')).sub 1574
1575 -def tostring(doc, pretty_print=False, include_meta_content_type=False, 1576 encoding=None, method="html", with_tail=True, doctype=None):
1577 """Return an HTML string representation of the document. 1578 1579 Note: if include_meta_content_type is true this will create a 1580 ``<meta http-equiv="Content-Type" ...>`` tag in the head; 1581 regardless of the value of include_meta_content_type any existing 1582 ``<meta http-equiv="Content-Type" ...>`` tag will be removed 1583 1584 The ``encoding`` argument controls the output encoding (defauts to 1585 ASCII, with &#...; character references for any characters outside 1586 of ASCII). Note that you can pass the name ``'unicode'`` as 1587 ``encoding`` argument to serialise to a Unicode string. 1588 1589 The ``method`` argument defines the output method. It defaults to 1590 'html', but can also be 'xml' for xhtml output, or 'text' to 1591 serialise to plain text without markup. 1592 1593 To leave out the tail text of the top-level element that is being 1594 serialised, pass ``with_tail=False``. 1595 1596 The ``doctype`` option allows passing in a plain string that will 1597 be serialised before the XML tree. Note that passing in non 1598 well-formed content here will make the XML output non well-formed. 1599 Also, an existing doctype in the document tree will not be removed 1600 when serialising an ElementTree instance. 1601 1602 Example:: 1603 1604 >>> from lxml import html 1605 >>> root = html.fragment_fromstring('<p>Hello<br>world!</p>') 1606 1607 >>> html.tostring(root) 1608 b'<p>Hello<br>world!</p>' 1609 >>> html.tostring(root, method='html') 1610 b'<p>Hello<br>world!</p>' 1611 1612 >>> html.tostring(root, method='xml') 1613 b'<p>Hello<br/>world!</p>' 1614 1615 >>> html.tostring(root, method='text') 1616 b'Helloworld!' 1617 1618 >>> html.tostring(root, method='text', encoding='unicode') 1619 u'Helloworld!' 1620 1621 >>> root = html.fragment_fromstring('<div><p>Hello<br>world!</p>TAIL</div>') 1622 >>> html.tostring(root[0], method='text', encoding='unicode') 1623 u'Helloworld!TAIL' 1624 1625 >>> html.tostring(root[0], method='text', encoding='unicode', with_tail=False) 1626 u'Helloworld!' 1627 1628 >>> doc = html.document_fromstring('<p>Hello<br>world!</p>') 1629 >>> html.tostring(doc, method='html', encoding='unicode') 1630 u'<html><body><p>Hello<br>world!</p></body></html>' 1631 1632 >>> print(html.tostring(doc, method='html', encoding='unicode', 1633 ... doctype='<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN"' 1634 ... ' "http://www.w3.org/TR/html4/strict.dtd">')) 1635 <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd"> 1636 <html><body><p>Hello<br>world!</p></body></html> 1637 """ 1638 html = etree.tostring(doc, method=method, pretty_print=pretty_print, 1639 encoding=encoding, with_tail=with_tail, 1640 doctype=doctype) 1641 if method == 'html' and not include_meta_content_type: 1642 if isinstance(html, str): 1643 html = __str_replace_meta_content_type('', html) 1644 else: 1645 html = __bytes_replace_meta_content_type(bytes(), html) 1646 return html
1647 1648 tostring.__doc__ = __fix_docstring(tostring.__doc__) 1649
1650 -def open_in_browser(doc, encoding=None):
1651 """ 1652 Open the HTML document in a web browser, saving it to a temporary 1653 file to open it. Note that this does not delete the file after 1654 use. This is mainly meant for debugging. 1655 """ 1656 import os 1657 import webbrowser 1658 import tempfile 1659 if not isinstance(doc, etree._ElementTree): 1660 doc = etree.ElementTree(doc) 1661 handle, fn = tempfile.mkstemp(suffix='.html') 1662 f = os.fdopen(handle, 'wb') 1663 try: 1664 doc.write(f, method="html", encoding=encoding or doc.docinfo.encoding or "UTF-8") 1665 finally: 1666 # we leak the file itself here, but we should at least close it 1667 f.close() 1668 url = 'file://' + fn.replace(os.path.sep, '/') 1669 print(url) 1670 webbrowser.open(url)
1671 1672 ################################################################################ 1673 # configure Element class lookup 1674 ################################################################################ 1675
1676 -class HTMLParser(etree.HTMLParser):
1677 """An HTML parser that is configured to return lxml.html Element 1678 objects. 1679 """
1680 - def __init__(self, **kwargs):
1681 super(HTMLParser, self).__init__(**kwargs) 1682 self.set_element_class_lookup(HtmlElementClassLookup())
1683
1684 -class XHTMLParser(etree.XMLParser):
1685 """An XML parser that is configured to return lxml.html Element 1686 objects. 1687 1688 Note that this parser is not really XHTML aware unless you let it 1689 load a DTD that declares the HTML entities. To do this, make sure 1690 you have the XHTML DTDs installed in your catalogs, and create the 1691 parser like this:: 1692 1693 >>> parser = XHTMLParser(load_dtd=True) 1694 1695 If you additionally want to validate the document, use this:: 1696 1697 >>> parser = XHTMLParser(dtd_validation=True) 1698 1699 For catalog support, see http://www.xmlsoft.org/catalog.html. 1700 """
1701 - def __init__(self, **kwargs):
1702 super(XHTMLParser, self).__init__(**kwargs) 1703 self.set_element_class_lookup(HtmlElementClassLookup())
1704
1705 -def Element(*args, **kw):
1706 """Create a new HTML Element. 1707 1708 This can also be used for XHTML documents. 1709 """ 1710 v = html_parser.makeelement(*args, **kw) 1711 return v
1712 1713 html_parser = HTMLParser() 1714 xhtml_parser = XHTMLParser() 1715