Package lxml :: Package html
[hide private]
[frames] | no frames]

Source Code for Package lxml.html

   1  # Copyright (c) 2004 Ian Bicking. All rights reserved. 
   2  # 
   3  # Redistribution and use in source and binary forms, with or without 
   4  # modification, are permitted provided that the following conditions are 
   5  # met: 
   6  # 
   7  # 1. Redistributions of source code must retain the above copyright 
   8  # notice, this list of conditions and the following disclaimer. 
   9  # 
  10  # 2. Redistributions in binary form must reproduce the above copyright 
  11  # notice, this list of conditions and the following disclaimer in 
  12  # the documentation and/or other materials provided with the 
  13  # distribution. 
  14  # 
  15  # 3. Neither the name of Ian Bicking nor the names of its contributors may 
  16  # be used to endorse or promote products derived from this software 
  17  # without specific prior written permission. 
  18  # 
  19  # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 
  20  # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 
  21  # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 
  22  # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL IAN BICKING OR 
  23  # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 
  24  # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 
  25  # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 
  26  # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 
  27  # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 
  28  # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 
  29  # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 
  30   
  31  """The ``lxml.html`` tool set for HTML handling. 
  32  """ 
  33   
  34  from __future__ import absolute_import 
  35   
  36  __all__ = [ 
  37      'document_fromstring', 'fragment_fromstring', 'fragments_fromstring', 'fromstring', 
  38      'tostring', 'Element', 'defs', 'open_in_browser', 'submit_form', 
  39      'find_rel_links', 'find_class', 'make_links_absolute', 
  40      'resolve_base_href', 'iterlinks', 'rewrite_links', 'open_in_browser', 'parse'] 
  41   
  42   
  43  import copy 
  44  import sys 
  45  import re 
  46  from functools import partial 
  47   
  48  try: 
  49      from collections.abc import MutableMapping, MutableSet 
  50  except ImportError: 
  51      from collections import MutableMapping, MutableSet 
  52   
  53  from .. import etree 
  54  from . import defs 
  55  from ._setmixin import SetMixin 
  56   
  57  try: 
  58      from urlparse import urljoin 
  59  except ImportError: 
  60      # Python 3 
  61      from urllib.parse import urljoin 
  62   
  63  try: 
  64      unicode 
  65  except NameError: 
  66      # Python 3 
  67      unicode = str 
  68  try: 
  69      basestring 
  70  except NameError: 
  71      # Python 3 
  72      basestring = (str, bytes) 
73 74 75 -def __fix_docstring(s):
76 if not s: 77 return s 78 if sys.version_info[0] >= 3: 79 sub = re.compile(r"^(\s*)u'", re.M).sub 80 else: 81 sub = re.compile(r"^(\s*)b'", re.M).sub 82 return sub(r"\1'", s)
83 84 85 XHTML_NAMESPACE = "http://www.w3.org/1999/xhtml" 86 87 _rel_links_xpath = etree.XPath("descendant-or-self::a[@rel]|descendant-or-self::x:a[@rel]", 88 namespaces={'x':XHTML_NAMESPACE}) 89 _options_xpath = etree.XPath("descendant-or-self::option|descendant-or-self::x:option", 90 namespaces={'x':XHTML_NAMESPACE}) 91 _forms_xpath = etree.XPath("descendant-or-self::form|descendant-or-self::x:form", 92 namespaces={'x':XHTML_NAMESPACE}) 93 #_class_xpath = etree.XPath(r"descendant-or-self::*[regexp:match(@class, concat('\b', $class_name, '\b'))]", {'regexp': 'http://exslt.org/regular-expressions'}) 94 _class_xpath = etree.XPath("descendant-or-self::*[@class and contains(concat(' ', normalize-space(@class), ' '), concat(' ', $class_name, ' '))]") 95 _id_xpath = etree.XPath("descendant-or-self::*[@id=$id]") 96 _collect_string_content = etree.XPath("string()") 97 _iter_css_urls = re.compile(r'url\(('+'["][^"]*["]|'+"['][^']*[']|"+r'[^)]*)\)', re.I).finditer 98 _iter_css_imports = re.compile(r'@import "(.*?)"').finditer 99 _label_xpath = etree.XPath("//label[@for=$id]|//x:label[@for=$id]", 100 namespaces={'x':XHTML_NAMESPACE}) 101 _archive_re = re.compile(r'[^ ]+') 102 _parse_meta_refresh_url = re.compile( 103 r'[^;=]*;\s*(?:url\s*=\s*)?(?P<url>.*)$', re.I).search
104 105 106 -def _unquote_match(s, pos):
107 if s[:1] == '"' and s[-1:] == '"' or s[:1] == "'" and s[-1:] == "'": 108 return s[1:-1], pos+1 109 else: 110 return s,pos
111
112 113 -def _transform_result(typ, result):
114 """Convert the result back into the input type. 115 """ 116 if issubclass(typ, bytes): 117 return tostring(result, encoding='utf-8') 118 elif issubclass(typ, unicode): 119 return tostring(result, encoding='unicode') 120 else: 121 return result
122
123 124 -def _nons(tag):
125 if isinstance(tag, basestring): 126 if tag[0] == '{' and tag[1:len(XHTML_NAMESPACE)+1] == XHTML_NAMESPACE: 127 return tag.split('}')[-1] 128 return tag
129
130 131 -class Classes(MutableSet):
132 """Provides access to an element's class attribute as a set-like collection. 133 Usage:: 134 135 >>> el = fromstring('<p class="hidden large">Text</p>') 136 >>> classes = el.classes # or: classes = Classes(el.attrib) 137 >>> classes |= ['block', 'paragraph'] 138 >>> el.get('class') 139 'hidden large block paragraph' 140 >>> classes.toggle('hidden') 141 False 142 >>> el.get('class') 143 'large block paragraph' 144 >>> classes -= ('some', 'classes', 'block') 145 >>> el.get('class') 146 'large paragraph' 147 """
148 - def __init__(self, attributes):
149 self._attributes = attributes 150 self._get_class_value = partial(attributes.get, 'class', '')
151
152 - def add(self, value):
153 """ 154 Add a class. 155 156 This has no effect if the class is already present. 157 """ 158 if not value or re.search(r'\s', value): 159 raise ValueError("Invalid class name: %r" % value) 160 classes = self._get_class_value().split() 161 if value in classes: 162 return 163 classes.append(value) 164 self._attributes['class'] = ' '.join(classes)
165
166 - def discard(self, value):
167 """ 168 Remove a class if it is currently present. 169 170 If the class is not present, do nothing. 171 """ 172 if not value or re.search(r'\s', value): 173 raise ValueError("Invalid class name: %r" % value) 174 classes = [name for name in self._get_class_value().split() 175 if name != value] 176 if classes: 177 self._attributes['class'] = ' '.join(classes) 178 elif 'class' in self._attributes: 179 del self._attributes['class']
180
181 - def remove(self, value):
182 """ 183 Remove a class; it must currently be present. 184 185 If the class is not present, raise a KeyError. 186 """ 187 if not value or re.search(r'\s', value): 188 raise ValueError("Invalid class name: %r" % value) 189 super(Classes, self).remove(value)
190
191 - def __contains__(self, name):
192 classes = self._get_class_value() 193 return name in classes and name in classes.split()
194
195 - def __iter__(self):
196 return iter(self._get_class_value().split())
197
198 - def __len__(self):
199 return len(self._get_class_value().split())
200 201 # non-standard methods 202
203 - def update(self, values):
204 """ 205 Add all names from 'values'. 206 """ 207 classes = self._get_class_value().split() 208 extended = False 209 for value in values: 210 if value not in classes: 211 classes.append(value) 212 extended = True 213 if extended: 214 self._attributes['class'] = ' '.join(classes)
215
216 - def toggle(self, value):
217 """ 218 Add a class name if it isn't there yet, or remove it if it exists. 219 220 Returns true if the class was added (and is now enabled) and 221 false if it was removed (and is now disabled). 222 """ 223 if not value or re.search(r'\s', value): 224 raise ValueError("Invalid class name: %r" % value) 225 classes = self._get_class_value().split() 226 try: 227 classes.remove(value) 228 enabled = False 229 except ValueError: 230 classes.append(value) 231 enabled = True 232 if classes: 233 self._attributes['class'] = ' '.join(classes) 234 else: 235 del self._attributes['class'] 236 return enabled
237
238 239 -class HtmlMixin(object):
240
241 - def set(self, key, value=None):
242 """set(self, key, value=None) 243 244 Sets an element attribute. If no value is provided, or if the value is None, 245 creates a 'boolean' attribute without value, e.g. "<form novalidate></form>" 246 for ``form.set('novalidate')``. 247 """ 248 super(HtmlElement, self).set(key, value)
249 250 @property
251 - def classes(self):
252 """ 253 A set-like wrapper around the 'class' attribute. 254 """ 255 return Classes(self.attrib)
256 257 @classes.setter
258 - def classes(self, classes):
259 assert isinstance(classes, Classes) # only allow "el.classes |= ..." etc. 260 value = classes._get_class_value() 261 if value: 262 self.set('class', value) 263 elif self.get('class') is not None: 264 del self.attrib['class']
265 266 @property
267 - def base_url(self):
268 """ 269 Returns the base URL, given when the page was parsed. 270 271 Use with ``urlparse.urljoin(el.base_url, href)`` to get 272 absolute URLs. 273 """ 274 return self.getroottree().docinfo.URL
275 276 @property
277 - def forms(self):
278 """ 279 Return a list of all the forms 280 """ 281 return _forms_xpath(self)
282 283 @property
284 - def body(self):
285 """ 286 Return the <body> element. Can be called from a child element 287 to get the document's head. 288 """ 289 return self.xpath('//body|//x:body', namespaces={'x':XHTML_NAMESPACE})[0]
290 291 @property
292 - def head(self):
293 """ 294 Returns the <head> element. Can be called from a child 295 element to get the document's head. 296 """ 297 return self.xpath('//head|//x:head', namespaces={'x':XHTML_NAMESPACE})[0]
298 299 @property
300 - def label(self):
301 """ 302 Get or set any <label> element associated with this element. 303 """ 304 id = self.get('id') 305 if not id: 306 return None 307 result = _label_xpath(self, id=id) 308 if not result: 309 return None 310 else: 311 return result[0]
312 313 @label.setter
314 - def label(self, label):
315 id = self.get('id') 316 if not id: 317 raise TypeError( 318 "You cannot set a label for an element (%r) that has no id" 319 % self) 320 if _nons(label.tag) != 'label': 321 raise TypeError( 322 "You can only assign label to a label element (not %r)" 323 % label) 324 label.set('for', id)
325 326 @label.deleter
327 - def label(self):
328 label = self.label 329 if label is not None: 330 del label.attrib['for']
331
332 - def drop_tree(self):
333 """ 334 Removes this element from the tree, including its children and 335 text. The tail text is joined to the previous element or 336 parent. 337 """ 338 parent = self.getparent() 339 assert parent is not None 340 if self.tail: 341 previous = self.getprevious() 342 if previous is None: 343 parent.text = (parent.text or '') + self.tail 344 else: 345 previous.tail = (previous.tail or '') + self.tail 346 parent.remove(self)
347
348 - def drop_tag(self):
349 """ 350 Remove the tag, but not its children or text. The children and text 351 are merged into the parent. 352 353 Example:: 354 355 >>> h = fragment_fromstring('<div>Hello <b>World!</b></div>') 356 >>> h.find('.//b').drop_tag() 357 >>> print(tostring(h, encoding='unicode')) 358 <div>Hello World!</div> 359 """ 360 parent = self.getparent() 361 assert parent is not None 362 previous = self.getprevious() 363 if self.text and isinstance(self.tag, basestring): 364 # not a Comment, etc. 365 if previous is None: 366 parent.text = (parent.text or '') + self.text 367 else: 368 previous.tail = (previous.tail or '') + self.text 369 if self.tail: 370 if len(self): 371 last = self[-1] 372 last.tail = (last.tail or '') + self.tail 373 elif previous is None: 374 parent.text = (parent.text or '') + self.tail 375 else: 376 previous.tail = (previous.tail or '') + self.tail 377 index = parent.index(self) 378 parent[index:index+1] = self[:]
379 387
388 - def find_class(self, class_name):
389 """ 390 Find any elements with the given class name. 391 """ 392 return _class_xpath(self, class_name=class_name)
393
394 - def get_element_by_id(self, id, *default):
395 """ 396 Get the first element in a document with the given id. If none is 397 found, return the default argument if provided or raise KeyError 398 otherwise. 399 400 Note that there can be more than one element with the same id, 401 and this isn't uncommon in HTML documents found in the wild. 402 Browsers return only the first match, and this function does 403 the same. 404 """ 405 try: 406 # FIXME: should this check for multiple matches? 407 # browsers just return the first one 408 return _id_xpath(self, id=id)[0] 409 except IndexError: 410 if default: 411 return default[0] 412 else: 413 raise KeyError(id)
414
415 - def text_content(self):
416 """ 417 Return the text content of the tag (and the text in any children). 418 """ 419 return _collect_string_content(self)
420
421 - def cssselect(self, expr, translator='html'):
422 """ 423 Run the CSS expression on this element and its children, 424 returning a list of the results. 425 426 Equivalent to lxml.cssselect.CSSSelect(expr, translator='html')(self) 427 -- note that pre-compiling the expression can provide a substantial 428 speedup. 429 """ 430 # Do the import here to make the dependency optional. 431 from lxml.cssselect import CSSSelector 432 return CSSSelector(expr, translator=translator)(self)
433 434 ######################################## 435 ## Link functions 436 ######################################## 437 468 elif handle_failures == 'discard': 469 def link_repl(href): 470 try: 471 return urljoin(base_url, href) 472 except ValueError: 473 return None
474 elif handle_failures is None: 475 def link_repl(href): 476 return urljoin(base_url, href) 477 else: 478 raise ValueError( 479 "unexpected value for handle_failures: %r" % handle_failures) 480 481 self.rewrite_links(link_repl) 482
483 - def resolve_base_href(self, handle_failures=None):
484 """ 485 Find any ``<base href>`` tag in the document, and apply its 486 values to all links found in the document. Also remove the 487 tag once it has been applied. 488 489 If ``handle_failures`` is None (default), a failure to process 490 a URL will abort the processing. If set to 'ignore', errors 491 are ignored. If set to 'discard', failing URLs will be removed. 492 """ 493 base_href = None 494 basetags = self.xpath('//base[@href]|//x:base[@href]', 495 namespaces={'x': XHTML_NAMESPACE}) 496 for b in basetags: 497 base_href = b.get('href') 498 b.drop_tree() 499 if not base_href: 500 return 501 self.make_links_absolute(base_href, resolve_base_href=False, 502 handle_failures=handle_failures)
503 593 642
643 644 -class _MethodFunc(object):
645 """ 646 An object that represents a method on an element as a function; 647 the function takes either an element or an HTML string. It 648 returns whatever the function normally returns, or if the function 649 works in-place (and so returns None) it returns a serialized form 650 of the resulting document. 651 """
652 - def __init__(self, name, copy=False, source_class=HtmlMixin):
653 self.name = name 654 self.copy = copy 655 self.__doc__ = getattr(source_class, self.name).__doc__
656 - def __call__(self, doc, *args, **kw):
657 result_type = type(doc) 658 if isinstance(doc, basestring): 659 if 'copy' in kw: 660 raise TypeError( 661 "The keyword 'copy' can only be used with element inputs to %s, not a string input" % self.name) 662 doc = fromstring(doc, **kw) 663 else: 664 if 'copy' in kw: 665 make_a_copy = kw.pop('copy') 666 else: 667 make_a_copy = self.copy 668 if make_a_copy: 669 doc = copy.deepcopy(doc) 670 meth = getattr(doc, self.name) 671 result = meth(*args, **kw) 672 # FIXME: this None test is a bit sloppy 673 if result is None: 674 # Then return what we got in 675 return _transform_result(result_type, doc) 676 else: 677 return result
678 679 680 find_rel_links = _MethodFunc('find_rel_links', copy=False) 681 find_class = _MethodFunc('find_class', copy=False) 682 make_links_absolute = _MethodFunc('make_links_absolute', copy=True) 683 resolve_base_href = _MethodFunc('resolve_base_href', copy=True) 684 iterlinks = _MethodFunc('iterlinks', copy=False) 685 rewrite_links = _MethodFunc('rewrite_links', copy=True)
686 687 688 -class HtmlComment(etree.CommentBase, HtmlMixin):
689 pass
690
691 692 -class HtmlElement(etree.ElementBase, HtmlMixin):
693 # Override etree.ElementBase.cssselect() and set(), despite the MRO (FIXME: change base order?) 694 cssselect = HtmlMixin.cssselect 695 set = HtmlMixin.set
696
697 698 -class HtmlProcessingInstruction(etree.PIBase, HtmlMixin):
699 pass
700
701 702 -class HtmlEntity(etree.EntityBase, HtmlMixin):
703 pass
704
705 706 -class HtmlElementClassLookup(etree.CustomElementClassLookup):
707 """A lookup scheme for HTML Element classes. 708 709 To create a lookup instance with different Element classes, pass a tag 710 name mapping of Element classes in the ``classes`` keyword argument and/or 711 a tag name mapping of Mixin classes in the ``mixins`` keyword argument. 712 The special key '*' denotes a Mixin class that should be mixed into all 713 Element classes. 714 """ 715 _default_element_classes = {} 716
717 - def __init__(self, classes=None, mixins=None):
718 etree.CustomElementClassLookup.__init__(self) 719 if classes is None: 720 classes = self._default_element_classes.copy() 721 if mixins: 722 mixers = {} 723 for name, value in mixins: 724 if name == '*': 725 for n in classes.keys(): 726 mixers.setdefault(n, []).append(value) 727 else: 728 mixers.setdefault(name, []).append(value) 729 for name, mix_bases in mixers.items(): 730 cur = classes.get(name, HtmlElement) 731 bases = tuple(mix_bases + [cur]) 732 classes[name] = type(cur.__name__, bases, {}) 733 self._element_classes = classes
734
735 - def lookup(self, node_type, document, namespace, name):
736 if node_type == 'element': 737 return self._element_classes.get(name.lower(), HtmlElement) 738 elif node_type == 'comment': 739 return HtmlComment 740 elif node_type == 'PI': 741 return HtmlProcessingInstruction 742 elif node_type == 'entity': 743 return HtmlEntity 744 # Otherwise normal lookup 745 return None
746 747 748 ################################################################################ 749 # parsing 750 ################################################################################ 751 752 _looks_like_full_html_unicode = re.compile( 753 unicode(r'^\s*<(?:html|!doctype)'), re.I).match 754 _looks_like_full_html_bytes = re.compile( 755 r'^\s*<(?:html|!doctype)'.encode('ascii'), re.I).match
756 757 758 -def document_fromstring(html, parser=None, ensure_head_body=False, **kw):
759 if parser is None: 760 parser = html_parser 761 value = etree.fromstring(html, parser, **kw) 762 if value is None: 763 raise etree.ParserError( 764 "Document is empty") 765 if ensure_head_body and value.find('head') is None: 766 value.insert(0, Element('head')) 767 if ensure_head_body and value.find('body') is None: 768 value.append(Element('body')) 769 return value
770
771 772 -def fragments_fromstring(html, no_leading_text=False, base_url=None, 773 parser=None, **kw):
774 """Parses several HTML elements, returning a list of elements. 775 776 The first item in the list may be a string. 777 If no_leading_text is true, then it will be an error if there is 778 leading text, and it will always be a list of only elements. 779 780 base_url will set the document's base_url attribute 781 (and the tree's docinfo.URL). 782 """ 783 if parser is None: 784 parser = html_parser 785 # FIXME: check what happens when you give html with a body, head, etc. 786 if isinstance(html, bytes): 787 if not _looks_like_full_html_bytes(html): 788 # can't use %-formatting in early Py3 versions 789 html = ('<html><body>'.encode('ascii') + html + 790 '</body></html>'.encode('ascii')) 791 else: 792 if not _looks_like_full_html_unicode(html): 793 html = '<html><body>%s</body></html>' % html 794 doc = document_fromstring(html, parser=parser, base_url=base_url, **kw) 795 assert _nons(doc.tag) == 'html' 796 bodies = [e for e in doc if _nons(e.tag) == 'body'] 797 assert len(bodies) == 1, ("too many bodies: %r in %r" % (bodies, html)) 798 body = bodies[0] 799 elements = [] 800 if no_leading_text and body.text and body.text.strip(): 801 raise etree.ParserError( 802 "There is leading text: %r" % body.text) 803 if body.text and body.text.strip(): 804 elements.append(body.text) 805 elements.extend(body) 806 # FIXME: removing the reference to the parent artificial document 807 # would be nice 808 return elements
809
810 811 -def fragment_fromstring(html, create_parent=False, base_url=None, 812 parser=None, **kw):
813 """ 814 Parses a single HTML element; it is an error if there is more than 815 one element, or if anything but whitespace precedes or follows the 816 element. 817 818 If ``create_parent`` is true (or is a tag name) then a parent node 819 will be created to encapsulate the HTML in a single element. In this 820 case, leading or trailing text is also allowed, as are multiple elements 821 as result of the parsing. 822 823 Passing a ``base_url`` will set the document's ``base_url`` attribute 824 (and the tree's docinfo.URL). 825 """ 826 if parser is None: 827 parser = html_parser 828 829 accept_leading_text = bool(create_parent) 830 831 elements = fragments_fromstring( 832 html, parser=parser, no_leading_text=not accept_leading_text, 833 base_url=base_url, **kw) 834 835 if create_parent: 836 if not isinstance(create_parent, basestring): 837 create_parent = 'div' 838 new_root = Element(create_parent) 839 if elements: 840 if isinstance(elements[0], basestring): 841 new_root.text = elements[0] 842 del elements[0] 843 new_root.extend(elements) 844 return new_root 845 846 if not elements: 847 raise etree.ParserError('No elements found') 848 if len(elements) > 1: 849 raise etree.ParserError( 850 "Multiple elements found (%s)" 851 % ', '.join([_element_name(e) for e in elements])) 852 el = elements[0] 853 if el.tail and el.tail.strip(): 854 raise etree.ParserError( 855 "Element followed by text: %r" % el.tail) 856 el.tail = None 857 return el
858
859 860 -def fromstring(html, base_url=None, parser=None, **kw):
861 """ 862 Parse the html, returning a single element/document. 863 864 This tries to minimally parse the chunk of text, without knowing if it 865 is a fragment or a document. 866 867 base_url will set the document's base_url attribute (and the tree's docinfo.URL) 868 """ 869 if parser is None: 870 parser = html_parser 871 if isinstance(html, bytes): 872 is_full_html = _looks_like_full_html_bytes(html) 873 else: 874 is_full_html = _looks_like_full_html_unicode(html) 875 doc = document_fromstring(html, parser=parser, base_url=base_url, **kw) 876 if is_full_html: 877 return doc 878 # otherwise, lets parse it out... 879 bodies = doc.findall('body') 880 if not bodies: 881 bodies = doc.findall('{%s}body' % XHTML_NAMESPACE) 882 if bodies: 883 body = bodies[0] 884 if len(bodies) > 1: 885 # Somehow there are multiple bodies, which is bad, but just 886 # smash them into one body 887 for other_body in bodies[1:]: 888 if other_body.text: 889 if len(body): 890 body[-1].tail = (body[-1].tail or '') + other_body.text 891 else: 892 body.text = (body.text or '') + other_body.text 893 body.extend(other_body) 894 # We'll ignore tail 895 # I guess we are ignoring attributes too 896 other_body.drop_tree() 897 else: 898 body = None 899 heads = doc.findall('head') 900 if not heads: 901 heads = doc.findall('{%s}head' % XHTML_NAMESPACE) 902 if heads: 903 # Well, we have some sort of structure, so lets keep it all 904 head = heads[0] 905 if len(heads) > 1: 906 for other_head in heads[1:]: 907 head.extend(other_head) 908 # We don't care about text or tail in a head 909 other_head.drop_tree() 910 return doc 911 if body is None: 912 return doc 913 if (len(body) == 1 and (not body.text or not body.text.strip()) 914 and (not body[-1].tail or not body[-1].tail.strip())): 915 # The body has just one element, so it was probably a single 916 # element passed in 917 return body[0] 918 # Now we have a body which represents a bunch of tags which have the 919 # content that was passed in. We will create a fake container, which 920 # is the body tag, except <body> implies too much structure. 921 if _contains_block_level_tag(body): 922 body.tag = 'div' 923 else: 924 body.tag = 'span' 925 return body
926
927 928 -def parse(filename_or_url, parser=None, base_url=None, **kw):
929 """ 930 Parse a filename, URL, or file-like object into an HTML document 931 tree. Note: this returns a tree, not an element. Use 932 ``parse(...).getroot()`` to get the document root. 933 934 You can override the base URL with the ``base_url`` keyword. This 935 is most useful when parsing from a file-like object. 936 """ 937 if parser is None: 938 parser = html_parser 939 return etree.parse(filename_or_url, parser, base_url=base_url, **kw)
940
941 942 -def _contains_block_level_tag(el):
943 # FIXME: I could do this with XPath, but would that just be 944 # unnecessarily slow? 945 for el in el.iter(etree.Element): 946 if _nons(el.tag) in defs.block_tags: 947 return True 948 return False
949
950 951 -def _element_name(el):
952 if isinstance(el, etree.CommentBase): 953 return 'comment' 954 elif isinstance(el, basestring): 955 return 'string' 956 else: 957 return _nons(el.tag)
958
959 960 ################################################################################ 961 # form handling 962 ################################################################################ 963 964 -class FormElement(HtmlElement):
965 """ 966 Represents a <form> element. 967 """ 968 969 @property
970 - def inputs(self):
971 """ 972 Returns an accessor for all the input elements in the form. 973 974 See `InputGetter` for more information about the object. 975 """ 976 return InputGetter(self)
977 978 @property
979 - def fields(self):
980 """ 981 Dictionary-like object that represents all the fields in this 982 form. You can set values in this dictionary to effect the 983 form. 984 """ 985 return FieldsDict(self.inputs)
986 987 @fields.setter
988 - def fields(self, value):
989 fields = self.fields 990 prev_keys = fields.keys() 991 for key, value in value.items(): 992 if key in prev_keys: 993 prev_keys.remove(key) 994 fields[key] = value 995 for key in prev_keys: 996 if key is None: 997 # Case of an unnamed input; these aren't really 998 # expressed in form_values() anyway. 999 continue 1000 fields[key] = None
1001
1002 - def _name(self):
1003 if self.get('name'): 1004 return self.get('name') 1005 elif self.get('id'): 1006 return '#' + self.get('id') 1007 iter_tags = self.body.iter 1008 forms = list(iter_tags('form')) 1009 if not forms: 1010 forms = list(iter_tags('{%s}form' % XHTML_NAMESPACE)) 1011 return str(forms.index(self))
1012
1013 - def form_values(self):
1014 """ 1015 Return a list of tuples of the field values for the form. 1016 This is suitable to be passed to ``urllib.urlencode()``. 1017 """ 1018 results = [] 1019 for el in self.inputs: 1020 name = el.name 1021 if not name or 'disabled' in el.attrib: 1022 continue 1023 tag = _nons(el.tag) 1024 if tag == 'textarea': 1025 results.append((name, el.value)) 1026 elif tag == 'select': 1027 value = el.value 1028 if el.multiple: 1029 for v in value: 1030 results.append((name, v)) 1031 elif value is not None: 1032 results.append((name, el.value)) 1033 else: 1034 assert tag == 'input', ( 1035 "Unexpected tag: %r" % el) 1036 if el.checkable and not el.checked: 1037 continue 1038 if el.type in ('submit', 'image', 'reset', 'file'): 1039 continue 1040 value = el.value 1041 if value is not None: 1042 results.append((name, el.value)) 1043 return results
1044 1045 @property
1046 - def action(self):
1047 """ 1048 Get/set the form's ``action`` attribute. 1049 """ 1050 base_url = self.base_url 1051 action = self.get('action') 1052 if base_url and action is not None: 1053 return urljoin(base_url, action) 1054 else: 1055 return action
1056 1057 @action.setter
1058 - def action(self, value):
1059 self.set('action', value)
1060 1061 @action.deleter
1062 - def action(self):
1063 attrib = self.attrib 1064 if 'action' in attrib: 1065 del attrib['action']
1066 1067 @property
1068 - def method(self):
1069 """ 1070 Get/set the form's method. Always returns a capitalized 1071 string, and defaults to ``'GET'`` 1072 """ 1073 return self.get('method', 'GET').upper()
1074 1075 @method.setter
1076 - def method(self, value):
1077 self.set('method', value.upper())
1078 1079 1080 HtmlElementClassLookup._default_element_classes['form'] = FormElement
1081 1082 1083 -def submit_form(form, extra_values=None, open_http=None):
1084 """ 1085 Helper function to submit a form. Returns a file-like object, as from 1086 ``urllib.urlopen()``. This object also has a ``.geturl()`` function, 1087 which shows the URL if there were any redirects. 1088 1089 You can use this like:: 1090 1091 form = doc.forms[0] 1092 form.inputs['foo'].value = 'bar' # etc 1093 response = form.submit() 1094 doc = parse(response) 1095 doc.make_links_absolute(response.geturl()) 1096 1097 To change the HTTP requester, pass a function as ``open_http`` keyword 1098 argument that opens the URL for you. The function must have the following 1099 signature:: 1100 1101 open_http(method, URL, values) 1102 1103 The action is one of 'GET' or 'POST', the URL is the target URL as a 1104 string, and the values are a sequence of ``(name, value)`` tuples with the 1105 form data. 1106 """ 1107 values = form.form_values() 1108 if extra_values: 1109 if hasattr(extra_values, 'items'): 1110 extra_values = extra_values.items() 1111 values.extend(extra_values) 1112 if open_http is None: 1113 open_http = open_http_urllib 1114 if form.action: 1115 url = form.action 1116 else: 1117 url = form.base_url 1118 return open_http(form.method, url, values)
1119
1120 1121 -def open_http_urllib(method, url, values):
1122 if not url: 1123 raise ValueError("cannot submit, no URL provided") 1124 ## FIXME: should test that it's not a relative URL or something 1125 try: 1126 from urllib import urlencode, urlopen 1127 except ImportError: # Python 3 1128 from urllib.request import urlopen 1129 from urllib.parse import urlencode 1130 if method == 'GET': 1131 if '?' in url: 1132 url += '&' 1133 else: 1134 url += '?' 1135 url += urlencode(values) 1136 data = None 1137 else: 1138 data = urlencode(values) 1139 if not isinstance(data, bytes): 1140 data = data.encode('ASCII') 1141 return urlopen(url, data)
1142
1143 1144 -class FieldsDict(MutableMapping):
1145
1146 - def __init__(self, inputs):
1147 self.inputs = inputs
1148 - def __getitem__(self, item):
1149 return self.inputs[item].value
1150 - def __setitem__(self, item, value):
1151 self.inputs[item].value = value
1152 - def __delitem__(self, item):
1153 raise KeyError( 1154 "You cannot remove keys from ElementDict")
1155 - def keys(self):
1156 return self.inputs.keys()
1157 - def __contains__(self, item):
1158 return item in self.inputs
1159 - def __iter__(self):
1160 return iter(self.inputs.keys())
1161 - def __len__(self):
1162 return len(self.inputs)
1163
1164 - def __repr__(self):
1165 return '<%s for form %s>' % ( 1166 self.__class__.__name__, 1167 self.inputs.form._name())
1168
1169 1170 -class InputGetter(object):
1171 1172 """ 1173 An accessor that represents all the input fields in a form. 1174 1175 You can get fields by name from this, with 1176 ``form.inputs['field_name']``. If there are a set of checkboxes 1177 with the same name, they are returned as a list (a `CheckboxGroup` 1178 which also allows value setting). Radio inputs are handled 1179 similarly. 1180 1181 You can also iterate over this to get all input elements. This 1182 won't return the same thing as if you get all the names, as 1183 checkboxes and radio elements are returned individually. 1184 """ 1185 1186 _name_xpath = etree.XPath(".//*[@name = $name and (local-name(.) = 'select' or local-name(.) = 'input' or local-name(.) = 'textarea')]") 1187 _all_xpath = etree.XPath(".//*[local-name() = 'select' or local-name() = 'input' or local-name() = 'textarea']") 1188
1189 - def __init__(self, form):
1190 self.form = form
1191
1192 - def __repr__(self):
1193 return '<%s for form %s>' % ( 1194 self.__class__.__name__, 1195 self.form._name())
1196 1197 ## FIXME: there should be more methods, and it's unclear if this is 1198 ## a dictionary-like object or list-like object 1199
1200 - def __getitem__(self, name):
1201 results = self._name_xpath(self.form, name=name) 1202 if results: 1203 type = results[0].get('type') 1204 if type == 'radio' and len(results) > 1: 1205 group = RadioGroup(results) 1206 group.name = name 1207 return group 1208 elif type == 'checkbox' and len(results) > 1: 1209 group = CheckboxGroup(results) 1210 group.name = name 1211 return group 1212 else: 1213 # I don't like throwing away elements like this 1214 return results[0] 1215 else: 1216 raise KeyError( 1217 "No input element with the name %r" % name)
1218
1219 - def __contains__(self, name):
1220 results = self._name_xpath(self.form, name=name) 1221 return bool(results)
1222
1223 - def keys(self):
1224 names = set() 1225 for el in self: 1226 names.add(el.name) 1227 if None in names: 1228 names.remove(None) 1229 return list(names)
1230
1231 - def __iter__(self):
1232 ## FIXME: kind of dumb to turn a list into an iterator, only 1233 ## to have it likely turned back into a list again :( 1234 return iter(self._all_xpath(self.form))
1235
1236 1237 -class InputMixin(object):
1238 """ 1239 Mix-in for all input elements (input, select, and textarea) 1240 """ 1241 @property
1242 - def name(self):
1243 """ 1244 Get/set the name of the element 1245 """ 1246 return self.get('name')
1247 1248 @name.setter
1249 - def name(self, value):
1250 self.set('name', value)
1251 1252 @name.deleter
1253 - def name(self):
1254 attrib = self.attrib 1255 if 'name' in attrib: 1256 del attrib['name']
1257
1258 - def __repr__(self):
1259 type_name = getattr(self, 'type', None) 1260 if type_name: 1261 type_name = ' type=%r' % type_name 1262 else: 1263 type_name = '' 1264 return '<%s %x name=%r%s>' % ( 1265 self.__class__.__name__, id(self), self.name, type_name)
1266
1267 1268 -class TextareaElement(InputMixin, HtmlElement):
1269 """ 1270 ``<textarea>`` element. You can get the name with ``.name`` and 1271 get/set the value with ``.value`` 1272 """ 1273 @property
1274 - def value(self):
1275 """ 1276 Get/set the value (which is the contents of this element) 1277 """ 1278 content = self.text or '' 1279 if self.tag.startswith("{%s}" % XHTML_NAMESPACE): 1280 serialisation_method = 'xml' 1281 else: 1282 serialisation_method = 'html' 1283 for el in self: 1284 # it's rare that we actually get here, so let's not use ''.join() 1285 content += etree.tostring( 1286 el, method=serialisation_method, encoding='unicode') 1287 return content
1288 1289 @value.setter
1290 - def value(self, value):
1291 del self[:] 1292 self.text = value
1293 1294 @value.deleter
1295 - def value(self):
1296 self.text = '' 1297 del self[:]
1298 1299 1300 HtmlElementClassLookup._default_element_classes['textarea'] = TextareaElement
1301 1302 1303 -class SelectElement(InputMixin, HtmlElement):
1304 """ 1305 ``<select>`` element. You can get the name with ``.name``. 1306 1307 ``.value`` will be the value of the selected option, unless this 1308 is a multi-select element (``<select multiple>``), in which case 1309 it will be a set-like object. In either case ``.value_options`` 1310 gives the possible values. 1311 1312 The boolean attribute ``.multiple`` shows if this is a 1313 multi-select. 1314 """ 1315 @property
1316 - def value(self):
1317 """ 1318 Get/set the value of this select (the selected option). 1319 1320 If this is a multi-select, this is a set-like object that 1321 represents all the selected options. 1322 """ 1323 if self.multiple: 1324 return MultipleSelectOptions(self) 1325 options = _options_xpath(self) 1326 1327 try: 1328 selected_option = next(el for el in reversed(options) if el.get('selected') is not None) 1329 except StopIteration: 1330 try: 1331 selected_option = next(el for el in options if el.get('disabled') is None) 1332 except StopIteration: 1333 return None 1334 value = selected_option.get('value') 1335 if value is None: 1336 value = (selected_option.text or '').strip() 1337 return value
1338 1339 @value.setter
1340 - def value(self, value):
1341 if self.multiple: 1342 if isinstance(value, basestring): 1343 raise TypeError("You must pass in a sequence") 1344 values = self.value 1345 values.clear() 1346 values.update(value) 1347 return 1348 checked_option = None 1349 if value is not None: 1350 for el in _options_xpath(self): 1351 opt_value = el.get('value') 1352 if opt_value is None: 1353 opt_value = (el.text or '').strip() 1354 if opt_value == value: 1355 checked_option = el 1356 break 1357 else: 1358 raise ValueError( 1359 "There is no option with the value of %r" % value) 1360 for el in _options_xpath(self): 1361 if 'selected' in el.attrib: 1362 del el.attrib['selected'] 1363 if checked_option is not None: 1364 checked_option.set('selected', '')
1365 1366 @value.deleter
1367 - def value(self):
1368 # FIXME: should del be allowed at all? 1369 if self.multiple: 1370 self.value.clear() 1371 else: 1372 self.value = None
1373 1374 @property
1375 - def value_options(self):
1376 """ 1377 All the possible values this select can have (the ``value`` 1378 attribute of all the ``<option>`` elements. 1379 """ 1380 options = [] 1381 for el in _options_xpath(self): 1382 value = el.get('value') 1383 if value is None: 1384 value = (el.text or '').strip() 1385 options.append(value) 1386 return options
1387 1388 @property
1389 - def multiple(self):
1390 """ 1391 Boolean attribute: is there a ``multiple`` attribute on this element. 1392 """ 1393 return 'multiple' in self.attrib
1394 1395 @multiple.setter
1396 - def multiple(self, value):
1397 if value: 1398 self.set('multiple', '') 1399 elif 'multiple' in self.attrib: 1400 del self.attrib['multiple']
1401 1402 1403 HtmlElementClassLookup._default_element_classes['select'] = SelectElement
1404 1405 1406 -class MultipleSelectOptions(SetMixin):
1407 """ 1408 Represents all the selected options in a ``<select multiple>`` element. 1409 1410 You can add to this set-like option to select an option, or remove 1411 to unselect the option. 1412 """ 1413
1414 - def __init__(self, select):
1415 self.select = select
1416 1417 @property
1418 - def options(self):
1419 """ 1420 Iterator of all the ``<option>`` elements. 1421 """ 1422 return iter(_options_xpath(self.select))
1423
1424 - def __iter__(self):
1425 for option in self.options: 1426 if 'selected' in option.attrib: 1427 opt_value = option.get('value') 1428 if opt_value is None: 1429 opt_value = (option.text or '').strip() 1430 yield opt_value
1431
1432 - def add(self, item):
1433 for option in self.options: 1434 opt_value = option.get('value') 1435 if opt_value is None: 1436 opt_value = (option.text or '').strip() 1437 if opt_value == item: 1438 option.set('selected', '') 1439 break 1440 else: 1441 raise ValueError( 1442 "There is no option with the value %r" % item)
1443
1444 - def remove(self, item):
1445 for option in self.options: 1446 opt_value = option.get('value') 1447 if opt_value is None: 1448 opt_value = (option.text or '').strip() 1449 if opt_value == item: 1450 if 'selected' in option.attrib: 1451 del option.attrib['selected'] 1452 else: 1453 raise ValueError( 1454 "The option %r is not currently selected" % item) 1455 break 1456 else: 1457 raise ValueError( 1458 "There is not option with the value %r" % item)
1459
1460 - def __repr__(self):
1461 return '<%s {%s} for select name=%r>' % ( 1462 self.__class__.__name__, 1463 ', '.join([repr(v) for v in self]), 1464 self.select.name)
1465
1466 1467 -class RadioGroup(list):
1468 """ 1469 This object represents several ``<input type=radio>`` elements 1470 that have the same name. 1471 1472 You can use this like a list, but also use the property 1473 ``.value`` to check/uncheck inputs. Also you can use 1474 ``.value_options`` to get the possible values. 1475 """ 1476 @property
1477 - def value(self):
1478 """ 1479 Get/set the value, which checks the radio with that value (and 1480 unchecks any other value). 1481 """ 1482 for el in self: 1483 if 'checked' in el.attrib: 1484 return el.get('value') 1485 return None
1486 1487 @value.setter
1488 - def value(self, value):
1489 checked_option = None 1490 if value is not None: 1491 for el in self: 1492 if el.get('value') == value: 1493 checked_option = el 1494 break 1495 else: 1496 raise ValueError("There is no radio input with the value %r" % value) 1497 for el in self: 1498 if 'checked' in el.attrib: 1499 del el.attrib['checked'] 1500 if checked_option is not None: 1501 checked_option.set('checked', '')
1502 1503 @value.deleter
1504 - def value(self):
1505 self.value = None
1506 1507 @property
1508 - def value_options(self):
1509 """ 1510 Returns a list of all the possible values. 1511 """ 1512 return [el.get('value') for el in self]
1513
1514 - def __repr__(self):
1515 return '%s(%s)' % ( 1516 self.__class__.__name__, 1517 list.__repr__(self))
1518
1519 1520 -class CheckboxGroup(list):
1521 """ 1522 Represents a group of checkboxes (``<input type=checkbox>``) that 1523 have the same name. 1524 1525 In addition to using this like a list, the ``.value`` attribute 1526 returns a set-like object that you can add to or remove from to 1527 check and uncheck checkboxes. You can also use ``.value_options`` 1528 to get the possible values. 1529 """ 1530 @property
1531 - def value(self):
1532 """ 1533 Return a set-like object that can be modified to check or 1534 uncheck individual checkboxes according to their value. 1535 """ 1536 return CheckboxValues(self)
1537 1538 @value.setter
1539 - def value(self, value):
1540 values = self.value 1541 values.clear() 1542 if not hasattr(value, '__iter__'): 1543 raise ValueError( 1544 "A CheckboxGroup (name=%r) must be set to a sequence (not %r)" 1545 % (self[0].name, value)) 1546 values.update(value)
1547 1548 @value.deleter
1549 - def value(self):
1550 self.value.clear()
1551 1552 @property
1553 - def value_options(self):
1554 """ 1555 Returns a list of all the possible values. 1556 """ 1557 return [el.get('value') for el in self]
1558
1559 - def __repr__(self):
1560 return '%s(%s)' % ( 1561 self.__class__.__name__, list.__repr__(self))
1562
1563 1564 -class CheckboxValues(SetMixin):
1565 """ 1566 Represents the values of the checked checkboxes in a group of 1567 checkboxes with the same name. 1568 """ 1569
1570 - def __init__(self, group):
1571 self.group = group
1572
1573 - def __iter__(self):
1574 return iter([ 1575 el.get('value') 1576 for el in self.group 1577 if 'checked' in el.attrib])
1578
1579 - def add(self, value):
1580 for el in self.group: 1581 if el.get('value') == value: 1582 el.set('checked', '') 1583 break 1584 else: 1585 raise KeyError("No checkbox with value %r" % value)
1586
1587 - def remove(self, value):
1588 for el in self.group: 1589 if el.get('value') == value: 1590 if 'checked' in el.attrib: 1591 del el.attrib['checked'] 1592 else: 1593 raise KeyError( 1594 "The checkbox with value %r was already unchecked" % value) 1595 break 1596 else: 1597 raise KeyError( 1598 "No checkbox with value %r" % value)
1599
1600 - def __repr__(self):
1601 return '<%s {%s} for checkboxes name=%r>' % ( 1602 self.__class__.__name__, 1603 ', '.join([repr(v) for v in self]), 1604 self.group.name)
1605
1606 1607 -class InputElement(InputMixin, HtmlElement):
1608 """ 1609 Represents an ``<input>`` element. 1610 1611 You can get the type with ``.type`` (which is lower-cased and 1612 defaults to ``'text'``). 1613 1614 Also you can get and set the value with ``.value`` 1615 1616 Checkboxes and radios have the attribute ``input.checkable == 1617 True`` (for all others it is false) and a boolean attribute 1618 ``.checked``. 1619 1620 """ 1621 1622 ## FIXME: I'm a little uncomfortable with the use of .checked 1623 @property
1624 - def value(self):
1625 """ 1626 Get/set the value of this element, using the ``value`` attribute. 1627 1628 Also, if this is a checkbox and it has no value, this defaults 1629 to ``'on'``. If it is a checkbox or radio that is not 1630 checked, this returns None. 1631 """ 1632 if self.checkable: 1633 if self.checked: 1634 return self.get('value') or 'on' 1635 else: 1636 return None 1637 return self.get('value')
1638 1639 @value.setter
1640 - def value(self, value):
1641 if self.checkable: 1642 if not value: 1643 self.checked = False 1644 else: 1645 self.checked = True 1646 if isinstance(value, basestring): 1647 self.set('value', value) 1648 else: 1649 self.set('value', value)
1650 1651 @value.deleter
1652 - def value(self):
1653 if self.checkable: 1654 self.checked = False 1655 else: 1656 if 'value' in self.attrib: 1657 del self.attrib['value']
1658 1659 @property
1660 - def type(self):
1661 """ 1662 Return the type of this element (using the type attribute). 1663 """ 1664 return self.get('type', 'text').lower()
1665 1666 @type.setter
1667 - def type(self, value):
1668 self.set('type', value)
1669 1670 @property
1671 - def checkable(self):
1672 """ 1673 Boolean: can this element be checked? 1674 """ 1675 return self.type in ('checkbox', 'radio')
1676 1677 @property
1678 - def checked(self):
1679 """ 1680 Boolean attribute to get/set the presence of the ``checked`` 1681 attribute. 1682 1683 You can only use this on checkable input types. 1684 """ 1685 if not self.checkable: 1686 raise AttributeError('Not a checkable input type') 1687 return 'checked' in self.attrib
1688 1689 @checked.setter
1690 - def checked(self, value):
1691 if not self.checkable: 1692 raise AttributeError('Not a checkable input type') 1693 if value: 1694 self.set('checked', '') 1695 else: 1696 attrib = self.attrib 1697 if 'checked' in attrib: 1698 del attrib['checked']
1699 1700 1701 HtmlElementClassLookup._default_element_classes['input'] = InputElement
1702 1703 1704 -class LabelElement(HtmlElement):
1705 """ 1706 Represents a ``<label>`` element. 1707 1708 Label elements are linked to other elements with their ``for`` 1709 attribute. You can access this element with ``label.for_element``. 1710 """ 1711 @property
1712 - def for_element(self):
1713 """ 1714 Get/set the element this label points to. Return None if it 1715 can't be found. 1716 """ 1717 id = self.get('for') 1718 if not id: 1719 return None 1720 return self.body.get_element_by_id(id)
1721 1722 @for_element.setter
1723 - def for_element(self, other):
1724 id = other.get('id') 1725 if not id: 1726 raise TypeError( 1727 "Element %r has no id attribute" % other) 1728 self.set('for', id)
1729 1730 @for_element.deleter
1731 - def for_element(self):
1732 attrib = self.attrib 1733 if 'id' in attrib: 1734 del attrib['id']
1735 1736 1737 HtmlElementClassLookup._default_element_classes['label'] = LabelElement
1738 1739 1740 ############################################################ 1741 ## Serialization 1742 ############################################################ 1743 1744 -def html_to_xhtml(html):
1745 """Convert all tags in an HTML tree to XHTML by moving them to the 1746 XHTML namespace. 1747 """ 1748 try: 1749 html = html.getroot() 1750 except AttributeError: 1751 pass 1752 prefix = "{%s}" % XHTML_NAMESPACE 1753 for el in html.iter(etree.Element): 1754 tag = el.tag 1755 if tag[0] != '{': 1756 el.tag = prefix + tag
1757
1758 1759 -def xhtml_to_html(xhtml):
1760 """Convert all tags in an XHTML tree to HTML by removing their 1761 XHTML namespace. 1762 """ 1763 try: 1764 xhtml = xhtml.getroot() 1765 except AttributeError: 1766 pass 1767 prefix = "{%s}" % XHTML_NAMESPACE 1768 prefix_len = len(prefix) 1769 for el in xhtml.iter(prefix + "*"): 1770 el.tag = el.tag[prefix_len:]
1771 1772 1773 # This isn't a general match, but it's a match for what libxml2 1774 # specifically serialises: 1775 __str_replace_meta_content_type = re.compile( 1776 r'<meta http-equiv="Content-Type"[^>]*>').sub 1777 __bytes_replace_meta_content_type = re.compile( 1778 r'<meta http-equiv="Content-Type"[^>]*>'.encode('ASCII')).sub
1779 1780 1781 -def tostring(doc, pretty_print=False, include_meta_content_type=False, 1782 encoding=None, method="html", with_tail=True, doctype=None):
1783 """Return an HTML string representation of the document. 1784 1785 Note: if include_meta_content_type is true this will create a 1786 ``<meta http-equiv="Content-Type" ...>`` tag in the head; 1787 regardless of the value of include_meta_content_type any existing 1788 ``<meta http-equiv="Content-Type" ...>`` tag will be removed 1789 1790 The ``encoding`` argument controls the output encoding (defaults to 1791 ASCII, with &#...; character references for any characters outside 1792 of ASCII). Note that you can pass the name ``'unicode'`` as 1793 ``encoding`` argument to serialise to a Unicode string. 1794 1795 The ``method`` argument defines the output method. It defaults to 1796 'html', but can also be 'xml' for xhtml output, or 'text' to 1797 serialise to plain text without markup. 1798 1799 To leave out the tail text of the top-level element that is being 1800 serialised, pass ``with_tail=False``. 1801 1802 The ``doctype`` option allows passing in a plain string that will 1803 be serialised before the XML tree. Note that passing in non 1804 well-formed content here will make the XML output non well-formed. 1805 Also, an existing doctype in the document tree will not be removed 1806 when serialising an ElementTree instance. 1807 1808 Example:: 1809 1810 >>> from lxml import html 1811 >>> root = html.fragment_fromstring('<p>Hello<br>world!</p>') 1812 1813 >>> html.tostring(root) 1814 b'<p>Hello<br>world!</p>' 1815 >>> html.tostring(root, method='html') 1816 b'<p>Hello<br>world!</p>' 1817 1818 >>> html.tostring(root, method='xml') 1819 b'<p>Hello<br/>world!</p>' 1820 1821 >>> html.tostring(root, method='text') 1822 b'Helloworld!' 1823 1824 >>> html.tostring(root, method='text', encoding='unicode') 1825 u'Helloworld!' 1826 1827 >>> root = html.fragment_fromstring('<div><p>Hello<br>world!</p>TAIL</div>') 1828 >>> html.tostring(root[0], method='text', encoding='unicode') 1829 u'Helloworld!TAIL' 1830 1831 >>> html.tostring(root[0], method='text', encoding='unicode', with_tail=False) 1832 u'Helloworld!' 1833 1834 >>> doc = html.document_fromstring('<p>Hello<br>world!</p>') 1835 >>> html.tostring(doc, method='html', encoding='unicode') 1836 u'<html><body><p>Hello<br>world!</p></body></html>' 1837 1838 >>> print(html.tostring(doc, method='html', encoding='unicode', 1839 ... doctype='<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN"' 1840 ... ' "http://www.w3.org/TR/html4/strict.dtd">')) 1841 <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd"> 1842 <html><body><p>Hello<br>world!</p></body></html> 1843 """ 1844 html = etree.tostring(doc, method=method, pretty_print=pretty_print, 1845 encoding=encoding, with_tail=with_tail, 1846 doctype=doctype) 1847 if method == 'html' and not include_meta_content_type: 1848 if isinstance(html, str): 1849 html = __str_replace_meta_content_type('', html) 1850 else: 1851 html = __bytes_replace_meta_content_type(bytes(), html) 1852 return html
1853 1854 1855 tostring.__doc__ = __fix_docstring(tostring.__doc__)
1856 1857 1858 -def open_in_browser(doc, encoding=None):
1859 """ 1860 Open the HTML document in a web browser, saving it to a temporary 1861 file to open it. Note that this does not delete the file after 1862 use. This is mainly meant for debugging. 1863 """ 1864 import os 1865 import webbrowser 1866 import tempfile 1867 if not isinstance(doc, etree._ElementTree): 1868 doc = etree.ElementTree(doc) 1869 handle, fn = tempfile.mkstemp(suffix='.html') 1870 f = os.fdopen(handle, 'wb') 1871 try: 1872 doc.write(f, method="html", encoding=encoding or doc.docinfo.encoding or "UTF-8") 1873 finally: 1874 # we leak the file itself here, but we should at least close it 1875 f.close() 1876 url = 'file://' + fn.replace(os.path.sep, '/') 1877 print(url) 1878 webbrowser.open(url)
1879
1880 1881 ################################################################################ 1882 # configure Element class lookup 1883 ################################################################################ 1884 1885 -class HTMLParser(etree.HTMLParser):
1886 """An HTML parser that is configured to return lxml.html Element 1887 objects. 1888 """
1889 - def __init__(self, **kwargs):
1890 super(HTMLParser, self).__init__(**kwargs) 1891 self.set_element_class_lookup(HtmlElementClassLookup())
1892
1893 1894 -class XHTMLParser(etree.XMLParser):
1895 """An XML parser that is configured to return lxml.html Element 1896 objects. 1897 1898 Note that this parser is not really XHTML aware unless you let it 1899 load a DTD that declares the HTML entities. To do this, make sure 1900 you have the XHTML DTDs installed in your catalogs, and create the 1901 parser like this:: 1902 1903 >>> parser = XHTMLParser(load_dtd=True) 1904 1905 If you additionally want to validate the document, use this:: 1906 1907 >>> parser = XHTMLParser(dtd_validation=True) 1908 1909 For catalog support, see http://www.xmlsoft.org/catalog.html. 1910 """
1911 - def __init__(self, **kwargs):
1912 super(XHTMLParser, self).__init__(**kwargs) 1913 self.set_element_class_lookup(HtmlElementClassLookup())
1914
1915 1916 -def Element(*args, **kw):
1917 """Create a new HTML Element. 1918 1919 This can also be used for XHTML documents. 1920 """ 1921 v = html_parser.makeelement(*args, **kw) 1922 return v
1923 1924 1925 html_parser = HTMLParser() 1926 xhtml_parser = XHTMLParser() 1927