Package lxml :: Package html
[hide private]
[frames] | no frames]

Source Code for Package lxml.html

   1  """The ``lxml.html`` tool set for HTML handling. 
   2  """ 
   3   
   4  import threading 
   5  import re 
   6  try: 
   7      from urlparse import urljoin 
   8  except ImportError: 
   9      # Python 3 
  10      from urllib.parse import urljoin 
  11  import copy 
  12  from lxml import etree 
  13  from lxml.html import defs 
  14  from lxml import cssselect 
  15  from lxml.html._setmixin import SetMixin 
  16  try: 
  17      from UserDict import DictMixin 
  18  except ImportError: 
  19      # DictMixin was introduced in Python 2.4 
  20      from lxml.html._dictmixin import DictMixin 
  21  try: 
  22      set 
  23  except NameError: 
  24      # Python 2.3 
  25      from sets import Set as set 
  26  try: 
  27      bytes = __builtins__["bytes"] 
  28  except (KeyError, NameError): 
  29      # Python < 2.6 
  30      bytes = str 
  31  try: 
  32      unicode = __builtins__["unicode"] 
  33  except (KeyError, NameError): 
  34      # Python 3 
  35      unicode = str 
  36  try: 
  37      basestring = __builtins__["basestring"] 
  38  except (KeyError, NameError): 
  39      # Python 3 
  40      basestring = (str, bytes) 
  41   
42 -def __fix_docstring(s):
43 if not s: 44 return s 45 import sys 46 if sys.version_info[0] >= 3: 47 sub = re.compile(r"^(\s*)u'", re.M).sub 48 else: 49 sub = re.compile(r"^(\s*)b'", re.M).sub 50 return sub(r"\1'", s)
51 52 __all__ = [ 53 'document_fromstring', 'fragment_fromstring', 'fragments_fromstring', 'fromstring', 54 'tostring', 'Element', 'defs', 'open_in_browser', 'submit_form', 55 'find_rel_links', 'find_class', 'make_links_absolute', 56 'resolve_base_href', 'iterlinks', 'rewrite_links', 'open_in_browser', 'parse'] 57 58 XHTML_NAMESPACE = "http://www.w3.org/1999/xhtml" 59 60 _rel_links_xpath = etree.XPath("descendant-or-self::a[@rel]|descendant-or-self::x:a[@rel]", 61 namespaces={'x':XHTML_NAMESPACE}) 62 _options_xpath = etree.XPath("descendant-or-self::option|descendant-or-self::x:option", 63 namespaces={'x':XHTML_NAMESPACE}) 64 _forms_xpath = etree.XPath("descendant-or-self::form|descendant-or-self::x:form", 65 namespaces={'x':XHTML_NAMESPACE}) 66 #_class_xpath = etree.XPath(r"descendant-or-self::*[regexp:match(@class, concat('\b', $class_name, '\b'))]", {'regexp': 'http://exslt.org/regular-expressions'}) 67 _class_xpath = etree.XPath("descendant-or-self::*[@class and contains(concat(' ', normalize-space(@class), ' '), concat(' ', $class_name, ' '))]") 68 _id_xpath = etree.XPath("descendant-or-self::*[@id=$id]") 69 _collect_string_content = etree.XPath("string()") 70 _css_url_re = re.compile(r'url\(('+'["][^"]*["]|'+"['][^']*[']|"+r'[^)]*)\)', re.I) 71 _css_import_re = re.compile(r'@import "(.*?)"') 72 _label_xpath = etree.XPath("//label[@for=$id]|//x:label[@for=$id]", 73 namespaces={'x':XHTML_NAMESPACE}) 74 _archive_re = re.compile(r'[^ ]+') 75
76 -def _unquote_match(s, pos):
77 if s[:1] == '"' and s[-1:] == '"' or s[:1] == "'" and s[-1:] == "'": 78 return s[1:-1], pos+1 79 else: 80 return s,pos
81
82 -def _transform_result(typ, result):
83 """Convert the result back into the input type. 84 """ 85 if issubclass(typ, bytes): 86 return tostring(result, encoding='utf-8') 87 elif issubclass(typ, unicode): 88 return tostring(result, encoding=unicode) 89 else: 90 return result
91
92 -def _nons(tag):
93 if isinstance(tag, basestring): 94 if tag[0] == '{' and tag[1:len(XHTML_NAMESPACE)+1] == XHTML_NAMESPACE: 95 return tag.split('}')[-1] 96 return tag
97
98 -class HtmlMixin(object):
99
100 - def base_url(self):
101 """ 102 Returns the base URL, given when the page was parsed. 103 104 Use with ``urlparse.urljoin(el.base_url, href)`` to get 105 absolute URLs. 106 """ 107 return self.getroottree().docinfo.URL
108 base_url = property(base_url, doc=base_url.__doc__) 109
110 - def forms(self):
111 """ 112 Return a list of all the forms 113 """ 114 return _forms_xpath(self)
115 forms = property(forms, doc=forms.__doc__) 116
117 - def body(self):
118 """ 119 Return the <body> element. Can be called from a child element 120 to get the document's head. 121 """ 122 return self.xpath('//body|//x:body', namespaces={'x':XHTML_NAMESPACE})[0]
123 body = property(body, doc=body.__doc__) 124
125 - def head(self):
126 """ 127 Returns the <head> element. Can be called from a child 128 element to get the document's head. 129 """ 130 return self.xpath('//head|//x:head', namespaces={'x':XHTML_NAMESPACE})[0]
131 head = property(head, doc=head.__doc__) 132
133 - def _label__get(self):
134 """ 135 Get or set any <label> element associated with this element. 136 """ 137 id = self.get('id') 138 if not id: 139 return None 140 result = _label_xpath(self, id=id) 141 if not result: 142 return None 143 else: 144 return result[0]
145 - def _label__set(self, label):
146 id = self.get('id') 147 if not id: 148 raise TypeError( 149 "You cannot set a label for an element (%r) that has no id" 150 % self) 151 if _nons(label.tag) != 'label': 152 raise TypeError( 153 "You can only assign label to a label element (not %r)" 154 % label) 155 label.set('for', id)
156 - def _label__del(self):
157 label = self.label 158 if label is not None: 159 del label.attrib['for']
160 label = property(_label__get, _label__set, _label__del, doc=_label__get.__doc__) 161
162 - def drop_tree(self):
163 """ 164 Removes this element from the tree, including its children and 165 text. The tail text is joined to the previous element or 166 parent. 167 """ 168 parent = self.getparent() 169 assert parent is not None 170 if self.tail: 171 previous = self.getprevious() 172 if previous is None: 173 parent.text = (parent.text or '') + self.tail 174 else: 175 previous.tail = (previous.tail or '') + self.tail 176 parent.remove(self)
177
178 - def drop_tag(self):
179 """ 180 Remove the tag, but not its children or text. The children and text 181 are merged into the parent. 182 183 Example:: 184 185 >>> h = fragment_fromstring('<div>Hello <b>World!</b></div>') 186 >>> h.find('.//b').drop_tag() 187 >>> print(tostring(h, encoding=unicode)) 188 <div>Hello World!</div> 189 """ 190 parent = self.getparent() 191 assert parent is not None 192 previous = self.getprevious() 193 if self.text and isinstance(self.tag, basestring): 194 # not a Comment, etc. 195 if previous is None: 196 parent.text = (parent.text or '') + self.text 197 else: 198 previous.tail = (previous.tail or '') + self.text 199 if self.tail: 200 if len(self): 201 last = self[-1] 202 last.tail = (last.tail or '') + self.tail 203 elif previous is None: 204 parent.text = (parent.text or '') + self.tail 205 else: 206 previous.tail = (previous.tail or '') + self.tail 207 index = parent.index(self) 208 parent[index:index+1] = self[:]
209 217
218 - def find_class(self, class_name):
219 """ 220 Find any elements with the given class name. 221 """ 222 return _class_xpath(self, class_name=class_name)
223
224 - def get_element_by_id(self, id, *default):
225 """ 226 Get the first element in a document with the given id. If none is 227 found, return the default argument if provided or raise KeyError 228 otherwise. 229 230 Note that there can be more than one element with the same id, 231 and this isn't uncommon in HTML documents found in the wild. 232 Browsers return only the first match, and this function does 233 the same. 234 """ 235 try: 236 # FIXME: should this check for multiple matches? 237 # browsers just return the first one 238 return _id_xpath(self, id=id)[0] 239 except IndexError: 240 if default: 241 return default[0] 242 else: 243 raise KeyError(id)
244
245 - def text_content(self):
246 """ 247 Return the text content of the tag (and the text in any children). 248 """ 249 return _collect_string_content(self)
250
251 - def cssselect(self, expr):
252 """ 253 Run the CSS expression on this element and its children, 254 returning a list of the results. 255 256 Equivalent to lxml.cssselect.CSSSelect(expr)(self) -- note 257 that pre-compiling the expression can provide a substantial 258 speedup. 259 """ 260 return cssselect.CSSSelector(expr)(self)
261 262 ######################################## 263 ## Link functions 264 ######################################## 265 285 self.rewrite_links(link_repl)
286
287 - def resolve_base_href(self):
288 """ 289 Find any ``<base href>`` tag in the document, and apply its 290 values to all links found in the document. Also remove the 291 tag once it has been applied. 292 """ 293 base_href = None 294 basetags = self.xpath('//base[@href]|//x:base[@href]', namespaces={'x':XHTML_NAMESPACE}) 295 for b in basetags: 296 base_href = b.get('href') 297 b.drop_tree() 298 if not base_href: 299 return 300 self.make_links_absolute(base_href, resolve_base_href=False)
301 380 427 428
429 -class _MethodFunc(object):
430 """ 431 An object that represents a method on an element as a function; 432 the function takes either an element or an HTML string. It 433 returns whatever the function normally returns, or if the function 434 works in-place (and so returns None) it returns a serialized form 435 of the resulting document. 436 """
437 - def __init__(self, name, copy=False, source_class=HtmlMixin):
438 self.name = name 439 self.copy = copy 440 self.__doc__ = getattr(source_class, self.name).__doc__
441 - def __call__(self, doc, *args, **kw):
442 result_type = type(doc) 443 if isinstance(doc, basestring): 444 if 'copy' in kw: 445 raise TypeError( 446 "The keyword 'copy' can only be used with element inputs to %s, not a string input" % self.name) 447 doc = fromstring(doc, **kw) 448 else: 449 if 'copy' in kw: 450 make_a_copy = kw.pop('copy') 451 else: 452 make_a_copy = self.copy 453 if make_a_copy: 454 doc = copy.deepcopy(doc) 455 meth = getattr(doc, self.name) 456 result = meth(*args, **kw) 457 # FIXME: this None test is a bit sloppy 458 if result is None: 459 # Then return what we got in 460 return _transform_result(result_type, doc) 461 else: 462 return result
463 464 find_rel_links = _MethodFunc('find_rel_links', copy=False) 465 find_class = _MethodFunc('find_class', copy=False) 466 make_links_absolute = _MethodFunc('make_links_absolute', copy=True) 467 resolve_base_href = _MethodFunc('resolve_base_href', copy=True) 468 iterlinks = _MethodFunc('iterlinks', copy=False) 469 rewrite_links = _MethodFunc('rewrite_links', copy=True) 470
471 -class HtmlComment(etree.CommentBase, HtmlMixin):
472 pass
473
474 -class HtmlElement(etree.ElementBase, HtmlMixin):
475 pass
476
477 -class HtmlProcessingInstruction(etree.PIBase, HtmlMixin):
478 pass
479
480 -class HtmlEntity(etree.EntityBase, HtmlMixin):
481 pass
482 483
484 -class HtmlElementClassLookup(etree.CustomElementClassLookup):
485 """A lookup scheme for HTML Element classes. 486 487 To create a lookup instance with different Element classes, pass a tag 488 name mapping of Element classes in the ``classes`` keyword argument and/or 489 a tag name mapping of Mixin classes in the ``mixins`` keyword argument. 490 The special key '*' denotes a Mixin class that should be mixed into all 491 Element classes. 492 """ 493 _default_element_classes = {} 494
495 - def __init__(self, classes=None, mixins=None):
496 etree.CustomElementClassLookup.__init__(self) 497 if classes is None: 498 classes = self._default_element_classes.copy() 499 if mixins: 500 mixers = {} 501 for name, value in mixins: 502 if name == '*': 503 for n in classes.keys(): 504 mixers.setdefault(n, []).append(value) 505 else: 506 mixers.setdefault(name, []).append(value) 507 for name, mix_bases in mixers.items(): 508 cur = classes.get(name, HtmlElement) 509 bases = tuple(mix_bases + [cur]) 510 classes[name] = type(cur.__name__, bases, {}) 511 self._element_classes = classes
512
513 - def lookup(self, node_type, document, namespace, name):
514 if node_type == 'element': 515 return self._element_classes.get(name.lower(), HtmlElement) 516 elif node_type == 'comment': 517 return HtmlComment 518 elif node_type == 'PI': 519 return HtmlProcessingInstruction 520 elif node_type == 'entity': 521 return HtmlEntity 522 # Otherwise normal lookup 523 return None
524 525 ################################################################################ 526 # parsing 527 ################################################################################ 528
529 -def document_fromstring(html, parser=None, **kw):
530 if parser is None: 531 parser = html_parser 532 value = etree.fromstring(html, parser, **kw) 533 if value is None: 534 raise etree.ParserError( 535 "Document is empty") 536 return value
537
538 -def fragments_fromstring(html, no_leading_text=False, base_url=None, 539 parser=None, **kw):
540 """ 541 Parses several HTML elements, returning a list of elements. 542 543 The first item in the list may be a string (though leading 544 whitespace is removed). If no_leading_text is true, then it will 545 be an error if there is leading text, and it will always be a list 546 of only elements. 547 548 base_url will set the document's base_url attribute (and the tree's docinfo.URL) 549 """ 550 if parser is None: 551 parser = html_parser 552 # FIXME: check what happens when you give html with a body, head, etc. 553 start = html[:20].lstrip().lower() 554 if not start.startswith('<html') and not start.startswith('<!doctype'): 555 html = '<html><body>%s</body></html>' % html 556 doc = document_fromstring(html, parser=parser, base_url=base_url, **kw) 557 assert _nons(doc.tag) == 'html' 558 bodies = [e for e in doc if _nons(e.tag) == 'body'] 559 assert len(bodies) == 1, ("too many bodies: %r in %r" % (bodies, html)) 560 body = bodies[0] 561 elements = [] 562 if no_leading_text and body.text and body.text.strip(): 563 raise etree.ParserError( 564 "There is leading text: %r" % body.text) 565 if body.text and body.text.strip(): 566 elements.append(body.text) 567 elements.extend(body) 568 # FIXME: removing the reference to the parent artificial document 569 # would be nice 570 return elements
571
572 -def fragment_fromstring(html, create_parent=False, base_url=None, 573 parser=None, **kw):
574 """ 575 Parses a single HTML element; it is an error if there is more than 576 one element, or if anything but whitespace precedes or follows the 577 element. 578 579 If create_parent is true (or is a tag name) then a parent node 580 will be created to encapsulate the HTML in a single element. In 581 this case, leading or trailing text is allowed. 582 583 base_url will set the document's base_url attribute (and the tree's docinfo.URL) 584 """ 585 if parser is None: 586 parser = html_parser 587 588 accept_leading_text = bool(create_parent) 589 590 elements = fragments_fromstring( 591 html, parser=parser, no_leading_text=not accept_leading_text, 592 base_url=base_url, **kw) 593 594 if create_parent: 595 if not isinstance(create_parent, basestring): 596 create_parent = 'div' 597 new_root = Element(create_parent) 598 if elements: 599 if isinstance(elements[0], basestring): 600 new_root.text = elements[0] 601 del elements[0] 602 new_root.extend(elements) 603 return new_root 604 605 if not elements: 606 raise etree.ParserError('No elements found') 607 if len(elements) > 1: 608 raise etree.ParserError( 609 "Multiple elements found (%s)" 610 % ', '.join([_element_name(e) for e in elements])) 611 el = elements[0] 612 if el.tail and el.tail.strip(): 613 raise etree.ParserError( 614 "Element followed by text: %r" % el.tail) 615 el.tail = None 616 return el
617
618 -def fromstring(html, base_url=None, parser=None, **kw):
619 """ 620 Parse the html, returning a single element/document. 621 622 This tries to minimally parse the chunk of text, without knowing if it 623 is a fragment or a document. 624 625 base_url will set the document's base_url attribute (and the tree's docinfo.URL) 626 """ 627 if parser is None: 628 parser = html_parser 629 start = html[:10].lstrip().lower() 630 if start.startswith('<html') or start.startswith('<!doctype'): 631 # Looks like a full HTML document 632 return document_fromstring(html, parser=parser, base_url=base_url, **kw) 633 # otherwise, lets parse it out... 634 doc = document_fromstring(html, parser=parser, base_url=base_url, **kw) 635 bodies = doc.findall('body') 636 if not bodies: 637 bodies = doc.findall('{%s}body' % XHTML_NAMESPACE) 638 if bodies: 639 body = bodies[0] 640 if len(bodies) > 1: 641 # Somehow there are multiple bodies, which is bad, but just 642 # smash them into one body 643 for other_body in bodies[1:]: 644 if other_body.text: 645 if len(body): 646 body[-1].tail = (body[-1].tail or '') + other_body.text 647 else: 648 body.text = (body.text or '') + other_body.text 649 body.extend(other_body) 650 # We'll ignore tail 651 # I guess we are ignoring attributes too 652 other_body.drop_tree() 653 else: 654 body = None 655 heads = doc.findall('head') 656 if not heads: 657 heads = doc.findall('{%s}head' % XHTML_NAMESPACE) 658 if heads: 659 # Well, we have some sort of structure, so lets keep it all 660 head = heads[0] 661 if len(heads) > 1: 662 for other_head in heads[1:]: 663 head.extend(other_head) 664 # We don't care about text or tail in a head 665 other_head.drop_tree() 666 return doc 667 if (len(body) == 1 and (not body.text or not body.text.strip()) 668 and (not body[-1].tail or not body[-1].tail.strip())): 669 # The body has just one element, so it was probably a single 670 # element passed in 671 return body[0] 672 # Now we have a body which represents a bunch of tags which have the 673 # content that was passed in. We will create a fake container, which 674 # is the body tag, except <body> implies too much structure. 675 if _contains_block_level_tag(body): 676 body.tag = 'div' 677 else: 678 body.tag = 'span' 679 return body
680
681 -def parse(filename_or_url, parser=None, base_url=None, **kw):
682 """ 683 Parse a filename, URL, or file-like object into an HTML document 684 tree. Note: this returns a tree, not an element. Use 685 ``parse(...).getroot()`` to get the document root. 686 687 You can override the base URL with the ``base_url`` keyword. This 688 is most useful when parsing from a file-like object. 689 """ 690 if parser is None: 691 parser = html_parser 692 return etree.parse(filename_or_url, parser, base_url=base_url, **kw)
693
694 -def _contains_block_level_tag(el):
695 # FIXME: I could do this with XPath, but would that just be 696 # unnecessarily slow? 697 for el in el.iter(): 698 if _nons(el.tag) in defs.block_tags: 699 return True 700 return False
701
702 -def _element_name(el):
703 if isinstance(el, etree.CommentBase): 704 return 'comment' 705 elif isinstance(el, basestring): 706 return 'string' 707 else: 708 return _nons(el.tag)
709 710 ################################################################################ 711 # form handling 712 ################################################################################ 713
714 -class FormElement(HtmlElement):
715 """ 716 Represents a <form> element. 717 """ 718
719 - def inputs(self):
720 """ 721 Returns an accessor for all the input elements in the form. 722 723 See `InputGetter` for more information about the object. 724 """ 725 return InputGetter(self)
726 inputs = property(inputs, doc=inputs.__doc__) 727
728 - def _fields__get(self):
729 """ 730 Dictionary-like object that represents all the fields in this 731 form. You can set values in this dictionary to effect the 732 form. 733 """ 734 return FieldsDict(self.inputs)
735 - def _fields__set(self, value):
736 prev_keys = self.fields.keys() 737 for key, value in value.iteritems(): 738 if key in prev_keys: 739 prev_keys.remove(key) 740 self.fields[key] = value 741 for key in prev_keys: 742 if key is None: 743 # Case of an unnamed input; these aren't really 744 # expressed in form_values() anyway. 745 continue 746 self.fields[key] = None
747 748 fields = property(_fields__get, _fields__set, doc=_fields__get.__doc__) 749
750 - def _name(self):
751 if self.get('name'): 752 return self.get('name') 753 elif self.get('id'): 754 return '#' + self.get('id') 755 forms = list(self.body.iter('form')) 756 if not forms: 757 forms = list(self.body.iter('{%s}form' % XHTML_NAMESPACE)) 758 return str(forms.index(self))
759
760 - def form_values(self):
761 """ 762 Return a list of tuples of the field values for the form. 763 This is suitable to be passed to ``urllib.urlencode()``. 764 """ 765 results = [] 766 for el in self.inputs: 767 name = el.name 768 if not name: 769 continue 770 tag = _nons(el.tag) 771 if tag == 'textarea': 772 results.append((name, el.value)) 773 elif tag == 'select': 774 value = el.value 775 if el.multiple: 776 for v in value: 777 results.append((name, v)) 778 elif value is not None: 779 results.append((name, el.value)) 780 else: 781 assert tag == 'input', ( 782 "Unexpected tag: %r" % el) 783 if el.checkable and not el.checked: 784 continue 785 if el.type in ('submit', 'image', 'reset'): 786 continue 787 value = el.value 788 if value is not None: 789 results.append((name, el.value)) 790 return results
791
792 - def _action__get(self):
793 """ 794 Get/set the form's ``action`` attribute. 795 """ 796 base_url = self.base_url 797 action = self.get('action') 798 if base_url and action is not None: 799 return urljoin(base_url, action) 800 else: 801 return action
802 - def _action__set(self, value):
803 self.set('action', value)
804 - def _action__del(self):
805 if 'action' in self.attrib: 806 del self.attrib['action']
807 action = property(_action__get, _action__set, _action__del, doc=_action__get.__doc__) 808
809 - def _method__get(self):
810 """ 811 Get/set the form's method. Always returns a capitalized 812 string, and defaults to ``'GET'`` 813 """ 814 return self.get('method', 'GET').upper()
815 - def _method__set(self, value):
816 self.set('method', value.upper())
817 method = property(_method__get, _method__set, doc=_method__get.__doc__)
818 819 HtmlElementClassLookup._default_element_classes['form'] = FormElement 820
821 -def submit_form(form, extra_values=None, open_http=None):
822 """ 823 Helper function to submit a form. Returns a file-like object, as from 824 ``urllib.urlopen()``. This object also has a ``.geturl()`` function, 825 which shows the URL if there were any redirects. 826 827 You can use this like:: 828 829 form = doc.forms[0] 830 form.inputs['foo'].value = 'bar' # etc 831 response = form.submit() 832 doc = parse(response) 833 doc.make_links_absolute(response.geturl()) 834 835 To change the HTTP requester, pass a function as ``open_http`` keyword 836 argument that opens the URL for you. The function must have the following 837 signature:: 838 839 open_http(method, URL, values) 840 841 The action is one of 'GET' or 'POST', the URL is the target URL as a 842 string, and the values are a sequence of ``(name, value)`` tuples with the 843 form data. 844 """ 845 values = form.form_values() 846 if extra_values: 847 if hasattr(extra_values, 'items'): 848 extra_values = extra_values.items() 849 values.extend(extra_values) 850 if open_http is None: 851 open_http = open_http_urllib 852 if form.action: 853 url = form.action 854 else: 855 url = form.base_url 856 return open_http(form.method, url, values)
857
858 -def open_http_urllib(method, url, values):
859 if not url: 860 raise ValueError("cannot submit, no URL provided") 861 ## FIXME: should test that it's not a relative URL or something 862 try: 863 from urllib import urlencode, urlopen 864 except ImportError: # Python 3 865 from urllib.request import urlopen 866 from urllib.parse import urlencode 867 if method == 'GET': 868 if '?' in url: 869 url += '&' 870 else: 871 url += '?' 872 url += urlencode(values) 873 data = None 874 else: 875 data = urlencode(values) 876 return urlopen(url, data)
877
878 -class FieldsDict(DictMixin):
879
880 - def __init__(self, inputs):
881 self.inputs = inputs
882 - def __getitem__(self, item):
883 return self.inputs[item].value
884 - def __setitem__(self, item, value):
885 self.inputs[item].value = value
886 - def __delitem__(self, item):
887 raise KeyError( 888 "You cannot remove keys from ElementDict")
889 - def keys(self):
890 return self.inputs.keys()
891 - def __contains__(self, item):
892 return item in self.inputs
893
894 - def __repr__(self):
895 return '<%s for form %s>' % ( 896 self.__class__.__name__, 897 self.inputs.form._name())
898
899 -class InputGetter(object):
900 901 """ 902 An accessor that represents all the input fields in a form. 903 904 You can get fields by name from this, with 905 ``form.inputs['field_name']``. If there are a set of checkboxes 906 with the same name, they are returned as a list (a `CheckboxGroup` 907 which also allows value setting). Radio inputs are handled 908 similarly. 909 910 You can also iterate over this to get all input elements. This 911 won't return the same thing as if you get all the names, as 912 checkboxes and radio elements are returned individually. 913 """ 914 915 _name_xpath = etree.XPath(".//*[@name = $name and (local-name(.) = 'select' or local-name(.) = 'input' or local-name(.) = 'textarea')]") 916 _all_xpath = etree.XPath(".//*[local-name() = 'select' or local-name() = 'input' or local-name() = 'textarea']") 917
918 - def __init__(self, form):
919 self.form = form
920
921 - def __repr__(self):
922 return '<%s for form %s>' % ( 923 self.__class__.__name__, 924 self.form._name())
925 926 ## FIXME: there should be more methods, and it's unclear if this is 927 ## a dictionary-like object or list-like object 928
929 - def __getitem__(self, name):
930 results = self._name_xpath(self.form, name=name) 931 if results: 932 type = results[0].get('type') 933 if type == 'radio' and len(results) > 1: 934 group = RadioGroup(results) 935 group.name = name 936 return group 937 elif type == 'checkbox' and len(results) > 1: 938 group = CheckboxGroup(results) 939 group.name = name 940 return group 941 else: 942 # I don't like throwing away elements like this 943 return results[0] 944 else: 945 raise KeyError( 946 "No input element with the name %r" % name)
947
948 - def __contains__(self, name):
949 results = self._name_xpath(self.form, name=name) 950 return bool(results)
951
952 - def keys(self):
953 names = set() 954 for el in self: 955 names.add(el.name) 956 if None in names: 957 names.remove(None) 958 return list(names)
959
960 - def __iter__(self):
961 ## FIXME: kind of dumb to turn a list into an iterator, only 962 ## to have it likely turned back into a list again :( 963 return iter(self._all_xpath(self.form))
964
965 -class InputMixin(object):
966 967 """ 968 Mix-in for all input elements (input, select, and textarea) 969 """ 970 971
972 - def _name__get(self):
973 """ 974 Get/set the name of the element 975 """ 976 return self.get('name')
977 - def _name__set(self, value):
978 self.set('name', value)
979 - def _name__del(self):
980 if 'name' in self.attrib: 981 del self.attrib['name']
982 name = property(_name__get, _name__set, _name__del, doc=_name__get.__doc__) 983
984 - def __repr__(self):
985 type = getattr(self, 'type', None) 986 if type: 987 type = ' type=%r' % type 988 else: 989 type = '' 990 return '<%s %x name=%r%s>' % ( 991 self.__class__.__name__, id(self), self.name, type)
992
993 -class TextareaElement(InputMixin, HtmlElement):
994 """ 995 ``<textarea>`` element. You can get the name with ``.name`` and 996 get/set the value with ``.value`` 997 """ 998
999 - def _value__get(self):
1000 """ 1001 Get/set the value (which is the contents of this element) 1002 """ 1003 content = self.text or '' 1004 if self.tag.startswith("{%s}" % XHTML_NAMESPACE): 1005 serialisation_method = 'xml' 1006 else: 1007 serialisation_method = 'html' 1008 for el in self: 1009 # it's rare that we actually get here, so let's not use ''.join() 1010 content += etree.tostring(el, method=serialisation_method, encoding=unicode) 1011 return content
1012 - def _value__set(self, value):
1013 del self[:] 1014 self.text = value
1015 - def _value__del(self):
1016 self.text = '' 1017 del self[:]
1018 value = property(_value__get, _value__set, _value__del, doc=_value__get.__doc__)
1019 1020 HtmlElementClassLookup._default_element_classes['textarea'] = TextareaElement 1021
1022 -class SelectElement(InputMixin, HtmlElement):
1023 """ 1024 ``<select>`` element. You can get the name with ``.name``. 1025 1026 ``.value`` will be the value of the selected option, unless this 1027 is a multi-select element (``<select multiple>``), in which case 1028 it will be a set-like object. In either case ``.value_options`` 1029 gives the possible values. 1030 1031 The boolean attribute ``.multiple`` shows if this is a 1032 multi-select. 1033 """ 1034
1035 - def _value__get(self):
1036 """ 1037 Get/set the value of this select (the selected option). 1038 1039 If this is a multi-select, this is a set-like object that 1040 represents all the selected options. 1041 """ 1042 if self.multiple: 1043 return MultipleSelectOptions(self) 1044 for el in _options_xpath(self): 1045 if el.get('selected') is not None: 1046 value = el.get('value') 1047 if value is None: 1048 value = el.text or '' 1049 if value: 1050 value = value.strip() 1051 return value 1052 return None
1053
1054 - def _value__set(self, value):
1055 if self.multiple: 1056 if isinstance(value, basestring): 1057 raise TypeError( 1058 "You must pass in a sequence") 1059 self.value.clear() 1060 self.value.update(value) 1061 return 1062 if value is not None: 1063 value = value.strip() 1064 for el in _options_xpath(self): 1065 opt_value = el.get('value') 1066 if opt_value is None: 1067 opt_value = el.text or '' 1068 if opt_value: 1069 opt_value = opt_value.strip() 1070 if opt_value == value: 1071 checked_option = el 1072 break 1073 else: 1074 raise ValueError( 1075 "There is no option with the value of %r" % value) 1076 for el in _options_xpath(self): 1077 if 'selected' in el.attrib: 1078 del el.attrib['selected'] 1079 if value is not None: 1080 checked_option.set('selected', '')
1081
1082 - def _value__del(self):
1083 # FIXME: should del be allowed at all? 1084 if self.multiple: 1085 self.value.clear() 1086 else: 1087 self.value = None
1088 1089 value = property(_value__get, _value__set, _value__del, doc=_value__get.__doc__) 1090
1091 - def value_options(self):
1092 """ 1093 All the possible values this select can have (the ``value`` 1094 attribute of all the ``<option>`` elements. 1095 """ 1096 options = [] 1097 for el in _options_xpath(self): 1098 value = el.get('value') 1099 if value is None: 1100 value = el.text or '' 1101 if value: 1102 value = value.strip() 1103 options.append(value) 1104 return options
1105 value_options = property(value_options, doc=value_options.__doc__) 1106
1107 - def _multiple__get(self):
1108 """ 1109 Boolean attribute: is there a ``multiple`` attribute on this element. 1110 """ 1111 return 'multiple' in self.attrib
1112 - def _multiple__set(self, value):
1113 if value: 1114 self.set('multiple', '') 1115 elif 'multiple' in self.attrib: 1116 del self.attrib['multiple']
1117 multiple = property(_multiple__get, _multiple__set, doc=_multiple__get.__doc__)
1118 1119 HtmlElementClassLookup._default_element_classes['select'] = SelectElement 1120
1121 -class MultipleSelectOptions(SetMixin):
1122 """ 1123 Represents all the selected options in a ``<select multiple>`` element. 1124 1125 You can add to this set-like option to select an option, or remove 1126 to unselect the option. 1127 """ 1128
1129 - def __init__(self, select):
1130 self.select = select
1131
1132 - def options(self):
1133 """ 1134 Iterator of all the ``<option>`` elements. 1135 """ 1136 return iter(_options_xpath(self.select))
1137 options = property(options) 1138
1139 - def __iter__(self):
1140 for option in self.options: 1141 if 'selected' in option.attrib: 1142 opt_value = option.get('value') 1143 if opt_value is None: 1144 opt_value = option.text or '' 1145 if opt_value: 1146 opt_value = opt_value.strip() 1147 yield opt_value
1148
1149 - def add(self, item):
1150 for option in self.options: 1151 opt_value = option.get('value') 1152 if opt_value is None: 1153 opt_value = option.text or '' 1154 if opt_value: 1155 opt_value = opt_value.strip() 1156 if opt_value == item: 1157 option.set('selected', '') 1158 break 1159 else: 1160 raise ValueError( 1161 "There is no option with the value %r" % item)
1162
1163 - def remove(self, item):
1164 for option in self.options: 1165 opt_value = option.get('value') 1166 if opt_value is None: 1167 opt_value = option.text or '' 1168 if opt_value: 1169 opt_value = opt_value.strip() 1170 if opt_value == item: 1171 if 'selected' in option.attrib: 1172 del option.attrib['selected'] 1173 else: 1174 raise ValueError( 1175 "The option %r is not currently selected" % item) 1176 break 1177 else: 1178 raise ValueError( 1179 "There is not option with the value %r" % item)
1180
1181 - def __repr__(self):
1182 return '<%s {%s} for select name=%r>' % ( 1183 self.__class__.__name__, 1184 ', '.join([repr(v) for v in self]), 1185 self.select.name)
1186
1187 -class RadioGroup(list):
1188 """ 1189 This object represents several ``<input type=radio>`` elements 1190 that have the same name. 1191 1192 You can use this like a list, but also use the property 1193 ``.value`` to check/uncheck inputs. Also you can use 1194 ``.value_options`` to get the possible values. 1195 """ 1196
1197 - def _value__get(self):
1198 """ 1199 Get/set the value, which checks the radio with that value (and 1200 unchecks any other value). 1201 """ 1202 for el in self: 1203 if 'checked' in el.attrib: 1204 return el.get('value') 1205 return None
1206
1207 - def _value__set(self, value):
1208 if value is not None: 1209 for el in self: 1210 if el.get('value') == value: 1211 checked_option = el 1212 break 1213 else: 1214 raise ValueError( 1215 "There is no radio input with the value %r" % value) 1216 for el in self: 1217 if 'checked' in el.attrib: 1218 del el.attrib['checked'] 1219 if value is not None: 1220 checked_option.set('checked', '')
1221
1222 - def _value__del(self):
1223 self.value = None
1224 1225 value = property(_value__get, _value__set, _value__del, doc=_value__get.__doc__) 1226
1227 - def value_options(self):
1228 """ 1229 Returns a list of all the possible values. 1230 """ 1231 return [el.get('value') for el in self]
1232 value_options = property(value_options, doc=value_options.__doc__) 1233
1234 - def __repr__(self):
1235 return '%s(%s)' % ( 1236 self.__class__.__name__, 1237 list.__repr__(self))
1238
1239 -class CheckboxGroup(list):
1240 """ 1241 Represents a group of checkboxes (``<input type=checkbox>``) that 1242 have the same name. 1243 1244 In addition to using this like a list, the ``.value`` attribute 1245 returns a set-like object that you can add to or remove from to 1246 check and uncheck checkboxes. You can also use ``.value_options`` 1247 to get the possible values. 1248 """ 1249
1250 - def _value__get(self):
1251 """ 1252 Return a set-like object that can be modified to check or 1253 uncheck individual checkboxes according to their value. 1254 """ 1255 return CheckboxValues(self)
1256 - def _value__set(self, value):
1257 self.value.clear() 1258 if not hasattr(value, '__iter__'): 1259 raise ValueError( 1260 "A CheckboxGroup (name=%r) must be set to a sequence (not %r)" 1261 % (self[0].name, value)) 1262 self.value.update(value)
1263 - def _value__del(self):
1264 self.value.clear()
1265 value = property(_value__get, _value__set, _value__del, doc=_value__get.__doc__) 1266
1267 - def __repr__(self):
1268 return '%s(%s)' % ( 1269 self.__class__.__name__, list.__repr__(self))
1270
1271 -class CheckboxValues(SetMixin):
1272 1273 """ 1274 Represents the values of the checked checkboxes in a group of 1275 checkboxes with the same name. 1276 """ 1277
1278 - def __init__(self, group):
1279 self.group = group
1280
1281 - def __iter__(self):
1282 return iter([ 1283 el.get('value') 1284 for el in self.group 1285 if 'checked' in el.attrib])
1286
1287 - def add(self, value):
1288 for el in self.group: 1289 if el.get('value') == value: 1290 el.set('checked', '') 1291 break 1292 else: 1293 raise KeyError("No checkbox with value %r" % value)
1294
1295 - def remove(self, value):
1296 for el in self.group: 1297 if el.get('value') == value: 1298 if 'checked' in el.attrib: 1299 del el.attrib['checked'] 1300 else: 1301 raise KeyError( 1302 "The checkbox with value %r was already unchecked" % value) 1303 break 1304 else: 1305 raise KeyError( 1306 "No checkbox with value %r" % value)
1307
1308 - def __repr__(self):
1309 return '<%s {%s} for checkboxes name=%r>' % ( 1310 self.__class__.__name__, 1311 ', '.join([repr(v) for v in self]), 1312 self.group.name)
1313
1314 -class InputElement(InputMixin, HtmlElement):
1315 """ 1316 Represents an ``<input>`` element. 1317 1318 You can get the type with ``.type`` (which is lower-cased and 1319 defaults to ``'text'``). 1320 1321 Also you can get and set the value with ``.value`` 1322 1323 Checkboxes and radios have the attribute ``input.checkable == 1324 True`` (for all others it is false) and a boolean attribute 1325 ``.checked``. 1326 1327 """ 1328 1329 ## FIXME: I'm a little uncomfortable with the use of .checked
1330 - def _value__get(self):
1331 """ 1332 Get/set the value of this element, using the ``value`` attribute. 1333 1334 Also, if this is a checkbox and it has no value, this defaults 1335 to ``'on'``. If it is a checkbox or radio that is not 1336 checked, this returns None. 1337 """ 1338 if self.checkable: 1339 if self.checked: 1340 return self.get('value') or 'on' 1341 else: 1342 return None 1343 return self.get('value')
1344 - def _value__set(self, value):
1345 if self.checkable: 1346 if not value: 1347 self.checked = False 1348 else: 1349 self.checked = True 1350 if isinstance(value, basestring): 1351 self.set('value', value) 1352 else: 1353 self.set('value', value)
1354 - def _value__del(self):
1355 if self.checkable: 1356 self.checked = False 1357 else: 1358 if 'value' in self.attrib: 1359 del self.attrib['value']
1360 value = property(_value__get, _value__set, _value__del, doc=_value__get.__doc__) 1361
1362 - def _type__get(self):
1363 """ 1364 Return the type of this element (using the type attribute). 1365 """ 1366 return self.get('type', 'text').lower()
1367 - def _type__set(self, value):
1368 self.set('type', value)
1369 type = property(_type__get, _type__set, doc=_type__get.__doc__) 1370
1371 - def checkable(self):
1372 """ 1373 Boolean: can this element be checked? 1374 """ 1375 return self.type in ['checkbox', 'radio']
1376 checkable = property(checkable, doc=checkable.__doc__) 1377
1378 - def _checked__get(self):
1379 """ 1380 Boolean attribute to get/set the presence of the ``checked`` 1381 attribute. 1382 1383 You can only use this on checkable input types. 1384 """ 1385 if not self.checkable: 1386 raise AttributeError('Not a checkable input type') 1387 return 'checked' in self.attrib
1388 - def _checked__set(self, value):
1389 if not self.checkable: 1390 raise AttributeError('Not a checkable input type') 1391 if value: 1392 self.set('checked', '') 1393 else: 1394 if 'checked' in self.attrib: 1395 del self.attrib['checked']
1396 checked = property(_checked__get, _checked__set, doc=_checked__get.__doc__)
1397 1398 HtmlElementClassLookup._default_element_classes['input'] = InputElement 1399
1400 -class LabelElement(HtmlElement):
1401 """ 1402 Represents a ``<label>`` element. 1403 1404 Label elements are linked to other elements with their ``for`` 1405 attribute. You can access this element with ``label.for_element``. 1406 """ 1407
1408 - def _for_element__get(self):
1409 """ 1410 Get/set the element this label points to. Return None if it 1411 can't be found. 1412 """ 1413 id = self.get('for') 1414 if not id: 1415 return None 1416 return self.body.get_element_by_id(id)
1417 - def _for_element__set(self, other):
1418 id = other.get('id') 1419 if not id: 1420 raise TypeError( 1421 "Element %r has no id attribute" % other) 1422 self.set('for', id)
1423 - def _for_element__del(self):
1424 if 'id' in self.attrib: 1425 del self.attrib['id']
1426 for_element = property(_for_element__get, _for_element__set, _for_element__del, 1427 doc=_for_element__get.__doc__)
1428 1429 HtmlElementClassLookup._default_element_classes['label'] = LabelElement 1430 1431 ############################################################ 1432 ## Serialization 1433 ############################################################ 1434
1435 -def html_to_xhtml(html):
1436 """Convert all tags in an HTML tree to XHTML by moving them to the 1437 XHTML namespace. 1438 """ 1439 try: 1440 html = html.getroot() 1441 except AttributeError: 1442 pass 1443 prefix = "{%s}" % XHTML_NAMESPACE 1444 for el in html.iter(): 1445 tag = el.tag 1446 if isinstance(tag, basestring): 1447 if tag[0] != '{': 1448 el.tag = prefix + tag
1449
1450 -def xhtml_to_html(xhtml):
1451 """Convert all tags in an XHTML tree to HTML by removing their 1452 XHTML namespace. 1453 """ 1454 try: 1455 xhtml = xhtml.getroot() 1456 except AttributeError: 1457 pass 1458 prefix = "{%s}" % XHTML_NAMESPACE 1459 prefix_len = len(prefix) 1460 for el in xhtml.iter(prefix + "*"): 1461 el.tag = el.tag[prefix_len:]
1462 1463 # This isn't a general match, but it's a match for what libxml2 1464 # specifically serialises: 1465 __str_replace_meta_content_type = re.compile( 1466 r'<meta http-equiv="Content-Type"[^>]*>').sub 1467 __bytes_replace_meta_content_type = re.compile( 1468 r'<meta http-equiv="Content-Type"[^>]*>'.encode('ASCII')).sub 1469
1470 -def tostring(doc, pretty_print=False, include_meta_content_type=False, 1471 encoding=None, method="html", with_tail=True, doctype=None):
1472 """Return an HTML string representation of the document. 1473 1474 Note: if include_meta_content_type is true this will create a 1475 ``<meta http-equiv="Content-Type" ...>`` tag in the head; 1476 regardless of the value of include_meta_content_type any existing 1477 ``<meta http-equiv="Content-Type" ...>`` tag will be removed 1478 1479 The ``encoding`` argument controls the output encoding (defauts to 1480 ASCII, with &#...; character references for any characters outside 1481 of ASCII). Note that you can pass the name ``'unicode'`` as 1482 ``encoding`` argument to serialise to a unicode string. 1483 1484 The ``method`` argument defines the output method. It defaults to 1485 'html', but can also be 'xml' for xhtml output, or 'text' to 1486 serialise to plain text without markup. 1487 1488 To leave out the tail text of the top-level element that is being 1489 serialised, pass ``with_tail=False``. 1490 1491 The ``doctype`` option allows passing in a plain string that will 1492 be serialised before the XML tree. Note that passing in non 1493 well-formed content here will make the XML output non well-formed. 1494 Also, an existing doctype in the document tree will not be removed 1495 when serialising an ElementTree instance. 1496 1497 Example:: 1498 1499 >>> from lxml import html 1500 >>> root = html.fragment_fromstring('<p>Hello<br>world!</p>') 1501 1502 >>> html.tostring(root) 1503 b'<p>Hello<br>world!</p>' 1504 >>> html.tostring(root, method='html') 1505 b'<p>Hello<br>world!</p>' 1506 1507 >>> html.tostring(root, method='xml') 1508 b'<p>Hello<br/>world!</p>' 1509 1510 >>> html.tostring(root, method='text') 1511 b'Helloworld!' 1512 1513 >>> html.tostring(root, method='text', encoding=unicode) 1514 u'Helloworld!' 1515 1516 >>> root = html.fragment_fromstring('<div><p>Hello<br>world!</p>TAIL</div>') 1517 >>> html.tostring(root[0], method='text', encoding=unicode) 1518 u'Helloworld!TAIL' 1519 1520 >>> html.tostring(root[0], method='text', encoding=unicode, with_tail=False) 1521 u'Helloworld!' 1522 1523 >>> doc = html.document_fromstring('<p>Hello<br>world!</p>') 1524 >>> html.tostring(doc, method='html', encoding=unicode) 1525 u'<html><body><p>Hello<br>world!</p></body></html>' 1526 1527 >>> print(html.tostring(doc, method='html', encoding=unicode, 1528 ... doctype='<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN"' 1529 ... ' "http://www.w3.org/TR/html4/strict.dtd">')) 1530 <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd"> 1531 <html><body><p>Hello<br>world!</p></body></html> 1532 """ 1533 html = etree.tostring(doc, method=method, pretty_print=pretty_print, 1534 encoding=encoding, with_tail=with_tail, 1535 doctype=doctype) 1536 if method == 'html' and not include_meta_content_type: 1537 if isinstance(html, str): 1538 html = __str_replace_meta_content_type('', html) 1539 else: 1540 html = __bytes_replace_meta_content_type(bytes(), html) 1541 return html
1542 1543 tostring.__doc__ = __fix_docstring(tostring.__doc__) 1544
1545 -def open_in_browser(doc, encoding=None):
1546 """ 1547 Open the HTML document in a web browser, saving it to a temporary 1548 file to open it. Note that this does not delete the file after 1549 use. This is mainly meant for debugging. 1550 """ 1551 import os 1552 import webbrowser 1553 import tempfile 1554 if not isinstance(doc, etree._ElementTree): 1555 doc = etree.ElementTree(doc) 1556 handle, fn = tempfile.mkstemp(suffix='.html') 1557 f = os.fdopen(handle, 'wb') 1558 try: 1559 doc.write(f, method="html", encoding=encoding or doc.docinfo.encoding or "UTF-8") 1560 finally: 1561 # we leak the file itself here, but we should at least close it 1562 f.close() 1563 url = 'file://' + fn.replace(os.path.sep, '/') 1564 print(url) 1565 webbrowser.open(url)
1566 1567 ################################################################################ 1568 # configure Element class lookup 1569 ################################################################################ 1570
1571 -class HTMLParser(etree.HTMLParser):
1572 """An HTML parser that is configured to return lxml.html Element 1573 objects. 1574 """
1575 - def __init__(self, **kwargs):
1576 super(HTMLParser, self).__init__(**kwargs) 1577 self.set_element_class_lookup(HtmlElementClassLookup())
1578
1579 -class XHTMLParser(etree.XMLParser):
1580 """An XML parser that is configured to return lxml.html Element 1581 objects. 1582 1583 Note that this parser is not really XHTML aware unless you let it 1584 load a DTD that declares the HTML entities. To do this, make sure 1585 you have the XHTML DTDs installed in your catalogs, and create the 1586 parser like this:: 1587 1588 >>> parser = XHTMLParser(load_dtd=True) 1589 1590 If you additionally want to validate the document, use this:: 1591 1592 >>> parser = XHTMLParser(dtd_validation=True) 1593 1594 For catalog support, see http://www.xmlsoft.org/catalog.html. 1595 """
1596 - def __init__(self, **kwargs):
1597 super(XHTMLParser, self).__init__(**kwargs) 1598 self.set_element_class_lookup(HtmlElementClassLookup())
1599
1600 -def Element(*args, **kw):
1601 """Create a new HTML Element. 1602 1603 This can also be used for XHTML documents. 1604 """ 1605 v = html_parser.makeelement(*args, **kw) 1606 return v
1607 1608 html_parser = HTMLParser() 1609 xhtml_parser = XHTMLParser() 1610