Package lxml :: Package html
[hide private]
[frames] | no frames]

Source Code for Package lxml.html

   1  """The ``lxml.html`` tool set for HTML handling. 
   2  """ 
   3   
   4  import threading 
   5  import re 
   6  try: 
   7      from urlparse import urljoin 
   8  except ImportError: 
   9      # Python 3 
  10      from urllib.parse import urljoin 
  11  import copy 
  12  from lxml import etree 
  13  from lxml.html import defs 
  14  from lxml import cssselect 
  15  from lxml.html._setmixin import SetMixin 
  16  try: 
  17      from UserDict import DictMixin 
  18  except ImportError: 
  19      # DictMixin was introduced in Python 2.4 
  20      from lxml.html._dictmixin import DictMixin 
  21  try: 
  22      set 
  23  except NameError: 
  24      # Python 2.3 
  25      from sets import Set as set 
  26  try: 
  27      bytes = __builtins__["bytes"] 
  28  except (KeyError, NameError): 
  29      # Python < 2.6 
  30      bytes = str 
  31  try: 
  32      unicode = __builtins__["unicode"] 
  33  except (KeyError, NameError): 
  34      # Python 3 
  35      unicode = str 
  36  try: 
  37      basestring = __builtins__["basestring"] 
  38  except (KeyError, NameError): 
  39      # Python 3 
  40      basestring = (str, bytes) 
  41   
42 -def __fix_docstring(s):
43 import sys 44 if sys.version_info[0] >= 3: 45 sub = re.compile(r"^(\s*)u'", re.M).sub 46 else: 47 sub = re.compile(r"^(\s*)b'", re.M).sub 48 return sub(r"\1'", s)
49 50 __all__ = [ 51 'document_fromstring', 'fragment_fromstring', 'fragments_fromstring', 'fromstring', 52 'tostring', 'Element', 'defs', 'open_in_browser', 'submit_form', 53 'find_rel_links', 'find_class', 'make_links_absolute', 54 'resolve_base_href', 'iterlinks', 'rewrite_links', 'open_in_browser'] 55 56 XHTML_NAMESPACE = "http://www.w3.org/1999/xhtml" 57 58 _rel_links_xpath = etree.XPath("descendant-or-self::a[@rel]|descendant-or-self::x:a[@rel]", 59 namespaces={'x':XHTML_NAMESPACE}) 60 _options_xpath = etree.XPath("descendant-or-self::option|descendant-or-self::x:option", 61 namespaces={'x':XHTML_NAMESPACE}) 62 _forms_xpath = etree.XPath("descendant-or-self::form|descendant-or-self::x:form", 63 namespaces={'x':XHTML_NAMESPACE}) 64 #_class_xpath = etree.XPath(r"descendant-or-self::*[regexp:match(@class, concat('\b', $class_name, '\b'))]", {'regexp': 'http://exslt.org/regular-expressions'}) 65 _class_xpath = etree.XPath("descendant-or-self::*[@class and contains(concat(' ', normalize-space(@class), ' '), concat(' ', $class_name, ' '))]") 66 _id_xpath = etree.XPath("descendant-or-self::*[@id=$id]") 67 _collect_string_content = etree.XPath("string()") 68 _css_url_re = re.compile(r'url\((.*?)\)', re.I) 69 _css_import_re = re.compile(r'@import "(.*?)"') 70 _label_xpath = etree.XPath("//label[@for=$id]|//x:label[@for=$id]", 71 namespaces={'x':XHTML_NAMESPACE}) 72 _archive_re = re.compile(r'[^ ]+') 73
74 -def _transform_result(typ, result):
75 """Convert the result back into the input type. 76 """ 77 if issubclass(typ, bytes): 78 return tostring(result, encoding='utf-8') 79 elif issubclass(typ, unicode): 80 return tostring(result, encoding=unicode) 81 else: 82 return result
83
84 -def _nons(tag):
85 if isinstance(tag, basestring): 86 if tag[0] == '{' and tag[1:len(XHTML_NAMESPACE)+1] == XHTML_NAMESPACE: 87 return tag.split('}')[-1] 88 return tag
89
90 -class HtmlMixin(object):
91
92 - def base_url(self):
93 """ 94 Returns the base URL, given when the page was parsed. 95 96 Use with ``urlparse.urljoin(el.base_url, href)`` to get 97 absolute URLs. 98 """ 99 return self.getroottree().docinfo.URL
100 base_url = property(base_url, doc=base_url.__doc__) 101
102 - def forms(self):
103 """ 104 Return a list of all the forms 105 """ 106 return _forms_xpath(self)
107 forms = property(forms, doc=forms.__doc__) 108
109 - def body(self):
110 """ 111 Return the <body> element. Can be called from a child element 112 to get the document's head. 113 """ 114 return self.xpath('//body|//x:body', namespaces={'x':XHTML_NAMESPACE})[0]
115 body = property(body, doc=body.__doc__) 116
117 - def head(self):
118 """ 119 Returns the <head> element. Can be called from a child 120 element to get the document's head. 121 """ 122 return self.xpath('//head|//x:head', namespaces={'x':XHTML_NAMESPACE})[0]
123 head = property(head, doc=head.__doc__) 124
125 - def _label__get(self):
126 """ 127 Get or set any <label> element associated with this element. 128 """ 129 id = self.get('id') 130 if not id: 131 return None 132 result = _label_xpath(self, id=id) 133 if not result: 134 return None 135 else: 136 return result[0]
137 - def _label__set(self, label):
138 id = self.get('id') 139 if not id: 140 raise TypeError( 141 "You cannot set a label for an element (%r) that has no id" 142 % self) 143 if _nons(label.tag) != 'label': 144 raise TypeError( 145 "You can only assign label to a label element (not %r)" 146 % label) 147 label.set('for', id)
148 - def _label__del(self):
149 label = self.label 150 if label is not None: 151 del label.attrib['for']
152 label = property(_label__get, _label__set, _label__del, doc=_label__get.__doc__) 153
154 - def drop_tree(self):
155 """ 156 Removes this element from the tree, including its children and 157 text. The tail text is joined to the previous element or 158 parent. 159 """ 160 parent = self.getparent() 161 assert parent is not None 162 if self.tail: 163 previous = self.getprevious() 164 if previous is None: 165 parent.text = (parent.text or '') + self.tail 166 else: 167 previous.tail = (previous.tail or '') + self.tail 168 parent.remove(self)
169
170 - def drop_tag(self):
171 """ 172 Remove the tag, but not its children or text. The children and text 173 are merged into the parent. 174 175 Example:: 176 177 >>> h = fragment_fromstring('<div>Hello <b>World!</b></div>') 178 >>> h.find('.//b').drop_tag() 179 >>> print(tostring(h, encoding=unicode)) 180 <div>Hello World!</div> 181 """ 182 parent = self.getparent() 183 assert parent is not None 184 previous = self.getprevious() 185 if self.text and isinstance(self.tag, basestring): 186 # not a Comment, etc. 187 if previous is None: 188 parent.text = (parent.text or '') + self.text 189 else: 190 previous.tail = (previous.tail or '') + self.text 191 if self.tail: 192 if len(self): 193 last = self[-1] 194 last.tail = (last.tail or '') + self.tail 195 elif previous is None: 196 parent.text = (parent.text or '') + self.tail 197 else: 198 previous.tail = (previous.tail or '') + self.tail 199 index = parent.index(self) 200 parent[index:index+1] = self[:]
201 209
210 - def find_class(self, class_name):
211 """ 212 Find any elements with the given class name. 213 """ 214 return _class_xpath(self, class_name=class_name)
215
216 - def get_element_by_id(self, id, *default):
217 """ 218 Get the first element in a document with the given id. If none is 219 found, return the default argument if provided or raise KeyError 220 otherwise. 221 222 Note that there can be more than one element with the same id, 223 and this isn't uncommon in HTML documents found in the wild. 224 Browsers return only the first match, and this function does 225 the same. 226 """ 227 try: 228 # FIXME: should this check for multiple matches? 229 # browsers just return the first one 230 return _id_xpath(self, id=id)[0] 231 except IndexError: 232 if default: 233 return default[0] 234 else: 235 raise KeyError(id)
236
237 - def text_content(self):
238 """ 239 Return the text content of the tag (and the text in any children). 240 """ 241 return _collect_string_content(self)
242
243 - def cssselect(self, expr):
244 """ 245 Run the CSS expression on this element and its children, 246 returning a list of the results. 247 248 Equivalent to lxml.cssselect.CSSSelect(expr)(self) -- note 249 that pre-compiling the expression can provide a substantial 250 speedup. 251 """ 252 return cssselect.CSSSelector(expr)(self)
253 254 ######################################## 255 ## Link functions 256 ######################################## 257 277 self.rewrite_links(link_repl)
278
279 - def resolve_base_href(self):
280 """ 281 Find any ``<base href>`` tag in the document, and apply its 282 values to all links found in the document. Also remove the 283 tag once it has been applied. 284 """ 285 base_href = None 286 basetags = self.xpath('//base[@href]|//x:base[@href]', namespaces={'x':XHTML_NAMESPACE}) 287 for b in basetags: 288 base_href = b.get('href') 289 b.drop_tree() 290 if not base_href: 291 return 292 self.make_links_absolute(base_href, resolve_base_href=False)
293 349 396 397
398 -class _MethodFunc(object):
399 """ 400 An object that represents a method on an element as a function; 401 the function takes either an element or an HTML string. It 402 returns whatever the function normally returns, or if the function 403 works in-place (and so returns None) it returns a serialized form 404 of the resulting document. 405 """
406 - def __init__(self, name, copy=False, source_class=HtmlMixin):
407 self.name = name 408 self.copy = copy 409 self.__doc__ = getattr(source_class, self.name).__doc__
410 - def __call__(self, doc, *args, **kw):
411 result_type = type(doc) 412 if isinstance(doc, basestring): 413 if 'copy' in kw: 414 raise TypeError( 415 "The keyword 'copy' can only be used with element inputs to %s, not a string input" % self.name) 416 doc = fromstring(doc, **kw) 417 else: 418 if 'copy' in kw: 419 copy = kw.pop('copy') 420 else: 421 copy = self.copy 422 if copy: 423 doc = copy.deepcopy(doc) 424 meth = getattr(doc, self.name) 425 result = meth(*args, **kw) 426 # FIXME: this None test is a bit sloppy 427 if result is None: 428 # Then return what we got in 429 return _transform_result(result_type, doc) 430 else: 431 return result
432 433 find_rel_links = _MethodFunc('find_rel_links', copy=False) 434 find_class = _MethodFunc('find_class', copy=False) 435 make_links_absolute = _MethodFunc('make_links_absolute', copy=True) 436 resolve_base_href = _MethodFunc('resolve_base_href', copy=True) 437 iterlinks = _MethodFunc('iterlinks', copy=False) 438 rewrite_links = _MethodFunc('rewrite_links', copy=True) 439
440 -class HtmlComment(etree.CommentBase, HtmlMixin):
441 pass
442
443 -class HtmlElement(etree.ElementBase, HtmlMixin):
444 pass
445
446 -class HtmlProcessingInstruction(etree.PIBase, HtmlMixin):
447 pass
448
449 -class HtmlEntity(etree.EntityBase, HtmlMixin):
450 pass
451 452
453 -class HtmlElementClassLookup(etree.CustomElementClassLookup):
454 """A lookup scheme for HTML Element classes. 455 456 To create a lookup instance with different Element classes, pass a tag 457 name mapping of Element classes in the ``classes`` keyword argument and/or 458 a tag name mapping of Mixin classes in the ``mixins`` keyword argument. 459 The special key '*' denotes a Mixin class that should be mixed into all 460 Element classes. 461 """ 462 _default_element_classes = {} 463
464 - def __init__(self, classes=None, mixins=None):
465 etree.CustomElementClassLookup.__init__(self) 466 if classes is None: 467 classes = self._default_element_classes.copy() 468 if mixins: 469 mixers = {} 470 for name, value in mixins: 471 if name == '*': 472 for n in classes.keys(): 473 mixers.setdefault(n, []).append(value) 474 else: 475 mixers.setdefault(name, []).append(value) 476 for name, mix_bases in mixers.items(): 477 cur = classes.get(name, HtmlElement) 478 bases = tuple(mix_bases + [cur]) 479 classes[name] = type(cur.__name__, bases, {}) 480 self._element_classes = classes
481
482 - def lookup(self, node_type, document, namespace, name):
483 if node_type == 'element': 484 return self._element_classes.get(name.lower(), HtmlElement) 485 elif node_type == 'comment': 486 return HtmlComment 487 elif node_type == 'PI': 488 return HtmlProcessingInstruction 489 elif node_type == 'entity': 490 return HtmlEntity 491 # Otherwise normal lookup 492 return None
493 494 ################################################################################ 495 # parsing 496 ################################################################################ 497
498 -def document_fromstring(html, parser=None, **kw):
499 if parser is None: 500 parser = html_parser 501 value = etree.fromstring(html, parser, **kw) 502 if value is None: 503 raise etree.ParserError( 504 "Document is empty") 505 return value
506
507 -def fragments_fromstring(html, no_leading_text=False, base_url=None, 508 parser=None, **kw):
509 """ 510 Parses several HTML elements, returning a list of elements. 511 512 The first item in the list may be a string (though leading 513 whitespace is removed). If no_leading_text is true, then it will 514 be an error if there is leading text, and it will always be a list 515 of only elements. 516 517 base_url will set the document's base_url attribute (and the tree's docinfo.URL) 518 """ 519 if parser is None: 520 parser = html_parser 521 # FIXME: check what happens when you give html with a body, head, etc. 522 start = html[:20].lstrip().lower() 523 if not start.startswith('<html') and not start.startswith('<!doctype'): 524 html = '<html><body>%s</body></html>' % html 525 doc = document_fromstring(html, parser=parser, base_url=base_url, **kw) 526 assert _nons(doc.tag) == 'html' 527 bodies = [e for e in doc if _nons(e.tag) == 'body'] 528 assert len(bodies) == 1, ("too many bodies: %r in %r" % (bodies, html)) 529 body = bodies[0] 530 elements = [] 531 if no_leading_text and body.text and body.text.strip(): 532 raise etree.ParserError( 533 "There is leading text: %r" % body.text) 534 if body.text and body.text.strip(): 535 elements.append(body.text) 536 elements.extend(body) 537 # FIXME: removing the reference to the parent artificial document 538 # would be nice 539 return elements
540
541 -def fragment_fromstring(html, create_parent=False, base_url=None, 542 parser=None, **kw):
543 """ 544 Parses a single HTML element; it is an error if there is more than 545 one element, or if anything but whitespace precedes or follows the 546 element. 547 548 If create_parent is true (or is a tag name) then a parent node 549 will be created to encapsulate the HTML in a single element. 550 551 base_url will set the document's base_url attribute (and the tree's docinfo.URL) 552 """ 553 if parser is None: 554 parser = html_parser 555 if create_parent: 556 if not isinstance(create_parent, basestring): 557 create_parent = 'div' 558 return fragment_fromstring('<%s>%s</%s>' % ( 559 create_parent, html, create_parent), 560 parser=parser, base_url=base_url, **kw) 561 elements = fragments_fromstring(html, parser=parser, no_leading_text=True, 562 base_url=base_url, **kw) 563 if not elements: 564 raise etree.ParserError( 565 "No elements found") 566 if len(elements) > 1: 567 raise etree.ParserError( 568 "Multiple elements found (%s)" 569 % ', '.join([_element_name(e) for e in elements])) 570 el = elements[0] 571 if el.tail and el.tail.strip(): 572 raise etree.ParserError( 573 "Element followed by text: %r" % el.tail) 574 el.tail = None 575 return el
576
577 -def fromstring(html, base_url=None, parser=None, **kw):
578 """ 579 Parse the html, returning a single element/document. 580 581 This tries to minimally parse the chunk of text, without knowing if it 582 is a fragment or a document. 583 584 base_url will set the document's base_url attribute (and the tree's docinfo.URL) 585 """ 586 if parser is None: 587 parser = html_parser 588 start = html[:10].lstrip().lower() 589 if start.startswith('<html') or start.startswith('<!doctype'): 590 # Looks like a full HTML document 591 return document_fromstring(html, parser=parser, base_url=base_url, **kw) 592 # otherwise, lets parse it out... 593 doc = document_fromstring(html, parser=parser, base_url=base_url, **kw) 594 bodies = doc.findall('body') 595 if not bodies: 596 bodies = doc.findall('{%s}body' % XHTML_NAMESPACE) 597 if bodies: 598 body = bodies[0] 599 if len(bodies) > 1: 600 # Somehow there are multiple bodies, which is bad, but just 601 # smash them into one body 602 for other_body in bodies[1:]: 603 if other_body.text: 604 if len(body): 605 body[-1].tail = (body[-1].tail or '') + other_body.text 606 else: 607 body.text = (body.text or '') + other_body.text 608 body.extend(other_body) 609 # We'll ignore tail 610 # I guess we are ignoring attributes too 611 other_body.drop_tree() 612 else: 613 body = None 614 heads = doc.findall('head') 615 if not heads: 616 heads = doc.findall('{%s}head' % XHTML_NAMESPACE) 617 if heads: 618 # Well, we have some sort of structure, so lets keep it all 619 head = heads[0] 620 if len(heads) > 1: 621 for other_head in heads[1:]: 622 head.extend(other_head) 623 # We don't care about text or tail in a head 624 other_head.drop_tree() 625 return doc 626 if (len(body) == 1 and (not body.text or not body.text.strip()) 627 and (not body[-1].tail or not body[-1].tail.strip())): 628 # The body has just one element, so it was probably a single 629 # element passed in 630 return body[0] 631 # Now we have a body which represents a bunch of tags which have the 632 # content that was passed in. We will create a fake container, which 633 # is the body tag, except <body> implies too much structure. 634 if _contains_block_level_tag(body): 635 body.tag = 'div' 636 else: 637 body.tag = 'span' 638 return body
639
640 -def parse(filename_or_url, parser=None, base_url=None, **kw):
641 """ 642 Parse a filename, URL, or file-like object into an HTML document 643 tree. Note: this returns a tree, not an element. Use 644 ``parse(...).getroot()`` to get the document root. 645 646 You can override the base URL with the ``base_url`` keyword. This 647 is most useful when parsing from a file-like object. 648 """ 649 if parser is None: 650 parser = html_parser 651 return etree.parse(filename_or_url, parser, base_url=base_url, **kw)
652
653 -def _contains_block_level_tag(el):
654 # FIXME: I could do this with XPath, but would that just be 655 # unnecessarily slow? 656 for el in el.iter(): 657 if _nons(el.tag) in defs.block_tags: 658 return True 659 return False
660
661 -def _element_name(el):
662 if isinstance(el, etree.CommentBase): 663 return 'comment' 664 elif isinstance(el, basestring): 665 return 'string' 666 else: 667 return _nons(el.tag)
668 669 ################################################################################ 670 # form handling 671 ################################################################################ 672
673 -class FormElement(HtmlElement):
674 """ 675 Represents a <form> element. 676 """ 677
678 - def inputs(self):
679 """ 680 Returns an accessor for all the input elements in the form. 681 682 See `InputGetter` for more information about the object. 683 """ 684 return InputGetter(self)
685 inputs = property(inputs, doc=inputs.__doc__) 686
687 - def _fields__get(self):
688 """ 689 Dictionary-like object that represents all the fields in this 690 form. You can set values in this dictionary to effect the 691 form. 692 """ 693 return FieldsDict(self.inputs)
694 - def _fields__set(self, value):
695 prev_keys = self.fields.keys() 696 for key, value in value.iteritems(): 697 if key in prev_keys: 698 prev_keys.remove(key) 699 self.fields[key] = value 700 for key in prev_keys: 701 if key is None: 702 # Case of an unnamed input; these aren't really 703 # expressed in form_values() anyway. 704 continue 705 self.fields[key] = None
706 707 fields = property(_fields__get, _fields__set, doc=_fields__get.__doc__) 708
709 - def _name(self):
710 if self.get('name'): 711 return self.get('name') 712 elif self.get('id'): 713 return '#' + self.get('id') 714 forms = self.body.findall('form') 715 if not forms: 716 forms = self.body.findall('{%s}form' % XHTML_NAMESPACE) 717 return str(forms.index(self))
718
719 - def form_values(self):
720 """ 721 Return a list of tuples of the field values for the form. 722 This is suitable to be passed to ``urllib.urlencode()``. 723 """ 724 results = [] 725 for el in self.inputs: 726 name = el.name 727 if not name: 728 continue 729 tag = _nons(el.tag) 730 if tag == 'textarea': 731 results.append((name, el.value)) 732 elif tag == 'select': 733 value = el.value 734 if el.multiple: 735 for v in value: 736 results.append((name, v)) 737 elif value is not None: 738 results.append((name, el.value)) 739 else: 740 assert tag == 'input', ( 741 "Unexpected tag: %r" % el) 742 if el.checkable and not el.checked: 743 continue 744 if el.type in ('submit', 'image', 'reset'): 745 continue 746 value = el.value 747 if value is not None: 748 results.append((name, el.value)) 749 return results
750
751 - def _action__get(self):
752 """ 753 Get/set the form's ``action`` attribute. 754 """ 755 base_url = self.base_url 756 action = self.get('action') 757 if base_url and action is not None: 758 return urljoin(base_url, action) 759 else: 760 return action
761 - def _action__set(self, value):
762 self.set('action', value)
763 - def _action__del(self):
764 if 'action' in self.attrib: 765 del self.attrib['action']
766 action = property(_action__get, _action__set, _action__del, doc=_action__get.__doc__) 767
768 - def _method__get(self):
769 """ 770 Get/set the form's method. Always returns a capitalized 771 string, and defaults to ``'GET'`` 772 """ 773 return self.get('method', 'GET').upper()
774 - def _method__set(self, value):
775 self.set('method', value.upper())
776 method = property(_method__get, _method__set, doc=_method__get.__doc__)
777 778 HtmlElementClassLookup._default_element_classes['form'] = FormElement 779
780 -def submit_form(form, extra_values=None, open_http=None):
781 """ 782 Helper function to submit a form. Returns a file-like object, as from 783 ``urllib.urlopen()``. This object also has a ``.geturl()`` function, 784 which shows the URL if there were any redirects. 785 786 You can use this like:: 787 788 form = doc.forms[0] 789 form.inputs['foo'].value = 'bar' # etc 790 response = form.submit() 791 doc = parse(response) 792 doc.make_links_absolute(response.geturl()) 793 794 To change the HTTP requester, pass a function as ``open_http`` keyword 795 argument that opens the URL for you. The function must have the following 796 signature:: 797 798 open_http(method, URL, values) 799 800 The action is one of 'GET' or 'POST', the URL is the target URL as a 801 string, and the values are a sequence of ``(name, value)`` tuples with the 802 form data. 803 """ 804 values = form.form_values() 805 if extra_values: 806 if hasattr(extra_values, 'items'): 807 extra_values = extra_values.items() 808 values.extend(extra_values) 809 if open_http is None: 810 open_http = open_http_urllib 811 return open_http(form.method, form.action, values)
812
813 -def open_http_urllib(method, url, values):
814 import urllib 815 ## FIXME: should test that it's not a relative URL or something 816 if method == 'GET': 817 if '?' in url: 818 url += '&' 819 else: 820 url += '?' 821 url += urllib.urlencode(values) 822 data = None 823 else: 824 data = urllib.urlencode(values) 825 return urllib.urlopen(url, data)
826
827 -class FieldsDict(DictMixin):
828
829 - def __init__(self, inputs):
830 self.inputs = inputs
831 - def __getitem__(self, item):
832 return self.inputs[item].value
833 - def __setitem__(self, item, value):
834 self.inputs[item].value = value
835 - def __delitem__(self, item):
836 raise KeyError( 837 "You cannot remove keys from ElementDict")
838 - def keys(self):
839 return self.inputs.keys()
840 - def __contains__(self, item):
841 return item in self.inputs
842
843 - def __repr__(self):
844 return '<%s for form %s>' % ( 845 self.__class__.__name__, 846 self.inputs.form._name())
847
848 -class InputGetter(object):
849 850 """ 851 An accessor that represents all the input fields in a form. 852 853 You can get fields by name from this, with 854 ``form.inputs['field_name']``. If there are a set of checkboxes 855 with the same name, they are returned as a list (a `CheckboxGroup` 856 which also allows value setting). Radio inputs are handled 857 similarly. 858 859 You can also iterate over this to get all input elements. This 860 won't return the same thing as if you get all the names, as 861 checkboxes and radio elements are returned individually. 862 """ 863 864 _name_xpath = etree.XPath(".//*[@name = $name and (local-name(.) = 'select' or local-name(.) = 'input' or local-name(.) = 'textarea')]") 865 _all_xpath = etree.XPath(".//*[local-name() = 'select' or local-name() = 'input' or local-name() = 'textarea']") 866
867 - def __init__(self, form):
868 self.form = form
869
870 - def __repr__(self):
871 return '<%s for form %s>' % ( 872 self.__class__.__name__, 873 self.form._name())
874 875 ## FIXME: there should be more methods, and it's unclear if this is 876 ## a dictionary-like object or list-like object 877
878 - def __getitem__(self, name):
879 results = self._name_xpath(self.form, name=name) 880 if results: 881 type = results[0].get('type') 882 if type == 'radio' and len(results) > 1: 883 group = RadioGroup(results) 884 group.name = name 885 return group 886 elif type == 'checkbox' and len(results) > 1: 887 group = CheckboxGroup(results) 888 group.name = name 889 return group 890 else: 891 # I don't like throwing away elements like this 892 return results[0] 893 else: 894 raise KeyError( 895 "No input element with the name %r" % name)
896
897 - def __contains__(self, name):
898 results = self._name_xpath(self.form, name=name) 899 return bool(results)
900
901 - def keys(self):
902 names = set() 903 for el in self: 904 names.add(el.name) 905 if None in names: 906 names.remove(None) 907 return list(names)
908
909 - def __iter__(self):
910 ## FIXME: kind of dumb to turn a list into an iterator, only 911 ## to have it likely turned back into a list again :( 912 return iter(self._all_xpath(self.form))
913
914 -class InputMixin(object):
915 916 """ 917 Mix-in for all input elements (input, select, and textarea) 918 """ 919 920
921 - def _name__get(self):
922 """ 923 Get/set the name of the element 924 """ 925 return self.get('name')
926 - def _name__set(self, value):
927 self.set('name', value)
928 - def _name__del(self):
929 if 'name' in self.attrib: 930 del self.attrib['name']
931 name = property(_name__get, _name__set, _name__del, doc=_name__get.__doc__) 932
933 - def __repr__(self):
934 type = getattr(self, 'type', None) 935 if type: 936 type = ' type=%r' % type 937 else: 938 type = '' 939 return '<%s %x name=%r%s>' % ( 940 self.__class__.__name__, id(self), self.name, type)
941
942 -class TextareaElement(InputMixin, HtmlElement):
943 """ 944 ``<textarea>`` element. You can get the name with ``.name`` and 945 get/set the value with ``.value`` 946 """ 947
948 - def _value__get(self):
949 """ 950 Get/set the value (which is the contents of this element) 951 """ 952 return self.text or ''
953 - def _value__set(self, value):
954 self.text = value
955 - def _value__del(self):
956 self.text = ''
957 value = property(_value__get, _value__set, _value__del, doc=_value__get.__doc__)
958 959 HtmlElementClassLookup._default_element_classes['textarea'] = TextareaElement 960
961 -class SelectElement(InputMixin, HtmlElement):
962 """ 963 ``<select>`` element. You can get the name with ``.name``. 964 965 ``.value`` will be the value of the selected option, unless this 966 is a multi-select element (``<select multiple>``), in which case 967 it will be a set-like object. In either case ``.value_options`` 968 gives the possible values. 969 970 The boolean attribute ``.multiple`` shows if this is a 971 multi-select. 972 """ 973
974 - def _value__get(self):
975 """ 976 Get/set the value of this select (the selected option). 977 978 If this is a multi-select, this is a set-like object that 979 represents all the selected options. 980 """ 981 if self.multiple: 982 return MultipleSelectOptions(self) 983 for el in _options_xpath(self): 984 if 'selected' in el.attrib: 985 value = el.get('value') 986 # FIXME: If value is None, what to return?, get_text()? 987 return value 988 return None
989
990 - def _value__set(self, value):
991 if self.multiple: 992 if isinstance(value, basestring): 993 raise TypeError( 994 "You must pass in a sequence") 995 self.value.clear() 996 self.value.update(value) 997 return 998 if value is not None: 999 for el in _options_xpath(self): 1000 # FIXME: also if el.get('value') is None? 1001 if el.get('value') == value: 1002 checked_option = el 1003 break 1004 else: 1005 raise ValueError( 1006 "There is no option with the value of %r" % value) 1007 for el in _options_xpath(self): 1008 if 'selected' in el.attrib: 1009 del el.attrib['selected'] 1010 if value is not None: 1011 checked_option.set('selected', '')
1012
1013 - def _value__del(self):
1014 # FIXME: should del be allowed at all? 1015 if self.multiple: 1016 self.value.clear() 1017 else: 1018 self.value = None
1019 1020 value = property(_value__get, _value__set, _value__del, doc=_value__get.__doc__) 1021
1022 - def value_options(self):
1023 """ 1024 All the possible values this select can have (the ``value`` 1025 attribute of all the ``<option>`` elements. 1026 """ 1027 return [el.get('value') for el in _options_xpath(self)]
1028 value_options = property(value_options, doc=value_options.__doc__) 1029
1030 - def _multiple__get(self):
1031 """ 1032 Boolean attribute: is there a ``multiple`` attribute on this element. 1033 """ 1034 return 'multiple' in self.attrib
1035 - def _multiple__set(self, value):
1036 if value: 1037 self.set('multiple', '') 1038 elif 'multiple' in self.attrib: 1039 del self.attrib['multiple']
1040 multiple = property(_multiple__get, _multiple__set, doc=_multiple__get.__doc__)
1041 1042 HtmlElementClassLookup._default_element_classes['select'] = SelectElement 1043
1044 -class MultipleSelectOptions(SetMixin):
1045 """ 1046 Represents all the selected options in a ``<select multiple>`` element. 1047 1048 You can add to this set-like option to select an option, or remove 1049 to unselect the option. 1050 """ 1051
1052 - def __init__(self, select):
1053 self.select = select
1054
1055 - def options(self):
1056 """ 1057 Iterator of all the ``<option>`` elements. 1058 """ 1059 return iter(_options_xpath(self.select))
1060 options = property(options) 1061
1062 - def __iter__(self):
1063 for option in self.options: 1064 yield option.get('value')
1065
1066 - def add(self, item):
1067 for option in self.options: 1068 if option.get('value') == item: 1069 option.set('selected', '') 1070 break 1071 else: 1072 raise ValueError( 1073 "There is no option with the value %r" % item)
1074
1075 - def remove(self, item):
1076 for option in self.options: 1077 if option.get('value') == item: 1078 if 'selected' in option.attrib: 1079 del option.attrib['selected'] 1080 else: 1081 raise ValueError( 1082 "The option %r is not currently selected" % item) 1083 break 1084 else: 1085 raise ValueError( 1086 "There is not option with the value %r" % item)
1087
1088 - def __repr__(self):
1089 return '<%s {%s} for select name=%r>' % ( 1090 self.__class__.__name__, 1091 ', '.join([repr(v) for v in self]), 1092 self.select.name)
1093
1094 -class RadioGroup(list):
1095 """ 1096 This object represents several ``<input type=radio>`` elements 1097 that have the same name. 1098 1099 You can use this like a list, but also use the property 1100 ``.value`` to check/uncheck inputs. Also you can use 1101 ``.value_options`` to get the possible values. 1102 """ 1103
1104 - def _value__get(self):
1105 """ 1106 Get/set the value, which checks the radio with that value (and 1107 unchecks any other value). 1108 """ 1109 for el in self: 1110 if 'checked' in el.attrib: 1111 return el.get('value') 1112 return None
1113
1114 - def _value__set(self, value):
1115 if value is not None: 1116 for el in self: 1117 if el.get('value') == value: 1118 checked_option = el 1119 break 1120 else: 1121 raise ValueError( 1122 "There is no radio input with the value %r" % value) 1123 for el in self: 1124 if 'checked' in el.attrib: 1125 del el.attrib['checked'] 1126 if value is not None: 1127 checked_option.set('checked', '')
1128
1129 - def _value__del(self):
1130 self.value = None
1131 1132 value = property(_value__get, _value__set, _value__del, doc=_value__get.__doc__) 1133
1134 - def value_options(self):
1135 """ 1136 Returns a list of all the possible values. 1137 """ 1138 return [el.get('value') for el in self]
1139 value_options = property(value_options, doc=value_options.__doc__) 1140
1141 - def __repr__(self):
1142 return '%s(%s)' % ( 1143 self.__class__.__name__, 1144 list.__repr__(self))
1145
1146 -class CheckboxGroup(list):
1147 """ 1148 Represents a group of checkboxes (``<input type=checkbox>``) that 1149 have the same name. 1150 1151 In addition to using this like a list, the ``.value`` attribute 1152 returns a set-like object that you can add to or remove from to 1153 check and uncheck checkboxes. You can also use ``.value_options`` 1154 to get the possible values. 1155 """ 1156
1157 - def _value__get(self):
1158 """ 1159 Return a set-like object that can be modified to check or 1160 uncheck individual checkboxes according to their value. 1161 """ 1162 return CheckboxValues(self)
1163 - def _value__set(self, value):
1164 self.value.clear() 1165 if not hasattr(value, '__iter__'): 1166 raise ValueError( 1167 "A CheckboxGroup (name=%r) must be set to a sequence (not %r)" 1168 % (self[0].name, value)) 1169 self.value.update(value)
1170 - def _value__del(self):
1171 self.value.clear()
1172 value = property(_value__get, _value__set, _value__del, doc=_value__get.__doc__) 1173
1174 - def __repr__(self):
1175 return '%s(%s)' % ( 1176 self.__class__.__name__, list.__repr__(self))
1177
1178 -class CheckboxValues(SetMixin):
1179 1180 """ 1181 Represents the values of the checked checkboxes in a group of 1182 checkboxes with the same name. 1183 """ 1184
1185 - def __init__(self, group):
1186 self.group = group
1187
1188 - def __iter__(self):
1189 return iter([ 1190 el.get('value') 1191 for el in self.group 1192 if 'checked' in el.attrib])
1193
1194 - def add(self, value):
1195 for el in self.group: 1196 if el.get('value') == value: 1197 el.set('checked', '') 1198 break 1199 else: 1200 raise KeyError("No checkbox with value %r" % value)
1201
1202 - def remove(self, value):
1203 for el in self.group: 1204 if el.get('value') == value: 1205 if 'checked' in el.attrib: 1206 del el.attrib['checked'] 1207 else: 1208 raise KeyError( 1209 "The checkbox with value %r was already unchecked" % value) 1210 break 1211 else: 1212 raise KeyError( 1213 "No checkbox with value %r" % value)
1214
1215 - def __repr__(self):
1216 return '<%s {%s} for checkboxes name=%r>' % ( 1217 self.__class__.__name__, 1218 ', '.join([repr(v) for v in self]), 1219 self.group.name)
1220
1221 -class InputElement(InputMixin, HtmlElement):
1222 """ 1223 Represents an ``<input>`` element. 1224 1225 You can get the type with ``.type`` (which is lower-cased and 1226 defaults to ``'text'``). 1227 1228 Also you can get and set the value with ``.value`` 1229 1230 Checkboxes and radios have the attribute ``input.checkable == 1231 True`` (for all others it is false) and a boolean attribute 1232 ``.checked``. 1233 1234 """ 1235 1236 ## FIXME: I'm a little uncomfortable with the use of .checked
1237 - def _value__get(self):
1238 """ 1239 Get/set the value of this element, using the ``value`` attribute. 1240 1241 Also, if this is a checkbox and it has no value, this defaults 1242 to ``'on'``. If it is a checkbox or radio that is not 1243 checked, this returns None. 1244 """ 1245 if self.checkable: 1246 if self.checked: 1247 return self.get('value') or 'on' 1248 else: 1249 return None 1250 return self.get('value')
1251 - def _value__set(self, value):
1252 if self.checkable: 1253 if not value: 1254 self.checked = False 1255 else: 1256 self.checked = True 1257 if isinstance(value, basestring): 1258 self.set('value', value) 1259 else: 1260 self.set('value', value)
1261 - def _value__del(self):
1262 if self.checkable: 1263 self.checked = False 1264 else: 1265 if 'value' in self.attrib: 1266 del self.attrib['value']
1267 value = property(_value__get, _value__set, _value__del, doc=_value__get.__doc__) 1268
1269 - def _type__get(self):
1270 """ 1271 Return the type of this element (using the type attribute). 1272 """ 1273 return self.get('type', 'text').lower()
1274 - def _type__set(self, value):
1275 self.set('type', value)
1276 type = property(_type__get, _type__set, doc=_type__get.__doc__) 1277
1278 - def checkable(self):
1279 """ 1280 Boolean: can this element be checked? 1281 """ 1282 return self.type in ['checkbox', 'radio']
1283 checkable = property(checkable, doc=checkable.__doc__) 1284
1285 - def _checked__get(self):
1286 """ 1287 Boolean attribute to get/set the presence of the ``checked`` 1288 attribute. 1289 1290 You can only use this on checkable input types. 1291 """ 1292 if not self.checkable: 1293 raise AttributeError('Not a checkable input type') 1294 return 'checked' in self.attrib
1295 - def _checked__set(self, value):
1296 if not self.checkable: 1297 raise AttributeError('Not a checkable input type') 1298 if value: 1299 self.set('checked', '') 1300 else: 1301 if 'checked' in self.attrib: 1302 del self.attrib['checked']
1303 checked = property(_checked__get, _checked__set, doc=_checked__get.__doc__)
1304 1305 HtmlElementClassLookup._default_element_classes['input'] = InputElement 1306
1307 -class LabelElement(HtmlElement):
1308 """ 1309 Represents a ``<label>`` element. 1310 1311 Label elements are linked to other elements with their ``for`` 1312 attribute. You can access this element with ``label.for_element``. 1313 """ 1314
1315 - def _for_element__get(self):
1316 """ 1317 Get/set the element this label points to. Return None if it 1318 can't be found. 1319 """ 1320 id = self.get('for') 1321 if not id: 1322 return None 1323 return self.body.get_element_by_id(id)
1324 - def _for_element__set(self, other):
1325 id = other.get('id') 1326 if not id: 1327 raise TypeError( 1328 "Element %r has no id attribute" % other) 1329 self.set('for', id)
1330 - def _for_element__del(self):
1331 if 'id' in self.attrib: 1332 del self.attrib['id']
1333 for_element = property(_for_element__get, _for_element__set, _for_element__del, 1334 doc=_for_element__get.__doc__)
1335 1336 HtmlElementClassLookup._default_element_classes['label'] = LabelElement 1337 1338 ############################################################ 1339 ## Serialization 1340 ############################################################ 1341
1342 -def html_to_xhtml(html):
1343 """Convert all tags in an HTML tree to XHTML by moving them to the 1344 XHTML namespace. 1345 """ 1346 try: 1347 html = html.getroot() 1348 except AttributeError: 1349 pass 1350 prefix = "{%s}" % XHTML_NAMESPACE 1351 for el in html.iter(): 1352 tag = el.tag 1353 if isinstance(tag, basestring): 1354 if tag[0] != '{': 1355 el.tag = prefix + tag
1356
1357 -def xhtml_to_html(xhtml):
1358 """Convert all tags in an XHTML tree to HTML by removing their 1359 XHTML namespace. 1360 """ 1361 try: 1362 xhtml = xhtml.getroot() 1363 except AttributeError: 1364 pass 1365 prefix = "{%s}" % XHTML_NAMESPACE 1366 prefix_len = len(prefix) 1367 for el in xhtml.iter(prefix + "*"): 1368 el.tag = el.tag[prefix_len:]
1369 1370 # This isn't a general match, but it's a match for what libxml2 1371 # specifically serialises: 1372 __replace_meta_content_type = re.compile( 1373 r'<meta http-equiv="Content-Type".*?>').sub 1374
1375 -def tostring(doc, pretty_print=False, include_meta_content_type=False, 1376 encoding=None, method="html"):
1377 """Return an HTML string representation of the document. 1378 1379 Note: the 'include_meta_content_type' argument exists purely for 1380 compatibility and does not serve any purpose. 1381 1382 The ``encoding`` argument controls the output encoding (defauts to 1383 ASCII, with &#...; character references for any characters outside 1384 of ASCII). 1385 1386 The ``method`` argument defines the output method. It defaults to 1387 'html', but can also be 'xml' for xhtml output, or 'text' to 1388 serialise to plain text without markup. Note that you can pass 1389 the builtin ``unicode`` type as ``encoding`` argument to serialise 1390 to a unicode string. 1391 1392 Example:: 1393 1394 >>> from lxml import html 1395 >>> root = html.fragment_fromstring('<p>Hello<br>world!</p>') 1396 1397 >>> html.tostring(root) 1398 b'<p>Hello<br>world!</p>' 1399 >>> html.tostring(root, method='html') 1400 b'<p>Hello<br>world!</p>' 1401 1402 >>> html.tostring(root, method='xml') 1403 b'<p>Hello<br/>world!</p>' 1404 1405 >>> html.tostring(root, method='text') 1406 b'Helloworld!' 1407 1408 >>> html.tostring(root, method='text', encoding=unicode) 1409 u'Helloworld!' 1410 """ 1411 html = etree.tostring(doc, method=method, pretty_print=pretty_print, 1412 encoding=encoding) 1413 if not include_meta_content_type: 1414 html = __replace_meta_content_type('', html) 1415 return html
1416 1417 tostring.__doc__ = __fix_docstring(tostring.__doc__) 1418
1419 -def open_in_browser(doc):
1420 """ 1421 Open the HTML document in a web browser (saving it to a temporary 1422 file to open it). 1423 """ 1424 import os 1425 import webbrowser 1426 try: 1427 write_doc = doc.write 1428 except AttributeError: 1429 write_doc = etree.ElementTree(element=doc).write 1430 fn = os.tempnam() + '.html' 1431 write_doc(fn, method="html") 1432 url = 'file://' + fn.replace(os.path.sep, '/') 1433 print(url) 1434 webbrowser.open(url)
1435 1436 ################################################################################ 1437 # configure Element class lookup 1438 ################################################################################ 1439
1440 -class HTMLParser(etree.HTMLParser):
1441 - def __init__(self, **kwargs):
1442 super(HTMLParser, self).__init__(**kwargs) 1443 self.set_element_class_lookup(HtmlElementClassLookup())
1444
1445 -class XHTMLParser(etree.XMLParser):
1446 - def __init__(self, **kwargs):
1447 super(XHTMLParser, self).__init__(**kwargs) 1448 self.set_element_class_lookup(HtmlElementClassLookup())
1449
1450 -def Element(*args, **kw):
1451 """Create a new HTML Element. 1452 1453 This can also be used for XHTML documents. 1454 """ 1455 v = html_parser.makeelement(*args, **kw) 1456 return v
1457 1458 html_parser = HTMLParser() 1459 xhtml_parser = XHTMLParser() 1460