Package lxml :: Package html
[hide private]
[frames] | no frames]

Source Code for Package lxml.html

   1  """The ``lxml.html`` tool set for HTML handling. 
   2  """ 
   3   
   4  import threading 
   5  import re 
   6  try: 
   7      from urlparse import urljoin 
   8  except ImportError: 
   9      # Python 3 
  10      from urllib.parse import urljoin 
  11  import copy 
  12  from lxml import etree 
  13  from lxml.html import defs 
  14  from lxml import cssselect 
  15  from lxml.html._setmixin import SetMixin 
  16  try: 
  17      from UserDict import DictMixin 
  18  except ImportError: 
  19      # DictMixin was introduced in Python 2.4 
  20      from lxml.html._dictmixin import DictMixin 
  21  try: 
  22      set 
  23  except NameError: 
  24      # Python 2.3 
  25      from sets import Set as set 
  26  try: 
  27      bytes = __builtins__["bytes"] 
  28  except (KeyError, NameError): 
  29      # Python < 2.6 
  30      bytes = str 
  31  try: 
  32      unicode = __builtins__["unicode"] 
  33  except (KeyError, NameError): 
  34      # Python 3 
  35      unicode = str 
  36  try: 
  37      basestring = __builtins__["basestring"] 
  38  except (KeyError, NameError): 
  39      # Python 3 
  40      basestring = (str, bytes) 
  41   
42 -def __fix_docstring(s):
43 if not s: 44 return s 45 import sys 46 if sys.version_info[0] >= 3: 47 sub = re.compile(r"^(\s*)u'", re.M).sub 48 else: 49 sub = re.compile(r"^(\s*)b'", re.M).sub 50 return sub(r"\1'", s)
51 52 __all__ = [ 53 'document_fromstring', 'fragment_fromstring', 'fragments_fromstring', 'fromstring', 54 'tostring', 'Element', 'defs', 'open_in_browser', 'submit_form', 55 'find_rel_links', 'find_class', 'make_links_absolute', 56 'resolve_base_href', 'iterlinks', 'rewrite_links', 'open_in_browser', 'parse'] 57 58 XHTML_NAMESPACE = "http://www.w3.org/1999/xhtml" 59 60 _rel_links_xpath = etree.XPath("descendant-or-self::a[@rel]|descendant-or-self::x:a[@rel]", 61 namespaces={'x':XHTML_NAMESPACE}) 62 _options_xpath = etree.XPath("descendant-or-self::option|descendant-or-self::x:option", 63 namespaces={'x':XHTML_NAMESPACE}) 64 _forms_xpath = etree.XPath("descendant-or-self::form|descendant-or-self::x:form", 65 namespaces={'x':XHTML_NAMESPACE}) 66 #_class_xpath = etree.XPath(r"descendant-or-self::*[regexp:match(@class, concat('\b', $class_name, '\b'))]", {'regexp': 'http://exslt.org/regular-expressions'}) 67 _class_xpath = etree.XPath("descendant-or-self::*[@class and contains(concat(' ', normalize-space(@class), ' '), concat(' ', $class_name, ' '))]") 68 _id_xpath = etree.XPath("descendant-or-self::*[@id=$id]") 69 _collect_string_content = etree.XPath("string()") 70 _css_url_re = re.compile(r'url\((.*?)\)', re.I) 71 _css_import_re = re.compile(r'@import "(.*?)"') 72 _label_xpath = etree.XPath("//label[@for=$id]|//x:label[@for=$id]", 73 namespaces={'x':XHTML_NAMESPACE}) 74 _archive_re = re.compile(r'[^ ]+') 75
76 -def _transform_result(typ, result):
77 """Convert the result back into the input type. 78 """ 79 if issubclass(typ, bytes): 80 return tostring(result, encoding='utf-8') 81 elif issubclass(typ, unicode): 82 return tostring(result, encoding=unicode) 83 else: 84 return result
85
86 -def _nons(tag):
87 if isinstance(tag, basestring): 88 if tag[0] == '{' and tag[1:len(XHTML_NAMESPACE)+1] == XHTML_NAMESPACE: 89 return tag.split('}')[-1] 90 return tag
91
92 -class HtmlMixin(object):
93
94 - def base_url(self):
95 """ 96 Returns the base URL, given when the page was parsed. 97 98 Use with ``urlparse.urljoin(el.base_url, href)`` to get 99 absolute URLs. 100 """ 101 return self.getroottree().docinfo.URL
102 base_url = property(base_url, doc=base_url.__doc__) 103
104 - def forms(self):
105 """ 106 Return a list of all the forms 107 """ 108 return _forms_xpath(self)
109 forms = property(forms, doc=forms.__doc__) 110
111 - def body(self):
112 """ 113 Return the <body> element. Can be called from a child element 114 to get the document's head. 115 """ 116 return self.xpath('//body|//x:body', namespaces={'x':XHTML_NAMESPACE})[0]
117 body = property(body, doc=body.__doc__) 118
119 - def head(self):
120 """ 121 Returns the <head> element. Can be called from a child 122 element to get the document's head. 123 """ 124 return self.xpath('//head|//x:head', namespaces={'x':XHTML_NAMESPACE})[0]
125 head = property(head, doc=head.__doc__) 126
127 - def _label__get(self):
128 """ 129 Get or set any <label> element associated with this element. 130 """ 131 id = self.get('id') 132 if not id: 133 return None 134 result = _label_xpath(self, id=id) 135 if not result: 136 return None 137 else: 138 return result[0]
139 - def _label__set(self, label):
140 id = self.get('id') 141 if not id: 142 raise TypeError( 143 "You cannot set a label for an element (%r) that has no id" 144 % self) 145 if _nons(label.tag) != 'label': 146 raise TypeError( 147 "You can only assign label to a label element (not %r)" 148 % label) 149 label.set('for', id)
150 - def _label__del(self):
151 label = self.label 152 if label is not None: 153 del label.attrib['for']
154 label = property(_label__get, _label__set, _label__del, doc=_label__get.__doc__) 155
156 - def drop_tree(self):
157 """ 158 Removes this element from the tree, including its children and 159 text. The tail text is joined to the previous element or 160 parent. 161 """ 162 parent = self.getparent() 163 assert parent is not None 164 if self.tail: 165 previous = self.getprevious() 166 if previous is None: 167 parent.text = (parent.text or '') + self.tail 168 else: 169 previous.tail = (previous.tail or '') + self.tail 170 parent.remove(self)
171
172 - def drop_tag(self):
173 """ 174 Remove the tag, but not its children or text. The children and text 175 are merged into the parent. 176 177 Example:: 178 179 >>> h = fragment_fromstring('<div>Hello <b>World!</b></div>') 180 >>> h.find('.//b').drop_tag() 181 >>> print(tostring(h, encoding=unicode)) 182 <div>Hello World!</div> 183 """ 184 parent = self.getparent() 185 assert parent is not None 186 previous = self.getprevious() 187 if self.text and isinstance(self.tag, basestring): 188 # not a Comment, etc. 189 if previous is None: 190 parent.text = (parent.text or '') + self.text 191 else: 192 previous.tail = (previous.tail or '') + self.text 193 if self.tail: 194 if len(self): 195 last = self[-1] 196 last.tail = (last.tail or '') + self.tail 197 elif previous is None: 198 parent.text = (parent.text or '') + self.tail 199 else: 200 previous.tail = (previous.tail or '') + self.tail 201 index = parent.index(self) 202 parent[index:index+1] = self[:]
203 211
212 - def find_class(self, class_name):
213 """ 214 Find any elements with the given class name. 215 """ 216 return _class_xpath(self, class_name=class_name)
217
218 - def get_element_by_id(self, id, *default):
219 """ 220 Get the first element in a document with the given id. If none is 221 found, return the default argument if provided or raise KeyError 222 otherwise. 223 224 Note that there can be more than one element with the same id, 225 and this isn't uncommon in HTML documents found in the wild. 226 Browsers return only the first match, and this function does 227 the same. 228 """ 229 try: 230 # FIXME: should this check for multiple matches? 231 # browsers just return the first one 232 return _id_xpath(self, id=id)[0] 233 except IndexError: 234 if default: 235 return default[0] 236 else: 237 raise KeyError(id)
238
239 - def text_content(self):
240 """ 241 Return the text content of the tag (and the text in any children). 242 """ 243 return _collect_string_content(self)
244
245 - def cssselect(self, expr):
246 """ 247 Run the CSS expression on this element and its children, 248 returning a list of the results. 249 250 Equivalent to lxml.cssselect.CSSSelect(expr)(self) -- note 251 that pre-compiling the expression can provide a substantial 252 speedup. 253 """ 254 return cssselect.CSSSelector(expr)(self)
255 256 ######################################## 257 ## Link functions 258 ######################################## 259 279 self.rewrite_links(link_repl)
280
281 - def resolve_base_href(self):
282 """ 283 Find any ``<base href>`` tag in the document, and apply its 284 values to all links found in the document. Also remove the 285 tag once it has been applied. 286 """ 287 base_href = None 288 basetags = self.xpath('//base[@href]|//x:base[@href]', namespaces={'x':XHTML_NAMESPACE}) 289 for b in basetags: 290 base_href = b.get('href') 291 b.drop_tree() 292 if not base_href: 293 return 294 self.make_links_absolute(base_href, resolve_base_href=False)
295 351 398 399
400 -class _MethodFunc(object):
401 """ 402 An object that represents a method on an element as a function; 403 the function takes either an element or an HTML string. It 404 returns whatever the function normally returns, or if the function 405 works in-place (and so returns None) it returns a serialized form 406 of the resulting document. 407 """
408 - def __init__(self, name, copy=False, source_class=HtmlMixin):
409 self.name = name 410 self.copy = copy 411 self.__doc__ = getattr(source_class, self.name).__doc__
412 - def __call__(self, doc, *args, **kw):
413 result_type = type(doc) 414 if isinstance(doc, basestring): 415 if 'copy' in kw: 416 raise TypeError( 417 "The keyword 'copy' can only be used with element inputs to %s, not a string input" % self.name) 418 doc = fromstring(doc, **kw) 419 else: 420 if 'copy' in kw: 421 copy = kw.pop('copy') 422 else: 423 copy = self.copy 424 if copy: 425 doc = copy.deepcopy(doc) 426 meth = getattr(doc, self.name) 427 result = meth(*args, **kw) 428 # FIXME: this None test is a bit sloppy 429 if result is None: 430 # Then return what we got in 431 return _transform_result(result_type, doc) 432 else: 433 return result
434 435 find_rel_links = _MethodFunc('find_rel_links', copy=False) 436 find_class = _MethodFunc('find_class', copy=False) 437 make_links_absolute = _MethodFunc('make_links_absolute', copy=True) 438 resolve_base_href = _MethodFunc('resolve_base_href', copy=True) 439 iterlinks = _MethodFunc('iterlinks', copy=False) 440 rewrite_links = _MethodFunc('rewrite_links', copy=True) 441
442 -class HtmlComment(etree.CommentBase, HtmlMixin):
443 pass
444
445 -class HtmlElement(etree.ElementBase, HtmlMixin):
446 pass
447
448 -class HtmlProcessingInstruction(etree.PIBase, HtmlMixin):
449 pass
450
451 -class HtmlEntity(etree.EntityBase, HtmlMixin):
452 pass
453 454
455 -class HtmlElementClassLookup(etree.CustomElementClassLookup):
456 """A lookup scheme for HTML Element classes. 457 458 To create a lookup instance with different Element classes, pass a tag 459 name mapping of Element classes in the ``classes`` keyword argument and/or 460 a tag name mapping of Mixin classes in the ``mixins`` keyword argument. 461 The special key '*' denotes a Mixin class that should be mixed into all 462 Element classes. 463 """ 464 _default_element_classes = {} 465
466 - def __init__(self, classes=None, mixins=None):
467 etree.CustomElementClassLookup.__init__(self) 468 if classes is None: 469 classes = self._default_element_classes.copy() 470 if mixins: 471 mixers = {} 472 for name, value in mixins: 473 if name == '*': 474 for n in classes.keys(): 475 mixers.setdefault(n, []).append(value) 476 else: 477 mixers.setdefault(name, []).append(value) 478 for name, mix_bases in mixers.items(): 479 cur = classes.get(name, HtmlElement) 480 bases = tuple(mix_bases + [cur]) 481 classes[name] = type(cur.__name__, bases, {}) 482 self._element_classes = classes
483
484 - def lookup(self, node_type, document, namespace, name):
485 if node_type == 'element': 486 return self._element_classes.get(name.lower(), HtmlElement) 487 elif node_type == 'comment': 488 return HtmlComment 489 elif node_type == 'PI': 490 return HtmlProcessingInstruction 491 elif node_type == 'entity': 492 return HtmlEntity 493 # Otherwise normal lookup 494 return None
495 496 ################################################################################ 497 # parsing 498 ################################################################################ 499
500 -def document_fromstring(html, parser=None, **kw):
501 if parser is None: 502 parser = html_parser 503 value = etree.fromstring(html, parser, **kw) 504 if value is None: 505 raise etree.ParserError( 506 "Document is empty") 507 return value
508
509 -def fragments_fromstring(html, no_leading_text=False, base_url=None, 510 parser=None, **kw):
511 """ 512 Parses several HTML elements, returning a list of elements. 513 514 The first item in the list may be a string (though leading 515 whitespace is removed). If no_leading_text is true, then it will 516 be an error if there is leading text, and it will always be a list 517 of only elements. 518 519 base_url will set the document's base_url attribute (and the tree's docinfo.URL) 520 """ 521 if parser is None: 522 parser = html_parser 523 # FIXME: check what happens when you give html with a body, head, etc. 524 start = html[:20].lstrip().lower() 525 if not start.startswith('<html') and not start.startswith('<!doctype'): 526 html = '<html><body>%s</body></html>' % html 527 doc = document_fromstring(html, parser=parser, base_url=base_url, **kw) 528 assert _nons(doc.tag) == 'html' 529 bodies = [e for e in doc if _nons(e.tag) == 'body'] 530 assert len(bodies) == 1, ("too many bodies: %r in %r" % (bodies, html)) 531 body = bodies[0] 532 elements = [] 533 if no_leading_text and body.text and body.text.strip(): 534 raise etree.ParserError( 535 "There is leading text: %r" % body.text) 536 if body.text and body.text.strip(): 537 elements.append(body.text) 538 elements.extend(body) 539 # FIXME: removing the reference to the parent artificial document 540 # would be nice 541 return elements
542
543 -def fragment_fromstring(html, create_parent=False, base_url=None, 544 parser=None, **kw):
545 """ 546 Parses a single HTML element; it is an error if there is more than 547 one element, or if anything but whitespace precedes or follows the 548 element. 549 550 If create_parent is true (or is a tag name) then a parent node 551 will be created to encapsulate the HTML in a single element. 552 553 base_url will set the document's base_url attribute (and the tree's docinfo.URL) 554 """ 555 if parser is None: 556 parser = html_parser 557 if create_parent: 558 if not isinstance(create_parent, basestring): 559 create_parent = 'div' 560 return fragment_fromstring('<%s>%s</%s>' % ( 561 create_parent, html, create_parent), 562 parser=parser, base_url=base_url, **kw) 563 elements = fragments_fromstring(html, parser=parser, no_leading_text=True, 564 base_url=base_url, **kw) 565 if not elements: 566 raise etree.ParserError( 567 "No elements found") 568 if len(elements) > 1: 569 raise etree.ParserError( 570 "Multiple elements found (%s)" 571 % ', '.join([_element_name(e) for e in elements])) 572 el = elements[0] 573 if el.tail and el.tail.strip(): 574 raise etree.ParserError( 575 "Element followed by text: %r" % el.tail) 576 el.tail = None 577 return el
578
579 -def fromstring(html, base_url=None, parser=None, **kw):
580 """ 581 Parse the html, returning a single element/document. 582 583 This tries to minimally parse the chunk of text, without knowing if it 584 is a fragment or a document. 585 586 base_url will set the document's base_url attribute (and the tree's docinfo.URL) 587 """ 588 if parser is None: 589 parser = html_parser 590 start = html[:10].lstrip().lower() 591 if start.startswith('<html') or start.startswith('<!doctype'): 592 # Looks like a full HTML document 593 return document_fromstring(html, parser=parser, base_url=base_url, **kw) 594 # otherwise, lets parse it out... 595 doc = document_fromstring(html, parser=parser, base_url=base_url, **kw) 596 bodies = doc.findall('body') 597 if not bodies: 598 bodies = doc.findall('{%s}body' % XHTML_NAMESPACE) 599 if bodies: 600 body = bodies[0] 601 if len(bodies) > 1: 602 # Somehow there are multiple bodies, which is bad, but just 603 # smash them into one body 604 for other_body in bodies[1:]: 605 if other_body.text: 606 if len(body): 607 body[-1].tail = (body[-1].tail or '') + other_body.text 608 else: 609 body.text = (body.text or '') + other_body.text 610 body.extend(other_body) 611 # We'll ignore tail 612 # I guess we are ignoring attributes too 613 other_body.drop_tree() 614 else: 615 body = None 616 heads = doc.findall('head') 617 if not heads: 618 heads = doc.findall('{%s}head' % XHTML_NAMESPACE) 619 if heads: 620 # Well, we have some sort of structure, so lets keep it all 621 head = heads[0] 622 if len(heads) > 1: 623 for other_head in heads[1:]: 624 head.extend(other_head) 625 # We don't care about text or tail in a head 626 other_head.drop_tree() 627 return doc 628 if (len(body) == 1 and (not body.text or not body.text.strip()) 629 and (not body[-1].tail or not body[-1].tail.strip())): 630 # The body has just one element, so it was probably a single 631 # element passed in 632 return body[0] 633 # Now we have a body which represents a bunch of tags which have the 634 # content that was passed in. We will create a fake container, which 635 # is the body tag, except <body> implies too much structure. 636 if _contains_block_level_tag(body): 637 body.tag = 'div' 638 else: 639 body.tag = 'span' 640 return body
641
642 -def parse(filename_or_url, parser=None, base_url=None, **kw):
643 """ 644 Parse a filename, URL, or file-like object into an HTML document 645 tree. Note: this returns a tree, not an element. Use 646 ``parse(...).getroot()`` to get the document root. 647 648 You can override the base URL with the ``base_url`` keyword. This 649 is most useful when parsing from a file-like object. 650 """ 651 if parser is None: 652 parser = html_parser 653 return etree.parse(filename_or_url, parser, base_url=base_url, **kw)
654
655 -def _contains_block_level_tag(el):
656 # FIXME: I could do this with XPath, but would that just be 657 # unnecessarily slow? 658 for el in el.iter(): 659 if _nons(el.tag) in defs.block_tags: 660 return True 661 return False
662
663 -def _element_name(el):
664 if isinstance(el, etree.CommentBase): 665 return 'comment' 666 elif isinstance(el, basestring): 667 return 'string' 668 else: 669 return _nons(el.tag)
670 671 ################################################################################ 672 # form handling 673 ################################################################################ 674
675 -class FormElement(HtmlElement):
676 """ 677 Represents a <form> element. 678 """ 679
680 - def inputs(self):
681 """ 682 Returns an accessor for all the input elements in the form. 683 684 See `InputGetter` for more information about the object. 685 """ 686 return InputGetter(self)
687 inputs = property(inputs, doc=inputs.__doc__) 688
689 - def _fields__get(self):
690 """ 691 Dictionary-like object that represents all the fields in this 692 form. You can set values in this dictionary to effect the 693 form. 694 """ 695 return FieldsDict(self.inputs)
696 - def _fields__set(self, value):
697 prev_keys = self.fields.keys() 698 for key, value in value.iteritems(): 699 if key in prev_keys: 700 prev_keys.remove(key) 701 self.fields[key] = value 702 for key in prev_keys: 703 if key is None: 704 # Case of an unnamed input; these aren't really 705 # expressed in form_values() anyway. 706 continue 707 self.fields[key] = None
708 709 fields = property(_fields__get, _fields__set, doc=_fields__get.__doc__) 710
711 - def _name(self):
712 if self.get('name'): 713 return self.get('name') 714 elif self.get('id'): 715 return '#' + self.get('id') 716 forms = self.body.findall('form') 717 if not forms: 718 forms = self.body.findall('{%s}form' % XHTML_NAMESPACE) 719 return str(forms.index(self))
720
721 - def form_values(self):
722 """ 723 Return a list of tuples of the field values for the form. 724 This is suitable to be passed to ``urllib.urlencode()``. 725 """ 726 results = [] 727 for el in self.inputs: 728 name = el.name 729 if not name: 730 continue 731 tag = _nons(el.tag) 732 if tag == 'textarea': 733 results.append((name, el.value)) 734 elif tag == 'select': 735 value = el.value 736 if el.multiple: 737 for v in value: 738 results.append((name, v)) 739 elif value is not None: 740 results.append((name, el.value)) 741 else: 742 assert tag == 'input', ( 743 "Unexpected tag: %r" % el) 744 if el.checkable and not el.checked: 745 continue 746 if el.type in ('submit', 'image', 'reset'): 747 continue 748 value = el.value 749 if value is not None: 750 results.append((name, el.value)) 751 return results
752
753 - def _action__get(self):
754 """ 755 Get/set the form's ``action`` attribute. 756 """ 757 base_url = self.base_url 758 action = self.get('action') 759 if base_url and action is not None: 760 return urljoin(base_url, action) 761 else: 762 return action
763 - def _action__set(self, value):
764 self.set('action', value)
765 - def _action__del(self):
766 if 'action' in self.attrib: 767 del self.attrib['action']
768 action = property(_action__get, _action__set, _action__del, doc=_action__get.__doc__) 769
770 - def _method__get(self):
771 """ 772 Get/set the form's method. Always returns a capitalized 773 string, and defaults to ``'GET'`` 774 """ 775 return self.get('method', 'GET').upper()
776 - def _method__set(self, value):
777 self.set('method', value.upper())
778 method = property(_method__get, _method__set, doc=_method__get.__doc__)
779 780 HtmlElementClassLookup._default_element_classes['form'] = FormElement 781
782 -def submit_form(form, extra_values=None, open_http=None):
783 """ 784 Helper function to submit a form. Returns a file-like object, as from 785 ``urllib.urlopen()``. This object also has a ``.geturl()`` function, 786 which shows the URL if there were any redirects. 787 788 You can use this like:: 789 790 form = doc.forms[0] 791 form.inputs['foo'].value = 'bar' # etc 792 response = form.submit() 793 doc = parse(response) 794 doc.make_links_absolute(response.geturl()) 795 796 To change the HTTP requester, pass a function as ``open_http`` keyword 797 argument that opens the URL for you. The function must have the following 798 signature:: 799 800 open_http(method, URL, values) 801 802 The action is one of 'GET' or 'POST', the URL is the target URL as a 803 string, and the values are a sequence of ``(name, value)`` tuples with the 804 form data. 805 """ 806 values = form.form_values() 807 if extra_values: 808 if hasattr(extra_values, 'items'): 809 extra_values = extra_values.items() 810 values.extend(extra_values) 811 if open_http is None: 812 open_http = open_http_urllib 813 return open_http(form.method, form.action, values)
814
815 -def open_http_urllib(method, url, values):
816 import urllib 817 ## FIXME: should test that it's not a relative URL or something 818 if method == 'GET': 819 if '?' in url: 820 url += '&' 821 else: 822 url += '?' 823 url += urllib.urlencode(values) 824 data = None 825 else: 826 data = urllib.urlencode(values) 827 return urllib.urlopen(url, data)
828
829 -class FieldsDict(DictMixin):
830
831 - def __init__(self, inputs):
832 self.inputs = inputs
833 - def __getitem__(self, item):
834 return self.inputs[item].value
835 - def __setitem__(self, item, value):
836 self.inputs[item].value = value
837 - def __delitem__(self, item):
838 raise KeyError( 839 "You cannot remove keys from ElementDict")
840 - def keys(self):
841 return self.inputs.keys()
842 - def __contains__(self, item):
843 return item in self.inputs
844
845 - def __repr__(self):
846 return '<%s for form %s>' % ( 847 self.__class__.__name__, 848 self.inputs.form._name())
849
850 -class InputGetter(object):
851 852 """ 853 An accessor that represents all the input fields in a form. 854 855 You can get fields by name from this, with 856 ``form.inputs['field_name']``. If there are a set of checkboxes 857 with the same name, they are returned as a list (a `CheckboxGroup` 858 which also allows value setting). Radio inputs are handled 859 similarly. 860 861 You can also iterate over this to get all input elements. This 862 won't return the same thing as if you get all the names, as 863 checkboxes and radio elements are returned individually. 864 """ 865 866 _name_xpath = etree.XPath(".//*[@name = $name and (local-name(.) = 'select' or local-name(.) = 'input' or local-name(.) = 'textarea')]") 867 _all_xpath = etree.XPath(".//*[local-name() = 'select' or local-name() = 'input' or local-name() = 'textarea']") 868
869 - def __init__(self, form):
870 self.form = form
871
872 - def __repr__(self):
873 return '<%s for form %s>' % ( 874 self.__class__.__name__, 875 self.form._name())
876 877 ## FIXME: there should be more methods, and it's unclear if this is 878 ## a dictionary-like object or list-like object 879
880 - def __getitem__(self, name):
881 results = self._name_xpath(self.form, name=name) 882 if results: 883 type = results[0].get('type') 884 if type == 'radio' and len(results) > 1: 885 group = RadioGroup(results) 886 group.name = name 887 return group 888 elif type == 'checkbox' and len(results) > 1: 889 group = CheckboxGroup(results) 890 group.name = name 891 return group 892 else: 893 # I don't like throwing away elements like this 894 return results[0] 895 else: 896 raise KeyError( 897 "No input element with the name %r" % name)
898
899 - def __contains__(self, name):
900 results = self._name_xpath(self.form, name=name) 901 return bool(results)
902
903 - def keys(self):
904 names = set() 905 for el in self: 906 names.add(el.name) 907 if None in names: 908 names.remove(None) 909 return list(names)
910
911 - def __iter__(self):
912 ## FIXME: kind of dumb to turn a list into an iterator, only 913 ## to have it likely turned back into a list again :( 914 return iter(self._all_xpath(self.form))
915
916 -class InputMixin(object):
917 918 """ 919 Mix-in for all input elements (input, select, and textarea) 920 """ 921 922
923 - def _name__get(self):
924 """ 925 Get/set the name of the element 926 """ 927 return self.get('name')
928 - def _name__set(self, value):
929 self.set('name', value)
930 - def _name__del(self):
931 if 'name' in self.attrib: 932 del self.attrib['name']
933 name = property(_name__get, _name__set, _name__del, doc=_name__get.__doc__) 934
935 - def __repr__(self):
936 type = getattr(self, 'type', None) 937 if type: 938 type = ' type=%r' % type 939 else: 940 type = '' 941 return '<%s %x name=%r%s>' % ( 942 self.__class__.__name__, id(self), self.name, type)
943
944 -class TextareaElement(InputMixin, HtmlElement):
945 """ 946 ``<textarea>`` element. You can get the name with ``.name`` and 947 get/set the value with ``.value`` 948 """ 949
950 - def _value__get(self):
951 """ 952 Get/set the value (which is the contents of this element) 953 """ 954 return self.text or ''
955 - def _value__set(self, value):
956 self.text = value
957 - def _value__del(self):
958 self.text = ''
959 value = property(_value__get, _value__set, _value__del, doc=_value__get.__doc__)
960 961 HtmlElementClassLookup._default_element_classes['textarea'] = TextareaElement 962
963 -class SelectElement(InputMixin, HtmlElement):
964 """ 965 ``<select>`` element. You can get the name with ``.name``. 966 967 ``.value`` will be the value of the selected option, unless this 968 is a multi-select element (``<select multiple>``), in which case 969 it will be a set-like object. In either case ``.value_options`` 970 gives the possible values. 971 972 The boolean attribute ``.multiple`` shows if this is a 973 multi-select. 974 """ 975
976 - def _value__get(self):
977 """ 978 Get/set the value of this select (the selected option). 979 980 If this is a multi-select, this is a set-like object that 981 represents all the selected options. 982 """ 983 if self.multiple: 984 return MultipleSelectOptions(self) 985 for el in _options_xpath(self): 986 if 'selected' in el.attrib: 987 value = el.get('value') 988 # FIXME: If value is None, what to return?, get_text()? 989 return value 990 return None
991
992 - def _value__set(self, value):
993 if self.multiple: 994 if isinstance(value, basestring): 995 raise TypeError( 996 "You must pass in a sequence") 997 self.value.clear() 998 self.value.update(value) 999 return 1000 if value is not None: 1001 for el in _options_xpath(self): 1002 # FIXME: also if el.get('value') is None? 1003 if el.get('value') == value: 1004 checked_option = el 1005 break 1006 else: 1007 raise ValueError( 1008 "There is no option with the value of %r" % value) 1009 for el in _options_xpath(self): 1010 if 'selected' in el.attrib: 1011 del el.attrib['selected'] 1012 if value is not None: 1013 checked_option.set('selected', '')
1014
1015 - def _value__del(self):
1016 # FIXME: should del be allowed at all? 1017 if self.multiple: 1018 self.value.clear() 1019 else: 1020 self.value = None
1021 1022 value = property(_value__get, _value__set, _value__del, doc=_value__get.__doc__) 1023
1024 - def value_options(self):
1025 """ 1026 All the possible values this select can have (the ``value`` 1027 attribute of all the ``<option>`` elements. 1028 """ 1029 return [el.get('value') for el in _options_xpath(self)]
1030 value_options = property(value_options, doc=value_options.__doc__) 1031
1032 - def _multiple__get(self):
1033 """ 1034 Boolean attribute: is there a ``multiple`` attribute on this element. 1035 """ 1036 return 'multiple' in self.attrib
1037 - def _multiple__set(self, value):
1038 if value: 1039 self.set('multiple', '') 1040 elif 'multiple' in self.attrib: 1041 del self.attrib['multiple']
1042 multiple = property(_multiple__get, _multiple__set, doc=_multiple__get.__doc__)
1043 1044 HtmlElementClassLookup._default_element_classes['select'] = SelectElement 1045
1046 -class MultipleSelectOptions(SetMixin):
1047 """ 1048 Represents all the selected options in a ``<select multiple>`` element. 1049 1050 You can add to this set-like option to select an option, or remove 1051 to unselect the option. 1052 """ 1053
1054 - def __init__(self, select):
1055 self.select = select
1056
1057 - def options(self):
1058 """ 1059 Iterator of all the ``<option>`` elements. 1060 """ 1061 return iter(_options_xpath(self.select))
1062 options = property(options) 1063
1064 - def __iter__(self):
1065 for option in self.options: 1066 yield option.get('value')
1067
1068 - def add(self, item):
1069 for option in self.options: 1070 if option.get('value') == item: 1071 option.set('selected', '') 1072 break 1073 else: 1074 raise ValueError( 1075 "There is no option with the value %r" % item)
1076
1077 - def remove(self, item):
1078 for option in self.options: 1079 if option.get('value') == item: 1080 if 'selected' in option.attrib: 1081 del option.attrib['selected'] 1082 else: 1083 raise ValueError( 1084 "The option %r is not currently selected" % item) 1085 break 1086 else: 1087 raise ValueError( 1088 "There is not option with the value %r" % item)
1089
1090 - def __repr__(self):
1091 return '<%s {%s} for select name=%r>' % ( 1092 self.__class__.__name__, 1093 ', '.join([repr(v) for v in self]), 1094 self.select.name)
1095
1096 -class RadioGroup(list):
1097 """ 1098 This object represents several ``<input type=radio>`` elements 1099 that have the same name. 1100 1101 You can use this like a list, but also use the property 1102 ``.value`` to check/uncheck inputs. Also you can use 1103 ``.value_options`` to get the possible values. 1104 """ 1105
1106 - def _value__get(self):
1107 """ 1108 Get/set the value, which checks the radio with that value (and 1109 unchecks any other value). 1110 """ 1111 for el in self: 1112 if 'checked' in el.attrib: 1113 return el.get('value') 1114 return None
1115
1116 - def _value__set(self, value):
1117 if value is not None: 1118 for el in self: 1119 if el.get('value') == value: 1120 checked_option = el 1121 break 1122 else: 1123 raise ValueError( 1124 "There is no radio input with the value %r" % value) 1125 for el in self: 1126 if 'checked' in el.attrib: 1127 del el.attrib['checked'] 1128 if value is not None: 1129 checked_option.set('checked', '')
1130
1131 - def _value__del(self):
1132 self.value = None
1133 1134 value = property(_value__get, _value__set, _value__del, doc=_value__get.__doc__) 1135
1136 - def value_options(self):
1137 """ 1138 Returns a list of all the possible values. 1139 """ 1140 return [el.get('value') for el in self]
1141 value_options = property(value_options, doc=value_options.__doc__) 1142
1143 - def __repr__(self):
1144 return '%s(%s)' % ( 1145 self.__class__.__name__, 1146 list.__repr__(self))
1147
1148 -class CheckboxGroup(list):
1149 """ 1150 Represents a group of checkboxes (``<input type=checkbox>``) that 1151 have the same name. 1152 1153 In addition to using this like a list, the ``.value`` attribute 1154 returns a set-like object that you can add to or remove from to 1155 check and uncheck checkboxes. You can also use ``.value_options`` 1156 to get the possible values. 1157 """ 1158
1159 - def _value__get(self):
1160 """ 1161 Return a set-like object that can be modified to check or 1162 uncheck individual checkboxes according to their value. 1163 """ 1164 return CheckboxValues(self)
1165 - def _value__set(self, value):
1166 self.value.clear() 1167 if not hasattr(value, '__iter__'): 1168 raise ValueError( 1169 "A CheckboxGroup (name=%r) must be set to a sequence (not %r)" 1170 % (self[0].name, value)) 1171 self.value.update(value)
1172 - def _value__del(self):
1173 self.value.clear()
1174 value = property(_value__get, _value__set, _value__del, doc=_value__get.__doc__) 1175
1176 - def __repr__(self):
1177 return '%s(%s)' % ( 1178 self.__class__.__name__, list.__repr__(self))
1179
1180 -class CheckboxValues(SetMixin):
1181 1182 """ 1183 Represents the values of the checked checkboxes in a group of 1184 checkboxes with the same name. 1185 """ 1186
1187 - def __init__(self, group):
1188 self.group = group
1189
1190 - def __iter__(self):
1191 return iter([ 1192 el.get('value') 1193 for el in self.group 1194 if 'checked' in el.attrib])
1195
1196 - def add(self, value):
1197 for el in self.group: 1198 if el.get('value') == value: 1199 el.set('checked', '') 1200 break 1201 else: 1202 raise KeyError("No checkbox with value %r" % value)
1203
1204 - def remove(self, value):
1205 for el in self.group: 1206 if el.get('value') == value: 1207 if 'checked' in el.attrib: 1208 del el.attrib['checked'] 1209 else: 1210 raise KeyError( 1211 "The checkbox with value %r was already unchecked" % value) 1212 break 1213 else: 1214 raise KeyError( 1215 "No checkbox with value %r" % value)
1216
1217 - def __repr__(self):
1218 return '<%s {%s} for checkboxes name=%r>' % ( 1219 self.__class__.__name__, 1220 ', '.join([repr(v) for v in self]), 1221 self.group.name)
1222
1223 -class InputElement(InputMixin, HtmlElement):
1224 """ 1225 Represents an ``<input>`` element. 1226 1227 You can get the type with ``.type`` (which is lower-cased and 1228 defaults to ``'text'``). 1229 1230 Also you can get and set the value with ``.value`` 1231 1232 Checkboxes and radios have the attribute ``input.checkable == 1233 True`` (for all others it is false) and a boolean attribute 1234 ``.checked``. 1235 1236 """ 1237 1238 ## FIXME: I'm a little uncomfortable with the use of .checked
1239 - def _value__get(self):
1240 """ 1241 Get/set the value of this element, using the ``value`` attribute. 1242 1243 Also, if this is a checkbox and it has no value, this defaults 1244 to ``'on'``. If it is a checkbox or radio that is not 1245 checked, this returns None. 1246 """ 1247 if self.checkable: 1248 if self.checked: 1249 return self.get('value') or 'on' 1250 else: 1251 return None 1252 return self.get('value')
1253 - def _value__set(self, value):
1254 if self.checkable: 1255 if not value: 1256 self.checked = False 1257 else: 1258 self.checked = True 1259 if isinstance(value, basestring): 1260 self.set('value', value) 1261 else: 1262 self.set('value', value)
1263 - def _value__del(self):
1264 if self.checkable: 1265 self.checked = False 1266 else: 1267 if 'value' in self.attrib: 1268 del self.attrib['value']
1269 value = property(_value__get, _value__set, _value__del, doc=_value__get.__doc__) 1270
1271 - def _type__get(self):
1272 """ 1273 Return the type of this element (using the type attribute). 1274 """ 1275 return self.get('type', 'text').lower()
1276 - def _type__set(self, value):
1277 self.set('type', value)
1278 type = property(_type__get, _type__set, doc=_type__get.__doc__) 1279
1280 - def checkable(self):
1281 """ 1282 Boolean: can this element be checked? 1283 """ 1284 return self.type in ['checkbox', 'radio']
1285 checkable = property(checkable, doc=checkable.__doc__) 1286
1287 - def _checked__get(self):
1288 """ 1289 Boolean attribute to get/set the presence of the ``checked`` 1290 attribute. 1291 1292 You can only use this on checkable input types. 1293 """ 1294 if not self.checkable: 1295 raise AttributeError('Not a checkable input type') 1296 return 'checked' in self.attrib
1297 - def _checked__set(self, value):
1298 if not self.checkable: 1299 raise AttributeError('Not a checkable input type') 1300 if value: 1301 self.set('checked', '') 1302 else: 1303 if 'checked' in self.attrib: 1304 del self.attrib['checked']
1305 checked = property(_checked__get, _checked__set, doc=_checked__get.__doc__)
1306 1307 HtmlElementClassLookup._default_element_classes['input'] = InputElement 1308
1309 -class LabelElement(HtmlElement):
1310 """ 1311 Represents a ``<label>`` element. 1312 1313 Label elements are linked to other elements with their ``for`` 1314 attribute. You can access this element with ``label.for_element``. 1315 """ 1316
1317 - def _for_element__get(self):
1318 """ 1319 Get/set the element this label points to. Return None if it 1320 can't be found. 1321 """ 1322 id = self.get('for') 1323 if not id: 1324 return None 1325 return self.body.get_element_by_id(id)
1326 - def _for_element__set(self, other):
1327 id = other.get('id') 1328 if not id: 1329 raise TypeError( 1330 "Element %r has no id attribute" % other) 1331 self.set('for', id)
1332 - def _for_element__del(self):
1333 if 'id' in self.attrib: 1334 del self.attrib['id']
1335 for_element = property(_for_element__get, _for_element__set, _for_element__del, 1336 doc=_for_element__get.__doc__)
1337 1338 HtmlElementClassLookup._default_element_classes['label'] = LabelElement 1339 1340 ############################################################ 1341 ## Serialization 1342 ############################################################ 1343
1344 -def html_to_xhtml(html):
1345 """Convert all tags in an HTML tree to XHTML by moving them to the 1346 XHTML namespace. 1347 """ 1348 try: 1349 html = html.getroot() 1350 except AttributeError: 1351 pass 1352 prefix = "{%s}" % XHTML_NAMESPACE 1353 for el in html.iter(): 1354 tag = el.tag 1355 if isinstance(tag, basestring): 1356 if tag[0] != '{': 1357 el.tag = prefix + tag
1358
1359 -def xhtml_to_html(xhtml):
1360 """Convert all tags in an XHTML tree to HTML by removing their 1361 XHTML namespace. 1362 """ 1363 try: 1364 xhtml = xhtml.getroot() 1365 except AttributeError: 1366 pass 1367 prefix = "{%s}" % XHTML_NAMESPACE 1368 prefix_len = len(prefix) 1369 for el in xhtml.iter(prefix + "*"): 1370 el.tag = el.tag[prefix_len:]
1371 1372 # This isn't a general match, but it's a match for what libxml2 1373 # specifically serialises: 1374 __replace_meta_content_type = re.compile( 1375 r'<meta http-equiv="Content-Type"[^>]*>').sub 1376
1377 -def tostring(doc, pretty_print=False, include_meta_content_type=False, 1378 encoding=None, method="html"):
1379 """Return an HTML string representation of the document. 1380 1381 Note: if include_meta_content_type is true this will create a 1382 ``<meta http-equiv="Content-Type" ...>`` tag in the head; 1383 regardless of the value of include_meta_content_type any existing 1384 ``<meta http-equiv="Content-Type" ...>`` tag will be removed 1385 1386 The ``encoding`` argument controls the output encoding (defauts to 1387 ASCII, with &#...; character references for any characters outside 1388 of ASCII). 1389 1390 The ``method`` argument defines the output method. It defaults to 1391 'html', but can also be 'xml' for xhtml output, or 'text' to 1392 serialise to plain text without markup. Note that you can pass 1393 the builtin ``unicode`` type as ``encoding`` argument to serialise 1394 to a unicode string. 1395 1396 Example:: 1397 1398 >>> from lxml import html 1399 >>> root = html.fragment_fromstring('<p>Hello<br>world!</p>') 1400 1401 >>> html.tostring(root) 1402 b'<p>Hello<br>world!</p>' 1403 >>> html.tostring(root, method='html') 1404 b'<p>Hello<br>world!</p>' 1405 1406 >>> html.tostring(root, method='xml') 1407 b'<p>Hello<br/>world!</p>' 1408 1409 >>> html.tostring(root, method='text') 1410 b'Helloworld!' 1411 1412 >>> html.tostring(root, method='text', encoding=unicode) 1413 u'Helloworld!' 1414 """ 1415 html = etree.tostring(doc, method=method, pretty_print=pretty_print, 1416 encoding=encoding) 1417 if not include_meta_content_type: 1418 html = __replace_meta_content_type('', html) 1419 return html
1420 1421 tostring.__doc__ = __fix_docstring(tostring.__doc__) 1422
1423 -def open_in_browser(doc):
1424 """ 1425 Open the HTML document in a web browser (saving it to a temporary 1426 file to open it). 1427 """ 1428 import os 1429 import webbrowser 1430 try: 1431 write_doc = doc.write 1432 except AttributeError: 1433 write_doc = etree.ElementTree(element=doc).write 1434 fn = os.tempnam() + '.html' 1435 write_doc(fn, method="html") 1436 url = 'file://' + fn.replace(os.path.sep, '/') 1437 print(url) 1438 webbrowser.open(url)
1439 1440 ################################################################################ 1441 # configure Element class lookup 1442 ################################################################################ 1443
1444 -class HTMLParser(etree.HTMLParser):
1445 - def __init__(self, **kwargs):
1446 super(HTMLParser, self).__init__(**kwargs) 1447 self.set_element_class_lookup(HtmlElementClassLookup())
1448
1449 -class XHTMLParser(etree.XMLParser):
1450 - def __init__(self, **kwargs):
1451 super(XHTMLParser, self).__init__(**kwargs) 1452 self.set_element_class_lookup(HtmlElementClassLookup())
1453
1454 -def Element(*args, **kw):
1455 """Create a new HTML Element. 1456 1457 This can also be used for XHTML documents. 1458 """ 1459 v = html_parser.makeelement(*args, **kw) 1460 return v
1461 1462 html_parser = HTMLParser() 1463 xhtml_parser = XHTMLParser() 1464