Package lxml :: Package html
[hide private]
[frames] | no frames]

Source Code for Package lxml.html

   1  """The ``lxml.html`` tool set for HTML handling. 
   2  """ 
   3   
   4  import threading 
   5  import re 
   6  try: 
   7      from urlparse import urljoin 
   8  except ImportError: 
   9      # Python 3 
  10      from urllib.parse import urljoin 
  11  import copy 
  12  from lxml import etree 
  13  from lxml.html import defs 
  14  from lxml import cssselect 
  15  from lxml.html._setmixin import SetMixin 
  16  try: 
  17      from UserDict import DictMixin 
  18  except ImportError: 
  19      # DictMixin was introduced in Python 2.4 
  20      from lxml.html._dictmixin import DictMixin 
  21  try: 
  22      set 
  23  except NameError: 
  24      # Python 2.3 
  25      from sets import Set as set 
  26  try: 
  27      bytes = __builtins__["bytes"] 
  28  except (KeyError, NameError): 
  29      # Python < 2.6 
  30      bytes = str 
  31  try: 
  32      unicode = __builtins__["unicode"] 
  33  except (KeyError, NameError): 
  34      # Python 3 
  35      unicode = str 
  36  try: 
  37      basestring = __builtins__["basestring"] 
  38  except (KeyError, NameError): 
  39      # Python 3 
  40      basestring = (str, bytes) 
  41   
42 -def __fix_docstring(s):
43 if not s: 44 return s 45 import sys 46 if sys.version_info[0] >= 3: 47 sub = re.compile(r"^(\s*)u'", re.M).sub 48 else: 49 sub = re.compile(r"^(\s*)b'", re.M).sub 50 return sub(r"\1'", s)
51 52 __all__ = [ 53 'document_fromstring', 'fragment_fromstring', 'fragments_fromstring', 'fromstring', 54 'tostring', 'Element', 'defs', 'open_in_browser', 'submit_form', 55 'find_rel_links', 'find_class', 'make_links_absolute', 56 'resolve_base_href', 'iterlinks', 'rewrite_links', 'open_in_browser', 'parse'] 57 58 XHTML_NAMESPACE = "http://www.w3.org/1999/xhtml" 59 60 _rel_links_xpath = etree.XPath("descendant-or-self::a[@rel]|descendant-or-self::x:a[@rel]", 61 namespaces={'x':XHTML_NAMESPACE}) 62 _options_xpath = etree.XPath("descendant-or-self::option|descendant-or-self::x:option", 63 namespaces={'x':XHTML_NAMESPACE}) 64 _forms_xpath = etree.XPath("descendant-or-self::form|descendant-or-self::x:form", 65 namespaces={'x':XHTML_NAMESPACE}) 66 #_class_xpath = etree.XPath(r"descendant-or-self::*[regexp:match(@class, concat('\b', $class_name, '\b'))]", {'regexp': 'http://exslt.org/regular-expressions'}) 67 _class_xpath = etree.XPath("descendant-or-self::*[@class and contains(concat(' ', normalize-space(@class), ' '), concat(' ', $class_name, ' '))]") 68 _id_xpath = etree.XPath("descendant-or-self::*[@id=$id]") 69 _collect_string_content = etree.XPath("string()") 70 _css_url_re = re.compile(r'url\(('+'["][^"]*["]|'+"['][^']*[']|"+r'[^)]*)\)', re.I) 71 _css_import_re = re.compile(r'@import "(.*?)"') 72 _label_xpath = etree.XPath("//label[@for=$id]|//x:label[@for=$id]", 73 namespaces={'x':XHTML_NAMESPACE}) 74 _archive_re = re.compile(r'[^ ]+') 75
76 -def _unquote_match(s, pos):
77 if s[:1] == '"' and s[-1:] == '"' or s[:1] == "'" and s[-1:] == "'": 78 return s[1:-1], pos+1 79 else: 80 return s,pos
81
82 -def _transform_result(typ, result):
83 """Convert the result back into the input type. 84 """ 85 if issubclass(typ, bytes): 86 return tostring(result, encoding='utf-8') 87 elif issubclass(typ, unicode): 88 return tostring(result, encoding=unicode) 89 else: 90 return result
91
92 -def _nons(tag):
93 if isinstance(tag, basestring): 94 if tag[0] == '{' and tag[1:len(XHTML_NAMESPACE)+1] == XHTML_NAMESPACE: 95 return tag.split('}')[-1] 96 return tag
97
98 -class HtmlMixin(object):
99
100 - def base_url(self):
101 """ 102 Returns the base URL, given when the page was parsed. 103 104 Use with ``urlparse.urljoin(el.base_url, href)`` to get 105 absolute URLs. 106 """ 107 return self.getroottree().docinfo.URL
108 base_url = property(base_url, doc=base_url.__doc__) 109
110 - def forms(self):
111 """ 112 Return a list of all the forms 113 """ 114 return _forms_xpath(self)
115 forms = property(forms, doc=forms.__doc__) 116
117 - def body(self):
118 """ 119 Return the <body> element. Can be called from a child element 120 to get the document's head. 121 """ 122 return self.xpath('//body|//x:body', namespaces={'x':XHTML_NAMESPACE})[0]
123 body = property(body, doc=body.__doc__) 124
125 - def head(self):
126 """ 127 Returns the <head> element. Can be called from a child 128 element to get the document's head. 129 """ 130 return self.xpath('//head|//x:head', namespaces={'x':XHTML_NAMESPACE})[0]
131 head = property(head, doc=head.__doc__) 132
133 - def _label__get(self):
134 """ 135 Get or set any <label> element associated with this element. 136 """ 137 id = self.get('id') 138 if not id: 139 return None 140 result = _label_xpath(self, id=id) 141 if not result: 142 return None 143 else: 144 return result[0]
145 - def _label__set(self, label):
146 id = self.get('id') 147 if not id: 148 raise TypeError( 149 "You cannot set a label for an element (%r) that has no id" 150 % self) 151 if _nons(label.tag) != 'label': 152 raise TypeError( 153 "You can only assign label to a label element (not %r)" 154 % label) 155 label.set('for', id)
156 - def _label__del(self):
157 label = self.label 158 if label is not None: 159 del label.attrib['for']
160 label = property(_label__get, _label__set, _label__del, doc=_label__get.__doc__) 161
162 - def drop_tree(self):
163 """ 164 Removes this element from the tree, including its children and 165 text. The tail text is joined to the previous element or 166 parent. 167 """ 168 parent = self.getparent() 169 assert parent is not None 170 if self.tail: 171 previous = self.getprevious() 172 if previous is None: 173 parent.text = (parent.text or '') + self.tail 174 else: 175 previous.tail = (previous.tail or '') + self.tail 176 parent.remove(self)
177
178 - def drop_tag(self):
179 """ 180 Remove the tag, but not its children or text. The children and text 181 are merged into the parent. 182 183 Example:: 184 185 >>> h = fragment_fromstring('<div>Hello <b>World!</b></div>') 186 >>> h.find('.//b').drop_tag() 187 >>> print(tostring(h, encoding=unicode)) 188 <div>Hello World!</div> 189 """ 190 parent = self.getparent() 191 assert parent is not None 192 previous = self.getprevious() 193 if self.text and isinstance(self.tag, basestring): 194 # not a Comment, etc. 195 if previous is None: 196 parent.text = (parent.text or '') + self.text 197 else: 198 previous.tail = (previous.tail or '') + self.text 199 if self.tail: 200 if len(self): 201 last = self[-1] 202 last.tail = (last.tail or '') + self.tail 203 elif previous is None: 204 parent.text = (parent.text or '') + self.tail 205 else: 206 previous.tail = (previous.tail or '') + self.tail 207 index = parent.index(self) 208 parent[index:index+1] = self[:]
209 217
218 - def find_class(self, class_name):
219 """ 220 Find any elements with the given class name. 221 """ 222 return _class_xpath(self, class_name=class_name)
223
224 - def get_element_by_id(self, id, *default):
225 """ 226 Get the first element in a document with the given id. If none is 227 found, return the default argument if provided or raise KeyError 228 otherwise. 229 230 Note that there can be more than one element with the same id, 231 and this isn't uncommon in HTML documents found in the wild. 232 Browsers return only the first match, and this function does 233 the same. 234 """ 235 try: 236 # FIXME: should this check for multiple matches? 237 # browsers just return the first one 238 return _id_xpath(self, id=id)[0] 239 except IndexError: 240 if default: 241 return default[0] 242 else: 243 raise KeyError(id)
244
245 - def text_content(self):
246 """ 247 Return the text content of the tag (and the text in any children). 248 """ 249 return _collect_string_content(self)
250
251 - def cssselect(self, expr):
252 """ 253 Run the CSS expression on this element and its children, 254 returning a list of the results. 255 256 Equivalent to lxml.cssselect.CSSSelect(expr)(self) -- note 257 that pre-compiling the expression can provide a substantial 258 speedup. 259 """ 260 return cssselect.CSSSelector(expr)(self)
261 262 ######################################## 263 ## Link functions 264 ######################################## 265 285 self.rewrite_links(link_repl)
286
287 - def resolve_base_href(self):
288 """ 289 Find any ``<base href>`` tag in the document, and apply its 290 values to all links found in the document. Also remove the 291 tag once it has been applied. 292 """ 293 base_href = None 294 basetags = self.xpath('//base[@href]|//x:base[@href]', namespaces={'x':XHTML_NAMESPACE}) 295 for b in basetags: 296 base_href = b.get('href') 297 b.drop_tree() 298 if not base_href: 299 return 300 self.make_links_absolute(base_href, resolve_base_href=False)
301 359 406 407
408 -class _MethodFunc(object):
409 """ 410 An object that represents a method on an element as a function; 411 the function takes either an element or an HTML string. It 412 returns whatever the function normally returns, or if the function 413 works in-place (and so returns None) it returns a serialized form 414 of the resulting document. 415 """
416 - def __init__(self, name, copy=False, source_class=HtmlMixin):
417 self.name = name 418 self.copy = copy 419 self.__doc__ = getattr(source_class, self.name).__doc__
420 - def __call__(self, doc, *args, **kw):
421 result_type = type(doc) 422 if isinstance(doc, basestring): 423 if 'copy' in kw: 424 raise TypeError( 425 "The keyword 'copy' can only be used with element inputs to %s, not a string input" % self.name) 426 doc = fromstring(doc, **kw) 427 else: 428 if 'copy' in kw: 429 copy = kw.pop('copy') 430 else: 431 copy = self.copy 432 if copy: 433 doc = copy.deepcopy(doc) 434 meth = getattr(doc, self.name) 435 result = meth(*args, **kw) 436 # FIXME: this None test is a bit sloppy 437 if result is None: 438 # Then return what we got in 439 return _transform_result(result_type, doc) 440 else: 441 return result
442 443 find_rel_links = _MethodFunc('find_rel_links', copy=False) 444 find_class = _MethodFunc('find_class', copy=False) 445 make_links_absolute = _MethodFunc('make_links_absolute', copy=True) 446 resolve_base_href = _MethodFunc('resolve_base_href', copy=True) 447 iterlinks = _MethodFunc('iterlinks', copy=False) 448 rewrite_links = _MethodFunc('rewrite_links', copy=True) 449
450 -class HtmlComment(etree.CommentBase, HtmlMixin):
451 pass
452
453 -class HtmlElement(etree.ElementBase, HtmlMixin):
454 pass
455
456 -class HtmlProcessingInstruction(etree.PIBase, HtmlMixin):
457 pass
458
459 -class HtmlEntity(etree.EntityBase, HtmlMixin):
460 pass
461 462
463 -class HtmlElementClassLookup(etree.CustomElementClassLookup):
464 """A lookup scheme for HTML Element classes. 465 466 To create a lookup instance with different Element classes, pass a tag 467 name mapping of Element classes in the ``classes`` keyword argument and/or 468 a tag name mapping of Mixin classes in the ``mixins`` keyword argument. 469 The special key '*' denotes a Mixin class that should be mixed into all 470 Element classes. 471 """ 472 _default_element_classes = {} 473
474 - def __init__(self, classes=None, mixins=None):
475 etree.CustomElementClassLookup.__init__(self) 476 if classes is None: 477 classes = self._default_element_classes.copy() 478 if mixins: 479 mixers = {} 480 for name, value in mixins: 481 if name == '*': 482 for n in classes.keys(): 483 mixers.setdefault(n, []).append(value) 484 else: 485 mixers.setdefault(name, []).append(value) 486 for name, mix_bases in mixers.items(): 487 cur = classes.get(name, HtmlElement) 488 bases = tuple(mix_bases + [cur]) 489 classes[name] = type(cur.__name__, bases, {}) 490 self._element_classes = classes
491
492 - def lookup(self, node_type, document, namespace, name):
493 if node_type == 'element': 494 return self._element_classes.get(name.lower(), HtmlElement) 495 elif node_type == 'comment': 496 return HtmlComment 497 elif node_type == 'PI': 498 return HtmlProcessingInstruction 499 elif node_type == 'entity': 500 return HtmlEntity 501 # Otherwise normal lookup 502 return None
503 504 ################################################################################ 505 # parsing 506 ################################################################################ 507
508 -def document_fromstring(html, parser=None, **kw):
509 if parser is None: 510 parser = html_parser 511 value = etree.fromstring(html, parser, **kw) 512 if value is None: 513 raise etree.ParserError( 514 "Document is empty") 515 return value
516
517 -def fragments_fromstring(html, no_leading_text=False, base_url=None, 518 parser=None, **kw):
519 """ 520 Parses several HTML elements, returning a list of elements. 521 522 The first item in the list may be a string (though leading 523 whitespace is removed). If no_leading_text is true, then it will 524 be an error if there is leading text, and it will always be a list 525 of only elements. 526 527 base_url will set the document's base_url attribute (and the tree's docinfo.URL) 528 """ 529 if parser is None: 530 parser = html_parser 531 # FIXME: check what happens when you give html with a body, head, etc. 532 start = html[:20].lstrip().lower() 533 if not start.startswith('<html') and not start.startswith('<!doctype'): 534 html = '<html><body>%s</body></html>' % html 535 doc = document_fromstring(html, parser=parser, base_url=base_url, **kw) 536 assert _nons(doc.tag) == 'html' 537 bodies = [e for e in doc if _nons(e.tag) == 'body'] 538 assert len(bodies) == 1, ("too many bodies: %r in %r" % (bodies, html)) 539 body = bodies[0] 540 elements = [] 541 if no_leading_text and body.text and body.text.strip(): 542 raise etree.ParserError( 543 "There is leading text: %r" % body.text) 544 if body.text and body.text.strip(): 545 elements.append(body.text) 546 elements.extend(body) 547 # FIXME: removing the reference to the parent artificial document 548 # would be nice 549 return elements
550
551 -def fragment_fromstring(html, create_parent=False, base_url=None, 552 parser=None, **kw):
553 """ 554 Parses a single HTML element; it is an error if there is more than 555 one element, or if anything but whitespace precedes or follows the 556 element. 557 558 If create_parent is true (or is a tag name) then a parent node 559 will be created to encapsulate the HTML in a single element. 560 561 base_url will set the document's base_url attribute (and the tree's docinfo.URL) 562 """ 563 if parser is None: 564 parser = html_parser 565 if create_parent: 566 if not isinstance(create_parent, basestring): 567 create_parent = 'div' 568 return fragment_fromstring('<%s>%s</%s>' % ( 569 create_parent, html, create_parent), 570 parser=parser, base_url=base_url, **kw) 571 elements = fragments_fromstring(html, parser=parser, no_leading_text=True, 572 base_url=base_url, **kw) 573 if not elements: 574 raise etree.ParserError( 575 "No elements found") 576 if len(elements) > 1: 577 raise etree.ParserError( 578 "Multiple elements found (%s)" 579 % ', '.join([_element_name(e) for e in elements])) 580 el = elements[0] 581 if el.tail and el.tail.strip(): 582 raise etree.ParserError( 583 "Element followed by text: %r" % el.tail) 584 el.tail = None 585 return el
586
587 -def fromstring(html, base_url=None, parser=None, **kw):
588 """ 589 Parse the html, returning a single element/document. 590 591 This tries to minimally parse the chunk of text, without knowing if it 592 is a fragment or a document. 593 594 base_url will set the document's base_url attribute (and the tree's docinfo.URL) 595 """ 596 if parser is None: 597 parser = html_parser 598 start = html[:10].lstrip().lower() 599 if start.startswith('<html') or start.startswith('<!doctype'): 600 # Looks like a full HTML document 601 return document_fromstring(html, parser=parser, base_url=base_url, **kw) 602 # otherwise, lets parse it out... 603 doc = document_fromstring(html, parser=parser, base_url=base_url, **kw) 604 bodies = doc.findall('body') 605 if not bodies: 606 bodies = doc.findall('{%s}body' % XHTML_NAMESPACE) 607 if bodies: 608 body = bodies[0] 609 if len(bodies) > 1: 610 # Somehow there are multiple bodies, which is bad, but just 611 # smash them into one body 612 for other_body in bodies[1:]: 613 if other_body.text: 614 if len(body): 615 body[-1].tail = (body[-1].tail or '') + other_body.text 616 else: 617 body.text = (body.text or '') + other_body.text 618 body.extend(other_body) 619 # We'll ignore tail 620 # I guess we are ignoring attributes too 621 other_body.drop_tree() 622 else: 623 body = None 624 heads = doc.findall('head') 625 if not heads: 626 heads = doc.findall('{%s}head' % XHTML_NAMESPACE) 627 if heads: 628 # Well, we have some sort of structure, so lets keep it all 629 head = heads[0] 630 if len(heads) > 1: 631 for other_head in heads[1:]: 632 head.extend(other_head) 633 # We don't care about text or tail in a head 634 other_head.drop_tree() 635 return doc 636 if (len(body) == 1 and (not body.text or not body.text.strip()) 637 and (not body[-1].tail or not body[-1].tail.strip())): 638 # The body has just one element, so it was probably a single 639 # element passed in 640 return body[0] 641 # Now we have a body which represents a bunch of tags which have the 642 # content that was passed in. We will create a fake container, which 643 # is the body tag, except <body> implies too much structure. 644 if _contains_block_level_tag(body): 645 body.tag = 'div' 646 else: 647 body.tag = 'span' 648 return body
649
650 -def parse(filename_or_url, parser=None, base_url=None, **kw):
651 """ 652 Parse a filename, URL, or file-like object into an HTML document 653 tree. Note: this returns a tree, not an element. Use 654 ``parse(...).getroot()`` to get the document root. 655 656 You can override the base URL with the ``base_url`` keyword. This 657 is most useful when parsing from a file-like object. 658 """ 659 if parser is None: 660 parser = html_parser 661 return etree.parse(filename_or_url, parser, base_url=base_url, **kw)
662
663 -def _contains_block_level_tag(el):
664 # FIXME: I could do this with XPath, but would that just be 665 # unnecessarily slow? 666 for el in el.iter(): 667 if _nons(el.tag) in defs.block_tags: 668 return True 669 return False
670
671 -def _element_name(el):
672 if isinstance(el, etree.CommentBase): 673 return 'comment' 674 elif isinstance(el, basestring): 675 return 'string' 676 else: 677 return _nons(el.tag)
678 679 ################################################################################ 680 # form handling 681 ################################################################################ 682
683 -class FormElement(HtmlElement):
684 """ 685 Represents a <form> element. 686 """ 687
688 - def inputs(self):
689 """ 690 Returns an accessor for all the input elements in the form. 691 692 See `InputGetter` for more information about the object. 693 """ 694 return InputGetter(self)
695 inputs = property(inputs, doc=inputs.__doc__) 696
697 - def _fields__get(self):
698 """ 699 Dictionary-like object that represents all the fields in this 700 form. You can set values in this dictionary to effect the 701 form. 702 """ 703 return FieldsDict(self.inputs)
704 - def _fields__set(self, value):
705 prev_keys = self.fields.keys() 706 for key, value in value.iteritems(): 707 if key in prev_keys: 708 prev_keys.remove(key) 709 self.fields[key] = value 710 for key in prev_keys: 711 if key is None: 712 # Case of an unnamed input; these aren't really 713 # expressed in form_values() anyway. 714 continue 715 self.fields[key] = None
716 717 fields = property(_fields__get, _fields__set, doc=_fields__get.__doc__) 718
719 - def _name(self):
720 if self.get('name'): 721 return self.get('name') 722 elif self.get('id'): 723 return '#' + self.get('id') 724 forms = list(self.body.iter('form')) 725 if not forms: 726 forms = list(self.body.iter('{%s}form' % XHTML_NAMESPACE)) 727 return str(forms.index(self))
728
729 - def form_values(self):
730 """ 731 Return a list of tuples of the field values for the form. 732 This is suitable to be passed to ``urllib.urlencode()``. 733 """ 734 results = [] 735 for el in self.inputs: 736 name = el.name 737 if not name: 738 continue 739 tag = _nons(el.tag) 740 if tag == 'textarea': 741 results.append((name, el.value)) 742 elif tag == 'select': 743 value = el.value 744 if el.multiple: 745 for v in value: 746 results.append((name, v)) 747 elif value is not None: 748 results.append((name, el.value)) 749 else: 750 assert tag == 'input', ( 751 "Unexpected tag: %r" % el) 752 if el.checkable and not el.checked: 753 continue 754 if el.type in ('submit', 'image', 'reset'): 755 continue 756 value = el.value 757 if value is not None: 758 results.append((name, el.value)) 759 return results
760
761 - def _action__get(self):
762 """ 763 Get/set the form's ``action`` attribute. 764 """ 765 base_url = self.base_url 766 action = self.get('action') 767 if base_url and action is not None: 768 return urljoin(base_url, action) 769 else: 770 return action
771 - def _action__set(self, value):
772 self.set('action', value)
773 - def _action__del(self):
774 if 'action' in self.attrib: 775 del self.attrib['action']
776 action = property(_action__get, _action__set, _action__del, doc=_action__get.__doc__) 777
778 - def _method__get(self):
779 """ 780 Get/set the form's method. Always returns a capitalized 781 string, and defaults to ``'GET'`` 782 """ 783 return self.get('method', 'GET').upper()
784 - def _method__set(self, value):
785 self.set('method', value.upper())
786 method = property(_method__get, _method__set, doc=_method__get.__doc__)
787 788 HtmlElementClassLookup._default_element_classes['form'] = FormElement 789
790 -def submit_form(form, extra_values=None, open_http=None):
791 """ 792 Helper function to submit a form. Returns a file-like object, as from 793 ``urllib.urlopen()``. This object also has a ``.geturl()`` function, 794 which shows the URL if there were any redirects. 795 796 You can use this like:: 797 798 form = doc.forms[0] 799 form.inputs['foo'].value = 'bar' # etc 800 response = form.submit() 801 doc = parse(response) 802 doc.make_links_absolute(response.geturl()) 803 804 To change the HTTP requester, pass a function as ``open_http`` keyword 805 argument that opens the URL for you. The function must have the following 806 signature:: 807 808 open_http(method, URL, values) 809 810 The action is one of 'GET' or 'POST', the URL is the target URL as a 811 string, and the values are a sequence of ``(name, value)`` tuples with the 812 form data. 813 """ 814 values = form.form_values() 815 if extra_values: 816 if hasattr(extra_values, 'items'): 817 extra_values = extra_values.items() 818 values.extend(extra_values) 819 if open_http is None: 820 open_http = open_http_urllib 821 return open_http(form.method, form.action, values)
822
823 -def open_http_urllib(method, url, values):
824 ## FIXME: should test that it's not a relative URL or something 825 try: 826 from urllib import urlencode, urlopen 827 except ImportError: # Python 3 828 from urllib.request import urlopen 829 from urllib.parse import urlencode 830 if method == 'GET': 831 if '?' in url: 832 url += '&' 833 else: 834 url += '?' 835 url += urlencode(values) 836 data = None 837 else: 838 data = urlencode(values) 839 return urlopen(url, data)
840
841 -class FieldsDict(DictMixin):
842
843 - def __init__(self, inputs):
844 self.inputs = inputs
845 - def __getitem__(self, item):
846 return self.inputs[item].value
847 - def __setitem__(self, item, value):
848 self.inputs[item].value = value
849 - def __delitem__(self, item):
850 raise KeyError( 851 "You cannot remove keys from ElementDict")
852 - def keys(self):
853 return self.inputs.keys()
854 - def __contains__(self, item):
855 return item in self.inputs
856
857 - def __repr__(self):
858 return '<%s for form %s>' % ( 859 self.__class__.__name__, 860 self.inputs.form._name())
861
862 -class InputGetter(object):
863 864 """ 865 An accessor that represents all the input fields in a form. 866 867 You can get fields by name from this, with 868 ``form.inputs['field_name']``. If there are a set of checkboxes 869 with the same name, they are returned as a list (a `CheckboxGroup` 870 which also allows value setting). Radio inputs are handled 871 similarly. 872 873 You can also iterate over this to get all input elements. This 874 won't return the same thing as if you get all the names, as 875 checkboxes and radio elements are returned individually. 876 """ 877 878 _name_xpath = etree.XPath(".//*[@name = $name and (local-name(.) = 'select' or local-name(.) = 'input' or local-name(.) = 'textarea')]") 879 _all_xpath = etree.XPath(".//*[local-name() = 'select' or local-name() = 'input' or local-name() = 'textarea']") 880
881 - def __init__(self, form):
882 self.form = form
883
884 - def __repr__(self):
885 return '<%s for form %s>' % ( 886 self.__class__.__name__, 887 self.form._name())
888 889 ## FIXME: there should be more methods, and it's unclear if this is 890 ## a dictionary-like object or list-like object 891
892 - def __getitem__(self, name):
893 results = self._name_xpath(self.form, name=name) 894 if results: 895 type = results[0].get('type') 896 if type == 'radio' and len(results) > 1: 897 group = RadioGroup(results) 898 group.name = name 899 return group 900 elif type == 'checkbox' and len(results) > 1: 901 group = CheckboxGroup(results) 902 group.name = name 903 return group 904 else: 905 # I don't like throwing away elements like this 906 return results[0] 907 else: 908 raise KeyError( 909 "No input element with the name %r" % name)
910
911 - def __contains__(self, name):
912 results = self._name_xpath(self.form, name=name) 913 return bool(results)
914
915 - def keys(self):
916 names = set() 917 for el in self: 918 names.add(el.name) 919 if None in names: 920 names.remove(None) 921 return list(names)
922
923 - def __iter__(self):
924 ## FIXME: kind of dumb to turn a list into an iterator, only 925 ## to have it likely turned back into a list again :( 926 return iter(self._all_xpath(self.form))
927
928 -class InputMixin(object):
929 930 """ 931 Mix-in for all input elements (input, select, and textarea) 932 """ 933 934
935 - def _name__get(self):
936 """ 937 Get/set the name of the element 938 """ 939 return self.get('name')
940 - def _name__set(self, value):
941 self.set('name', value)
942 - def _name__del(self):
943 if 'name' in self.attrib: 944 del self.attrib['name']
945 name = property(_name__get, _name__set, _name__del, doc=_name__get.__doc__) 946
947 - def __repr__(self):
948 type = getattr(self, 'type', None) 949 if type: 950 type = ' type=%r' % type 951 else: 952 type = '' 953 return '<%s %x name=%r%s>' % ( 954 self.__class__.__name__, id(self), self.name, type)
955
956 -class TextareaElement(InputMixin, HtmlElement):
957 """ 958 ``<textarea>`` element. You can get the name with ``.name`` and 959 get/set the value with ``.value`` 960 """ 961
962 - def _value__get(self):
963 """ 964 Get/set the value (which is the contents of this element) 965 """ 966 return self.text or ''
967 - def _value__set(self, value):
968 self.text = value
969 - def _value__del(self):
970 self.text = ''
971 value = property(_value__get, _value__set, _value__del, doc=_value__get.__doc__)
972 973 HtmlElementClassLookup._default_element_classes['textarea'] = TextareaElement 974
975 -class SelectElement(InputMixin, HtmlElement):
976 """ 977 ``<select>`` element. You can get the name with ``.name``. 978 979 ``.value`` will be the value of the selected option, unless this 980 is a multi-select element (``<select multiple>``), in which case 981 it will be a set-like object. In either case ``.value_options`` 982 gives the possible values. 983 984 The boolean attribute ``.multiple`` shows if this is a 985 multi-select. 986 """ 987
988 - def _value__get(self):
989 """ 990 Get/set the value of this select (the selected option). 991 992 If this is a multi-select, this is a set-like object that 993 represents all the selected options. 994 """ 995 if self.multiple: 996 return MultipleSelectOptions(self) 997 for el in _options_xpath(self): 998 if el.get('selected') is not None: 999 value = el.get('value') 1000 if value is None: 1001 value = el.text or '' 1002 if value: 1003 value = value.strip() 1004 return value 1005 return None
1006
1007 - def _value__set(self, value):
1008 if self.multiple: 1009 if isinstance(value, basestring): 1010 raise TypeError( 1011 "You must pass in a sequence") 1012 self.value.clear() 1013 self.value.update(value) 1014 return 1015 if value is not None: 1016 value = value.strip() 1017 for el in _options_xpath(self): 1018 opt_value = el.get('value') 1019 if opt_value is None: 1020 opt_value = el.text or '' 1021 if opt_value: 1022 opt_value = opt_value.strip() 1023 if opt_value == value: 1024 checked_option = el 1025 break 1026 else: 1027 raise ValueError( 1028 "There is no option with the value of %r" % value) 1029 for el in _options_xpath(self): 1030 if 'selected' in el.attrib: 1031 del el.attrib['selected'] 1032 if value is not None: 1033 checked_option.set('selected', '')
1034
1035 - def _value__del(self):
1036 # FIXME: should del be allowed at all? 1037 if self.multiple: 1038 self.value.clear() 1039 else: 1040 self.value = None
1041 1042 value = property(_value__get, _value__set, _value__del, doc=_value__get.__doc__) 1043
1044 - def value_options(self):
1045 """ 1046 All the possible values this select can have (the ``value`` 1047 attribute of all the ``<option>`` elements. 1048 """ 1049 options = [] 1050 for el in _options_xpath(self): 1051 value = el.get('value') 1052 if value is None: 1053 value = el.text or '' 1054 if value: 1055 value = value.strip() 1056 options.append(value) 1057 return options
1058 value_options = property(value_options, doc=value_options.__doc__) 1059
1060 - def _multiple__get(self):
1061 """ 1062 Boolean attribute: is there a ``multiple`` attribute on this element. 1063 """ 1064 return 'multiple' in self.attrib
1065 - def _multiple__set(self, value):
1066 if value: 1067 self.set('multiple', '') 1068 elif 'multiple' in self.attrib: 1069 del self.attrib['multiple']
1070 multiple = property(_multiple__get, _multiple__set, doc=_multiple__get.__doc__)
1071 1072 HtmlElementClassLookup._default_element_classes['select'] = SelectElement 1073
1074 -class MultipleSelectOptions(SetMixin):
1075 """ 1076 Represents all the selected options in a ``<select multiple>`` element. 1077 1078 You can add to this set-like option to select an option, or remove 1079 to unselect the option. 1080 """ 1081
1082 - def __init__(self, select):
1083 self.select = select
1084
1085 - def options(self):
1086 """ 1087 Iterator of all the ``<option>`` elements. 1088 """ 1089 return iter(_options_xpath(self.select))
1090 options = property(options) 1091
1092 - def __iter__(self):
1093 for option in self.options: 1094 yield option.get('value')
1095
1096 - def add(self, item):
1097 for option in self.options: 1098 if option.get('value') == item: 1099 option.set('selected', '') 1100 break 1101 else: 1102 raise ValueError( 1103 "There is no option with the value %r" % item)
1104
1105 - def remove(self, item):
1106 for option in self.options: 1107 if option.get('value') == item: 1108 if 'selected' in option.attrib: 1109 del option.attrib['selected'] 1110 else: 1111 raise ValueError( 1112 "The option %r is not currently selected" % item) 1113 break 1114 else: 1115 raise ValueError( 1116 "There is not option with the value %r" % item)
1117
1118 - def __repr__(self):
1119 return '<%s {%s} for select name=%r>' % ( 1120 self.__class__.__name__, 1121 ', '.join([repr(v) for v in self]), 1122 self.select.name)
1123
1124 -class RadioGroup(list):
1125 """ 1126 This object represents several ``<input type=radio>`` elements 1127 that have the same name. 1128 1129 You can use this like a list, but also use the property 1130 ``.value`` to check/uncheck inputs. Also you can use 1131 ``.value_options`` to get the possible values. 1132 """ 1133
1134 - def _value__get(self):
1135 """ 1136 Get/set the value, which checks the radio with that value (and 1137 unchecks any other value). 1138 """ 1139 for el in self: 1140 if 'checked' in el.attrib: 1141 return el.get('value') 1142 return None
1143
1144 - def _value__set(self, value):
1145 if value is not None: 1146 for el in self: 1147 if el.get('value') == value: 1148 checked_option = el 1149 break 1150 else: 1151 raise ValueError( 1152 "There is no radio input with the value %r" % value) 1153 for el in self: 1154 if 'checked' in el.attrib: 1155 del el.attrib['checked'] 1156 if value is not None: 1157 checked_option.set('checked', '')
1158
1159 - def _value__del(self):
1160 self.value = None
1161 1162 value = property(_value__get, _value__set, _value__del, doc=_value__get.__doc__) 1163
1164 - def value_options(self):
1165 """ 1166 Returns a list of all the possible values. 1167 """ 1168 return [el.get('value') for el in self]
1169 value_options = property(value_options, doc=value_options.__doc__) 1170
1171 - def __repr__(self):
1172 return '%s(%s)' % ( 1173 self.__class__.__name__, 1174 list.__repr__(self))
1175
1176 -class CheckboxGroup(list):
1177 """ 1178 Represents a group of checkboxes (``<input type=checkbox>``) that 1179 have the same name. 1180 1181 In addition to using this like a list, the ``.value`` attribute 1182 returns a set-like object that you can add to or remove from to 1183 check and uncheck checkboxes. You can also use ``.value_options`` 1184 to get the possible values. 1185 """ 1186
1187 - def _value__get(self):
1188 """ 1189 Return a set-like object that can be modified to check or 1190 uncheck individual checkboxes according to their value. 1191 """ 1192 return CheckboxValues(self)
1193 - def _value__set(self, value):
1194 self.value.clear() 1195 if not hasattr(value, '__iter__'): 1196 raise ValueError( 1197 "A CheckboxGroup (name=%r) must be set to a sequence (not %r)" 1198 % (self[0].name, value)) 1199 self.value.update(value)
1200 - def _value__del(self):
1201 self.value.clear()
1202 value = property(_value__get, _value__set, _value__del, doc=_value__get.__doc__) 1203
1204 - def __repr__(self):
1205 return '%s(%s)' % ( 1206 self.__class__.__name__, list.__repr__(self))
1207
1208 -class CheckboxValues(SetMixin):
1209 1210 """ 1211 Represents the values of the checked checkboxes in a group of 1212 checkboxes with the same name. 1213 """ 1214
1215 - def __init__(self, group):
1216 self.group = group
1217
1218 - def __iter__(self):
1219 return iter([ 1220 el.get('value') 1221 for el in self.group 1222 if 'checked' in el.attrib])
1223
1224 - def add(self, value):
1225 for el in self.group: 1226 if el.get('value') == value: 1227 el.set('checked', '') 1228 break 1229 else: 1230 raise KeyError("No checkbox with value %r" % value)
1231
1232 - def remove(self, value):
1233 for el in self.group: 1234 if el.get('value') == value: 1235 if 'checked' in el.attrib: 1236 del el.attrib['checked'] 1237 else: 1238 raise KeyError( 1239 "The checkbox with value %r was already unchecked" % value) 1240 break 1241 else: 1242 raise KeyError( 1243 "No checkbox with value %r" % value)
1244
1245 - def __repr__(self):
1246 return '<%s {%s} for checkboxes name=%r>' % ( 1247 self.__class__.__name__, 1248 ', '.join([repr(v) for v in self]), 1249 self.group.name)
1250
1251 -class InputElement(InputMixin, HtmlElement):
1252 """ 1253 Represents an ``<input>`` element. 1254 1255 You can get the type with ``.type`` (which is lower-cased and 1256 defaults to ``'text'``). 1257 1258 Also you can get and set the value with ``.value`` 1259 1260 Checkboxes and radios have the attribute ``input.checkable == 1261 True`` (for all others it is false) and a boolean attribute 1262 ``.checked``. 1263 1264 """ 1265 1266 ## FIXME: I'm a little uncomfortable with the use of .checked
1267 - def _value__get(self):
1268 """ 1269 Get/set the value of this element, using the ``value`` attribute. 1270 1271 Also, if this is a checkbox and it has no value, this defaults 1272 to ``'on'``. If it is a checkbox or radio that is not 1273 checked, this returns None. 1274 """ 1275 if self.checkable: 1276 if self.checked: 1277 return self.get('value') or 'on' 1278 else: 1279 return None 1280 return self.get('value')
1281 - def _value__set(self, value):
1282 if self.checkable: 1283 if not value: 1284 self.checked = False 1285 else: 1286 self.checked = True 1287 if isinstance(value, basestring): 1288 self.set('value', value) 1289 else: 1290 self.set('value', value)
1291 - def _value__del(self):
1292 if self.checkable: 1293 self.checked = False 1294 else: 1295 if 'value' in self.attrib: 1296 del self.attrib['value']
1297 value = property(_value__get, _value__set, _value__del, doc=_value__get.__doc__) 1298
1299 - def _type__get(self):
1300 """ 1301 Return the type of this element (using the type attribute). 1302 """ 1303 return self.get('type', 'text').lower()
1304 - def _type__set(self, value):
1305 self.set('type', value)
1306 type = property(_type__get, _type__set, doc=_type__get.__doc__) 1307
1308 - def checkable(self):
1309 """ 1310 Boolean: can this element be checked? 1311 """ 1312 return self.type in ['checkbox', 'radio']
1313 checkable = property(checkable, doc=checkable.__doc__) 1314
1315 - def _checked__get(self):
1316 """ 1317 Boolean attribute to get/set the presence of the ``checked`` 1318 attribute. 1319 1320 You can only use this on checkable input types. 1321 """ 1322 if not self.checkable: 1323 raise AttributeError('Not a checkable input type') 1324 return 'checked' in self.attrib
1325 - def _checked__set(self, value):
1326 if not self.checkable: 1327 raise AttributeError('Not a checkable input type') 1328 if value: 1329 self.set('checked', '') 1330 else: 1331 if 'checked' in self.attrib: 1332 del self.attrib['checked']
1333 checked = property(_checked__get, _checked__set, doc=_checked__get.__doc__)
1334 1335 HtmlElementClassLookup._default_element_classes['input'] = InputElement 1336
1337 -class LabelElement(HtmlElement):
1338 """ 1339 Represents a ``<label>`` element. 1340 1341 Label elements are linked to other elements with their ``for`` 1342 attribute. You can access this element with ``label.for_element``. 1343 """ 1344
1345 - def _for_element__get(self):
1346 """ 1347 Get/set the element this label points to. Return None if it 1348 can't be found. 1349 """ 1350 id = self.get('for') 1351 if not id: 1352 return None 1353 return self.body.get_element_by_id(id)
1354 - def _for_element__set(self, other):
1355 id = other.get('id') 1356 if not id: 1357 raise TypeError( 1358 "Element %r has no id attribute" % other) 1359 self.set('for', id)
1360 - def _for_element__del(self):
1361 if 'id' in self.attrib: 1362 del self.attrib['id']
1363 for_element = property(_for_element__get, _for_element__set, _for_element__del, 1364 doc=_for_element__get.__doc__)
1365 1366 HtmlElementClassLookup._default_element_classes['label'] = LabelElement 1367 1368 ############################################################ 1369 ## Serialization 1370 ############################################################ 1371
1372 -def html_to_xhtml(html):
1373 """Convert all tags in an HTML tree to XHTML by moving them to the 1374 XHTML namespace. 1375 """ 1376 try: 1377 html = html.getroot() 1378 except AttributeError: 1379 pass 1380 prefix = "{%s}" % XHTML_NAMESPACE 1381 for el in html.iter(): 1382 tag = el.tag 1383 if isinstance(tag, basestring): 1384 if tag[0] != '{': 1385 el.tag = prefix + tag
1386
1387 -def xhtml_to_html(xhtml):
1388 """Convert all tags in an XHTML tree to HTML by removing their 1389 XHTML namespace. 1390 """ 1391 try: 1392 xhtml = xhtml.getroot() 1393 except AttributeError: 1394 pass 1395 prefix = "{%s}" % XHTML_NAMESPACE 1396 prefix_len = len(prefix) 1397 for el in xhtml.iter(prefix + "*"): 1398 el.tag = el.tag[prefix_len:]
1399 1400 # This isn't a general match, but it's a match for what libxml2 1401 # specifically serialises: 1402 __str_replace_meta_content_type = re.compile( 1403 r'<meta http-equiv="Content-Type"[^>]*>').sub 1404 __bytes_replace_meta_content_type = re.compile( 1405 r'<meta http-equiv="Content-Type"[^>]*>'.encode('ASCII')).sub 1406
1407 -def tostring(doc, pretty_print=False, include_meta_content_type=False, 1408 encoding=None, method="html"):
1409 """Return an HTML string representation of the document. 1410 1411 Note: if include_meta_content_type is true this will create a 1412 ``<meta http-equiv="Content-Type" ...>`` tag in the head; 1413 regardless of the value of include_meta_content_type any existing 1414 ``<meta http-equiv="Content-Type" ...>`` tag will be removed 1415 1416 The ``encoding`` argument controls the output encoding (defauts to 1417 ASCII, with &#...; character references for any characters outside 1418 of ASCII). 1419 1420 The ``method`` argument defines the output method. It defaults to 1421 'html', but can also be 'xml' for xhtml output, or 'text' to 1422 serialise to plain text without markup. Note that you can pass 1423 the builtin ``unicode`` type as ``encoding`` argument to serialise 1424 to a unicode string. 1425 1426 Example:: 1427 1428 >>> from lxml import html 1429 >>> root = html.fragment_fromstring('<p>Hello<br>world!</p>') 1430 1431 >>> html.tostring(root) 1432 b'<p>Hello<br>world!</p>' 1433 >>> html.tostring(root, method='html') 1434 b'<p>Hello<br>world!</p>' 1435 1436 >>> html.tostring(root, method='xml') 1437 b'<p>Hello<br/>world!</p>' 1438 1439 >>> html.tostring(root, method='text') 1440 b'Helloworld!' 1441 1442 >>> html.tostring(root, method='text', encoding=unicode) 1443 u'Helloworld!' 1444 """ 1445 html = etree.tostring(doc, method=method, pretty_print=pretty_print, 1446 encoding=encoding) 1447 if not include_meta_content_type: 1448 if isinstance(html, str): 1449 html = __str_replace_meta_content_type('', html) 1450 else: 1451 html = __bytes_replace_meta_content_type(bytes(), html) 1452 return html
1453 1454 tostring.__doc__ = __fix_docstring(tostring.__doc__) 1455
1456 -def open_in_browser(doc):
1457 """ 1458 Open the HTML document in a web browser (saving it to a temporary 1459 file to open it). 1460 """ 1461 import os 1462 import webbrowser 1463 try: 1464 write_doc = doc.write 1465 except AttributeError: 1466 write_doc = etree.ElementTree(element=doc).write 1467 fn = os.tempnam() + '.html' 1468 write_doc(fn, method="html") 1469 url = 'file://' + fn.replace(os.path.sep, '/') 1470 print(url) 1471 webbrowser.open(url)
1472 1473 ################################################################################ 1474 # configure Element class lookup 1475 ################################################################################ 1476
1477 -class HTMLParser(etree.HTMLParser):
1478 - def __init__(self, **kwargs):
1479 super(HTMLParser, self).__init__(**kwargs) 1480 self.set_element_class_lookup(HtmlElementClassLookup())
1481
1482 -class XHTMLParser(etree.XMLParser):
1483 - def __init__(self, **kwargs):
1484 super(XHTMLParser, self).__init__(**kwargs) 1485 self.set_element_class_lookup(HtmlElementClassLookup())
1486
1487 -def Element(*args, **kw):
1488 """Create a new HTML Element. 1489 1490 This can also be used for XHTML documents. 1491 """ 1492 v = html_parser.makeelement(*args, **kw) 1493 return v
1494 1495 html_parser = HTMLParser() 1496 xhtml_parser = XHTMLParser() 1497