Package lxml :: Package html
[frames] | no frames]

Source Code for Package lxml.html

   1  """The ``lxml.html`` tool set for HTML handling. 
   2  """ 
   3   
   4  import threading 
   5  import re 
   6  import urlparse 
   7  import copy 
   8  from lxml import etree 
   9  from lxml.html import defs 
  10  from lxml import cssselect 
  11  from lxml.html._setmixin import SetMixin 
  12  try: 
  13      from UserDict import DictMixin 
  14  except ImportError: 
  15      # DictMixin was introduced in Python 2.4 
  16      from lxml.html._dictmixin import DictMixin 
  17  import sets 
  18   
  19  __all__ = [ 
  20      'document_fromstring', 'fragment_fromstring', 'fragments_fromstring', 'fromstring', 
  21      'tostring', 'Element', 'defs', 'open_in_browser', 'submit_form', 
  22      'find_rel_links', 'find_class', 'make_links_absolute', 
  23      'resolve_base_href', 'iterlinks', 'rewrite_links', 'open_in_browser'] 
  24   
  25  _rel_links_xpath = etree.XPath("descendant-or-self::a[@rel]") 
  26  #_class_xpath = etree.XPath(r"descendant-or-self::*[regexp:match(@class, concat('\b', $class_name, '\b'))]", {'regexp': 'http://exslt.org/regular-expressions'}) 
  27  _class_xpath = etree.XPath("descendant-or-self::*[@class and contains(concat(' ', normalize-space(@class), ' '), concat(' ', $class_name, ' '))]") 
  28  _id_xpath = etree.XPath("descendant-or-self::*[@id=$id]") 
  29  _collect_string_content = etree.XPath("string()") 
  30  _css_url_re = re.compile(r'url\((.*?)\)', re.I) 
  31  _css_import_re = re.compile(r'@import "(.*?)"') 
  32  _label_xpath = etree.XPath("//label[@for=$id]") 
  33  _archive_re = re.compile(r'[^ ]+') 
  34   
35 -class HtmlMixin(object):
36
37 - def base_url(self):
38 """ 39 Returns the base URL, given when the page was parsed. 40 41 Use with ``urlparse.urljoin(el.base_url, href)`` to get 42 absolute URLs. 43 """ 44 return self.getroottree().docinfo.URL
45 base_url = property(base_url, doc=base_url.__doc__) 46
47 - def forms(self):
48 """ 49 Return a list of all the forms 50 """ 51 return list(self.getiterator('form'))
52 forms = property(forms, doc=forms.__doc__) 53
54 - def body(self):
55 """ 56 Return the <body> element. Can be called from a child element 57 to get the document's head. 58 """ 59 return self.xpath('//body')[0]
60 body = property(body, doc=body.__doc__) 61
62 - def head(self):
63 """ 64 Returns the <head> element. Can be called from a child 65 element to get the document's head. 66 """ 67 return self.xpath('//head')[0]
68 head = property(head, doc=head.__doc__) 69
70 - def label__get(self):
71 """ 72 Get or set any <label> element associated with this element. 73 """ 74 id = self.get('id') 75 if not id: 76 return None 77 result = _label_xpath(self, id=id) 78 if not result: 79 return None 80 else: 81 return result[0]
82 - def label__set(self, label):
83 id = self.get('id') 84 if not id: 85 raise TypeError( 86 "You cannot set a label for an element (%r) that has no id" 87 % self) 88 if not label.tag == 'label': 89 raise TypeError( 90 "You can only assign label to a label element (not %r)" 91 % label) 92 label.set('for', id)
93 - def label__del(self):
94 label = self.label 95 if label is not None: 96 del label.attrib['for']
97 label = property(label__get, label__set, label__del, doc=label__get.__doc__) 98
99 - def drop_tree(self):
100 """ 101 Removes this element from the tree, including its children and 102 text. The tail text is joined to the previous element or 103 parent. 104 """ 105 parent = self.getparent() 106 assert parent is not None 107 if self.tail: 108 previous = self.getprevious() 109 if previous is None: 110 parent.text = (parent.text or '') + self.tail 111 else: 112 previous.tail = (previous.tail or '') + self.tail 113 parent.remove(self)
114
115 - def drop_tag(self):
116 """ 117 Remove the tag, but not its children or text. The children and text 118 are merged into the parent. 119 120 Example:: 121 122 >>> h = fragment_fromstring('<div>Hello <b>World!</b></div>') 123 >>> h.find('//b').drop_tag() 124 >>> print tostring(h) 125 <div>Hello World!</div> 126 """ 127 parent = self.getparent() 128 assert parent is not None 129 previous = self.getprevious() 130 if self.text and isinstance(self.tag, basestring): 131 # not a Comment, etc. 132 if previous is None: 133 parent.text = (parent.text or '') + self.text 134 else: 135 previous.tail = (previous.tail or '') + self.text 136 if self.tail: 137 if len(self): 138 last = self[-1] 139 last.tail = (last.tail or '') + self.tail 140 elif previous is None: 141 parent.text = (parent.text or '') + self.tail 142 else: 143 previous.tail = (previous.tail or '') + self.tail 144 index = parent.index(self) 145 parent[index:index+1] = self[:]
146 154
155 - def find_class(self, class_name):
156 """ 157 Find any elements with the given class name. 158 """ 159 return _class_xpath(self, class_name=class_name)
160
161 - def get_element_by_id(self, id, *default):
162 """ 163 Get the first element in a document with the given id. If none is 164 found, return the default argument if provided or raise KeyError 165 otherwise. 166 167 Note that there can be more than one element with the same id, 168 and this isn't uncommon in HTML documents found in the wild. 169 Browsers return only the first match, and this function does 170 the same. 171 """ 172 try: 173 # FIXME: should this check for multiple matches? 174 # browsers just return the first one 175 return _id_xpath(self, id=id)[0] 176 except IndexError: 177 if default: 178 return default[0] 179 else: 180 raise KeyError, id
181
182 - def text_content(self):
183 """ 184 Return the text content of the tag (and the text in any children). 185 """ 186 return _collect_string_content(self)
187
188 - def cssselect(self, expr):
189 """ 190 Run the CSS expression on this element and its children, 191 returning a list of the results. 192 193 Equivalent to lxml.cssselect.CSSSelect(expr)(self) -- note 194 that pre-compiling the expression can provide a substantial 195 speedup. 196 """ 197 return cssselect.CSSSelect(expr)(self)
198 199 ######################################## 200 ## Link functions 201 ######################################## 202 222 self.rewrite_links(link_repl)
223
224 - def resolve_base_href(self):
225 """ 226 Find any ``<base href>`` tag in the document, and apply its 227 values to all links found in the document. Also remove the 228 tag once it has been applied. 229 """ 230 base_href = None 231 basetags = self.xpath('//base[@href]') 232 for b in basetags: 233 base_href = b.get('href') 234 b.drop_tree() 235 if not base_href: 236 return 237 self.make_links_absolute(base_href, resolve_base_href=False)
238 293 340 341
342 -class _MethodFunc(object):
343 """ 344 An object that represents a method on an element as a function; 345 the function takes either an element or an HTML string. It 346 returns whatever the function normally returns, or if the function 347 works in-place (and so returns None) it returns a serialized form 348 of the resulting document. 349 """
350 - def __init__(self, name, copy=False, source_class=HtmlMixin):
351 self.name = name 352 self.copy = copy 353 self.__doc__ = getattr(source_class, self.name).__doc__
354 - def __call__(self, doc, *args, **kw):
355 if isinstance(doc, basestring): 356 if 'copy' in kw: 357 raise TypeError( 358 "The keyword 'copy' can only be used with element inputs to %s, not a string input" % self.name) 359 return_string = True 360 doc = fromstring(doc, **kw) 361 else: 362 if 'copy' in kw: 363 copy = kw.pop('copy') 364 else: 365 copy = self.copy 366 return_string = False 367 if copy: 368 doc = copy.deepcopy(doc) 369 meth = getattr(doc, self.name) 370 result = meth(*args, **kw) 371 # FIXME: this None test is a bit sloppy 372 if result is None: 373 # Then return what we got in 374 if return_string: 375 return tostring(doc) 376 else: 377 return doc 378 else: 379 return result
380 381 find_rel_links = _MethodFunc('find_rel_links', copy=False) 382 find_class = _MethodFunc('find_class', copy=False) 383 make_links_absolute = _MethodFunc('make_links_absolute', copy=True) 384 resolve_base_href = _MethodFunc('resolve_base_href', copy=True) 385 iterlinks = _MethodFunc('iterlinks', copy=False) 386 rewrite_links = _MethodFunc('rewrite_links', copy=True) 387
388 -class HtmlComment(etree.CommentBase, HtmlMixin):
389 pass
390
391 -class HtmlElement(etree.ElementBase, HtmlMixin):
392 pass
393
394 -class HtmlProcessingInstruction(etree.PIBase, HtmlMixin):
395 pass
396
397 -class HtmlEntity(etree.EntityBase, HtmlMixin):
398 pass
399 400
401 -class HtmlElementClassLookup(etree.CustomElementClassLookup):
402 """A lookup scheme for HTML Element classes. 403 404 To create a lookup instance with different Element classes, pass a tag 405 name mapping of Element classes in the ``classes`` keyword argument and/or 406 a tag name mapping of Mixin classes in the ``mixins`` keyword argument. 407 The special key '*' denotes a Mixin class that should be mixed into all 408 Element classes. 409 """ 410 _default_element_classes = {} 411
412 - def __init__(self, classes=None, mixins=None):
413 etree.CustomElementClassLookup.__init__(self) 414 if classes is None: 415 classes = self._default_element_classes.copy() 416 if mixins: 417 mixers = {} 418 for name, value in mixins: 419 if name == '*': 420 for n in classes.keys(): 421 mixers.setdefault(n, []).append(value) 422 else: 423 mixers.setdefault(name, []).append(value) 424 for name, mix_bases in mixers.items(): 425 cur = classes.get(name, HtmlElement) 426 bases = tuple(mix_bases + [cur]) 427 classes[name] = type(cur.__name__, bases, {}) 428 self._element_classes = classes
429
430 - def lookup(self, node_type, document, namespace, name):
431 if node_type == 'element': 432 return self._element_classes.get(name.lower(), HtmlElement) 433 elif node_type == 'comment': 434 return HtmlComment 435 elif node_type == 'PI': 436 return HtmlProcessingInstruction 437 elif node_type == 'entity': 438 return HtmlEntity 439 # Otherwise normal lookup 440 return None
441 442 ################################################################################ 443 # parsing 444 ################################################################################ 445
446 -def document_fromstring(html, **kw):
447 value = etree.HTML(html, html_parser, **kw) 448 if value is None: 449 raise etree.ParserError( 450 "Document is empty") 451 return value
452
453 -def fragments_fromstring(html, no_leading_text=False, **kw):
454 """ 455 Parses several HTML elements, returning a list of elements. 456 457 The first item in the list may be a string (though leading 458 whitespace is removed). If no_leading_text is true, then it will 459 be an error if there is leading text, and it will always be a list 460 of only elements. 461 """ 462 # FIXME: check what happens when you give html with a body, head, etc. 463 start = html[:20].lstrip().lower() 464 if not start.startswith('<html') and not start.startswith('<!doctype'): 465 html = '<html><body>%s</body></html>' % html 466 doc = document_fromstring(html, **kw) 467 assert doc.tag == 'html' 468 bodies = [e for e in doc if e.tag == 'body'] 469 assert len(bodies) == 1, ("too many bodies: %r in %r" % (bodies, html)) 470 body = bodies[0] 471 elements = [] 472 if no_leading_text and body.text and body.text.strip(): 473 raise etree.ParserError( 474 "There is leading text: %r" % body.text) 475 if body.text and body.text.strip(): 476 elements.append(body.text) 477 elements.extend(body) 478 # FIXME: removing the reference to the parent artificial document 479 # would be nice 480 return elements
481
482 -def fragment_fromstring(html, create_parent=False, **kw):
483 """ 484 Parses a single HTML element; it is an error if there is more than 485 one element, or if anything but whitespace precedes or follows the 486 element. 487 488 If create_parent is true (or is a tag name) then a parent node 489 will be created to encapsulate the HTML in a single element. 490 """ 491 if create_parent: 492 if not isinstance(create_parent, basestring): 493 create_parent = 'div' 494 return fragment_fromstring('<%s>%s</%s>' % ( 495 create_parent, html, create_parent), **kw) 496 elements = fragments_fromstring(html, no_leading_text=True) 497 if not elements: 498 raise etree.ParserError( 499 "No elements found") 500 if len(elements) > 1: 501 raise etree.ParserError( 502 "Multiple elements found (%s)" 503 % ', '.join([_element_name(e) for e in elements])) 504 el = elements[0] 505 if el.tail and el.tail.strip(): 506 raise etree.ParserError( 507 "Element followed by text: %r" % el.tail) 508 el.tail = None 509 return el
510
511 -def fromstring(html, **kw):
512 """ 513 Parse the html, returning a single element/document. 514 515 This tries to minimally parse the chunk of text, without knowing if it 516 is a fragment or a document. 517 """ 518 start = html[:10].lstrip().lower() 519 if start.startswith('<html') or start.startswith('<!doctype'): 520 # Looks like a full HTML document 521 return document_fromstring(html, **kw) 522 # otherwise, lets parse it out... 523 doc = document_fromstring(html, **kw) 524 bodies = doc.findall('body') 525 if bodies: 526 body = bodies[0] 527 if len(bodies) > 1: 528 # Somehow there are multiple bodies, which is bad, but just 529 # smash them into one body 530 for other_body in bodies[1:]: 531 if other_body.text: 532 if len(body): 533 body[-1].tail = (body[-1].tail or '') + other_body.text 534 else: 535 body.text = (body.text or '') + other_body.text 536 body.extend(other_body) 537 # We'll ignore tail 538 # I guess we are ignoring attributes too 539 other_body.drop_tree() 540 else: 541 body = None 542 heads = doc.findall('head') 543 if heads: 544 # Well, we have some sort of structure, so lets keep it all 545 head = heads[0] 546 if len(heads) > 1: 547 for other_head in heads[1:]: 548 head.extend(other_head) 549 # We don't care about text or tail in a head 550 other_head.drop_tree() 551 return doc 552 if (len(body) == 1 and (not body.text or not body.text.strip()) 553 and (not body[-1].tail or not body[-1].tail.strip())): 554 # The body has just one element, so it was probably a single 555 # element passed in 556 return body[0] 557 # Now we have a body which represents a bunch of tags which have the 558 # content that was passed in. We will create a fake container, which 559 # is the body tag, except <body> implies too much structure. 560 if _contains_block_level_tag(body): 561 body.tag = 'div' 562 else: 563 body.tag = 'span' 564 return body
565
566 -def parse(filename, parser=None, **kw):
567 """ 568 Parse a filename, URL, or file-like object into an HTML document. 569 570 You may pass the keyword argument ``base_url='http://...'`` to set 571 the base URL. 572 """ 573 if parser is None: 574 parser = html_parser 575 return etree.parse(filename, parser, **kw)
576
577 -def _contains_block_level_tag(el):
578 # FIXME: I could do this with XPath, but would that just be 579 # unnecessarily slow? 580 for el in el.getiterator(): 581 if el.tag in defs.block_tags: 582 return True 583 return False
584
585 -def _element_name(el):
586 if isinstance(el, etree.CommentBase): 587 return 'comment' 588 elif isinstance(el, basestring): 589 return 'string' 590 else: 591 return el.tag
592 593 ################################################################################ 594 # form handling 595 ################################################################################ 596
597 -class FormElement(HtmlElement):
598 """ 599 Represents a <form> element. 600 """ 601
602 - def inputs(self):
603 """ 604 Returns an accessor for all the input elements in the form. 605 606 See `InputGetter` for more information about the object. 607 """ 608 return InputGetter(self)
609 inputs = property(inputs, doc=inputs.__doc__) 610
611 - def fields__get(self):
612 """ 613 Dictionary-like object that represents all the fields in this 614 form. You can set values in this dictionary to effect the 615 form. 616 """ 617 return FieldsDict(self.inputs)
618 - def fields__set(self, value):
619 prev_keys = self.fields.keys() 620 for key, value in value.iteritems(): 621 if key in prev_keys: 622 prev_keys.remove(key) 623 self.fields[key] = value 624 for key in prev_keys: 625 if key is None: 626 # Case of an unnamed input; these aren't really 627 # expressed in form_values() anyway. 628 continue 629 self.fields[key] = None
630 631 fields = property(fields__get, fields__set, doc=fields__get.__doc__) 632
633 - def _name(self):
634 if self.get('name'): 635 return self.get('name') 636 elif self.get('id'): 637 return '#' + self.get('id') 638 return str(self.body.findall('form').index(self))
639
640 - def form_values(self):
641 """ 642 Return a list of tuples of the field values for the form. 643 This is suitable to be passed to ``urllib.urlencode()``. 644 """ 645 results = [] 646 for el in self.inputs: 647 name = el.name 648 if not name: 649 continue 650 if el.tag == 'textarea': 651 results.append((name, el.value)) 652 elif el.tag == 'select': 653 value = el.value 654 if el.multiple: 655 for v in value: 656 results.append((name, v)) 657 elif value is not None: 658 results.append((name, el.value)) 659 else: 660 assert el.tag == 'input', ( 661 "Unexpected tag: %r" % el) 662 if el.checkable and not el.checked: 663 continue 664 if el.type in ('submit', 'image', 'reset'): 665 continue 666 value = el.value 667 if value is not None: 668 results.append((name, el.value)) 669 return results
670
671 - def action__get(self):
672 """ 673 Get/set the form's ``action`` attribute. 674 """ 675 base_url = self.base_url 676 action = self.get('action') 677 if base_url and action is not None: 678 return urlparse.urljoin(base_url, action) 679 else: 680 return action
681 - def action__set(self, value):
682 self.set('action', value)
683 - def action__del(self):
684 if 'action' in self.attrib: 685 del self.attrib['action']
686 action = property(action__get, action__set, action__del, doc=action__get.__doc__) 687
688 - def method__get(self):
689 """ 690 Get/set the form's method. Always returns a capitalized 691 string, and defaults to ``'GET'`` 692 """ 693 return self.get('method', 'GET').upper()
694 - def method__set(self, value):
695 self.set('method', value.upper())
696 method = property(method__get, method__set, doc=method__get.__doc__)
697 698 HtmlElementClassLookup._default_element_classes['form'] = FormElement 699
700 -def submit_form(form, extra_values=None, open_http=None):
701 """ 702 Helper function to submit a form. Returns a file-like object, as from 703 ``urllib.urlopen()``. This object also has a ``.geturl()`` function, 704 which shows the URL if there were any redirects. 705 706 You can use this like:: 707 708 >>> form = doc.forms[0] 709 >>> form.inputs['foo'].value = 'bar' # etc 710 >>> response = form.submit() 711 >>> doc = parse(response) 712 >>> doc.make_links_absolute(response.geturl()) 713 714 To change the HTTP requester, pass a function as ``open_http`` keyword 715 argument that opens the URL for you. The function must have the following 716 signature:: 717 718 open_http(method, URL, values) 719 720 The action is one of 'GET' or 'POST', the URL is the target URL as a 721 string, and the values are a sequence of ``(name, value)`` tuples with the 722 form data. 723 """ 724 values = form.form_values() 725 if extra_values: 726 if hasattr(extra_values, 'items'): 727 extra_values = extra_values.items() 728 values.extend(extra_values) 729 if open_http is None: 730 open_http = open_http_urllib 731 return open_http(form.method, form.action, values)
732
733 -def open_http_urllib(method, url, values):
734 import urllib 735 ## FIXME: should test that it's not a relative URL or something 736 if method == 'GET': 737 if '?' in url: 738 url += '&' 739 else: 740 url += '?' 741 url += urllib.urlencode(values) 742 data = None 743 else: 744 data = urllib.urlencode(values) 745 return urllib.urlopen(url, data)
746
747 -class FieldsDict(DictMixin):
748
749 - def __init__(self, inputs):
750 self.inputs = inputs
751 - def __getitem__(self, item):
752 return self.inputs[item].value
753 - def __setitem__(self, item, value):
754 self.inputs[item].value = value
755 - def __delitem__(self, item):
756 raise KeyError( 757 "You cannot remove keys from ElementDict")
758 - def keys(self):
759 return self.inputs.keys()
760 - def __contains__(self, item):
761 return item in self.inputs
762
763 - def __repr__(self):
764 return '<%s for form %s>' % ( 765 self.__class__.__name__, 766 self.inputs.form._name())
767
768 -class InputGetter(object):
769 770 """ 771 An accessor that represents all the input fields in a form. 772 773 You can get fields by name from this, with 774 ``form.inputs['field_name']``. If there are a set of checkboxes 775 with the same name, they are returned as a list (a `CheckboxGroup` 776 which also allows value setting). Radio inputs are handled 777 similarly. 778 779 You can also iterate over this to get all input elements. This 780 won't return the same thing as if you get all the names, as 781 checkboxes and radio elements are returned individually. 782 """ 783 784 _name_xpath = etree.XPath(".//*[@name = $name and (name(.) = 'select' or name(.) = 'input' or name(.) = 'textarea')]") 785 _all_xpath = etree.XPath(".//*[name() = 'select' or name() = 'input' or name() = 'textarea']") 786
787 - def __init__(self, form):
788 self.form = form
789
790 - def __repr__(self):
791 return '<%s for form %s>' % ( 792 self.__class__.__name__, 793 self.form._name())
794 795 ## FIXME: there should be more methods, and it's unclear if this is 796 ## a dictionary-like object or list-like object 797
798 - def __getitem__(self, name):
799 results = self._name_xpath(self.form, name=name) 800 if results: 801 type = results[0].get('type') 802 if type == 'radio' and len(results) > 1: 803 group = RadioGroup(results) 804 group.name = name 805 return group 806 elif type == 'checkbox' and len(results) > 1: 807 group = CheckboxGroup(results) 808 group.name = name 809 return group 810 else: 811 # I don't like throwing away elements like this 812 return results[0] 813 else: 814 raise KeyError( 815 "No input element with the name %r" % name)
816
817 - def __contains__(self, name):
818 results = self._name_xpath(self.form, name=name) 819 return bool(results)
820
821 - def keys(self):
822 names = sets.Set() 823 for el in self: 824 if el.name is not None: 825 names.add(el.name) 826 return list(names)
827
828 - def __iter__(self):
829 ## FIXME: kind of dumb to turn a list into an iterator, only 830 ## to have it likely turned back into a list again :( 831 return iter(self._all_xpath(self.form))
832
833 -class InputMixin(object):
834 835 """ 836 Mix-in for all input elements (input, select, and textarea) 837 """ 838 839
840 - def name__get(self):
841 """ 842 Get/set the name of the element 843 """ 844 return self.get('name')
845 - def name__set(self, value):
846 self.set('name', value)
847 - def name__del(self):
848 if 'name' in self.attrib: 849 del self.attrib['name']
850 name = property(name__get, name__set, name__del, doc=name__get.__doc__) 851
852 - def __repr__(self):
853 type = getattr(self, 'type', None) 854 if type: 855 type = ' type=%r' % type 856 else: 857 type = '' 858 return '<%s %x name=%r%s>' % ( 859 self.__class__.__name__, id(self), self.name, type)
860
861 -class TextareaElement(InputMixin, HtmlElement):
862 """ 863 ``<textarea>`` element. You can get the name with ``.name`` and 864 get/set the value with ``.value`` 865 """ 866
867 - def value__get(self):
868 """ 869 Get/set the value (which is the contents of this element) 870 """ 871 return self.text or ''
872 - def value__set(self, value):
873 self.text = value
874 - def value__del(self):
875 self.text = ''
876 value = property(value__get, value__set, value__del, doc=value__get.__doc__)
877 878 HtmlElementClassLookup._default_element_classes['textarea'] = TextareaElement 879
880 -class SelectElement(InputMixin, HtmlElement):
881 """ 882 ``<select>`` element. You can get the name with ``.name``. 883 884 ``.value`` will be the value of the selected option, unless this 885 is a multi-select element (``<select multiple>``), in which case 886 it will be a set-like object. In either case ``.value_options`` 887 gives the possible values. 888 889 The boolean attribute ``.multiple`` shows if this is a 890 multi-select. 891 """ 892
893 - def value__get(self):
894 """ 895 Get/set the value of this select (the selected option). 896 897 If this is a multi-select, this is a set-like object that 898 represents all the selected options. 899 """ 900 if self.multiple: 901 return MultipleSelectOptions(self) 902 for el in self.getiterator('option'): 903 if 'selected' in el.attrib: 904 value = el.get('value') 905 # FIXME: If value is None, what to return?, get_text()? 906 return value 907 return None
908
909 - def value__set(self, value):
910 if self.multiple: 911 if isinstance(value, basestring): 912 raise TypeError( 913 "You must pass in a sequence") 914 self.value.clear() 915 self.value.update(value) 916 return 917 if value is not None: 918 for el in self.getiterator('option'): 919 # FIXME: also if el.get('value') is None? 920 if el.get('value') == value: 921 checked_option = el 922 break 923 else: 924 raise ValueError( 925 "There is no option with the value of %r" % value) 926 for el in self.getiterator('option'): 927 if 'selected' in el.attrib: 928 del el.attrib['selected'] 929 if value is not None: 930 checked_option.set('selected', '')
931
932 - def value__del(self):
933 # FIXME: should del be allowed at all? 934 if self.multiple: 935 self.value.clear() 936 else: 937 self.value = None
938 939 value = property(value__get, value__set, value__del, doc=value__get.__doc__) 940
941 - def value_options(self):
942 """ 943 All the possible values this select can have (the ``value`` 944 attribute of all the ``<option>`` elements. 945 """ 946 return [el.get('value') for el in self.getiterator('option')]
947 value_options = property(value_options, doc=value_options.__doc__) 948
949 - def multiple__get(self):
950 """ 951 Boolean attribute: is there a ``multiple`` attribute on this element. 952 """ 953 return 'multiple' in self.attrib
954 - def multiple__set(self, value):
955 if value: 956 self.set('multiple', '') 957 elif 'multiple' in self.attrib: 958 del self.attrib['multiple']
959 multiple = property(multiple__get, multiple__set, doc=multiple__get.__doc__)
960 961 HtmlElementClassLookup._default_element_classes['select'] = SelectElement 962
963 -class MultipleSelectOptions(SetMixin):
964 """ 965 Represents all the selected options in a ``<select multiple>`` element. 966 967 You can add to this set-like option to select an option, or remove 968 to unselect the option. 969 """ 970
971 - def __init__(self, select):
972 self.select = select
973
974 - def options(self):
975 """ 976 Iterator of all the ``<option>`` elements. 977 """ 978 return self.select.getiterator('option')
979 options = property(options) 980
981 - def __iter__(self):
982 for option in self.options: 983 yield option.get('value')
984
985 - def add(self, item):
986 for option in self.options: 987 if option.get('value') == item: 988 option.set('selected', '') 989 break 990 else: 991 raise ValueError( 992 "There is no option with the value %r" % item)
993
994 - def remove(self, item):
995 for option in self.options: 996 if option.get('value') == item: 997 if 'selected' in option.attrib: 998 del option.attrib['selected'] 999 else: 1000 raise ValueError( 1001 "The option %r is not currently selected" % item) 1002 break 1003 else: 1004 raise ValueError( 1005 "There is not option with the value %r" % item)
1006
1007 - def __repr__(self):
1008 return '<%s {%s} for select name=%r>' % ( 1009 self.__class__.__name__, 1010 ', '.join([repr(v) for v in self]), 1011 self.select.name)
1012
1013 -class RadioGroup(list):
1014 """ 1015 This object represents several ``<input type=radio>`` elements 1016 that have the same name. 1017 1018 You can use this like a list, but also use the property 1019 ``.value`` to check/uncheck inputs. Also you can use 1020 ``.value_options`` to get the possible values. 1021 """ 1022
1023 - def value__get(self):
1024 """ 1025 Get/set the value, which checks the radio with that value (and 1026 unchecks any other value). 1027 """ 1028 for el in self: 1029 if 'checked' in el.attrib: 1030 return el.get('value') 1031 return None
1032
1033 - def value__set(self, value):
1034 if value is not None: 1035 for el in self: 1036 if el.get('value') == value: 1037 checked_option = el 1038 break 1039 else: 1040 raise ValueError( 1041 "There is no radio input with the value %r" % value) 1042 for el in self: 1043 if 'checked' in el.attrib: 1044 del el.attrib['checked'] 1045 if value is not None: 1046 checked_option.set('checked', '')
1047
1048 - def value__del(self):
1049 self.value = None
1050 1051 value = property(value__get, value__set, value__del, doc=value__get.__doc__) 1052
1053 - def value_options(self):
1054 """ 1055 Returns a list of all the possible values. 1056 """ 1057 return [el.get('value') for el in self]
1058 value_options = property(value_options, doc=value_options.__doc__) 1059
1060 - def __repr__(self):
1061 return '%s(%s)' % ( 1062 self.__class__.__name__, 1063 list.__repr__(self))
1064
1065 -class CheckboxGroup(list):
1066 """ 1067 Represents a group of checkboxes (``<input type=checkbox>``) that 1068 have the same name. 1069 1070 In addition to using this like a list, the ``.value`` attribute 1071 returns a set-like object that you can add to or remove from to 1072 check and uncheck checkboxes. You can also use ``.value_options`` 1073 to get the possible values. 1074 """ 1075
1076 - def value__get(self):
1077 """ 1078 Return a set-like object that can be modified to check or 1079 uncheck individual checkboxes according to their value. 1080 """ 1081 return CheckboxValues(self)
1082 - def value__set(self, value):
1083 self.value.clear() 1084 if not hasattr(value, '__iter__'): 1085 raise ValueError( 1086 "A CheckboxGroup (name=%r) must be set to a sequence (not %r)" 1087 % (self[0].name, value)) 1088 self.value.update(value)
1089 - def value__del(self):
1090 self.value.clear()
1091 value = property(value__get, value__set, value__del, doc=value__get.__doc__) 1092
1093 - def __repr__(self):
1094 return '%s(%s)' % ( 1095 self.__class__.__name__, list.__repr__(self))
1096
1097 -class CheckboxValues(SetMixin):
1098 1099 """ 1100 Represents the values of the checked checkboxes in a group of 1101 checkboxes with the same name. 1102 """ 1103
1104 - def __init__(self, group):
1105 self.group = group
1106
1107 - def __iter__(self):
1108 return iter([ 1109 el.get('value') 1110 for el in self.group 1111 if 'checked' in el.attrib])
1112
1113 - def add(self, value):
1114 for el in self.group: 1115 if el.get('value') == value: 1116 el.set('checked', '') 1117 break 1118 else: 1119 raise KeyError("No checkbox with value %r" % value)
1120
1121 - def remove(self, value):
1122 for el in self.group: 1123 if el.get('value') == value: 1124 if 'checked' in el.attrib: 1125 del el.attrib['checked'] 1126 else: 1127 raise KeyError( 1128 "The checkbox with value %r was already unchecked" % value) 1129 break 1130 else: 1131 raise KeyError( 1132 "No checkbox with value %r" % value)
1133
1134 - def __repr__(self):
1135 return '<%s {%s} for checkboxes name=%r>' % ( 1136 self.__class__.__name__, 1137 ', '.join([repr(v) for v in self]), 1138 self.group.name)
1139
1140 -class InputElement(InputMixin, HtmlElement):
1141 """ 1142 Represents an ``<input>`` element. 1143 1144 You can get the type with ``.type`` (which is lower-cased and 1145 defaults to ``'text'``). 1146 1147 Also you can get and set the value with ``.value`` 1148 1149 Checkboxes and radios have the attribute ``input.checkable == 1150 True`` (for all others it is false) and a boolean attribute 1151 ``.checked``. 1152 1153 """ 1154 1155 ## FIXME: I'm a little uncomfortable with the use of .checked
1156 - def value__get(self):
1157 """ 1158 Get/set the value of this element, using the ``value`` attribute. 1159 1160 Also, if this is a checkbox and it has no value, this defaults 1161 to ``'on'``. If it is a checkbox or radio that is not 1162 checked, this returns None. 1163 """ 1164 if self.checkable: 1165 if self.checked: 1166 return self.get('value') or 'on' 1167 else: 1168 return None 1169 return self.get('value')
1170 - def value__set(self, value):
1171 if self.checkable: 1172 if not value: 1173 self.checked = False 1174 else: 1175 self.checked = True 1176 if isinstance(value, basestring): 1177 self.set('value', value) 1178 else: 1179 self.set('value', value)
1180 - def value__del(self):
1181 if self.checkable: 1182 self.checked = False 1183 else: 1184 if 'value' in self.attrib: 1185 del self.attrib['value']
1186 value = property(value__get, value__set, value__del, doc=value__get.__doc__) 1187
1188 - def type__get(self):
1189 """ 1190 Return the type of this element (using the type attribute). 1191 """ 1192 return self.get('type', 'text').lower()
1193 - def type__set(self, value):
1194 self.set('type', value)
1195 type = property(type__get, type__set, doc=type__get.__doc__) 1196
1197 - def checkable__get(self):
1198 """ 1199 Boolean: can this element be checked? 1200 """ 1201 return self.type in ['checkbox', 'radio']
1202 checkable = property(checkable__get, doc=checkable__get.__doc__) 1203
1204 - def checked__get(self):
1205 """ 1206 Boolean attribute to get/set the presence of the ``checked`` 1207 attribute. 1208 1209 You can only use this on checkable input types. 1210 """ 1211 if not self.checkable: 1212 raise AttributeError('Not a checkable input type') 1213 return 'checked' in self.attrib
1214 - def checked__set(self, value):
1215 if not self.checkable: 1216 raise AttributeError('Not a checkable input type') 1217 if value: 1218 self.set('checked', '') 1219 else: 1220 if 'checked' in self.attrib: 1221 del self.attrib['checked']
1222 checked = property(checked__get, checked__set, doc=checked__get.__doc__)
1223 1224 HtmlElementClassLookup._default_element_classes['input'] = InputElement 1225
1226 -class LabelElement(HtmlElement):
1227 """ 1228 Represents a ``<label>`` element. 1229 1230 Label elements are linked to other elements with their ``for`` 1231 attribute. You can access this element with ``label.for_element``. 1232 """ 1233
1234 - def for_element__get(self):
1235 """ 1236 Get/set the element this label points to. Return None if it 1237 can't be found. 1238 """ 1239 id = self.get('for') 1240 if not id: 1241 return None 1242 return self.body.get_element_by_id(id)
1243 - def for_element__set(self, other):
1244 id = other.get('id') 1245 if not id: 1246 raise TypeError( 1247 "Element %r has no id attribute" % other) 1248 self.set('for', id)
1249 - def for_element__del(self):
1250 if 'id' in self.attrib: 1251 del self.attrib['id']
1252 for_element = property(for_element__get, for_element__set, for_element__del, 1253 doc=for_element__get.__doc__)
1254 1255 HtmlElementClassLookup._default_element_classes['label'] = LabelElement 1256 1257 ############################################################ 1258 ## Serialization 1259 ############################################################ 1260 1261 # This isn't a general match, but it's a match for what libxml2 1262 # specifically serialises: 1263 __replace_meta_content_type = re.compile( 1264 r'<meta http-equiv="Content-Type".*?>').sub 1265
1266 -def tostring(doc, pretty_print=False, include_meta_content_type=False, 1267 encoding=None):
1268 """ 1269 return HTML string representation of the document given 1270 1271 note: if include_meta_content_type is true this will create a meta 1272 http-equiv="Content" tag in the head; regardless of the value of include_meta_content_type 1273 any existing meta http-equiv="Content" tag will be removed 1274 """ 1275 assert doc is not None 1276 html = etree.tostring(doc, method="html", pretty_print=pretty_print, 1277 encoding=encoding) 1278 if not include_meta_content_type: 1279 html = __replace_meta_content_type('', html) 1280 return html
1281
1282 -def open_in_browser(doc):
1283 """ 1284 Open the HTML document in a web browser (saving it to a temporary 1285 file to open it). 1286 """ 1287 import os 1288 import webbrowser 1289 try: 1290 write_doc = doc.write 1291 except AttributeError: 1292 write_doc = etree.ElementTree(element=doc).write 1293 fn = os.tempnam() + '.html' 1294 write_doc(fn, method="html") 1295 url = 'file://' + fn.replace(os.path.sep, '/') 1296 print url 1297 webbrowser.open(url)
1298 1299 ################################################################################ 1300 # configure Element class lookup 1301 ################################################################################ 1302
1303 -class HTMLParser(etree.HTMLParser):
1304 - def __init__(self, **kwargs):
1305 super(HTMLParser, self).__init__(**kwargs) 1306 self.setElementClassLookup(HtmlElementClassLookup())
1307
1308 -def Element(*args, **kw):
1309 v = html_parser.makeelement(*args, **kw) 1310 return v
1311 1312 html_parser = HTMLParser() 1313