Package lxml :: Package html
[hide private]
[frames] | no frames]

Source Code for Package lxml.html

   1  import threading 
   2  import re 
   3  import urlparse 
   4  import copy 
   5  from lxml import etree 
   6  from lxml.html import defs 
   7  from lxml import cssselect 
   8  from lxml.html.setmixin import SetMixin 
   9  try: 
  10      from UserDict import DictMixin 
  11  except ImportError: 
  12      # DictMixin was introduced in Python 2.4 
  13      from lxml.html._dictmixin import DictMixin 
  14  import sets 
  15   
  16  __all__ = [ 
  17      'document_fromstring', 'fragment_fromstring', 'fragments_fromstring', 'fromstring', 
  18      'tostring', 'Element', 'defs', 'open_in_browser', 'submit_form', 
  19      'find_rel_links', 'find_class', 'make_links_absolute', 
  20      'resolve_base_href', 'iterlinks', 'rewrite_links', 'open_in_browser'] 
  21   
  22  _rel_links_xpath = etree.XPath("descendant-or-self::a[@rel]") 
  23  #_class_xpath = etree.XPath(r"descendant-or-self::*[regexp:match(@class, concat('\b', $class_name, '\b'))]", {'regexp': 'http://exslt.org/regular-expressions'}) 
  24  _class_xpath = etree.XPath("descendant-or-self::*[@class and contains(concat(' ', normalize-space(@class), ' '), concat(' ', $class_name, ' '))]") 
  25  _id_xpath = etree.XPath("descendant-or-self::*[@id=$id]") 
  26  _collect_string_content = etree.XPath("string()") 
  27  _css_url_re = re.compile(r'url\((.*?)\)', re.I) 
  28  _css_import_re = re.compile(r'@import "(.*?)"') 
  29  _label_xpath = etree.XPath("//label[@for=$id]") 
  30  _archive_re = re.compile(r'[^ ]+') 
  31   
32 -class HtmlMixin(object):
33
34 - def base_url(self):
35 """ 36 Returns the base URL, given when the page was parsed. 37 38 Use with ``urlparse.urljoin(el.base_url, href)`` to get 39 absolute URLs. 40 """ 41 return self.getroottree().docinfo.URL
42 base_url = property(base_url, doc=base_url.__doc__) 43
44 - def forms(self):
45 """ 46 Return a list of all the forms 47 """ 48 return list(self.getiterator('form'))
49 forms = property(forms, doc=forms.__doc__) 50
51 - def body(self):
52 """ 53 Return the <body> element. Can be called from a child element 54 to get the document's head. 55 """ 56 return self.xpath('//body')[0]
57 body = property(body, doc=body.__doc__) 58
59 - def head(self):
60 """ 61 Returns the <head> element. Can be called from a child 62 element to get the document's head. 63 """ 64 return self.xpath('//head')[0]
65 head = property(head, doc=head.__doc__) 66
67 - def label__get(self):
68 """ 69 Get or set any <label> element associated with this element. 70 """ 71 id = self.get('id') 72 if not id: 73 return None 74 result = _label_xpath(self, id=id) 75 if not result: 76 return None 77 else: 78 return result[0]
79 - def label__set(self, label):
80 id = self.get('id') 81 if not id: 82 raise TypeError( 83 "You cannot set a label for an element (%r) that has no id" 84 % self) 85 if not label.tag == 'label': 86 raise TypeError( 87 "You can only assign label to a label element (not %r)" 88 % label) 89 label.set('for', id)
90 - def label__del(self):
91 label = self.label 92 if label is not None: 93 del label.attrib['for']
94 label = property(label__get, label__set, label__del, doc=label__get.__doc__) 95
96 - def drop_tree(self):
97 """ 98 Removes this element from the tree, including its children and 99 text. The tail text is joined to the previous element or 100 parent. 101 """ 102 parent = self.getparent() 103 assert parent is not None 104 if self.tail: 105 previous = self.getprevious() 106 if previous is None: 107 parent.text = (parent.text or '') + self.tail 108 else: 109 previous.tail = (previous.tail or '') + self.tail 110 parent.remove(self)
111
112 - def drop_tag(self):
113 """ 114 Remove the tag, but not its children or text. The children and text 115 are merged into the parent. 116 117 Example:: 118 119 >>> h = fragment_fromstring('<div>Hello <b>World!</b></div>') 120 >>> h.find('//b').drop_tag() 121 >>> print tostring(h) 122 <div>Hello World!</div> 123 """ 124 parent = self.getparent() 125 assert parent is not None 126 previous = self.getprevious() 127 if self.text and isinstance(self.tag, basestring): 128 # not a Comment, etc. 129 if previous is None: 130 parent.text = (parent.text or '') + self.text 131 else: 132 previous.tail = (previous.tail or '') + self.text 133 if self.tail: 134 if len(self): 135 last = self[-1] 136 last.tail = (last.tail or '') + self.tail 137 elif previous is None: 138 parent.text = (parent.text or '') + self.tail 139 else: 140 previous.tail = (previous.tail or '') + self.tail 141 index = parent.index(self) 142 parent[index:index+1] = self[:]
143 151
152 - def find_class(self, class_name):
153 """ 154 Find any elements with the given class name. 155 """ 156 return _class_xpath(self, class_name=class_name)
157
158 - def get_element_by_id(self, id, *default):
159 """ 160 Get the first element in a document with the given id. If none is 161 found, return the default argument if provided or raise KeyError 162 otherwise. 163 164 Note that there can be more than one element with the same id, 165 and this isn't uncommon in HTML documents found in the wild. 166 Browsers return only the first match, and this function does 167 the same. 168 """ 169 try: 170 # FIXME: should this check for multiple matches? 171 # browsers just return the first one 172 return _id_xpath(self, id=id)[0] 173 except IndexError: 174 if default: 175 return default[0] 176 else: 177 raise KeyError, id
178
179 - def text_content(self):
180 """ 181 Return the text content of the tag (and the text in any children). 182 """ 183 return _collect_string_content(self)
184
185 - def cssselect(self, expr):
186 """ 187 Run the CSS expression on this element and its children, 188 returning a list of the results. 189 190 Equivalent to lxml.cssselect.CSSSelect(expr)(self) -- note 191 that pre-compiling the expression can provide a substantial 192 speedup. 193 """ 194 return cssselect.CSSSelect(expr)(self)
195 196 ######################################## 197 ## Link functions 198 ######################################## 199 219 self.rewrite_links(link_repl)
220
221 - def resolve_base_href(self):
222 """ 223 Find any ``<base href>`` tag in the document, and apply its 224 values to all links found in the document. Also remove the 225 tag once it has been applied. 226 """ 227 base_href = None 228 basetags = self.xpath('//base[@href]') 229 for b in basetags: 230 base_href = b.get('href') 231 b.drop_tree() 232 if not base_href: 233 return 234 self.make_links_absolute(base_href, resolve_base_href=False)
235 290 337 338
339 -class _MethodFunc(object):
340 """ 341 An object that represents a method on an element as a function; 342 the function takes either an element or an HTML string. It 343 returns whatever the function normally returns, or if the function 344 works in-place (and so returns None) it returns a serialized form 345 of the resulting document. 346 """
347 - def __init__(self, name, copy=False, source_class=HtmlMixin):
348 self.name = name 349 self.copy = copy 350 self.__doc__ = getattr(source_class, self.name).__doc__
351 - def __call__(self, doc, *args, **kw):
352 if isinstance(doc, basestring): 353 if 'copy' in kw: 354 raise TypeError( 355 "The keyword 'copy' can only be used with element inputs to %s, not a string input" % self.name) 356 return_string = True 357 doc = fromstring(doc, **kw) 358 else: 359 if 'copy' in kw: 360 copy = kw.pop('copy') 361 else: 362 copy = self.copy 363 return_string = False 364 if copy: 365 doc = copy.deepcopy(doc) 366 meth = getattr(doc, self.name) 367 result = meth(*args, **kw) 368 # FIXME: this None test is a bit sloppy 369 if result is None: 370 # Then return what we got in 371 if return_string: 372 return tostring(doc) 373 else: 374 return doc 375 else: 376 return result
377 378 find_rel_links = _MethodFunc('find_rel_links', copy=False) 379 find_class = _MethodFunc('find_class', copy=False) 380 make_links_absolute = _MethodFunc('make_links_absolute', copy=True) 381 resolve_base_href = _MethodFunc('resolve_base_href', copy=True) 382 iterlinks = _MethodFunc('iterlinks', copy=False) 383 rewrite_links = _MethodFunc('rewrite_links', copy=True) 384
385 -class HtmlComment(etree.CommentBase, HtmlMixin):
386 pass
387
388 -class HtmlElement(etree.ElementBase, HtmlMixin):
389 pass
390
391 -class HtmlProcessingInstruction(etree.PIBase, HtmlMixin):
392 pass
393
394 -class HtmlEntity(etree.EntityBase, HtmlMixin):
395 pass
396 397
398 -class HtmlElementClassLookup(etree.CustomElementClassLookup):
399 """A lookup scheme for HTML Element classes. 400 401 To create a lookup instance with different Element classes, pass a tag 402 name mapping of Element classes in the ``classes`` keyword argument and/or 403 a tag name mapping of Mixin classes in the ``mixins`` keyword argument. 404 The special key '*' denotes a Mixin class that should be mixed into all 405 Element classes. 406 """ 407 _default_element_classes = {} 408
409 - def __init__(self, classes=None, mixins=None):
410 etree.CustomElementClassLookup.__init__(self) 411 if classes is None: 412 classes = self._default_element_classes.copy() 413 if mixins: 414 mixers = {} 415 for name, value in mixins: 416 if name == '*': 417 for n in classes.keys(): 418 mixers.setdefault(n, []).append(value) 419 else: 420 mixers.setdefault(name, []).append(value) 421 for name, mix_bases in mixers.items(): 422 cur = classes.get(name, HtmlElement) 423 bases = tuple(mix_bases + [cur]) 424 classes[name] = type(cur.__name__, bases, {}) 425 self._element_classes = classes
426
427 - def lookup(self, node_type, document, namespace, name):
428 if node_type == 'element': 429 return self._element_classes.get(name.lower(), HtmlElement) 430 elif node_type == 'comment': 431 return HtmlComment 432 elif node_type == 'PI': 433 return HtmlProcessingInstruction 434 elif node_type == 'entity': 435 return HtmlEntity 436 # Otherwise normal lookup 437 return None
438 439 ################################################################################ 440 # parsing 441 ################################################################################ 442
443 -def document_fromstring(html, **kw):
444 value = etree.HTML(html, html_parser, **kw) 445 if value is None: 446 raise etree.ParserError( 447 "Document is empty") 448 return value
449
450 -def fragments_fromstring(html, no_leading_text=False, **kw):
451 """ 452 Parses several HTML elements, returning a list of elements. 453 454 The first item in the list may be a string (though leading 455 whitespace is removed). If no_leading_text is true, then it will 456 be an error if there is leading text, and it will always be a list 457 of only elements. 458 """ 459 # FIXME: check what happens when you give html with a body, head, etc. 460 start = html[:20].lstrip().lower() 461 if not start.startswith('<html') and not start.startswith('<!doctype'): 462 html = '<html><body>%s</body></html>' % html 463 doc = document_fromstring(html, **kw) 464 assert doc.tag == 'html' 465 bodies = [e for e in doc if e.tag == 'body'] 466 assert len(bodies) == 1, ("too many bodies: %r in %r" % (bodies, html)) 467 body = bodies[0] 468 elements = [] 469 if no_leading_text and body.text and body.text.strip(): 470 raise etree.ParserError( 471 "There is leading text: %r" % body.text) 472 if body.text and body.text.strip(): 473 elements.append(body.text) 474 elements.extend(body) 475 # FIXME: removing the reference to the parent artificial document 476 # would be nice 477 return elements
478
479 -def fragment_fromstring(html, create_parent=False, **kw):
480 """ 481 Parses a single HTML element; it is an error if there is more than 482 one element, or if anything but whitespace precedes or follows the 483 element. 484 485 If create_parent is true (or is a tag name) then a parent node 486 will be created to encapsulate the HTML in a single element. 487 """ 488 if create_parent: 489 if not isinstance(create_parent, basestring): 490 create_parent = 'div' 491 return fragment_fromstring('<%s>%s</%s>' % ( 492 create_parent, html, create_parent), **kw) 493 elements = fragments_fromstring(html, no_leading_text=True) 494 if not elements: 495 raise etree.ParserError( 496 "No elements found") 497 if len(elements) > 1: 498 raise etree.ParserError( 499 "Multiple elements found (%s)" 500 % ', '.join([_element_name(e) for e in elements])) 501 el = elements[0] 502 if el.tail and el.tail.strip(): 503 raise etree.ParserError( 504 "Element followed by text: %r" % el.tail) 505 el.tail = None 506 return el
507
508 -def fromstring(html, **kw):
509 """ 510 Parse the html, returning a single element/document. 511 512 This tries to minimally parse the chunk of text, without knowing if it 513 is a fragment or a document. 514 """ 515 start = html[:10].lstrip().lower() 516 if start.startswith('<html') or start.startswith('<!doctype'): 517 # Looks like a full HTML document 518 return document_fromstring(html, **kw) 519 # otherwise, lets parse it out... 520 doc = document_fromstring(html, **kw) 521 bodies = doc.findall('body') 522 if bodies: 523 body = bodies[0] 524 if len(bodies) > 1: 525 # Somehow there are multiple bodies, which is bad, but just 526 # smash them into one body 527 for other_body in bodies[1:]: 528 if other_body.text: 529 if len(body): 530 body[-1].tail = (body[-1].tail or '') + other_body.text 531 else: 532 body.text = (body.text or '') + other_body.text 533 body.extend(other_body) 534 # We'll ignore tail 535 # I guess we are ignoring attributes too 536 other_body.drop_tree() 537 else: 538 body = None 539 heads = doc.findall('head') 540 if heads: 541 # Well, we have some sort of structure, so lets keep it all 542 head = heads[0] 543 if len(heads) > 1: 544 for other_head in heads[1:]: 545 head.extend(other_head) 546 # We don't care about text or tail in a head 547 other_head.drop_tree() 548 return doc 549 if (len(body) == 1 and (not body.text or not body.text.strip()) 550 and (not body[-1].tail or not body[-1].tail.strip())): 551 # The body has just one element, so it was probably a single 552 # element passed in 553 return body[0] 554 # Now we have a body which represents a bunch of tags which have the 555 # content that was passed in. We will create a fake container, which 556 # is the body tag, except <body> implies too much structure. 557 if _contains_block_level_tag(body): 558 body.tag = 'div' 559 else: 560 body.tag = 'span' 561 return body
562
563 -def parse(filename, parser=None, **kw):
564 """ 565 Parse a filename, URL, or file-like object into an HTML document. 566 567 You may pass the keyword argument ``base_url='http://...'`` to set 568 the base URL. 569 """ 570 if parser is None: 571 parser = html_parser 572 return etree.parse(filename, parser, **kw)
573
574 -def _contains_block_level_tag(el):
575 # FIXME: I could do this with XPath, but would that just be 576 # unnecessarily slow? 577 for el in el.getiterator(): 578 if el.tag in defs.block_tags: 579 return True 580 return False
581
582 -def _element_name(el):
583 if isinstance(el, etree.CommentBase): 584 return 'comment' 585 elif isinstance(el, basestring): 586 return 'string' 587 else: 588 return el.tag
589 590 ################################################################################ 591 # form handling 592 ################################################################################ 593
594 -class FormElement(HtmlElement):
595 """ 596 Represents a <form> element. 597 """ 598
599 - def inputs(self):
600 """ 601 Returns an accessor for all the input elements in the form. 602 603 See `InputGetter` for more information about the object. 604 """ 605 return InputGetter(self)
606 inputs = property(inputs, doc=inputs.__doc__) 607
608 - def fields__get(self):
609 """ 610 Dictionary-like object that represents all the fields in this 611 form. You can set values in this dictionary to effect the 612 form. 613 """ 614 return FieldsDict(self.inputs)
615 - def fields__set(self, value):
616 prev_keys = self.fields.keys() 617 for key, value in value.iteritems(): 618 if key in prev_keys: 619 prev_keys.remove(key) 620 self.fields[key] = value 621 for key in prev_keys: 622 if key is None: 623 # Case of an unnamed input; these aren't really 624 # expressed in form_values() anyway. 625 continue 626 self.fields[key] = None
627 628 fields = property(fields__get, fields__set, doc=fields__get.__doc__) 629
630 - def _name(self):
631 if self.get('name'): 632 return self.get('name') 633 elif self.get('id'): 634 return '#' + self.get('id') 635 return str(self.body.findall('form').index(self))
636
637 - def form_values(self):
638 """ 639 Return a list of tuples of the field values for the form. 640 This is suitable to be passed to ``urllib.urlencode()``. 641 """ 642 results = [] 643 for el in self.inputs: 644 name = el.name 645 if not name: 646 continue 647 if el.tag == 'textarea': 648 results.append((name, el.value)) 649 elif el.tag == 'select': 650 value = el.value 651 if el.multiple: 652 for v in value: 653 results.append((name, v)) 654 elif value is not None: 655 results.append((name, el.value)) 656 else: 657 assert el.tag == 'input', ( 658 "Unexpected tag: %r" % el) 659 if el.checkable and not el.checked: 660 continue 661 if el.type in ('submit', 'image', 'reset'): 662 continue 663 value = el.value 664 if value is not None: 665 results.append((name, el.value)) 666 return results
667
668 - def action__get(self):
669 """ 670 Get/set the form's ``action`` attribute. 671 """ 672 base_url = self.base_url 673 action = self.get('action') 674 if base_url and action is not None: 675 return urlparse.urljoin(base_url, action) 676 else: 677 return action
678 - def action__set(self, value):
679 self.set('action', value)
680 - def action__del(self):
681 if 'action' in self.attrib: 682 del self.attrib['action']
683 action = property(action__get, action__set, action__del, doc=action__get.__doc__) 684
685 - def method__get(self):
686 """ 687 Get/set the form's method. Always returns a capitalized 688 string, and defaults to ``'GET'`` 689 """ 690 return self.get('method', 'GET').upper()
691 - def method__set(self, value):
692 self.set('method', value.upper())
693 method = property(method__get, method__set, doc=method__get.__doc__)
694 695 HtmlElementClassLookup._default_element_classes['form'] = FormElement 696
697 -def submit_form(form, extra_values=None, open_http=None):
698 """ 699 Helper function to submit a form. Returns a file-like object, as from 700 ``urllib.urlopen()``. This object also has a ``.geturl()`` function, 701 which shows the URL if there were any redirects. 702 703 You can use this like:: 704 705 >>> form = doc.forms[0] 706 >>> form.inputs['foo'].value = 'bar' # etc 707 >>> response = form.submit() 708 >>> doc = parse(response) 709 >>> doc.make_links_absolute(response.geturl()) 710 711 To change the HTTP requester, pass a function as ``open_http`` keyword 712 argument that opens the URL for you. The function must have the following 713 signature:: 714 715 open_http(method, URL, values) 716 717 The action is one of 'GET' or 'POST', the URL is the target URL as a 718 string, and the values are a sequence of ``(name, value)`` tuples with the 719 form data. 720 """ 721 values = form.form_values() 722 if extra_values: 723 if hasattr(extra_values, 'items'): 724 extra_values = extra_values.items() 725 values.extend(extra_values) 726 if open_http is None: 727 open_http = open_http_urllib 728 return open_http(form.method, form.action, values)
729
730 -def open_http_urllib(method, url, values):
731 import urllib 732 ## FIXME: should test that it's not a relative URL or something 733 if method == 'GET': 734 if '?' in url: 735 url += '&' 736 else: 737 url += '?' 738 url += urllib.urlencode(values) 739 data = None 740 else: 741 data = urllib.urlencode(values) 742 return urllib.urlopen(url, data)
743
744 -class FieldsDict(DictMixin):
745
746 - def __init__(self, inputs):
747 self.inputs = inputs
748 - def __getitem__(self, item):
749 return self.inputs[item].value
750 - def __setitem__(self, item, value):
751 self.inputs[item].value = value
752 - def __delitem__(self, item):
753 raise KeyError( 754 "You cannot remove keys from ElementDict")
755 - def keys(self):
756 return self.inputs.keys()
757 - def __contains__(self, item):
758 return item in self.inputs
759
760 - def __repr__(self):
761 return '<%s for form %s>' % ( 762 self.__class__.__name__, 763 self.inputs.form._name())
764
765 -class InputGetter(object):
766 767 """ 768 An accessor that represents all the input fields in a form. 769 770 You can get fields by name from this, with 771 ``form.inputs['field_name']``. If there are a set of checkboxes 772 with the same name, they are returned as a list (a `CheckboxGroup` 773 which also allows value setting). Radio inputs are handled 774 similarly. 775 776 You can also iterate over this to get all input elements. This 777 won't return the same thing as if you get all the names, as 778 checkboxes and radio elements are returned individually. 779 """ 780 781 _name_xpath = etree.XPath(".//*[@name = $name and (name(.) = 'select' or name(.) = 'input' or name(.) = 'textarea')]") 782 _all_xpath = etree.XPath(".//*[name() = 'select' or name() = 'input' or name() = 'textarea']") 783
784 - def __init__(self, form):
785 self.form = form
786
787 - def __repr__(self):
788 return '<%s for form %s>' % ( 789 self.__class__.__name__, 790 self.form._name())
791 792 ## FIXME: there should be more methods, and it's unclear if this is 793 ## a dictionary-like object or list-like object 794
795 - def __getitem__(self, name):
796 results = self._name_xpath(self.form, name=name) 797 if results: 798 type = results[0].get('type') 799 if type == 'radio' and len(results) > 1: 800 group = RadioGroup(results) 801 group.name = name 802 return group 803 elif type == 'checkbox' and len(results) > 1: 804 group = CheckboxGroup(results) 805 group.name = name 806 return group 807 else: 808 # I don't like throwing away elements like this 809 return results[0] 810 else: 811 raise KeyError( 812 "No input element with the name %r" % name)
813
814 - def __contains__(self, name):
815 results = self._name_xpath(self.form, name=name) 816 return bool(results)
817
818 - def keys(self):
819 names = sets.Set() 820 for el in self: 821 names.add(el.name) 822 return list(names)
823
824 - def __iter__(self):
825 ## FIXME: kind of dumb to turn a list into an iterator, only 826 ## to have it likely turned back into a list again :( 827 return iter(self._all_xpath(self.form))
828
829 -class InputMixin(object):
830 831 """ 832 Mix-in for all input elements (input, select, and textarea) 833 """ 834 835
836 - def name__get(self):
837 """ 838 Get/set the name of the element 839 """ 840 return self.get('name')
841 - def name__set(self, value):
842 self.set('name', value)
843 - def name__del(self):
844 if 'name' in self.attrib: 845 del self.attrib['name']
846 name = property(name__get, name__set, name__del, doc=name__get.__doc__) 847
848 - def __repr__(self):
849 type = getattr(self, 'type', None) 850 if type: 851 type = ' type=%r' % type 852 else: 853 type = '' 854 return '<%s %x name=%r%s>' % ( 855 self.__class__.__name__, id(self), self.name, type)
856
857 -class TextareaElement(InputMixin, HtmlElement):
858 """ 859 ``<textarea>`` element. You can get the name with ``.name`` and 860 get/set the value with ``.value`` 861 """ 862
863 - def value__get(self):
864 """ 865 Get/set the value (which is the contents of this element) 866 """ 867 return self.text or ''
868 - def value__set(self, value):
869 self.text = value
870 - def value__del(self):
871 self.text = ''
872 value = property(value__get, value__set, value__del, doc=value__get.__doc__)
873 874 HtmlElementClassLookup._default_element_classes['textarea'] = TextareaElement 875
876 -class SelectElement(InputMixin, HtmlElement):
877 """ 878 ``<select>`` element. You can get the name with ``.name``. 879 880 ``.value`` will be the value of the selected option, unless this 881 is a multi-select element (``<select multiple>``), in which case 882 it will be a set-like object. In either case ``.value_options`` 883 gives the possible values. 884 885 The boolean attribute ``.multiple`` shows if this is a 886 multi-select. 887 """ 888
889 - def value__get(self):
890 """ 891 Get/set the value of this select (the selected option). 892 893 If this is a multi-select, this is a set-like object that 894 represents all the selected options. 895 """ 896 if self.multiple: 897 return MultipleSelectOptions(self) 898 for el in self.getiterator('option'): 899 if 'selected' in el.attrib: 900 value = el.get('value') 901 # FIXME: If value is None, what to return?, get_text()? 902 return value 903 return None
904
905 - def value__set(self, value):
906 if self.multiple: 907 if isinstance(value, basestring): 908 raise TypeError( 909 "You must pass in a sequence") 910 self.value.clear() 911 self.value.update(value) 912 return 913 if value is not None: 914 for el in self.getiterator('option'): 915 # FIXME: also if el.get('value') is None? 916 if el.get('value') == value: 917 checked_option = el 918 break 919 else: 920 raise ValueError( 921 "There is no option with the value of %r" % value) 922 for el in self.getiterator('option'): 923 if 'selected' in el.attrib: 924 del el.attrib['selected'] 925 if value is not None: 926 checked_option.set('selected', '')
927
928 - def value__del(self):
929 # FIXME: should del be allowed at all? 930 if self.multiple: 931 self.value.clear() 932 else: 933 self.value = None
934 935 value = property(value__get, value__set, value__del, doc=value__get.__doc__) 936
937 - def value_options(self):
938 """ 939 All the possible values this select can have (the ``value`` 940 attribute of all the ``<option>`` elements. 941 """ 942 return [el.get('value') for el in self.getiterator('option')]
943 value_options = property(value_options, doc=value_options.__doc__) 944
945 - def multiple__get(self):
946 """ 947 Boolean attribute: is there a ``multiple`` attribute on this element. 948 """ 949 return 'multiple' in self.attrib
950 - def multiple__set(self, value):
951 if value: 952 self.set('multiple', '') 953 elif 'multiple' in self.attrib: 954 del self.attrib['multiple']
955 multiple = property(multiple__get, multiple__set, doc=multiple__get.__doc__)
956 957 HtmlElementClassLookup._default_element_classes['select'] = SelectElement 958
959 -class MultipleSelectOptions(SetMixin):
960 """ 961 Represents all the selected options in a ``<select multiple>`` element. 962 963 You can add to this set-like option to select an option, or remove 964 to unselect the option. 965 """ 966
967 - def __init__(self, select):
968 self.select = select
969
970 - def options(self):
971 """ 972 Iterator of all the ``<option>`` elements. 973 """ 974 return self.select.getiterator('option')
975 options = property(options) 976
977 - def __iter__(self):
978 for option in self.options: 979 yield option.get('value')
980
981 - def add(self, item):
982 for option in self.options: 983 if option.get('value') == item: 984 option.set('selected', '') 985 break 986 else: 987 raise ValueError( 988 "There is no option with the value %r" % item)
989
990 - def remove(self, item):
991 for option in self.options: 992 if option.get('value') == item: 993 if 'selected' in option.attrib: 994 del option.attrib['selected'] 995 else: 996 raise ValueError( 997 "The option %r is not currently selected" % item) 998 break 999 else: 1000 raise ValueError( 1001 "There is not option with the value %r" % item)
1002
1003 - def __repr__(self):
1004 return '<%s {%s} for select name=%r>' % ( 1005 self.__class__.__name__, 1006 ', '.join([repr(v) for v in self]), 1007 self.select.name)
1008
1009 -class RadioGroup(list):
1010 """ 1011 This object represents several ``<input type=radio>`` elements 1012 that have the same name. 1013 1014 You can use this like a list, but also use the property 1015 ``.value`` to check/uncheck inputs. Also you can use 1016 ``.value_options`` to get the possible values. 1017 """ 1018
1019 - def value__get(self):
1020 """ 1021 Get/set the value, which checks the radio with that value (and 1022 unchecks any other value). 1023 """ 1024 for el in self: 1025 if 'checked' in el.attrib: 1026 return el.get('value') 1027 return None
1028
1029 - def value__set(self, value):
1030 if value is not None: 1031 for el in self: 1032 if el.get('value') == value: 1033 checked_option = el 1034 break 1035 else: 1036 raise ValueError( 1037 "There is no radio input with the value %r" % value) 1038 for el in self: 1039 if 'checked' in el.attrib: 1040 del el.attrib['checked'] 1041 if value is not None: 1042 checked_option.set('checked', '')
1043
1044 - def value__del(self):
1045 self.value = None
1046 1047 value = property(value__get, value__set, value__del, doc=value__get.__doc__) 1048
1049 - def value_options(self):
1050 """ 1051 Returns a list of all the possible values. 1052 """ 1053 return [el.get('value') for el in self]
1054 value_options = property(value_options, doc=value_options.__doc__) 1055
1056 - def __repr__(self):
1057 return '%s(%s)' % ( 1058 self.__class__.__name__, 1059 list.__repr__(self))
1060
1061 -class CheckboxGroup(list):
1062 """ 1063 Represents a group of checkboxes (``<input type=checkbox>``) that 1064 have the same name. 1065 1066 In addition to using this like a list, the ``.value`` attribute 1067 returns a set-like object that you can add to or remove from to 1068 check and uncheck checkboxes. You can also use ``.value_options`` 1069 to get the possible values. 1070 """ 1071
1072 - def value__get(self):
1073 """ 1074 Return a set-like object that can be modified to check or 1075 uncheck individual checkboxes according to their value. 1076 """ 1077 return CheckboxValues(self)
1078 - def value__set(self, value):
1079 self.value.clear() 1080 if not hasattr(value, '__iter__'): 1081 raise ValueError( 1082 "A CheckboxGroup (name=%r) must be set to a sequence (not %r)" 1083 % (self[0].name, value)) 1084 self.value.update(value)
1085 - def value__del(self):
1086 self.value.clear()
1087 value = property(value__get, value__set, value__del, doc=value__get.__doc__) 1088
1089 - def __repr__(self):
1090 return '%s(%s)' % ( 1091 self.__class__.__name__, list.__repr__(self))
1092
1093 -class CheckboxValues(SetMixin):
1094 1095 """ 1096 Represents the values of the checked checkboxes in a group of 1097 checkboxes with the same name. 1098 """ 1099
1100 - def __init__(self, group):
1101 self.group = group
1102
1103 - def __iter__(self):
1104 return iter([ 1105 el.get('value') 1106 for el in self.group 1107 if 'checked' in el.attrib])
1108
1109 - def add(self, value):
1110 for el in self.group: 1111 if el.get('value') == value: 1112 el.set('checked', '') 1113 break 1114 else: 1115 raise KeyError("No checkbox with value %r" % value)
1116
1117 - def remove(self, value):
1118 for el in self.group: 1119 if el.get('value') == value: 1120 if 'checked' in el.attrib: 1121 del el.attrib['checked'] 1122 else: 1123 raise KeyError( 1124 "The checkbox with value %r was already unchecked" % value) 1125 break 1126 else: 1127 raise KeyError( 1128 "No checkbox with value %r" % value)
1129
1130 - def __repr__(self):
1131 return '<%s {%s} for checkboxes name=%r>' % ( 1132 self.__class__.__name__, 1133 ', '.join([repr(v) for v in self]), 1134 self.group.name)
1135
1136 -class InputElement(InputMixin, HtmlElement):
1137 """ 1138 Represents an ``<input>`` element. 1139 1140 You can get the type with ``.type`` (which is lower-cased and 1141 defaults to ``'text'``). 1142 1143 Also you can get and set the value with ``.value`` 1144 1145 Checkboxes and radios have the attribute ``input.checkable == 1146 True`` (for all others it is false) and a boolean attribute 1147 ``.checked``. 1148 1149 """ 1150 1151 ## FIXME: I'm a little uncomfortable with the use of .checked
1152 - def value__get(self):
1153 """ 1154 Get/set the value of this element, using the ``value`` attribute. 1155 1156 Also, if this is a checkbox and it has no value, this defaults 1157 to ``'on'``. If it is a checkbox or radio that is not 1158 checked, this returns None. 1159 """ 1160 if self.checkable: 1161 if self.checked: 1162 return self.get('value') or 'on' 1163 else: 1164 return None 1165 return self.get('value')
1166 - def value__set(self, value):
1167 if self.checkable: 1168 if not value: 1169 self.checked = False 1170 else: 1171 self.checked = True 1172 if isinstance(value, basestring): 1173 self.set('value', value) 1174 else: 1175 self.set('value', value)
1176 - def value__del(self):
1177 if self.checkable: 1178 self.checked = False 1179 else: 1180 if 'value' in self.attrib: 1181 del self.attrib['value']
1182 value = property(value__get, value__set, value__del, doc=value__get.__doc__) 1183
1184 - def type__get(self):
1185 """ 1186 Return the type of this element (using the type attribute). 1187 """ 1188 return self.get('type', 'text').lower()
1189 - def type__set(self, value):
1190 self.set('type', value)
1191 type = property(type__get, type__set, doc=type__get.__doc__) 1192
1193 - def checkable__get(self):
1194 """ 1195 Boolean: can this element be checked? 1196 """ 1197 return self.type in ['checkbox', 'radio']
1198 checkable = property(checkable__get, doc=checkable__get.__doc__) 1199
1200 - def checked__get(self):
1201 """ 1202 Boolean attribute to get/set the presence of the ``checked`` 1203 attribute. 1204 1205 You can only use this on checkable input types. 1206 """ 1207 if not self.checkable: 1208 raise AttributeError('Not a checkable input type') 1209 return 'checked' in self.attrib
1210 - def checked__set(self, value):
1211 if not self.checkable: 1212 raise AttributeError('Not a checkable input type') 1213 if value: 1214 self.set('checked', '') 1215 else: 1216 if 'checked' in self.attrib: 1217 del self.attrib['checked']
1218 checked = property(checked__get, checked__set, doc=checked__get.__doc__)
1219 1220 HtmlElementClassLookup._default_element_classes['input'] = InputElement 1221
1222 -class LabelElement(HtmlElement):
1223 """ 1224 Represents a ``<label>`` element. 1225 1226 Label elements are linked to other elements with their ``for`` 1227 attribute. You can access this element with ``label.for_element``. 1228 """ 1229
1230 - def for_element__get(self):
1231 """ 1232 Get/set the element this label points to. Return None if it 1233 can't be found. 1234 """ 1235 id = self.get('for') 1236 if not id: 1237 return None 1238 return self.body.get_element_by_id(id)
1239 - def for_element__set(self, other):
1240 id = other.get('id') 1241 if not id: 1242 raise TypeError( 1243 "Element %r has no id attribute" % other) 1244 self.set('for', id)
1245 - def for_element__del(self):
1246 if 'id' in self.attrib: 1247 del self.attrib['id']
1248 for_element = property(for_element__get, for_element__set, for_element__del, 1249 doc=for_element__get.__doc__)
1250 1251 HtmlElementClassLookup._default_element_classes['label'] = LabelElement 1252 1253 ############################################################ 1254 ## Serialization 1255 ############################################################ 1256 1257 # This isn't a general match, but it's a match for what libxml2 1258 # specifically serialises: 1259 __replace_meta_content_type = re.compile( 1260 r'<meta http-equiv="Content-Type".*?>').sub 1261
1262 -def tostring(doc, pretty_print=False, include_meta_content_type=False, 1263 encoding=None):
1264 """ 1265 return HTML string representation of the document given 1266 1267 note: if include_meta_content_type is true this will create a meta 1268 http-equiv="Content" tag in the head; regardless of the value of include_meta_content_type 1269 any existing meta http-equiv="Content" tag will be removed 1270 """ 1271 assert doc is not None 1272 html = etree.tostring(doc, method="html", pretty_print=pretty_print, 1273 encoding=encoding) 1274 if not include_meta_content_type: 1275 html = __replace_meta_content_type('', html) 1276 return html
1277
1278 -def open_in_browser(doc):
1279 """ 1280 Open the HTML document in a web browser (saving it to a temporary 1281 file to open it). 1282 """ 1283 import os 1284 import webbrowser 1285 try: 1286 write_doc = doc.write 1287 except AttributeError: 1288 write_doc = etree.ElementTree(element=doc).write 1289 fn = os.tempnam() + '.html' 1290 write_doc(fn, method="html") 1291 url = 'file://' + fn.replace(os.path.sep, '/') 1292 print url 1293 webbrowser.open(url)
1294 1295 ################################################################################ 1296 # configure Element class lookup 1297 ################################################################################ 1298
1299 -class HTMLParser(etree.HTMLParser):
1300 - def __init__(self, **kwargs):
1301 super(HTMLParser, self).__init__(**kwargs) 1302 self.setElementClassLookup(HtmlElementClassLookup())
1303
1304 -def Element(*args, **kw):
1305 v = html_parser.makeelement(*args, **kw) 1306 return v
1307 1308 html_parser = HTMLParser() 1309