Package lxml :: Package html
[hide private]
[frames] | no frames]

Source Code for Package lxml.html

   1  import threading 
   2  import re 
   3  import urlparse 
   4  import copy 
   5  from lxml import etree 
   6  from lxml.html import defs 
   7  from lxml import cssselect 
   8  from lxml.html.setmixin import SetMixin 
   9  try: 
  10      from UserDict import DictMixin 
  11  except ImportError: 
  12      # DictMixin was introduced in Python 2.4 
  13      from lxml.html._dictmixin import DictMixin 
  14  import sets 
  15   
  16  __all__ = [ 
  17      'document_fromstring', 'fragment_fromstring', 'fragments_fromstring', 'fromstring', 
  18      'tostring', 'Element', 'defs', 'open_in_browser', 'submit_form', 
  19      'find_rel_links', 'find_class', 'make_links_absolute', 
  20      'resolve_base_href', 'iterlinks', 'rewrite_links', 'open_in_browser'] 
  21   
  22  _rel_links_xpath = etree.XPath("descendant-or-self::a[@rel]") 
  23  #_class_xpath = etree.XPath(r"descendant-or-self::*[regexp:match(@class, concat('\b', $class_name, '\b'))]", {'regexp': 'http://exslt.org/regular-expressions'}) 
  24  _class_xpath = etree.XPath("descendant-or-self::*[@class and contains(concat(' ', normalize-space(@class), ' '), concat(' ', $class_name, ' '))]") 
  25  _id_xpath = etree.XPath("descendant-or-self::*[@id=$id]") 
  26  _collect_string_content = etree.XPath("string()") 
  27  _css_url_re = re.compile(r'url\((.*?)\)', re.I) 
  28  _css_import_re = re.compile(r'@import "(.*?)"') 
  29  _label_xpath = etree.XPath("//label[@for=$id]") 
  30   
31 -class HtmlMixin(object):
32
33 - def base_url(self):
34 """ 35 Returns the base URL, given when the page was parsed. 36 37 Use with ``urlparse.urljoin(el.base_url, href)`` to get 38 absolute URLs. 39 """ 40 return self.getroottree().docinfo.URL
41 base_url = property(base_url, doc=base_url.__doc__) 42
43 - def forms(self):
44 """ 45 Return a list of all the forms 46 """ 47 return list(self.getiterator('form'))
48 forms = property(forms, doc=forms.__doc__) 49
50 - def body(self):
51 """ 52 Return the <body> element. Can be called from a child element 53 to get the document's head. 54 """ 55 return self.xpath('//body')[0]
56 body = property(body, doc=body.__doc__) 57
58 - def head(self):
59 """ 60 Returns the <head> element. Can be called from a child 61 element to get the document's head. 62 """ 63 return self.xpath('//head')[0]
64 head = property(head, doc=head.__doc__) 65
66 - def label__get(self):
67 """ 68 Get or set any <label> element associated with this element. 69 """ 70 id = self.get('id') 71 if not id: 72 return None 73 result = _label_xpath(self, id=id) 74 if not result: 75 return None 76 else: 77 return result[0]
78 - def label__set(self, label):
79 id = self.get('id') 80 if not id: 81 raise TypeError( 82 "You cannot set a label for an element (%r) that has no id" 83 % self) 84 if not label.tag == 'label': 85 raise TypeError( 86 "You can only assign label to a label element (not %r)" 87 % label) 88 label.set('for', id)
89 - def label__del(self):
90 label = self.label 91 if label is not None: 92 del label.attrib['for']
93 label = property(label__get, label__set, label__del, doc=label__get.__doc__) 94
95 - def drop_tree(self):
96 """ 97 Removes this element from the tree, including its children and 98 text. The tail text is joined to the previous element or 99 parent. 100 """ 101 parent = self.getparent() 102 assert parent is not None 103 if self.tail: 104 previous = self.getprevious() 105 if previous is None: 106 parent.text = (parent.text or '') + self.tail 107 else: 108 previous.tail = (previous.tail or '') + self.tail 109 parent.remove(self)
110
111 - def drop_tag(self):
112 """ 113 Remove the tag, but not its children or text. The children and text 114 are merged into the parent. 115 116 Example:: 117 118 >>> h = fragment_fromstring('<div>Hello <b>World!</b></div>') 119 >>> h.find('//b').drop_tag() 120 >>> print tostring(h) 121 <div>Hello World!</div> 122 """ 123 parent = self.getparent() 124 assert parent is not None 125 previous = self.getprevious() 126 if self.text and isinstance(self.tag, basestring): 127 # not a Comment, etc. 128 if previous is None: 129 parent.text = (parent.text or '') + self.text 130 else: 131 previous.tail = (previous.tail or '') + self.text 132 if self.tail: 133 if len(self): 134 last = self[-1] 135 last.tail = (last.tail or '') + self.tail 136 elif previous is None: 137 parent.text = (parent.text or '') + self.tail 138 else: 139 previous.tail = (previous.tail or '') + self.tail 140 index = parent.index(self) 141 parent[index:index+1] = self[:]
142 150
151 - def find_class(self, class_name):
152 """ 153 Find any elements with the given class name. 154 """ 155 return _class_xpath(self, class_name=class_name)
156
157 - def get_element_by_id(self, id, *default):
158 """ 159 Get the first element in a document with the given id. If none is 160 found, return the default argument if provided or raise KeyError 161 otherwise. 162 163 Note that there can be more than one element with the same id, 164 and this isn't uncommon in HTML documents found in the wild. 165 Browsers return only the first match, and this function does 166 the same. 167 """ 168 try: 169 # FIXME: should this check for multiple matches? 170 # browsers just return the first one 171 return _id_xpath(self, id=id)[0] 172 except IndexError: 173 if default: 174 return default[0] 175 else: 176 raise KeyError, id
177
178 - def text_content(self):
179 """ 180 Return the text content of the tag (and the text in any children). 181 """ 182 return _collect_string_content(self)
183
184 - def cssselect(self, expr):
185 """ 186 Run the CSS expression on this element and its children, 187 returning a list of the results. 188 189 Equivalent to lxml.cssselect.CSSSelect(expr)(self) -- note 190 that pre-compiling the expression can provide a substantial 191 speedup. 192 """ 193 return cssselect.CSSSelect(expr)(self)
194 195 ######################################## 196 ## Link functions 197 ######################################## 198 218 self.rewrite_links(link_repl)
219
220 - def resolve_base_href(self):
221 """ 222 Find any ``<base href>`` tag in the document, and apply its 223 values to all links found in the document. Also remove the 224 tag once it has been applied. 225 """ 226 base_href = None 227 basetags = self.xpath('//base[@href]') 228 for b in basetags: 229 base_href = b.get('href') 230 b.drop_tree() 231 if not base_href: 232 return 233 self.make_links_absolute(base_href, resolve_base_href=False)
234 259 306 307
308 -class _MethodFunc(object):
309 """ 310 An object that represents a method on an element as a function; 311 the function takes either an element or an HTML string. It 312 returns whatever the function normally returns, or if the function 313 works in-place (and so returns None) it returns a serialized form 314 of the resulting document. 315 """
316 - def __init__(self, name, copy=False, source_class=HtmlMixin):
317 self.name = name 318 self.copy = copy 319 self.__doc__ = getattr(source_class, self.name).__doc__
320 - def __call__(self, doc, *args, **kw):
321 if isinstance(doc, basestring): 322 if 'copy' in kw: 323 raise TypeError( 324 "The keyword 'copy' can only be used with element inputs to %s, not a string input" % self.name) 325 return_string = True 326 doc = fromstring(doc, **kw) 327 else: 328 if 'copy' in kw: 329 copy = kw.pop('copy') 330 else: 331 copy = self.copy 332 return_string = False 333 if copy: 334 doc = copy.deepcopy(doc) 335 meth = getattr(doc, self.name) 336 result = meth(*args, **kw) 337 # FIXME: this None test is a bit sloppy 338 if result is None: 339 # Then return what we got in 340 if return_string: 341 return tostring(doc) 342 else: 343 return doc 344 else: 345 return result
346 347 find_rel_links = _MethodFunc('find_rel_links', copy=False) 348 find_class = _MethodFunc('find_class', copy=False) 349 make_links_absolute = _MethodFunc('make_links_absolute', copy=True) 350 resolve_base_href = _MethodFunc('resolve_base_href', copy=True) 351 iterlinks = _MethodFunc('iterlinks', copy=False) 352 rewrite_links = _MethodFunc('rewrite_links', copy=True) 353
354 -class HtmlComment(etree.CommentBase, HtmlMixin):
355 pass
356
357 -class HtmlElement(etree.ElementBase, HtmlMixin):
358 pass
359
360 -class HtmlProcessingInstruction(etree.PIBase, HtmlMixin):
361 pass
362
363 -class HtmlEntity(etree.EntityBase, HtmlMixin):
364 pass
365 366
367 -class HtmlElementClassLookup(etree.CustomElementClassLookup):
368 """A lookup scheme for HTML Element classes. 369 370 To create a lookup instance with different Element classes, pass a tag 371 name mapping of Element classes in the ``classes`` keyword argument and/or 372 a tag name mapping of Mixin classes in the ``mixins`` keyword argument. 373 The special key '*' denotes a Mixin class that should be mixed into all 374 Element classes. 375 """ 376 _default_element_classes = {} 377
378 - def __init__(self, classes=None, mixins=None):
379 etree.CustomElementClassLookup.__init__(self) 380 if classes is None: 381 classes = self._default_element_classes.copy() 382 if mixins: 383 mixers = {} 384 for name, value in mixins: 385 if name == '*': 386 for n in classes.keys(): 387 mixers.setdefault(n, []).append(value) 388 else: 389 mixers.setdefault(name, []).append(value) 390 for name, mix_bases in mixers.items(): 391 cur = classes.get(name, HtmlElement) 392 bases = tuple(mix_bases + [cur]) 393 classes[name] = type(cur.__name__, bases, {}) 394 self._element_classes = classes
395
396 - def lookup(self, node_type, document, namespace, name):
397 if node_type == 'element': 398 return self._element_classes.get(name.lower(), HtmlElement) 399 elif node_type == 'comment': 400 return HtmlComment 401 elif node_type == 'PI': 402 return HtmlProcessingInstruction 403 elif node_type == 'entity': 404 return HtmlEntity 405 # Otherwise normal lookup 406 return None
407 408 409 html_parser = etree.HTMLParser() 410
411 -def document_fromstring(html, **kw):
412 value = etree.HTML(html, html_parser, **kw) 413 if value is None: 414 raise etree.ParserError( 415 "Document is empty") 416 return value
417
418 -def fragments_fromstring(html, no_leading_text=False, **kw):
419 """ 420 Parses several HTML elements, returning a list of elements. 421 422 The first item in the list may be a string (though leading 423 whitespace is removed). If no_leading_text is true, then it will 424 be an error if there is leading text, and it will always be a list 425 of only elements. 426 """ 427 # FIXME: check what happens when you give html with a body, head, etc. 428 start = html[:20].lstrip().lower() 429 if not start.startswith('<html') and not start.startswith('<!doctype'): 430 html = '<html><body>%s</body></html>' % html 431 doc = document_fromstring(html, **kw) 432 assert doc.tag == 'html' 433 bodies = [e for e in doc if e.tag == 'body'] 434 assert len(bodies) == 1, ("too many bodies: %r in %r" % (bodies, html)) 435 body = bodies[0] 436 elements = [] 437 if no_leading_text and body.text and body.text.strip(): 438 raise etree.ParserError( 439 "There is leading text: %r" % body.text) 440 if body.text and body.text.strip(): 441 elements.append(body.text) 442 elements.extend(body) 443 # FIXME: removing the reference to the parent artificial document 444 # would be nice 445 return elements
446
447 -def fragment_fromstring(html, create_parent=False, **kw):
448 """ 449 Parses a single HTML element; it is an error if there is more than 450 one element, or if anything but whitespace precedes or follows the 451 element. 452 453 If create_parent is true (or is a tag name) then a parent node 454 will be created to encapsulate the HTML in a single element. 455 """ 456 if create_parent: 457 if not isinstance(create_parent, basestring): 458 create_parent = 'div' 459 return fragment_fromstring('<%s>%s</%s>' % ( 460 create_parent, html, create_parent), **kw) 461 elements = fragments_fromstring(html, no_leading_text=True) 462 if not elements: 463 raise etree.ParserError( 464 "No elements found") 465 if len(elements) > 1: 466 raise etree.ParserError( 467 "Multiple elements found (%s)" 468 % ', '.join([_element_name(e) for e in elements])) 469 el = elements[0] 470 if el.tail and el.tail.strip(): 471 raise etree.ParserError( 472 "Element followed by text: %r" % el.tail) 473 el.tail = None 474 return el
475
476 -def fromstring(html, **kw):
477 """ 478 Parse the html, returning a single element/document. 479 480 This tries to minimally parse the chunk of text, without knowing if it 481 is a fragment or a document. 482 """ 483 start = html[:10].lstrip().lower() 484 if start.startswith('<html') or start.startswith('<!doctype'): 485 # Looks like a full HTML document 486 return document_fromstring(html, **kw) 487 # otherwise, lets parse it out... 488 doc = document_fromstring(html, **kw) 489 bodies = doc.findall('body') 490 if bodies: 491 body = bodies[0] 492 if len(bodies) > 1: 493 # Somehow there are multiple bodies, which is bad, but just 494 # smash them into one body 495 for other_body in bodies[1:]: 496 if other_body.text: 497 if len(body): 498 body[-1].tail = (body[-1].tail or '') + other_body.text 499 else: 500 body.text = (body.text or '') + other_body.text 501 body.extend(other_body) 502 # We'll ignore tail 503 # I guess we are ignoring attributes too 504 other_body.drop_tree() 505 else: 506 body = None 507 heads = doc.findall('head') 508 if heads: 509 # Well, we have some sort of structure, so lets keep it all 510 head = heads[0] 511 if len(heads) > 1: 512 for other_head in heads[1:]: 513 head.extend(other_head) 514 # We don't care about text or tail in a head 515 other_head.drop_tree() 516 return doc 517 if (len(body) == 1 and (not body.text or not body.text.strip()) 518 and (not body[-1].tail or not body[-1].tail.strip())): 519 # The body has just one element, so it was probably a single 520 # element passed in 521 return body[0] 522 # Now we have a body which represents a bunch of tags which have the 523 # content that was passed in. We will create a fake container, which 524 # is the body tag, except <body> implies too much structure. 525 if _contains_block_level_tag(body): 526 body.tag = 'div' 527 else: 528 body.tag = 'span' 529 return body
530
531 -def parse(filename, **kw):
532 """ 533 Parse a filename, URL, or file-like object into an HTML document. 534 535 You may pass the keyword argument ``base_url='http://...'`` to set 536 the base URL. 537 """ 538 return etree.parse(filename, html_parser, **kw)
539
540 -def _contains_block_level_tag(el):
541 # FIXME: I could do this with XPath, but would that just be 542 # unnecessarily slow? 543 for el in el.getiterator(): 544 if el.tag in defs.block_tags: 545 return True 546 return False
547
548 -def _element_name(el):
549 if isinstance(el, etree.CommentBase): 550 return 'comment' 551 elif isinstance(el, basestring): 552 return 'string' 553 else: 554 return el.tag
555
556 -def Element(*args, **kw):
557 v = html_parser.makeelement(*args, **kw) 558 return v
559
560 -class FormElement(HtmlElement):
561 """ 562 Represents a <form> element. 563 """ 564
565 - def inputs(self):
566 """ 567 Returns an accessor for all the input elements in the form. 568 569 See `InputGetter` for more information about the object. 570 """ 571 return InputGetter(self)
572 inputs = property(inputs, doc=inputs.__doc__) 573
574 - def fields__get(self):
575 """ 576 Dictionary-like object that represents all the fields in this 577 form. You can set values in this dictionary to effect the 578 form. 579 """ 580 return FieldsDict(self.inputs)
581 - def fields__set(self, value):
582 prev_keys = self.fields.keys() 583 for key, value in value.iteritems(): 584 if key in prev_keys: 585 prev_keys.remove(key) 586 self.fields[key] = value 587 for key in prev_keys: 588 if key is None: 589 # Case of an unnamed input; these aren't really 590 # expressed in form_values() anyway. 591 continue 592 self.fields[key] = None
593 594 fields = property(fields__get, fields__set, doc=fields__get.__doc__) 595
596 - def _name(self):
597 if self.get('name'): 598 return self.get('name') 599 elif self.get('id'): 600 return '#' + self.get('id') 601 return str(self.body.findall('form').index(self))
602
603 - def form_values(self):
604 """ 605 Return a list of tuples of the field values for the form. 606 This is suitable to be passed to ``urllib.urlencode()``. 607 """ 608 results = [] 609 for el in self.inputs: 610 name = el.name 611 if not name: 612 continue 613 if el.tag == 'textarea': 614 results.append((name, el.value)) 615 elif el.tag == 'select': 616 value = el.value 617 if el.multiple: 618 for v in value: 619 results.append((name, v)) 620 elif value is not None: 621 results.append((name, el.value)) 622 else: 623 assert el.tag == 'input', ( 624 "Unexpected tag: %r" % el) 625 if el.checkable and not el.checked: 626 continue 627 if el.type in ('submit', 'image', 'reset'): 628 continue 629 value = el.value 630 if value is not None: 631 results.append((name, el.value)) 632 return results
633
634 - def action__get(self):
635 """ 636 Get/set the form's ``action`` attribute. 637 """ 638 base_url = self.base_url 639 action = self.get('action') 640 if base_url and action is not None: 641 return urlparse.urljoin(base_url, action) 642 else: 643 return action
644 - def action__set(self, value):
645 self.set('action', value)
646 - def action__del(self):
647 if 'action' in self.attrib: 648 del self.attrib['action']
649 action = property(action__get, action__set, action__del, doc=action__get.__doc__) 650
651 - def method__get(self):
652 """ 653 Get/set the form's method. Always returns a capitalized 654 string, and defaults to ``'GET'`` 655 """ 656 return self.get('method', 'GET').upper()
657 - def method__set(self, value):
658 self.set('method', value.upper())
659 method = property(method__get, method__set, doc=method__get.__doc__)
660 661 HtmlElementClassLookup._default_element_classes['form'] = FormElement 662
663 -def submit_form(form, extra_values=None, open_http=None):
664 """ 665 Helper function to submit a form. Returns a file-like object, as from 666 ``urllib.urlopen()``. This object also has a ``.geturl()`` function, 667 which shows the URL if there were any redirects. 668 669 You can use this like:: 670 671 >>> form = doc.forms[0] 672 >>> form.inputs['foo'].value = 'bar' # etc 673 >>> response = form.submit() 674 >>> doc = parse(response) 675 >>> doc.make_links_absolute(response.geturl()) 676 677 To change the HTTP requester, pass a function as ``open_http`` keyword 678 argument that opens the URL for you. The function must have the following 679 signature:: 680 681 open_http(method, URL, values) 682 683 The action is one of 'GET' or 'POST', the URL is the target URL as a 684 string, and the values are a sequence of ``(name, value)`` tuples with the 685 form data. 686 """ 687 values = form.form_values() 688 if extra_values: 689 if hasattr(extra_values, 'items'): 690 extra_values = extra_values.items() 691 values.extend(extra_values) 692 if open_http is None: 693 open_http = open_http_urllib 694 return open_http(form.method, form.action, values)
695
696 -def open_http_urllib(method, url, values):
697 import urllib 698 ## FIXME: should test that it's not a relative URL or something 699 if method == 'GET': 700 if '?' in url: 701 url += '&' 702 else: 703 url += '?' 704 url += urllib.urlencode(values) 705 data = None 706 else: 707 data = urllib.urlencode(values) 708 return urllib.urlopen(url, data)
709
710 -class FieldsDict(DictMixin):
711
712 - def __init__(self, inputs):
713 self.inputs = inputs
714 - def __getitem__(self, item):
715 return self.inputs[item].value
716 - def __setitem__(self, item, value):
717 self.inputs[item].value = value
718 - def __delitem__(self, item):
719 raise KeyError( 720 "You cannot remove keys from ElementDict")
721 - def keys(self):
722 return self.inputs.keys()
723 - def __contains__(self, item):
724 return item in self.inputs
725
726 - def __repr__(self):
727 return '<%s for form %s>' % ( 728 self.__class__.__name__, 729 self.inputs.form._name())
730
731 -class InputGetter(object):
732 733 """ 734 An accessor that represents all the input fields in a form. 735 736 You can get fields by name from this, with 737 ``form.inputs['field_name']``. If there are a set of checkboxes 738 with the same name, they are returned as a list (a `CheckboxGroup` 739 which also allows value setting). Radio inputs are handled 740 similarly. 741 742 You can also iterate over this to get all input elements. This 743 won't return the same thing as if you get all the names, as 744 checkboxes and radio elements are returned individually. 745 """ 746 747 _name_xpath = etree.XPath(".//*[@name = $name and (name(.) = 'select' or name(.) = 'input' or name(.) = 'textarea')]") 748 _all_xpath = etree.XPath(".//*[name() = 'select' or name() = 'input' or name() = 'textarea']") 749
750 - def __init__(self, form):
751 self.form = form
752
753 - def __repr__(self):
754 return '<%s for form %s>' % ( 755 self.__class__.__name__, 756 self.form._name())
757 758 ## FIXME: there should be more methods, and it's unclear if this is 759 ## a dictionary-like object or list-like object 760
761 - def __getitem__(self, name):
762 results = self._name_xpath(self.form, name=name) 763 if results: 764 type = results[0].get('type') 765 if type == 'radio' and len(results) > 1: 766 group = RadioGroup(results) 767 group.name = name 768 return group 769 elif type == 'checkbox' and len(results) > 1: 770 group = CheckboxGroup(results) 771 group.name = name 772 return group 773 else: 774 # I don't like throwing away elements like this 775 return results[0] 776 else: 777 raise KeyError( 778 "No input element with the name %r" % name)
779
780 - def __contains__(self, name):
781 results = self._name_xpath(self.form, name=name) 782 return bool(results)
783
784 - def keys(self):
785 names = sets.Set() 786 for el in self: 787 names.add(el.name) 788 return list(names)
789
790 - def __iter__(self):
791 ## FIXME: kind of dumb to turn a list into an iterator, only 792 ## to have it likely turned back into a list again :( 793 return iter(self._all_xpath(self.form))
794
795 -class InputMixin(object):
796 797 """ 798 Mix-in for all input elements (input, select, and textarea) 799 """ 800 801
802 - def name__get(self):
803 """ 804 Get/set the name of the element 805 """ 806 return self.get('name')
807 - def name__set(self, value):
808 self.set('name', value)
809 - def name__del(self):
810 if 'name' in self.attrib: 811 del self.attrib['name']
812 name = property(name__get, name__set, name__del, doc=name__get.__doc__) 813
814 - def __repr__(self):
815 type = getattr(self, 'type', None) 816 if type: 817 type = ' type=%r' % type 818 else: 819 type = '' 820 return '<%s %x name=%r%s>' % ( 821 self.__class__.__name__, id(self), self.name, type)
822
823 -class TextareaElement(InputMixin, HtmlElement):
824 """ 825 ``<textarea>`` element. You can get the name with ``.name`` and 826 get/set the value with ``.value`` 827 """ 828
829 - def value__get(self):
830 """ 831 Get/set the value (which is the contents of this element) 832 """ 833 return self.text or ''
834 - def value__set(self, value):
835 self.text = value
836 - def value__del(self):
837 self.text = ''
838 value = property(value__get, value__set, value__del, doc=value__get.__doc__)
839 840 HtmlElementClassLookup._default_element_classes['textarea'] = TextareaElement 841
842 -class SelectElement(InputMixin, HtmlElement):
843 """ 844 ``<select>`` element. You can get the name with ``.name``. 845 846 ``.value`` will be the value of the selected option, unless this 847 is a multi-select element (``<select multiple>``), in which case 848 it will be a set-like object. In either case ``.value_options`` 849 gives the possible values. 850 851 The boolean attribute ``.multiple`` shows if this is a 852 multi-select. 853 """ 854
855 - def value__get(self):
856 """ 857 Get/set the value of this select (the selected option). 858 859 If this is a multi-select, this is a set-like object that 860 represents all the selected options. 861 """ 862 if self.multiple: 863 return MultipleSelectOptions(self) 864 for el in self.getiterator('option'): 865 if 'selected' in el.attrib: 866 value = el.get('value') 867 # FIXME: If value is None, what to return?, get_text()? 868 return value 869 return None
870
871 - def value__set(self, value):
872 if self.multiple: 873 if isinstance(value, basestring): 874 raise TypeError( 875 "You must pass in a sequence") 876 self.value.clear() 877 self.value.update(value) 878 return 879 if value is not None: 880 for el in self.getiterator('option'): 881 # FIXME: also if el.get('value') is None? 882 if el.get('value') == value: 883 checked_option = el 884 break 885 else: 886 raise ValueError( 887 "There is no option with the value of %r" % value) 888 for el in self.getiterator('option'): 889 if 'selected' in el.attrib: 890 del el.attrib['selected'] 891 if value is not None: 892 checked_option.set('selected', '')
893
894 - def value__del(self):
895 # FIXME: should del be allowed at all? 896 if self.multiple: 897 self.value.clear() 898 else: 899 self.value = None
900 901 value = property(value__get, value__set, value__del, doc=value__get.__doc__) 902
903 - def value_options(self):
904 """ 905 All the possible values this select can have (the ``value`` 906 attribute of all the ``<option>`` elements. 907 """ 908 return [el.get('value') for el in self.getiterator('option')]
909 value_options = property(value_options, doc=value_options.__doc__) 910
911 - def multiple__get(self):
912 """ 913 Boolean attribute: is there a ``multiple`` attribute on this element. 914 """ 915 return 'multiple' in self.attrib
916 - def multiple__set(self, value):
917 if value: 918 self.set('multiple', '') 919 elif 'multiple' in self.attrib: 920 del self.attrib['multiple']
921 multiple = property(multiple__get, multiple__set, doc=multiple__get.__doc__)
922 923 HtmlElementClassLookup._default_element_classes['select'] = SelectElement 924
925 -class MultipleSelectOptions(SetMixin):
926 """ 927 Represents all the selected options in a ``<select multiple>`` element. 928 929 You can add to this set-like option to select an option, or remove 930 to unselect the option. 931 """ 932
933 - def __init__(self, select):
934 self.select = select
935
936 - def options(self):
937 """ 938 Iterator of all the ``<option>`` elements. 939 """ 940 return self.select.getiterator('option')
941 options = property(options) 942
943 - def __iter__(self):
944 for option in self.options: 945 yield option.get('value')
946
947 - def add(self, item):
948 for option in self.options: 949 if option.get('value') == item: 950 option.set('selected', '') 951 break 952 else: 953 raise ValueError( 954 "There is no option with the value %r" % item)
955
956 - def remove(self, item):
957 for option in self.options: 958 if option.get('value') == item: 959 if 'selected' in option.attrib: 960 del option.attrib['selected'] 961 else: 962 raise ValueError( 963 "The option %r is not currently selected" % item) 964 break 965 else: 966 raise ValueError( 967 "There is not option with the value %r" % item)
968
969 - def __repr__(self):
970 return '<%s {%s} for select name=%r>' % ( 971 self.__class__.__name__, 972 ', '.join([repr(v) for v in self]), 973 self.select.name)
974
975 -class RadioGroup(list):
976 """ 977 This object represents several ``<input type=radio>`` elements 978 that have the same name. 979 980 You can use this like a list, but also use the property 981 ``.value`` to check/uncheck inputs. Also you can use 982 ``.value_options`` to get the possible values. 983 """ 984
985 - def value__get(self):
986 """ 987 Get/set the value, which checks the radio with that value (and 988 unchecks any other value). 989 """ 990 for el in self: 991 if 'checked' in el.attrib: 992 return el.get('value') 993 return None
994
995 - def value__set(self, value):
996 if value is not None: 997 for el in self: 998 if el.get('value') == value: 999 checked_option = el 1000 break 1001 else: 1002 raise ValueError( 1003 "There is no radio input with the value %r" % value) 1004 for el in self: 1005 if 'checked' in el.attrib: 1006 del el.attrib['checked'] 1007 if value is not None: 1008 checked_option.set('checked', '')
1009
1010 - def value__del(self):
1011 self.value = None
1012 1013 value = property(value__get, value__set, value__del, doc=value__get.__doc__) 1014
1015 - def value_options(self):
1016 """ 1017 Returns a list of all the possible values. 1018 """ 1019 return [el.get('value') for el in self]
1020 value_options = property(value_options, doc=value_options.__doc__) 1021
1022 - def __repr__(self):
1023 return '%s(%s)' % ( 1024 self.__class__.__name__, 1025 list.__repr__(self))
1026
1027 -class CheckboxGroup(list):
1028 """ 1029 Represents a group of checkboxes (``<input type=checkbox>``) that 1030 have the same name. 1031 1032 In addition to using this like a list, the ``.value`` attribute 1033 returns a set-like object that you can add to or remove from to 1034 check and uncheck checkboxes. You can also use ``.value_options`` 1035 to get the possible values. 1036 """ 1037
1038 - def value__get(self):
1039 """ 1040 Return a set-like object that can be modified to check or 1041 uncheck individual checkboxes according to their value. 1042 """ 1043 return CheckboxValues(self)
1044 - def value__set(self, value):
1045 self.value.clear() 1046 if not hasattr(value, '__iter__'): 1047 raise ValueError( 1048 "A CheckboxGroup (name=%r) must be set to a sequence (not %r)" 1049 % (self[0].name, value)) 1050 self.value.update(value)
1051 - def value__del(self):
1052 self.value.clear()
1053 value = property(value__get, value__set, value__del, doc=value__get.__doc__) 1054
1055 - def __repr__(self):
1056 return '%s(%s)' % ( 1057 self.__class__.__name__, list.__repr__(self))
1058
1059 -class CheckboxValues(SetMixin):
1060 1061 """ 1062 Represents the values of the checked checkboxes in a group of 1063 checkboxes with the same name. 1064 """ 1065
1066 - def __init__(self, group):
1067 self.group = group
1068
1069 - def __iter__(self):
1070 return iter([ 1071 el.get('value') 1072 for el in self.group 1073 if 'checked' in el.attrib])
1074
1075 - def add(self, value):
1076 for el in self.group: 1077 if el.get('value') == value: 1078 el.set('checked', '') 1079 break 1080 else: 1081 raise KeyError("No checkbox with value %r" % value)
1082
1083 - def remove(self, value):
1084 for el in self.group: 1085 if el.get('value') == value: 1086 if 'checked' in el.attrib: 1087 del el.attrib['checked'] 1088 else: 1089 raise KeyError( 1090 "The checkbox with value %r was already unchecked" % value) 1091 break 1092 else: 1093 raise KeyError( 1094 "No checkbox with value %r" % value)
1095
1096 - def __repr__(self):
1097 return '<%s {%s} for checkboxes name=%r>' % ( 1098 self.__class__.__name__, 1099 ', '.join([repr(v) for v in self]), 1100 self.group.name)
1101
1102 -class InputElement(InputMixin, HtmlElement):
1103 """ 1104 Represents an ``<input>`` element. 1105 1106 You can get the type with ``.type`` (which is lower-cased and 1107 defaults to ``'text'``). 1108 1109 Also you can get and set the value with ``.value`` 1110 1111 Checkboxes and radios have the attribute ``input.checkable == 1112 True`` (for all others it is false) and a boolean attribute 1113 ``.checked``. 1114 1115 """ 1116 1117 ## FIXME: I'm a little uncomfortable with the use of .checked
1118 - def value__get(self):
1119 """ 1120 Get/set the value of this element, using the ``value`` attribute. 1121 1122 Also, if this is a checkbox and it has no value, this defaults 1123 to ``'on'``. If it is a checkbox or radio that is not 1124 checked, this returns None. 1125 """ 1126 if self.checkable: 1127 if self.checked: 1128 return self.get('value') or 'on' 1129 else: 1130 return None 1131 return self.get('value')
1132 - def value__set(self, value):
1133 if self.checkable: 1134 if not value: 1135 self.checked = False 1136 else: 1137 self.checked = True 1138 if isinstance(value, basestring): 1139 self.set('value', value) 1140 else: 1141 self.set('value', value)
1142 - def value__del(self):
1143 if self.checkable: 1144 self.checked = False 1145 else: 1146 if 'value' in self.attrib: 1147 del self.attrib['value']
1148 value = property(value__get, value__set, value__del, doc=value__get.__doc__) 1149
1150 - def type__get(self):
1151 """ 1152 Return the type of this element (using the type attribute). 1153 """ 1154 return self.get('type', 'text').lower()
1155 - def type__set(self, value):
1156 self.set('type', value)
1157 type = property(type__get, type__set, doc=type__get.__doc__) 1158
1159 - def checkable__get(self):
1160 """ 1161 Boolean: can this element be checked? 1162 """ 1163 return self.type in ['checkbox', 'radio']
1164 checkable = property(checkable__get, doc=checkable__get.__doc__) 1165
1166 - def checked__get(self):
1167 """ 1168 Boolean attribute to get/set the presence of the ``checked`` 1169 attribute. 1170 1171 You can only use this on checkable input types. 1172 """ 1173 if not self.checkable: 1174 raise AttributeError('Not a checkable input type') 1175 return 'checked' in self.attrib
1176 - def checked__set(self, value):
1177 if not self.checkable: 1178 raise AttributeError('Not a checkable input type') 1179 if value: 1180 self.set('checked', '') 1181 else: 1182 if 'checked' in self.attrib: 1183 del self.attrib['checked']
1184 checked = property(checked__get, checked__set, doc=checked__get.__doc__)
1185 1186 HtmlElementClassLookup._default_element_classes['input'] = InputElement 1187
1188 -class LabelElement(HtmlElement):
1189 """ 1190 Represents a ``<label>`` element. 1191 1192 Label elements are linked to other elements with their ``for`` 1193 attribute. You can access this element with ``label.for_element``. 1194 """ 1195
1196 - def for_element__get(self):
1197 """ 1198 Get/set the element this label points to. Return None if it 1199 can't be found. 1200 """ 1201 id = self.get('for') 1202 if not id: 1203 return None 1204 return self.body.get_element_by_id(id)
1205 - def for_element__set(self, other):
1206 id = other.get('id') 1207 if not id: 1208 raise TypeError( 1209 "Element %r has no id attribute" % other) 1210 self.set('for', id)
1211 - def for_element__del(self):
1212 if 'id' in self.attrib: 1213 del self.attrib['id']
1214 for_element = property(for_element__get, for_element__set, for_element__del, 1215 doc=for_element__get.__doc__)
1216 1217 HtmlElementClassLookup._default_element_classes['label'] = LabelElement 1218 1219 ############################################################ 1220 ## Serialization 1221 ############################################################ 1222 1223 # This isn't a general match, but it's a match for what libxml2 1224 # specifically serialises: 1225 __replace_meta_content_type = re.compile( 1226 r'<meta http-equiv="Content-Type".*?>').sub 1227
1228 -def tostring(doc, pretty_print=False, include_meta_content_type=False):
1229 """ 1230 return HTML string representation of the document given 1231 1232 note: this will create a meta http-equiv="Content" tag in the head 1233 and may replace any that are present 1234 """ 1235 assert doc is not None 1236 html = etree.tostring(doc, method="html", pretty_print=pretty_print) 1237 if not include_meta_content_type: 1238 html = __replace_meta_content_type('', html) 1239 return html
1240
1241 -def open_in_browser(doc):
1242 """ 1243 Open the HTML document in a web browser (saving it to a temporary 1244 file to open it). 1245 """ 1246 import os 1247 import webbrowser 1248 try: 1249 write_doc = doc.write 1250 except AttributeError: 1251 write_doc = etree.ElementTree(element=doc).write 1252 fn = os.tempnam() + '.html' 1253 write_doc(fn, method="html") 1254 url = 'file://' + fn.replace(os.path.sep, '/') 1255 print url 1256 webbrowser.open(url)
1257 1258 ################################################################################ 1259 # configure Element class lookup 1260 1261 html_parser.setElementClassLookup(HtmlElementClassLookup()) 1262