Package lxml :: Package html
[frames] | no frames]

Source Code for Package lxml.html

   1  """The ``lxml.html`` tool set for HTML handling. 
   2  """ 
   3   
   4  import threading 
   5  import re 
   6  import urlparse 
   7  import copy 
   8  from lxml import etree 
   9  from lxml.html import defs 
  10  from lxml import cssselect 
  11  from lxml.html._setmixin import SetMixin 
  12  try: 
  13      from UserDict import DictMixin 
  14  except ImportError: 
  15      # DictMixin was introduced in Python 2.4 
  16      from lxml.html._dictmixin import DictMixin 
  17  try: 
  18      set 
  19  except NameError: 
  20      from sets import Set as set 
  21   
  22  __all__ = [ 
  23      'document_fromstring', 'fragment_fromstring', 'fragments_fromstring', 'fromstring', 
  24      'tostring', 'Element', 'defs', 'open_in_browser', 'submit_form', 
  25      'find_rel_links', 'find_class', 'make_links_absolute', 
  26      'resolve_base_href', 'iterlinks', 'rewrite_links', 'open_in_browser'] 
  27   
  28  _rel_links_xpath = etree.XPath("descendant-or-self::a[@rel]") 
  29  #_class_xpath = etree.XPath(r"descendant-or-self::*[regexp:match(@class, concat('\b', $class_name, '\b'))]", {'regexp': 'http://exslt.org/regular-expressions'}) 
  30  _class_xpath = etree.XPath("descendant-or-self::*[@class and contains(concat(' ', normalize-space(@class), ' '), concat(' ', $class_name, ' '))]") 
  31  _id_xpath = etree.XPath("descendant-or-self::*[@id=$id]") 
  32  _collect_string_content = etree.XPath("string()") 
  33  _css_url_re = re.compile(r'url\((.*?)\)', re.I) 
  34  _css_import_re = re.compile(r'@import "(.*?)"') 
  35  _label_xpath = etree.XPath("//label[@for=$id]") 
  36  _archive_re = re.compile(r'[^ ]+') 
  37   
38 -class HtmlMixin(object):
39
40 - def base_url(self):
41 """ 42 Returns the base URL, given when the page was parsed. 43 44 Use with ``urlparse.urljoin(el.base_url, href)`` to get 45 absolute URLs. 46 """ 47 return self.getroottree().docinfo.URL
48 base_url = property(base_url, doc=base_url.__doc__) 49
50 - def forms(self):
51 """ 52 Return a list of all the forms 53 """ 54 return list(self.getiterator('form'))
55 forms = property(forms, doc=forms.__doc__) 56
57 - def body(self):
58 """ 59 Return the <body> element. Can be called from a child element 60 to get the document's head. 61 """ 62 return self.xpath('//body')[0]
63 body = property(body, doc=body.__doc__) 64
65 - def head(self):
66 """ 67 Returns the <head> element. Can be called from a child 68 element to get the document's head. 69 """ 70 return self.xpath('//head')[0]
71 head = property(head, doc=head.__doc__) 72
73 - def _label__get(self):
74 """ 75 Get or set any <label> element associated with this element. 76 """ 77 id = self.get('id') 78 if not id: 79 return None 80 result = _label_xpath(self, id=id) 81 if not result: 82 return None 83 else: 84 return result[0]
85 - def _label__set(self, label):
86 id = self.get('id') 87 if not id: 88 raise TypeError( 89 "You cannot set a label for an element (%r) that has no id" 90 % self) 91 if not label.tag == 'label': 92 raise TypeError( 93 "You can only assign label to a label element (not %r)" 94 % label) 95 label.set('for', id)
96 - def _label__del(self):
97 label = self.label 98 if label is not None: 99 del label.attrib['for']
100 label = property(_label__get, _label__set, _label__del, doc=_label__get.__doc__) 101
102 - def drop_tree(self):
103 """ 104 Removes this element from the tree, including its children and 105 text. The tail text is joined to the previous element or 106 parent. 107 """ 108 parent = self.getparent() 109 assert parent is not None 110 if self.tail: 111 previous = self.getprevious() 112 if previous is None: 113 parent.text = (parent.text or '') + self.tail 114 else: 115 previous.tail = (previous.tail or '') + self.tail 116 parent.remove(self)
117
118 - def drop_tag(self):
119 """ 120 Remove the tag, but not its children or text. The children and text 121 are merged into the parent. 122 123 Example:: 124 125 >>> h = fragment_fromstring('<div>Hello <b>World!</b></div>') 126 >>> h.find('.//b').drop_tag() 127 >>> print tostring(h) 128 <div>Hello World!</div> 129 """ 130 parent = self.getparent() 131 assert parent is not None 132 previous = self.getprevious() 133 if self.text and isinstance(self.tag, basestring): 134 # not a Comment, etc. 135 if previous is None: 136 parent.text = (parent.text or '') + self.text 137 else: 138 previous.tail = (previous.tail or '') + self.text 139 if self.tail: 140 if len(self): 141 last = self[-1] 142 last.tail = (last.tail or '') + self.tail 143 elif previous is None: 144 parent.text = (parent.text or '') + self.tail 145 else: 146 previous.tail = (previous.tail or '') + self.tail 147 index = parent.index(self) 148 parent[index:index+1] = self[:]
149 157
158 - def find_class(self, class_name):
159 """ 160 Find any elements with the given class name. 161 """ 162 return _class_xpath(self, class_name=class_name)
163
164 - def get_element_by_id(self, id, *default):
165 """ 166 Get the first element in a document with the given id. If none is 167 found, return the default argument if provided or raise KeyError 168 otherwise. 169 170 Note that there can be more than one element with the same id, 171 and this isn't uncommon in HTML documents found in the wild. 172 Browsers return only the first match, and this function does 173 the same. 174 """ 175 try: 176 # FIXME: should this check for multiple matches? 177 # browsers just return the first one 178 return _id_xpath(self, id=id)[0] 179 except IndexError: 180 if default: 181 return default[0] 182 else: 183 raise KeyError, id
184
185 - def text_content(self):
186 """ 187 Return the text content of the tag (and the text in any children). 188 """ 189 return _collect_string_content(self)
190
191 - def cssselect(self, expr):
192 """ 193 Run the CSS expression on this element and its children, 194 returning a list of the results. 195 196 Equivalent to lxml.cssselect.CSSSelect(expr)(self) -- note 197 that pre-compiling the expression can provide a substantial 198 speedup. 199 """ 200 return cssselect.CSSSelector(expr)(self)
201 202 ######################################## 203 ## Link functions 204 ######################################## 205 225 self.rewrite_links(link_repl)
226
227 - def resolve_base_href(self):
228 """ 229 Find any ``<base href>`` tag in the document, and apply its 230 values to all links found in the document. Also remove the 231 tag once it has been applied. 232 """ 233 base_href = None 234 basetags = self.xpath('//base[@href]') 235 for b in basetags: 236 base_href = b.get('href') 237 b.drop_tree() 238 if not base_href: 239 return 240 self.make_links_absolute(base_href, resolve_base_href=False)
241 296 343 344
345 -class _MethodFunc(object):
346 """ 347 An object that represents a method on an element as a function; 348 the function takes either an element or an HTML string. It 349 returns whatever the function normally returns, or if the function 350 works in-place (and so returns None) it returns a serialized form 351 of the resulting document. 352 """
353 - def __init__(self, name, copy=False, source_class=HtmlMixin):
354 self.name = name 355 self.copy = copy 356 self.__doc__ = getattr(source_class, self.name).__doc__
357 - def __call__(self, doc, *args, **kw):
358 if isinstance(doc, basestring): 359 if 'copy' in kw: 360 raise TypeError( 361 "The keyword 'copy' can only be used with element inputs to %s, not a string input" % self.name) 362 return_string = True 363 doc = fromstring(doc, **kw) 364 else: 365 if 'copy' in kw: 366 copy = kw.pop('copy') 367 else: 368 copy = self.copy 369 return_string = False 370 if copy: 371 doc = copy.deepcopy(doc) 372 meth = getattr(doc, self.name) 373 result = meth(*args, **kw) 374 # FIXME: this None test is a bit sloppy 375 if result is None: 376 # Then return what we got in 377 if return_string: 378 return tostring(doc) 379 else: 380 return doc 381 else: 382 return result
383 384 find_rel_links = _MethodFunc('find_rel_links', copy=False) 385 find_class = _MethodFunc('find_class', copy=False) 386 make_links_absolute = _MethodFunc('make_links_absolute', copy=True) 387 resolve_base_href = _MethodFunc('resolve_base_href', copy=True) 388 iterlinks = _MethodFunc('iterlinks', copy=False) 389 rewrite_links = _MethodFunc('rewrite_links', copy=True) 390
391 -class HtmlComment(etree.CommentBase, HtmlMixin):
392 pass
393
394 -class HtmlElement(etree.ElementBase, HtmlMixin):
395 pass
396
397 -class HtmlProcessingInstruction(etree.PIBase, HtmlMixin):
398 pass
399
400 -class HtmlEntity(etree.EntityBase, HtmlMixin):
401 pass
402 403
404 -class HtmlElementClassLookup(etree.CustomElementClassLookup):
405 """A lookup scheme for HTML Element classes. 406 407 To create a lookup instance with different Element classes, pass a tag 408 name mapping of Element classes in the ``classes`` keyword argument and/or 409 a tag name mapping of Mixin classes in the ``mixins`` keyword argument. 410 The special key '*' denotes a Mixin class that should be mixed into all 411 Element classes. 412 """ 413 _default_element_classes = {} 414
415 - def __init__(self, classes=None, mixins=None):
416 etree.CustomElementClassLookup.__init__(self) 417 if classes is None: 418 classes = self._default_element_classes.copy() 419 if mixins: 420 mixers = {} 421 for name, value in mixins: 422 if name == '*': 423 for n in classes.keys(): 424 mixers.setdefault(n, []).append(value) 425 else: 426 mixers.setdefault(name, []).append(value) 427 for name, mix_bases in mixers.items(): 428 cur = classes.get(name, HtmlElement) 429 bases = tuple(mix_bases + [cur]) 430 classes[name] = type(cur.__name__, bases, {}) 431 self._element_classes = classes
432
433 - def lookup(self, node_type, document, namespace, name):
434 if node_type == 'element': 435 return self._element_classes.get(name.lower(), HtmlElement) 436 elif node_type == 'comment': 437 return HtmlComment 438 elif node_type == 'PI': 439 return HtmlProcessingInstruction 440 elif node_type == 'entity': 441 return HtmlEntity 442 # Otherwise normal lookup 443 return None
444 445 ################################################################################ 446 # parsing 447 ################################################################################ 448
449 -def document_fromstring(html, **kw):
450 value = etree.HTML(html, html_parser, **kw) 451 if value is None: 452 raise etree.ParserError( 453 "Document is empty") 454 return value
455
456 -def fragments_fromstring(html, no_leading_text=False, base_url=None, **kw):
457 """ 458 Parses several HTML elements, returning a list of elements. 459 460 The first item in the list may be a string (though leading 461 whitespace is removed). If no_leading_text is true, then it will 462 be an error if there is leading text, and it will always be a list 463 of only elements. 464 465 base_url will set the document's base_url attribute (and the tree's docinfo.URL) 466 """ 467 # FIXME: check what happens when you give html with a body, head, etc. 468 start = html[:20].lstrip().lower() 469 if not start.startswith('<html') and not start.startswith('<!doctype'): 470 html = '<html><body>%s</body></html>' % html 471 doc = document_fromstring(html, base_url=base_url, **kw) 472 assert doc.tag == 'html' 473 bodies = [e for e in doc if e.tag == 'body'] 474 assert len(bodies) == 1, ("too many bodies: %r in %r" % (bodies, html)) 475 body = bodies[0] 476 elements = [] 477 if no_leading_text and body.text and body.text.strip(): 478 raise etree.ParserError( 479 "There is leading text: %r" % body.text) 480 if body.text and body.text.strip(): 481 elements.append(body.text) 482 elements.extend(body) 483 # FIXME: removing the reference to the parent artificial document 484 # would be nice 485 return elements
486
487 -def fragment_fromstring(html, create_parent=False, base_url=None, **kw):
488 """ 489 Parses a single HTML element; it is an error if there is more than 490 one element, or if anything but whitespace precedes or follows the 491 element. 492 493 If create_parent is true (or is a tag name) then a parent node 494 will be created to encapsulate the HTML in a single element. 495 496 base_url will set the document's base_url attribute (and the tree's docinfo.URL) 497 """ 498 if create_parent: 499 if not isinstance(create_parent, basestring): 500 create_parent = 'div' 501 return fragment_fromstring('<%s>%s</%s>' % ( 502 create_parent, html, create_parent), base_url=base_url, **kw) 503 elements = fragments_fromstring(html, no_leading_text=True, base_url=base_url, **kw) 504 if not elements: 505 raise etree.ParserError( 506 "No elements found") 507 if len(elements) > 1: 508 raise etree.ParserError( 509 "Multiple elements found (%s)" 510 % ', '.join([_element_name(e) for e in elements])) 511 el = elements[0] 512 if el.tail and el.tail.strip(): 513 raise etree.ParserError( 514 "Element followed by text: %r" % el.tail) 515 el.tail = None 516 return el
517
518 -def fromstring(html, base_url=None, **kw):
519 """ 520 Parse the html, returning a single element/document. 521 522 This tries to minimally parse the chunk of text, without knowing if it 523 is a fragment or a document. 524 525 base_url will set the document's base_url attribute (and the tree's docinfo.URL) 526 """ 527 start = html[:10].lstrip().lower() 528 if start.startswith('<html') or start.startswith('<!doctype'): 529 # Looks like a full HTML document 530 return document_fromstring(html, base_url=base_url, **kw) 531 # otherwise, lets parse it out... 532 doc = document_fromstring(html, base_url=base_url, **kw) 533 bodies = doc.findall('body') 534 if bodies: 535 body = bodies[0] 536 if len(bodies) > 1: 537 # Somehow there are multiple bodies, which is bad, but just 538 # smash them into one body 539 for other_body in bodies[1:]: 540 if other_body.text: 541 if len(body): 542 body[-1].tail = (body[-1].tail or '') + other_body.text 543 else: 544 body.text = (body.text or '') + other_body.text 545 body.extend(other_body) 546 # We'll ignore tail 547 # I guess we are ignoring attributes too 548 other_body.drop_tree() 549 else: 550 body = None 551 heads = doc.findall('head') 552 if heads: 553 # Well, we have some sort of structure, so lets keep it all 554 head = heads[0] 555 if len(heads) > 1: 556 for other_head in heads[1:]: 557 head.extend(other_head) 558 # We don't care about text or tail in a head 559 other_head.drop_tree() 560 return doc 561 if (len(body) == 1 and (not body.text or not body.text.strip()) 562 and (not body[-1].tail or not body[-1].tail.strip())): 563 # The body has just one element, so it was probably a single 564 # element passed in 565 return body[0] 566 # Now we have a body which represents a bunch of tags which have the 567 # content that was passed in. We will create a fake container, which 568 # is the body tag, except <body> implies too much structure. 569 if _contains_block_level_tag(body): 570 body.tag = 'div' 571 else: 572 body.tag = 'span' 573 return body
574
575 -def parse(filename_or_url, parser=None, base_url=None, **kw):
576 """ 577 Parse a filename, URL, or file-like object into an HTML document 578 tree. Note: this returns a tree, not an element. Use 579 ``parse(...).getroot()`` to get the document root. 580 581 You can override the base URL with the ``base_url`` keyword. This 582 is most useful when parsing from a file-like object. 583 """ 584 if parser is None: 585 parser = html_parser 586 return etree.parse(filename_or_url, parser, base_url=base_url, **kw)
587
588 -def _contains_block_level_tag(el):
589 # FIXME: I could do this with XPath, but would that just be 590 # unnecessarily slow? 591 for el in el.getiterator(): 592 if el.tag in defs.block_tags: 593 return True 594 return False
595
596 -def _element_name(el):
597 if isinstance(el, etree.CommentBase): 598 return 'comment' 599 elif isinstance(el, basestring): 600 return 'string' 601 else: 602 return el.tag
603 604 ################################################################################ 605 # form handling 606 ################################################################################ 607
608 -class FormElement(HtmlElement):
609 """ 610 Represents a <form> element. 611 """ 612
613 - def inputs(self):
614 """ 615 Returns an accessor for all the input elements in the form. 616 617 See `InputGetter` for more information about the object. 618 """ 619 return InputGetter(self)
620 inputs = property(inputs, doc=inputs.__doc__) 621
622 - def _fields__get(self):
623 """ 624 Dictionary-like object that represents all the fields in this 625 form. You can set values in this dictionary to effect the 626 form. 627 """ 628 return FieldsDict(self.inputs)
629 - def _fields__set(self, value):
630 prev_keys = self.fields.keys() 631 for key, value in value.iteritems(): 632 if key in prev_keys: 633 prev_keys.remove(key) 634 self.fields[key] = value 635 for key in prev_keys: 636 if key is None: 637 # Case of an unnamed input; these aren't really 638 # expressed in form_values() anyway. 639 continue 640 self.fields[key] = None
641 642 fields = property(_fields__get, _fields__set, doc=_fields__get.__doc__) 643
644 - def _name(self):
645 if self.get('name'): 646 return self.get('name') 647 elif self.get('id'): 648 return '#' + self.get('id') 649 return str(self.body.findall('form').index(self))
650
651 - def form_values(self):
652 """ 653 Return a list of tuples of the field values for the form. 654 This is suitable to be passed to ``urllib.urlencode()``. 655 """ 656 results = [] 657 for el in self.inputs: 658 name = el.name 659 if not name: 660 continue 661 if el.tag == 'textarea': 662 results.append((name, el.value)) 663 elif el.tag == 'select': 664 value = el.value 665 if el.multiple: 666 for v in value: 667 results.append((name, v)) 668 elif value is not None: 669 results.append((name, el.value)) 670 else: 671 assert el.tag == 'input', ( 672 "Unexpected tag: %r" % el) 673 if el.checkable and not el.checked: 674 continue 675 if el.type in ('submit', 'image', 'reset'): 676 continue 677 value = el.value 678 if value is not None: 679 results.append((name, el.value)) 680 return results
681
682 - def _action__get(self):
683 """ 684 Get/set the form's ``action`` attribute. 685 """ 686 base_url = self.base_url 687 action = self.get('action') 688 if base_url and action is not None: 689 return urlparse.urljoin(base_url, action) 690 else: 691 return action
692 - def _action__set(self, value):
693 self.set('action', value)
694 - def _action__del(self):
695 if 'action' in self.attrib: 696 del self.attrib['action']
697 action = property(_action__get, _action__set, _action__del, doc=_action__get.__doc__) 698
699 - def _method__get(self):
700 """ 701 Get/set the form's method. Always returns a capitalized 702 string, and defaults to ``'GET'`` 703 """ 704 return self.get('method', 'GET').upper()
705 - def _method__set(self, value):
706 self.set('method', value.upper())
707 method = property(_method__get, _method__set, doc=_method__get.__doc__)
708 709 HtmlElementClassLookup._default_element_classes['form'] = FormElement 710
711 -def submit_form(form, extra_values=None, open_http=None):
712 """ 713 Helper function to submit a form. Returns a file-like object, as from 714 ``urllib.urlopen()``. This object also has a ``.geturl()`` function, 715 which shows the URL if there were any redirects. 716 717 You can use this like:: 718 719 form = doc.forms[0] 720 form.inputs['foo'].value = 'bar' # etc 721 response = form.submit() 722 doc = parse(response) 723 doc.make_links_absolute(response.geturl()) 724 725 To change the HTTP requester, pass a function as ``open_http`` keyword 726 argument that opens the URL for you. The function must have the following 727 signature:: 728 729 open_http(method, URL, values) 730 731 The action is one of 'GET' or 'POST', the URL is the target URL as a 732 string, and the values are a sequence of ``(name, value)`` tuples with the 733 form data. 734 """ 735 values = form.form_values() 736 if extra_values: 737 if hasattr(extra_values, 'items'): 738 extra_values = extra_values.items() 739 values.extend(extra_values) 740 if open_http is None: 741 open_http = open_http_urllib 742 return open_http(form.method, form.action, values)
743
744 -def open_http_urllib(method, url, values):
745 import urllib 746 ## FIXME: should test that it's not a relative URL or something 747 if method == 'GET': 748 if '?' in url: 749 url += '&' 750 else: 751 url += '?' 752 url += urllib.urlencode(values) 753 data = None 754 else: 755 data = urllib.urlencode(values) 756 return urllib.urlopen(url, data)
757
758 -class FieldsDict(DictMixin):
759
760 - def __init__(self, inputs):
761 self.inputs = inputs
762 - def __getitem__(self, item):
763 return self.inputs[item].value
764 - def __setitem__(self, item, value):
765 self.inputs[item].value = value
766 - def __delitem__(self, item):
767 raise KeyError( 768 "You cannot remove keys from ElementDict")
769 - def keys(self):
770 return self.inputs.keys()
771 - def __contains__(self, item):
772 return item in self.inputs
773
774 - def __repr__(self):
775 return '<%s for form %s>' % ( 776 self.__class__.__name__, 777 self.inputs.form._name())
778
779 -class InputGetter(object):
780 781 """ 782 An accessor that represents all the input fields in a form. 783 784 You can get fields by name from this, with 785 ``form.inputs['field_name']``. If there are a set of checkboxes 786 with the same name, they are returned as a list (a `CheckboxGroup` 787 which also allows value setting). Radio inputs are handled 788 similarly. 789 790 You can also iterate over this to get all input elements. This 791 won't return the same thing as if you get all the names, as 792 checkboxes and radio elements are returned individually. 793 """ 794 795 _name_xpath = etree.XPath(".//*[@name = $name and (name(.) = 'select' or name(.) = 'input' or name(.) = 'textarea')]") 796 _all_xpath = etree.XPath(".//*[name() = 'select' or name() = 'input' or name() = 'textarea']") 797
798 - def __init__(self, form):
799 self.form = form
800
801 - def __repr__(self):
802 return '<%s for form %s>' % ( 803 self.__class__.__name__, 804 self.form._name())
805 806 ## FIXME: there should be more methods, and it's unclear if this is 807 ## a dictionary-like object or list-like object 808
809 - def __getitem__(self, name):
810 results = self._name_xpath(self.form, name=name) 811 if results: 812 type = results[0].get('type') 813 if type == 'radio' and len(results) > 1: 814 group = RadioGroup(results) 815 group.name = name 816 return group 817 elif type == 'checkbox' and len(results) > 1: 818 group = CheckboxGroup(results) 819 group.name = name 820 return group 821 else: 822 # I don't like throwing away elements like this 823 return results[0] 824 else: 825 raise KeyError( 826 "No input element with the name %r" % name)
827
828 - def __contains__(self, name):
829 results = self._name_xpath(self.form, name=name) 830 return bool(results)
831
832 - def keys(self):
833 names = set() 834 for el in self: 835 if el.name is not None: 836 names.add(el.name) 837 return list(names)
838
839 - def __iter__(self):
840 ## FIXME: kind of dumb to turn a list into an iterator, only 841 ## to have it likely turned back into a list again :( 842 return iter(self._all_xpath(self.form))
843
844 -class InputMixin(object):
845 846 """ 847 Mix-in for all input elements (input, select, and textarea) 848 """ 849 850
851 - def _name__get(self):
852 """ 853 Get/set the name of the element 854 """ 855 return self.get('name')
856 - def _name__set(self, value):
857 self.set('name', value)
858 - def _name__del(self):
859 if 'name' in self.attrib: 860 del self.attrib['name']
861 name = property(_name__get, _name__set, _name__del, doc=_name__get.__doc__) 862
863 - def __repr__(self):
864 type = getattr(self, 'type', None) 865 if type: 866 type = ' type=%r' % type 867 else: 868 type = '' 869 return '<%s %x name=%r%s>' % ( 870 self.__class__.__name__, id(self), self.name, type)
871
872 -class TextareaElement(InputMixin, HtmlElement):
873 """ 874 ``<textarea>`` element. You can get the name with ``.name`` and 875 get/set the value with ``.value`` 876 """ 877
878 - def _value__get(self):
879 """ 880 Get/set the value (which is the contents of this element) 881 """ 882 return self.text or ''
883 - def _value__set(self, value):
884 self.text = value
885 - def _value__del(self):
886 self.text = ''
887 value = property(_value__get, _value__set, _value__del, doc=_value__get.__doc__)
888 889 HtmlElementClassLookup._default_element_classes['textarea'] = TextareaElement 890
891 -class SelectElement(InputMixin, HtmlElement):
892 """ 893 ``<select>`` element. You can get the name with ``.name``. 894 895 ``.value`` will be the value of the selected option, unless this 896 is a multi-select element (``<select multiple>``), in which case 897 it will be a set-like object. In either case ``.value_options`` 898 gives the possible values. 899 900 The boolean attribute ``.multiple`` shows if this is a 901 multi-select. 902 """ 903
904 - def _value__get(self):
905 """ 906 Get/set the value of this select (the selected option). 907 908 If this is a multi-select, this is a set-like object that 909 represents all the selected options. 910 """ 911 if self.multiple: 912 return MultipleSelectOptions(self) 913 for el in self.getiterator('option'): 914 if 'selected' in el.attrib: 915 value = el.get('value') 916 # FIXME: If value is None, what to return?, get_text()? 917 return value 918 return None
919
920 - def _value__set(self, value):
921 if self.multiple: 922 if isinstance(value, basestring): 923 raise TypeError( 924 "You must pass in a sequence") 925 self.value.clear() 926 self.value.update(value) 927 return 928 if value is not None: 929 for el in self.getiterator('option'): 930 # FIXME: also if el.get('value') is None? 931 if el.get('value') == value: 932 checked_option = el 933 break 934 else: 935 raise ValueError( 936 "There is no option with the value of %r" % value) 937 for el in self.getiterator('option'): 938 if 'selected' in el.attrib: 939 del el.attrib['selected'] 940 if value is not None: 941 checked_option.set('selected', '')
942
943 - def _value__del(self):
944 # FIXME: should del be allowed at all? 945 if self.multiple: 946 self.value.clear() 947 else: 948 self.value = None
949 950 value = property(_value__get, _value__set, _value__del, doc=_value__get.__doc__) 951
952 - def value_options(self):
953 """ 954 All the possible values this select can have (the ``value`` 955 attribute of all the ``<option>`` elements. 956 """ 957 return [el.get('value') for el in self.getiterator('option')]
958 value_options = property(value_options, doc=value_options.__doc__) 959
960 - def _multiple__get(self):
961 """ 962 Boolean attribute: is there a ``multiple`` attribute on this element. 963 """ 964 return 'multiple' in self.attrib
965 - def _multiple__set(self, value):
966 if value: 967 self.set('multiple', '') 968 elif 'multiple' in self.attrib: 969 del self.attrib['multiple']
970 multiple = property(_multiple__get, _multiple__set, doc=_multiple__get.__doc__)
971 972 HtmlElementClassLookup._default_element_classes['select'] = SelectElement 973
974 -class MultipleSelectOptions(SetMixin):
975 """ 976 Represents all the selected options in a ``<select multiple>`` element. 977 978 You can add to this set-like option to select an option, or remove 979 to unselect the option. 980 """ 981
982 - def __init__(self, select):
983 self.select = select
984
985 - def options(self):
986 """ 987 Iterator of all the ``<option>`` elements. 988 """ 989 return self.select.getiterator('option')
990 options = property(options) 991
992 - def __iter__(self):
993 for option in self.options: 994 yield option.get('value')
995
996 - def add(self, item):
997 for option in self.options: 998 if option.get('value') == item: 999 option.set('selected', '') 1000 break 1001 else: 1002 raise ValueError( 1003 "There is no option with the value %r" % item)
1004
1005 - def remove(self, item):
1006 for option in self.options: 1007 if option.get('value') == item: 1008 if 'selected' in option.attrib: 1009 del option.attrib['selected'] 1010 else: 1011 raise ValueError( 1012 "The option %r is not currently selected" % item) 1013 break 1014 else: 1015 raise ValueError( 1016 "There is not option with the value %r" % item)
1017
1018 - def __repr__(self):
1019 return '<%s {%s} for select name=%r>' % ( 1020 self.__class__.__name__, 1021 ', '.join([repr(v) for v in self]), 1022 self.select.name)
1023
1024 -class RadioGroup(list):
1025 """ 1026 This object represents several ``<input type=radio>`` elements 1027 that have the same name. 1028 1029 You can use this like a list, but also use the property 1030 ``.value`` to check/uncheck inputs. Also you can use 1031 ``.value_options`` to get the possible values. 1032 """ 1033
1034 - def _value__get(self):
1035 """ 1036 Get/set the value, which checks the radio with that value (and 1037 unchecks any other value). 1038 """ 1039 for el in self: 1040 if 'checked' in el.attrib: 1041 return el.get('value') 1042 return None
1043
1044 - def _value__set(self, value):
1045 if value is not None: 1046 for el in self: 1047 if el.get('value') == value: 1048 checked_option = el 1049 break 1050 else: 1051 raise ValueError( 1052 "There is no radio input with the value %r" % value) 1053 for el in self: 1054 if 'checked' in el.attrib: 1055 del el.attrib['checked'] 1056 if value is not None: 1057 checked_option.set('checked', '')
1058
1059 - def _value__del(self):
1060 self.value = None
1061 1062 value = property(_value__get, _value__set, _value__del, doc=_value__get.__doc__) 1063
1064 - def value_options(self):
1065 """ 1066 Returns a list of all the possible values. 1067 """ 1068 return [el.get('value') for el in self]
1069 value_options = property(value_options, doc=value_options.__doc__) 1070
1071 - def __repr__(self):
1072 return '%s(%s)' % ( 1073 self.__class__.__name__, 1074 list.__repr__(self))
1075
1076 -class CheckboxGroup(list):
1077 """ 1078 Represents a group of checkboxes (``<input type=checkbox>``) that 1079 have the same name. 1080 1081 In addition to using this like a list, the ``.value`` attribute 1082 returns a set-like object that you can add to or remove from to 1083 check and uncheck checkboxes. You can also use ``.value_options`` 1084 to get the possible values. 1085 """ 1086
1087 - def _value__get(self):
1088 """ 1089 Return a set-like object that can be modified to check or 1090 uncheck individual checkboxes according to their value. 1091 """ 1092 return CheckboxValues(self)
1093 - def _value__set(self, value):
1094 self.value.clear() 1095 if not hasattr(value, '__iter__'): 1096 raise ValueError( 1097 "A CheckboxGroup (name=%r) must be set to a sequence (not %r)" 1098 % (self[0].name, value)) 1099 self.value.update(value)
1100 - def _value__del(self):
1101 self.value.clear()
1102 value = property(_value__get, _value__set, _value__del, doc=_value__get.__doc__) 1103
1104 - def __repr__(self):
1105 return '%s(%s)' % ( 1106 self.__class__.__name__, list.__repr__(self))
1107
1108 -class CheckboxValues(SetMixin):
1109 1110 """ 1111 Represents the values of the checked checkboxes in a group of 1112 checkboxes with the same name. 1113 """ 1114
1115 - def __init__(self, group):
1116 self.group = group
1117
1118 - def __iter__(self):
1119 return iter([ 1120 el.get('value') 1121 for el in self.group 1122 if 'checked' in el.attrib])
1123
1124 - def add(self, value):
1125 for el in self.group: 1126 if el.get('value') == value: 1127 el.set('checked', '') 1128 break 1129 else: 1130 raise KeyError("No checkbox with value %r" % value)
1131
1132 - def remove(self, value):
1133 for el in self.group: 1134 if el.get('value') == value: 1135 if 'checked' in el.attrib: 1136 del el.attrib['checked'] 1137 else: 1138 raise KeyError( 1139 "The checkbox with value %r was already unchecked" % value) 1140 break 1141 else: 1142 raise KeyError( 1143 "No checkbox with value %r" % value)
1144
1145 - def __repr__(self):
1146 return '<%s {%s} for checkboxes name=%r>' % ( 1147 self.__class__.__name__, 1148 ', '.join([repr(v) for v in self]), 1149 self.group.name)
1150
1151 -class InputElement(InputMixin, HtmlElement):
1152 """ 1153 Represents an ``<input>`` element. 1154 1155 You can get the type with ``.type`` (which is lower-cased and 1156 defaults to ``'text'``). 1157 1158 Also you can get and set the value with ``.value`` 1159 1160 Checkboxes and radios have the attribute ``input.checkable == 1161 True`` (for all others it is false) and a boolean attribute 1162 ``.checked``. 1163 1164 """ 1165 1166 ## FIXME: I'm a little uncomfortable with the use of .checked
1167 - def _value__get(self):
1168 """ 1169 Get/set the value of this element, using the ``value`` attribute. 1170 1171 Also, if this is a checkbox and it has no value, this defaults 1172 to ``'on'``. If it is a checkbox or radio that is not 1173 checked, this returns None. 1174 """ 1175 if self.checkable: 1176 if self.checked: 1177 return self.get('value') or 'on' 1178 else: 1179 return None 1180 return self.get('value')
1181 - def _value__set(self, value):
1182 if self.checkable: 1183 if not value: 1184 self.checked = False 1185 else: 1186 self.checked = True 1187 if isinstance(value, basestring): 1188 self.set('value', value) 1189 else: 1190 self.set('value', value)
1191 - def _value__del(self):
1192 if self.checkable: 1193 self.checked = False 1194 else: 1195 if 'value' in self.attrib: 1196 del self.attrib['value']
1197 value = property(_value__get, _value__set, _value__del, doc=_value__get.__doc__) 1198
1199 - def _type__get(self):
1200 """ 1201 Return the type of this element (using the type attribute). 1202 """ 1203 return self.get('type', 'text').lower()
1204 - def _type__set(self, value):
1205 self.set('type', value)
1206 type = property(_type__get, _type__set, doc=_type__get.__doc__) 1207
1208 - def checkable(self):
1209 """ 1210 Boolean: can this element be checked? 1211 """ 1212 return self.type in ['checkbox', 'radio']
1213 checkable = property(checkable, doc=checkable.__doc__) 1214
1215 - def _checked__get(self):
1216 """ 1217 Boolean attribute to get/set the presence of the ``checked`` 1218 attribute. 1219 1220 You can only use this on checkable input types. 1221 """ 1222 if not self.checkable: 1223 raise AttributeError('Not a checkable input type') 1224 return 'checked' in self.attrib
1225 - def _checked__set(self, value):
1226 if not self.checkable: 1227 raise AttributeError('Not a checkable input type') 1228 if value: 1229 self.set('checked', '') 1230 else: 1231 if 'checked' in self.attrib: 1232 del self.attrib['checked']
1233 checked = property(_checked__get, _checked__set, doc=_checked__get.__doc__)
1234 1235 HtmlElementClassLookup._default_element_classes['input'] = InputElement 1236
1237 -class LabelElement(HtmlElement):
1238 """ 1239 Represents a ``<label>`` element. 1240 1241 Label elements are linked to other elements with their ``for`` 1242 attribute. You can access this element with ``label.for_element``. 1243 """ 1244
1245 - def _for_element__get(self):
1246 """ 1247 Get/set the element this label points to. Return None if it 1248 can't be found. 1249 """ 1250 id = self.get('for') 1251 if not id: 1252 return None 1253 return self.body.get_element_by_id(id)
1254 - def _for_element__set(self, other):
1255 id = other.get('id') 1256 if not id: 1257 raise TypeError( 1258 "Element %r has no id attribute" % other) 1259 self.set('for', id)
1260 - def _for_element__del(self):
1261 if 'id' in self.attrib: 1262 del self.attrib['id']
1263 for_element = property(_for_element__get, _for_element__set, _for_element__del, 1264 doc=_for_element__get.__doc__)
1265 1266 HtmlElementClassLookup._default_element_classes['label'] = LabelElement 1267 1268 ############################################################ 1269 ## Serialization 1270 ############################################################ 1271 1272 # This isn't a general match, but it's a match for what libxml2 1273 # specifically serialises: 1274 __replace_meta_content_type = re.compile( 1275 r'<meta http-equiv="Content-Type".*?>').sub 1276
1277 -def tostring(doc, pretty_print=False, include_meta_content_type=False, 1278 encoding=None, method="html"):
1279 """Return an HTML string representation of the document. 1280 1281 Note: if include_meta_content_type is true this will create a 1282 ``<meta http-equiv="Content-Type" ...>`` tag in the head; 1283 regardless of the value of include_meta_content_type any existing 1284 ``<meta http-equiv="Content-Type" ...>`` tag will be removed 1285 1286 The ``encoding`` argument controls the output encoding (defauts to 1287 ASCII, with &#...; character references for any characters outside 1288 of ASCII). 1289 1290 The ``method`` argument defines the output method. It defaults to 1291 'html', but can also be 'xml' for xhtml output, or 'text' to 1292 serialise to plain text without markup. Note that you can pass 1293 the builtin ``unicode`` type as ``encoding`` argument to serialise 1294 to a unicode string. 1295 1296 Example:: 1297 1298 >>> from lxml import html 1299 >>> root = html.fragment_fromstring('<p>Hello<br>world!</p>') 1300 1301 >>> html.tostring(root) 1302 '<p>Hello<br>world!</p>' 1303 >>> html.tostring(root, method='html') 1304 '<p>Hello<br>world!</p>' 1305 1306 >>> html.tostring(root, method='xml') 1307 '<p>Hello<br/>world!</p>' 1308 1309 >>> html.tostring(root, method='text') 1310 'Helloworld!' 1311 1312 >>> html.tostring(root, method='text', encoding=unicode) 1313 u'Helloworld!' 1314 """ 1315 html = etree.tostring(doc, method=method, pretty_print=pretty_print, 1316 encoding=encoding) 1317 if not include_meta_content_type: 1318 html = __replace_meta_content_type('', html) 1319 return html
1320
1321 -def open_in_browser(doc):
1322 """ 1323 Open the HTML document in a web browser (saving it to a temporary 1324 file to open it). 1325 """ 1326 import os 1327 import webbrowser 1328 try: 1329 write_doc = doc.write 1330 except AttributeError: 1331 write_doc = etree.ElementTree(element=doc).write 1332 fn = os.tempnam() + '.html' 1333 write_doc(fn, method="html") 1334 url = 'file://' + fn.replace(os.path.sep, '/') 1335 print url 1336 webbrowser.open(url)
1337 1338 ################################################################################ 1339 # configure Element class lookup 1340 ################################################################################ 1341
1342 -class HTMLParser(etree.HTMLParser):
1343 - def __init__(self, **kwargs):
1344 super(HTMLParser, self).__init__(**kwargs) 1345 self.setElementClassLookup(HtmlElementClassLookup())
1346
1347 -def Element(*args, **kw):
1348 v = html_parser.makeelement(*args, **kw) 1349 return v
1350 1351 html_parser = HTMLParser() 1352