1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31 """The ``lxml.html`` tool set for HTML handling.
32 """
33
34 from __future__ import absolute_import
35
36 __all__ = [
37 'document_fromstring', 'fragment_fromstring', 'fragments_fromstring', 'fromstring',
38 'tostring', 'Element', 'defs', 'open_in_browser', 'submit_form',
39 'find_rel_links', 'find_class', 'make_links_absolute',
40 'resolve_base_href', 'iterlinks', 'rewrite_links', 'open_in_browser', 'parse']
41
42
43 import copy
44 import sys
45 import re
46 from functools import partial
47
48 try:
49 from collections.abc import MutableMapping, MutableSet
50 except ImportError:
51 from collections import MutableMapping, MutableSet
52
53 from .. import etree
54 from . import defs
55 from ._setmixin import SetMixin
56
57 try:
58 from urlparse import urljoin
59 except ImportError:
60
61 from urllib.parse import urljoin
62
63 try:
64 unicode
65 except NameError:
66
67 unicode = str
68 try:
69 basestring
70 except NameError:
71
72 basestring = (str, bytes)
76 if not s:
77 return s
78 if sys.version_info[0] >= 3:
79 sub = re.compile(r"^(\s*)u'", re.M).sub
80 else:
81 sub = re.compile(r"^(\s*)b'", re.M).sub
82 return sub(r"\1'", s)
83
84
85 XHTML_NAMESPACE = "http://www.w3.org/1999/xhtml"
86
87 _rel_links_xpath = etree.XPath("descendant-or-self::a[@rel]|descendant-or-self::x:a[@rel]",
88 namespaces={'x':XHTML_NAMESPACE})
89 _options_xpath = etree.XPath("descendant-or-self::option|descendant-or-self::x:option",
90 namespaces={'x':XHTML_NAMESPACE})
91 _forms_xpath = etree.XPath("descendant-or-self::form|descendant-or-self::x:form",
92 namespaces={'x':XHTML_NAMESPACE})
93
94 _class_xpath = etree.XPath("descendant-or-self::*[@class and contains(concat(' ', normalize-space(@class), ' '), concat(' ', $class_name, ' '))]")
95 _id_xpath = etree.XPath("descendant-or-self::*[@id=$id]")
96 _collect_string_content = etree.XPath("string()")
97 _iter_css_urls = re.compile(r'url\(('+'["][^"]*["]|'+"['][^']*[']|"+r'[^)]*)\)', re.I).finditer
98 _iter_css_imports = re.compile(r'@import "(.*?)"').finditer
99 _label_xpath = etree.XPath("//label[@for=$id]|//x:label[@for=$id]",
100 namespaces={'x':XHTML_NAMESPACE})
101 _archive_re = re.compile(r'[^ ]+')
102 _parse_meta_refresh_url = re.compile(
103 r'[^;=]*;\s*(?:url\s*=\s*)?(?P<url>.*)$', re.I).search
107 if s[:1] == '"' and s[-1:] == '"' or s[:1] == "'" and s[-1:] == "'":
108 return s[1:-1], pos+1
109 else:
110 return s,pos
111
122
129
132 """Provides access to an element's class attribute as a set-like collection.
133 Usage::
134
135 >>> el = fromstring('<p class="hidden large">Text</p>')
136 >>> classes = el.classes # or: classes = Classes(el.attrib)
137 >>> classes |= ['block', 'paragraph']
138 >>> el.get('class')
139 'hidden large block paragraph'
140 >>> classes.toggle('hidden')
141 False
142 >>> el.get('class')
143 'large block paragraph'
144 >>> classes -= ('some', 'classes', 'block')
145 >>> el.get('class')
146 'large paragraph'
147 """
149 self._attributes = attributes
150 self._get_class_value = partial(attributes.get, 'class', '')
151
152 - def add(self, value):
153 """
154 Add a class.
155
156 This has no effect if the class is already present.
157 """
158 if not value or re.search(r'\s', value):
159 raise ValueError("Invalid class name: %r" % value)
160 classes = self._get_class_value().split()
161 if value in classes:
162 return
163 classes.append(value)
164 self._attributes['class'] = ' '.join(classes)
165
167 """
168 Remove a class if it is currently present.
169
170 If the class is not present, do nothing.
171 """
172 if not value or re.search(r'\s', value):
173 raise ValueError("Invalid class name: %r" % value)
174 classes = [name for name in self._get_class_value().split()
175 if name != value]
176 if classes:
177 self._attributes['class'] = ' '.join(classes)
178 elif 'class' in self._attributes:
179 del self._attributes['class']
180
182 """
183 Remove a class; it must currently be present.
184
185 If the class is not present, raise a KeyError.
186 """
187 if not value or re.search(r'\s', value):
188 raise ValueError("Invalid class name: %r" % value)
189 super(Classes, self).remove(value)
190
194
196 return iter(self._get_class_value().split())
197
199 return len(self._get_class_value().split())
200
201
202
204 """
205 Add all names from 'values'.
206 """
207 classes = self._get_class_value().split()
208 extended = False
209 for value in values:
210 if value not in classes:
211 classes.append(value)
212 extended = True
213 if extended:
214 self._attributes['class'] = ' '.join(classes)
215
217 """
218 Add a class name if it isn't there yet, or remove it if it exists.
219
220 Returns true if the class was added (and is now enabled) and
221 false if it was removed (and is now disabled).
222 """
223 if not value or re.search(r'\s', value):
224 raise ValueError("Invalid class name: %r" % value)
225 classes = self._get_class_value().split()
226 try:
227 classes.remove(value)
228 enabled = False
229 except ValueError:
230 classes.append(value)
231 enabled = True
232 if classes:
233 self._attributes['class'] = ' '.join(classes)
234 else:
235 del self._attributes['class']
236 return enabled
237
240
241 - def set(self, key, value=None):
242 """set(self, key, value=None)
243
244 Sets an element attribute. If no value is provided, or if the value is None,
245 creates a 'boolean' attribute without value, e.g. "<form novalidate></form>"
246 for ``form.set('novalidate')``.
247 """
248 super(HtmlElement, self).set(key, value)
249
250 @property
252 """
253 A set-like wrapper around the 'class' attribute.
254 """
255 return Classes(self.attrib)
256
257 @classes.setter
265
266 @property
268 """
269 Returns the base URL, given when the page was parsed.
270
271 Use with ``urlparse.urljoin(el.base_url, href)`` to get
272 absolute URLs.
273 """
274 return self.getroottree().docinfo.URL
275
276 @property
282
283 @property
285 """
286 Return the <body> element. Can be called from a child element
287 to get the document's head.
288 """
289 return self.xpath('//body|//x:body', namespaces={'x':XHTML_NAMESPACE})[0]
290
291 @property
293 """
294 Returns the <head> element. Can be called from a child
295 element to get the document's head.
296 """
297 return self.xpath('//head|//x:head', namespaces={'x':XHTML_NAMESPACE})[0]
298
299 @property
301 """
302 Get or set any <label> element associated with this element.
303 """
304 id = self.get('id')
305 if not id:
306 return None
307 result = _label_xpath(self, id=id)
308 if not result:
309 return None
310 else:
311 return result[0]
312
313 @label.setter
315 id = self.get('id')
316 if not id:
317 raise TypeError(
318 "You cannot set a label for an element (%r) that has no id"
319 % self)
320 if _nons(label.tag) != 'label':
321 raise TypeError(
322 "You can only assign label to a label element (not %r)"
323 % label)
324 label.set('for', id)
325
326 @label.deleter
331
333 """
334 Removes this element from the tree, including its children and
335 text. The tail text is joined to the previous element or
336 parent.
337 """
338 parent = self.getparent()
339 assert parent is not None
340 if self.tail:
341 previous = self.getprevious()
342 if previous is None:
343 parent.text = (parent.text or '') + self.tail
344 else:
345 previous.tail = (previous.tail or '') + self.tail
346 parent.remove(self)
347
349 """
350 Remove the tag, but not its children or text. The children and text
351 are merged into the parent.
352
353 Example::
354
355 >>> h = fragment_fromstring('<div>Hello <b>World!</b></div>')
356 >>> h.find('.//b').drop_tag()
357 >>> print(tostring(h, encoding='unicode'))
358 <div>Hello World!</div>
359 """
360 parent = self.getparent()
361 assert parent is not None
362 previous = self.getprevious()
363 if self.text and isinstance(self.tag, basestring):
364
365 if previous is None:
366 parent.text = (parent.text or '') + self.text
367 else:
368 previous.tail = (previous.tail or '') + self.text
369 if self.tail:
370 if len(self):
371 last = self[-1]
372 last.tail = (last.tail or '') + self.tail
373 elif previous is None:
374 parent.text = (parent.text or '') + self.tail
375 else:
376 previous.tail = (previous.tail or '') + self.tail
377 index = parent.index(self)
378 parent[index:index+1] = self[:]
379
381 """
382 Find any links like ``<a rel="{rel}">...</a>``; returns a list of elements.
383 """
384 rel = rel.lower()
385 return [el for el in _rel_links_xpath(self)
386 if el.get('rel').lower() == rel]
387
389 """
390 Find any elements with the given class name.
391 """
392 return _class_xpath(self, class_name=class_name)
393
395 """
396 Get the first element in a document with the given id. If none is
397 found, return the default argument if provided or raise KeyError
398 otherwise.
399
400 Note that there can be more than one element with the same id,
401 and this isn't uncommon in HTML documents found in the wild.
402 Browsers return only the first match, and this function does
403 the same.
404 """
405 try:
406
407
408 return _id_xpath(self, id=id)[0]
409 except IndexError:
410 if default:
411 return default[0]
412 else:
413 raise KeyError(id)
414
415 - def text_content(self):
416 """
417 Return the text content of the tag (and the text in any children).
418 """
419 return _collect_string_content(self)
420
421 - def cssselect(self, expr, translator='html'):
422 """
423 Run the CSS expression on this element and its children,
424 returning a list of the results.
425
426 Equivalent to lxml.cssselect.CSSSelect(expr, translator='html')(self)
427 -- note that pre-compiling the expression can provide a substantial
428 speedup.
429 """
430
431 from lxml.cssselect import CSSSelector
432 return CSSSelector(expr, translator=translator)(self)
433
434
435
436
437
438 - def make_links_absolute(self, base_url=None, resolve_base_href=True,
439 handle_failures=None):
440 """
441 Make all links in the document absolute, given the
442 ``base_url`` for the document (the full URL where the document
443 came from), or if no ``base_url`` is given, then the ``.base_url``
444 of the document.
445
446 If ``resolve_base_href`` is true, then any ``<base href>``
447 tags in the document are used *and* removed from the document.
448 If it is false then any such tag is ignored.
449
450 If ``handle_failures`` is None (default), a failure to process
451 a URL will abort the processing. If set to 'ignore', errors
452 are ignored. If set to 'discard', failing URLs will be removed.
453 """
454 if base_url is None:
455 base_url = self.base_url
456 if base_url is None:
457 raise TypeError(
458 "No base_url given, and the document has no base_url")
459 if resolve_base_href:
460 self.resolve_base_href()
461
462 if handle_failures == 'ignore':
463 def link_repl(href):
464 try:
465 return urljoin(base_url, href)
466 except ValueError:
467 return href
468 elif handle_failures == 'discard':
469 def link_repl(href):
470 try:
471 return urljoin(base_url, href)
472 except ValueError:
473 return None
474 elif handle_failures is None:
475 def link_repl(href):
476 return urljoin(base_url, href)
477 else:
478 raise ValueError(
479 "unexpected value for handle_failures: %r" % handle_failures)
480
481 self.rewrite_links(link_repl)
482
484 """
485 Find any ``<base href>`` tag in the document, and apply its
486 values to all links found in the document. Also remove the
487 tag once it has been applied.
488
489 If ``handle_failures`` is None (default), a failure to process
490 a URL will abort the processing. If set to 'ignore', errors
491 are ignored. If set to 'discard', failing URLs will be removed.
492 """
493 base_href = None
494 basetags = self.xpath('//base[@href]|//x:base[@href]',
495 namespaces={'x': XHTML_NAMESPACE})
496 for b in basetags:
497 base_href = b.get('href')
498 b.drop_tree()
499 if not base_href:
500 return
501 self.make_links_absolute(base_href, resolve_base_href=False,
502 handle_failures=handle_failures)
503
505 """
506 Yield (element, attribute, link, pos), where attribute may be None
507 (indicating the link is in the text). ``pos`` is the position
508 where the link occurs; often 0, but sometimes something else in
509 the case of links in stylesheets or style tags.
510
511 Note: <base href> is *not* taken into account in any way. The
512 link you get is exactly the link in the document.
513
514 Note: multiple links inside of a single text string or
515 attribute value are returned in reversed order. This makes it
516 possible to replace or delete them from the text string value
517 based on their reported text positions. Otherwise, a
518 modification at one text position can change the positions of
519 links reported later on.
520 """
521 link_attrs = defs.link_attrs
522 for el in self.iter(etree.Element):
523 attribs = el.attrib
524 tag = _nons(el.tag)
525 if tag == 'object':
526 codebase = None
527
528
529 if 'codebase' in attribs:
530 codebase = el.get('codebase')
531 yield (el, 'codebase', codebase, 0)
532 for attrib in ('classid', 'data'):
533 if attrib in attribs:
534 value = el.get(attrib)
535 if codebase is not None:
536 value = urljoin(codebase, value)
537 yield (el, attrib, value, 0)
538 if 'archive' in attribs:
539 for match in _archive_re.finditer(el.get('archive')):
540 value = match.group(0)
541 if codebase is not None:
542 value = urljoin(codebase, value)
543 yield (el, 'archive', value, match.start())
544 else:
545 for attrib in link_attrs:
546 if attrib in attribs:
547 yield (el, attrib, attribs[attrib], 0)
548 if tag == 'meta':
549 http_equiv = attribs.get('http-equiv', '').lower()
550 if http_equiv == 'refresh':
551 content = attribs.get('content', '')
552 match = _parse_meta_refresh_url(content)
553 url = (match.group('url') if match else content).strip()
554
555
556 if url:
557 url, pos = _unquote_match(
558 url, match.start('url') if match else content.find(url))
559 yield (el, 'content', url, pos)
560 elif tag == 'param':
561 valuetype = el.get('valuetype') or ''
562 if valuetype.lower() == 'ref':
563
564
565
566
567
568
569 yield (el, 'value', el.get('value'), 0)
570 elif tag == 'style' and el.text:
571 urls = [
572
573 _unquote_match(match.group(1), match.start(1))[::-1]
574 for match in _iter_css_urls(el.text)
575 ] + [
576 (match.start(1), match.group(1))
577 for match in _iter_css_imports(el.text)
578 ]
579 if urls:
580
581
582
583 urls.sort(reverse=True)
584 for start, url in urls:
585 yield (el, None, url, start)
586 if 'style' in attribs:
587 urls = list(_iter_css_urls(attribs['style']))
588 if urls:
589
590 for match in urls[::-1]:
591 url, start = _unquote_match(match.group(1), match.start(1))
592 yield (el, 'style', url, start)
593
594 - def rewrite_links(self, link_repl_func, resolve_base_href=True,
595 base_href=None):
596 """
597 Rewrite all the links in the document. For each link
598 ``link_repl_func(link)`` will be called, and the return value
599 will replace the old link.
600
601 Note that links may not be absolute (unless you first called
602 ``make_links_absolute()``), and may be internal (e.g.,
603 ``'#anchor'``). They can also be values like
604 ``'mailto:email'`` or ``'javascript:expr'``.
605
606 If you give ``base_href`` then all links passed to
607 ``link_repl_func()`` will take that into account.
608
609 If the ``link_repl_func`` returns None, the attribute or
610 tag text will be removed completely.
611 """
612 if base_href is not None:
613
614
615 self.make_links_absolute(
616 base_href, resolve_base_href=resolve_base_href)
617 elif resolve_base_href:
618 self.resolve_base_href()
619
620 for el, attrib, link, pos in self.iterlinks():
621 new_link = link_repl_func(link.strip())
622 if new_link == link:
623 continue
624 if new_link is None:
625
626 if attrib is None:
627 el.text = ''
628 else:
629 del el.attrib[attrib]
630 continue
631
632 if attrib is None:
633 new = el.text[:pos] + new_link + el.text[pos+len(link):]
634 el.text = new
635 else:
636 cur = el.get(attrib)
637 if not pos and len(cur) == len(link):
638 new = new_link
639 else:
640 new = cur[:pos] + new_link + cur[pos+len(link):]
641 el.set(attrib, new)
642
645 """
646 An object that represents a method on an element as a function;
647 the function takes either an element or an HTML string. It
648 returns whatever the function normally returns, or if the function
649 works in-place (and so returns None) it returns a serialized form
650 of the resulting document.
651 """
657 result_type = type(doc)
658 if isinstance(doc, basestring):
659 if 'copy' in kw:
660 raise TypeError(
661 "The keyword 'copy' can only be used with element inputs to %s, not a string input" % self.name)
662 doc = fromstring(doc, **kw)
663 else:
664 if 'copy' in kw:
665 make_a_copy = kw.pop('copy')
666 else:
667 make_a_copy = self.copy
668 if make_a_copy:
669 doc = copy.deepcopy(doc)
670 meth = getattr(doc, self.name)
671 result = meth(*args, **kw)
672
673 if result is None:
674
675 return _transform_result(result_type, doc)
676 else:
677 return result
678
679
680 find_rel_links = _MethodFunc('find_rel_links', copy=False)
681 find_class = _MethodFunc('find_class', copy=False)
682 make_links_absolute = _MethodFunc('make_links_absolute', copy=True)
683 resolve_base_href = _MethodFunc('resolve_base_href', copy=True)
684 iterlinks = _MethodFunc('iterlinks', copy=False)
685 rewrite_links = _MethodFunc('rewrite_links', copy=True)
690
696
700
701
702 -class HtmlEntity(etree.EntityBase, HtmlMixin):
704
707 """A lookup scheme for HTML Element classes.
708
709 To create a lookup instance with different Element classes, pass a tag
710 name mapping of Element classes in the ``classes`` keyword argument and/or
711 a tag name mapping of Mixin classes in the ``mixins`` keyword argument.
712 The special key '*' denotes a Mixin class that should be mixed into all
713 Element classes.
714 """
715 _default_element_classes = {}
716
717 - def __init__(self, classes=None, mixins=None):
734
735 - def lookup(self, node_type, document, namespace, name):
746
747
748
749
750
751
752 _looks_like_full_html_unicode = re.compile(
753 unicode(r'^\s*<(?:html|!doctype)'), re.I).match
754 _looks_like_full_html_bytes = re.compile(
755 r'^\s*<(?:html|!doctype)'.encode('ascii'), re.I).match
770
774 """Parses several HTML elements, returning a list of elements.
775
776 The first item in the list may be a string.
777 If no_leading_text is true, then it will be an error if there is
778 leading text, and it will always be a list of only elements.
779
780 base_url will set the document's base_url attribute
781 (and the tree's docinfo.URL).
782 """
783 if parser is None:
784 parser = html_parser
785
786 if isinstance(html, bytes):
787 if not _looks_like_full_html_bytes(html):
788
789 html = ('<html><body>'.encode('ascii') + html +
790 '</body></html>'.encode('ascii'))
791 else:
792 if not _looks_like_full_html_unicode(html):
793 html = '<html><body>%s</body></html>' % html
794 doc = document_fromstring(html, parser=parser, base_url=base_url, **kw)
795 assert _nons(doc.tag) == 'html'
796 bodies = [e for e in doc if _nons(e.tag) == 'body']
797 assert len(bodies) == 1, ("too many bodies: %r in %r" % (bodies, html))
798 body = bodies[0]
799 elements = []
800 if no_leading_text and body.text and body.text.strip():
801 raise etree.ParserError(
802 "There is leading text: %r" % body.text)
803 if body.text and body.text.strip():
804 elements.append(body.text)
805 elements.extend(body)
806
807
808 return elements
809
813 """
814 Parses a single HTML element; it is an error if there is more than
815 one element, or if anything but whitespace precedes or follows the
816 element.
817
818 If ``create_parent`` is true (or is a tag name) then a parent node
819 will be created to encapsulate the HTML in a single element. In this
820 case, leading or trailing text is also allowed, as are multiple elements
821 as result of the parsing.
822
823 Passing a ``base_url`` will set the document's ``base_url`` attribute
824 (and the tree's docinfo.URL).
825 """
826 if parser is None:
827 parser = html_parser
828
829 accept_leading_text = bool(create_parent)
830
831 elements = fragments_fromstring(
832 html, parser=parser, no_leading_text=not accept_leading_text,
833 base_url=base_url, **kw)
834
835 if create_parent:
836 if not isinstance(create_parent, basestring):
837 create_parent = 'div'
838 new_root = Element(create_parent)
839 if elements:
840 if isinstance(elements[0], basestring):
841 new_root.text = elements[0]
842 del elements[0]
843 new_root.extend(elements)
844 return new_root
845
846 if not elements:
847 raise etree.ParserError('No elements found')
848 if len(elements) > 1:
849 raise etree.ParserError(
850 "Multiple elements found (%s)"
851 % ', '.join([_element_name(e) for e in elements]))
852 el = elements[0]
853 if el.tail and el.tail.strip():
854 raise etree.ParserError(
855 "Element followed by text: %r" % el.tail)
856 el.tail = None
857 return el
858
859
860 -def fromstring(html, base_url=None, parser=None, **kw):
926
927
928 -def parse(filename_or_url, parser=None, base_url=None, **kw):
929 """
930 Parse a filename, URL, or file-like object into an HTML document
931 tree. Note: this returns a tree, not an element. Use
932 ``parse(...).getroot()`` to get the document root.
933
934 You can override the base URL with the ``base_url`` keyword. This
935 is most useful when parsing from a file-like object.
936 """
937 if parser is None:
938 parser = html_parser
939 return etree.parse(filename_or_url, parser, base_url=base_url, **kw)
940
949
952 if isinstance(el, etree.CommentBase):
953 return 'comment'
954 elif isinstance(el, basestring):
955 return 'string'
956 else:
957 return _nons(el.tag)
958
1078
1079
1080 HtmlElementClassLookup._default_element_classes['form'] = FormElement
1119
1122 if not url:
1123 raise ValueError("cannot submit, no URL provided")
1124
1125 try:
1126 from urllib import urlencode, urlopen
1127 except ImportError:
1128 from urllib.request import urlopen
1129 from urllib.parse import urlencode
1130 if method == 'GET':
1131 if '?' in url:
1132 url += '&'
1133 else:
1134 url += '?'
1135 url += urlencode(values)
1136 data = None
1137 else:
1138 data = urlencode(values)
1139 if not isinstance(data, bytes):
1140 data = data.encode('ASCII')
1141 return urlopen(url, data)
1142
1145
1153 raise KeyError(
1154 "You cannot remove keys from ElementDict")
1158 return item in self.inputs
1163
1165 return '<%s for form %s>' % (
1166 self.__class__.__name__,
1167 self.inputs.form._name())
1168
1235
1266
1267
1268 -class TextareaElement(InputMixin, HtmlElement):
1269 """
1270 ``<textarea>`` element. You can get the name with ``.name`` and
1271 get/set the value with ``.value``
1272 """
1273 @property
1275 """
1276 Get/set the value (which is the contents of this element)
1277 """
1278 content = self.text or ''
1279 if self.tag.startswith("{%s}" % XHTML_NAMESPACE):
1280 serialisation_method = 'xml'
1281 else:
1282 serialisation_method = 'html'
1283 for el in self:
1284
1285 content += etree.tostring(
1286 el, method=serialisation_method, encoding='unicode')
1287 return content
1288
1289 @value.setter
1290 - def value(self, value):
1291 del self[:]
1292 self.text = value
1293
1294 @value.deleter
1296 self.text = ''
1297 del self[:]
1298
1299
1300 HtmlElementClassLookup._default_element_classes['textarea'] = TextareaElement
1304 """
1305 ``<select>`` element. You can get the name with ``.name``.
1306
1307 ``.value`` will be the value of the selected option, unless this
1308 is a multi-select element (``<select multiple>``), in which case
1309 it will be a set-like object. In either case ``.value_options``
1310 gives the possible values.
1311
1312 The boolean attribute ``.multiple`` shows if this is a
1313 multi-select.
1314 """
1315 @property
1317 """
1318 Get/set the value of this select (the selected option).
1319
1320 If this is a multi-select, this is a set-like object that
1321 represents all the selected options.
1322 """
1323 if self.multiple:
1324 return MultipleSelectOptions(self)
1325 options = _options_xpath(self)
1326
1327 try:
1328 selected_option = next(el for el in reversed(options) if el.get('selected') is not None)
1329 except StopIteration:
1330 try:
1331 selected_option = next(el for el in options if el.get('disabled') is None)
1332 except StopIteration:
1333 return None
1334 value = selected_option.get('value')
1335 if value is None:
1336 value = (selected_option.text or '').strip()
1337 return value
1338
1339 @value.setter
1340 - def value(self, value):
1341 if self.multiple:
1342 if isinstance(value, basestring):
1343 raise TypeError("You must pass in a sequence")
1344 values = self.value
1345 values.clear()
1346 values.update(value)
1347 return
1348 checked_option = None
1349 if value is not None:
1350 for el in _options_xpath(self):
1351 opt_value = el.get('value')
1352 if opt_value is None:
1353 opt_value = (el.text or '').strip()
1354 if opt_value == value:
1355 checked_option = el
1356 break
1357 else:
1358 raise ValueError(
1359 "There is no option with the value of %r" % value)
1360 for el in _options_xpath(self):
1361 if 'selected' in el.attrib:
1362 del el.attrib['selected']
1363 if checked_option is not None:
1364 checked_option.set('selected', '')
1365
1366 @value.deleter
1373
1374 @property
1387
1388 @property
1390 """
1391 Boolean attribute: is there a ``multiple`` attribute on this element.
1392 """
1393 return 'multiple' in self.attrib
1394
1395 @multiple.setter
1397 if value:
1398 self.set('multiple', '')
1399 elif 'multiple' in self.attrib:
1400 del self.attrib['multiple']
1401
1402
1403 HtmlElementClassLookup._default_element_classes['select'] = SelectElement
1407 """
1408 Represents all the selected options in a ``<select multiple>`` element.
1409
1410 You can add to this set-like option to select an option, or remove
1411 to unselect the option.
1412 """
1413
1415 self.select = select
1416
1417 @property
1419 """
1420 Iterator of all the ``<option>`` elements.
1421 """
1422 return iter(_options_xpath(self.select))
1423
1425 for option in self.options:
1426 if 'selected' in option.attrib:
1427 opt_value = option.get('value')
1428 if opt_value is None:
1429 opt_value = (option.text or '').strip()
1430 yield opt_value
1431
1432 - def add(self, item):
1433 for option in self.options:
1434 opt_value = option.get('value')
1435 if opt_value is None:
1436 opt_value = (option.text or '').strip()
1437 if opt_value == item:
1438 option.set('selected', '')
1439 break
1440 else:
1441 raise ValueError(
1442 "There is no option with the value %r" % item)
1443
1445 for option in self.options:
1446 opt_value = option.get('value')
1447 if opt_value is None:
1448 opt_value = (option.text or '').strip()
1449 if opt_value == item:
1450 if 'selected' in option.attrib:
1451 del option.attrib['selected']
1452 else:
1453 raise ValueError(
1454 "The option %r is not currently selected" % item)
1455 break
1456 else:
1457 raise ValueError(
1458 "There is not option with the value %r" % item)
1459
1461 return '<%s {%s} for select name=%r>' % (
1462 self.__class__.__name__,
1463 ', '.join([repr(v) for v in self]),
1464 self.select.name)
1465
1468 """
1469 This object represents several ``<input type=radio>`` elements
1470 that have the same name.
1471
1472 You can use this like a list, but also use the property
1473 ``.value`` to check/uncheck inputs. Also you can use
1474 ``.value_options`` to get the possible values.
1475 """
1476 @property
1478 """
1479 Get/set the value, which checks the radio with that value (and
1480 unchecks any other value).
1481 """
1482 for el in self:
1483 if 'checked' in el.attrib:
1484 return el.get('value')
1485 return None
1486
1487 @value.setter
1488 - def value(self, value):
1489 checked_option = None
1490 if value is not None:
1491 for el in self:
1492 if el.get('value') == value:
1493 checked_option = el
1494 break
1495 else:
1496 raise ValueError("There is no radio input with the value %r" % value)
1497 for el in self:
1498 if 'checked' in el.attrib:
1499 del el.attrib['checked']
1500 if checked_option is not None:
1501 checked_option.set('checked', '')
1502
1503 @value.deleter
1506
1507 @property
1509 """
1510 Returns a list of all the possible values.
1511 """
1512 return [el.get('value') for el in self]
1513
1515 return '%s(%s)' % (
1516 self.__class__.__name__,
1517 list.__repr__(self))
1518
1521 """
1522 Represents a group of checkboxes (``<input type=checkbox>``) that
1523 have the same name.
1524
1525 In addition to using this like a list, the ``.value`` attribute
1526 returns a set-like object that you can add to or remove from to
1527 check and uncheck checkboxes. You can also use ``.value_options``
1528 to get the possible values.
1529 """
1530 @property
1532 """
1533 Return a set-like object that can be modified to check or
1534 uncheck individual checkboxes according to their value.
1535 """
1536 return CheckboxValues(self)
1537
1538 @value.setter
1539 - def value(self, value):
1547
1548 @value.deleter
1551
1552 @property
1554 """
1555 Returns a list of all the possible values.
1556 """
1557 return [el.get('value') for el in self]
1558
1560 return '%s(%s)' % (
1561 self.__class__.__name__, list.__repr__(self))
1562
1565 """
1566 Represents the values of the checked checkboxes in a group of
1567 checkboxes with the same name.
1568 """
1569
1572
1574 return iter([
1575 el.get('value')
1576 for el in self.group
1577 if 'checked' in el.attrib])
1578
1579 - def add(self, value):
1580 for el in self.group:
1581 if el.get('value') == value:
1582 el.set('checked', '')
1583 break
1584 else:
1585 raise KeyError("No checkbox with value %r" % value)
1586
1588 for el in self.group:
1589 if el.get('value') == value:
1590 if 'checked' in el.attrib:
1591 del el.attrib['checked']
1592 else:
1593 raise KeyError(
1594 "The checkbox with value %r was already unchecked" % value)
1595 break
1596 else:
1597 raise KeyError(
1598 "No checkbox with value %r" % value)
1599
1601 return '<%s {%s} for checkboxes name=%r>' % (
1602 self.__class__.__name__,
1603 ', '.join([repr(v) for v in self]),
1604 self.group.name)
1605
1699
1700
1701 HtmlElementClassLookup._default_element_classes['input'] = InputElement
1705 """
1706 Represents a ``<label>`` element.
1707
1708 Label elements are linked to other elements with their ``for``
1709 attribute. You can access this element with ``label.for_element``.
1710 """
1711 @property
1713 """
1714 Get/set the element this label points to. Return None if it
1715 can't be found.
1716 """
1717 id = self.get('for')
1718 if not id:
1719 return None
1720 return self.body.get_element_by_id(id)
1721
1722 @for_element.setter
1724 id = other.get('id')
1725 if not id:
1726 raise TypeError(
1727 "Element %r has no id attribute" % other)
1728 self.set('for', id)
1729
1730 @for_element.deleter
1735
1736
1737 HtmlElementClassLookup._default_element_classes['label'] = LabelElement
1757
1760 """Convert all tags in an XHTML tree to HTML by removing their
1761 XHTML namespace.
1762 """
1763 try:
1764 xhtml = xhtml.getroot()
1765 except AttributeError:
1766 pass
1767 prefix = "{%s}" % XHTML_NAMESPACE
1768 prefix_len = len(prefix)
1769 for el in xhtml.iter(prefix + "*"):
1770 el.tag = el.tag[prefix_len:]
1771
1772
1773
1774
1775 __str_replace_meta_content_type = re.compile(
1776 r'<meta http-equiv="Content-Type"[^>]*>').sub
1777 __bytes_replace_meta_content_type = re.compile(
1778 r'<meta http-equiv="Content-Type"[^>]*>'.encode('ASCII')).sub
1779
1780
1781 -def tostring(doc, pretty_print=False, include_meta_content_type=False,
1782 encoding=None, method="html", with_tail=True, doctype=None):
1783 """Return an HTML string representation of the document.
1784
1785 Note: if include_meta_content_type is true this will create a
1786 ``<meta http-equiv="Content-Type" ...>`` tag in the head;
1787 regardless of the value of include_meta_content_type any existing
1788 ``<meta http-equiv="Content-Type" ...>`` tag will be removed
1789
1790 The ``encoding`` argument controls the output encoding (defaults to
1791 ASCII, with &#...; character references for any characters outside
1792 of ASCII). Note that you can pass the name ``'unicode'`` as
1793 ``encoding`` argument to serialise to a Unicode string.
1794
1795 The ``method`` argument defines the output method. It defaults to
1796 'html', but can also be 'xml' for xhtml output, or 'text' to
1797 serialise to plain text without markup.
1798
1799 To leave out the tail text of the top-level element that is being
1800 serialised, pass ``with_tail=False``.
1801
1802 The ``doctype`` option allows passing in a plain string that will
1803 be serialised before the XML tree. Note that passing in non
1804 well-formed content here will make the XML output non well-formed.
1805 Also, an existing doctype in the document tree will not be removed
1806 when serialising an ElementTree instance.
1807
1808 Example::
1809
1810 >>> from lxml import html
1811 >>> root = html.fragment_fromstring('<p>Hello<br>world!</p>')
1812
1813 >>> html.tostring(root)
1814 b'<p>Hello<br>world!</p>'
1815 >>> html.tostring(root, method='html')
1816 b'<p>Hello<br>world!</p>'
1817
1818 >>> html.tostring(root, method='xml')
1819 b'<p>Hello<br/>world!</p>'
1820
1821 >>> html.tostring(root, method='text')
1822 b'Helloworld!'
1823
1824 >>> html.tostring(root, method='text', encoding='unicode')
1825 u'Helloworld!'
1826
1827 >>> root = html.fragment_fromstring('<div><p>Hello<br>world!</p>TAIL</div>')
1828 >>> html.tostring(root[0], method='text', encoding='unicode')
1829 u'Helloworld!TAIL'
1830
1831 >>> html.tostring(root[0], method='text', encoding='unicode', with_tail=False)
1832 u'Helloworld!'
1833
1834 >>> doc = html.document_fromstring('<p>Hello<br>world!</p>')
1835 >>> html.tostring(doc, method='html', encoding='unicode')
1836 u'<html><body><p>Hello<br>world!</p></body></html>'
1837
1838 >>> print(html.tostring(doc, method='html', encoding='unicode',
1839 ... doctype='<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN"'
1840 ... ' "http://www.w3.org/TR/html4/strict.dtd">'))
1841 <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd">
1842 <html><body><p>Hello<br>world!</p></body></html>
1843 """
1844 html = etree.tostring(doc, method=method, pretty_print=pretty_print,
1845 encoding=encoding, with_tail=with_tail,
1846 doctype=doctype)
1847 if method == 'html' and not include_meta_content_type:
1848 if isinstance(html, str):
1849 html = __str_replace_meta_content_type('', html)
1850 else:
1851 html = __bytes_replace_meta_content_type(bytes(), html)
1852 return html
1853
1854
1855 tostring.__doc__ = __fix_docstring(tostring.__doc__)
1859 """
1860 Open the HTML document in a web browser, saving it to a temporary
1861 file to open it. Note that this does not delete the file after
1862 use. This is mainly meant for debugging.
1863 """
1864 import os
1865 import webbrowser
1866 import tempfile
1867 if not isinstance(doc, etree._ElementTree):
1868 doc = etree.ElementTree(doc)
1869 handle, fn = tempfile.mkstemp(suffix='.html')
1870 f = os.fdopen(handle, 'wb')
1871 try:
1872 doc.write(f, method="html", encoding=encoding or doc.docinfo.encoding or "UTF-8")
1873 finally:
1874
1875 f.close()
1876 url = 'file://' + fn.replace(os.path.sep, '/')
1877 print(url)
1878 webbrowser.open(url)
1879
1880
1881
1882
1883
1884
1885 -class HTMLParser(etree.HTMLParser):
1886 """An HTML parser that is configured to return lxml.html Element
1887 objects.
1888 """
1892
1895 """An XML parser that is configured to return lxml.html Element
1896 objects.
1897
1898 Note that this parser is not really XHTML aware unless you let it
1899 load a DTD that declares the HTML entities. To do this, make sure
1900 you have the XHTML DTDs installed in your catalogs, and create the
1901 parser like this::
1902
1903 >>> parser = XHTMLParser(load_dtd=True)
1904
1905 If you additionally want to validate the document, use this::
1906
1907 >>> parser = XHTMLParser(dtd_validation=True)
1908
1909 For catalog support, see http://www.xmlsoft.org/catalog.html.
1910 """
1914
1917 """Create a new HTML Element.
1918
1919 This can also be used for XHTML documents.
1920 """
1921 v = html_parser.makeelement(*args, **kw)
1922 return v
1923
1924
1925 html_parser = HTMLParser()
1926 xhtml_parser = XHTMLParser()
1927