1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31 """The ``lxml.html`` tool set for HTML handling.
32 """
33
34 from __future__ import absolute_import
35
36 __all__ = [
37 'document_fromstring', 'fragment_fromstring', 'fragments_fromstring', 'fromstring',
38 'tostring', 'Element', 'defs', 'open_in_browser', 'submit_form',
39 'find_rel_links', 'find_class', 'make_links_absolute',
40 'resolve_base_href', 'iterlinks', 'rewrite_links', 'open_in_browser', 'parse']
41
42
43 import copy
44 import sys
45 import re
46 from functools import partial
47
48 try:
49
50 from collections.abc import MutableMapping, MutableSet
51 except ImportError:
52 from collections import MutableMapping, MutableSet
53
54 from .. import etree
55 from . import defs
56 from ._setmixin import SetMixin
57
58 try:
59 from urlparse import urljoin
60 except ImportError:
61
62 from urllib.parse import urljoin
63
64 try:
65 unicode
66 except NameError:
67
68 unicode = str
69 try:
70 basestring
71 except NameError:
72
73 basestring = (str, bytes)
77 if not s:
78 return s
79 if sys.version_info[0] >= 3:
80 sub = re.compile(r"^(\s*)u'", re.M).sub
81 else:
82 sub = re.compile(r"^(\s*)b'", re.M).sub
83 return sub(r"\1'", s)
84
85
86 XHTML_NAMESPACE = "http://www.w3.org/1999/xhtml"
87
88 _rel_links_xpath = etree.XPath("descendant-or-self::a[@rel]|descendant-or-self::x:a[@rel]",
89 namespaces={'x':XHTML_NAMESPACE})
90 _options_xpath = etree.XPath("descendant-or-self::option|descendant-or-self::x:option",
91 namespaces={'x':XHTML_NAMESPACE})
92 _forms_xpath = etree.XPath("descendant-or-self::form|descendant-or-self::x:form",
93 namespaces={'x':XHTML_NAMESPACE})
94
95 _class_xpath = etree.XPath("descendant-or-self::*[@class and contains(concat(' ', normalize-space(@class), ' '), concat(' ', $class_name, ' '))]")
96 _id_xpath = etree.XPath("descendant-or-self::*[@id=$id]")
97 _collect_string_content = etree.XPath("string()")
98 _iter_css_urls = re.compile(r'url\(('+'["][^"]*["]|'+"['][^']*[']|"+r'[^)]*)\)', re.I).finditer
99 _iter_css_imports = re.compile(r'@import "(.*?)"').finditer
100 _label_xpath = etree.XPath("//label[@for=$id]|//x:label[@for=$id]",
101 namespaces={'x':XHTML_NAMESPACE})
102 _archive_re = re.compile(r'[^ ]+')
103 _parse_meta_refresh_url = re.compile(
104 r'[^;=]*;\s*(?:url\s*=\s*)?(?P<url>.*)$', re.I).search
108 if s[:1] == '"' and s[-1:] == '"' or s[:1] == "'" and s[-1:] == "'":
109 return s[1:-1], pos+1
110 else:
111 return s,pos
112
123
130
133 """Provides access to an element's class attribute as a set-like collection.
134 Usage::
135
136 >>> el = fromstring('<p class="hidden large">Text</p>')
137 >>> classes = el.classes # or: classes = Classes(el.attrib)
138 >>> classes |= ['block', 'paragraph']
139 >>> el.get('class')
140 'hidden large block paragraph'
141 >>> classes.toggle('hidden')
142 False
143 >>> el.get('class')
144 'large block paragraph'
145 >>> classes -= ('some', 'classes', 'block')
146 >>> el.get('class')
147 'large paragraph'
148 """
150 self._attributes = attributes
151 self._get_class_value = partial(attributes.get, 'class', '')
152
153 - def add(self, value):
154 """
155 Add a class.
156
157 This has no effect if the class is already present.
158 """
159 if not value or re.search(r'\s', value):
160 raise ValueError("Invalid class name: %r" % value)
161 classes = self._get_class_value().split()
162 if value in classes:
163 return
164 classes.append(value)
165 self._attributes['class'] = ' '.join(classes)
166
168 """
169 Remove a class if it is currently present.
170
171 If the class is not present, do nothing.
172 """
173 if not value or re.search(r'\s', value):
174 raise ValueError("Invalid class name: %r" % value)
175 classes = [name for name in self._get_class_value().split()
176 if name != value]
177 if classes:
178 self._attributes['class'] = ' '.join(classes)
179 elif 'class' in self._attributes:
180 del self._attributes['class']
181
183 """
184 Remove a class; it must currently be present.
185
186 If the class is not present, raise a KeyError.
187 """
188 if not value or re.search(r'\s', value):
189 raise ValueError("Invalid class name: %r" % value)
190 super(Classes, self).remove(value)
191
195
197 return iter(self._get_class_value().split())
198
200 return len(self._get_class_value().split())
201
202
203
205 """
206 Add all names from 'values'.
207 """
208 classes = self._get_class_value().split()
209 extended = False
210 for value in values:
211 if value not in classes:
212 classes.append(value)
213 extended = True
214 if extended:
215 self._attributes['class'] = ' '.join(classes)
216
218 """
219 Add a class name if it isn't there yet, or remove it if it exists.
220
221 Returns true if the class was added (and is now enabled) and
222 false if it was removed (and is now disabled).
223 """
224 if not value or re.search(r'\s', value):
225 raise ValueError("Invalid class name: %r" % value)
226 classes = self._get_class_value().split()
227 try:
228 classes.remove(value)
229 enabled = False
230 except ValueError:
231 classes.append(value)
232 enabled = True
233 if classes:
234 self._attributes['class'] = ' '.join(classes)
235 else:
236 del self._attributes['class']
237 return enabled
238
241
242 @property
244 """
245 A set-like wrapper around the 'class' attribute.
246 """
247 return Classes(self.attrib)
248
249 @classes.setter
257
258 @property
260 """
261 Returns the base URL, given when the page was parsed.
262
263 Use with ``urlparse.urljoin(el.base_url, href)`` to get
264 absolute URLs.
265 """
266 return self.getroottree().docinfo.URL
267
268 @property
274
275 @property
277 """
278 Return the <body> element. Can be called from a child element
279 to get the document's head.
280 """
281 return self.xpath('//body|//x:body', namespaces={'x':XHTML_NAMESPACE})[0]
282
283 @property
285 """
286 Returns the <head> element. Can be called from a child
287 element to get the document's head.
288 """
289 return self.xpath('//head|//x:head', namespaces={'x':XHTML_NAMESPACE})[0]
290
291 @property
293 """
294 Get or set any <label> element associated with this element.
295 """
296 id = self.get('id')
297 if not id:
298 return None
299 result = _label_xpath(self, id=id)
300 if not result:
301 return None
302 else:
303 return result[0]
304
305 @label.setter
307 id = self.get('id')
308 if not id:
309 raise TypeError(
310 "You cannot set a label for an element (%r) that has no id"
311 % self)
312 if _nons(label.tag) != 'label':
313 raise TypeError(
314 "You can only assign label to a label element (not %r)"
315 % label)
316 label.set('for', id)
317
318 @label.deleter
323
325 """
326 Removes this element from the tree, including its children and
327 text. The tail text is joined to the previous element or
328 parent.
329 """
330 parent = self.getparent()
331 assert parent is not None
332 if self.tail:
333 previous = self.getprevious()
334 if previous is None:
335 parent.text = (parent.text or '') + self.tail
336 else:
337 previous.tail = (previous.tail or '') + self.tail
338 parent.remove(self)
339
341 """
342 Remove the tag, but not its children or text. The children and text
343 are merged into the parent.
344
345 Example::
346
347 >>> h = fragment_fromstring('<div>Hello <b>World!</b></div>')
348 >>> h.find('.//b').drop_tag()
349 >>> print(tostring(h, encoding='unicode'))
350 <div>Hello World!</div>
351 """
352 parent = self.getparent()
353 assert parent is not None
354 previous = self.getprevious()
355 if self.text and isinstance(self.tag, basestring):
356
357 if previous is None:
358 parent.text = (parent.text or '') + self.text
359 else:
360 previous.tail = (previous.tail or '') + self.text
361 if self.tail:
362 if len(self):
363 last = self[-1]
364 last.tail = (last.tail or '') + self.tail
365 elif previous is None:
366 parent.text = (parent.text or '') + self.tail
367 else:
368 previous.tail = (previous.tail or '') + self.tail
369 index = parent.index(self)
370 parent[index:index+1] = self[:]
371
373 """
374 Find any links like ``<a rel="{rel}">...</a>``; returns a list of elements.
375 """
376 rel = rel.lower()
377 return [el for el in _rel_links_xpath(self)
378 if el.get('rel').lower() == rel]
379
381 """
382 Find any elements with the given class name.
383 """
384 return _class_xpath(self, class_name=class_name)
385
387 """
388 Get the first element in a document with the given id. If none is
389 found, return the default argument if provided or raise KeyError
390 otherwise.
391
392 Note that there can be more than one element with the same id,
393 and this isn't uncommon in HTML documents found in the wild.
394 Browsers return only the first match, and this function does
395 the same.
396 """
397 try:
398
399
400 return _id_xpath(self, id=id)[0]
401 except IndexError:
402 if default:
403 return default[0]
404 else:
405 raise KeyError(id)
406
407 - def text_content(self):
408 """
409 Return the text content of the tag (and the text in any children).
410 """
411 return _collect_string_content(self)
412
413 - def cssselect(self, expr, translator='html'):
414 """
415 Run the CSS expression on this element and its children,
416 returning a list of the results.
417
418 Equivalent to lxml.cssselect.CSSSelect(expr, translator='html')(self)
419 -- note that pre-compiling the expression can provide a substantial
420 speedup.
421 """
422
423 from lxml.cssselect import CSSSelector
424 return CSSSelector(expr, translator=translator)(self)
425
426
427
428
429
430 - def make_links_absolute(self, base_url=None, resolve_base_href=True,
431 handle_failures=None):
432 """
433 Make all links in the document absolute, given the
434 ``base_url`` for the document (the full URL where the document
435 came from), or if no ``base_url`` is given, then the ``.base_url``
436 of the document.
437
438 If ``resolve_base_href`` is true, then any ``<base href>``
439 tags in the document are used *and* removed from the document.
440 If it is false then any such tag is ignored.
441
442 If ``handle_failures`` is None (default), a failure to process
443 a URL will abort the processing. If set to 'ignore', errors
444 are ignored. If set to 'discard', failing URLs will be removed.
445 """
446 if base_url is None:
447 base_url = self.base_url
448 if base_url is None:
449 raise TypeError(
450 "No base_url given, and the document has no base_url")
451 if resolve_base_href:
452 self.resolve_base_href()
453
454 if handle_failures == 'ignore':
455 def link_repl(href):
456 try:
457 return urljoin(base_url, href)
458 except ValueError:
459 return href
460 elif handle_failures == 'discard':
461 def link_repl(href):
462 try:
463 return urljoin(base_url, href)
464 except ValueError:
465 return None
466 elif handle_failures is None:
467 def link_repl(href):
468 return urljoin(base_url, href)
469 else:
470 raise ValueError(
471 "unexpected value for handle_failures: %r" % handle_failures)
472
473 self.rewrite_links(link_repl)
474
476 """
477 Find any ``<base href>`` tag in the document, and apply its
478 values to all links found in the document. Also remove the
479 tag once it has been applied.
480
481 If ``handle_failures`` is None (default), a failure to process
482 a URL will abort the processing. If set to 'ignore', errors
483 are ignored. If set to 'discard', failing URLs will be removed.
484 """
485 base_href = None
486 basetags = self.xpath('//base[@href]|//x:base[@href]',
487 namespaces={'x': XHTML_NAMESPACE})
488 for b in basetags:
489 base_href = b.get('href')
490 b.drop_tree()
491 if not base_href:
492 return
493 self.make_links_absolute(base_href, resolve_base_href=False,
494 handle_failures=handle_failures)
495
497 """
498 Yield (element, attribute, link, pos), where attribute may be None
499 (indicating the link is in the text). ``pos`` is the position
500 where the link occurs; often 0, but sometimes something else in
501 the case of links in stylesheets or style tags.
502
503 Note: <base href> is *not* taken into account in any way. The
504 link you get is exactly the link in the document.
505
506 Note: multiple links inside of a single text string or
507 attribute value are returned in reversed order. This makes it
508 possible to replace or delete them from the text string value
509 based on their reported text positions. Otherwise, a
510 modification at one text position can change the positions of
511 links reported later on.
512 """
513 link_attrs = defs.link_attrs
514 for el in self.iter(etree.Element):
515 attribs = el.attrib
516 tag = _nons(el.tag)
517 if tag == 'object':
518 codebase = None
519
520
521 if 'codebase' in attribs:
522 codebase = el.get('codebase')
523 yield (el, 'codebase', codebase, 0)
524 for attrib in ('classid', 'data'):
525 if attrib in attribs:
526 value = el.get(attrib)
527 if codebase is not None:
528 value = urljoin(codebase, value)
529 yield (el, attrib, value, 0)
530 if 'archive' in attribs:
531 for match in _archive_re.finditer(el.get('archive')):
532 value = match.group(0)
533 if codebase is not None:
534 value = urljoin(codebase, value)
535 yield (el, 'archive', value, match.start())
536 else:
537 for attrib in link_attrs:
538 if attrib in attribs:
539 yield (el, attrib, attribs[attrib], 0)
540 if tag == 'meta':
541 http_equiv = attribs.get('http-equiv', '').lower()
542 if http_equiv == 'refresh':
543 content = attribs.get('content', '')
544 match = _parse_meta_refresh_url(content)
545 url = (match.group('url') if match else content).strip()
546
547
548 if url:
549 url, pos = _unquote_match(
550 url, match.start('url') if match else content.find(url))
551 yield (el, 'content', url, pos)
552 elif tag == 'param':
553 valuetype = el.get('valuetype') or ''
554 if valuetype.lower() == 'ref':
555
556
557
558
559
560
561 yield (el, 'value', el.get('value'), 0)
562 elif tag == 'style' and el.text:
563 urls = [
564
565 _unquote_match(match.group(1), match.start(1))[::-1]
566 for match in _iter_css_urls(el.text)
567 ] + [
568 (match.start(1), match.group(1))
569 for match in _iter_css_imports(el.text)
570 ]
571 if urls:
572
573
574
575 urls.sort(reverse=True)
576 for start, url in urls:
577 yield (el, None, url, start)
578 if 'style' in attribs:
579 urls = list(_iter_css_urls(attribs['style']))
580 if urls:
581
582 for match in urls[::-1]:
583 url, start = _unquote_match(match.group(1), match.start(1))
584 yield (el, 'style', url, start)
585
586 - def rewrite_links(self, link_repl_func, resolve_base_href=True,
587 base_href=None):
588 """
589 Rewrite all the links in the document. For each link
590 ``link_repl_func(link)`` will be called, and the return value
591 will replace the old link.
592
593 Note that links may not be absolute (unless you first called
594 ``make_links_absolute()``), and may be internal (e.g.,
595 ``'#anchor'``). They can also be values like
596 ``'mailto:email'`` or ``'javascript:expr'``.
597
598 If you give ``base_href`` then all links passed to
599 ``link_repl_func()`` will take that into account.
600
601 If the ``link_repl_func`` returns None, the attribute or
602 tag text will be removed completely.
603 """
604 if base_href is not None:
605
606
607 self.make_links_absolute(
608 base_href, resolve_base_href=resolve_base_href)
609 elif resolve_base_href:
610 self.resolve_base_href()
611
612 for el, attrib, link, pos in self.iterlinks():
613 new_link = link_repl_func(link.strip())
614 if new_link == link:
615 continue
616 if new_link is None:
617
618 if attrib is None:
619 el.text = ''
620 else:
621 del el.attrib[attrib]
622 continue
623
624 if attrib is None:
625 new = el.text[:pos] + new_link + el.text[pos+len(link):]
626 el.text = new
627 else:
628 cur = el.get(attrib)
629 if not pos and len(cur) == len(link):
630 new = new_link
631 else:
632 new = cur[:pos] + new_link + cur[pos+len(link):]
633 el.set(attrib, new)
634
637 """
638 An object that represents a method on an element as a function;
639 the function takes either an element or an HTML string. It
640 returns whatever the function normally returns, or if the function
641 works in-place (and so returns None) it returns a serialized form
642 of the resulting document.
643 """
649 result_type = type(doc)
650 if isinstance(doc, basestring):
651 if 'copy' in kw:
652 raise TypeError(
653 "The keyword 'copy' can only be used with element inputs to %s, not a string input" % self.name)
654 doc = fromstring(doc, **kw)
655 else:
656 if 'copy' in kw:
657 make_a_copy = kw.pop('copy')
658 else:
659 make_a_copy = self.copy
660 if make_a_copy:
661 doc = copy.deepcopy(doc)
662 meth = getattr(doc, self.name)
663 result = meth(*args, **kw)
664
665 if result is None:
666
667 return _transform_result(result_type, doc)
668 else:
669 return result
670
671
672 find_rel_links = _MethodFunc('find_rel_links', copy=False)
673 find_class = _MethodFunc('find_class', copy=False)
674 make_links_absolute = _MethodFunc('make_links_absolute', copy=True)
675 resolve_base_href = _MethodFunc('resolve_base_href', copy=True)
676 iterlinks = _MethodFunc('iterlinks', copy=False)
677 rewrite_links = _MethodFunc('rewrite_links', copy=True)
682
687
691
692
693 -class HtmlEntity(etree.EntityBase, HtmlMixin):
695
698 """A lookup scheme for HTML Element classes.
699
700 To create a lookup instance with different Element classes, pass a tag
701 name mapping of Element classes in the ``classes`` keyword argument and/or
702 a tag name mapping of Mixin classes in the ``mixins`` keyword argument.
703 The special key '*' denotes a Mixin class that should be mixed into all
704 Element classes.
705 """
706 _default_element_classes = {}
707
708 - def __init__(self, classes=None, mixins=None):
725
726 - def lookup(self, node_type, document, namespace, name):
737
738
739
740
741
742
743 _looks_like_full_html_unicode = re.compile(
744 unicode(r'^\s*<(?:html|!doctype)'), re.I).match
745 _looks_like_full_html_bytes = re.compile(
746 r'^\s*<(?:html|!doctype)'.encode('ascii'), re.I).match
761
765 """
766 Parses several HTML elements, returning a list of elements.
767
768 The first item in the list may be a string (though leading
769 whitespace is removed). If no_leading_text is true, then it will
770 be an error if there is leading text, and it will always be a list
771 of only elements.
772
773 base_url will set the document's base_url attribute (and the tree's docinfo.URL)
774 """
775 if parser is None:
776 parser = html_parser
777
778 if isinstance(html, bytes):
779 if not _looks_like_full_html_bytes(html):
780
781 html = ('<html><body>'.encode('ascii') + html +
782 '</body></html>'.encode('ascii'))
783 else:
784 if not _looks_like_full_html_unicode(html):
785 html = '<html><body>%s</body></html>' % html
786 doc = document_fromstring(html, parser=parser, base_url=base_url, **kw)
787 assert _nons(doc.tag) == 'html'
788 bodies = [e for e in doc if _nons(e.tag) == 'body']
789 assert len(bodies) == 1, ("too many bodies: %r in %r" % (bodies, html))
790 body = bodies[0]
791 elements = []
792 if no_leading_text and body.text and body.text.strip():
793 raise etree.ParserError(
794 "There is leading text: %r" % body.text)
795 if body.text and body.text.strip():
796 elements.append(body.text)
797 elements.extend(body)
798
799
800 return elements
801
805 """
806 Parses a single HTML element; it is an error if there is more than
807 one element, or if anything but whitespace precedes or follows the
808 element.
809
810 If ``create_parent`` is true (or is a tag name) then a parent node
811 will be created to encapsulate the HTML in a single element. In this
812 case, leading or trailing text is also allowed, as are multiple elements
813 as result of the parsing.
814
815 Passing a ``base_url`` will set the document's ``base_url`` attribute
816 (and the tree's docinfo.URL).
817 """
818 if parser is None:
819 parser = html_parser
820
821 accept_leading_text = bool(create_parent)
822
823 elements = fragments_fromstring(
824 html, parser=parser, no_leading_text=not accept_leading_text,
825 base_url=base_url, **kw)
826
827 if create_parent:
828 if not isinstance(create_parent, basestring):
829 create_parent = 'div'
830 new_root = Element(create_parent)
831 if elements:
832 if isinstance(elements[0], basestring):
833 new_root.text = elements[0]
834 del elements[0]
835 new_root.extend(elements)
836 return new_root
837
838 if not elements:
839 raise etree.ParserError('No elements found')
840 if len(elements) > 1:
841 raise etree.ParserError(
842 "Multiple elements found (%s)"
843 % ', '.join([_element_name(e) for e in elements]))
844 el = elements[0]
845 if el.tail and el.tail.strip():
846 raise etree.ParserError(
847 "Element followed by text: %r" % el.tail)
848 el.tail = None
849 return el
850
851
852 -def fromstring(html, base_url=None, parser=None, **kw):
918
919
920 -def parse(filename_or_url, parser=None, base_url=None, **kw):
921 """
922 Parse a filename, URL, or file-like object into an HTML document
923 tree. Note: this returns a tree, not an element. Use
924 ``parse(...).getroot()`` to get the document root.
925
926 You can override the base URL with the ``base_url`` keyword. This
927 is most useful when parsing from a file-like object.
928 """
929 if parser is None:
930 parser = html_parser
931 return etree.parse(filename_or_url, parser, base_url=base_url, **kw)
932
941
944 if isinstance(el, etree.CommentBase):
945 return 'comment'
946 elif isinstance(el, basestring):
947 return 'string'
948 else:
949 return _nons(el.tag)
950
1070
1071
1072 HtmlElementClassLookup._default_element_classes['form'] = FormElement
1111
1114 if not url:
1115 raise ValueError("cannot submit, no URL provided")
1116
1117 try:
1118 from urllib import urlencode, urlopen
1119 except ImportError:
1120 from urllib.request import urlopen
1121 from urllib.parse import urlencode
1122 if method == 'GET':
1123 if '?' in url:
1124 url += '&'
1125 else:
1126 url += '?'
1127 url += urlencode(values)
1128 data = None
1129 else:
1130 data = urlencode(values)
1131 return urlopen(url, data)
1132
1135
1143 raise KeyError(
1144 "You cannot remove keys from ElementDict")
1148 return item in self.inputs
1153
1155 return '<%s for form %s>' % (
1156 self.__class__.__name__,
1157 self.inputs.form._name())
1158
1225
1256
1257
1258 -class TextareaElement(InputMixin, HtmlElement):
1259 """
1260 ``<textarea>`` element. You can get the name with ``.name`` and
1261 get/set the value with ``.value``
1262 """
1263 @property
1265 """
1266 Get/set the value (which is the contents of this element)
1267 """
1268 content = self.text or ''
1269 if self.tag.startswith("{%s}" % XHTML_NAMESPACE):
1270 serialisation_method = 'xml'
1271 else:
1272 serialisation_method = 'html'
1273 for el in self:
1274
1275 content += etree.tostring(
1276 el, method=serialisation_method, encoding='unicode')
1277 return content
1278
1279 @value.setter
1280 - def value(self, value):
1281 del self[:]
1282 self.text = value
1283
1284 @value.deleter
1286 self.text = ''
1287 del self[:]
1288
1289
1290 HtmlElementClassLookup._default_element_classes['textarea'] = TextareaElement
1294 """
1295 ``<select>`` element. You can get the name with ``.name``.
1296
1297 ``.value`` will be the value of the selected option, unless this
1298 is a multi-select element (``<select multiple>``), in which case
1299 it will be a set-like object. In either case ``.value_options``
1300 gives the possible values.
1301
1302 The boolean attribute ``.multiple`` shows if this is a
1303 multi-select.
1304 """
1305 @property
1307 """
1308 Get/set the value of this select (the selected option).
1309
1310 If this is a multi-select, this is a set-like object that
1311 represents all the selected options.
1312 """
1313 if self.multiple:
1314 return MultipleSelectOptions(self)
1315 for el in _options_xpath(self):
1316 if el.get('selected') is not None:
1317 value = el.get('value')
1318 if value is None:
1319 value = el.text or ''
1320 if value:
1321 value = value.strip()
1322 return value
1323 return None
1324
1325 @value.setter
1326 - def value(self, value):
1327 if self.multiple:
1328 if isinstance(value, basestring):
1329 raise TypeError("You must pass in a sequence")
1330 values = self.value
1331 values.clear()
1332 values.update(value)
1333 return
1334 checked_option = None
1335 if value is not None:
1336 value = value.strip()
1337 for el in _options_xpath(self):
1338 opt_value = el.get('value')
1339 if opt_value is None:
1340 opt_value = el.text or ''
1341 if opt_value:
1342 opt_value = opt_value.strip()
1343 if opt_value == value:
1344 checked_option = el
1345 break
1346 else:
1347 raise ValueError(
1348 "There is no option with the value of %r" % value)
1349 for el in _options_xpath(self):
1350 if 'selected' in el.attrib:
1351 del el.attrib['selected']
1352 if checked_option is not None:
1353 checked_option.set('selected', '')
1354
1355 @value.deleter
1362
1363 @property
1378
1379 @property
1381 """
1382 Boolean attribute: is there a ``multiple`` attribute on this element.
1383 """
1384 return 'multiple' in self.attrib
1385
1386 @multiple.setter
1388 if value:
1389 self.set('multiple', '')
1390 elif 'multiple' in self.attrib:
1391 del self.attrib['multiple']
1392
1393
1394 HtmlElementClassLookup._default_element_classes['select'] = SelectElement
1398 """
1399 Represents all the selected options in a ``<select multiple>`` element.
1400
1401 You can add to this set-like option to select an option, or remove
1402 to unselect the option.
1403 """
1404
1406 self.select = select
1407
1408 @property
1410 """
1411 Iterator of all the ``<option>`` elements.
1412 """
1413 return iter(_options_xpath(self.select))
1414
1416 for option in self.options:
1417 if 'selected' in option.attrib:
1418 opt_value = option.get('value')
1419 if opt_value is None:
1420 opt_value = option.text or ''
1421 if opt_value:
1422 opt_value = opt_value.strip()
1423 yield opt_value
1424
1425 - def add(self, item):
1426 for option in self.options:
1427 opt_value = option.get('value')
1428 if opt_value is None:
1429 opt_value = option.text or ''
1430 if opt_value:
1431 opt_value = opt_value.strip()
1432 if opt_value == item:
1433 option.set('selected', '')
1434 break
1435 else:
1436 raise ValueError(
1437 "There is no option with the value %r" % item)
1438
1440 for option in self.options:
1441 opt_value = option.get('value')
1442 if opt_value is None:
1443 opt_value = option.text or ''
1444 if opt_value:
1445 opt_value = opt_value.strip()
1446 if opt_value == item:
1447 if 'selected' in option.attrib:
1448 del option.attrib['selected']
1449 else:
1450 raise ValueError(
1451 "The option %r is not currently selected" % item)
1452 break
1453 else:
1454 raise ValueError(
1455 "There is not option with the value %r" % item)
1456
1458 return '<%s {%s} for select name=%r>' % (
1459 self.__class__.__name__,
1460 ', '.join([repr(v) for v in self]),
1461 self.select.name)
1462
1465 """
1466 This object represents several ``<input type=radio>`` elements
1467 that have the same name.
1468
1469 You can use this like a list, but also use the property
1470 ``.value`` to check/uncheck inputs. Also you can use
1471 ``.value_options`` to get the possible values.
1472 """
1473 @property
1475 """
1476 Get/set the value, which checks the radio with that value (and
1477 unchecks any other value).
1478 """
1479 for el in self:
1480 if 'checked' in el.attrib:
1481 return el.get('value')
1482 return None
1483
1484 @value.setter
1485 - def value(self, value):
1486 checked_option = None
1487 if value is not None:
1488 for el in self:
1489 if el.get('value') == value:
1490 checked_option = el
1491 break
1492 else:
1493 raise ValueError("There is no radio input with the value %r" % value)
1494 for el in self:
1495 if 'checked' in el.attrib:
1496 del el.attrib['checked']
1497 if checked_option is not None:
1498 checked_option.set('checked', '')
1499
1500 @value.deleter
1503
1504 @property
1506 """
1507 Returns a list of all the possible values.
1508 """
1509 return [el.get('value') for el in self]
1510
1512 return '%s(%s)' % (
1513 self.__class__.__name__,
1514 list.__repr__(self))
1515
1518 """
1519 Represents a group of checkboxes (``<input type=checkbox>``) that
1520 have the same name.
1521
1522 In addition to using this like a list, the ``.value`` attribute
1523 returns a set-like object that you can add to or remove from to
1524 check and uncheck checkboxes. You can also use ``.value_options``
1525 to get the possible values.
1526 """
1527 @property
1529 """
1530 Return a set-like object that can be modified to check or
1531 uncheck individual checkboxes according to their value.
1532 """
1533 return CheckboxValues(self)
1534
1535 @value.setter
1536 - def value(self, value):
1544
1545 @value.deleter
1548
1549 @property
1551 """
1552 Returns a list of all the possible values.
1553 """
1554 return [el.get('value') for el in self]
1555
1557 return '%s(%s)' % (
1558 self.__class__.__name__, list.__repr__(self))
1559
1562 """
1563 Represents the values of the checked checkboxes in a group of
1564 checkboxes with the same name.
1565 """
1566
1569
1571 return iter([
1572 el.get('value')
1573 for el in self.group
1574 if 'checked' in el.attrib])
1575
1576 - def add(self, value):
1577 for el in self.group:
1578 if el.get('value') == value:
1579 el.set('checked', '')
1580 break
1581 else:
1582 raise KeyError("No checkbox with value %r" % value)
1583
1585 for el in self.group:
1586 if el.get('value') == value:
1587 if 'checked' in el.attrib:
1588 del el.attrib['checked']
1589 else:
1590 raise KeyError(
1591 "The checkbox with value %r was already unchecked" % value)
1592 break
1593 else:
1594 raise KeyError(
1595 "No checkbox with value %r" % value)
1596
1598 return '<%s {%s} for checkboxes name=%r>' % (
1599 self.__class__.__name__,
1600 ', '.join([repr(v) for v in self]),
1601 self.group.name)
1602
1696
1697
1698 HtmlElementClassLookup._default_element_classes['input'] = InputElement
1702 """
1703 Represents a ``<label>`` element.
1704
1705 Label elements are linked to other elements with their ``for``
1706 attribute. You can access this element with ``label.for_element``.
1707 """
1708 @property
1710 """
1711 Get/set the element this label points to. Return None if it
1712 can't be found.
1713 """
1714 id = self.get('for')
1715 if not id:
1716 return None
1717 return self.body.get_element_by_id(id)
1718
1719 @for_element.setter
1721 id = other.get('id')
1722 if not id:
1723 raise TypeError(
1724 "Element %r has no id attribute" % other)
1725 self.set('for', id)
1726
1727 @for_element.deleter
1732
1733
1734 HtmlElementClassLookup._default_element_classes['label'] = LabelElement
1754
1757 """Convert all tags in an XHTML tree to HTML by removing their
1758 XHTML namespace.
1759 """
1760 try:
1761 xhtml = xhtml.getroot()
1762 except AttributeError:
1763 pass
1764 prefix = "{%s}" % XHTML_NAMESPACE
1765 prefix_len = len(prefix)
1766 for el in xhtml.iter(prefix + "*"):
1767 el.tag = el.tag[prefix_len:]
1768
1769
1770
1771
1772 __str_replace_meta_content_type = re.compile(
1773 r'<meta http-equiv="Content-Type"[^>]*>').sub
1774 __bytes_replace_meta_content_type = re.compile(
1775 r'<meta http-equiv="Content-Type"[^>]*>'.encode('ASCII')).sub
1776
1777
1778 -def tostring(doc, pretty_print=False, include_meta_content_type=False,
1779 encoding=None, method="html", with_tail=True, doctype=None):
1780 """Return an HTML string representation of the document.
1781
1782 Note: if include_meta_content_type is true this will create a
1783 ``<meta http-equiv="Content-Type" ...>`` tag in the head;
1784 regardless of the value of include_meta_content_type any existing
1785 ``<meta http-equiv="Content-Type" ...>`` tag will be removed
1786
1787 The ``encoding`` argument controls the output encoding (defauts to
1788 ASCII, with &#...; character references for any characters outside
1789 of ASCII). Note that you can pass the name ``'unicode'`` as
1790 ``encoding`` argument to serialise to a Unicode string.
1791
1792 The ``method`` argument defines the output method. It defaults to
1793 'html', but can also be 'xml' for xhtml output, or 'text' to
1794 serialise to plain text without markup.
1795
1796 To leave out the tail text of the top-level element that is being
1797 serialised, pass ``with_tail=False``.
1798
1799 The ``doctype`` option allows passing in a plain string that will
1800 be serialised before the XML tree. Note that passing in non
1801 well-formed content here will make the XML output non well-formed.
1802 Also, an existing doctype in the document tree will not be removed
1803 when serialising an ElementTree instance.
1804
1805 Example::
1806
1807 >>> from lxml import html
1808 >>> root = html.fragment_fromstring('<p>Hello<br>world!</p>')
1809
1810 >>> html.tostring(root)
1811 b'<p>Hello<br>world!</p>'
1812 >>> html.tostring(root, method='html')
1813 b'<p>Hello<br>world!</p>'
1814
1815 >>> html.tostring(root, method='xml')
1816 b'<p>Hello<br/>world!</p>'
1817
1818 >>> html.tostring(root, method='text')
1819 b'Helloworld!'
1820
1821 >>> html.tostring(root, method='text', encoding='unicode')
1822 u'Helloworld!'
1823
1824 >>> root = html.fragment_fromstring('<div><p>Hello<br>world!</p>TAIL</div>')
1825 >>> html.tostring(root[0], method='text', encoding='unicode')
1826 u'Helloworld!TAIL'
1827
1828 >>> html.tostring(root[0], method='text', encoding='unicode', with_tail=False)
1829 u'Helloworld!'
1830
1831 >>> doc = html.document_fromstring('<p>Hello<br>world!</p>')
1832 >>> html.tostring(doc, method='html', encoding='unicode')
1833 u'<html><body><p>Hello<br>world!</p></body></html>'
1834
1835 >>> print(html.tostring(doc, method='html', encoding='unicode',
1836 ... doctype='<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN"'
1837 ... ' "http://www.w3.org/TR/html4/strict.dtd">'))
1838 <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd">
1839 <html><body><p>Hello<br>world!</p></body></html>
1840 """
1841 html = etree.tostring(doc, method=method, pretty_print=pretty_print,
1842 encoding=encoding, with_tail=with_tail,
1843 doctype=doctype)
1844 if method == 'html' and not include_meta_content_type:
1845 if isinstance(html, str):
1846 html = __str_replace_meta_content_type('', html)
1847 else:
1848 html = __bytes_replace_meta_content_type(bytes(), html)
1849 return html
1850
1851
1852 tostring.__doc__ = __fix_docstring(tostring.__doc__)
1856 """
1857 Open the HTML document in a web browser, saving it to a temporary
1858 file to open it. Note that this does not delete the file after
1859 use. This is mainly meant for debugging.
1860 """
1861 import os
1862 import webbrowser
1863 import tempfile
1864 if not isinstance(doc, etree._ElementTree):
1865 doc = etree.ElementTree(doc)
1866 handle, fn = tempfile.mkstemp(suffix='.html')
1867 f = os.fdopen(handle, 'wb')
1868 try:
1869 doc.write(f, method="html", encoding=encoding or doc.docinfo.encoding or "UTF-8")
1870 finally:
1871
1872 f.close()
1873 url = 'file://' + fn.replace(os.path.sep, '/')
1874 print(url)
1875 webbrowser.open(url)
1876
1877
1878
1879
1880
1881
1882 -class HTMLParser(etree.HTMLParser):
1883 """An HTML parser that is configured to return lxml.html Element
1884 objects.
1885 """
1889
1892 """An XML parser that is configured to return lxml.html Element
1893 objects.
1894
1895 Note that this parser is not really XHTML aware unless you let it
1896 load a DTD that declares the HTML entities. To do this, make sure
1897 you have the XHTML DTDs installed in your catalogs, and create the
1898 parser like this::
1899
1900 >>> parser = XHTMLParser(load_dtd=True)
1901
1902 If you additionally want to validate the document, use this::
1903
1904 >>> parser = XHTMLParser(dtd_validation=True)
1905
1906 For catalog support, see http://www.xmlsoft.org/catalog.html.
1907 """
1911
1914 """Create a new HTML Element.
1915
1916 This can also be used for XHTML documents.
1917 """
1918 v = html_parser.makeelement(*args, **kw)
1919 return v
1920
1921
1922 html_parser = HTMLParser()
1923 xhtml_parser = XHTMLParser()
1924