1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31 """The ``lxml.html`` tool set for HTML handling.
32 """
33
34 import sys
35 import re
36 try:
37 from urlparse import urljoin
38 except ImportError:
39
40 from urllib.parse import urljoin
41 import copy
42 from lxml import etree
43 from lxml.html import defs
44 from lxml.html._setmixin import SetMixin
45 try:
46 from collections import MutableMapping as DictMixin
47 except ImportError:
48
49 from UserDict import DictMixin
50 try:
51 set
52 except NameError:
53
54 from sets import Set as set
55 try:
56 bytes
57 except NameError:
58
59 bytes = str
60 try:
61 unicode
62 except NameError:
63
64 unicode = str
65 try:
66 basestring
67 except NameError:
68
69 basestring = (str, bytes)
70
72 if not s:
73 return s
74 import sys
75 if sys.version_info[0] >= 3:
76 sub = re.compile(r"^(\s*)u'", re.M).sub
77 else:
78 sub = re.compile(r"^(\s*)b'", re.M).sub
79 return sub(r"\1'", s)
80
81 __all__ = [
82 'document_fromstring', 'fragment_fromstring', 'fragments_fromstring', 'fromstring',
83 'tostring', 'Element', 'defs', 'open_in_browser', 'submit_form',
84 'find_rel_links', 'find_class', 'make_links_absolute',
85 'resolve_base_href', 'iterlinks', 'rewrite_links', 'open_in_browser', 'parse']
86
87 XHTML_NAMESPACE = "http://www.w3.org/1999/xhtml"
88
89 _rel_links_xpath = etree.XPath("descendant-or-self::a[@rel]|descendant-or-self::x:a[@rel]",
90 namespaces={'x':XHTML_NAMESPACE})
91 _options_xpath = etree.XPath("descendant-or-self::option|descendant-or-self::x:option",
92 namespaces={'x':XHTML_NAMESPACE})
93 _forms_xpath = etree.XPath("descendant-or-self::form|descendant-or-self::x:form",
94 namespaces={'x':XHTML_NAMESPACE})
95
96 _class_xpath = etree.XPath("descendant-or-self::*[@class and contains(concat(' ', normalize-space(@class), ' '), concat(' ', $class_name, ' '))]")
97 _id_xpath = etree.XPath("descendant-or-self::*[@id=$id]")
98 _collect_string_content = etree.XPath("string()")
99 _css_url_re = re.compile(r'url\(('+'["][^"]*["]|'+"['][^']*[']|"+r'[^)]*)\)', re.I)
100 _css_import_re = re.compile(r'@import "(.*?)"')
101 _label_xpath = etree.XPath("//label[@for=$id]|//x:label[@for=$id]",
102 namespaces={'x':XHTML_NAMESPACE})
103 _archive_re = re.compile(r'[^ ]+')
104
106 if s[:1] == '"' and s[-1:] == '"' or s[:1] == "'" and s[-1:] == "'":
107 return s[1:-1], pos+1
108 else:
109 return s,pos
110
120
126
128
130 """
131 Returns the base URL, given when the page was parsed.
132
133 Use with ``urlparse.urljoin(el.base_url, href)`` to get
134 absolute URLs.
135 """
136 return self.getroottree().docinfo.URL
137 base_url = property(base_url, doc=base_url.__doc__)
138
144 forms = property(forms, doc=forms.__doc__)
145
147 """
148 Return the <body> element. Can be called from a child element
149 to get the document's head.
150 """
151 return self.xpath('//body|//x:body', namespaces={'x':XHTML_NAMESPACE})[0]
152 body = property(body, doc=body.__doc__)
153
155 """
156 Returns the <head> element. Can be called from a child
157 element to get the document's head.
158 """
159 return self.xpath('//head|//x:head', namespaces={'x':XHTML_NAMESPACE})[0]
160 head = property(head, doc=head.__doc__)
161
163 """
164 Get or set any <label> element associated with this element.
165 """
166 id = self.get('id')
167 if not id:
168 return None
169 result = _label_xpath(self, id=id)
170 if not result:
171 return None
172 else:
173 return result[0]
175 id = self.get('id')
176 if not id:
177 raise TypeError(
178 "You cannot set a label for an element (%r) that has no id"
179 % self)
180 if _nons(label.tag) != 'label':
181 raise TypeError(
182 "You can only assign label to a label element (not %r)"
183 % label)
184 label.set('for', id)
189 label = property(_label__get, _label__set, _label__del, doc=_label__get.__doc__)
190
192 """
193 Removes this element from the tree, including its children and
194 text. The tail text is joined to the previous element or
195 parent.
196 """
197 parent = self.getparent()
198 assert parent is not None
199 if self.tail:
200 previous = self.getprevious()
201 if previous is None:
202 parent.text = (parent.text or '') + self.tail
203 else:
204 previous.tail = (previous.tail or '') + self.tail
205 parent.remove(self)
206
208 """
209 Remove the tag, but not its children or text. The children and text
210 are merged into the parent.
211
212 Example::
213
214 >>> h = fragment_fromstring('<div>Hello <b>World!</b></div>')
215 >>> h.find('.//b').drop_tag()
216 >>> print(tostring(h, encoding='unicode'))
217 <div>Hello World!</div>
218 """
219 parent = self.getparent()
220 assert parent is not None
221 previous = self.getprevious()
222 if self.text and isinstance(self.tag, basestring):
223
224 if previous is None:
225 parent.text = (parent.text or '') + self.text
226 else:
227 previous.tail = (previous.tail or '') + self.text
228 if self.tail:
229 if len(self):
230 last = self[-1]
231 last.tail = (last.tail or '') + self.tail
232 elif previous is None:
233 parent.text = (parent.text or '') + self.tail
234 else:
235 previous.tail = (previous.tail or '') + self.tail
236 index = parent.index(self)
237 parent[index:index+1] = self[:]
238
240 """
241 Find any links like ``<a rel="{rel}">...</a>``; returns a list of elements.
242 """
243 rel = rel.lower()
244 return [el for el in _rel_links_xpath(self)
245 if el.get('rel').lower() == rel]
246
248 """
249 Find any elements with the given class name.
250 """
251 return _class_xpath(self, class_name=class_name)
252
254 """
255 Get the first element in a document with the given id. If none is
256 found, return the default argument if provided or raise KeyError
257 otherwise.
258
259 Note that there can be more than one element with the same id,
260 and this isn't uncommon in HTML documents found in the wild.
261 Browsers return only the first match, and this function does
262 the same.
263 """
264 try:
265
266
267 return _id_xpath(self, id=id)[0]
268 except IndexError:
269 if default:
270 return default[0]
271 else:
272 raise KeyError(id)
273
274 - def text_content(self):
275 """
276 Return the text content of the tag (and the text in any children).
277 """
278 return _collect_string_content(self)
279
280 - def cssselect(self, expr, translator='html'):
281 """
282 Run the CSS expression on this element and its children,
283 returning a list of the results.
284
285 Equivalent to lxml.cssselect.CSSSelect(expr, translator='html')(self)
286 -- note that pre-compiling the expression can provide a substantial
287 speedup.
288 """
289
290 from lxml.cssselect import CSSSelector
291 return CSSSelector(expr, translator=translator)(self)
292
293
294
295
296
297 - def make_links_absolute(self, base_url=None, resolve_base_href=True,
298 handle_failures=None):
299 """
300 Make all links in the document absolute, given the
301 ``base_url`` for the document (the full URL where the document
302 came from), or if no ``base_url`` is given, then the ``.base_url``
303 of the document.
304
305 If ``resolve_base_href`` is true, then any ``<base href>``
306 tags in the document are used *and* removed from the document.
307 If it is false then any such tag is ignored.
308
309 If ``handle_failures`` is None (default), a failure to process
310 a URL will abort the processing. If set to 'ignore', errors
311 are ignored. If set to 'discard', failing URLs will be removed.
312 """
313 if base_url is None:
314 base_url = self.base_url
315 if base_url is None:
316 raise TypeError(
317 "No base_url given, and the document has no base_url")
318 if resolve_base_href:
319 self.resolve_base_href()
320
321 if handle_failures == 'ignore':
322 def link_repl(href):
323 try:
324 return urljoin(base_url, href)
325 except ValueError:
326 return href
327 elif handle_failures == 'discard':
328 def link_repl(href):
329 try:
330 return urljoin(base_url, href)
331 except ValueError:
332 return None
333 elif handle_failures is None:
334 def link_repl(href):
335 return urljoin(base_url, href)
336 else:
337 raise ValueError(
338 "unexpected value for handle_failures: %r" % handle_failures)
339
340 self.rewrite_links(link_repl)
341
343 """
344 Find any ``<base href>`` tag in the document, and apply its
345 values to all links found in the document. Also remove the
346 tag once it has been applied.
347
348 If ``handle_failures`` is None (default), a failure to process
349 a URL will abort the processing. If set to 'ignore', errors
350 are ignored. If set to 'discard', failing URLs will be removed.
351 """
352 base_href = None
353 basetags = self.xpath('//base[@href]|//x:base[@href]',
354 namespaces={'x': XHTML_NAMESPACE})
355 for b in basetags:
356 base_href = b.get('href')
357 b.drop_tree()
358 if not base_href:
359 return
360 self.make_links_absolute(base_href, resolve_base_href=False,
361 handle_failures=handle_failures)
362
364 """
365 Yield (element, attribute, link, pos), where attribute may be None
366 (indicating the link is in the text). ``pos`` is the position
367 where the link occurs; often 0, but sometimes something else in
368 the case of links in stylesheets or style tags.
369
370 Note: <base href> is *not* taken into account in any way. The
371 link you get is exactly the link in the document.
372
373 Note: multiple links inside of a single text string or
374 attribute value are returned in reversed order. This makes it
375 possible to replace or delete them from the text string value
376 based on their reported text positions. Otherwise, a
377 modification at one text position can change the positions of
378 links reported later on.
379 """
380 link_attrs = defs.link_attrs
381 for el in self.iter(etree.Element):
382 attribs = el.attrib
383 tag = _nons(el.tag)
384 if tag != 'object':
385 for attrib in link_attrs:
386 if attrib in attribs:
387 yield (el, attrib, attribs[attrib], 0)
388 elif tag == 'object':
389 codebase = None
390
391
392 if 'codebase' in attribs:
393 codebase = el.get('codebase')
394 yield (el, 'codebase', codebase, 0)
395 for attrib in 'classid', 'data':
396 if attrib in attribs:
397 value = el.get(attrib)
398 if codebase is not None:
399 value = urljoin(codebase, value)
400 yield (el, attrib, value, 0)
401 if 'archive' in attribs:
402 for match in _archive_re.finditer(el.get('archive')):
403 value = match.group(0)
404 if codebase is not None:
405 value = urljoin(codebase, value)
406 yield (el, 'archive', value, match.start())
407 if tag == 'param':
408 valuetype = el.get('valuetype') or ''
409 if valuetype.lower() == 'ref':
410
411
412
413
414
415
416 yield (el, 'value', el.get('value'), 0)
417 if tag == 'style' and el.text:
418 urls = [
419 _unquote_match(match.group(1), match.start(1))
420 for match in _css_url_re.finditer(el.text)
421 ] + [
422 (match.group(1), match.start(1))
423 for match in _css_import_re.finditer(el.text)
424 ]
425 if urls:
426
427 urls = [ (start, url) for (url, start) in urls ]
428 urls.sort()
429
430
431 urls.reverse()
432 for start, url in urls:
433 yield (el, None, url, start)
434 if 'style' in attribs:
435 urls = list(_css_url_re.finditer(attribs['style']))
436 if urls:
437
438 for match in urls[::-1]:
439 url, start = _unquote_match(match.group(1), match.start(1))
440 yield (el, 'style', url, start)
441
442 - def rewrite_links(self, link_repl_func, resolve_base_href=True,
443 base_href=None):
444 """
445 Rewrite all the links in the document. For each link
446 ``link_repl_func(link)`` will be called, and the return value
447 will replace the old link.
448
449 Note that links may not be absolute (unless you first called
450 ``make_links_absolute()``), and may be internal (e.g.,
451 ``'#anchor'``). They can also be values like
452 ``'mailto:email'`` or ``'javascript:expr'``.
453
454 If you give ``base_href`` then all links passed to
455 ``link_repl_func()`` will take that into account.
456
457 If the ``link_repl_func`` returns None, the attribute or
458 tag text will be removed completely.
459 """
460 if base_href is not None:
461
462
463 self.make_links_absolute(
464 base_href, resolve_base_href=resolve_base_href)
465 elif resolve_base_href:
466 self.resolve_base_href()
467
468 for el, attrib, link, pos in self.iterlinks():
469 new_link = link_repl_func(link.strip())
470 if new_link == link:
471 continue
472 if new_link is None:
473
474 if attrib is None:
475 el.text = ''
476 else:
477 del el.attrib[attrib]
478 continue
479
480 if attrib is None:
481 new = el.text[:pos] + new_link + el.text[pos+len(link):]
482 el.text = new
483 else:
484 cur = el.get(attrib)
485 if not pos and len(cur) == len(link):
486 new = new_link
487 else:
488 new = cur[:pos] + new_link + cur[pos+len(link):]
489 el.set(attrib, new)
490
491
493 """
494 An object that represents a method on an element as a function;
495 the function takes either an element or an HTML string. It
496 returns whatever the function normally returns, or if the function
497 works in-place (and so returns None) it returns a serialized form
498 of the resulting document.
499 """
505 result_type = type(doc)
506 if isinstance(doc, basestring):
507 if 'copy' in kw:
508 raise TypeError(
509 "The keyword 'copy' can only be used with element inputs to %s, not a string input" % self.name)
510 doc = fromstring(doc, **kw)
511 else:
512 if 'copy' in kw:
513 make_a_copy = kw.pop('copy')
514 else:
515 make_a_copy = self.copy
516 if make_a_copy:
517 doc = copy.deepcopy(doc)
518 meth = getattr(doc, self.name)
519 result = meth(*args, **kw)
520
521 if result is None:
522
523 return _transform_result(result_type, doc)
524 else:
525 return result
526
527 find_rel_links = _MethodFunc('find_rel_links', copy=False)
528 find_class = _MethodFunc('find_class', copy=False)
529 make_links_absolute = _MethodFunc('make_links_absolute', copy=True)
530 resolve_base_href = _MethodFunc('resolve_base_href', copy=True)
531 iterlinks = _MethodFunc('iterlinks', copy=False)
532 rewrite_links = _MethodFunc('rewrite_links', copy=True)
533
536
539
542
545
546
548 """A lookup scheme for HTML Element classes.
549
550 To create a lookup instance with different Element classes, pass a tag
551 name mapping of Element classes in the ``classes`` keyword argument and/or
552 a tag name mapping of Mixin classes in the ``mixins`` keyword argument.
553 The special key '*' denotes a Mixin class that should be mixed into all
554 Element classes.
555 """
556 _default_element_classes = {}
557
558 - def __init__(self, classes=None, mixins=None):
575
576 - def lookup(self, node_type, document, namespace, name):
587
588
589
590
591
592 _looks_like_full_html_unicode = re.compile(
593 unicode(r'^\s*<(?:html|!doctype)'), re.I).match
594 _looks_like_full_html_bytes = re.compile(
595 r'^\s*<(?:html|!doctype)'.encode('ascii'), re.I).match
596
605
608 """
609 Parses several HTML elements, returning a list of elements.
610
611 The first item in the list may be a string (though leading
612 whitespace is removed). If no_leading_text is true, then it will
613 be an error if there is leading text, and it will always be a list
614 of only elements.
615
616 base_url will set the document's base_url attribute (and the tree's docinfo.URL)
617 """
618 if parser is None:
619 parser = html_parser
620
621 if isinstance(html, bytes):
622 if not _looks_like_full_html_bytes(html):
623
624 html = ('<html><body>'.encode('ascii') + html +
625 '</body></html>'.encode('ascii'))
626 else:
627 if not _looks_like_full_html_unicode(html):
628 html = '<html><body>%s</body></html>' % html
629 doc = document_fromstring(html, parser=parser, base_url=base_url, **kw)
630 assert _nons(doc.tag) == 'html'
631 bodies = [e for e in doc if _nons(e.tag) == 'body']
632 assert len(bodies) == 1, ("too many bodies: %r in %r" % (bodies, html))
633 body = bodies[0]
634 elements = []
635 if no_leading_text and body.text and body.text.strip():
636 raise etree.ParserError(
637 "There is leading text: %r" % body.text)
638 if body.text and body.text.strip():
639 elements.append(body.text)
640 elements.extend(body)
641
642
643 return elements
644
647 """
648 Parses a single HTML element; it is an error if there is more than
649 one element, or if anything but whitespace precedes or follows the
650 element.
651
652 If create_parent is true (or is a tag name) then a parent node
653 will be created to encapsulate the HTML in a single element. In
654 this case, leading or trailing text is allowed.
655
656 base_url will set the document's base_url attribute (and the tree's docinfo.URL)
657 """
658 if parser is None:
659 parser = html_parser
660
661 accept_leading_text = bool(create_parent)
662
663 elements = fragments_fromstring(
664 html, parser=parser, no_leading_text=not accept_leading_text,
665 base_url=base_url, **kw)
666
667 if create_parent:
668 if not isinstance(create_parent, basestring):
669 create_parent = 'div'
670 new_root = Element(create_parent)
671 if elements:
672 if isinstance(elements[0], basestring):
673 new_root.text = elements[0]
674 del elements[0]
675 new_root.extend(elements)
676 return new_root
677
678 if not elements:
679 raise etree.ParserError('No elements found')
680 if len(elements) > 1:
681 raise etree.ParserError(
682 "Multiple elements found (%s)"
683 % ', '.join([_element_name(e) for e in elements]))
684 el = elements[0]
685 if el.tail and el.tail.strip():
686 raise etree.ParserError(
687 "Element followed by text: %r" % el.tail)
688 el.tail = None
689 return el
690
691 -def fromstring(html, base_url=None, parser=None, **kw):
757
758 -def parse(filename_or_url, parser=None, base_url=None, **kw):
759 """
760 Parse a filename, URL, or file-like object into an HTML document
761 tree. Note: this returns a tree, not an element. Use
762 ``parse(...).getroot()`` to get the document root.
763
764 You can override the base URL with the ``base_url`` keyword. This
765 is most useful when parsing from a file-like object.
766 """
767 if parser is None:
768 parser = html_parser
769 return etree.parse(filename_or_url, parser, base_url=base_url, **kw)
770
778
780 if isinstance(el, etree.CommentBase):
781 return 'comment'
782 elif isinstance(el, basestring):
783 return 'string'
784 else:
785 return _nons(el.tag)
786
787
788
789
790
895
896 HtmlElementClassLookup._default_element_classes['form'] = FormElement
897
934
936 if not url:
937 raise ValueError("cannot submit, no URL provided")
938
939 try:
940 from urllib import urlencode, urlopen
941 except ImportError:
942 from urllib.request import urlopen
943 from urllib.parse import urlencode
944 if method == 'GET':
945 if '?' in url:
946 url += '&'
947 else:
948 url += '?'
949 url += urlencode(values)
950 data = None
951 else:
952 data = urlencode(values)
953 return urlopen(url, data)
954
956
964 raise KeyError(
965 "You cannot remove keys from ElementDict")
969 return item in self.inputs
974
976 return '<%s for form %s>' % (
977 self.__class__.__name__,
978 self.inputs.form._name())
979
1045
1073
1074 -class TextareaElement(InputMixin, HtmlElement):
1075 """
1076 ``<textarea>`` element. You can get the name with ``.name`` and
1077 get/set the value with ``.value``
1078 """
1079
1080 - def _value__get(self):
1081 """
1082 Get/set the value (which is the contents of this element)
1083 """
1084 content = self.text or ''
1085 if self.tag.startswith("{%s}" % XHTML_NAMESPACE):
1086 serialisation_method = 'xml'
1087 else:
1088 serialisation_method = 'html'
1089 for el in self:
1090
1091 content += etree.tostring(
1092 el, method=serialisation_method, encoding='unicode')
1093 return content
1094 - def _value__set(self, value):
1095 del self[:]
1096 self.text = value
1097 - def _value__del(self):
1098 self.text = ''
1099 del self[:]
1100 value = property(_value__get, _value__set, _value__del, doc=_value__get.__doc__)
1101
1102 HtmlElementClassLookup._default_element_classes['textarea'] = TextareaElement
1103
1105 """
1106 ``<select>`` element. You can get the name with ``.name``.
1107
1108 ``.value`` will be the value of the selected option, unless this
1109 is a multi-select element (``<select multiple>``), in which case
1110 it will be a set-like object. In either case ``.value_options``
1111 gives the possible values.
1112
1113 The boolean attribute ``.multiple`` shows if this is a
1114 multi-select.
1115 """
1116
1118 """
1119 Get/set the value of this select (the selected option).
1120
1121 If this is a multi-select, this is a set-like object that
1122 represents all the selected options.
1123 """
1124 if self.multiple:
1125 return MultipleSelectOptions(self)
1126 for el in _options_xpath(self):
1127 if el.get('selected') is not None:
1128 value = el.get('value')
1129 if value is None:
1130 value = el.text or ''
1131 if value:
1132 value = value.strip()
1133 return value
1134 return None
1135
1137 if self.multiple:
1138 if isinstance(value, basestring):
1139 raise TypeError(
1140 "You must pass in a sequence")
1141 self.value.clear()
1142 self.value.update(value)
1143 return
1144 if value is not None:
1145 value = value.strip()
1146 for el in _options_xpath(self):
1147 opt_value = el.get('value')
1148 if opt_value is None:
1149 opt_value = el.text or ''
1150 if opt_value:
1151 opt_value = opt_value.strip()
1152 if opt_value == value:
1153 checked_option = el
1154 break
1155 else:
1156 raise ValueError(
1157 "There is no option with the value of %r" % value)
1158 for el in _options_xpath(self):
1159 if 'selected' in el.attrib:
1160 del el.attrib['selected']
1161 if value is not None:
1162 checked_option.set('selected', '')
1163
1170
1171 value = property(_value__get, _value__set, _value__del, doc=_value__get.__doc__)
1172
1187 value_options = property(value_options, doc=value_options.__doc__)
1188
1190 """
1191 Boolean attribute: is there a ``multiple`` attribute on this element.
1192 """
1193 return 'multiple' in self.attrib
1195 if value:
1196 self.set('multiple', '')
1197 elif 'multiple' in self.attrib:
1198 del self.attrib['multiple']
1199 multiple = property(_multiple__get, _multiple__set, doc=_multiple__get.__doc__)
1200
1201 HtmlElementClassLookup._default_element_classes['select'] = SelectElement
1202
1204 """
1205 Represents all the selected options in a ``<select multiple>`` element.
1206
1207 You can add to this set-like option to select an option, or remove
1208 to unselect the option.
1209 """
1210
1212 self.select = select
1213
1215 """
1216 Iterator of all the ``<option>`` elements.
1217 """
1218 return iter(_options_xpath(self.select))
1219 options = property(options)
1220
1222 for option in self.options:
1223 if 'selected' in option.attrib:
1224 opt_value = option.get('value')
1225 if opt_value is None:
1226 opt_value = option.text or ''
1227 if opt_value:
1228 opt_value = opt_value.strip()
1229 yield opt_value
1230
1231 - def add(self, item):
1232 for option in self.options:
1233 opt_value = option.get('value')
1234 if opt_value is None:
1235 opt_value = option.text or ''
1236 if opt_value:
1237 opt_value = opt_value.strip()
1238 if opt_value == item:
1239 option.set('selected', '')
1240 break
1241 else:
1242 raise ValueError(
1243 "There is no option with the value %r" % item)
1244
1246 for option in self.options:
1247 opt_value = option.get('value')
1248 if opt_value is None:
1249 opt_value = option.text or ''
1250 if opt_value:
1251 opt_value = opt_value.strip()
1252 if opt_value == item:
1253 if 'selected' in option.attrib:
1254 del option.attrib['selected']
1255 else:
1256 raise ValueError(
1257 "The option %r is not currently selected" % item)
1258 break
1259 else:
1260 raise ValueError(
1261 "There is not option with the value %r" % item)
1262
1264 return '<%s {%s} for select name=%r>' % (
1265 self.__class__.__name__,
1266 ', '.join([repr(v) for v in self]),
1267 self.select.name)
1268
1270 """
1271 This object represents several ``<input type=radio>`` elements
1272 that have the same name.
1273
1274 You can use this like a list, but also use the property
1275 ``.value`` to check/uncheck inputs. Also you can use
1276 ``.value_options`` to get the possible values.
1277 """
1278
1280 """
1281 Get/set the value, which checks the radio with that value (and
1282 unchecks any other value).
1283 """
1284 for el in self:
1285 if 'checked' in el.attrib:
1286 return el.get('value')
1287 return None
1288
1290 if value is not None:
1291 for el in self:
1292 if el.get('value') == value:
1293 checked_option = el
1294 break
1295 else:
1296 raise ValueError(
1297 "There is no radio input with the value %r" % value)
1298 for el in self:
1299 if 'checked' in el.attrib:
1300 del el.attrib['checked']
1301 if value is not None:
1302 checked_option.set('checked', '')
1303
1306
1307 value = property(_value__get, _value__set, _value__del, doc=_value__get.__doc__)
1308
1310 """
1311 Returns a list of all the possible values.
1312 """
1313 return [el.get('value') for el in self]
1314 value_options = property(value_options, doc=value_options.__doc__)
1315
1317 return '%s(%s)' % (
1318 self.__class__.__name__,
1319 list.__repr__(self))
1320
1322 """
1323 Represents a group of checkboxes (``<input type=checkbox>``) that
1324 have the same name.
1325
1326 In addition to using this like a list, the ``.value`` attribute
1327 returns a set-like object that you can add to or remove from to
1328 check and uncheck checkboxes. You can also use ``.value_options``
1329 to get the possible values.
1330 """
1331
1333 """
1334 Return a set-like object that can be modified to check or
1335 uncheck individual checkboxes according to their value.
1336 """
1337 return CheckboxValues(self)
1347 value = property(_value__get, _value__set, _value__del, doc=_value__get.__doc__)
1348
1350 """
1351 Returns a list of all the possible values.
1352 """
1353 return [el.get('value') for el in self]
1354 value_options = property(value_options, doc=value_options.__doc__)
1355
1357 return '%s(%s)' % (
1358 self.__class__.__name__, list.__repr__(self))
1359
1361
1362 """
1363 Represents the values of the checked checkboxes in a group of
1364 checkboxes with the same name.
1365 """
1366
1369
1371 return iter([
1372 el.get('value')
1373 for el in self.group
1374 if 'checked' in el.attrib])
1375
1376 - def add(self, value):
1377 for el in self.group:
1378 if el.get('value') == value:
1379 el.set('checked', '')
1380 break
1381 else:
1382 raise KeyError("No checkbox with value %r" % value)
1383
1385 for el in self.group:
1386 if el.get('value') == value:
1387 if 'checked' in el.attrib:
1388 del el.attrib['checked']
1389 else:
1390 raise KeyError(
1391 "The checkbox with value %r was already unchecked" % value)
1392 break
1393 else:
1394 raise KeyError(
1395 "No checkbox with value %r" % value)
1396
1398 return '<%s {%s} for checkboxes name=%r>' % (
1399 self.__class__.__name__,
1400 ', '.join([repr(v) for v in self]),
1401 self.group.name)
1402
1486
1487 HtmlElementClassLookup._default_element_classes['input'] = InputElement
1488
1490 """
1491 Represents a ``<label>`` element.
1492
1493 Label elements are linked to other elements with their ``for``
1494 attribute. You can access this element with ``label.for_element``.
1495 """
1496
1498 """
1499 Get/set the element this label points to. Return None if it
1500 can't be found.
1501 """
1502 id = self.get('for')
1503 if not id:
1504 return None
1505 return self.body.get_element_by_id(id)
1507 id = other.get('id')
1508 if not id:
1509 raise TypeError(
1510 "Element %r has no id attribute" % other)
1511 self.set('for', id)
1515 for_element = property(_for_element__get, _for_element__set, _for_element__del,
1516 doc=_for_element__get.__doc__)
1517
1518 HtmlElementClassLookup._default_element_classes['label'] = LabelElement
1519
1520
1521
1522
1523
1537
1539 """Convert all tags in an XHTML tree to HTML by removing their
1540 XHTML namespace.
1541 """
1542 try:
1543 xhtml = xhtml.getroot()
1544 except AttributeError:
1545 pass
1546 prefix = "{%s}" % XHTML_NAMESPACE
1547 prefix_len = len(prefix)
1548 for el in xhtml.iter(prefix + "*"):
1549 el.tag = el.tag[prefix_len:]
1550
1551
1552
1553 __str_replace_meta_content_type = re.compile(
1554 r'<meta http-equiv="Content-Type"[^>]*>').sub
1555 __bytes_replace_meta_content_type = re.compile(
1556 r'<meta http-equiv="Content-Type"[^>]*>'.encode('ASCII')).sub
1557
1558 -def tostring(doc, pretty_print=False, include_meta_content_type=False,
1559 encoding=None, method="html", with_tail=True, doctype=None):
1560 """Return an HTML string representation of the document.
1561
1562 Note: if include_meta_content_type is true this will create a
1563 ``<meta http-equiv="Content-Type" ...>`` tag in the head;
1564 regardless of the value of include_meta_content_type any existing
1565 ``<meta http-equiv="Content-Type" ...>`` tag will be removed
1566
1567 The ``encoding`` argument controls the output encoding (defauts to
1568 ASCII, with &#...; character references for any characters outside
1569 of ASCII). Note that you can pass the name ``'unicode'`` as
1570 ``encoding`` argument to serialise to a Unicode string.
1571
1572 The ``method`` argument defines the output method. It defaults to
1573 'html', but can also be 'xml' for xhtml output, or 'text' to
1574 serialise to plain text without markup.
1575
1576 To leave out the tail text of the top-level element that is being
1577 serialised, pass ``with_tail=False``.
1578
1579 The ``doctype`` option allows passing in a plain string that will
1580 be serialised before the XML tree. Note that passing in non
1581 well-formed content here will make the XML output non well-formed.
1582 Also, an existing doctype in the document tree will not be removed
1583 when serialising an ElementTree instance.
1584
1585 Example::
1586
1587 >>> from lxml import html
1588 >>> root = html.fragment_fromstring('<p>Hello<br>world!</p>')
1589
1590 >>> html.tostring(root)
1591 b'<p>Hello<br>world!</p>'
1592 >>> html.tostring(root, method='html')
1593 b'<p>Hello<br>world!</p>'
1594
1595 >>> html.tostring(root, method='xml')
1596 b'<p>Hello<br/>world!</p>'
1597
1598 >>> html.tostring(root, method='text')
1599 b'Helloworld!'
1600
1601 >>> html.tostring(root, method='text', encoding='unicode')
1602 u'Helloworld!'
1603
1604 >>> root = html.fragment_fromstring('<div><p>Hello<br>world!</p>TAIL</div>')
1605 >>> html.tostring(root[0], method='text', encoding='unicode')
1606 u'Helloworld!TAIL'
1607
1608 >>> html.tostring(root[0], method='text', encoding='unicode', with_tail=False)
1609 u'Helloworld!'
1610
1611 >>> doc = html.document_fromstring('<p>Hello<br>world!</p>')
1612 >>> html.tostring(doc, method='html', encoding='unicode')
1613 u'<html><body><p>Hello<br>world!</p></body></html>'
1614
1615 >>> print(html.tostring(doc, method='html', encoding='unicode',
1616 ... doctype='<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN"'
1617 ... ' "http://www.w3.org/TR/html4/strict.dtd">'))
1618 <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd">
1619 <html><body><p>Hello<br>world!</p></body></html>
1620 """
1621 html = etree.tostring(doc, method=method, pretty_print=pretty_print,
1622 encoding=encoding, with_tail=with_tail,
1623 doctype=doctype)
1624 if method == 'html' and not include_meta_content_type:
1625 if isinstance(html, str):
1626 html = __str_replace_meta_content_type('', html)
1627 else:
1628 html = __bytes_replace_meta_content_type(bytes(), html)
1629 return html
1630
1631 tostring.__doc__ = __fix_docstring(tostring.__doc__)
1632
1634 """
1635 Open the HTML document in a web browser, saving it to a temporary
1636 file to open it. Note that this does not delete the file after
1637 use. This is mainly meant for debugging.
1638 """
1639 import os
1640 import webbrowser
1641 import tempfile
1642 if not isinstance(doc, etree._ElementTree):
1643 doc = etree.ElementTree(doc)
1644 handle, fn = tempfile.mkstemp(suffix='.html')
1645 f = os.fdopen(handle, 'wb')
1646 try:
1647 doc.write(f, method="html", encoding=encoding or doc.docinfo.encoding or "UTF-8")
1648 finally:
1649
1650 f.close()
1651 url = 'file://' + fn.replace(os.path.sep, '/')
1652 print(url)
1653 webbrowser.open(url)
1654
1655
1656
1657
1658
1660 """An HTML parser that is configured to return lxml.html Element
1661 objects.
1662 """
1666
1668 """An XML parser that is configured to return lxml.html Element
1669 objects.
1670
1671 Note that this parser is not really XHTML aware unless you let it
1672 load a DTD that declares the HTML entities. To do this, make sure
1673 you have the XHTML DTDs installed in your catalogs, and create the
1674 parser like this::
1675
1676 >>> parser = XHTMLParser(load_dtd=True)
1677
1678 If you additionally want to validate the document, use this::
1679
1680 >>> parser = XHTMLParser(dtd_validation=True)
1681
1682 For catalog support, see http://www.xmlsoft.org/catalog.html.
1683 """
1687
1689 """Create a new HTML Element.
1690
1691 This can also be used for XHTML documents.
1692 """
1693 v = html_parser.makeelement(*args, **kw)
1694 return v
1695
1696 html_parser = HTMLParser()
1697 xhtml_parser = XHTMLParser()
1698