1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31 """The ``lxml.html`` tool set for HTML handling.
32 """
33
34 import sys
35 import re
36 try:
37 from urlparse import urljoin
38 except ImportError:
39
40 from urllib.parse import urljoin
41 import copy
42 from lxml import etree
43 from lxml.html import defs
44 from lxml.html._setmixin import SetMixin
45 try:
46 from collections import MutableMapping as DictMixin
47 except ImportError:
48
49 from UserDict import DictMixin
50 try:
51 set
52 except NameError:
53
54 from sets import Set as set
55 try:
56 bytes
57 except NameError:
58
59 bytes = str
60 try:
61 unicode
62 except NameError:
63
64 unicode = str
65 try:
66 basestring
67 except NameError:
68
69 basestring = (str, bytes)
70
72 if not s:
73 return s
74 import sys
75 if sys.version_info[0] >= 3:
76 sub = re.compile(r"^(\s*)u'", re.M).sub
77 else:
78 sub = re.compile(r"^(\s*)b'", re.M).sub
79 return sub(r"\1'", s)
80
81 __all__ = [
82 'document_fromstring', 'fragment_fromstring', 'fragments_fromstring', 'fromstring',
83 'tostring', 'Element', 'defs', 'open_in_browser', 'submit_form',
84 'find_rel_links', 'find_class', 'make_links_absolute',
85 'resolve_base_href', 'iterlinks', 'rewrite_links', 'open_in_browser', 'parse']
86
87 XHTML_NAMESPACE = "http://www.w3.org/1999/xhtml"
88
89 _rel_links_xpath = etree.XPath("descendant-or-self::a[@rel]|descendant-or-self::x:a[@rel]",
90 namespaces={'x':XHTML_NAMESPACE})
91 _options_xpath = etree.XPath("descendant-or-self::option|descendant-or-self::x:option",
92 namespaces={'x':XHTML_NAMESPACE})
93 _forms_xpath = etree.XPath("descendant-or-self::form|descendant-or-self::x:form",
94 namespaces={'x':XHTML_NAMESPACE})
95
96 _class_xpath = etree.XPath("descendant-or-self::*[@class and contains(concat(' ', normalize-space(@class), ' '), concat(' ', $class_name, ' '))]")
97 _id_xpath = etree.XPath("descendant-or-self::*[@id=$id]")
98 _collect_string_content = etree.XPath("string()")
99 _css_url_re = re.compile(r'url\(('+'["][^"]*["]|'+"['][^']*[']|"+r'[^)]*)\)', re.I)
100 _css_import_re = re.compile(r'@import "(.*?)"')
101 _label_xpath = etree.XPath("//label[@for=$id]|//x:label[@for=$id]",
102 namespaces={'x':XHTML_NAMESPACE})
103 _archive_re = re.compile(r'[^ ]+')
104
106 if s[:1] == '"' and s[-1:] == '"' or s[:1] == "'" and s[-1:] == "'":
107 return s[1:-1], pos+1
108 else:
109 return s,pos
110
120
126
128
130 """
131 Returns the base URL, given when the page was parsed.
132
133 Use with ``urlparse.urljoin(el.base_url, href)`` to get
134 absolute URLs.
135 """
136 return self.getroottree().docinfo.URL
137 base_url = property(base_url, doc=base_url.__doc__)
138
144 forms = property(forms, doc=forms.__doc__)
145
147 """
148 Return the <body> element. Can be called from a child element
149 to get the document's head.
150 """
151 return self.xpath('//body|//x:body', namespaces={'x':XHTML_NAMESPACE})[0]
152 body = property(body, doc=body.__doc__)
153
155 """
156 Returns the <head> element. Can be called from a child
157 element to get the document's head.
158 """
159 return self.xpath('//head|//x:head', namespaces={'x':XHTML_NAMESPACE})[0]
160 head = property(head, doc=head.__doc__)
161
163 """
164 Get or set any <label> element associated with this element.
165 """
166 id = self.get('id')
167 if not id:
168 return None
169 result = _label_xpath(self, id=id)
170 if not result:
171 return None
172 else:
173 return result[0]
175 id = self.get('id')
176 if not id:
177 raise TypeError(
178 "You cannot set a label for an element (%r) that has no id"
179 % self)
180 if _nons(label.tag) != 'label':
181 raise TypeError(
182 "You can only assign label to a label element (not %r)"
183 % label)
184 label.set('for', id)
189 label = property(_label__get, _label__set, _label__del, doc=_label__get.__doc__)
190
192 """
193 Removes this element from the tree, including its children and
194 text. The tail text is joined to the previous element or
195 parent.
196 """
197 parent = self.getparent()
198 assert parent is not None
199 if self.tail:
200 previous = self.getprevious()
201 if previous is None:
202 parent.text = (parent.text or '') + self.tail
203 else:
204 previous.tail = (previous.tail or '') + self.tail
205 parent.remove(self)
206
208 """
209 Remove the tag, but not its children or text. The children and text
210 are merged into the parent.
211
212 Example::
213
214 >>> h = fragment_fromstring('<div>Hello <b>World!</b></div>')
215 >>> h.find('.//b').drop_tag()
216 >>> print(tostring(h, encoding=unicode))
217 <div>Hello World!</div>
218 """
219 parent = self.getparent()
220 assert parent is not None
221 previous = self.getprevious()
222 if self.text and isinstance(self.tag, basestring):
223
224 if previous is None:
225 parent.text = (parent.text or '') + self.text
226 else:
227 previous.tail = (previous.tail or '') + self.text
228 if self.tail:
229 if len(self):
230 last = self[-1]
231 last.tail = (last.tail or '') + self.tail
232 elif previous is None:
233 parent.text = (parent.text or '') + self.tail
234 else:
235 previous.tail = (previous.tail or '') + self.tail
236 index = parent.index(self)
237 parent[index:index+1] = self[:]
238
240 """
241 Find any links like ``<a rel="{rel}">...</a>``; returns a list of elements.
242 """
243 rel = rel.lower()
244 return [el for el in _rel_links_xpath(self)
245 if el.get('rel').lower() == rel]
246
248 """
249 Find any elements with the given class name.
250 """
251 return _class_xpath(self, class_name=class_name)
252
254 """
255 Get the first element in a document with the given id. If none is
256 found, return the default argument if provided or raise KeyError
257 otherwise.
258
259 Note that there can be more than one element with the same id,
260 and this isn't uncommon in HTML documents found in the wild.
261 Browsers return only the first match, and this function does
262 the same.
263 """
264 try:
265
266
267 return _id_xpath(self, id=id)[0]
268 except IndexError:
269 if default:
270 return default[0]
271 else:
272 raise KeyError(id)
273
274 - def text_content(self):
275 """
276 Return the text content of the tag (and the text in any children).
277 """
278 return _collect_string_content(self)
279
280 - def cssselect(self, expr, translator='html'):
281 """
282 Run the CSS expression on this element and its children,
283 returning a list of the results.
284
285 Equivalent to lxml.cssselect.CSSSelect(expr, translator='html')(self)
286 -- note that pre-compiling the expression can provide a substantial
287 speedup.
288 """
289
290 from lxml.cssselect import CSSSelector
291 return CSSSelector(expr, translator=translator)(self)
292
293
294
295
296
297 - def make_links_absolute(self, base_url=None, resolve_base_href=True,
298 handle_failures=None):
299 """
300 Make all links in the document absolute, given the
301 ``base_url`` for the document (the full URL where the document
302 came from), or if no ``base_url`` is given, then the ``.base_url``
303 of the document.
304
305 If ``resolve_base_href`` is true, then any ``<base href>``
306 tags in the document are used *and* removed from the document.
307 If it is false then any such tag is ignored.
308
309 If ``handle_failures`` is None (default), a failure to process
310 a URL will abort the processing. If set to 'ignore', errors
311 are ignored. If set to 'discard', failing URLs will be removed.
312 """
313 if base_url is None:
314 base_url = self.base_url
315 if base_url is None:
316 raise TypeError(
317 "No base_url given, and the document has no base_url")
318 if resolve_base_href:
319 self.resolve_base_href()
320
321 if handle_failures == 'ignore':
322 def link_repl(href):
323 try:
324 return urljoin(base_url, href)
325 except ValueError:
326 return href
327 elif handle_failures == 'discard':
328 def link_repl(href):
329 try:
330 return urljoin(base_url, href)
331 except ValueError:
332 return None
333 elif handle_failures is None:
334 def link_repl(href):
335 return urljoin(base_url, href)
336 else:
337 raise ValueError(
338 "unexpected value for handle_failures: %r" % handle_failures)
339
340 self.rewrite_links(link_repl)
341
343 """
344 Find any ``<base href>`` tag in the document, and apply its
345 values to all links found in the document. Also remove the
346 tag once it has been applied.
347
348 If ``handle_failures`` is None (default), a failure to process
349 a URL will abort the processing. If set to 'ignore', errors
350 are ignored. If set to 'discard', failing URLs will be removed.
351 """
352 base_href = None
353 basetags = self.xpath('//base[@href]|//x:base[@href]',
354 namespaces={'x': XHTML_NAMESPACE})
355 for b in basetags:
356 base_href = b.get('href')
357 b.drop_tree()
358 if not base_href:
359 return
360 self.make_links_absolute(base_href, resolve_base_href=False,
361 handle_failures=handle_failures)
362
364 """
365 Yield (element, attribute, link, pos), where attribute may be None
366 (indicating the link is in the text). ``pos`` is the position
367 where the link occurs; often 0, but sometimes something else in
368 the case of links in stylesheets or style tags.
369
370 Note: <base href> is *not* taken into account in any way. The
371 link you get is exactly the link in the document.
372
373 Note: multiple links inside of a single text string or
374 attribute value are returned in reversed order. This makes it
375 possible to replace or delete them from the text string value
376 based on their reported text positions. Otherwise, a
377 modification at one text position can change the positions of
378 links reported later on.
379 """
380 link_attrs = defs.link_attrs
381 for el in self.iter():
382 attribs = el.attrib
383 tag = _nons(el.tag)
384 if tag != 'object':
385 for attrib in link_attrs:
386 if attrib in attribs:
387 yield (el, attrib, attribs[attrib], 0)
388 elif tag == 'object':
389 codebase = None
390
391
392 if 'codebase' in attribs:
393 codebase = el.get('codebase')
394 yield (el, 'codebase', codebase, 0)
395 for attrib in 'classid', 'data':
396 if attrib in attribs:
397 value = el.get(attrib)
398 if codebase is not None:
399 value = urljoin(codebase, value)
400 yield (el, attrib, value, 0)
401 if 'archive' in attribs:
402 for match in _archive_re.finditer(el.get('archive')):
403 value = match.group(0)
404 if codebase is not None:
405 value = urljoin(codebase, value)
406 yield (el, 'archive', value, match.start())
407 if tag == 'param':
408 valuetype = el.get('valuetype') or ''
409 if valuetype.lower() == 'ref':
410
411
412
413
414
415
416 yield (el, 'value', el.get('value'), 0)
417 if tag == 'style' and el.text:
418 urls = [
419 _unquote_match(match.group(1), match.start(1))
420 for match in _css_url_re.finditer(el.text)
421 ] + [
422 (match.group(1), match.start(1))
423 for match in _css_import_re.finditer(el.text)
424 ]
425 if urls:
426
427 urls = [ (start, url) for (url, start) in urls ]
428 urls.sort()
429
430
431 urls.reverse()
432 for start, url in urls:
433 yield (el, None, url, start)
434 if 'style' in attribs:
435 urls = list(_css_url_re.finditer(attribs['style']))
436 if urls:
437
438 for match in urls[::-1]:
439 url, start = _unquote_match(match.group(1), match.start(1))
440 yield (el, 'style', url, start)
441
442 - def rewrite_links(self, link_repl_func, resolve_base_href=True,
443 base_href=None):
444 """
445 Rewrite all the links in the document. For each link
446 ``link_repl_func(link)`` will be called, and the return value
447 will replace the old link.
448
449 Note that links may not be absolute (unless you first called
450 ``make_links_absolute()``), and may be internal (e.g.,
451 ``'#anchor'``). They can also be values like
452 ``'mailto:email'`` or ``'javascript:expr'``.
453
454 If you give ``base_href`` then all links passed to
455 ``link_repl_func()`` will take that into account.
456
457 If the ``link_repl_func`` returns None, the attribute or
458 tag text will be removed completely.
459 """
460 if base_href is not None:
461
462
463 self.make_links_absolute(
464 base_href, resolve_base_href=resolve_base_href)
465 elif resolve_base_href:
466 self.resolve_base_href()
467
468 for el, attrib, link, pos in self.iterlinks():
469 new_link = link_repl_func(link.strip())
470 if new_link == link:
471 continue
472 if new_link is None:
473
474 if attrib is None:
475 el.text = ''
476 else:
477 del el.attrib[attrib]
478 continue
479
480 if attrib is None:
481 new = el.text[:pos] + new_link + el.text[pos+len(link):]
482 el.text = new
483 else:
484 cur = el.get(attrib)
485 if not pos and len(cur) == len(link):
486 new = new_link
487 else:
488 new = cur[:pos] + new_link + cur[pos+len(link):]
489 el.set(attrib, new)
490
491
493 """
494 An object that represents a method on an element as a function;
495 the function takes either an element or an HTML string. It
496 returns whatever the function normally returns, or if the function
497 works in-place (and so returns None) it returns a serialized form
498 of the resulting document.
499 """
505 result_type = type(doc)
506 if isinstance(doc, basestring):
507 if 'copy' in kw:
508 raise TypeError(
509 "The keyword 'copy' can only be used with element inputs to %s, not a string input" % self.name)
510 doc = fromstring(doc, **kw)
511 else:
512 if 'copy' in kw:
513 make_a_copy = kw.pop('copy')
514 else:
515 make_a_copy = self.copy
516 if make_a_copy:
517 doc = copy.deepcopy(doc)
518 meth = getattr(doc, self.name)
519 result = meth(*args, **kw)
520
521 if result is None:
522
523 return _transform_result(result_type, doc)
524 else:
525 return result
526
527 find_rel_links = _MethodFunc('find_rel_links', copy=False)
528 find_class = _MethodFunc('find_class', copy=False)
529 make_links_absolute = _MethodFunc('make_links_absolute', copy=True)
530 resolve_base_href = _MethodFunc('resolve_base_href', copy=True)
531 iterlinks = _MethodFunc('iterlinks', copy=False)
532 rewrite_links = _MethodFunc('rewrite_links', copy=True)
533
536
539
542
545
546
548 """A lookup scheme for HTML Element classes.
549
550 To create a lookup instance with different Element classes, pass a tag
551 name mapping of Element classes in the ``classes`` keyword argument and/or
552 a tag name mapping of Mixin classes in the ``mixins`` keyword argument.
553 The special key '*' denotes a Mixin class that should be mixed into all
554 Element classes.
555 """
556 _default_element_classes = {}
557
558 - def __init__(self, classes=None, mixins=None):
575
576 - def lookup(self, node_type, document, namespace, name):
587
588
589
590
591
592 _looks_like_full_html_unicode = re.compile(
593 unicode(r'^\s*<(?:html|!doctype)'), re.I).match
594 _looks_like_full_html_bytes = re.compile(
595 r'^\s*<(?:html|!doctype)'.encode('ascii'), re.I).match
596
605
608 """
609 Parses several HTML elements, returning a list of elements.
610
611 The first item in the list may be a string (though leading
612 whitespace is removed). If no_leading_text is true, then it will
613 be an error if there is leading text, and it will always be a list
614 of only elements.
615
616 base_url will set the document's base_url attribute (and the tree's docinfo.URL)
617 """
618 if parser is None:
619 parser = html_parser
620
621 if isinstance(html, bytes):
622 if not _looks_like_full_html_bytes(html):
623 html = '<html><body>%s</body></html>'.encode('ascii') % html
624 else:
625 if not _looks_like_full_html_unicode(html):
626 html = '<html><body>%s</body></html>' % html
627 doc = document_fromstring(html, parser=parser, base_url=base_url, **kw)
628 assert _nons(doc.tag) == 'html'
629 bodies = [e for e in doc if _nons(e.tag) == 'body']
630 assert len(bodies) == 1, ("too many bodies: %r in %r" % (bodies, html))
631 body = bodies[0]
632 elements = []
633 if no_leading_text and body.text and body.text.strip():
634 raise etree.ParserError(
635 "There is leading text: %r" % body.text)
636 if body.text and body.text.strip():
637 elements.append(body.text)
638 elements.extend(body)
639
640
641 return elements
642
645 """
646 Parses a single HTML element; it is an error if there is more than
647 one element, or if anything but whitespace precedes or follows the
648 element.
649
650 If create_parent is true (or is a tag name) then a parent node
651 will be created to encapsulate the HTML in a single element. In
652 this case, leading or trailing text is allowed.
653
654 base_url will set the document's base_url attribute (and the tree's docinfo.URL)
655 """
656 if parser is None:
657 parser = html_parser
658
659 accept_leading_text = bool(create_parent)
660
661 elements = fragments_fromstring(
662 html, parser=parser, no_leading_text=not accept_leading_text,
663 base_url=base_url, **kw)
664
665 if create_parent:
666 if not isinstance(create_parent, basestring):
667 create_parent = 'div'
668 new_root = Element(create_parent)
669 if elements:
670 if isinstance(elements[0], basestring):
671 new_root.text = elements[0]
672 del elements[0]
673 new_root.extend(elements)
674 return new_root
675
676 if not elements:
677 raise etree.ParserError('No elements found')
678 if len(elements) > 1:
679 raise etree.ParserError(
680 "Multiple elements found (%s)"
681 % ', '.join([_element_name(e) for e in elements]))
682 el = elements[0]
683 if el.tail and el.tail.strip():
684 raise etree.ParserError(
685 "Element followed by text: %r" % el.tail)
686 el.tail = None
687 return el
688
689 -def fromstring(html, base_url=None, parser=None, **kw):
755
756 -def parse(filename_or_url, parser=None, base_url=None, **kw):
757 """
758 Parse a filename, URL, or file-like object into an HTML document
759 tree. Note: this returns a tree, not an element. Use
760 ``parse(...).getroot()`` to get the document root.
761
762 You can override the base URL with the ``base_url`` keyword. This
763 is most useful when parsing from a file-like object.
764 """
765 if parser is None:
766 parser = html_parser
767 return etree.parse(filename_or_url, parser, base_url=base_url, **kw)
768
770
771
772 for el in el.iter():
773 if _nons(el.tag) in defs.block_tags:
774 return True
775 return False
776
778 if isinstance(el, etree.CommentBase):
779 return 'comment'
780 elif isinstance(el, basestring):
781 return 'string'
782 else:
783 return _nons(el.tag)
784
785
786
787
788
893
894 HtmlElementClassLookup._default_element_classes['form'] = FormElement
895
932
934 if not url:
935 raise ValueError("cannot submit, no URL provided")
936
937 try:
938 from urllib import urlencode, urlopen
939 except ImportError:
940 from urllib.request import urlopen
941 from urllib.parse import urlencode
942 if method == 'GET':
943 if '?' in url:
944 url += '&'
945 else:
946 url += '?'
947 url += urlencode(values)
948 data = None
949 else:
950 data = urlencode(values)
951 return urlopen(url, data)
952
954
962 raise KeyError(
963 "You cannot remove keys from ElementDict")
967 return item in self.inputs
972
974 return '<%s for form %s>' % (
975 self.__class__.__name__,
976 self.inputs.form._name())
977
1043
1071
1072 -class TextareaElement(InputMixin, HtmlElement):
1073 """
1074 ``<textarea>`` element. You can get the name with ``.name`` and
1075 get/set the value with ``.value``
1076 """
1077
1078 - def _value__get(self):
1079 """
1080 Get/set the value (which is the contents of this element)
1081 """
1082 content = self.text or ''
1083 if self.tag.startswith("{%s}" % XHTML_NAMESPACE):
1084 serialisation_method = 'xml'
1085 else:
1086 serialisation_method = 'html'
1087 for el in self:
1088
1089 content += etree.tostring(el, method=serialisation_method, encoding=unicode)
1090 return content
1091 - def _value__set(self, value):
1092 del self[:]
1093 self.text = value
1094 - def _value__del(self):
1095 self.text = ''
1096 del self[:]
1097 value = property(_value__get, _value__set, _value__del, doc=_value__get.__doc__)
1098
1099 HtmlElementClassLookup._default_element_classes['textarea'] = TextareaElement
1100
1102 """
1103 ``<select>`` element. You can get the name with ``.name``.
1104
1105 ``.value`` will be the value of the selected option, unless this
1106 is a multi-select element (``<select multiple>``), in which case
1107 it will be a set-like object. In either case ``.value_options``
1108 gives the possible values.
1109
1110 The boolean attribute ``.multiple`` shows if this is a
1111 multi-select.
1112 """
1113
1115 """
1116 Get/set the value of this select (the selected option).
1117
1118 If this is a multi-select, this is a set-like object that
1119 represents all the selected options.
1120 """
1121 if self.multiple:
1122 return MultipleSelectOptions(self)
1123 for el in _options_xpath(self):
1124 if el.get('selected') is not None:
1125 value = el.get('value')
1126 if value is None:
1127 value = el.text or ''
1128 if value:
1129 value = value.strip()
1130 return value
1131 return None
1132
1134 if self.multiple:
1135 if isinstance(value, basestring):
1136 raise TypeError(
1137 "You must pass in a sequence")
1138 self.value.clear()
1139 self.value.update(value)
1140 return
1141 if value is not None:
1142 value = value.strip()
1143 for el in _options_xpath(self):
1144 opt_value = el.get('value')
1145 if opt_value is None:
1146 opt_value = el.text or ''
1147 if opt_value:
1148 opt_value = opt_value.strip()
1149 if opt_value == value:
1150 checked_option = el
1151 break
1152 else:
1153 raise ValueError(
1154 "There is no option with the value of %r" % value)
1155 for el in _options_xpath(self):
1156 if 'selected' in el.attrib:
1157 del el.attrib['selected']
1158 if value is not None:
1159 checked_option.set('selected', '')
1160
1167
1168 value = property(_value__get, _value__set, _value__del, doc=_value__get.__doc__)
1169
1184 value_options = property(value_options, doc=value_options.__doc__)
1185
1187 """
1188 Boolean attribute: is there a ``multiple`` attribute on this element.
1189 """
1190 return 'multiple' in self.attrib
1192 if value:
1193 self.set('multiple', '')
1194 elif 'multiple' in self.attrib:
1195 del self.attrib['multiple']
1196 multiple = property(_multiple__get, _multiple__set, doc=_multiple__get.__doc__)
1197
1198 HtmlElementClassLookup._default_element_classes['select'] = SelectElement
1199
1201 """
1202 Represents all the selected options in a ``<select multiple>`` element.
1203
1204 You can add to this set-like option to select an option, or remove
1205 to unselect the option.
1206 """
1207
1209 self.select = select
1210
1212 """
1213 Iterator of all the ``<option>`` elements.
1214 """
1215 return iter(_options_xpath(self.select))
1216 options = property(options)
1217
1219 for option in self.options:
1220 if 'selected' in option.attrib:
1221 opt_value = option.get('value')
1222 if opt_value is None:
1223 opt_value = option.text or ''
1224 if opt_value:
1225 opt_value = opt_value.strip()
1226 yield opt_value
1227
1228 - def add(self, item):
1229 for option in self.options:
1230 opt_value = option.get('value')
1231 if opt_value is None:
1232 opt_value = option.text or ''
1233 if opt_value:
1234 opt_value = opt_value.strip()
1235 if opt_value == item:
1236 option.set('selected', '')
1237 break
1238 else:
1239 raise ValueError(
1240 "There is no option with the value %r" % item)
1241
1243 for option in self.options:
1244 opt_value = option.get('value')
1245 if opt_value is None:
1246 opt_value = option.text or ''
1247 if opt_value:
1248 opt_value = opt_value.strip()
1249 if opt_value == item:
1250 if 'selected' in option.attrib:
1251 del option.attrib['selected']
1252 else:
1253 raise ValueError(
1254 "The option %r is not currently selected" % item)
1255 break
1256 else:
1257 raise ValueError(
1258 "There is not option with the value %r" % item)
1259
1261 return '<%s {%s} for select name=%r>' % (
1262 self.__class__.__name__,
1263 ', '.join([repr(v) for v in self]),
1264 self.select.name)
1265
1267 """
1268 This object represents several ``<input type=radio>`` elements
1269 that have the same name.
1270
1271 You can use this like a list, but also use the property
1272 ``.value`` to check/uncheck inputs. Also you can use
1273 ``.value_options`` to get the possible values.
1274 """
1275
1277 """
1278 Get/set the value, which checks the radio with that value (and
1279 unchecks any other value).
1280 """
1281 for el in self:
1282 if 'checked' in el.attrib:
1283 return el.get('value')
1284 return None
1285
1287 if value is not None:
1288 for el in self:
1289 if el.get('value') == value:
1290 checked_option = el
1291 break
1292 else:
1293 raise ValueError(
1294 "There is no radio input with the value %r" % value)
1295 for el in self:
1296 if 'checked' in el.attrib:
1297 del el.attrib['checked']
1298 if value is not None:
1299 checked_option.set('checked', '')
1300
1303
1304 value = property(_value__get, _value__set, _value__del, doc=_value__get.__doc__)
1305
1307 """
1308 Returns a list of all the possible values.
1309 """
1310 return [el.get('value') for el in self]
1311 value_options = property(value_options, doc=value_options.__doc__)
1312
1314 return '%s(%s)' % (
1315 self.__class__.__name__,
1316 list.__repr__(self))
1317
1319 """
1320 Represents a group of checkboxes (``<input type=checkbox>``) that
1321 have the same name.
1322
1323 In addition to using this like a list, the ``.value`` attribute
1324 returns a set-like object that you can add to or remove from to
1325 check and uncheck checkboxes. You can also use ``.value_options``
1326 to get the possible values.
1327 """
1328
1330 """
1331 Return a set-like object that can be modified to check or
1332 uncheck individual checkboxes according to their value.
1333 """
1334 return CheckboxValues(self)
1344 value = property(_value__get, _value__set, _value__del, doc=_value__get.__doc__)
1345
1347 """
1348 Returns a list of all the possible values.
1349 """
1350 return [el.get('value') for el in self]
1351 value_options = property(value_options, doc=value_options.__doc__)
1352
1354 return '%s(%s)' % (
1355 self.__class__.__name__, list.__repr__(self))
1356
1358
1359 """
1360 Represents the values of the checked checkboxes in a group of
1361 checkboxes with the same name.
1362 """
1363
1366
1368 return iter([
1369 el.get('value')
1370 for el in self.group
1371 if 'checked' in el.attrib])
1372
1373 - def add(self, value):
1374 for el in self.group:
1375 if el.get('value') == value:
1376 el.set('checked', '')
1377 break
1378 else:
1379 raise KeyError("No checkbox with value %r" % value)
1380
1382 for el in self.group:
1383 if el.get('value') == value:
1384 if 'checked' in el.attrib:
1385 del el.attrib['checked']
1386 else:
1387 raise KeyError(
1388 "The checkbox with value %r was already unchecked" % value)
1389 break
1390 else:
1391 raise KeyError(
1392 "No checkbox with value %r" % value)
1393
1395 return '<%s {%s} for checkboxes name=%r>' % (
1396 self.__class__.__name__,
1397 ', '.join([repr(v) for v in self]),
1398 self.group.name)
1399
1483
1484 HtmlElementClassLookup._default_element_classes['input'] = InputElement
1485
1487 """
1488 Represents a ``<label>`` element.
1489
1490 Label elements are linked to other elements with their ``for``
1491 attribute. You can access this element with ``label.for_element``.
1492 """
1493
1495 """
1496 Get/set the element this label points to. Return None if it
1497 can't be found.
1498 """
1499 id = self.get('for')
1500 if not id:
1501 return None
1502 return self.body.get_element_by_id(id)
1504 id = other.get('id')
1505 if not id:
1506 raise TypeError(
1507 "Element %r has no id attribute" % other)
1508 self.set('for', id)
1512 for_element = property(_for_element__get, _for_element__set, _for_element__del,
1513 doc=_for_element__get.__doc__)
1514
1515 HtmlElementClassLookup._default_element_classes['label'] = LabelElement
1516
1517
1518
1519
1520
1535
1537 """Convert all tags in an XHTML tree to HTML by removing their
1538 XHTML namespace.
1539 """
1540 try:
1541 xhtml = xhtml.getroot()
1542 except AttributeError:
1543 pass
1544 prefix = "{%s}" % XHTML_NAMESPACE
1545 prefix_len = len(prefix)
1546 for el in xhtml.iter(prefix + "*"):
1547 el.tag = el.tag[prefix_len:]
1548
1549
1550
1551 __str_replace_meta_content_type = re.compile(
1552 r'<meta http-equiv="Content-Type"[^>]*>').sub
1553 __bytes_replace_meta_content_type = re.compile(
1554 r'<meta http-equiv="Content-Type"[^>]*>'.encode('ASCII')).sub
1555
1556 -def tostring(doc, pretty_print=False, include_meta_content_type=False,
1557 encoding=None, method="html", with_tail=True, doctype=None):
1558 """Return an HTML string representation of the document.
1559
1560 Note: if include_meta_content_type is true this will create a
1561 ``<meta http-equiv="Content-Type" ...>`` tag in the head;
1562 regardless of the value of include_meta_content_type any existing
1563 ``<meta http-equiv="Content-Type" ...>`` tag will be removed
1564
1565 The ``encoding`` argument controls the output encoding (defauts to
1566 ASCII, with &#...; character references for any characters outside
1567 of ASCII). Note that you can pass the name ``'unicode'`` as
1568 ``encoding`` argument to serialise to a unicode string.
1569
1570 The ``method`` argument defines the output method. It defaults to
1571 'html', but can also be 'xml' for xhtml output, or 'text' to
1572 serialise to plain text without markup.
1573
1574 To leave out the tail text of the top-level element that is being
1575 serialised, pass ``with_tail=False``.
1576
1577 The ``doctype`` option allows passing in a plain string that will
1578 be serialised before the XML tree. Note that passing in non
1579 well-formed content here will make the XML output non well-formed.
1580 Also, an existing doctype in the document tree will not be removed
1581 when serialising an ElementTree instance.
1582
1583 Example::
1584
1585 >>> from lxml import html
1586 >>> root = html.fragment_fromstring('<p>Hello<br>world!</p>')
1587
1588 >>> html.tostring(root)
1589 b'<p>Hello<br>world!</p>'
1590 >>> html.tostring(root, method='html')
1591 b'<p>Hello<br>world!</p>'
1592
1593 >>> html.tostring(root, method='xml')
1594 b'<p>Hello<br/>world!</p>'
1595
1596 >>> html.tostring(root, method='text')
1597 b'Helloworld!'
1598
1599 >>> html.tostring(root, method='text', encoding=unicode)
1600 u'Helloworld!'
1601
1602 >>> root = html.fragment_fromstring('<div><p>Hello<br>world!</p>TAIL</div>')
1603 >>> html.tostring(root[0], method='text', encoding=unicode)
1604 u'Helloworld!TAIL'
1605
1606 >>> html.tostring(root[0], method='text', encoding=unicode, with_tail=False)
1607 u'Helloworld!'
1608
1609 >>> doc = html.document_fromstring('<p>Hello<br>world!</p>')
1610 >>> html.tostring(doc, method='html', encoding=unicode)
1611 u'<html><body><p>Hello<br>world!</p></body></html>'
1612
1613 >>> print(html.tostring(doc, method='html', encoding=unicode,
1614 ... doctype='<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN"'
1615 ... ' "http://www.w3.org/TR/html4/strict.dtd">'))
1616 <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd">
1617 <html><body><p>Hello<br>world!</p></body></html>
1618 """
1619 html = etree.tostring(doc, method=method, pretty_print=pretty_print,
1620 encoding=encoding, with_tail=with_tail,
1621 doctype=doctype)
1622 if method == 'html' and not include_meta_content_type:
1623 if isinstance(html, str):
1624 html = __str_replace_meta_content_type('', html)
1625 else:
1626 html = __bytes_replace_meta_content_type(bytes(), html)
1627 return html
1628
1629 tostring.__doc__ = __fix_docstring(tostring.__doc__)
1630
1632 """
1633 Open the HTML document in a web browser, saving it to a temporary
1634 file to open it. Note that this does not delete the file after
1635 use. This is mainly meant for debugging.
1636 """
1637 import os
1638 import webbrowser
1639 import tempfile
1640 if not isinstance(doc, etree._ElementTree):
1641 doc = etree.ElementTree(doc)
1642 handle, fn = tempfile.mkstemp(suffix='.html')
1643 f = os.fdopen(handle, 'wb')
1644 try:
1645 doc.write(f, method="html", encoding=encoding or doc.docinfo.encoding or "UTF-8")
1646 finally:
1647
1648 f.close()
1649 url = 'file://' + fn.replace(os.path.sep, '/')
1650 print(url)
1651 webbrowser.open(url)
1652
1653
1654
1655
1656
1658 """An HTML parser that is configured to return lxml.html Element
1659 objects.
1660 """
1664
1666 """An XML parser that is configured to return lxml.html Element
1667 objects.
1668
1669 Note that this parser is not really XHTML aware unless you let it
1670 load a DTD that declares the HTML entities. To do this, make sure
1671 you have the XHTML DTDs installed in your catalogs, and create the
1672 parser like this::
1673
1674 >>> parser = XHTMLParser(load_dtd=True)
1675
1676 If you additionally want to validate the document, use this::
1677
1678 >>> parser = XHTMLParser(dtd_validation=True)
1679
1680 For catalog support, see http://www.xmlsoft.org/catalog.html.
1681 """
1685
1687 """Create a new HTML Element.
1688
1689 This can also be used for XHTML documents.
1690 """
1691 v = html_parser.makeelement(*args, **kw)
1692 return v
1693
1694 html_parser = HTMLParser()
1695 xhtml_parser = XHTMLParser()
1696