1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31 """The ``lxml.html`` tool set for HTML handling.
32 """
33
34 import sys
35 import re
36 try:
37 from urlparse import urljoin
38 except ImportError:
39
40 from urllib.parse import urljoin
41 import copy
42 from lxml import etree
43 from lxml.html import defs
44 from lxml.html._setmixin import SetMixin
45 try:
46 from collections import MutableMapping as DictMixin
47 except ImportError:
48
49 from UserDict import DictMixin
50 try:
51 set
52 except NameError:
53
54 from sets import Set as set
55 try:
56 bytes
57 except NameError:
58
59 bytes = str
60 try:
61 unicode
62 except NameError:
63
64 unicode = str
65 try:
66 basestring
67 except NameError:
68
69 basestring = (str, bytes)
70
72 if not s:
73 return s
74 import sys
75 if sys.version_info[0] >= 3:
76 sub = re.compile(r"^(\s*)u'", re.M).sub
77 else:
78 sub = re.compile(r"^(\s*)b'", re.M).sub
79 return sub(r"\1'", s)
80
81 __all__ = [
82 'document_fromstring', 'fragment_fromstring', 'fragments_fromstring', 'fromstring',
83 'tostring', 'Element', 'defs', 'open_in_browser', 'submit_form',
84 'find_rel_links', 'find_class', 'make_links_absolute',
85 'resolve_base_href', 'iterlinks', 'rewrite_links', 'open_in_browser', 'parse']
86
87 XHTML_NAMESPACE = "http://www.w3.org/1999/xhtml"
88
89 _rel_links_xpath = etree.XPath("descendant-or-self::a[@rel]|descendant-or-self::x:a[@rel]",
90 namespaces={'x':XHTML_NAMESPACE})
91 _options_xpath = etree.XPath("descendant-or-self::option|descendant-or-self::x:option",
92 namespaces={'x':XHTML_NAMESPACE})
93 _forms_xpath = etree.XPath("descendant-or-self::form|descendant-or-self::x:form",
94 namespaces={'x':XHTML_NAMESPACE})
95
96 _class_xpath = etree.XPath("descendant-or-self::*[@class and contains(concat(' ', normalize-space(@class), ' '), concat(' ', $class_name, ' '))]")
97 _id_xpath = etree.XPath("descendant-or-self::*[@id=$id]")
98 _collect_string_content = etree.XPath("string()")
99 _iter_css_urls = re.compile(r'url\(('+'["][^"]*["]|'+"['][^']*[']|"+r'[^)]*)\)', re.I).finditer
100 _iter_css_imports = re.compile(r'@import "(.*?)"').finditer
101 _label_xpath = etree.XPath("//label[@for=$id]|//x:label[@for=$id]",
102 namespaces={'x':XHTML_NAMESPACE})
103 _archive_re = re.compile(r'[^ ]+')
104 _parse_meta_refresh_url = re.compile(
105 r'[^;=]*;\s*(?:url\s*=\s*)?(?P<url>.*)$', re.I).search
106
107
109 if s[:1] == '"' and s[-1:] == '"' or s[:1] == "'" and s[-1:] == "'":
110 return s[1:-1], pos+1
111 else:
112 return s,pos
113
123
129
131
133 """
134 Returns the base URL, given when the page was parsed.
135
136 Use with ``urlparse.urljoin(el.base_url, href)`` to get
137 absolute URLs.
138 """
139 return self.getroottree().docinfo.URL
140 base_url = property(base_url, doc=base_url.__doc__)
141
147 forms = property(forms, doc=forms.__doc__)
148
150 """
151 Return the <body> element. Can be called from a child element
152 to get the document's head.
153 """
154 return self.xpath('//body|//x:body', namespaces={'x':XHTML_NAMESPACE})[0]
155 body = property(body, doc=body.__doc__)
156
158 """
159 Returns the <head> element. Can be called from a child
160 element to get the document's head.
161 """
162 return self.xpath('//head|//x:head', namespaces={'x':XHTML_NAMESPACE})[0]
163 head = property(head, doc=head.__doc__)
164
166 """
167 Get or set any <label> element associated with this element.
168 """
169 id = self.get('id')
170 if not id:
171 return None
172 result = _label_xpath(self, id=id)
173 if not result:
174 return None
175 else:
176 return result[0]
178 id = self.get('id')
179 if not id:
180 raise TypeError(
181 "You cannot set a label for an element (%r) that has no id"
182 % self)
183 if _nons(label.tag) != 'label':
184 raise TypeError(
185 "You can only assign label to a label element (not %r)"
186 % label)
187 label.set('for', id)
192 label = property(_label__get, _label__set, _label__del, doc=_label__get.__doc__)
193
195 """
196 Removes this element from the tree, including its children and
197 text. The tail text is joined to the previous element or
198 parent.
199 """
200 parent = self.getparent()
201 assert parent is not None
202 if self.tail:
203 previous = self.getprevious()
204 if previous is None:
205 parent.text = (parent.text or '') + self.tail
206 else:
207 previous.tail = (previous.tail or '') + self.tail
208 parent.remove(self)
209
211 """
212 Remove the tag, but not its children or text. The children and text
213 are merged into the parent.
214
215 Example::
216
217 >>> h = fragment_fromstring('<div>Hello <b>World!</b></div>')
218 >>> h.find('.//b').drop_tag()
219 >>> print(tostring(h, encoding='unicode'))
220 <div>Hello World!</div>
221 """
222 parent = self.getparent()
223 assert parent is not None
224 previous = self.getprevious()
225 if self.text and isinstance(self.tag, basestring):
226
227 if previous is None:
228 parent.text = (parent.text or '') + self.text
229 else:
230 previous.tail = (previous.tail or '') + self.text
231 if self.tail:
232 if len(self):
233 last = self[-1]
234 last.tail = (last.tail or '') + self.tail
235 elif previous is None:
236 parent.text = (parent.text or '') + self.tail
237 else:
238 previous.tail = (previous.tail or '') + self.tail
239 index = parent.index(self)
240 parent[index:index+1] = self[:]
241
243 """
244 Find any links like ``<a rel="{rel}">...</a>``; returns a list of elements.
245 """
246 rel = rel.lower()
247 return [el for el in _rel_links_xpath(self)
248 if el.get('rel').lower() == rel]
249
251 """
252 Find any elements with the given class name.
253 """
254 return _class_xpath(self, class_name=class_name)
255
257 """
258 Get the first element in a document with the given id. If none is
259 found, return the default argument if provided or raise KeyError
260 otherwise.
261
262 Note that there can be more than one element with the same id,
263 and this isn't uncommon in HTML documents found in the wild.
264 Browsers return only the first match, and this function does
265 the same.
266 """
267 try:
268
269
270 return _id_xpath(self, id=id)[0]
271 except IndexError:
272 if default:
273 return default[0]
274 else:
275 raise KeyError(id)
276
277 - def text_content(self):
278 """
279 Return the text content of the tag (and the text in any children).
280 """
281 return _collect_string_content(self)
282
283 - def cssselect(self, expr, translator='html'):
284 """
285 Run the CSS expression on this element and its children,
286 returning a list of the results.
287
288 Equivalent to lxml.cssselect.CSSSelect(expr, translator='html')(self)
289 -- note that pre-compiling the expression can provide a substantial
290 speedup.
291 """
292
293 from lxml.cssselect import CSSSelector
294 return CSSSelector(expr, translator=translator)(self)
295
296
297
298
299
300 - def make_links_absolute(self, base_url=None, resolve_base_href=True,
301 handle_failures=None):
302 """
303 Make all links in the document absolute, given the
304 ``base_url`` for the document (the full URL where the document
305 came from), or if no ``base_url`` is given, then the ``.base_url``
306 of the document.
307
308 If ``resolve_base_href`` is true, then any ``<base href>``
309 tags in the document are used *and* removed from the document.
310 If it is false then any such tag is ignored.
311
312 If ``handle_failures`` is None (default), a failure to process
313 a URL will abort the processing. If set to 'ignore', errors
314 are ignored. If set to 'discard', failing URLs will be removed.
315 """
316 if base_url is None:
317 base_url = self.base_url
318 if base_url is None:
319 raise TypeError(
320 "No base_url given, and the document has no base_url")
321 if resolve_base_href:
322 self.resolve_base_href()
323
324 if handle_failures == 'ignore':
325 def link_repl(href):
326 try:
327 return urljoin(base_url, href)
328 except ValueError:
329 return href
330 elif handle_failures == 'discard':
331 def link_repl(href):
332 try:
333 return urljoin(base_url, href)
334 except ValueError:
335 return None
336 elif handle_failures is None:
337 def link_repl(href):
338 return urljoin(base_url, href)
339 else:
340 raise ValueError(
341 "unexpected value for handle_failures: %r" % handle_failures)
342
343 self.rewrite_links(link_repl)
344
346 """
347 Find any ``<base href>`` tag in the document, and apply its
348 values to all links found in the document. Also remove the
349 tag once it has been applied.
350
351 If ``handle_failures`` is None (default), a failure to process
352 a URL will abort the processing. If set to 'ignore', errors
353 are ignored. If set to 'discard', failing URLs will be removed.
354 """
355 base_href = None
356 basetags = self.xpath('//base[@href]|//x:base[@href]',
357 namespaces={'x': XHTML_NAMESPACE})
358 for b in basetags:
359 base_href = b.get('href')
360 b.drop_tree()
361 if not base_href:
362 return
363 self.make_links_absolute(base_href, resolve_base_href=False,
364 handle_failures=handle_failures)
365
367 """
368 Yield (element, attribute, link, pos), where attribute may be None
369 (indicating the link is in the text). ``pos`` is the position
370 where the link occurs; often 0, but sometimes something else in
371 the case of links in stylesheets or style tags.
372
373 Note: <base href> is *not* taken into account in any way. The
374 link you get is exactly the link in the document.
375
376 Note: multiple links inside of a single text string or
377 attribute value are returned in reversed order. This makes it
378 possible to replace or delete them from the text string value
379 based on their reported text positions. Otherwise, a
380 modification at one text position can change the positions of
381 links reported later on.
382 """
383 link_attrs = defs.link_attrs
384 for el in self.iter(etree.Element):
385 attribs = el.attrib
386 tag = _nons(el.tag)
387 if tag == 'object':
388 codebase = None
389
390
391 if 'codebase' in attribs:
392 codebase = el.get('codebase')
393 yield (el, 'codebase', codebase, 0)
394 for attrib in ('classid', 'data'):
395 if attrib in attribs:
396 value = el.get(attrib)
397 if codebase is not None:
398 value = urljoin(codebase, value)
399 yield (el, attrib, value, 0)
400 if 'archive' in attribs:
401 for match in _archive_re.finditer(el.get('archive')):
402 value = match.group(0)
403 if codebase is not None:
404 value = urljoin(codebase, value)
405 yield (el, 'archive', value, match.start())
406 else:
407 for attrib in link_attrs:
408 if attrib in attribs:
409 yield (el, attrib, attribs[attrib], 0)
410 if tag == 'meta':
411 http_equiv = attribs.get('http-equiv', '').lower()
412 if http_equiv == 'refresh':
413 content = attribs.get('content', '')
414 match = _parse_meta_refresh_url(content)
415 url = (match.group('url') if match else content).strip()
416
417
418 if url:
419 url, pos = _unquote_match(
420 url, match.start('url') if match else content.find(url))
421 yield (el, 'content', url, pos)
422 elif tag == 'param':
423 valuetype = el.get('valuetype') or ''
424 if valuetype.lower() == 'ref':
425
426
427
428
429
430
431 yield (el, 'value', el.get('value'), 0)
432 elif tag == 'style' and el.text:
433 urls = [
434
435 _unquote_match(match.group(1), match.start(1))[::-1]
436 for match in _iter_css_urls(el.text)
437 ] + [
438 (match.start(1), match.group(1))
439 for match in _iter_css_imports(el.text)
440 ]
441 if urls:
442
443
444
445 urls.sort(reverse=True)
446 for start, url in urls:
447 yield (el, None, url, start)
448 if 'style' in attribs:
449 urls = list(_iter_css_urls(attribs['style']))
450 if urls:
451
452 for match in urls[::-1]:
453 url, start = _unquote_match(match.group(1), match.start(1))
454 yield (el, 'style', url, start)
455
456 - def rewrite_links(self, link_repl_func, resolve_base_href=True,
457 base_href=None):
458 """
459 Rewrite all the links in the document. For each link
460 ``link_repl_func(link)`` will be called, and the return value
461 will replace the old link.
462
463 Note that links may not be absolute (unless you first called
464 ``make_links_absolute()``), and may be internal (e.g.,
465 ``'#anchor'``). They can also be values like
466 ``'mailto:email'`` or ``'javascript:expr'``.
467
468 If you give ``base_href`` then all links passed to
469 ``link_repl_func()`` will take that into account.
470
471 If the ``link_repl_func`` returns None, the attribute or
472 tag text will be removed completely.
473 """
474 if base_href is not None:
475
476
477 self.make_links_absolute(
478 base_href, resolve_base_href=resolve_base_href)
479 elif resolve_base_href:
480 self.resolve_base_href()
481
482 for el, attrib, link, pos in self.iterlinks():
483 new_link = link_repl_func(link.strip())
484 if new_link == link:
485 continue
486 if new_link is None:
487
488 if attrib is None:
489 el.text = ''
490 else:
491 del el.attrib[attrib]
492 continue
493
494 if attrib is None:
495 new = el.text[:pos] + new_link + el.text[pos+len(link):]
496 el.text = new
497 else:
498 cur = el.get(attrib)
499 if not pos and len(cur) == len(link):
500 new = new_link
501 else:
502 new = cur[:pos] + new_link + cur[pos+len(link):]
503 el.set(attrib, new)
504
505
507 """
508 An object that represents a method on an element as a function;
509 the function takes either an element or an HTML string. It
510 returns whatever the function normally returns, or if the function
511 works in-place (and so returns None) it returns a serialized form
512 of the resulting document.
513 """
519 result_type = type(doc)
520 if isinstance(doc, basestring):
521 if 'copy' in kw:
522 raise TypeError(
523 "The keyword 'copy' can only be used with element inputs to %s, not a string input" % self.name)
524 doc = fromstring(doc, **kw)
525 else:
526 if 'copy' in kw:
527 make_a_copy = kw.pop('copy')
528 else:
529 make_a_copy = self.copy
530 if make_a_copy:
531 doc = copy.deepcopy(doc)
532 meth = getattr(doc, self.name)
533 result = meth(*args, **kw)
534
535 if result is None:
536
537 return _transform_result(result_type, doc)
538 else:
539 return result
540
541 find_rel_links = _MethodFunc('find_rel_links', copy=False)
542 find_class = _MethodFunc('find_class', copy=False)
543 make_links_absolute = _MethodFunc('make_links_absolute', copy=True)
544 resolve_base_href = _MethodFunc('resolve_base_href', copy=True)
545 iterlinks = _MethodFunc('iterlinks', copy=False)
546 rewrite_links = _MethodFunc('rewrite_links', copy=True)
547
550
553
556
559
560
562 """A lookup scheme for HTML Element classes.
563
564 To create a lookup instance with different Element classes, pass a tag
565 name mapping of Element classes in the ``classes`` keyword argument and/or
566 a tag name mapping of Mixin classes in the ``mixins`` keyword argument.
567 The special key '*' denotes a Mixin class that should be mixed into all
568 Element classes.
569 """
570 _default_element_classes = {}
571
572 - def __init__(self, classes=None, mixins=None):
589
590 - def lookup(self, node_type, document, namespace, name):
601
602
603
604
605
606 _looks_like_full_html_unicode = re.compile(
607 unicode(r'^\s*<(?:html|!doctype)'), re.I).match
608 _looks_like_full_html_bytes = re.compile(
609 r'^\s*<(?:html|!doctype)'.encode('ascii'), re.I).match
610
623
626 """
627 Parses several HTML elements, returning a list of elements.
628
629 The first item in the list may be a string (though leading
630 whitespace is removed). If no_leading_text is true, then it will
631 be an error if there is leading text, and it will always be a list
632 of only elements.
633
634 base_url will set the document's base_url attribute (and the tree's docinfo.URL)
635 """
636 if parser is None:
637 parser = html_parser
638
639 if isinstance(html, bytes):
640 if not _looks_like_full_html_bytes(html):
641
642 html = ('<html><body>'.encode('ascii') + html +
643 '</body></html>'.encode('ascii'))
644 else:
645 if not _looks_like_full_html_unicode(html):
646 html = '<html><body>%s</body></html>' % html
647 doc = document_fromstring(html, parser=parser, base_url=base_url, **kw)
648 assert _nons(doc.tag) == 'html'
649 bodies = [e for e in doc if _nons(e.tag) == 'body']
650 assert len(bodies) == 1, ("too many bodies: %r in %r" % (bodies, html))
651 body = bodies[0]
652 elements = []
653 if no_leading_text and body.text and body.text.strip():
654 raise etree.ParserError(
655 "There is leading text: %r" % body.text)
656 if body.text and body.text.strip():
657 elements.append(body.text)
658 elements.extend(body)
659
660
661 return elements
662
665 """
666 Parses a single HTML element; it is an error if there is more than
667 one element, or if anything but whitespace precedes or follows the
668 element.
669
670 If ``create_parent`` is true (or is a tag name) then a parent node
671 will be created to encapsulate the HTML in a single element. In this
672 case, leading or trailing text is also allowed, as are multiple elements
673 as result of the parsing.
674
675 Passing a ``base_url`` will set the document's ``base_url`` attribute
676 (and the tree's docinfo.URL).
677 """
678 if parser is None:
679 parser = html_parser
680
681 accept_leading_text = bool(create_parent)
682
683 elements = fragments_fromstring(
684 html, parser=parser, no_leading_text=not accept_leading_text,
685 base_url=base_url, **kw)
686
687 if create_parent:
688 if not isinstance(create_parent, basestring):
689 create_parent = 'div'
690 new_root = Element(create_parent)
691 if elements:
692 if isinstance(elements[0], basestring):
693 new_root.text = elements[0]
694 del elements[0]
695 new_root.extend(elements)
696 return new_root
697
698 if not elements:
699 raise etree.ParserError('No elements found')
700 if len(elements) > 1:
701 raise etree.ParserError(
702 "Multiple elements found (%s)"
703 % ', '.join([_element_name(e) for e in elements]))
704 el = elements[0]
705 if el.tail and el.tail.strip():
706 raise etree.ParserError(
707 "Element followed by text: %r" % el.tail)
708 el.tail = None
709 return el
710
711 -def fromstring(html, base_url=None, parser=None, **kw):
777
778 -def parse(filename_or_url, parser=None, base_url=None, **kw):
779 """
780 Parse a filename, URL, or file-like object into an HTML document
781 tree. Note: this returns a tree, not an element. Use
782 ``parse(...).getroot()`` to get the document root.
783
784 You can override the base URL with the ``base_url`` keyword. This
785 is most useful when parsing from a file-like object.
786 """
787 if parser is None:
788 parser = html_parser
789 return etree.parse(filename_or_url, parser, base_url=base_url, **kw)
790
798
800 if isinstance(el, etree.CommentBase):
801 return 'comment'
802 elif isinstance(el, basestring):
803 return 'string'
804 else:
805 return _nons(el.tag)
806
807
808
809
810
915
916 HtmlElementClassLookup._default_element_classes['form'] = FormElement
917
954
956 if not url:
957 raise ValueError("cannot submit, no URL provided")
958
959 try:
960 from urllib import urlencode, urlopen
961 except ImportError:
962 from urllib.request import urlopen
963 from urllib.parse import urlencode
964 if method == 'GET':
965 if '?' in url:
966 url += '&'
967 else:
968 url += '?'
969 url += urlencode(values)
970 data = None
971 else:
972 data = urlencode(values)
973 return urlopen(url, data)
974
976
984 raise KeyError(
985 "You cannot remove keys from ElementDict")
989 return item in self.inputs
994
996 return '<%s for form %s>' % (
997 self.__class__.__name__,
998 self.inputs.form._name())
999
1065
1093
1094 -class TextareaElement(InputMixin, HtmlElement):
1095 """
1096 ``<textarea>`` element. You can get the name with ``.name`` and
1097 get/set the value with ``.value``
1098 """
1099
1100 - def _value__get(self):
1101 """
1102 Get/set the value (which is the contents of this element)
1103 """
1104 content = self.text or ''
1105 if self.tag.startswith("{%s}" % XHTML_NAMESPACE):
1106 serialisation_method = 'xml'
1107 else:
1108 serialisation_method = 'html'
1109 for el in self:
1110
1111 content += etree.tostring(
1112 el, method=serialisation_method, encoding='unicode')
1113 return content
1114 - def _value__set(self, value):
1115 del self[:]
1116 self.text = value
1117 - def _value__del(self):
1118 self.text = ''
1119 del self[:]
1120 value = property(_value__get, _value__set, _value__del, doc=_value__get.__doc__)
1121
1122 HtmlElementClassLookup._default_element_classes['textarea'] = TextareaElement
1123
1125 """
1126 ``<select>`` element. You can get the name with ``.name``.
1127
1128 ``.value`` will be the value of the selected option, unless this
1129 is a multi-select element (``<select multiple>``), in which case
1130 it will be a set-like object. In either case ``.value_options``
1131 gives the possible values.
1132
1133 The boolean attribute ``.multiple`` shows if this is a
1134 multi-select.
1135 """
1136
1138 """
1139 Get/set the value of this select (the selected option).
1140
1141 If this is a multi-select, this is a set-like object that
1142 represents all the selected options.
1143 """
1144 if self.multiple:
1145 return MultipleSelectOptions(self)
1146 for el in _options_xpath(self):
1147 if el.get('selected') is not None:
1148 value = el.get('value')
1149 if value is None:
1150 value = el.text or ''
1151 if value:
1152 value = value.strip()
1153 return value
1154 return None
1155
1157 if self.multiple:
1158 if isinstance(value, basestring):
1159 raise TypeError(
1160 "You must pass in a sequence")
1161 self.value.clear()
1162 self.value.update(value)
1163 return
1164 if value is not None:
1165 value = value.strip()
1166 for el in _options_xpath(self):
1167 opt_value = el.get('value')
1168 if opt_value is None:
1169 opt_value = el.text or ''
1170 if opt_value:
1171 opt_value = opt_value.strip()
1172 if opt_value == value:
1173 checked_option = el
1174 break
1175 else:
1176 raise ValueError(
1177 "There is no option with the value of %r" % value)
1178 for el in _options_xpath(self):
1179 if 'selected' in el.attrib:
1180 del el.attrib['selected']
1181 if value is not None:
1182 checked_option.set('selected', '')
1183
1190
1191 value = property(_value__get, _value__set, _value__del, doc=_value__get.__doc__)
1192
1207 value_options = property(value_options, doc=value_options.__doc__)
1208
1210 """
1211 Boolean attribute: is there a ``multiple`` attribute on this element.
1212 """
1213 return 'multiple' in self.attrib
1215 if value:
1216 self.set('multiple', '')
1217 elif 'multiple' in self.attrib:
1218 del self.attrib['multiple']
1219 multiple = property(_multiple__get, _multiple__set, doc=_multiple__get.__doc__)
1220
1221 HtmlElementClassLookup._default_element_classes['select'] = SelectElement
1222
1224 """
1225 Represents all the selected options in a ``<select multiple>`` element.
1226
1227 You can add to this set-like option to select an option, or remove
1228 to unselect the option.
1229 """
1230
1232 self.select = select
1233
1235 """
1236 Iterator of all the ``<option>`` elements.
1237 """
1238 return iter(_options_xpath(self.select))
1239 options = property(options)
1240
1242 for option in self.options:
1243 if 'selected' in option.attrib:
1244 opt_value = option.get('value')
1245 if opt_value is None:
1246 opt_value = option.text or ''
1247 if opt_value:
1248 opt_value = opt_value.strip()
1249 yield opt_value
1250
1251 - def add(self, item):
1252 for option in self.options:
1253 opt_value = option.get('value')
1254 if opt_value is None:
1255 opt_value = option.text or ''
1256 if opt_value:
1257 opt_value = opt_value.strip()
1258 if opt_value == item:
1259 option.set('selected', '')
1260 break
1261 else:
1262 raise ValueError(
1263 "There is no option with the value %r" % item)
1264
1266 for option in self.options:
1267 opt_value = option.get('value')
1268 if opt_value is None:
1269 opt_value = option.text or ''
1270 if opt_value:
1271 opt_value = opt_value.strip()
1272 if opt_value == item:
1273 if 'selected' in option.attrib:
1274 del option.attrib['selected']
1275 else:
1276 raise ValueError(
1277 "The option %r is not currently selected" % item)
1278 break
1279 else:
1280 raise ValueError(
1281 "There is not option with the value %r" % item)
1282
1284 return '<%s {%s} for select name=%r>' % (
1285 self.__class__.__name__,
1286 ', '.join([repr(v) for v in self]),
1287 self.select.name)
1288
1290 """
1291 This object represents several ``<input type=radio>`` elements
1292 that have the same name.
1293
1294 You can use this like a list, but also use the property
1295 ``.value`` to check/uncheck inputs. Also you can use
1296 ``.value_options`` to get the possible values.
1297 """
1298
1300 """
1301 Get/set the value, which checks the radio with that value (and
1302 unchecks any other value).
1303 """
1304 for el in self:
1305 if 'checked' in el.attrib:
1306 return el.get('value')
1307 return None
1308
1310 if value is not None:
1311 for el in self:
1312 if el.get('value') == value:
1313 checked_option = el
1314 break
1315 else:
1316 raise ValueError(
1317 "There is no radio input with the value %r" % value)
1318 for el in self:
1319 if 'checked' in el.attrib:
1320 del el.attrib['checked']
1321 if value is not None:
1322 checked_option.set('checked', '')
1323
1326
1327 value = property(_value__get, _value__set, _value__del, doc=_value__get.__doc__)
1328
1330 """
1331 Returns a list of all the possible values.
1332 """
1333 return [el.get('value') for el in self]
1334 value_options = property(value_options, doc=value_options.__doc__)
1335
1337 return '%s(%s)' % (
1338 self.__class__.__name__,
1339 list.__repr__(self))
1340
1342 """
1343 Represents a group of checkboxes (``<input type=checkbox>``) that
1344 have the same name.
1345
1346 In addition to using this like a list, the ``.value`` attribute
1347 returns a set-like object that you can add to or remove from to
1348 check and uncheck checkboxes. You can also use ``.value_options``
1349 to get the possible values.
1350 """
1351
1353 """
1354 Return a set-like object that can be modified to check or
1355 uncheck individual checkboxes according to their value.
1356 """
1357 return CheckboxValues(self)
1367 value = property(_value__get, _value__set, _value__del, doc=_value__get.__doc__)
1368
1370 """
1371 Returns a list of all the possible values.
1372 """
1373 return [el.get('value') for el in self]
1374 value_options = property(value_options, doc=value_options.__doc__)
1375
1377 return '%s(%s)' % (
1378 self.__class__.__name__, list.__repr__(self))
1379
1381
1382 """
1383 Represents the values of the checked checkboxes in a group of
1384 checkboxes with the same name.
1385 """
1386
1389
1391 return iter([
1392 el.get('value')
1393 for el in self.group
1394 if 'checked' in el.attrib])
1395
1396 - def add(self, value):
1397 for el in self.group:
1398 if el.get('value') == value:
1399 el.set('checked', '')
1400 break
1401 else:
1402 raise KeyError("No checkbox with value %r" % value)
1403
1405 for el in self.group:
1406 if el.get('value') == value:
1407 if 'checked' in el.attrib:
1408 del el.attrib['checked']
1409 else:
1410 raise KeyError(
1411 "The checkbox with value %r was already unchecked" % value)
1412 break
1413 else:
1414 raise KeyError(
1415 "No checkbox with value %r" % value)
1416
1418 return '<%s {%s} for checkboxes name=%r>' % (
1419 self.__class__.__name__,
1420 ', '.join([repr(v) for v in self]),
1421 self.group.name)
1422
1506
1507 HtmlElementClassLookup._default_element_classes['input'] = InputElement
1508
1510 """
1511 Represents a ``<label>`` element.
1512
1513 Label elements are linked to other elements with their ``for``
1514 attribute. You can access this element with ``label.for_element``.
1515 """
1516
1518 """
1519 Get/set the element this label points to. Return None if it
1520 can't be found.
1521 """
1522 id = self.get('for')
1523 if not id:
1524 return None
1525 return self.body.get_element_by_id(id)
1527 id = other.get('id')
1528 if not id:
1529 raise TypeError(
1530 "Element %r has no id attribute" % other)
1531 self.set('for', id)
1535 for_element = property(_for_element__get, _for_element__set, _for_element__del,
1536 doc=_for_element__get.__doc__)
1537
1538 HtmlElementClassLookup._default_element_classes['label'] = LabelElement
1539
1540
1541
1542
1543
1557
1559 """Convert all tags in an XHTML tree to HTML by removing their
1560 XHTML namespace.
1561 """
1562 try:
1563 xhtml = xhtml.getroot()
1564 except AttributeError:
1565 pass
1566 prefix = "{%s}" % XHTML_NAMESPACE
1567 prefix_len = len(prefix)
1568 for el in xhtml.iter(prefix + "*"):
1569 el.tag = el.tag[prefix_len:]
1570
1571
1572
1573 __str_replace_meta_content_type = re.compile(
1574 r'<meta http-equiv="Content-Type"[^>]*>').sub
1575 __bytes_replace_meta_content_type = re.compile(
1576 r'<meta http-equiv="Content-Type"[^>]*>'.encode('ASCII')).sub
1577
1578 -def tostring(doc, pretty_print=False, include_meta_content_type=False,
1579 encoding=None, method="html", with_tail=True, doctype=None):
1580 """Return an HTML string representation of the document.
1581
1582 Note: if include_meta_content_type is true this will create a
1583 ``<meta http-equiv="Content-Type" ...>`` tag in the head;
1584 regardless of the value of include_meta_content_type any existing
1585 ``<meta http-equiv="Content-Type" ...>`` tag will be removed
1586
1587 The ``encoding`` argument controls the output encoding (defauts to
1588 ASCII, with &#...; character references for any characters outside
1589 of ASCII). Note that you can pass the name ``'unicode'`` as
1590 ``encoding`` argument to serialise to a Unicode string.
1591
1592 The ``method`` argument defines the output method. It defaults to
1593 'html', but can also be 'xml' for xhtml output, or 'text' to
1594 serialise to plain text without markup.
1595
1596 To leave out the tail text of the top-level element that is being
1597 serialised, pass ``with_tail=False``.
1598
1599 The ``doctype`` option allows passing in a plain string that will
1600 be serialised before the XML tree. Note that passing in non
1601 well-formed content here will make the XML output non well-formed.
1602 Also, an existing doctype in the document tree will not be removed
1603 when serialising an ElementTree instance.
1604
1605 Example::
1606
1607 >>> from lxml import html
1608 >>> root = html.fragment_fromstring('<p>Hello<br>world!</p>')
1609
1610 >>> html.tostring(root)
1611 b'<p>Hello<br>world!</p>'
1612 >>> html.tostring(root, method='html')
1613 b'<p>Hello<br>world!</p>'
1614
1615 >>> html.tostring(root, method='xml')
1616 b'<p>Hello<br/>world!</p>'
1617
1618 >>> html.tostring(root, method='text')
1619 b'Helloworld!'
1620
1621 >>> html.tostring(root, method='text', encoding='unicode')
1622 u'Helloworld!'
1623
1624 >>> root = html.fragment_fromstring('<div><p>Hello<br>world!</p>TAIL</div>')
1625 >>> html.tostring(root[0], method='text', encoding='unicode')
1626 u'Helloworld!TAIL'
1627
1628 >>> html.tostring(root[0], method='text', encoding='unicode', with_tail=False)
1629 u'Helloworld!'
1630
1631 >>> doc = html.document_fromstring('<p>Hello<br>world!</p>')
1632 >>> html.tostring(doc, method='html', encoding='unicode')
1633 u'<html><body><p>Hello<br>world!</p></body></html>'
1634
1635 >>> print(html.tostring(doc, method='html', encoding='unicode',
1636 ... doctype='<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN"'
1637 ... ' "http://www.w3.org/TR/html4/strict.dtd">'))
1638 <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd">
1639 <html><body><p>Hello<br>world!</p></body></html>
1640 """
1641 html = etree.tostring(doc, method=method, pretty_print=pretty_print,
1642 encoding=encoding, with_tail=with_tail,
1643 doctype=doctype)
1644 if method == 'html' and not include_meta_content_type:
1645 if isinstance(html, str):
1646 html = __str_replace_meta_content_type('', html)
1647 else:
1648 html = __bytes_replace_meta_content_type(bytes(), html)
1649 return html
1650
1651 tostring.__doc__ = __fix_docstring(tostring.__doc__)
1652
1654 """
1655 Open the HTML document in a web browser, saving it to a temporary
1656 file to open it. Note that this does not delete the file after
1657 use. This is mainly meant for debugging.
1658 """
1659 import os
1660 import webbrowser
1661 import tempfile
1662 if not isinstance(doc, etree._ElementTree):
1663 doc = etree.ElementTree(doc)
1664 handle, fn = tempfile.mkstemp(suffix='.html')
1665 f = os.fdopen(handle, 'wb')
1666 try:
1667 doc.write(f, method="html", encoding=encoding or doc.docinfo.encoding or "UTF-8")
1668 finally:
1669
1670 f.close()
1671 url = 'file://' + fn.replace(os.path.sep, '/')
1672 print(url)
1673 webbrowser.open(url)
1674
1675
1676
1677
1678
1680 """An HTML parser that is configured to return lxml.html Element
1681 objects.
1682 """
1686
1688 """An XML parser that is configured to return lxml.html Element
1689 objects.
1690
1691 Note that this parser is not really XHTML aware unless you let it
1692 load a DTD that declares the HTML entities. To do this, make sure
1693 you have the XHTML DTDs installed in your catalogs, and create the
1694 parser like this::
1695
1696 >>> parser = XHTMLParser(load_dtd=True)
1697
1698 If you additionally want to validate the document, use this::
1699
1700 >>> parser = XHTMLParser(dtd_validation=True)
1701
1702 For catalog support, see http://www.xmlsoft.org/catalog.html.
1703 """
1707
1709 """Create a new HTML Element.
1710
1711 This can also be used for XHTML documents.
1712 """
1713 v = html_parser.makeelement(*args, **kw)
1714 return v
1715
1716 html_parser = HTMLParser()
1717 xhtml_parser = XHTMLParser()
1718