1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31 """The ``lxml.html`` tool set for HTML handling.
32 """
33
34 import sys
35 import re
36 try:
37 from urlparse import urljoin
38 except ImportError:
39
40 from urllib.parse import urljoin
41 import copy
42 from lxml import etree
43 from lxml.html import defs
44 from lxml.html._setmixin import SetMixin
45 try:
46 from collections import MutableMapping as DictMixin
47 except ImportError:
48
49 from UserDict import DictMixin
50 try:
51 set
52 except NameError:
53
54 from sets import Set as set
55 try:
56 bytes
57 except NameError:
58
59 bytes = str
60 try:
61 unicode
62 except NameError:
63
64 unicode = str
65 try:
66 basestring
67 except NameError:
68
69 basestring = (str, bytes)
70
72 if not s:
73 return s
74 import sys
75 if sys.version_info[0] >= 3:
76 sub = re.compile(r"^(\s*)u'", re.M).sub
77 else:
78 sub = re.compile(r"^(\s*)b'", re.M).sub
79 return sub(r"\1'", s)
80
81 __all__ = [
82 'document_fromstring', 'fragment_fromstring', 'fragments_fromstring', 'fromstring',
83 'tostring', 'Element', 'defs', 'open_in_browser', 'submit_form',
84 'find_rel_links', 'find_class', 'make_links_absolute',
85 'resolve_base_href', 'iterlinks', 'rewrite_links', 'open_in_browser', 'parse']
86
87 XHTML_NAMESPACE = "http://www.w3.org/1999/xhtml"
88
89 _rel_links_xpath = etree.XPath("descendant-or-self::a[@rel]|descendant-or-self::x:a[@rel]",
90 namespaces={'x':XHTML_NAMESPACE})
91 _options_xpath = etree.XPath("descendant-or-self::option|descendant-or-self::x:option",
92 namespaces={'x':XHTML_NAMESPACE})
93 _forms_xpath = etree.XPath("descendant-or-self::form|descendant-or-self::x:form",
94 namespaces={'x':XHTML_NAMESPACE})
95
96 _class_xpath = etree.XPath("descendant-or-self::*[@class and contains(concat(' ', normalize-space(@class), ' '), concat(' ', $class_name, ' '))]")
97 _id_xpath = etree.XPath("descendant-or-self::*[@id=$id]")
98 _collect_string_content = etree.XPath("string()")
99 _css_url_re = re.compile(r'url\(('+'["][^"]*["]|'+"['][^']*[']|"+r'[^)]*)\)', re.I)
100 _css_import_re = re.compile(r'@import "(.*?)"')
101 _label_xpath = etree.XPath("//label[@for=$id]|//x:label[@for=$id]",
102 namespaces={'x':XHTML_NAMESPACE})
103 _archive_re = re.compile(r'[^ ]+')
104
106 if s[:1] == '"' and s[-1:] == '"' or s[:1] == "'" and s[-1:] == "'":
107 return s[1:-1], pos+1
108 else:
109 return s,pos
110
120
126
128
130 """
131 Returns the base URL, given when the page was parsed.
132
133 Use with ``urlparse.urljoin(el.base_url, href)`` to get
134 absolute URLs.
135 """
136 return self.getroottree().docinfo.URL
137 base_url = property(base_url, doc=base_url.__doc__)
138
144 forms = property(forms, doc=forms.__doc__)
145
147 """
148 Return the <body> element. Can be called from a child element
149 to get the document's head.
150 """
151 return self.xpath('//body|//x:body', namespaces={'x':XHTML_NAMESPACE})[0]
152 body = property(body, doc=body.__doc__)
153
155 """
156 Returns the <head> element. Can be called from a child
157 element to get the document's head.
158 """
159 return self.xpath('//head|//x:head', namespaces={'x':XHTML_NAMESPACE})[0]
160 head = property(head, doc=head.__doc__)
161
163 """
164 Get or set any <label> element associated with this element.
165 """
166 id = self.get('id')
167 if not id:
168 return None
169 result = _label_xpath(self, id=id)
170 if not result:
171 return None
172 else:
173 return result[0]
175 id = self.get('id')
176 if not id:
177 raise TypeError(
178 "You cannot set a label for an element (%r) that has no id"
179 % self)
180 if _nons(label.tag) != 'label':
181 raise TypeError(
182 "You can only assign label to a label element (not %r)"
183 % label)
184 label.set('for', id)
189 label = property(_label__get, _label__set, _label__del, doc=_label__get.__doc__)
190
192 """
193 Removes this element from the tree, including its children and
194 text. The tail text is joined to the previous element or
195 parent.
196 """
197 parent = self.getparent()
198 assert parent is not None
199 if self.tail:
200 previous = self.getprevious()
201 if previous is None:
202 parent.text = (parent.text or '') + self.tail
203 else:
204 previous.tail = (previous.tail or '') + self.tail
205 parent.remove(self)
206
208 """
209 Remove the tag, but not its children or text. The children and text
210 are merged into the parent.
211
212 Example::
213
214 >>> h = fragment_fromstring('<div>Hello <b>World!</b></div>')
215 >>> h.find('.//b').drop_tag()
216 >>> print(tostring(h, encoding=unicode))
217 <div>Hello World!</div>
218 """
219 parent = self.getparent()
220 assert parent is not None
221 previous = self.getprevious()
222 if self.text and isinstance(self.tag, basestring):
223
224 if previous is None:
225 parent.text = (parent.text or '') + self.text
226 else:
227 previous.tail = (previous.tail or '') + self.text
228 if self.tail:
229 if len(self):
230 last = self[-1]
231 last.tail = (last.tail or '') + self.tail
232 elif previous is None:
233 parent.text = (parent.text or '') + self.tail
234 else:
235 previous.tail = (previous.tail or '') + self.tail
236 index = parent.index(self)
237 parent[index:index+1] = self[:]
238
240 """
241 Find any links like ``<a rel="{rel}">...</a>``; returns a list of elements.
242 """
243 rel = rel.lower()
244 return [el for el in _rel_links_xpath(self)
245 if el.get('rel').lower() == rel]
246
248 """
249 Find any elements with the given class name.
250 """
251 return _class_xpath(self, class_name=class_name)
252
254 """
255 Get the first element in a document with the given id. If none is
256 found, return the default argument if provided or raise KeyError
257 otherwise.
258
259 Note that there can be more than one element with the same id,
260 and this isn't uncommon in HTML documents found in the wild.
261 Browsers return only the first match, and this function does
262 the same.
263 """
264 try:
265
266
267 return _id_xpath(self, id=id)[0]
268 except IndexError:
269 if default:
270 return default[0]
271 else:
272 raise KeyError(id)
273
274 - def text_content(self):
275 """
276 Return the text content of the tag (and the text in any children).
277 """
278 return _collect_string_content(self)
279
280 - def cssselect(self, expr, translator='html'):
281 """
282 Run the CSS expression on this element and its children,
283 returning a list of the results.
284
285 Equivalent to lxml.cssselect.CSSSelect(expr, translator='html')(self)
286 -- note that pre-compiling the expression can provide a substantial
287 speedup.
288 """
289
290 from lxml.cssselect import CSSSelector
291 return CSSSelector(expr, translator=translator)(self)
292
293
294
295
296
298 """
299 Make all links in the document absolute, given the
300 ``base_url`` for the document (the full URL where the document
301 came from), or if no ``base_url`` is given, then the ``.base_url`` of the document.
302
303 If ``resolve_base_href`` is true, then any ``<base href>``
304 tags in the document are used *and* removed from the document.
305 If it is false then any such tag is ignored.
306 """
307 if base_url is None:
308 base_url = self.base_url
309 if base_url is None:
310 raise TypeError(
311 "No base_url given, and the document has no base_url")
312 if resolve_base_href:
313 self.resolve_base_href()
314 def link_repl(href):
315 return urljoin(base_url, href)
316 self.rewrite_links(link_repl)
317
319 """
320 Find any ``<base href>`` tag in the document, and apply its
321 values to all links found in the document. Also remove the
322 tag once it has been applied.
323 """
324 base_href = None
325 basetags = self.xpath('//base[@href]|//x:base[@href]', namespaces={'x':XHTML_NAMESPACE})
326 for b in basetags:
327 base_href = b.get('href')
328 b.drop_tree()
329 if not base_href:
330 return
331 self.make_links_absolute(base_href, resolve_base_href=False)
332
334 """
335 Yield (element, attribute, link, pos), where attribute may be None
336 (indicating the link is in the text). ``pos`` is the position
337 where the link occurs; often 0, but sometimes something else in
338 the case of links in stylesheets or style tags.
339
340 Note: <base href> is *not* taken into account in any way. The
341 link you get is exactly the link in the document.
342
343 Note: multiple links inside of a single text string or
344 attribute value are returned in reversed order. This makes it
345 possible to replace or delete them from the text string value
346 based on their reported text positions. Otherwise, a
347 modification at one text position can change the positions of
348 links reported later on.
349 """
350 link_attrs = defs.link_attrs
351 for el in self.iter():
352 attribs = el.attrib
353 tag = _nons(el.tag)
354 if tag != 'object':
355 for attrib in link_attrs:
356 if attrib in attribs:
357 yield (el, attrib, attribs[attrib], 0)
358 elif tag == 'object':
359 codebase = None
360
361
362 if 'codebase' in attribs:
363 codebase = el.get('codebase')
364 yield (el, 'codebase', codebase, 0)
365 for attrib in 'classid', 'data':
366 if attrib in attribs:
367 value = el.get(attrib)
368 if codebase is not None:
369 value = urljoin(codebase, value)
370 yield (el, attrib, value, 0)
371 if 'archive' in attribs:
372 for match in _archive_re.finditer(el.get('archive')):
373 value = match.group(0)
374 if codebase is not None:
375 value = urljoin(codebase, value)
376 yield (el, 'archive', value, match.start())
377 if tag == 'param':
378 valuetype = el.get('valuetype') or ''
379 if valuetype.lower() == 'ref':
380
381
382
383
384
385
386 yield (el, 'value', el.get('value'), 0)
387 if tag == 'style' and el.text:
388 urls = [
389 _unquote_match(match.group(1), match.start(1))
390 for match in _css_url_re.finditer(el.text)
391 ] + [
392 (match.group(1), match.start(1))
393 for match in _css_import_re.finditer(el.text)
394 ]
395 if urls:
396
397 urls = [ (start, url) for (url, start) in urls ]
398 urls.sort()
399
400
401 urls.reverse()
402 for start, url in urls:
403 yield (el, None, url, start)
404 if 'style' in attribs:
405 urls = list(_css_url_re.finditer(attribs['style']))
406 if urls:
407
408 for match in urls[::-1]:
409 url, start = _unquote_match(match.group(1), match.start(1))
410 yield (el, 'style', url, start)
411
412 - def rewrite_links(self, link_repl_func, resolve_base_href=True,
413 base_href=None):
414 """
415 Rewrite all the links in the document. For each link
416 ``link_repl_func(link)`` will be called, and the return value
417 will replace the old link.
418
419 Note that links may not be absolute (unless you first called
420 ``make_links_absolute()``), and may be internal (e.g.,
421 ``'#anchor'``). They can also be values like
422 ``'mailto:email'`` or ``'javascript:expr'``.
423
424 If you give ``base_href`` then all links passed to
425 ``link_repl_func()`` will take that into account.
426
427 If the ``link_repl_func`` returns None, the attribute or
428 tag text will be removed completely.
429 """
430 if base_href is not None:
431
432
433 self.make_links_absolute(base_href, resolve_base_href=resolve_base_href)
434 elif resolve_base_href:
435 self.resolve_base_href()
436 for el, attrib, link, pos in self.iterlinks():
437 new_link = link_repl_func(link.strip())
438 if new_link == link:
439 continue
440 if new_link is None:
441
442 if attrib is None:
443 el.text = ''
444 else:
445 del el.attrib[attrib]
446 continue
447 if attrib is None:
448 new = el.text[:pos] + new_link + el.text[pos+len(link):]
449 el.text = new
450 else:
451 cur = el.attrib[attrib]
452 if not pos and len(cur) == len(link):
453
454 el.attrib[attrib] = new_link
455 else:
456 new = cur[:pos] + new_link + cur[pos+len(link):]
457 el.attrib[attrib] = new
458
459
461 """
462 An object that represents a method on an element as a function;
463 the function takes either an element or an HTML string. It
464 returns whatever the function normally returns, or if the function
465 works in-place (and so returns None) it returns a serialized form
466 of the resulting document.
467 """
473 result_type = type(doc)
474 if isinstance(doc, basestring):
475 if 'copy' in kw:
476 raise TypeError(
477 "The keyword 'copy' can only be used with element inputs to %s, not a string input" % self.name)
478 doc = fromstring(doc, **kw)
479 else:
480 if 'copy' in kw:
481 make_a_copy = kw.pop('copy')
482 else:
483 make_a_copy = self.copy
484 if make_a_copy:
485 doc = copy.deepcopy(doc)
486 meth = getattr(doc, self.name)
487 result = meth(*args, **kw)
488
489 if result is None:
490
491 return _transform_result(result_type, doc)
492 else:
493 return result
494
495 find_rel_links = _MethodFunc('find_rel_links', copy=False)
496 find_class = _MethodFunc('find_class', copy=False)
497 make_links_absolute = _MethodFunc('make_links_absolute', copy=True)
498 resolve_base_href = _MethodFunc('resolve_base_href', copy=True)
499 iterlinks = _MethodFunc('iterlinks', copy=False)
500 rewrite_links = _MethodFunc('rewrite_links', copy=True)
501
504
507
510
513
514
516 """A lookup scheme for HTML Element classes.
517
518 To create a lookup instance with different Element classes, pass a tag
519 name mapping of Element classes in the ``classes`` keyword argument and/or
520 a tag name mapping of Mixin classes in the ``mixins`` keyword argument.
521 The special key '*' denotes a Mixin class that should be mixed into all
522 Element classes.
523 """
524 _default_element_classes = {}
525
526 - def __init__(self, classes=None, mixins=None):
543
544 - def lookup(self, node_type, document, namespace, name):
555
556
557
558
559
560 _looks_like_full_html_unicode = re.compile(
561 unicode(r'^\s*<(?:html|!doctype)'), re.I).match
562 _looks_like_full_html_bytes = re.compile(
563 r'^\s*<(?:html|!doctype)'.encode('ascii'), re.I).match
564
573
576 """
577 Parses several HTML elements, returning a list of elements.
578
579 The first item in the list may be a string (though leading
580 whitespace is removed). If no_leading_text is true, then it will
581 be an error if there is leading text, and it will always be a list
582 of only elements.
583
584 base_url will set the document's base_url attribute (and the tree's docinfo.URL)
585 """
586 if parser is None:
587 parser = html_parser
588
589 if isinstance(html, bytes):
590 if not _looks_like_full_html_bytes(html):
591 html = '<html><body>%s</body></html>'.encode('ascii') % html
592 else:
593 if not _looks_like_full_html_unicode(html):
594 html = '<html><body>%s</body></html>' % html
595 doc = document_fromstring(html, parser=parser, base_url=base_url, **kw)
596 assert _nons(doc.tag) == 'html'
597 bodies = [e for e in doc if _nons(e.tag) == 'body']
598 assert len(bodies) == 1, ("too many bodies: %r in %r" % (bodies, html))
599 body = bodies[0]
600 elements = []
601 if no_leading_text and body.text and body.text.strip():
602 raise etree.ParserError(
603 "There is leading text: %r" % body.text)
604 if body.text and body.text.strip():
605 elements.append(body.text)
606 elements.extend(body)
607
608
609 return elements
610
613 """
614 Parses a single HTML element; it is an error if there is more than
615 one element, or if anything but whitespace precedes or follows the
616 element.
617
618 If create_parent is true (or is a tag name) then a parent node
619 will be created to encapsulate the HTML in a single element. In
620 this case, leading or trailing text is allowed.
621
622 base_url will set the document's base_url attribute (and the tree's docinfo.URL)
623 """
624 if parser is None:
625 parser = html_parser
626
627 accept_leading_text = bool(create_parent)
628
629 elements = fragments_fromstring(
630 html, parser=parser, no_leading_text=not accept_leading_text,
631 base_url=base_url, **kw)
632
633 if create_parent:
634 if not isinstance(create_parent, basestring):
635 create_parent = 'div'
636 new_root = Element(create_parent)
637 if elements:
638 if isinstance(elements[0], basestring):
639 new_root.text = elements[0]
640 del elements[0]
641 new_root.extend(elements)
642 return new_root
643
644 if not elements:
645 raise etree.ParserError('No elements found')
646 if len(elements) > 1:
647 raise etree.ParserError(
648 "Multiple elements found (%s)"
649 % ', '.join([_element_name(e) for e in elements]))
650 el = elements[0]
651 if el.tail and el.tail.strip():
652 raise etree.ParserError(
653 "Element followed by text: %r" % el.tail)
654 el.tail = None
655 return el
656
657 -def fromstring(html, base_url=None, parser=None, **kw):
723
724 -def parse(filename_or_url, parser=None, base_url=None, **kw):
725 """
726 Parse a filename, URL, or file-like object into an HTML document
727 tree. Note: this returns a tree, not an element. Use
728 ``parse(...).getroot()`` to get the document root.
729
730 You can override the base URL with the ``base_url`` keyword. This
731 is most useful when parsing from a file-like object.
732 """
733 if parser is None:
734 parser = html_parser
735 return etree.parse(filename_or_url, parser, base_url=base_url, **kw)
736
738
739
740 for el in el.iter():
741 if _nons(el.tag) in defs.block_tags:
742 return True
743 return False
744
746 if isinstance(el, etree.CommentBase):
747 return 'comment'
748 elif isinstance(el, basestring):
749 return 'string'
750 else:
751 return _nons(el.tag)
752
753
754
755
756
861
862 HtmlElementClassLookup._default_element_classes['form'] = FormElement
863
900
902 if not url:
903 raise ValueError("cannot submit, no URL provided")
904
905 try:
906 from urllib import urlencode, urlopen
907 except ImportError:
908 from urllib.request import urlopen
909 from urllib.parse import urlencode
910 if method == 'GET':
911 if '?' in url:
912 url += '&'
913 else:
914 url += '?'
915 url += urlencode(values)
916 data = None
917 else:
918 data = urlencode(values)
919 return urlopen(url, data)
920
922
930 raise KeyError(
931 "You cannot remove keys from ElementDict")
935 return item in self.inputs
940
942 return '<%s for form %s>' % (
943 self.__class__.__name__,
944 self.inputs.form._name())
945
1011
1039
1040 -class TextareaElement(InputMixin, HtmlElement):
1041 """
1042 ``<textarea>`` element. You can get the name with ``.name`` and
1043 get/set the value with ``.value``
1044 """
1045
1046 - def _value__get(self):
1047 """
1048 Get/set the value (which is the contents of this element)
1049 """
1050 content = self.text or ''
1051 if self.tag.startswith("{%s}" % XHTML_NAMESPACE):
1052 serialisation_method = 'xml'
1053 else:
1054 serialisation_method = 'html'
1055 for el in self:
1056
1057 content += etree.tostring(el, method=serialisation_method, encoding=unicode)
1058 return content
1059 - def _value__set(self, value):
1060 del self[:]
1061 self.text = value
1062 - def _value__del(self):
1063 self.text = ''
1064 del self[:]
1065 value = property(_value__get, _value__set, _value__del, doc=_value__get.__doc__)
1066
1067 HtmlElementClassLookup._default_element_classes['textarea'] = TextareaElement
1068
1070 """
1071 ``<select>`` element. You can get the name with ``.name``.
1072
1073 ``.value`` will be the value of the selected option, unless this
1074 is a multi-select element (``<select multiple>``), in which case
1075 it will be a set-like object. In either case ``.value_options``
1076 gives the possible values.
1077
1078 The boolean attribute ``.multiple`` shows if this is a
1079 multi-select.
1080 """
1081
1083 """
1084 Get/set the value of this select (the selected option).
1085
1086 If this is a multi-select, this is a set-like object that
1087 represents all the selected options.
1088 """
1089 if self.multiple:
1090 return MultipleSelectOptions(self)
1091 for el in _options_xpath(self):
1092 if el.get('selected') is not None:
1093 value = el.get('value')
1094 if value is None:
1095 value = el.text or ''
1096 if value:
1097 value = value.strip()
1098 return value
1099 return None
1100
1102 if self.multiple:
1103 if isinstance(value, basestring):
1104 raise TypeError(
1105 "You must pass in a sequence")
1106 self.value.clear()
1107 self.value.update(value)
1108 return
1109 if value is not None:
1110 value = value.strip()
1111 for el in _options_xpath(self):
1112 opt_value = el.get('value')
1113 if opt_value is None:
1114 opt_value = el.text or ''
1115 if opt_value:
1116 opt_value = opt_value.strip()
1117 if opt_value == value:
1118 checked_option = el
1119 break
1120 else:
1121 raise ValueError(
1122 "There is no option with the value of %r" % value)
1123 for el in _options_xpath(self):
1124 if 'selected' in el.attrib:
1125 del el.attrib['selected']
1126 if value is not None:
1127 checked_option.set('selected', '')
1128
1135
1136 value = property(_value__get, _value__set, _value__del, doc=_value__get.__doc__)
1137
1152 value_options = property(value_options, doc=value_options.__doc__)
1153
1155 """
1156 Boolean attribute: is there a ``multiple`` attribute on this element.
1157 """
1158 return 'multiple' in self.attrib
1160 if value:
1161 self.set('multiple', '')
1162 elif 'multiple' in self.attrib:
1163 del self.attrib['multiple']
1164 multiple = property(_multiple__get, _multiple__set, doc=_multiple__get.__doc__)
1165
1166 HtmlElementClassLookup._default_element_classes['select'] = SelectElement
1167
1169 """
1170 Represents all the selected options in a ``<select multiple>`` element.
1171
1172 You can add to this set-like option to select an option, or remove
1173 to unselect the option.
1174 """
1175
1177 self.select = select
1178
1180 """
1181 Iterator of all the ``<option>`` elements.
1182 """
1183 return iter(_options_xpath(self.select))
1184 options = property(options)
1185
1187 for option in self.options:
1188 if 'selected' in option.attrib:
1189 opt_value = option.get('value')
1190 if opt_value is None:
1191 opt_value = option.text or ''
1192 if opt_value:
1193 opt_value = opt_value.strip()
1194 yield opt_value
1195
1196 - def add(self, item):
1197 for option in self.options:
1198 opt_value = option.get('value')
1199 if opt_value is None:
1200 opt_value = option.text or ''
1201 if opt_value:
1202 opt_value = opt_value.strip()
1203 if opt_value == item:
1204 option.set('selected', '')
1205 break
1206 else:
1207 raise ValueError(
1208 "There is no option with the value %r" % item)
1209
1211 for option in self.options:
1212 opt_value = option.get('value')
1213 if opt_value is None:
1214 opt_value = option.text or ''
1215 if opt_value:
1216 opt_value = opt_value.strip()
1217 if opt_value == item:
1218 if 'selected' in option.attrib:
1219 del option.attrib['selected']
1220 else:
1221 raise ValueError(
1222 "The option %r is not currently selected" % item)
1223 break
1224 else:
1225 raise ValueError(
1226 "There is not option with the value %r" % item)
1227
1229 return '<%s {%s} for select name=%r>' % (
1230 self.__class__.__name__,
1231 ', '.join([repr(v) for v in self]),
1232 self.select.name)
1233
1235 """
1236 This object represents several ``<input type=radio>`` elements
1237 that have the same name.
1238
1239 You can use this like a list, but also use the property
1240 ``.value`` to check/uncheck inputs. Also you can use
1241 ``.value_options`` to get the possible values.
1242 """
1243
1245 """
1246 Get/set the value, which checks the radio with that value (and
1247 unchecks any other value).
1248 """
1249 for el in self:
1250 if 'checked' in el.attrib:
1251 return el.get('value')
1252 return None
1253
1255 if value is not None:
1256 for el in self:
1257 if el.get('value') == value:
1258 checked_option = el
1259 break
1260 else:
1261 raise ValueError(
1262 "There is no radio input with the value %r" % value)
1263 for el in self:
1264 if 'checked' in el.attrib:
1265 del el.attrib['checked']
1266 if value is not None:
1267 checked_option.set('checked', '')
1268
1271
1272 value = property(_value__get, _value__set, _value__del, doc=_value__get.__doc__)
1273
1275 """
1276 Returns a list of all the possible values.
1277 """
1278 return [el.get('value') for el in self]
1279 value_options = property(value_options, doc=value_options.__doc__)
1280
1282 return '%s(%s)' % (
1283 self.__class__.__name__,
1284 list.__repr__(self))
1285
1287 """
1288 Represents a group of checkboxes (``<input type=checkbox>``) that
1289 have the same name.
1290
1291 In addition to using this like a list, the ``.value`` attribute
1292 returns a set-like object that you can add to or remove from to
1293 check and uncheck checkboxes. You can also use ``.value_options``
1294 to get the possible values.
1295 """
1296
1298 """
1299 Return a set-like object that can be modified to check or
1300 uncheck individual checkboxes according to their value.
1301 """
1302 return CheckboxValues(self)
1312 value = property(_value__get, _value__set, _value__del, doc=_value__get.__doc__)
1313
1315 """
1316 Returns a list of all the possible values.
1317 """
1318 return [el.get('value') for el in self]
1319 value_options = property(value_options, doc=value_options.__doc__)
1320
1322 return '%s(%s)' % (
1323 self.__class__.__name__, list.__repr__(self))
1324
1326
1327 """
1328 Represents the values of the checked checkboxes in a group of
1329 checkboxes with the same name.
1330 """
1331
1334
1336 return iter([
1337 el.get('value')
1338 for el in self.group
1339 if 'checked' in el.attrib])
1340
1341 - def add(self, value):
1342 for el in self.group:
1343 if el.get('value') == value:
1344 el.set('checked', '')
1345 break
1346 else:
1347 raise KeyError("No checkbox with value %r" % value)
1348
1350 for el in self.group:
1351 if el.get('value') == value:
1352 if 'checked' in el.attrib:
1353 del el.attrib['checked']
1354 else:
1355 raise KeyError(
1356 "The checkbox with value %r was already unchecked" % value)
1357 break
1358 else:
1359 raise KeyError(
1360 "No checkbox with value %r" % value)
1361
1363 return '<%s {%s} for checkboxes name=%r>' % (
1364 self.__class__.__name__,
1365 ', '.join([repr(v) for v in self]),
1366 self.group.name)
1367
1451
1452 HtmlElementClassLookup._default_element_classes['input'] = InputElement
1453
1455 """
1456 Represents a ``<label>`` element.
1457
1458 Label elements are linked to other elements with their ``for``
1459 attribute. You can access this element with ``label.for_element``.
1460 """
1461
1463 """
1464 Get/set the element this label points to. Return None if it
1465 can't be found.
1466 """
1467 id = self.get('for')
1468 if not id:
1469 return None
1470 return self.body.get_element_by_id(id)
1472 id = other.get('id')
1473 if not id:
1474 raise TypeError(
1475 "Element %r has no id attribute" % other)
1476 self.set('for', id)
1480 for_element = property(_for_element__get, _for_element__set, _for_element__del,
1481 doc=_for_element__get.__doc__)
1482
1483 HtmlElementClassLookup._default_element_classes['label'] = LabelElement
1484
1485
1486
1487
1488
1503
1505 """Convert all tags in an XHTML tree to HTML by removing their
1506 XHTML namespace.
1507 """
1508 try:
1509 xhtml = xhtml.getroot()
1510 except AttributeError:
1511 pass
1512 prefix = "{%s}" % XHTML_NAMESPACE
1513 prefix_len = len(prefix)
1514 for el in xhtml.iter(prefix + "*"):
1515 el.tag = el.tag[prefix_len:]
1516
1517
1518
1519 __str_replace_meta_content_type = re.compile(
1520 r'<meta http-equiv="Content-Type"[^>]*>').sub
1521 __bytes_replace_meta_content_type = re.compile(
1522 r'<meta http-equiv="Content-Type"[^>]*>'.encode('ASCII')).sub
1523
1524 -def tostring(doc, pretty_print=False, include_meta_content_type=False,
1525 encoding=None, method="html", with_tail=True, doctype=None):
1526 """Return an HTML string representation of the document.
1527
1528 Note: if include_meta_content_type is true this will create a
1529 ``<meta http-equiv="Content-Type" ...>`` tag in the head;
1530 regardless of the value of include_meta_content_type any existing
1531 ``<meta http-equiv="Content-Type" ...>`` tag will be removed
1532
1533 The ``encoding`` argument controls the output encoding (defauts to
1534 ASCII, with &#...; character references for any characters outside
1535 of ASCII). Note that you can pass the name ``'unicode'`` as
1536 ``encoding`` argument to serialise to a unicode string.
1537
1538 The ``method`` argument defines the output method. It defaults to
1539 'html', but can also be 'xml' for xhtml output, or 'text' to
1540 serialise to plain text without markup.
1541
1542 To leave out the tail text of the top-level element that is being
1543 serialised, pass ``with_tail=False``.
1544
1545 The ``doctype`` option allows passing in a plain string that will
1546 be serialised before the XML tree. Note that passing in non
1547 well-formed content here will make the XML output non well-formed.
1548 Also, an existing doctype in the document tree will not be removed
1549 when serialising an ElementTree instance.
1550
1551 Example::
1552
1553 >>> from lxml import html
1554 >>> root = html.fragment_fromstring('<p>Hello<br>world!</p>')
1555
1556 >>> html.tostring(root)
1557 b'<p>Hello<br>world!</p>'
1558 >>> html.tostring(root, method='html')
1559 b'<p>Hello<br>world!</p>'
1560
1561 >>> html.tostring(root, method='xml')
1562 b'<p>Hello<br/>world!</p>'
1563
1564 >>> html.tostring(root, method='text')
1565 b'Helloworld!'
1566
1567 >>> html.tostring(root, method='text', encoding=unicode)
1568 u'Helloworld!'
1569
1570 >>> root = html.fragment_fromstring('<div><p>Hello<br>world!</p>TAIL</div>')
1571 >>> html.tostring(root[0], method='text', encoding=unicode)
1572 u'Helloworld!TAIL'
1573
1574 >>> html.tostring(root[0], method='text', encoding=unicode, with_tail=False)
1575 u'Helloworld!'
1576
1577 >>> doc = html.document_fromstring('<p>Hello<br>world!</p>')
1578 >>> html.tostring(doc, method='html', encoding=unicode)
1579 u'<html><body><p>Hello<br>world!</p></body></html>'
1580
1581 >>> print(html.tostring(doc, method='html', encoding=unicode,
1582 ... doctype='<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN"'
1583 ... ' "http://www.w3.org/TR/html4/strict.dtd">'))
1584 <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd">
1585 <html><body><p>Hello<br>world!</p></body></html>
1586 """
1587 html = etree.tostring(doc, method=method, pretty_print=pretty_print,
1588 encoding=encoding, with_tail=with_tail,
1589 doctype=doctype)
1590 if method == 'html' and not include_meta_content_type:
1591 if isinstance(html, str):
1592 html = __str_replace_meta_content_type('', html)
1593 else:
1594 html = __bytes_replace_meta_content_type(bytes(), html)
1595 return html
1596
1597 tostring.__doc__ = __fix_docstring(tostring.__doc__)
1598
1600 """
1601 Open the HTML document in a web browser, saving it to a temporary
1602 file to open it. Note that this does not delete the file after
1603 use. This is mainly meant for debugging.
1604 """
1605 import os
1606 import webbrowser
1607 import tempfile
1608 if not isinstance(doc, etree._ElementTree):
1609 doc = etree.ElementTree(doc)
1610 handle, fn = tempfile.mkstemp(suffix='.html')
1611 f = os.fdopen(handle, 'wb')
1612 try:
1613 doc.write(f, method="html", encoding=encoding or doc.docinfo.encoding or "UTF-8")
1614 finally:
1615
1616 f.close()
1617 url = 'file://' + fn.replace(os.path.sep, '/')
1618 print(url)
1619 webbrowser.open(url)
1620
1621
1622
1623
1624
1626 """An HTML parser that is configured to return lxml.html Element
1627 objects.
1628 """
1632
1634 """An XML parser that is configured to return lxml.html Element
1635 objects.
1636
1637 Note that this parser is not really XHTML aware unless you let it
1638 load a DTD that declares the HTML entities. To do this, make sure
1639 you have the XHTML DTDs installed in your catalogs, and create the
1640 parser like this::
1641
1642 >>> parser = XHTMLParser(load_dtd=True)
1643
1644 If you additionally want to validate the document, use this::
1645
1646 >>> parser = XHTMLParser(dtd_validation=True)
1647
1648 For catalog support, see http://www.xmlsoft.org/catalog.html.
1649 """
1653
1655 """Create a new HTML Element.
1656
1657 This can also be used for XHTML documents.
1658 """
1659 v = html_parser.makeelement(*args, **kw)
1660 return v
1661
1662 html_parser = HTMLParser()
1663 xhtml_parser = XHTMLParser()
1664