1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31 """The ``lxml.html`` tool set for HTML handling.
32 """
33
34 import threading
35 import re
36 try:
37 from urlparse import urljoin
38 except ImportError:
39
40 from urllib.parse import urljoin
41 import copy
42 from lxml import etree
43 from lxml.html import defs
44 from lxml.html._setmixin import SetMixin
45 try:
46 from collections import MutableMapping as DictMixin
47 except ImportError:
48
49 from UserDict import DictMixin
50 try:
51 set
52 except NameError:
53
54 from sets import Set as set
55 try:
56 bytes
57 except NameError:
58
59 bytes = str
60 try:
61 unicode
62 except NameError:
63
64 unicode = str
65 try:
66 basestring
67 except NameError:
68
69 basestring = (str, bytes)
70
72 if not s:
73 return s
74 import sys
75 if sys.version_info[0] >= 3:
76 sub = re.compile(r"^(\s*)u'", re.M).sub
77 else:
78 sub = re.compile(r"^(\s*)b'", re.M).sub
79 return sub(r"\1'", s)
80
81 __all__ = [
82 'document_fromstring', 'fragment_fromstring', 'fragments_fromstring', 'fromstring',
83 'tostring', 'Element', 'defs', 'open_in_browser', 'submit_form',
84 'find_rel_links', 'find_class', 'make_links_absolute',
85 'resolve_base_href', 'iterlinks', 'rewrite_links', 'open_in_browser', 'parse']
86
87 XHTML_NAMESPACE = "http://www.w3.org/1999/xhtml"
88
89 _rel_links_xpath = etree.XPath("descendant-or-self::a[@rel]|descendant-or-self::x:a[@rel]",
90 namespaces={'x':XHTML_NAMESPACE})
91 _options_xpath = etree.XPath("descendant-or-self::option|descendant-or-self::x:option",
92 namespaces={'x':XHTML_NAMESPACE})
93 _forms_xpath = etree.XPath("descendant-or-self::form|descendant-or-self::x:form",
94 namespaces={'x':XHTML_NAMESPACE})
95
96 _class_xpath = etree.XPath("descendant-or-self::*[@class and contains(concat(' ', normalize-space(@class), ' '), concat(' ', $class_name, ' '))]")
97 _id_xpath = etree.XPath("descendant-or-self::*[@id=$id]")
98 _collect_string_content = etree.XPath("string()")
99 _css_url_re = re.compile(r'url\(('+'["][^"]*["]|'+"['][^']*[']|"+r'[^)]*)\)', re.I)
100 _css_import_re = re.compile(r'@import "(.*?)"')
101 _label_xpath = etree.XPath("//label[@for=$id]|//x:label[@for=$id]",
102 namespaces={'x':XHTML_NAMESPACE})
103 _archive_re = re.compile(r'[^ ]+')
104
106 if s[:1] == '"' and s[-1:] == '"' or s[:1] == "'" and s[-1:] == "'":
107 return s[1:-1], pos+1
108 else:
109 return s,pos
110
120
126
128
130 """
131 Returns the base URL, given when the page was parsed.
132
133 Use with ``urlparse.urljoin(el.base_url, href)`` to get
134 absolute URLs.
135 """
136 return self.getroottree().docinfo.URL
137 base_url = property(base_url, doc=base_url.__doc__)
138
144 forms = property(forms, doc=forms.__doc__)
145
147 """
148 Return the <body> element. Can be called from a child element
149 to get the document's head.
150 """
151 return self.xpath('//body|//x:body', namespaces={'x':XHTML_NAMESPACE})[0]
152 body = property(body, doc=body.__doc__)
153
155 """
156 Returns the <head> element. Can be called from a child
157 element to get the document's head.
158 """
159 return self.xpath('//head|//x:head', namespaces={'x':XHTML_NAMESPACE})[0]
160 head = property(head, doc=head.__doc__)
161
163 """
164 Get or set any <label> element associated with this element.
165 """
166 id = self.get('id')
167 if not id:
168 return None
169 result = _label_xpath(self, id=id)
170 if not result:
171 return None
172 else:
173 return result[0]
175 id = self.get('id')
176 if not id:
177 raise TypeError(
178 "You cannot set a label for an element (%r) that has no id"
179 % self)
180 if _nons(label.tag) != 'label':
181 raise TypeError(
182 "You can only assign label to a label element (not %r)"
183 % label)
184 label.set('for', id)
189 label = property(_label__get, _label__set, _label__del, doc=_label__get.__doc__)
190
192 """
193 Removes this element from the tree, including its children and
194 text. The tail text is joined to the previous element or
195 parent.
196 """
197 parent = self.getparent()
198 assert parent is not None
199 if self.tail:
200 previous = self.getprevious()
201 if previous is None:
202 parent.text = (parent.text or '') + self.tail
203 else:
204 previous.tail = (previous.tail or '') + self.tail
205 parent.remove(self)
206
208 """
209 Remove the tag, but not its children or text. The children and text
210 are merged into the parent.
211
212 Example::
213
214 >>> h = fragment_fromstring('<div>Hello <b>World!</b></div>')
215 >>> h.find('.//b').drop_tag()
216 >>> print(tostring(h, encoding=unicode))
217 <div>Hello World!</div>
218 """
219 parent = self.getparent()
220 assert parent is not None
221 previous = self.getprevious()
222 if self.text and isinstance(self.tag, basestring):
223
224 if previous is None:
225 parent.text = (parent.text or '') + self.text
226 else:
227 previous.tail = (previous.tail or '') + self.text
228 if self.tail:
229 if len(self):
230 last = self[-1]
231 last.tail = (last.tail or '') + self.tail
232 elif previous is None:
233 parent.text = (parent.text or '') + self.tail
234 else:
235 previous.tail = (previous.tail or '') + self.tail
236 index = parent.index(self)
237 parent[index:index+1] = self[:]
238
240 """
241 Find any links like ``<a rel="{rel}">...</a>``; returns a list of elements.
242 """
243 rel = rel.lower()
244 return [el for el in _rel_links_xpath(self)
245 if el.get('rel').lower() == rel]
246
248 """
249 Find any elements with the given class name.
250 """
251 return _class_xpath(self, class_name=class_name)
252
254 """
255 Get the first element in a document with the given id. If none is
256 found, return the default argument if provided or raise KeyError
257 otherwise.
258
259 Note that there can be more than one element with the same id,
260 and this isn't uncommon in HTML documents found in the wild.
261 Browsers return only the first match, and this function does
262 the same.
263 """
264 try:
265
266
267 return _id_xpath(self, id=id)[0]
268 except IndexError:
269 if default:
270 return default[0]
271 else:
272 raise KeyError(id)
273
274 - def text_content(self):
275 """
276 Return the text content of the tag (and the text in any children).
277 """
278 return _collect_string_content(self)
279
280 - def cssselect(self, expr, translator='html'):
281 """
282 Run the CSS expression on this element and its children,
283 returning a list of the results.
284
285 Equivalent to lxml.cssselect.CSSSelect(expr, translator='html')(self)
286 -- note that pre-compiling the expression can provide a substantial
287 speedup.
288 """
289
290 from lxml.cssselect import CSSSelector
291 return CSSSelector(expr, translator=translator)(self)
292
293
294
295
296
298 """
299 Make all links in the document absolute, given the
300 ``base_url`` for the document (the full URL where the document
301 came from), or if no ``base_url`` is given, then the ``.base_url`` of the document.
302
303 If ``resolve_base_href`` is true, then any ``<base href>``
304 tags in the document are used *and* removed from the document.
305 If it is false then any such tag is ignored.
306 """
307 if base_url is None:
308 base_url = self.base_url
309 if base_url is None:
310 raise TypeError(
311 "No base_url given, and the document has no base_url")
312 if resolve_base_href:
313 self.resolve_base_href()
314 def link_repl(href):
315 return urljoin(base_url, href)
316 self.rewrite_links(link_repl)
317
319 """
320 Find any ``<base href>`` tag in the document, and apply its
321 values to all links found in the document. Also remove the
322 tag once it has been applied.
323 """
324 base_href = None
325 basetags = self.xpath('//base[@href]|//x:base[@href]', namespaces={'x':XHTML_NAMESPACE})
326 for b in basetags:
327 base_href = b.get('href')
328 b.drop_tree()
329 if not base_href:
330 return
331 self.make_links_absolute(base_href, resolve_base_href=False)
332
334 """
335 Yield (element, attribute, link, pos), where attribute may be None
336 (indicating the link is in the text). ``pos`` is the position
337 where the link occurs; often 0, but sometimes something else in
338 the case of links in stylesheets or style tags.
339
340 Note: <base href> is *not* taken into account in any way. The
341 link you get is exactly the link in the document.
342
343 Note: multiple links inside of a single text string or
344 attribute value are returned in reversed order. This makes it
345 possible to replace or delete them from the text string value
346 based on their reported text positions. Otherwise, a
347 modification at one text position can change the positions of
348 links reported later on.
349 """
350 link_attrs = defs.link_attrs
351 for el in self.iter():
352 attribs = el.attrib
353 tag = _nons(el.tag)
354 if tag != 'object':
355 for attrib in link_attrs:
356 if attrib in attribs:
357 yield (el, attrib, attribs[attrib], 0)
358 elif tag == 'object':
359 codebase = None
360
361
362 if 'codebase' in attribs:
363 codebase = el.get('codebase')
364 yield (el, 'codebase', codebase, 0)
365 for attrib in 'classid', 'data':
366 if attrib in attribs:
367 value = el.get(attrib)
368 if codebase is not None:
369 value = urljoin(codebase, value)
370 yield (el, attrib, value, 0)
371 if 'archive' in attribs:
372 for match in _archive_re.finditer(el.get('archive')):
373 value = match.group(0)
374 if codebase is not None:
375 value = urljoin(codebase, value)
376 yield (el, 'archive', value, match.start())
377 if tag == 'param':
378 valuetype = el.get('valuetype') or ''
379 if valuetype.lower() == 'ref':
380
381
382
383
384
385
386 yield (el, 'value', el.get('value'), 0)
387 if tag == 'style' and el.text:
388 urls = [
389 _unquote_match(match.group(1), match.start(1))
390 for match in _css_url_re.finditer(el.text)
391 ] + [
392 (match.group(1), match.start(1))
393 for match in _css_import_re.finditer(el.text)
394 ]
395 if urls:
396
397 urls = [ (start, url) for (url, start) in urls ]
398 urls.sort()
399
400
401 urls.reverse()
402 for start, url in urls:
403 yield (el, None, url, start)
404 if 'style' in attribs:
405 urls = list(_css_url_re.finditer(attribs['style']))
406 if urls:
407
408 for match in urls[::-1]:
409 url, start = _unquote_match(match.group(1), match.start(1))
410 yield (el, 'style', url, start)
411
412 - def rewrite_links(self, link_repl_func, resolve_base_href=True,
413 base_href=None):
414 """
415 Rewrite all the links in the document. For each link
416 ``link_repl_func(link)`` will be called, and the return value
417 will replace the old link.
418
419 Note that links may not be absolute (unless you first called
420 ``make_links_absolute()``), and may be internal (e.g.,
421 ``'#anchor'``). They can also be values like
422 ``'mailto:email'`` or ``'javascript:expr'``.
423
424 If you give ``base_href`` then all links passed to
425 ``link_repl_func()`` will take that into account.
426
427 If the ``link_repl_func`` returns None, the attribute or
428 tag text will be removed completely.
429 """
430 if base_href is not None:
431
432
433 self.make_links_absolute(base_href, resolve_base_href=resolve_base_href)
434 elif resolve_base_href:
435 self.resolve_base_href()
436 for el, attrib, link, pos in self.iterlinks():
437 new_link = link_repl_func(link.strip())
438 if new_link == link:
439 continue
440 if new_link is None:
441
442 if attrib is None:
443 el.text = ''
444 else:
445 del el.attrib[attrib]
446 continue
447 if attrib is None:
448 new = el.text[:pos] + new_link + el.text[pos+len(link):]
449 el.text = new
450 else:
451 cur = el.attrib[attrib]
452 if not pos and len(cur) == len(link):
453
454 el.attrib[attrib] = new_link
455 else:
456 new = cur[:pos] + new_link + cur[pos+len(link):]
457 el.attrib[attrib] = new
458
459
461 """
462 An object that represents a method on an element as a function;
463 the function takes either an element or an HTML string. It
464 returns whatever the function normally returns, or if the function
465 works in-place (and so returns None) it returns a serialized form
466 of the resulting document.
467 """
473 result_type = type(doc)
474 if isinstance(doc, basestring):
475 if 'copy' in kw:
476 raise TypeError(
477 "The keyword 'copy' can only be used with element inputs to %s, not a string input" % self.name)
478 doc = fromstring(doc, **kw)
479 else:
480 if 'copy' in kw:
481 make_a_copy = kw.pop('copy')
482 else:
483 make_a_copy = self.copy
484 if make_a_copy:
485 doc = copy.deepcopy(doc)
486 meth = getattr(doc, self.name)
487 result = meth(*args, **kw)
488
489 if result is None:
490
491 return _transform_result(result_type, doc)
492 else:
493 return result
494
495 find_rel_links = _MethodFunc('find_rel_links', copy=False)
496 find_class = _MethodFunc('find_class', copy=False)
497 make_links_absolute = _MethodFunc('make_links_absolute', copy=True)
498 resolve_base_href = _MethodFunc('resolve_base_href', copy=True)
499 iterlinks = _MethodFunc('iterlinks', copy=False)
500 rewrite_links = _MethodFunc('rewrite_links', copy=True)
501
504
507
510
513
514
516 """A lookup scheme for HTML Element classes.
517
518 To create a lookup instance with different Element classes, pass a tag
519 name mapping of Element classes in the ``classes`` keyword argument and/or
520 a tag name mapping of Mixin classes in the ``mixins`` keyword argument.
521 The special key '*' denotes a Mixin class that should be mixed into all
522 Element classes.
523 """
524 _default_element_classes = {}
525
526 - def __init__(self, classes=None, mixins=None):
543
544 - def lookup(self, node_type, document, namespace, name):
555
556
557
558
559
568
602
605 """
606 Parses a single HTML element; it is an error if there is more than
607 one element, or if anything but whitespace precedes or follows the
608 element.
609
610 If create_parent is true (or is a tag name) then a parent node
611 will be created to encapsulate the HTML in a single element. In
612 this case, leading or trailing text is allowed.
613
614 base_url will set the document's base_url attribute (and the tree's docinfo.URL)
615 """
616 if parser is None:
617 parser = html_parser
618
619 accept_leading_text = bool(create_parent)
620
621 elements = fragments_fromstring(
622 html, parser=parser, no_leading_text=not accept_leading_text,
623 base_url=base_url, **kw)
624
625 if create_parent:
626 if not isinstance(create_parent, basestring):
627 create_parent = 'div'
628 new_root = Element(create_parent)
629 if elements:
630 if isinstance(elements[0], basestring):
631 new_root.text = elements[0]
632 del elements[0]
633 new_root.extend(elements)
634 return new_root
635
636 if not elements:
637 raise etree.ParserError('No elements found')
638 if len(elements) > 1:
639 raise etree.ParserError(
640 "Multiple elements found (%s)"
641 % ', '.join([_element_name(e) for e in elements]))
642 el = elements[0]
643 if el.tail and el.tail.strip():
644 raise etree.ParserError(
645 "Element followed by text: %r" % el.tail)
646 el.tail = None
647 return el
648
649 -def fromstring(html, base_url=None, parser=None, **kw):
711
712 -def parse(filename_or_url, parser=None, base_url=None, **kw):
713 """
714 Parse a filename, URL, or file-like object into an HTML document
715 tree. Note: this returns a tree, not an element. Use
716 ``parse(...).getroot()`` to get the document root.
717
718 You can override the base URL with the ``base_url`` keyword. This
719 is most useful when parsing from a file-like object.
720 """
721 if parser is None:
722 parser = html_parser
723 return etree.parse(filename_or_url, parser, base_url=base_url, **kw)
724
726
727
728 for el in el.iter():
729 if _nons(el.tag) in defs.block_tags:
730 return True
731 return False
732
734 if isinstance(el, etree.CommentBase):
735 return 'comment'
736 elif isinstance(el, basestring):
737 return 'string'
738 else:
739 return _nons(el.tag)
740
741
742
743
744
849
850 HtmlElementClassLookup._default_element_classes['form'] = FormElement
851
888
890 if not url:
891 raise ValueError("cannot submit, no URL provided")
892
893 try:
894 from urllib import urlencode, urlopen
895 except ImportError:
896 from urllib.request import urlopen
897 from urllib.parse import urlencode
898 if method == 'GET':
899 if '?' in url:
900 url += '&'
901 else:
902 url += '?'
903 url += urlencode(values)
904 data = None
905 else:
906 data = urlencode(values)
907 return urlopen(url, data)
908
910
918 raise KeyError(
919 "You cannot remove keys from ElementDict")
923 return item in self.inputs
928
930 return '<%s for form %s>' % (
931 self.__class__.__name__,
932 self.inputs.form._name())
933
999
1027
1028 -class TextareaElement(InputMixin, HtmlElement):
1029 """
1030 ``<textarea>`` element. You can get the name with ``.name`` and
1031 get/set the value with ``.value``
1032 """
1033
1034 - def _value__get(self):
1035 """
1036 Get/set the value (which is the contents of this element)
1037 """
1038 content = self.text or ''
1039 if self.tag.startswith("{%s}" % XHTML_NAMESPACE):
1040 serialisation_method = 'xml'
1041 else:
1042 serialisation_method = 'html'
1043 for el in self:
1044
1045 content += etree.tostring(el, method=serialisation_method, encoding=unicode)
1046 return content
1047 - def _value__set(self, value):
1048 del self[:]
1049 self.text = value
1050 - def _value__del(self):
1051 self.text = ''
1052 del self[:]
1053 value = property(_value__get, _value__set, _value__del, doc=_value__get.__doc__)
1054
1055 HtmlElementClassLookup._default_element_classes['textarea'] = TextareaElement
1056
1058 """
1059 ``<select>`` element. You can get the name with ``.name``.
1060
1061 ``.value`` will be the value of the selected option, unless this
1062 is a multi-select element (``<select multiple>``), in which case
1063 it will be a set-like object. In either case ``.value_options``
1064 gives the possible values.
1065
1066 The boolean attribute ``.multiple`` shows if this is a
1067 multi-select.
1068 """
1069
1071 """
1072 Get/set the value of this select (the selected option).
1073
1074 If this is a multi-select, this is a set-like object that
1075 represents all the selected options.
1076 """
1077 if self.multiple:
1078 return MultipleSelectOptions(self)
1079 for el in _options_xpath(self):
1080 if el.get('selected') is not None:
1081 value = el.get('value')
1082 if value is None:
1083 value = el.text or ''
1084 if value:
1085 value = value.strip()
1086 return value
1087 return None
1088
1090 if self.multiple:
1091 if isinstance(value, basestring):
1092 raise TypeError(
1093 "You must pass in a sequence")
1094 self.value.clear()
1095 self.value.update(value)
1096 return
1097 if value is not None:
1098 value = value.strip()
1099 for el in _options_xpath(self):
1100 opt_value = el.get('value')
1101 if opt_value is None:
1102 opt_value = el.text or ''
1103 if opt_value:
1104 opt_value = opt_value.strip()
1105 if opt_value == value:
1106 checked_option = el
1107 break
1108 else:
1109 raise ValueError(
1110 "There is no option with the value of %r" % value)
1111 for el in _options_xpath(self):
1112 if 'selected' in el.attrib:
1113 del el.attrib['selected']
1114 if value is not None:
1115 checked_option.set('selected', '')
1116
1123
1124 value = property(_value__get, _value__set, _value__del, doc=_value__get.__doc__)
1125
1140 value_options = property(value_options, doc=value_options.__doc__)
1141
1143 """
1144 Boolean attribute: is there a ``multiple`` attribute on this element.
1145 """
1146 return 'multiple' in self.attrib
1148 if value:
1149 self.set('multiple', '')
1150 elif 'multiple' in self.attrib:
1151 del self.attrib['multiple']
1152 multiple = property(_multiple__get, _multiple__set, doc=_multiple__get.__doc__)
1153
1154 HtmlElementClassLookup._default_element_classes['select'] = SelectElement
1155
1157 """
1158 Represents all the selected options in a ``<select multiple>`` element.
1159
1160 You can add to this set-like option to select an option, or remove
1161 to unselect the option.
1162 """
1163
1165 self.select = select
1166
1168 """
1169 Iterator of all the ``<option>`` elements.
1170 """
1171 return iter(_options_xpath(self.select))
1172 options = property(options)
1173
1175 for option in self.options:
1176 if 'selected' in option.attrib:
1177 opt_value = option.get('value')
1178 if opt_value is None:
1179 opt_value = option.text or ''
1180 if opt_value:
1181 opt_value = opt_value.strip()
1182 yield opt_value
1183
1184 - def add(self, item):
1185 for option in self.options:
1186 opt_value = option.get('value')
1187 if opt_value is None:
1188 opt_value = option.text or ''
1189 if opt_value:
1190 opt_value = opt_value.strip()
1191 if opt_value == item:
1192 option.set('selected', '')
1193 break
1194 else:
1195 raise ValueError(
1196 "There is no option with the value %r" % item)
1197
1199 for option in self.options:
1200 opt_value = option.get('value')
1201 if opt_value is None:
1202 opt_value = option.text or ''
1203 if opt_value:
1204 opt_value = opt_value.strip()
1205 if opt_value == item:
1206 if 'selected' in option.attrib:
1207 del option.attrib['selected']
1208 else:
1209 raise ValueError(
1210 "The option %r is not currently selected" % item)
1211 break
1212 else:
1213 raise ValueError(
1214 "There is not option with the value %r" % item)
1215
1217 return '<%s {%s} for select name=%r>' % (
1218 self.__class__.__name__,
1219 ', '.join([repr(v) for v in self]),
1220 self.select.name)
1221
1223 """
1224 This object represents several ``<input type=radio>`` elements
1225 that have the same name.
1226
1227 You can use this like a list, but also use the property
1228 ``.value`` to check/uncheck inputs. Also you can use
1229 ``.value_options`` to get the possible values.
1230 """
1231
1233 """
1234 Get/set the value, which checks the radio with that value (and
1235 unchecks any other value).
1236 """
1237 for el in self:
1238 if 'checked' in el.attrib:
1239 return el.get('value')
1240 return None
1241
1243 if value is not None:
1244 for el in self:
1245 if el.get('value') == value:
1246 checked_option = el
1247 break
1248 else:
1249 raise ValueError(
1250 "There is no radio input with the value %r" % value)
1251 for el in self:
1252 if 'checked' in el.attrib:
1253 del el.attrib['checked']
1254 if value is not None:
1255 checked_option.set('checked', '')
1256
1259
1260 value = property(_value__get, _value__set, _value__del, doc=_value__get.__doc__)
1261
1263 """
1264 Returns a list of all the possible values.
1265 """
1266 return [el.get('value') for el in self]
1267 value_options = property(value_options, doc=value_options.__doc__)
1268
1270 return '%s(%s)' % (
1271 self.__class__.__name__,
1272 list.__repr__(self))
1273
1275 """
1276 Represents a group of checkboxes (``<input type=checkbox>``) that
1277 have the same name.
1278
1279 In addition to using this like a list, the ``.value`` attribute
1280 returns a set-like object that you can add to or remove from to
1281 check and uncheck checkboxes. You can also use ``.value_options``
1282 to get the possible values.
1283 """
1284
1286 """
1287 Return a set-like object that can be modified to check or
1288 uncheck individual checkboxes according to their value.
1289 """
1290 return CheckboxValues(self)
1300 value = property(_value__get, _value__set, _value__del, doc=_value__get.__doc__)
1301
1303 """
1304 Returns a list of all the possible values.
1305 """
1306 return [el.get('value') for el in self]
1307 value_options = property(value_options, doc=value_options.__doc__)
1308
1310 return '%s(%s)' % (
1311 self.__class__.__name__, list.__repr__(self))
1312
1314
1315 """
1316 Represents the values of the checked checkboxes in a group of
1317 checkboxes with the same name.
1318 """
1319
1322
1324 return iter([
1325 el.get('value')
1326 for el in self.group
1327 if 'checked' in el.attrib])
1328
1329 - def add(self, value):
1330 for el in self.group:
1331 if el.get('value') == value:
1332 el.set('checked', '')
1333 break
1334 else:
1335 raise KeyError("No checkbox with value %r" % value)
1336
1338 for el in self.group:
1339 if el.get('value') == value:
1340 if 'checked' in el.attrib:
1341 del el.attrib['checked']
1342 else:
1343 raise KeyError(
1344 "The checkbox with value %r was already unchecked" % value)
1345 break
1346 else:
1347 raise KeyError(
1348 "No checkbox with value %r" % value)
1349
1351 return '<%s {%s} for checkboxes name=%r>' % (
1352 self.__class__.__name__,
1353 ', '.join([repr(v) for v in self]),
1354 self.group.name)
1355
1439
1440 HtmlElementClassLookup._default_element_classes['input'] = InputElement
1441
1443 """
1444 Represents a ``<label>`` element.
1445
1446 Label elements are linked to other elements with their ``for``
1447 attribute. You can access this element with ``label.for_element``.
1448 """
1449
1451 """
1452 Get/set the element this label points to. Return None if it
1453 can't be found.
1454 """
1455 id = self.get('for')
1456 if not id:
1457 return None
1458 return self.body.get_element_by_id(id)
1460 id = other.get('id')
1461 if not id:
1462 raise TypeError(
1463 "Element %r has no id attribute" % other)
1464 self.set('for', id)
1468 for_element = property(_for_element__get, _for_element__set, _for_element__del,
1469 doc=_for_element__get.__doc__)
1470
1471 HtmlElementClassLookup._default_element_classes['label'] = LabelElement
1472
1473
1474
1475
1476
1491
1493 """Convert all tags in an XHTML tree to HTML by removing their
1494 XHTML namespace.
1495 """
1496 try:
1497 xhtml = xhtml.getroot()
1498 except AttributeError:
1499 pass
1500 prefix = "{%s}" % XHTML_NAMESPACE
1501 prefix_len = len(prefix)
1502 for el in xhtml.iter(prefix + "*"):
1503 el.tag = el.tag[prefix_len:]
1504
1505
1506
1507 __str_replace_meta_content_type = re.compile(
1508 r'<meta http-equiv="Content-Type"[^>]*>').sub
1509 __bytes_replace_meta_content_type = re.compile(
1510 r'<meta http-equiv="Content-Type"[^>]*>'.encode('ASCII')).sub
1511
1512 -def tostring(doc, pretty_print=False, include_meta_content_type=False,
1513 encoding=None, method="html", with_tail=True, doctype=None):
1514 """Return an HTML string representation of the document.
1515
1516 Note: if include_meta_content_type is true this will create a
1517 ``<meta http-equiv="Content-Type" ...>`` tag in the head;
1518 regardless of the value of include_meta_content_type any existing
1519 ``<meta http-equiv="Content-Type" ...>`` tag will be removed
1520
1521 The ``encoding`` argument controls the output encoding (defauts to
1522 ASCII, with &#...; character references for any characters outside
1523 of ASCII). Note that you can pass the name ``'unicode'`` as
1524 ``encoding`` argument to serialise to a unicode string.
1525
1526 The ``method`` argument defines the output method. It defaults to
1527 'html', but can also be 'xml' for xhtml output, or 'text' to
1528 serialise to plain text without markup.
1529
1530 To leave out the tail text of the top-level element that is being
1531 serialised, pass ``with_tail=False``.
1532
1533 The ``doctype`` option allows passing in a plain string that will
1534 be serialised before the XML tree. Note that passing in non
1535 well-formed content here will make the XML output non well-formed.
1536 Also, an existing doctype in the document tree will not be removed
1537 when serialising an ElementTree instance.
1538
1539 Example::
1540
1541 >>> from lxml import html
1542 >>> root = html.fragment_fromstring('<p>Hello<br>world!</p>')
1543
1544 >>> html.tostring(root)
1545 b'<p>Hello<br>world!</p>'
1546 >>> html.tostring(root, method='html')
1547 b'<p>Hello<br>world!</p>'
1548
1549 >>> html.tostring(root, method='xml')
1550 b'<p>Hello<br/>world!</p>'
1551
1552 >>> html.tostring(root, method='text')
1553 b'Helloworld!'
1554
1555 >>> html.tostring(root, method='text', encoding=unicode)
1556 u'Helloworld!'
1557
1558 >>> root = html.fragment_fromstring('<div><p>Hello<br>world!</p>TAIL</div>')
1559 >>> html.tostring(root[0], method='text', encoding=unicode)
1560 u'Helloworld!TAIL'
1561
1562 >>> html.tostring(root[0], method='text', encoding=unicode, with_tail=False)
1563 u'Helloworld!'
1564
1565 >>> doc = html.document_fromstring('<p>Hello<br>world!</p>')
1566 >>> html.tostring(doc, method='html', encoding=unicode)
1567 u'<html><body><p>Hello<br>world!</p></body></html>'
1568
1569 >>> print(html.tostring(doc, method='html', encoding=unicode,
1570 ... doctype='<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN"'
1571 ... ' "http://www.w3.org/TR/html4/strict.dtd">'))
1572 <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd">
1573 <html><body><p>Hello<br>world!</p></body></html>
1574 """
1575 html = etree.tostring(doc, method=method, pretty_print=pretty_print,
1576 encoding=encoding, with_tail=with_tail,
1577 doctype=doctype)
1578 if method == 'html' and not include_meta_content_type:
1579 if isinstance(html, str):
1580 html = __str_replace_meta_content_type('', html)
1581 else:
1582 html = __bytes_replace_meta_content_type(bytes(), html)
1583 return html
1584
1585 tostring.__doc__ = __fix_docstring(tostring.__doc__)
1586
1588 """
1589 Open the HTML document in a web browser, saving it to a temporary
1590 file to open it. Note that this does not delete the file after
1591 use. This is mainly meant for debugging.
1592 """
1593 import os
1594 import webbrowser
1595 import tempfile
1596 if not isinstance(doc, etree._ElementTree):
1597 doc = etree.ElementTree(doc)
1598 handle, fn = tempfile.mkstemp(suffix='.html')
1599 f = os.fdopen(handle, 'wb')
1600 try:
1601 doc.write(f, method="html", encoding=encoding or doc.docinfo.encoding or "UTF-8")
1602 finally:
1603
1604 f.close()
1605 url = 'file://' + fn.replace(os.path.sep, '/')
1606 print(url)
1607 webbrowser.open(url)
1608
1609
1610
1611
1612
1614 """An HTML parser that is configured to return lxml.html Element
1615 objects.
1616 """
1620
1622 """An XML parser that is configured to return lxml.html Element
1623 objects.
1624
1625 Note that this parser is not really XHTML aware unless you let it
1626 load a DTD that declares the HTML entities. To do this, make sure
1627 you have the XHTML DTDs installed in your catalogs, and create the
1628 parser like this::
1629
1630 >>> parser = XHTMLParser(load_dtd=True)
1631
1632 If you additionally want to validate the document, use this::
1633
1634 >>> parser = XHTMLParser(dtd_validation=True)
1635
1636 For catalog support, see http://www.xmlsoft.org/catalog.html.
1637 """
1641
1643 """Create a new HTML Element.
1644
1645 This can also be used for XHTML documents.
1646 """
1647 v = html_parser.makeelement(*args, **kw)
1648 return v
1649
1650 html_parser = HTMLParser()
1651 xhtml_parser = XHTMLParser()
1652