1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31 """The ``lxml.html`` tool set for HTML handling.
32 """
33
34 import sys
35 import re
36 try:
37 from urlparse import urljoin
38 except ImportError:
39
40 from urllib.parse import urljoin
41 import copy
42 from lxml import etree
43 from lxml.html import defs
44 from lxml.html._setmixin import SetMixin
45 try:
46 from collections import MutableMapping as DictMixin
47 except ImportError:
48
49 from UserDict import DictMixin
50 try:
51 set
52 except NameError:
53
54 from sets import Set as set
55 try:
56 bytes
57 except NameError:
58
59 bytes = str
60 try:
61 unicode
62 except NameError:
63
64 unicode = str
65 try:
66 basestring
67 except NameError:
68
69 basestring = (str, bytes)
70
72 if not s:
73 return s
74 import sys
75 if sys.version_info[0] >= 3:
76 sub = re.compile(r"^(\s*)u'", re.M).sub
77 else:
78 sub = re.compile(r"^(\s*)b'", re.M).sub
79 return sub(r"\1'", s)
80
81 __all__ = [
82 'document_fromstring', 'fragment_fromstring', 'fragments_fromstring', 'fromstring',
83 'tostring', 'Element', 'defs', 'open_in_browser', 'submit_form',
84 'find_rel_links', 'find_class', 'make_links_absolute',
85 'resolve_base_href', 'iterlinks', 'rewrite_links', 'open_in_browser', 'parse']
86
87 XHTML_NAMESPACE = "http://www.w3.org/1999/xhtml"
88
89 _rel_links_xpath = etree.XPath("descendant-or-self::a[@rel]|descendant-or-self::x:a[@rel]",
90 namespaces={'x':XHTML_NAMESPACE})
91 _options_xpath = etree.XPath("descendant-or-self::option|descendant-or-self::x:option",
92 namespaces={'x':XHTML_NAMESPACE})
93 _forms_xpath = etree.XPath("descendant-or-self::form|descendant-or-self::x:form",
94 namespaces={'x':XHTML_NAMESPACE})
95
96 _class_xpath = etree.XPath("descendant-or-self::*[@class and contains(concat(' ', normalize-space(@class), ' '), concat(' ', $class_name, ' '))]")
97 _id_xpath = etree.XPath("descendant-or-self::*[@id=$id]")
98 _collect_string_content = etree.XPath("string()")
99 _css_url_re = re.compile(r'url\(('+'["][^"]*["]|'+"['][^']*[']|"+r'[^)]*)\)', re.I)
100 _css_import_re = re.compile(r'@import "(.*?)"')
101 _label_xpath = etree.XPath("//label[@for=$id]|//x:label[@for=$id]",
102 namespaces={'x':XHTML_NAMESPACE})
103 _archive_re = re.compile(r'[^ ]+')
104
106 if s[:1] == '"' and s[-1:] == '"' or s[:1] == "'" and s[-1:] == "'":
107 return s[1:-1], pos+1
108 else:
109 return s,pos
110
120
126
128
130 """
131 Returns the base URL, given when the page was parsed.
132
133 Use with ``urlparse.urljoin(el.base_url, href)`` to get
134 absolute URLs.
135 """
136 return self.getroottree().docinfo.URL
137 base_url = property(base_url, doc=base_url.__doc__)
138
144 forms = property(forms, doc=forms.__doc__)
145
147 """
148 Return the <body> element. Can be called from a child element
149 to get the document's head.
150 """
151 return self.xpath('//body|//x:body', namespaces={'x':XHTML_NAMESPACE})[0]
152 body = property(body, doc=body.__doc__)
153
155 """
156 Returns the <head> element. Can be called from a child
157 element to get the document's head.
158 """
159 return self.xpath('//head|//x:head', namespaces={'x':XHTML_NAMESPACE})[0]
160 head = property(head, doc=head.__doc__)
161
163 """
164 Get or set any <label> element associated with this element.
165 """
166 id = self.get('id')
167 if not id:
168 return None
169 result = _label_xpath(self, id=id)
170 if not result:
171 return None
172 else:
173 return result[0]
175 id = self.get('id')
176 if not id:
177 raise TypeError(
178 "You cannot set a label for an element (%r) that has no id"
179 % self)
180 if _nons(label.tag) != 'label':
181 raise TypeError(
182 "You can only assign label to a label element (not %r)"
183 % label)
184 label.set('for', id)
189 label = property(_label__get, _label__set, _label__del, doc=_label__get.__doc__)
190
192 """
193 Removes this element from the tree, including its children and
194 text. The tail text is joined to the previous element or
195 parent.
196 """
197 parent = self.getparent()
198 assert parent is not None
199 if self.tail:
200 previous = self.getprevious()
201 if previous is None:
202 parent.text = (parent.text or '') + self.tail
203 else:
204 previous.tail = (previous.tail or '') + self.tail
205 parent.remove(self)
206
208 """
209 Remove the tag, but not its children or text. The children and text
210 are merged into the parent.
211
212 Example::
213
214 >>> h = fragment_fromstring('<div>Hello <b>World!</b></div>')
215 >>> h.find('.//b').drop_tag()
216 >>> print(tostring(h, encoding=unicode))
217 <div>Hello World!</div>
218 """
219 parent = self.getparent()
220 assert parent is not None
221 previous = self.getprevious()
222 if self.text and isinstance(self.tag, basestring):
223
224 if previous is None:
225 parent.text = (parent.text or '') + self.text
226 else:
227 previous.tail = (previous.tail or '') + self.text
228 if self.tail:
229 if len(self):
230 last = self[-1]
231 last.tail = (last.tail or '') + self.tail
232 elif previous is None:
233 parent.text = (parent.text or '') + self.tail
234 else:
235 previous.tail = (previous.tail or '') + self.tail
236 index = parent.index(self)
237 parent[index:index+1] = self[:]
238
240 """
241 Find any links like ``<a rel="{rel}">...</a>``; returns a list of elements.
242 """
243 rel = rel.lower()
244 return [el for el in _rel_links_xpath(self)
245 if el.get('rel').lower() == rel]
246
248 """
249 Find any elements with the given class name.
250 """
251 return _class_xpath(self, class_name=class_name)
252
254 """
255 Get the first element in a document with the given id. If none is
256 found, return the default argument if provided or raise KeyError
257 otherwise.
258
259 Note that there can be more than one element with the same id,
260 and this isn't uncommon in HTML documents found in the wild.
261 Browsers return only the first match, and this function does
262 the same.
263 """
264 try:
265
266
267 return _id_xpath(self, id=id)[0]
268 except IndexError:
269 if default:
270 return default[0]
271 else:
272 raise KeyError(id)
273
274 - def text_content(self):
275 """
276 Return the text content of the tag (and the text in any children).
277 """
278 return _collect_string_content(self)
279
280 - def cssselect(self, expr, translator='html'):
281 """
282 Run the CSS expression on this element and its children,
283 returning a list of the results.
284
285 Equivalent to lxml.cssselect.CSSSelect(expr, translator='html')(self)
286 -- note that pre-compiling the expression can provide a substantial
287 speedup.
288 """
289
290 from lxml.cssselect import CSSSelector
291 return CSSSelector(expr, translator=translator)(self)
292
293
294
295
296
298 """
299 Make all links in the document absolute, given the
300 ``base_url`` for the document (the full URL where the document
301 came from), or if no ``base_url`` is given, then the ``.base_url`` of the document.
302
303 If ``resolve_base_href`` is true, then any ``<base href>``
304 tags in the document are used *and* removed from the document.
305 If it is false then any such tag is ignored.
306 """
307 if base_url is None:
308 base_url = self.base_url
309 if base_url is None:
310 raise TypeError(
311 "No base_url given, and the document has no base_url")
312 if resolve_base_href:
313 self.resolve_base_href()
314 def link_repl(href):
315 return urljoin(base_url, href)
316 self.rewrite_links(link_repl)
317
319 """
320 Find any ``<base href>`` tag in the document, and apply its
321 values to all links found in the document. Also remove the
322 tag once it has been applied.
323 """
324 base_href = None
325 basetags = self.xpath('//base[@href]|//x:base[@href]', namespaces={'x':XHTML_NAMESPACE})
326 for b in basetags:
327 base_href = b.get('href')
328 b.drop_tree()
329 if not base_href:
330 return
331 self.make_links_absolute(base_href, resolve_base_href=False)
332
334 """
335 Yield (element, attribute, link, pos), where attribute may be None
336 (indicating the link is in the text). ``pos`` is the position
337 where the link occurs; often 0, but sometimes something else in
338 the case of links in stylesheets or style tags.
339
340 Note: <base href> is *not* taken into account in any way. The
341 link you get is exactly the link in the document.
342
343 Note: multiple links inside of a single text string or
344 attribute value are returned in reversed order. This makes it
345 possible to replace or delete them from the text string value
346 based on their reported text positions. Otherwise, a
347 modification at one text position can change the positions of
348 links reported later on.
349 """
350 link_attrs = defs.link_attrs
351 for el in self.iter():
352 attribs = el.attrib
353 tag = _nons(el.tag)
354 if tag != 'object':
355 for attrib in link_attrs:
356 if attrib in attribs:
357 yield (el, attrib, attribs[attrib], 0)
358 elif tag == 'object':
359 codebase = None
360
361
362 if 'codebase' in attribs:
363 codebase = el.get('codebase')
364 yield (el, 'codebase', codebase, 0)
365 for attrib in 'classid', 'data':
366 if attrib in attribs:
367 value = el.get(attrib)
368 if codebase is not None:
369 value = urljoin(codebase, value)
370 yield (el, attrib, value, 0)
371 if 'archive' in attribs:
372 for match in _archive_re.finditer(el.get('archive')):
373 value = match.group(0)
374 if codebase is not None:
375 value = urljoin(codebase, value)
376 yield (el, 'archive', value, match.start())
377 if tag == 'param':
378 valuetype = el.get('valuetype') or ''
379 if valuetype.lower() == 'ref':
380
381
382
383
384
385
386 yield (el, 'value', el.get('value'), 0)
387 if tag == 'style' and el.text:
388 urls = [
389 _unquote_match(match.group(1), match.start(1))
390 for match in _css_url_re.finditer(el.text)
391 ] + [
392 (match.group(1), match.start(1))
393 for match in _css_import_re.finditer(el.text)
394 ]
395 if urls:
396
397 urls = [ (start, url) for (url, start) in urls ]
398 urls.sort()
399
400
401 urls.reverse()
402 for start, url in urls:
403 yield (el, None, url, start)
404 if 'style' in attribs:
405 urls = list(_css_url_re.finditer(attribs['style']))
406 if urls:
407
408 for match in urls[::-1]:
409 url, start = _unquote_match(match.group(1), match.start(1))
410 yield (el, 'style', url, start)
411
412 - def rewrite_links(self, link_repl_func, resolve_base_href=True,
413 base_href=None):
414 """
415 Rewrite all the links in the document. For each link
416 ``link_repl_func(link)`` will be called, and the return value
417 will replace the old link.
418
419 Note that links may not be absolute (unless you first called
420 ``make_links_absolute()``), and may be internal (e.g.,
421 ``'#anchor'``). They can also be values like
422 ``'mailto:email'`` or ``'javascript:expr'``.
423
424 If you give ``base_href`` then all links passed to
425 ``link_repl_func()`` will take that into account.
426
427 If the ``link_repl_func`` returns None, the attribute or
428 tag text will be removed completely.
429 """
430 if base_href is not None:
431
432
433 self.make_links_absolute(base_href, resolve_base_href=resolve_base_href)
434 elif resolve_base_href:
435 self.resolve_base_href()
436 for el, attrib, link, pos in self.iterlinks():
437 new_link = link_repl_func(link.strip())
438 if new_link == link:
439 continue
440 if new_link is None:
441
442 if attrib is None:
443 el.text = ''
444 else:
445 del el.attrib[attrib]
446 continue
447 if attrib is None:
448 new = el.text[:pos] + new_link + el.text[pos+len(link):]
449 el.text = new
450 else:
451 cur = el.attrib[attrib]
452 if not pos and len(cur) == len(link):
453
454 el.attrib[attrib] = new_link
455 else:
456 new = cur[:pos] + new_link + cur[pos+len(link):]
457 el.attrib[attrib] = new
458
459
461 """
462 An object that represents a method on an element as a function;
463 the function takes either an element or an HTML string. It
464 returns whatever the function normally returns, or if the function
465 works in-place (and so returns None) it returns a serialized form
466 of the resulting document.
467 """
473 result_type = type(doc)
474 if isinstance(doc, basestring):
475 if 'copy' in kw:
476 raise TypeError(
477 "The keyword 'copy' can only be used with element inputs to %s, not a string input" % self.name)
478 doc = fromstring(doc, **kw)
479 else:
480 if 'copy' in kw:
481 make_a_copy = kw.pop('copy')
482 else:
483 make_a_copy = self.copy
484 if make_a_copy:
485 doc = copy.deepcopy(doc)
486 meth = getattr(doc, self.name)
487 result = meth(*args, **kw)
488
489 if result is None:
490
491 return _transform_result(result_type, doc)
492 else:
493 return result
494
495 find_rel_links = _MethodFunc('find_rel_links', copy=False)
496 find_class = _MethodFunc('find_class', copy=False)
497 make_links_absolute = _MethodFunc('make_links_absolute', copy=True)
498 resolve_base_href = _MethodFunc('resolve_base_href', copy=True)
499 iterlinks = _MethodFunc('iterlinks', copy=False)
500 rewrite_links = _MethodFunc('rewrite_links', copy=True)
501
504
507
510
513
514
516 """A lookup scheme for HTML Element classes.
517
518 To create a lookup instance with different Element classes, pass a tag
519 name mapping of Element classes in the ``classes`` keyword argument and/or
520 a tag name mapping of Mixin classes in the ``mixins`` keyword argument.
521 The special key '*' denotes a Mixin class that should be mixed into all
522 Element classes.
523 """
524 _default_element_classes = {}
525
526 - def __init__(self, classes=None, mixins=None):
543
544 - def lookup(self, node_type, document, namespace, name):
555
556
557
558
559
568
604
607 """
608 Parses a single HTML element; it is an error if there is more than
609 one element, or if anything but whitespace precedes or follows the
610 element.
611
612 If create_parent is true (or is a tag name) then a parent node
613 will be created to encapsulate the HTML in a single element. In
614 this case, leading or trailing text is allowed.
615
616 base_url will set the document's base_url attribute (and the tree's docinfo.URL)
617 """
618 if parser is None:
619 parser = html_parser
620
621 accept_leading_text = bool(create_parent)
622
623 elements = fragments_fromstring(
624 html, parser=parser, no_leading_text=not accept_leading_text,
625 base_url=base_url, **kw)
626
627 if create_parent:
628 if not isinstance(create_parent, basestring):
629 create_parent = 'div'
630 new_root = Element(create_parent)
631 if elements:
632 if isinstance(elements[0], basestring):
633 new_root.text = elements[0]
634 del elements[0]
635 new_root.extend(elements)
636 return new_root
637
638 if not elements:
639 raise etree.ParserError('No elements found')
640 if len(elements) > 1:
641 raise etree.ParserError(
642 "Multiple elements found (%s)"
643 % ', '.join([_element_name(e) for e in elements]))
644 el = elements[0]
645 if el.tail and el.tail.strip():
646 raise etree.ParserError(
647 "Element followed by text: %r" % el.tail)
648 el.tail = None
649 return el
650
651 -def fromstring(html, base_url=None, parser=None, **kw):
715
716 -def parse(filename_or_url, parser=None, base_url=None, **kw):
717 """
718 Parse a filename, URL, or file-like object into an HTML document
719 tree. Note: this returns a tree, not an element. Use
720 ``parse(...).getroot()`` to get the document root.
721
722 You can override the base URL with the ``base_url`` keyword. This
723 is most useful when parsing from a file-like object.
724 """
725 if parser is None:
726 parser = html_parser
727 return etree.parse(filename_or_url, parser, base_url=base_url, **kw)
728
730
731
732 for el in el.iter():
733 if _nons(el.tag) in defs.block_tags:
734 return True
735 return False
736
738 if isinstance(el, etree.CommentBase):
739 return 'comment'
740 elif isinstance(el, basestring):
741 return 'string'
742 else:
743 return _nons(el.tag)
744
745
746
747
748
853
854 HtmlElementClassLookup._default_element_classes['form'] = FormElement
855
892
894 if not url:
895 raise ValueError("cannot submit, no URL provided")
896
897 try:
898 from urllib import urlencode, urlopen
899 except ImportError:
900 from urllib.request import urlopen
901 from urllib.parse import urlencode
902 if method == 'GET':
903 if '?' in url:
904 url += '&'
905 else:
906 url += '?'
907 url += urlencode(values)
908 data = None
909 else:
910 data = urlencode(values)
911 return urlopen(url, data)
912
914
922 raise KeyError(
923 "You cannot remove keys from ElementDict")
927 return item in self.inputs
932
934 return '<%s for form %s>' % (
935 self.__class__.__name__,
936 self.inputs.form._name())
937
1003
1031
1032 -class TextareaElement(InputMixin, HtmlElement):
1033 """
1034 ``<textarea>`` element. You can get the name with ``.name`` and
1035 get/set the value with ``.value``
1036 """
1037
1038 - def _value__get(self):
1039 """
1040 Get/set the value (which is the contents of this element)
1041 """
1042 content = self.text or ''
1043 if self.tag.startswith("{%s}" % XHTML_NAMESPACE):
1044 serialisation_method = 'xml'
1045 else:
1046 serialisation_method = 'html'
1047 for el in self:
1048
1049 content += etree.tostring(el, method=serialisation_method, encoding=unicode)
1050 return content
1051 - def _value__set(self, value):
1052 del self[:]
1053 self.text = value
1054 - def _value__del(self):
1055 self.text = ''
1056 del self[:]
1057 value = property(_value__get, _value__set, _value__del, doc=_value__get.__doc__)
1058
1059 HtmlElementClassLookup._default_element_classes['textarea'] = TextareaElement
1060
1062 """
1063 ``<select>`` element. You can get the name with ``.name``.
1064
1065 ``.value`` will be the value of the selected option, unless this
1066 is a multi-select element (``<select multiple>``), in which case
1067 it will be a set-like object. In either case ``.value_options``
1068 gives the possible values.
1069
1070 The boolean attribute ``.multiple`` shows if this is a
1071 multi-select.
1072 """
1073
1075 """
1076 Get/set the value of this select (the selected option).
1077
1078 If this is a multi-select, this is a set-like object that
1079 represents all the selected options.
1080 """
1081 if self.multiple:
1082 return MultipleSelectOptions(self)
1083 for el in _options_xpath(self):
1084 if el.get('selected') is not None:
1085 value = el.get('value')
1086 if value is None:
1087 value = el.text or ''
1088 if value:
1089 value = value.strip()
1090 return value
1091 return None
1092
1094 if self.multiple:
1095 if isinstance(value, basestring):
1096 raise TypeError(
1097 "You must pass in a sequence")
1098 self.value.clear()
1099 self.value.update(value)
1100 return
1101 if value is not None:
1102 value = value.strip()
1103 for el in _options_xpath(self):
1104 opt_value = el.get('value')
1105 if opt_value is None:
1106 opt_value = el.text or ''
1107 if opt_value:
1108 opt_value = opt_value.strip()
1109 if opt_value == value:
1110 checked_option = el
1111 break
1112 else:
1113 raise ValueError(
1114 "There is no option with the value of %r" % value)
1115 for el in _options_xpath(self):
1116 if 'selected' in el.attrib:
1117 del el.attrib['selected']
1118 if value is not None:
1119 checked_option.set('selected', '')
1120
1127
1128 value = property(_value__get, _value__set, _value__del, doc=_value__get.__doc__)
1129
1144 value_options = property(value_options, doc=value_options.__doc__)
1145
1147 """
1148 Boolean attribute: is there a ``multiple`` attribute on this element.
1149 """
1150 return 'multiple' in self.attrib
1152 if value:
1153 self.set('multiple', '')
1154 elif 'multiple' in self.attrib:
1155 del self.attrib['multiple']
1156 multiple = property(_multiple__get, _multiple__set, doc=_multiple__get.__doc__)
1157
1158 HtmlElementClassLookup._default_element_classes['select'] = SelectElement
1159
1161 """
1162 Represents all the selected options in a ``<select multiple>`` element.
1163
1164 You can add to this set-like option to select an option, or remove
1165 to unselect the option.
1166 """
1167
1169 self.select = select
1170
1172 """
1173 Iterator of all the ``<option>`` elements.
1174 """
1175 return iter(_options_xpath(self.select))
1176 options = property(options)
1177
1179 for option in self.options:
1180 if 'selected' in option.attrib:
1181 opt_value = option.get('value')
1182 if opt_value is None:
1183 opt_value = option.text or ''
1184 if opt_value:
1185 opt_value = opt_value.strip()
1186 yield opt_value
1187
1188 - def add(self, item):
1189 for option in self.options:
1190 opt_value = option.get('value')
1191 if opt_value is None:
1192 opt_value = option.text or ''
1193 if opt_value:
1194 opt_value = opt_value.strip()
1195 if opt_value == item:
1196 option.set('selected', '')
1197 break
1198 else:
1199 raise ValueError(
1200 "There is no option with the value %r" % item)
1201
1203 for option in self.options:
1204 opt_value = option.get('value')
1205 if opt_value is None:
1206 opt_value = option.text or ''
1207 if opt_value:
1208 opt_value = opt_value.strip()
1209 if opt_value == item:
1210 if 'selected' in option.attrib:
1211 del option.attrib['selected']
1212 else:
1213 raise ValueError(
1214 "The option %r is not currently selected" % item)
1215 break
1216 else:
1217 raise ValueError(
1218 "There is not option with the value %r" % item)
1219
1221 return '<%s {%s} for select name=%r>' % (
1222 self.__class__.__name__,
1223 ', '.join([repr(v) for v in self]),
1224 self.select.name)
1225
1227 """
1228 This object represents several ``<input type=radio>`` elements
1229 that have the same name.
1230
1231 You can use this like a list, but also use the property
1232 ``.value`` to check/uncheck inputs. Also you can use
1233 ``.value_options`` to get the possible values.
1234 """
1235
1237 """
1238 Get/set the value, which checks the radio with that value (and
1239 unchecks any other value).
1240 """
1241 for el in self:
1242 if 'checked' in el.attrib:
1243 return el.get('value')
1244 return None
1245
1247 if value is not None:
1248 for el in self:
1249 if el.get('value') == value:
1250 checked_option = el
1251 break
1252 else:
1253 raise ValueError(
1254 "There is no radio input with the value %r" % value)
1255 for el in self:
1256 if 'checked' in el.attrib:
1257 del el.attrib['checked']
1258 if value is not None:
1259 checked_option.set('checked', '')
1260
1263
1264 value = property(_value__get, _value__set, _value__del, doc=_value__get.__doc__)
1265
1267 """
1268 Returns a list of all the possible values.
1269 """
1270 return [el.get('value') for el in self]
1271 value_options = property(value_options, doc=value_options.__doc__)
1272
1274 return '%s(%s)' % (
1275 self.__class__.__name__,
1276 list.__repr__(self))
1277
1279 """
1280 Represents a group of checkboxes (``<input type=checkbox>``) that
1281 have the same name.
1282
1283 In addition to using this like a list, the ``.value`` attribute
1284 returns a set-like object that you can add to or remove from to
1285 check and uncheck checkboxes. You can also use ``.value_options``
1286 to get the possible values.
1287 """
1288
1290 """
1291 Return a set-like object that can be modified to check or
1292 uncheck individual checkboxes according to their value.
1293 """
1294 return CheckboxValues(self)
1304 value = property(_value__get, _value__set, _value__del, doc=_value__get.__doc__)
1305
1307 """
1308 Returns a list of all the possible values.
1309 """
1310 return [el.get('value') for el in self]
1311 value_options = property(value_options, doc=value_options.__doc__)
1312
1314 return '%s(%s)' % (
1315 self.__class__.__name__, list.__repr__(self))
1316
1318
1319 """
1320 Represents the values of the checked checkboxes in a group of
1321 checkboxes with the same name.
1322 """
1323
1326
1328 return iter([
1329 el.get('value')
1330 for el in self.group
1331 if 'checked' in el.attrib])
1332
1333 - def add(self, value):
1334 for el in self.group:
1335 if el.get('value') == value:
1336 el.set('checked', '')
1337 break
1338 else:
1339 raise KeyError("No checkbox with value %r" % value)
1340
1342 for el in self.group:
1343 if el.get('value') == value:
1344 if 'checked' in el.attrib:
1345 del el.attrib['checked']
1346 else:
1347 raise KeyError(
1348 "The checkbox with value %r was already unchecked" % value)
1349 break
1350 else:
1351 raise KeyError(
1352 "No checkbox with value %r" % value)
1353
1355 return '<%s {%s} for checkboxes name=%r>' % (
1356 self.__class__.__name__,
1357 ', '.join([repr(v) for v in self]),
1358 self.group.name)
1359
1443
1444 HtmlElementClassLookup._default_element_classes['input'] = InputElement
1445
1447 """
1448 Represents a ``<label>`` element.
1449
1450 Label elements are linked to other elements with their ``for``
1451 attribute. You can access this element with ``label.for_element``.
1452 """
1453
1455 """
1456 Get/set the element this label points to. Return None if it
1457 can't be found.
1458 """
1459 id = self.get('for')
1460 if not id:
1461 return None
1462 return self.body.get_element_by_id(id)
1464 id = other.get('id')
1465 if not id:
1466 raise TypeError(
1467 "Element %r has no id attribute" % other)
1468 self.set('for', id)
1472 for_element = property(_for_element__get, _for_element__set, _for_element__del,
1473 doc=_for_element__get.__doc__)
1474
1475 HtmlElementClassLookup._default_element_classes['label'] = LabelElement
1476
1477
1478
1479
1480
1495
1497 """Convert all tags in an XHTML tree to HTML by removing their
1498 XHTML namespace.
1499 """
1500 try:
1501 xhtml = xhtml.getroot()
1502 except AttributeError:
1503 pass
1504 prefix = "{%s}" % XHTML_NAMESPACE
1505 prefix_len = len(prefix)
1506 for el in xhtml.iter(prefix + "*"):
1507 el.tag = el.tag[prefix_len:]
1508
1509
1510
1511 __str_replace_meta_content_type = re.compile(
1512 r'<meta http-equiv="Content-Type"[^>]*>').sub
1513 __bytes_replace_meta_content_type = re.compile(
1514 r'<meta http-equiv="Content-Type"[^>]*>'.encode('ASCII')).sub
1515
1516 -def tostring(doc, pretty_print=False, include_meta_content_type=False,
1517 encoding=None, method="html", with_tail=True, doctype=None):
1518 """Return an HTML string representation of the document.
1519
1520 Note: if include_meta_content_type is true this will create a
1521 ``<meta http-equiv="Content-Type" ...>`` tag in the head;
1522 regardless of the value of include_meta_content_type any existing
1523 ``<meta http-equiv="Content-Type" ...>`` tag will be removed
1524
1525 The ``encoding`` argument controls the output encoding (defauts to
1526 ASCII, with &#...; character references for any characters outside
1527 of ASCII). Note that you can pass the name ``'unicode'`` as
1528 ``encoding`` argument to serialise to a unicode string.
1529
1530 The ``method`` argument defines the output method. It defaults to
1531 'html', but can also be 'xml' for xhtml output, or 'text' to
1532 serialise to plain text without markup.
1533
1534 To leave out the tail text of the top-level element that is being
1535 serialised, pass ``with_tail=False``.
1536
1537 The ``doctype`` option allows passing in a plain string that will
1538 be serialised before the XML tree. Note that passing in non
1539 well-formed content here will make the XML output non well-formed.
1540 Also, an existing doctype in the document tree will not be removed
1541 when serialising an ElementTree instance.
1542
1543 Example::
1544
1545 >>> from lxml import html
1546 >>> root = html.fragment_fromstring('<p>Hello<br>world!</p>')
1547
1548 >>> html.tostring(root)
1549 b'<p>Hello<br>world!</p>'
1550 >>> html.tostring(root, method='html')
1551 b'<p>Hello<br>world!</p>'
1552
1553 >>> html.tostring(root, method='xml')
1554 b'<p>Hello<br/>world!</p>'
1555
1556 >>> html.tostring(root, method='text')
1557 b'Helloworld!'
1558
1559 >>> html.tostring(root, method='text', encoding=unicode)
1560 u'Helloworld!'
1561
1562 >>> root = html.fragment_fromstring('<div><p>Hello<br>world!</p>TAIL</div>')
1563 >>> html.tostring(root[0], method='text', encoding=unicode)
1564 u'Helloworld!TAIL'
1565
1566 >>> html.tostring(root[0], method='text', encoding=unicode, with_tail=False)
1567 u'Helloworld!'
1568
1569 >>> doc = html.document_fromstring('<p>Hello<br>world!</p>')
1570 >>> html.tostring(doc, method='html', encoding=unicode)
1571 u'<html><body><p>Hello<br>world!</p></body></html>'
1572
1573 >>> print(html.tostring(doc, method='html', encoding=unicode,
1574 ... doctype='<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN"'
1575 ... ' "http://www.w3.org/TR/html4/strict.dtd">'))
1576 <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd">
1577 <html><body><p>Hello<br>world!</p></body></html>
1578 """
1579 html = etree.tostring(doc, method=method, pretty_print=pretty_print,
1580 encoding=encoding, with_tail=with_tail,
1581 doctype=doctype)
1582 if method == 'html' and not include_meta_content_type:
1583 if isinstance(html, str):
1584 html = __str_replace_meta_content_type('', html)
1585 else:
1586 html = __bytes_replace_meta_content_type(bytes(), html)
1587 return html
1588
1589 tostring.__doc__ = __fix_docstring(tostring.__doc__)
1590
1592 """
1593 Open the HTML document in a web browser, saving it to a temporary
1594 file to open it. Note that this does not delete the file after
1595 use. This is mainly meant for debugging.
1596 """
1597 import os
1598 import webbrowser
1599 import tempfile
1600 if not isinstance(doc, etree._ElementTree):
1601 doc = etree.ElementTree(doc)
1602 handle, fn = tempfile.mkstemp(suffix='.html')
1603 f = os.fdopen(handle, 'wb')
1604 try:
1605 doc.write(f, method="html", encoding=encoding or doc.docinfo.encoding or "UTF-8")
1606 finally:
1607
1608 f.close()
1609 url = 'file://' + fn.replace(os.path.sep, '/')
1610 print(url)
1611 webbrowser.open(url)
1612
1613
1614
1615
1616
1618 """An HTML parser that is configured to return lxml.html Element
1619 objects.
1620 """
1624
1626 """An XML parser that is configured to return lxml.html Element
1627 objects.
1628
1629 Note that this parser is not really XHTML aware unless you let it
1630 load a DTD that declares the HTML entities. To do this, make sure
1631 you have the XHTML DTDs installed in your catalogs, and create the
1632 parser like this::
1633
1634 >>> parser = XHTMLParser(load_dtd=True)
1635
1636 If you additionally want to validate the document, use this::
1637
1638 >>> parser = XHTMLParser(dtd_validation=True)
1639
1640 For catalog support, see http://www.xmlsoft.org/catalog.html.
1641 """
1645
1647 """Create a new HTML Element.
1648
1649 This can also be used for XHTML documents.
1650 """
1651 v = html_parser.makeelement(*args, **kw)
1652 return v
1653
1654 html_parser = HTMLParser()
1655 xhtml_parser = XHTMLParser()
1656