1 """The ``lxml.html`` tool set for HTML handling.
2 """
3
4 import threading
5 import re
6 try:
7 from urlparse import urljoin
8 except ImportError:
9
10 from urllib.parse import urljoin
11 import copy
12 from lxml import etree
13 from lxml.html import defs
14 from lxml import cssselect
15 from lxml.html._setmixin import SetMixin
16 try:
17 from UserDict import DictMixin
18 except ImportError:
19
20 from lxml.html._dictmixin import DictMixin
21 try:
22 set
23 except NameError:
24
25 from sets import Set as set
26 try:
27 bytes = __builtins__["bytes"]
28 except (KeyError, NameError):
29
30 bytes = str
31 try:
32 unicode = __builtins__["unicode"]
33 except (KeyError, NameError):
34
35 unicode = str
36 try:
37 basestring = __builtins__["basestring"]
38 except (KeyError, NameError):
39
40 basestring = (str, bytes)
41
43 if not s:
44 return s
45 import sys
46 if sys.version_info[0] >= 3:
47 sub = re.compile(r"^(\s*)u'", re.M).sub
48 else:
49 sub = re.compile(r"^(\s*)b'", re.M).sub
50 return sub(r"\1'", s)
51
52 __all__ = [
53 'document_fromstring', 'fragment_fromstring', 'fragments_fromstring', 'fromstring',
54 'tostring', 'Element', 'defs', 'open_in_browser', 'submit_form',
55 'find_rel_links', 'find_class', 'make_links_absolute',
56 'resolve_base_href', 'iterlinks', 'rewrite_links', 'open_in_browser', 'parse']
57
58 XHTML_NAMESPACE = "http://www.w3.org/1999/xhtml"
59
60 _rel_links_xpath = etree.XPath("descendant-or-self::a[@rel]|descendant-or-self::x:a[@rel]",
61 namespaces={'x':XHTML_NAMESPACE})
62 _options_xpath = etree.XPath("descendant-or-self::option|descendant-or-self::x:option",
63 namespaces={'x':XHTML_NAMESPACE})
64 _forms_xpath = etree.XPath("descendant-or-self::form|descendant-or-self::x:form",
65 namespaces={'x':XHTML_NAMESPACE})
66
67 _class_xpath = etree.XPath("descendant-or-self::*[@class and contains(concat(' ', normalize-space(@class), ' '), concat(' ', $class_name, ' '))]")
68 _id_xpath = etree.XPath("descendant-or-self::*[@id=$id]")
69 _collect_string_content = etree.XPath("string()")
70 _css_url_re = re.compile(r'url\(('+'["][^"]*["]|'+"['][^']*[']|"+r'[^)]*)\)', re.I)
71 _css_import_re = re.compile(r'@import "(.*?)"')
72 _label_xpath = etree.XPath("//label[@for=$id]|//x:label[@for=$id]",
73 namespaces={'x':XHTML_NAMESPACE})
74 _archive_re = re.compile(r'[^ ]+')
75
77 if s[:1] == '"' and s[-1:] == '"' or s[:1] == "'" and s[-1:] == "'":
78 return s[1:-1], pos+1
79 else:
80 return s,pos
81
91
97
99
101 """
102 Returns the base URL, given when the page was parsed.
103
104 Use with ``urlparse.urljoin(el.base_url, href)`` to get
105 absolute URLs.
106 """
107 return self.getroottree().docinfo.URL
108 base_url = property(base_url, doc=base_url.__doc__)
109
115 forms = property(forms, doc=forms.__doc__)
116
118 """
119 Return the <body> element. Can be called from a child element
120 to get the document's head.
121 """
122 return self.xpath('//body|//x:body', namespaces={'x':XHTML_NAMESPACE})[0]
123 body = property(body, doc=body.__doc__)
124
126 """
127 Returns the <head> element. Can be called from a child
128 element to get the document's head.
129 """
130 return self.xpath('//head|//x:head', namespaces={'x':XHTML_NAMESPACE})[0]
131 head = property(head, doc=head.__doc__)
132
134 """
135 Get or set any <label> element associated with this element.
136 """
137 id = self.get('id')
138 if not id:
139 return None
140 result = _label_xpath(self, id=id)
141 if not result:
142 return None
143 else:
144 return result[0]
146 id = self.get('id')
147 if not id:
148 raise TypeError(
149 "You cannot set a label for an element (%r) that has no id"
150 % self)
151 if _nons(label.tag) != 'label':
152 raise TypeError(
153 "You can only assign label to a label element (not %r)"
154 % label)
155 label.set('for', id)
160 label = property(_label__get, _label__set, _label__del, doc=_label__get.__doc__)
161
163 """
164 Removes this element from the tree, including its children and
165 text. The tail text is joined to the previous element or
166 parent.
167 """
168 parent = self.getparent()
169 assert parent is not None
170 if self.tail:
171 previous = self.getprevious()
172 if previous is None:
173 parent.text = (parent.text or '') + self.tail
174 else:
175 previous.tail = (previous.tail or '') + self.tail
176 parent.remove(self)
177
179 """
180 Remove the tag, but not its children or text. The children and text
181 are merged into the parent.
182
183 Example::
184
185 >>> h = fragment_fromstring('<div>Hello <b>World!</b></div>')
186 >>> h.find('.//b').drop_tag()
187 >>> print(tostring(h, encoding=unicode))
188 <div>Hello World!</div>
189 """
190 parent = self.getparent()
191 assert parent is not None
192 previous = self.getprevious()
193 if self.text and isinstance(self.tag, basestring):
194
195 if previous is None:
196 parent.text = (parent.text or '') + self.text
197 else:
198 previous.tail = (previous.tail or '') + self.text
199 if self.tail:
200 if len(self):
201 last = self[-1]
202 last.tail = (last.tail or '') + self.tail
203 elif previous is None:
204 parent.text = (parent.text or '') + self.tail
205 else:
206 previous.tail = (previous.tail or '') + self.tail
207 index = parent.index(self)
208 parent[index:index+1] = self[:]
209
211 """
212 Find any links like ``<a rel="{rel}">...</a>``; returns a list of elements.
213 """
214 rel = rel.lower()
215 return [el for el in _rel_links_xpath(self)
216 if el.get('rel').lower() == rel]
217
219 """
220 Find any elements with the given class name.
221 """
222 return _class_xpath(self, class_name=class_name)
223
225 """
226 Get the first element in a document with the given id. If none is
227 found, return the default argument if provided or raise KeyError
228 otherwise.
229
230 Note that there can be more than one element with the same id,
231 and this isn't uncommon in HTML documents found in the wild.
232 Browsers return only the first match, and this function does
233 the same.
234 """
235 try:
236
237
238 return _id_xpath(self, id=id)[0]
239 except IndexError:
240 if default:
241 return default[0]
242 else:
243 raise KeyError(id)
244
245 - def text_content(self):
246 """
247 Return the text content of the tag (and the text in any children).
248 """
249 return _collect_string_content(self)
250
252 """
253 Run the CSS expression on this element and its children,
254 returning a list of the results.
255
256 Equivalent to lxml.cssselect.CSSSelect(expr)(self) -- note
257 that pre-compiling the expression can provide a substantial
258 speedup.
259 """
260 return cssselect.CSSSelector(expr)(self)
261
262
263
264
265
267 """
268 Make all links in the document absolute, given the
269 ``base_url`` for the document (the full URL where the document
270 came from), or if no ``base_url`` is given, then the ``.base_url`` of the document.
271
272 If ``resolve_base_href`` is true, then any ``<base href>``
273 tags in the document are used *and* removed from the document.
274 If it is false then any such tag is ignored.
275 """
276 if base_url is None:
277 base_url = self.base_url
278 if base_url is None:
279 raise TypeError(
280 "No base_url given, and the document has no base_url")
281 if resolve_base_href:
282 self.resolve_base_href()
283 def link_repl(href):
284 return urljoin(base_url, href)
285 self.rewrite_links(link_repl)
286
288 """
289 Find any ``<base href>`` tag in the document, and apply its
290 values to all links found in the document. Also remove the
291 tag once it has been applied.
292 """
293 base_href = None
294 basetags = self.xpath('//base[@href]|//x:base[@href]', namespaces={'x':XHTML_NAMESPACE})
295 for b in basetags:
296 base_href = b.get('href')
297 b.drop_tree()
298 if not base_href:
299 return
300 self.make_links_absolute(base_href, resolve_base_href=False)
301
303 """
304 Yield (element, attribute, link, pos), where attribute may be None
305 (indicating the link is in the text). ``pos`` is the position
306 where the link occurs; often 0, but sometimes something else in
307 the case of links in stylesheets or style tags.
308
309 Note: <base href> is *not* taken into account in any way. The
310 link you get is exactly the link in the document.
311
312 Note: multiple links inside of a single text string or
313 attribute value are returned in reversed order. This makes it
314 possible to replace or delete them from the text string value
315 based on their reported text positions. Otherwise, a
316 modification at one text position can change the positions of
317 links reported later on.
318 """
319 link_attrs = defs.link_attrs
320 for el in self.iter():
321 attribs = el.attrib
322 tag = _nons(el.tag)
323 if tag != 'object':
324 for attrib in link_attrs:
325 if attrib in attribs:
326 yield (el, attrib, attribs[attrib], 0)
327 elif tag == 'object':
328 codebase = None
329
330
331 if 'codebase' in attribs:
332 codebase = el.get('codebase')
333 yield (el, 'codebase', codebase, 0)
334 for attrib in 'classid', 'data':
335 if attrib in attribs:
336 value = el.get(attrib)
337 if codebase is not None:
338 value = urljoin(codebase, value)
339 yield (el, attrib, value, 0)
340 if 'archive' in attribs:
341 for match in _archive_re.finditer(el.get('archive')):
342 value = match.group(0)
343 if codebase is not None:
344 value = urljoin(codebase, value)
345 yield (el, 'archive', value, match.start())
346 if tag == 'param':
347 valuetype = el.get('valuetype') or ''
348 if valuetype.lower() == 'ref':
349
350
351
352
353
354
355 yield (el, 'value', el.get('value'), 0)
356 if tag == 'style' and el.text:
357 urls = [
358 _unquote_match(match.group(1), match.start(1))
359 for match in _css_url_re.finditer(el.text)
360 ] + [
361 (match.group(1), match.start(1))
362 for match in _css_import_re.finditer(el.text)
363 ]
364 if urls:
365
366 urls = [ (start, url) for (url, start) in urls ]
367 urls.sort()
368
369
370 urls.reverse()
371 for start, url in urls:
372 yield (el, None, url, start)
373 if 'style' in attribs:
374 urls = list(_css_url_re.finditer(attribs['style']))
375 if urls:
376
377 for match in urls[::-1]:
378 url, start = _unquote_match(match.group(1), match.start(1))
379 yield (el, 'style', url, start)
380
381 - def rewrite_links(self, link_repl_func, resolve_base_href=True,
382 base_href=None):
383 """
384 Rewrite all the links in the document. For each link
385 ``link_repl_func(link)`` will be called, and the return value
386 will replace the old link.
387
388 Note that links may not be absolute (unless you first called
389 ``make_links_absolute()``), and may be internal (e.g.,
390 ``'#anchor'``). They can also be values like
391 ``'mailto:email'`` or ``'javascript:expr'``.
392
393 If you give ``base_href`` then all links passed to
394 ``link_repl_func()`` will take that into account.
395
396 If the ``link_repl_func`` returns None, the attribute or
397 tag text will be removed completely.
398 """
399 if base_href is not None:
400
401
402 self.make_links_absolute(base_href, resolve_base_href=resolve_base_href)
403 elif resolve_base_href:
404 self.resolve_base_href()
405 for el, attrib, link, pos in self.iterlinks():
406 new_link = link_repl_func(link.strip())
407 if new_link == link:
408 continue
409 if new_link is None:
410
411 if attrib is None:
412 el.text = ''
413 else:
414 del el.attrib[attrib]
415 continue
416 if attrib is None:
417 new = el.text[:pos] + new_link + el.text[pos+len(link):]
418 el.text = new
419 else:
420 cur = el.attrib[attrib]
421 if not pos and len(cur) == len(link):
422
423 el.attrib[attrib] = new_link
424 else:
425 new = cur[:pos] + new_link + cur[pos+len(link):]
426 el.attrib[attrib] = new
427
428
430 """
431 An object that represents a method on an element as a function;
432 the function takes either an element or an HTML string. It
433 returns whatever the function normally returns, or if the function
434 works in-place (and so returns None) it returns a serialized form
435 of the resulting document.
436 """
442 result_type = type(doc)
443 if isinstance(doc, basestring):
444 if 'copy' in kw:
445 raise TypeError(
446 "The keyword 'copy' can only be used with element inputs to %s, not a string input" % self.name)
447 doc = fromstring(doc, **kw)
448 else:
449 if 'copy' in kw:
450 make_a_copy = kw.pop('copy')
451 else:
452 make_a_copy = self.copy
453 if make_a_copy:
454 doc = copy.deepcopy(doc)
455 meth = getattr(doc, self.name)
456 result = meth(*args, **kw)
457
458 if result is None:
459
460 return _transform_result(result_type, doc)
461 else:
462 return result
463
464 find_rel_links = _MethodFunc('find_rel_links', copy=False)
465 find_class = _MethodFunc('find_class', copy=False)
466 make_links_absolute = _MethodFunc('make_links_absolute', copy=True)
467 resolve_base_href = _MethodFunc('resolve_base_href', copy=True)
468 iterlinks = _MethodFunc('iterlinks', copy=False)
469 rewrite_links = _MethodFunc('rewrite_links', copy=True)
470
473
476
479
482
483
485 """A lookup scheme for HTML Element classes.
486
487 To create a lookup instance with different Element classes, pass a tag
488 name mapping of Element classes in the ``classes`` keyword argument and/or
489 a tag name mapping of Mixin classes in the ``mixins`` keyword argument.
490 The special key '*' denotes a Mixin class that should be mixed into all
491 Element classes.
492 """
493 _default_element_classes = {}
494
495 - def __init__(self, classes=None, mixins=None):
512
513 - def lookup(self, node_type, document, namespace, name):
524
525
526
527
528
537
540 """
541 Parses several HTML elements, returning a list of elements.
542
543 The first item in the list may be a string (though leading
544 whitespace is removed). If no_leading_text is true, then it will
545 be an error if there is leading text, and it will always be a list
546 of only elements.
547
548 base_url will set the document's base_url attribute (and the tree's docinfo.URL)
549 """
550 if parser is None:
551 parser = html_parser
552
553 start = html[:20].lstrip().lower()
554 if not start.startswith('<html') and not start.startswith('<!doctype'):
555 html = '<html><body>%s</body></html>' % html
556 doc = document_fromstring(html, parser=parser, base_url=base_url, **kw)
557 assert _nons(doc.tag) == 'html'
558 bodies = [e for e in doc if _nons(e.tag) == 'body']
559 assert len(bodies) == 1, ("too many bodies: %r in %r" % (bodies, html))
560 body = bodies[0]
561 elements = []
562 if no_leading_text and body.text and body.text.strip():
563 raise etree.ParserError(
564 "There is leading text: %r" % body.text)
565 if body.text and body.text.strip():
566 elements.append(body.text)
567 elements.extend(body)
568
569
570 return elements
571
574 """
575 Parses a single HTML element; it is an error if there is more than
576 one element, or if anything but whitespace precedes or follows the
577 element.
578
579 If create_parent is true (or is a tag name) then a parent node
580 will be created to encapsulate the HTML in a single element. In
581 this case, leading or trailing text is allowed.
582
583 base_url will set the document's base_url attribute (and the tree's docinfo.URL)
584 """
585 if parser is None:
586 parser = html_parser
587
588 accept_leading_text = bool(create_parent)
589
590 elements = fragments_fromstring(
591 html, parser=parser, no_leading_text=not accept_leading_text,
592 base_url=base_url, **kw)
593
594 if create_parent:
595 if not isinstance(create_parent, basestring):
596 create_parent = 'div'
597 new_root = Element(create_parent)
598 if elements:
599 if isinstance(elements[0], basestring):
600 new_root.text = elements[0]
601 del elements[0]
602 new_root.extend(elements)
603 return new_root
604
605 if not elements:
606 raise etree.ParserError('No elements found')
607 if len(elements) > 1:
608 raise etree.ParserError(
609 "Multiple elements found (%s)"
610 % ', '.join([_element_name(e) for e in elements]))
611 el = elements[0]
612 if el.tail and el.tail.strip():
613 raise etree.ParserError(
614 "Element followed by text: %r" % el.tail)
615 el.tail = None
616 return el
617
618 -def fromstring(html, base_url=None, parser=None, **kw):
680
681 -def parse(filename_or_url, parser=None, base_url=None, **kw):
682 """
683 Parse a filename, URL, or file-like object into an HTML document
684 tree. Note: this returns a tree, not an element. Use
685 ``parse(...).getroot()`` to get the document root.
686
687 You can override the base URL with the ``base_url`` keyword. This
688 is most useful when parsing from a file-like object.
689 """
690 if parser is None:
691 parser = html_parser
692 return etree.parse(filename_or_url, parser, base_url=base_url, **kw)
693
701
703 if isinstance(el, etree.CommentBase):
704 return 'comment'
705 elif isinstance(el, basestring):
706 return 'string'
707 else:
708 return _nons(el.tag)
709
710
711
712
713
818
819 HtmlElementClassLookup._default_element_classes['form'] = FormElement
820
857
859 if not url:
860 raise ValueError("cannot submit, no URL provided")
861
862 try:
863 from urllib import urlencode, urlopen
864 except ImportError:
865 from urllib.request import urlopen
866 from urllib.parse import urlencode
867 if method == 'GET':
868 if '?' in url:
869 url += '&'
870 else:
871 url += '?'
872 url += urlencode(values)
873 data = None
874 else:
875 data = urlencode(values)
876 return urlopen(url, data)
877
879
887 raise KeyError(
888 "You cannot remove keys from ElementDict")
892 return item in self.inputs
893
895 return '<%s for form %s>' % (
896 self.__class__.__name__,
897 self.inputs.form._name())
898
964
992
993 -class TextareaElement(InputMixin, HtmlElement):
994 """
995 ``<textarea>`` element. You can get the name with ``.name`` and
996 get/set the value with ``.value``
997 """
998
999 - def _value__get(self):
1000 """
1001 Get/set the value (which is the contents of this element)
1002 """
1003 content = self.text or ''
1004 if self.tag.startswith("{%s}" % XHTML_NAMESPACE):
1005 serialisation_method = 'xml'
1006 else:
1007 serialisation_method = 'html'
1008 for el in self:
1009
1010 content += etree.tostring(el, method=serialisation_method, encoding=unicode)
1011 return content
1012 - def _value__set(self, value):
1013 del self[:]
1014 self.text = value
1015 - def _value__del(self):
1016 self.text = ''
1017 del self[:]
1018 value = property(_value__get, _value__set, _value__del, doc=_value__get.__doc__)
1019
1020 HtmlElementClassLookup._default_element_classes['textarea'] = TextareaElement
1021
1023 """
1024 ``<select>`` element. You can get the name with ``.name``.
1025
1026 ``.value`` will be the value of the selected option, unless this
1027 is a multi-select element (``<select multiple>``), in which case
1028 it will be a set-like object. In either case ``.value_options``
1029 gives the possible values.
1030
1031 The boolean attribute ``.multiple`` shows if this is a
1032 multi-select.
1033 """
1034
1036 """
1037 Get/set the value of this select (the selected option).
1038
1039 If this is a multi-select, this is a set-like object that
1040 represents all the selected options.
1041 """
1042 if self.multiple:
1043 return MultipleSelectOptions(self)
1044 for el in _options_xpath(self):
1045 if el.get('selected') is not None:
1046 value = el.get('value')
1047 if value is None:
1048 value = el.text or ''
1049 if value:
1050 value = value.strip()
1051 return value
1052 return None
1053
1055 if self.multiple:
1056 if isinstance(value, basestring):
1057 raise TypeError(
1058 "You must pass in a sequence")
1059 self.value.clear()
1060 self.value.update(value)
1061 return
1062 if value is not None:
1063 value = value.strip()
1064 for el in _options_xpath(self):
1065 opt_value = el.get('value')
1066 if opt_value is None:
1067 opt_value = el.text or ''
1068 if opt_value:
1069 opt_value = opt_value.strip()
1070 if opt_value == value:
1071 checked_option = el
1072 break
1073 else:
1074 raise ValueError(
1075 "There is no option with the value of %r" % value)
1076 for el in _options_xpath(self):
1077 if 'selected' in el.attrib:
1078 del el.attrib['selected']
1079 if value is not None:
1080 checked_option.set('selected', '')
1081
1088
1089 value = property(_value__get, _value__set, _value__del, doc=_value__get.__doc__)
1090
1105 value_options = property(value_options, doc=value_options.__doc__)
1106
1108 """
1109 Boolean attribute: is there a ``multiple`` attribute on this element.
1110 """
1111 return 'multiple' in self.attrib
1113 if value:
1114 self.set('multiple', '')
1115 elif 'multiple' in self.attrib:
1116 del self.attrib['multiple']
1117 multiple = property(_multiple__get, _multiple__set, doc=_multiple__get.__doc__)
1118
1119 HtmlElementClassLookup._default_element_classes['select'] = SelectElement
1120
1122 """
1123 Represents all the selected options in a ``<select multiple>`` element.
1124
1125 You can add to this set-like option to select an option, or remove
1126 to unselect the option.
1127 """
1128
1130 self.select = select
1131
1133 """
1134 Iterator of all the ``<option>`` elements.
1135 """
1136 return iter(_options_xpath(self.select))
1137 options = property(options)
1138
1140 for option in self.options:
1141 if 'selected' in option.attrib:
1142 opt_value = option.get('value')
1143 if opt_value is None:
1144 opt_value = option.text or ''
1145 if opt_value:
1146 opt_value = opt_value.strip()
1147 yield opt_value
1148
1149 - def add(self, item):
1150 for option in self.options:
1151 opt_value = option.get('value')
1152 if opt_value is None:
1153 opt_value = option.text or ''
1154 if opt_value:
1155 opt_value = opt_value.strip()
1156 if opt_value == item:
1157 option.set('selected', '')
1158 break
1159 else:
1160 raise ValueError(
1161 "There is no option with the value %r" % item)
1162
1164 for option in self.options:
1165 opt_value = option.get('value')
1166 if opt_value is None:
1167 opt_value = option.text or ''
1168 if opt_value:
1169 opt_value = opt_value.strip()
1170 if opt_value == item:
1171 if 'selected' in option.attrib:
1172 del option.attrib['selected']
1173 else:
1174 raise ValueError(
1175 "The option %r is not currently selected" % item)
1176 break
1177 else:
1178 raise ValueError(
1179 "There is not option with the value %r" % item)
1180
1182 return '<%s {%s} for select name=%r>' % (
1183 self.__class__.__name__,
1184 ', '.join([repr(v) for v in self]),
1185 self.select.name)
1186
1188 """
1189 This object represents several ``<input type=radio>`` elements
1190 that have the same name.
1191
1192 You can use this like a list, but also use the property
1193 ``.value`` to check/uncheck inputs. Also you can use
1194 ``.value_options`` to get the possible values.
1195 """
1196
1198 """
1199 Get/set the value, which checks the radio with that value (and
1200 unchecks any other value).
1201 """
1202 for el in self:
1203 if 'checked' in el.attrib:
1204 return el.get('value')
1205 return None
1206
1208 if value is not None:
1209 for el in self:
1210 if el.get('value') == value:
1211 checked_option = el
1212 break
1213 else:
1214 raise ValueError(
1215 "There is no radio input with the value %r" % value)
1216 for el in self:
1217 if 'checked' in el.attrib:
1218 del el.attrib['checked']
1219 if value is not None:
1220 checked_option.set('checked', '')
1221
1224
1225 value = property(_value__get, _value__set, _value__del, doc=_value__get.__doc__)
1226
1228 """
1229 Returns a list of all the possible values.
1230 """
1231 return [el.get('value') for el in self]
1232 value_options = property(value_options, doc=value_options.__doc__)
1233
1235 return '%s(%s)' % (
1236 self.__class__.__name__,
1237 list.__repr__(self))
1238
1240 """
1241 Represents a group of checkboxes (``<input type=checkbox>``) that
1242 have the same name.
1243
1244 In addition to using this like a list, the ``.value`` attribute
1245 returns a set-like object that you can add to or remove from to
1246 check and uncheck checkboxes. You can also use ``.value_options``
1247 to get the possible values.
1248 """
1249
1251 """
1252 Return a set-like object that can be modified to check or
1253 uncheck individual checkboxes according to their value.
1254 """
1255 return CheckboxValues(self)
1265 value = property(_value__get, _value__set, _value__del, doc=_value__get.__doc__)
1266
1268 return '%s(%s)' % (
1269 self.__class__.__name__, list.__repr__(self))
1270
1272
1273 """
1274 Represents the values of the checked checkboxes in a group of
1275 checkboxes with the same name.
1276 """
1277
1280
1282 return iter([
1283 el.get('value')
1284 for el in self.group
1285 if 'checked' in el.attrib])
1286
1287 - def add(self, value):
1288 for el in self.group:
1289 if el.get('value') == value:
1290 el.set('checked', '')
1291 break
1292 else:
1293 raise KeyError("No checkbox with value %r" % value)
1294
1296 for el in self.group:
1297 if el.get('value') == value:
1298 if 'checked' in el.attrib:
1299 del el.attrib['checked']
1300 else:
1301 raise KeyError(
1302 "The checkbox with value %r was already unchecked" % value)
1303 break
1304 else:
1305 raise KeyError(
1306 "No checkbox with value %r" % value)
1307
1309 return '<%s {%s} for checkboxes name=%r>' % (
1310 self.__class__.__name__,
1311 ', '.join([repr(v) for v in self]),
1312 self.group.name)
1313
1397
1398 HtmlElementClassLookup._default_element_classes['input'] = InputElement
1399
1401 """
1402 Represents a ``<label>`` element.
1403
1404 Label elements are linked to other elements with their ``for``
1405 attribute. You can access this element with ``label.for_element``.
1406 """
1407
1409 """
1410 Get/set the element this label points to. Return None if it
1411 can't be found.
1412 """
1413 id = self.get('for')
1414 if not id:
1415 return None
1416 return self.body.get_element_by_id(id)
1418 id = other.get('id')
1419 if not id:
1420 raise TypeError(
1421 "Element %r has no id attribute" % other)
1422 self.set('for', id)
1426 for_element = property(_for_element__get, _for_element__set, _for_element__del,
1427 doc=_for_element__get.__doc__)
1428
1429 HtmlElementClassLookup._default_element_classes['label'] = LabelElement
1430
1431
1432
1433
1434
1436 """Convert all tags in an HTML tree to XHTML by moving them to the
1437 XHTML namespace.
1438 """
1439 try:
1440 html = html.getroot()
1441 except AttributeError:
1442 pass
1443 prefix = "{%s}" % XHTML_NAMESPACE
1444 for el in html.iter():
1445 tag = el.tag
1446 if isinstance(tag, basestring):
1447 if tag[0] != '{':
1448 el.tag = prefix + tag
1449
1451 """Convert all tags in an XHTML tree to HTML by removing their
1452 XHTML namespace.
1453 """
1454 try:
1455 xhtml = xhtml.getroot()
1456 except AttributeError:
1457 pass
1458 prefix = "{%s}" % XHTML_NAMESPACE
1459 prefix_len = len(prefix)
1460 for el in xhtml.iter(prefix + "*"):
1461 el.tag = el.tag[prefix_len:]
1462
1463
1464
1465 __str_replace_meta_content_type = re.compile(
1466 r'<meta http-equiv="Content-Type"[^>]*>').sub
1467 __bytes_replace_meta_content_type = re.compile(
1468 r'<meta http-equiv="Content-Type"[^>]*>'.encode('ASCII')).sub
1469
1470 -def tostring(doc, pretty_print=False, include_meta_content_type=False,
1471 encoding=None, method="html", with_tail=True, doctype=None):
1472 """Return an HTML string representation of the document.
1473
1474 Note: if include_meta_content_type is true this will create a
1475 ``<meta http-equiv="Content-Type" ...>`` tag in the head;
1476 regardless of the value of include_meta_content_type any existing
1477 ``<meta http-equiv="Content-Type" ...>`` tag will be removed
1478
1479 The ``encoding`` argument controls the output encoding (defauts to
1480 ASCII, with &#...; character references for any characters outside
1481 of ASCII). Note that you can pass the name ``'unicode'`` as
1482 ``encoding`` argument to serialise to a unicode string.
1483
1484 The ``method`` argument defines the output method. It defaults to
1485 'html', but can also be 'xml' for xhtml output, or 'text' to
1486 serialise to plain text without markup.
1487
1488 To leave out the tail text of the top-level element that is being
1489 serialised, pass ``with_tail=False``.
1490
1491 The ``doctype`` option allows passing in a plain string that will
1492 be serialised before the XML tree. Note that passing in non
1493 well-formed content here will make the XML output non well-formed.
1494 Also, an existing doctype in the document tree will not be removed
1495 when serialising an ElementTree instance.
1496
1497 Example::
1498
1499 >>> from lxml import html
1500 >>> root = html.fragment_fromstring('<p>Hello<br>world!</p>')
1501
1502 >>> html.tostring(root)
1503 b'<p>Hello<br>world!</p>'
1504 >>> html.tostring(root, method='html')
1505 b'<p>Hello<br>world!</p>'
1506
1507 >>> html.tostring(root, method='xml')
1508 b'<p>Hello<br/>world!</p>'
1509
1510 >>> html.tostring(root, method='text')
1511 b'Helloworld!'
1512
1513 >>> html.tostring(root, method='text', encoding=unicode)
1514 u'Helloworld!'
1515
1516 >>> root = html.fragment_fromstring('<div><p>Hello<br>world!</p>TAIL</div>')
1517 >>> html.tostring(root[0], method='text', encoding=unicode)
1518 u'Helloworld!TAIL'
1519
1520 >>> html.tostring(root[0], method='text', encoding=unicode, with_tail=False)
1521 u'Helloworld!'
1522
1523 >>> doc = html.document_fromstring('<p>Hello<br>world!</p>')
1524 >>> html.tostring(doc, method='html', encoding=unicode)
1525 u'<html><body><p>Hello<br>world!</p></body></html>'
1526
1527 >>> print(html.tostring(doc, method='html', encoding=unicode,
1528 ... doctype='<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN"'
1529 ... ' "http://www.w3.org/TR/html4/strict.dtd">'))
1530 <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd">
1531 <html><body><p>Hello<br>world!</p></body></html>
1532 """
1533 html = etree.tostring(doc, method=method, pretty_print=pretty_print,
1534 encoding=encoding, with_tail=with_tail,
1535 doctype=doctype)
1536 if method == 'html' and not include_meta_content_type:
1537 if isinstance(html, str):
1538 html = __str_replace_meta_content_type('', html)
1539 else:
1540 html = __bytes_replace_meta_content_type(bytes(), html)
1541 return html
1542
1543 tostring.__doc__ = __fix_docstring(tostring.__doc__)
1544
1546 """
1547 Open the HTML document in a web browser, saving it to a temporary
1548 file to open it. Note that this does not delete the file after
1549 use. This is mainly meant for debugging.
1550 """
1551 import os
1552 import webbrowser
1553 import tempfile
1554 if not isinstance(doc, etree._ElementTree):
1555 doc = etree.ElementTree(doc)
1556 handle, fn = tempfile.mkstemp(suffix='.html')
1557 f = os.fdopen(handle, 'wb')
1558 try:
1559 doc.write(f, method="html", encoding=encoding or doc.docinfo.encoding or "UTF-8")
1560 finally:
1561
1562 f.close()
1563 url = 'file://' + fn.replace(os.path.sep, '/')
1564 print(url)
1565 webbrowser.open(url)
1566
1567
1568
1569
1570
1572 """An HTML parser that is configured to return lxml.html Element
1573 objects.
1574 """
1578
1580 """An XML parser that is configured to return lxml.html Element
1581 objects.
1582
1583 Note that this parser is not really XHTML aware unless you let it
1584 load a DTD that declares the HTML entities. To do this, make sure
1585 you have the XHTML DTDs installed in your catalogs, and create the
1586 parser like this::
1587
1588 >>> parser = XHTMLParser(load_dtd=True)
1589
1590 If you additionally want to validate the document, use this::
1591
1592 >>> parser = XHTMLParser(dtd_validation=True)
1593
1594 For catalog support, see http://www.xmlsoft.org/catalog.html.
1595 """
1599
1601 """Create a new HTML Element.
1602
1603 This can also be used for XHTML documents.
1604 """
1605 v = html_parser.makeelement(*args, **kw)
1606 return v
1607
1608 html_parser = HTMLParser()
1609 xhtml_parser = XHTMLParser()
1610