1 """The ``lxml.html`` tool set for HTML handling.
2 """
3
4 import threading
5 import re
6 try:
7 from urlparse import urljoin
8 except ImportError:
9
10 from urllib.parse import urljoin
11 import copy
12 from lxml import etree
13 from lxml.html import defs
14 from lxml import cssselect
15 from lxml.html._setmixin import SetMixin
16 try:
17 from UserDict import DictMixin
18 except ImportError:
19
20 from lxml.html._dictmixin import DictMixin
21 try:
22 set
23 except NameError:
24
25 from sets import Set as set
26 try:
27 bytes = __builtins__["bytes"]
28 except (KeyError, NameError):
29
30 bytes = str
31 try:
32 unicode = __builtins__["unicode"]
33 except (KeyError, NameError):
34
35 unicode = str
36 try:
37 basestring = __builtins__["basestring"]
38 except (KeyError, NameError):
39
40 basestring = (str, bytes)
41
43 if not s:
44 return s
45 import sys
46 if sys.version_info[0] >= 3:
47 sub = re.compile(r"^(\s*)u'", re.M).sub
48 else:
49 sub = re.compile(r"^(\s*)b'", re.M).sub
50 return sub(r"\1'", s)
51
52 __all__ = [
53 'document_fromstring', 'fragment_fromstring', 'fragments_fromstring', 'fromstring',
54 'tostring', 'Element', 'defs', 'open_in_browser', 'submit_form',
55 'find_rel_links', 'find_class', 'make_links_absolute',
56 'resolve_base_href', 'iterlinks', 'rewrite_links', 'open_in_browser', 'parse']
57
58 XHTML_NAMESPACE = "http://www.w3.org/1999/xhtml"
59
60 _rel_links_xpath = etree.XPath("descendant-or-self::a[@rel]|descendant-or-self::x:a[@rel]",
61 namespaces={'x':XHTML_NAMESPACE})
62 _options_xpath = etree.XPath("descendant-or-self::option|descendant-or-self::x:option",
63 namespaces={'x':XHTML_NAMESPACE})
64 _forms_xpath = etree.XPath("descendant-or-self::form|descendant-or-self::x:form",
65 namespaces={'x':XHTML_NAMESPACE})
66
67 _class_xpath = etree.XPath("descendant-or-self::*[@class and contains(concat(' ', normalize-space(@class), ' '), concat(' ', $class_name, ' '))]")
68 _id_xpath = etree.XPath("descendant-or-self::*[@id=$id]")
69 _collect_string_content = etree.XPath("string()")
70 _css_url_re = re.compile(r'url\(('+'["][^"]*["]|'+"['][^']*[']|"+r'[^)]*)\)', re.I)
71 _css_import_re = re.compile(r'@import "(.*?)"')
72 _label_xpath = etree.XPath("//label[@for=$id]|//x:label[@for=$id]",
73 namespaces={'x':XHTML_NAMESPACE})
74 _archive_re = re.compile(r'[^ ]+')
75
77 if s[:1] == '"' and s[-1:] == '"' or s[:1] == "'" and s[-1:] == "'":
78 return s[1:-1], pos+1
79 else:
80 return s,pos
81
91
97
99
101 """
102 Returns the base URL, given when the page was parsed.
103
104 Use with ``urlparse.urljoin(el.base_url, href)`` to get
105 absolute URLs.
106 """
107 return self.getroottree().docinfo.URL
108 base_url = property(base_url, doc=base_url.__doc__)
109
115 forms = property(forms, doc=forms.__doc__)
116
118 """
119 Return the <body> element. Can be called from a child element
120 to get the document's head.
121 """
122 return self.xpath('//body|//x:body', namespaces={'x':XHTML_NAMESPACE})[0]
123 body = property(body, doc=body.__doc__)
124
126 """
127 Returns the <head> element. Can be called from a child
128 element to get the document's head.
129 """
130 return self.xpath('//head|//x:head', namespaces={'x':XHTML_NAMESPACE})[0]
131 head = property(head, doc=head.__doc__)
132
134 """
135 Get or set any <label> element associated with this element.
136 """
137 id = self.get('id')
138 if not id:
139 return None
140 result = _label_xpath(self, id=id)
141 if not result:
142 return None
143 else:
144 return result[0]
146 id = self.get('id')
147 if not id:
148 raise TypeError(
149 "You cannot set a label for an element (%r) that has no id"
150 % self)
151 if _nons(label.tag) != 'label':
152 raise TypeError(
153 "You can only assign label to a label element (not %r)"
154 % label)
155 label.set('for', id)
160 label = property(_label__get, _label__set, _label__del, doc=_label__get.__doc__)
161
163 """
164 Removes this element from the tree, including its children and
165 text. The tail text is joined to the previous element or
166 parent.
167 """
168 parent = self.getparent()
169 assert parent is not None
170 if self.tail:
171 previous = self.getprevious()
172 if previous is None:
173 parent.text = (parent.text or '') + self.tail
174 else:
175 previous.tail = (previous.tail or '') + self.tail
176 parent.remove(self)
177
179 """
180 Remove the tag, but not its children or text. The children and text
181 are merged into the parent.
182
183 Example::
184
185 >>> h = fragment_fromstring('<div>Hello <b>World!</b></div>')
186 >>> h.find('.//b').drop_tag()
187 >>> print(tostring(h, encoding=unicode))
188 <div>Hello World!</div>
189 """
190 parent = self.getparent()
191 assert parent is not None
192 previous = self.getprevious()
193 if self.text and isinstance(self.tag, basestring):
194
195 if previous is None:
196 parent.text = (parent.text or '') + self.text
197 else:
198 previous.tail = (previous.tail or '') + self.text
199 if self.tail:
200 if len(self):
201 last = self[-1]
202 last.tail = (last.tail or '') + self.tail
203 elif previous is None:
204 parent.text = (parent.text or '') + self.tail
205 else:
206 previous.tail = (previous.tail or '') + self.tail
207 index = parent.index(self)
208 parent[index:index+1] = self[:]
209
211 """
212 Find any links like ``<a rel="{rel}">...</a>``; returns a list of elements.
213 """
214 rel = rel.lower()
215 return [el for el in _rel_links_xpath(self)
216 if el.get('rel').lower() == rel]
217
219 """
220 Find any elements with the given class name.
221 """
222 return _class_xpath(self, class_name=class_name)
223
225 """
226 Get the first element in a document with the given id. If none is
227 found, return the default argument if provided or raise KeyError
228 otherwise.
229
230 Note that there can be more than one element with the same id,
231 and this isn't uncommon in HTML documents found in the wild.
232 Browsers return only the first match, and this function does
233 the same.
234 """
235 try:
236
237
238 return _id_xpath(self, id=id)[0]
239 except IndexError:
240 if default:
241 return default[0]
242 else:
243 raise KeyError(id)
244
245 - def text_content(self):
246 """
247 Return the text content of the tag (and the text in any children).
248 """
249 return _collect_string_content(self)
250
252 """
253 Run the CSS expression on this element and its children,
254 returning a list of the results.
255
256 Equivalent to lxml.cssselect.CSSSelect(expr)(self) -- note
257 that pre-compiling the expression can provide a substantial
258 speedup.
259 """
260 return cssselect.CSSSelector(expr)(self)
261
262
263
264
265
267 """
268 Make all links in the document absolute, given the
269 ``base_url`` for the document (the full URL where the document
270 came from), or if no ``base_url`` is given, then the ``.base_url`` of the document.
271
272 If ``resolve_base_href`` is true, then any ``<base href>``
273 tags in the document are used *and* removed from the document.
274 If it is false then any such tag is ignored.
275 """
276 if base_url is None:
277 base_url = self.base_url
278 if base_url is None:
279 raise TypeError(
280 "No base_url given, and the document has no base_url")
281 if resolve_base_href:
282 self.resolve_base_href()
283 def link_repl(href):
284 return urljoin(base_url, href)
285 self.rewrite_links(link_repl)
286
288 """
289 Find any ``<base href>`` tag in the document, and apply its
290 values to all links found in the document. Also remove the
291 tag once it has been applied.
292 """
293 base_href = None
294 basetags = self.xpath('//base[@href]|//x:base[@href]', namespaces={'x':XHTML_NAMESPACE})
295 for b in basetags:
296 base_href = b.get('href')
297 b.drop_tree()
298 if not base_href:
299 return
300 self.make_links_absolute(base_href, resolve_base_href=False)
301
303 """
304 Yield (element, attribute, link, pos), where attribute may be None
305 (indicating the link is in the text). ``pos`` is the position
306 where the link occurs; often 0, but sometimes something else in
307 the case of links in stylesheets or style tags.
308
309 Note: <base href> is *not* taken into account in any way. The
310 link you get is exactly the link in the document.
311 """
312 link_attrs = defs.link_attrs
313 for el in self.iter():
314 attribs = el.attrib
315 tag = _nons(el.tag)
316 if tag != 'object':
317 for attrib in link_attrs:
318 if attrib in attribs:
319 yield (el, attrib, attribs[attrib], 0)
320 elif tag == 'object':
321 codebase = None
322
323
324 if 'codebase' in attribs:
325 codebase = el.get('codebase')
326 yield (el, 'codebase', codebase, 0)
327 for attrib in 'classid', 'data':
328 if attrib in attribs:
329 value = el.get(attrib)
330 if codebase is not None:
331 value = urljoin(codebase, value)
332 yield (el, attrib, value, 0)
333 if 'archive' in attribs:
334 for match in _archive_re.finditer(el.get('archive')):
335 value = match.group(0)
336 if codebase is not None:
337 value = urljoin(codebase, value)
338 yield (el, 'archive', value, match.start())
339 if tag == 'param':
340 valuetype = el.get('valuetype') or ''
341 if valuetype.lower() == 'ref':
342
343
344
345
346
347
348 yield (el, 'value', el.get('value'), 0)
349 if tag == 'style' and el.text:
350 for match in _css_url_re.finditer(el.text):
351 url, start = _unquote_match(match.group(1), match.start(1))
352 yield (el, None, url, start)
353 for match in _css_import_re.finditer(el.text):
354 yield (el, None, match.group(1), match.start(1))
355 if 'style' in attribs:
356 for match in _css_url_re.finditer(attribs['style']):
357 url, start = _unquote_match(match.group(1), match.start(1))
358 yield (el, 'style', url, start)
359
360 - def rewrite_links(self, link_repl_func, resolve_base_href=True,
361 base_href=None):
362 """
363 Rewrite all the links in the document. For each link
364 ``link_repl_func(link)`` will be called, and the return value
365 will replace the old link.
366
367 Note that links may not be absolute (unless you first called
368 ``make_links_absolute()``), and may be internal (e.g.,
369 ``'#anchor'``). They can also be values like
370 ``'mailto:email'`` or ``'javascript:expr'``.
371
372 If you give ``base_href`` then all links passed to
373 ``link_repl_func()`` will take that into account.
374
375 If the ``link_repl_func`` returns None, the attribute or
376 tag text will be removed completely.
377 """
378 if base_href is not None:
379
380
381 self.make_links_absolute(base_href, resolve_base_href=resolve_base_href)
382 elif resolve_base_href:
383 self.resolve_base_href()
384 for el, attrib, link, pos in self.iterlinks():
385 new_link = link_repl_func(link.strip())
386 if new_link == link:
387 continue
388 if new_link is None:
389
390 if attrib is None:
391 el.text = ''
392 else:
393 del el.attrib[attrib]
394 continue
395 if attrib is None:
396 new = el.text[:pos] + new_link + el.text[pos+len(link):]
397 el.text = new
398 else:
399 cur = el.attrib[attrib]
400 if not pos and len(cur) == len(link):
401
402 el.attrib[attrib] = new_link
403 else:
404 new = cur[:pos] + new_link + cur[pos+len(link):]
405 el.attrib[attrib] = new
406
407
409 """
410 An object that represents a method on an element as a function;
411 the function takes either an element or an HTML string. It
412 returns whatever the function normally returns, or if the function
413 works in-place (and so returns None) it returns a serialized form
414 of the resulting document.
415 """
421 result_type = type(doc)
422 if isinstance(doc, basestring):
423 if 'copy' in kw:
424 raise TypeError(
425 "The keyword 'copy' can only be used with element inputs to %s, not a string input" % self.name)
426 doc = fromstring(doc, **kw)
427 else:
428 if 'copy' in kw:
429 copy = kw.pop('copy')
430 else:
431 copy = self.copy
432 if copy:
433 doc = copy.deepcopy(doc)
434 meth = getattr(doc, self.name)
435 result = meth(*args, **kw)
436
437 if result is None:
438
439 return _transform_result(result_type, doc)
440 else:
441 return result
442
443 find_rel_links = _MethodFunc('find_rel_links', copy=False)
444 find_class = _MethodFunc('find_class', copy=False)
445 make_links_absolute = _MethodFunc('make_links_absolute', copy=True)
446 resolve_base_href = _MethodFunc('resolve_base_href', copy=True)
447 iterlinks = _MethodFunc('iterlinks', copy=False)
448 rewrite_links = _MethodFunc('rewrite_links', copy=True)
449
452
455
458
461
462
464 """A lookup scheme for HTML Element classes.
465
466 To create a lookup instance with different Element classes, pass a tag
467 name mapping of Element classes in the ``classes`` keyword argument and/or
468 a tag name mapping of Mixin classes in the ``mixins`` keyword argument.
469 The special key '*' denotes a Mixin class that should be mixed into all
470 Element classes.
471 """
472 _default_element_classes = {}
473
474 - def __init__(self, classes=None, mixins=None):
491
492 - def lookup(self, node_type, document, namespace, name):
503
504
505
506
507
516
519 """
520 Parses several HTML elements, returning a list of elements.
521
522 The first item in the list may be a string (though leading
523 whitespace is removed). If no_leading_text is true, then it will
524 be an error if there is leading text, and it will always be a list
525 of only elements.
526
527 base_url will set the document's base_url attribute (and the tree's docinfo.URL)
528 """
529 if parser is None:
530 parser = html_parser
531
532 start = html[:20].lstrip().lower()
533 if not start.startswith('<html') and not start.startswith('<!doctype'):
534 html = '<html><body>%s</body></html>' % html
535 doc = document_fromstring(html, parser=parser, base_url=base_url, **kw)
536 assert _nons(doc.tag) == 'html'
537 bodies = [e for e in doc if _nons(e.tag) == 'body']
538 assert len(bodies) == 1, ("too many bodies: %r in %r" % (bodies, html))
539 body = bodies[0]
540 elements = []
541 if no_leading_text and body.text and body.text.strip():
542 raise etree.ParserError(
543 "There is leading text: %r" % body.text)
544 if body.text and body.text.strip():
545 elements.append(body.text)
546 elements.extend(body)
547
548
549 return elements
550
553 """
554 Parses a single HTML element; it is an error if there is more than
555 one element, or if anything but whitespace precedes or follows the
556 element.
557
558 If create_parent is true (or is a tag name) then a parent node
559 will be created to encapsulate the HTML in a single element.
560
561 base_url will set the document's base_url attribute (and the tree's docinfo.URL)
562 """
563 if parser is None:
564 parser = html_parser
565 if create_parent:
566 if not isinstance(create_parent, basestring):
567 create_parent = 'div'
568 return fragment_fromstring('<%s>%s</%s>' % (
569 create_parent, html, create_parent),
570 parser=parser, base_url=base_url, **kw)
571 elements = fragments_fromstring(html, parser=parser, no_leading_text=True,
572 base_url=base_url, **kw)
573 if not elements:
574 raise etree.ParserError(
575 "No elements found")
576 if len(elements) > 1:
577 raise etree.ParserError(
578 "Multiple elements found (%s)"
579 % ', '.join([_element_name(e) for e in elements]))
580 el = elements[0]
581 if el.tail and el.tail.strip():
582 raise etree.ParserError(
583 "Element followed by text: %r" % el.tail)
584 el.tail = None
585 return el
586
587 -def fromstring(html, base_url=None, parser=None, **kw):
649
650 -def parse(filename_or_url, parser=None, base_url=None, **kw):
651 """
652 Parse a filename, URL, or file-like object into an HTML document
653 tree. Note: this returns a tree, not an element. Use
654 ``parse(...).getroot()`` to get the document root.
655
656 You can override the base URL with the ``base_url`` keyword. This
657 is most useful when parsing from a file-like object.
658 """
659 if parser is None:
660 parser = html_parser
661 return etree.parse(filename_or_url, parser, base_url=base_url, **kw)
662
670
672 if isinstance(el, etree.CommentBase):
673 return 'comment'
674 elif isinstance(el, basestring):
675 return 'string'
676 else:
677 return _nons(el.tag)
678
679
680
681
682
787
788 HtmlElementClassLookup._default_element_classes['form'] = FormElement
789
822
824
825 try:
826 from urllib import urlencode, urlopen
827 except ImportError:
828 from urllib.request import urlopen
829 from urllib.parse import urlencode
830 if method == 'GET':
831 if '?' in url:
832 url += '&'
833 else:
834 url += '?'
835 url += urlencode(values)
836 data = None
837 else:
838 data = urlencode(values)
839 return urlopen(url, data)
840
842
850 raise KeyError(
851 "You cannot remove keys from ElementDict")
855 return item in self.inputs
856
858 return '<%s for form %s>' % (
859 self.__class__.__name__,
860 self.inputs.form._name())
861
927
955
956 -class TextareaElement(InputMixin, HtmlElement):
957 """
958 ``<textarea>`` element. You can get the name with ``.name`` and
959 get/set the value with ``.value``
960 """
961
962 - def _value__get(self):
963 """
964 Get/set the value (which is the contents of this element)
965 """
966 return self.text or ''
967 - def _value__set(self, value):
969 - def _value__del(self):
971 value = property(_value__get, _value__set, _value__del, doc=_value__get.__doc__)
972
973 HtmlElementClassLookup._default_element_classes['textarea'] = TextareaElement
974
976 """
977 ``<select>`` element. You can get the name with ``.name``.
978
979 ``.value`` will be the value of the selected option, unless this
980 is a multi-select element (``<select multiple>``), in which case
981 it will be a set-like object. In either case ``.value_options``
982 gives the possible values.
983
984 The boolean attribute ``.multiple`` shows if this is a
985 multi-select.
986 """
987
989 """
990 Get/set the value of this select (the selected option).
991
992 If this is a multi-select, this is a set-like object that
993 represents all the selected options.
994 """
995 if self.multiple:
996 return MultipleSelectOptions(self)
997 for el in _options_xpath(self):
998 if el.get('selected') is not None:
999 value = el.get('value')
1000 if value is None:
1001 value = el.text or ''
1002 if value:
1003 value = value.strip()
1004 return value
1005 return None
1006
1008 if self.multiple:
1009 if isinstance(value, basestring):
1010 raise TypeError(
1011 "You must pass in a sequence")
1012 self.value.clear()
1013 self.value.update(value)
1014 return
1015 if value is not None:
1016 value = value.strip()
1017 for el in _options_xpath(self):
1018 opt_value = el.get('value')
1019 if opt_value is None:
1020 opt_value = el.text or ''
1021 if opt_value:
1022 opt_value = opt_value.strip()
1023 if opt_value == value:
1024 checked_option = el
1025 break
1026 else:
1027 raise ValueError(
1028 "There is no option with the value of %r" % value)
1029 for el in _options_xpath(self):
1030 if 'selected' in el.attrib:
1031 del el.attrib['selected']
1032 if value is not None:
1033 checked_option.set('selected', '')
1034
1041
1042 value = property(_value__get, _value__set, _value__del, doc=_value__get.__doc__)
1043
1058 value_options = property(value_options, doc=value_options.__doc__)
1059
1061 """
1062 Boolean attribute: is there a ``multiple`` attribute on this element.
1063 """
1064 return 'multiple' in self.attrib
1066 if value:
1067 self.set('multiple', '')
1068 elif 'multiple' in self.attrib:
1069 del self.attrib['multiple']
1070 multiple = property(_multiple__get, _multiple__set, doc=_multiple__get.__doc__)
1071
1072 HtmlElementClassLookup._default_element_classes['select'] = SelectElement
1073
1075 """
1076 Represents all the selected options in a ``<select multiple>`` element.
1077
1078 You can add to this set-like option to select an option, or remove
1079 to unselect the option.
1080 """
1081
1083 self.select = select
1084
1086 """
1087 Iterator of all the ``<option>`` elements.
1088 """
1089 return iter(_options_xpath(self.select))
1090 options = property(options)
1091
1093 for option in self.options:
1094 yield option.get('value')
1095
1096 - def add(self, item):
1097 for option in self.options:
1098 if option.get('value') == item:
1099 option.set('selected', '')
1100 break
1101 else:
1102 raise ValueError(
1103 "There is no option with the value %r" % item)
1104
1106 for option in self.options:
1107 if option.get('value') == item:
1108 if 'selected' in option.attrib:
1109 del option.attrib['selected']
1110 else:
1111 raise ValueError(
1112 "The option %r is not currently selected" % item)
1113 break
1114 else:
1115 raise ValueError(
1116 "There is not option with the value %r" % item)
1117
1119 return '<%s {%s} for select name=%r>' % (
1120 self.__class__.__name__,
1121 ', '.join([repr(v) for v in self]),
1122 self.select.name)
1123
1125 """
1126 This object represents several ``<input type=radio>`` elements
1127 that have the same name.
1128
1129 You can use this like a list, but also use the property
1130 ``.value`` to check/uncheck inputs. Also you can use
1131 ``.value_options`` to get the possible values.
1132 """
1133
1135 """
1136 Get/set the value, which checks the radio with that value (and
1137 unchecks any other value).
1138 """
1139 for el in self:
1140 if 'checked' in el.attrib:
1141 return el.get('value')
1142 return None
1143
1145 if value is not None:
1146 for el in self:
1147 if el.get('value') == value:
1148 checked_option = el
1149 break
1150 else:
1151 raise ValueError(
1152 "There is no radio input with the value %r" % value)
1153 for el in self:
1154 if 'checked' in el.attrib:
1155 del el.attrib['checked']
1156 if value is not None:
1157 checked_option.set('checked', '')
1158
1161
1162 value = property(_value__get, _value__set, _value__del, doc=_value__get.__doc__)
1163
1165 """
1166 Returns a list of all the possible values.
1167 """
1168 return [el.get('value') for el in self]
1169 value_options = property(value_options, doc=value_options.__doc__)
1170
1172 return '%s(%s)' % (
1173 self.__class__.__name__,
1174 list.__repr__(self))
1175
1177 """
1178 Represents a group of checkboxes (``<input type=checkbox>``) that
1179 have the same name.
1180
1181 In addition to using this like a list, the ``.value`` attribute
1182 returns a set-like object that you can add to or remove from to
1183 check and uncheck checkboxes. You can also use ``.value_options``
1184 to get the possible values.
1185 """
1186
1188 """
1189 Return a set-like object that can be modified to check or
1190 uncheck individual checkboxes according to their value.
1191 """
1192 return CheckboxValues(self)
1202 value = property(_value__get, _value__set, _value__del, doc=_value__get.__doc__)
1203
1205 return '%s(%s)' % (
1206 self.__class__.__name__, list.__repr__(self))
1207
1209
1210 """
1211 Represents the values of the checked checkboxes in a group of
1212 checkboxes with the same name.
1213 """
1214
1217
1219 return iter([
1220 el.get('value')
1221 for el in self.group
1222 if 'checked' in el.attrib])
1223
1224 - def add(self, value):
1225 for el in self.group:
1226 if el.get('value') == value:
1227 el.set('checked', '')
1228 break
1229 else:
1230 raise KeyError("No checkbox with value %r" % value)
1231
1233 for el in self.group:
1234 if el.get('value') == value:
1235 if 'checked' in el.attrib:
1236 del el.attrib['checked']
1237 else:
1238 raise KeyError(
1239 "The checkbox with value %r was already unchecked" % value)
1240 break
1241 else:
1242 raise KeyError(
1243 "No checkbox with value %r" % value)
1244
1246 return '<%s {%s} for checkboxes name=%r>' % (
1247 self.__class__.__name__,
1248 ', '.join([repr(v) for v in self]),
1249 self.group.name)
1250
1334
1335 HtmlElementClassLookup._default_element_classes['input'] = InputElement
1336
1338 """
1339 Represents a ``<label>`` element.
1340
1341 Label elements are linked to other elements with their ``for``
1342 attribute. You can access this element with ``label.for_element``.
1343 """
1344
1346 """
1347 Get/set the element this label points to. Return None if it
1348 can't be found.
1349 """
1350 id = self.get('for')
1351 if not id:
1352 return None
1353 return self.body.get_element_by_id(id)
1355 id = other.get('id')
1356 if not id:
1357 raise TypeError(
1358 "Element %r has no id attribute" % other)
1359 self.set('for', id)
1363 for_element = property(_for_element__get, _for_element__set, _for_element__del,
1364 doc=_for_element__get.__doc__)
1365
1366 HtmlElementClassLookup._default_element_classes['label'] = LabelElement
1367
1368
1369
1370
1371
1373 """Convert all tags in an HTML tree to XHTML by moving them to the
1374 XHTML namespace.
1375 """
1376 try:
1377 html = html.getroot()
1378 except AttributeError:
1379 pass
1380 prefix = "{%s}" % XHTML_NAMESPACE
1381 for el in html.iter():
1382 tag = el.tag
1383 if isinstance(tag, basestring):
1384 if tag[0] != '{':
1385 el.tag = prefix + tag
1386
1388 """Convert all tags in an XHTML tree to HTML by removing their
1389 XHTML namespace.
1390 """
1391 try:
1392 xhtml = xhtml.getroot()
1393 except AttributeError:
1394 pass
1395 prefix = "{%s}" % XHTML_NAMESPACE
1396 prefix_len = len(prefix)
1397 for el in xhtml.iter(prefix + "*"):
1398 el.tag = el.tag[prefix_len:]
1399
1400
1401
1402 __str_replace_meta_content_type = re.compile(
1403 r'<meta http-equiv="Content-Type"[^>]*>').sub
1404 __bytes_replace_meta_content_type = re.compile(
1405 r'<meta http-equiv="Content-Type"[^>]*>'.encode('ASCII')).sub
1406
1407 -def tostring(doc, pretty_print=False, include_meta_content_type=False,
1408 encoding=None, method="html"):
1409 """Return an HTML string representation of the document.
1410
1411 Note: if include_meta_content_type is true this will create a
1412 ``<meta http-equiv="Content-Type" ...>`` tag in the head;
1413 regardless of the value of include_meta_content_type any existing
1414 ``<meta http-equiv="Content-Type" ...>`` tag will be removed
1415
1416 The ``encoding`` argument controls the output encoding (defauts to
1417 ASCII, with &#...; character references for any characters outside
1418 of ASCII).
1419
1420 The ``method`` argument defines the output method. It defaults to
1421 'html', but can also be 'xml' for xhtml output, or 'text' to
1422 serialise to plain text without markup. Note that you can pass
1423 the builtin ``unicode`` type as ``encoding`` argument to serialise
1424 to a unicode string.
1425
1426 Example::
1427
1428 >>> from lxml import html
1429 >>> root = html.fragment_fromstring('<p>Hello<br>world!</p>')
1430
1431 >>> html.tostring(root)
1432 b'<p>Hello<br>world!</p>'
1433 >>> html.tostring(root, method='html')
1434 b'<p>Hello<br>world!</p>'
1435
1436 >>> html.tostring(root, method='xml')
1437 b'<p>Hello<br/>world!</p>'
1438
1439 >>> html.tostring(root, method='text')
1440 b'Helloworld!'
1441
1442 >>> html.tostring(root, method='text', encoding=unicode)
1443 u'Helloworld!'
1444 """
1445 html = etree.tostring(doc, method=method, pretty_print=pretty_print,
1446 encoding=encoding)
1447 if not include_meta_content_type:
1448 if isinstance(html, str):
1449 html = __str_replace_meta_content_type('', html)
1450 else:
1451 html = __bytes_replace_meta_content_type(bytes(), html)
1452 return html
1453
1454 tostring.__doc__ = __fix_docstring(tostring.__doc__)
1455
1457 """
1458 Open the HTML document in a web browser (saving it to a temporary
1459 file to open it).
1460 """
1461 import os
1462 import webbrowser
1463 try:
1464 write_doc = doc.write
1465 except AttributeError:
1466 write_doc = etree.ElementTree(element=doc).write
1467 fn = os.tempnam() + '.html'
1468 write_doc(fn, method="html")
1469 url = 'file://' + fn.replace(os.path.sep, '/')
1470 print(url)
1471 webbrowser.open(url)
1472
1473
1474
1475
1476
1481
1486
1488 """Create a new HTML Element.
1489
1490 This can also be used for XHTML documents.
1491 """
1492 v = html_parser.makeelement(*args, **kw)
1493 return v
1494
1495 html_parser = HTMLParser()
1496 xhtml_parser = XHTMLParser()
1497