1 """The ``lxml.html`` tool set for HTML handling.
2 """
3
4 import threading
5 import re
6 try:
7 from urlparse import urljoin
8 except ImportError:
9
10 from urllib.parse import urljoin
11 import copy
12 from lxml import etree
13 from lxml.html import defs
14 from lxml import cssselect
15 from lxml.html._setmixin import SetMixin
16 try:
17 from UserDict import DictMixin
18 except ImportError:
19
20 from lxml.html._dictmixin import DictMixin
21 try:
22 set
23 except NameError:
24
25 from sets import Set as set
26 try:
27 bytes = __builtins__["bytes"]
28 except (KeyError, NameError):
29
30 bytes = str
31 try:
32 unicode = __builtins__["unicode"]
33 except (KeyError, NameError):
34
35 unicode = str
36 try:
37 basestring = __builtins__["basestring"]
38 except (KeyError, NameError):
39
40 basestring = (str, bytes)
41
43 if not s:
44 return s
45 import sys
46 if sys.version_info[0] >= 3:
47 sub = re.compile(r"^(\s*)u'", re.M).sub
48 else:
49 sub = re.compile(r"^(\s*)b'", re.M).sub
50 return sub(r"\1'", s)
51
52 __all__ = [
53 'document_fromstring', 'fragment_fromstring', 'fragments_fromstring', 'fromstring',
54 'tostring', 'Element', 'defs', 'open_in_browser', 'submit_form',
55 'find_rel_links', 'find_class', 'make_links_absolute',
56 'resolve_base_href', 'iterlinks', 'rewrite_links', 'open_in_browser', 'parse']
57
58 XHTML_NAMESPACE = "http://www.w3.org/1999/xhtml"
59
60 _rel_links_xpath = etree.XPath("descendant-or-self::a[@rel]|descendant-or-self::x:a[@rel]",
61 namespaces={'x':XHTML_NAMESPACE})
62 _options_xpath = etree.XPath("descendant-or-self::option|descendant-or-self::x:option",
63 namespaces={'x':XHTML_NAMESPACE})
64 _forms_xpath = etree.XPath("descendant-or-self::form|descendant-or-self::x:form",
65 namespaces={'x':XHTML_NAMESPACE})
66
67 _class_xpath = etree.XPath("descendant-or-self::*[@class and contains(concat(' ', normalize-space(@class), ' '), concat(' ', $class_name, ' '))]")
68 _id_xpath = etree.XPath("descendant-or-self::*[@id=$id]")
69 _collect_string_content = etree.XPath("string()")
70 _css_url_re = re.compile(r'url\((.*?)\)', re.I)
71 _css_import_re = re.compile(r'@import "(.*?)"')
72 _label_xpath = etree.XPath("//label[@for=$id]|//x:label[@for=$id]",
73 namespaces={'x':XHTML_NAMESPACE})
74 _archive_re = re.compile(r'[^ ]+')
75
85
91
93
95 """
96 Returns the base URL, given when the page was parsed.
97
98 Use with ``urlparse.urljoin(el.base_url, href)`` to get
99 absolute URLs.
100 """
101 return self.getroottree().docinfo.URL
102 base_url = property(base_url, doc=base_url.__doc__)
103
109 forms = property(forms, doc=forms.__doc__)
110
112 """
113 Return the <body> element. Can be called from a child element
114 to get the document's head.
115 """
116 return self.xpath('//body|//x:body', namespaces={'x':XHTML_NAMESPACE})[0]
117 body = property(body, doc=body.__doc__)
118
120 """
121 Returns the <head> element. Can be called from a child
122 element to get the document's head.
123 """
124 return self.xpath('//head|//x:head', namespaces={'x':XHTML_NAMESPACE})[0]
125 head = property(head, doc=head.__doc__)
126
128 """
129 Get or set any <label> element associated with this element.
130 """
131 id = self.get('id')
132 if not id:
133 return None
134 result = _label_xpath(self, id=id)
135 if not result:
136 return None
137 else:
138 return result[0]
140 id = self.get('id')
141 if not id:
142 raise TypeError(
143 "You cannot set a label for an element (%r) that has no id"
144 % self)
145 if _nons(label.tag) != 'label':
146 raise TypeError(
147 "You can only assign label to a label element (not %r)"
148 % label)
149 label.set('for', id)
154 label = property(_label__get, _label__set, _label__del, doc=_label__get.__doc__)
155
157 """
158 Removes this element from the tree, including its children and
159 text. The tail text is joined to the previous element or
160 parent.
161 """
162 parent = self.getparent()
163 assert parent is not None
164 if self.tail:
165 previous = self.getprevious()
166 if previous is None:
167 parent.text = (parent.text or '') + self.tail
168 else:
169 previous.tail = (previous.tail or '') + self.tail
170 parent.remove(self)
171
173 """
174 Remove the tag, but not its children or text. The children and text
175 are merged into the parent.
176
177 Example::
178
179 >>> h = fragment_fromstring('<div>Hello <b>World!</b></div>')
180 >>> h.find('.//b').drop_tag()
181 >>> print(tostring(h, encoding=unicode))
182 <div>Hello World!</div>
183 """
184 parent = self.getparent()
185 assert parent is not None
186 previous = self.getprevious()
187 if self.text and isinstance(self.tag, basestring):
188
189 if previous is None:
190 parent.text = (parent.text or '') + self.text
191 else:
192 previous.tail = (previous.tail or '') + self.text
193 if self.tail:
194 if len(self):
195 last = self[-1]
196 last.tail = (last.tail or '') + self.tail
197 elif previous is None:
198 parent.text = (parent.text or '') + self.tail
199 else:
200 previous.tail = (previous.tail or '') + self.tail
201 index = parent.index(self)
202 parent[index:index+1] = self[:]
203
205 """
206 Find any links like ``<a rel="{rel}">...</a>``; returns a list of elements.
207 """
208 rel = rel.lower()
209 return [el for el in _rel_links_xpath(self)
210 if el.get('rel').lower() == rel]
211
213 """
214 Find any elements with the given class name.
215 """
216 return _class_xpath(self, class_name=class_name)
217
219 """
220 Get the first element in a document with the given id. If none is
221 found, return the default argument if provided or raise KeyError
222 otherwise.
223
224 Note that there can be more than one element with the same id,
225 and this isn't uncommon in HTML documents found in the wild.
226 Browsers return only the first match, and this function does
227 the same.
228 """
229 try:
230
231
232 return _id_xpath(self, id=id)[0]
233 except IndexError:
234 if default:
235 return default[0]
236 else:
237 raise KeyError(id)
238
239 - def text_content(self):
240 """
241 Return the text content of the tag (and the text in any children).
242 """
243 return _collect_string_content(self)
244
246 """
247 Run the CSS expression on this element and its children,
248 returning a list of the results.
249
250 Equivalent to lxml.cssselect.CSSSelect(expr)(self) -- note
251 that pre-compiling the expression can provide a substantial
252 speedup.
253 """
254 return cssselect.CSSSelector(expr)(self)
255
256
257
258
259
261 """
262 Make all links in the document absolute, given the
263 ``base_url`` for the document (the full URL where the document
264 came from), or if no ``base_url`` is given, then the ``.base_url`` of the document.
265
266 If ``resolve_base_href`` is true, then any ``<base href>``
267 tags in the document are used *and* removed from the document.
268 If it is false then any such tag is ignored.
269 """
270 if base_url is None:
271 base_url = self.base_url
272 if base_url is None:
273 raise TypeError(
274 "No base_url given, and the document has no base_url")
275 if resolve_base_href:
276 self.resolve_base_href()
277 def link_repl(href):
278 return urljoin(base_url, href)
279 self.rewrite_links(link_repl)
280
282 """
283 Find any ``<base href>`` tag in the document, and apply its
284 values to all links found in the document. Also remove the
285 tag once it has been applied.
286 """
287 base_href = None
288 basetags = self.xpath('//base[@href]|//x:base[@href]', namespaces={'x':XHTML_NAMESPACE})
289 for b in basetags:
290 base_href = b.get('href')
291 b.drop_tree()
292 if not base_href:
293 return
294 self.make_links_absolute(base_href, resolve_base_href=False)
295
297 """
298 Yield (element, attribute, link, pos), where attribute may be None
299 (indicating the link is in the text). ``pos`` is the position
300 where the link occurs; often 0, but sometimes something else in
301 the case of links in stylesheets or style tags.
302
303 Note: <base href> is *not* taken into account in any way. The
304 link you get is exactly the link in the document.
305 """
306 link_attrs = defs.link_attrs
307 for el in self.iter():
308 attribs = el.attrib
309 tag = _nons(el.tag)
310 if tag != 'object':
311 for attrib in link_attrs:
312 if attrib in attribs:
313 yield (el, attrib, attribs[attrib], 0)
314 elif tag == 'object':
315 codebase = None
316
317
318 if 'codebase' in attribs:
319 codebase = el.get('codebase')
320 yield (el, 'codebase', codebase, 0)
321 for attrib in 'classid', 'data':
322 if attrib in attribs:
323 value = el.get(attrib)
324 if codebase is not None:
325 value = urljoin(codebase, value)
326 yield (el, attrib, value, 0)
327 if 'archive' in attribs:
328 for match in _archive_re.finditer(el.get('archive')):
329 value = match.group(0)
330 if codebase is not None:
331 value = urljoin(codebase, value)
332 yield (el, 'archive', value, match.start())
333 if tag == 'param':
334 valuetype = el.get('valuetype') or ''
335 if valuetype.lower() == 'ref':
336
337
338
339
340
341
342 yield (el, 'value', el.get('value'), 0)
343 if tag == 'style' and el.text:
344 for match in _css_url_re.finditer(el.text):
345 yield (el, None, match.group(1), match.start(1))
346 for match in _css_import_re.finditer(el.text):
347 yield (el, None, match.group(1), match.start(1))
348 if 'style' in attribs:
349 for match in _css_url_re.finditer(attribs['style']):
350 yield (el, 'style', match.group(1), match.start(1))
351
352 - def rewrite_links(self, link_repl_func, resolve_base_href=True,
353 base_href=None):
354 """
355 Rewrite all the links in the document. For each link
356 ``link_repl_func(link)`` will be called, and the return value
357 will replace the old link.
358
359 Note that links may not be absolute (unless you first called
360 ``make_links_absolute()``), and may be internal (e.g.,
361 ``'#anchor'``). They can also be values like
362 ``'mailto:email'`` or ``'javascript:expr'``.
363
364 If you give ``base_href`` then all links passed to
365 ``link_repl_func()`` will take that into account.
366
367 If the ``link_repl_func`` returns None, the attribute or
368 tag text will be removed completely.
369 """
370 if base_href is not None:
371
372
373 self.make_links_absolute(base_href, resolve_base_href=resolve_base_href)
374 elif resolve_base_href:
375 self.resolve_base_href()
376 for el, attrib, link, pos in self.iterlinks():
377 new_link = link_repl_func(link.strip())
378 if new_link == link:
379 continue
380 if new_link is None:
381
382 if attrib is None:
383 el.text = ''
384 else:
385 del el.attrib[attrib]
386 continue
387 if attrib is None:
388 new = el.text[:pos] + new_link + el.text[pos+len(link):]
389 el.text = new
390 else:
391 cur = el.attrib[attrib]
392 if not pos and len(cur) == len(link):
393
394 el.attrib[attrib] = new_link
395 else:
396 new = cur[:pos] + new_link + cur[pos+len(link):]
397 el.attrib[attrib] = new
398
399
401 """
402 An object that represents a method on an element as a function;
403 the function takes either an element or an HTML string. It
404 returns whatever the function normally returns, or if the function
405 works in-place (and so returns None) it returns a serialized form
406 of the resulting document.
407 """
413 result_type = type(doc)
414 if isinstance(doc, basestring):
415 if 'copy' in kw:
416 raise TypeError(
417 "The keyword 'copy' can only be used with element inputs to %s, not a string input" % self.name)
418 doc = fromstring(doc, **kw)
419 else:
420 if 'copy' in kw:
421 copy = kw.pop('copy')
422 else:
423 copy = self.copy
424 if copy:
425 doc = copy.deepcopy(doc)
426 meth = getattr(doc, self.name)
427 result = meth(*args, **kw)
428
429 if result is None:
430
431 return _transform_result(result_type, doc)
432 else:
433 return result
434
435 find_rel_links = _MethodFunc('find_rel_links', copy=False)
436 find_class = _MethodFunc('find_class', copy=False)
437 make_links_absolute = _MethodFunc('make_links_absolute', copy=True)
438 resolve_base_href = _MethodFunc('resolve_base_href', copy=True)
439 iterlinks = _MethodFunc('iterlinks', copy=False)
440 rewrite_links = _MethodFunc('rewrite_links', copy=True)
441
444
447
450
453
454
456 """A lookup scheme for HTML Element classes.
457
458 To create a lookup instance with different Element classes, pass a tag
459 name mapping of Element classes in the ``classes`` keyword argument and/or
460 a tag name mapping of Mixin classes in the ``mixins`` keyword argument.
461 The special key '*' denotes a Mixin class that should be mixed into all
462 Element classes.
463 """
464 _default_element_classes = {}
465
466 - def __init__(self, classes=None, mixins=None):
483
484 - def lookup(self, node_type, document, namespace, name):
495
496
497
498
499
508
511 """
512 Parses several HTML elements, returning a list of elements.
513
514 The first item in the list may be a string (though leading
515 whitespace is removed). If no_leading_text is true, then it will
516 be an error if there is leading text, and it will always be a list
517 of only elements.
518
519 base_url will set the document's base_url attribute (and the tree's docinfo.URL)
520 """
521 if parser is None:
522 parser = html_parser
523
524 start = html[:20].lstrip().lower()
525 if not start.startswith('<html') and not start.startswith('<!doctype'):
526 html = '<html><body>%s</body></html>' % html
527 doc = document_fromstring(html, parser=parser, base_url=base_url, **kw)
528 assert _nons(doc.tag) == 'html'
529 bodies = [e for e in doc if _nons(e.tag) == 'body']
530 assert len(bodies) == 1, ("too many bodies: %r in %r" % (bodies, html))
531 body = bodies[0]
532 elements = []
533 if no_leading_text and body.text and body.text.strip():
534 raise etree.ParserError(
535 "There is leading text: %r" % body.text)
536 if body.text and body.text.strip():
537 elements.append(body.text)
538 elements.extend(body)
539
540
541 return elements
542
545 """
546 Parses a single HTML element; it is an error if there is more than
547 one element, or if anything but whitespace precedes or follows the
548 element.
549
550 If create_parent is true (or is a tag name) then a parent node
551 will be created to encapsulate the HTML in a single element.
552
553 base_url will set the document's base_url attribute (and the tree's docinfo.URL)
554 """
555 if parser is None:
556 parser = html_parser
557 if create_parent:
558 if not isinstance(create_parent, basestring):
559 create_parent = 'div'
560 return fragment_fromstring('<%s>%s</%s>' % (
561 create_parent, html, create_parent),
562 parser=parser, base_url=base_url, **kw)
563 elements = fragments_fromstring(html, parser=parser, no_leading_text=True,
564 base_url=base_url, **kw)
565 if not elements:
566 raise etree.ParserError(
567 "No elements found")
568 if len(elements) > 1:
569 raise etree.ParserError(
570 "Multiple elements found (%s)"
571 % ', '.join([_element_name(e) for e in elements]))
572 el = elements[0]
573 if el.tail and el.tail.strip():
574 raise etree.ParserError(
575 "Element followed by text: %r" % el.tail)
576 el.tail = None
577 return el
578
579 -def fromstring(html, base_url=None, parser=None, **kw):
641
642 -def parse(filename_or_url, parser=None, base_url=None, **kw):
643 """
644 Parse a filename, URL, or file-like object into an HTML document
645 tree. Note: this returns a tree, not an element. Use
646 ``parse(...).getroot()`` to get the document root.
647
648 You can override the base URL with the ``base_url`` keyword. This
649 is most useful when parsing from a file-like object.
650 """
651 if parser is None:
652 parser = html_parser
653 return etree.parse(filename_or_url, parser, base_url=base_url, **kw)
654
662
664 if isinstance(el, etree.CommentBase):
665 return 'comment'
666 elif isinstance(el, basestring):
667 return 'string'
668 else:
669 return _nons(el.tag)
670
671
672
673
674
779
780 HtmlElementClassLookup._default_element_classes['form'] = FormElement
781
814
816 import urllib
817
818 if method == 'GET':
819 if '?' in url:
820 url += '&'
821 else:
822 url += '?'
823 url += urllib.urlencode(values)
824 data = None
825 else:
826 data = urllib.urlencode(values)
827 return urllib.urlopen(url, data)
828
830
838 raise KeyError(
839 "You cannot remove keys from ElementDict")
843 return item in self.inputs
844
846 return '<%s for form %s>' % (
847 self.__class__.__name__,
848 self.inputs.form._name())
849
915
943
944 -class TextareaElement(InputMixin, HtmlElement):
945 """
946 ``<textarea>`` element. You can get the name with ``.name`` and
947 get/set the value with ``.value``
948 """
949
950 - def _value__get(self):
951 """
952 Get/set the value (which is the contents of this element)
953 """
954 return self.text or ''
955 - def _value__set(self, value):
957 - def _value__del(self):
959 value = property(_value__get, _value__set, _value__del, doc=_value__get.__doc__)
960
961 HtmlElementClassLookup._default_element_classes['textarea'] = TextareaElement
962
964 """
965 ``<select>`` element. You can get the name with ``.name``.
966
967 ``.value`` will be the value of the selected option, unless this
968 is a multi-select element (``<select multiple>``), in which case
969 it will be a set-like object. In either case ``.value_options``
970 gives the possible values.
971
972 The boolean attribute ``.multiple`` shows if this is a
973 multi-select.
974 """
975
977 """
978 Get/set the value of this select (the selected option).
979
980 If this is a multi-select, this is a set-like object that
981 represents all the selected options.
982 """
983 if self.multiple:
984 return MultipleSelectOptions(self)
985 for el in _options_xpath(self):
986 if 'selected' in el.attrib:
987 value = el.get('value')
988
989 return value
990 return None
991
993 if self.multiple:
994 if isinstance(value, basestring):
995 raise TypeError(
996 "You must pass in a sequence")
997 self.value.clear()
998 self.value.update(value)
999 return
1000 if value is not None:
1001 for el in _options_xpath(self):
1002
1003 if el.get('value') == value:
1004 checked_option = el
1005 break
1006 else:
1007 raise ValueError(
1008 "There is no option with the value of %r" % value)
1009 for el in _options_xpath(self):
1010 if 'selected' in el.attrib:
1011 del el.attrib['selected']
1012 if value is not None:
1013 checked_option.set('selected', '')
1014
1021
1022 value = property(_value__get, _value__set, _value__del, doc=_value__get.__doc__)
1023
1025 """
1026 All the possible values this select can have (the ``value``
1027 attribute of all the ``<option>`` elements.
1028 """
1029 return [el.get('value') for el in _options_xpath(self)]
1030 value_options = property(value_options, doc=value_options.__doc__)
1031
1033 """
1034 Boolean attribute: is there a ``multiple`` attribute on this element.
1035 """
1036 return 'multiple' in self.attrib
1038 if value:
1039 self.set('multiple', '')
1040 elif 'multiple' in self.attrib:
1041 del self.attrib['multiple']
1042 multiple = property(_multiple__get, _multiple__set, doc=_multiple__get.__doc__)
1043
1044 HtmlElementClassLookup._default_element_classes['select'] = SelectElement
1045
1047 """
1048 Represents all the selected options in a ``<select multiple>`` element.
1049
1050 You can add to this set-like option to select an option, or remove
1051 to unselect the option.
1052 """
1053
1055 self.select = select
1056
1058 """
1059 Iterator of all the ``<option>`` elements.
1060 """
1061 return iter(_options_xpath(self.select))
1062 options = property(options)
1063
1065 for option in self.options:
1066 yield option.get('value')
1067
1068 - def add(self, item):
1069 for option in self.options:
1070 if option.get('value') == item:
1071 option.set('selected', '')
1072 break
1073 else:
1074 raise ValueError(
1075 "There is no option with the value %r" % item)
1076
1078 for option in self.options:
1079 if option.get('value') == item:
1080 if 'selected' in option.attrib:
1081 del option.attrib['selected']
1082 else:
1083 raise ValueError(
1084 "The option %r is not currently selected" % item)
1085 break
1086 else:
1087 raise ValueError(
1088 "There is not option with the value %r" % item)
1089
1091 return '<%s {%s} for select name=%r>' % (
1092 self.__class__.__name__,
1093 ', '.join([repr(v) for v in self]),
1094 self.select.name)
1095
1097 """
1098 This object represents several ``<input type=radio>`` elements
1099 that have the same name.
1100
1101 You can use this like a list, but also use the property
1102 ``.value`` to check/uncheck inputs. Also you can use
1103 ``.value_options`` to get the possible values.
1104 """
1105
1107 """
1108 Get/set the value, which checks the radio with that value (and
1109 unchecks any other value).
1110 """
1111 for el in self:
1112 if 'checked' in el.attrib:
1113 return el.get('value')
1114 return None
1115
1117 if value is not None:
1118 for el in self:
1119 if el.get('value') == value:
1120 checked_option = el
1121 break
1122 else:
1123 raise ValueError(
1124 "There is no radio input with the value %r" % value)
1125 for el in self:
1126 if 'checked' in el.attrib:
1127 del el.attrib['checked']
1128 if value is not None:
1129 checked_option.set('checked', '')
1130
1133
1134 value = property(_value__get, _value__set, _value__del, doc=_value__get.__doc__)
1135
1137 """
1138 Returns a list of all the possible values.
1139 """
1140 return [el.get('value') for el in self]
1141 value_options = property(value_options, doc=value_options.__doc__)
1142
1144 return '%s(%s)' % (
1145 self.__class__.__name__,
1146 list.__repr__(self))
1147
1149 """
1150 Represents a group of checkboxes (``<input type=checkbox>``) that
1151 have the same name.
1152
1153 In addition to using this like a list, the ``.value`` attribute
1154 returns a set-like object that you can add to or remove from to
1155 check and uncheck checkboxes. You can also use ``.value_options``
1156 to get the possible values.
1157 """
1158
1160 """
1161 Return a set-like object that can be modified to check or
1162 uncheck individual checkboxes according to their value.
1163 """
1164 return CheckboxValues(self)
1174 value = property(_value__get, _value__set, _value__del, doc=_value__get.__doc__)
1175
1177 return '%s(%s)' % (
1178 self.__class__.__name__, list.__repr__(self))
1179
1181
1182 """
1183 Represents the values of the checked checkboxes in a group of
1184 checkboxes with the same name.
1185 """
1186
1189
1191 return iter([
1192 el.get('value')
1193 for el in self.group
1194 if 'checked' in el.attrib])
1195
1196 - def add(self, value):
1197 for el in self.group:
1198 if el.get('value') == value:
1199 el.set('checked', '')
1200 break
1201 else:
1202 raise KeyError("No checkbox with value %r" % value)
1203
1205 for el in self.group:
1206 if el.get('value') == value:
1207 if 'checked' in el.attrib:
1208 del el.attrib['checked']
1209 else:
1210 raise KeyError(
1211 "The checkbox with value %r was already unchecked" % value)
1212 break
1213 else:
1214 raise KeyError(
1215 "No checkbox with value %r" % value)
1216
1218 return '<%s {%s} for checkboxes name=%r>' % (
1219 self.__class__.__name__,
1220 ', '.join([repr(v) for v in self]),
1221 self.group.name)
1222
1306
1307 HtmlElementClassLookup._default_element_classes['input'] = InputElement
1308
1310 """
1311 Represents a ``<label>`` element.
1312
1313 Label elements are linked to other elements with their ``for``
1314 attribute. You can access this element with ``label.for_element``.
1315 """
1316
1318 """
1319 Get/set the element this label points to. Return None if it
1320 can't be found.
1321 """
1322 id = self.get('for')
1323 if not id:
1324 return None
1325 return self.body.get_element_by_id(id)
1327 id = other.get('id')
1328 if not id:
1329 raise TypeError(
1330 "Element %r has no id attribute" % other)
1331 self.set('for', id)
1335 for_element = property(_for_element__get, _for_element__set, _for_element__del,
1336 doc=_for_element__get.__doc__)
1337
1338 HtmlElementClassLookup._default_element_classes['label'] = LabelElement
1339
1340
1341
1342
1343
1345 """Convert all tags in an HTML tree to XHTML by moving them to the
1346 XHTML namespace.
1347 """
1348 try:
1349 html = html.getroot()
1350 except AttributeError:
1351 pass
1352 prefix = "{%s}" % XHTML_NAMESPACE
1353 for el in html.iter():
1354 tag = el.tag
1355 if isinstance(tag, basestring):
1356 if tag[0] != '{':
1357 el.tag = prefix + tag
1358
1360 """Convert all tags in an XHTML tree to HTML by removing their
1361 XHTML namespace.
1362 """
1363 try:
1364 xhtml = xhtml.getroot()
1365 except AttributeError:
1366 pass
1367 prefix = "{%s}" % XHTML_NAMESPACE
1368 prefix_len = len(prefix)
1369 for el in xhtml.iter(prefix + "*"):
1370 el.tag = el.tag[prefix_len:]
1371
1372
1373
1374 __replace_meta_content_type = re.compile(
1375 r'<meta http-equiv="Content-Type"[^>]*>').sub
1376
1377 -def tostring(doc, pretty_print=False, include_meta_content_type=False,
1378 encoding=None, method="html"):
1379 """Return an HTML string representation of the document.
1380
1381 Note: if include_meta_content_type is true this will create a
1382 ``<meta http-equiv="Content-Type" ...>`` tag in the head;
1383 regardless of the value of include_meta_content_type any existing
1384 ``<meta http-equiv="Content-Type" ...>`` tag will be removed
1385
1386 The ``encoding`` argument controls the output encoding (defauts to
1387 ASCII, with &#...; character references for any characters outside
1388 of ASCII).
1389
1390 The ``method`` argument defines the output method. It defaults to
1391 'html', but can also be 'xml' for xhtml output, or 'text' to
1392 serialise to plain text without markup. Note that you can pass
1393 the builtin ``unicode`` type as ``encoding`` argument to serialise
1394 to a unicode string.
1395
1396 Example::
1397
1398 >>> from lxml import html
1399 >>> root = html.fragment_fromstring('<p>Hello<br>world!</p>')
1400
1401 >>> html.tostring(root)
1402 b'<p>Hello<br>world!</p>'
1403 >>> html.tostring(root, method='html')
1404 b'<p>Hello<br>world!</p>'
1405
1406 >>> html.tostring(root, method='xml')
1407 b'<p>Hello<br/>world!</p>'
1408
1409 >>> html.tostring(root, method='text')
1410 b'Helloworld!'
1411
1412 >>> html.tostring(root, method='text', encoding=unicode)
1413 u'Helloworld!'
1414 """
1415 html = etree.tostring(doc, method=method, pretty_print=pretty_print,
1416 encoding=encoding)
1417 if not include_meta_content_type:
1418 html = __replace_meta_content_type('', html)
1419 return html
1420
1421 tostring.__doc__ = __fix_docstring(tostring.__doc__)
1422
1424 """
1425 Open the HTML document in a web browser (saving it to a temporary
1426 file to open it).
1427 """
1428 import os
1429 import webbrowser
1430 try:
1431 write_doc = doc.write
1432 except AttributeError:
1433 write_doc = etree.ElementTree(element=doc).write
1434 fn = os.tempnam() + '.html'
1435 write_doc(fn, method="html")
1436 url = 'file://' + fn.replace(os.path.sep, '/')
1437 print(url)
1438 webbrowser.open(url)
1439
1440
1441
1442
1443
1448
1453
1455 """Create a new HTML Element.
1456
1457 This can also be used for XHTML documents.
1458 """
1459 v = html_parser.makeelement(*args, **kw)
1460 return v
1461
1462 html_parser = HTMLParser()
1463 xhtml_parser = XHTMLParser()
1464