1 """The ``lxml.html`` tool set for HTML handling.
2 """
3
4 import threading
5 import re
6 try:
7 from urlparse import urljoin
8 except ImportError:
9
10 from urllib.parse import urljoin
11 import copy
12 from lxml import etree
13 from lxml.html import defs
14 from lxml import cssselect
15 from lxml.html._setmixin import SetMixin
16 try:
17 from UserDict import DictMixin
18 except ImportError:
19
20 from lxml.html._dictmixin import DictMixin
21 try:
22 set
23 except NameError:
24
25 from sets import Set as set
26 try:
27 bytes = __builtins__["bytes"]
28 except (KeyError, NameError):
29
30 bytes = str
31 try:
32 unicode = __builtins__["unicode"]
33 except (KeyError, NameError):
34
35 unicode = str
36 try:
37 basestring = __builtins__["basestring"]
38 except (KeyError, NameError):
39
40 basestring = (str, bytes)
41
43 import sys
44 if sys.version_info[0] >= 3:
45 sub = re.compile(r"^(\s*)u'", re.M).sub
46 else:
47 sub = re.compile(r"^(\s*)b'", re.M).sub
48 return sub(r"\1'", s)
49
50 __all__ = [
51 'document_fromstring', 'fragment_fromstring', 'fragments_fromstring', 'fromstring',
52 'tostring', 'Element', 'defs', 'open_in_browser', 'submit_form',
53 'find_rel_links', 'find_class', 'make_links_absolute',
54 'resolve_base_href', 'iterlinks', 'rewrite_links', 'open_in_browser']
55
56 XHTML_NAMESPACE = "http://www.w3.org/1999/xhtml"
57
58 _rel_links_xpath = etree.XPath("descendant-or-self::a[@rel]|descendant-or-self::x:a[@rel]",
59 namespaces={'x':XHTML_NAMESPACE})
60 _options_xpath = etree.XPath("descendant-or-self::option|descendant-or-self::x:option",
61 namespaces={'x':XHTML_NAMESPACE})
62 _forms_xpath = etree.XPath("descendant-or-self::form|descendant-or-self::x:form",
63 namespaces={'x':XHTML_NAMESPACE})
64
65 _class_xpath = etree.XPath("descendant-or-self::*[@class and contains(concat(' ', normalize-space(@class), ' '), concat(' ', $class_name, ' '))]")
66 _id_xpath = etree.XPath("descendant-or-self::*[@id=$id]")
67 _collect_string_content = etree.XPath("string()")
68 _css_url_re = re.compile(r'url\((.*?)\)', re.I)
69 _css_import_re = re.compile(r'@import "(.*?)"')
70 _label_xpath = etree.XPath("//label[@for=$id]|//x:label[@for=$id]",
71 namespaces={'x':XHTML_NAMESPACE})
72 _archive_re = re.compile(r'[^ ]+')
73
83
89
91
93 """
94 Returns the base URL, given when the page was parsed.
95
96 Use with ``urlparse.urljoin(el.base_url, href)`` to get
97 absolute URLs.
98 """
99 return self.getroottree().docinfo.URL
100 base_url = property(base_url, doc=base_url.__doc__)
101
107 forms = property(forms, doc=forms.__doc__)
108
110 """
111 Return the <body> element. Can be called from a child element
112 to get the document's head.
113 """
114 return self.xpath('//body|//x:body', namespaces={'x':XHTML_NAMESPACE})[0]
115 body = property(body, doc=body.__doc__)
116
118 """
119 Returns the <head> element. Can be called from a child
120 element to get the document's head.
121 """
122 return self.xpath('//head|//x:head', namespaces={'x':XHTML_NAMESPACE})[0]
123 head = property(head, doc=head.__doc__)
124
126 """
127 Get or set any <label> element associated with this element.
128 """
129 id = self.get('id')
130 if not id:
131 return None
132 result = _label_xpath(self, id=id)
133 if not result:
134 return None
135 else:
136 return result[0]
138 id = self.get('id')
139 if not id:
140 raise TypeError(
141 "You cannot set a label for an element (%r) that has no id"
142 % self)
143 if _nons(label.tag) != 'label':
144 raise TypeError(
145 "You can only assign label to a label element (not %r)"
146 % label)
147 label.set('for', id)
152 label = property(_label__get, _label__set, _label__del, doc=_label__get.__doc__)
153
155 """
156 Removes this element from the tree, including its children and
157 text. The tail text is joined to the previous element or
158 parent.
159 """
160 parent = self.getparent()
161 assert parent is not None
162 if self.tail:
163 previous = self.getprevious()
164 if previous is None:
165 parent.text = (parent.text or '') + self.tail
166 else:
167 previous.tail = (previous.tail or '') + self.tail
168 parent.remove(self)
169
171 """
172 Remove the tag, but not its children or text. The children and text
173 are merged into the parent.
174
175 Example::
176
177 >>> h = fragment_fromstring('<div>Hello <b>World!</b></div>')
178 >>> h.find('.//b').drop_tag()
179 >>> print(tostring(h, encoding=unicode))
180 <div>Hello World!</div>
181 """
182 parent = self.getparent()
183 assert parent is not None
184 previous = self.getprevious()
185 if self.text and isinstance(self.tag, basestring):
186
187 if previous is None:
188 parent.text = (parent.text or '') + self.text
189 else:
190 previous.tail = (previous.tail or '') + self.text
191 if self.tail:
192 if len(self):
193 last = self[-1]
194 last.tail = (last.tail or '') + self.tail
195 elif previous is None:
196 parent.text = (parent.text or '') + self.tail
197 else:
198 previous.tail = (previous.tail or '') + self.tail
199 index = parent.index(self)
200 parent[index:index+1] = self[:]
201
203 """
204 Find any links like ``<a rel="{rel}">...</a>``; returns a list of elements.
205 """
206 rel = rel.lower()
207 return [el for el in _rel_links_xpath(self)
208 if el.get('rel').lower() == rel]
209
211 """
212 Find any elements with the given class name.
213 """
214 return _class_xpath(self, class_name=class_name)
215
217 """
218 Get the first element in a document with the given id. If none is
219 found, return the default argument if provided or raise KeyError
220 otherwise.
221
222 Note that there can be more than one element with the same id,
223 and this isn't uncommon in HTML documents found in the wild.
224 Browsers return only the first match, and this function does
225 the same.
226 """
227 try:
228
229
230 return _id_xpath(self, id=id)[0]
231 except IndexError:
232 if default:
233 return default[0]
234 else:
235 raise KeyError(id)
236
237 - def text_content(self):
238 """
239 Return the text content of the tag (and the text in any children).
240 """
241 return _collect_string_content(self)
242
244 """
245 Run the CSS expression on this element and its children,
246 returning a list of the results.
247
248 Equivalent to lxml.cssselect.CSSSelect(expr)(self) -- note
249 that pre-compiling the expression can provide a substantial
250 speedup.
251 """
252 return cssselect.CSSSelector(expr)(self)
253
254
255
256
257
259 """
260 Make all links in the document absolute, given the
261 ``base_url`` for the document (the full URL where the document
262 came from), or if no ``base_url`` is given, then the ``.base_url`` of the document.
263
264 If ``resolve_base_href`` is true, then any ``<base href>``
265 tags in the document are used *and* removed from the document.
266 If it is false then any such tag is ignored.
267 """
268 if base_url is None:
269 base_url = self.base_url
270 if base_url is None:
271 raise TypeError(
272 "No base_url given, and the document has no base_url")
273 if resolve_base_href:
274 self.resolve_base_href()
275 def link_repl(href):
276 return urljoin(base_url, href)
277 self.rewrite_links(link_repl)
278
280 """
281 Find any ``<base href>`` tag in the document, and apply its
282 values to all links found in the document. Also remove the
283 tag once it has been applied.
284 """
285 base_href = None
286 basetags = self.xpath('//base[@href]|//x:base[@href]', namespaces={'x':XHTML_NAMESPACE})
287 for b in basetags:
288 base_href = b.get('href')
289 b.drop_tree()
290 if not base_href:
291 return
292 self.make_links_absolute(base_href, resolve_base_href=False)
293
295 """
296 Yield (element, attribute, link, pos), where attribute may be None
297 (indicating the link is in the text). ``pos`` is the position
298 where the link occurs; often 0, but sometimes something else in
299 the case of links in stylesheets or style tags.
300
301 Note: <base href> is *not* taken into account in any way. The
302 link you get is exactly the link in the document.
303 """
304 link_attrs = defs.link_attrs
305 for el in self.iter():
306 attribs = el.attrib
307 tag = _nons(el.tag)
308 if tag != 'object':
309 for attrib in link_attrs:
310 if attrib in attribs:
311 yield (el, attrib, attribs[attrib], 0)
312 elif tag == 'object':
313 codebase = None
314
315
316 if 'codebase' in attribs:
317 codebase = el.get('codebase')
318 yield (el, 'codebase', codebase, 0)
319 for attrib in 'classid', 'data':
320 if attrib in attribs:
321 value = el.get(attrib)
322 if codebase is not None:
323 value = urljoin(codebase, value)
324 yield (el, attrib, value, 0)
325 if 'archive' in attribs:
326 for match in _archive_re.finditer(el.get('archive')):
327 value = match.group(0)
328 if codebase is not None:
329 value = urljoin(codebase, value)
330 yield (el, 'archive', value, match.start())
331 if tag == 'param':
332 valuetype = el.get('valuetype') or ''
333 if valuetype.lower() == 'ref':
334
335
336
337
338
339
340 yield (el, 'value', el.get('value'), 0)
341 if tag == 'style' and el.text:
342 for match in _css_url_re.finditer(el.text):
343 yield (el, None, match.group(1), match.start(1))
344 for match in _css_import_re.finditer(el.text):
345 yield (el, None, match.group(1), match.start(1))
346 if 'style' in attribs:
347 for match in _css_url_re.finditer(attribs['style']):
348 yield (el, 'style', match.group(1), match.start(1))
349
350 - def rewrite_links(self, link_repl_func, resolve_base_href=True,
351 base_href=None):
352 """
353 Rewrite all the links in the document. For each link
354 ``link_repl_func(link)`` will be called, and the return value
355 will replace the old link.
356
357 Note that links may not be absolute (unless you first called
358 ``make_links_absolute()``), and may be internal (e.g.,
359 ``'#anchor'``). They can also be values like
360 ``'mailto:email'`` or ``'javascript:expr'``.
361
362 If you give ``base_href`` then all links passed to
363 ``link_repl_func()`` will take that into account.
364
365 If the ``link_repl_func`` returns None, the attribute or
366 tag text will be removed completely.
367 """
368 if base_href is not None:
369
370
371 self.make_links_absolute(base_href, resolve_base_href=resolve_base_href)
372 elif resolve_base_href:
373 self.resolve_base_href()
374 for el, attrib, link, pos in self.iterlinks():
375 new_link = link_repl_func(link.strip())
376 if new_link == link:
377 continue
378 if new_link is None:
379
380 if attrib is None:
381 el.text = ''
382 else:
383 del el.attrib[attrib]
384 continue
385 if attrib is None:
386 new = el.text[:pos] + new_link + el.text[pos+len(link):]
387 el.text = new
388 else:
389 cur = el.attrib[attrib]
390 if not pos and len(cur) == len(link):
391
392 el.attrib[attrib] = new_link
393 else:
394 new = cur[:pos] + new_link + cur[pos+len(link):]
395 el.attrib[attrib] = new
396
397
399 """
400 An object that represents a method on an element as a function;
401 the function takes either an element or an HTML string. It
402 returns whatever the function normally returns, or if the function
403 works in-place (and so returns None) it returns a serialized form
404 of the resulting document.
405 """
411 result_type = type(doc)
412 if isinstance(doc, basestring):
413 if 'copy' in kw:
414 raise TypeError(
415 "The keyword 'copy' can only be used with element inputs to %s, not a string input" % self.name)
416 doc = fromstring(doc, **kw)
417 else:
418 if 'copy' in kw:
419 copy = kw.pop('copy')
420 else:
421 copy = self.copy
422 if copy:
423 doc = copy.deepcopy(doc)
424 meth = getattr(doc, self.name)
425 result = meth(*args, **kw)
426
427 if result is None:
428
429 return _transform_result(result_type, doc)
430 else:
431 return result
432
433 find_rel_links = _MethodFunc('find_rel_links', copy=False)
434 find_class = _MethodFunc('find_class', copy=False)
435 make_links_absolute = _MethodFunc('make_links_absolute', copy=True)
436 resolve_base_href = _MethodFunc('resolve_base_href', copy=True)
437 iterlinks = _MethodFunc('iterlinks', copy=False)
438 rewrite_links = _MethodFunc('rewrite_links', copy=True)
439
442
445
448
451
452
454 """A lookup scheme for HTML Element classes.
455
456 To create a lookup instance with different Element classes, pass a tag
457 name mapping of Element classes in the ``classes`` keyword argument and/or
458 a tag name mapping of Mixin classes in the ``mixins`` keyword argument.
459 The special key '*' denotes a Mixin class that should be mixed into all
460 Element classes.
461 """
462 _default_element_classes = {}
463
464 - def __init__(self, classes=None, mixins=None):
481
482 - def lookup(self, node_type, document, namespace, name):
493
494
495
496
497
506
509 """
510 Parses several HTML elements, returning a list of elements.
511
512 The first item in the list may be a string (though leading
513 whitespace is removed). If no_leading_text is true, then it will
514 be an error if there is leading text, and it will always be a list
515 of only elements.
516
517 base_url will set the document's base_url attribute (and the tree's docinfo.URL)
518 """
519 if parser is None:
520 parser = html_parser
521
522 start = html[:20].lstrip().lower()
523 if not start.startswith('<html') and not start.startswith('<!doctype'):
524 html = '<html><body>%s</body></html>' % html
525 doc = document_fromstring(html, parser=parser, base_url=base_url, **kw)
526 assert _nons(doc.tag) == 'html'
527 bodies = [e for e in doc if _nons(e.tag) == 'body']
528 assert len(bodies) == 1, ("too many bodies: %r in %r" % (bodies, html))
529 body = bodies[0]
530 elements = []
531 if no_leading_text and body.text and body.text.strip():
532 raise etree.ParserError(
533 "There is leading text: %r" % body.text)
534 if body.text and body.text.strip():
535 elements.append(body.text)
536 elements.extend(body)
537
538
539 return elements
540
543 """
544 Parses a single HTML element; it is an error if there is more than
545 one element, or if anything but whitespace precedes or follows the
546 element.
547
548 If create_parent is true (or is a tag name) then a parent node
549 will be created to encapsulate the HTML in a single element.
550
551 base_url will set the document's base_url attribute (and the tree's docinfo.URL)
552 """
553 if parser is None:
554 parser = html_parser
555 if create_parent:
556 if not isinstance(create_parent, basestring):
557 create_parent = 'div'
558 return fragment_fromstring('<%s>%s</%s>' % (
559 create_parent, html, create_parent),
560 parser=parser, base_url=base_url, **kw)
561 elements = fragments_fromstring(html, parser=parser, no_leading_text=True,
562 base_url=base_url, **kw)
563 if not elements:
564 raise etree.ParserError(
565 "No elements found")
566 if len(elements) > 1:
567 raise etree.ParserError(
568 "Multiple elements found (%s)"
569 % ', '.join([_element_name(e) for e in elements]))
570 el = elements[0]
571 if el.tail and el.tail.strip():
572 raise etree.ParserError(
573 "Element followed by text: %r" % el.tail)
574 el.tail = None
575 return el
576
577 -def fromstring(html, base_url=None, parser=None, **kw):
639
640 -def parse(filename_or_url, parser=None, base_url=None, **kw):
641 """
642 Parse a filename, URL, or file-like object into an HTML document
643 tree. Note: this returns a tree, not an element. Use
644 ``parse(...).getroot()`` to get the document root.
645
646 You can override the base URL with the ``base_url`` keyword. This
647 is most useful when parsing from a file-like object.
648 """
649 if parser is None:
650 parser = html_parser
651 return etree.parse(filename_or_url, parser, base_url=base_url, **kw)
652
660
662 if isinstance(el, etree.CommentBase):
663 return 'comment'
664 elif isinstance(el, basestring):
665 return 'string'
666 else:
667 return _nons(el.tag)
668
669
670
671
672
777
778 HtmlElementClassLookup._default_element_classes['form'] = FormElement
779
812
814 import urllib
815
816 if method == 'GET':
817 if '?' in url:
818 url += '&'
819 else:
820 url += '?'
821 url += urllib.urlencode(values)
822 data = None
823 else:
824 data = urllib.urlencode(values)
825 return urllib.urlopen(url, data)
826
828
836 raise KeyError(
837 "You cannot remove keys from ElementDict")
841 return item in self.inputs
842
844 return '<%s for form %s>' % (
845 self.__class__.__name__,
846 self.inputs.form._name())
847
913
941
942 -class TextareaElement(InputMixin, HtmlElement):
943 """
944 ``<textarea>`` element. You can get the name with ``.name`` and
945 get/set the value with ``.value``
946 """
947
948 - def _value__get(self):
949 """
950 Get/set the value (which is the contents of this element)
951 """
952 return self.text or ''
953 - def _value__set(self, value):
955 - def _value__del(self):
957 value = property(_value__get, _value__set, _value__del, doc=_value__get.__doc__)
958
959 HtmlElementClassLookup._default_element_classes['textarea'] = TextareaElement
960
962 """
963 ``<select>`` element. You can get the name with ``.name``.
964
965 ``.value`` will be the value of the selected option, unless this
966 is a multi-select element (``<select multiple>``), in which case
967 it will be a set-like object. In either case ``.value_options``
968 gives the possible values.
969
970 The boolean attribute ``.multiple`` shows if this is a
971 multi-select.
972 """
973
975 """
976 Get/set the value of this select (the selected option).
977
978 If this is a multi-select, this is a set-like object that
979 represents all the selected options.
980 """
981 if self.multiple:
982 return MultipleSelectOptions(self)
983 for el in _options_xpath(self):
984 if 'selected' in el.attrib:
985 value = el.get('value')
986
987 return value
988 return None
989
991 if self.multiple:
992 if isinstance(value, basestring):
993 raise TypeError(
994 "You must pass in a sequence")
995 self.value.clear()
996 self.value.update(value)
997 return
998 if value is not None:
999 for el in _options_xpath(self):
1000
1001 if el.get('value') == value:
1002 checked_option = el
1003 break
1004 else:
1005 raise ValueError(
1006 "There is no option with the value of %r" % value)
1007 for el in _options_xpath(self):
1008 if 'selected' in el.attrib:
1009 del el.attrib['selected']
1010 if value is not None:
1011 checked_option.set('selected', '')
1012
1019
1020 value = property(_value__get, _value__set, _value__del, doc=_value__get.__doc__)
1021
1023 """
1024 All the possible values this select can have (the ``value``
1025 attribute of all the ``<option>`` elements.
1026 """
1027 return [el.get('value') for el in _options_xpath(self)]
1028 value_options = property(value_options, doc=value_options.__doc__)
1029
1031 """
1032 Boolean attribute: is there a ``multiple`` attribute on this element.
1033 """
1034 return 'multiple' in self.attrib
1036 if value:
1037 self.set('multiple', '')
1038 elif 'multiple' in self.attrib:
1039 del self.attrib['multiple']
1040 multiple = property(_multiple__get, _multiple__set, doc=_multiple__get.__doc__)
1041
1042 HtmlElementClassLookup._default_element_classes['select'] = SelectElement
1043
1045 """
1046 Represents all the selected options in a ``<select multiple>`` element.
1047
1048 You can add to this set-like option to select an option, or remove
1049 to unselect the option.
1050 """
1051
1053 self.select = select
1054
1056 """
1057 Iterator of all the ``<option>`` elements.
1058 """
1059 return iter(_options_xpath(self.select))
1060 options = property(options)
1061
1063 for option in self.options:
1064 yield option.get('value')
1065
1066 - def add(self, item):
1067 for option in self.options:
1068 if option.get('value') == item:
1069 option.set('selected', '')
1070 break
1071 else:
1072 raise ValueError(
1073 "There is no option with the value %r" % item)
1074
1076 for option in self.options:
1077 if option.get('value') == item:
1078 if 'selected' in option.attrib:
1079 del option.attrib['selected']
1080 else:
1081 raise ValueError(
1082 "The option %r is not currently selected" % item)
1083 break
1084 else:
1085 raise ValueError(
1086 "There is not option with the value %r" % item)
1087
1089 return '<%s {%s} for select name=%r>' % (
1090 self.__class__.__name__,
1091 ', '.join([repr(v) for v in self]),
1092 self.select.name)
1093
1095 """
1096 This object represents several ``<input type=radio>`` elements
1097 that have the same name.
1098
1099 You can use this like a list, but also use the property
1100 ``.value`` to check/uncheck inputs. Also you can use
1101 ``.value_options`` to get the possible values.
1102 """
1103
1105 """
1106 Get/set the value, which checks the radio with that value (and
1107 unchecks any other value).
1108 """
1109 for el in self:
1110 if 'checked' in el.attrib:
1111 return el.get('value')
1112 return None
1113
1115 if value is not None:
1116 for el in self:
1117 if el.get('value') == value:
1118 checked_option = el
1119 break
1120 else:
1121 raise ValueError(
1122 "There is no radio input with the value %r" % value)
1123 for el in self:
1124 if 'checked' in el.attrib:
1125 del el.attrib['checked']
1126 if value is not None:
1127 checked_option.set('checked', '')
1128
1131
1132 value = property(_value__get, _value__set, _value__del, doc=_value__get.__doc__)
1133
1135 """
1136 Returns a list of all the possible values.
1137 """
1138 return [el.get('value') for el in self]
1139 value_options = property(value_options, doc=value_options.__doc__)
1140
1142 return '%s(%s)' % (
1143 self.__class__.__name__,
1144 list.__repr__(self))
1145
1147 """
1148 Represents a group of checkboxes (``<input type=checkbox>``) that
1149 have the same name.
1150
1151 In addition to using this like a list, the ``.value`` attribute
1152 returns a set-like object that you can add to or remove from to
1153 check and uncheck checkboxes. You can also use ``.value_options``
1154 to get the possible values.
1155 """
1156
1158 """
1159 Return a set-like object that can be modified to check or
1160 uncheck individual checkboxes according to their value.
1161 """
1162 return CheckboxValues(self)
1172 value = property(_value__get, _value__set, _value__del, doc=_value__get.__doc__)
1173
1175 return '%s(%s)' % (
1176 self.__class__.__name__, list.__repr__(self))
1177
1179
1180 """
1181 Represents the values of the checked checkboxes in a group of
1182 checkboxes with the same name.
1183 """
1184
1187
1189 return iter([
1190 el.get('value')
1191 for el in self.group
1192 if 'checked' in el.attrib])
1193
1194 - def add(self, value):
1195 for el in self.group:
1196 if el.get('value') == value:
1197 el.set('checked', '')
1198 break
1199 else:
1200 raise KeyError("No checkbox with value %r" % value)
1201
1203 for el in self.group:
1204 if el.get('value') == value:
1205 if 'checked' in el.attrib:
1206 del el.attrib['checked']
1207 else:
1208 raise KeyError(
1209 "The checkbox with value %r was already unchecked" % value)
1210 break
1211 else:
1212 raise KeyError(
1213 "No checkbox with value %r" % value)
1214
1216 return '<%s {%s} for checkboxes name=%r>' % (
1217 self.__class__.__name__,
1218 ', '.join([repr(v) for v in self]),
1219 self.group.name)
1220
1304
1305 HtmlElementClassLookup._default_element_classes['input'] = InputElement
1306
1308 """
1309 Represents a ``<label>`` element.
1310
1311 Label elements are linked to other elements with their ``for``
1312 attribute. You can access this element with ``label.for_element``.
1313 """
1314
1316 """
1317 Get/set the element this label points to. Return None if it
1318 can't be found.
1319 """
1320 id = self.get('for')
1321 if not id:
1322 return None
1323 return self.body.get_element_by_id(id)
1325 id = other.get('id')
1326 if not id:
1327 raise TypeError(
1328 "Element %r has no id attribute" % other)
1329 self.set('for', id)
1333 for_element = property(_for_element__get, _for_element__set, _for_element__del,
1334 doc=_for_element__get.__doc__)
1335
1336 HtmlElementClassLookup._default_element_classes['label'] = LabelElement
1337
1338
1339
1340
1341
1343 """Convert all tags in an HTML tree to XHTML by moving them to the
1344 XHTML namespace.
1345 """
1346 try:
1347 html = html.getroot()
1348 except AttributeError:
1349 pass
1350 prefix = "{%s}" % XHTML_NAMESPACE
1351 for el in html.iter():
1352 tag = el.tag
1353 if isinstance(tag, basestring):
1354 if tag[0] != '{':
1355 el.tag = prefix + tag
1356
1358 """Convert all tags in an XHTML tree to HTML by removing their
1359 XHTML namespace.
1360 """
1361 try:
1362 xhtml = xhtml.getroot()
1363 except AttributeError:
1364 pass
1365 prefix = "{%s}" % XHTML_NAMESPACE
1366 prefix_len = len(prefix)
1367 for el in xhtml.iter(prefix + "*"):
1368 el.tag = el.tag[prefix_len:]
1369
1370
1371
1372 __replace_meta_content_type = re.compile(
1373 r'<meta http-equiv="Content-Type".*?>').sub
1374
1375 -def tostring(doc, pretty_print=False, include_meta_content_type=False,
1376 encoding=None, method="html"):
1377 """Return an HTML string representation of the document.
1378
1379 Note: the 'include_meta_content_type' argument exists purely for
1380 compatibility and does not serve any purpose.
1381
1382 The ``encoding`` argument controls the output encoding (defauts to
1383 ASCII, with &#...; character references for any characters outside
1384 of ASCII).
1385
1386 The ``method`` argument defines the output method. It defaults to
1387 'html', but can also be 'xml' for xhtml output, or 'text' to
1388 serialise to plain text without markup. Note that you can pass
1389 the builtin ``unicode`` type as ``encoding`` argument to serialise
1390 to a unicode string.
1391
1392 Example::
1393
1394 >>> from lxml import html
1395 >>> root = html.fragment_fromstring('<p>Hello<br>world!</p>')
1396
1397 >>> html.tostring(root)
1398 b'<p>Hello<br>world!</p>'
1399 >>> html.tostring(root, method='html')
1400 b'<p>Hello<br>world!</p>'
1401
1402 >>> html.tostring(root, method='xml')
1403 b'<p>Hello<br/>world!</p>'
1404
1405 >>> html.tostring(root, method='text')
1406 b'Helloworld!'
1407
1408 >>> html.tostring(root, method='text', encoding=unicode)
1409 u'Helloworld!'
1410 """
1411 html = etree.tostring(doc, method=method, pretty_print=pretty_print,
1412 encoding=encoding)
1413 if not include_meta_content_type:
1414 html = __replace_meta_content_type('', html)
1415 return html
1416
1417 tostring.__doc__ = __fix_docstring(tostring.__doc__)
1418
1420 """
1421 Open the HTML document in a web browser (saving it to a temporary
1422 file to open it).
1423 """
1424 import os
1425 import webbrowser
1426 try:
1427 write_doc = doc.write
1428 except AttributeError:
1429 write_doc = etree.ElementTree(element=doc).write
1430 fn = os.tempnam() + '.html'
1431 write_doc(fn, method="html")
1432 url = 'file://' + fn.replace(os.path.sep, '/')
1433 print(url)
1434 webbrowser.open(url)
1435
1436
1437
1438
1439
1444
1449
1451 """Create a new HTML Element.
1452
1453 This can also be used for XHTML documents.
1454 """
1455 v = html_parser.makeelement(*args, **kw)
1456 return v
1457
1458 html_parser = HTMLParser()
1459 xhtml_parser = XHTMLParser()
1460