1 """The ``lxml.html`` tool set for HTML handling.
2 """
3
4 import threading
5 import re
6 import urlparse
7 import copy
8 from lxml import etree
9 from lxml.html import defs
10 from lxml import cssselect
11 from lxml.html._setmixin import SetMixin
12 try:
13 from UserDict import DictMixin
14 except ImportError:
15
16 from lxml.html._dictmixin import DictMixin
17 import sets
18
19 __all__ = [
20 'document_fromstring', 'fragment_fromstring', 'fragments_fromstring', 'fromstring',
21 'tostring', 'Element', 'defs', 'open_in_browser', 'submit_form',
22 'find_rel_links', 'find_class', 'make_links_absolute',
23 'resolve_base_href', 'iterlinks', 'rewrite_links', 'open_in_browser']
24
25 _rel_links_xpath = etree.XPath("descendant-or-self::a[@rel]")
26
27 _class_xpath = etree.XPath("descendant-or-self::*[@class and contains(concat(' ', normalize-space(@class), ' '), concat(' ', $class_name, ' '))]")
28 _id_xpath = etree.XPath("descendant-or-self::*[@id=$id]")
29 _collect_string_content = etree.XPath("string()")
30 _css_url_re = re.compile(r'url\((.*?)\)', re.I)
31 _css_import_re = re.compile(r'@import "(.*?)"')
32 _label_xpath = etree.XPath("//label[@for=$id]")
33 _archive_re = re.compile(r'[^ ]+')
34
36
38 """
39 Returns the base URL, given when the page was parsed.
40
41 Use with ``urlparse.urljoin(el.base_url, href)`` to get
42 absolute URLs.
43 """
44 return self.getroottree().docinfo.URL
45 base_url = property(base_url, doc=base_url.__doc__)
46
52 forms = property(forms, doc=forms.__doc__)
53
55 """
56 Return the <body> element. Can be called from a child element
57 to get the document's head.
58 """
59 return self.xpath('//body')[0]
60 body = property(body, doc=body.__doc__)
61
63 """
64 Returns the <head> element. Can be called from a child
65 element to get the document's head.
66 """
67 return self.xpath('//head')[0]
68 head = property(head, doc=head.__doc__)
69
71 """
72 Get or set any <label> element associated with this element.
73 """
74 id = self.get('id')
75 if not id:
76 return None
77 result = _label_xpath(self, id=id)
78 if not result:
79 return None
80 else:
81 return result[0]
83 id = self.get('id')
84 if not id:
85 raise TypeError(
86 "You cannot set a label for an element (%r) that has no id"
87 % self)
88 if not label.tag == 'label':
89 raise TypeError(
90 "You can only assign label to a label element (not %r)"
91 % label)
92 label.set('for', id)
94 label = self.label
95 if label is not None:
96 del label.attrib['for']
97 label = property(label__get, label__set, label__del, doc=label__get.__doc__)
98
100 """
101 Removes this element from the tree, including its children and
102 text. The tail text is joined to the previous element or
103 parent.
104 """
105 parent = self.getparent()
106 assert parent is not None
107 if self.tail:
108 previous = self.getprevious()
109 if previous is None:
110 parent.text = (parent.text or '') + self.tail
111 else:
112 previous.tail = (previous.tail or '') + self.tail
113 parent.remove(self)
114
116 """
117 Remove the tag, but not its children or text. The children and text
118 are merged into the parent.
119
120 Example::
121
122 >>> h = fragment_fromstring('<div>Hello <b>World!</b></div>')
123 >>> h.find('//b').drop_tag()
124 >>> print tostring(h)
125 <div>Hello World!</div>
126 """
127 parent = self.getparent()
128 assert parent is not None
129 previous = self.getprevious()
130 if self.text and isinstance(self.tag, basestring):
131
132 if previous is None:
133 parent.text = (parent.text or '') + self.text
134 else:
135 previous.tail = (previous.tail or '') + self.text
136 if self.tail:
137 if len(self):
138 last = self[-1]
139 last.tail = (last.tail or '') + self.tail
140 elif previous is None:
141 parent.text = (parent.text or '') + self.tail
142 else:
143 previous.tail = (previous.tail or '') + self.tail
144 index = parent.index(self)
145 parent[index:index+1] = self[:]
146
148 """
149 Find any links like ``<a rel="{rel}">...</a>``; returns a list of elements.
150 """
151 rel = rel.lower()
152 return [el for el in _rel_links_xpath(self)
153 if el.get('rel').lower() == rel]
154
156 """
157 Find any elements with the given class name.
158 """
159 return _class_xpath(self, class_name=class_name)
160
162 """
163 Get the first element in a document with the given id. If none is
164 found, return the default argument if provided or raise KeyError
165 otherwise.
166
167 Note that there can be more than one element with the same id,
168 and this isn't uncommon in HTML documents found in the wild.
169 Browsers return only the first match, and this function does
170 the same.
171 """
172 try:
173
174
175 return _id_xpath(self, id=id)[0]
176 except IndexError:
177 if default:
178 return default[0]
179 else:
180 raise KeyError, id
181
182 - def text_content(self):
183 """
184 Return the text content of the tag (and the text in any children).
185 """
186 return _collect_string_content(self)
187
189 """
190 Run the CSS expression on this element and its children,
191 returning a list of the results.
192
193 Equivalent to lxml.cssselect.CSSSelect(expr)(self) -- note
194 that pre-compiling the expression can provide a substantial
195 speedup.
196 """
197 return cssselect.CSSSelect(expr)(self)
198
199
200
201
202
204 """
205 Make all links in the document absolute, given the
206 ``base_url`` for the document (the full URL where the document
207 came from), or if no ``base_url`` is given, then the ``.base_url`` of the document.
208
209 If ``resolve_base_href`` is true, then any ``<base href>``
210 tags in the document are used *and* removed from the document.
211 If it is false then any such tag is ignored.
212 """
213 if base_url is None:
214 base_url = self.base_url
215 if base_url is None:
216 raise TypeError(
217 "No base_url given, and the document has no base_url")
218 if resolve_base_href:
219 self.resolve_base_href()
220 def link_repl(href):
221 return urlparse.urljoin(base_url, href)
222 self.rewrite_links(link_repl)
223
225 """
226 Find any ``<base href>`` tag in the document, and apply its
227 values to all links found in the document. Also remove the
228 tag once it has been applied.
229 """
230 base_href = None
231 basetags = self.xpath('//base[@href]')
232 for b in basetags:
233 base_href = b.get('href')
234 b.drop_tree()
235 if not base_href:
236 return
237 self.make_links_absolute(base_href, resolve_base_href=False)
238
240 """
241 Yield (element, attribute, link, pos), where attribute may be None
242 (indicating the link is in the text). ``pos`` is the position
243 where the link occurs; often 0, but sometimes something else in
244 the case of links in stylesheets or style tags.
245
246 Note: <base href> is *not* taken into account in any way. The
247 link you get is exactly the link in the document.
248 """
249 link_attrs = defs.link_attrs
250 for el in self.getiterator():
251 attribs = el.attrib
252 if el.tag != 'object':
253 for attrib in link_attrs:
254 if attrib in attribs:
255 yield (el, attrib, attribs[attrib], 0)
256 elif el.tag == 'object':
257 codebase = None
258
259
260 if 'codebase' in attribs:
261 codebase = el.get('codebase')
262 yield (el, 'codebase', codebase, 0)
263 for attrib in 'classid', 'data':
264 if attrib in attribs:
265 value = el.get(attrib)
266 if codebase is not None:
267 value = urlparse.urljoin(codebase, value)
268 yield (el, attrib, value, 0)
269 if 'archive' in attribs:
270 for match in _archive_re.finditer(el.get('archive')):
271 value = match.group(0)
272 if codebase is not None:
273 value = urlparse.urljoin(codebase, value)
274 yield (el, 'archive', value, match.start())
275 if el.tag == 'param':
276 valuetype = el.get('valuetype') or ''
277 if valuetype.lower() == 'ref':
278
279
280
281
282
283
284 yield (el, 'value', el.get('value'), 0)
285 if el.tag == 'style' and el.text:
286 for match in _css_url_re.finditer(el.text):
287 yield (el, None, match.group(1), match.start(1))
288 for match in _css_import_re.finditer(el.text):
289 yield (el, None, match.group(1), match.start(1))
290 if 'style' in attribs:
291 for match in _css_url_re.finditer(attribs['style']):
292 yield (el, 'style', match.group(1), match.start(1))
293
294 - def rewrite_links(self, link_repl_func, resolve_base_href=True,
295 base_href=None):
296 """
297 Rewrite all the links in the document. For each link
298 ``link_repl_func(link)`` will be called, and the return value
299 will replace the old link.
300
301 Note that links may not be absolute (unless you first called
302 ``make_links_absolute()``), and may be internal (e.g.,
303 ``'#anchor'``). They can also be values like
304 ``'mailto:email'`` or ``'javascript:expr'``.
305
306 If you give ``base_href`` then all links passed to
307 ``link_repl_func()`` will take that into account.
308
309 If the ``link_repl_func`` returns None, the attribute or
310 tag text will be removed completely.
311 """
312 if base_href is not None:
313
314
315 self.make_links_absolute(base_href, resolve_base_href=resolve_base_href)
316 elif resolve_base_href:
317 self.resolve_base_href()
318 for el, attrib, link, pos in self.iterlinks():
319 new_link = link_repl_func(link)
320 if new_link == link:
321 continue
322 if new_link is None:
323
324 if attrib is None:
325 el.text = ''
326 else:
327 del el.attrib[attrib]
328 continue
329 if attrib is None:
330 new = el.text[:pos] + new_link + el.text[pos+len(link):]
331 el.text = new
332 else:
333 cur = el.attrib[attrib]
334 if not pos and len(cur) == len(link):
335
336 el.attrib[attrib] = new_link
337 else:
338 new = cur[:pos] + new_link + cur[pos+len(link):]
339 el.attrib[attrib] = new
340
341
343 """
344 An object that represents a method on an element as a function;
345 the function takes either an element or an HTML string. It
346 returns whatever the function normally returns, or if the function
347 works in-place (and so returns None) it returns a serialized form
348 of the resulting document.
349 """
350 - def __init__(self, name, copy=False, source_class=HtmlMixin):
355 if isinstance(doc, basestring):
356 if 'copy' in kw:
357 raise TypeError(
358 "The keyword 'copy' can only be used with element inputs to %s, not a string input" % self.name)
359 return_string = True
360 doc = fromstring(doc, **kw)
361 else:
362 if 'copy' in kw:
363 copy = kw.pop('copy')
364 else:
365 copy = self.copy
366 return_string = False
367 if copy:
368 doc = copy.deepcopy(doc)
369 meth = getattr(doc, self.name)
370 result = meth(*args, **kw)
371
372 if result is None:
373
374 if return_string:
375 return tostring(doc)
376 else:
377 return doc
378 else:
379 return result
380
381 find_rel_links = _MethodFunc('find_rel_links', copy=False)
382 find_class = _MethodFunc('find_class', copy=False)
383 make_links_absolute = _MethodFunc('make_links_absolute', copy=True)
384 resolve_base_href = _MethodFunc('resolve_base_href', copy=True)
385 iterlinks = _MethodFunc('iterlinks', copy=False)
386 rewrite_links = _MethodFunc('rewrite_links', copy=True)
387
390
393
396
399
400
402 """A lookup scheme for HTML Element classes.
403
404 To create a lookup instance with different Element classes, pass a tag
405 name mapping of Element classes in the ``classes`` keyword argument and/or
406 a tag name mapping of Mixin classes in the ``mixins`` keyword argument.
407 The special key '*' denotes a Mixin class that should be mixed into all
408 Element classes.
409 """
410 _default_element_classes = {}
411
412 - def __init__(self, classes=None, mixins=None):
413 etree.CustomElementClassLookup.__init__(self)
414 if classes is None:
415 classes = self._default_element_classes.copy()
416 if mixins:
417 mixers = {}
418 for name, value in mixins:
419 if name == '*':
420 for n in classes.keys():
421 mixers.setdefault(n, []).append(value)
422 else:
423 mixers.setdefault(name, []).append(value)
424 for name, mix_bases in mixers.items():
425 cur = classes.get(name, HtmlElement)
426 bases = tuple(mix_bases + [cur])
427 classes[name] = type(cur.__name__, bases, {})
428 self._element_classes = classes
429
430 - def lookup(self, node_type, document, namespace, name):
431 if node_type == 'element':
432 return self._element_classes.get(name.lower(), HtmlElement)
433 elif node_type == 'comment':
434 return HtmlComment
435 elif node_type == 'PI':
436 return HtmlProcessingInstruction
437 elif node_type == 'entity':
438 return HtmlEntity
439
440 return None
441
442
443
444
445
452
454 """
455 Parses several HTML elements, returning a list of elements.
456
457 The first item in the list may be a string (though leading
458 whitespace is removed). If no_leading_text is true, then it will
459 be an error if there is leading text, and it will always be a list
460 of only elements.
461 """
462
463 start = html[:20].lstrip().lower()
464 if not start.startswith('<html') and not start.startswith('<!doctype'):
465 html = '<html><body>%s</body></html>' % html
466 doc = document_fromstring(html, **kw)
467 assert doc.tag == 'html'
468 bodies = [e for e in doc if e.tag == 'body']
469 assert len(bodies) == 1, ("too many bodies: %r in %r" % (bodies, html))
470 body = bodies[0]
471 elements = []
472 if no_leading_text and body.text and body.text.strip():
473 raise etree.ParserError(
474 "There is leading text: %r" % body.text)
475 if body.text and body.text.strip():
476 elements.append(body.text)
477 elements.extend(body)
478
479
480 return elements
481
483 """
484 Parses a single HTML element; it is an error if there is more than
485 one element, or if anything but whitespace precedes or follows the
486 element.
487
488 If create_parent is true (or is a tag name) then a parent node
489 will be created to encapsulate the HTML in a single element.
490 """
491 if create_parent:
492 if not isinstance(create_parent, basestring):
493 create_parent = 'div'
494 return fragment_fromstring('<%s>%s</%s>' % (
495 create_parent, html, create_parent), **kw)
496 elements = fragments_fromstring(html, no_leading_text=True)
497 if not elements:
498 raise etree.ParserError(
499 "No elements found")
500 if len(elements) > 1:
501 raise etree.ParserError(
502 "Multiple elements found (%s)"
503 % ', '.join([_element_name(e) for e in elements]))
504 el = elements[0]
505 if el.tail and el.tail.strip():
506 raise etree.ParserError(
507 "Element followed by text: %r" % el.tail)
508 el.tail = None
509 return el
510
512 """
513 Parse the html, returning a single element/document.
514
515 This tries to minimally parse the chunk of text, without knowing if it
516 is a fragment or a document.
517 """
518 start = html[:10].lstrip().lower()
519 if start.startswith('<html') or start.startswith('<!doctype'):
520
521 return document_fromstring(html, **kw)
522
523 doc = document_fromstring(html, **kw)
524 bodies = doc.findall('body')
525 if bodies:
526 body = bodies[0]
527 if len(bodies) > 1:
528
529
530 for other_body in bodies[1:]:
531 if other_body.text:
532 if len(body):
533 body[-1].tail = (body[-1].tail or '') + other_body.text
534 else:
535 body.text = (body.text or '') + other_body.text
536 body.extend(other_body)
537
538
539 other_body.drop_tree()
540 else:
541 body = None
542 heads = doc.findall('head')
543 if heads:
544
545 head = heads[0]
546 if len(heads) > 1:
547 for other_head in heads[1:]:
548 head.extend(other_head)
549
550 other_head.drop_tree()
551 return doc
552 if (len(body) == 1 and (not body.text or not body.text.strip())
553 and (not body[-1].tail or not body[-1].tail.strip())):
554
555
556 return body[0]
557
558
559
560 if _contains_block_level_tag(body):
561 body.tag = 'div'
562 else:
563 body.tag = 'span'
564 return body
565
566 -def parse(filename, parser=None, **kw):
567 """
568 Parse a filename, URL, or file-like object into an HTML document.
569
570 You may pass the keyword argument ``base_url='http://...'`` to set
571 the base URL.
572 """
573 if parser is None:
574 parser = html_parser
575 return etree.parse(filename, parser, **kw)
576
578
579
580 for el in el.getiterator():
581 if el.tag in defs.block_tags:
582 return True
583 return False
584
586 if isinstance(el, etree.CommentBase):
587 return 'comment'
588 elif isinstance(el, basestring):
589 return 'string'
590 else:
591 return el.tag
592
593
594
595
596
697
698 HtmlElementClassLookup._default_element_classes['form'] = FormElement
699
732
734 import urllib
735
736 if method == 'GET':
737 if '?' in url:
738 url += '&'
739 else:
740 url += '?'
741 url += urllib.urlencode(values)
742 data = None
743 else:
744 data = urllib.urlencode(values)
745 return urllib.urlopen(url, data)
746
748
752 return self.inputs[item].value
754 self.inputs[item].value = value
756 raise KeyError(
757 "You cannot remove keys from ElementDict")
759 return self.inputs.keys()
761 return item in self.inputs
762
764 return '<%s for form %s>' % (
765 self.__class__.__name__,
766 self.inputs.form._name())
767
832
860
861 -class TextareaElement(InputMixin, HtmlElement):
862 """
863 ``<textarea>`` element. You can get the name with ``.name`` and
864 get/set the value with ``.value``
865 """
866
867 - def value__get(self):
868 """
869 Get/set the value (which is the contents of this element)
870 """
871 return self.text or ''
872 - def value__set(self, value):
874 - def value__del(self):
876 value = property(value__get, value__set, value__del, doc=value__get.__doc__)
877
878 HtmlElementClassLookup._default_element_classes['textarea'] = TextareaElement
879
881 """
882 ``<select>`` element. You can get the name with ``.name``.
883
884 ``.value`` will be the value of the selected option, unless this
885 is a multi-select element (``<select multiple>``), in which case
886 it will be a set-like object. In either case ``.value_options``
887 gives the possible values.
888
889 The boolean attribute ``.multiple`` shows if this is a
890 multi-select.
891 """
892
894 """
895 Get/set the value of this select (the selected option).
896
897 If this is a multi-select, this is a set-like object that
898 represents all the selected options.
899 """
900 if self.multiple:
901 return MultipleSelectOptions(self)
902 for el in self.getiterator('option'):
903 if 'selected' in el.attrib:
904 value = el.get('value')
905
906 return value
907 return None
908
910 if self.multiple:
911 if isinstance(value, basestring):
912 raise TypeError(
913 "You must pass in a sequence")
914 self.value.clear()
915 self.value.update(value)
916 return
917 if value is not None:
918 for el in self.getiterator('option'):
919
920 if el.get('value') == value:
921 checked_option = el
922 break
923 else:
924 raise ValueError(
925 "There is no option with the value of %r" % value)
926 for el in self.getiterator('option'):
927 if 'selected' in el.attrib:
928 del el.attrib['selected']
929 if value is not None:
930 checked_option.set('selected', '')
931
933
934 if self.multiple:
935 self.value.clear()
936 else:
937 self.value = None
938
939 value = property(value__get, value__set, value__del, doc=value__get.__doc__)
940
942 """
943 All the possible values this select can have (the ``value``
944 attribute of all the ``<option>`` elements.
945 """
946 return [el.get('value') for el in self.getiterator('option')]
947 value_options = property(value_options, doc=value_options.__doc__)
948
950 """
951 Boolean attribute: is there a ``multiple`` attribute on this element.
952 """
953 return 'multiple' in self.attrib
955 if value:
956 self.set('multiple', '')
957 elif 'multiple' in self.attrib:
958 del self.attrib['multiple']
959 multiple = property(multiple__get, multiple__set, doc=multiple__get.__doc__)
960
961 HtmlElementClassLookup._default_element_classes['select'] = SelectElement
962
964 """
965 Represents all the selected options in a ``<select multiple>`` element.
966
967 You can add to this set-like option to select an option, or remove
968 to unselect the option.
969 """
970
973
975 """
976 Iterator of all the ``<option>`` elements.
977 """
978 return self.select.getiterator('option')
979 options = property(options)
980
982 for option in self.options:
983 yield option.get('value')
984
985 - def add(self, item):
986 for option in self.options:
987 if option.get('value') == item:
988 option.set('selected', '')
989 break
990 else:
991 raise ValueError(
992 "There is no option with the value %r" % item)
993
995 for option in self.options:
996 if option.get('value') == item:
997 if 'selected' in option.attrib:
998 del option.attrib['selected']
999 else:
1000 raise ValueError(
1001 "The option %r is not currently selected" % item)
1002 break
1003 else:
1004 raise ValueError(
1005 "There is not option with the value %r" % item)
1006
1008 return '<%s {%s} for select name=%r>' % (
1009 self.__class__.__name__,
1010 ', '.join([repr(v) for v in self]),
1011 self.select.name)
1012
1014 """
1015 This object represents several ``<input type=radio>`` elements
1016 that have the same name.
1017
1018 You can use this like a list, but also use the property
1019 ``.value`` to check/uncheck inputs. Also you can use
1020 ``.value_options`` to get the possible values.
1021 """
1022
1024 """
1025 Get/set the value, which checks the radio with that value (and
1026 unchecks any other value).
1027 """
1028 for el in self:
1029 if 'checked' in el.attrib:
1030 return el.get('value')
1031 return None
1032
1034 if value is not None:
1035 for el in self:
1036 if el.get('value') == value:
1037 checked_option = el
1038 break
1039 else:
1040 raise ValueError(
1041 "There is no radio input with the value %r" % value)
1042 for el in self:
1043 if 'checked' in el.attrib:
1044 del el.attrib['checked']
1045 if value is not None:
1046 checked_option.set('checked', '')
1047
1050
1051 value = property(value__get, value__set, value__del, doc=value__get.__doc__)
1052
1054 """
1055 Returns a list of all the possible values.
1056 """
1057 return [el.get('value') for el in self]
1058 value_options = property(value_options, doc=value_options.__doc__)
1059
1061 return '%s(%s)' % (
1062 self.__class__.__name__,
1063 list.__repr__(self))
1064
1066 """
1067 Represents a group of checkboxes (``<input type=checkbox>``) that
1068 have the same name.
1069
1070 In addition to using this like a list, the ``.value`` attribute
1071 returns a set-like object that you can add to or remove from to
1072 check and uncheck checkboxes. You can also use ``.value_options``
1073 to get the possible values.
1074 """
1075
1077 """
1078 Return a set-like object that can be modified to check or
1079 uncheck individual checkboxes according to their value.
1080 """
1081 return CheckboxValues(self)
1083 self.value.clear()
1084 if not hasattr(value, '__iter__'):
1085 raise ValueError(
1086 "A CheckboxGroup (name=%r) must be set to a sequence (not %r)"
1087 % (self[0].name, value))
1088 self.value.update(value)
1091 value = property(value__get, value__set, value__del, doc=value__get.__doc__)
1092
1094 return '%s(%s)' % (
1095 self.__class__.__name__, list.__repr__(self))
1096
1098
1099 """
1100 Represents the values of the checked checkboxes in a group of
1101 checkboxes with the same name.
1102 """
1103
1106
1108 return iter([
1109 el.get('value')
1110 for el in self.group
1111 if 'checked' in el.attrib])
1112
1113 - def add(self, value):
1114 for el in self.group:
1115 if el.get('value') == value:
1116 el.set('checked', '')
1117 break
1118 else:
1119 raise KeyError("No checkbox with value %r" % value)
1120
1122 for el in self.group:
1123 if el.get('value') == value:
1124 if 'checked' in el.attrib:
1125 del el.attrib['checked']
1126 else:
1127 raise KeyError(
1128 "The checkbox with value %r was already unchecked" % value)
1129 break
1130 else:
1131 raise KeyError(
1132 "No checkbox with value %r" % value)
1133
1135 return '<%s {%s} for checkboxes name=%r>' % (
1136 self.__class__.__name__,
1137 ', '.join([repr(v) for v in self]),
1138 self.group.name)
1139
1223
1224 HtmlElementClassLookup._default_element_classes['input'] = InputElement
1225
1227 """
1228 Represents a ``<label>`` element.
1229
1230 Label elements are linked to other elements with their ``for``
1231 attribute. You can access this element with ``label.for_element``.
1232 """
1233
1235 """
1236 Get/set the element this label points to. Return None if it
1237 can't be found.
1238 """
1239 id = self.get('for')
1240 if not id:
1241 return None
1242 return self.body.get_element_by_id(id)
1244 id = other.get('id')
1245 if not id:
1246 raise TypeError(
1247 "Element %r has no id attribute" % other)
1248 self.set('for', id)
1252 for_element = property(for_element__get, for_element__set, for_element__del,
1253 doc=for_element__get.__doc__)
1254
1255 HtmlElementClassLookup._default_element_classes['label'] = LabelElement
1256
1257
1258
1259
1260
1261
1262
1263 __replace_meta_content_type = re.compile(
1264 r'<meta http-equiv="Content-Type".*?>').sub
1265
1266 -def tostring(doc, pretty_print=False, include_meta_content_type=False,
1267 encoding=None):
1268 """
1269 return HTML string representation of the document given
1270
1271 note: if include_meta_content_type is true this will create a meta
1272 http-equiv="Content" tag in the head; regardless of the value of include_meta_content_type
1273 any existing meta http-equiv="Content" tag will be removed
1274 """
1275 assert doc is not None
1276 html = etree.tostring(doc, method="html", pretty_print=pretty_print,
1277 encoding=encoding)
1278 if not include_meta_content_type:
1279 html = __replace_meta_content_type('', html)
1280 return html
1281
1283 """
1284 Open the HTML document in a web browser (saving it to a temporary
1285 file to open it).
1286 """
1287 import os
1288 import webbrowser
1289 try:
1290 write_doc = doc.write
1291 except AttributeError:
1292 write_doc = etree.ElementTree(element=doc).write
1293 fn = os.tempnam() + '.html'
1294 write_doc(fn, method="html")
1295 url = 'file://' + fn.replace(os.path.sep, '/')
1296 print url
1297 webbrowser.open(url)
1298
1299
1300
1301
1302
1307
1311
1312 html_parser = HTMLParser()
1313