1 """The ``lxml.html`` tool set for HTML handling.
2 """
3
4 import threading
5 import re
6 import urlparse
7 import copy
8 from lxml import etree
9 from lxml.html import defs
10 from lxml import cssselect
11 from lxml.html._setmixin import SetMixin
12 try:
13 from UserDict import DictMixin
14 except ImportError:
15
16 from lxml.html._dictmixin import DictMixin
17 try:
18 set
19 except NameError:
20 from sets import Set as set
21
22 __all__ = [
23 'document_fromstring', 'fragment_fromstring', 'fragments_fromstring', 'fromstring',
24 'tostring', 'Element', 'defs', 'open_in_browser', 'submit_form',
25 'find_rel_links', 'find_class', 'make_links_absolute',
26 'resolve_base_href', 'iterlinks', 'rewrite_links', 'open_in_browser']
27
28 _rel_links_xpath = etree.XPath("descendant-or-self::a[@rel]")
29
30 _class_xpath = etree.XPath("descendant-or-self::*[@class and contains(concat(' ', normalize-space(@class), ' '), concat(' ', $class_name, ' '))]")
31 _id_xpath = etree.XPath("descendant-or-self::*[@id=$id]")
32 _collect_string_content = etree.XPath("string()")
33 _css_url_re = re.compile(r'url\((.*?)\)', re.I)
34 _css_import_re = re.compile(r'@import "(.*?)"')
35 _label_xpath = etree.XPath("//label[@for=$id]")
36 _archive_re = re.compile(r'[^ ]+')
37
39
41 """
42 Returns the base URL, given when the page was parsed.
43
44 Use with ``urlparse.urljoin(el.base_url, href)`` to get
45 absolute URLs.
46 """
47 return self.getroottree().docinfo.URL
48 base_url = property(base_url, doc=base_url.__doc__)
49
55 forms = property(forms, doc=forms.__doc__)
56
58 """
59 Return the <body> element. Can be called from a child element
60 to get the document's head.
61 """
62 return self.xpath('//body')[0]
63 body = property(body, doc=body.__doc__)
64
66 """
67 Returns the <head> element. Can be called from a child
68 element to get the document's head.
69 """
70 return self.xpath('//head')[0]
71 head = property(head, doc=head.__doc__)
72
74 """
75 Get or set any <label> element associated with this element.
76 """
77 id = self.get('id')
78 if not id:
79 return None
80 result = _label_xpath(self, id=id)
81 if not result:
82 return None
83 else:
84 return result[0]
86 id = self.get('id')
87 if not id:
88 raise TypeError(
89 "You cannot set a label for an element (%r) that has no id"
90 % self)
91 if not label.tag == 'label':
92 raise TypeError(
93 "You can only assign label to a label element (not %r)"
94 % label)
95 label.set('for', id)
97 label = self.label
98 if label is not None:
99 del label.attrib['for']
100 label = property(_label__get, _label__set, _label__del, doc=_label__get.__doc__)
101
103 """
104 Removes this element from the tree, including its children and
105 text. The tail text is joined to the previous element or
106 parent.
107 """
108 parent = self.getparent()
109 assert parent is not None
110 if self.tail:
111 previous = self.getprevious()
112 if previous is None:
113 parent.text = (parent.text or '') + self.tail
114 else:
115 previous.tail = (previous.tail or '') + self.tail
116 parent.remove(self)
117
119 """
120 Remove the tag, but not its children or text. The children and text
121 are merged into the parent.
122
123 Example::
124
125 >>> h = fragment_fromstring('<div>Hello <b>World!</b></div>')
126 >>> h.find('.//b').drop_tag()
127 >>> print tostring(h)
128 <div>Hello World!</div>
129 """
130 parent = self.getparent()
131 assert parent is not None
132 previous = self.getprevious()
133 if self.text and isinstance(self.tag, basestring):
134
135 if previous is None:
136 parent.text = (parent.text or '') + self.text
137 else:
138 previous.tail = (previous.tail or '') + self.text
139 if self.tail:
140 if len(self):
141 last = self[-1]
142 last.tail = (last.tail or '') + self.tail
143 elif previous is None:
144 parent.text = (parent.text or '') + self.tail
145 else:
146 previous.tail = (previous.tail or '') + self.tail
147 index = parent.index(self)
148 parent[index:index+1] = self[:]
149
151 """
152 Find any links like ``<a rel="{rel}">...</a>``; returns a list of elements.
153 """
154 rel = rel.lower()
155 return [el for el in _rel_links_xpath(self)
156 if el.get('rel').lower() == rel]
157
159 """
160 Find any elements with the given class name.
161 """
162 return _class_xpath(self, class_name=class_name)
163
165 """
166 Get the first element in a document with the given id. If none is
167 found, return the default argument if provided or raise KeyError
168 otherwise.
169
170 Note that there can be more than one element with the same id,
171 and this isn't uncommon in HTML documents found in the wild.
172 Browsers return only the first match, and this function does
173 the same.
174 """
175 try:
176
177
178 return _id_xpath(self, id=id)[0]
179 except IndexError:
180 if default:
181 return default[0]
182 else:
183 raise KeyError, id
184
185 - def text_content(self):
186 """
187 Return the text content of the tag (and the text in any children).
188 """
189 return _collect_string_content(self)
190
192 """
193 Run the CSS expression on this element and its children,
194 returning a list of the results.
195
196 Equivalent to lxml.cssselect.CSSSelect(expr)(self) -- note
197 that pre-compiling the expression can provide a substantial
198 speedup.
199 """
200 return cssselect.CSSSelector(expr)(self)
201
202
203
204
205
207 """
208 Make all links in the document absolute, given the
209 ``base_url`` for the document (the full URL where the document
210 came from), or if no ``base_url`` is given, then the ``.base_url`` of the document.
211
212 If ``resolve_base_href`` is true, then any ``<base href>``
213 tags in the document are used *and* removed from the document.
214 If it is false then any such tag is ignored.
215 """
216 if base_url is None:
217 base_url = self.base_url
218 if base_url is None:
219 raise TypeError(
220 "No base_url given, and the document has no base_url")
221 if resolve_base_href:
222 self.resolve_base_href()
223 def link_repl(href):
224 return urlparse.urljoin(base_url, href)
225 self.rewrite_links(link_repl)
226
228 """
229 Find any ``<base href>`` tag in the document, and apply its
230 values to all links found in the document. Also remove the
231 tag once it has been applied.
232 """
233 base_href = None
234 basetags = self.xpath('//base[@href]')
235 for b in basetags:
236 base_href = b.get('href')
237 b.drop_tree()
238 if not base_href:
239 return
240 self.make_links_absolute(base_href, resolve_base_href=False)
241
243 """
244 Yield (element, attribute, link, pos), where attribute may be None
245 (indicating the link is in the text). ``pos`` is the position
246 where the link occurs; often 0, but sometimes something else in
247 the case of links in stylesheets or style tags.
248
249 Note: <base href> is *not* taken into account in any way. The
250 link you get is exactly the link in the document.
251 """
252 link_attrs = defs.link_attrs
253 for el in self.getiterator():
254 attribs = el.attrib
255 if el.tag != 'object':
256 for attrib in link_attrs:
257 if attrib in attribs:
258 yield (el, attrib, attribs[attrib], 0)
259 elif el.tag == 'object':
260 codebase = None
261
262
263 if 'codebase' in attribs:
264 codebase = el.get('codebase')
265 yield (el, 'codebase', codebase, 0)
266 for attrib in 'classid', 'data':
267 if attrib in attribs:
268 value = el.get(attrib)
269 if codebase is not None:
270 value = urlparse.urljoin(codebase, value)
271 yield (el, attrib, value, 0)
272 if 'archive' in attribs:
273 for match in _archive_re.finditer(el.get('archive')):
274 value = match.group(0)
275 if codebase is not None:
276 value = urlparse.urljoin(codebase, value)
277 yield (el, 'archive', value, match.start())
278 if el.tag == 'param':
279 valuetype = el.get('valuetype') or ''
280 if valuetype.lower() == 'ref':
281
282
283
284
285
286
287 yield (el, 'value', el.get('value'), 0)
288 if el.tag == 'style' and el.text:
289 for match in _css_url_re.finditer(el.text):
290 yield (el, None, match.group(1), match.start(1))
291 for match in _css_import_re.finditer(el.text):
292 yield (el, None, match.group(1), match.start(1))
293 if 'style' in attribs:
294 for match in _css_url_re.finditer(attribs['style']):
295 yield (el, 'style', match.group(1), match.start(1))
296
297 - def rewrite_links(self, link_repl_func, resolve_base_href=True,
298 base_href=None):
299 """
300 Rewrite all the links in the document. For each link
301 ``link_repl_func(link)`` will be called, and the return value
302 will replace the old link.
303
304 Note that links may not be absolute (unless you first called
305 ``make_links_absolute()``), and may be internal (e.g.,
306 ``'#anchor'``). They can also be values like
307 ``'mailto:email'`` or ``'javascript:expr'``.
308
309 If you give ``base_href`` then all links passed to
310 ``link_repl_func()`` will take that into account.
311
312 If the ``link_repl_func`` returns None, the attribute or
313 tag text will be removed completely.
314 """
315 if base_href is not None:
316
317
318 self.make_links_absolute(base_href, resolve_base_href=resolve_base_href)
319 elif resolve_base_href:
320 self.resolve_base_href()
321 for el, attrib, link, pos in self.iterlinks():
322 new_link = link_repl_func(link.strip())
323 if new_link == link:
324 continue
325 if new_link is None:
326
327 if attrib is None:
328 el.text = ''
329 else:
330 del el.attrib[attrib]
331 continue
332 if attrib is None:
333 new = el.text[:pos] + new_link + el.text[pos+len(link):]
334 el.text = new
335 else:
336 cur = el.attrib[attrib]
337 if not pos and len(cur) == len(link):
338
339 el.attrib[attrib] = new_link
340 else:
341 new = cur[:pos] + new_link + cur[pos+len(link):]
342 el.attrib[attrib] = new
343
344
346 """
347 An object that represents a method on an element as a function;
348 the function takes either an element or an HTML string. It
349 returns whatever the function normally returns, or if the function
350 works in-place (and so returns None) it returns a serialized form
351 of the resulting document.
352 """
353 - def __init__(self, name, copy=False, source_class=HtmlMixin):
358 if isinstance(doc, basestring):
359 if 'copy' in kw:
360 raise TypeError(
361 "The keyword 'copy' can only be used with element inputs to %s, not a string input" % self.name)
362 return_string = True
363 doc = fromstring(doc, **kw)
364 else:
365 if 'copy' in kw:
366 copy = kw.pop('copy')
367 else:
368 copy = self.copy
369 return_string = False
370 if copy:
371 doc = copy.deepcopy(doc)
372 meth = getattr(doc, self.name)
373 result = meth(*args, **kw)
374
375 if result is None:
376
377 if return_string:
378 return tostring(doc)
379 else:
380 return doc
381 else:
382 return result
383
384 find_rel_links = _MethodFunc('find_rel_links', copy=False)
385 find_class = _MethodFunc('find_class', copy=False)
386 make_links_absolute = _MethodFunc('make_links_absolute', copy=True)
387 resolve_base_href = _MethodFunc('resolve_base_href', copy=True)
388 iterlinks = _MethodFunc('iterlinks', copy=False)
389 rewrite_links = _MethodFunc('rewrite_links', copy=True)
390
393
396
399
402
403
405 """A lookup scheme for HTML Element classes.
406
407 To create a lookup instance with different Element classes, pass a tag
408 name mapping of Element classes in the ``classes`` keyword argument and/or
409 a tag name mapping of Mixin classes in the ``mixins`` keyword argument.
410 The special key '*' denotes a Mixin class that should be mixed into all
411 Element classes.
412 """
413 _default_element_classes = {}
414
415 - def __init__(self, classes=None, mixins=None):
416 etree.CustomElementClassLookup.__init__(self)
417 if classes is None:
418 classes = self._default_element_classes.copy()
419 if mixins:
420 mixers = {}
421 for name, value in mixins:
422 if name == '*':
423 for n in classes.keys():
424 mixers.setdefault(n, []).append(value)
425 else:
426 mixers.setdefault(name, []).append(value)
427 for name, mix_bases in mixers.items():
428 cur = classes.get(name, HtmlElement)
429 bases = tuple(mix_bases + [cur])
430 classes[name] = type(cur.__name__, bases, {})
431 self._element_classes = classes
432
433 - def lookup(self, node_type, document, namespace, name):
434 if node_type == 'element':
435 return self._element_classes.get(name.lower(), HtmlElement)
436 elif node_type == 'comment':
437 return HtmlComment
438 elif node_type == 'PI':
439 return HtmlProcessingInstruction
440 elif node_type == 'entity':
441 return HtmlEntity
442
443 return None
444
445
446
447
448
455
457 """
458 Parses several HTML elements, returning a list of elements.
459
460 The first item in the list may be a string (though leading
461 whitespace is removed). If no_leading_text is true, then it will
462 be an error if there is leading text, and it will always be a list
463 of only elements.
464
465 base_url will set the document's base_url attribute (and the tree's docinfo.URL)
466 """
467
468 start = html[:20].lstrip().lower()
469 if not start.startswith('<html') and not start.startswith('<!doctype'):
470 html = '<html><body>%s</body></html>' % html
471 doc = document_fromstring(html, base_url=base_url, **kw)
472 assert doc.tag == 'html'
473 bodies = [e for e in doc if e.tag == 'body']
474 assert len(bodies) == 1, ("too many bodies: %r in %r" % (bodies, html))
475 body = bodies[0]
476 elements = []
477 if no_leading_text and body.text and body.text.strip():
478 raise etree.ParserError(
479 "There is leading text: %r" % body.text)
480 if body.text and body.text.strip():
481 elements.append(body.text)
482 elements.extend(body)
483
484
485 return elements
486
488 """
489 Parses a single HTML element; it is an error if there is more than
490 one element, or if anything but whitespace precedes or follows the
491 element.
492
493 If create_parent is true (or is a tag name) then a parent node
494 will be created to encapsulate the HTML in a single element.
495
496 base_url will set the document's base_url attribute (and the tree's docinfo.URL)
497 """
498 if create_parent:
499 if not isinstance(create_parent, basestring):
500 create_parent = 'div'
501 return fragment_fromstring('<%s>%s</%s>' % (
502 create_parent, html, create_parent), base_url=base_url, **kw)
503 elements = fragments_fromstring(html, no_leading_text=True, base_url=base_url, **kw)
504 if not elements:
505 raise etree.ParserError(
506 "No elements found")
507 if len(elements) > 1:
508 raise etree.ParserError(
509 "Multiple elements found (%s)"
510 % ', '.join([_element_name(e) for e in elements]))
511 el = elements[0]
512 if el.tail and el.tail.strip():
513 raise etree.ParserError(
514 "Element followed by text: %r" % el.tail)
515 el.tail = None
516 return el
517
519 """
520 Parse the html, returning a single element/document.
521
522 This tries to minimally parse the chunk of text, without knowing if it
523 is a fragment or a document.
524
525 base_url will set the document's base_url attribute (and the tree's docinfo.URL)
526 """
527 start = html[:10].lstrip().lower()
528 if start.startswith('<html') or start.startswith('<!doctype'):
529
530 return document_fromstring(html, base_url=base_url, **kw)
531
532 doc = document_fromstring(html, base_url=base_url, **kw)
533 bodies = doc.findall('body')
534 if bodies:
535 body = bodies[0]
536 if len(bodies) > 1:
537
538
539 for other_body in bodies[1:]:
540 if other_body.text:
541 if len(body):
542 body[-1].tail = (body[-1].tail or '') + other_body.text
543 else:
544 body.text = (body.text or '') + other_body.text
545 body.extend(other_body)
546
547
548 other_body.drop_tree()
549 else:
550 body = None
551 heads = doc.findall('head')
552 if heads:
553
554 head = heads[0]
555 if len(heads) > 1:
556 for other_head in heads[1:]:
557 head.extend(other_head)
558
559 other_head.drop_tree()
560 return doc
561 if (len(body) == 1 and (not body.text or not body.text.strip())
562 and (not body[-1].tail or not body[-1].tail.strip())):
563
564
565 return body[0]
566
567
568
569 if _contains_block_level_tag(body):
570 body.tag = 'div'
571 else:
572 body.tag = 'span'
573 return body
574
575 -def parse(filename_or_url, parser=None, base_url=None, **kw):
576 """
577 Parse a filename, URL, or file-like object into an HTML document
578 tree. Note: this returns a tree, not an element. Use
579 ``parse(...).getroot()`` to get the document root.
580
581 You can override the base URL with the ``base_url`` keyword. This
582 is most useful when parsing from a file-like object.
583 """
584 if parser is None:
585 parser = html_parser
586 return etree.parse(filename_or_url, parser, base_url=base_url, **kw)
587
589
590
591 for el in el.getiterator():
592 if el.tag in defs.block_tags:
593 return True
594 return False
595
597 if isinstance(el, etree.CommentBase):
598 return 'comment'
599 elif isinstance(el, basestring):
600 return 'string'
601 else:
602 return el.tag
603
604
605
606
607
708
709 HtmlElementClassLookup._default_element_classes['form'] = FormElement
710
743
745 import urllib
746
747 if method == 'GET':
748 if '?' in url:
749 url += '&'
750 else:
751 url += '?'
752 url += urllib.urlencode(values)
753 data = None
754 else:
755 data = urllib.urlencode(values)
756 return urllib.urlopen(url, data)
757
759
763 return self.inputs[item].value
765 self.inputs[item].value = value
767 raise KeyError(
768 "You cannot remove keys from ElementDict")
770 return self.inputs.keys()
772 return item in self.inputs
773
775 return '<%s for form %s>' % (
776 self.__class__.__name__,
777 self.inputs.form._name())
778
843
871
872 -class TextareaElement(InputMixin, HtmlElement):
873 """
874 ``<textarea>`` element. You can get the name with ``.name`` and
875 get/set the value with ``.value``
876 """
877
878 - def _value__get(self):
879 """
880 Get/set the value (which is the contents of this element)
881 """
882 return self.text or ''
883 - def _value__set(self, value):
885 - def _value__del(self):
887 value = property(_value__get, _value__set, _value__del, doc=_value__get.__doc__)
888
889 HtmlElementClassLookup._default_element_classes['textarea'] = TextareaElement
890
892 """
893 ``<select>`` element. You can get the name with ``.name``.
894
895 ``.value`` will be the value of the selected option, unless this
896 is a multi-select element (``<select multiple>``), in which case
897 it will be a set-like object. In either case ``.value_options``
898 gives the possible values.
899
900 The boolean attribute ``.multiple`` shows if this is a
901 multi-select.
902 """
903
905 """
906 Get/set the value of this select (the selected option).
907
908 If this is a multi-select, this is a set-like object that
909 represents all the selected options.
910 """
911 if self.multiple:
912 return MultipleSelectOptions(self)
913 for el in self.getiterator('option'):
914 if 'selected' in el.attrib:
915 value = el.get('value')
916
917 return value
918 return None
919
921 if self.multiple:
922 if isinstance(value, basestring):
923 raise TypeError(
924 "You must pass in a sequence")
925 self.value.clear()
926 self.value.update(value)
927 return
928 if value is not None:
929 for el in self.getiterator('option'):
930
931 if el.get('value') == value:
932 checked_option = el
933 break
934 else:
935 raise ValueError(
936 "There is no option with the value of %r" % value)
937 for el in self.getiterator('option'):
938 if 'selected' in el.attrib:
939 del el.attrib['selected']
940 if value is not None:
941 checked_option.set('selected', '')
942
944
945 if self.multiple:
946 self.value.clear()
947 else:
948 self.value = None
949
950 value = property(_value__get, _value__set, _value__del, doc=_value__get.__doc__)
951
953 """
954 All the possible values this select can have (the ``value``
955 attribute of all the ``<option>`` elements.
956 """
957 return [el.get('value') for el in self.getiterator('option')]
958 value_options = property(value_options, doc=value_options.__doc__)
959
961 """
962 Boolean attribute: is there a ``multiple`` attribute on this element.
963 """
964 return 'multiple' in self.attrib
966 if value:
967 self.set('multiple', '')
968 elif 'multiple' in self.attrib:
969 del self.attrib['multiple']
970 multiple = property(_multiple__get, _multiple__set, doc=_multiple__get.__doc__)
971
972 HtmlElementClassLookup._default_element_classes['select'] = SelectElement
973
975 """
976 Represents all the selected options in a ``<select multiple>`` element.
977
978 You can add to this set-like option to select an option, or remove
979 to unselect the option.
980 """
981
984
986 """
987 Iterator of all the ``<option>`` elements.
988 """
989 return self.select.getiterator('option')
990 options = property(options)
991
993 for option in self.options:
994 yield option.get('value')
995
996 - def add(self, item):
997 for option in self.options:
998 if option.get('value') == item:
999 option.set('selected', '')
1000 break
1001 else:
1002 raise ValueError(
1003 "There is no option with the value %r" % item)
1004
1006 for option in self.options:
1007 if option.get('value') == item:
1008 if 'selected' in option.attrib:
1009 del option.attrib['selected']
1010 else:
1011 raise ValueError(
1012 "The option %r is not currently selected" % item)
1013 break
1014 else:
1015 raise ValueError(
1016 "There is not option with the value %r" % item)
1017
1019 return '<%s {%s} for select name=%r>' % (
1020 self.__class__.__name__,
1021 ', '.join([repr(v) for v in self]),
1022 self.select.name)
1023
1025 """
1026 This object represents several ``<input type=radio>`` elements
1027 that have the same name.
1028
1029 You can use this like a list, but also use the property
1030 ``.value`` to check/uncheck inputs. Also you can use
1031 ``.value_options`` to get the possible values.
1032 """
1033
1035 """
1036 Get/set the value, which checks the radio with that value (and
1037 unchecks any other value).
1038 """
1039 for el in self:
1040 if 'checked' in el.attrib:
1041 return el.get('value')
1042 return None
1043
1045 if value is not None:
1046 for el in self:
1047 if el.get('value') == value:
1048 checked_option = el
1049 break
1050 else:
1051 raise ValueError(
1052 "There is no radio input with the value %r" % value)
1053 for el in self:
1054 if 'checked' in el.attrib:
1055 del el.attrib['checked']
1056 if value is not None:
1057 checked_option.set('checked', '')
1058
1061
1062 value = property(_value__get, _value__set, _value__del, doc=_value__get.__doc__)
1063
1065 """
1066 Returns a list of all the possible values.
1067 """
1068 return [el.get('value') for el in self]
1069 value_options = property(value_options, doc=value_options.__doc__)
1070
1072 return '%s(%s)' % (
1073 self.__class__.__name__,
1074 list.__repr__(self))
1075
1077 """
1078 Represents a group of checkboxes (``<input type=checkbox>``) that
1079 have the same name.
1080
1081 In addition to using this like a list, the ``.value`` attribute
1082 returns a set-like object that you can add to or remove from to
1083 check and uncheck checkboxes. You can also use ``.value_options``
1084 to get the possible values.
1085 """
1086
1088 """
1089 Return a set-like object that can be modified to check or
1090 uncheck individual checkboxes according to their value.
1091 """
1092 return CheckboxValues(self)
1094 self.value.clear()
1095 if not hasattr(value, '__iter__'):
1096 raise ValueError(
1097 "A CheckboxGroup (name=%r) must be set to a sequence (not %r)"
1098 % (self[0].name, value))
1099 self.value.update(value)
1102 value = property(_value__get, _value__set, _value__del, doc=_value__get.__doc__)
1103
1105 return '%s(%s)' % (
1106 self.__class__.__name__, list.__repr__(self))
1107
1109
1110 """
1111 Represents the values of the checked checkboxes in a group of
1112 checkboxes with the same name.
1113 """
1114
1117
1119 return iter([
1120 el.get('value')
1121 for el in self.group
1122 if 'checked' in el.attrib])
1123
1124 - def add(self, value):
1125 for el in self.group:
1126 if el.get('value') == value:
1127 el.set('checked', '')
1128 break
1129 else:
1130 raise KeyError("No checkbox with value %r" % value)
1131
1133 for el in self.group:
1134 if el.get('value') == value:
1135 if 'checked' in el.attrib:
1136 del el.attrib['checked']
1137 else:
1138 raise KeyError(
1139 "The checkbox with value %r was already unchecked" % value)
1140 break
1141 else:
1142 raise KeyError(
1143 "No checkbox with value %r" % value)
1144
1146 return '<%s {%s} for checkboxes name=%r>' % (
1147 self.__class__.__name__,
1148 ', '.join([repr(v) for v in self]),
1149 self.group.name)
1150
1234
1235 HtmlElementClassLookup._default_element_classes['input'] = InputElement
1236
1238 """
1239 Represents a ``<label>`` element.
1240
1241 Label elements are linked to other elements with their ``for``
1242 attribute. You can access this element with ``label.for_element``.
1243 """
1244
1246 """
1247 Get/set the element this label points to. Return None if it
1248 can't be found.
1249 """
1250 id = self.get('for')
1251 if not id:
1252 return None
1253 return self.body.get_element_by_id(id)
1255 id = other.get('id')
1256 if not id:
1257 raise TypeError(
1258 "Element %r has no id attribute" % other)
1259 self.set('for', id)
1263 for_element = property(_for_element__get, _for_element__set, _for_element__del,
1264 doc=_for_element__get.__doc__)
1265
1266 HtmlElementClassLookup._default_element_classes['label'] = LabelElement
1267
1268
1269
1270
1271
1272
1273
1274 __replace_meta_content_type = re.compile(
1275 r'<meta http-equiv="Content-Type".*?>').sub
1276
1277 -def tostring(doc, pretty_print=False, include_meta_content_type=False,
1278 encoding=None, method="html"):
1279 """Return an HTML string representation of the document.
1280
1281 Note: if include_meta_content_type is true this will create a
1282 ``<meta http-equiv="Content-Type" ...>`` tag in the head;
1283 regardless of the value of include_meta_content_type any existing
1284 ``<meta http-equiv="Content-Type" ...>`` tag will be removed
1285
1286 The ``encoding`` argument controls the output encoding (defauts to
1287 ASCII, with &#...; character references for any characters outside
1288 of ASCII).
1289
1290 The ``method`` argument defines the output method. It defaults to
1291 'html', but can also be 'xml' for xhtml output, or 'text' to
1292 serialise to plain text without markup. Note that you can pass
1293 the builtin ``unicode`` type as ``encoding`` argument to serialise
1294 to a unicode string.
1295
1296 Example::
1297
1298 >>> from lxml import html
1299 >>> root = html.fragment_fromstring('<p>Hello<br>world!</p>')
1300
1301 >>> html.tostring(root)
1302 '<p>Hello<br>world!</p>'
1303 >>> html.tostring(root, method='html')
1304 '<p>Hello<br>world!</p>'
1305
1306 >>> html.tostring(root, method='xml')
1307 '<p>Hello<br/>world!</p>'
1308
1309 >>> html.tostring(root, method='text')
1310 'Helloworld!'
1311
1312 >>> html.tostring(root, method='text', encoding=unicode)
1313 u'Helloworld!'
1314 """
1315 html = etree.tostring(doc, method=method, pretty_print=pretty_print,
1316 encoding=encoding)
1317 if not include_meta_content_type:
1318 html = __replace_meta_content_type('', html)
1319 return html
1320
1322 """
1323 Open the HTML document in a web browser (saving it to a temporary
1324 file to open it).
1325 """
1326 import os
1327 import webbrowser
1328 try:
1329 write_doc = doc.write
1330 except AttributeError:
1331 write_doc = etree.ElementTree(element=doc).write
1332 fn = os.tempnam() + '.html'
1333 write_doc(fn, method="html")
1334 url = 'file://' + fn.replace(os.path.sep, '/')
1335 print url
1336 webbrowser.open(url)
1337
1338
1339
1340
1341
1346
1350
1351 html_parser = HTMLParser()
1352