1 import threading
2 import re
3 import urlparse
4 import copy
5 from lxml import etree
6 from lxml.html import defs
7 from lxml import cssselect
8 from lxml.html.setmixin import SetMixin
9 try:
10 from UserDict import DictMixin
11 except ImportError:
12
13 from lxml.html._dictmixin import DictMixin
14 import sets
15
16 __all__ = ['document_fromstring', 'tostring', 'Element', 'defs',
17 'find_rel_links', 'find_class', 'make_links_absolute',
18 'resolve_base_href', 'iterlinks', 'rewrite_links', 'open_in_browser']
19
20 _rel_links_xpath = etree.XPath("descendant-or-self::a[@rel]")
21
22 _class_xpath = etree.XPath("descendant-or-self::*[@class and contains(concat(' ', normalize-space(@class), ' '), concat(' ', $class_name, ' '))]")
23 _id_xpath = etree.XPath("descendant-or-self::*[@id=$id]")
24 _collect_string_content = etree.XPath("string()")
25 _css_url_re = re.compile(r'url\((.*?)\)', re.I)
26 _css_import_re = re.compile(r'@import "(.*?)"')
27 _label_xpath = etree.XPath("//label[@for=$id]")
28
30
32 """
33 Returns the base URL, given when the page was parsed.
34
35 Use with ``urlparse.urljoin(el.base_url, href)`` to get
36 absolute URLs.
37 """
38 return self.getroottree().docinfo.URL
39 base_url = property(base_url, doc=base_url.__doc__)
40
46 forms = property(forms, doc=forms.__doc__)
47
49 """
50 Return the <body> element. Can be called from a child element
51 to get the document's head.
52 """
53 return self.xpath('//body')[0]
54 body = property(body, doc=body.__doc__)
55
57 """
58 Returns the <head> element. Can be called from a child
59 element to get the document's head.
60 """
61 return self.xpath('//head')[0]
62 head = property(head, doc=head.__doc__)
63
65 """
66 Get or set any <label> element associated with this element.
67 """
68 id = self.get('id')
69 if not id:
70 return None
71 result = _label_xpath(self, id=id)
72 if not result:
73 return None
74 else:
75 return result[0]
77 id = self.get('id')
78 if not id:
79 raise TypeError(
80 "You cannot set a label for an element (%r) that has no id"
81 % self)
82 if not label.tag == 'label':
83 raise TypeError(
84 "You can only assign label to a label element (not %r)"
85 % label)
86 label.set('for', id)
91 label = property(label__get, label__set, label__del, doc=label__get.__doc__)
92
94 """
95 Removes this element from the tree, including its children and
96 text. The tail text is joined to the previous element or
97 parent.
98 """
99 parent = self.getparent()
100 assert parent is not None
101 if self.tail:
102 previous = self.getprevious()
103 if previous is None:
104 parent.text = (parent.text or '') + self.tail
105 else:
106 previous.tail = (previous.tail or '') + self.tail
107 parent.remove(self)
108
110 """
111 Remove the tag, but not its children or text. The children and text
112 are merged into the parent.
113
114 Example::
115
116 >>> h = fragment_fromstring('<div>Hello <b>World!</b></div>')
117 >>> h.find('//b').drop_tag()
118 >>> print tostring(h)
119 <div>Hello World!</div>
120 """
121 parent = self.getparent()
122 assert parent is not None
123 previous = self.getprevious()
124 if self.text and isinstance(self.tag, basestring):
125
126 if previous is None:
127 parent.text = (parent.text or '') + self.text
128 else:
129 previous.tail = (previous.tail or '') + self.text
130 if self.tail:
131 if len(self):
132 last = self[-1]
133 last.tail = (last.tail or '') + self.tail
134 elif previous is None:
135 parent.text = (parent.text or '') + self.tail
136 else:
137 previous.tail = (previous.tail or '') + self.tail
138 index = parent.index(self)
139 parent[index:index+1] = self[:]
140
142 """
143 Find any links like ``<a rel="{rel}">...</a>``; returns a list of elements.
144 """
145 rel = rel.lower()
146 return [el for el in _rel_links_xpath(self)
147 if el.get('rel').lower() == rel]
148
150 """
151 Find any elements with the given class name.
152 """
153 return _class_xpath(self, class_name=class_name)
154
156 """
157 Get the first element in a document with the given id. If none is
158 found, return the default argument if provided or raise KeyError
159 otherwise.
160
161 Note that there can be more than one element with the same id,
162 and this isn't uncommon in HTML documents found in the wild.
163 Browsers return only the first match, and this function does
164 the same.
165 """
166 try:
167
168
169 return _id_xpath(self, id=id)[0]
170 except IndexError:
171 if default:
172 return default[0]
173 else:
174 raise KeyError, id
175
176 - def text_content(self):
177 """
178 Return the text content of the tag (and the text in any children).
179 """
180 return _collect_string_content(self)
181
183 """
184 Run the CSS expression on this element and its children,
185 returning a list of the results.
186
187 Equivalent to lxml.cssselect.CSSSelect(expr)(self) -- note
188 that pre-compiling the expression can provide a substantial
189 speedup.
190 """
191 return cssselect.CSSSelect(expr)(self)
192
193
194
195
196
198 """
199 Make all links in the document absolute, given the
200 ``base_url`` for the document (the full URL where the document
201 came from), or if no ``base_url`` is given, then the ``.base_url`` of the document.
202
203 If ``resolve_base_href`` is true, then any ``<base href>``
204 tags in the document are used *and* removed from the document.
205 If it is false then any such tag is ignored.
206 """
207 if base_url is None:
208 base_url = self.base_url
209 if base_url is None:
210 raise TypeError(
211 "No base_url given, and the document has no base_url")
212 if resolve_base_href:
213 self.resolve_base_href()
214 def link_repl(href):
215 return urlparse.urljoin(base_url, href)
216 self.rewrite_links(link_repl)
217
219 """
220 Find any ``<base href>`` tag in the document, and apply its
221 values to all links found in the document. Also remove the
222 tag once it has been applied.
223 """
224 base_href = None
225 basetags = self.xpath('//base[@href]')
226 for b in basetags:
227 base_href = b.get('href')
228 b.drop_tree()
229 if not base_href:
230 return
231 self.make_links_absolute(base_href, resolve_base_href=False)
232
234 """
235 Yield (element, attribute, link, pos), where attribute may be None
236 (indicating the link is in the text). ``pos`` is the position
237 where the link occurs; often 0, but sometimes something else in
238 the case of links in stylesheets or style tags.
239
240 Note: <base href> is *not* taken into account in any way. The
241 link you get is exactly the link in the document.
242 """
243 link_attrs = defs.link_attrs
244 for el in self.getiterator():
245 attribs = el.attrib
246 for attrib in link_attrs:
247 if attrib in attribs:
248 yield (el, attrib, attribs[attrib], 0)
249 if el.tag == 'style' and el.text:
250 for match in _css_url_re.finditer(el.text):
251 yield (el, None, match.group(1), match.start(1))
252 for match in _css_import_re.finditer(el.text):
253 yield (el, None, match.group(1), match.start(1))
254 if 'style' in attribs:
255 for match in _css_url_re.finditer(attribs['style']):
256 yield (el, 'style', match.group(1), match.start(1))
257
258 - def rewrite_links(self, link_repl_func, resolve_base_href=True,
259 base_href=None):
260 """
261 Rewrite all the links in the document. For each link
262 ``link_repl_func(link)`` will be called, and the return value
263 will replace the old link.
264
265 Note that links may not be absolute (unless you first called
266 ``make_links_absolute()``), and may be internal (e.g.,
267 ``'#anchor'``). They can also be values like
268 ``'mailto:email'`` or ``'javascript:expr'``.
269
270 If you give ``base_href`` then all links passed to
271 ``link_repl_func()`` will take that into account.
272
273 If the ``link_repl_func`` returns None, the attribute or
274 tag text will be removed completely.
275 """
276 if base_href is not None:
277
278
279 self.make_links_absolute(base_href, resolve_base_href=resolve_base_href)
280 elif resolve_base_href:
281 self.resolve_base_href()
282 for el, attrib, link, pos in self.iterlinks():
283 new_link = link_repl_func(link)
284 if new_link == link:
285 continue
286 if new_link is None:
287
288 if attrib is None:
289 el.text = ''
290 else:
291 del el.attrib[attrib]
292 continue
293 if attrib is None:
294 new = el.text[:pos] + new_link + el.text[pos+len(link):]
295 el.text = new
296 else:
297 cur = el.attrib[attrib]
298 if not pos and len(cur) == len(link):
299
300 el.attrib[attrib] = new_link
301 else:
302 new = cur[:pos] + new_link + cur[pos+len(link):]
303 el.attrib[attrib] = new
304
305
307 """
308 An object that represents a method on an element as a function;
309 the function takes either an element or an HTML string. It
310 returns whatever the function normally returns, or if the function
311 works in-place (and so returns None) it returns a serialized form
312 of the resulting document.
313 """
319 if isinstance(doc, basestring):
320 if 'copy' in kw:
321 raise TypeError(
322 "The keyword 'copy' can only be used with element inputs to %s, not a string input" % self.name)
323 return_string = True
324 doc = fromstring(doc, **kw)
325 else:
326 if 'copy' in kw:
327 copy = kw.pop('copy')
328 else:
329 copy = self.copy
330 return_string = False
331 if copy:
332 doc = copy.deepcopy(doc)
333 meth = getattr(doc, self.name)
334 result = meth(*args, **kw)
335
336 if result is None:
337
338 if return_string:
339 return tostring(doc)
340 else:
341 return doc
342 else:
343 return result
344
345 find_rel_links = _MethodFunc('find_rel_links', copy=False)
346 find_class = _MethodFunc('find_class', copy=False)
347 make_links_absolute = _MethodFunc('make_links_absolute', copy=True)
348 resolve_base_href = _MethodFunc('resolve_base_href', copy=True)
349 iterlinks = _MethodFunc('iterlinks', copy=False)
350 rewrite_links = _MethodFunc('rewrite_links', copy=True)
351
354
357
360
363
364
366 """A lookup scheme for HTML Element classes.
367
368 To create a lookup instance with different Element classes, pass a tag
369 name mapping of Element classes in the ``classes`` keyword argument and/or
370 a tag name mapping of Mixin classes in the ``mixins`` keyword argument.
371 The special key '*' denotes a Mixin class that should be mixed into all
372 Element classes.
373 """
374 _default_element_classes = {}
375
376 - def __init__(self, classes=None, mixins=None):
393
394 - def lookup(self, node_type, document, namespace, name):
405
406
407 html_parser = etree.HTMLParser()
408
415
417 """
418 Parses several HTML elements, returning a list of elements.
419
420 The first item in the list may be a string (though leading
421 whitespace is removed). If no_leading_text is true, then it will
422 be an error if there is leading text, and it will always be a list
423 of only elements.
424 """
425
426 start = html[:20].lstrip().lower()
427 if not start.startswith('<html') and not start.startswith('<!doctype'):
428 html = '<html><body>%s</body></html>' % html
429 doc = document_fromstring(html, **kw)
430 assert doc.tag == 'html'
431 bodies = [e for e in doc if e.tag == 'body']
432 assert len(bodies) == 1, ("too many bodies: %r in %r" % (bodies, html))
433 body = bodies[0]
434 elements = []
435 if no_leading_text and body.text and body.text.strip():
436 raise etree.ParserError(
437 "There is leading text: %r" % body.text)
438 if body.text and body.text.strip():
439 elements.append(body.text)
440 elements.extend(body)
441
442
443 return elements
444
446 """
447 Parses a single HTML element; it is an error if there is more than
448 one element, or if anything but whitespace precedes or follows the
449 element.
450
451 If create_parent is true (or is a tag name) then a parent node
452 will be created to encapsulate the HTML in a single element.
453 """
454 if create_parent:
455 if not isinstance(create_parent, basestring):
456 create_parent = 'div'
457 return fragment_fromstring('<%s>%s</%s>' % (
458 create_parent, html, create_parent), **kw)
459 elements = fragments_fromstring(html, no_leading_text=True)
460 if not elements:
461 raise etree.ParserError(
462 "No elements found")
463 if len(elements) > 1:
464 raise etree.ParserError(
465 "Multiple elements found (%s)"
466 % ', '.join([_element_name(e) for e in elements]))
467 el = elements[0]
468 if el.tail and el.tail.strip():
469 raise etree.ParserError(
470 "Element followed by text: %r" % el.tail)
471 el.tail = None
472 return el
473
528
529 -def parse(filename, **kw):
530 """
531 Parse a filename, URL, or file-like object into an HTML document.
532
533 You may pass the keyword argument ``base_url='http://...'`` to set
534 the base URL.
535 """
536 return etree.parse(filename, html_parser, **kw)
537
545
547 if isinstance(el, etree.CommentBase):
548 return 'comment'
549 elif isinstance(el, basestring):
550 return 'string'
551 else:
552 return el.tag
553
557
657
658 HtmlElementClassLookup._default_element_classes['form'] = FormElement
659
692
694 import urllib
695
696 if method == 'GET':
697 if '?' in url:
698 url += '&'
699 else:
700 url += '?'
701 url += urllib.urlencode(values)
702 data = None
703 else:
704 data = urllib.urlencode(values)
705 return urllib.urlopen(url, data)
706
708
716 raise KeyError(
717 "You cannot remove keys from ElementDict")
721 return item in self.inputs
722
724 return '<%s for form %s>' % (
725 self.__class__.__name__,
726 self.inputs.form._name())
727
791
819
820 -class TextareaElement(InputMixin, HtmlElement):
821 """
822 ``<textarea>`` element. You can get the name with ``.name`` and
823 get/set the value with ``.value``
824 """
825
826 - def value__get(self):
827 """
828 Get/set the value (which is the contents of this element)
829 """
830 return self.text or ''
831 - def value__set(self, value):
833 - def value__del(self):
835 value = property(value__get, value__set, value__del, doc=value__get.__doc__)
836
837 HtmlElementClassLookup._default_element_classes['textarea'] = TextareaElement
838
840 """
841 ``<select>`` element. You can get the name with ``.name``.
842
843 ``.value`` will be the value of the selected option, unless this
844 is a multi-select element (``<select multiple>``), in which case
845 it will be a set-like object. In either case ``.value_options``
846 gives the possible values.
847
848 The boolean attribute ``.multiple`` shows if this is a
849 multi-select.
850 """
851
853 """
854 Get/set the value of this select (the selected option).
855
856 If this is a multi-select, this is a set-like object that
857 represents all the selected options.
858 """
859 if self.multiple:
860 return MultipleSelectOptions(self)
861 for el in self.getiterator('option'):
862 if 'selected' in el.attrib:
863 value = el.get('value')
864
865 return value
866 return None
867
869 if self.multiple:
870 if isinstance(value, basestring):
871 raise TypeError(
872 "You must pass in a sequence")
873 self.value.clear()
874 self.value.update(value)
875 return
876 if value is not None:
877 for el in self.getiterator('option'):
878
879 if el.get('value') == value:
880 checked_option = el
881 break
882 else:
883 raise ValueError(
884 "There is no option with the value of %r" % value)
885 for el in self.getiterator('option'):
886 if 'selected' in el.attrib:
887 del el.attrib['selected']
888 if value is not None:
889 checked_option.set('selected', '')
890
897
898 value = property(value__get, value__set, value__del, doc=value__get.__doc__)
899
901 """
902 All the possible values this select can have (the ``value``
903 attribute of all the ``<option>`` elements.
904 """
905 return [el.get('value') for el in self.getiterator('option')]
906 value_options = property(value_options, doc=value_options.__doc__)
907
909 """
910 Boolean attribute: is there a ``multiple`` attribute on this element.
911 """
912 return 'multiple' in self.attrib
914 if value:
915 self.set('multiple', '')
916 elif 'multiple' in self.attrib:
917 del self.attrib['multiple']
918 multiple = property(multiple__get, multiple__set, doc=multiple__get.__doc__)
919
920 HtmlElementClassLookup._default_element_classes['select'] = SelectElement
921
923 """
924 Represents all the selected options in a ``<select multiple>`` element.
925
926 You can add to this set-like option to select an option, or remove
927 to unselect the option.
928 """
929
932
934 """
935 Iterator of all the ``<option>`` elements.
936 """
937 return self.select.getiterator('option')
938 options = property(options)
939
941 for option in self.options:
942 yield option.get('value')
943
944 - def add(self, item):
945 for option in self.options:
946 if option.get('value') == item:
947 option.set('selected', '')
948 break
949 else:
950 raise ValueError(
951 "There is no option with the value %r" % item)
952
954 for option in self.options:
955 if option.get('value') == item:
956 if 'selected' in option.attrib:
957 del option.attrib['selected']
958 else:
959 raise ValueError(
960 "The option %r is not currently selected" % item)
961 break
962 else:
963 raise ValueError(
964 "There is not option with the value %r" % item)
965
967 return '<%s {%s} for select name=%r>' % (
968 self.__class__.__name__,
969 ', '.join([repr(v) for v in self]),
970 self.select.name)
971
973 """
974 This object represents several ``<input type=radio>`` elements
975 that have the same name.
976
977 You can use this like a list, but also use the property
978 ``.value`` to check/uncheck inputs. Also you can use
979 ``.value_options`` to get the possible values.
980 """
981
983 """
984 Get/set the value, which checks the radio with that value (and
985 unchecks any other value).
986 """
987 for el in self:
988 if 'checked' in el.attrib:
989 return el.get('value')
990 return None
991
993 if value is not None:
994 for el in self:
995 if el.get('value') == value:
996 checked_option = el
997 break
998 else:
999 raise ValueError(
1000 "There is no radio input with the value %r" % value)
1001 for el in self:
1002 if 'checked' in el.attrib:
1003 del el.attrib['checked']
1004 if value is not None:
1005 checked_option.set('checked', '')
1006
1009
1010 value = property(value__get, value__set, value__del, doc=value__get.__doc__)
1011
1013 """
1014 Returns a list of all the possible values.
1015 """
1016 return [el.get('value') for el in self]
1017 value_options = property(value_options, doc=value_options.__doc__)
1018
1020 return '%s(%s)' % (
1021 self.__class__.__name__,
1022 list.__repr__(self))
1023
1025 """
1026 Represents a group of checkboxes (``<input type=checkbox>``) that
1027 have the same name.
1028
1029 In addition to using this like a list, the ``.value`` attribute
1030 returns a set-like object that you can add to or remove from to
1031 check and uncheck checkboxes. You can also use ``.value_options``
1032 to get the possible values.
1033 """
1034
1036 """
1037 Return a set-like object that can be modified to check or
1038 uncheck individual checkboxes according to their value.
1039 """
1040 return CheckboxValues(self)
1046 value = property(value__get, value__set, value__del, doc=value__get.__doc__)
1047
1049 return '%s(%s)' % (
1050 self.__class__.__name__, list.__repr__(self))
1051
1053
1054 """
1055 Represents the values of the checked checkboxes in a group of
1056 checkboxes with the same name.
1057 """
1058
1061
1063 return iter([
1064 el.get('value')
1065 for el in self.group
1066 if 'checked' in el.attrib])
1067
1068 - def add(self, value):
1069 for el in self.group:
1070 if el.get('value') == value:
1071 el.set('checked', '')
1072 break
1073 else:
1074 raise KeyError("No checkbox with value %r" % value)
1075
1077 for el in self.group:
1078 if el.get('value') == value:
1079 if 'checked' in el.attrib:
1080 del el.attrib['checked']
1081 else:
1082 raise KeyError(
1083 "The checkbox with value %r was already unchecked" % value)
1084 break
1085 else:
1086 raise KeyError(
1087 "No checkbox with value %r" % value)
1088
1090 return '<%s {%s} for checkboxes name=%r>' % (
1091 self.__class__.__name__,
1092 ', '.join([repr(v) for v in self]),
1093 self.group.name)
1094
1178
1179 HtmlElementClassLookup._default_element_classes['input'] = InputElement
1180
1182 """
1183 Represents a ``<label>`` element.
1184
1185 Label elements are linked to other elements with their ``for``
1186 attribute. You can access this element with ``label.for_element``.
1187 """
1188
1190 """
1191 Get/set the element this label points to. Return None if it
1192 can't be found.
1193 """
1194 id = self.get('for')
1195 if not id:
1196 return None
1197 return self.body.get_element_by_id(id)
1199 id = other.get('id')
1200 if not id:
1201 raise TypeError(
1202 "Element %r has no id attribute" % other)
1203 self.set('for', id)
1207 for_element = property(for_element__get, for_element__set, for_element__del,
1208 doc=for_element__get.__doc__)
1209
1210 HtmlElementClassLookup._default_element_classes['label'] = LabelElement
1211
1212
1213
1214
1215
1216 _html_xsl = """\
1217 <xsl:transform xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
1218 <xsl:output method="html" encoding="UTF-8" />
1219 <xsl:template match="/">
1220 <xsl:copy-of select="."/>
1221 </xsl:template>
1222 </xsl:transform>
1223 """
1224
1225 _pretty_html_xsl = """\
1226 <xsl:transform xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
1227 <xsl:output method="html" encoding="UTF-8" indent="yes" />
1228 <xsl:template match="/">
1229 <xsl:copy-of select="."/>
1230 </xsl:template>
1231 </xsl:transform>
1232 """
1233
1234 _local_transforms = threading.local()
1235
1236 _local_transforms.html_transform = etree.XSLT(etree.XML(_html_xsl))
1237 _local_transforms.pretty_html_transform = etree.XSLT(etree.XML(_pretty_html_xsl))
1238
1239
1240 __replace_meta_content_type = re.compile(
1241 r'<meta http-equiv="Content-Type".*?>').sub
1242
1243 -def tostring(doc, pretty=False, include_meta_content_type=False):
1266
1268 """
1269 Open the HTML document in a web browser (saving it to a temporary
1270 file to open it).
1271 """
1272 import os
1273 import webbrowser
1274 fn = os.tempnam() + '.html'
1275 f = open(fn, 'wb')
1276 f.write(tostring(doc, include_meta_content_type=True))
1277 f.close()
1278 url = 'file://' + fn.replace(os.path.sep, '/')
1279 print url
1280 webbrowser.open(url)
1281
1282
1283
1284
1285 html_parser.setElementClassLookup(HtmlElementClassLookup())
1286